Skip to content

Engines API

llm_ie.engines.InferenceEngine

InferenceEngine(config: LLMConfig, **kwrs)

This is an abstract class to provide interfaces for LLM inference engines. Children classes that inherts this class can be used in extrators. Must implement chat() method.

Parameters:

config : LLMConfig the LLM configuration. Must be a child class of LLMConfig.

Source code in package/llm-ie/src/llm_ie/engines.py
@abc.abstractmethod
def __init__(self, config:LLMConfig, **kwrs):
    """
    This is an abstract class to provide interfaces for LLM inference engines. 
    Children classes that inherts this class can be used in extrators. Must implement chat() method.

    Parameters:
    ----------
    config : LLMConfig
        the LLM configuration. Must be a child class of LLMConfig.
    """
    return NotImplemented

chat abstractmethod

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[Dict[str, str], None, None]]

This method inputs chat messages and outputs LLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, LLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in package/llm-ie/src/llm_ie/engines.py
@abc.abstractmethod
def chat(self, messages:List[Dict[str,str]], 
         verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
    """
    This method inputs chat messages and outputs LLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, LLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.  
    """
    return NotImplemented

llm_ie.engines.OllamaInferenceEngine

OllamaInferenceEngine(
    model_name: str,
    num_ctx: int = 4096,
    keep_alive: int = 300,
    config: LLMConfig = None,
    **kwrs
)

Bases: InferenceEngine

The Ollama inference engine.

Parameters:

model_name : str the model name exactly as shown in >> ollama ls num_ctx : int, Optional context length that LLM will evaluate. keep_alive : int, Optional seconds to hold the LLM after the last API call. config : LLMConfig the LLM configuration.

Source code in package/llm-ie/src/llm_ie/engines.py
def __init__(self, model_name:str, num_ctx:int=4096, keep_alive:int=300, config:LLMConfig=None, **kwrs):
    """
    The Ollama inference engine.

    Parameters:
    ----------
    model_name : str
        the model name exactly as shown in >> ollama ls
    num_ctx : int, Optional
        context length that LLM will evaluate.
    keep_alive : int, Optional
        seconds to hold the LLM after the last API call.
    config : LLMConfig
        the LLM configuration. 
    """
    if importlib.util.find_spec("ollama") is None:
        raise ImportError("ollama-python not found. Please install ollama-python (```pip install ollama```).")

    from ollama import Client, AsyncClient
    self.client = Client(**kwrs)
    self.async_client = AsyncClient(**kwrs)
    self.model_name = model_name
    self.num_ctx = num_ctx
    self.keep_alive = keep_alive
    self.config = config if config else BasicLLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[Dict[str, str], None, None]]

This method inputs chat messages and outputs VLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in package/llm-ie/src/llm_ie/engines.py
def chat(self, messages:List[Dict[str,str]], 
         verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
    """
    This method inputs chat messages and outputs VLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    options={'num_ctx': self.num_ctx, **self.formatted_params}
    if stream:
        def _stream_generator():
            response_stream = self.client.chat(
                model=self.model_name, 
                messages=processed_messages, 
                options=options,
                stream=True, 
                keep_alive=self.keep_alive
            )
            for chunk in response_stream:
                content_chunk = chunk.get('message', {}).get('content')
                if content_chunk:
                    yield content_chunk

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.client.chat(
                        model=self.model_name, 
                        messages=processed_messages, 
                        options=options,
                        stream=True,
                        keep_alive=self.keep_alive
                    )

        res = ''
        for chunk in response:
            content_chunk = chunk.get('message', {}).get('content')
            print(content_chunk, end='', flush=True)
            res += content_chunk
        print('\n')
        return self.config.postprocess_response(res)

    else:
        response = self.client.chat(
                            model=self.model_name, 
                            messages=processed_messages, 
                            options=options,
                            stream=False,
                            keep_alive=self.keep_alive
                        )
        res = response.get('message', {}).get('content')
        return self.config.postprocess_response(res)

chat_async async

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in package/llm-ie/src/llm_ie/engines.py
async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.async_client.chat(
                        model=self.model_name, 
                        messages=processed_messages, 
                        options={'num_ctx': self.num_ctx, **self.formatted_params},
                        stream=False,
                        keep_alive=self.keep_alive
                    )

    res = response['message']['content']
    return self.config.postprocess_response(res)

llm_ie.engines.OpenAIInferenceEngine

OpenAIInferenceEngine(
    model: str, config: LLMConfig = None, **kwrs
)

Bases: InferenceEngine

The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers: - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) - Llama.cpp OpenAI compatible server (https://llama-cpp-python.readthedocs.io/en/latest/server/)

For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

Parameters:

model_name : str model name as described in https://platform.openai.com/docs/models

Source code in package/llm-ie/src/llm_ie/engines.py
def __init__(self, model:str, config:LLMConfig=None, **kwrs):
    """
    The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers:
    - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
    - Llama.cpp OpenAI compatible server (https://llama-cpp-python.readthedocs.io/en/latest/server/)

    For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

    Parameters:
    ----------
    model_name : str
        model name as described in https://platform.openai.com/docs/models
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import OpenAI, AsyncOpenAI
    self.client = OpenAI(**kwrs)
    self.async_client = AsyncOpenAI(**kwrs)
    self.model = model
    self.config = config if config else BasicLLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[Dict[str, str], None, None]]

This method inputs chat messages and outputs LLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in package/llm-ie/src/llm_ie/engines.py
def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
    """
    This method inputs chat messages and outputs LLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    if stream:
        def _stream_generator():
            response_stream = self.client.chat.completions.create(
                                    model=self.model,
                                    messages=processed_messages,
                                    stream=True,
                                    **self.formatted_params
                                )
            for chunk in response_stream:
                if len(chunk.choices) > 0:
                    if chunk.choices[0].delta.content is not None:
                        yield chunk.choices[0].delta.content
                    if chunk.choices[0].finish_reason == "length":
                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=processed_messages,
            stream=True,
            **self.formatted_params
        )
        res = ''
        for chunk in response:
            if len(chunk.choices) > 0:
                if chunk.choices[0].delta.content is not None:
                    res += chunk.choices[0].delta.content
                    print(chunk.choices[0].delta.content, end="", flush=True)
                if chunk.choices[0].finish_reason == "length":
                    warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

        print('\n')
        return self.config.postprocess_response(res)
    else:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=processed_messages,
            stream=False,
            **self.formatted_params
        )
        res = response.choices[0].message.content
        return self.config.postprocess_response(res)

chat_async async

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in package/llm-ie/src/llm_ie/engines.py
async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.async_client.chat.completions.create(
        model=self.model,
        messages=processed_messages,
        stream=False,
        **self.formatted_params
    )

    if response.choices[0].finish_reason == "length":
        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

    res = response.choices[0].message.content
    return self.config.postprocess_response(res)

llm_ie.engines.AzureOpenAIInferenceEngine

AzureOpenAIInferenceEngine(
    model: str,
    api_version: str,
    config: LLMConfig = None,
    **kwrs
)

Bases: OpenAIInferenceEngine

The Azure OpenAI API inference engine. For parameters and documentation, refer to - https://azure.microsoft.com/en-us/products/ai-services/openai-service - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

Parameters:

model : str model name as described in https://platform.openai.com/docs/models api_version : str the Azure OpenAI API version config : LLMConfig the LLM configuration.

Source code in package/llm-ie/src/llm_ie/engines.py
def __init__(self, model:str, api_version:str, config:LLMConfig=None, **kwrs):
    """
    The Azure OpenAI API inference engine.
    For parameters and documentation, refer to 
    - https://azure.microsoft.com/en-us/products/ai-services/openai-service
    - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

    Parameters:
    ----------
    model : str
        model name as described in https://platform.openai.com/docs/models
    api_version : str
        the Azure OpenAI API version
    config : LLMConfig
        the LLM configuration.
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import AzureOpenAI, AsyncAzureOpenAI
    self.model = model
    self.api_version = api_version
    self.client = AzureOpenAI(api_version=self.api_version, 
                              **kwrs)
    self.async_client = AsyncAzureOpenAI(api_version=self.api_version, 
                                         **kwrs)
    self.config = config if config else BasicLLMConfig()
    self.formatted_params = self._format_config()

llm_ie.engines.HuggingFaceHubInferenceEngine

HuggingFaceHubInferenceEngine(
    model: str = None,
    token: Union[str, bool] = None,
    base_url: str = None,
    api_key: str = None,
    config: LLMConfig = None,
    **kwrs
)

Bases: InferenceEngine

The Huggingface_hub InferenceClient inference engine. For parameters and documentation, refer to https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client

Parameters:

model : str the model name exactly as shown in Huggingface repo token : str, Optional the Huggingface token. If None, will use the token in os.environ['HF_TOKEN']. base_url : str, Optional the base url for the LLM server. If None, will use the default Huggingface Hub URL. api_key : str, Optional the API key for the LLM server. config : LLMConfig the LLM configuration.

Source code in package/llm-ie/src/llm_ie/engines.py
def __init__(self, model:str=None, token:Union[str, bool]=None, base_url:str=None, api_key:str=None, config:LLMConfig=None, **kwrs):
    """
    The Huggingface_hub InferenceClient inference engine.
    For parameters and documentation, refer to https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client

    Parameters:
    ----------
    model : str
        the model name exactly as shown in Huggingface repo
    token : str, Optional
        the Huggingface token. If None, will use the token in os.environ['HF_TOKEN'].
    base_url : str, Optional
        the base url for the LLM server. If None, will use the default Huggingface Hub URL.
    api_key : str, Optional
        the API key for the LLM server. 
    config : LLMConfig
        the LLM configuration. 
    """
    if importlib.util.find_spec("huggingface_hub") is None:
        raise ImportError("huggingface-hub not found. Please install huggingface-hub (```pip install huggingface-hub```).")

    from huggingface_hub import InferenceClient, AsyncInferenceClient
    self.model = model
    self.base_url = base_url
    self.client = InferenceClient(model=model, token=token, base_url=base_url, api_key=api_key, **kwrs)
    self.client_async = AsyncInferenceClient(model=model, token=token, base_url=base_url, api_key=api_key, **kwrs)
    self.config = config if config else BasicLLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[Dict[str, str], None, None]]

This method inputs chat messages and outputs LLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in package/llm-ie/src/llm_ie/engines.py
def chat(self, messages:List[Dict[str,str]], 
         verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
    """
    This method inputs chat messages and outputs LLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    if stream:
        def _stream_generator():
            response_stream = self.client.chat.completions.create(
                                messages=processed_messages,
                                stream=True,
                                **self.formatted_params
                            )
            for chunk in response_stream:
                content_chunk = chunk.get('choices')[0].get('delta').get('content')
                if content_chunk:
                    yield content_chunk

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.client.chat.completions.create(
                        messages=processed_messages,
                        stream=True,
                        **self.formatted_params
                    )

        res = ''
        for chunk in response:
            content_chunk = chunk.get('choices')[0].get('delta').get('content')
            if content_chunk:
                res += content_chunk
                print(content_chunk, end='', flush=True)
        return self.config.postprocess_response(res)

    else:
        response = self.client.chat.completions.create(
                            messages=processed_messages,
                            stream=False,
                            **self.formatted_params
                        )
        res = response.choices[0].message.content
        return self.config.postprocess_response(res)

chat_async async

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in package/llm-ie/src/llm_ie/engines.py
async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.client_async.chat.completions.create(
                messages=processed_messages,
                stream=False,
                **self.formatted_params
            )

    res = response.choices[0].message.content
    return self.config.postprocess_response(res)

llm_ie.engines.LiteLLMInferenceEngine

LiteLLMInferenceEngine(
    model: str = None,
    base_url: str = None,
    api_key: str = None,
    config: LLMConfig = None,
)

Bases: InferenceEngine

The LiteLLM inference engine. For parameters and documentation, refer to https://github.com/BerriAI/litellm?tab=readme-ov-file

Parameters:

model : str the model name base_url : str, Optional the base url for the LLM server api_key : str, Optional the API key for the LLM server config : LLMConfig the LLM configuration.

Source code in package/llm-ie/src/llm_ie/engines.py
def __init__(self, model:str=None, base_url:str=None, api_key:str=None, config:LLMConfig=None):
    """
    The LiteLLM inference engine. 
    For parameters and documentation, refer to https://github.com/BerriAI/litellm?tab=readme-ov-file

    Parameters:
    ----------
    model : str
        the model name
    base_url : str, Optional
        the base url for the LLM server
    api_key : str, Optional
        the API key for the LLM server
    config : LLMConfig
        the LLM configuration.
    """
    if importlib.util.find_spec("litellm") is None:
        raise ImportError("litellm not found. Please install litellm (```pip install litellm```).")

    import litellm 
    self.litellm = litellm
    self.model = model
    self.base_url = base_url
    self.api_key = api_key
    self.config = config if config else BasicLLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[Dict[str, str], None, None]]

This method inputs chat messages and outputs LLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in package/llm-ie/src/llm_ie/engines.py
def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[Dict[str, str], None, None]]:
    """
    This method inputs chat messages and outputs LLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"} 
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    if stream:
        def _stream_generator():
            response_stream = self.litellm.completion(
                model=self.model,
                messages=processed_messages,
                stream=True,
                base_url=self.base_url,
                api_key=self.api_key,
                **self.formatted_params
            )

            for chunk in response_stream:
                chunk_content = chunk.get('choices')[0].get('delta').get('content')
                if chunk_content:
                    yield chunk_content

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.litellm.completion(
            model=self.model,
            messages=processed_messages,
            stream=True,
            base_url=self.base_url,
            api_key=self.api_key,
            **self.formatted_params
        )

        res = ''
        for chunk in response:
            chunk_content = chunk.get('choices')[0].get('delta').get('content')
            if chunk_content:
                res += chunk_content
                print(chunk_content, end='', flush=True)

        return self.config.postprocess_response(res)

    else:
        response = self.litellm.completion(
                model=self.model,
                messages=processed_messages,
                stream=False,
                base_url=self.base_url,
                api_key=self.api_key,
                **self.formatted_params
            )
        res = response.choices[0].message.content
        return self.config.postprocess_response(res)

chat_async async

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in package/llm-ie/src/llm_ie/engines.py
async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.litellm.acompletion(
        model=self.model,
        messages=processed_messages,
        stream=False,
        base_url=self.base_url,
        api_key=self.api_key,
        **self.formatted_params
    )

    res = response.get('choices')[0].get('message').get('content')
    return self.config.postprocess_response(res)