VLM Engines Reference

vlm4ocr.vlm_engines.OllamaVLMEngine

OllamaVLMEngine(
    model_name: str,
    num_ctx: int = 8192,
    keep_alive: int = 300,
    config: VLMConfig = None,
    **kwrs
)

Bases: VLMEngine

The Ollama inference engine.

Parameters:

model_name : str the model name exactly as shown in >> ollama ls num_ctx : int, Optional context length that LLM will evaluate. keep_alive : int, Optional seconds to hold the LLM after the last API call. config : LLMConfig the LLM configuration.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def __init__(self, model_name:str, num_ctx:int=8192, keep_alive:int=300, config:VLMConfig=None, **kwrs):
    """
    The Ollama inference engine.

    Parameters:
    ----------
    model_name : str
        the model name exactly as shown in >> ollama ls
    num_ctx : int, Optional
        context length that LLM will evaluate.
    keep_alive : int, Optional
        seconds to hold the LLM after the last API call.
    config : LLMConfig
        the LLM configuration. 
    """
    if importlib.util.find_spec("ollama") is None:
        raise ImportError("ollama-python not found. Please install ollama-python (```pip install ollama```).")

    from ollama import Client, AsyncClient
    self.client = Client(**kwrs)
    self.async_client = AsyncClient(**kwrs)
    self.model_name = model_name
    self.num_ctx = num_ctx
    self.keep_alive = keep_alive
    self.config = config if config else BasicVLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[str, None, None]]

This method inputs chat messages and outputs VLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[str, None, None]]:
    """
    This method inputs chat messages and outputs VLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    options={'num_ctx': self.num_ctx, **self.formatted_params}
    if stream:
        def _stream_generator():
            response_stream = self.client.chat(
                model=self.model_name, 
                messages=processed_messages, 
                options=options,
                stream=True, 
                keep_alive=self.keep_alive
            )
            for chunk in response_stream:
                content_chunk = chunk.get('message', {}).get('content')
                if content_chunk:
                    yield content_chunk

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.client.chat(
                        model=self.model_name, 
                        messages=processed_messages, 
                        options=options,
                        stream=True,
                        keep_alive=self.keep_alive
                    )

        res = ''
        for chunk in response:
            content_chunk = chunk.get('message', {}).get('content')
            print(content_chunk, end='', flush=True)
            res += content_chunk
        print('\n')
        return self.config.postprocess_response(res)

    else:
        response = self.client.chat(
                            model=self.model_name, 
                            messages=processed_messages, 
                            options=options,
                            stream=False,
                            keep_alive=self.keep_alive
                        )
        res = response.get('message', {}).get('content')
        return self.config.postprocess_response(res)

chat_async `async`

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.async_client.chat(
                        model=self.model_name, 
                        messages=processed_messages, 
                        options={'num_ctx': self.num_ctx, **self.formatted_params},
                        stream=False,
                        keep_alive=self.keep_alive
                    )

    res = response['message']['content']
    return self.config.postprocess_response(res)

get_ocr_messages

get_ocr_messages(
    system_prompt: str, user_prompt: str, image: Image
) -> List[Dict[str, str]]

This method inputs an image and returns the correesponding chat messages for the inference engine.

Parameters:

system_prompt : str the system prompt. user_prompt : str the user prompt. image : Image.Image the image for OCR.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image) -> List[Dict[str,str]]:
    """
    This method inputs an image and returns the correesponding chat messages for the inference engine.

    Parameters:
    ----------
    system_prompt : str
        the system prompt.
    user_prompt : str
        the user prompt.
    image : Image.Image
        the image for OCR.
    """
    base64_str = image_to_base64(image)
    return [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": user_prompt,
            "images": [base64_str]
        }
    ]

vlm4ocr.vlm_engines.OpenAIVLMEngine

OpenAIVLMEngine(
    model: str, config: VLMConfig = None, **kwrs
)

Bases: VLMEngine

The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers: - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)

For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

Parameters:

model_name : str model name as described in https://platform.openai.com/docs/models config : VLMConfig, Optional the VLM configuration. Must be a child class of VLMConfig.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def __init__(self, model:str, config:VLMConfig=None, **kwrs):
    """
    The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers:
    - vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)

    For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

    Parameters:
    ----------
    model_name : str
        model name as described in https://platform.openai.com/docs/models
    config : VLMConfig, Optional
        the VLM configuration. Must be a child class of VLMConfig.
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import OpenAI, AsyncOpenAI
    self.client = OpenAI(**kwrs)
    self.async_client = AsyncOpenAI(**kwrs)
    self.model = model
    self.config = config if config else BasicVLMConfig()
    self.formatted_params = self._format_config()

chat

chat(
    messages: List[Dict[str, str]],
    verbose: bool = False,
    stream: bool = False,
) -> Union[str, Generator[str, None, None]]

This method inputs chat messages and outputs LLM generated text.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"} verbose : bool, Optional if True, VLM generated text will be printed in terminal in real-time. stream : bool, Optional if True, returns a generator that yields the output in real-time.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def chat(self, messages:List[Dict[str,str]], verbose:bool=False, stream:bool=False) -> Union[str, Generator[str, None, None]]:
    """
    This method inputs chat messages and outputs LLM generated text.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    verbose : bool, Optional
        if True, VLM generated text will be printed in terminal in real-time.
    stream : bool, Optional
        if True, returns a generator that yields the output in real-time.
    """
    processed_messages = self.config.preprocess_messages(messages)

    if stream:
        def _stream_generator():
            response_stream = self.client.chat.completions.create(
                                    model=self.model,
                                    messages=processed_messages,
                                    stream=True,
                                    **self.formatted_params
                                )
            for chunk in response_stream:
                if len(chunk.choices) > 0:
                    if chunk.choices[0].delta.content is not None:
                        yield chunk.choices[0].delta.content
                    if chunk.choices[0].finish_reason == "length":
                        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

        return self.config.postprocess_response(_stream_generator())

    elif verbose:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=processed_messages,
            stream=True,
            **self.formatted_params
        )
        res = ''
        for chunk in response:
            if len(chunk.choices) > 0:
                if chunk.choices[0].delta.content is not None:
                    res += chunk.choices[0].delta.content
                    print(chunk.choices[0].delta.content, end="", flush=True)
                if chunk.choices[0].finish_reason == "length":
                    warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

        print('\n')
        return self.config.postprocess_response(res)
    else:
        response = self.client.chat.completions.create(
            model=self.model,
            messages=processed_messages,
            stream=False,
            **self.formatted_params
        )
        res = response.choices[0].message.content
        return self.config.postprocess_response(res)

chat_async `async`

chat_async(messages: List[Dict[str, str]]) -> str

Async version of chat method. Streaming is not supported.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

async def chat_async(self, messages:List[Dict[str,str]]) -> str:
    """
    Async version of chat method. Streaming is not supported.
    """
    processed_messages = self.config.preprocess_messages(messages)

    response = await self.async_client.chat.completions.create(
        model=self.model,
        messages=processed_messages,
        stream=False,
        **self.formatted_params
    )

    if response.choices[0].finish_reason == "length":
        warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)

    res = response.choices[0].message.content
    return self.config.postprocess_response(res)

get_ocr_messages

get_ocr_messages(
    system_prompt: str,
    user_prompt: str,
    image: Image,
    format: str = "png",
    detail: str = "high",
) -> List[Dict[str, str]]

This method inputs an image and returns the correesponding chat messages for the inference engine.

Parameters:

system_prompt : str the system prompt. user_prompt : str the user prompt. image : Image.Image the image for OCR. format : str, Optional the image format. detail : str, Optional the detail level of the image. Default is "high".

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', detail:str="high") -> List[Dict[str,str]]:
    """
    This method inputs an image and returns the correesponding chat messages for the inference engine.

    Parameters:
    ----------
    system_prompt : str
        the system prompt.
    user_prompt : str
        the user prompt.
    image : Image.Image
        the image for OCR.
    format : str, Optional
        the image format. 
    detail : str, Optional
        the detail level of the image. Default is "high". 
    """
    base64_str = image_to_base64(image)
    return [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/{format};base64,{base64_str}",
                        "detail": detail
                    },
                },
                {"type": "text", "text": user_prompt},
            ],
        },
    ]

vlm4ocr.vlm_engines.AzureOpenAIVLMEngine

AzureOpenAIVLMEngine(
    model: str,
    api_version: str,
    config: VLMConfig = None,
    **kwrs
)

Bases: OpenAIVLMEngine

The Azure OpenAI API inference engine. For parameters and documentation, refer to - https://azure.microsoft.com/en-us/products/ai-services/openai-service - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

Parameters:

model : str model name as described in https://platform.openai.com/docs/models api_version : str the Azure OpenAI API version config : LLMConfig the LLM configuration.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py

def __init__(self, model:str, api_version:str, config:VLMConfig=None, **kwrs):
    """
    The Azure OpenAI API inference engine.
    For parameters and documentation, refer to 
    - https://azure.microsoft.com/en-us/products/ai-services/openai-service
    - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

    Parameters:
    ----------
    model : str
        model name as described in https://platform.openai.com/docs/models
    api_version : str
        the Azure OpenAI API version
    config : LLMConfig
        the LLM configuration.
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import AzureOpenAI, AsyncAzureOpenAI
    self.model = model
    self.api_version = api_version
    self.client = AzureOpenAI(api_version=self.api_version, 
                              **kwrs)
    self.async_client = AsyncAzureOpenAI(api_version=self.api_version, 
                                         **kwrs)
    self.config = config if config else BasicVLMConfig()
    self.formatted_params = self._format_config()

VLM Engines Reference

vlm4ocr.vlm_engines.OllamaVLMEngine

chat

chat_async async

get_ocr_messages

vlm4ocr.vlm_engines.OpenAIVLMEngine

chat

chat_async async

get_ocr_messages

vlm4ocr.vlm_engines.AzureOpenAIVLMEngine

chat_async `async`

chat_async `async`