Skip to content

VLM Engines Reference

vlm4ocr.vlm_engines.BasicVLMConfig

BasicVLMConfig(
    max_new_tokens: int = 2048,
    temperature: float = 0.0,
    **kwargs
)

Bases: LLMConfig

The basic LLM configuration for most non-reasoning models.

Source code in llm_inference_engine/llm_configs.py
def __init__(self, max_new_tokens:int=2048, temperature:float=0.0, **kwargs):
    """
    The basic LLM configuration for most non-reasoning models.
    """
    super().__init__(**kwargs)
    self.max_new_tokens = max_new_tokens
    self.temperature = temperature
    self.params["max_new_tokens"] = self.max_new_tokens
    self.params["temperature"] = self.temperature

preprocess_messages

preprocess_messages(
    messages: List[Dict[str, str]],
) -> List[Dict[str, str]]

This method preprocesses the input messages before passing them to the LLM.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Returns:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Source code in llm_inference_engine/llm_configs.py
def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
    """
    This method preprocesses the input messages before passing them to the LLM.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}

    Returns:
    -------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    """
    return messages.copy()

postprocess_response

postprocess_response(
    response: Union[
        str,
        Dict[str, Any],
        Generator[str, None, None],
        AsyncGenerator[str, None],
    ],
) -> Union[
    Dict[str, Any],
    Generator[Dict[str, Any], None, None],
    AsyncGenerator[Dict[str, Any], None],
]

This method postprocesses the LLM response after it is generated.

Parameters:

response : Union[str, Dict[str, Any], Generator[str, None, None], AsyncGenerator[str, None]] the LLM response. Can be a string, a generator, or an async generator.

Returns: Union[Dict[str, Any], Generator[Dict[str, Any], None, None], AsyncGenerator[Dict[str, Any], None]] the postprocessed LLM response. if input is a string, the output will be a dict {"response": }. if input is a generator, the output will be a generator {"type": "response", "data": }. if input is an async generator, the output will be an async generator {"type": "response", "data": }.

Source code in llm_inference_engine/llm_configs.py
def postprocess_response(self, response:Union[str, Dict[str, Any], Generator[str, None, None], AsyncGenerator[str, None]]) -> Union[Dict[str, Any], Generator[Dict[str, Any], None, None], AsyncGenerator[Dict[str, Any], None]]:
    """
    This method postprocesses the LLM response after it is generated.

    Parameters:
    ----------
    response : Union[str, Dict[str, Any], Generator[str, None, None], AsyncGenerator[str, None]]
        the LLM response. Can be a string, a generator, or an async generator.

    Returns: Union[Dict[str, Any], Generator[Dict[str, Any], None, None], AsyncGenerator[Dict[str, Any], None]]
        the postprocessed LLM response. 
        if input is a string, the output will be a dict {"response": <response>}. 
        if input is a generator, the output will be a generator {"type": "response", "data": <content>}.
        if input is an async generator, the output will be an async generator {"type": "response", "data": <content>}.
    """
    if isinstance(response, str):
        return {"response": response}

    elif isinstance(response, dict):
        if "response" in response or "tool_calls" in response:
            return response
        else:
            warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
            return {"response": ""}

    elif isinstance(response, Generator):
        def _process_stream():
            for chunk in response:
                if isinstance(chunk, dict):
                    yield chunk
                elif isinstance(chunk, str):
                    yield {"type": "response", "data": chunk}

        return _process_stream()

    elif isinstance(response, AsyncGenerator):
        async def _process_async_stream():
            async for chunk in response:
                if isinstance(chunk, dict):
                    yield chunk
                elif isinstance(chunk, str):
                    yield {"type": "response", "data": chunk}

        return _process_async_stream()

    else:
        warnings.warn(f"Invalid response type: {type(response)}. Returning default empty dict.", UserWarning)
        return {"response": ""}

vlm4ocr.vlm_engines.ReasoningVLMConfig

ReasoningVLMConfig(
    thinking_token_start="<think>",
    thinking_token_end="</think>",
    **kwargs
)

Bases: LLMConfig

The general LLM configuration for reasoning models.

Source code in llm_inference_engine/llm_configs.py
def __init__(self, thinking_token_start="<think>", thinking_token_end="</think>", **kwargs):
    """
    The general LLM configuration for reasoning models.
    """
    super().__init__(**kwargs)
    self.thinking_token_start = thinking_token_start
    self.thinking_token_end = thinking_token_end

preprocess_messages

preprocess_messages(
    messages: List[Dict[str, str]],
) -> List[Dict[str, str]]

This method preprocesses the input messages before passing them to the LLM.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Returns:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Source code in llm_inference_engine/llm_configs.py
def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
    """
    This method preprocesses the input messages before passing them to the LLM.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}

    Returns:
    -------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    """
    return messages.copy()

postprocess_response

postprocess_response(
    response: Union[
        str,
        Dict[str, str],
        Generator[str, None, None],
        AsyncGenerator[str, None],
    ],
) -> Union[
    Dict[str, str],
    Generator[Dict[str, str], None, None],
    AsyncGenerator[Dict[str, str], None],
]

This method postprocesses the LLM response after it is generated. 1. If input is a string, it will extract the reasoning and response based on the thinking tokens. 2. If input is a dict, it should contain keys "reasoning", "response", or "tool_calls". This is for inference engines that already parse reasoning, response, and tool calls. 3. If input is a generator, a. if the chunk is a dict, it should contain keys "type" and "data". This is for inference engines that already parse reasoning, response, and tool calls. b. if the chunk is a string, it will yield dicts with keys "type" and "data" based on the thinking tokens.

Parameters:

response : Union[str, Generator[str, None, None]] the LLM response. Can be a string or a generator.

Returns:

response : Union[str, Generator[str, None, None]] the postprocessed LLM response as a dict {"reasoning": , "response": } if input is a generator, the output will be a generator {"type": , "data": }.

Source code in llm_inference_engine/llm_configs.py
def postprocess_response(self, response:Union[str, Dict[str, str], Generator[str, None, None], AsyncGenerator[str, None]]) -> Union[Dict[str,str], Generator[Dict[str,str], None, None], AsyncGenerator[Dict[str,str], None]]:
    """
    This method postprocesses the LLM response after it is generated.
    1. If input is a string, it will extract the reasoning and response based on the thinking tokens.
    2. If input is a dict, it should contain keys "reasoning", "response", or "tool_calls". This is for inference engines that already parse reasoning, response, and tool calls.
    3. If input is a generator, 
        a. if the chunk is a dict, it should contain keys "type" and "data". This is for inference engines that already parse reasoning, response, and tool calls.
        b. if the chunk is a string, it will yield dicts with keys "type" and "data" based on the thinking tokens.

    Parameters:
    ----------
    response : Union[str, Generator[str, None, None]]
        the LLM response. Can be a string or a generator.

    Returns:
    -------
    response : Union[str, Generator[str, None, None]]
        the postprocessed LLM response as a dict {"reasoning": <reasoning>, "response": <content>}
        if input is a generator, the output will be a generator {"type": <reasoning or response>, "data": <content>}.
    """
    if isinstance(response, str):
        # get contents between thinking_token_start and thinking_token_end
        pattern = f"{re.escape(self.thinking_token_start)}(.*?){re.escape(self.thinking_token_end)}"
        match = re.search(pattern, response, re.DOTALL)
        reasoning = match.group(1) if match else ""
        # get response AFTER thinking_token_end
        response = re.sub(f".*?{self.thinking_token_end}", "", response, flags=re.DOTALL).strip()
        return {"reasoning": reasoning, "response": response}

    elif isinstance(response, dict):
        if "reasoning" in response and "response" in response:
            return response
        else:
            warnings.warn(f"Invalid response dict keys: {response.keys()}. Returning default empty dict.", UserWarning)
            return {"reasoning": "", "response": ""}

    elif isinstance(response, Generator):
        def _process_stream():
            think_flag = False
            buffer = ""
            for chunk in response:
                if isinstance(chunk, dict):
                    yield chunk

                elif isinstance(chunk, str):
                    buffer += chunk
                    # switch between reasoning and response
                    if self.thinking_token_start in buffer:
                        think_flag = True
                        buffer = buffer.replace(self.thinking_token_start, "")
                    elif self.thinking_token_end in buffer:
                        think_flag = False
                        buffer = buffer.replace(self.thinking_token_end, "")

                    # if chunk is in thinking block, tag it as reasoning; else tag it as response
                    if chunk not in [self.thinking_token_start, self.thinking_token_end]:
                        if think_flag:
                            yield {"type": "reasoning", "data": chunk}
                        else:
                            yield {"type": "response", "data": chunk}

        return _process_stream()

    elif isinstance(response, AsyncGenerator):
        async def _process_async_stream():
            think_flag = False
            buffer = ""
            async for chunk in response:
                if isinstance(chunk, dict):
                    yield chunk

                elif isinstance(chunk, str):
                    buffer += chunk
                    # switch between reasoning and response
                    if self.thinking_token_start in buffer:
                        think_flag = True
                        buffer = buffer.replace(self.thinking_token_start, "")
                    elif self.thinking_token_end in buffer:
                        think_flag = False
                        buffer = buffer.replace(self.thinking_token_end, "")

                    # if chunk is in thinking block, tag it as reasoning; else tag it as response
                    if chunk not in [self.thinking_token_start, self.thinking_token_end]:
                        if think_flag:
                            yield {"type": "reasoning", "data": chunk}
                        else:
                            yield {"type": "response", "data": chunk}

        return _process_async_stream()

    else:
        warnings.warn(f"Invalid response type: {type(response)}. Returning default empty dict.", UserWarning)
        return {"reasoning": "", "response": ""}

vlm4ocr.vlm_engines.OpenAIReasoningVLMConfig

OpenAIReasoningVLMConfig(
    reasoning_effort: str = None, **kwargs
)

Bases: ReasoningLLMConfig

The OpenAI "o" series configuration. 1. The reasoning effort as one of {"low", "medium", "high"}. For models that do not support setting reasoning effort (e.g., o1-mini, o1-preview), set to None. 2. The temperature parameter is not supported and will be ignored. 3. The system prompt is not supported and will be concatenated to the next user prompt.

Parameters:

reasoning_effort : str, Optional the reasoning effort. Must be one of {"low", "medium", "high"}. Default is "low".

Source code in llm_inference_engine/llm_configs.py
def __init__(self, reasoning_effort:str=None, **kwargs):
    """
    The OpenAI "o" series configuration.
    1. The reasoning effort as one of {"low", "medium", "high"}.
        For models that do not support setting reasoning effort (e.g., o1-mini, o1-preview), set to None.
    2. The temperature parameter is not supported and will be ignored.
    3. The system prompt is not supported and will be concatenated to the next user prompt.

    Parameters:
    ----------
    reasoning_effort : str, Optional
        the reasoning effort. Must be one of {"low", "medium", "high"}. Default is "low".
    """
    super().__init__(**kwargs)
    if reasoning_effort is not None:
        if reasoning_effort not in ["low", "medium", "high"]:
            raise ValueError("reasoning_effort must be one of {'low', 'medium', 'high'}.")

        self.reasoning_effort = reasoning_effort
        self.params["reasoning_effort"] = self.reasoning_effort

    if "temperature" in self.params:
        warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
        self.params.pop("temperature")

preprocess_messages

preprocess_messages(
    messages: List[Dict[str, str]],
) -> List[Dict[str, str]]

Concatenate system prompts to the next user prompt.

Parameters:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Returns:

messages : List[Dict[str,str]] a list of dict with role and content. role must be one of {"system", "user", "assistant"}

Source code in llm_inference_engine/llm_configs.py
def preprocess_messages(self, messages:List[Dict[str,str]]) -> List[Dict[str,str]]:
    """
    Concatenate system prompts to the next user prompt.

    Parameters:
    ----------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}

    Returns:
    -------
    messages : List[Dict[str,str]]
        a list of dict with role and content. role must be one of {"system", "user", "assistant"}
    """
    system_prompt_holder = ""
    new_messages = []
    for i, message in enumerate(messages):
        # if system prompt, store it in system_prompt_holder
        if message['role'] == 'system':
            system_prompt_holder = message['content']
        # if user prompt, concatenate it with system_prompt_holder
        elif message['role'] == 'user':
            if system_prompt_holder:
                new_message = {'role': message['role'], 'content': f"{system_prompt_holder} {message['content']}"}
                system_prompt_holder = ""
            else:
                new_message = {'role': message['role'], 'content': message['content']}

            new_messages.append(new_message)
        # if assistant/other prompt, do nothing
        else:
            new_message = {'role': message['role'], 'content': message['content']}
            new_messages.append(new_message)

    return new_messages

vlm4ocr.vlm_engines.OpenAICompatibleVLMEngine

OpenAICompatibleVLMEngine(
    model: str,
    api_key: str,
    base_url: str,
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: OpenAICompatibleInferenceEngine, VLMEngine

Source code in llm_inference_engine/engines.py
def __init__(self, model:str, api_key:str, base_url:str, config:LLMConfig=None, 
             max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    General OpenAI-compatible server inference engine.
    https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html

    For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

    Parameters:
    ----------
    model : str
        model name as shown in the vLLM server
    api_key : str
        the API key for the vLLM server.
    base_url : str
        the base url for the vLLM server. 
    config : LLMConfig, Optional
        the LLM configuration.
    max_concurrent_requests : int, Optional
        maximum number of concurrent requests to the LLM.
    max_requests_per_minute : int, Optional
        maximum number of requests per minute to the LLM.
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import OpenAI, AsyncOpenAI
    from openai.types.chat import ChatCompletionChunk, ChatCompletion
    self.ChatCompletion = ChatCompletion
    self.ChatCompletionChunk = ChatCompletionChunk
    super().__init__(config=config, max_concurrent_requests=max_concurrent_requests, max_requests_per_minute=max_requests_per_minute)
    self.client = OpenAI(api_key=api_key, base_url=base_url, **kwrs)
    self.async_client = AsyncOpenAI(api_key=api_key, base_url=base_url, **kwrs)
    self.model = model

get_ocr_messages

get_ocr_messages(
    system_prompt: str,
    user_prompt: str,
    image: Image,
    format: str = "png",
    detail: str = "high",
    few_shot_examples: List[FewShotExample] = None,
) -> List[Dict[str, str]]

This method inputs an image and returns the correesponding chat messages for the inference engine.

Parameters:

system_prompt : str the system prompt. user_prompt : str the user prompt. image : Image.Image the image for OCR. format : str, Optional the image format. detail : str, Optional the detail level of the image. Default is "high". few_shot_examples : List[FewShotExample], Optional list of few-shot examples.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', 
                     detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
    """
    This method inputs an image and returns the correesponding chat messages for the inference engine.

    Parameters:
    ----------
    system_prompt : str
        the system prompt.
    user_prompt : str
        the user prompt.
    image : Image.Image
        the image for OCR.
    format : str, Optional
        the image format. 
    detail : str, Optional
        the detail level of the image. Default is "high". 
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples.
    """
    base64_str = image_to_base64(image)
    output_messages = []
    # system message (omitted when system_prompt is None)
    if system_prompt is not None:
        output_messages.append({"role": "system", "content": system_prompt})

    # few-shot examples
    if few_shot_examples is not None:
        for example in few_shot_examples:
            if not isinstance(example, FewShotExample):
                raise ValueError("Few-shot example must be a FewShotExample object.")

            example_image_b64 = image_to_base64(example.image)
            example_user_content = [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/{format};base64,{example_image_b64}",
                        "detail": detail
                    },
                },
            ]
            if user_prompt is not None:
                example_user_content.append({"type": "text", "text": user_prompt})
            example_user_message = {"role": "user", "content": example_user_content}
            example_agent_message = {"role": "assistant", "content": example.text}
            output_messages.append(example_user_message)
            output_messages.append(example_agent_message)

    # user message (text omitted when user_prompt is None)
    user_content = [
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/{format};base64,{base64_str}",
                "detail": detail
            },
        },
    ]
    if user_prompt is not None:
        user_content.append({"type": "text", "text": user_prompt})
    output_messages.append({"role": "user", "content": user_content})
    return output_messages

vlm4ocr.vlm_engines.VLLMVLMEngine

VLLMVLMEngine(
    model: str,
    api_key: str = "",
    base_url: str = "http://localhost:8000/v1",
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: VLLMInferenceEngine, OpenAICompatibleVLMEngine

vLLM OpenAI compatible server inference engine. https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html

For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

Parameters:

model_name : str model name as shown in the vLLM server api_key : str, Optional the API key for the vLLM server. base_url : str, Optional the base url for the vLLM server. config : LLMConfig the LLM configuration.

Source code in llm_inference_engine/engines.py
def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:8000/v1", config:LLMConfig=None, 
             max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    vLLM OpenAI compatible server inference engine.
    https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html

    For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

    Parameters:
    ----------
    model_name : str
        model name as shown in the vLLM server
    api_key : str, Optional
        the API key for the vLLM server.
    base_url : str, Optional
        the base url for the vLLM server. 
    config : LLMConfig, Optional
        the LLM configuration.
    max_concurrent_requests : int, Optional
        the maximum number of concurrent requests.
    max_requests_per_minute : int, Optional
        the maximum number of requests per minute.
    """
    super().__init__(model=model, api_key=api_key, base_url=base_url, config=config, 
                     max_concurrent_requests=max_concurrent_requests, 
                     max_requests_per_minute=max_requests_per_minute, **kwrs)

vlm4ocr.vlm_engines.OpenRouterVLMEngine

OpenRouterVLMEngine(
    model: str,
    api_key: str = None,
    base_url: str = "https://openrouter.ai/api/v1",
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: OpenRouterInferenceEngine, OpenAICompatibleVLMEngine

OpenRouter OpenAI-compatible server inference engine.

Parameters:

model_name : str model name as shown in the vLLM server api_key : str, Optional the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY']. base_url : str, Optional the base url for the vLLM server. config : LLMConfig the LLM configuration.

Source code in llm_inference_engine/engines.py
def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, 
             max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    OpenRouter OpenAI-compatible server inference engine.

    Parameters:
    ----------
    model_name : str
        model name as shown in the vLLM server
    api_key : str, Optional
        the API key for the vLLM server. If None, will use the key in os.environ['OPENROUTER_API_KEY'].
    base_url : str, Optional
        the base url for the vLLM server. 
    config : LLMConfig, Optional
        the LLM configuration.
    max_concurrent_requests : int, Optional
        the maximum number of concurrent requests.
    max_requests_per_minute : int, Optional
        the maximum number of requests per minute.
    """
    self.api_key = api_key
    if self.api_key is None:
        self.api_key = os.getenv("OPENROUTER_API_KEY")
    super().__init__(model=model, 
                     api_key=self.api_key, 
                     base_url=base_url, 
                     config=config, 
                     max_concurrent_requests=max_concurrent_requests, 
                     max_requests_per_minute=max_requests_per_minute, 
                     **kwrs)

vlm4ocr.vlm_engines.OllamaVLMEngine

OllamaVLMEngine(
    model_name: str,
    num_ctx: int = 4096,
    keep_alive: int = 300,
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: OllamaInferenceEngine, VLMEngine

Source code in llm_inference_engine/engines.py
def __init__(self, model_name:str, num_ctx:int=4096, keep_alive:int=300, config:LLMConfig=None, 
             max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    The Ollama inference engine.

    Parameters:
    ----------
    model_name : str
        the model name exactly as shown in >> ollama ls
    num_ctx : int, Optional
        context length that LLM will evaluate.
    keep_alive : int, Optional
        seconds to hold the LLM after the last API call.
    config : LLMConfig, Optional
        the LLM configuration. 
    max_concurrent_requests : int, Optional
        maximum number of concurrent requests to the LLM.
    max_requests_per_minute : int, Optional
        maximum number of requests per minute to the LLM.
    """
    if importlib.util.find_spec("ollama") is None:
        raise ImportError("ollama-python not found. Please install ollama-python (```pip install ollama```).")

    from ollama import Client, AsyncClient
    super().__init__(config=config, max_concurrent_requests=max_concurrent_requests, max_requests_per_minute=max_requests_per_minute)
    self.client = Client(**kwrs)
    self.async_client = AsyncClient(**kwrs)
    self.model_name = model_name
    self.num_ctx = num_ctx
    self.keep_alive = keep_alive

get_ocr_messages

get_ocr_messages(
    system_prompt: str,
    user_prompt: str,
    image: Image,
    few_shot_examples: List[FewShotExample] = None,
) -> List[Dict[str, str]]

This method inputs an image and returns the correesponding chat messages for the inference engine.

Parameters:

system_prompt : str the system prompt. user_prompt : str the user prompt. image : Image.Image the image for OCR. few_shot_examples : List[FewShotExample], Optional list of few-shot examples.

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
    """
    This method inputs an image and returns the correesponding chat messages for the inference engine.

    Parameters:
    ----------
    system_prompt : str
        the system prompt.
    user_prompt : str
        the user prompt.
    image : Image.Image
        the image for OCR.
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples. 
    """
    base64_str = image_to_base64(image)
    output_messages = []
    # system message (omitted when system_prompt is None)
    if system_prompt is not None:
        output_messages.append({"role": "system", "content": system_prompt})

    # few-shot examples
    if few_shot_examples is not None:
        for example in few_shot_examples:
            if not isinstance(example, FewShotExample):
                raise ValueError("Few-shot example must be a FewShotExample object.")

            example_image_b64 = image_to_base64(example.image)
            example_user_message = {"role": "user", "content": user_prompt, "images": [example_image_b64]}
            example_agent_message = {"role": "assistant", "content": example.text}
            output_messages.append(example_user_message)
            output_messages.append(example_agent_message)

    # user message (text omitted when user_prompt is None)
    if user_prompt is not None:
        user_message = {"role": "user", "content": user_prompt, "images": [base64_str]}
    else:
        user_message = {"role": "user", "content": "", "images": [base64_str]}
    output_messages.append(user_message)

    return output_messages

vlm4ocr.vlm_engines.OpenAIVLMEngine

OpenAIVLMEngine(
    model: str,
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: OpenAIInferenceEngine, VLMEngine

Source code in llm_inference_engine/engines.py
def __init__(self, model:str, config:LLMConfig=None, max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    The OpenAI API inference engine. 
    For parameters and documentation, refer to https://platform.openai.com/docs/api-reference/introduction

    Parameters:
    ----------
    model_name : str
        model name as described in https://platform.openai.com/docs/models
    config : LLMConfig, Optional
        the LLM configuration.
    max_concurrent_requests : int, Optional
        maximum number of concurrent requests to the LLM.
    max_requests_per_minute : int, Optional
        maximum number of requests per minute to the LLM.
    """
    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import OpenAI, AsyncOpenAI
    from openai.types.chat import ChatCompletionChunk, ChatCompletion
    super().__init__(config=config, max_concurrent_requests=max_concurrent_requests, max_requests_per_minute=max_requests_per_minute)
    self.client = OpenAI(**kwrs)
    self.async_client = AsyncOpenAI(**kwrs)
    self.model = model
    self.ChatCompletion = ChatCompletion
    self.ChatCompletionChunk = ChatCompletionChunk

get_ocr_messages

get_ocr_messages(
    system_prompt: str,
    user_prompt: str,
    image: Image,
    format: str = "png",
    detail: str = "high",
    few_shot_examples: List[FewShotExample] = None,
) -> List[Dict[str, str]]

This method inputs an image and returns the correesponding chat messages for the inference engine.

Parameters:

system_prompt : str the system prompt. user_prompt : str the user prompt. image : Image.Image the image for OCR. format : str, Optional the image format. detail : str, Optional the detail level of the image. Default is "high". few_shot_examples : List[FewShotExample], Optional list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).

Source code in packages/vlm4ocr/vlm4ocr/vlm_engines.py
def get_ocr_messages(self, system_prompt:str, user_prompt:str, image:Image.Image, format:str='png', 
                     detail:str="high", few_shot_examples:List[FewShotExample]=None) -> List[Dict[str,str]]:
    """
    This method inputs an image and returns the correesponding chat messages for the inference engine.

    Parameters:
    ----------
    system_prompt : str
        the system prompt.
    user_prompt : str
        the user prompt.
    image : Image.Image
        the image for OCR.
    format : str, Optional
        the image format. 
    detail : str, Optional
        the detail level of the image. Default is "high". 
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
    """
    base64_str = image_to_base64(image)
    output_messages = []
    # system message (omitted when system_prompt is None)
    if system_prompt is not None:
        output_messages.append({"role": "system", "content": system_prompt})

    # few-shot examples
    if few_shot_examples is not None:
        for example in few_shot_examples:
            if not isinstance(example, FewShotExample):
                raise ValueError("Few-shot example must be a FewShotExample object.")

            example_image_b64 = image_to_base64(example.image)
            example_user_content = [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/{format};base64,{example_image_b64}",
                        "detail": detail
                    },
                },
            ]
            if user_prompt is not None:
                example_user_content.append({"type": "text", "text": user_prompt})
            example_user_message = {"role": "user", "content": example_user_content}
            example_agent_message = {"role": "assistant", "content": example.text}
            output_messages.append(example_user_message)
            output_messages.append(example_agent_message)

    # user message (text omitted when user_prompt is None)
    user_content = [
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/{format};base64,{base64_str}",
                "detail": detail
            },
        },
    ]
    if user_prompt is not None:
        user_content.append({"type": "text", "text": user_prompt})
    output_messages.append({"role": "user", "content": user_content})
    return output_messages

vlm4ocr.vlm_engines.AzureOpenAIVLMEngine

AzureOpenAIVLMEngine(
    model: str,
    api_version: str,
    config: LLMConfig = None,
    max_concurrent_requests: int = None,
    max_requests_per_minute: int = None,
    **kwrs
)

Bases: AzureOpenAIInferenceEngine, OpenAIVLMEngine

The Azure OpenAI API inference engine. For parameters and documentation, refer to - https://azure.microsoft.com/en-us/products/ai-services/openai-service - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

Parameters:

model : str model name as described in https://platform.openai.com/docs/models api_version : str the Azure OpenAI API version config : LLMConfig the LLM configuration.

Source code in llm_inference_engine/engines.py
def __init__(self, model:str, api_version:str, config:LLMConfig=None, max_concurrent_requests:int=None, max_requests_per_minute:int=None, **kwrs):
    """
    The Azure OpenAI API inference engine.
    For parameters and documentation, refer to 
    - https://azure.microsoft.com/en-us/products/ai-services/openai-service
    - https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart

    Parameters:
    ----------
    model : str
        model name as described in https://platform.openai.com/docs/models
    api_version : str
        the Azure OpenAI API version
    config : LLMConfig, Optional
        the LLM configuration.
    max_concurrent_requests : int, Optional
        maximum number of concurrent requests to the LLM.
    max_requests_per_minute : int, Optional
        maximum number of requests per minute to the LLM.
    """
    InferenceEngine.__init__(self, config=config, 
                             max_concurrent_requests=max_concurrent_requests, 
                             max_requests_per_minute=max_requests_per_minute)

    if importlib.util.find_spec("openai") is None:
        raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")

    from openai import AzureOpenAI, AsyncAzureOpenAI
    from openai.types.chat import ChatCompletionChunk, ChatCompletion
    self.api_version = api_version
    self.client = AzureOpenAI(api_version=api_version, **kwrs)
    self.async_client = AsyncAzureOpenAI(api_version=api_version, **kwrs)
    self.model = model
    self.ChatCompletion = ChatCompletion
    self.ChatCompletionChunk = ChatCompletionChunk