OCR Engine Reference

vlm4ocr.ocr_engines.OCREngine

OCREngine(
    vlm_engine: VLMEngine,
    output_mode: str = "markdown",
    system_prompt: Union[str, None, Literal[False]] = None,
    user_prompt: Union[str, None, Literal[False]] = None,
    bbox_format: Union[BBoxFormat, None] = None,
)

This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.

Parameters:

vlm_engine : VLMEngine The VLM inference engine to use for OCR. output_mode : str, Optional The output format. Must be 'markdown', 'HTML', 'text', 'JSON', or 'bbox'. system_prompt : str | None | False, Optional Controls the system prompt sent to the model. - None (default): use the built-in default system prompt for the selected output_mode. - str: use this custom system prompt. - False: send no system prompt at all. user_prompt : str | None | False, Optional Controls the user-turn text sent alongside the image. - None (default): use the built-in default user prompt, or empty string in bbox mode (triggers full-text OCR). - str: custom user prompt. In bbox mode a non-empty string triggers targeted extraction. - False: send no user prompt text (image only). bbox_format : BBoxFormat | None, Optional Override the auto-resolved BBoxFormat for bbox output mode. When None (default) the format is resolved from the registry based on vlm_engine.model.

Source code in packages/vlm4ocr/vlm4ocr/ocr_engines.py

def __init__(self, vlm_engine: VLMEngine, output_mode: str = "markdown",
             system_prompt: Union[str, None, Literal[False]] = None,
             user_prompt: Union[str, None, Literal[False]] = None,
             bbox_format: Union[BBoxFormat, None] = None):
    """
    This class inputs a image or PDF file path and processes them using a VLM inference engine. Outputs plain text or markdown.

    Parameters:
    -----------
    vlm_engine : VLMEngine
        The VLM inference engine to use for OCR.
    output_mode : str, Optional
        The output format. Must be 'markdown', 'HTML', 'text', 'JSON', or 'bbox'.
    system_prompt : str | None | False, Optional
        Controls the system prompt sent to the model.
        - None (default): use the built-in default system prompt for the selected output_mode.
        - str: use this custom system prompt.
        - False: send no system prompt at all.
    user_prompt : str | None | False, Optional
        Controls the user-turn text sent alongside the image.
        - None (default): use the built-in default user prompt, or empty string in bbox mode
          (triggers full-text OCR).
        - str: custom user prompt. In bbox mode a non-empty string triggers targeted extraction.
        - False: send no user prompt text (image only).
    bbox_format : BBoxFormat | None, Optional
        Override the auto-resolved BBoxFormat for bbox output mode. When None (default) the
        format is resolved from the registry based on vlm_engine.model.
    """
    # Check inference engine
    if not isinstance(vlm_engine, VLMEngine):
        raise TypeError("vlm_engine must be an instance of VLMEngine")
    self.vlm_engine = vlm_engine

    # Check output mode
    if output_mode not in ["markdown", "HTML", "text", "JSON", "bbox"]:
        raise ValueError("output_mode must be 'markdown', 'HTML', 'text', 'JSON', or 'bbox'.")
    self.output_mode = output_mode

    # Resolve BBoxFormat (only used in bbox mode, but resolved eagerly so init fails fast)
    if output_mode == "bbox":
        from vlm4ocr.bbox import resolve_bbox_format
        if bbox_format is not None:
            if not isinstance(bbox_format, BBoxFormat):
                raise TypeError("bbox_format must be a BBoxFormat instance or None")
            self.bbox_format = bbox_format
        else:
            self.bbox_format = resolve_bbox_format(self.vlm_engine.model)
    else:
        self.bbox_format = None

    # System prompt
    if system_prompt is False:
        self.system_prompt = None
    elif isinstance(system_prompt, str) and system_prompt:
        self.system_prompt = system_prompt
    else:
        if output_mode == "bbox":
            prompt_file = self.bbox_format.system_prompt_file
        else:
            prompt_file = f'ocr_{self.output_mode}_system_prompt.txt'
        prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(prompt_file)
        with prompt_template_path.open('r', encoding='utf-8') as f:
            self.system_prompt = f.read()

    # User prompt
    if user_prompt is False:
        self.user_prompt = None
    elif isinstance(user_prompt, str) and user_prompt:
        self.user_prompt = user_prompt
    else:
        if self.output_mode == "JSON":
            raise ValueError("user_prompt must be provided when output_mode is 'JSON' to define the JSON structure.")
        if self.output_mode == "bbox":
            # None / empty → full-text OCR (empty user text)
            self.user_prompt = ""
        else:
            prompt_template_path = importlib.resources.files('vlm4ocr.assets.default_prompt_templates').joinpath(f'ocr_{self.output_mode}_user_prompt.txt')
            with prompt_template_path.open('r', encoding='utf-8') as f:
                self.user_prompt = f.read()

    # Image processor
    self.image_processor = ImageProcessor(vlm_engine=self.vlm_engine)

stream_ocr

stream_ocr(
    file_path: str,
    rotate_correction: Union[
        RotateCorrectionMethod, Literal[False]
    ] = False,
    max_dimension_pixels: int = None,
    few_shot_examples: List[FewShotExample] = None,
) -> Generator[Dict[str, str], None, None]

This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications. Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.

Parameters:

file_path : str The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp' rotate_correction : {"tesseract", "vlm", False}, Optional Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction. max_dimension_pixels : int, Optional The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied. few_shot_examples : List[FewShotExample], Optional list of few-shot examples.

Returns:

Generator[Dict[str, str], None, None] A generator that yields the output: {"type": "info", "data": msg} {"type": "ocr_chunk", "data": chunk} {"type": "page_delimiter", "data": page_delimiter}

Source code in packages/vlm4ocr/vlm4ocr/ocr_engines.py

def stream_ocr(self, file_path: str, rotate_correction:Union[RotateCorrectionMethod, Literal[False]]=False,
               max_dimension_pixels:int=None,
               few_shot_examples:List[FewShotExample]=None) -> Generator[Dict[str, str], None, None]:
    """
    This method inputs a file path (image or PDF) and stream OCR results in real-time. This is useful for frontend applications.
    Yields dictionaries with 'type' ('ocr_chunk' or 'page_delimiter') and 'data'.

    Parameters:
    -----------
    file_path : str
        The path to the image or PDF file. Must be one of '.pdf', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
    rotate_correction : {"tesseract", "vlm", False}, Optional
        Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction.
    max_dimension_pixels : int, Optional
        The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples.

    Returns:
    --------
    Generator[Dict[str, str], None, None]
        A generator that yields the output:
        {"type": "info", "data": msg}
        {"type": "ocr_chunk", "data": chunk}
        {"type": "page_delimiter", "data": page_delimiter}
    """
    # Check file path
    if not isinstance(file_path, str):
        raise TypeError("file_path must be a string")

    # Check file extension
    file_ext = os.path.splitext(file_path)[1].lower()
    if file_ext not in SUPPORTED_IMAGE_EXTS:
        raise ValueError(f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")

    # PDF or TIFF
    if file_ext in ['.pdf', '.tif', '.tiff']:
        data_loader = PDFDataLoader(file_path) if file_ext == '.pdf' else TIFFDataLoader(file_path)
        images = data_loader.get_all_pages()
        # Check if images were extracted
        if not images:
            raise ValueError(f"No images extracted from file: {file_path}")

        # OCR each image
        for i, image in enumerate(images):
            # Apply rotate correction if specified
            if rotate_correction:
                try:
                    image, _ = self.image_processor.rotate_correction(image, method=rotate_correction)
                except Exception as e:
                    yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}

            # Resize the image if max_dimension_pixels is specified
            if max_dimension_pixels is not None:
                try:
                    image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
                except Exception as e:
                    yield {"type": "info", "data": f"Error resizing image: {str(e)}"}

            # Get OCR messages
            messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
                                                        user_prompt=self.user_prompt,
                                                        image=image,
                                                        few_shot_examples=few_shot_examples)

            # Stream response
            response_stream = self.vlm_engine.chat_stream(messages)
            buffered_chunks = []
            for chunk in response_stream:
                if chunk["type"] == "response":
                    yield {"type": "ocr_chunk", "data": chunk["data"]}
                    buffered_chunks.append(chunk["data"])

            # bbox post-processing after stream completes
            if self.output_mode == "bbox":
                from vlm4ocr.bbox import parse_bbox_response
                img_w, img_h = image.size
                raw_response = "".join(buffered_chunks)
                bboxes = parse_bbox_response(raw_response, self.bbox_format, img_w, img_h)
                yield {"type": "bbox_result", "data": {
                    "page_idx": i,
                    "bboxes": [asdict(b) for b in bboxes],
                    "image_width": img_w,
                    "image_height": img_h,
                }}

            if i < len(images) - 1:
                yield {"type": "page_delimiter", "data": get_default_page_delimiter(self.output_mode)}

    # Image
    else:
        data_loader = ImageDataLoader(file_path)
        image = data_loader.get_page(0)

        # Apply rotate correction if specified
        if rotate_correction:
            try:
                image, _ = self.image_processor.rotate_correction(image, method=rotate_correction)
            except Exception as e:
                yield {"type": "info", "data": f"Error during rotate correction: {str(e)}"}

        # Resize the image if max_dimension_pixels is specified
        if max_dimension_pixels is not None:
            try:
                image, _ = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
            except Exception as e:
                yield {"type": "info", "data": f"Error resizing image: {str(e)}"}

        # Get OCR messages
        messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
                                                    user_prompt=self.user_prompt,
                                                    image=image,
                                                    few_shot_examples=few_shot_examples)
        # Stream response
        response_stream = self.vlm_engine.chat_stream(messages)
        buffered_chunks = []
        for chunk in response_stream:
            if chunk["type"] == "response":
                yield {"type": "ocr_chunk", "data": chunk["data"]}
                buffered_chunks.append(chunk["data"])

        # bbox post-processing after stream completes
        if self.output_mode == "bbox":
            from vlm4ocr.bbox import parse_bbox_response
            img_w, img_h = image.size
            raw_response = "".join(buffered_chunks)
            bboxes = parse_bbox_response(raw_response, self.bbox_format, img_w, img_h)
            yield {"type": "bbox_result", "data": {
                "page_idx": 0,
                "bboxes": [asdict(b) for b in bboxes],
                "image_width": img_w,
                "image_height": img_h,
            }}

sequential_ocr

sequential_ocr(
    file_paths: Union[str, Iterable[str]],
    rotate_correction: Union[
        RotateCorrectionMethod, Literal[False]
    ] = False,
    max_dimension_pixels: int = None,
    verbose: bool = False,
    few_shot_examples: List[FewShotExample] = None,
) -> List[OCRResult]

This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.

Parameters:

file_paths : Union[str, Iterable[str]] A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp' rotate_correction : {"tesseract", "vlm", False}, Optional Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction. max_dimension_pixels : int, Optional The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied. verbose : bool, Optional If True, the function will print the output in terminal. few_shot_examples : List[FewShotExample], Optional list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).

Returns:

List[OCRResult] A list of OCR result objects.

Source code in packages/vlm4ocr/vlm4ocr/ocr_engines.py

def sequential_ocr(self, file_paths: Union[str, Iterable[str]],
                   rotate_correction:Union[RotateCorrectionMethod, Literal[False]]=False,
                   max_dimension_pixels:int=None, verbose:bool=False, few_shot_examples:List[FewShotExample]=None) -> List[OCRResult]:
    """
    This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.

    Parameters:
    -----------
    file_paths : Union[str, Iterable[str]]
        A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
    rotate_correction : {"tesseract", "vlm", False}, Optional
        Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction.
    max_dimension_pixels : int, Optional
        The maximum dimension of the image in pixels. Original dimensions will be resized to fit in. If None, no resizing is applied.
    verbose : bool, Optional
        If True, the function will print the output in terminal.
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).

    Returns:
    --------
    List[OCRResult]
        A list of OCR result objects.
    """
    if isinstance(file_paths, str):
        file_paths = [file_paths]

    # Iterate through file paths
    ocr_results = []
    for file_path in file_paths:
        # Define OCRResult object
        ocr_result = OCRResult(input_dir=file_path, output_mode=self.output_mode)
        # get file extension
        file_ext = os.path.splitext(file_path)[1].lower()
        # Check file extension
        if file_ext not in SUPPORTED_IMAGE_EXTS:
            if verbose:
                print(f"{Fore.RED}Unsupported file type:{Style.RESET_ALL} {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}")
            ocr_result.status = "error"
            ocr_result.add_page(text=f"Unsupported file type: {file_ext}. Supported types are: {SUPPORTED_IMAGE_EXTS}",
                                image_processing_status={})
            ocr_results.append(ocr_result)
            continue

        filename = os.path.basename(file_path)

        try:
            # Load images from file
            if file_ext == '.pdf':
                data_loader = PDFDataLoader(file_path) 
            elif file_ext in ['.tif', '.tiff']:
                data_loader = TIFFDataLoader(file_path)
            else:
                data_loader = ImageDataLoader(file_path)

            images = data_loader.get_all_pages()
        except Exception as e:
            if verbose:
                print(f"{Fore.RED}Error processing file {filename}:{Style.RESET_ALL} {str(e)}")
            ocr_result.status = "error"
            ocr_result.add_page(text=f"Error processing file {filename}: {str(e)}", image_processing_status={})
            ocr_results.append(ocr_result)
            continue

        # Check if images were extracted
        if not images:
            if verbose:
                print(f"{Fore.RED}No images extracted from file:{Style.RESET_ALL} {filename}. It might be empty or corrupted.")
            ocr_result.status = "error"
            ocr_result.add_page(text=f"No images extracted from file: {filename}. It might be empty or corrupted.",
                                image_processing_status={})
            ocr_results.append(ocr_result)
            continue

        # OCR images
        for i, image in enumerate(images):
            image_processing_status = {}
            # Apply rotate correction if specified
            if rotate_correction:
                try:
                    image, rotation_angle = self.image_processor.rotate_correction(image, method=rotate_correction)
                    image_processing_status["rotate_correction"] = {
                        "status": "success",
                        "rotation_angle": rotation_angle
                    }
                    if verbose:
                        print(f"{Fore.GREEN}Rotate correction applied for {filename} page {i} with angle {rotation_angle} degrees.{Style.RESET_ALL}")
                except Exception as e:
                    image_processing_status["rotate_correction"] = {
                        "status": "error",
                        "error": str(e)
                    }
                    if verbose:
                        print(f"{Fore.RED}Error during rotate correction for {filename}:{Style.RESET_ALL} {rotation_angle['error']}. OCR continues without rotate correction.")

            # Resize the image if max_dimension_pixels is specified
            if max_dimension_pixels is not None:
                try:
                    image, resized = self.image_processor.resize(image, max_dimension_pixels=max_dimension_pixels)
                    image_processing_status["resize"] = {
                        "status": "success",
                        "resized": resized
                    }
                    if verbose and resized:
                        print(f"{Fore.GREEN}Image resized for {filename} page {i} to fit within {max_dimension_pixels} pixels.{Style.RESET_ALL}")
                except Exception as e:
                    image_processing_status["resize"] = {
                        "status": "error",
                        "error": str(e)
                    }
                    if verbose:
                        print(f"{Fore.RED}Error resizing image for {filename}:{Style.RESET_ALL} {resized['error']}. OCR continues without resizing.")

            try:
                messages = self.vlm_engine.get_ocr_messages(system_prompt=self.system_prompt,
                                                            user_prompt=self.user_prompt,
                                                            image=image,
                                                            few_shot_examples=few_shot_examples)
                # Define a messages logger to capture messages
                messages_logger = MessagesLogger()
                # Generate response
                response = self.vlm_engine.chat(
                    messages,
                    verbose=verbose,
                    messages_logger=messages_logger
                )
                ocr_text = response["response"]
                bboxes = None
                img_w = img_h = None

                # Mode-specific post-processing
                if self.output_mode == "markdown":
                    ocr_text = clean_markdown(ocr_text)
                elif self.output_mode == "JSON":
                    json_list = extract_json(ocr_text)
                    ocr_text = json.dumps(json_list, indent=4)
                elif self.output_mode == "bbox":
                    from vlm4ocr.bbox import parse_bbox_response
                    img_w, img_h = image.size
                    bboxes = parse_bbox_response(ocr_text, self.bbox_format, img_w, img_h)
                    ocr_text = "\n".join(item.text for item in bboxes)

                # Add the page to the OCR result
                ocr_result.add_page(text=ocr_text,
                                    image_processing_status=image_processing_status,
                                    bboxes=bboxes,
                                    image_width=img_w,
                                    image_height=img_h)

                # Add messages log to the OCR result
                ocr_result.add_messages_to_log(messages_logger.get_messages_log())

            except Exception as page_e:
                ocr_result.status = "error"
                ocr_result.add_page(text=f"Error during OCR for a page in {filename}: {str(page_e)}",
                                    image_processing_status={})
                if verbose:
                    print(f"{Fore.RED}Error during OCR for a page in {filename}:{Style.RESET_ALL} {page_e}")

        # Add the OCR result to the list
        if ocr_result.status != "error":
            ocr_result.status = "success"
        ocr_results.append(ocr_result)

        if verbose:
            print(f"{Fore.BLUE}Processed {filename} with {len(ocr_result)} pages.{Style.RESET_ALL}")
            for page in ocr_result:
                print(page)
                print("-" * 80)

    return ocr_results

concurrent_ocr

concurrent_ocr(
    file_paths: Union[str, Iterable[str]],
    rotate_correction: Union[
        RotateCorrectionMethod, Literal[False]
    ] = False,
    max_dimension_pixels: int = None,
    few_shot_examples: List[FewShotExample] = None,
    concurrent_batch_size: int = 32,
    max_file_load: int = None,
) -> AsyncGenerator[OCRResult, None]

First complete first out. Input and output order not guaranteed. This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine. Results are processed concurrently using asyncio.

Parameters:

file_paths : Union[str, Iterable[str]] A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp' rotate_correction : {"tesseract", "vlm", False}, Optional Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction. max_dimension_pixels : int, Optional The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied. few_shot_examples : List[FewShotExample], Optional list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str). concurrent_batch_size : int, Optional The number of concurrent VLM calls to make. max_file_load : int, Optional The maximum number of files to load concurrently. If None, defaults to 2 times of concurrent_batch_size.

Returns:

AsyncGenerator[OCRResult, None] A generator that yields OCR result objects as they complete.

Source code in packages/vlm4ocr/vlm4ocr/ocr_engines.py

def concurrent_ocr(self, file_paths: Union[str, Iterable[str]],
                   rotate_correction:Union[RotateCorrectionMethod, Literal[False]]=False,
                   max_dimension_pixels:int=None, few_shot_examples:List[FewShotExample]=None,
                   concurrent_batch_size: int=32, max_file_load: int=None) -> AsyncGenerator[OCRResult, None]:
    """
    First complete first out. Input and output order not guaranteed.
    This method inputs a file path or a list of file paths (image, PDF, TIFF) and performs OCR using the VLM inference engine.
    Results are processed concurrently using asyncio.

    Parameters:
    -----------
    file_paths : Union[str, Iterable[str]]
        A file path or a list of file paths to process. Must be one of '.pdf', '.tif', '.tiff', '.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'
    rotate_correction : {"tesseract", "vlm", False}, Optional
        Rotation correction method. "tesseract" uses pytesseract OSD; "vlm" prompts the VLM engine for orientation. False disables correction.
    max_dimension_pixels : int, Optional
        The maximum dimension of the image in pixels. Origianl dimensions will be resized to fit in. If None, no resizing is applied.
    few_shot_examples : List[FewShotExample], Optional
        list of few-shot examples. Each example is a dict with keys "image" (PIL.Image.Image) and "text" (str).
    concurrent_batch_size : int, Optional
        The number of concurrent VLM calls to make. 
    max_file_load : int, Optional
        The maximum number of files to load concurrently. If None, defaults to 2 times of concurrent_batch_size.

    Returns:
    --------
    AsyncGenerator[OCRResult, None]
        A generator that yields OCR result objects as they complete.
    """
    if isinstance(file_paths, str):
        file_paths = [file_paths]

    if max_file_load is None:
        max_file_load = concurrent_batch_size * 2

    if not isinstance(max_file_load, int) or max_file_load <= 0:
        raise ValueError("max_file_load must be a positive integer")

    return self._ocr_async(file_paths=file_paths, 
                           rotate_correction=rotate_correction,
                           max_dimension_pixels=max_dimension_pixels,
                           few_shot_examples=few_shot_examples,
                           concurrent_batch_size=concurrent_batch_size, 
                           max_file_load=max_file_load)