Skip to content

Chunkers API

This module provides classes for splitting documents into manageable units for processing by LLMs and for providing context to those units.

Unit Chunkers

Unit chunkers determine how a document is divided into smaller pieces for frame extraction. Each piece is a FrameExtractionUnit.

llm_ie.chunkers.UnitChunker

UnitChunker()

Bases: ABC

This is the abstract class for frame extraction unit chunker. It chunks a document into units (e.g., sentences). LLMs process unit by unit.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This is the abstract class for frame extraction unit chunker.
    It chunks a document into units (e.g., sentences). LLMs process unit by unit. 
    """
    pass

chunk abstractmethod

chunk(
    text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
@abc.abstractmethod
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    return NotImplemented

chunk_async async

chunk_async(
    text: str, doc_id: str = None, executor=None
) -> List[FrameExtractionUnit]

asynchronous version of chunk method.

Source code in package/llm-ie/src/llm_ie/chunkers.py
async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
    """
    asynchronous version of chunk method.
    """
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(executor, self.chunk, text, doc_id)

llm_ie.chunkers.WholeDocumentUnitChunker

WholeDocumentUnitChunker()

Bases: UnitChunker

This class chunks the whole document into a single unit (no chunking).

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This class chunks the whole document into a single unit (no chunking).
    """
    super().__init__()

chunk

chunk(
    text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    return [FrameExtractionUnit(
        doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
        start=0,
        end=len(text),
        text=text
    )]

llm_ie.chunkers.SentenceUnitChunker

SentenceUnitChunker()

Bases: UnitChunker

This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
    """
    super().__init__()

chunk

chunk(
    text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
    sentences = []
    for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
        sentences.append(FrameExtractionUnit(
            doc_id=doc_id,
            start=start,
            end=end,
            text=text[start:end]
        ))    
    return sentences

llm_ie.chunkers.SeparatorUnitChunker

SeparatorUnitChunker(sep: str)

Bases: UnitChunker

This class chunks a document by separator provided.

Parameters:

sep : str a separator string.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self, sep:str):
    """
    This class chunks a document by separator provided.

    Parameters:
    ----------
    sep : str
        a separator string.
    """
    super().__init__()
    if not isinstance(sep, str):
        raise ValueError("sep must be a string")

    self.sep = sep

chunk

chunk(
    text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
    paragraphs = text.split(self.sep)
    paragraph_units = []
    start = 0
    for paragraph in paragraphs:
        end = start + len(paragraph)
        paragraph_units.append(FrameExtractionUnit(
            doc_id=doc_id,
            start=start,
            end=end,
            text=paragraph
        ))
        start = end + len(self.sep)
    return paragraph_units

llm_ie.chunkers.TextLineUnitChunker

TextLineUnitChunker()

Bases: UnitChunker

This class chunks a document into lines.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This class chunks a document into lines.
    """
    super().__init__()

chunk

chunk(
    text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
    lines = text.split('\n')
    line_units = []
    start = 0
    for line in lines:
        end = start + len(line)
        line_units.append(FrameExtractionUnit(
            doc_id=doc_id,
            start=start,
            end=end,
            text=line
        ))
        start = end + 1 
    return line_units

llm_ie.chunkers.LLMUnitChunker

LLMUnitChunker(
    inference_engine: InferenceEngine,
    prompt_template: str = None,
    system_prompt: str = None,
)

Bases: UnitChunker

This class prompt an LLM for document segmentation (e.g., sections, paragraphs).

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. prompt_template : str the prompt template that defines how to chunk the document. Must define a JSON schema with

[
    {
        "title": "<your title here>",
        "anchor_text": "<the anchor text of the chunk here>"
    },
    {
        "title": "<your title here>",
        "anchor_text": "<the anchor text of the chunk here>"
    }
]
system_prompt : str, optional The system prompt.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
    """
    This class prompt an LLM for document segmentation (e.g., sections, paragraphs).

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object.
    prompt_template : str
        the prompt template that defines how to chunk the document. Must define a JSON schema with 
        ```json
        [
            {
                "title": "<your title here>",
                "anchor_text": "<the anchor text of the chunk here>"
            },
            {
                "title": "<your title here>",
                "anchor_text": "<the anchor text of the chunk here>"
            }
        ]
        ```
    system_prompt : str, optional
        The system prompt.
    """
    self.inference_engine = inference_engine

    if prompt_template is None:
        file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
        with open(file_path, 'r', encoding="utf-8") as f:
            self.prompt_template = f.read()
    else:
        self.prompt_template = prompt_template

    self.system_prompt = system_prompt

chunk

chunk(text, doc_id=None) -> List[FrameExtractionUnit]
Parameters:

text : str the document text. doc_id : str, optional the document id.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
    """
    Parameters:
    -----------
    text : str
        the document text.
    doc_id : str, optional
        the document id.
    """
    doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
    user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
    messages = []
    if self.system_prompt is not None:
        messages.append({'role': 'system', 'content': self.system_prompt})
    messages.append({'role': 'user', 'content': user_prompt})

    gen_text = self.inference_engine.chat(messages=messages)

    header_list = extract_json(gen_text=gen_text["response"])
    units = []
    start = 0
    prev_end = 0
    for header in header_list:
        if "anchor_text" not in header:
            Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
            continue
        if not isinstance(header["anchor_text"], str):
            Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
            continue

        start = prev_end
        # find the first instandce of the leading sentence in the rest of the text
        end = text.find(header["anchor_text"], start)
        # if not found, skip this header
        if end == -1:
            continue
        # if start == end (empty text), skip this header
        if start == end:
            continue
        # create a frame extraction unit
        units.append(FrameExtractionUnit(
            doc_id=doc_id,
            start=start,
            end=end,
            text=text[start:end]
        ))
        prev_end = end
    # add the last section
    if prev_end < len(text):
        units.append(FrameExtractionUnit(
            doc_id=doc_id,
            start=prev_end,
            end=len(text),
            text=text[prev_end:]
        ))
    return units

Context Chunkers

Context chunkers determine what contextual information is provided to the LLM alongside a specific FrameExtractionUnit.

llm_ie.chunkers.ContextChunker

ContextChunker()

Bases: ABC

This is the abstract class for context chunker. Given a frame extraction unit, it returns the context for it.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This is the abstract class for context chunker. Given a frame extraction unit,
    it returns the context for it.
    """
    pass

fit abstractmethod

fit(text: str, units: List[FrameExtractionUnit])
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
@abc.abstractmethod
def fit(self, text:str, units:List[FrameExtractionUnit]):
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    pass

fit_async async

fit_async(
    text: str,
    units: List[FrameExtractionUnit],
    executor=None,
)

asynchronous version of fit method.

Source code in package/llm-ie/src/llm_ie/chunkers.py
async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
    """
    asynchronous version of fit method.
    """
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(executor, self.fit, text, units)

chunk abstractmethod

chunk(unit: FrameExtractionUnit) -> str
Parameters:

unit : FrameExtractionUnit The frame extraction unit.

Return : str The context for the frame extraction unit.

Source code in package/llm-ie/src/llm_ie/chunkers.py
@abc.abstractmethod
def chunk(self, unit:FrameExtractionUnit) -> str:
    """
    Parameters:
    ----------
    unit : FrameExtractionUnit
        The frame extraction unit.

    Return : str 
        The context for the frame extraction unit.
    """
    return NotImplemented

chunk_async async

chunk_async(
    unit: FrameExtractionUnit, executor=None
) -> str

asynchronous version of chunk method.

Source code in package/llm-ie/src/llm_ie/chunkers.py
async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
    """
    asynchronous version of chunk method.
    """
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(executor, self.chunk, unit)

llm_ie.chunkers.NoContextChunker

NoContextChunker()

Bases: ContextChunker

This class does not provide any context.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This class does not provide any context.
    """
    super().__init__()

fit

fit(text: str, units: List[FrameExtractionUnit])
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def fit(self, text:str, units:List[FrameExtractionUnit]):
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    pass

llm_ie.chunkers.WholeDocumentContextChunker

WholeDocumentContextChunker()

Bases: ContextChunker

This class provides the whole document as context.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self):
    """
    This class provides the whole document as context.
    """
    super().__init__()
    self.text = None

fit

fit(text: str, units: List[FrameExtractionUnit])
Parameters:

text : str The document text.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def fit(self, text:str, units:List[FrameExtractionUnit]):
    """
    Parameters:
    ----------
    text : str
        The document text.
    """
    self.text = text

llm_ie.chunkers.SlideWindowContextChunker

SlideWindowContextChunker(window_size: int)

Bases: ContextChunker

This class provides a sliding window context. For example, +-2 sentences around a unit sentence.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def __init__(self, window_size:int):
    """
    This class provides a sliding window context. For example, +-2 sentences around a unit sentence. 
    """
    super().__init__()
    self.window_size = window_size
    self.units = None

fit

fit(text: str, units: List[FrameExtractionUnit])
Parameters:

units : List[FrameExtractionUnit] The list of frame extraction units.

Source code in package/llm-ie/src/llm_ie/chunkers.py
def fit(self, text:str, units:List[FrameExtractionUnit]):
    """
    Parameters:
    ----------
    units : List[FrameExtractionUnit]
        The list of frame extraction units.
    """
    self.units = sorted(units)