Chunkers API
This module provides classes for splitting documents into manageable units for processing by LLMs and for providing context to those units.
Unit Chunkers
Unit chunkers determine how a document is divided into smaller pieces for frame extraction. Each piece is a FrameExtractionUnit.
llm_ie.chunkers.UnitChunker
Bases: ABC
This is the abstract class for frame extraction unit chunker.
It chunks a document into units (e.g., sentences). LLMs process unit by unit.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This is the abstract class for frame extraction unit chunker.
It chunks a document into units (e.g., sentences). LLMs process unit by unit.
"""
pass
|
chunk
abstractmethod
chunk(
text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| @abc.abstractmethod
def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
"""
Parameters:
----------
text : str
The document text.
"""
return NotImplemented
|
chunk_async
async
chunk_async(
text: str, doc_id: str = None, executor=None
) -> List[FrameExtractionUnit]
asynchronous version of chunk method.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
"""
asynchronous version of chunk method.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(executor, self.chunk, text, doc_id)
|
llm_ie.chunkers.WholeDocumentUnitChunker
WholeDocumentUnitChunker()
Bases: UnitChunker
This class chunks the whole document into a single unit (no chunking).
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This class chunks the whole document into a single unit (no chunking).
"""
super().__init__()
|
chunk
chunk(
text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
"""
Parameters:
----------
text : str
The document text.
"""
return [FrameExtractionUnit(
doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
start=0,
end=len(text),
text=text
)]
|
llm_ie.chunkers.SentenceUnitChunker
Bases: UnitChunker
This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This class uses the NLTK PunktSentenceTokenizer to chunk a document into sentences.
"""
super().__init__()
|
chunk
chunk(
text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
"""
Parameters:
----------
text : str
The document text.
"""
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
sentences = []
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
sentences.append(FrameExtractionUnit(
doc_id=doc_id,
start=start,
end=end,
text=text[start:end]
))
return sentences
|
llm_ie.chunkers.SeparatorUnitChunker
SeparatorUnitChunker(sep: str)
Bases: UnitChunker
This class chunks a document by separator provided.
Parameters:
sep : str
a separator string.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self, sep:str):
"""
This class chunks a document by separator provided.
Parameters:
----------
sep : str
a separator string.
"""
super().__init__()
if not isinstance(sep, str):
raise ValueError("sep must be a string")
self.sep = sep
|
chunk
chunk(
text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
"""
Parameters:
----------
text : str
The document text.
"""
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
paragraphs = text.split(self.sep)
paragraph_units = []
start = 0
for paragraph in paragraphs:
end = start + len(paragraph)
paragraph_units.append(FrameExtractionUnit(
doc_id=doc_id,
start=start,
end=end,
text=paragraph
))
start = end + len(self.sep)
return paragraph_units
|
llm_ie.chunkers.TextLineUnitChunker
Bases: UnitChunker
This class chunks a document into lines.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This class chunks a document into lines.
"""
super().__init__()
|
chunk
chunk(
text: str, doc_id: str = None
) -> List[FrameExtractionUnit]
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
"""
Parameters:
----------
text : str
The document text.
"""
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
lines = text.split('\n')
line_units = []
start = 0
for line in lines:
end = start + len(line)
line_units.append(FrameExtractionUnit(
doc_id=doc_id,
start=start,
end=end,
text=line
))
start = end + 1
return line_units
|
llm_ie.chunkers.LLMUnitChunker
LLMUnitChunker(
inference_engine: InferenceEngine,
prompt_template: str = None,
system_prompt: str = None,
)
Bases: UnitChunker
This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
Parameters:
inference_engine : InferenceEngine
the LLM inferencing engine object.
prompt_template : str
the prompt template that defines how to chunk the document. Must define a JSON schema with
[
{
"title": "<your title here>",
"anchor_text": "<the anchor text of the chunk here>"
},
{
"title": "<your title here>",
"anchor_text": "<the anchor text of the chunk here>"
}
]
system_prompt : str, optional
The system prompt.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
"""
This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
Parameters:
----------
inference_engine : InferenceEngine
the LLM inferencing engine object.
prompt_template : str
the prompt template that defines how to chunk the document. Must define a JSON schema with
```json
[
{
"title": "<your title here>",
"anchor_text": "<the anchor text of the chunk here>"
},
{
"title": "<your title here>",
"anchor_text": "<the anchor text of the chunk here>"
}
]
```
system_prompt : str, optional
The system prompt.
"""
self.inference_engine = inference_engine
if prompt_template is None:
file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
with open(file_path, 'r', encoding="utf-8") as f:
self.prompt_template = f.read()
else:
self.prompt_template = prompt_template
self.system_prompt = system_prompt
|
chunk
chunk(text, doc_id=None) -> List[FrameExtractionUnit]
Parameters:
text : str
the document text.
doc_id : str, optional
the document id.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
"""
Parameters:
-----------
text : str
the document text.
doc_id : str, optional
the document id.
"""
doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
messages = []
if self.system_prompt is not None:
messages.append({'role': 'system', 'content': self.system_prompt})
messages.append({'role': 'user', 'content': user_prompt})
gen_text = self.inference_engine.chat(messages=messages)
header_list = extract_json(gen_text=gen_text["response"])
units = []
start = 0
prev_end = 0
for header in header_list:
if "anchor_text" not in header:
Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
continue
if not isinstance(header["anchor_text"], str):
Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
continue
start = prev_end
# find the first instandce of the leading sentence in the rest of the text
end = text.find(header["anchor_text"], start)
# if not found, skip this header
if end == -1:
continue
# if start == end (empty text), skip this header
if start == end:
continue
# create a frame extraction unit
units.append(FrameExtractionUnit(
doc_id=doc_id,
start=start,
end=end,
text=text[start:end]
))
prev_end = end
# add the last section
if prev_end < len(text):
units.append(FrameExtractionUnit(
doc_id=doc_id,
start=prev_end,
end=len(text),
text=text[prev_end:]
))
return units
|
Context Chunkers
Context chunkers determine what contextual information is provided to the LLM alongside a specific FrameExtractionUnit.
llm_ie.chunkers.ContextChunker
Bases: ABC
This is the abstract class for context chunker. Given a frame extraction unit,
it returns the context for it.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This is the abstract class for context chunker. Given a frame extraction unit,
it returns the context for it.
"""
pass
|
fit
abstractmethod
fit(text: str, units: List[FrameExtractionUnit])
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| @abc.abstractmethod
def fit(self, text:str, units:List[FrameExtractionUnit]):
"""
Parameters:
----------
text : str
The document text.
"""
pass
|
fit_async
async
fit_async(
text: str,
units: List[FrameExtractionUnit],
executor=None,
)
asynchronous version of fit method.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
"""
asynchronous version of fit method.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(executor, self.fit, text, units)
|
chunk
abstractmethod
chunk(unit: FrameExtractionUnit) -> str
Parameters:
unit : FrameExtractionUnit
The frame extraction unit.
Return : str
The context for the frame extraction unit.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| @abc.abstractmethod
def chunk(self, unit:FrameExtractionUnit) -> str:
"""
Parameters:
----------
unit : FrameExtractionUnit
The frame extraction unit.
Return : str
The context for the frame extraction unit.
"""
return NotImplemented
|
chunk_async
async
chunk_async(
unit: FrameExtractionUnit, executor=None
) -> str
asynchronous version of chunk method.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
"""
asynchronous version of chunk method.
"""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(executor, self.chunk, unit)
|
llm_ie.chunkers.NoContextChunker
Bases: ContextChunker
This class does not provide any context.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This class does not provide any context.
"""
super().__init__()
|
fit
fit(text: str, units: List[FrameExtractionUnit])
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def fit(self, text:str, units:List[FrameExtractionUnit]):
"""
Parameters:
----------
text : str
The document text.
"""
pass
|
llm_ie.chunkers.WholeDocumentContextChunker
WholeDocumentContextChunker()
Bases: ContextChunker
This class provides the whole document as context.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self):
"""
This class provides the whole document as context.
"""
super().__init__()
self.text = None
|
fit
fit(text: str, units: List[FrameExtractionUnit])
Parameters:
text : str
The document text.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def fit(self, text:str, units:List[FrameExtractionUnit]):
"""
Parameters:
----------
text : str
The document text.
"""
self.text = text
|
llm_ie.chunkers.SlideWindowContextChunker
SlideWindowContextChunker(window_size: int)
Bases: ContextChunker
This class provides a sliding window context. For example, +-2 sentences around a unit sentence.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def __init__(self, window_size:int):
"""
This class provides a sliding window context. For example, +-2 sentences around a unit sentence.
"""
super().__init__()
self.window_size = window_size
self.units = None
|
fit
fit(text: str, units: List[FrameExtractionUnit])
Parameters:
units : List[FrameExtractionUnit]
The list of frame extraction units.
Source code in package/llm-ie/src/llm_ie/chunkers.py
| def fit(self, text:str, units:List[FrameExtractionUnit]):
"""
Parameters:
----------
units : List[FrameExtractionUnit]
The list of frame extraction units.
"""
self.units = sorted(units)
|