Extractors API

Frame Extractors

llm_ie.extractors.DirectFrameExtractor

DirectFrameExtractor(
    inference_engine: InferenceEngine,
    unit_chunker: UnitChunker,
    prompt_template: str,
    system_prompt: str = None,
    context_chunker: ContextChunker = None,
)

Bases: FrameExtractor

This class is for general unit-context frame extraction. Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. unit_chunker : UnitChunker the unit chunker object that determines how to chunk the document text into units. prompt_template : str prompt template with "{{}}" placeholder. system_prompt : str, Optional system prompt. context_chunker : ContextChunker the context chunker object that determines how to get context for each unit.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker, 
             prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None):
    """
    This class is for general unit-context frame extraction.
    Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    unit_chunker : UnitChunker
        the unit chunker object that determines how to chunk the document text into units.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    system_prompt : str, Optional
        system prompt.
    context_chunker : ContextChunker
        the context chunker object that determines how to get context for each unit.
    """
    super().__init__(inference_engine=inference_engine,
                     unit_chunker=unit_chunker,
                     prompt_template=prompt_template,
                     system_prompt=system_prompt,
                     context_chunker=context_chunker)

extract

extract(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
    verbose: bool = False,
    return_messages_log: bool = False,
) -> List[FrameExtractionUnitResult]

This method inputs a text and outputs a list of outputs per unit.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored. verbose : bool, Optional if True, LLM generated text will be printed in terminal in real-time. return_messages_log : bool, Optional if True, a list of messages will be returned.

Return : List[FrameExtractionUnitResult] the output from LLM for each unit. Contains the start, end, text, and generated text.

Source code in package/llm-ie/src/llm_ie/extractors.py

def extract(self, text_content:Union[str, Dict[str,str]], 
            document_key:str=None, verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
    """
    This method inputs a text and outputs a list of outputs per unit.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template. 
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is. 
        If text_content is str, this parameter will be ignored.
    verbose : bool, Optional
        if True, LLM generated text will be printed in terminal in real-time. 
    return_messages_log : bool, Optional
        if True, a list of messages will be returned.

    Return : List[FrameExtractionUnitResult]
        the output from LLM for each unit. Contains the start, end, text, and generated text.
    """
    # define output
    output = []
    # unit chunking
    if isinstance(text_content, str):
        doc_text = text_content

    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        doc_text = text_content[document_key]

    units = self.unit_chunker.chunk(doc_text)
    # context chunker init
    self.context_chunker.fit(doc_text, units)
    # messages log
    if return_messages_log:
        messages_log = []

    # generate unit by unit
    for i, unit in enumerate(units):
        # construct chat messages
        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context = self.context_chunker.chunk(unit)

        if context == "":
            # no context, just place unit in user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
            else:
                unit_content = text_content.copy()
                unit_content[document_key] = unit.text
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
        else:
            # insert context to user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
            else:
                context_content = text_content.copy()
                context_content[document_key] = context
                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
            # simulate conversation where assistant confirms
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            # place unit of interest
            messages.append({'role': 'user', 'content': unit.text})

        if verbose:
            print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
            if context != "":
                print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")

            print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")


        gen_text = self.inference_engine.chat(
                        messages=messages, 
                        verbose=verbose,
                        stream=False
                    )

        if return_messages_log:
            messages.append({"role": "assistant", "content": gen_text})
            messages_log.append(messages)

        # add to output
        result = FrameExtractionUnitResult(
                        start=unit.start,
                        end=unit.end,
                        text=unit.text,
                        gen_text=gen_text)
        output.append(result)

    if return_messages_log:
        return output, messages_log

    return output

stream

stream(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
) -> Generator[
    Dict[str, Any], None, List[FrameExtractionUnitResult]
]

Streams LLM responses per unit with structured event types, and returns collected data for post-processing.

Yields:

Dict[str, Any]: (type, data) - {"type": "info", "data": str_message}: General informational messages. - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'} - {"type": "context", "data": str_context}: Context string for the current unit. - {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM. - {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.

Returns:

List[FrameExtractionUnitResult]: A list of FrameExtractionUnitResult objects, each containing the original unit details and the fully accumulated 'gen_text' from the LLM.

Source code in package/llm-ie/src/llm_ie/extractors.py

def stream(self, text_content: Union[str, Dict[str, str]], 
           document_key: str = None) -> Generator[Dict[str, Any], None, List[FrameExtractionUnitResult]]:
    """
    Streams LLM responses per unit with structured event types,
    and returns collected data for post-processing.

    Yields:
    -------
    Dict[str, Any]: (type, data)
        - {"type": "info", "data": str_message}: General informational messages.
        - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
        - {"type": "context", "data": str_context}: Context string for the current unit.
        - {"type": "reasoning", "data": str_chunk}: A reasoning model thinking chunk from the LLM.
        - {"type": "response", "data": str_chunk}: A response/answer chunk from the LLM.

    Returns:
    --------
    List[FrameExtractionUnitResult]:
        A list of FrameExtractionUnitResult objects, each containing the
        original unit details and the fully accumulated 'gen_text' from the LLM.
    """
    collected_results: List[FrameExtractionUnitResult] = []

    if isinstance(text_content, str):
        doc_text = text_content
    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        if document_key not in text_content:
            raise ValueError(f"document_key '{document_key}' not found in text_content.")
        doc_text = text_content[document_key]
    else:
        raise TypeError("text_content must be a string or a dictionary.")

    units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
    self.context_chunker.fit(doc_text, units)

    yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}

    for i, unit in enumerate(units):
        unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
        yield {"type": "unit", "data": unit_info_payload}

        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context_str = self.context_chunker.chunk(unit)

        # Construct prompt input based on whether text_content was str or dict
        if context_str:
            yield {"type": "context", "data": context_str}
            prompt_input_for_context = context_str
            if isinstance(text_content, dict):
                context_content_dict = text_content.copy()
                context_content_dict[document_key] = context_str
                prompt_input_for_context = context_content_dict
            messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            messages.append({'role': 'user', 'content': unit.text})
        else: # No context
            prompt_input_for_unit = unit.text
            if isinstance(text_content, dict):
                unit_content_dict = text_content.copy()
                unit_content_dict[document_key] = unit.text
                prompt_input_for_unit = unit_content_dict
            messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})

        current_gen_text = ""

        response_stream = self.inference_engine.chat(
            messages=messages,
            stream=True
        )
        for chunk in response_stream:
            yield chunk
            current_gen_text += chunk

        # Store the result for this unit
        result_for_unit = FrameExtractionUnitResult(
            start=unit.start,
            end=unit.end,
            text=unit.text,
            gen_text=current_gen_text
        )
        collected_results.append(result_for_unit)

    yield {"type": "info", "data": "All units processed by LLM."}
    return collected_results

extract_async `async`

extract_async(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
    concurrent_batch_size: int = 32,
    return_messages_log: bool = False,
) -> List[FrameExtractionUnitResult]

This is the asynchronous version of the extract() method.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored. concurrent_batch_size : int, Optional the batch size for concurrent processing. return_messages_log : bool, Optional if True, a list of messages will be returned.

Return : List[FrameExtractionUnitResult] the output from LLM for each unit. Contains the start, end, text, and generated text.

Source code in package/llm-ie/src/llm_ie/extractors.py

async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None, 
                        concurrent_batch_size:int=32, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
    """
    This is the asynchronous version of the extract() method.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template. 
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is. 
        If text_content is str, this parameter will be ignored.
    concurrent_batch_size : int, Optional
        the batch size for concurrent processing. 
    return_messages_log : bool, Optional
        if True, a list of messages will be returned.

    Return : List[FrameExtractionUnitResult]
        the output from LLM for each unit. Contains the start, end, text, and generated text.
    """
    if isinstance(text_content, str):
        doc_text = text_content
    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        if document_key not in text_content:
             raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
        doc_text = text_content[document_key]
    else:
        raise TypeError("text_content must be a string or a dictionary.")

    units = self.unit_chunker.chunk(doc_text)

    # context chunker init 
    self.context_chunker.fit(doc_text, units)

    # Prepare inputs for all units first
    tasks_input = []
    for i, unit in enumerate(units):
        # construct chat messages
        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context = self.context_chunker.chunk(unit)

        if context == "":
             # no context, just place unit in user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
            else:
                unit_content = text_content.copy()
                unit_content[document_key] = unit.text
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
        else:
            # insert context to user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
            else:
                context_content = text_content.copy()
                context_content[document_key] = context
                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
            # simulate conversation where assistant confirms
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            # place unit of interest
            messages.append({'role': 'user', 'content': unit.text})

        # Store unit and messages together for the task
        tasks_input.append({"unit": unit, "messages": messages, "original_index": i})

    # Process units concurrently with asyncio.Semaphore
    semaphore = asyncio.Semaphore(concurrent_batch_size)

    async def semaphore_helper(task_data: Dict, **kwrs):
        unit = task_data["unit"]
        messages = task_data["messages"]
        original_index = task_data["original_index"]

        async with semaphore:
            gen_text = await self.inference_engine.chat_async(
                messages=messages
            )
        return {"original_index": original_index, "unit": unit, "gen_text": gen_text, "messages": messages}

    # Create and gather tasks
    tasks = []
    for task_inp in tasks_input:
        task = asyncio.create_task(semaphore_helper(
            task_inp
        ))
        tasks.append(task)

    results_raw = await asyncio.gather(*tasks)

    # Sort results back into original order using the index stored
    results_raw.sort(key=lambda x: x["original_index"])

    # Restructure the results
    output: List[FrameExtractionUnitResult] = []
    messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None

    for result_data in results_raw:
        unit = result_data["unit"]
        gen_text = result_data["gen_text"]

        # Create result object
        result = FrameExtractionUnitResult(
            start=unit.start,
            end=unit.end,
            text=unit.text,
            gen_text=gen_text
        )
        output.append(result)

        # Append to messages log if requested
        if return_messages_log:
            final_messages = result_data["messages"] + [{"role": "assistant", "content": gen_text}]
            messages_log.append(final_messages)

    if return_messages_log:
        return output, messages_log
    else:
        return output

extract_frames

extract_frames(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
    verbose: bool = False,
    concurrent: bool = False,
    concurrent_batch_size: int = 32,
    case_sensitive: bool = False,
    fuzzy_match: bool = True,
    fuzzy_buffer_size: float = 0.2,
    fuzzy_score_cutoff: float = 0.8,
    allow_overlap_entities: bool = False,
    return_messages_log: bool = False,
) -> List[LLMInformationExtractionFrame]

This method inputs a text and outputs a list of LLMInformationExtractionFrame It use the extract() method and post-process outputs into frames.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored. verbose : bool, Optional if True, LLM generated text will be printed in terminal in real-time. concurrent : bool, Optional if True, the sentences will be extracted in concurrent. concurrent_batch_size : int, Optional the number of sentences to process in concurrent. Only used when concurrent is True. case_sensitive : bool, Optional if True, entity text matching will be case-sensitive. fuzzy_match : bool, Optional if True, fuzzy matching will be applied to find entity text. fuzzy_buffer_size : float, Optional the buffer size for fuzzy matching. Default is 20% of entity text length. fuzzy_score_cutoff : float, Optional the Jaccard score cutoff for fuzzy matching. Matched entity text must have a score higher than this value or a None will be returned. allow_overlap_entities : bool, Optional if True, entities can overlap in the text. Note that this can cause multiple frames to be generated on the same entity span if they have same entity text. return_messages_log : bool, Optional if True, a list of messages will be returned.

Return : str a list of frames.

Source code in package/llm-ie/src/llm_ie/extractors.py

def extract_frames(self, text_content:Union[str, Dict[str,str]], document_key:str=None, 
                   verbose:bool=False, concurrent:bool=False, concurrent_batch_size:int=32,
                    case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
                    allow_overlap_entities:bool=False, return_messages_log:bool=False) -> List[LLMInformationExtractionFrame]:
    """
    This method inputs a text and outputs a list of LLMInformationExtractionFrame
    It use the extract() method and post-process outputs into frames.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template. 
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is. 
        If text_content is str, this parameter will be ignored.
    verbose : bool, Optional
        if True, LLM generated text will be printed in terminal in real-time. 
    concurrent : bool, Optional
        if True, the sentences will be extracted in concurrent.
    concurrent_batch_size : int, Optional
        the number of sentences to process in concurrent. Only used when `concurrent` is True.
    case_sensitive : bool, Optional
        if True, entity text matching will be case-sensitive.
    fuzzy_match : bool, Optional
        if True, fuzzy matching will be applied to find entity text.
    fuzzy_buffer_size : float, Optional
        the buffer size for fuzzy matching. Default is 20% of entity text length.
    fuzzy_score_cutoff : float, Optional
        the Jaccard score cutoff for fuzzy matching. 
        Matched entity text must have a score higher than this value or a None will be returned.
    allow_overlap_entities : bool, Optional
        if True, entities can overlap in the text. 
        Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
    return_messages_log : bool, Optional
        if True, a list of messages will be returned.

    Return : str
        a list of frames.
    """
    ENTITY_KEY = "entity_text"
    if concurrent:
        if verbose:
            warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)

        nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
        extraction_results = asyncio.run(self.extract_async(text_content=text_content, 
                                            document_key=document_key,
                                            concurrent_batch_size=concurrent_batch_size,
                                            return_messages_log=return_messages_log)
                                        )
    else:
        extraction_results = self.extract(text_content=text_content, 
                                            document_key=document_key,
                                            verbose=verbose,
                                            return_messages_log=return_messages_log)

    llm_output_results, messages_log = extraction_results if return_messages_log else (extraction_results, None)

    frame_list = []
    for res in llm_output_results:
        entity_json = []
        for entity in self._extract_json(gen_text=res.gen_text):
            if ENTITY_KEY in entity:
                entity_json.append(entity)
            else:
                warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)

        spans = self._find_entity_spans(text=res.text, 
                                        entities=[e[ENTITY_KEY] for e in entity_json], 
                                        case_sensitive=case_sensitive,
                                        fuzzy_match=fuzzy_match,
                                        fuzzy_buffer_size=fuzzy_buffer_size,
                                        fuzzy_score_cutoff=fuzzy_score_cutoff,
                                        allow_overlap_entities=allow_overlap_entities)
        for ent, span in zip(entity_json, spans):
            if span is not None:
                start, end = span
                entity_text = res.text[start:end]
                start += res.start
                end += res.start
                attr = {}
                if "attr" in ent and ent["attr"] is not None:
                    attr = ent["attr"]

                frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}", 
                            start=start,
                            end=end,
                            entity_text=entity_text,
                            attr=attr)
                frame_list.append(frame)

    if return_messages_log:
        return frame_list, messages_log
    return frame_list

llm_ie.extractors.ReviewFrameExtractor

ReviewFrameExtractor(
    unit_chunker: UnitChunker,
    context_chunker: ContextChunker,
    inference_engine: InferenceEngine,
    prompt_template: str,
    review_mode: str,
    review_prompt: str = None,
    system_prompt: str = None,
)

Bases: DirectFrameExtractor

This class add a review step after the DirectFrameExtractor. The Review process asks LLM to review its output and: 1. add more frames while keep current. This is efficient for boosting recall. 2. or, regenerate frames (add new and delete existing). Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

Parameters:

unit_chunker : UnitChunker the unit chunker object that determines how to chunk the document text into units. context_chunker : ContextChunker the context chunker object that determines how to get context for each unit. inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. prompt_template : str prompt template with "{{}}" placeholder. review_prompt : str: Optional the prompt text that ask LLM to review. Specify addition or revision in the instruction. if not provided, a default review prompt will be used. review_mode : str review mode. Must be one of {"addition", "revision"} addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate. system_prompt : str, Optional system prompt.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker, inference_engine:InferenceEngine, 
             prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None):
    """
    This class add a review step after the DirectFrameExtractor.
    The Review process asks LLM to review its output and:
        1. add more frames while keep current. This is efficient for boosting recall. 
        2. or, regenerate frames (add new and delete existing). 
    Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

    Parameters:
    ----------
    unit_chunker : UnitChunker
        the unit chunker object that determines how to chunk the document text into units.
    context_chunker : ContextChunker
        the context chunker object that determines how to get context for each unit.
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    review_prompt : str: Optional
        the prompt text that ask LLM to review. Specify addition or revision in the instruction.
        if not provided, a default review prompt will be used. 
    review_mode : str
        review mode. Must be one of {"addition", "revision"}
        addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
    system_prompt : str, Optional
        system prompt.
    """
    super().__init__(inference_engine=inference_engine, 
                     unit_chunker=unit_chunker, 
                     prompt_template=prompt_template, 
                     system_prompt=system_prompt, 
                     context_chunker=context_chunker)
    # check review mode
    if review_mode not in {"addition", "revision"}: 
        raise ValueError('review_mode must be one of {"addition", "revision"}.')
    self.review_mode = review_mode
    # assign review prompt
    if review_prompt:
        self.review_prompt = review_prompt
    else:
        self.review_prompt = None
        original_class_name = self.__class__.__name__

        current_class_name = original_class_name
        for current_class_in_mro in self.__class__.__mro__:
            if current_class_in_mro is object: 
                continue

            current_class_name = current_class_in_mro.__name__
            try:
                file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
                    joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
                with open(file_path, 'r', encoding="utf-8") as f:
                    self.review_prompt = f.read()
            except FileNotFoundError:
                pass

            except Exception as e:
                warnings.warn(
                    f"Error attempting to read default review prompt for '{current_class_name}' "
                    f"from '{str(file_path)}': {e}. Trying next in MRO.",
                    UserWarning
                )
                continue 

    if self.review_prompt is None:
        raise ValueError(f"Cannot find review prompt for {self.__class__.__name__} in the package. Please provide a review_prompt.")

extract

extract(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
    verbose: bool = False,
    return_messages_log: bool = False,
) -> List[FrameExtractionUnitResult]

This method inputs a text and outputs a list of outputs per unit.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored. verbose : bool, Optional if True, LLM generated text will be printed in terminal in real-time. return_messages_log : bool, Optional if True, a list of messages will be returned.

Return : List[FrameExtractionUnitResult] the output from LLM for each unit. Contains the start, end, text, and generated text.

Source code in package/llm-ie/src/llm_ie/extractors.py

def extract(self, text_content:Union[str, Dict[str,str]], document_key:str=None, 
            verbose:bool=False, return_messages_log:bool=False) -> List[FrameExtractionUnitResult]:
    """
    This method inputs a text and outputs a list of outputs per unit.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template. 
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is. 
        If text_content is str, this parameter will be ignored.
    verbose : bool, Optional
        if True, LLM generated text will be printed in terminal in real-time. 
    return_messages_log : bool, Optional
        if True, a list of messages will be returned.

    Return : List[FrameExtractionUnitResult]
        the output from LLM for each unit. Contains the start, end, text, and generated text.
    """
    # define output
    output = []
    # unit chunking
    if isinstance(text_content, str):
        doc_text = text_content

    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        doc_text = text_content[document_key]

    units = self.unit_chunker.chunk(doc_text)
    # context chunker init
    self.context_chunker.fit(doc_text, units)
    # messages log
    if return_messages_log:
        messages_log = []

    # generate unit by unit
    for i, unit in enumerate(units):
        # <--- Initial generation step --->
        # construct chat messages
        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context = self.context_chunker.chunk(unit)

        if context == "":
            # no context, just place unit in user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
            else:
                unit_content = text_content.copy()
                unit_content[document_key] = unit.text
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
        else:
            # insert context to user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
            else:
                context_content = text_content.copy()
                context_content[document_key] = context
                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
            # simulate conversation where assistant confirms
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            # place unit of interest
            messages.append({'role': 'user', 'content': unit.text})

        if verbose:
            print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
            if context != "":
                print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")

            print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")


        initial = self.inference_engine.chat(
                        messages=messages, 
                        verbose=verbose,
                        stream=False
                    )

        if return_messages_log:
            messages.append({"role": "assistant", "content": initial})
            messages_log.append(messages)

        # <--- Review step --->
        if verbose:
            print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")

        messages.append({'role': 'assistant', 'content': initial})
        messages.append({'role': 'user', 'content': self.review_prompt})

        review = self.inference_engine.chat(
                        messages=messages, 
                        verbose=verbose,
                        stream=False
                    )

        # Output
        if self.review_mode == "revision":
            gen_text = review
        elif self.review_mode == "addition":
            gen_text = initial + '\n' + review

        if return_messages_log:
            messages.append({"role": "assistant", "content": review})
            messages_log.append(messages)

        # add to output
        result = FrameExtractionUnitResult(
                        start=unit.start,
                        end=unit.end,
                        text=unit.text,
                        gen_text=gen_text)
        output.append(result)

    if return_messages_log:
        return output, messages_log

    return output

stream

stream(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
) -> Generator[str, None, None]

This method inputs a text and outputs a list of outputs per unit.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored.

Return : List[FrameExtractionUnitResult] the output from LLM for each unit. Contains the start, end, text, and generated text.

Source code in package/llm-ie/src/llm_ie/extractors.py

def stream(self, text_content:Union[str, Dict[str,str]], document_key:str=None) -> Generator[str, None, None]:
    """
    This method inputs a text and outputs a list of outputs per unit.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template. 
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is. 
        If text_content is str, this parameter will be ignored.

    Return : List[FrameExtractionUnitResult]
        the output from LLM for each unit. Contains the start, end, text, and generated text.
    """
    # unit chunking
    if isinstance(text_content, str):
        doc_text = text_content

    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        doc_text = text_content[document_key]

    units = self.unit_chunker.chunk(doc_text)
    # context chunker init
    self.context_chunker.fit(doc_text, units)

    # generate unit by unit
    for i, unit in enumerate(units):
        # <--- Initial generation step --->
        # construct chat messages
        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context = self.context_chunker.chunk(unit)

        if context == "":
            # no context, just place unit in user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
            else:
                unit_content = text_content.copy()
                unit_content[document_key] = unit.text
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
        else:
            # insert context to user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
            else:
                context_content = text_content.copy()
                context_content[document_key] = context
                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
            # simulate conversation where assistant confirms
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            # place unit of interest
            messages.append({'role': 'user', 'content': unit.text})


        yield f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n"
        if context != "":
            yield f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n"

        yield f"{Fore.BLUE}Extraction:{Style.RESET_ALL}\n"

        response_stream = self.inference_engine.chat(
                        messages=messages, 
                        stream=True
                    )

        initial = ""
        for chunk in response_stream:
            initial += chunk
            yield chunk

        # <--- Review step --->
        yield f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}"

        messages.append({'role': 'assistant', 'content': initial})
        messages.append({'role': 'user', 'content': self.review_prompt})

        response_stream = self.inference_engine.chat(
                        messages=messages, 
                        stream=True
                    )

        for chunk in response_stream:
            yield chunk

extract_async `async`

extract_async(
    text_content: Union[str, Dict[str, str]],
    document_key: str = None,
    concurrent_batch_size: int = 32,
    return_messages_log: bool = False,
    **kwrs
) -> List[FrameExtractionUnitResult]

This is the asynchronous version of the extract() method with the review step.

Parameters:

text_content : Union[str, Dict[str,str]] the input text content to put in prompt template. If str, the prompt template must has only 1 placeholder {{}}, regardless of placeholder name. If dict, all the keys must be included in the prompt template placeholder {{}}. document_key : str, Optional specify the key in text_content where document text is. If text_content is str, this parameter will be ignored. concurrent_batch_size : int, Optional the batch size for concurrent processing. return_messages_log : bool, Optional if True, a list of messages will be returned, including review steps.

Return : List[FrameExtractionUnitResult] the output from LLM for each unit after review. Contains the start, end, text, and generated text.

Source code in package/llm-ie/src/llm_ie/extractors.py

async def extract_async(self, text_content:Union[str, Dict[str,str]], document_key:str=None,
                        concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
    """
    This is the asynchronous version of the extract() method with the review step.

    Parameters:
    ----------
    text_content : Union[str, Dict[str,str]]
        the input text content to put in prompt template.
        If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
        If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
    document_key : str, Optional
        specify the key in text_content where document text is.
        If text_content is str, this parameter will be ignored.
    concurrent_batch_size : int, Optional
        the batch size for concurrent processing.
    return_messages_log : bool, Optional
        if True, a list of messages will be returned, including review steps.

    Return : List[FrameExtractionUnitResult]
        the output from LLM for each unit after review. Contains the start, end, text, and generated text.
    """
    if isinstance(text_content, str):
        doc_text = text_content
    elif isinstance(text_content, dict):
        if document_key is None:
            raise ValueError("document_key must be provided when text_content is dict.")
        if document_key not in text_content:
             raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
        doc_text = text_content[document_key]
    else:
        raise TypeError("text_content must be a string or a dictionary.")

    units = self.unit_chunker.chunk(doc_text)

    # context chunker init
    self.context_chunker.fit(doc_text, units)

    # <--- Initial generation step --->
    initial_tasks_input = []
    for i, unit in enumerate(units):
        # construct chat messages for initial generation
        messages = []
        if self.system_prompt:
            messages.append({'role': 'system', 'content': self.system_prompt})

        context = self.context_chunker.chunk(unit)

        if context == "":
             # no context, just place unit in user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
            else:
                unit_content = text_content.copy()
                unit_content[document_key] = unit.text
                messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
        else:
            # insert context to user prompt
            if isinstance(text_content, str):
                messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
            else:
                context_content = text_content.copy()
                context_content[document_key] = context
                messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
            # simulate conversation where assistant confirms
            messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
            # place unit of interest
            messages.append({'role': 'user', 'content': unit.text})

        # Store unit and messages together for the initial task
        initial_tasks_input.append({"unit": unit, "messages": messages, "original_index": i})

    semaphore = asyncio.Semaphore(concurrent_batch_size)

    async def initial_semaphore_helper(task_data: Dict):
        unit = task_data["unit"]
        messages = task_data["messages"]
        original_index = task_data["original_index"]

        async with semaphore:
            gen_text = await self.inference_engine.chat_async(
                messages=messages
            )
        # Return initial generation result along with the messages used and the unit
        return {"original_index": original_index, "unit": unit, "initial_gen_text": gen_text, "initial_messages": messages}

    # Create and gather initial generation tasks
    initial_tasks = [
        asyncio.create_task(initial_semaphore_helper(
            task_inp
        ))
        for task_inp in initial_tasks_input
    ]

    initial_results_raw = await asyncio.gather(*initial_tasks)

    # Sort initial results back into original order
    initial_results_raw.sort(key=lambda x: x["original_index"])

    # <--- Review step --->
    review_tasks_input = []
    for result_data in initial_results_raw:
        # Prepare messages for the review step
        initial_messages = result_data["initial_messages"]
        initial_gen_text = result_data["initial_gen_text"]
        review_messages = initial_messages + [
            {'role': 'assistant', 'content': initial_gen_text},
            {'role': 'user', 'content': self.review_prompt}
        ]
        # Store data needed for review task
        review_tasks_input.append({
            "unit": result_data["unit"],
            "initial_gen_text": initial_gen_text,
            "messages": review_messages, 
            "original_index": result_data["original_index"],
            "full_initial_log": initial_messages + [{'role': 'assistant', 'content': initial_gen_text}] if return_messages_log else None # Log up to initial generation
        })


    async def review_semaphore_helper(task_data: Dict, **kwrs):
        messages = task_data["messages"] 
        original_index = task_data["original_index"]

        async with semaphore:
            review_gen_text = await self.inference_engine.chat_async(
                messages=messages
            )
        # Combine initial and review results
        task_data["review_gen_text"] = review_gen_text
        if return_messages_log:
            # Log for the review call itself
             task_data["full_review_log"] = messages + [{'role': 'assistant', 'content': review_gen_text}]
        return task_data # Return the augmented dictionary

    # Create and gather review tasks
    review_tasks = [
         asyncio.create_task(review_semaphore_helper(
            task_inp
        ))
       for task_inp in review_tasks_input
    ]

    final_results_raw = await asyncio.gather(*review_tasks)

    # Sort final results back into original order (although gather might preserve order for tasks added sequentially)
    final_results_raw.sort(key=lambda x: x["original_index"])

    # <--- Process final results --->
    output: List[FrameExtractionUnitResult] = []
    messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None

    for result_data in final_results_raw:
        unit = result_data["unit"]
        initial_gen = result_data["initial_gen_text"]
        review_gen = result_data["review_gen_text"]

        # Combine based on review mode
        if self.review_mode == "revision":
            final_gen_text = review_gen
        elif self.review_mode == "addition":
            final_gen_text = initial_gen + '\n' + review_gen
        else: # Should not happen due to init check
            final_gen_text = review_gen # Default to revision if mode is somehow invalid

        # Create final result object
        result = FrameExtractionUnitResult(
            start=unit.start,
            end=unit.end,
            text=unit.text,
            gen_text=final_gen_text # Use the combined/reviewed text
        )
        output.append(result)

        # Append full conversation log if requested
        if return_messages_log:
            full_log_for_unit = result_data.get("full_initial_log", []) + [{'role': 'user', 'content': self.review_prompt}] + [{'role': 'assistant', 'content': review_gen}]
            messages_log.append(full_log_for_unit)

    if return_messages_log:
        return output, messages_log
    else:
        return output

Convenience Frame Extractors

llm_ie.extractors.BasicFrameExtractor

BasicFrameExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    system_prompt: str = None,
)

Bases: DirectFrameExtractor

This class diretly prompt LLM for frame extraction. Input system prompt (optional), prompt template (with instruction, few-shot examples), and specify a LLM.

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. prompt_template : str prompt template with "{{}}" placeholder. system_prompt : str, Optional system prompt.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None):
    """
    This class diretly prompt LLM for frame extraction.
    Input system prompt (optional), prompt template (with instruction, few-shot examples), 
    and specify a LLM.

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    system_prompt : str, Optional
        system prompt.
    """
    super().__init__(inference_engine=inference_engine, 
                     unit_chunker=WholeDocumentUnitChunker(),
                     prompt_template=prompt_template, 
                     system_prompt=system_prompt, 
                     context_chunker=NoContextChunker())

llm_ie.extractors.SentenceFrameExtractor

SentenceFrameExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    system_prompt: str = None,
    context_sentences: Union[str, int] = "all",
)

Bases: DirectFrameExtractor

This class performs sentence-by-sentence information extraction. The process is as follows: 1. system prompt (optional) 2. user prompt with instructions (schema, background, full text, few-shot example...) 3. feed a sentence (start with first sentence) 4. LLM extract entities and attributes from the sentence 5. iterate to the next sentence and repeat steps 3-4 until all sentences are processed.

Input system prompt (optional), prompt template (with user instructions), and specify a LLM.

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. prompt_template : str prompt template with "{{}}" placeholder. system_prompt : str, Optional system prompt. context_sentences : Union[str, int], Optional number of sentences before and after the given sentence to provide additional context. if "all", the full text will be provided in the prompt as context. if 0, no additional context will be provided. This is good for tasks that does not require context beyond the given sentence. if > 0, the number of sentences before and after the given sentence to provide as context. This is good for tasks that require context beyond the given sentence.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
             context_sentences:Union[str, int]="all"):
    """
    This class performs sentence-by-sentence information extraction.
    The process is as follows:
        1. system prompt (optional)
        2. user prompt with instructions (schema, background, full text, few-shot example...)
        3. feed a sentence (start with first sentence)
        4. LLM extract entities and attributes from the sentence
        5. iterate to the next sentence and repeat steps 3-4 until all sentences are processed.

    Input system prompt (optional), prompt template (with user instructions), 
    and specify a LLM.

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    system_prompt : str, Optional
        system prompt.
    context_sentences : Union[str, int], Optional
        number of sentences before and after the given sentence to provide additional context. 
        if "all", the full text will be provided in the prompt as context. 
        if 0, no additional context will be provided.
            This is good for tasks that does not require context beyond the given sentence. 
        if > 0, the number of sentences before and after the given sentence to provide as context.
            This is good for tasks that require context beyond the given sentence. 
    """
    if not isinstance(context_sentences, int) and context_sentences != "all":
        raise ValueError('context_sentences must be an integer (>= 0) or "all".')

    if isinstance(context_sentences, int) and context_sentences < 0:
        raise ValueError("context_sentences must be a positive integer.")

    if isinstance(context_sentences, int):
        context_chunker = SlideWindowContextChunker(window_size=context_sentences)
    elif context_sentences == "all":
        context_chunker = WholeDocumentContextChunker()

    super().__init__(inference_engine=inference_engine, 
                     unit_chunker=SentenceUnitChunker(),
                     prompt_template=prompt_template, 
                     system_prompt=system_prompt, 
                     context_chunker=context_chunker)

llm_ie.extractors.BasicReviewFrameExtractor

BasicReviewFrameExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    review_mode: str,
    review_prompt: str = None,
    system_prompt: str = None,
)

Bases: ReviewFrameExtractor

This class add a review step after the BasicFrameExtractor. The Review process asks LLM to review its output and: 1. add more frames while keep current. This is efficient for boosting recall. 2. or, regenerate frames (add new and delete existing). Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. prompt_template : str prompt template with "{{}}" placeholder. review_prompt : str: Optional the prompt text that ask LLM to review. Specify addition or revision in the instruction. if not provided, a default review prompt will be used. review_mode : str review mode. Must be one of {"addition", "revision"} addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate. system_prompt : str, Optional system prompt.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None):
    """
    This class add a review step after the BasicFrameExtractor.
    The Review process asks LLM to review its output and:
        1. add more frames while keep current. This is efficient for boosting recall. 
        2. or, regenerate frames (add new and delete existing). 
    Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    review_prompt : str: Optional
        the prompt text that ask LLM to review. Specify addition or revision in the instruction.
        if not provided, a default review prompt will be used. 
    review_mode : str
        review mode. Must be one of {"addition", "revision"}
        addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
    system_prompt : str, Optional
        system prompt.
    """
    super().__init__(inference_engine=inference_engine, 
                     unit_chunker=WholeDocumentUnitChunker(),
                     prompt_template=prompt_template, 
                     review_mode=review_mode,
                     review_prompt=review_prompt,
                     system_prompt=system_prompt, 
                     context_chunker=NoContextChunker())

llm_ie.extractors.SentenceReviewFrameExtractor

SentenceReviewFrameExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    review_mode: str,
    review_prompt: str = None,
    system_prompt: str = None,
    context_sentences: Union[str, int] = "all",
)

Bases: ReviewFrameExtractor

This class adds a review step after the SentenceFrameExtractor. For each sentence, the review process asks LLM to review its output and: 1. add more frames while keeping current. This is efficient for boosting recall. 2. or, regenerate frames (add new and delete existing). Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

Parameters:

inference_engine : InferenceEngine the LLM inferencing engine object. Must implements the chat() method. prompt_template : str prompt template with "{{}}" placeholder. review_prompt : str: Optional the prompt text that ask LLM to review. Specify addition or revision in the instruction. if not provided, a default review prompt will be used. review_mode : str review mode. Must be one of {"addition", "revision"} addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate. system_prompt : str, Optional system prompt. context_sentences : Union[str, int], Optional number of sentences before and after the given sentence to provide additional context. if "all", the full text will be provided in the prompt as context. if 0, no additional context will be provided. This is good for tasks that does not require context beyond the given sentence. if > 0, the number of sentences before and after the given sentence to provide as context. This is good for tasks that require context beyond the given sentence.

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str,  
             review_mode:str, review_prompt:str=None, system_prompt:str=None,
             context_sentences:Union[str, int]="all"):
    """
    This class adds a review step after the SentenceFrameExtractor.
    For each sentence, the review process asks LLM to review its output and:
        1. add more frames while keeping current. This is efficient for boosting recall. 
        2. or, regenerate frames (add new and delete existing). 
    Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.

    Parameters:
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    review_prompt : str: Optional
        the prompt text that ask LLM to review. Specify addition or revision in the instruction.
        if not provided, a default review prompt will be used. 
    review_mode : str
        review mode. Must be one of {"addition", "revision"}
        addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
    system_prompt : str, Optional
        system prompt.
    context_sentences : Union[str, int], Optional
        number of sentences before and after the given sentence to provide additional context. 
        if "all", the full text will be provided in the prompt as context. 
        if 0, no additional context will be provided.
            This is good for tasks that does not require context beyond the given sentence. 
        if > 0, the number of sentences before and after the given sentence to provide as context.
            This is good for tasks that require context beyond the given sentence. 
    """
    if not isinstance(context_sentences, int) and context_sentences != "all":
        raise ValueError('context_sentences must be an integer (>= 0) or "all".')

    if isinstance(context_sentences, int) and context_sentences < 0:
        raise ValueError("context_sentences must be a positive integer.")

    if isinstance(context_sentences, int):
        context_chunker = SlideWindowContextChunker(window_size=context_sentences)
    elif context_sentences == "all":
        context_chunker = WholeDocumentContextChunker()

    super().__init__(inference_engine=inference_engine, 
                     unit_chunker=SentenceUnitChunker(),
                     prompt_template=prompt_template,
                     review_mode=review_mode,
                     review_prompt=review_prompt, 
                     system_prompt=system_prompt, 
                     context_chunker=context_chunker)

Relation Extractors

llm_ie.extractors.BinaryRelationExtractor

BinaryRelationExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    possible_relation_func: Callable,
    system_prompt: str = None,
)

Bases: RelationExtractor

This class extracts binary (yes/no) relations between two entities. Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

Parameters:

Name	Type	Description	Default
`inference_engine`	`InferenceEngine`	the LLM inferencing engine object. Must implements the chat() method.	required
`prompt_template`	`str`	prompt template with "{{}}" placeholder.	required
`possible_relation_func`	`(Callable, Optional)`	a function that inputs 2 frames and returns a bool indicating possible relations between them.	required
`system_prompt`	`(str, Optional)`	system prompt.	`None`

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_func: Callable,
             system_prompt:str=None):
    """
    This class extracts binary (yes/no) relations between two entities.
    Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

    Parameters
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    possible_relation_func : Callable, Optional
        a function that inputs 2 frames and returns a bool indicating possible relations between them.
    system_prompt : str, Optional
        system prompt.
    """
    super().__init__(inference_engine, prompt_template, system_prompt)
    if not callable(possible_relation_func):
        raise TypeError(f"Expect possible_relation_func as a function, received {type(possible_relation_func)} instead.")

    sig = inspect.signature(possible_relation_func)
    if len(sig.parameters) != 2:
        raise ValueError("The possible_relation_func must have exactly two parameters.")

    if sig.return_annotation not in {bool, inspect.Signature.empty}:
        warnings.warn(f"Expected possible_relation_func return annotation to be bool, but got {sig.return_annotation}.")

    self.possible_relation_func = possible_relation_func

llm_ie.extractors.MultiClassRelationExtractor

MultiClassRelationExtractor(
    inference_engine: InferenceEngine,
    prompt_template: str,
    possible_relation_types_func: Callable,
    system_prompt: str = None,
)

Bases: RelationExtractor

This class extracts relations with relation types. Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

Parameters:

Name	Type	Description	Default
`inference_engine`	`InferenceEngine`	the LLM inferencing engine object. Must implements the chat() method.	required
`prompt_template`	`str`	prompt template with "{{}}" placeholder.	required
`possible_relation_types_func`	`Callable`	a function that inputs 2 frames and returns a List of possible relation types between them. If the two frames must not have relations, this function should return an empty list [].	required
`system_prompt`	`(str, Optional)`	system prompt.	`None`

Source code in package/llm-ie/src/llm_ie/extractors.py

def __init__(self, inference_engine:InferenceEngine, prompt_template:str, possible_relation_types_func: Callable, 
             system_prompt:str=None):
    """
    This class extracts relations with relation types.
    Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).

    Parameters
    ----------
    inference_engine : InferenceEngine
        the LLM inferencing engine object. Must implements the chat() method.
    prompt_template : str
        prompt template with "{{<placeholder name>}}" placeholder.
    possible_relation_types_func : Callable
        a function that inputs 2 frames and returns a List of possible relation types between them. 
        If the two frames must not have relations, this function should return an empty list [].
    system_prompt : str, Optional
        system prompt.
    """
    super().__init__(inference_engine=inference_engine,
                     prompt_template=prompt_template,
                     system_prompt=system_prompt)

    if possible_relation_types_func:
        # Check if possible_relation_types_func is a function
        if not callable(possible_relation_types_func):
            raise TypeError(f"Expect possible_relation_types_func as a function, received {type(possible_relation_types_func)} instead.")

        sig = inspect.signature(possible_relation_types_func)
        # Check if frame_1, frame_2 are in input parameters
        if len(sig.parameters) != 2:
            raise ValueError("The possible_relation_types_func must have exactly frame_1 and frame_2 as parameters.")
        if "frame_1" not in sig.parameters.keys():
            raise ValueError("The possible_relation_types_func is missing frame_1 as a parameter.")
        if "frame_2" not in sig.parameters.keys():
            raise ValueError("The possible_relation_types_func is missing frame_2 as a parameter.")
        # Check if output is a List
        if sig.return_annotation not in {inspect._empty, List, List[str]}:
            raise ValueError(f"Expect possible_relation_types_func to output a List of string, current type hint suggests {sig.return_annotation} instead.")

        self.possible_relation_types_func = possible_relation_types_func

Extractors API

Frame Extractors

llm_ie.extractors.DirectFrameExtractor

extract

stream

extract_async async

extract_frames

llm_ie.extractors.ReviewFrameExtractor

extract

stream

extract_async async

Convenience Frame Extractors

llm_ie.extractors.BasicFrameExtractor

llm_ie.extractors.SentenceFrameExtractor

llm_ie.extractors.BasicReviewFrameExtractor

llm_ie.extractors.SentenceReviewFrameExtractor

Relation Extractors

llm_ie.extractors.BinaryRelationExtractor

llm_ie.extractors.MultiClassRelationExtractor

extract_async `async`

extract_async `async`