Source code for kor.extraction.parser
from __future__ import annotations
from typing import List, Optional
from langchain_core.output_parsers import BaseOutputParser
from pydantic import Extra
from kor.encoders import Encoder
from kor.exceptions import ParseError
from kor.extraction.typedefs import Extraction
from kor.nodes import Object
from kor.validators import Validator
[docs]class KorParser(BaseOutputParser[Extraction]):
"""A Kor langchain parser integration.
This parser can use any of Kor's encoders to support encoding/decoding
different data formats.
"""
encoder: Encoder
schema_: Object
validator: Optional[Validator] = None
@property
def _type(self) -> str:
"""Declare the type property."""
return "KorEncoder"
[docs] def parse(self, text: str) -> Extraction:
"""Parse the text."""
try:
data = self.encoder.decode(text)
except ParseError as e:
return {"data": {}, "raw": text, "errors": [e], "validated_data": {}}
key_id = self.schema_.id
errors: List[Exception]
if key_id not in data:
if data: # We got something parsed, but it doesn't match the schema.
errors = [
ParseError(
"The LLM has returned structured data which does not match the"
" expected schema. Providing additional examples may help"
" improve the parse."
)
]
else:
errors = []
return {"data": {}, "raw": text, "errors": errors, "validated_data": {}}
obj_data = data[key_id]
if self.validator:
validated_data, errors = self.validator.clean_data(obj_data)
else:
validated_data, errors = {}, []
return {
"data": data,
"raw": text,
"errors": errors,
"validated_data": validated_data,
}
[docs] class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
arbitrary_types_allowed = True