Source code for kor.encoders.json_data

"""JSON encoder and decoder."""
import json
from typing import Any

from kor.exceptions import ParseError

from .typedefs import Encoder
from .utils import unwrap_tag, wrap_in_tag


[docs]class JSONEncoder(Encoder): """JSON encoder and decoder. The encoder by default adds additional <json> tags around the JSON output, Additional tags are added to the output to help identify the JSON content within the LLM response and extract it. The usage of <json> tags is similar to the usage of ```JSON and ``` marks. Examples: .. code-block:: python from kor import JSONEncoder json_encoder = JSONEncoder(use_tags=True) data = {"name": "Café"} json_encoder.encode(data) # '<json>{"name": "Café"}</json>' json_encoder = JSONEncoder(use_tags=True, ensure_ascii=True) data = {"name": "Café"} json_encoder.encode(data) # '<json>{"name": "Caf\\u00e9"}</json>' """ def __init__(self, use_tags: bool = True, ensure_ascii: bool = False) -> None: """Initialize the JSON encoder. Args: use_tags: Whether to wrap the output in a special JSON tags. This may help identify the JSON content in cases when the model attempts to add clarifying explanations. ensure_ascii: Whether to escape non-ASCII characters. Default is False to preserve non-ASCII characters as that it a more sensible behavior for the extraction use cases. """ self.use_tags = use_tags self.ensure_ascii = ensure_ascii
[docs] def encode(self, data: Any) -> str: """Encode the data as JSON. Args: data: JSON serializable data. Returns: The JSON encoded data as a string optionally wrapped in <json> tags. """ content = json.dumps(data) if self.use_tags: return wrap_in_tag("json", json.dumps(data, ensure_ascii=self.ensure_ascii)) return content
[docs] def decode(self, text: str) -> Any: """Decode the text as JSON. If the encoder is using tags, the <json> content is identified within the text and then is decoded. Args: text: the text to be decoded Returns: The decoded JSON data. """ if self.use_tags: content = unwrap_tag("json", text) else: content = text if content is None: return {} try: return json.loads( content, ) except json.JSONDecodeError as e: raise ParseError(e)
[docs] def get_instruction_segment(self) -> str: """Get the format instructions for the given decoder. This is a specification to the LLM that tells it how to shape its response so that the response can be structured properly using the given decoder. """ format_instructions = ( "Please output the extracted information in JSON format. Do not output" " anything except for the extracted information. Do not add any clarifying" " information. Do not add any fields that are not in the schema. If the" " text contains attributes that do not appear in the schema, please ignore" " them. All output must be in JSON format and follow the schema specified" " above." ) if self.use_tags: format_instructions += " Wrap the JSON in <json> tags." return format_instructions