{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6fcfbb4c-e91d-46aa-8653-38f094146c0c",
   "metadata": {},
   "source": [
    "# Custom Prompt\n",
    "\n",
    "Here, we'll see how to customize the instruction segment of the **Kor** prompt."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5419b2f8-1c28-497d-9bf0-a046d21a93d9",
   "metadata": {
    "nbsphinx": "hidden",
    "tags": [
     "remove-cell"
    ]
   },
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import sys\n",
    "import pprint\n",
    "\n",
    "sys.path.insert(0, \"../../\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6dfe32e7-e80c-46d8-b173-eb9fe7d66e87",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from kor.extraction import create_extraction_chain\n",
    "from kor.nodes import Object, Text, Number\n",
    "from langchain_core.prompts import PromptTemplate\n",
    "from langchain_openai import ChatOpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "96ed58c4-45d2-4236-a6a5-8b894f4b601c",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "llm = ChatOpenAI(\n",
    "    model_name=\"gpt-4o\",\n",
    "    temperature=0,\n",
    ")\n",
    "\n",
    "schema = Object(\n",
    "    id=\"person\",\n",
    "    description=\"Personal information\",\n",
    "    examples=[\n",
    "        (\"Alice and Bob are friends\", [{\"first_name\": \"Alice\"}, {\"first_name\": \"Bob\"}])\n",
    "    ],\n",
    "    attributes=[\n",
    "        Text(\n",
    "            id=\"first_name\",\n",
    "            description=\"The first name of a person.\",\n",
    "        )\n",
    "    ],\n",
    "    many=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4acce29-6720-40dc-b74e-86fcaf3db29d",
   "metadata": {},
   "source": [
    "## Create a template"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ee0a720-90de-4563-9d5e-76c663d6d5c7",
   "metadata": {},
   "source": [
    "Here we create an instruction template. \n",
    "\n",
    "The template accepts 2 optional parameters:\n",
    "\n",
    "1. `type_description` -- will be replaced with the schema type-descriptor.\n",
    "2. `format_instructions` -- will be replaced with the format instructions of whichever encoder is used."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ebc251dd-e7aa-4859-8385-ca32c7fd900c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Pep talk for your LLM goes here]\n",
      "\n",
      "Add some type description\n",
      "\n",
      "```TypeScript\n",
      "\n",
      "person: Array<{ // Personal information\n",
      " first_name: string // The first name of a person.\n",
      "}>\n",
      "```\n",
      "\n",
      "\n",
      "Add some format instructions\n",
      "\n",
      "Please output the extracted information in CSV format in Excel dialect. Please use a | as the delimiter. \n",
      " Do NOT add any clarifying information. Output MUST follow the schema above. Do NOT add any additional columns that do not appear in the schema.\n",
      "Suffix heren\n",
      "\n",
      "\n",
      "Input: Alice and Bob are friends\n",
      "Output: first_name\n",
      "Alice\n",
      "Bob\n",
      "\n",
      "Input: hello\n",
      "Output:\n"
     ]
    }
   ],
   "source": [
    "instruction_template = PromptTemplate(\n",
    "    input_variables=[\"format_instructions\", \"type_description\"],\n",
    "    template=(\n",
    "        \"[Pep talk for your LLM goes here]\\n\\n\"\n",
    "        \"Add some type description\\n\\n\"\n",
    "        \"{type_description}\\n\\n\"  # Can comment out\n",
    "        \"Add some format instructions\\n\\n\"\n",
    "        \"{format_instructions}\\n\"\n",
    "        \"Suffix heren\\n\"\n",
    "    ),\n",
    ")\n",
    "\n",
    "\n",
    "chain = create_extraction_chain(llm, schema, instruction_template=instruction_template)\n",
    "\n",
    "print(chain.get_prompts()[0].format_prompt(text=\"hello\").to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "08581bc7-a134-40b7-a17b-ec8bd85de3f9",
   "metadata": {},
   "source": [
    "## Custom Encoder and TypeDescriptor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e406eac6-da5b-4d13-8378-cee5cfbbf544",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from kor import JSONEncoder, TypeDescriptor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "65ae257b-983c-496c-87d2-8c59ff4f9fda",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "class CatEncoder(JSONEncoder):\n",
    "    def get_instruction_segment(self) -> str:\n",
    "        return \"Encode your response as Cat JSON enclosed in <😼> tags.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "86611975-41c8-41d4-9347-7e3f4ae5247f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "class CatType(TypeDescriptor):\n",
    "    def describe(self, node: Object) -> str:\n",
    "        \"\"\"Describe the schema of the node.\"\"\"\n",
    "        return f\"A 😼 ate the schema of {type(node)} 😼\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9658425e-684a-46db-98e3-85dfda6ca97e",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Pep talk for your LLM goes here]\n",
      "\n",
      "Add some type description\n",
      "\n",
      "A 😼 ate the schema of <class 'kor.nodes.Object'> 😼\n",
      "\n",
      "Add some format instructions\n",
      "\n",
      "Encode your response as Cat JSON enclosed in <😼> tags.\n",
      "Suffix heren\n",
      "\n",
      "\n",
      "Input: Alice and Bob are friends\n",
      "Output: <json>{\"person\": [{\"first_name\": \"Alice\"}, {\"first_name\": \"Bob\"}]}</json>\n",
      "Input: hello\n",
      "Output:\n"
     ]
    }
   ],
   "source": [
    "instruction_template = PromptTemplate(\n",
    "    input_variables=[\"format_instructions\", \"type_description\"],\n",
    "    template=(\n",
    "        \"[Pep talk for your LLM goes here]\\n\\n\"\n",
    "        \"Add some type description\\n\\n\"\n",
    "        \"{type_description}\\n\\n\"  # Can comment out\n",
    "        \"Add some format instructions\\n\\n\"\n",
    "        \"{format_instructions}\\n\"\n",
    "        \"Suffix heren\\n\"\n",
    "    ),\n",
    ")\n",
    "\n",
    "\n",
    "chain = create_extraction_chain(\n",
    "    llm,\n",
    "    schema,\n",
    "    instruction_template=instruction_template,\n",
    "    encoder_or_encoder_class=CatEncoder,\n",
    "    type_descriptor=CatType(),\n",
    ")\n",
    "\n",
    "print(chain.get_prompts()[0].format_prompt(text=\"hello\").to_string())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}