# Natural Language Based APIs

Being able to understand the content of text can help in tasks other than information extraction.

Here, we'll see how extracting information from text can help with powering a natural language based assistant 
that has different skills.

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.insert(0, "../../")

In [2]:
from langchain.chat_models import ChatOpenAI
from kor import create_extraction_chain, Object, Text, Number

In [3]:
llm = ChatOpenAI(
 model_name="gpt-3.5-turbo",
 temperature=0,
 max_tokens=2000,
)

## Control Music

Here's a hypotehtical API for controlling music.

In [4]:
schema = Object(
 id="player",
 description=(
 "User is controlling a music player to select songs, pause or start them or play"
 " music by a particular artist."
 ),
 attributes=[
 Text(
 id="song",
 description="User wants to play this song",
 examples=[],
 many=True,
 ),
 Text(
 id="album",
 description="User wants to play this album",
 examples=[],
 many=True,
 ),
 Text(
 id="artist",
 description="Music by the given artist",
 examples=[("Songs by paul simon", "paul simon")],
 many=True,
 ),
 Text(
 id="action",
 description="Action to take one of: `play`, `stop`, `next`, `previous`.",
 examples=[
 ("Please stop the music", "stop"),
 ("play something", "play"),
 ("play a song", "play"),
 ("next song", "next"),
 ],
 ),
 ],
 many=False,
)

**ATTENTION** Use the JSON encoder here rather than the default CSV encoder as it supports nested lists

In [5]:
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")

## Music Player

In [6]:
chain.run("stop the music now")["data"]

{'player': {'action': 'stop'}}

In [7]:
chain.run("i want to hear a song")["data"]

{'player': {'action': 'play'}}

In [8]:
chain.run("can you play the lion king soundtrack")["data"]

{'player': {'album': ['the lion king soundtrack']}}

In [9]:
chain.run("play songs by paul simon and led zeppelin and the doors")

{'data': {'player': {'artist': ['paul simon', 'led zeppelin', 'the doors']}},
 'raw': '{"player": {"artist": ["paul simon", "led zeppelin", "the doors"]}}',
 'errors': [],
 'validated_data': {}}

In [10]:
chain.run("could you play the previous song again?")["data"]

{'player': {'action': 'previous'}}

## Ticket ordering

Here's an imaginary API for searching and buying tickets

In [11]:
schema = Object(
 id="action",
 description="User is looking for sports tickets",
 attributes=[
 Text(
 id="sport",
 description="which sports do you want to buy tickets for?",
 examples=[
 (
 "I want to buy tickets to basketball and football games",
 ["basketball", "footbal"],
 )
 ],
 ),
 Text(
 id="location",
 description="where would you like to watch the game?",
 examples=[
 ("in boston", "boston"),
 ("in france or italy", ["france", "italy"]),
 ],
 ),
 Object(
 id="price_range",
 description="how much do you want to spend?",
 attributes=[],
 examples=[
 ("no more than $100", {"price_max": "100", "currency": "$"}),
 (
 "between 50 and 100 dollars",
 {"price_max": "100", "price_min": "50", "currency": "$"},
 ),
 ],
 ),
 ],
)

In [12]:
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")

In [13]:
chain.run("I want to buy tickets for a baseball game in LA area under $100")["data"]

{'action': {'sport': 'baseball',
 'location': 'LA area',
 'price_range': {'price_max': '100', 'currency': '$'}}}

In [14]:
chain.run(
 "I want to see a celtics game in boston somewhere between 20 and 40 dollars per ticket"
)["data"]

{'action': {'sport': 'celtics',
 'location': 'boston',
 'price_range': {'price_min': '20', 'price_max': '40', 'currency': '$'}}}

## Company Search

**ATTENTION** This is a demo that shows how to build a complex schema to run a company search that matches different criteria.

However, using this format for issuing database queries (e.g., by translating the JSON into SQL) will only works well for simple queries. 

There's a better way to leverage LLMs to issue database queries, and support for that may be added to the package in the future.

In [15]:
company_name = Text(
 id="company_name",
 description="what is the name of the company you want to find",
 many=True,
 examples=[
 ("Apple inc", "Apple inc"),
 ("largest 10 banks in the world", ""),
 ("microsoft and apple", "microsoft,apple"),
 ],
)

industry_name = Text(
 id="industry_name",
 description="what is the name of the company's industry",
 many=True,
 examples=[
 ("companies in the steel manufacturing industry", "steel manufacturing"),
 ("large banks", "banking"),
 ("military companies", "defense"),
 ("chinese companies", ""),
 ("companies that cell cigars", "cigars"),
 ],
)

geography_name = Text(
 id="geography_name",
 description="where is the company based?",
 examples=[
 ("chinese companies", "china"),
 ("companies based in france", "france"),
 ("LaMaple was based in france, italy", ["france", "italy"]),
 ("italy", ""),
 ],
)

foundation_date = Text(
 id="foundation_date",
 description="Foundation date of the company",
 examples=[("companies founded in 2023", "2023")],
)

attribute_filter = Object(
 id="attribute_filter",
 many=True,
 description=(
 "Filter by a value of an attribute using a binary expression. Specify the"
 " attribute's name, an operator (>, <, =, !=, >=, <=, in, not in) and a value."
 ),
 attributes=[],
 examples=[
 (
 "Companies with revenue > 100",
 {
 "attribute": "revenue",
 "op": ">",
 "value": "100",
 },
 ),
 (
 "number of employees between 50 and 1000",
 {"attribute": "employees", "op": "in", "value": ["50", "1000"]},
 ),
 (
 "blue or green color",
 {
 "attribute": "color",
 "op": "in",
 "value": ["blue", "green"],
 },
 ),
 (
 "companies that do not sell in california",
 {
 "attribute": "geography-sales",
 "op": "not in",
 "value": "california",
 },
 ),
 ],
)

sales_geography = Text(
 id="geography_sales",
 description="where is the company doing sales? Please use a single country name.",
 many=True,
 examples=[
 ("companies with sales in france", "france"),
 ("companies that sell their products in germany", "germany"),
 ("france, italy", ""),
 ],
)

attribute_selection_block = Text(
 id="attribute_selection",
 description="Asking to see the value of one or more attributes",
 many=True,
 examples=[
 ("What is the revenue of tech companies?", "revenue"),
 ("market cap of apple?", "market cap"),
 ("number of employees of largest company", "number of employees"),
 ("what are the revenue and market cap of apple", ["revenue", "market cap"]),
 (
 "share price and number of shares of indian companies",
 ["share price", "number of shares"],
 ),
 ],
)

sort_by_attribute_block = Object(
 id="sort_block",
 description=(
 "Use to request to sort the results by a particular attribute. "
 "Can specify the direction"
 ),
 attributes=[
 Text(id="direction", description="The direction of the sort"),
 Text(id="attribute", description="The sort attribute"),
 ],
 examples=[
 (
 "Largest by market-cap tech companies",
 {"direction": "descending", "attribute": "market-cap"},
 ),
 (
 "sort by companies with smallest revenue ",
 {"direction": "ascending", "attribute": "revenue"},
 ),
 ],
)

schema = Object(
 id="search_for_companies",
 description="Search for companies matching the following criteria.",
 attributes=[
 company_name,
 geography_name,
 foundation_date,
 industry_name,
 sales_geography,
 attribute_filter,
 attribute_selection_block,
 sort_by_attribute_block,
 ],
)

**ATTENTION** Some of the queries below fail. One common reason is that more examples could be useful to show the model how to group objects together. Pay attention to failures!

In [16]:
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")

Confirm that we're not getting **false** positives

In [17]:
text = (
 "Today Alice MacDonald is turning sixty days old. She had blue eyes. "
 "Bob is turning 10 years old. His eyes were bright red."
)
chain.run(text)["data"]

{}

In [18]:
text = (
 "revenue, eps of indian companies that have market cap of over 1 million, and"
 " and between 20-50 employees"
)
chain.run(text)["data"]

{'search_for_companies': {'attribute_filter': [{'attribute': 'market cap',
 'op': '>',
 'value': '1 million'},
 {'attribute': 'employees', 'op': 'in', 'value': ['20', '50']}],
 'attribute_selection': ['revenue', 'eps']}}

In [19]:
text = "companies that own red and blue buildings"
chain.run(text)["data"]

{'search_for_companies': {'attribute_filter': [{'attribute': 'building_color',
 'op': 'in',
 'value': ['red', 'blue']}]}}

In [20]:
text = "revenue of largest german companies sorted by number of employees"
chain.run(text)["data"]

{'search_for_companies': {'geography_name': 'germany',
 'sort_block': {'direction': 'descending',
 'attribute': 'number of employees'},
 'attribute_selection': ['revenue']}}

In [25]:
text = (
 "revenue, eps of indian companies that have market cap of over 1 million, "
 "that own red and blue buildings"
)
chain.run(text)["data"]

{'search_for_companies': {'attribute_filter': [{'attribute': 'market cap',
 'op': '>',
 'value': '1000000'},
 {'attribute': 'building color', 'op': 'in', 'value': ['red', 'blue']}],
 'attribute_selection': ['revenue', 'eps']}}

In [22]:
print(chain.prompt.format_prompt("[user_input]").to_string())

Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

search_for_companies: { // Search for companies matching the following criteria.
 company_name: Array // what is the name of the company you want to find
 geography_name: string // where is the company based?
 foundation_date: string // Foundation date of the company
 industry_name: Array // what is the name of the company's industry
 geography_sales: Array // where is the company doing sales? Please use a single country name.
 attribute_filter: Array<{ // Filter by a value of an attribute using a binary expression. Specify the attribute's name, an operator (>, <, =, !=, >=, <=, in, not in) and a value.
 }>
 attribute_selection: Array // Asking to see the value of one or more attributes
 sort_block: { // Use 