Natural Language Based APIs
Contents
Natural Language Based APIs#
Being able to understand the content of text can help in tasks other than information extraction.
Here, we’ll see how extracting information from text can help with powering a natural language based assistant that has different skills.
from kor import create_extraction_chain, Object, Text, Number
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
model_name="gpt-4o",
temperature=0,
max_tokens=2000,
)
Control Music#
Here’s a hypotehtical API for controlling music.
schema = Object(
id="player",
description=(
"User is controlling a music player to select songs, pause or start them or play"
" music by a particular artist."
),
attributes=[
Text(
id="song",
description="User wants to play this song",
examples=[],
many=True,
),
Text(
id="album",
description="User wants to play this album",
examples=[],
many=True,
),
Text(
id="artist",
description="Music by the given artist",
examples=[("Songs by paul simon", "paul simon")],
many=True,
),
Text(
id="action",
description="Action to take one of: `play`, `stop`, `next`, `previous`.",
examples=[
("Please stop the music", "stop"),
("play something", "play"),
("play a song", "play"),
("next song", "next"),
],
),
],
many=False,
)
ATTENTION Use the JSON encoder here rather than the default CSV encoder as it supports nested lists
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")
Music Player#
chain.invoke("stop playing the music now")['data']
{'player': {'action': 'stop'}}
chain.invoke("i want to hear a song")["data"]
{'player': {'action': 'play'}}
chain.invoke("can you play the lion king soundtrack")["data"]
{'player': {'album': ['the lion king soundtrack'], 'action': 'play'}}
chain.invoke("play songs by paul simon and led zeppelin and the doors")
{'data': {'player': {'artist': ['paul simon', 'led zeppelin', 'the doors'],
'action': 'play'}},
'raw': '<json>{"player": {"artist": ["paul simon", "led zeppelin", "the doors"], "action": "play"}}</json>',
'errors': [],
'validated_data': {}}
chain.invoke("could you play the previous song again?")["data"]
{'player': {'action': 'previous'}}
Ticket ordering#
Here’s an imaginary API for searching and buying tickets
schema = Object(
id="action",
description="User is looking for sports tickets",
attributes=[
Text(
id="sport",
description="which sports do you want to buy tickets for?",
examples=[
(
"I want to buy tickets to basketball and football games",
["basketball", "footbal"],
)
],
),
Text(
id="location",
description="where would you like to watch the game?",
examples=[
("in boston", "boston"),
("in france or italy", ["france", "italy"]),
],
),
Object(
id="price_range",
description="how much do you want to spend?",
attributes=[],
examples=[
("no more than $100", {"price_max": "100", "currency": "$"}),
(
"between 50 and 100 dollars",
{"price_max": "100", "price_min": "50", "currency": "$"},
),
],
),
],
)
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")
chain.invoke("I want to buy tickets for a baseball game in LA area under $100")["data"]
{'action': {'sport': 'baseball',
'location': 'LA area',
'price_range': {'price_max': '100'}}}
chain.invoke(
"I want to see a celtics game in boston somewhere between 20 and 40 dollars per ticket"
)["data"]
{'action': {'sport': 'basketball',
'location': 'boston',
'price_range': {'price_max': '40', 'price_min': '20', 'currency': '$'}}}
Company Search#
ATTENTION This is a demo that shows how to build a complex schema to run a company search that matches different criteria.
However, using this format for issuing database queries (e.g., by translating the JSON into SQL) will only works well for simple queries.
There’s a better way to leverage LLMs to issue database queries, and support for that may be added to the package in the future.
company_name = Text(
id="company_name",
description="what is the name of the company you want to find",
many=True,
examples=[
("Apple inc", "Apple inc"),
("largest 10 banks in the world", ""),
("microsoft and apple", "microsoft,apple"),
],
)
industry_name = Text(
id="industry_name",
description="what is the name of the company's industry",
many=True,
examples=[
("companies in the steel manufacturing industry", "steel manufacturing"),
("large banks", "banking"),
("military companies", "defense"),
("chinese companies", ""),
("companies that cell cigars", "cigars"),
],
)
geography_name = Text(
id="geography_name",
description="where is the company based?",
examples=[
("chinese companies", "china"),
("companies based in france", "france"),
("LaMaple was based in france, italy", ["france", "italy"]),
("italy", ""),
],
)
foundation_date = Text(
id="foundation_date",
description="Foundation date of the company",
examples=[("companies founded in 2023", "2023")],
)
attribute_filter = Object(
id="attribute_filter",
many=True,
description=(
"Filter by a value of an attribute using a binary expression. Specify the"
" attribute's name, an operator (>, <, =, !=, >=, <=, in, not in) and a value."
),
attributes=[],
examples=[
(
"Companies with revenue > 100",
{
"attribute": "revenue",
"op": ">",
"value": "100",
},
),
(
"number of employees between 50 and 1000",
{"attribute": "employees", "op": "in", "value": ["50", "1000"]},
),
(
"blue or green color",
{
"attribute": "color",
"op": "in",
"value": ["blue", "green"],
},
),
(
"companies that do not sell in california",
{
"attribute": "geography-sales",
"op": "not in",
"value": "california",
},
),
],
)
sales_geography = Text(
id="geography_sales",
description="where is the company doing sales? Please use a single country name.",
many=True,
examples=[
("companies with sales in france", "france"),
("companies that sell their products in germany", "germany"),
("france, italy", ""),
],
)
attribute_selection_block = Text(
id="attribute_selection",
description="Asking to see the value of one or more attributes",
many=True,
examples=[
("What is the revenue of tech companies?", "revenue"),
("market cap of apple?", "market cap"),
("number of employees of largest company", "number of employees"),
("what are the revenue and market cap of apple", ["revenue", "market cap"]),
(
"share price and number of shares of indian companies",
["share price", "number of shares"],
),
],
)
sort_by_attribute_block = Object(
id="sort_block",
description=(
"Use to request to sort the results by a particular attribute. "
"Can specify the direction"
),
attributes=[
Text(id="direction", description="The direction of the sort"),
Text(id="attribute", description="The sort attribute"),
],
examples=[
(
"Largest by market-cap tech companies",
{"direction": "descending", "attribute": "market-cap"},
),
(
"sort by companies with smallest revenue ",
{"direction": "ascending", "attribute": "revenue"},
),
],
)
schema = Object(
id="search_for_companies",
description="Search for companies matching the following criteria.",
attributes=[
company_name,
geography_name,
foundation_date,
industry_name,
sales_geography,
attribute_filter,
attribute_selection_block,
sort_by_attribute_block,
],
)
ATTENTION Some of the queries below fail. One common reason is that more examples could be useful to show the model how to group objects together. Pay attention to failures!
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class="json")
Confirm that we’re not getting false positives
text = (
"Today Alice MacDonald is turning sixty days old. She had blue eyes. "
"Bob is turning 10 years old. His eyes were bright red."
)
chain.invoke(text)["data"]
{}
text = (
"revenue, eps of indian companies that have market cap of over 1 million, and"
" and between 20-50 employees"
)
chain.invoke(text)["data"]
{'search_for_companies': {'geography_name': 'india',
'attribute_filter': [{'attribute': 'market cap',
'op': '>',
'value': '1000000'},
{'attribute': 'employees', 'op': 'in', 'value': ['20', '50']}],
'attribute_selection': ['revenue', 'eps']}}
text = "companies that own red and blue buildings"
chain.invoke(text)["data"]
{'search_for_companies': {'attribute_filter': [{'attribute': 'building color',
'op': 'in',
'value': ['red', 'blue']}]}}
text = "revenue of largest german companies sorted by number of employees"
chain.invoke(text)["data"]
{'search_for_companies': {'geography_name': 'germany',
'attribute_selection': ['revenue'],
'sort_block': {'direction': 'descending',
'attribute': 'number of employees'}}}
text = (
"revenue, eps of indian companies that have market cap of over 1 million, "
"that own red and blue buildings"
)
chain.invoke(text)["data"]
{'search_for_companies': {'geography_name': 'india',
'attribute_filter': [{'attribute': 'market cap',
'op': '>',
'value': '1 million'},
{'attribute': 'buildings', 'op': 'in', 'value': ['red', 'blue']}],
'attribute_selection': ['revenue', 'eps']}}
print(chain.get_prompts()[0].format_prompt("[user_input]").to_string())
Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.
```TypeScript
search_for_companies: { // Search for companies matching the following criteria.
company_name: Array<string> // what is the name of the company you want to find
geography_name: string // where is the company based?
foundation_date: string // Foundation date of the company
industry_name: Array<string> // what is the name of the company's industry
geography_sales: Array<string> // where is the company doing sales? Please use a single country name.
attribute_filter: Array<{ // Filter by a value of an attribute using a binary expression. Specify the attribute's name, an operator (>, <, =, !=, >=, <=, in, not in) and a value.
}>
attribute_selection: Array<string> // Asking to see the value of one or more attributes
sort_block: { // Use to request to sort the results by a particular attribute. Can specify the direction
direction: string // The direction of the sort
attribute: string // The sort attribute
}
}
```
Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please ignore them. All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags.
Input: Apple inc
Output: <json>{"search_for_companies": {"company_name": ["Apple inc"]}}</json>
Input: largest 10 banks in the world
Output: <json>{}</json>
Input: microsoft and apple
Output: <json>{"search_for_companies": {"company_name": ["microsoft,apple"]}}</json>
Input: chinese companies
Output: <json>{"search_for_companies": {"geography_name": "china"}}</json>
Input: companies based in france
Output: <json>{"search_for_companies": {"geography_name": "france"}}</json>
Input: LaMaple was based in france, italy
Output: <json>{"search_for_companies": {"geography_name": ["france", "italy"]}}</json>
Input: italy
Output: <json>{}</json>
Input: companies founded in 2023
Output: <json>{"search_for_companies": {"foundation_date": "2023"}}</json>
Input: companies in the steel manufacturing industry
Output: <json>{"search_for_companies": {"industry_name": ["steel manufacturing"]}}</json>
Input: large banks
Output: <json>{"search_for_companies": {"industry_name": ["banking"]}}</json>
Input: military companies
Output: <json>{"search_for_companies": {"industry_name": ["defense"]}}</json>
Input: chinese companies
Output: <json>{}</json>
Input: companies that cell cigars
Output: <json>{"search_for_companies": {"industry_name": ["cigars"]}}</json>
Input: companies with sales in france
Output: <json>{"search_for_companies": {"geography_sales": ["france"]}}</json>
Input: companies that sell their products in germany
Output: <json>{"search_for_companies": {"geography_sales": ["germany"]}}</json>
Input: france, italy
Output: <json>{}</json>
Input: Companies with revenue > 100
Output: <json>{"search_for_companies": {"attribute_filter": [{"attribute": "revenue", "op": ">", "value": "100"}]}}</json>
Input: number of employees between 50 and 1000
Output: <json>{"search_for_companies": {"attribute_filter": [{"attribute": "employees", "op": "in", "value": ["50", "1000"]}]}}</json>
Input: blue or green color
Output: <json>{"search_for_companies": {"attribute_filter": [{"attribute": "color", "op": "in", "value": ["blue", "green"]}]}}</json>
Input: companies that do not sell in california
Output: <json>{"search_for_companies": {"attribute_filter": [{"attribute": "geography-sales", "op": "not in", "value": "california"}]}}</json>
Input: What is the revenue of tech companies?
Output: <json>{"search_for_companies": {"attribute_selection": ["revenue"]}}</json>
Input: market cap of apple?
Output: <json>{"search_for_companies": {"attribute_selection": ["market cap"]}}</json>
Input: number of employees of largest company
Output: <json>{"search_for_companies": {"attribute_selection": ["number of employees"]}}</json>
Input: what are the revenue and market cap of apple
Output: <json>{"search_for_companies": {"attribute_selection": ["revenue", "market cap"]}}</json>
Input: share price and number of shares of indian companies
Output: <json>{"search_for_companies": {"attribute_selection": ["share price", "number of shares"]}}</json>
Input: Largest by market-cap tech companies
Output: <json>{"search_for_companies": {"sort_block": {"direction": "descending", "attribute": "market-cap"}}}</json>
Input: sort by companies with smallest revenue
Output: <json>{"search_for_companies": {"sort_block": {"direction": "ascending", "attribute": "revenue"}}}</json>
Input: [user_input]
Output: