openai-knowledgebase-irembo / app_kb_handler.py
DJOMGA TOUKO Peter Charles
Intial commit with embetting splits in 07 parts. The Content if Crawl automatically fron the website of irembo support
6183ded
# Preview the embeddings created
import pandas as pd
import numpy as np
from ast import literal_eval
# Define function to calculate distances from embeddings and answer question using embeddings search
from typing import List, Optional
from scipy import spatial
# Generate embeddings using OpenAI API
from openai import OpenAI
import os
GPT_MODEL = "text-davinci-003"
def get_embeddings():
df1=pd.read_csv('processed/embeddings-1.csv')
df2=pd.read_csv('processed/embeddings-2.csv')
df3=pd.read_csv('processed/embeddings-3.csv')
df4=pd.read_csv('processed/embeddings-4.csv')
df5=pd.read_csv('processed/embeddings-5.csv')
df6=pd.read_csv('processed/embeddings-6.csv')
df7=pd.read_csv('processed/embeddings-7.csv')
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
df.columns = ['text', 'n_tokens', 'embedding']
df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)
#df.head()
return df
def distances_from_embeddings(
query_embedding: List[float],
embeddings: List[List[float]],
distance_metric="cosine",
) -> List[List]:
"""Return the distances between a query embedding and a list of embeddings."""
distance_metrics = {
"cosine": spatial.distance.cosine,
"L1": spatial.distance.cityblock,
"L2": spatial.distance.euclidean,
"Linf": spatial.distance.chebyshev,
}
distances = [
distance_metrics[distance_metric](query_embedding, embedding)
for embedding in embeddings
]
return distances
def create_context(
question, df, client, max_len=1800, size="ada",
):
"""
Create a context for a question by finding the most similar context from the dataframe
"""
# Get the embeddings for the question
q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine')
returns = []
cur_len = 0
# Sort by distance and add the text to the context until the context is too long
for i, row in df.sort_values('distances', ascending=True).iterrows():
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
returns.append(row['text'])
# Return the context
return "\n\n###\n\n".join(returns)
def answer_question(
df,
model=GPT_MODEL,
question="Am I allowed to publish model outputs to Twitter, without a human review?",
max_len=1800,
size="ada",
debug=False,
max_tokens=150,
stop_sequence=None,
api_key="fake"
):
client=OpenAI(api_key=api_key)
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context(
question,
df,
client=client,
max_len=max_len,
size=size
)
# If debug, print the raw model response
if debug:
print("Context:\n" + context)
print("\n\n")
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"},
{"role": "user", "content": f"Question: {question}"}
]
)
print(response)
return response.choices[0].message.content
except Exception as e:
print(e)
return f'Error processing {e.__cause__}: {e.message}'