Spaces:
Sleeping
Sleeping
DJOMGA TOUKO Peter Charles
Intial commit with embetting splits in 07 parts. The Content if Crawl automatically fron the website of irembo support
6183ded
# Preview the embeddings created | |
import pandas as pd | |
import numpy as np | |
from ast import literal_eval | |
# Define function to calculate distances from embeddings and answer question using embeddings search | |
from typing import List, Optional | |
from scipy import spatial | |
# Generate embeddings using OpenAI API | |
from openai import OpenAI | |
import os | |
GPT_MODEL = "text-davinci-003" | |
def get_embeddings(): | |
df1=pd.read_csv('processed/embeddings-1.csv') | |
df2=pd.read_csv('processed/embeddings-2.csv') | |
df3=pd.read_csv('processed/embeddings-3.csv') | |
df4=pd.read_csv('processed/embeddings-4.csv') | |
df5=pd.read_csv('processed/embeddings-5.csv') | |
df6=pd.read_csv('processed/embeddings-6.csv') | |
df7=pd.read_csv('processed/embeddings-7.csv') | |
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True) | |
df.columns = ['text', 'n_tokens', 'embedding'] | |
df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array) | |
#df.head() | |
return df | |
def distances_from_embeddings( | |
query_embedding: List[float], | |
embeddings: List[List[float]], | |
distance_metric="cosine", | |
) -> List[List]: | |
"""Return the distances between a query embedding and a list of embeddings.""" | |
distance_metrics = { | |
"cosine": spatial.distance.cosine, | |
"L1": spatial.distance.cityblock, | |
"L2": spatial.distance.euclidean, | |
"Linf": spatial.distance.chebyshev, | |
} | |
distances = [ | |
distance_metrics[distance_metric](query_embedding, embedding) | |
for embedding in embeddings | |
] | |
return distances | |
def create_context( | |
question, df, client, max_len=1800, size="ada", | |
): | |
""" | |
Create a context for a question by finding the most similar context from the dataframe | |
""" | |
# Get the embeddings for the question | |
q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding | |
# Get the distances from the embeddings | |
df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine') | |
returns = [] | |
cur_len = 0 | |
# Sort by distance and add the text to the context until the context is too long | |
for i, row in df.sort_values('distances', ascending=True).iterrows(): | |
# Add the length of the text to the current length | |
cur_len += row['n_tokens'] + 4 | |
# If the context is too long, break | |
if cur_len > max_len: | |
break | |
# Else add it to the text that is being returned | |
returns.append(row['text']) | |
# Return the context | |
return "\n\n###\n\n".join(returns) | |
def answer_question( | |
df, | |
model=GPT_MODEL, | |
question="Am I allowed to publish model outputs to Twitter, without a human review?", | |
max_len=1800, | |
size="ada", | |
debug=False, | |
max_tokens=150, | |
stop_sequence=None, | |
api_key="fake" | |
): | |
client=OpenAI(api_key=api_key) | |
""" | |
Answer a question based on the most similar context from the dataframe texts | |
""" | |
context = create_context( | |
question, | |
df, | |
client=client, | |
max_len=max_len, | |
size=size | |
) | |
# If debug, print the raw model response | |
if debug: | |
print("Context:\n" + context) | |
print("\n\n") | |
try: | |
response = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"}, | |
{"role": "user", "content": f"Question: {question}"} | |
] | |
) | |
print(response) | |
return response.choices[0].message.content | |
except Exception as e: | |
print(e) | |
return f'Error processing {e.__cause__}: {e.message}' |