# Preview the embeddings created import pandas as pd import numpy as np from ast import literal_eval # Define function to calculate distances from embeddings and answer question using embeddings search from typing import List, Optional from scipy import spatial # Generate embeddings using OpenAI API from openai import OpenAI import os GPT_MODEL = "text-davinci-003" def get_embeddings(): df1=pd.read_csv('processed/embeddings-1.csv') df2=pd.read_csv('processed/embeddings-2.csv') df3=pd.read_csv('processed/embeddings-3.csv') df4=pd.read_csv('processed/embeddings-4.csv') df5=pd.read_csv('processed/embeddings-5.csv') df6=pd.read_csv('processed/embeddings-6.csv') df7=pd.read_csv('processed/embeddings-7.csv') df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True) df.columns = ['text', 'n_tokens', 'embedding'] df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array) #df.head() return df def distances_from_embeddings( query_embedding: List[float], embeddings: List[List[float]], distance_metric="cosine", ) -> List[List]: """Return the distances between a query embedding and a list of embeddings.""" distance_metrics = { "cosine": spatial.distance.cosine, "L1": spatial.distance.cityblock, "L2": spatial.distance.euclidean, "Linf": spatial.distance.chebyshev, } distances = [ distance_metrics[distance_metric](query_embedding, embedding) for embedding in embeddings ] return distances def create_context( question, df, client, max_len=1800, size="ada", ): """ Create a context for a question by finding the most similar context from the dataframe """ # Get the embeddings for the question q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding # Get the distances from the embeddings df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine') returns = [] cur_len = 0 # Sort by distance and add the text to the context until the context is too long for i, row in df.sort_values('distances', ascending=True).iterrows(): # Add the length of the text to the current length cur_len += row['n_tokens'] + 4 # If the context is too long, break if cur_len > max_len: break # Else add it to the text that is being returned returns.append(row['text']) # Return the context return "\n\n###\n\n".join(returns) def answer_question( df, model=GPT_MODEL, question="Am I allowed to publish model outputs to Twitter, without a human review?", max_len=1800, size="ada", debug=False, max_tokens=150, stop_sequence=None, api_key="fake" ): client=OpenAI(api_key=api_key) """ Answer a question based on the most similar context from the dataframe texts """ context = create_context( question, df, client=client, max_len=max_len, size=size ) # If debug, print the raw model response if debug: print("Context:\n" + context) print("\n\n") try: response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"}, {"role": "user", "content": f"Question: {question}"} ] ) print(response) return response.choices[0].message.content except Exception as e: print(e) return f'Error processing {e.__cause__}: {e.message}'