Spaces:

p-touko
/

openai-knowledgebase-irembo

Sleeping

File size: 3,909 Bytes

6183ded

# Preview the embeddings created 
import pandas as pd
import numpy as np
from ast import literal_eval
# Define function to calculate distances from embeddings and answer question using embeddings search
from typing import List, Optional
from scipy import spatial
# Generate embeddings using OpenAI API
from openai import OpenAI
import os


GPT_MODEL = "text-davinci-003"


def get_embeddings():
    df1=pd.read_csv('processed/embeddings-1.csv')
    df2=pd.read_csv('processed/embeddings-2.csv')
    df3=pd.read_csv('processed/embeddings-3.csv')
    df4=pd.read_csv('processed/embeddings-4.csv')
    df5=pd.read_csv('processed/embeddings-5.csv')
    df6=pd.read_csv('processed/embeddings-6.csv')
    df7=pd.read_csv('processed/embeddings-7.csv')

    df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
    df.columns = ['text', 'n_tokens', 'embedding']
    df['embedding'] = df['embedding'].apply(literal_eval).apply(np.array)
    #df.head()
    return df


def distances_from_embeddings(
    query_embedding: List[float],
    embeddings: List[List[float]],
    distance_metric="cosine",
) -> List[List]:
    """Return the distances between a query embedding and a list of embeddings."""
    distance_metrics = {
        "cosine": spatial.distance.cosine,
        "L1": spatial.distance.cityblock,
        "L2": spatial.distance.euclidean,
        "Linf": spatial.distance.chebyshev,
    }
    distances = [
        distance_metrics[distance_metric](query_embedding, embedding)
        for embedding in embeddings
    ]
    return distances

def create_context(
    question, df, client, max_len=1800, size="ada", 
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = client.embeddings.create(input = [question], model="text-embedding-ada-002").data[0].embedding

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embedding'].values, distance_metric='cosine')

    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row['text'])

    # Return the context
    return "\n\n###\n\n".join(returns)

def answer_question(
    df,
    model=GPT_MODEL,
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None,
    api_key="fake"    
):
    client=OpenAI(api_key=api_key)     
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        client=client,
        max_len=max_len,
        size=size        
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
           
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": f"Answer the question based on the context below, in Markdown format, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}"},
                {"role": "user", "content": f"Question: {question}"}
            ]
        )
        print(response)
        return response.choices[0].message.content
    except Exception as e:
        print(e)
        return f'Error processing {e.__cause__}: {e.message}'