File size: 8,483 Bytes

e65f5e1

import openai
import pandas as pd
import numpy as np
# from openai.embeddings_utils import get_embedding
from transformers import GPT2TokenizerFast
from tqdm.auto import tqdm
import os



tqdm.pandas()

import spacy
# import numpy as np

# Load spaCy model with GloVe embeddings
import en_core_web_sm

nlp = en_core_web_sm.load()

def custom_embedding(text, model_name="text-embedding-ada-002"):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract word embeddings and average them to get the text embedding
    word_embeddings = [token.vector for token in doc if token.has_vector]
    
    if not word_embeddings:
        return None  # No embeddings found for any word in the text

    text_embedding = np.mean(word_embeddings, axis=0)

    # Create a response dictionary
    response = {
        "data": [
            {
                "embedding": text_embedding.tolist(),
                "index": 0,
                "object": "embedding"
            }
        ],
        "model": model_name,
        "object": "list",
        "usage": {
            "prompt_tokens": len(text.split()),
            "total_tokens": len(text.split())
        }
    }

    return response

# Example usage
text = "Rome"
response = custom_embedding(text)

if response["data"][0]["embedding"] is not None:
    print(f"Custom Embedding for '{text}': {response['data'][0]['embedding']}")
else:
    print(f"No embeddings found for words in '{text}'.")

print(response)


# import spacy
# import numpy as np

# Load spaCy model with GloVe embeddings
# import en_core_web_sm

nlp = en_core_web_sm.load()

def custom_embedding(text_list, model_name="text-embedding-ada-002"):
    embeddings = []

    for text in text_list:
        # Process the text with spaCy
        doc = nlp(text)

        # Extract word embeddings and average them to get the text embedding
        word_embeddings = [token.vector for token in doc if token.has_vector]
        
        if not word_embeddings:
            embeddings.append(None)  # No embeddings found for any word in the text
        else:
            text_embedding = np.mean(word_embeddings, axis=0)
            embeddings.append(text_embedding.tolist())

    # Create a response dictionary
    response = {
        "data": [
            {
                "embedding": emb,
                "index": idx,
                "object": "embedding"
            }
            for idx, emb in enumerate(embeddings)
        ],
        "model": model_name,
        "object": "list",
        "usage": {
            "prompt_tokens": sum(len(text.split()) for text in text_list),
            "total_tokens": sum(len(text.split()) for text in text_list)
        }
    }

    return response

# Example usage
text = ["She is running", "Fitness is good", "I am hungry", "Basketball is healthy"]
response = custom_embedding(text)

for idx, embedding in enumerate(response["data"]):
    if embedding["embedding"] is not None:
        print(f"Custom Embedding for '{text[idx]}': {embedding['embedding']}")
    else:
        print(f"No embeddings found for words in '{text[idx]}'.")

print(response)

emb1 = response['data'][0]['embedding']
emb2 = response['data'][1]['embedding']
emb3 = response['data'][2]['embedding']
emb4 = response['data'][3]['embedding']

np.dot(emb1, emb2)
np.dot(emb2, emb4)

df = pd.read_csv('Dronealexa.csv')
df = df.dropna()
df.info()
df.head()
df['combined'] = "Title: " + df['Title'].str.strip() + "; URL: " + df['URL'].str.strip() + "; Publication Year: " + df['Publication Year'].astype(str).str.strip() + "; Abstract: " + df['Abstract'].str.strip()
df.head()

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

df['n_tokens'] = df.combined.progress_apply(lambda x: len(tokenizer.encode(x)))
df = df[df.n_tokens < 8000]
df.info()
df.head()


# import spacy
# import numpy as np

# Load spaCy model with GloVe embeddings
# import en_core_web_sm

nlp = en_core_web_sm.load()

def get_embeddings(text, model):
    # Process the text with spaCy
    doc = model(text)

    # Extract word embeddings and average them to get the text embedding
    word_embeddings = [token.vector for token in doc if token.has_vector]
    
    if not word_embeddings:
        return None  # No embeddings found for any word in the text

    text_embedding = np.mean(word_embeddings, axis=0)

    # Create a response dictionary
    response = {
        "data": [
            {
                "embedding": text_embedding.tolist(),
                "index": 0,
                "object": "embedding"
            }
        ],
        "model": model.meta["name"],
        "object": "list",
        "usage": {
            "prompt_tokens": len(text.split()),
            "total_tokens": len(doc)
        }
    }

    return response

# Example usage
input_text = "Your input text goes here"
custom_model = nlp  # You can replace this with any other spaCy model

# Renaming 'input_text' to avoid conflict with the built-in 'input' function
text_to_process = input_text  

response = get_embeddings(text_to_process, custom_model)

if response["data"][0]["embedding"] is not None:
    print(f"Custom Embedding for '{text_to_process}': {response['data'][0]['embedding']}")
else:
    print(f"No embeddings found for words in '{text_to_process}'.")

print(response)

from tqdm import tqdm

batch_size = 2000
model_name = 'text-embedding-ada-002'

# Assuming df is your DataFrame
for i in tqdm(range(0, len(df.combined), batch_size)):
    # find end of batch
    i_end = min(i + batch_size, len(df.combined))
    
    # Get embeddings for the current batch
    batch_text = list(df.combined)[i:i_end]
    
    # Initialize an empty list to store the embeddings for each text in the batch
    batch_embeddings = []
    
    # Process each text in the batch and get embeddings
    for text in batch_text:
        response = get_embeddings(text, nlp)
        
        # Check if embeddings were found
        if response and response["data"][0]["embedding"] is not None:
            batch_embeddings.append(response["data"][0]["embedding"])
        else:
            # Handle the case where no embeddings are found for a text
            batch_embeddings.append(None)

    # Update the DataFrame with the embeddings
    for j in range(i, i_end):
        df.loc[j, 'ada_vector'] = str(batch_embeddings[j - i])
        
df.head()
df.info()
df['ada_vector'] = df.ada_vector.progress_apply(eval).progress_apply(np.array)
df.to_csv('embeddings_chatbot.csv',index=False)
df=pd.read_csv('embeddings_chatbot.csv')

user_query = input("Enter query - ")

query_response = get_embeddings(user_query, nlp)

if query_response["data"][0]["embedding"] is not None:
    print(f"Embedding for '{user_query}': {query_response['data'][0]['embedding']}")
else:
    print(f"No embeddings found for words in '{user_query}'.")
    
searchvector = get_embeddings(user_query, custom_model)["data"][0]["embedding"]



from sklearn.metrics.pairwise import cosine_similarity

# Assuming df['ada_vector'] contains the vectors you want to compare

# Ensure 'ada_vector' column contains valid numeric arrays
df['ada_vector'] = df['ada_vector'].apply(lambda x: np.array(x) if isinstance(x, (list, np.ndarray)) else x)

# Filter out rows where 'ada_vector' is not a valid numeric array
valid_rows = df['ada_vector'].apply(lambda x: isinstance(x, np.ndarray))

# Calculate cosine similarity only for valid rows
df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].apply(
    lambda x: cosine_similarity([x], [searchvector])[0][0]
)

# If you are using the 'progress_apply' from the 'tqdm' library
# You can keep it as follows:
# df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].progress_apply(
#     lambda x: cosine_similarity([x], [searchvector])[0][0]
# )

df.head()
df.sort_values('similarities', ascending = False)
result = df.sort_values('similarities', ascending = False).head(3)

result.head()

xc = list(result.combined)

def construct_prompt(query, xc):
  context = ''
  for i in range(3):
    context += xc[i] + "\n"
  header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
  header += context + "\n\n Q: " + query + "\n A:"
  return header



from transformers import pipeline

summarizer = pipeline("summarization")
Fresult = construct_prompt(user_query, xc)
summarizer("\n".join(xc), max_length=130, min_length=30, do_sample=False)