import openai import pandas as pd import numpy as np # from openai.embeddings_utils import get_embedding from transformers import GPT2TokenizerFast from tqdm.auto import tqdm import os tqdm.pandas() import spacy # import numpy as np # Load spaCy model with GloVe embeddings import en_core_web_sm nlp = en_core_web_sm.load() def custom_embedding(text, model_name="text-embedding-ada-002"): # Process the text with spaCy doc = nlp(text) # Extract word embeddings and average them to get the text embedding word_embeddings = [token.vector for token in doc if token.has_vector] if not word_embeddings: return None # No embeddings found for any word in the text text_embedding = np.mean(word_embeddings, axis=0) # Create a response dictionary response = { "data": [ { "embedding": text_embedding.tolist(), "index": 0, "object": "embedding" } ], "model": model_name, "object": "list", "usage": { "prompt_tokens": len(text.split()), "total_tokens": len(text.split()) } } return response # Example usage text = "Rome" response = custom_embedding(text) if response["data"][0]["embedding"] is not None: print(f"Custom Embedding for '{text}': {response['data'][0]['embedding']}") else: print(f"No embeddings found for words in '{text}'.") print(response) # import spacy # import numpy as np # Load spaCy model with GloVe embeddings # import en_core_web_sm nlp = en_core_web_sm.load() def custom_embedding(text_list, model_name="text-embedding-ada-002"): embeddings = [] for text in text_list: # Process the text with spaCy doc = nlp(text) # Extract word embeddings and average them to get the text embedding word_embeddings = [token.vector for token in doc if token.has_vector] if not word_embeddings: embeddings.append(None) # No embeddings found for any word in the text else: text_embedding = np.mean(word_embeddings, axis=0) embeddings.append(text_embedding.tolist()) # Create a response dictionary response = { "data": [ { "embedding": emb, "index": idx, "object": "embedding" } for idx, emb in enumerate(embeddings) ], "model": model_name, "object": "list", "usage": { "prompt_tokens": sum(len(text.split()) for text in text_list), "total_tokens": sum(len(text.split()) for text in text_list) } } return response # Example usage text = ["She is running", "Fitness is good", "I am hungry", "Basketball is healthy"] response = custom_embedding(text) for idx, embedding in enumerate(response["data"]): if embedding["embedding"] is not None: print(f"Custom Embedding for '{text[idx]}': {embedding['embedding']}") else: print(f"No embeddings found for words in '{text[idx]}'.") print(response) emb1 = response['data'][0]['embedding'] emb2 = response['data'][1]['embedding'] emb3 = response['data'][2]['embedding'] emb4 = response['data'][3]['embedding'] np.dot(emb1, emb2) np.dot(emb2, emb4) df = pd.read_csv('Dronealexa.csv') df = df.dropna() df.info() df.head() df['combined'] = "Title: " + df['Title'].str.strip() + "; URL: " + df['URL'].str.strip() + "; Publication Year: " + df['Publication Year'].astype(str).str.strip() + "; Abstract: " + df['Abstract'].str.strip() df.head() tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") df['n_tokens'] = df.combined.progress_apply(lambda x: len(tokenizer.encode(x))) df = df[df.n_tokens < 8000] df.info() df.head() # import spacy # import numpy as np # Load spaCy model with GloVe embeddings # import en_core_web_sm nlp = en_core_web_sm.load() def get_embeddings(text, model): # Process the text with spaCy doc = model(text) # Extract word embeddings and average them to get the text embedding word_embeddings = [token.vector for token in doc if token.has_vector] if not word_embeddings: return None # No embeddings found for any word in the text text_embedding = np.mean(word_embeddings, axis=0) # Create a response dictionary response = { "data": [ { "embedding": text_embedding.tolist(), "index": 0, "object": "embedding" } ], "model": model.meta["name"], "object": "list", "usage": { "prompt_tokens": len(text.split()), "total_tokens": len(doc) } } return response # Example usage input_text = "Your input text goes here" custom_model = nlp # You can replace this with any other spaCy model # Renaming 'input_text' to avoid conflict with the built-in 'input' function text_to_process = input_text response = get_embeddings(text_to_process, custom_model) if response["data"][0]["embedding"] is not None: print(f"Custom Embedding for '{text_to_process}': {response['data'][0]['embedding']}") else: print(f"No embeddings found for words in '{text_to_process}'.") print(response) from tqdm import tqdm batch_size = 2000 model_name = 'text-embedding-ada-002' # Assuming df is your DataFrame for i in tqdm(range(0, len(df.combined), batch_size)): # find end of batch i_end = min(i + batch_size, len(df.combined)) # Get embeddings for the current batch batch_text = list(df.combined)[i:i_end] # Initialize an empty list to store the embeddings for each text in the batch batch_embeddings = [] # Process each text in the batch and get embeddings for text in batch_text: response = get_embeddings(text, nlp) # Check if embeddings were found if response and response["data"][0]["embedding"] is not None: batch_embeddings.append(response["data"][0]["embedding"]) else: # Handle the case where no embeddings are found for a text batch_embeddings.append(None) # Update the DataFrame with the embeddings for j in range(i, i_end): df.loc[j, 'ada_vector'] = str(batch_embeddings[j - i]) df.head() df.info() df['ada_vector'] = df.ada_vector.progress_apply(eval).progress_apply(np.array) df.to_csv('embeddings_chatbot.csv',index=False) df=pd.read_csv('embeddings_chatbot.csv') user_query = input("Enter query - ") query_response = get_embeddings(user_query, nlp) if query_response["data"][0]["embedding"] is not None: print(f"Embedding for '{user_query}': {query_response['data'][0]['embedding']}") else: print(f"No embeddings found for words in '{user_query}'.") searchvector = get_embeddings(user_query, custom_model)["data"][0]["embedding"] from sklearn.metrics.pairwise import cosine_similarity # Assuming df['ada_vector'] contains the vectors you want to compare # Ensure 'ada_vector' column contains valid numeric arrays df['ada_vector'] = df['ada_vector'].apply(lambda x: np.array(x) if isinstance(x, (list, np.ndarray)) else x) # Filter out rows where 'ada_vector' is not a valid numeric array valid_rows = df['ada_vector'].apply(lambda x: isinstance(x, np.ndarray)) # Calculate cosine similarity only for valid rows df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].apply( lambda x: cosine_similarity([x], [searchvector])[0][0] ) # If you are using the 'progress_apply' from the 'tqdm' library # You can keep it as follows: # df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].progress_apply( # lambda x: cosine_similarity([x], [searchvector])[0][0] # ) df.head() df.sort_values('similarities', ascending = False) result = df.sort_values('similarities', ascending = False).head(3) result.head() xc = list(result.combined) def construct_prompt(query, xc): context = '' for i in range(3): context += xc[i] + "\n" header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n""" header += context + "\n\n Q: " + query + "\n A:" return header from transformers import pipeline summarizer = pipeline("summarization") Fresult = construct_prompt(user_query, xc) summarizer("\n".join(xc), max_length=130, min_length=30, do_sample=False)