project2 / Relevant_Documents_Agent.py
ericlkc's picture
initial
2702698 verified
import numpy as np
from langchain_community.embeddings import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from tqdm.notebook import tqdm
import langchain
import openai
from openai import OpenAI
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
import os
def get_text_from_document(document):
# Assuming 'text' is a list of Document objects, each with a 'page_content' attribute
# Concatenate the page_content of each Document into a single text string
# text = "".join([doc.page_content for doc in document])
text = "".join(document).replace('\n\n', '\n')
# Now, 'full_text' should contain the actual text extracted from the PDF
print(f"Total length of text: {len(text)} characters")
# If you want to see a part of the extracted text
print(text[:1000]) # Adjust the number as necessary to inspect more of the text
return text
# Function to get the embeddings of the text using OpenAI text-embedding-ada-002 model
def get_embedding(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
openai_key = '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'
client = OpenAI(api_key=openai_key)
return client.embeddings.create(input=[text], model=model).data[0].embedding
## TODO: Function to query the Pinecone vector store and return the top-k results
def query_pinecone_vector_store(query, top_k=5):
# Generate an embedding for the query
query_embedding = get_embedding(query)
# pc = Pinecone(api_key="c25f9e89-fc9e-4d21-b3eb-057dbc21c17c")
pc = Pinecone(api_key="52ef9136-6188-4e51-af13-9639bf95c163")
pinecone_index_name = "ee596llm-project2"
index = pc.Index(pinecone_index_name)
# Query the Pinecone index with the generated embedding
query_results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
# Extract and return the most relevant documents along with their scores
relevant_docs = [
(result['id'], result['score'], result['metadata']['text'])
for result in query_results['matches']
]
return relevant_docs
def get_completion(prompt, model="gpt-3.5-turbo"):
message = {"role": "user", "content": prompt}
client = OpenAI(api_key='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')
response = client.chat.completions.create(
model=model,
messages=[message]
)
return response.choices[0].message.content
def generate_answer_with_context(query, results):
# Construct the prompt with the top-k results as context
context_texts = "\n\n".join(
[f"Context {idx + 1}: {result[1]}" for idx, result in enumerate(results)]) # Assuming result[1] is the text
print(f"context_texts is : {context_texts} \n\n\n")
prompt = f"Given the following contexts related to the query '{query}', provide a detailed answer:\n\n{context_texts}\n\nAnswer the query:"
# Generate the answer using the GPT-3.5 Turbo model with the constructed prompt
answer = get_completion(prompt, model="gpt-3.5-turbo")
return answer
class Relevant_Documents_Agent:
def __init__(self, openai_client) -> None:
# TODO: Initialize the Relevant_Documents_Agent
self.openai_client = openai_client
def get_relevance(self, conversation) -> str:
# TODO: Get if the returned documents are relevant
# Generate embeddings for the query and all documents
top_k_results = query_pinecone_vector_store(conversation, top_k=4)
answer = generate_answer_with_context(conversation, top_k_results)
most_relevant_document = answer
return most_relevant_document
def compute_cosine_similarity(self, vec1, vec2):
# Ensure the vectors are numpy arrays for mathematical operations
vec1 = np.array(vec1)
vec2 = np.array(vec2)
# Compute the cosine similarity
dot_product = np.dot(vec1, vec2)
norm_vec1 = np.linalg.norm(vec1)
norm_vec2 = np.linalg.norm(vec2)
cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
return cosine_similarity