Spaces:
Sleeping
Sleeping
import numpy as np | |
from langchain_community.embeddings import OpenAIEmbeddings | |
from pinecone import Pinecone, ServerlessSpec | |
from tqdm.notebook import tqdm | |
import langchain | |
import openai | |
from openai import OpenAI | |
import string | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader | |
import os | |
def get_text_from_document(document): | |
# Assuming 'text' is a list of Document objects, each with a 'page_content' attribute | |
# Concatenate the page_content of each Document into a single text string | |
# text = "".join([doc.page_content for doc in document]) | |
text = "".join(document).replace('\n\n', '\n') | |
# Now, 'full_text' should contain the actual text extracted from the PDF | |
print(f"Total length of text: {len(text)} characters") | |
# If you want to see a part of the extracted text | |
print(text[:1000]) # Adjust the number as necessary to inspect more of the text | |
return text | |
# Function to get the embeddings of the text using OpenAI text-embedding-ada-002 model | |
def get_embedding(text, model="text-embedding-ada-002"): | |
text = text.replace("\n", " ") | |
openai_key = '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' | |
client = OpenAI(api_key=openai_key) | |
return client.embeddings.create(input=[text], model=model).data[0].embedding | |
## TODO: Function to query the Pinecone vector store and return the top-k results | |
def query_pinecone_vector_store(query, top_k=5): | |
# Generate an embedding for the query | |
query_embedding = get_embedding(query) | |
# pc = Pinecone(api_key="c25f9e89-fc9e-4d21-b3eb-057dbc21c17c") | |
pc = Pinecone(api_key="52ef9136-6188-4e51-af13-9639bf95c163") | |
pinecone_index_name = "ee596llm-project2" | |
index = pc.Index(pinecone_index_name) | |
# Query the Pinecone index with the generated embedding | |
query_results = index.query( | |
vector=query_embedding, | |
top_k=top_k, | |
include_metadata=True | |
) | |
# Extract and return the most relevant documents along with their scores | |
relevant_docs = [ | |
(result['id'], result['score'], result['metadata']['text']) | |
for result in query_results['matches'] | |
] | |
return relevant_docs | |
def get_completion(prompt, model="gpt-3.5-turbo"): | |
message = {"role": "user", "content": prompt} | |
client = OpenAI(api_key='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') | |
response = client.chat.completions.create( | |
model=model, | |
messages=[message] | |
) | |
return response.choices[0].message.content | |
def generate_answer_with_context(query, results): | |
# Construct the prompt with the top-k results as context | |
context_texts = "\n\n".join( | |
[f"Context {idx + 1}: {result[1]}" for idx, result in enumerate(results)]) # Assuming result[1] is the text | |
print(f"context_texts is : {context_texts} \n\n\n") | |
prompt = f"Given the following contexts related to the query '{query}', provide a detailed answer:\n\n{context_texts}\n\nAnswer the query:" | |
# Generate the answer using the GPT-3.5 Turbo model with the constructed prompt | |
answer = get_completion(prompt, model="gpt-3.5-turbo") | |
return answer | |
class Relevant_Documents_Agent: | |
def __init__(self, openai_client) -> None: | |
# TODO: Initialize the Relevant_Documents_Agent | |
self.openai_client = openai_client | |
def get_relevance(self, conversation) -> str: | |
# TODO: Get if the returned documents are relevant | |
# Generate embeddings for the query and all documents | |
top_k_results = query_pinecone_vector_store(conversation, top_k=4) | |
answer = generate_answer_with_context(conversation, top_k_results) | |
most_relevant_document = answer | |
return most_relevant_document | |
def compute_cosine_similarity(self, vec1, vec2): | |
# Ensure the vectors are numpy arrays for mathematical operations | |
vec1 = np.array(vec1) | |
vec2 = np.array(vec2) | |
# Compute the cosine similarity | |
dot_product = np.dot(vec1, vec2) | |
norm_vec1 = np.linalg.norm(vec1) | |
norm_vec2 = np.linalg.norm(vec2) | |
cosine_similarity = dot_product / (norm_vec1 * norm_vec2) | |
return cosine_similarity | |