File size: 4,236 Bytes
2702698
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
from langchain_community.embeddings import OpenAIEmbeddings

from pinecone import Pinecone, ServerlessSpec
from tqdm.notebook import tqdm
import langchain
import openai
from openai import OpenAI
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
import os


def get_text_from_document(document):
    # Assuming 'text' is a list of Document objects, each with a 'page_content' attribute
    # Concatenate the page_content of each Document into a single text string
    # text = "".join([doc.page_content for doc in document])
    text = "".join(document).replace('\n\n', '\n')

    # Now, 'full_text' should contain the actual text extracted from the PDF
    print(f"Total length of text: {len(text)} characters")

    # If you want to see a part of the extracted text
    print(text[:1000])  # Adjust the number as necessary to inspect more of the text

    return text


# Function to get the embeddings of the text using OpenAI text-embedding-ada-002 model
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    openai_key = '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'
    client = OpenAI(api_key=openai_key)
    return client.embeddings.create(input=[text], model=model).data[0].embedding


## TODO: Function to query the Pinecone vector store and return the top-k results
def query_pinecone_vector_store(query, top_k=5):
    # Generate an embedding for the query
    query_embedding = get_embedding(query)

    # pc = Pinecone(api_key="c25f9e89-fc9e-4d21-b3eb-057dbc21c17c")
    pc = Pinecone(api_key="52ef9136-6188-4e51-af13-9639bf95c163")
    pinecone_index_name = "ee596llm-project2"
    index = pc.Index(pinecone_index_name)

    # Query the Pinecone index with the generated embedding
    query_results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Extract and return the most relevant documents along with their scores
    relevant_docs = [
        (result['id'], result['score'], result['metadata']['text'])
        for result in query_results['matches']
    ]
    return relevant_docs


def get_completion(prompt, model="gpt-3.5-turbo"):
    message = {"role": "user", "content": prompt}
    client = OpenAI(api_key='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')
    response = client.chat.completions.create(
        model=model,
        messages=[message]
    )
    return response.choices[0].message.content


def generate_answer_with_context(query, results):
    # Construct the prompt with the top-k results as context
    context_texts = "\n\n".join(
        [f"Context {idx + 1}: {result[1]}" for idx, result in enumerate(results)])  # Assuming result[1] is the text
    print(f"context_texts is : {context_texts} \n\n\n")
    prompt = f"Given the following contexts related to the query '{query}', provide a detailed answer:\n\n{context_texts}\n\nAnswer the query:"

    # Generate the answer using the GPT-3.5 Turbo model with the constructed prompt
    answer = get_completion(prompt, model="gpt-3.5-turbo")

    return answer


class Relevant_Documents_Agent:
    def __init__(self, openai_client) -> None:
        # TODO: Initialize the Relevant_Documents_Agent
        self.openai_client = openai_client

    def get_relevance(self, conversation) -> str:
        # TODO: Get if the returned documents are relevant
        # Generate embeddings for the query and all documents

        top_k_results = query_pinecone_vector_store(conversation, top_k=4)

        answer = generate_answer_with_context(conversation, top_k_results)

        most_relevant_document = answer

        return most_relevant_document

    def compute_cosine_similarity(self, vec1, vec2):
        # Ensure the vectors are numpy arrays for mathematical operations
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)

        # Compute the cosine similarity
        dot_product = np.dot(vec1, vec2)
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        cosine_similarity = dot_product / (norm_vec1 * norm_vec2)

        return cosine_similarity