File size: 7,902 Bytes
0175843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364d4fe
 
 
0175843
540b20d
364d4fe
 
0175843
364d4fe
 
 
 
0175843
364d4fe
 
 
 
 
 
 
 
 
 
0175843
 
 
 
 
 
 
 
 
364d4fe
0175843
364d4fe
 
 
0175843
364d4fe
0175843
 
 
 
 
 
 
 
 
 
 
 
364d4fe
0175843
364d4fe
0175843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364d4fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0175843
 
 
 
364d4fe
 
0175843
 
 
 
 
364d4fe
0175843
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import streamlit as st
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from pdfminer.high_level import extract_text
import docx2txt
import os
import re
from typing import List
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from pdfminer.high_level import extract_text
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
import os
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
CLIENT_SECRET_FILE = 'path/to/client_secret.json'

def authenticate_google_drive():
    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
    creds = flow.run_local_server(port=0)
    return build('drive', 'v3', credentials=creds)

def get_file_from_google_drive(drive_service, file_id):
    request = drive_service.files().get_media(fileId=file_id)
    file_content = request.execute()
    return file_content

def extract_text_from_pdf(pdf_content):
    return extract_text(pdf_content)

def extract_text_from_doc(doc_content):
    return docx2txt.process(doc_content)

def preprocess_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_files(file_contents: List[str]):
    all_text = ""
    for file_content in file_contents:
        if isinstance(file_content, bytes):
            extracted_text = extract_text_from_pdf(file_content)
        else:
            extracted_text = extract_text_from_doc(file_content)
        preprocessed_text = preprocess_text(extracted_text)
        all_text += preprocessed_text + " "
    return all_text

def compute_cosine_similarity_scores(query, retrieved_docs):
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
    cosine_scores = np.dot(doc_embeddings, query_embedding.T)
    readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
    return readable_scores

def answer_query_with_similarity(query, file_contents):
    try:
        all_text = process_files(file_contents)

        embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_text(all_text)

        vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
        load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
        print("Vector DB Successfully Created!")

        db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
        docs = db3.similarity_search(query)
        print(f"\n\nDocuments retrieved: {len(docs)}")

        if not docs:
            print("No documents match the query.")
            return None, None

        docs_content = [doc.page_content for doc in docs]
        for i, content in enumerate(docs_content, start=1):
            print(f"\nDocument {i}: {content}...")

        cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
        for score in cosine_similarity_scores:
            print(f"\nDocument Score: {score['score']}")

        all_docs_content = " ".join(docs_content)

        template = """
                ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
                    Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
                    Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
                    Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
                    Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
                    Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
                ### Docs : {docs}
                ### Question : {question}
                """
        prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))

        repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
        llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
                                  top_p=0.15,
                                  max_new_tokens=512,
                                  repetition_penalty=1.1
                                  )
        llm_chain = LLMChain(prompt=prompt, llm=llm)

        answer = llm_chain.run(question=query)
        cleaned_answer = answer.split("Answer:")[-1].strip()
        print(f"\n\nAnswer: {cleaned_answer}")

        return cleaned_answer,
    except Exception as e:
        print("An error occurred to get the answer: ", str(e))
        return None, None

def main():
    st.title("Document Query App")

    # Get user input for authentication method
    auth_method = st.radio("Choose authentication method", ("Google Drive", "Upload Files"))

    if auth_method == "Google Drive":
        # Authenticate with Google Drive
        drive_service = authenticate_google_drive()

        # Get file IDs from user input
        file_ids = st.text_input("Enter the file IDs (comma-separated):")
        file_ids = [file_id.strip() for file_id in file_ids.split(",")]

        # Get file contents from Google Drive
        file_contents = []
        for file_id in file_ids:
            file_content = get_file_from_google_drive(drive_service, file_id)
            file_contents.append(file_content)
    else:
        # Allow user to upload files directly
        uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True)
        file_contents = [file.read() for file in uploaded_files]

    query = st.text_input("Enter your query:")

    if st.button("Get Answer"):
        if file_contents and query:
            response = answer_query_with_similarity(query, file_contents)
            if response:
                st.write("Answer:", response[0])
            else:
                st.write("No answer found.")
        else:
            st.write("Please provide files and a query.")

if __name__ == "__main__":
    main()