Luciferalive commited on
Commit
0175843
·
verified ·
1 Parent(s): b939811

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.chains import LLMChain
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain_community.llms import HuggingFaceEndpoint
5
+ from pdfminer.high_level import extract_text
6
+ import docx2txt
7
+ import os
8
+ import re
9
+ from typing import List
10
+ from langchain.chains import LLMChain
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain_community.llms import HuggingFaceEndpoint
13
+ from pdfminer.high_level import extract_text
14
+ from langchain.vectorstores import Chroma
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.embeddings import SentenceTransformerEmbeddings
17
+ import os
18
+ import re
19
+ from sentence_transformers import SentenceTransformer
20
+ from sklearn.metrics.pairwise import cosine_similarity
21
+ import numpy as np
22
+
23
+
24
+ os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN
25
+
26
+ def extract_text_from_pdf(pdf_path):
27
+ return extract_text(pdf_path)
28
+
29
+ def extract_text_from_doc(doc_path):
30
+ return docx2txt.process(doc_path)
31
+
32
+ def preprocess_text(text):
33
+ text = text.replace('\n', ' ').replace('\r', ' ')
34
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
35
+ text = text.lower()
36
+ text = re.sub(r'[^\w\s]', '', text)
37
+ text = re.sub(r'\s+', ' ', text).strip()
38
+ return text
39
+
40
+ def process_files(file_paths: List[str]):
41
+ all_text = ""
42
+ for file_path in file_paths:
43
+ print(file_path)
44
+ if file_path.endswith(".pdf"):
45
+ extracted_text = extract_text_from_pdf(file_path)
46
+ elif file_path.endswith(".doc") or file_path.endswith(".docx"):
47
+ extracted_text = extract_text_from_doc(file_path)
48
+ else:
49
+ print(f"Unsupported file type: {file_path}")
50
+ continue
51
+ preprocessed_text = preprocess_text(extracted_text)
52
+ all_text += preprocessed_text + " "
53
+ return all_text
54
+
55
+ def compute_cosine_similarity_scores(query, retrieved_docs):
56
+ model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
57
+ query_embedding = model.encode(query, convert_to_tensor=True)
58
+ doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
59
+ cosine_scores = np.dot(doc_embeddings, query_embedding.T)
60
+ readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
61
+ return readable_scores
62
+
63
+ def answer_query_with_similarity(query, file_paths):
64
+ try:
65
+ all_text = process_files(file_paths)
66
+
67
+ embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
68
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
69
+ texts = text_splitter.split_text(all_text)
70
+
71
+ vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
72
+ load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
73
+ print("Vector DB Successfully Created!")
74
+
75
+ db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
76
+ docs = db3.similarity_search(query)
77
+ print(f"\n\nDocuments retrieved: {len(docs)}")
78
+
79
+ if not docs:
80
+ print("No documents match the query.")
81
+ return None, None
82
+
83
+ docs_content = [doc.page_content for doc in docs]
84
+ for i, content in enumerate(docs_content, start=1):
85
+ print(f"\nDocument {i}: {content}...")
86
+
87
+ cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
88
+ for score in cosine_similarity_scores:
89
+ print(f"\nDocument Score: {score['score']}")
90
+
91
+ all_docs_content = " ".join(docs_content)
92
+
93
+ template = """
94
+ ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
95
+
96
+ Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
97
+ Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
98
+ Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
99
+ Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
100
+ Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
101
+
102
+ ### Docs : {docs}
103
+ ### Question : {question}
104
+ """
105
+ prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
106
+
107
+ repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
108
+ llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
109
+ top_p=0.15,
110
+ max_new_tokens=512,
111
+ repetition_penalty=1.1
112
+ )
113
+ llm_chain = LLMChain(prompt=prompt, llm=llm)
114
+
115
+ answer = llm_chain.run(question=query)
116
+ cleaned_answer = answer.split("Answer:")[-1].strip()
117
+ print(f"\n\nAnswer: {cleaned_answer}")
118
+
119
+ return cleaned_answer,
120
+ except Exception as e:
121
+ print("An error occurred to get the answer: ", str(e))
122
+ return None, None
123
+
124
+ def main():
125
+ st.title("Document Query App")
126
+
127
+ # Get user inputs
128
+ file_paths = st.text_input("Enter the file paths (comma-separated):")
129
+ file_paths = [path.strip() for path in file_paths.split(",")]
130
+
131
+ query = st.text_input("Enter your query:")
132
+
133
+ if st.button("Get Answer"):
134
+ if file_paths and query:
135
+ response = answer_query_with_similarity(query, file_paths)
136
+ if response:
137
+ st.write("Answer:", response[0])
138
+ else:
139
+ st.write("No answer found.")
140
+ else:
141
+ st.write("Please provide file paths and a query.")
142
+
143
+ if __name__ == "__main__":
144
+ main()