Luciferalive commited on
Commit
364d4fe
·
verified ·
1 Parent(s): 540b20d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -23
app.py CHANGED
@@ -19,14 +19,29 @@ import re
19
  from sentence_transformers import SentenceTransformer
20
  from sklearn.metrics.pairwise import cosine_similarity
21
  import numpy as np
 
 
 
22
 
23
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 
 
24
 
25
- def extract_text_from_pdf(pdf_path):
26
- return extract_text(pdf_path)
 
 
27
 
28
- def extract_text_from_doc(doc_path):
29
- return docx2txt.process(doc_path)
 
 
 
 
 
 
 
 
30
 
31
  def preprocess_text(text):
32
  text = text.replace('\n', ' ').replace('\r', ' ')
@@ -36,17 +51,13 @@ def preprocess_text(text):
36
  text = re.sub(r'\s+', ' ', text).strip()
37
  return text
38
 
39
- def process_files(file_paths: List[str]):
40
  all_text = ""
41
- for file_path in file_paths:
42
- print(file_path)
43
- if file_path.endswith(".pdf"):
44
- extracted_text = extract_text_from_pdf(file_path)
45
- elif file_path.endswith(".doc") or file_path.endswith(".docx"):
46
- extracted_text = extract_text_from_doc(file_path)
47
  else:
48
- print(f"Unsupported file type: {file_path}")
49
- continue
50
  preprocessed_text = preprocess_text(extracted_text)
51
  all_text += preprocessed_text + " "
52
  return all_text
@@ -59,9 +70,9 @@ def compute_cosine_similarity_scores(query, retrieved_docs):
59
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
60
  return readable_scores
61
 
62
- def answer_query_with_similarity(query, file_paths):
63
  try:
64
- all_text = process_files(file_paths)
65
 
66
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
67
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
@@ -91,13 +102,11 @@ def answer_query_with_similarity(query, file_paths):
91
 
92
  template = """
93
  ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
94
-
95
  Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
96
  Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
97
  Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
98
  Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
99
  Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
100
-
101
  ### Docs : {docs}
102
  ### Question : {question}
103
  """
@@ -123,21 +132,38 @@ def answer_query_with_similarity(query, file_paths):
123
  def main():
124
  st.title("Document Query App")
125
 
126
- # Get user inputs
127
- file_paths = st.text_input("Enter the file paths (comma-separated):")
128
- file_paths = [path.strip() for path in file_paths.split(",")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  query = st.text_input("Enter your query:")
131
 
132
  if st.button("Get Answer"):
133
- if file_paths and query:
134
- response = answer_query_with_similarity(query, file_paths)
135
  if response:
136
  st.write("Answer:", response[0])
137
  else:
138
  st.write("No answer found.")
139
  else:
140
- st.write("Please provide file paths and a query.")
141
 
142
  if __name__ == "__main__":
143
  main()
 
19
  from sentence_transformers import SentenceTransformer
20
  from sklearn.metrics.pairwise import cosine_similarity
21
  import numpy as np
22
+ from google.oauth2.credentials import Credentials
23
+ from google_auth_oauthlib.flow import InstalledAppFlow
24
+ from googleapiclient.discovery import build
25
 
26
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
27
+ GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
28
+ CLIENT_SECRET_FILE = 'path/to/client_secret.json'
29
 
30
+ def authenticate_google_drive():
31
+ flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
32
+ creds = flow.run_local_server(port=0)
33
+ return build('drive', 'v3', credentials=creds)
34
 
35
+ def get_file_from_google_drive(drive_service, file_id):
36
+ request = drive_service.files().get_media(fileId=file_id)
37
+ file_content = request.execute()
38
+ return file_content
39
+
40
+ def extract_text_from_pdf(pdf_content):
41
+ return extract_text(pdf_content)
42
+
43
+ def extract_text_from_doc(doc_content):
44
+ return docx2txt.process(doc_content)
45
 
46
  def preprocess_text(text):
47
  text = text.replace('\n', ' ').replace('\r', ' ')
 
51
  text = re.sub(r'\s+', ' ', text).strip()
52
  return text
53
 
54
+ def process_files(file_contents: List[str]):
55
  all_text = ""
56
+ for file_content in file_contents:
57
+ if isinstance(file_content, bytes):
58
+ extracted_text = extract_text_from_pdf(file_content)
 
 
 
59
  else:
60
+ extracted_text = extract_text_from_doc(file_content)
 
61
  preprocessed_text = preprocess_text(extracted_text)
62
  all_text += preprocessed_text + " "
63
  return all_text
 
70
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
71
  return readable_scores
72
 
73
+ def answer_query_with_similarity(query, file_contents):
74
  try:
75
+ all_text = process_files(file_contents)
76
 
77
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
78
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
102
 
103
  template = """
104
  ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
 
105
  Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
106
  Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
107
  Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
108
  Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
109
  Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
 
110
  ### Docs : {docs}
111
  ### Question : {question}
112
  """
 
132
  def main():
133
  st.title("Document Query App")
134
 
135
+ # Get user input for authentication method
136
+ auth_method = st.radio("Choose authentication method", ("Google Drive", "Upload Files"))
137
+
138
+ if auth_method == "Google Drive":
139
+ # Authenticate with Google Drive
140
+ drive_service = authenticate_google_drive()
141
+
142
+ # Get file IDs from user input
143
+ file_ids = st.text_input("Enter the file IDs (comma-separated):")
144
+ file_ids = [file_id.strip() for file_id in file_ids.split(",")]
145
+
146
+ # Get file contents from Google Drive
147
+ file_contents = []
148
+ for file_id in file_ids:
149
+ file_content = get_file_from_google_drive(drive_service, file_id)
150
+ file_contents.append(file_content)
151
+ else:
152
+ # Allow user to upload files directly
153
+ uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True)
154
+ file_contents = [file.read() for file in uploaded_files]
155
 
156
  query = st.text_input("Enter your query:")
157
 
158
  if st.button("Get Answer"):
159
+ if file_contents and query:
160
+ response = answer_query_with_similarity(query, file_contents)
161
  if response:
162
  st.write("Answer:", response[0])
163
  else:
164
  st.write("No answer found.")
165
  else:
166
+ st.write("Please provide files and a query.")
167
 
168
  if __name__ == "__main__":
169
  main()