Spaces:

Luciferalive
/

Rag-v10

Sleeping

App Files Files Community

Luciferalive commited on May 13, 2024

Commit

364d4fe

verified ·

1 Parent(s): 540b20d

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -23

app.py CHANGED Viewed

@@ -19,14 +19,29 @@ import re
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
-def extract_text_from_pdf(pdf_path):
-    return extract_text(pdf_path)
-def extract_text_from_doc(doc_path):
-    return docx2txt.process(doc_path)
 def preprocess_text(text):
     text = text.replace('\n', ' ').replace('\r', ' ')
@@ -36,17 +51,13 @@ def preprocess_text(text):
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-def process_files(file_paths: List[str]):
     all_text = ""
-    for file_path in file_paths:
-        print(file_path)
-        if file_path.endswith(".pdf"):
-            extracted_text = extract_text_from_pdf(file_path)
-        elif file_path.endswith(".doc") or file_path.endswith(".docx"):
-            extracted_text = extract_text_from_doc(file_path)
         else:
-            print(f"Unsupported file type: {file_path}")
-            continue
         preprocessed_text = preprocess_text(extracted_text)
         all_text += preprocessed_text + " "
     return all_text
@@ -59,9 +70,9 @@ def compute_cosine_similarity_scores(query, retrieved_docs):
     readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
     return readable_scores
-def answer_query_with_similarity(query, file_paths):
     try:
-        all_text = process_files(file_paths)
         embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
@@ -91,13 +102,11 @@ def answer_query_with_similarity(query, file_paths):
         template = """
                 ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
                     Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
                     Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
                     Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
                     Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
                     Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
                 ### Docs : {docs}
                 ### Question : {question}
                 """
@@ -123,21 +132,38 @@ def answer_query_with_similarity(query, file_paths):
 def main():
     st.title("Document Query App")
-    # Get user inputs
-    file_paths = st.text_input("Enter the file paths (comma-separated):")
-    file_paths = [path.strip() for path in file_paths.split(",")]
     query = st.text_input("Enter your query:")
     if st.button("Get Answer"):
-        if file_paths and query:
-            response = answer_query_with_similarity(query, file_paths)
             if response:
                 st.write("Answer:", response[0])
             else:
                 st.write("No answer found.")
         else:
-            st.write("Please provide file paths and a query.")
 if __name__ == "__main__":
     main()

 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from googleapiclient.discovery import build
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+GOOGLE_DRIVE_SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
+CLIENT_SECRET_FILE = 'path/to/client_secret.json'
+def authenticate_google_drive():
+    flow = InstalledAppFlow.from_client_secrets_file(CLIENT_SECRET_FILE, GOOGLE_DRIVE_SCOPES)
+    creds = flow.run_local_server(port=0)
+    return build('drive', 'v3', credentials=creds)
+def get_file_from_google_drive(drive_service, file_id):
+    request = drive_service.files().get_media(fileId=file_id)
+    file_content = request.execute()
+    return file_content
+def extract_text_from_pdf(pdf_content):
+    return extract_text(pdf_content)
+def extract_text_from_doc(doc_content):
+    return docx2txt.process(doc_content)
 def preprocess_text(text):
     text = text.replace('\n', ' ').replace('\r', ' ')
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+def process_files(file_contents: List[str]):
     all_text = ""
+    for file_content in file_contents:
+        if isinstance(file_content, bytes):
+            extracted_text = extract_text_from_pdf(file_content)
         else:
+            extracted_text = extract_text_from_doc(file_content)
         preprocessed_text = preprocess_text(extracted_text)
         all_text += preprocessed_text + " "
     return all_text
     readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
     return readable_scores
+def answer_query_with_similarity(query, file_contents):
     try:
+        all_text = process_files(file_contents)
         embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         template = """
                 ### [INST] Instruction:Analyze the provided PDF and DOC documents focusing specifically on extracting factual content, mathematical data, and crucial information relevant to device specifications, including discription. Utilize the RAG model's retrieval capabilities to ensure accuracy and minimize the risk of hallucinations in the generated content. Present the findings in a structured and clear format, incorporating:
                     Device Specifications: List all relevant device specifications, including batch numbers, ensuring accuracy and attention to detail.
                     Mathematical Calculations: Perform and report any necessary mathematical calculations found within the documents, providing step-by-step explanations to ensure clarity.
                     Numerical Data Analysis: Extract and analyze numerical data from tables included in the documents, summarizing key findings and implications.
                     Factual Information: Highlight crucial factual information extracted from the text, ensuring it is presented in a straightforward and understandable manner.
                     Ensure the response is well-organized, using bullet points or numbered lists where applicable, to enhance readability and presentation. Avoid any form of hallucination by cross-referencing facts with the document content directly.
                 ### Docs : {docs}
                 ### Question : {question}
                 """
 def main():
     st.title("Document Query App")
+    # Get user input for authentication method
+    auth_method = st.radio("Choose authentication method", ("Google Drive", "Upload Files"))
+    if auth_method == "Google Drive":
+        # Authenticate with Google Drive
+        drive_service = authenticate_google_drive()
+        # Get file IDs from user input
+        file_ids = st.text_input("Enter the file IDs (comma-separated):")
+        file_ids = [file_id.strip() for file_id in file_ids.split(",")]
+        # Get file contents from Google Drive
+        file_contents = []
+        for file_id in file_ids:
+            file_content = get_file_from_google_drive(drive_service, file_id)
+            file_contents.append(file_content)
+    else:
+        # Allow user to upload files directly
+        uploaded_files = st.file_uploader("Upload files", accept_multiple_files=True)
+        file_contents = [file.read() for file in uploaded_files]
     query = st.text_input("Enter your query:")
     if st.button("Get Answer"):
+        if file_contents and query:
+            response = answer_query_with_similarity(query, file_contents)
             if response:
                 st.write("Answer:", response[0])
             else:
                 st.write("No answer found.")
         else:
+            st.write("Please provide files and a query.")
 if __name__ == "__main__":
     main()