Spaces:

Penality
/

pdf-something

Sleeping

App Files Files Community

Penality commited on Feb 25

Commit

fecb931

verified ·

1 Parent(s): 5f799ae

Update app.py

Browse files

updated store data method to generate embeddings and faiss index and pass to back to flask backend

Files changed (1) hide show

app.py +34 -56

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import numpy as np
 import re
 import unicodedata
 from dotenv import load_dotenv
 load_dotenv()
@@ -25,51 +26,34 @@ embedding_model = SentenceTransformer(
     trust_remote_code=True  # Allow remote code execution
 )
-# Define dataset storage folder
-DATASET_DIR = "/home/user/.cache/huggingface/datasets/my_documents"
-os.makedirs(DATASET_DIR, exist_ok=True)  # Ensure directory exists
-# Define file paths inside dataset folder
-INDEX_FILE = os.path.join(DATASET_DIR, "faiss_index.bin")  # FAISS index file
-METADATA_FILE = os.path.join(DATASET_DIR, "metadata.json")  # Metadata file
 embedding_dim = 768  # Adjust according to model
-# Initialize FAISS index
-index = faiss.IndexFlatL2(embedding_dim)
-# Debugging: Check working directory and available files
-print("Current working directory:", os.getcwd())
-print("Files in dataset directory:", os.listdir(DATASET_DIR))
-# Load FAISS index if it exists
-if os.path.exists(INDEX_FILE):
-    print(" FAISS index file exists")
-    index = faiss.read_index(INDEX_FILE)
-else:
-    print(" No FAISS index found. Creating a new one.")
-    index = faiss.IndexFlatL2(embedding_dim)  # Empty FAISS index
-# Load metadata
-if os.path.exists(METADATA_FILE):
-    print(" Metadata file exists")
-    with open(METADATA_FILE, "r") as f:
-        metadata = json.load(f)
-else:
-    metadata = {}
-def store_document(text):
     print(" Storing document...")
-    # Generate a unique filename inside the dataset folder
-    doc_id = len(metadata) + 1
-    filename = os.path.join(DATASET_DIR, f"doc_{doc_id}.txt")
-    print(f"Saving document at: {filename}")
-    # Save document to file
-    with open(filename, "w", encoding="utf-8") as f:
-        f.write(text)
-    print(" Document saved")
     # Generate and store embedding
     embedding = embedding_model.encode([text]).astype(np.float32)
@@ -80,16 +64,10 @@ def store_document(text):
     doc_index = index.ntotal - 1
     # Update metadata with FAISS index
-    metadata[str(doc_index)] = filename
-    with open(METADATA_FILE, "w") as f:
-        json.dump(metadata, f)
     print(" Saved Metadata")
-    # Save FAISS index
-    faiss.write_index(index, INDEX_FILE)
-    print(" FAISS index saved")
-    return f"Document stored at: {filename}"
 def retrieve_document(query):
     print(f"Retrieving document based on:\n{query}")
@@ -112,7 +90,6 @@ def retrieve_document(query):
     with open(filename, "r", encoding="utf-8") as f:
         return f.read()
 def clean_text(text):
     """Cleans extracted text for better processing by the model."""
     print("cleaning")
@@ -143,12 +120,7 @@ def chatbot(pdf_file, user_question):
     """Processes the PDF and answers the user's question."""
     print("chatbot start")
-    if pdf_file:
-        # Extract text from the PDF
-        text = extract_text_from_pdf(pdf_file)
-        if not text:
-            return "Could not extract any text from the PDF."
     # retrieve the document relevant to the query
     doc = retrieve_document(user_question)
@@ -195,7 +167,13 @@ iface = gr.TabbedInterface(
             fn=helloWorld,
             inputs="text",
             outputs="text",
-        )
     ]
 )

 import re
 import unicodedata
 from dotenv import load_dotenv
+from flask import jsonify
 load_dotenv()
     trust_remote_code=True  # Allow remote code execution
 )
 embedding_dim = 768  # Adjust according to model
+def store_document_data(PDF_FILE, METADATA_FILE, INDEX_FILE):
     print(" Storing document...")
+    if PDF_FILE:
+        # Extract text from the PDF
+        text = extract_text_from_pdf(PDF_FILE)
+        if not text:
+            return "Could not extract any text from the PDF."
+    if METADATA_FILE:
+         # extract metadata
+        print(" Metadata file exists")
+        with open(METADATA_FILE, "r") as f:
+            metadata = json.load(f)
+    else:
+        print("metadata_file is empty")
+        metadata = {}
+    if INDEX_FILE:
+        # extract Faiss
+        print("index_file recieved")
+        index = faiss.read_index(INDEX_FILE)
+    else:
+        print(" No FAISS index found. Creating a new one.")
+        index = faiss.IndexFlatL2(embedding_dim)  # Empty FAISS index
     # Generate and store embedding
     embedding = embedding_model.encode([text]).astype(np.float32)
     doc_index = index.ntotal - 1
     # Update metadata with FAISS index
+    metadata[str(doc_index)] = PDF_FILE
     print(" Saved Metadata")
+    return jsonify({"metadata" : metadata, "index" : index})
 def retrieve_document(query):
     print(f"Retrieving document based on:\n{query}")
     with open(filename, "r", encoding="utf-8") as f:
         return f.read()
 def clean_text(text):
     """Cleans extracted text for better processing by the model."""
     print("cleaning")
     """Processes the PDF and answers the user's question."""
     print("chatbot start")
     # retrieve the document relevant to the query
     doc = retrieve_document(user_question)
             fn=helloWorld,
             inputs="text",
             outputs="text",
+        ),
+        gr.Interface(
+            fn=store_document_data,
+            inputs=[gr.File(label="Upload PDF"), gr.file(label="Upload metadata"), gr.file(label="upload index")],
+            outputs=gr.Textbox(label="Answer"),
+            title="pdf file, metadata, index parsing and storing",
+        ),
     ]
 )