Penality commited on
Commit
fecb931
·
verified ·
1 Parent(s): 5f799ae

Update app.py

Browse files

updated store data method to generate embeddings and faiss index and pass to back to flask backend

Files changed (1) hide show
  1. app.py +34 -56
app.py CHANGED
@@ -9,6 +9,7 @@ import numpy as np
9
  import re
10
  import unicodedata
11
  from dotenv import load_dotenv
 
12
 
13
  load_dotenv()
14
 
@@ -25,51 +26,34 @@ embedding_model = SentenceTransformer(
25
  trust_remote_code=True # Allow remote code execution
26
  )
27
 
28
- # Define dataset storage folder
29
- DATASET_DIR = "/home/user/.cache/huggingface/datasets/my_documents"
30
- os.makedirs(DATASET_DIR, exist_ok=True) # Ensure directory exists
31
-
32
- # Define file paths inside dataset folder
33
- INDEX_FILE = os.path.join(DATASET_DIR, "faiss_index.bin") # FAISS index file
34
- METADATA_FILE = os.path.join(DATASET_DIR, "metadata.json") # Metadata file
35
-
36
  embedding_dim = 768 # Adjust according to model
37
 
38
- # Initialize FAISS index
39
- index = faiss.IndexFlatL2(embedding_dim)
40
-
41
- # Debugging: Check working directory and available files
42
- print("Current working directory:", os.getcwd())
43
- print("Files in dataset directory:", os.listdir(DATASET_DIR))
44
-
45
- # Load FAISS index if it exists
46
- if os.path.exists(INDEX_FILE):
47
- print(" FAISS index file exists")
48
- index = faiss.read_index(INDEX_FILE)
49
- else:
50
- print(" No FAISS index found. Creating a new one.")
51
- index = faiss.IndexFlatL2(embedding_dim) # Empty FAISS index
52
-
53
- # Load metadata
54
- if os.path.exists(METADATA_FILE):
55
- print(" Metadata file exists")
56
- with open(METADATA_FILE, "r") as f:
57
- metadata = json.load(f)
58
- else:
59
- metadata = {}
60
-
61
- def store_document(text):
62
  print(" Storing document...")
63
 
64
- # Generate a unique filename inside the dataset folder
65
- doc_id = len(metadata) + 1
66
- filename = os.path.join(DATASET_DIR, f"doc_{doc_id}.txt")
67
- print(f"Saving document at: {filename}")
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Save document to file
70
- with open(filename, "w", encoding="utf-8") as f:
71
- f.write(text)
72
- print(" Document saved")
 
 
 
73
 
74
  # Generate and store embedding
75
  embedding = embedding_model.encode([text]).astype(np.float32)
@@ -80,16 +64,10 @@ def store_document(text):
80
  doc_index = index.ntotal - 1
81
 
82
  # Update metadata with FAISS index
83
- metadata[str(doc_index)] = filename
84
- with open(METADATA_FILE, "w") as f:
85
- json.dump(metadata, f)
86
  print(" Saved Metadata")
87
 
88
- # Save FAISS index
89
- faiss.write_index(index, INDEX_FILE)
90
- print(" FAISS index saved")
91
-
92
- return f"Document stored at: {filename}"
93
 
94
  def retrieve_document(query):
95
  print(f"Retrieving document based on:\n{query}")
@@ -112,7 +90,6 @@ def retrieve_document(query):
112
  with open(filename, "r", encoding="utf-8") as f:
113
  return f.read()
114
 
115
-
116
  def clean_text(text):
117
  """Cleans extracted text for better processing by the model."""
118
  print("cleaning")
@@ -143,12 +120,7 @@ def chatbot(pdf_file, user_question):
143
  """Processes the PDF and answers the user's question."""
144
  print("chatbot start")
145
 
146
- if pdf_file:
147
- # Extract text from the PDF
148
- text = extract_text_from_pdf(pdf_file)
149
- if not text:
150
- return "Could not extract any text from the PDF."
151
-
152
  # retrieve the document relevant to the query
153
  doc = retrieve_document(user_question)
154
 
@@ -195,7 +167,13 @@ iface = gr.TabbedInterface(
195
  fn=helloWorld,
196
  inputs="text",
197
  outputs="text",
198
- )
 
 
 
 
 
 
199
  ]
200
  )
201
 
 
9
  import re
10
  import unicodedata
11
  from dotenv import load_dotenv
12
+ from flask import jsonify
13
 
14
  load_dotenv()
15
 
 
26
  trust_remote_code=True # Allow remote code execution
27
  )
28
 
 
 
 
 
 
 
 
 
29
  embedding_dim = 768 # Adjust according to model
30
 
31
+
32
+ def store_document_data(PDF_FILE, METADATA_FILE, INDEX_FILE):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  print(" Storing document...")
34
 
35
+ if PDF_FILE:
36
+ # Extract text from the PDF
37
+ text = extract_text_from_pdf(PDF_FILE)
38
+ if not text:
39
+ return "Could not extract any text from the PDF."
40
+
41
+ if METADATA_FILE:
42
+ # extract metadata
43
+ print(" Metadata file exists")
44
+ with open(METADATA_FILE, "r") as f:
45
+ metadata = json.load(f)
46
+ else:
47
+ print("metadata_file is empty")
48
+ metadata = {}
49
 
50
+ if INDEX_FILE:
51
+ # extract Faiss
52
+ print("index_file recieved")
53
+ index = faiss.read_index(INDEX_FILE)
54
+ else:
55
+ print(" No FAISS index found. Creating a new one.")
56
+ index = faiss.IndexFlatL2(embedding_dim) # Empty FAISS index
57
 
58
  # Generate and store embedding
59
  embedding = embedding_model.encode([text]).astype(np.float32)
 
64
  doc_index = index.ntotal - 1
65
 
66
  # Update metadata with FAISS index
67
+ metadata[str(doc_index)] = PDF_FILE
 
 
68
  print(" Saved Metadata")
69
 
70
+ return jsonify({"metadata" : metadata, "index" : index})
 
 
 
 
71
 
72
  def retrieve_document(query):
73
  print(f"Retrieving document based on:\n{query}")
 
90
  with open(filename, "r", encoding="utf-8") as f:
91
  return f.read()
92
 
 
93
  def clean_text(text):
94
  """Cleans extracted text for better processing by the model."""
95
  print("cleaning")
 
120
  """Processes the PDF and answers the user's question."""
121
  print("chatbot start")
122
 
123
+
 
 
 
 
 
124
  # retrieve the document relevant to the query
125
  doc = retrieve_document(user_question)
126
 
 
167
  fn=helloWorld,
168
  inputs="text",
169
  outputs="text",
170
+ ),
171
+ gr.Interface(
172
+ fn=store_document_data,
173
+ inputs=[gr.File(label="Upload PDF"), gr.file(label="Upload metadata"), gr.file(label="upload index")],
174
+ outputs=gr.Textbox(label="Answer"),
175
+ title="pdf file, metadata, index parsing and storing",
176
+ ),
177
  ]
178
  )
179