aquibmoin commited on
Commit
5ea3859
·
verified ·
1 Parent(s): e287940

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -88,15 +88,13 @@ def get_chunks(text, chunk_size=500):
88
 
89
  return chunks
90
 
91
- # Initialize FAISS index with cosine similarity
92
-
93
- embedding_dim = 768 # NASA Bi-Encoder outputs 768-dimensional embeddings
94
- index = faiss.IndexFlatIP(embedding_dim) # FAISS inner product (cosine similarity)
95
-
96
  def load_and_process_uploaded_pdfs(pdf_files):
97
-
98
  """Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
99
 
 
 
 
 
100
  pdf_chunks = [] # Store extracted chunks
101
  chunk_embeddings = [] # Store embeddings
102
 
@@ -106,21 +104,22 @@ def load_and_process_uploaded_pdfs(pdf_files):
106
  for page in reader.pages:
107
  pdf_text += page.extract_text() + "\n"
108
 
109
- # Split extracted text into chunks
110
- chunks = get_chunks(pdf_text, chunk_size=500)
111
- pdf_chunks.extend(chunks) # Store chunks for later retrieval
112
 
113
  # Generate embeddings for each chunk
114
  for chunk in chunks:
115
  chunk_embedding = encode_text(chunk).reshape(1, -1)
116
 
117
- # Normalize the embedding for cosine similarity
118
  chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
119
 
120
- index.add(chunk_embedding) # Add to FAISS
121
  chunk_embeddings.append(chunk_embedding)
122
 
123
- return pdf_chunks, chunk_embeddings # Return both for retrieval
 
124
 
125
 
126
  def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
@@ -413,7 +412,7 @@ def gpt_response_to_dataframe(gpt_response):
413
  def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
414
  # Load and process uploaded PDFs (if provided)
415
  if uploaded_pdfs:
416
- pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
417
  else:
418
  pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
419
 
 
88
 
89
  return chunks
90
 
 
 
 
 
 
91
  def load_and_process_uploaded_pdfs(pdf_files):
 
92
  """Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
93
 
94
+ # **RESET FAISS INDEX on every function call**
95
+ embedding_dim = 768 # NASA Bi-Encoder embedding size
96
+ index = faiss.IndexFlatIP(embedding_dim) # Fresh FAISS index
97
+
98
  pdf_chunks = [] # Store extracted chunks
99
  chunk_embeddings = [] # Store embeddings
100
 
 
104
  for page in reader.pages:
105
  pdf_text += page.extract_text() + "\n"
106
 
107
+ # **Reduce Chunk Size for Faster Processing**
108
+ chunks = get_chunks(pdf_text, chunk_size=300)
109
+ pdf_chunks.extend(chunks) # Store for retrieval
110
 
111
  # Generate embeddings for each chunk
112
  for chunk in chunks:
113
  chunk_embedding = encode_text(chunk).reshape(1, -1)
114
 
115
+ # Normalize for cosine similarity
116
  chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
117
 
118
+ index.add(chunk_embedding) # **Now adding to fresh FAISS index**
119
  chunk_embeddings.append(chunk_embedding)
120
 
121
+ return index, pdf_chunks, chunk_embeddings # Return fresh FAISS index and chunk data
122
+
123
 
124
 
125
  def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
 
412
  def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
413
  # Load and process uploaded PDFs (if provided)
414
  if uploaded_pdfs:
415
+ index, pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
416
  else:
417
  pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
418