Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -88,15 +88,13 @@ def get_chunks(text, chunk_size=500):
|
|
88 |
|
89 |
return chunks
|
90 |
|
91 |
-
# Initialize FAISS index with cosine similarity
|
92 |
-
|
93 |
-
embedding_dim = 768 # NASA Bi-Encoder outputs 768-dimensional embeddings
|
94 |
-
index = faiss.IndexFlatIP(embedding_dim) # FAISS inner product (cosine similarity)
|
95 |
-
|
96 |
def load_and_process_uploaded_pdfs(pdf_files):
|
97 |
-
|
98 |
"""Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
|
99 |
|
|
|
|
|
|
|
|
|
100 |
pdf_chunks = [] # Store extracted chunks
|
101 |
chunk_embeddings = [] # Store embeddings
|
102 |
|
@@ -106,21 +104,22 @@ def load_and_process_uploaded_pdfs(pdf_files):
|
|
106 |
for page in reader.pages:
|
107 |
pdf_text += page.extract_text() + "\n"
|
108 |
|
109 |
-
#
|
110 |
-
chunks = get_chunks(pdf_text, chunk_size=
|
111 |
-
pdf_chunks.extend(chunks) # Store
|
112 |
|
113 |
# Generate embeddings for each chunk
|
114 |
for chunk in chunks:
|
115 |
chunk_embedding = encode_text(chunk).reshape(1, -1)
|
116 |
|
117 |
-
# Normalize
|
118 |
chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
|
119 |
|
120 |
-
index.add(chunk_embedding) #
|
121 |
chunk_embeddings.append(chunk_embedding)
|
122 |
|
123 |
-
return pdf_chunks, chunk_embeddings # Return
|
|
|
124 |
|
125 |
|
126 |
def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
|
@@ -413,7 +412,7 @@ def gpt_response_to_dataframe(gpt_response):
|
|
413 |
def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
|
414 |
# Load and process uploaded PDFs (if provided)
|
415 |
if uploaded_pdfs:
|
416 |
-
pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
|
417 |
else:
|
418 |
pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
|
419 |
|
|
|
88 |
|
89 |
return chunks
|
90 |
|
|
|
|
|
|
|
|
|
|
|
91 |
def load_and_process_uploaded_pdfs(pdf_files):
|
|
|
92 |
"""Extracts text from PDFs, splits into chunks, generates embeddings, and stores in FAISS."""
|
93 |
|
94 |
+
# **RESET FAISS INDEX on every function call**
|
95 |
+
embedding_dim = 768 # NASA Bi-Encoder embedding size
|
96 |
+
index = faiss.IndexFlatIP(embedding_dim) # Fresh FAISS index
|
97 |
+
|
98 |
pdf_chunks = [] # Store extracted chunks
|
99 |
chunk_embeddings = [] # Store embeddings
|
100 |
|
|
|
104 |
for page in reader.pages:
|
105 |
pdf_text += page.extract_text() + "\n"
|
106 |
|
107 |
+
# **Reduce Chunk Size for Faster Processing**
|
108 |
+
chunks = get_chunks(pdf_text, chunk_size=300)
|
109 |
+
pdf_chunks.extend(chunks) # Store for retrieval
|
110 |
|
111 |
# Generate embeddings for each chunk
|
112 |
for chunk in chunks:
|
113 |
chunk_embedding = encode_text(chunk).reshape(1, -1)
|
114 |
|
115 |
+
# Normalize for cosine similarity
|
116 |
chunk_embedding = chunk_embedding / np.linalg.norm(chunk_embedding)
|
117 |
|
118 |
+
index.add(chunk_embedding) # **Now adding to fresh FAISS index**
|
119 |
chunk_embeddings.append(chunk_embedding)
|
120 |
|
121 |
+
return index, pdf_chunks, chunk_embeddings # Return fresh FAISS index and chunk data
|
122 |
+
|
123 |
|
124 |
|
125 |
def retrieve_relevant_context(user_input, context_text, science_objectives="", index=None, pdf_chunks=None, k=3):
|
|
|
412 |
def chatbot(user_input, science_objectives="", context="", subdomain="", uploaded_pdfs=None, max_tokens=150, temperature=0.7, top_p=0.9, frequency_penalty=0.5, presence_penalty=0.0):
|
413 |
# Load and process uploaded PDFs (if provided)
|
414 |
if uploaded_pdfs:
|
415 |
+
index, pdf_chunks, chunk_embeddings = load_and_process_uploaded_pdfs(uploaded_pdfs)
|
416 |
else:
|
417 |
pdf_chunks, chunk_embeddings = [], [] # Ensure empty list if no PDFs provided
|
418 |
|