Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on 19 days ago

Commit

f01a813

verified ·

1 Parent(s): ca47d69

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -17

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os
 import pickle
 import numpy as np
 import gradio as gr
@@ -25,9 +25,10 @@ def get_embeddings(texts, is_query=False):
     with torch.no_grad():
         model_output = embedding_model(**inputs)
-    embeddings = model_output.last_hidden_state[:, 0]  # CLS token embeddings
     return embeddings.cpu().numpy()
 # ===============================
 # TEXT CHUNKING
 # ===============================
@@ -46,7 +47,7 @@ def chunk_text(text, chunk_size=800, overlap=100):
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
-embedding_dim = 768  # E5-small-v2 embedding dimension
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
@@ -87,21 +88,15 @@ def extract_text_from_docx(path):
 # ===============================
 def upload_document(file):
     ext = os.path.splitext(file.name)[-1].lower()
-    # Save uploaded file temporarily
-    temp_path = f"temp_upload{ext}"
-    with open(temp_path, "wb") as f:
-        f.write(file.read())
     if ext == ".pdf":
-        text = extract_text_from_pdf(temp_path)
     elif ext == ".docx":
-        text = extract_text_from_docx(temp_path)
     else:
         return "Unsupported file type."
     chunks = chunk_text(text)
-    chunk_embeddings = get_embeddings(chunks, is_query=False)
     index.add(np.array(chunk_embeddings).astype('float32'))
     document_texts.extend(chunks)
@@ -110,12 +105,8 @@ def upload_document(file):
     with open(document_texts_path, "wb") as f:
         pickle.dump(document_texts, f)
-    # Remove the temporary file after processing (optional)
-    os.remove(temp_path)
     return "Document uploaded and indexed successfully."
 # ===============================
 # GENERATION PIPELINE (FLAN-T5)
 # ===============================
@@ -125,7 +116,7 @@ def generate_answer_from_file(query, top_k=10):
     if not document_texts:
         return "No documents indexed yet."
-    query_vector = get_embeddings(query, is_query=True).astype("float32")
     scores, indices = index.search(query_vector, k=top_k)
     retrieved_chunks = [document_texts[i] for i in indices[0]]
     context = "\n\n".join(retrieved_chunks)
@@ -170,3 +161,5 @@ search_interface = gr.Interface(
 app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
 app.launch()

+now explain this code that how this work? i want to understand deeply import os
 import pickle
 import numpy as np
 import gradio as gr
     with torch.no_grad():
         model_output = embedding_model(**inputs)
+    embeddings = model_output.last_hidden_state[:, 0]  # CLS token
     return embeddings.cpu().numpy()
 # ===============================
 # TEXT CHUNKING
 # ===============================
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
+embedding_dim = 384
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
 # ===============================
 def upload_document(file):
     ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
+        text = extract_text_from_pdf(file.name)
     elif ext == ".docx":
+        text = extract_text_from_docx(file.name)
     else:
         return "Unsupported file type."
     chunks = chunk_text(text)
+    chunk_embeddings = get_embeddings(chunks)
     index.add(np.array(chunk_embeddings).astype('float32'))
     document_texts.extend(chunks)
     with open(document_texts_path, "wb") as f:
         pickle.dump(document_texts, f)
     return "Document uploaded and indexed successfully."
 # ===============================
 # GENERATION PIPELINE (FLAN-T5)
 # ===============================
     if not document_texts:
         return "No documents indexed yet."
+    query_vector = get_embeddings(query).astype("float32")
     scores, indices = index.search(query_vector, k=top_k)
     retrieved_chunks = [document_texts[i] for i in indices[0]]
     context = "\n\n".join(retrieved_chunks)
 app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
 app.launch()