Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on 24 days ago

Commit

d87413b

verified ·

1 Parent(s): a028e27

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -33

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import gradio as gr
 import fitz  # PyMuPDF
 from docx import Document
-from transformers import AutoModel, AutoTokenizer
 import faiss
 import torch
@@ -24,13 +24,25 @@ def get_embeddings(texts):
     return outputs.last_hidden_state[:, 0].cpu().numpy()
 # =============================================
-# DOCUMENT STORAGE SETUP
 # =============================================
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
-embedding_dim = 384  # Dimension for all-MiniLM-L6-v2
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
@@ -44,29 +56,29 @@ else:
     index = faiss.IndexFlatIP(embedding_dim)
 # =============================================
-# DOCUMENT PROCESSING FUNCTIONS
 # =============================================
-def extract_text_from_pdf(pdf_path):
     text = ""
     try:
-        doc = fitz.open(pdf_path)
         for page in doc:
             text += page.get_text()
     except Exception as e:
         print(f"PDF error: {e}")
     return text
-def extract_text_from_docx(docx_path):
     text = ""
     try:
-        doc = Document(docx_path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
         print(f"DOCX error: {e}")
     return text
 # =============================================
-# DOCUMENT UPLOAD HANDLER
 # =============================================
 def upload_document(file):
     ext = os.path.splitext(file.name)[-1].lower()
@@ -75,53 +87,56 @@ def upload_document(file):
     elif ext == ".docx":
         text = extract_text_from_docx(file.name)
     else:
-        return "Unsupported file type"
-    embedding = get_embeddings(text)
-    index.add(embedding)
-    document_texts.append(text)
-    # Save updated index and texts
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
     with open(document_texts_path, "wb") as f:
         pickle.dump(document_texts, f)
-    return "Document uploaded and indexed successfully!"
 # =============================================
-# SEMANTIC SEARCH HANDLER
 # =============================================
-def search_documents(query):
     if not document_texts:
         return "No documents indexed yet."
-    query_vector = get_embeddings(query)
-    scores, indices = index.search(query_vector, k=1)
-    best_match_idx = indices[0][0]
-    return f"**Best Match:**\n\n{document_texts[best_match_idx][:1000]}..."
 # =============================================
-# GRADIO INTERFACE
 # =============================================
 upload_interface = gr.Interface(
     fn=upload_document,
     inputs=gr.File(file_types=[".pdf", ".docx"]),
     outputs="text",
-    title="Upload PDF/DOCX",
-    description="Upload a PDF or Word document to be indexed for semantic search."
 )
 search_interface = gr.Interface(
-    fn=search_documents,
-    inputs=gr.Textbox(placeholder="Enter your question or search query here..."),
-    outputs="markdown",
-    title="Semantic Search",
-    description="Search for content in uploaded documents using natural language."
 )
-app = gr.TabbedInterface([upload_interface, search_interface], ["Upload Document", "Search Document"])
-if __name__ == "__main__":
-    app.launch()

 import gradio as gr
 import fitz  # PyMuPDF
 from docx import Document
+from transformers import AutoModel, AutoTokenizer, pipeline
 import faiss
 import torch
     return outputs.last_hidden_state[:, 0].cpu().numpy()
 # =============================================
+# TEXT CHUNKING
+# =============================================
+def chunk_text(text, chunk_size=500, overlap=50):
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(len(text), start + chunk_size)
+        chunks.append(text[start:end])
+        start += chunk_size - overlap
+    return chunks
+# =============================================
+# FAISS INDEX SETUP
 # =============================================
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
+embedding_dim = 384  # for all-MiniLM-L6-v2
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
     index = faiss.IndexFlatIP(embedding_dim)
 # =============================================
+# DOCUMENT PROCESSING
 # =============================================
+def extract_text_from_pdf(path):
     text = ""
     try:
+        doc = fitz.open(path)
         for page in doc:
             text += page.get_text()
     except Exception as e:
         print(f"PDF error: {e}")
     return text
+def extract_text_from_docx(path):
     text = ""
     try:
+        doc = Document(path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
         print(f"DOCX error: {e}")
     return text
 # =============================================
+# UPLOAD AND INDEX FILE
 # =============================================
 def upload_document(file):
     ext = os.path.splitext(file.name)[-1].lower()
     elif ext == ".docx":
         text = extract_text_from_docx(file.name)
     else:
+        return "Unsupported file type."
+    chunks = chunk_text(text)
+    chunk_embeddings = get_embeddings(chunks)
+    index.add(np.array(chunk_embeddings).astype('float32'))
+    document_texts.extend(chunks)
     with open(index_path, "wb") as f:
         pickle.dump(index, f)
     with open(document_texts_path, "wb") as f:
         pickle.dump(document_texts, f)
+    return "Document uploaded and indexed successfully."
 # =============================================
+# QA PIPELINE WITH FLAN-T5
 # =============================================
+qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
+def generate_answer_from_file(query, top_k=3):
     if not document_texts:
         return "No documents indexed yet."
+    query_vector = get_embeddings(query).astype("float32")
+    scores, indices = index.search(query_vector, k=top_k)
+    retrieved_chunks = [document_texts[i] for i in indices[0]]
+    context = " ".join(retrieved_chunks)
+    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
+    result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
+    return result
 # =============================================
+# GRADIO UI
 # =============================================
 upload_interface = gr.Interface(
     fn=upload_document,
     inputs=gr.File(file_types=[".pdf", ".docx"]),
     outputs="text",
+    title="Upload Document",
+    description="Upload a Word or PDF file to index it for question answering."
 )
 search_interface = gr.Interface(
+    fn=generate_answer_from_file,
+    inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
+    outputs="text",
+    title="Ask Your Document",
+    description="Ask any question. The chatbot will read the document and answer like ChatGPT."
 )
+app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
+app.launch()