Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on 18 days ago

Commit

0a2bb75

verified ·

1 Parent(s): 072e16f

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -152

app.py CHANGED Viewed

@@ -1,164 +1,65 @@
-import os
-import pickle
-import numpy as np
 import gradio as gr
-import fitz  # PyMuPDF
-from docx import Document
-from transformers import AutoModel, AutoTokenizer, pipeline
-import faiss
-import torch
-# ===============================
-# EMBEDDING MODEL (E5)
-# ===============================
-model_name = "intfloat/e5-small-v2"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-embedding_model = AutoModel.from_pretrained(model_name)
-def get_embeddings(texts, is_query=False):
-    if isinstance(texts, str):
-        texts = [texts]
-    prefix = "query: " if is_query else "passage: "
-    texts = [prefix + t for t in texts]
-    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
-    with torch.no_grad():
-        model_output = embedding_model(**inputs)
-    embeddings = model_output.last_hidden_state[:, 0]  # CLS token
-    return embeddings.cpu().numpy()
-# ===============================
-# TEXT CHUNKING
-# ===============================
-def chunk_text(text, chunk_size=800, overlap=100):
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = min(len(text), start + chunk_size)
-        chunks.append(text[start:end])
-        start += chunk_size - overlap
-    return chunks
-# ===============================
-# FAISS INDEX SETUP
-# ===============================
-index_path = "faiss_index.pkl"
-document_texts_path = "document_texts.pkl"
-document_texts = []
-embedding_dim = 384
-if os.path.exists(index_path) and os.path.exists(document_texts_path):
-    try:
-        with open(index_path, "rb") as f:
-            index = pickle.load(f)
-        with open(document_texts_path, "rb") as f:
-            document_texts = pickle.load(f)
-    except Exception as e:
-        print(f"Error loading index: {e}")
-        index = faiss.IndexFlatIP(embedding_dim)
-else:
-    index = faiss.IndexFlatIP(embedding_dim)
-# ===============================
-# FILE EXTRACTORS
-# ===============================
-def extract_text_from_pdf(path):
-    text = ""
-    try:
-        doc = fitz.open(path)
-        for page in doc:
-            text += page.get_text()
-    except Exception as e:
-        print(f"PDF error: {e}")
     return text
-def extract_text_from_docx(path):
-    text = ""
     try:
-        doc = Document(path)
-        text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
-        print(f"DOCX error: {e}")
-    return text
-# ===============================
-# UPLOAD HANDLER
-# ===============================
-def upload_document(file):
-    ext = os.path.splitext(file.name)[-1].lower()
-    if ext == ".pdf":
-        text = extract_text_from_pdf(file.name)
-    elif ext == ".docx":
-        text = extract_text_from_docx(file.name)
-    else:
-        return "Unsupported file type."
-    chunks = chunk_text(text)
-    chunk_embeddings = get_embeddings(chunks)
-    index.add(np.array(chunk_embeddings).astype('float32'))
-    document_texts.extend(chunks)
-    with open(index_path, "wb") as f:
-        pickle.dump(index, f)
-    with open(document_texts_path, "wb") as f:
-        pickle.dump(document_texts, f)
-    return "Document uploaded and indexed successfully."
-# ===============================
-# QA GENERATION PIPELINE
-# ===============================
-# Initialize text generation pipeline (you can use a more powerful model if needed)
-qa_pipeline = pipeline("text-generation", model="gpt2")
-def generate_answer_from_file(query, top_k=10):
-    if not document_texts:
-        return "No documents indexed yet."
-    query_vector = get_embeddings(query, is_query=True).astype("float32")
-    scores, indices = index.search(query_vector, k=top_k)
-    retrieved_chunks = [document_texts[i] for i in indices[0]]
-    context = "\n\n".join(retrieved_chunks)
-    # Prompt for the model
-    prompt = (
-        "You are a helpful assistant reading student notes or textbook passages.\n\n"
-        "Based on the context provided, answer the question accurately and clearly.\n\n"
-        "### Example\n"
-        "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
-        "Question: What is an Artificial System?\n"
-        "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
-        "### Now answer this\n"
-        f"Context:\n{context}\n\n"
-        f"Question: {query}\n"
-        f"Answer:"
-    )
-    result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
-    return result.strip()
-# ===============================
-# GRADIO INTERFACES
-# ===============================
-upload_interface = gr.Interface(
-    fn=upload_document,
-    inputs=gr.File(file_types=[".pdf", ".docx"]),
-    outputs="text",
-    title="Upload Document",
-    description="Upload your Word or PDF document for question answering."
-)
-search_interface = gr.Interface(
-    fn=generate_answer_from_file,
-    inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
-    outputs="text",
-    title="Ask the Document",
-    description="Ask questions about the uploaded content. The chatbot will answer based on the document."
-)
-app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
-app.launch()

 import gradio as gr
+from PyPDF2 import PdfReader
+from transformers import pipeline
+# Load QA pipeline
+qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+# Function to extract text from PDF
+def extract_text_from_pdf(file):
+    reader = PdfReader(file)
+    text = ''
+    for page in reader.pages:
+        content = page.extract_text()
+        if content:
+            text += content
     return text
+# Store context globally
+document_context = {"text": ""}
+# Function to set context from PDF or text
+def set_context(pdf_file, text_input):
+    if pdf_file:
+        extracted = extract_text_from_pdf(pdf_file)
+        document_context["text"] = extracted
+        return "PDF uploaded and processed successfully!"
+    elif text_input.strip():
+        document_context["text"] = text_input.strip()
+        return "Text received and stored successfully!"
+    else:
+        return "Please upload a PDF or provide some text."
+# Function to answer questions based on stored context
+def answer_question(question):
+    context = document_context["text"]
+    if not context:
+        return "Please upload a document or enter some text first."
+    if not question.strip():
+        return "Please enter a question."
     try:
+        result = qa_pipeline(question=question, context=context)
+        return result["answer"]
     except Exception as e:
+        return f"Error during QA: {str(e)}"
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("# 📄 Ask Questions from a Document")
+    gr.Markdown("Upload a PDF or paste some text, then ask questions about it!")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF (optional)", type="binary")
+        text_input = gr.Textbox(label="Or paste text here", lines=8, placeholder="Paste your document text...")
+    upload_btn = gr.Button("Submit Document")
+    upload_output = gr.Textbox(label="Status", interactive=False)
+    question_input = gr.Textbox(label="Ask a Question", placeholder="Type your question here...")
+    answer_output = gr.Textbox(label="Answer", interactive=False)
+    upload_btn.click(set_context, inputs=[pdf_input, text_input], outputs=upload_output)
+    question_input.change(answer_question, inputs=question_input, outputs=answer_output)
+demo.launch()