Spaces:

NaimaAqeel
/

Chatbot

Runtime error

File size: 4,918 Bytes

import os
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
import fitz  # PyMuPDF for PDFs
import docx  # python-docx for Word files

# Initialize global variables
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"

# Load or initialize FAISS index and document chunks
if os.path.exists(index_path) and os.path.exists(document_texts_path):
    with open(index_path, "rb") as f:
        index = pickle.load(f)
    with open(document_texts_path, "rb") as f:
        document_texts = pickle.load(f)
else:
    # Use 384 dim for all-MiniLM-L6-v2 model
    dim = 384
    index = faiss.IndexFlatL2(dim)
    document_texts = []

# Load SentenceTransformer for embeddings
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initialize QA pipeline with a text generation model
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return "\n".join(fullText)

def chunk_text(text, max_len=500):
    """Split text into chunks of max_len characters, trying to split at sentence boundaries."""
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    for sent in sentences:
        if len(current_chunk) + len(sent) + 1 <= max_len:
            current_chunk += sent + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sent + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def get_embeddings(texts, is_query=False):
    if isinstance(texts, str):
        texts = [texts]
    embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
    return embeddings

def upload_document(file):
    global index, document_texts
    
    ext = os.path.splitext(file.name)[-1].lower()
    try:
        if ext == ".pdf":
            text = extract_text_from_pdf(file.file.name)
        elif ext == ".docx":
            text = extract_text_from_docx(file.file.name)
        else:
            return "Unsupported file type. Please upload a PDF or DOCX file."
    except Exception as e:
        return f"Failed to extract text: {str(e)}"

    if not text.strip():
        return "Failed to extract any text from the document."

    chunks = chunk_text(text)
    embeddings = get_embeddings(chunks)
    
    # Convert FAISS index to IDMap to allow adding new vectors incrementally
    if not isinstance(index, faiss.IndexIDMap):
        id_map = faiss.IndexIDMap(index)
        index = id_map
    
    start_id = len(document_texts)
    ids = np.arange(start_id, start_id + len(chunks))
    
    index.add_with_ids(embeddings.astype('float32'), ids)
    document_texts.extend(chunks)

    # Save index and texts
    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return f"Document uploaded and indexed successfully with {len(chunks)} chunks."

def generate_answer_from_file(query, top_k=5):
    global index, document_texts
    
    if len(document_texts) == 0:
        return "No document uploaded yet. Please upload a PDF or DOCX file first."

    query_vec = get_embeddings(query, is_query=True).astype("float32")
    scores, indices = index.search(query_vec, top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0] if i < len(document_texts)]

    context = "\n\n".join(retrieved_chunks)

    prompt = (
        "You are a helpful assistant reading a document.\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query}\n"
        "Answer:"
    )
    
    # Generate answer with max length 256 tokens
    result = qa_pipeline(prompt, max_length=256, do_sample=False)[0]['generated_text']

    return result.strip()

with gr.Blocks() as demo:
    gr.Markdown("## Document Question Answering App\nUpload a PDF or DOCX file, then ask questions based on it.")
    
    with gr.Row():
        file_input = gr.File(label="Upload PDF or DOCX file", file_types=['.pdf', '.docx'])
        upload_btn = gr.Button("Upload & Index Document")
    
    upload_output = gr.Textbox(label="Upload Status", interactive=False)

    question = gr.Textbox(label="Enter your question here")
    answer = gr.Textbox(label="Answer", interactive=False)
    ask_btn = gr.Button("Ask")

    upload_btn.click(upload_document, inputs=file_input, outputs=upload_output)
    ask_btn.click(generate_answer_from_file, inputs=question, outputs=answer)

demo.launch()