Spaces:

NaimaAqeel
/

Chatbot

Runtime error

File size: 5,538 Bytes

now explain this code that how this work? i want to understand deeply import os
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch

# ===============================
# EMBEDDING MODEL (E5)
# ===============================
model_name = "intfloat/e5-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts, is_query=False):
    if isinstance(texts, str):
        texts = [texts]
    prefix = "query: " if is_query else "passage: "
    texts = [prefix + t for t in texts]

    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        model_output = embedding_model(**inputs)

    embeddings = model_output.last_hidden_state[:, 0]  # CLS token
    return embeddings.cpu().numpy()


# ===============================
# TEXT CHUNKING
# ===============================
def chunk_text(text, chunk_size=800, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# ===============================
# FAISS INDEX SETUP
# ===============================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384

if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# ===============================
# FILE EXTRACTORS
# ===============================
def extract_text_from_pdf(path):
    text = ""
    try:
        doc = fitz.open(path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(path):
    text = ""
    try:
        doc = Document(path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# ===============================
# UPLOAD HANDLER
# ===============================
def upload_document(file):
    ext = os.path.splitext(file.name)[-1].lower()
    if ext == ".pdf":
        text = extract_text_from_pdf(file.name)
    elif ext == ".docx":
        text = extract_text_from_docx(file.name)
    else:
        return "Unsupported file type."

    chunks = chunk_text(text)
    chunk_embeddings = get_embeddings(chunks)
    index.add(np.array(chunk_embeddings).astype('float32'))
    document_texts.extend(chunks)

    with open(index_path, "wb") as f:
        pickle.dump(index, f)
    with open(document_texts_path, "wb") as f:
        pickle.dump(document_texts, f)

    return "Document uploaded and indexed successfully."

# ===============================
# GENERATION PIPELINE (FLAN-T5)
# ===============================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_answer_from_file(query, top_k=10):
    if not document_texts:
        return "No documents indexed yet."

    query_vector = get_embeddings(query).astype("float32")
    scores, indices = index.search(query_vector, k=top_k)
    retrieved_chunks = [document_texts[i] for i in indices[0]]
    context = "\n\n".join(retrieved_chunks)

    print("\n--- Retrieved Context ---\n", context)  # Debugging print

    # Prompt Engineering
    prompt = (
        "You are a helpful assistant reading student notes or textbook passages.\n\n"
        "Based on the context provided, answer the question accurately and clearly.\n\n"
        "### Example\n"
        "Context:\nArtificial systems are created by people. These systems are designed to perform specific tasks, improve efficiency, and solve problems. Examples include knowledge systems, engineering systems, and social systems.\n\n"
        "Question: What is an Artificial System?\n"
        "Answer: Artificial systems are systems created by humans to perform specific tasks, improve efficiency, and solve problems. They include systems like knowledge systems, engineering systems, and social systems.\n\n"
        "### Now answer this\n"
        f"Context:\n{context}\n\n"
        f"Question: {query}\n"
        f"Answer:"
    )

    result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
    return result.strip()

# ===============================
# GRADIO INTERFACES
# ===============================
upload_interface = gr.Interface(
    fn=upload_document,
    inputs=gr.File(file_types=[".pdf", ".docx"]),
    outputs="text",
    title="Upload Document",
    description="Upload your Word or PDF document for question answering."
)

search_interface = gr.Interface(
    fn=generate_answer_from_file,
    inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
    outputs="text",
    title="Ask the Document",
    description="Ask questions about the uploaded content. The chatbot will answer based on the document."
)

app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()