Spaces:

NaimaAqeel
/

Chatbot

Build error

File size: 4,449 Bytes

import os
import sys
import pickle
import numpy as np
import gradio as gr
import fitz  # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer
import faiss

# =============================================
# EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state[:, 0].cpu().numpy()

# =============================================
# DOCUMENT STORAGE SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []

embedding_dim = 384  # Dimension for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
    try:
        with open(index_path, "rb") as f:
            index = pickle.load(f)
        with open(document_texts_path, "rb") as f:
            document_texts = pickle.load(f)
    except Exception as e:
        print(f"Error loading index: {e}")
        index = faiss.IndexFlatIP(embedding_dim)
else:
    index = faiss.IndexFlatIP(embedding_dim)

# =============================================
# DOCUMENT PROCESSING FUNCTIONS
# =============================================
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"PDF error: {e}")
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"DOCX error: {e}")
    return text

# =============================================
# CORE FUNCTIONALITY
# =============================================
def upload_files(files):
    global index, document_texts
    try:
        for file in files:
            file_path = file.name
            if file_path.endswith('.pdf'):
                text = extract_text_from_pdf(file_path)
            elif file_path.endswith('.docx'):
                text = extract_text_from_docx(file_path)
            else:
                continue

            sentences = [s.strip() for s in text.split("\n") if s.strip()]
            if not sentences:
                continue
                
            embeddings = get_embeddings(sentences)
            index.add(embeddings)
            document_texts.extend(sentences)

        # Save updated index
        with open(index_path, "wb") as f:
            pickle.dump(index, f)
        with open(document_texts_path, "wb") as f:
            pickle.dump(document_texts, f)

        return f"Processed {len(files)} files, added {len(sentences)} sentences"
    except Exception as e:
        return f"Error: {str(e)}"

def query_text(query):
    try:
        query_embedding = get_embeddings(query)
        D, I = index.search(query_embedding, k=3)
        
        results = []
        for idx in I[0]:
            if 0 <= idx < len(document_texts):
                results.append(document_texts[idx])
        
        return "\n\n---\n\n".join(results) if results else "No matches found"
    except Exception as e:
        return f"Query error: {str(e)}"

# =============================================
# GRADIO INTERFACE
# =============================================
with gr.Blocks() as demo:
    gr.Markdown("## Document Search with Semantic Similarity")
    
    with gr.Tab("Upload Documents"):
        file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
        upload_btn = gr.Button("Process Files")
        upload_output = gr.Textbox()
    
    with gr.Tab("Search"):
        query_input = gr.Textbox(label="Enter your query")
        search_btn = gr.Button("Search")
        results_output = gr.Textbox()
    
    upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
    search_btn.click(query_text, inputs=query_input, outputs=results_output)

if __name__ == "__main__":
    demo.launch()