Chatbot / app.py
NaimaAqeel's picture
Update app.py
d87413b verified
raw
history blame
4.62 kB
import os
import pickle
import numpy as np
import gradio as gr
import fitz # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer, pipeline
import faiss
import torch
# =============================================
# EMBEDDING MODEL SETUP
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)
def get_embeddings(texts):
if isinstance(texts, str):
texts = [texts]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = embedding_model(**inputs)
return outputs.last_hidden_state[:, 0].cpu().numpy()
# =============================================
# TEXT CHUNKING
# =============================================
def chunk_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + chunk_size)
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
# =============================================
# FAISS INDEX SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384 # for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
try:
with open(index_path, "rb") as f:
index = pickle.load(f)
with open(document_texts_path, "rb") as f:
document_texts = pickle.load(f)
except Exception as e:
print(f"Error loading index: {e}")
index = faiss.IndexFlatIP(embedding_dim)
else:
index = faiss.IndexFlatIP(embedding_dim)
# =============================================
# DOCUMENT PROCESSING
# =============================================
def extract_text_from_pdf(path):
text = ""
try:
doc = fitz.open(path)
for page in doc:
text += page.get_text()
except Exception as e:
print(f"PDF error: {e}")
return text
def extract_text_from_docx(path):
text = ""
try:
doc = Document(path)
text = "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"DOCX error: {e}")
return text
# =============================================
# UPLOAD AND INDEX FILE
# =============================================
def upload_document(file):
ext = os.path.splitext(file.name)[-1].lower()
if ext == ".pdf":
text = extract_text_from_pdf(file.name)
elif ext == ".docx":
text = extract_text_from_docx(file.name)
else:
return "Unsupported file type."
chunks = chunk_text(text)
chunk_embeddings = get_embeddings(chunks)
index.add(np.array(chunk_embeddings).astype('float32'))
document_texts.extend(chunks)
with open(index_path, "wb") as f:
pickle.dump(index, f)
with open(document_texts_path, "wb") as f:
pickle.dump(document_texts, f)
return "Document uploaded and indexed successfully."
# =============================================
# QA PIPELINE WITH FLAN-T5
# =============================================
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
def generate_answer_from_file(query, top_k=3):
if not document_texts:
return "No documents indexed yet."
query_vector = get_embeddings(query).astype("float32")
scores, indices = index.search(query_vector, k=top_k)
retrieved_chunks = [document_texts[i] for i in indices[0]]
context = " ".join(retrieved_chunks)
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
return result
# =============================================
# GRADIO UI
# =============================================
upload_interface = gr.Interface(
fn=upload_document,
inputs=gr.File(file_types=[".pdf", ".docx"]),
outputs="text",
title="Upload Document",
description="Upload a Word or PDF file to index it for question answering."
)
search_interface = gr.Interface(
fn=generate_answer_from_file,
inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
outputs="text",
title="Ask Your Document",
description="Ask any question. The chatbot will read the document and answer like ChatGPT."
)
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
app.launch()