Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

d87413b verified 5 days ago

raw

history blame

4.62 kB

	import os
	import pickle
	import numpy as np
	import gradio as gr
	import fitz # PyMuPDF
	from docx import Document
	from transformers import AutoModel, AutoTokenizer, pipeline
	import faiss
	import torch

	# =============================================
	# EMBEDDING MODEL SETUP
	# =============================================
	model_name = "sentence-transformers/all-MiniLM-L6-v2"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	embedding_model = AutoModel.from_pretrained(model_name)

	def get_embeddings(texts):
	if isinstance(texts, str):
	texts = [texts]
	inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
	with torch.no_grad():
	outputs = embedding_model(**inputs)
	return outputs.last_hidden_state[:, 0].cpu().numpy()

	# =============================================
	# TEXT CHUNKING
	# =============================================
	def chunk_text(text, chunk_size=500, overlap=50):
	chunks = []
	start = 0
	while start < len(text):
	end = min(len(text), start + chunk_size)
	chunks.append(text[start:end])
	start += chunk_size - overlap
	return chunks

	# =============================================
	# FAISS INDEX SETUP
	# =============================================
	index_path = "faiss_index.pkl"
	document_texts_path = "document_texts.pkl"
	document_texts = []

	embedding_dim = 384 # for all-MiniLM-L6-v2
	if os.path.exists(index_path) and os.path.exists(document_texts_path):
	try:
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	with open(document_texts_path, "rb") as f:
	document_texts = pickle.load(f)
	except Exception as e:
	print(f"Error loading index: {e}")
	index = faiss.IndexFlatIP(embedding_dim)
	else:
	index = faiss.IndexFlatIP(embedding_dim)

	# =============================================
	# DOCUMENT PROCESSING
	# =============================================
	def extract_text_from_pdf(path):
	text = ""
	try:
	doc = fitz.open(path)
	for page in doc:
	text += page.get_text()
	except Exception as e:
	print(f"PDF error: {e}")
	return text

	def extract_text_from_docx(path):
	text = ""
	try:
	doc = Document(path)
	text = "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"DOCX error: {e}")
	return text

	# =============================================
	# UPLOAD AND INDEX FILE
	# =============================================
	def upload_document(file):
	ext = os.path.splitext(file.name)[-1].lower()
	if ext == ".pdf":
	text = extract_text_from_pdf(file.name)
	elif ext == ".docx":
	text = extract_text_from_docx(file.name)
	else:
	return "Unsupported file type."

	chunks = chunk_text(text)
	chunk_embeddings = get_embeddings(chunks)
	index.add(np.array(chunk_embeddings).astype('float32'))
	document_texts.extend(chunks)

	with open(index_path, "wb") as f:
	pickle.dump(index, f)
	with open(document_texts_path, "wb") as f:
	pickle.dump(document_texts, f)

	return "Document uploaded and indexed successfully."

	# =============================================
	# QA PIPELINE WITH FLAN-T5
	# =============================================
	qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

	def generate_answer_from_file(query, top_k=3):
	if not document_texts:
	return "No documents indexed yet."

	query_vector = get_embeddings(query).astype("float32")
	scores, indices = index.search(query_vector, k=top_k)
	retrieved_chunks = [document_texts[i] for i in indices[0]]
	context = " ".join(retrieved_chunks)

	prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
	result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
	return result

	# =============================================
	# GRADIO UI
	# =============================================
	upload_interface = gr.Interface(
	fn=upload_document,
	inputs=gr.File(file_types=[".pdf", ".docx"]),
	outputs="text",
	title="Upload Document",
	description="Upload a Word or PDF file to index it for question answering."
	)

	search_interface = gr.Interface(
	fn=generate_answer_from_file,
	inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
	outputs="text",
	title="Ask Your Document",
	description="Ask any question. The chatbot will read the document and answer like ChatGPT."
	)

	app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
	app.launch()