Spaces:

NaimaAqeel
/

Chatbot

Build error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

3ac4e4b verified 15 days ago

raw

history blame

4.45 kB

	import os
	import sys
	import pickle
	import numpy as np
	import gradio as gr
	import fitz # PyMuPDF
	from docx import Document
	from transformers import AutoModel, AutoTokenizer
	import faiss

	# =============================================
	# EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
	# =============================================
	model_name = "sentence-transformers/all-MiniLM-L6-v2"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	embedding_model = AutoModel.from_pretrained(model_name)

	def get_embeddings(texts):
	if isinstance(texts, str):
	texts = [texts]
	inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
	with torch.no_grad():
	outputs = embedding_model(**inputs)
	return outputs.last_hidden_state[:, 0].cpu().numpy()

	# =============================================
	# DOCUMENT STORAGE SETUP
	# =============================================
	index_path = "faiss_index.pkl"
	document_texts_path = "document_texts.pkl"
	document_texts = []

	embedding_dim = 384 # Dimension for all-MiniLM-L6-v2
	if os.path.exists(index_path) and os.path.exists(document_texts_path):
	try:
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	with open(document_texts_path, "rb") as f:
	document_texts = pickle.load(f)
	except Exception as e:
	print(f"Error loading index: {e}")
	index = faiss.IndexFlatIP(embedding_dim)
	else:
	index = faiss.IndexFlatIP(embedding_dim)

	# =============================================
	# DOCUMENT PROCESSING FUNCTIONS
	# =============================================
	def extract_text_from_pdf(pdf_path):
	text = ""
	try:
	doc = fitz.open(pdf_path)
	for page in doc:
	text += page.get_text()
	except Exception as e:
	print(f"PDF error: {e}")
	return text

	def extract_text_from_docx(docx_path):
	text = ""
	try:
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"DOCX error: {e}")
	return text

	# =============================================
	# CORE FUNCTIONALITY
	# =============================================
	def upload_files(files):
	global index, document_texts
	try:
	for file in files:
	file_path = file.name
	if file_path.endswith('.pdf'):
	text = extract_text_from_pdf(file_path)
	elif file_path.endswith('.docx'):
	text = extract_text_from_docx(file_path)
	else:
	continue

	sentences = [s.strip() for s in text.split("\n") if s.strip()]
	if not sentences:
	continue

	embeddings = get_embeddings(sentences)
	index.add(embeddings)
	document_texts.extend(sentences)

	# Save updated index
	with open(index_path, "wb") as f:
	pickle.dump(index, f)
	with open(document_texts_path, "wb") as f:
	pickle.dump(document_texts, f)

	return f"Processed {len(files)} files, added {len(sentences)} sentences"
	except Exception as e:
	return f"Error: {str(e)}"

	def query_text(query):
	try:
	query_embedding = get_embeddings(query)
	D, I = index.search(query_embedding, k=3)

	results = []
	for idx in I[0]:
	if 0 <= idx < len(document_texts):
	results.append(document_texts[idx])

	return "\n\n---\n\n".join(results) if results else "No matches found"
	except Exception as e:
	return f"Query error: {str(e)}"

	# =============================================
	# GRADIO INTERFACE
	# =============================================
	with gr.Blocks() as demo:
	gr.Markdown("## Document Search with Semantic Similarity")

	with gr.Tab("Upload Documents"):
	file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
	upload_btn = gr.Button("Process Files")
	upload_output = gr.Textbox()

	with gr.Tab("Search"):
	query_input = gr.Textbox(label="Enter your query")
	search_btn = gr.Button("Search")
	results_output = gr.Textbox()

	upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
	search_btn.click(query_text, inputs=query_input, outputs=results_output)

	if __name__ == "__main__":
	demo.launch()