Chatbot / app.py
NaimaAqeel's picture
Update app.py
3ac4e4b verified
raw
history blame
4.45 kB
import os
import sys
import pickle
import numpy as np
import gradio as gr
import fitz # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer
import faiss
# =============================================
# EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)
def get_embeddings(texts):
if isinstance(texts, str):
texts = [texts]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = embedding_model(**inputs)
return outputs.last_hidden_state[:, 0].cpu().numpy()
# =============================================
# DOCUMENT STORAGE SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384 # Dimension for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
try:
with open(index_path, "rb") as f:
index = pickle.load(f)
with open(document_texts_path, "rb") as f:
document_texts = pickle.load(f)
except Exception as e:
print(f"Error loading index: {e}")
index = faiss.IndexFlatIP(embedding_dim)
else:
index = faiss.IndexFlatIP(embedding_dim)
# =============================================
# DOCUMENT PROCESSING FUNCTIONS
# =============================================
def extract_text_from_pdf(pdf_path):
text = ""
try:
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text()
except Exception as e:
print(f"PDF error: {e}")
return text
def extract_text_from_docx(docx_path):
text = ""
try:
doc = Document(docx_path)
text = "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"DOCX error: {e}")
return text
# =============================================
# CORE FUNCTIONALITY
# =============================================
def upload_files(files):
global index, document_texts
try:
for file in files:
file_path = file.name
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
else:
continue
sentences = [s.strip() for s in text.split("\n") if s.strip()]
if not sentences:
continue
embeddings = get_embeddings(sentences)
index.add(embeddings)
document_texts.extend(sentences)
# Save updated index
with open(index_path, "wb") as f:
pickle.dump(index, f)
with open(document_texts_path, "wb") as f:
pickle.dump(document_texts, f)
return f"Processed {len(files)} files, added {len(sentences)} sentences"
except Exception as e:
return f"Error: {str(e)}"
def query_text(query):
try:
query_embedding = get_embeddings(query)
D, I = index.search(query_embedding, k=3)
results = []
for idx in I[0]:
if 0 <= idx < len(document_texts):
results.append(document_texts[idx])
return "\n\n---\n\n".join(results) if results else "No matches found"
except Exception as e:
return f"Query error: {str(e)}"
# =============================================
# GRADIO INTERFACE
# =============================================
with gr.Blocks() as demo:
gr.Markdown("## Document Search with Semantic Similarity")
with gr.Tab("Upload Documents"):
file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
upload_btn = gr.Button("Process Files")
upload_output = gr.Textbox()
with gr.Tab("Search"):
query_input = gr.Textbox(label="Enter your query")
search_btn = gr.Button("Search")
results_output = gr.Textbox()
upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
search_btn.click(query_text, inputs=query_input, outputs=results_output)
if __name__ == "__main__":
demo.launch()