import os import sys import pickle import numpy as np import gradio as gr import fitz # PyMuPDF from docx import Document from transformers import AutoModel, AutoTokenizer import faiss # ============================================= # EMBEDDING MODEL SETUP (NO sentence-transformers dependency) # ============================================= model_name = "sentence-transformers/all-MiniLM-L6-v2" tokenizer = AutoTokenizer.from_pretrained(model_name) embedding_model = AutoModel.from_pretrained(model_name) def get_embeddings(texts): if isinstance(texts, str): texts = [texts] inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512) with torch.no_grad(): outputs = embedding_model(**inputs) return outputs.last_hidden_state[:, 0].cpu().numpy() # ============================================= # DOCUMENT STORAGE SETUP # ============================================= index_path = "faiss_index.pkl" document_texts_path = "document_texts.pkl" document_texts = [] embedding_dim = 384 # Dimension for all-MiniLM-L6-v2 if os.path.exists(index_path) and os.path.exists(document_texts_path): try: with open(index_path, "rb") as f: index = pickle.load(f) with open(document_texts_path, "rb") as f: document_texts = pickle.load(f) except Exception as e: print(f"Error loading index: {e}") index = faiss.IndexFlatIP(embedding_dim) else: index = faiss.IndexFlatIP(embedding_dim) # ============================================= # DOCUMENT PROCESSING FUNCTIONS # ============================================= def extract_text_from_pdf(pdf_path): text = "" try: doc = fitz.open(pdf_path) for page in doc: text += page.get_text() except Exception as e: print(f"PDF error: {e}") return text def extract_text_from_docx(docx_path): text = "" try: doc = Document(docx_path) text = "\n".join([para.text for para in doc.paragraphs]) except Exception as e: print(f"DOCX error: {e}") return text # ============================================= # CORE FUNCTIONALITY # ============================================= def upload_files(files): global index, document_texts try: for file in files: file_path = file.name if file_path.endswith('.pdf'): text = extract_text_from_pdf(file_path) elif file_path.endswith('.docx'): text = extract_text_from_docx(file_path) else: continue sentences = [s.strip() for s in text.split("\n") if s.strip()] if not sentences: continue embeddings = get_embeddings(sentences) index.add(embeddings) document_texts.extend(sentences) # Save updated index with open(index_path, "wb") as f: pickle.dump(index, f) with open(document_texts_path, "wb") as f: pickle.dump(document_texts, f) return f"Processed {len(files)} files, added {len(sentences)} sentences" except Exception as e: return f"Error: {str(e)}" def query_text(query): try: query_embedding = get_embeddings(query) D, I = index.search(query_embedding, k=3) results = [] for idx in I[0]: if 0 <= idx < len(document_texts): results.append(document_texts[idx]) return "\n\n---\n\n".join(results) if results else "No matches found" except Exception as e: return f"Query error: {str(e)}" # ============================================= # GRADIO INTERFACE # ============================================= with gr.Blocks() as demo: gr.Markdown("## Document Search with Semantic Similarity") with gr.Tab("Upload Documents"): file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"]) upload_btn = gr.Button("Process Files") upload_output = gr.Textbox() with gr.Tab("Search"): query_input = gr.Textbox(label="Enter your query") search_btn = gr.Button("Search") results_output = gr.Textbox() upload_btn.click(upload_files, inputs=file_input, outputs=upload_output) search_btn.click(query_text, inputs=query_input, outputs=results_output) if __name__ == "__main__": demo.launch()