Spaces:
Build error
Build error
import os | |
import sys | |
import pickle | |
import numpy as np | |
import gradio as gr | |
import fitz # PyMuPDF | |
from docx import Document | |
from transformers import AutoModel, AutoTokenizer | |
import faiss | |
# ============================================= | |
# EMBEDDING MODEL SETUP (NO sentence-transformers dependency) | |
# ============================================= | |
model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
embedding_model = AutoModel.from_pretrained(model_name) | |
def get_embeddings(texts): | |
if isinstance(texts, str): | |
texts = [texts] | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
with torch.no_grad(): | |
outputs = embedding_model(**inputs) | |
return outputs.last_hidden_state[:, 0].cpu().numpy() | |
# ============================================= | |
# DOCUMENT STORAGE SETUP | |
# ============================================= | |
index_path = "faiss_index.pkl" | |
document_texts_path = "document_texts.pkl" | |
document_texts = [] | |
embedding_dim = 384 # Dimension for all-MiniLM-L6-v2 | |
if os.path.exists(index_path) and os.path.exists(document_texts_path): | |
try: | |
with open(index_path, "rb") as f: | |
index = pickle.load(f) | |
with open(document_texts_path, "rb") as f: | |
document_texts = pickle.load(f) | |
except Exception as e: | |
print(f"Error loading index: {e}") | |
index = faiss.IndexFlatIP(embedding_dim) | |
else: | |
index = faiss.IndexFlatIP(embedding_dim) | |
# ============================================= | |
# DOCUMENT PROCESSING FUNCTIONS | |
# ============================================= | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
try: | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
text += page.get_text() | |
except Exception as e: | |
print(f"PDF error: {e}") | |
return text | |
def extract_text_from_docx(docx_path): | |
text = "" | |
try: | |
doc = Document(docx_path) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
except Exception as e: | |
print(f"DOCX error: {e}") | |
return text | |
# ============================================= | |
# CORE FUNCTIONALITY | |
# ============================================= | |
def upload_files(files): | |
global index, document_texts | |
try: | |
for file in files: | |
file_path = file.name | |
if file_path.endswith('.pdf'): | |
text = extract_text_from_pdf(file_path) | |
elif file_path.endswith('.docx'): | |
text = extract_text_from_docx(file_path) | |
else: | |
continue | |
sentences = [s.strip() for s in text.split("\n") if s.strip()] | |
if not sentences: | |
continue | |
embeddings = get_embeddings(sentences) | |
index.add(embeddings) | |
document_texts.extend(sentences) | |
# Save updated index | |
with open(index_path, "wb") as f: | |
pickle.dump(index, f) | |
with open(document_texts_path, "wb") as f: | |
pickle.dump(document_texts, f) | |
return f"Processed {len(files)} files, added {len(sentences)} sentences" | |
except Exception as e: | |
return f"Error: {str(e)}" | |
def query_text(query): | |
try: | |
query_embedding = get_embeddings(query) | |
D, I = index.search(query_embedding, k=3) | |
results = [] | |
for idx in I[0]: | |
if 0 <= idx < len(document_texts): | |
results.append(document_texts[idx]) | |
return "\n\n---\n\n".join(results) if results else "No matches found" | |
except Exception as e: | |
return f"Query error: {str(e)}" | |
# ============================================= | |
# GRADIO INTERFACE | |
# ============================================= | |
with gr.Blocks() as demo: | |
gr.Markdown("## Document Search with Semantic Similarity") | |
with gr.Tab("Upload Documents"): | |
file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"]) | |
upload_btn = gr.Button("Process Files") | |
upload_output = gr.Textbox() | |
with gr.Tab("Search"): | |
query_input = gr.Textbox(label="Enter your query") | |
search_btn = gr.Button("Search") | |
results_output = gr.Textbox() | |
upload_btn.click(upload_files, inputs=file_input, outputs=upload_output) | |
search_btn.click(query_text, inputs=query_input, outputs=results_output) | |
if __name__ == "__main__": | |
demo.launch() | |