Spaces:
Build error
Build error
File size: 4,449 Bytes
1649416 24d9947 944d263 24d9947 834c71a 24d9947 3ac4e4b 24d9947 3ac4e4b 24d9947 3ac4e4b 24d9947 3ac4e4b 24d9947 3ac4e4b 24d9947 3ac4e4b 24d9947 3ac4e4b 56ec544 24d9947 944d263 24d9947 944d263 24d9947 944d263 834c71a 944d263 24d9947 944d263 24d9947 7adb197 944d263 90bf4dc 7adb197 ac5f15c 944d263 7adb197 24d9947 7adb197 3ac4e4b ac5f15c 944d263 24d9947 944d263 ac5f15c 24d9947 90bf4dc 24d9947 944d263 24d9947 944d263 3ac4e4b 24d9947 944d263 24d9947 944d263 24d9947 944d263 24d9947 944d263 24d9947 944d263 24d9947 944d263 24d9947 944d263 3ac4e4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import sys
import pickle
import numpy as np
import gradio as gr
import fitz # PyMuPDF
from docx import Document
from transformers import AutoModel, AutoTokenizer
import faiss
# =============================================
# EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
# =============================================
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)
def get_embeddings(texts):
if isinstance(texts, str):
texts = [texts]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = embedding_model(**inputs)
return outputs.last_hidden_state[:, 0].cpu().numpy()
# =============================================
# DOCUMENT STORAGE SETUP
# =============================================
index_path = "faiss_index.pkl"
document_texts_path = "document_texts.pkl"
document_texts = []
embedding_dim = 384 # Dimension for all-MiniLM-L6-v2
if os.path.exists(index_path) and os.path.exists(document_texts_path):
try:
with open(index_path, "rb") as f:
index = pickle.load(f)
with open(document_texts_path, "rb") as f:
document_texts = pickle.load(f)
except Exception as e:
print(f"Error loading index: {e}")
index = faiss.IndexFlatIP(embedding_dim)
else:
index = faiss.IndexFlatIP(embedding_dim)
# =============================================
# DOCUMENT PROCESSING FUNCTIONS
# =============================================
def extract_text_from_pdf(pdf_path):
text = ""
try:
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text()
except Exception as e:
print(f"PDF error: {e}")
return text
def extract_text_from_docx(docx_path):
text = ""
try:
doc = Document(docx_path)
text = "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"DOCX error: {e}")
return text
# =============================================
# CORE FUNCTIONALITY
# =============================================
def upload_files(files):
global index, document_texts
try:
for file in files:
file_path = file.name
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
elif file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
else:
continue
sentences = [s.strip() for s in text.split("\n") if s.strip()]
if not sentences:
continue
embeddings = get_embeddings(sentences)
index.add(embeddings)
document_texts.extend(sentences)
# Save updated index
with open(index_path, "wb") as f:
pickle.dump(index, f)
with open(document_texts_path, "wb") as f:
pickle.dump(document_texts, f)
return f"Processed {len(files)} files, added {len(sentences)} sentences"
except Exception as e:
return f"Error: {str(e)}"
def query_text(query):
try:
query_embedding = get_embeddings(query)
D, I = index.search(query_embedding, k=3)
results = []
for idx in I[0]:
if 0 <= idx < len(document_texts):
results.append(document_texts[idx])
return "\n\n---\n\n".join(results) if results else "No matches found"
except Exception as e:
return f"Query error: {str(e)}"
# =============================================
# GRADIO INTERFACE
# =============================================
with gr.Blocks() as demo:
gr.Markdown("## Document Search with Semantic Similarity")
with gr.Tab("Upload Documents"):
file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
upload_btn = gr.Button("Process Files")
upload_output = gr.Textbox()
with gr.Tab("Search"):
query_input = gr.Textbox(label="Enter your query")
search_btn = gr.Button("Search")
results_output = gr.Textbox()
upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
search_btn.click(query_text, inputs=query_input, outputs=results_output)
if __name__ == "__main__":
demo.launch()
|