Spaces:
Build error
Build error
import os | |
import pickle | |
import numpy as np | |
import gradio as gr | |
import fitz # PyMuPDF | |
from docx import Document | |
from transformers import AutoModel, AutoTokenizer | |
import faiss | |
import torch | |
# ============================================= | |
# EMBEDDING MODEL SETUP | |
# ============================================= | |
model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
embedding_model = AutoModel.from_pretrained(model_name) | |
def get_embeddings(texts): | |
if isinstance(texts, str): | |
texts = [texts] | |
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512) | |
with torch.no_grad(): | |
outputs = embedding_model(**inputs) | |
return outputs.last_hidden_state[:, 0].cpu().numpy() | |
# ============================================= | |
# DOCUMENT STORAGE SETUP | |
# ============================================= | |
index_path = "faiss_index.pkl" | |
document_texts_path = "document_texts.pkl" | |
document_texts = [] | |
embedding_dim = 384 # Dimension for all-MiniLM-L6-v2 | |
if os.path.exists(index_path) and os.path.exists(document_texts_path): | |
try: | |
with open(index_path, "rb") as f: | |
index = pickle.load(f) | |
with open(document_texts_path, "rb") as f: | |
document_texts = pickle.load(f) | |
except Exception as e: | |
print(f"Error loading index: {e}") | |
index = faiss.IndexFlatIP(embedding_dim) | |
else: | |
index = faiss.IndexFlatIP(embedding_dim) | |
# ============================================= | |
# DOCUMENT PROCESSING FUNCTIONS | |
# ============================================= | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
try: | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
text += page.get_text() | |
except Exception as e: | |
print(f"PDF error: {e}") | |
return text | |
def extract_text_from_docx(docx_path): | |
text = "" | |
try: | |
doc = Document(docx_path) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
except Exception as e: | |