NaimaAqeel commited on
Commit
3ac4e4b
·
verified ·
1 Parent(s): 24d9947

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -46
app.py CHANGED
@@ -3,47 +3,34 @@ import sys
3
  import pickle
4
  import numpy as np
5
  import gradio as gr
6
- from typing import List
7
  import fitz # PyMuPDF
8
  from docx import Document
9
- from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
10
  import faiss
11
 
12
  # =============================================
13
- # FIX FOR HUGGINGFACE HUB IMPORT ISSUE
14
  # =============================================
15
- try:
16
- from huggingface_hub import cached_download
17
- except ImportError:
18
- from huggingface_hub.utils import cached_download
19
- import huggingface_hub
20
- sys.modules['huggingface_hub'].cached_download = cached_download
21
-
22
- # Now we can safely import sentence-transformers
23
- from sentence_transformers import SentenceTransformer
 
 
24
 
25
  # =============================================
26
- # INITIALIZE MODELS
27
  # =============================================
28
- # Initialize embedding model (using direct transformers as fallback)
29
- try:
30
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
31
- except Exception as e:
32
- print(f"Failed to load SentenceTransformer, falling back to direct transformers: {e}")
33
- model_name = "sentence-transformers/all-MiniLM-L6-v2"
34
- tokenizer = AutoTokenizer.from_pretrained(model_name)
35
- embedding_model = AutoModel.from_pretrained(model_name)
36
-
37
- def get_embeddings(texts):
38
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
39
- outputs = embedding_model(**inputs)
40
- return outputs.last_hidden_state.mean(dim=1).detach().numpy()
41
-
42
- # Initialize FAISS index
43
  index_path = "faiss_index.pkl"
44
  document_texts_path = "document_texts.pkl"
45
  document_texts = []
46
 
 
47
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
48
  try:
49
  with open(index_path, "rb") as f:
@@ -51,10 +38,10 @@ if os.path.exists(index_path) and os.path.exists(document_texts_path):
51
  with open(document_texts_path, "rb") as f:
52
  document_texts = pickle.load(f)
53
  except Exception as e:
54
- print(f"Error loading FAISS index: {e}")
55
- index = faiss.IndexFlatIP(384) # 384 is dim for all-MiniLM-L6-v2
56
  else:
57
- index = faiss.IndexFlatIP(384)
58
 
59
  # =============================================
60
  # DOCUMENT PROCESSING FUNCTIONS
@@ -93,14 +80,12 @@ def upload_files(files):
93
  else:
94
  continue
95
 
96
- sentences = [s for s in text.split("\n") if s.strip()]
97
-
98
- if hasattr(embedding_model, 'encode'):
99
- embeddings = embedding_model.encode(sentences, normalize_embeddings=True)
100
- else:
101
- embeddings = get_embeddings(sentences)
102
-
103
- index.add(np.array(embeddings))
104
  document_texts.extend(sentences)
105
 
106
  # Save updated index
@@ -115,12 +100,9 @@ def upload_files(files):
115
 
116
  def query_text(query):
117
  try:
118
- if hasattr(embedding_model, 'encode'):
119
- query_embedding = embedding_model.encode([query], normalize_embeddings=True)
120
- else:
121
- query_embedding = get_embeddings([query])
122
-
123
- D, I = index.search(np.array(query_embedding), k=3)
124
  results = []
125
  for idx in I[0]:
126
  if 0 <= idx < len(document_texts):
@@ -149,4 +131,5 @@ with gr.Blocks() as demo:
149
  upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
150
  search_btn.click(query_text, inputs=query_input, outputs=results_output)
151
 
152
- demo.launch()
 
 
3
  import pickle
4
  import numpy as np
5
  import gradio as gr
 
6
  import fitz # PyMuPDF
7
  from docx import Document
8
+ from transformers import AutoModel, AutoTokenizer
9
  import faiss
10
 
11
  # =============================================
12
+ # EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
13
  # =============================================
14
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ embedding_model = AutoModel.from_pretrained(model_name)
17
+
18
+ def get_embeddings(texts):
19
+ if isinstance(texts, str):
20
+ texts = [texts]
21
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
22
+ with torch.no_grad():
23
+ outputs = embedding_model(**inputs)
24
+ return outputs.last_hidden_state[:, 0].cpu().numpy()
25
 
26
  # =============================================
27
+ # DOCUMENT STORAGE SETUP
28
  # =============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  index_path = "faiss_index.pkl"
30
  document_texts_path = "document_texts.pkl"
31
  document_texts = []
32
 
33
+ embedding_dim = 384 # Dimension for all-MiniLM-L6-v2
34
  if os.path.exists(index_path) and os.path.exists(document_texts_path):
35
  try:
36
  with open(index_path, "rb") as f:
 
38
  with open(document_texts_path, "rb") as f:
39
  document_texts = pickle.load(f)
40
  except Exception as e:
41
+ print(f"Error loading index: {e}")
42
+ index = faiss.IndexFlatIP(embedding_dim)
43
  else:
44
+ index = faiss.IndexFlatIP(embedding_dim)
45
 
46
  # =============================================
47
  # DOCUMENT PROCESSING FUNCTIONS
 
80
  else:
81
  continue
82
 
83
+ sentences = [s.strip() for s in text.split("\n") if s.strip()]
84
+ if not sentences:
85
+ continue
86
+
87
+ embeddings = get_embeddings(sentences)
88
+ index.add(embeddings)
 
 
89
  document_texts.extend(sentences)
90
 
91
  # Save updated index
 
100
 
101
  def query_text(query):
102
  try:
103
+ query_embedding = get_embeddings(query)
104
+ D, I = index.search(query_embedding, k=3)
105
+
 
 
 
106
  results = []
107
  for idx in I[0]:
108
  if 0 <= idx < len(document_texts):
 
131
  upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
132
  search_btn.click(query_text, inputs=query_input, outputs=results_output)
133
 
134
+ if __name__ == "__main__":
135
+ demo.launch()