NaimaAqeel commited on
Commit
24d9947
·
verified ·
1 Parent(s): b2bfd05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -102
app.py CHANGED
@@ -1,156 +1,152 @@
1
  import os
2
- import fitz
3
- from docx import Document
4
- from sentence_transformers import SentenceTransformer
5
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
- import faiss
7
- import numpy as np
8
  import pickle
 
9
  import gradio as gr
10
  from typing import List
11
- from langchain_community.llms import HuggingFaceEndpoint
12
- from langchain_community.vectorstores import FAISS
13
- from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Function to extract text from a PDF file
 
 
16
  def extract_text_from_pdf(pdf_path):
17
  text = ""
18
  try:
19
  doc = fitz.open(pdf_path)
20
- for page_num in range(len(doc)):
21
- page = doc.load_page(page_num)
22
  text += page.get_text()
23
  except Exception as e:
24
- print(f"Error extracting text from PDF: {e}")
25
  return text
26
 
27
- # Function to extract text from a Word document
28
  def extract_text_from_docx(docx_path):
29
  text = ""
30
  try:
31
  doc = Document(docx_path)
32
  text = "\n".join([para.text for para in doc.paragraphs])
33
  except Exception as e:
34
- print(f"Error extracting text from DOCX: {e}")
35
  return text
36
 
37
- # Initialize the embedding model
38
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
39
-
40
- # Hugging Face API token
41
- api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
42
- if not api_token:
43
- raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
44
-
45
- # Initialize RAG models from Hugging Face
46
- generator_model_name = "facebook/bart-base"
47
- retriever_model_name = "facebook/bart-base"
48
- generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
49
- generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
50
- retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
51
- retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
52
-
53
- # Initialize the HuggingFace LLM
54
- llm = HuggingFaceEndpoint(
55
- endpoint_url="https://api-inference.huggingface.co/models/gpt2",
56
- model_kwargs={"api_key": api_token}
57
- )
58
-
59
- # Initialize the HuggingFace embeddings
60
- embedding = HuggingFaceEmbeddings()
61
-
62
- # FAISS index and storage paths
63
- index_path = "faiss_index.pkl"
64
- document_texts_path = "document_texts.pkl"
65
- document_texts = []
66
-
67
- # Load or create FAISS index using cosine similarity (Inner Product + Normalized vectors)
68
- if os.path.exists(index_path) and os.path.exists(document_texts_path):
69
- try:
70
- with open(index_path, "rb") as f:
71
- index = pickle.load(f)
72
- print("Loaded FAISS index from faiss_index.pkl")
73
- with open(document_texts_path, "rb") as f:
74
- document_texts = pickle.load(f)
75
- print("Loaded document texts from document_texts.pkl")
76
- except Exception as e:
77
- print(f"Error loading FAISS index or document texts: {e}")
78
- else:
79
- index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
80
- with open(index_path, "wb") as f:
81
- pickle.dump(index, f)
82
- print("Created new FAISS index and saved to faiss_index.pkl")
83
-
84
  def upload_files(files):
85
  global index, document_texts
86
  try:
87
  for file in files:
88
  file_path = file.name
89
- file.save(file_path) # Saving file in Hugging Face space
90
-
91
  if file_path.endswith('.pdf'):
92
  text = extract_text_from_pdf(file_path)
93
  elif file_path.endswith('.docx'):
94
  text = extract_text_from_docx(file_path)
95
  else:
96
- return "Unsupported file format"
97
 
98
- print(f"Extracted text: {text[:100]}...")
 
 
 
 
 
99
 
100
- sentences = text.split("\n")
101
- embeddings = embedding_model.encode(sentences, normalize_embeddings=True) # Cosine similarity step
102
- print(f"Embeddings shape: {embeddings.shape}")
103
  index.add(np.array(embeddings))
104
  document_texts.extend(sentences)
105
 
106
- # Save updated index and texts
107
  with open(index_path, "wb") as f:
108
  pickle.dump(index, f)
109
- print("Saved updated FAISS index to faiss_index.pkl")
110
  with open(document_texts_path, "wb") as f:
111
  pickle.dump(document_texts, f)
112
- print("Saved updated document texts to document_texts.pkl")
113
 
114
- return "Files processed successfully"
115
  except Exception as e:
116
- print(f"Error processing files: {e}")
117
- return f"Error processing files: {e}"
118
 
119
- def query_text(text):
120
  try:
121
- print(f"Query text: {text}")
122
- query_embedding = embedding_model.encode([text], normalize_embeddings=True) # Cosine similarity step
123
- print(f"Query embedding shape: {query_embedding.shape}")
124
-
125
- D, I = index.search(np.array(query_embedding), k=5)
126
- print(f"Distances: {D}, Indices: {I}")
127
 
128
- top_documents = []
 
129
  for idx in I[0]:
130
- if idx != -1 and idx < len(document_texts):
131
- top_documents.append(document_texts[idx])
132
- else:
133
- print(f"Invalid index found: {idx}")
134
-
135
- return "\n\n".join(top_documents)
136
  except Exception as e:
137
- print(f"Error querying text: {e}")
138
- return f"Error querying text: {e}"
139
 
140
- # Gradio Interface
 
 
141
  with gr.Blocks() as demo:
142
- gr.Markdown("## Document Upload and Query System with Cosine Similarity")
143
 
144
- with gr.Tab("Upload Files"):
145
- upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
146
- upload_button = gr.Button("Upload")
147
  upload_output = gr.Textbox()
148
- upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
149
 
150
- with gr.Tab("Query"):
151
- query = gr.Textbox(label="Enter your query")
152
- query_button = gr.Button("Search")
153
- query_output = gr.Textbox()
154
- query_button.click(fn=query_text, inputs=query, outputs=query_output)
 
 
155
 
156
  demo.launch()
 
1
  import os
2
+ import sys
 
 
 
 
 
3
  import pickle
4
+ import numpy as np
5
  import gradio as gr
6
  from typing import List
7
+ import fitz # PyMuPDF
8
+ from docx import Document
9
+ from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
10
+ import faiss
11
+
12
+ # =============================================
13
+ # FIX FOR HUGGINGFACE HUB IMPORT ISSUE
14
+ # =============================================
15
+ try:
16
+ from huggingface_hub import cached_download
17
+ except ImportError:
18
+ from huggingface_hub.utils import cached_download
19
+ import huggingface_hub
20
+ sys.modules['huggingface_hub'].cached_download = cached_download
21
+
22
+ # Now we can safely import sentence-transformers
23
+ from sentence_transformers import SentenceTransformer
24
+
25
+ # =============================================
26
+ # INITIALIZE MODELS
27
+ # =============================================
28
+ # Initialize embedding model (using direct transformers as fallback)
29
+ try:
30
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
31
+ except Exception as e:
32
+ print(f"Failed to load SentenceTransformer, falling back to direct transformers: {e}")
33
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
34
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
35
+ embedding_model = AutoModel.from_pretrained(model_name)
36
+
37
+ def get_embeddings(texts):
38
+ inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
39
+ outputs = embedding_model(**inputs)
40
+ return outputs.last_hidden_state.mean(dim=1).detach().numpy()
41
+
42
+ # Initialize FAISS index
43
+ index_path = "faiss_index.pkl"
44
+ document_texts_path = "document_texts.pkl"
45
+ document_texts = []
46
+
47
+ if os.path.exists(index_path) and os.path.exists(document_texts_path):
48
+ try:
49
+ with open(index_path, "rb") as f:
50
+ index = pickle.load(f)
51
+ with open(document_texts_path, "rb") as f:
52
+ document_texts = pickle.load(f)
53
+ except Exception as e:
54
+ print(f"Error loading FAISS index: {e}")
55
+ index = faiss.IndexFlatIP(384) # 384 is dim for all-MiniLM-L6-v2
56
+ else:
57
+ index = faiss.IndexFlatIP(384)
58
 
59
+ # =============================================
60
+ # DOCUMENT PROCESSING FUNCTIONS
61
+ # =============================================
62
  def extract_text_from_pdf(pdf_path):
63
  text = ""
64
  try:
65
  doc = fitz.open(pdf_path)
66
+ for page in doc:
 
67
  text += page.get_text()
68
  except Exception as e:
69
+ print(f"PDF error: {e}")
70
  return text
71
 
 
72
  def extract_text_from_docx(docx_path):
73
  text = ""
74
  try:
75
  doc = Document(docx_path)
76
  text = "\n".join([para.text for para in doc.paragraphs])
77
  except Exception as e:
78
+ print(f"DOCX error: {e}")
79
  return text
80
 
81
+ # =============================================
82
+ # CORE FUNCTIONALITY
83
+ # =============================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def upload_files(files):
85
  global index, document_texts
86
  try:
87
  for file in files:
88
  file_path = file.name
 
 
89
  if file_path.endswith('.pdf'):
90
  text = extract_text_from_pdf(file_path)
91
  elif file_path.endswith('.docx'):
92
  text = extract_text_from_docx(file_path)
93
  else:
94
+ continue
95
 
96
+ sentences = [s for s in text.split("\n") if s.strip()]
97
+
98
+ if hasattr(embedding_model, 'encode'):
99
+ embeddings = embedding_model.encode(sentences, normalize_embeddings=True)
100
+ else:
101
+ embeddings = get_embeddings(sentences)
102
 
 
 
 
103
  index.add(np.array(embeddings))
104
  document_texts.extend(sentences)
105
 
106
+ # Save updated index
107
  with open(index_path, "wb") as f:
108
  pickle.dump(index, f)
 
109
  with open(document_texts_path, "wb") as f:
110
  pickle.dump(document_texts, f)
 
111
 
112
+ return f"Processed {len(files)} files, added {len(sentences)} sentences"
113
  except Exception as e:
114
+ return f"Error: {str(e)}"
 
115
 
116
+ def query_text(query):
117
  try:
118
+ if hasattr(embedding_model, 'encode'):
119
+ query_embedding = embedding_model.encode([query], normalize_embeddings=True)
120
+ else:
121
+ query_embedding = get_embeddings([query])
 
 
122
 
123
+ D, I = index.search(np.array(query_embedding), k=3)
124
+ results = []
125
  for idx in I[0]:
126
+ if 0 <= idx < len(document_texts):
127
+ results.append(document_texts[idx])
128
+
129
+ return "\n\n---\n\n".join(results) if results else "No matches found"
 
 
130
  except Exception as e:
131
+ return f"Query error: {str(e)}"
 
132
 
133
+ # =============================================
134
+ # GRADIO INTERFACE
135
+ # =============================================
136
  with gr.Blocks() as demo:
137
+ gr.Markdown("## Document Search with Semantic Similarity")
138
 
139
+ with gr.Tab("Upload Documents"):
140
+ file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
141
+ upload_btn = gr.Button("Process Files")
142
  upload_output = gr.Textbox()
 
143
 
144
+ with gr.Tab("Search"):
145
+ query_input = gr.Textbox(label="Enter your query")
146
+ search_btn = gr.Button("Search")
147
+ results_output = gr.Textbox()
148
+
149
+ upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
150
+ search_btn.click(query_text, inputs=query_input, outputs=results_output)
151
 
152
  demo.launch()