NaimaAqeel commited on
Commit
145a282
·
verified ·
1 Parent(s): dc15ddb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -73
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import sys
3
  import pickle
4
  import numpy as np
5
  import gradio as gr
@@ -7,9 +6,10 @@ import fitz # PyMuPDF
7
  from docx import Document
8
  from transformers import AutoModel, AutoTokenizer
9
  import faiss
 
10
 
11
  # =============================================
12
- # EMBEDDING MODEL SETUP (NO sentence-transformers dependency)
13
  # =============================================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -62,74 +62,4 @@ def extract_text_from_docx(docx_path):
62
  doc = Document(docx_path)
63
  text = "\n".join([para.text for para in doc.paragraphs])
64
  except Exception as e:
65
- print(f"DOCX error: {e}")
66
- return text
67
-
68
- # =============================================
69
- # CORE FUNCTIONALITY
70
- # =============================================
71
- def upload_files(files):
72
- global index, document_texts
73
- try:
74
- for file in files:
75
- file_path = file.name
76
- if file_path.endswith('.pdf'):
77
- text = extract_text_from_pdf(file_path)
78
- elif file_path.endswith('.docx'):
79
- text = extract_text_from_docx(file_path)
80
- else:
81
- continue
82
-
83
- sentences = [s.strip() for s in text.split("\n") if s.strip()]
84
- if not sentences:
85
- continue
86
-
87
- embeddings = get_embeddings(sentences)
88
- index.add(embeddings)
89
- document_texts.extend(sentences)
90
-
91
- # Save updated index
92
- with open(index_path, "wb") as f:
93
- pickle.dump(index, f)
94
- with open(document_texts_path, "wb") as f:
95
- pickle.dump(document_texts, f)
96
-
97
- return f"Processed {len(files)} files, added {len(sentences)} sentences"
98
- except Exception as e:
99
- return f"Error: {str(e)}"
100
-
101
- def query_text(query):
102
- try:
103
- query_embedding = get_embeddings(query)
104
- D, I = index.search(query_embedding, k=3)
105
-
106
- results = []
107
- for idx in I[0]:
108
- if 0 <= idx < len(document_texts):
109
- results.append(document_texts[idx])
110
-
111
- return "\n\n---\n\n".join(results) if results else "No matches found"
112
- except Exception as e:
113
- return f"Query error: {str(e)}"
114
-
115
- # =============================================
116
- # GRADIO INTERFACE
117
- # =============================================
118
- with gr.Blocks() as demo:
119
- gr.Markdown("## Document Search with Semantic Similarity")
120
-
121
- with gr.Tab("Upload Documents"):
122
- file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
123
- upload_btn = gr.Button("Process Files")
124
- upload_output = gr.Textbox()
125
-
126
- with gr.Tab("Search"):
127
- query_input = gr.Textbox(label="Enter your query")
128
- search_btn = gr.Button("Search")
129
- results_output = gr.Textbox()
130
-
131
- upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
132
- search_btn.click(query_text, inputs=query_input, outputs=results_output)
133
-
134
- if __name__ == "__main__":
135
- demo.launch()
 
1
  import os
 
2
  import pickle
3
  import numpy as np
4
  import gradio as gr
 
6
  from docx import Document
7
  from transformers import AutoModel, AutoTokenizer
8
  import faiss
9
+ import torch
10
 
11
  # =============================================
12
+ # EMBEDDING MODEL SETUP
13
  # =============================================
14
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
62
  doc = Document(docx_path)
63
  text = "\n".join([para.text for para in doc.paragraphs])
64
  except Exception as e:
65
+ print