Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import os
|
2 |
import pickle
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
@@ -25,9 +25,10 @@ def get_embeddings(texts, is_query=False):
|
|
25 |
with torch.no_grad():
|
26 |
model_output = embedding_model(**inputs)
|
27 |
|
28 |
-
embeddings = model_output.last_hidden_state[:, 0] # CLS token
|
29 |
return embeddings.cpu().numpy()
|
30 |
|
|
|
31 |
# ===============================
|
32 |
# TEXT CHUNKING
|
33 |
# ===============================
|
@@ -46,7 +47,7 @@ def chunk_text(text, chunk_size=800, overlap=100):
|
|
46 |
index_path = "faiss_index.pkl"
|
47 |
document_texts_path = "document_texts.pkl"
|
48 |
document_texts = []
|
49 |
-
embedding_dim =
|
50 |
|
51 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
52 |
try:
|
@@ -87,21 +88,15 @@ def extract_text_from_docx(path):
|
|
87 |
# ===============================
|
88 |
def upload_document(file):
|
89 |
ext = os.path.splitext(file.name)[-1].lower()
|
90 |
-
|
91 |
-
# Save uploaded file temporarily
|
92 |
-
temp_path = f"temp_upload{ext}"
|
93 |
-
with open(temp_path, "wb") as f:
|
94 |
-
f.write(file.read())
|
95 |
-
|
96 |
if ext == ".pdf":
|
97 |
-
text = extract_text_from_pdf(
|
98 |
elif ext == ".docx":
|
99 |
-
text = extract_text_from_docx(
|
100 |
else:
|
101 |
return "Unsupported file type."
|
102 |
|
103 |
chunks = chunk_text(text)
|
104 |
-
chunk_embeddings = get_embeddings(chunks
|
105 |
index.add(np.array(chunk_embeddings).astype('float32'))
|
106 |
document_texts.extend(chunks)
|
107 |
|
@@ -110,12 +105,8 @@ def upload_document(file):
|
|
110 |
with open(document_texts_path, "wb") as f:
|
111 |
pickle.dump(document_texts, f)
|
112 |
|
113 |
-
# Remove the temporary file after processing (optional)
|
114 |
-
os.remove(temp_path)
|
115 |
-
|
116 |
return "Document uploaded and indexed successfully."
|
117 |
|
118 |
-
|
119 |
# ===============================
|
120 |
# GENERATION PIPELINE (FLAN-T5)
|
121 |
# ===============================
|
@@ -125,7 +116,7 @@ def generate_answer_from_file(query, top_k=10):
|
|
125 |
if not document_texts:
|
126 |
return "No documents indexed yet."
|
127 |
|
128 |
-
query_vector = get_embeddings(query
|
129 |
scores, indices = index.search(query_vector, k=top_k)
|
130 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
131 |
context = "\n\n".join(retrieved_chunks)
|
@@ -170,3 +161,5 @@ search_interface = gr.Interface(
|
|
170 |
|
171 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
172 |
app.launch()
|
|
|
|
|
|
1 |
+
now explain this code that how this work? i want to understand deeply import os
|
2 |
import pickle
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
|
|
25 |
with torch.no_grad():
|
26 |
model_output = embedding_model(**inputs)
|
27 |
|
28 |
+
embeddings = model_output.last_hidden_state[:, 0] # CLS token
|
29 |
return embeddings.cpu().numpy()
|
30 |
|
31 |
+
|
32 |
# ===============================
|
33 |
# TEXT CHUNKING
|
34 |
# ===============================
|
|
|
47 |
index_path = "faiss_index.pkl"
|
48 |
document_texts_path = "document_texts.pkl"
|
49 |
document_texts = []
|
50 |
+
embedding_dim = 384
|
51 |
|
52 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
53 |
try:
|
|
|
88 |
# ===============================
|
89 |
def upload_document(file):
|
90 |
ext = os.path.splitext(file.name)[-1].lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
if ext == ".pdf":
|
92 |
+
text = extract_text_from_pdf(file.name)
|
93 |
elif ext == ".docx":
|
94 |
+
text = extract_text_from_docx(file.name)
|
95 |
else:
|
96 |
return "Unsupported file type."
|
97 |
|
98 |
chunks = chunk_text(text)
|
99 |
+
chunk_embeddings = get_embeddings(chunks)
|
100 |
index.add(np.array(chunk_embeddings).astype('float32'))
|
101 |
document_texts.extend(chunks)
|
102 |
|
|
|
105 |
with open(document_texts_path, "wb") as f:
|
106 |
pickle.dump(document_texts, f)
|
107 |
|
|
|
|
|
|
|
108 |
return "Document uploaded and indexed successfully."
|
109 |
|
|
|
110 |
# ===============================
|
111 |
# GENERATION PIPELINE (FLAN-T5)
|
112 |
# ===============================
|
|
|
116 |
if not document_texts:
|
117 |
return "No documents indexed yet."
|
118 |
|
119 |
+
query_vector = get_embeddings(query).astype("float32")
|
120 |
scores, indices = index.search(query_vector, k=top_k)
|
121 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
122 |
context = "\n\n".join(retrieved_chunks)
|
|
|
161 |
|
162 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
163 |
app.launch()
|
164 |
+
|
165 |
+
|