Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ from transformers import AutoModel, AutoTokenizer, pipeline
|
|
8 |
import faiss
|
9 |
import torch
|
10 |
|
11 |
-
#
|
12 |
-
# EMBEDDING MODEL
|
13 |
-
#
|
14 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
embedding_model = AutoModel.from_pretrained(model_name)
|
@@ -23,9 +23,9 @@ def get_embeddings(texts):
|
|
23 |
outputs = embedding_model(**inputs)
|
24 |
return outputs.last_hidden_state[:, 0].cpu().numpy()
|
25 |
|
26 |
-
#
|
27 |
# TEXT CHUNKING
|
28 |
-
#
|
29 |
def chunk_text(text, chunk_size=500, overlap=50):
|
30 |
chunks = []
|
31 |
start = 0
|
@@ -35,14 +35,14 @@ def chunk_text(text, chunk_size=500, overlap=50):
|
|
35 |
start += chunk_size - overlap
|
36 |
return chunks
|
37 |
|
38 |
-
#
|
39 |
# FAISS INDEX SETUP
|
40 |
-
#
|
41 |
index_path = "faiss_index.pkl"
|
42 |
document_texts_path = "document_texts.pkl"
|
43 |
document_texts = []
|
|
|
44 |
|
45 |
-
embedding_dim = 384 # for all-MiniLM-L6-v2
|
46 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
47 |
try:
|
48 |
with open(index_path, "rb") as f:
|
@@ -55,9 +55,9 @@ if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
|
55 |
else:
|
56 |
index = faiss.IndexFlatIP(embedding_dim)
|
57 |
|
58 |
-
#
|
59 |
-
#
|
60 |
-
#
|
61 |
def extract_text_from_pdf(path):
|
62 |
text = ""
|
63 |
try:
|
@@ -77,9 +77,9 @@ def extract_text_from_docx(path):
|
|
77 |
print(f"DOCX error: {e}")
|
78 |
return text
|
79 |
|
80 |
-
#
|
81 |
-
# UPLOAD
|
82 |
-
#
|
83 |
def upload_document(file):
|
84 |
ext = os.path.splitext(file.name)[-1].lower()
|
85 |
if ext == ".pdf":
|
@@ -101,12 +101,12 @@ def upload_document(file):
|
|
101 |
|
102 |
return "Document uploaded and indexed successfully."
|
103 |
|
104 |
-
#
|
105 |
-
#
|
106 |
-
#
|
107 |
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
|
108 |
|
109 |
-
def generate_answer_from_file(query, top_k=
|
110 |
if not document_texts:
|
111 |
return "No documents indexed yet."
|
112 |
|
@@ -115,27 +115,33 @@ def generate_answer_from_file(query, top_k=3):
|
|
115 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
116 |
context = " ".join(retrieved_chunks)
|
117 |
|
118 |
-
prompt =
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
upload_interface = gr.Interface(
|
126 |
fn=upload_document,
|
127 |
inputs=gr.File(file_types=[".pdf", ".docx"]),
|
128 |
outputs="text",
|
129 |
title="Upload Document",
|
130 |
-
description="Upload
|
131 |
)
|
132 |
|
133 |
search_interface = gr.Interface(
|
134 |
fn=generate_answer_from_file,
|
135 |
-
inputs=gr.Textbox(placeholder="Ask
|
136 |
outputs="text",
|
137 |
-
title="Ask
|
138 |
-
description="Ask
|
139 |
)
|
140 |
|
141 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
|
|
8 |
import faiss
|
9 |
import torch
|
10 |
|
11 |
+
# ===============================
|
12 |
+
# EMBEDDING MODEL
|
13 |
+
# ===============================
|
14 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
15 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
embedding_model = AutoModel.from_pretrained(model_name)
|
|
|
23 |
outputs = embedding_model(**inputs)
|
24 |
return outputs.last_hidden_state[:, 0].cpu().numpy()
|
25 |
|
26 |
+
# ===============================
|
27 |
# TEXT CHUNKING
|
28 |
+
# ===============================
|
29 |
def chunk_text(text, chunk_size=500, overlap=50):
|
30 |
chunks = []
|
31 |
start = 0
|
|
|
35 |
start += chunk_size - overlap
|
36 |
return chunks
|
37 |
|
38 |
+
# ===============================
|
39 |
# FAISS INDEX SETUP
|
40 |
+
# ===============================
|
41 |
index_path = "faiss_index.pkl"
|
42 |
document_texts_path = "document_texts.pkl"
|
43 |
document_texts = []
|
44 |
+
embedding_dim = 384
|
45 |
|
|
|
46 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
47 |
try:
|
48 |
with open(index_path, "rb") as f:
|
|
|
55 |
else:
|
56 |
index = faiss.IndexFlatIP(embedding_dim)
|
57 |
|
58 |
+
# ===============================
|
59 |
+
# FILE EXTRACTORS
|
60 |
+
# ===============================
|
61 |
def extract_text_from_pdf(path):
|
62 |
text = ""
|
63 |
try:
|
|
|
77 |
print(f"DOCX error: {e}")
|
78 |
return text
|
79 |
|
80 |
+
# ===============================
|
81 |
+
# UPLOAD HANDLER
|
82 |
+
# ===============================
|
83 |
def upload_document(file):
|
84 |
ext = os.path.splitext(file.name)[-1].lower()
|
85 |
if ext == ".pdf":
|
|
|
101 |
|
102 |
return "Document uploaded and indexed successfully."
|
103 |
|
104 |
+
# ===============================
|
105 |
+
# GENERATION PIPELINE (FLAN-T5)
|
106 |
+
# ===============================
|
107 |
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
|
108 |
|
109 |
+
def generate_answer_from_file(query, top_k=5):
|
110 |
if not document_texts:
|
111 |
return "No documents indexed yet."
|
112 |
|
|
|
115 |
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
116 |
context = " ".join(retrieved_chunks)
|
117 |
|
118 |
+
prompt = (
|
119 |
+
f"Use the following context from a textbook or academic document to answer the question accurately and in detail.\n\n"
|
120 |
+
f"Context:\n{context}\n\n"
|
121 |
+
f"Question: {query}\n\n"
|
122 |
+
f"Answer:"
|
123 |
+
)
|
124 |
|
125 |
+
result = qa_pipeline(prompt, max_length=512, do_sample=False)[0]['generated_text']
|
126 |
+
return result.strip()
|
127 |
+
|
128 |
+
# ===============================
|
129 |
+
# GRADIO INTERFACES
|
130 |
+
# ===============================
|
131 |
upload_interface = gr.Interface(
|
132 |
fn=upload_document,
|
133 |
inputs=gr.File(file_types=[".pdf", ".docx"]),
|
134 |
outputs="text",
|
135 |
title="Upload Document",
|
136 |
+
description="Upload your Word or PDF document for question answering."
|
137 |
)
|
138 |
|
139 |
search_interface = gr.Interface(
|
140 |
fn=generate_answer_from_file,
|
141 |
+
inputs=gr.Textbox(placeholder="Ask your question about the uploaded document..."),
|
142 |
outputs="text",
|
143 |
+
title="Ask the Document",
|
144 |
+
description="Ask questions about the uploaded content. The chatbot will answer based on the document."
|
145 |
)
|
146 |
|
147 |
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|