Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
import gradio as gr
|
5 |
import fitz # PyMuPDF
|
6 |
from docx import Document
|
7 |
-
from transformers import AutoModel, AutoTokenizer
|
8 |
import faiss
|
9 |
import torch
|
10 |
|
@@ -24,13 +24,25 @@ def get_embeddings(texts):
|
|
24 |
return outputs.last_hidden_state[:, 0].cpu().numpy()
|
25 |
|
26 |
# =============================================
|
27 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# =============================================
|
29 |
index_path = "faiss_index.pkl"
|
30 |
document_texts_path = "document_texts.pkl"
|
31 |
document_texts = []
|
32 |
|
33 |
-
embedding_dim = 384 #
|
34 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
35 |
try:
|
36 |
with open(index_path, "rb") as f:
|
@@ -44,29 +56,29 @@ else:
|
|
44 |
index = faiss.IndexFlatIP(embedding_dim)
|
45 |
|
46 |
# =============================================
|
47 |
-
# DOCUMENT PROCESSING
|
48 |
# =============================================
|
49 |
-
def extract_text_from_pdf(
|
50 |
text = ""
|
51 |
try:
|
52 |
-
doc = fitz.open(
|
53 |
for page in doc:
|
54 |
text += page.get_text()
|
55 |
except Exception as e:
|
56 |
print(f"PDF error: {e}")
|
57 |
return text
|
58 |
|
59 |
-
def extract_text_from_docx(
|
60 |
text = ""
|
61 |
try:
|
62 |
-
doc = Document(
|
63 |
text = "\n".join([para.text for para in doc.paragraphs])
|
64 |
except Exception as e:
|
65 |
print(f"DOCX error: {e}")
|
66 |
return text
|
67 |
|
68 |
# =============================================
|
69 |
-
#
|
70 |
# =============================================
|
71 |
def upload_document(file):
|
72 |
ext = os.path.splitext(file.name)[-1].lower()
|
@@ -75,53 +87,56 @@ def upload_document(file):
|
|
75 |
elif ext == ".docx":
|
76 |
text = extract_text_from_docx(file.name)
|
77 |
else:
|
78 |
-
return "Unsupported file type"
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
83 |
|
84 |
-
# Save updated index and texts
|
85 |
with open(index_path, "wb") as f:
|
86 |
pickle.dump(index, f)
|
87 |
with open(document_texts_path, "wb") as f:
|
88 |
pickle.dump(document_texts, f)
|
89 |
|
90 |
-
return "Document uploaded and indexed successfully
|
91 |
|
92 |
# =============================================
|
93 |
-
#
|
94 |
# =============================================
|
95 |
-
|
|
|
|
|
96 |
if not document_texts:
|
97 |
return "No documents indexed yet."
|
98 |
|
99 |
-
query_vector = get_embeddings(query)
|
100 |
-
scores, indices = index.search(query_vector, k=
|
101 |
-
|
|
|
102 |
|
103 |
-
|
|
|
|
|
104 |
|
105 |
# =============================================
|
106 |
-
# GRADIO
|
107 |
# =============================================
|
108 |
upload_interface = gr.Interface(
|
109 |
fn=upload_document,
|
110 |
inputs=gr.File(file_types=[".pdf", ".docx"]),
|
111 |
outputs="text",
|
112 |
-
title="Upload
|
113 |
-
description="Upload a
|
114 |
)
|
115 |
|
116 |
search_interface = gr.Interface(
|
117 |
-
fn=
|
118 |
-
inputs=gr.Textbox(placeholder="
|
119 |
-
outputs="
|
120 |
-
title="
|
121 |
-
description="
|
122 |
)
|
123 |
|
124 |
-
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload
|
125 |
-
|
126 |
-
if __name__ == "__main__":
|
127 |
-
app.launch()
|
|
|
4 |
import gradio as gr
|
5 |
import fitz # PyMuPDF
|
6 |
from docx import Document
|
7 |
+
from transformers import AutoModel, AutoTokenizer, pipeline
|
8 |
import faiss
|
9 |
import torch
|
10 |
|
|
|
24 |
return outputs.last_hidden_state[:, 0].cpu().numpy()
|
25 |
|
26 |
# =============================================
|
27 |
+
# TEXT CHUNKING
|
28 |
+
# =============================================
|
29 |
+
def chunk_text(text, chunk_size=500, overlap=50):
|
30 |
+
chunks = []
|
31 |
+
start = 0
|
32 |
+
while start < len(text):
|
33 |
+
end = min(len(text), start + chunk_size)
|
34 |
+
chunks.append(text[start:end])
|
35 |
+
start += chunk_size - overlap
|
36 |
+
return chunks
|
37 |
+
|
38 |
+
# =============================================
|
39 |
+
# FAISS INDEX SETUP
|
40 |
# =============================================
|
41 |
index_path = "faiss_index.pkl"
|
42 |
document_texts_path = "document_texts.pkl"
|
43 |
document_texts = []
|
44 |
|
45 |
+
embedding_dim = 384 # for all-MiniLM-L6-v2
|
46 |
if os.path.exists(index_path) and os.path.exists(document_texts_path):
|
47 |
try:
|
48 |
with open(index_path, "rb") as f:
|
|
|
56 |
index = faiss.IndexFlatIP(embedding_dim)
|
57 |
|
58 |
# =============================================
|
59 |
+
# DOCUMENT PROCESSING
|
60 |
# =============================================
|
61 |
+
def extract_text_from_pdf(path):
|
62 |
text = ""
|
63 |
try:
|
64 |
+
doc = fitz.open(path)
|
65 |
for page in doc:
|
66 |
text += page.get_text()
|
67 |
except Exception as e:
|
68 |
print(f"PDF error: {e}")
|
69 |
return text
|
70 |
|
71 |
+
def extract_text_from_docx(path):
|
72 |
text = ""
|
73 |
try:
|
74 |
+
doc = Document(path)
|
75 |
text = "\n".join([para.text for para in doc.paragraphs])
|
76 |
except Exception as e:
|
77 |
print(f"DOCX error: {e}")
|
78 |
return text
|
79 |
|
80 |
# =============================================
|
81 |
+
# UPLOAD AND INDEX FILE
|
82 |
# =============================================
|
83 |
def upload_document(file):
|
84 |
ext = os.path.splitext(file.name)[-1].lower()
|
|
|
87 |
elif ext == ".docx":
|
88 |
text = extract_text_from_docx(file.name)
|
89 |
else:
|
90 |
+
return "Unsupported file type."
|
91 |
|
92 |
+
chunks = chunk_text(text)
|
93 |
+
chunk_embeddings = get_embeddings(chunks)
|
94 |
+
index.add(np.array(chunk_embeddings).astype('float32'))
|
95 |
+
document_texts.extend(chunks)
|
96 |
|
|
|
97 |
with open(index_path, "wb") as f:
|
98 |
pickle.dump(index, f)
|
99 |
with open(document_texts_path, "wb") as f:
|
100 |
pickle.dump(document_texts, f)
|
101 |
|
102 |
+
return "Document uploaded and indexed successfully."
|
103 |
|
104 |
# =============================================
|
105 |
+
# QA PIPELINE WITH FLAN-T5
|
106 |
# =============================================
|
107 |
+
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
|
108 |
+
|
109 |
+
def generate_answer_from_file(query, top_k=3):
|
110 |
if not document_texts:
|
111 |
return "No documents indexed yet."
|
112 |
|
113 |
+
query_vector = get_embeddings(query).astype("float32")
|
114 |
+
scores, indices = index.search(query_vector, k=top_k)
|
115 |
+
retrieved_chunks = [document_texts[i] for i in indices[0]]
|
116 |
+
context = " ".join(retrieved_chunks)
|
117 |
|
118 |
+
prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
|
119 |
+
result = qa_pipeline(prompt, max_length=200)[0]['generated_text']
|
120 |
+
return result
|
121 |
|
122 |
# =============================================
|
123 |
+
# GRADIO UI
|
124 |
# =============================================
|
125 |
upload_interface = gr.Interface(
|
126 |
fn=upload_document,
|
127 |
inputs=gr.File(file_types=[".pdf", ".docx"]),
|
128 |
outputs="text",
|
129 |
+
title="Upload Document",
|
130 |
+
description="Upload a Word or PDF file to index it for question answering."
|
131 |
)
|
132 |
|
133 |
search_interface = gr.Interface(
|
134 |
+
fn=generate_answer_from_file,
|
135 |
+
inputs=gr.Textbox(placeholder="Ask a question about the uploaded document..."),
|
136 |
+
outputs="text",
|
137 |
+
title="Ask Your Document",
|
138 |
+
description="Ask any question. The chatbot will read the document and answer like ChatGPT."
|
139 |
)
|
140 |
|
141 |
+
app = gr.TabbedInterface([upload_interface, search_interface], ["Upload", "Ask"])
|
142 |
+
app.launch()
|
|
|
|