Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6 |
import gradio as gr
|
7 |
from PyPDF2 import PdfReader # pip install PyPDF2
|
8 |
|
9 |
-
from helper import get_openai_api_key, get_llama_cloud_api_key
|
10 |
from llama_parse import LlamaParse
|
11 |
from llama_index.core import (
|
12 |
Settings, VectorStoreIndex, StorageContext, load_index_from_storage
|
@@ -24,12 +23,12 @@ Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-large")
|
|
24 |
Settings.chunk_size = 512
|
25 |
Settings.chunk_overlap = 64
|
26 |
|
27 |
-
os.
|
28 |
-
os.
|
29 |
|
30 |
# ---- 2. Parser Setup ----
|
31 |
parser = LlamaParse(
|
32 |
-
api_key =
|
33 |
base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
|
34 |
result_type = "markdown",
|
35 |
content_guideline_instruction = (
|
@@ -40,22 +39,20 @@ parser = LlamaParse(
|
|
40 |
verbose=True
|
41 |
)
|
42 |
|
43 |
-
#
|
|
|
|
|
|
|
|
|
44 |
async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
45 |
-
# Validate uploads
|
46 |
if not uploaded_files:
|
47 |
return "❗ Please upload at least one PDF."
|
48 |
if len(uploaded_files) > 5:
|
49 |
return "❗ You can upload up to 5 PDF files."
|
50 |
|
51 |
-
# Ensure user_data directory
|
52 |
-
user_dir = Path("./user_data")
|
53 |
-
user_dir.mkdir(exist_ok=True)
|
54 |
-
|
55 |
-
# Prepare list of QueryEngineTools
|
56 |
tools = []
|
57 |
for file_obj in uploaded_files:
|
58 |
-
#
|
59 |
try:
|
60 |
reader = PdfReader(file_obj.name)
|
61 |
except Exception as e:
|
@@ -63,35 +60,36 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
63 |
if len(reader.pages) > 20:
|
64 |
return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
|
65 |
|
66 |
-
# Copy
|
67 |
-
dest =
|
68 |
-
shutil.copyfile(file_obj.name, dest)
|
69 |
|
70 |
-
# Parse
|
71 |
docs = parser.load_data(dest)
|
72 |
|
73 |
-
# Index folder
|
74 |
-
stem
|
75 |
idx_dir = Path(f"./index_data/{stem}")
|
76 |
|
77 |
-
# Load or build index
|
78 |
if idx_dir.exists() and any(idx_dir.iterdir()):
|
79 |
-
sc
|
80 |
idx = load_index_from_storage(sc)
|
81 |
else:
|
82 |
sc = StorageContext.from_defaults()
|
83 |
idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
|
84 |
-
sc.persist(persist_dir=str(idx_dir))
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
91 |
)
|
92 |
-
tools.append(qe_tool)
|
93 |
|
94 |
-
# Combine into SubQuestionQueryEngine + Agent
|
95 |
subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
|
96 |
tools.append(
|
97 |
QueryEngineTool.from_defaults(
|
@@ -103,27 +101,82 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
103 |
agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
|
104 |
ctx = Context(agent)
|
105 |
|
106 |
-
# Run agent
|
107 |
-
|
108 |
-
return str(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
# ---- 4. Gradio UI ----
|
111 |
with gr.Blocks() as demo:
|
112 |
gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
)
|
123 |
-
output = gr.Textbox(label="Answer")
|
124 |
-
submit = gr.Button("Ask")
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
if __name__ == "__main__":
|
129 |
demo.launch()
|
|
|
6 |
import gradio as gr
|
7 |
from PyPDF2 import PdfReader # pip install PyPDF2
|
8 |
|
|
|
9 |
from llama_parse import LlamaParse
|
10 |
from llama_index.core import (
|
11 |
Settings, VectorStoreIndex, StorageContext, load_index_from_storage
|
|
|
23 |
Settings.chunk_size = 512
|
24 |
Settings.chunk_overlap = 64
|
25 |
|
26 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
27 |
+
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
28 |
|
29 |
# ---- 2. Parser Setup ----
|
30 |
parser = LlamaParse(
|
31 |
+
api_key = LLAMA_CLOUD_API_KEY,
|
32 |
base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
|
33 |
result_type = "markdown",
|
34 |
content_guideline_instruction = (
|
|
|
39 |
verbose=True
|
40 |
)
|
41 |
|
42 |
+
# Ensure directories exist
|
43 |
+
Path("./user_data").mkdir(exist_ok=True)
|
44 |
+
Path("./index_data").mkdir(exist_ok=True)
|
45 |
+
|
46 |
+
# ---- 3a. Upload + Answer Logic ----
|
47 |
async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
|
|
|
48 |
if not uploaded_files:
|
49 |
return "❗ Please upload at least one PDF."
|
50 |
if len(uploaded_files) > 5:
|
51 |
return "❗ You can upload up to 5 PDF files."
|
52 |
|
|
|
|
|
|
|
|
|
|
|
53 |
tools = []
|
54 |
for file_obj in uploaded_files:
|
55 |
+
# 1) Page-count check
|
56 |
try:
|
57 |
reader = PdfReader(file_obj.name)
|
58 |
except Exception as e:
|
|
|
60 |
if len(reader.pages) > 20:
|
61 |
return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
|
62 |
|
63 |
+
# 2) Copy PDF into user_data
|
64 |
+
dest = Path("./user_data") / Path(file_obj.name).name
|
65 |
+
shutil.copyfile(file_obj.name, dest)
|
66 |
|
67 |
+
# 3) Parse via LlamaParse
|
68 |
docs = parser.load_data(dest)
|
69 |
|
70 |
+
# 4) Index folder per file stem
|
71 |
+
stem = dest.stem
|
72 |
idx_dir = Path(f"./index_data/{stem}")
|
73 |
|
74 |
+
# 5) Load or build index
|
75 |
if idx_dir.exists() and any(idx_dir.iterdir()):
|
76 |
+
sc = StorageContext.from_defaults(persist_dir=str(idx_dir))
|
77 |
idx = load_index_from_storage(sc)
|
78 |
else:
|
79 |
sc = StorageContext.from_defaults()
|
80 |
idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
|
81 |
+
sc.persist(persist_dir=str(idx_dir))
|
82 |
+
|
83 |
+
# 6) Wrap in QueryEngineTool
|
84 |
+
tools.append(
|
85 |
+
QueryEngineTool.from_defaults(
|
86 |
+
query_engine=idx.as_query_engine(),
|
87 |
+
name=f"vector_index_{stem}",
|
88 |
+
description=f"Query engine for {stem}.pdf"
|
89 |
+
)
|
90 |
)
|
|
|
91 |
|
92 |
+
# 7) Combine tools into SubQuestionQueryEngine + Agent
|
93 |
subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
|
94 |
tools.append(
|
95 |
QueryEngineTool.from_defaults(
|
|
|
101 |
agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
|
102 |
ctx = Context(agent)
|
103 |
|
104 |
+
# 8) Run agent
|
105 |
+
resp = await agent.run(question, ctx=ctx)
|
106 |
+
return str(resp)
|
107 |
+
|
108 |
+
# ---- 3b. Remove Documents Logic ----
|
109 |
+
def remove_docs(filenames: str) -> str:
|
110 |
+
"""
|
111 |
+
filenames: comma-separated list of exact PDF filenames (with .pdf)
|
112 |
+
Deletes each from ./user_data/ and its index folder under ./index_data/
|
113 |
+
"""
|
114 |
+
if not filenames.strip():
|
115 |
+
return "❗ Enter at least one filename to remove."
|
116 |
+
|
117 |
+
removed, not_found = [], []
|
118 |
+
for name in [f.strip() for f in filenames.split(",")]:
|
119 |
+
pdf_path = Path("./user_data") / name
|
120 |
+
idx_path = Path("./index_data") / Path(name).stem
|
121 |
+
|
122 |
+
ok = True
|
123 |
+
if pdf_path.exists():
|
124 |
+
pdf_path.unlink()
|
125 |
+
else:
|
126 |
+
ok = False
|
127 |
+
|
128 |
+
if idx_path.exists():
|
129 |
+
shutil.rmtree(idx_path)
|
130 |
+
else:
|
131 |
+
ok = ok and False
|
132 |
+
|
133 |
+
if ok:
|
134 |
+
removed.append(name)
|
135 |
+
else:
|
136 |
+
not_found.append(name)
|
137 |
+
|
138 |
+
msg = ""
|
139 |
+
if removed:
|
140 |
+
msg += f"✅ Removed: {', '.join(removed)}.\n"
|
141 |
+
if not_found:
|
142 |
+
msg += f"⚠️ Not found: {', '.join(not_found)}."
|
143 |
+
return msg.strip()
|
144 |
|
145 |
# ---- 4. Gradio UI ----
|
146 |
with gr.Blocks() as demo:
|
147 |
gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
|
148 |
+
|
149 |
+
with gr.Tab("Ask Questions"):
|
150 |
+
with gr.Row():
|
151 |
+
file_input = gr.UploadButton(
|
152 |
+
"Upload up to 5 PDFs",
|
153 |
+
file_types=[".pdf"],
|
154 |
+
file_count="multiple"
|
155 |
+
)
|
156 |
+
question = gr.Textbox(
|
157 |
+
lines=2,
|
158 |
+
placeholder="Ask your question about the uploaded slide decks..."
|
159 |
+
)
|
160 |
+
output = gr.Textbox(label="Answer")
|
161 |
+
ask_btn = gr.Button("Ask")
|
162 |
+
ask_btn.click(
|
163 |
+
fn=answer,
|
164 |
+
inputs=[file_input, question],
|
165 |
+
outputs=output
|
166 |
)
|
|
|
|
|
167 |
|
168 |
+
with gr.Tab("Remove Documents"):
|
169 |
+
remove_input = gr.Textbox(
|
170 |
+
lines=1,
|
171 |
+
placeholder="e.g. Q1-Slides.pdf, Q2-Slides.pdf"
|
172 |
+
)
|
173 |
+
remove_output = gr.Textbox(label="Removal Status")
|
174 |
+
remove_btn = gr.Button("Remove Docs")
|
175 |
+
remove_btn.click(
|
176 |
+
fn=remove_docs,
|
177 |
+
inputs=remove_input,
|
178 |
+
outputs=remove_output
|
179 |
+
)
|
180 |
|
181 |
if __name__ == "__main__":
|
182 |
demo.launch()
|