getGO007 commited on
Commit
01d9bfd
·
verified ·
1 Parent(s): 0f7341d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -44
app.py CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
6
  import gradio as gr
7
  from PyPDF2 import PdfReader # pip install PyPDF2
8
 
9
- from helper import get_openai_api_key, get_llama_cloud_api_key
10
  from llama_parse import LlamaParse
11
  from llama_index.core import (
12
  Settings, VectorStoreIndex, StorageContext, load_index_from_storage
@@ -24,12 +23,12 @@ Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-3-large")
24
  Settings.chunk_size = 512
25
  Settings.chunk_overlap = 64
26
 
27
- os.environ["OPENAI_API_KEY"] = get_openai_api_key()
28
- os.environ["LLAMA_CLOUD_API_KEY"] = get_llama_cloud_api_key()
29
 
30
  # ---- 2. Parser Setup ----
31
  parser = LlamaParse(
32
- api_key = os.getenv("LLAMA_CLOUD_API_KEY"),
33
  base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
34
  result_type = "markdown",
35
  content_guideline_instruction = (
@@ -40,22 +39,20 @@ parser = LlamaParse(
40
  verbose=True
41
  )
42
 
43
- # ---- 3. Core “Answer” Logic ----
 
 
 
 
44
  async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
45
- # Validate uploads
46
  if not uploaded_files:
47
  return "❗ Please upload at least one PDF."
48
  if len(uploaded_files) > 5:
49
  return "❗ You can upload up to 5 PDF files."
50
 
51
- # Ensure user_data directory
52
- user_dir = Path("./user_data")
53
- user_dir.mkdir(exist_ok=True)
54
-
55
- # Prepare list of QueryEngineTools
56
  tools = []
57
  for file_obj in uploaded_files:
58
- # Read page count
59
  try:
60
  reader = PdfReader(file_obj.name)
61
  except Exception as e:
@@ -63,35 +60,36 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
63
  if len(reader.pages) > 20:
64
  return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
65
 
66
- # Copy file to persistent location
67
- dest = user_dir / Path(file_obj.name).name
68
- shutil.copyfile(file_obj.name, dest) # permanent copy :contentReference[oaicite:3]{index=3}
69
 
70
- # Parse PDF into Documents
71
  docs = parser.load_data(dest)
72
 
73
- # Index folder named after file stem
74
- stem = dest.stem
75
  idx_dir = Path(f"./index_data/{stem}")
76
 
77
- # Load or build index
78
  if idx_dir.exists() and any(idx_dir.iterdir()):
79
- sc = StorageContext.from_defaults(persist_dir=str(idx_dir))
80
  idx = load_index_from_storage(sc)
81
  else:
82
  sc = StorageContext.from_defaults()
83
  idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
84
- sc.persist(persist_dir=str(idx_dir)) # persist per-file index :contentReference[oaicite:4]{index=4}
85
-
86
- # Create a QueryEngineTool for this index
87
- qe_tool = QueryEngineTool.from_defaults(
88
- query_engine=idx.as_query_engine(),
89
- name=f"vector_index_{stem}",
90
- description=f"Query engine for slides in {stem}.pdf"
 
 
91
  )
92
- tools.append(qe_tool)
93
 
94
- # Combine into SubQuestionQueryEngine + Agent
95
  subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
96
  tools.append(
97
  QueryEngineTool.from_defaults(
@@ -103,27 +101,82 @@ async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
103
  agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
104
  ctx = Context(agent)
105
 
106
- # Run agent
107
- response = await agent.run(question, ctx=ctx)
108
- return str(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # ---- 4. Gradio UI ----
111
  with gr.Blocks() as demo:
112
  gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
113
- with gr.Row():
114
- file_input = gr.UploadButton(
115
- "Upload up to 5 PDFs",
116
- file_types=[".pdf"],
117
- file_count="multiple" # support multiple uploads
118
- )
119
- question = gr.Textbox(
120
- lines=2,
121
- placeholder="Ask your question about the uploaded slide decks..."
 
 
 
 
 
 
 
 
 
122
  )
123
- output = gr.Textbox(label="Answer")
124
- submit = gr.Button("Ask")
125
 
126
- submit.click(fn=answer, inputs=[file_input, question], outputs=output)
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  if __name__ == "__main__":
129
  demo.launch()
 
6
  import gradio as gr
7
  from PyPDF2 import PdfReader # pip install PyPDF2
8
 
 
9
  from llama_parse import LlamaParse
10
  from llama_index.core import (
11
  Settings, VectorStoreIndex, StorageContext, load_index_from_storage
 
23
  Settings.chunk_size = 512
24
  Settings.chunk_overlap = 64
25
 
26
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
27
+ LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
28
 
29
  # ---- 2. Parser Setup ----
30
  parser = LlamaParse(
31
+ api_key = LLAMA_CLOUD_API_KEY,
32
  base_url = os.getenv("LLAMA_CLOUD_BASE_URL"),
33
  result_type = "markdown",
34
  content_guideline_instruction = (
 
39
  verbose=True
40
  )
41
 
42
+ # Ensure directories exist
43
+ Path("./user_data").mkdir(exist_ok=True)
44
+ Path("./index_data").mkdir(exist_ok=True)
45
+
46
+ # ---- 3a. Upload + Answer Logic ----
47
  async def answer(uploaded_files: list[gr.FileData], question: str) -> str:
 
48
  if not uploaded_files:
49
  return "❗ Please upload at least one PDF."
50
  if len(uploaded_files) > 5:
51
  return "❗ You can upload up to 5 PDF files."
52
 
 
 
 
 
 
53
  tools = []
54
  for file_obj in uploaded_files:
55
+ # 1) Page-count check
56
  try:
57
  reader = PdfReader(file_obj.name)
58
  except Exception as e:
 
60
  if len(reader.pages) > 20:
61
  return f"❗ {Path(file_obj.name).name} has {len(reader.pages)} pages (>20)."
62
 
63
+ # 2) Copy PDF into user_data
64
+ dest = Path("./user_data") / Path(file_obj.name).name
65
+ shutil.copyfile(file_obj.name, dest)
66
 
67
+ # 3) Parse via LlamaParse
68
  docs = parser.load_data(dest)
69
 
70
+ # 4) Index folder per file stem
71
+ stem = dest.stem
72
  idx_dir = Path(f"./index_data/{stem}")
73
 
74
+ # 5) Load or build index
75
  if idx_dir.exists() and any(idx_dir.iterdir()):
76
+ sc = StorageContext.from_defaults(persist_dir=str(idx_dir))
77
  idx = load_index_from_storage(sc)
78
  else:
79
  sc = StorageContext.from_defaults()
80
  idx = VectorStoreIndex.from_documents(docs, storage_context=sc)
81
+ sc.persist(persist_dir=str(idx_dir))
82
+
83
+ # 6) Wrap in QueryEngineTool
84
+ tools.append(
85
+ QueryEngineTool.from_defaults(
86
+ query_engine=idx.as_query_engine(),
87
+ name=f"vector_index_{stem}",
88
+ description=f"Query engine for {stem}.pdf"
89
+ )
90
  )
 
91
 
92
+ # 7) Combine tools into SubQuestionQueryEngine + Agent
93
  subq = SubQuestionQueryEngine.from_defaults(query_engine_tools=tools)
94
  tools.append(
95
  QueryEngineTool.from_defaults(
 
101
  agent = FunctionAgent(tools=tools, llm=OpenAI(model="gpt-4o"))
102
  ctx = Context(agent)
103
 
104
+ # 8) Run agent
105
+ resp = await agent.run(question, ctx=ctx)
106
+ return str(resp)
107
+
108
+ # ---- 3b. Remove Documents Logic ----
109
+ def remove_docs(filenames: str) -> str:
110
+ """
111
+ filenames: comma-separated list of exact PDF filenames (with .pdf)
112
+ Deletes each from ./user_data/ and its index folder under ./index_data/
113
+ """
114
+ if not filenames.strip():
115
+ return "❗ Enter at least one filename to remove."
116
+
117
+ removed, not_found = [], []
118
+ for name in [f.strip() for f in filenames.split(",")]:
119
+ pdf_path = Path("./user_data") / name
120
+ idx_path = Path("./index_data") / Path(name).stem
121
+
122
+ ok = True
123
+ if pdf_path.exists():
124
+ pdf_path.unlink()
125
+ else:
126
+ ok = False
127
+
128
+ if idx_path.exists():
129
+ shutil.rmtree(idx_path)
130
+ else:
131
+ ok = ok and False
132
+
133
+ if ok:
134
+ removed.append(name)
135
+ else:
136
+ not_found.append(name)
137
+
138
+ msg = ""
139
+ if removed:
140
+ msg += f"✅ Removed: {', '.join(removed)}.\n"
141
+ if not_found:
142
+ msg += f"⚠️ Not found: {', '.join(not_found)}."
143
+ return msg.strip()
144
 
145
  # ---- 4. Gradio UI ----
146
  with gr.Blocks() as demo:
147
  gr.Markdown("# 📄 PDF Slide Deck Q&A Bot")
148
+
149
+ with gr.Tab("Ask Questions"):
150
+ with gr.Row():
151
+ file_input = gr.UploadButton(
152
+ "Upload up to 5 PDFs",
153
+ file_types=[".pdf"],
154
+ file_count="multiple"
155
+ )
156
+ question = gr.Textbox(
157
+ lines=2,
158
+ placeholder="Ask your question about the uploaded slide decks..."
159
+ )
160
+ output = gr.Textbox(label="Answer")
161
+ ask_btn = gr.Button("Ask")
162
+ ask_btn.click(
163
+ fn=answer,
164
+ inputs=[file_input, question],
165
+ outputs=output
166
  )
 
 
167
 
168
+ with gr.Tab("Remove Documents"):
169
+ remove_input = gr.Textbox(
170
+ lines=1,
171
+ placeholder="e.g. Q1-Slides.pdf, Q2-Slides.pdf"
172
+ )
173
+ remove_output = gr.Textbox(label="Removal Status")
174
+ remove_btn = gr.Button("Remove Docs")
175
+ remove_btn.click(
176
+ fn=remove_docs,
177
+ inputs=remove_input,
178
+ outputs=remove_output
179
+ )
180
 
181
  if __name__ == "__main__":
182
  demo.launch()