adibak commited on
Commit
dade368
·
1 Parent(s): b4b5aad

move file input to sidebar and set page range max to pdf length

Browse files
Files changed (2) hide show
  1. app.py +43 -13
  2. helpers/file_manager.py +9 -4
app.py CHANGED
@@ -221,12 +221,43 @@ with st.sidebar:
221
  ),
222
  value='2024-05-01-preview',
223
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- page_range_slider = st.slider(label=('4: Specify a page range to examine:\n\n'
226
- '(min=1, max=50)'),
227
- min_value=1, max_value=50,
228
- value=(1, 50))
229
- st.session_state['page_range'] = page_range_slider
230
 
231
 
232
  def build_ui():
@@ -283,18 +314,17 @@ def set_up_chat_ui():
283
  if prompt := st.chat_input(
284
  placeholder=APP_TEXT['chat_placeholder'],
285
  max_chars=GlobalConfig.LLM_MODEL_MAX_INPUT_LENGTH,
286
- accept_file=True,
287
  file_type=['pdf', ],
288
  ):
289
- prompt_text = prompt.text or ''
290
- if prompt['files']:
291
- # Apparently, Streamlit stores uploaded files in memory and clears on browser close
292
- # https://docs.streamlit.io/knowledge-base/using-streamlit/where-file-uploader-store-when-deleted
293
- page_range = st.session_state.get('page_range', (1, 50)) # fallback default
294
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
295
- prompt['files'][0], page_range
 
296
  )
297
- print(f'{prompt["files"]=}')
298
 
299
  provider, llm_name = llm_helper.get_provider_model(
300
  llm_provider_to_use,
 
221
  ),
222
  value='2024-05-01-preview',
223
  )
224
+
225
+ from pypdf import PdfReader
226
+
227
+ uploaded_pdf = st.file_uploader("1: Upload a PDF file", type=["pdf"])
228
+
229
+ # Detect file change: update session state
230
+ if uploaded_pdf:
231
+ # Unique hash for reset logic
232
+ new_file_hash = hash(uploaded_pdf.getvalue())
233
+
234
+ # If file is newly uploaded or changed
235
+ if st.session_state.get("current_pdf_hash") != new_file_hash:
236
+ reader = PdfReader(uploaded_pdf)
237
+ total_pages = len(reader.pages)
238
+
239
+ st.session_state["pdf_page_count"] = total_pages
240
+ st.session_state["current_pdf_hash"] = new_file_hash
241
+
242
+ # Force slider reset
243
+ st.session_state.pop("page_range", None)
244
+
245
+ # Set default page count if no file uploaded
246
+ page_count = st.session_state.get("pdf_page_count", 50)
247
+ max_slider = min(50, page_count)
248
+
249
+ # Show slider only after file upload
250
+ if "pdf_page_count" in st.session_state:
251
+ page_range_slider = st.slider(
252
+ label="2: Specify a page range to examine:",
253
+ min_value=1,
254
+ max_value=max_slider,
255
+ value=(1, max_slider),
256
+ key="page_range" # persistent + resettable
257
+ )
258
+ else:
259
+ st.info("📄 Upload a PDF to specify a page range.")
260
 
 
 
 
 
 
261
 
262
 
263
  def build_ui():
 
314
  if prompt := st.chat_input(
315
  placeholder=APP_TEXT['chat_placeholder'],
316
  max_chars=GlobalConfig.LLM_MODEL_MAX_INPUT_LENGTH,
317
+ accept_file=False,
318
  file_type=['pdf', ],
319
  ):
320
+ logger.info(f"type {type(prompt)}")
321
+ prompt_text = prompt
322
+
323
+ if uploaded_pdf and "page_range" in st.session_state:
 
324
  st.session_state[ADDITIONAL_INFO] = filem.get_pdf_contents(
325
+ uploaded_pdf,
326
+ st.session_state["page_range"]
327
  )
 
328
 
329
  provider, llm_name = llm_helper.get_provider_model(
330
  llm_provider_to_use,
helpers/file_manager.py CHANGED
@@ -26,19 +26,24 @@ def get_pdf_contents(
26
  Extract the text contents from a PDF file.
27
 
28
  :param pdf_file: The uploaded PDF file.
 
29
  :param max_pages: The max no. of pages to extract contents from.
30
  :return: The contents.
31
  """
32
 
33
  reader = PdfReader(pdf_file)
34
 
 
35
  total_pages = len(reader.pages)
36
- n_pages = min(max_pages, total_pages)
37
 
38
- start, end = page_range
39
- start = max(1, start)
40
- end = min(n_pages, end)
 
41
 
 
 
42
  text = ''
43
  for page_num in range(start - 1, end):
44
  page = reader.pages[page_num]
 
26
  Extract the text contents from a PDF file.
27
 
28
  :param pdf_file: The uploaded PDF file.
29
+ :param page_range: The range of pages to extract contents from.
30
  :param max_pages: The max no. of pages to extract contents from.
31
  :return: The contents.
32
  """
33
 
34
  reader = PdfReader(pdf_file)
35
 
36
+ # get number of pages
37
  total_pages = len(reader.pages)
38
+ n_pages = min(max_pages, total_pages) # set n_pages to min of 50 or the pdf length if the pdf is shorter than 50 pages
39
 
40
+ # ensure validity
41
+ start, end = page_range # set start and end per the range (user-specified values)
42
+ start = max(1, start) # set start to max of 1, or user-spec start
43
+ end = min(n_pages, end) # set end to min of n_pages, or user-spec end
44
 
45
+ logger.info(f"start={start}")
46
+ logger.info(f"end={end}")
47
  text = ''
48
  for page_num in range(start - 1, end):
49
  page = reader.pages[page_num]