Avanisha commited on
Commit
94ca2d7
·
verified ·
1 Parent(s): dbcc12d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -31
app.py CHANGED
@@ -460,7 +460,7 @@ def display_source_documents_with_images(source_documents, query):
460
  highlighted_snippet = highlight_query_words(snippet, query)
461
 
462
  st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
463
- # st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
464
 
465
  logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
466
 
@@ -519,41 +519,33 @@ def get_pdf_details(filename, page_number):
519
  """Get details of a specific PDF page."""
520
  logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
521
  try:
522
- # Check if running in Hugging Face space or locally
523
- if os.path.exists('/tmp'):
524
- data_path = '/tmp/data' # Hugging Face temporary storage
525
- else:
526
- data_path = 'data' # Local storage
527
-
528
  file_path = os.path.join(data_path, filename)
529
 
530
- # Ensure file exists
531
- if not os.path.exists(file_path):
532
- logger.error(f"File does not exist at {file_path}")
533
- st.error(f"File not found at {file_path}")
534
- return
535
-
536
  # Open the PDF
537
  logger.debug(f"Opening PDF file: {file_path}")
538
  doc = fitz.open(file_path)
539
-
540
  # Extract full PDF text
541
  full_text = ""
542
  for page in doc:
543
  full_text += page.get_text()
544
-
545
  # Get PDF metadata
546
  pdf_metadata = doc.metadata or {}
547
-
548
  # Extract page text and render page image
549
  page = doc.load_page(page_number)
550
  page_text = page.get_text()
551
-
552
  # Render page as image
553
  pix = page.get_pixmap()
554
  img_bytes = pix.tobytes("png")
555
  page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
556
-
557
  # Detect language
558
  try:
559
  lang_code = detect(page_text)
@@ -561,7 +553,7 @@ def get_pdf_details(filename, page_number):
561
  except Exception as e:
562
  logger.warning(f"Language detection failed: {str(e)}")
563
  language = 'Unknown'
564
-
565
  # Prepare response
566
  return {
567
  "file_path": file_path,
@@ -590,20 +582,12 @@ def get_romanized_text(filename):
590
  """Get romanized text from a PDF."""
591
  logger.info(f"Processing romanized text for file: {filename}")
592
  try:
593
- # Check if running in Hugging Face space or locally
594
- if os.path.exists('/tmp'):
595
- data_path = '/tmp/data' # Use Hugging Face's temp directory
596
- else:
597
- data_path = 'data' # Use local directory
598
-
599
  file_path = os.path.join(data_path, filename)
600
 
601
- # Ensure file exists
602
- if not os.path.exists(file_path):
603
- logger.error(f"File does not exist at {file_path}")
604
- st.error(f"File not found at {file_path}")
605
- return
606
-
607
  # Open the PDF
608
  logger.debug(f"Opening PDF file for romanization: {file_path}")
609
  doc = fitz.open(file_path)
 
460
  highlighted_snippet = highlight_query_words(snippet, query)
461
 
462
  st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
463
+ st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
464
 
465
  logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
466
 
 
519
  """Get details of a specific PDF page."""
520
  logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
521
  try:
522
+ with open(CONFIG_FILE, 'r') as f:
523
+ config = json.load(f)
524
+
525
+ data_path = config.get('data_path', '/tmp/data')
 
 
526
  file_path = os.path.join(data_path, filename)
527
 
 
 
 
 
 
 
528
  # Open the PDF
529
  logger.debug(f"Opening PDF file: {file_path}")
530
  doc = fitz.open(file_path)
531
+
532
  # Extract full PDF text
533
  full_text = ""
534
  for page in doc:
535
  full_text += page.get_text()
536
+
537
  # Get PDF metadata
538
  pdf_metadata = doc.metadata or {}
539
+
540
  # Extract page text and render page image
541
  page = doc.load_page(page_number)
542
  page_text = page.get_text()
543
+
544
  # Render page as image
545
  pix = page.get_pixmap()
546
  img_bytes = pix.tobytes("png")
547
  page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
548
+
549
  # Detect language
550
  try:
551
  lang_code = detect(page_text)
 
553
  except Exception as e:
554
  logger.warning(f"Language detection failed: {str(e)}")
555
  language = 'Unknown'
556
+
557
  # Prepare response
558
  return {
559
  "file_path": file_path,
 
582
  """Get romanized text from a PDF."""
583
  logger.info(f"Processing romanized text for file: {filename}")
584
  try:
585
+ with open(CONFIG_FILE, 'r') as f:
586
+ config = json.load(f)
587
+
588
+ data_path = config.get('data_path', '/tmp/data')
 
 
589
  file_path = os.path.join(data_path, filename)
590
 
 
 
 
 
 
 
591
  # Open the PDF
592
  logger.debug(f"Opening PDF file for romanization: {file_path}")
593
  doc = fitz.open(file_path)