Avanisha commited on
Commit
496b314
·
verified ·
1 Parent(s): 002703a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -13
app.py CHANGED
@@ -31,11 +31,16 @@ nltk.download('punkt')
31
  nltk.download('punkt_tab')
32
  nltk.download('stopwords')
33
 
34
- # Create directories if they don't exist
35
  def create_dirs_if_needed():
36
  """Create the necessary directories if they don't exist."""
37
- os.makedirs('/tmp/data', exist_ok=True)
38
- os.makedirs('/tmp/db', exist_ok=True)
 
 
 
 
 
 
39
 
40
  # Call the function at the start of your app
41
  create_dirs_if_needed()
@@ -509,35 +514,46 @@ def is_query_relevant(question, source_documents, threshold=0.1):
509
  except Exception as e:
510
  logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
511
  return False
 
512
  def get_pdf_details(filename, page_number):
513
  """Get details of a specific PDF page."""
514
  logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
515
  try:
516
- # Update the paths to point to /tmp for Hugging Face Space
517
- data_path = '/tmp/data'
 
 
 
 
518
  file_path = os.path.join(data_path, filename)
519
 
 
 
 
 
 
 
520
  # Open the PDF
521
  logger.debug(f"Opening PDF file: {file_path}")
522
  doc = fitz.open(file_path)
523
-
524
  # Extract full PDF text
525
  full_text = ""
526
  for page in doc:
527
  full_text += page.get_text()
528
-
529
  # Get PDF metadata
530
  pdf_metadata = doc.metadata or {}
531
-
532
  # Extract page text and render page image
533
  page = doc.load_page(page_number)
534
  page_text = page.get_text()
535
-
536
  # Render page as image
537
  pix = page.get_pixmap()
538
  img_bytes = pix.tobytes("png")
539
  page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
540
-
541
  # Detect language
542
  try:
543
  lang_code = detect(page_text)
@@ -545,7 +561,7 @@ def get_pdf_details(filename, page_number):
545
  except Exception as e:
546
  logger.warning(f"Language detection failed: {str(e)}")
547
  language = 'Unknown'
548
-
549
  # Prepare response
550
  return {
551
  "file_path": file_path,
@@ -574,10 +590,20 @@ def get_romanized_text(filename):
574
  """Get romanized text from a PDF."""
575
  logger.info(f"Processing romanized text for file: {filename}")
576
  try:
577
- # Update the paths to point to /tmp for Hugging Face Space
578
- data_path = '/tmp/data'
 
 
 
 
579
  file_path = os.path.join(data_path, filename)
580
 
 
 
 
 
 
 
581
  # Open the PDF
582
  logger.debug(f"Opening PDF file for romanization: {file_path}")
583
  doc = fitz.open(file_path)
 
31
  nltk.download('punkt_tab')
32
  nltk.download('stopwords')
33
 
 
34
  def create_dirs_if_needed():
35
  """Create the necessary directories if they don't exist."""
36
+ if os.path.exists('/tmp'):
37
+ # We're in Hugging Face space
38
+ os.makedirs('/tmp/data', exist_ok=True)
39
+ os.makedirs('/tmp/db', exist_ok=True)
40
+ else:
41
+ # Local environment
42
+ os.makedirs('data', exist_ok=True)
43
+ os.makedirs('db', exist_ok=True)
44
 
45
  # Call the function at the start of your app
46
  create_dirs_if_needed()
 
514
  except Exception as e:
515
  logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
516
  return False
517
+
518
  def get_pdf_details(filename, page_number):
519
  """Get details of a specific PDF page."""
520
  logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
521
  try:
522
+ # Check if running in Hugging Face space or locally
523
+ if os.path.exists('/tmp'):
524
+ data_path = '/tmp/data' # Hugging Face temporary storage
525
+ else:
526
+ data_path = 'data' # Local storage
527
+
528
  file_path = os.path.join(data_path, filename)
529
 
530
+ # Ensure file exists
531
+ if not os.path.exists(file_path):
532
+ logger.error(f"File does not exist at {file_path}")
533
+ st.error(f"File not found at {file_path}")
534
+ return
535
+
536
  # Open the PDF
537
  logger.debug(f"Opening PDF file: {file_path}")
538
  doc = fitz.open(file_path)
539
+
540
  # Extract full PDF text
541
  full_text = ""
542
  for page in doc:
543
  full_text += page.get_text()
544
+
545
  # Get PDF metadata
546
  pdf_metadata = doc.metadata or {}
547
+
548
  # Extract page text and render page image
549
  page = doc.load_page(page_number)
550
  page_text = page.get_text()
551
+
552
  # Render page as image
553
  pix = page.get_pixmap()
554
  img_bytes = pix.tobytes("png")
555
  page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
556
+
557
  # Detect language
558
  try:
559
  lang_code = detect(page_text)
 
561
  except Exception as e:
562
  logger.warning(f"Language detection failed: {str(e)}")
563
  language = 'Unknown'
564
+
565
  # Prepare response
566
  return {
567
  "file_path": file_path,
 
590
  """Get romanized text from a PDF."""
591
  logger.info(f"Processing romanized text for file: {filename}")
592
  try:
593
+ # Check if running in Hugging Face space or locally
594
+ if os.path.exists('/tmp'):
595
+ data_path = '/tmp/data' # Use Hugging Face's temp directory
596
+ else:
597
+ data_path = 'data' # Use local directory
598
+
599
  file_path = os.path.join(data_path, filename)
600
 
601
+ # Ensure file exists
602
+ if not os.path.exists(file_path):
603
+ logger.error(f"File does not exist at {file_path}")
604
+ st.error(f"File not found at {file_path}")
605
+ return
606
+
607
  # Open the PDF
608
  logger.debug(f"Opening PDF file for romanization: {file_path}")
609
  doc = fitz.open(file_path)