Update app.py
Browse files
app.py
CHANGED
@@ -31,11 +31,16 @@ nltk.download('punkt')
|
|
31 |
nltk.download('punkt_tab')
|
32 |
nltk.download('stopwords')
|
33 |
|
34 |
-
# Create directories if they don't exist
|
35 |
def create_dirs_if_needed():
|
36 |
"""Create the necessary directories if they don't exist."""
|
37 |
-
os.
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Call the function at the start of your app
|
41 |
create_dirs_if_needed()
|
@@ -509,35 +514,46 @@ def is_query_relevant(question, source_documents, threshold=0.1):
|
|
509 |
except Exception as e:
|
510 |
logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
|
511 |
return False
|
|
|
512 |
def get_pdf_details(filename, page_number):
|
513 |
"""Get details of a specific PDF page."""
|
514 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
515 |
try:
|
516 |
-
#
|
517 |
-
|
|
|
|
|
|
|
|
|
518 |
file_path = os.path.join(data_path, filename)
|
519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
# Open the PDF
|
521 |
logger.debug(f"Opening PDF file: {file_path}")
|
522 |
doc = fitz.open(file_path)
|
523 |
-
|
524 |
# Extract full PDF text
|
525 |
full_text = ""
|
526 |
for page in doc:
|
527 |
full_text += page.get_text()
|
528 |
-
|
529 |
# Get PDF metadata
|
530 |
pdf_metadata = doc.metadata or {}
|
531 |
-
|
532 |
# Extract page text and render page image
|
533 |
page = doc.load_page(page_number)
|
534 |
page_text = page.get_text()
|
535 |
-
|
536 |
# Render page as image
|
537 |
pix = page.get_pixmap()
|
538 |
img_bytes = pix.tobytes("png")
|
539 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
540 |
-
|
541 |
# Detect language
|
542 |
try:
|
543 |
lang_code = detect(page_text)
|
@@ -545,7 +561,7 @@ def get_pdf_details(filename, page_number):
|
|
545 |
except Exception as e:
|
546 |
logger.warning(f"Language detection failed: {str(e)}")
|
547 |
language = 'Unknown'
|
548 |
-
|
549 |
# Prepare response
|
550 |
return {
|
551 |
"file_path": file_path,
|
@@ -574,10 +590,20 @@ def get_romanized_text(filename):
|
|
574 |
"""Get romanized text from a PDF."""
|
575 |
logger.info(f"Processing romanized text for file: {filename}")
|
576 |
try:
|
577 |
-
#
|
578 |
-
|
|
|
|
|
|
|
|
|
579 |
file_path = os.path.join(data_path, filename)
|
580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
581 |
# Open the PDF
|
582 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
583 |
doc = fitz.open(file_path)
|
|
|
31 |
nltk.download('punkt_tab')
|
32 |
nltk.download('stopwords')
|
33 |
|
|
|
34 |
def create_dirs_if_needed():
|
35 |
"""Create the necessary directories if they don't exist."""
|
36 |
+
if os.path.exists('/tmp'):
|
37 |
+
# We're in Hugging Face space
|
38 |
+
os.makedirs('/tmp/data', exist_ok=True)
|
39 |
+
os.makedirs('/tmp/db', exist_ok=True)
|
40 |
+
else:
|
41 |
+
# Local environment
|
42 |
+
os.makedirs('data', exist_ok=True)
|
43 |
+
os.makedirs('db', exist_ok=True)
|
44 |
|
45 |
# Call the function at the start of your app
|
46 |
create_dirs_if_needed()
|
|
|
514 |
except Exception as e:
|
515 |
logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
|
516 |
return False
|
517 |
+
|
518 |
def get_pdf_details(filename, page_number):
|
519 |
"""Get details of a specific PDF page."""
|
520 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
521 |
try:
|
522 |
+
# Check if running in Hugging Face space or locally
|
523 |
+
if os.path.exists('/tmp'):
|
524 |
+
data_path = '/tmp/data' # Hugging Face temporary storage
|
525 |
+
else:
|
526 |
+
data_path = 'data' # Local storage
|
527 |
+
|
528 |
file_path = os.path.join(data_path, filename)
|
529 |
|
530 |
+
# Ensure file exists
|
531 |
+
if not os.path.exists(file_path):
|
532 |
+
logger.error(f"File does not exist at {file_path}")
|
533 |
+
st.error(f"File not found at {file_path}")
|
534 |
+
return
|
535 |
+
|
536 |
# Open the PDF
|
537 |
logger.debug(f"Opening PDF file: {file_path}")
|
538 |
doc = fitz.open(file_path)
|
539 |
+
|
540 |
# Extract full PDF text
|
541 |
full_text = ""
|
542 |
for page in doc:
|
543 |
full_text += page.get_text()
|
544 |
+
|
545 |
# Get PDF metadata
|
546 |
pdf_metadata = doc.metadata or {}
|
547 |
+
|
548 |
# Extract page text and render page image
|
549 |
page = doc.load_page(page_number)
|
550 |
page_text = page.get_text()
|
551 |
+
|
552 |
# Render page as image
|
553 |
pix = page.get_pixmap()
|
554 |
img_bytes = pix.tobytes("png")
|
555 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
556 |
+
|
557 |
# Detect language
|
558 |
try:
|
559 |
lang_code = detect(page_text)
|
|
|
561 |
except Exception as e:
|
562 |
logger.warning(f"Language detection failed: {str(e)}")
|
563 |
language = 'Unknown'
|
564 |
+
|
565 |
# Prepare response
|
566 |
return {
|
567 |
"file_path": file_path,
|
|
|
590 |
"""Get romanized text from a PDF."""
|
591 |
logger.info(f"Processing romanized text for file: {filename}")
|
592 |
try:
|
593 |
+
# Check if running in Hugging Face space or locally
|
594 |
+
if os.path.exists('/tmp'):
|
595 |
+
data_path = '/tmp/data' # Use Hugging Face's temp directory
|
596 |
+
else:
|
597 |
+
data_path = 'data' # Use local directory
|
598 |
+
|
599 |
file_path = os.path.join(data_path, filename)
|
600 |
|
601 |
+
# Ensure file exists
|
602 |
+
if not os.path.exists(file_path):
|
603 |
+
logger.error(f"File does not exist at {file_path}")
|
604 |
+
st.error(f"File not found at {file_path}")
|
605 |
+
return
|
606 |
+
|
607 |
# Open the PDF
|
608 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
609 |
doc = fitz.open(file_path)
|