Update app.py
Browse files
app.py
CHANGED
@@ -460,7 +460,7 @@ def display_source_documents_with_images(source_documents, query):
|
|
460 |
highlighted_snippet = highlight_query_words(snippet, query)
|
461 |
|
462 |
st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
|
463 |
-
|
464 |
|
465 |
logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
|
466 |
|
@@ -519,41 +519,33 @@ def get_pdf_details(filename, page_number):
|
|
519 |
"""Get details of a specific PDF page."""
|
520 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
521 |
try:
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
data_path = 'data' # Local storage
|
527 |
-
|
528 |
file_path = os.path.join(data_path, filename)
|
529 |
|
530 |
-
# Ensure file exists
|
531 |
-
if not os.path.exists(file_path):
|
532 |
-
logger.error(f"File does not exist at {file_path}")
|
533 |
-
st.error(f"File not found at {file_path}")
|
534 |
-
return
|
535 |
-
|
536 |
# Open the PDF
|
537 |
logger.debug(f"Opening PDF file: {file_path}")
|
538 |
doc = fitz.open(file_path)
|
539 |
-
|
540 |
# Extract full PDF text
|
541 |
full_text = ""
|
542 |
for page in doc:
|
543 |
full_text += page.get_text()
|
544 |
-
|
545 |
# Get PDF metadata
|
546 |
pdf_metadata = doc.metadata or {}
|
547 |
-
|
548 |
# Extract page text and render page image
|
549 |
page = doc.load_page(page_number)
|
550 |
page_text = page.get_text()
|
551 |
-
|
552 |
# Render page as image
|
553 |
pix = page.get_pixmap()
|
554 |
img_bytes = pix.tobytes("png")
|
555 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
556 |
-
|
557 |
# Detect language
|
558 |
try:
|
559 |
lang_code = detect(page_text)
|
@@ -561,7 +553,7 @@ def get_pdf_details(filename, page_number):
|
|
561 |
except Exception as e:
|
562 |
logger.warning(f"Language detection failed: {str(e)}")
|
563 |
language = 'Unknown'
|
564 |
-
|
565 |
# Prepare response
|
566 |
return {
|
567 |
"file_path": file_path,
|
@@ -590,20 +582,12 @@ def get_romanized_text(filename):
|
|
590 |
"""Get romanized text from a PDF."""
|
591 |
logger.info(f"Processing romanized text for file: {filename}")
|
592 |
try:
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
data_path = 'data' # Use local directory
|
598 |
-
|
599 |
file_path = os.path.join(data_path, filename)
|
600 |
|
601 |
-
# Ensure file exists
|
602 |
-
if not os.path.exists(file_path):
|
603 |
-
logger.error(f"File does not exist at {file_path}")
|
604 |
-
st.error(f"File not found at {file_path}")
|
605 |
-
return
|
606 |
-
|
607 |
# Open the PDF
|
608 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
609 |
doc = fitz.open(file_path)
|
|
|
460 |
highlighted_snippet = highlight_query_words(snippet, query)
|
461 |
|
462 |
st.markdown(f'<div class="source-content">{highlighted_snippet}</div>', unsafe_allow_html=True)
|
463 |
+
st.markdown(f"[View other results in this book](?page=pdf_details&filename={pdf_name}&page_number={page_number})", unsafe_allow_html=True)
|
464 |
|
465 |
logger.debug(f"Successfully displayed content for {pdf_name}, page {page_number + 1}")
|
466 |
|
|
|
519 |
"""Get details of a specific PDF page."""
|
520 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
521 |
try:
|
522 |
+
with open(CONFIG_FILE, 'r') as f:
|
523 |
+
config = json.load(f)
|
524 |
+
|
525 |
+
data_path = config.get('data_path', '/tmp/data')
|
|
|
|
|
526 |
file_path = os.path.join(data_path, filename)
|
527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
# Open the PDF
|
529 |
logger.debug(f"Opening PDF file: {file_path}")
|
530 |
doc = fitz.open(file_path)
|
531 |
+
|
532 |
# Extract full PDF text
|
533 |
full_text = ""
|
534 |
for page in doc:
|
535 |
full_text += page.get_text()
|
536 |
+
|
537 |
# Get PDF metadata
|
538 |
pdf_metadata = doc.metadata or {}
|
539 |
+
|
540 |
# Extract page text and render page image
|
541 |
page = doc.load_page(page_number)
|
542 |
page_text = page.get_text()
|
543 |
+
|
544 |
# Render page as image
|
545 |
pix = page.get_pixmap()
|
546 |
img_bytes = pix.tobytes("png")
|
547 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
548 |
+
|
549 |
# Detect language
|
550 |
try:
|
551 |
lang_code = detect(page_text)
|
|
|
553 |
except Exception as e:
|
554 |
logger.warning(f"Language detection failed: {str(e)}")
|
555 |
language = 'Unknown'
|
556 |
+
|
557 |
# Prepare response
|
558 |
return {
|
559 |
"file_path": file_path,
|
|
|
582 |
"""Get romanized text from a PDF."""
|
583 |
logger.info(f"Processing romanized text for file: {filename}")
|
584 |
try:
|
585 |
+
with open(CONFIG_FILE, 'r') as f:
|
586 |
+
config = json.load(f)
|
587 |
+
|
588 |
+
data_path = config.get('data_path', '/tmp/data')
|
|
|
|
|
589 |
file_path = os.path.join(data_path, filename)
|
590 |
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
# Open the PDF
|
592 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
593 |
doc = fitz.open(file_path)
|