nicolasb92's picture
PDF to images
6946525
import base64
from pdf2image import convert_from_path
from io import BytesIO
from PIL import Image
from llama_index.core.schema import BaseNode
from shiny import ui, render, App
import logging
from pathlib import Path
from llama_index.core.storage.docstore import SimpleDocumentStore
import json
from structure_parsers import parse_structure
root_logger = logging.getLogger()
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
root_logger.setLevel(logging.ERROR)
parsed_structure_info = {
"landscape": """
Structure parsed from PDF using Gemini (landscape mode):
- Indentation shows parent/child relationships
- Numbers in [] represent:
* Positive numbers: page numbers
* Negative numbers: abstract nodes grouping related sections
""",
"portrait": """
Structure parsed from PDF using Gemini (portrait mode):
- Indentation shows parent/child relationships
- Numbers in [] represent:
* Positive numbers: line numbers
* Negative numbers: abstract nodes grouping related sections
""",
}
# Load the docstore
data_dir = Path(__file__).parent / "data"
docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json"))
def get_pdf_as_images(pdf_path):
"""Convert PDF file to a list of base64 encoded images"""
try:
# Convert PDF to list of images
images = convert_from_path(pdf_path)
image_urls = []
for img in images:
# Convert PIL image to base64
buffered = BytesIO()
img.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode()
image_urls.append(f"data:image/png;base64,{img_base64}")
return image_urls
except Exception as e:
logger.error(f"Error converting PDF to images: {e}")
return []
def get_str_structure(doc: BaseNode) -> str:
"""Return a string representation of the document structure."""
tree = parse_structure(doc)
return str(tree)
def format_metadata_value(value):
# Try to parse as JSON if it looks like a dict/list string
if isinstance(value, str):
if (value.startswith("{") and value.endswith("}")) or (value.startswith("[") and value.endswith("]")):
try:
parsed = json.loads(value)
return ui.tags.pre(json.dumps(parsed, indent=2, ensure_ascii=False))
except:
pass
# Handle multiline strings
if "\n" in value:
return ui.tags.pre(value)
# Handle lists and dicts directly
if isinstance(value, (dict, list)):
return ui.tags.pre(json.dumps(value, indent=2, ensure_ascii=False))
# Default case
return str(value)
# Define UI
def get_doc_choices():
return [(doc_id, doc.metadata.get("filename", doc_id)) for doc_id, doc in docstore.docs.items()]
app_ui = ui.page_fluid(
# Summary section
ui.card(
ui.h2("Document Store Summary"),
ui.output_text("doc_count"),
# min_height=200,
),
# Document selection and display
ui.card(
ui.layout_sidebar(
ui.sidebar(
ui.h3("Document Selection"),
ui.input_select("selected_doc_id", "Select Document", choices=dict(get_doc_choices())),
),
ui.h2("Document display"),
ui.output_ui("display_panel"),
ui.h2("Document Details"),
ui.output_ui("metadata_panel"),
)
),
)
def server(input, output, session):
@output
@render.text
def doc_count():
return f"Total documents: {len(docstore.docs)}"
@output
@render.ui
def display_panel():
if not input.selected_doc_id():
return ui.p("Please select a document")
try:
doc = docstore.docs.get(input.selected_doc_id())
if not doc:
logger.error(f"Document not found: {input.selected_doc_id()}")
return ui.p("Error: Document not found")
filename = doc.metadata.get("filename", "")
if not filename:
logger.error(f"Document does not have a filename: {input.selected_doc_id()}")
return ui.p("Error: Document does not have a filename")
pdf_path = data_dir / "pdfs" / filename
# pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None
image_urls = get_pdf_as_images(pdf_path) if pdf_path.exists() else []
image_elements = [
ui.tags.img(
src=img_url,
style="width: 100%; margin-bottom: 10px; border: 1px solid #ddd;"
)
for img_url in image_urls
]
return ui.div(
ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"),
ui.row(
ui.column(
6, # Left column (PDF)
ui.h4("PDF View"),
ui.div(
ui.div(image_elements) if image_elements else ui.p("No PDF available"),
style="height: 800px; overflow-y: auto;"
),
),
ui.column(
6, # Right column (Markdown)
ui.h4("Content"),
ui.div(
ui.markdown(doc.text),
style="height: 800px; overflow-y: auto; border: 1px solid #ddd; padding: 1rem;",
),
),
),
)
except Exception as e:
logger.exception("Error displaying document metadata")
return ui.p(f"Error: {str(e)}")
@output
@render.ui
def metadata_panel():
if not input.selected_doc_id():
return ui.p("Please select a document")
try:
doc = docstore.docs.get(input.selected_doc_id())
if not doc:
logger.error(f"Document not found: {input.selected_doc_id()}")
return ui.p("Error: Document not found")
return ui.div(
ui.h3(f"Metadata for: {doc.metadata.get('filename', 'Unknown')}"),
ui.tags.table(
ui.tags.thead(ui.tags.tr(ui.tags.th("Property"), ui.tags.th("Value"))),
ui.tags.tbody(
*[
ui.tags.tr(ui.tags.td(key), ui.tags.td(format_metadata_value(value)))
for key, value in doc.metadata.items()
]
),
class_="table table-striped",
),
ui.h3(f"Parsed Structure for: {doc.metadata.get('filename', 'Unknown')}"),
ui.markdown(parsed_structure_info.get(doc.metadata.get("format", ""), "")),
ui.tags.pre(get_str_structure(doc)),
)
except Exception as e:
logger.exception("Error displaying document metadata")
return ui.p(f"Error: {str(e)}")
app = App(app_ui, server)