import base64 from pdf2image import convert_from_path from io import BytesIO from PIL import Image from llama_index.core.schema import BaseNode from shiny import ui, render, App import logging from pathlib import Path from llama_index.core.storage.docstore import SimpleDocumentStore import json from structure_parsers import parse_structure root_logger = logging.getLogger() logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) root_logger.setLevel(logging.ERROR) parsed_structure_info = { "landscape": """ Structure parsed from PDF using Gemini (landscape mode): - Indentation shows parent/child relationships - Numbers in [] represent: * Positive numbers: page numbers * Negative numbers: abstract nodes grouping related sections """, "portrait": """ Structure parsed from PDF using Gemini (portrait mode): - Indentation shows parent/child relationships - Numbers in [] represent: * Positive numbers: line numbers * Negative numbers: abstract nodes grouping related sections """, } # Load the docstore data_dir = Path(__file__).parent / "data" docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json")) def get_pdf_as_images(pdf_path): """Convert PDF file to a list of base64 encoded images""" try: # Convert PDF to list of images images = convert_from_path(pdf_path) image_urls = [] for img in images: # Convert PIL image to base64 buffered = BytesIO() img.save(buffered, format="PNG") img_base64 = base64.b64encode(buffered.getvalue()).decode() image_urls.append(f"data:image/png;base64,{img_base64}") return image_urls except Exception as e: logger.error(f"Error converting PDF to images: {e}") return [] def get_str_structure(doc: BaseNode) -> str: """Return a string representation of the document structure.""" tree = parse_structure(doc) return str(tree) def format_metadata_value(value): # Try to parse as JSON if it looks like a dict/list string if isinstance(value, str): if (value.startswith("{") and value.endswith("}")) or (value.startswith("[") and value.endswith("]")): try: parsed = json.loads(value) return ui.tags.pre(json.dumps(parsed, indent=2, ensure_ascii=False)) except: pass # Handle multiline strings if "\n" in value: return ui.tags.pre(value) # Handle lists and dicts directly if isinstance(value, (dict, list)): return ui.tags.pre(json.dumps(value, indent=2, ensure_ascii=False)) # Default case return str(value) # Define UI def get_doc_choices(): return [(doc_id, doc.metadata.get("filename", doc_id)) for doc_id, doc in docstore.docs.items()] app_ui = ui.page_fluid( # Summary section ui.card( ui.h2("Document Store Summary"), ui.output_text("doc_count"), # min_height=200, ), # Document selection and display ui.card( ui.layout_sidebar( ui.sidebar( ui.h3("Document Selection"), ui.input_select("selected_doc_id", "Select Document", choices=dict(get_doc_choices())), ), ui.h2("Document display"), ui.output_ui("display_panel"), ui.h2("Document Details"), ui.output_ui("metadata_panel"), ) ), ) def server(input, output, session): @output @render.text def doc_count(): return f"Total documents: {len(docstore.docs)}" @output @render.ui def display_panel(): if not input.selected_doc_id(): return ui.p("Please select a document") try: doc = docstore.docs.get(input.selected_doc_id()) if not doc: logger.error(f"Document not found: {input.selected_doc_id()}") return ui.p("Error: Document not found") filename = doc.metadata.get("filename", "") if not filename: logger.error(f"Document does not have a filename: {input.selected_doc_id()}") return ui.p("Error: Document does not have a filename") pdf_path = data_dir / "pdfs" / filename # pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None image_urls = get_pdf_as_images(pdf_path) if pdf_path.exists() else [] image_elements = [ ui.tags.img( src=img_url, style="width: 100%; margin-bottom: 10px; border: 1px solid #ddd;" ) for img_url in image_urls ] return ui.div( ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"), ui.row( ui.column( 6, # Left column (PDF) ui.h4("PDF View"), ui.div( ui.div(image_elements) if image_elements else ui.p("No PDF available"), style="height: 800px; overflow-y: auto;" ), ), ui.column( 6, # Right column (Markdown) ui.h4("Content"), ui.div( ui.markdown(doc.text), style="height: 800px; overflow-y: auto; border: 1px solid #ddd; padding: 1rem;", ), ), ), ) except Exception as e: logger.exception("Error displaying document metadata") return ui.p(f"Error: {str(e)}") @output @render.ui def metadata_panel(): if not input.selected_doc_id(): return ui.p("Please select a document") try: doc = docstore.docs.get(input.selected_doc_id()) if not doc: logger.error(f"Document not found: {input.selected_doc_id()}") return ui.p("Error: Document not found") return ui.div( ui.h3(f"Metadata for: {doc.metadata.get('filename', 'Unknown')}"), ui.tags.table( ui.tags.thead(ui.tags.tr(ui.tags.th("Property"), ui.tags.th("Value"))), ui.tags.tbody( *[ ui.tags.tr(ui.tags.td(key), ui.tags.td(format_metadata_value(value))) for key, value in doc.metadata.items() ] ), class_="table table-striped", ), ui.h3(f"Parsed Structure for: {doc.metadata.get('filename', 'Unknown')}"), ui.markdown(parsed_structure_info.get(doc.metadata.get("format", ""), "")), ui.tags.pre(get_str_structure(doc)), ) except Exception as e: logger.exception("Error displaying document metadata") return ui.p(f"Error: {str(e)}") app = App(app_ui, server)