|
import base64 |
|
from pdf2image import convert_from_path |
|
from io import BytesIO |
|
from PIL import Image |
|
|
|
from llama_index.core.schema import BaseNode |
|
from shiny import ui, render, App |
|
import logging |
|
from pathlib import Path |
|
from llama_index.core.storage.docstore import SimpleDocumentStore |
|
import json |
|
|
|
from structure_parsers import parse_structure |
|
|
|
root_logger = logging.getLogger() |
|
logger = logging.getLogger(__name__) |
|
logging.basicConfig(level=logging.INFO) |
|
root_logger.setLevel(logging.ERROR) |
|
|
|
|
|
parsed_structure_info = { |
|
"landscape": """ |
|
Structure parsed from PDF using Gemini (landscape mode): |
|
- Indentation shows parent/child relationships |
|
- Numbers in [] represent: |
|
* Positive numbers: page numbers |
|
* Negative numbers: abstract nodes grouping related sections |
|
""", |
|
"portrait": """ |
|
Structure parsed from PDF using Gemini (portrait mode): |
|
- Indentation shows parent/child relationships |
|
- Numbers in [] represent: |
|
* Positive numbers: line numbers |
|
* Negative numbers: abstract nodes grouping related sections |
|
""", |
|
} |
|
|
|
|
|
data_dir = Path(__file__).parent / "data" |
|
docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json")) |
|
|
|
|
|
def get_pdf_as_images(pdf_path): |
|
"""Convert PDF file to a list of base64 encoded images""" |
|
try: |
|
|
|
images = convert_from_path(pdf_path) |
|
image_urls = [] |
|
|
|
for img in images: |
|
|
|
buffered = BytesIO() |
|
img.save(buffered, format="PNG") |
|
img_base64 = base64.b64encode(buffered.getvalue()).decode() |
|
image_urls.append(f"data:image/png;base64,{img_base64}") |
|
|
|
return image_urls |
|
except Exception as e: |
|
logger.error(f"Error converting PDF to images: {e}") |
|
return [] |
|
|
|
|
|
def get_str_structure(doc: BaseNode) -> str: |
|
"""Return a string representation of the document structure.""" |
|
tree = parse_structure(doc) |
|
return str(tree) |
|
|
|
|
|
def format_metadata_value(value): |
|
|
|
if isinstance(value, str): |
|
if (value.startswith("{") and value.endswith("}")) or (value.startswith("[") and value.endswith("]")): |
|
try: |
|
parsed = json.loads(value) |
|
return ui.tags.pre(json.dumps(parsed, indent=2, ensure_ascii=False)) |
|
except: |
|
pass |
|
|
|
|
|
if "\n" in value: |
|
return ui.tags.pre(value) |
|
|
|
|
|
if isinstance(value, (dict, list)): |
|
return ui.tags.pre(json.dumps(value, indent=2, ensure_ascii=False)) |
|
|
|
|
|
return str(value) |
|
|
|
|
|
|
|
def get_doc_choices(): |
|
return [(doc_id, doc.metadata.get("filename", doc_id)) for doc_id, doc in docstore.docs.items()] |
|
|
|
|
|
app_ui = ui.page_fluid( |
|
|
|
ui.card( |
|
ui.h2("Document Store Summary"), |
|
ui.output_text("doc_count"), |
|
|
|
), |
|
|
|
ui.card( |
|
ui.layout_sidebar( |
|
ui.sidebar( |
|
ui.h3("Document Selection"), |
|
ui.input_select("selected_doc_id", "Select Document", choices=dict(get_doc_choices())), |
|
), |
|
ui.h2("Document display"), |
|
ui.output_ui("display_panel"), |
|
ui.h2("Document Details"), |
|
ui.output_ui("metadata_panel"), |
|
) |
|
), |
|
) |
|
|
|
|
|
def server(input, output, session): |
|
@output |
|
@render.text |
|
def doc_count(): |
|
return f"Total documents: {len(docstore.docs)}" |
|
|
|
@output |
|
@render.ui |
|
def display_panel(): |
|
if not input.selected_doc_id(): |
|
return ui.p("Please select a document") |
|
|
|
try: |
|
doc = docstore.docs.get(input.selected_doc_id()) |
|
if not doc: |
|
logger.error(f"Document not found: {input.selected_doc_id()}") |
|
return ui.p("Error: Document not found") |
|
filename = doc.metadata.get("filename", "") |
|
if not filename: |
|
logger.error(f"Document does not have a filename: {input.selected_doc_id()}") |
|
return ui.p("Error: Document does not have a filename") |
|
pdf_path = data_dir / "pdfs" / filename |
|
|
|
image_urls = get_pdf_as_images(pdf_path) if pdf_path.exists() else [] |
|
image_elements = [ |
|
ui.tags.img( |
|
src=img_url, |
|
style="width: 100%; margin-bottom: 10px; border: 1px solid #ddd;" |
|
) |
|
for img_url in image_urls |
|
] |
|
|
|
return ui.div( |
|
ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"), |
|
ui.row( |
|
ui.column( |
|
6, |
|
ui.h4("PDF View"), |
|
ui.div( |
|
ui.div(image_elements) if image_elements else ui.p("No PDF available"), |
|
style="height: 800px; overflow-y: auto;" |
|
), |
|
), |
|
ui.column( |
|
6, |
|
ui.h4("Content"), |
|
ui.div( |
|
ui.markdown(doc.text), |
|
style="height: 800px; overflow-y: auto; border: 1px solid #ddd; padding: 1rem;", |
|
), |
|
), |
|
), |
|
) |
|
|
|
except Exception as e: |
|
logger.exception("Error displaying document metadata") |
|
return ui.p(f"Error: {str(e)}") |
|
|
|
@output |
|
@render.ui |
|
def metadata_panel(): |
|
if not input.selected_doc_id(): |
|
return ui.p("Please select a document") |
|
|
|
try: |
|
doc = docstore.docs.get(input.selected_doc_id()) |
|
if not doc: |
|
logger.error(f"Document not found: {input.selected_doc_id()}") |
|
return ui.p("Error: Document not found") |
|
|
|
return ui.div( |
|
ui.h3(f"Metadata for: {doc.metadata.get('filename', 'Unknown')}"), |
|
ui.tags.table( |
|
ui.tags.thead(ui.tags.tr(ui.tags.th("Property"), ui.tags.th("Value"))), |
|
ui.tags.tbody( |
|
*[ |
|
ui.tags.tr(ui.tags.td(key), ui.tags.td(format_metadata_value(value))) |
|
for key, value in doc.metadata.items() |
|
] |
|
), |
|
class_="table table-striped", |
|
), |
|
ui.h3(f"Parsed Structure for: {doc.metadata.get('filename', 'Unknown')}"), |
|
ui.markdown(parsed_structure_info.get(doc.metadata.get("format", ""), "")), |
|
ui.tags.pre(get_str_structure(doc)), |
|
) |
|
|
|
except Exception as e: |
|
logger.exception("Error displaying document metadata") |
|
return ui.p(f"Error: {str(e)}") |
|
|
|
|
|
app = App(app_ui, server) |
|
|