nicolasb92's picture
Update description info
e56fb42
import re
from pathlib import Path
from typing import Literal
import streamlit as st
import os
from streamlit_pdf_viewer import pdf_viewer
st.set_page_config(layout="wide")
# Define the paths to your PDF and Markdown folders
repo_folder = Path(__file__).resolve().parent
pdf_folder = repo_folder / "pdfs"
pdf_pages = {
"2023-conocophillips-aim-presentation-1-7.pdf": 7,
"deloitte-tech-risk-sector-banking.pdf": 2,
"dttl-tax-technology-report-2023.pdf": 17,
"gx-iif-open-data.pdf": 32,
"life-sciences-smart-manufacturing-services-peak-matrix-assessment-2023.pdf": 15,
"XC9500_CPLD_Family-1-4.pdf": 4
}
pdfs_files = os.listdir(str(pdf_folder))
def read_markdown_file(
markdown_dir: Path,
pdf_filename: str,
method: Literal["docling", "llamaparse", "marker-pdf", "pymupdf"],
) -> list[str]:
match method:
case "docling" | "llamaparse":
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md")
md_content = md_path.read_text()
md_content_pages = md_content.split("\n---\n")
case "marker-pdf":
md_path = markdown_dir / pdf_filename.replace(".pdf", "") / pdf_filename.replace(".pdf", ".md")
md_content = md_path.read_text()
pattern = r"\n\{\d+\}-+\s\n*"
md_content_pages = re.split(pattern, md_content)
case "gemini" | "mistral":
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md")
md_content = md_path.read_text()
pattern = r"--- end page \d+"
md_content_pages = re.split(pattern, md_content)
if not md_content_pages[-1].strip():
md_content_pages.pop()
case "pymupdf":
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md")
md_content = md_path.read_text()
md_content_pages = md_content.split("\n-----\n")
if not md_content_pages[-1].strip():
md_content_pages.pop()
case _:
raise NotImplementedError(f"Method {method} not implemented")
return md_content_pages
@st.cache_data
def read_pdf_file(pdf_path: Path) -> bytes:
return pdf_path.read_bytes()
st.title("PDF and Markdown Viewer")
st.markdown("""
This is a demo of different PDF parsing methods.
For more information, see:
- [Blog post](https://nbrosse.github.io/posts/pdf-parsing/pdf-parsing.html)
- [GitHub repository](https://github.com/nbrosse/pdf-parsing)
""")
method = st.selectbox("Choose method", options=["docling", "llamaparse", "marker-pdf", "pymupdf", "gemini", "mistral"])
markdown_folder = repo_folder / f"{method}-folder" / "md"
filename = st.selectbox("Choose filename", options=pdfs_files)
num_pages_file = pdf_pages[filename]
page = st.selectbox("Choose page", options=list(range(1, num_pages_file + 1)))
pdf_content = read_pdf_file(pdf_folder / filename)
md_content_pages = read_markdown_file(
pdf_filename=filename,
markdown_dir=markdown_folder,
method=method,
)
if len(md_content_pages) != num_pages_file:
st.warning(f"Number of markdown pages {len(md_content_pages)} does not match the number of pdf pages {num_pages_file}")
if filename:
col1, col2 = st.columns(2)
with col1:
st.subheader("PDF Viewer")
pdf_viewer(pdf_content, width="90%", pages_to_render=[page])
with col2:
st.subheader("Markdown Content")
st.markdown(md_content_pages[page - 1])
st.divider()
st.subheader("Raw text Content")
st.text(md_content_pages[page - 1])