Spaces:
Running
Running
import re | |
from pathlib import Path | |
from typing import Literal | |
import streamlit as st | |
import os | |
from streamlit_pdf_viewer import pdf_viewer | |
st.set_page_config(layout="wide") | |
# Define the paths to your PDF and Markdown folders | |
repo_folder = Path(__file__).resolve().parent | |
pdf_folder = repo_folder / "pdfs" | |
pdf_pages = { | |
"2023-conocophillips-aim-presentation-1-7.pdf": 7, | |
"deloitte-tech-risk-sector-banking.pdf": 2, | |
"dttl-tax-technology-report-2023.pdf": 17, | |
"gx-iif-open-data.pdf": 32, | |
"life-sciences-smart-manufacturing-services-peak-matrix-assessment-2023.pdf": 15, | |
"XC9500_CPLD_Family-1-4.pdf": 4 | |
} | |
pdfs_files = os.listdir(str(pdf_folder)) | |
def read_markdown_file( | |
markdown_dir: Path, | |
pdf_filename: str, | |
method: Literal["docling", "llamaparse", "marker-pdf", "pymupdf"], | |
) -> list[str]: | |
match method: | |
case "docling" | "llamaparse": | |
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md") | |
md_content = md_path.read_text() | |
md_content_pages = md_content.split("\n---\n") | |
case "marker-pdf": | |
md_path = markdown_dir / pdf_filename.replace(".pdf", "") / pdf_filename.replace(".pdf", ".md") | |
md_content = md_path.read_text() | |
pattern = r"\n\{\d+\}-+\s\n*" | |
md_content_pages = re.split(pattern, md_content) | |
case "gemini" | "mistral": | |
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md") | |
md_content = md_path.read_text() | |
pattern = r"--- end page \d+" | |
md_content_pages = re.split(pattern, md_content) | |
if not md_content_pages[-1].strip(): | |
md_content_pages.pop() | |
case "pymupdf": | |
md_path = markdown_dir / pdf_filename.replace(".pdf", ".md") | |
md_content = md_path.read_text() | |
md_content_pages = md_content.split("\n-----\n") | |
if not md_content_pages[-1].strip(): | |
md_content_pages.pop() | |
case _: | |
raise NotImplementedError(f"Method {method} not implemented") | |
return md_content_pages | |
def read_pdf_file(pdf_path: Path) -> bytes: | |
return pdf_path.read_bytes() | |
st.title("PDF and Markdown Viewer") | |
st.markdown(""" | |
This is a demo of different PDF parsing methods. | |
For more information, see: | |
- [Blog post](https://nbrosse.github.io/posts/pdf-parsing/pdf-parsing.html) | |
- [GitHub repository](https://github.com/nbrosse/pdf-parsing) | |
""") | |
method = st.selectbox("Choose method", options=["docling", "llamaparse", "marker-pdf", "pymupdf", "gemini", "mistral"]) | |
markdown_folder = repo_folder / f"{method}-folder" / "md" | |
filename = st.selectbox("Choose filename", options=pdfs_files) | |
num_pages_file = pdf_pages[filename] | |
page = st.selectbox("Choose page", options=list(range(1, num_pages_file + 1))) | |
pdf_content = read_pdf_file(pdf_folder / filename) | |
md_content_pages = read_markdown_file( | |
pdf_filename=filename, | |
markdown_dir=markdown_folder, | |
method=method, | |
) | |
if len(md_content_pages) != num_pages_file: | |
st.warning(f"Number of markdown pages {len(md_content_pages)} does not match the number of pdf pages {num_pages_file}") | |
if filename: | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("PDF Viewer") | |
pdf_viewer(pdf_content, width="90%", pages_to_render=[page]) | |
with col2: | |
st.subheader("Markdown Content") | |
st.markdown(md_content_pages[page - 1]) | |
st.divider() | |
st.subheader("Raw text Content") | |
st.text(md_content_pages[page - 1]) |