import json import time from pathlib import Path from typing import Literal import requests import streamlit as st from chunknorris.chunkers import MarkdownChunker from chunknorris.parsers import ( AbstractParser, CSVParser, DocxParser, ExcelParser, HTMLParser, MarkdownParser, PdfParser, ) from chunknorris.pipelines import PdfPipeline from streamlit import session_state as ss from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec st.set_page_config( layout="wide", page_icon="🔪", page_title="ChunkNorris demo", menu_items={ "Report a bug": "https://github.com/wikit-ai/chunknorris/issues", "About": "https://wikit-ai.github.io/chunknorris/", }, ) LOGGER = st.empty() SAMPLE_FILE = { "sample PDF - 264 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf", "sample PDF - 16 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf", "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md", "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx", } if "parsing_time" not in ss: ss.parsing_time = 0 if "parsed_md" not in ss: ss.parsed_md = "" if "chunks" not in ss: ss.chunks = [] # type: ignore | list[Chunk] def get_parser(fileext: str) -> AbstractParser: """Get the pipeline for the given filename.""" match fileext: case ".md": parser = MarkdownParser() case ".html": parser = HTMLParser() case ".pdf": parser = PdfParser( use_ocr="never", ) case ".docx": parser = DocxParser() case ".xls" | ".xlsx" | ".xlsm" | ".xlsb" | ".odf" | ".ods" | ".odt": parser = ExcelParser() case ".csv": parser = CSVParser() case _: raise ValueError("File format not supported by ChunkNorris") return parser def get_md_chunker() -> MarkdownChunker: """Considering arguments set, returns the md chunker.""" return MarkdownChunker( max_headers_to_use=ss.max_headers_to_use, max_chunk_word_count=ss.max_chunk_word_count, hard_max_chunk_word_count=ss.hard_max_chunk_word_count, min_chunk_word_count=ss.min_chunk_word_count, ) def parse_and_chunk(uploaded_file: UploadedFile | None): """Parse and chunk the file.""" if uploaded_file is None: log("Please upload a file.", "warning") return log("Parsing and chunking...", "info") try: fileext = Path(uploaded_file.name).suffix.lower() parser = get_parser(fileext) start_time = time.perf_counter() match fileext: case ".pdf": md_doc = parser.parse_string(uploaded_file.getvalue()) chunker = PdfPipeline(parser, get_md_chunker()) chunks = chunker._get_chunks_using_strategy() # type: ignore case ".xlsx": md_doc = parser.parse_string(uploaded_file.getvalue()) chunker = get_md_chunker() chunks = chunker.chunk(md_doc) case _: md_doc = parser.parse_string(uploaded_file.getvalue().decode("utf-8")) chunker = get_md_chunker() chunks = chunker.chunk(md_doc) ss.parsing_time = time.perf_counter() - start_time ss.parsed_md = md_doc.to_string() ss.chunks = chunks log( f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success" ) except Exception as e: log(f"Error when parsing file.", "warning") print(e) return def save_parsed_md(): """Save the parsed markdown string to a md file.""" return ss.parsed_md.encode("utf-8") def save_chunks(): """Save the parsed chunks to a json file.""" return json.dumps( [ { k: v for k, v in chunk.model_dump().items() if k not in ["headers", "content"] } | {"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)} for chunk in ss.chunks ], indent=4, ensure_ascii=False, ).encode("utf-8") def log(message: str, log_type: Literal["success", "warning", "info"] = "info"): """Display a warning message.""" match log_type: case "warning": LOGGER.warning(message, icon="⚠️") case "success": LOGGER.success(message, icon="✅") case "info": LOGGER.info(message, icon="ℹ️") def load_sample_file(url: str): """Get the file from url""" response = requests.get(url) if response.status_code == 200: return UploadedFile( record=UploadedFileRec( file_id="sample_file", name=url.split("/")[-1], data=response.content, type="application/octet-stream", ), file_urls=[url], ) else: print(response.status_code, response.content) st.error("Failed to get data.") return None st.title("ChunkNorris.") st.subheader("*Fast, smart, lightweight document chunking.*") st.sidebar.header("Chunking settings") st.sidebar.markdown( "| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |" ) st.sidebar.select_slider( label="Max header level to consider for chunking", options=["h1", "h2", "h3", "h4", "h5", "h6"], value="h4", key="max_headers_to_use", help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.", label_visibility="visible", ) st.sidebar.slider( label="Maximum words per chunk", value=250, min_value=0, max_value=3000, step=50, key="max_chunk_word_count", help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.", label_visibility="visible", ) st.sidebar.slider( label="Hard maximum words per chunk", value=400, min_value=100, max_value=3000, step=50, key="hard_max_chunk_word_count", help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.", label_visibility="visible", ) st.sidebar.slider( label="Minumum words per chunk", value=10, min_value=0, max_value=50, step=1, key="min_chunk_word_count", help="The minimum words a chunk must have to avoid being discarded.", label_visibility="visible", ) st.sidebar.checkbox( "Prepend headers to chunk's text", value=True, key="prepend_headers_to_chunks", label_visibility="visible", help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.", ) _, col1, col2, _ = st.columns([0.1, 0.5, 0.3, 0.1]) with col1: uploaded_file = st.file_uploader( "Upload your own file...", type=[ "md", "html", "pdf", "docx", "xls", "xlsx", "xlsm", "xlsb", "odf", "ods", "odt", "csv", ], ) with col2: sample_file = st.selectbox( "... Or choose a sample file from the list.", options=list(SAMPLE_FILE.keys()), index=None, ) if sample_file is not None: st.markdown(f"[View file]({SAMPLE_FILE[sample_file]})") uploaded_file = load_sample_file(SAMPLE_FILE[sample_file]) if uploaded_file is not None: parse_and_chunk(uploaded_file) st.sidebar.button( "Parse & Chunk", on_click=parse_and_chunk, args=(uploaded_file,), type="primary", use_container_width=True, ) else: st.sidebar.button( "Parse & Chunk", on_click=log, args=( "You must upload a file first.", "warning", ), type="secondary", use_container_width=True, ) ss.parsed_md = "" ss.chunks = [] col1, col2 = st.columns(2) with col1: if uploaded_file and ss.parsed_md: file_parsed_md = save_parsed_md() cola, colb = st.columns([0.25, 0.75]) with colb: st.subheader("⚙️ Parsed Document", divider="blue") with cola: st.markdown("\n") st.download_button( label="⬇️ Download", data=file_parsed_md, file_name="chunknorris_parsed_document.md", mime="text/markdown", use_container_width=True, ) if Path(uploaded_file.name).suffix.lower() == ".pdf": st.info( "For the purpose of this demo, OCR on pdf documents is deactivated.", icon="ℹ️", ) with st.expander("Parsed document", expanded=True): with st.container(height=600, border=False): st.markdown(ss.parsed_md) with col2: if uploaded_file and ss.chunks: # type: ignore | list[Chunk] file_chunks = save_chunks() cola, colb = st.columns([0.25, 0.75]) with colb: st.subheader("📦 Chunks", divider="blue") with cola: st.markdown("\n") st.download_button( label="⬇️ Download", data=file_chunks, file_name="chunknorris_chunks.json", mime="application/json", use_container_width=True, ) with st.container(border=False): for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk] with st.expander(f"Chunk {i+1}", expanded=False): with st.container(height=300, border=False): st.markdown( chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text() )