chunknorris / app.py
mciancone's picture
Upload app.py
ed8fac8 verified
import json
import time
from pathlib import Path
from typing import Literal
import requests
import streamlit as st
from chunknorris.chunkers import MarkdownChunker
from chunknorris.parsers import (
AbstractParser,
CSVParser,
DocxParser,
ExcelParser,
HTMLParser,
MarkdownParser,
PdfParser,
)
from chunknorris.pipelines import PdfPipeline
from streamlit import session_state as ss
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
st.set_page_config(
layout="wide",
page_icon="🔪",
page_title="ChunkNorris demo",
menu_items={
"Report a bug": "https://github.com/wikit-ai/chunknorris/issues",
"About": "https://wikit-ai.github.io/chunknorris/",
},
)
LOGGER = st.empty()
SAMPLE_FILE = {
"sample PDF - 264 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
"sample PDF - 16 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
"sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
"sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
}
if "parsing_time" not in ss:
ss.parsing_time = 0
if "parsed_md" not in ss:
ss.parsed_md = ""
if "chunks" not in ss:
ss.chunks = [] # type: ignore | list[Chunk]
def get_parser(fileext: str) -> AbstractParser:
"""Get the pipeline for the given filename."""
match fileext:
case ".md":
parser = MarkdownParser()
case ".html":
parser = HTMLParser()
case ".pdf":
parser = PdfParser(
use_ocr="never",
)
case ".docx":
parser = DocxParser()
case ".xls" | ".xlsx" | ".xlsm" | ".xlsb" | ".odf" | ".ods" | ".odt":
parser = ExcelParser()
case ".csv":
parser = CSVParser()
case _:
raise ValueError("File format not supported by ChunkNorris")
return parser
def get_md_chunker() -> MarkdownChunker:
"""Considering arguments set, returns the md chunker."""
return MarkdownChunker(
max_headers_to_use=ss.max_headers_to_use,
max_chunk_word_count=ss.max_chunk_word_count,
hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
min_chunk_word_count=ss.min_chunk_word_count,
)
def parse_and_chunk(uploaded_file: UploadedFile | None):
"""Parse and chunk the file."""
if uploaded_file is None:
log("Please upload a file.", "warning")
return
log("Parsing and chunking...", "info")
try:
fileext = Path(uploaded_file.name).suffix.lower()
parser = get_parser(fileext)
start_time = time.perf_counter()
match fileext:
case ".pdf":
md_doc = parser.parse_string(uploaded_file.getvalue())
chunker = PdfPipeline(parser, get_md_chunker())
chunks = chunker._get_chunks_using_strategy() # type: ignore
case ".xlsx":
md_doc = parser.parse_string(uploaded_file.getvalue())
chunker = get_md_chunker()
chunks = chunker.chunk(md_doc)
case _:
md_doc = parser.parse_string(uploaded_file.getvalue().decode("utf-8"))
chunker = get_md_chunker()
chunks = chunker.chunk(md_doc)
ss.parsing_time = time.perf_counter() - start_time
ss.parsed_md = md_doc.to_string()
ss.chunks = chunks
log(
f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success"
)
except Exception as e:
log(f"Error when parsing file.", "warning")
print(e)
return
def save_parsed_md():
"""Save the parsed markdown string to a md file."""
return ss.parsed_md.encode("utf-8")
def save_chunks():
"""Save the parsed chunks to a json file."""
return json.dumps(
[
{
k: v
for k, v in chunk.model_dump().items()
if k not in ["headers", "content"]
}
| {"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)}
for chunk in ss.chunks
],
indent=4,
ensure_ascii=False,
).encode("utf-8")
def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
"""Display a warning message."""
match log_type:
case "warning":
LOGGER.warning(message, icon="⚠️")
case "success":
LOGGER.success(message, icon="✅")
case "info":
LOGGER.info(message, icon="ℹ️")
def load_sample_file(url: str):
"""Get the file from url"""
response = requests.get(url)
if response.status_code == 200:
return UploadedFile(
record=UploadedFileRec(
file_id="sample_file",
name=url.split("/")[-1],
data=response.content,
type="application/octet-stream",
),
file_urls=[url],
)
else:
print(response.status_code, response.content)
st.error("Failed to get data.")
return None
st.title("ChunkNorris.")
st.subheader("*Fast, smart, lightweight document chunking.*")
st.sidebar.header("Chunking settings")
st.sidebar.markdown(
"| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |"
)
st.sidebar.select_slider(
label="Max header level to consider for chunking",
options=["h1", "h2", "h3", "h4", "h5", "h6"],
value="h4",
key="max_headers_to_use",
help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
label_visibility="visible",
)
st.sidebar.slider(
label="Maximum words per chunk",
value=250,
min_value=0,
max_value=3000,
step=50,
key="max_chunk_word_count",
help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
label_visibility="visible",
)
st.sidebar.slider(
label="Hard maximum words per chunk",
value=400,
min_value=100,
max_value=3000,
step=50,
key="hard_max_chunk_word_count",
help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
label_visibility="visible",
)
st.sidebar.slider(
label="Minumum words per chunk",
value=10,
min_value=0,
max_value=50,
step=1,
key="min_chunk_word_count",
help="The minimum words a chunk must have to avoid being discarded.",
label_visibility="visible",
)
st.sidebar.checkbox(
"Prepend headers to chunk's text",
value=True,
key="prepend_headers_to_chunks",
label_visibility="visible",
help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
)
_, col1, col2, _ = st.columns([0.1, 0.5, 0.3, 0.1])
with col1:
uploaded_file = st.file_uploader(
"Upload your own file...",
type=[
"md",
"html",
"pdf",
"docx",
"xls",
"xlsx",
"xlsm",
"xlsb",
"odf",
"ods",
"odt",
"csv",
],
)
with col2:
sample_file = st.selectbox(
"... Or choose a sample file from the list.",
options=list(SAMPLE_FILE.keys()),
index=None,
)
if sample_file is not None:
st.markdown(f"[View file]({SAMPLE_FILE[sample_file]})")
uploaded_file = load_sample_file(SAMPLE_FILE[sample_file])
if uploaded_file is not None:
parse_and_chunk(uploaded_file)
st.sidebar.button(
"Parse & Chunk",
on_click=parse_and_chunk,
args=(uploaded_file,),
type="primary",
use_container_width=True,
)
else:
st.sidebar.button(
"Parse & Chunk",
on_click=log,
args=(
"You must upload a file first.",
"warning",
),
type="secondary",
use_container_width=True,
)
ss.parsed_md = ""
ss.chunks = []
col1, col2 = st.columns(2)
with col1:
if uploaded_file and ss.parsed_md:
file_parsed_md = save_parsed_md()
cola, colb = st.columns([0.25, 0.75])
with colb:
st.subheader("⚙️ Parsed Document", divider="blue")
with cola:
st.markdown("\n")
st.download_button(
label="⬇️ Download",
data=file_parsed_md,
file_name="chunknorris_parsed_document.md",
mime="text/markdown",
use_container_width=True,
)
if Path(uploaded_file.name).suffix.lower() == ".pdf":
st.info(
"For the purpose of this demo, OCR on pdf documents is deactivated.",
icon="ℹ️",
)
with st.expander("Parsed document", expanded=True):
with st.container(height=600, border=False):
st.markdown(ss.parsed_md)
with col2:
if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
file_chunks = save_chunks()
cola, colb = st.columns([0.25, 0.75])
with colb:
st.subheader("📦 Chunks", divider="blue")
with cola:
st.markdown("\n")
st.download_button(
label="⬇️ Download",
data=file_chunks,
file_name="chunknorris_chunks.json",
mime="application/json",
use_container_width=True,
)
with st.container(border=False):
for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
with st.expander(f"Chunk {i+1}", expanded=False):
with st.container(height=300, border=False):
st.markdown(
chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
)