Spaces:
Running
Running
Upload 2 files
Browse files- app.py +291 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
from typing import Literal
|
5 |
+
from io import BytesIO
|
6 |
+
import requests
|
7 |
+
import streamlit as st
|
8 |
+
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
|
9 |
+
from streamlit import session_state as ss
|
10 |
+
|
11 |
+
from chunknorris.parsers import (AbstractParser, MarkdownParser, HTMLParser, PdfParser, DocxParser, ExcelParser, CSVParser)
|
12 |
+
from chunknorris.chunkers import MarkdownChunker
|
13 |
+
from chunknorris.pipelines import PdfPipeline
|
14 |
+
|
15 |
+
st.set_page_config(
|
16 |
+
layout="wide",
|
17 |
+
page_icon="🔪",
|
18 |
+
page_title="ChunkNorris demo",
|
19 |
+
menu_items={
|
20 |
+
'Report a bug': "https://github.com/wikit-ai/chunknorris/issues",
|
21 |
+
'About': "https://wikit-ai.github.io/chunknorris/"
|
22 |
+
}
|
23 |
+
)
|
24 |
+
|
25 |
+
LOGGER = st.empty()
|
26 |
+
|
27 |
+
SAMPLE_FILE = {
|
28 |
+
"sample PDF - 264 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
|
29 |
+
"sample PDF - 16 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
|
30 |
+
"sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
|
31 |
+
"sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
|
32 |
+
}
|
33 |
+
|
34 |
+
if "parsing_time" not in ss:
|
35 |
+
ss.parsing_time = 0
|
36 |
+
|
37 |
+
if "parsed_md" not in ss:
|
38 |
+
ss.parsed_md = ""
|
39 |
+
|
40 |
+
if "chunks" not in ss:
|
41 |
+
ss.chunks = []
|
42 |
+
|
43 |
+
def get_parser(fileext : str) -> AbstractParser:
|
44 |
+
"""Get the pipeline for the given filename."""
|
45 |
+
match fileext:
|
46 |
+
case ".md":
|
47 |
+
parser = MarkdownParser()
|
48 |
+
case ".html":
|
49 |
+
parser = HTMLParser()
|
50 |
+
case ".pdf":
|
51 |
+
parser = PdfParser(
|
52 |
+
use_ocr="never",
|
53 |
+
)
|
54 |
+
log("For the purpose of this demo, OCR on .pdf documents is deactivated.", "info")
|
55 |
+
case ".docx":
|
56 |
+
parser = DocxParser()
|
57 |
+
case ".xls" | ".xlsx" | ".xlsm" | ".xlsb" | ".odf" | ".ods" | ".odt":
|
58 |
+
parser = ExcelParser()
|
59 |
+
case ".csv":
|
60 |
+
parser = CSVParser()
|
61 |
+
case _:
|
62 |
+
raise ValueError(
|
63 |
+
"File format not supported by ChunkNorris"
|
64 |
+
)
|
65 |
+
|
66 |
+
return parser
|
67 |
+
|
68 |
+
def get_md_chunker() -> MarkdownChunker:
|
69 |
+
"""Considering arguments set, returns the md chunker."""
|
70 |
+
return MarkdownChunker(
|
71 |
+
max_headers_to_use=ss.max_headers_to_use,
|
72 |
+
max_chunk_word_count=ss.max_chunk_word_count,
|
73 |
+
hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
|
74 |
+
min_chunk_word_count=ss.min_chunk_word_count,
|
75 |
+
)
|
76 |
+
|
77 |
+
def parse_and_chunk(uploaded_file : UploadedFile):
|
78 |
+
"""Parse and chunk the file."""
|
79 |
+
if uploaded_file is None:
|
80 |
+
log("Please upload a file.", "warning")
|
81 |
+
return
|
82 |
+
log("Parsing and chunking...", "info")
|
83 |
+
|
84 |
+
try:
|
85 |
+
print(uploaded_file)
|
86 |
+
fileext = Path(uploaded_file.name).suffix.lower()
|
87 |
+
parser = get_parser(fileext)
|
88 |
+
start_time = time.perf_counter()
|
89 |
+
match fileext:
|
90 |
+
case ".pdf":
|
91 |
+
md_doc = parser.parse_string(uploaded_file.getvalue())
|
92 |
+
chunker = PdfPipeline(parser, get_md_chunker())
|
93 |
+
chunks = chunker._get_chunks_using_strategy()
|
94 |
+
case ".xlsx":
|
95 |
+
md_doc = parser.parse_string(uploaded_file.getvalue())
|
96 |
+
chunker = get_md_chunker()
|
97 |
+
chunks = chunker.chunk(md_doc)
|
98 |
+
case _:
|
99 |
+
md_doc = parser.parse_string(uploaded_file.getvalue().decode("utf-8"))
|
100 |
+
chunker = get_md_chunker()
|
101 |
+
chunks = chunker.chunk(md_doc)
|
102 |
+
|
103 |
+
ss.parsing_time = time.perf_counter() - start_time
|
104 |
+
ss.parsed_md = md_doc.to_string()
|
105 |
+
ss.chunks = chunks
|
106 |
+
log(f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success")
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
log(f"Error when parsing file.", "warning")
|
110 |
+
print(e)
|
111 |
+
return
|
112 |
+
|
113 |
+
def save_parsed_md():
|
114 |
+
"""Save the parsed markdown string to a md file."""
|
115 |
+
return ss.parsed_md.encode("utf-8")
|
116 |
+
|
117 |
+
def save_chunks():
|
118 |
+
"""Save the parsed chunks to a json file."""
|
119 |
+
return json.dumps([
|
120 |
+
{
|
121 |
+
k:v
|
122 |
+
for k,v in chunk.model_dump().items()
|
123 |
+
if k not in ["headers","content"]
|
124 |
+
} | {
|
125 |
+
"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)
|
126 |
+
}
|
127 |
+
for chunk in ss.chunks
|
128 |
+
], indent=4, ensure_ascii=False).encode("utf-8")
|
129 |
+
|
130 |
+
def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
|
131 |
+
"""Display a warning message."""
|
132 |
+
match log_type:
|
133 |
+
case "warning":
|
134 |
+
LOGGER.warning(message, icon="⚠️")
|
135 |
+
case "success":
|
136 |
+
LOGGER.success(message, icon="✅")
|
137 |
+
case "info":
|
138 |
+
LOGGER.info(message, icon="ℹ️")
|
139 |
+
|
140 |
+
def load_sample_file(url: str):
|
141 |
+
"""Get the file from url"""
|
142 |
+
response = requests.get(url)
|
143 |
+
if response.status_code == 200:
|
144 |
+
return UploadedFile(
|
145 |
+
record=UploadedFileRec(
|
146 |
+
file_id="sample_file",
|
147 |
+
name=url.split("/")[-1],
|
148 |
+
data=response.content,
|
149 |
+
type="application/octet-stream",
|
150 |
+
),
|
151 |
+
file_urls=[url]
|
152 |
+
)
|
153 |
+
else:
|
154 |
+
print(response.status_code, response.content)
|
155 |
+
st.error("Failed to get data.")
|
156 |
+
return None
|
157 |
+
|
158 |
+
st.title("ChunkNorris.")
|
159 |
+
st.subheader("*Fast, smart, lightweight document chunking.*")
|
160 |
+
|
161 |
+
st.sidebar.header("Chunking settings")
|
162 |
+
st.sidebar.markdown("| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |")
|
163 |
+
st.sidebar.select_slider(
|
164 |
+
label="Max header level to consider for chunking",
|
165 |
+
options=["h1", "h2", "h3", "h4", "h5", "h6"],
|
166 |
+
value="h4",
|
167 |
+
key="max_headers_to_use",
|
168 |
+
help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
|
169 |
+
label_visibility="visible",
|
170 |
+
)
|
171 |
+
|
172 |
+
st.sidebar.slider(
|
173 |
+
label="Maximum words per chunk",
|
174 |
+
value=250,
|
175 |
+
min_value=0,
|
176 |
+
max_value=3000,
|
177 |
+
step=50,
|
178 |
+
key="max_chunk_word_count",
|
179 |
+
help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
|
180 |
+
label_visibility="visible",
|
181 |
+
)
|
182 |
+
|
183 |
+
st.sidebar.slider(
|
184 |
+
label="Hard maximum words per chunk",
|
185 |
+
value=400,
|
186 |
+
min_value=100,
|
187 |
+
max_value=3000,
|
188 |
+
step=50,
|
189 |
+
key="hard_max_chunk_word_count",
|
190 |
+
help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
|
191 |
+
label_visibility="visible",
|
192 |
+
)
|
193 |
+
|
194 |
+
st.sidebar.slider(
|
195 |
+
label="Minumum words per chunk",
|
196 |
+
value=10,
|
197 |
+
min_value=0,
|
198 |
+
max_value=50,
|
199 |
+
step=1,
|
200 |
+
key="min_chunk_word_count",
|
201 |
+
help="The minimum words a chunk must have to avoid being discarded.",
|
202 |
+
label_visibility="visible",
|
203 |
+
)
|
204 |
+
|
205 |
+
st.sidebar.checkbox(
|
206 |
+
"Prepend headers to chunk's text",
|
207 |
+
value=True,
|
208 |
+
key="prepend_headers_to_chunks",
|
209 |
+
label_visibility="visible",
|
210 |
+
help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
|
211 |
+
)
|
212 |
+
|
213 |
+
_, col1, col2, _ = st.columns([0.1, .5, .3, 0.1])
|
214 |
+
with col1:
|
215 |
+
uploaded_file = st.file_uploader(
|
216 |
+
"Upload your own file...",
|
217 |
+
type=["md", "html", "pdf", "docx", "xls", "xlsx", "xlsm", "xlsb", "odf", "ods", "odt", "csv"],
|
218 |
+
)
|
219 |
+
|
220 |
+
with col2:
|
221 |
+
sample_file = st.selectbox(
|
222 |
+
"... Or choose a sample file from the list.",
|
223 |
+
options=list(SAMPLE_FILE.keys()),
|
224 |
+
index=None,
|
225 |
+
)
|
226 |
+
if sample_file is not None:
|
227 |
+
st.markdown(f"[View file]({SAMPLE_FILE[sample_file]})")
|
228 |
+
uploaded_file = load_sample_file(SAMPLE_FILE[sample_file])
|
229 |
+
|
230 |
+
|
231 |
+
if uploaded_file is not None:
|
232 |
+
parse_and_chunk(uploaded_file)
|
233 |
+
st.sidebar.button(
|
234 |
+
"Parse & Chunk",
|
235 |
+
on_click=parse_and_chunk,
|
236 |
+
args=(uploaded_file,),
|
237 |
+
type="primary",
|
238 |
+
use_container_width=True
|
239 |
+
)
|
240 |
+
else:
|
241 |
+
st.sidebar.button(
|
242 |
+
"Parse & Chunk",
|
243 |
+
on_click=log,
|
244 |
+
args=("You must upload a file first.", "warning",),
|
245 |
+
type="secondary",
|
246 |
+
use_container_width=True
|
247 |
+
)
|
248 |
+
ss.parsed_md = ""
|
249 |
+
ss.chunks = []
|
250 |
+
|
251 |
+
|
252 |
+
col1, col2 = st.columns(2)
|
253 |
+
with col1:
|
254 |
+
if ss.parsed_md:
|
255 |
+
file_parsed_md = save_parsed_md()
|
256 |
+
cola, colb = st.columns([0.25, 0.75])
|
257 |
+
with colb:
|
258 |
+
st.subheader("⚙️ Parsed Document", divider="blue")
|
259 |
+
with cola:
|
260 |
+
st.write("\n")
|
261 |
+
st.download_button(
|
262 |
+
label="⬇️ Download",
|
263 |
+
data=file_parsed_md,
|
264 |
+
file_name="chunknorris_parsed_document.md",
|
265 |
+
mime="text/markdown",
|
266 |
+
use_container_width=True,
|
267 |
+
)
|
268 |
+
with st.expander("Parsed document", expanded=True):
|
269 |
+
with st.container(height=600, border=False):
|
270 |
+
st.markdown(ss.parsed_md)
|
271 |
+
|
272 |
+
with col2:
|
273 |
+
if ss.chunks:
|
274 |
+
file_chunks = save_chunks()
|
275 |
+
cola, colb = st.columns([0.25, 0.75])
|
276 |
+
with colb:
|
277 |
+
st.subheader("📦 Chunks", divider="blue")
|
278 |
+
with cola:
|
279 |
+
st.write("\n")
|
280 |
+
st.download_button(
|
281 |
+
label="⬇️ Download",
|
282 |
+
data=file_chunks,
|
283 |
+
file_name="chunknorris_chunks.json",
|
284 |
+
mime="application/json",
|
285 |
+
use_container_width=True,
|
286 |
+
)
|
287 |
+
with st.container(border=False):
|
288 |
+
for i, chunk in enumerate(ss.chunks):
|
289 |
+
with st.expander(f"Chunk {i+1}", expanded=False):
|
290 |
+
with st.container(height=300, border=False):
|
291 |
+
st.markdown(chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks))
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
chunknorris @ git+https://github.com/wikit-ai/chunknorris
|
2 |
+
pydantic
|
3 |
+
streamlit==1.44.0
|