Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,39 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
import time
|
3 |
import json
|
|
|
|
|
4 |
from typing import Literal
|
5 |
-
|
6 |
import requests
|
7 |
import streamlit as st
|
8 |
-
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
|
9 |
-
from streamlit import session_state as ss
|
10 |
-
|
11 |
-
from chunknorris.parsers import (AbstractParser, MarkdownParser, HTMLParser, PdfParser, DocxParser, ExcelParser, CSVParser)
|
12 |
from chunknorris.chunkers import MarkdownChunker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from chunknorris.pipelines import PdfPipeline
|
|
|
|
|
14 |
|
15 |
st.set_page_config(
|
16 |
layout="wide",
|
17 |
page_icon="🔪",
|
18 |
page_title="ChunkNorris demo",
|
19 |
menu_items={
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
|
25 |
LOGGER = st.empty()
|
26 |
|
27 |
SAMPLE_FILE = {
|
28 |
-
"sample PDF - 264 pages"
|
29 |
-
"sample PDF - 16 pages"
|
30 |
"sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
|
31 |
"sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
|
32 |
}
|
@@ -38,9 +45,10 @@ if "parsed_md" not in ss:
|
|
38 |
ss.parsed_md = ""
|
39 |
|
40 |
if "chunks" not in ss:
|
41 |
-
ss.chunks = []
|
|
|
42 |
|
43 |
-
def get_parser(fileext
|
44 |
"""Get the pipeline for the given filename."""
|
45 |
match fileext:
|
46 |
case ".md":
|
@@ -58,22 +66,22 @@ def get_parser(fileext : str) -> AbstractParser:
|
|
58 |
case ".csv":
|
59 |
parser = CSVParser()
|
60 |
case _:
|
61 |
-
raise ValueError(
|
62 |
-
"File format not supported by ChunkNorris"
|
63 |
-
)
|
64 |
|
65 |
return parser
|
66 |
|
|
|
67 |
def get_md_chunker() -> MarkdownChunker:
|
68 |
"""Considering arguments set, returns the md chunker."""
|
69 |
return MarkdownChunker(
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
|
76 |
-
|
|
|
77 |
"""Parse and chunk the file."""
|
78 |
if uploaded_file is None:
|
79 |
log("Please upload a file.", "warning")
|
@@ -88,7 +96,7 @@ def parse_and_chunk(uploaded_file : UploadedFile):
|
|
88 |
case ".pdf":
|
89 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
90 |
chunker = PdfPipeline(parser, get_md_chunker())
|
91 |
-
chunks = chunker._get_chunks_using_strategy()
|
92 |
case ".xlsx":
|
93 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
94 |
chunker = get_md_chunker()
|
@@ -101,29 +109,37 @@ def parse_and_chunk(uploaded_file : UploadedFile):
|
|
101 |
ss.parsing_time = time.perf_counter() - start_time
|
102 |
ss.parsed_md = md_doc.to_string()
|
103 |
ss.chunks = chunks
|
104 |
-
log(
|
|
|
|
|
105 |
|
106 |
except Exception as e:
|
107 |
log(f"Error when parsing file.", "warning")
|
108 |
print(e)
|
109 |
return
|
110 |
|
|
|
111 |
def save_parsed_md():
|
112 |
"""Save the parsed markdown string to a md file."""
|
113 |
return ss.parsed_md.encode("utf-8")
|
114 |
|
|
|
115 |
def save_chunks():
|
116 |
"""Save the parsed chunks to a json file."""
|
117 |
-
return json.dumps(
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
],
|
|
|
|
|
|
|
|
|
127 |
|
128 |
def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
|
129 |
"""Display a warning message."""
|
@@ -135,6 +151,7 @@ def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
|
|
135 |
case "info":
|
136 |
LOGGER.info(message, icon="ℹ️")
|
137 |
|
|
|
138 |
def load_sample_file(url: str):
|
139 |
"""Get the file from url"""
|
140 |
response = requests.get(url)
|
@@ -146,18 +163,21 @@ def load_sample_file(url: str):
|
|
146 |
data=response.content,
|
147 |
type="application/octet-stream",
|
148 |
),
|
149 |
-
file_urls=[url]
|
150 |
-
|
151 |
else:
|
152 |
print(response.status_code, response.content)
|
153 |
st.error("Failed to get data.")
|
154 |
return None
|
155 |
|
|
|
156 |
st.title("ChunkNorris.")
|
157 |
st.subheader("*Fast, smart, lightweight document chunking.*")
|
158 |
|
159 |
st.sidebar.header("Chunking settings")
|
160 |
-
st.sidebar.markdown(
|
|
|
|
|
161 |
st.sidebar.select_slider(
|
162 |
label="Max header level to consider for chunking",
|
163 |
options=["h1", "h2", "h3", "h4", "h5", "h6"],
|
@@ -165,7 +185,7 @@ st.sidebar.select_slider(
|
|
165 |
key="max_headers_to_use",
|
166 |
help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
|
167 |
label_visibility="visible",
|
168 |
-
|
169 |
|
170 |
st.sidebar.slider(
|
171 |
label="Maximum words per chunk",
|
@@ -176,7 +196,7 @@ st.sidebar.slider(
|
|
176 |
key="max_chunk_word_count",
|
177 |
help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
|
178 |
label_visibility="visible",
|
179 |
-
|
180 |
|
181 |
st.sidebar.slider(
|
182 |
label="Hard maximum words per chunk",
|
@@ -187,7 +207,7 @@ st.sidebar.slider(
|
|
187 |
key="hard_max_chunk_word_count",
|
188 |
help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
|
189 |
label_visibility="visible",
|
190 |
-
|
191 |
|
192 |
st.sidebar.slider(
|
193 |
label="Minumum words per chunk",
|
@@ -198,7 +218,7 @@ st.sidebar.slider(
|
|
198 |
key="min_chunk_word_count",
|
199 |
help="The minimum words a chunk must have to avoid being discarded.",
|
200 |
label_visibility="visible",
|
201 |
-
|
202 |
|
203 |
st.sidebar.checkbox(
|
204 |
"Prepend headers to chunk's text",
|
@@ -206,14 +226,27 @@ st.sidebar.checkbox(
|
|
206 |
key="prepend_headers_to_chunks",
|
207 |
label_visibility="visible",
|
208 |
help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
|
209 |
-
|
210 |
|
211 |
-
_, col1, col2, _ = st.columns([0.1, .5, .3, 0.1])
|
212 |
with col1:
|
213 |
uploaded_file = st.file_uploader(
|
214 |
"Upload your own file...",
|
215 |
-
type=[
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
with col2:
|
219 |
sample_file = st.selectbox(
|
@@ -233,29 +266,32 @@ if uploaded_file is not None:
|
|
233 |
on_click=parse_and_chunk,
|
234 |
args=(uploaded_file,),
|
235 |
type="primary",
|
236 |
-
use_container_width=True
|
237 |
-
|
238 |
else:
|
239 |
st.sidebar.button(
|
240 |
"Parse & Chunk",
|
241 |
on_click=log,
|
242 |
-
args=(
|
|
|
|
|
|
|
243 |
type="secondary",
|
244 |
-
use_container_width=True
|
245 |
-
|
246 |
ss.parsed_md = ""
|
247 |
ss.chunks = []
|
248 |
|
249 |
|
250 |
col1, col2 = st.columns(2)
|
251 |
with col1:
|
252 |
-
if ss.parsed_md:
|
253 |
file_parsed_md = save_parsed_md()
|
254 |
cola, colb = st.columns([0.25, 0.75])
|
255 |
with colb:
|
256 |
st.subheader("⚙️ Parsed Document", divider="blue")
|
257 |
with cola:
|
258 |
-
st.
|
259 |
st.download_button(
|
260 |
label="⬇️ Download",
|
261 |
data=file_parsed_md,
|
@@ -264,19 +300,22 @@ with col1:
|
|
264 |
use_container_width=True,
|
265 |
)
|
266 |
if Path(uploaded_file.name).suffix.lower() == ".pdf":
|
267 |
-
st.info(
|
|
|
|
|
|
|
268 |
with st.expander("Parsed document", expanded=True):
|
269 |
with st.container(height=600, border=False):
|
270 |
st.markdown(ss.parsed_md)
|
271 |
|
272 |
with col2:
|
273 |
-
if ss.chunks:
|
274 |
file_chunks = save_chunks()
|
275 |
cola, colb = st.columns([0.25, 0.75])
|
276 |
with colb:
|
277 |
st.subheader("📦 Chunks", divider="blue")
|
278 |
with cola:
|
279 |
-
st.
|
280 |
st.download_button(
|
281 |
label="⬇️ Download",
|
282 |
data=file_chunks,
|
@@ -285,7 +324,9 @@ with col2:
|
|
285 |
use_container_width=True,
|
286 |
)
|
287 |
with st.container(border=False):
|
288 |
-
for i, chunk in enumerate(ss.chunks):
|
289 |
with st.expander(f"Chunk {i+1}", expanded=False):
|
290 |
with st.container(height=300, border=False):
|
291 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
+
import time
|
3 |
+
from pathlib import Path
|
4 |
from typing import Literal
|
5 |
+
|
6 |
import requests
|
7 |
import streamlit as st
|
|
|
|
|
|
|
|
|
8 |
from chunknorris.chunkers import MarkdownChunker
|
9 |
+
from chunknorris.parsers import (
|
10 |
+
AbstractParser,
|
11 |
+
CSVParser,
|
12 |
+
DocxParser,
|
13 |
+
ExcelParser,
|
14 |
+
HTMLParser,
|
15 |
+
MarkdownParser,
|
16 |
+
PdfParser,
|
17 |
+
)
|
18 |
from chunknorris.pipelines import PdfPipeline
|
19 |
+
from streamlit import session_state as ss
|
20 |
+
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
|
21 |
|
22 |
st.set_page_config(
|
23 |
layout="wide",
|
24 |
page_icon="🔪",
|
25 |
page_title="ChunkNorris demo",
|
26 |
menu_items={
|
27 |
+
"Report a bug": "https://github.com/wikit-ai/chunknorris/issues",
|
28 |
+
"About": "https://wikit-ai.github.io/chunknorris/",
|
29 |
+
},
|
30 |
+
)
|
31 |
|
32 |
LOGGER = st.empty()
|
33 |
|
34 |
SAMPLE_FILE = {
|
35 |
+
"sample PDF - 264 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
|
36 |
+
"sample PDF - 16 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
|
37 |
"sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
|
38 |
"sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
|
39 |
}
|
|
|
45 |
ss.parsed_md = ""
|
46 |
|
47 |
if "chunks" not in ss:
|
48 |
+
ss.chunks = [] # type: ignore | list[Chunk]
|
49 |
+
|
50 |
|
51 |
+
def get_parser(fileext: str) -> AbstractParser:
|
52 |
"""Get the pipeline for the given filename."""
|
53 |
match fileext:
|
54 |
case ".md":
|
|
|
66 |
case ".csv":
|
67 |
parser = CSVParser()
|
68 |
case _:
|
69 |
+
raise ValueError("File format not supported by ChunkNorris")
|
|
|
|
|
70 |
|
71 |
return parser
|
72 |
|
73 |
+
|
74 |
def get_md_chunker() -> MarkdownChunker:
|
75 |
"""Considering arguments set, returns the md chunker."""
|
76 |
return MarkdownChunker(
|
77 |
+
max_headers_to_use=ss.max_headers_to_use,
|
78 |
+
max_chunk_word_count=ss.max_chunk_word_count,
|
79 |
+
hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
|
80 |
+
min_chunk_word_count=ss.min_chunk_word_count,
|
81 |
+
)
|
82 |
|
83 |
+
|
84 |
+
def parse_and_chunk(uploaded_file: UploadedFile | None):
|
85 |
"""Parse and chunk the file."""
|
86 |
if uploaded_file is None:
|
87 |
log("Please upload a file.", "warning")
|
|
|
96 |
case ".pdf":
|
97 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
98 |
chunker = PdfPipeline(parser, get_md_chunker())
|
99 |
+
chunks = chunker._get_chunks_using_strategy() # type: ignore
|
100 |
case ".xlsx":
|
101 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
102 |
chunker = get_md_chunker()
|
|
|
109 |
ss.parsing_time = time.perf_counter() - start_time
|
110 |
ss.parsed_md = md_doc.to_string()
|
111 |
ss.chunks = chunks
|
112 |
+
log(
|
113 |
+
f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success"
|
114 |
+
)
|
115 |
|
116 |
except Exception as e:
|
117 |
log(f"Error when parsing file.", "warning")
|
118 |
print(e)
|
119 |
return
|
120 |
|
121 |
+
|
122 |
def save_parsed_md():
|
123 |
"""Save the parsed markdown string to a md file."""
|
124 |
return ss.parsed_md.encode("utf-8")
|
125 |
|
126 |
+
|
127 |
def save_chunks():
|
128 |
"""Save the parsed chunks to a json file."""
|
129 |
+
return json.dumps(
|
130 |
+
[
|
131 |
+
{
|
132 |
+
k: v
|
133 |
+
for k, v in chunk.model_dump().items()
|
134 |
+
if k not in ["headers", "content"]
|
135 |
+
}
|
136 |
+
| {"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)}
|
137 |
+
for chunk in ss.chunks
|
138 |
+
],
|
139 |
+
indent=4,
|
140 |
+
ensure_ascii=False,
|
141 |
+
).encode("utf-8")
|
142 |
+
|
143 |
|
144 |
def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
|
145 |
"""Display a warning message."""
|
|
|
151 |
case "info":
|
152 |
LOGGER.info(message, icon="ℹ️")
|
153 |
|
154 |
+
|
155 |
def load_sample_file(url: str):
|
156 |
"""Get the file from url"""
|
157 |
response = requests.get(url)
|
|
|
163 |
data=response.content,
|
164 |
type="application/octet-stream",
|
165 |
),
|
166 |
+
file_urls=[url],
|
167 |
+
)
|
168 |
else:
|
169 |
print(response.status_code, response.content)
|
170 |
st.error("Failed to get data.")
|
171 |
return None
|
172 |
|
173 |
+
|
174 |
st.title("ChunkNorris.")
|
175 |
st.subheader("*Fast, smart, lightweight document chunking.*")
|
176 |
|
177 |
st.sidebar.header("Chunking settings")
|
178 |
+
st.sidebar.markdown(
|
179 |
+
"| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |"
|
180 |
+
)
|
181 |
st.sidebar.select_slider(
|
182 |
label="Max header level to consider for chunking",
|
183 |
options=["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
|
185 |
key="max_headers_to_use",
|
186 |
help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
|
187 |
label_visibility="visible",
|
188 |
+
)
|
189 |
|
190 |
st.sidebar.slider(
|
191 |
label="Maximum words per chunk",
|
|
|
196 |
key="max_chunk_word_count",
|
197 |
help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
|
198 |
label_visibility="visible",
|
199 |
+
)
|
200 |
|
201 |
st.sidebar.slider(
|
202 |
label="Hard maximum words per chunk",
|
|
|
207 |
key="hard_max_chunk_word_count",
|
208 |
help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
|
209 |
label_visibility="visible",
|
210 |
+
)
|
211 |
|
212 |
st.sidebar.slider(
|
213 |
label="Minumum words per chunk",
|
|
|
218 |
key="min_chunk_word_count",
|
219 |
help="The minimum words a chunk must have to avoid being discarded.",
|
220 |
label_visibility="visible",
|
221 |
+
)
|
222 |
|
223 |
st.sidebar.checkbox(
|
224 |
"Prepend headers to chunk's text",
|
|
|
226 |
key="prepend_headers_to_chunks",
|
227 |
label_visibility="visible",
|
228 |
help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
|
229 |
+
)
|
230 |
|
231 |
+
_, col1, col2, _ = st.columns([0.1, 0.5, 0.3, 0.1])
|
232 |
with col1:
|
233 |
uploaded_file = st.file_uploader(
|
234 |
"Upload your own file...",
|
235 |
+
type=[
|
236 |
+
"md",
|
237 |
+
"html",
|
238 |
+
"pdf",
|
239 |
+
"docx",
|
240 |
+
"xls",
|
241 |
+
"xlsx",
|
242 |
+
"xlsm",
|
243 |
+
"xlsb",
|
244 |
+
"odf",
|
245 |
+
"ods",
|
246 |
+
"odt",
|
247 |
+
"csv",
|
248 |
+
],
|
249 |
+
)
|
250 |
|
251 |
with col2:
|
252 |
sample_file = st.selectbox(
|
|
|
266 |
on_click=parse_and_chunk,
|
267 |
args=(uploaded_file,),
|
268 |
type="primary",
|
269 |
+
use_container_width=True,
|
270 |
+
)
|
271 |
else:
|
272 |
st.sidebar.button(
|
273 |
"Parse & Chunk",
|
274 |
on_click=log,
|
275 |
+
args=(
|
276 |
+
"You must upload a file first.",
|
277 |
+
"warning",
|
278 |
+
),
|
279 |
type="secondary",
|
280 |
+
use_container_width=True,
|
281 |
+
)
|
282 |
ss.parsed_md = ""
|
283 |
ss.chunks = []
|
284 |
|
285 |
|
286 |
col1, col2 = st.columns(2)
|
287 |
with col1:
|
288 |
+
if uploaded_file and ss.parsed_md:
|
289 |
file_parsed_md = save_parsed_md()
|
290 |
cola, colb = st.columns([0.25, 0.75])
|
291 |
with colb:
|
292 |
st.subheader("⚙️ Parsed Document", divider="blue")
|
293 |
with cola:
|
294 |
+
st.markdown("\n")
|
295 |
st.download_button(
|
296 |
label="⬇️ Download",
|
297 |
data=file_parsed_md,
|
|
|
300 |
use_container_width=True,
|
301 |
)
|
302 |
if Path(uploaded_file.name).suffix.lower() == ".pdf":
|
303 |
+
st.info(
|
304 |
+
"For the purpose of this demo, OCR on pdf documents is deactivated.",
|
305 |
+
icon="ℹ️",
|
306 |
+
)
|
307 |
with st.expander("Parsed document", expanded=True):
|
308 |
with st.container(height=600, border=False):
|
309 |
st.markdown(ss.parsed_md)
|
310 |
|
311 |
with col2:
|
312 |
+
if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
|
313 |
file_chunks = save_chunks()
|
314 |
cola, colb = st.columns([0.25, 0.75])
|
315 |
with colb:
|
316 |
st.subheader("📦 Chunks", divider="blue")
|
317 |
with cola:
|
318 |
+
st.markdown("\n")
|
319 |
st.download_button(
|
320 |
label="⬇️ Download",
|
321 |
data=file_chunks,
|
|
|
324 |
use_container_width=True,
|
325 |
)
|
326 |
with st.container(border=False):
|
327 |
+
for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
|
328 |
with st.expander(f"Chunk {i+1}", expanded=False):
|
329 |
with st.container(height=300, border=False):
|
330 |
+
st.markdown(
|
331 |
+
chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
|
332 |
+
)
|