mciancone commited on
Commit
5414a3b
·
verified ·
1 Parent(s): 63e7632

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +291 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import time
3
+ import json
4
+ from typing import Literal
5
+ from io import BytesIO
6
+ import requests
7
+ import streamlit as st
8
+ from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
9
+ from streamlit import session_state as ss
10
+
11
+ from chunknorris.parsers import (AbstractParser, MarkdownParser, HTMLParser, PdfParser, DocxParser, ExcelParser, CSVParser)
12
+ from chunknorris.chunkers import MarkdownChunker
13
+ from chunknorris.pipelines import PdfPipeline
14
+
15
+ st.set_page_config(
16
+ layout="wide",
17
+ page_icon="🔪",
18
+ page_title="ChunkNorris demo",
19
+ menu_items={
20
+ 'Report a bug': "https://github.com/wikit-ai/chunknorris/issues",
21
+ 'About': "https://wikit-ai.github.io/chunknorris/"
22
+ }
23
+ )
24
+
25
+ LOGGER = st.empty()
26
+
27
+ SAMPLE_FILE = {
28
+ "sample PDF - 264 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
29
+ "sample PDF - 16 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
30
+ "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
31
+ "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
32
+ }
33
+
34
+ if "parsing_time" not in ss:
35
+ ss.parsing_time = 0
36
+
37
+ if "parsed_md" not in ss:
38
+ ss.parsed_md = ""
39
+
40
+ if "chunks" not in ss:
41
+ ss.chunks = []
42
+
43
+ def get_parser(fileext : str) -> AbstractParser:
44
+ """Get the pipeline for the given filename."""
45
+ match fileext:
46
+ case ".md":
47
+ parser = MarkdownParser()
48
+ case ".html":
49
+ parser = HTMLParser()
50
+ case ".pdf":
51
+ parser = PdfParser(
52
+ use_ocr="never",
53
+ )
54
+ log("For the purpose of this demo, OCR on .pdf documents is deactivated.", "info")
55
+ case ".docx":
56
+ parser = DocxParser()
57
+ case ".xls" | ".xlsx" | ".xlsm" | ".xlsb" | ".odf" | ".ods" | ".odt":
58
+ parser = ExcelParser()
59
+ case ".csv":
60
+ parser = CSVParser()
61
+ case _:
62
+ raise ValueError(
63
+ "File format not supported by ChunkNorris"
64
+ )
65
+
66
+ return parser
67
+
68
+ def get_md_chunker() -> MarkdownChunker:
69
+ """Considering arguments set, returns the md chunker."""
70
+ return MarkdownChunker(
71
+ max_headers_to_use=ss.max_headers_to_use,
72
+ max_chunk_word_count=ss.max_chunk_word_count,
73
+ hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
74
+ min_chunk_word_count=ss.min_chunk_word_count,
75
+ )
76
+
77
+ def parse_and_chunk(uploaded_file : UploadedFile):
78
+ """Parse and chunk the file."""
79
+ if uploaded_file is None:
80
+ log("Please upload a file.", "warning")
81
+ return
82
+ log("Parsing and chunking...", "info")
83
+
84
+ try:
85
+ print(uploaded_file)
86
+ fileext = Path(uploaded_file.name).suffix.lower()
87
+ parser = get_parser(fileext)
88
+ start_time = time.perf_counter()
89
+ match fileext:
90
+ case ".pdf":
91
+ md_doc = parser.parse_string(uploaded_file.getvalue())
92
+ chunker = PdfPipeline(parser, get_md_chunker())
93
+ chunks = chunker._get_chunks_using_strategy()
94
+ case ".xlsx":
95
+ md_doc = parser.parse_string(uploaded_file.getvalue())
96
+ chunker = get_md_chunker()
97
+ chunks = chunker.chunk(md_doc)
98
+ case _:
99
+ md_doc = parser.parse_string(uploaded_file.getvalue().decode("utf-8"))
100
+ chunker = get_md_chunker()
101
+ chunks = chunker.chunk(md_doc)
102
+
103
+ ss.parsing_time = time.perf_counter() - start_time
104
+ ss.parsed_md = md_doc.to_string()
105
+ ss.chunks = chunks
106
+ log(f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success")
107
+
108
+ except Exception as e:
109
+ log(f"Error when parsing file.", "warning")
110
+ print(e)
111
+ return
112
+
113
+ def save_parsed_md():
114
+ """Save the parsed markdown string to a md file."""
115
+ return ss.parsed_md.encode("utf-8")
116
+
117
+ def save_chunks():
118
+ """Save the parsed chunks to a json file."""
119
+ return json.dumps([
120
+ {
121
+ k:v
122
+ for k,v in chunk.model_dump().items()
123
+ if k not in ["headers","content"]
124
+ } | {
125
+ "text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)
126
+ }
127
+ for chunk in ss.chunks
128
+ ], indent=4, ensure_ascii=False).encode("utf-8")
129
+
130
+ def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
131
+ """Display a warning message."""
132
+ match log_type:
133
+ case "warning":
134
+ LOGGER.warning(message, icon="⚠️")
135
+ case "success":
136
+ LOGGER.success(message, icon="✅")
137
+ case "info":
138
+ LOGGER.info(message, icon="ℹ️")
139
+
140
+ def load_sample_file(url: str):
141
+ """Get the file from url"""
142
+ response = requests.get(url)
143
+ if response.status_code == 200:
144
+ return UploadedFile(
145
+ record=UploadedFileRec(
146
+ file_id="sample_file",
147
+ name=url.split("/")[-1],
148
+ data=response.content,
149
+ type="application/octet-stream",
150
+ ),
151
+ file_urls=[url]
152
+ )
153
+ else:
154
+ print(response.status_code, response.content)
155
+ st.error("Failed to get data.")
156
+ return None
157
+
158
+ st.title("ChunkNorris.")
159
+ st.subheader("*Fast, smart, lightweight document chunking.*")
160
+
161
+ st.sidebar.header("Chunking settings")
162
+ st.sidebar.markdown("| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |")
163
+ st.sidebar.select_slider(
164
+ label="Max header level to consider for chunking",
165
+ options=["h1", "h2", "h3", "h4", "h5", "h6"],
166
+ value="h4",
167
+ key="max_headers_to_use",
168
+ help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
169
+ label_visibility="visible",
170
+ )
171
+
172
+ st.sidebar.slider(
173
+ label="Maximum words per chunk",
174
+ value=250,
175
+ min_value=0,
176
+ max_value=3000,
177
+ step=50,
178
+ key="max_chunk_word_count",
179
+ help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
180
+ label_visibility="visible",
181
+ )
182
+
183
+ st.sidebar.slider(
184
+ label="Hard maximum words per chunk",
185
+ value=400,
186
+ min_value=100,
187
+ max_value=3000,
188
+ step=50,
189
+ key="hard_max_chunk_word_count",
190
+ help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
191
+ label_visibility="visible",
192
+ )
193
+
194
+ st.sidebar.slider(
195
+ label="Minumum words per chunk",
196
+ value=10,
197
+ min_value=0,
198
+ max_value=50,
199
+ step=1,
200
+ key="min_chunk_word_count",
201
+ help="The minimum words a chunk must have to avoid being discarded.",
202
+ label_visibility="visible",
203
+ )
204
+
205
+ st.sidebar.checkbox(
206
+ "Prepend headers to chunk's text",
207
+ value=True,
208
+ key="prepend_headers_to_chunks",
209
+ label_visibility="visible",
210
+ help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
211
+ )
212
+
213
+ _, col1, col2, _ = st.columns([0.1, .5, .3, 0.1])
214
+ with col1:
215
+ uploaded_file = st.file_uploader(
216
+ "Upload your own file...",
217
+ type=["md", "html", "pdf", "docx", "xls", "xlsx", "xlsm", "xlsb", "odf", "ods", "odt", "csv"],
218
+ )
219
+
220
+ with col2:
221
+ sample_file = st.selectbox(
222
+ "... Or choose a sample file from the list.",
223
+ options=list(SAMPLE_FILE.keys()),
224
+ index=None,
225
+ )
226
+ if sample_file is not None:
227
+ st.markdown(f"[View file]({SAMPLE_FILE[sample_file]})")
228
+ uploaded_file = load_sample_file(SAMPLE_FILE[sample_file])
229
+
230
+
231
+ if uploaded_file is not None:
232
+ parse_and_chunk(uploaded_file)
233
+ st.sidebar.button(
234
+ "Parse & Chunk",
235
+ on_click=parse_and_chunk,
236
+ args=(uploaded_file,),
237
+ type="primary",
238
+ use_container_width=True
239
+ )
240
+ else:
241
+ st.sidebar.button(
242
+ "Parse & Chunk",
243
+ on_click=log,
244
+ args=("You must upload a file first.", "warning",),
245
+ type="secondary",
246
+ use_container_width=True
247
+ )
248
+ ss.parsed_md = ""
249
+ ss.chunks = []
250
+
251
+
252
+ col1, col2 = st.columns(2)
253
+ with col1:
254
+ if ss.parsed_md:
255
+ file_parsed_md = save_parsed_md()
256
+ cola, colb = st.columns([0.25, 0.75])
257
+ with colb:
258
+ st.subheader("⚙️ Parsed Document", divider="blue")
259
+ with cola:
260
+ st.write("\n")
261
+ st.download_button(
262
+ label="⬇️ Download",
263
+ data=file_parsed_md,
264
+ file_name="chunknorris_parsed_document.md",
265
+ mime="text/markdown",
266
+ use_container_width=True,
267
+ )
268
+ with st.expander("Parsed document", expanded=True):
269
+ with st.container(height=600, border=False):
270
+ st.markdown(ss.parsed_md)
271
+
272
+ with col2:
273
+ if ss.chunks:
274
+ file_chunks = save_chunks()
275
+ cola, colb = st.columns([0.25, 0.75])
276
+ with colb:
277
+ st.subheader("📦 Chunks", divider="blue")
278
+ with cola:
279
+ st.write("\n")
280
+ st.download_button(
281
+ label="⬇️ Download",
282
+ data=file_chunks,
283
+ file_name="chunknorris_chunks.json",
284
+ mime="application/json",
285
+ use_container_width=True,
286
+ )
287
+ with st.container(border=False):
288
+ for i, chunk in enumerate(ss.chunks):
289
+ with st.expander(f"Chunk {i+1}", expanded=False):
290
+ with st.container(height=300, border=False):
291
+ st.markdown(chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks))
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ chunknorris @ git+https://github.com/wikit-ai/chunknorris
2
+ pydantic
3
+ streamlit==1.44.0