mciancone commited on
Commit
ed8fac8
·
verified ·
1 Parent(s): 82c01c8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -59
app.py CHANGED
@@ -1,32 +1,39 @@
1
- from pathlib import Path
2
- import time
3
  import json
 
 
4
  from typing import Literal
5
- from io import BytesIO
6
  import requests
7
  import streamlit as st
8
- from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
9
- from streamlit import session_state as ss
10
-
11
- from chunknorris.parsers import (AbstractParser, MarkdownParser, HTMLParser, PdfParser, DocxParser, ExcelParser, CSVParser)
12
  from chunknorris.chunkers import MarkdownChunker
 
 
 
 
 
 
 
 
 
13
  from chunknorris.pipelines import PdfPipeline
 
 
14
 
15
  st.set_page_config(
16
  layout="wide",
17
  page_icon="🔪",
18
  page_title="ChunkNorris demo",
19
  menu_items={
20
- 'Report a bug': "https://github.com/wikit-ai/chunknorris/issues",
21
- 'About': "https://wikit-ai.github.io/chunknorris/"
22
- }
23
- )
24
 
25
  LOGGER = st.empty()
26
 
27
  SAMPLE_FILE = {
28
- "sample PDF - 264 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
29
- "sample PDF - 16 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
30
  "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
31
  "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
32
  }
@@ -38,9 +45,10 @@ if "parsed_md" not in ss:
38
  ss.parsed_md = ""
39
 
40
  if "chunks" not in ss:
41
- ss.chunks = []
 
42
 
43
- def get_parser(fileext : str) -> AbstractParser:
44
  """Get the pipeline for the given filename."""
45
  match fileext:
46
  case ".md":
@@ -58,22 +66,22 @@ def get_parser(fileext : str) -> AbstractParser:
58
  case ".csv":
59
  parser = CSVParser()
60
  case _:
61
- raise ValueError(
62
- "File format not supported by ChunkNorris"
63
- )
64
 
65
  return parser
66
 
 
67
  def get_md_chunker() -> MarkdownChunker:
68
  """Considering arguments set, returns the md chunker."""
69
  return MarkdownChunker(
70
- max_headers_to_use=ss.max_headers_to_use,
71
- max_chunk_word_count=ss.max_chunk_word_count,
72
- hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
73
- min_chunk_word_count=ss.min_chunk_word_count,
74
- )
75
 
76
- def parse_and_chunk(uploaded_file : UploadedFile):
 
77
  """Parse and chunk the file."""
78
  if uploaded_file is None:
79
  log("Please upload a file.", "warning")
@@ -88,7 +96,7 @@ def parse_and_chunk(uploaded_file : UploadedFile):
88
  case ".pdf":
89
  md_doc = parser.parse_string(uploaded_file.getvalue())
90
  chunker = PdfPipeline(parser, get_md_chunker())
91
- chunks = chunker._get_chunks_using_strategy()
92
  case ".xlsx":
93
  md_doc = parser.parse_string(uploaded_file.getvalue())
94
  chunker = get_md_chunker()
@@ -101,29 +109,37 @@ def parse_and_chunk(uploaded_file : UploadedFile):
101
  ss.parsing_time = time.perf_counter() - start_time
102
  ss.parsed_md = md_doc.to_string()
103
  ss.chunks = chunks
104
- log(f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success")
 
 
105
 
106
  except Exception as e:
107
  log(f"Error when parsing file.", "warning")
108
  print(e)
109
  return
110
 
 
111
  def save_parsed_md():
112
  """Save the parsed markdown string to a md file."""
113
  return ss.parsed_md.encode("utf-8")
114
 
 
115
  def save_chunks():
116
  """Save the parsed chunks to a json file."""
117
- return json.dumps([
118
- {
119
- k:v
120
- for k,v in chunk.model_dump().items()
121
- if k not in ["headers","content"]
122
- } | {
123
- "text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)
124
- }
125
- for chunk in ss.chunks
126
- ], indent=4, ensure_ascii=False).encode("utf-8")
 
 
 
 
127
 
128
  def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
129
  """Display a warning message."""
@@ -135,6 +151,7 @@ def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
135
  case "info":
136
  LOGGER.info(message, icon="ℹ️")
137
 
 
138
  def load_sample_file(url: str):
139
  """Get the file from url"""
140
  response = requests.get(url)
@@ -146,18 +163,21 @@ def load_sample_file(url: str):
146
  data=response.content,
147
  type="application/octet-stream",
148
  ),
149
- file_urls=[url]
150
- )
151
  else:
152
  print(response.status_code, response.content)
153
  st.error("Failed to get data.")
154
  return None
155
 
 
156
  st.title("ChunkNorris.")
157
  st.subheader("*Fast, smart, lightweight document chunking.*")
158
 
159
  st.sidebar.header("Chunking settings")
160
- st.sidebar.markdown("| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |")
 
 
161
  st.sidebar.select_slider(
162
  label="Max header level to consider for chunking",
163
  options=["h1", "h2", "h3", "h4", "h5", "h6"],
@@ -165,7 +185,7 @@ st.sidebar.select_slider(
165
  key="max_headers_to_use",
166
  help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
167
  label_visibility="visible",
168
- )
169
 
170
  st.sidebar.slider(
171
  label="Maximum words per chunk",
@@ -176,7 +196,7 @@ st.sidebar.slider(
176
  key="max_chunk_word_count",
177
  help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
178
  label_visibility="visible",
179
- )
180
 
181
  st.sidebar.slider(
182
  label="Hard maximum words per chunk",
@@ -187,7 +207,7 @@ st.sidebar.slider(
187
  key="hard_max_chunk_word_count",
188
  help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
189
  label_visibility="visible",
190
- )
191
 
192
  st.sidebar.slider(
193
  label="Minumum words per chunk",
@@ -198,7 +218,7 @@ st.sidebar.slider(
198
  key="min_chunk_word_count",
199
  help="The minimum words a chunk must have to avoid being discarded.",
200
  label_visibility="visible",
201
- )
202
 
203
  st.sidebar.checkbox(
204
  "Prepend headers to chunk's text",
@@ -206,14 +226,27 @@ st.sidebar.checkbox(
206
  key="prepend_headers_to_chunks",
207
  label_visibility="visible",
208
  help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
209
- )
210
 
211
- _, col1, col2, _ = st.columns([0.1, .5, .3, 0.1])
212
  with col1:
213
  uploaded_file = st.file_uploader(
214
  "Upload your own file...",
215
- type=["md", "html", "pdf", "docx", "xls", "xlsx", "xlsm", "xlsb", "odf", "ods", "odt", "csv"],
216
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  with col2:
219
  sample_file = st.selectbox(
@@ -233,29 +266,32 @@ if uploaded_file is not None:
233
  on_click=parse_and_chunk,
234
  args=(uploaded_file,),
235
  type="primary",
236
- use_container_width=True
237
- )
238
  else:
239
  st.sidebar.button(
240
  "Parse & Chunk",
241
  on_click=log,
242
- args=("You must upload a file first.", "warning",),
 
 
 
243
  type="secondary",
244
- use_container_width=True
245
- )
246
  ss.parsed_md = ""
247
  ss.chunks = []
248
 
249
 
250
  col1, col2 = st.columns(2)
251
  with col1:
252
- if ss.parsed_md:
253
  file_parsed_md = save_parsed_md()
254
  cola, colb = st.columns([0.25, 0.75])
255
  with colb:
256
  st.subheader("⚙️ Parsed Document", divider="blue")
257
  with cola:
258
- st.write("\n")
259
  st.download_button(
260
  label="⬇️ Download",
261
  data=file_parsed_md,
@@ -264,19 +300,22 @@ with col1:
264
  use_container_width=True,
265
  )
266
  if Path(uploaded_file.name).suffix.lower() == ".pdf":
267
- st.info("For the purpose of this demo, OCR on pdf documents is deactivated.", icon="ℹ️")
 
 
 
268
  with st.expander("Parsed document", expanded=True):
269
  with st.container(height=600, border=False):
270
  st.markdown(ss.parsed_md)
271
 
272
  with col2:
273
- if ss.chunks:
274
  file_chunks = save_chunks()
275
  cola, colb = st.columns([0.25, 0.75])
276
  with colb:
277
  st.subheader("📦 Chunks", divider="blue")
278
  with cola:
279
- st.write("\n")
280
  st.download_button(
281
  label="⬇️ Download",
282
  data=file_chunks,
@@ -285,7 +324,9 @@ with col2:
285
  use_container_width=True,
286
  )
287
  with st.container(border=False):
288
- for i, chunk in enumerate(ss.chunks):
289
  with st.expander(f"Chunk {i+1}", expanded=False):
290
  with st.container(height=300, border=False):
291
- st.markdown(chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks))
 
 
 
 
 
1
  import json
2
+ import time
3
+ from pathlib import Path
4
  from typing import Literal
5
+
6
  import requests
7
  import streamlit as st
 
 
 
 
8
  from chunknorris.chunkers import MarkdownChunker
9
+ from chunknorris.parsers import (
10
+ AbstractParser,
11
+ CSVParser,
12
+ DocxParser,
13
+ ExcelParser,
14
+ HTMLParser,
15
+ MarkdownParser,
16
+ PdfParser,
17
+ )
18
  from chunknorris.pipelines import PdfPipeline
19
+ from streamlit import session_state as ss
20
+ from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
21
 
22
  st.set_page_config(
23
  layout="wide",
24
  page_icon="🔪",
25
  page_title="ChunkNorris demo",
26
  menu_items={
27
+ "Report a bug": "https://github.com/wikit-ai/chunknorris/issues",
28
+ "About": "https://wikit-ai.github.io/chunknorris/",
29
+ },
30
+ )
31
 
32
  LOGGER = st.empty()
33
 
34
  SAMPLE_FILE = {
35
+ "sample PDF - 264 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
36
+ "sample PDF - 16 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
37
  "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
38
  "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
39
  }
 
45
  ss.parsed_md = ""
46
 
47
  if "chunks" not in ss:
48
+ ss.chunks = [] # type: ignore | list[Chunk]
49
+
50
 
51
+ def get_parser(fileext: str) -> AbstractParser:
52
  """Get the pipeline for the given filename."""
53
  match fileext:
54
  case ".md":
 
66
  case ".csv":
67
  parser = CSVParser()
68
  case _:
69
+ raise ValueError("File format not supported by ChunkNorris")
 
 
70
 
71
  return parser
72
 
73
+
74
  def get_md_chunker() -> MarkdownChunker:
75
  """Considering arguments set, returns the md chunker."""
76
  return MarkdownChunker(
77
+ max_headers_to_use=ss.max_headers_to_use,
78
+ max_chunk_word_count=ss.max_chunk_word_count,
79
+ hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
80
+ min_chunk_word_count=ss.min_chunk_word_count,
81
+ )
82
 
83
+
84
+ def parse_and_chunk(uploaded_file: UploadedFile | None):
85
  """Parse and chunk the file."""
86
  if uploaded_file is None:
87
  log("Please upload a file.", "warning")
 
96
  case ".pdf":
97
  md_doc = parser.parse_string(uploaded_file.getvalue())
98
  chunker = PdfPipeline(parser, get_md_chunker())
99
+ chunks = chunker._get_chunks_using_strategy() # type: ignore
100
  case ".xlsx":
101
  md_doc = parser.parse_string(uploaded_file.getvalue())
102
  chunker = get_md_chunker()
 
109
  ss.parsing_time = time.perf_counter() - start_time
110
  ss.parsed_md = md_doc.to_string()
111
  ss.chunks = chunks
112
+ log(
113
+ f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success"
114
+ )
115
 
116
  except Exception as e:
117
  log(f"Error when parsing file.", "warning")
118
  print(e)
119
  return
120
 
121
+
122
  def save_parsed_md():
123
  """Save the parsed markdown string to a md file."""
124
  return ss.parsed_md.encode("utf-8")
125
 
126
+
127
  def save_chunks():
128
  """Save the parsed chunks to a json file."""
129
+ return json.dumps(
130
+ [
131
+ {
132
+ k: v
133
+ for k, v in chunk.model_dump().items()
134
+ if k not in ["headers", "content"]
135
+ }
136
+ | {"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)}
137
+ for chunk in ss.chunks
138
+ ],
139
+ indent=4,
140
+ ensure_ascii=False,
141
+ ).encode("utf-8")
142
+
143
 
144
  def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
145
  """Display a warning message."""
 
151
  case "info":
152
  LOGGER.info(message, icon="ℹ️")
153
 
154
+
155
  def load_sample_file(url: str):
156
  """Get the file from url"""
157
  response = requests.get(url)
 
163
  data=response.content,
164
  type="application/octet-stream",
165
  ),
166
+ file_urls=[url],
167
+ )
168
  else:
169
  print(response.status_code, response.content)
170
  st.error("Failed to get data.")
171
  return None
172
 
173
+
174
  st.title("ChunkNorris.")
175
  st.subheader("*Fast, smart, lightweight document chunking.*")
176
 
177
  st.sidebar.header("Chunking settings")
178
+ st.sidebar.markdown(
179
+ "| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |"
180
+ )
181
  st.sidebar.select_slider(
182
  label="Max header level to consider for chunking",
183
  options=["h1", "h2", "h3", "h4", "h5", "h6"],
 
185
  key="max_headers_to_use",
186
  help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
187
  label_visibility="visible",
188
+ )
189
 
190
  st.sidebar.slider(
191
  label="Maximum words per chunk",
 
196
  key="max_chunk_word_count",
197
  help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
198
  label_visibility="visible",
199
+ )
200
 
201
  st.sidebar.slider(
202
  label="Hard maximum words per chunk",
 
207
  key="hard_max_chunk_word_count",
208
  help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
209
  label_visibility="visible",
210
+ )
211
 
212
  st.sidebar.slider(
213
  label="Minumum words per chunk",
 
218
  key="min_chunk_word_count",
219
  help="The minimum words a chunk must have to avoid being discarded.",
220
  label_visibility="visible",
221
+ )
222
 
223
  st.sidebar.checkbox(
224
  "Prepend headers to chunk's text",
 
226
  key="prepend_headers_to_chunks",
227
  label_visibility="visible",
228
  help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
229
+ )
230
 
231
+ _, col1, col2, _ = st.columns([0.1, 0.5, 0.3, 0.1])
232
  with col1:
233
  uploaded_file = st.file_uploader(
234
  "Upload your own file...",
235
+ type=[
236
+ "md",
237
+ "html",
238
+ "pdf",
239
+ "docx",
240
+ "xls",
241
+ "xlsx",
242
+ "xlsm",
243
+ "xlsb",
244
+ "odf",
245
+ "ods",
246
+ "odt",
247
+ "csv",
248
+ ],
249
+ )
250
 
251
  with col2:
252
  sample_file = st.selectbox(
 
266
  on_click=parse_and_chunk,
267
  args=(uploaded_file,),
268
  type="primary",
269
+ use_container_width=True,
270
+ )
271
  else:
272
  st.sidebar.button(
273
  "Parse & Chunk",
274
  on_click=log,
275
+ args=(
276
+ "You must upload a file first.",
277
+ "warning",
278
+ ),
279
  type="secondary",
280
+ use_container_width=True,
281
+ )
282
  ss.parsed_md = ""
283
  ss.chunks = []
284
 
285
 
286
  col1, col2 = st.columns(2)
287
  with col1:
288
+ if uploaded_file and ss.parsed_md:
289
  file_parsed_md = save_parsed_md()
290
  cola, colb = st.columns([0.25, 0.75])
291
  with colb:
292
  st.subheader("⚙️ Parsed Document", divider="blue")
293
  with cola:
294
+ st.markdown("\n")
295
  st.download_button(
296
  label="⬇️ Download",
297
  data=file_parsed_md,
 
300
  use_container_width=True,
301
  )
302
  if Path(uploaded_file.name).suffix.lower() == ".pdf":
303
+ st.info(
304
+ "For the purpose of this demo, OCR on pdf documents is deactivated.",
305
+ icon="ℹ️",
306
+ )
307
  with st.expander("Parsed document", expanded=True):
308
  with st.container(height=600, border=False):
309
  st.markdown(ss.parsed_md)
310
 
311
  with col2:
312
+ if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
313
  file_chunks = save_chunks()
314
  cola, colb = st.columns([0.25, 0.75])
315
  with colb:
316
  st.subheader("📦 Chunks", divider="blue")
317
  with cola:
318
+ st.markdown("\n")
319
  st.download_button(
320
  label="⬇️ Download",
321
  data=file_chunks,
 
324
  use_container_width=True,
325
  )
326
  with st.container(border=False):
327
+ for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
328
  with st.expander(f"Chunk {i+1}", expanded=False):
329
  with st.container(height=300, border=False):
330
+ st.markdown(
331
+ chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
332
+ )