mciancone commited on
Commit
ed2cad0
·
verified ·
1 Parent(s): ed8fac8

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
  from typing import Literal
5
 
6
  import requests
7
- import streamlit as st
8
  from chunknorris.chunkers import MarkdownChunker
9
  from chunknorris.parsers import (
10
  AbstractParser,
@@ -16,6 +16,7 @@ from chunknorris.parsers import (
16
  PdfParser,
17
  )
18
  from chunknorris.pipelines import PdfPipeline
 
19
  from streamlit import session_state as ss
20
  from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
21
 
@@ -78,6 +79,8 @@ def get_md_chunker() -> MarkdownChunker:
78
  max_chunk_word_count=ss.max_chunk_word_count,
79
  hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
80
  min_chunk_word_count=ss.min_chunk_word_count,
 
 
81
  )
82
 
83
 
@@ -96,7 +99,7 @@ def parse_and_chunk(uploaded_file: UploadedFile | None):
96
  case ".pdf":
97
  md_doc = parser.parse_string(uploaded_file.getvalue())
98
  chunker = PdfPipeline(parser, get_md_chunker())
99
- chunks = chunker._get_chunks_using_strategy() # type: ignore
100
  case ".xlsx":
101
  md_doc = parser.parse_string(uploaded_file.getvalue())
102
  chunker = get_md_chunker()
@@ -188,7 +191,7 @@ st.sidebar.select_slider(
188
  )
189
 
190
  st.sidebar.slider(
191
- label="Maximum words per chunk",
192
  value=250,
193
  min_value=0,
194
  max_value=3000,
@@ -199,7 +202,7 @@ st.sidebar.slider(
199
  )
200
 
201
  st.sidebar.slider(
202
- label="Hard maximum words per chunk",
203
  value=400,
204
  min_value=100,
205
  max_value=3000,
@@ -209,6 +212,17 @@ st.sidebar.slider(
209
  label_visibility="visible",
210
  )
211
 
 
 
 
 
 
 
 
 
 
 
 
212
  st.sidebar.slider(
213
  label="Minumum words per chunk",
214
  value=10,
@@ -309,7 +323,7 @@ with col1:
309
  st.markdown(ss.parsed_md)
310
 
311
  with col2:
312
- if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
313
  file_chunks = save_chunks()
314
  cola, colb = st.columns([0.25, 0.75])
315
  with colb:
@@ -324,9 +338,9 @@ with col2:
324
  use_container_width=True,
325
  )
326
  with st.container(border=False):
327
- for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
328
  with st.expander(f"Chunk {i+1}", expanded=False):
329
  with st.container(height=300, border=False):
330
  st.markdown(
331
- chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
332
  )
 
4
  from typing import Literal
5
 
6
  import requests
7
+ import tiktoken
8
  from chunknorris.chunkers import MarkdownChunker
9
  from chunknorris.parsers import (
10
  AbstractParser,
 
16
  PdfParser,
17
  )
18
  from chunknorris.pipelines import PdfPipeline
19
+ import streamlit as st
20
  from streamlit import session_state as ss
21
  from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
22
 
 
79
  max_chunk_word_count=ss.max_chunk_word_count,
80
  hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
81
  min_chunk_word_count=ss.min_chunk_word_count,
82
+ hard_max_chunk_token_count=ss.hard_max_chunk_token_count,
83
+ tokenizer=tiktoken.encoding_for_model("text-embedding-ada-002"),
84
  )
85
 
86
 
 
99
  case ".pdf":
100
  md_doc = parser.parse_string(uploaded_file.getvalue())
101
  chunker = PdfPipeline(parser, get_md_chunker())
102
+ chunks = chunker._get_chunks_using_strategy() # type: ignore
103
  case ".xlsx":
104
  md_doc = parser.parse_string(uploaded_file.getvalue())
105
  chunker = get_md_chunker()
 
191
  )
192
 
193
  st.sidebar.slider(
194
+ label="Maximum words (soft maximum) per chunk",
195
  value=250,
196
  min_value=0,
197
  max_value=3000,
 
202
  )
203
 
204
  st.sidebar.slider(
205
+ label="Maximum words (hard maximum) per chunk",
206
  value=400,
207
  min_value=100,
208
  max_value=3000,
 
212
  label_visibility="visible",
213
  )
214
 
215
+ st.sidebar.slider(
216
+ label="Maximum token (hard maximum) per chunk",
217
+ value=400,
218
+ min_value=100,
219
+ max_value=8000,
220
+ step=100,
221
+ key="hard_max_chunk_token_count",
222
+ help="The hard maximum number of tokens per chunk. If a chunk is bigger than this, chunk is split using newlines. Applied after the word-based chunking",
223
+ label_visibility="visible",
224
+ )
225
+
226
  st.sidebar.slider(
227
  label="Minumum words per chunk",
228
  value=10,
 
323
  st.markdown(ss.parsed_md)
324
 
325
  with col2:
326
+ if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
327
  file_chunks = save_chunks()
328
  cola, colb = st.columns([0.25, 0.75])
329
  with colb:
 
338
  use_container_width=True,
339
  )
340
  with st.container(border=False):
341
+ for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
342
  with st.expander(f"Chunk {i+1}", expanded=False):
343
  with st.container(height=300, border=False):
344
  st.markdown(
345
+ chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
346
  )