Spaces:
Running
Running
Upload 2 files
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
4 |
from typing import Literal
|
5 |
|
6 |
import requests
|
7 |
-
import
|
8 |
from chunknorris.chunkers import MarkdownChunker
|
9 |
from chunknorris.parsers import (
|
10 |
AbstractParser,
|
@@ -16,6 +16,7 @@ from chunknorris.parsers import (
|
|
16 |
PdfParser,
|
17 |
)
|
18 |
from chunknorris.pipelines import PdfPipeline
|
|
|
19 |
from streamlit import session_state as ss
|
20 |
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
|
21 |
|
@@ -78,6 +79,8 @@ def get_md_chunker() -> MarkdownChunker:
|
|
78 |
max_chunk_word_count=ss.max_chunk_word_count,
|
79 |
hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
|
80 |
min_chunk_word_count=ss.min_chunk_word_count,
|
|
|
|
|
81 |
)
|
82 |
|
83 |
|
@@ -96,7 +99,7 @@ def parse_and_chunk(uploaded_file: UploadedFile | None):
|
|
96 |
case ".pdf":
|
97 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
98 |
chunker = PdfPipeline(parser, get_md_chunker())
|
99 |
-
chunks = chunker._get_chunks_using_strategy()
|
100 |
case ".xlsx":
|
101 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
102 |
chunker = get_md_chunker()
|
@@ -188,7 +191,7 @@ st.sidebar.select_slider(
|
|
188 |
)
|
189 |
|
190 |
st.sidebar.slider(
|
191 |
-
label="Maximum words per chunk",
|
192 |
value=250,
|
193 |
min_value=0,
|
194 |
max_value=3000,
|
@@ -199,7 +202,7 @@ st.sidebar.slider(
|
|
199 |
)
|
200 |
|
201 |
st.sidebar.slider(
|
202 |
-
label="
|
203 |
value=400,
|
204 |
min_value=100,
|
205 |
max_value=3000,
|
@@ -209,6 +212,17 @@ st.sidebar.slider(
|
|
209 |
label_visibility="visible",
|
210 |
)
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
st.sidebar.slider(
|
213 |
label="Minumum words per chunk",
|
214 |
value=10,
|
@@ -309,7 +323,7 @@ with col1:
|
|
309 |
st.markdown(ss.parsed_md)
|
310 |
|
311 |
with col2:
|
312 |
-
if uploaded_file and ss.chunks:
|
313 |
file_chunks = save_chunks()
|
314 |
cola, colb = st.columns([0.25, 0.75])
|
315 |
with colb:
|
@@ -324,9 +338,9 @@ with col2:
|
|
324 |
use_container_width=True,
|
325 |
)
|
326 |
with st.container(border=False):
|
327 |
-
for i, chunk in enumerate(ss.chunks):
|
328 |
with st.expander(f"Chunk {i+1}", expanded=False):
|
329 |
with st.container(height=300, border=False):
|
330 |
st.markdown(
|
331 |
-
chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)
|
332 |
)
|
|
|
4 |
from typing import Literal
|
5 |
|
6 |
import requests
|
7 |
+
import tiktoken
|
8 |
from chunknorris.chunkers import MarkdownChunker
|
9 |
from chunknorris.parsers import (
|
10 |
AbstractParser,
|
|
|
16 |
PdfParser,
|
17 |
)
|
18 |
from chunknorris.pipelines import PdfPipeline
|
19 |
+
import streamlit as st
|
20 |
from streamlit import session_state as ss
|
21 |
from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
|
22 |
|
|
|
79 |
max_chunk_word_count=ss.max_chunk_word_count,
|
80 |
hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
|
81 |
min_chunk_word_count=ss.min_chunk_word_count,
|
82 |
+
hard_max_chunk_token_count=ss.hard_max_chunk_token_count,
|
83 |
+
tokenizer=tiktoken.encoding_for_model("text-embedding-ada-002"),
|
84 |
)
|
85 |
|
86 |
|
|
|
99 |
case ".pdf":
|
100 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
101 |
chunker = PdfPipeline(parser, get_md_chunker())
|
102 |
+
chunks = chunker._get_chunks_using_strategy() # type: ignore
|
103 |
case ".xlsx":
|
104 |
md_doc = parser.parse_string(uploaded_file.getvalue())
|
105 |
chunker = get_md_chunker()
|
|
|
191 |
)
|
192 |
|
193 |
st.sidebar.slider(
|
194 |
+
label="Maximum words (soft maximum) per chunk",
|
195 |
value=250,
|
196 |
min_value=0,
|
197 |
max_value=3000,
|
|
|
202 |
)
|
203 |
|
204 |
st.sidebar.slider(
|
205 |
+
label="Maximum words (hard maximum) per chunk",
|
206 |
value=400,
|
207 |
min_value=100,
|
208 |
max_value=3000,
|
|
|
212 |
label_visibility="visible",
|
213 |
)
|
214 |
|
215 |
+
st.sidebar.slider(
|
216 |
+
label="Maximum token (hard maximum) per chunk",
|
217 |
+
value=400,
|
218 |
+
min_value=100,
|
219 |
+
max_value=8000,
|
220 |
+
step=100,
|
221 |
+
key="hard_max_chunk_token_count",
|
222 |
+
help="The hard maximum number of tokens per chunk. If a chunk is bigger than this, chunk is split using newlines. Applied after the word-based chunking",
|
223 |
+
label_visibility="visible",
|
224 |
+
)
|
225 |
+
|
226 |
st.sidebar.slider(
|
227 |
label="Minumum words per chunk",
|
228 |
value=10,
|
|
|
323 |
st.markdown(ss.parsed_md)
|
324 |
|
325 |
with col2:
|
326 |
+
if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
|
327 |
file_chunks = save_chunks()
|
328 |
cola, colb = st.columns([0.25, 0.75])
|
329 |
with colb:
|
|
|
338 |
use_container_width=True,
|
339 |
)
|
340 |
with st.container(border=False):
|
341 |
+
for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
|
342 |
with st.expander(f"Chunk {i+1}", expanded=False):
|
343 |
with st.container(height=300, border=False):
|
344 |
st.markdown(
|
345 |
+
chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
|
346 |
)
|