Spaces:

Wikit
/

chunknorris

Running

App Files Files Community

mciancone commited on 17 days ago

Commit

ed8fac8

verified ·

1 Parent(s): 82c01c8

Upload app.py

Browse files

Files changed (1) hide show

app.py +100 -59

app.py CHANGED Viewed

@@ -1,32 +1,39 @@
-from pathlib import Path
-import time
 import json
 from typing import Literal
-from io import BytesIO
 import requests
 import streamlit as st
-from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
-from streamlit import session_state as ss
-from chunknorris.parsers import (AbstractParser, MarkdownParser, HTMLParser, PdfParser, DocxParser, ExcelParser, CSVParser)
 from chunknorris.chunkers import MarkdownChunker
 from chunknorris.pipelines import PdfPipeline
 st.set_page_config(
     layout="wide",
     page_icon="🔪",
     page_title="ChunkNorris demo",
     menu_items={
-        'Report a bug': "https://github.com/wikit-ai/chunknorris/issues",
-        'About': "https://wikit-ai.github.io/chunknorris/"
-        }
-    )
 LOGGER = st.empty()
 SAMPLE_FILE = {
-    "sample PDF - 264 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
-    "sample PDF - 16 pages" : "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
     "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
     "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
 }
@@ -38,9 +45,10 @@ if "parsed_md" not in ss:
     ss.parsed_md = ""
 if "chunks" not in ss:
-    ss.chunks = []
-def get_parser(fileext : str) -> AbstractParser:
     """Get the pipeline for the given filename."""
     match fileext:
         case ".md":
@@ -58,22 +66,22 @@ def get_parser(fileext : str) -> AbstractParser:
         case ".csv":
             parser = CSVParser()
         case _:
-            raise ValueError(
-                "File format not supported by ChunkNorris"
-            )
     return parser
 def get_md_chunker() -> MarkdownChunker:
     """Considering arguments set, returns the md chunker."""
     return MarkdownChunker(
-            max_headers_to_use=ss.max_headers_to_use,
-            max_chunk_word_count=ss.max_chunk_word_count,
-            hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
-            min_chunk_word_count=ss.min_chunk_word_count,
-        )
-def parse_and_chunk(uploaded_file : UploadedFile):
     """Parse and chunk the file."""
     if uploaded_file is None:
         log("Please upload a file.", "warning")
@@ -88,7 +96,7 @@ def parse_and_chunk(uploaded_file : UploadedFile):
             case ".pdf":
                 md_doc = parser.parse_string(uploaded_file.getvalue())
                 chunker = PdfPipeline(parser, get_md_chunker())
-                chunks = chunker._get_chunks_using_strategy()
             case ".xlsx":
                 md_doc = parser.parse_string(uploaded_file.getvalue())
                 chunker = get_md_chunker()
@@ -101,29 +109,37 @@ def parse_and_chunk(uploaded_file : UploadedFile):
         ss.parsing_time = time.perf_counter() - start_time
         ss.parsed_md = md_doc.to_string()
         ss.chunks = chunks
-        log(f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success")
     except Exception as e:
         log(f"Error when parsing file.", "warning")
         print(e)
         return
 def save_parsed_md():
     """Save the parsed markdown string to a md file."""
     return ss.parsed_md.encode("utf-8")
 def save_chunks():
     """Save the parsed chunks to a json file."""
-    return json.dumps([
-        {
-            k:v
-            for k,v in chunk.model_dump().items()
-            if k not in ["headers","content"]
-        } | {
-            "text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)
-        }
-        for chunk in ss.chunks
-        ], indent=4, ensure_ascii=False).encode("utf-8")
 def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
     """Display a warning message."""
@@ -135,6 +151,7 @@ def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
         case "info":
             LOGGER.info(message, icon="ℹ️")
 def load_sample_file(url: str):
     """Get the file from url"""
     response = requests.get(url)
@@ -146,18 +163,21 @@ def load_sample_file(url: str):
                 data=response.content,
                 type="application/octet-stream",
             ),
-            file_urls=[url]
-            )
     else:
         print(response.status_code, response.content)
         st.error("Failed to get data.")
         return None
 st.title("ChunkNorris.")
 st.subheader("*Fast, smart, lightweight document chunking.*")
 st.sidebar.header("Chunking settings")
-st.sidebar.markdown("| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |")
 st.sidebar.select_slider(
     label="Max header level to consider for chunking",
     options=["h1", "h2", "h3", "h4", "h5", "h6"],
@@ -165,7 +185,7 @@ st.sidebar.select_slider(
     key="max_headers_to_use",
     help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
     label_visibility="visible",
-    )
 st.sidebar.slider(
     label="Maximum words per chunk",
@@ -176,7 +196,7 @@ st.sidebar.slider(
     key="max_chunk_word_count",
     help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
     label_visibility="visible",
-    )
 st.sidebar.slider(
     label="Hard maximum words per chunk",
@@ -187,7 +207,7 @@ st.sidebar.slider(
     key="hard_max_chunk_word_count",
     help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
     label_visibility="visible",
-    )
 st.sidebar.slider(
     label="Minumum words per chunk",
@@ -198,7 +218,7 @@ st.sidebar.slider(
     key="min_chunk_word_count",
     help="The minimum words a chunk must have to avoid being discarded.",
     label_visibility="visible",
-    )
 st.sidebar.checkbox(
     "Prepend headers to chunk's text",
@@ -206,14 +226,27 @@ st.sidebar.checkbox(
     key="prepend_headers_to_chunks",
     label_visibility="visible",
     help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
-    )
-_, col1, col2, _ = st.columns([0.1, .5, .3, 0.1])
 with col1:
     uploaded_file = st.file_uploader(
         "Upload your own file...",
-        type=["md", "html", "pdf", "docx", "xls", "xlsx", "xlsm", "xlsb", "odf", "ods", "odt", "csv"],
-        )
 with col2:
     sample_file = st.selectbox(
@@ -233,29 +266,32 @@ if uploaded_file is not None:
         on_click=parse_and_chunk,
         args=(uploaded_file,),
         type="primary",
-        use_container_width=True
-        )
 else:
     st.sidebar.button(
         "Parse & Chunk",
         on_click=log,
-        args=("You must upload a file first.", "warning",),
         type="secondary",
-        use_container_width=True
-        )
     ss.parsed_md = ""
     ss.chunks = []
 col1, col2 = st.columns(2)
 with col1:
-    if ss.parsed_md:
         file_parsed_md = save_parsed_md()
         cola, colb = st.columns([0.25, 0.75])
         with colb:
             st.subheader("⚙️ Parsed Document", divider="blue")
         with cola:
-            st.write("\n")
             st.download_button(
                 label="⬇️ Download",
                 data=file_parsed_md,
@@ -264,19 +300,22 @@ with col1:
                 use_container_width=True,
             )
         if Path(uploaded_file.name).suffix.lower() == ".pdf":
-            st.info("For the purpose of this demo, OCR on pdf documents is deactivated.", icon="ℹ️")
         with st.expander("Parsed document", expanded=True):
             with st.container(height=600, border=False):
                 st.markdown(ss.parsed_md)
 with col2:
-    if ss.chunks:
         file_chunks = save_chunks()
         cola, colb = st.columns([0.25, 0.75])
         with colb:
             st.subheader("📦 Chunks", divider="blue")
         with cola:
-            st.write("\n")
             st.download_button(
                 label="⬇️ Download",
                 data=file_chunks,
@@ -285,7 +324,9 @@ with col2:
                 use_container_width=True,
             )
         with st.container(border=False):
-            for i, chunk in enumerate(ss.chunks):
                 with st.expander(f"Chunk {i+1}", expanded=False):
                     with st.container(height=300, border=False):
-                        st.markdown(chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks))

 import json
+import time
+from pathlib import Path
 from typing import Literal
 import requests
 import streamlit as st
 from chunknorris.chunkers import MarkdownChunker
+from chunknorris.parsers import (
+    AbstractParser,
+    CSVParser,
+    DocxParser,
+    ExcelParser,
+    HTMLParser,
+    MarkdownParser,
+    PdfParser,
+)
 from chunknorris.pipelines import PdfPipeline
+from streamlit import session_state as ss
+from streamlit.runtime.uploaded_file_manager import UploadedFile, UploadedFileRec
 st.set_page_config(
     layout="wide",
     page_icon="🔪",
     page_title="ChunkNorris demo",
     menu_items={
+        "Report a bug": "https://github.com/wikit-ai/chunknorris/issues",
+        "About": "https://wikit-ai.github.io/chunknorris/",
+    },
+)
 LOGGER = st.empty()
 SAMPLE_FILE = {
+    "sample PDF - 264 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.pdf",
+    "sample PDF - 16 pages": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample2.pdf",
     "sample MD": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/README.md",
     "sample XLSX": "https://raw.githubusercontent.com/wikit-ai/chunknorris/refs/heads/main/docs/examples/example_data/sample.xlsx",
 }
     ss.parsed_md = ""
 if "chunks" not in ss:
+    ss.chunks = []  # type: ignore | list[Chunk]
+def get_parser(fileext: str) -> AbstractParser:
     """Get the pipeline for the given filename."""
     match fileext:
         case ".md":
         case ".csv":
             parser = CSVParser()
         case _:
+            raise ValueError("File format not supported by ChunkNorris")
     return parser
 def get_md_chunker() -> MarkdownChunker:
     """Considering arguments set, returns the md chunker."""
     return MarkdownChunker(
+        max_headers_to_use=ss.max_headers_to_use,
+        max_chunk_word_count=ss.max_chunk_word_count,
+        hard_max_chunk_word_count=ss.hard_max_chunk_word_count,
+        min_chunk_word_count=ss.min_chunk_word_count,
+    )
+def parse_and_chunk(uploaded_file: UploadedFile | None):
     """Parse and chunk the file."""
     if uploaded_file is None:
         log("Please upload a file.", "warning")
             case ".pdf":
                 md_doc = parser.parse_string(uploaded_file.getvalue())
                 chunker = PdfPipeline(parser, get_md_chunker())
+                chunks = chunker._get_chunks_using_strategy() # type: ignore
             case ".xlsx":
                 md_doc = parser.parse_string(uploaded_file.getvalue())
                 chunker = get_md_chunker()
         ss.parsing_time = time.perf_counter() - start_time
         ss.parsed_md = md_doc.to_string()
         ss.chunks = chunks
+        log(
+            f"Parsing and chunking took {round(ss.parsing_time, 4)} seconds.", "success"
+        )
     except Exception as e:
         log(f"Error when parsing file.", "warning")
         print(e)
         return
 def save_parsed_md():
     """Save the parsed markdown string to a md file."""
     return ss.parsed_md.encode("utf-8")
 def save_chunks():
     """Save the parsed chunks to a json file."""
+    return json.dumps(
+        [
+            {
+                k: v
+                for k, v in chunk.model_dump().items()
+                if k not in ["headers", "content"]
+            }
+            | {"text": chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks)}
+            for chunk in ss.chunks
+        ],
+        indent=4,
+        ensure_ascii=False,
+    ).encode("utf-8")
 def log(message: str, log_type: Literal["success", "warning", "info"] = "info"):
     """Display a warning message."""
         case "info":
             LOGGER.info(message, icon="ℹ️")
 def load_sample_file(url: str):
     """Get the file from url"""
     response = requests.get(url)
                 data=response.content,
                 type="application/octet-stream",
             ),
+            file_urls=[url],
+        )
     else:
         print(response.status_code, response.content)
         st.error("Failed to get data.")
         return None
 st.title("ChunkNorris.")
 st.subheader("*Fast, smart, lightweight document chunking.*")
 st.sidebar.header("Chunking settings")
+st.sidebar.markdown(
+    "| [Documentation](https://wikit-ai.github.io/chunknorris/) | [Tutorials](https://wikit-ai.github.io/chunknorris/examples/) | [Repo](https://github.com/wikit-ai/chunknorris) |"
+)
 st.sidebar.select_slider(
     label="Max header level to consider for chunking",
     options=["h1", "h2", "h3", "h4", "h5", "h6"],
     key="max_headers_to_use",
     help="Max section header level to consider for chunking. Lower level headers won't be used to split a chunk into smaller chunks.",
     label_visibility="visible",
+)
 st.sidebar.slider(
     label="Maximum words per chunk",
     key="max_chunk_word_count",
     help="Maximum number of words per chunk. If a chunk is bigger than this, chunk is split using subsection headers if any are available.",
     label_visibility="visible",
+)
 st.sidebar.slider(
     label="Hard maximum words per chunk",
     key="hard_max_chunk_word_count",
     help="The hard maximum number of words per chunk. If a chunk is bigger than this, chunk is split using newlines, still trying to preverse code blocks or tables integrity.",
     label_visibility="visible",
+)
 st.sidebar.slider(
     label="Minumum words per chunk",
     key="min_chunk_word_count",
     help="The minimum words a chunk must have to avoid being discarded.",
     label_visibility="visible",
+)
 st.sidebar.checkbox(
     "Prepend headers to chunk's text",
     key="prepend_headers_to_chunks",
     label_visibility="visible",
     help="Whether or not all the parent headers should be prepended to the chunk's text content. Might improve retrieval performance of the chunk as it preserves context.",
+)
+_, col1, col2, _ = st.columns([0.1, 0.5, 0.3, 0.1])
 with col1:
     uploaded_file = st.file_uploader(
         "Upload your own file...",
+        type=[
+            "md",
+            "html",
+            "pdf",
+            "docx",
+            "xls",
+            "xlsx",
+            "xlsm",
+            "xlsb",
+            "odf",
+            "ods",
+            "odt",
+            "csv",
+        ],
+    )
 with col2:
     sample_file = st.selectbox(
         on_click=parse_and_chunk,
         args=(uploaded_file,),
         type="primary",
+        use_container_width=True,
+    )
 else:
     st.sidebar.button(
         "Parse & Chunk",
         on_click=log,
+        args=(
+            "You must upload a file first.",
+            "warning",
+        ),
         type="secondary",
+        use_container_width=True,
+    )
     ss.parsed_md = ""
     ss.chunks = []
 col1, col2 = st.columns(2)
 with col1:
+    if uploaded_file and ss.parsed_md:
         file_parsed_md = save_parsed_md()
         cola, colb = st.columns([0.25, 0.75])
         with colb:
             st.subheader("⚙️ Parsed Document", divider="blue")
         with cola:
+            st.markdown("\n")
             st.download_button(
                 label="⬇️ Download",
                 data=file_parsed_md,
                 use_container_width=True,
             )
         if Path(uploaded_file.name).suffix.lower() == ".pdf":
+            st.info(
+                "For the purpose of this demo, OCR on pdf documents is deactivated.",
+                icon="ℹ️",
+            )
         with st.expander("Parsed document", expanded=True):
             with st.container(height=600, border=False):
                 st.markdown(ss.parsed_md)
 with col2:
+    if uploaded_file and ss.chunks: # type: ignore | list[Chunk]
         file_chunks = save_chunks()
         cola, colb = st.columns([0.25, 0.75])
         with colb:
             st.subheader("📦 Chunks", divider="blue")
         with cola:
+            st.markdown("\n")
             st.download_button(
                 label="⬇️ Download",
                 data=file_chunks,
                 use_container_width=True,
             )
         with st.container(border=False):
+            for i, chunk in enumerate(ss.chunks): # type: ignore | list[Chunk]
                 with st.expander(f"Chunk {i+1}", expanded=False):
                     with st.container(height=300, border=False):
+                        st.markdown(
+                            chunk.get_text(prepend_headers=ss.prepend_headers_to_chunks) # type: ignore | Chunk.get_text()
+                        )