Spaces:
Running
Running
import gradio as gr | |
import iscc_core as ic | |
import iscc_sdk as idk | |
import pathlib | |
HERE = pathlib.Path(__file__).parent.absolute() | |
SAMPLE_FILEPATH = HERE / "samples/sample.txt" | |
sample_text = open(SAMPLE_FILEPATH, "rt", encoding="utf-8").read() | |
newline_symbols = { | |
"\u000a": "β", # Line Feed - Represented by the 'Return' symbol | |
"\u000b": "β¨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol | |
"\u000c": "β", # Form Feed - Unicode Control Pictures representation | |
"\u000d": "β΅", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol | |
"\u0085": "β€", # Next Line - 'Downwards Arrow with Double Stroke' symbol | |
"\u2028": "β²", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol | |
"\u2029": "ΒΆ", # Paragraph Separator - Represented by the 'Pilcrow' symbol | |
} | |
custom_css = """ | |
#chunked-text span.label { | |
text-transform: none !important; | |
} | |
""" | |
def no_nl(text): | |
"""Replace non-printable newline characters with printable symbols""" | |
for char, symbol in newline_symbols.items(): | |
text = text.replace(char, symbol) | |
return text | |
def chunk_text(text, chunk_size): | |
original_chunk_size = idk.sdk_opts.text_avg_chunk_size | |
idk.sdk_opts.text_avg_chunk_size = chunk_size | |
cleaned = ic.text_clean(text) | |
processed = idk.text_features(cleaned) | |
features = processed["features"] | |
sizes = processed["sizes"] | |
start = 0 | |
chunks = [] | |
for size in sizes: | |
end = start + size | |
chunks.append(no_nl(cleaned[start:end])) | |
start = end | |
result = [ | |
(chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features) | |
] | |
idk.sdk_opts.text_avg_chunk_size = original_chunk_size | |
return result | |
with gr.Blocks(css=custom_css) as demo: | |
with gr.Row(variant="panel"): | |
gr.Markdown( | |
""" | |
## βοΈ ISCC Chunker | |
Demo of Content-Defined Variable-Length Chunking for Shift-Resistant Text and Data Segmentation | |
""", | |
) | |
with gr.Row(variant="panel"): | |
with gr.Column(variant="panel"): | |
in_text = gr.TextArea( | |
label="Text Chunker", | |
placeholder="Paste your text here", | |
lines=12, | |
max_lines=12, | |
) | |
in_chunksize = gr.Slider( | |
label="Chunk Size", | |
info="AVERAGE NUMBER OF CHARACTERS PER CHUNK", | |
minimum=64, | |
maximum=2048, | |
step=32, | |
value=64, | |
) | |
gr.Examples(label="Sample Text", examples=[sample_text], inputs=[in_text]) | |
out_text = gr.HighlightedText( | |
label="Chunked Text Output", | |
interactive=False, | |
elem_id="chunked-text", | |
) | |
with gr.Row(): | |
gr.ClearButton(components=[in_text, in_chunksize, out_text]) | |
with gr.Row(variant="panel"): | |
gr.Markdown( | |
""" | |
## π Help & Instructions | |
This Demo showcases ISCC's shift-resistant chunking algorithm. Here's how to use it: | |
A) **Paste your text** into the "Text Chunker" field or select the sample below. | |
The **"Chunked Text Output"** will display the results, highlighting each chunk and its | |
number of characters and associated similarity hash. | |
B) Edit the text** in the "Text Chunker" field | |
Observe how most chunks stay the same (same length and same hash) even if you make edits | |
in the beginning of the text. | |
C) **Adjust the "Chunk Size"** slider to control the average number of characters per chunk. | |
Observe how the chunks get smaller/larger on average. Smaller sizes result in more, | |
more fine grained chunks, while larger sizes produce fewer, larger chunks on average. | |
D) Use the **Clear Button** to start over. | |
For more information about ISCC chunking, please visit: https://core.iscc.codes/algorithms/cdc/ | |
""", | |
) | |
gr.Markdown( | |
""" | |
## What is Content-Defined Chunking? | |
This method segments text (or data) into chunks using a content-defined approach, which is | |
resilient to shifts in the text. It ensures that changes in the beginning of the text have | |
minimal impact on the chunk boundaries further in the text, making it ideal for version | |
control, data deduplication, and similar applications where detecting content changes | |
efficiently is crucial. | |
## How does ISCC use Content-Defined Chunking? | |
The [Data-Code](https://github.com/iscc/iscc-core/blob/main/iscc_core/code_data.py) is | |
generated by chunking the raw file bitstream with an average chunk size of 1024 bytes. | |
The chunks are hashed with `xxhash` and processed with a `minhash` algorithm. | |
It is also used by the [iscc-sdk](https://github.com/iscc/iscc-sdk) to generate granular | |
syntactic similarity hashes for textual content with an average chunk size of 1024 | |
characters. When activated the granular chunk hashes are attached to the generated ISCC | |
Metadata. | |
""" | |
) | |
in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | |
in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | |
if __name__ == "__main__": | |
demo.launch() | |