File size: 5,368 Bytes
f14de11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import iscc_core as ic
import iscc_sdk as idk
import pathlib


HERE = pathlib.Path(__file__).parent.absolute()
SAMPLE_FILEPATH = HERE / "samples/sample.txt"
sample_text = open(SAMPLE_FILEPATH, "rt", encoding="utf-8").read()

newline_symbols = {
    "\u000a": "⏎",  # Line Feed - Represented by the 'Return' symbol
    "\u000b": "↨",  # Vertical Tab - Represented by the 'Up Down Arrow' symbol
    "\u000c": "␌",  # Form Feed - Unicode Control Pictures representation
    "\u000d": "↡",  # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
    "\u0085": "–",  # Next Line - 'Downwards Arrow with Double Stroke' symbol
    "\u2028": "↲",  # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
    "\u2029": "ΒΆ",  # Paragraph Separator - Represented by the 'Pilcrow' symbol
}

custom_css = """
#chunked-text span.label {
    text-transform: none !important;
}
"""


def no_nl(text):
    """Replace non-printable newline characters with printable symbols"""
    for char, symbol in newline_symbols.items():
        text = text.replace(char, symbol)
    return text


def chunk_text(text, chunk_size):
    original_chunk_size = idk.sdk_opts.text_avg_chunk_size
    idk.sdk_opts.text_avg_chunk_size = chunk_size
    cleaned = ic.text_clean(text)
    processed = idk.text_features(cleaned)
    features = processed["features"]
    sizes = processed["sizes"]
    start = 0
    chunks = []
    for size in sizes:
        end = start + size
        chunks.append(no_nl(cleaned[start:end]))
        start = end
    result = [
        (chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)
    ]
    idk.sdk_opts.text_avg_chunk_size = original_chunk_size
    return result


with gr.Blocks(css=custom_css) as demo:
    with gr.Row(variant="panel"):
        gr.Markdown(
            """
        ## βœ‚οΈ ISCC Chunker
        Demo of Content-Defined Variable-Length Chunking for Shift-Resistant Text and Data Segmentation
        """,
        )
    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            in_text = gr.TextArea(
                label="Text Chunker",
                placeholder="Paste your text here",
                lines=12,
                max_lines=12,
            )
            in_chunksize = gr.Slider(
                label="Chunk Size",
                info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
                minimum=64,
                maximum=2048,
                step=32,
                value=64,
            )
            gr.Examples(label="Sample Text", examples=[sample_text], inputs=[in_text])

        out_text = gr.HighlightedText(
            label="Chunked Text Output",
            interactive=False,
            elem_id="chunked-text",
        )
    with gr.Row():
        gr.ClearButton(components=[in_text, in_chunksize, out_text])
    with gr.Row(variant="panel"):
        gr.Markdown(
            """
        ## πŸ“– Help & Instructions

        This Demo showcases ISCC's shift-resistant chunking algorithm. Here's how to use it:

        A) **Paste your text** into the "Text Chunker" field or select the sample below.

        The **"Chunked Text Output"** will display the results, highlighting each chunk and its
        number of characters and associated similarity hash.

        B) Edit the text** in the "Text Chunker" field

        Observe how most chunks stay the same (same length and same hash) even if you make edits
        in the beginning of the text.

        C) **Adjust the "Chunk Size"** slider to control the average number of characters per chunk.

        Observe how the chunks get smaller/larger on average. Smaller sizes result in more,
        more fine grained chunks, while larger sizes produce fewer, larger chunks on average.

        D) Use the **Clear Button** to start over.

        For more information about ISCC chunking, please visit: https://core.iscc.codes/algorithms/cdc/
        """,
        )

        gr.Markdown(
            """
        ## What is Content-Defined Chunking?

        This method segments text (or data) into chunks using a content-defined approach, which is
        resilient to shifts in the text. It ensures that changes in the beginning of the text have
        minimal impact on the chunk boundaries further in the text, making it ideal for version
        control, data deduplication, and similar applications where detecting content changes
        efficiently is crucial.

        ## How does ISCC use Content-Defined Chunking?

        The [Data-Code](https://github.com/iscc/iscc-core/blob/main/iscc_core/code_data.py) is
        generated by chunking the raw file bitstream with an average chunk size of 1024 bytes.
        The chunks are hashed with `xxhash` and processed with a `minhash` algorithm.

        It is also used by the [iscc-sdk](https://github.com/iscc/iscc-sdk) to generate granular
        syntactic similarity hashes for textual content with an average chunk size of 1024
        characters. When activated the granular chunk hashes are attached to the generated ISCC
        Metadata.
        """
        )

    in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
    in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])


if __name__ == "__main__":
    demo.launch()