File size: 8,226 Bytes
e7cb6de
2c8408f
 
 
 
e7cb6de
eb84d7e
 
 
40a40cb
 
74c0a8b
9501bef
 
 
eb84d7e
 
9501bef
 
 
 
eb84d7e
40a40cb
b8e290e
51c0840
 
40a40cb
51c0840
b8e290e
40a40cb
9501bef
5323f2f
eb84d7e
40a40cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb84d7e
40a40cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb88228
e305056
2c8408f
bb88228
 
53e8388
3265b22
bb88228
3265b22
9b2e5ac
bb88228
 
 
3265b22
2c8408f
3265b22
2c8408f
3265b22
2c8408f
 
 
 
 
 
 
 
 
 
 
 
 
bb88228
2c8408f
 
51c0840
bb88228
5933b14
e304b98
 
eb84d7e
 
e304b98
eb84d7e
e304b98
 
 
1115dfa
e304b98
 
51c0840
e304b98
6a12a73
 
 
 
02271b4
6a12a73
02271b4
 
6a12a73
 
 
 
 
bb88228
 
37c61d6
bb88228
 
2c8408f
 
40a40cb
2c8408f
 
af9e1dd
 
40a40cb
af9e1dd
 
2c8408f
731bcbf
40a40cb
b7b6ca1
2c8408f
 
 
40a40cb
2c8408f
 
93f7595
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)

LABEL_TEXTSPLITTER = "LangChain's CharacterTextSplitter"
LABEL_RECURSIVE = "Langchain's RecursiveCharacterTextSplitter"

bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

def extract_separators_from_string(separators_str):
    try:
        separators = separators_str[1:-1].split(", ")
        return [separator.replace('"', "").replace("'", "") for separator in separators]
    except Exception as e:
        print(e)
        raise gr.Error(f"""
        Did not succeed in extracting seperators from string: {separator_str}.
        Please type it in the correct format: "['separator_1', 'separator_2', etc]"
        """)

def change_split_selection(text, slider_count, split_selection, separator_selection, length_unit_selection):
    print("Updating separator selection interactivity:")
    return (
        gr.Textbox.update(visible=(split_selection==LABEL_RECURSIVE)),
        chunk(text, slider_count, split_selection, separator_selection, length_unit_selection)
    )

def chunk(text, length, splitter_selection, separators_str, length_unit_selection):
    separators = extract_separators_from_string(separators_str)
    
    if splitter_selection == LABEL_TEXTSPLITTER:
        if "token" in length_unit_selection.lower():
            text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
                AutoTokenizer.from_pretrained(tokenizer_name),
                separator="",
                chunk_size=length,
                chunk_overlap=0,
                length_function=len,
                is_separator_regex=False,
            )
        else:
            text_splitter = CharacterTextSplitter(
                separator="",
                chunk_size=length,
                chunk_overlap=0,
                length_function=len,
                is_separator_regex=False,
            )
    elif splitter_selection == LABEL_RECURSIVE:
        if "token" in length_unit_selection.lower():
            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                AutoTokenizer.from_pretrained(tokenizer_name),
                chunk_size=chunk_size,
                chunk_overlap=0,
                add_start_index=True,
                strip_whitespace=False,
                separators=separators,
            )
        else:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=length,
                chunk_overlap=0,
                length_function=len,
                add_start_index=True,
                strip_whitespace=False,
                separators=separators,
            )
    splits = text_splitter.create_documents([text])
    text_splits = [split.page_content for split in splits]

    output = [(split, str(i)) for i, split in enumerate(text_splits)]
    return output


ESSAY = """Chapter 6

WHAT SORT OF DESPOTISM DEMOCRATIC NATIONS HAVE TO FEAR

I had remarked during my stay in the United States that a democratic state of society, similar to that of the Americans, might offer singular facilities for the establishment of despotism; and I perceived, upon my return to Europe, how much use had already been made, by most of our rulers, of the notions, the sentiments, and the wants created by this same social condition, for the purpose of extending the circle of their power. This led me to think that the nations of Christendom would perhaps eventually undergo some oppression like that which hung over several of the nations of the ancient world.
A more accurate examination of the subject, and five years of further meditation, have not diminished my fears, but have changed their object.
No sovereign ever lived in former ages so absolute or so powerful as to undertake to administer by his own agency, and without the assistance of intermediate powers, all the parts of a great empire; none ever attempted to subject all his subjects indiscriminately to strict uniformity of regulation and personally to tutor and direct every member of the community. The notion of such an undertaking never occurred to the human mind; and if any man had conceived it, the want of information, the imperfection of the administrative system, and, above all, the natural obstacles caused by the inequality of conditions would speedily have checked the execution of so vast a design.
When the Roman emperors were at the height of their power, the different nations of the empire still preserved usages and customs of great diversity; although they were subject to the same monarch, most of the provinces were separately administered; they abounded in powerful and active municipalities; and although the whole government of the empire was centered in the hands of the Emperor alone and he always remained, in case of need, the supreme arbiter in all matters, yet the details of social life and private occupations lay for the most part beyond his control. The emperors possessed, it is true, an immense and unchecked power, which allowed them to gratify all their whimsical tastes and to employ for that purpose the whole strength of the state. They frequently abused that power arbitrarily to deprive their subjects of property or of life; their tyranny was extremely onerous to the few, but it did not reach the many; it was confined to some few main objects and neglected the rest; it was violent, but its range was limited.

---

Then you can [Create a dataset repository](../huggingface_hub/quick-start#create-a-repository), for example using:

```python
from huggingface_hub import HfApi
HfApi().create_repo(repo_id="username/my_dataset", repo_type="dataset")
```
Finally, you can use [Hugging Face paths]([Hugging Face paths](https://huggingface.co/docs/huggingface_hub/guides/hf_file_system#integrations)) in Pandas:
```python
import pandas as pd
df.to_parquet("hf://datasets/username/my_dataset/data.parquet")
# or write in separate files if the dataset has train/validation/test splits
df_train.to_parquet("hf://datasets/username/my_dataset/train.parquet")
df_valid.to_parquet("hf://datasets/username/my_dataset/validation.parquet")
df_test .to_parquet("hf://datasets/username/my_dataset/test.parquet")
```
"""


with gr.Blocks(theme=gr.themes.Soft(), css="#textbox_id {color: red; font-samily:monospace}") as demo:
    text = gr.Textbox(label="Your text 🪶", value=ESSAY)
    with gr.Row():
        split_selection = gr.Dropdown(
            choices=[
                LABEL_TEXTSPLITTER,
                LABEL_RECURSIVE,
            ],
            value=LABEL_TEXTSPLITTER,
            label="Chunking method ",
        )
        separator_selection = gr.Textbox(
            elem_id="textbox_id",
            value=["\n\n", "\n", ".", " ", ""],
            label="Separators used in RecursiveCharacterTextSplitter",
            visible=False,
        )
    with gr.Row():
        length_unit_selection = gr.Dropdown(
            choices=[
                "Character count",
                "Token count (BERT tokens)",
            ],
            value="Character count",
            label="Length function",
            info="How should we count our chunk lengths?",
        )
        slider_count = gr.Slider(
            20, 500, value=50, label="Count 🧮", info="Chunk size, in the chosen unit."
        )
    out = gr.HighlightedText(
        label="Output",
        show_legend=True,
        show_label=False,
    )
    text.change(
        fn=chunk,
        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
        outputs=out,
    )
    length_unit_selection.change(
        fn=chunk,
        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
        outputs=out,
    )
    split_selection.change(
        fn=change_split_selection,
        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
        outputs=[separator_selection, out],
    )
    slider_count.change(
        fn=chunk,
        inputs=[text, slider_count, split_selection, separator_selection, length_unit_selection],
        outputs=out,
    )
demo.launch()