davanstrien HF staff commited on
Commit
907b541
1 Parent(s): eb008d8

add state to app

Browse files
Files changed (1) hide show
  1. app.py +109 -37
app.py CHANGED
@@ -35,7 +35,7 @@ def load_corpus(
35
 
36
  def split_corpus(verbose, docs, chunk_size, chunk_overlap):
37
  if verbose:
38
- print(f"Loaded {len(docs)} docs")
39
 
40
  parser = SentenceSplitter.from_defaults(
41
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
@@ -43,7 +43,7 @@ def split_corpus(verbose, docs, chunk_size, chunk_overlap):
43
  nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
44
 
45
  if verbose:
46
- print(f"Parsed {len(nodes)} nodes")
47
 
48
  docs = {
49
  node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
@@ -54,17 +54,15 @@ def split_corpus(verbose, docs, chunk_size, chunk_overlap):
54
  return docs
55
 
56
 
57
- def upload_file(
58
  files,
59
  chunk_size: int = 256,
60
  chunk_overlap: int = 0,
61
- hub_id: str = None,
62
- private: bool = False,
63
  split_sentences: bool = True,
64
- oauth_token: gr.OAuthToken = None,
65
  ):
66
  print("loading files")
67
  file_paths = [file.name for file in files]
 
68
  print("parsing into sentences")
69
  corpus = load_corpus(
70
  file_paths,
@@ -72,12 +70,64 @@ def upload_file(
72
  chunk_overlap=chunk_overlap,
73
  split_sentences=split_sentences,
74
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  print("Creating dataset")
76
  dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
77
- message = f"Dataset created has: \n - {len(dataset)} rows"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  if hub_id:
79
  if oauth_token is not None:
80
- gr.Info("Uploading to Hugging Face Hub")
81
  dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
82
  update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
83
  message += (
@@ -86,7 +136,7 @@ def upload_file(
86
  else:
87
  raise gr.Error("Please login to Hugging Face Hub to push to hub")
88
 
89
- return dataset.to_pandas(), message
90
 
91
 
92
  def update_dataset_card(
@@ -116,25 +166,30 @@ The resulting text chunks are stored in a dataset that can be previewed and uplo
116
  The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
117
 
118
  ### Usage:
119
- - Login: Start by logging in to your Hugging Face account using the provided login button.
120
- - Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default).
121
- - Upload Files: Use the upload button to load file(s) for processing.
122
- - Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
123
- - Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
124
 
125
  with gr.Blocks() as demo:
 
 
126
  gr.HTML(
127
  """<h1 style='text-align: center;'> Corpus Creator</h1>
128
- <center><i> &#128193; From random files to a Hugging Face dataset in a single step &#128193; </i></center>"""
129
  )
130
  gr.Markdown(description)
 
131
  with gr.Row():
132
- gr.LoginButton()
133
- with gr.Column():
134
- gr.Markdown(
135
- "To upload to the Hub, add an ID for where you want to push the dataset"
136
- )
137
- hub_id = gr.Textbox(value=None, label="Hub ID")
 
 
138
  with gr.Row():
139
  split_sentences = gr.Checkbox(True, label="Split sentences?")
140
  chunk_size = gr.Number(
@@ -151,25 +206,42 @@ with gr.Blocks() as demo:
151
  maximum=4096,
152
  step=1,
153
  )
154
- private = gr.Checkbox(False, label="Upload dataset to a private repo?")
155
- upload_button = gr.File(
156
- file_types=["text"], file_count="multiple", height=50, interactive=True
157
- )
158
- summary = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  with gr.Accordion("detailed logs", open=False):
161
  Log(log_file, dark=True, xterm_font_size=12)
162
- corpus_preview_df = gr.DataFrame()
163
  upload_button.upload(
164
- upload_file,
165
- inputs=[
166
- upload_button,
167
- chunk_size,
168
- chunk_overlap,
169
- hub_id,
170
- private,
171
- split_sentences,
172
- ],
173
- outputs=[corpus_preview_df, summary],
174
  )
 
 
 
 
 
 
 
175
  demo.launch(debug=True)
 
35
 
36
  def split_corpus(verbose, docs, chunk_size, chunk_overlap):
37
  if verbose:
38
+ gr.Info(f"Loaded {len(docs)} docs")
39
 
40
  parser = SentenceSplitter.from_defaults(
41
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
 
43
  nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
44
 
45
  if verbose:
46
+ gr.Info(f"Parsed {len(nodes)} nodes")
47
 
48
  docs = {
49
  node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
 
54
  return docs
55
 
56
 
57
+ def upload_and_preview(
58
  files,
59
  chunk_size: int = 256,
60
  chunk_overlap: int = 0,
 
 
61
  split_sentences: bool = True,
 
62
  ):
63
  print("loading files")
64
  file_paths = [file.name for file in files]
65
+
66
  print("parsing into sentences")
67
  corpus = load_corpus(
68
  file_paths,
 
70
  chunk_overlap=chunk_overlap,
71
  split_sentences=split_sentences,
72
  )
73
+ gr.Info("Creating dataset")
74
+ dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
75
+ message = f"Files uploaded and dataset preview created:\n - {len(dataset)} rows"
76
+
77
+ state = {
78
+ "file_paths": file_paths,
79
+ "dataset": dataset,
80
+ "chunk_size": chunk_size,
81
+ "chunk_overlap": chunk_overlap,
82
+ }
83
+
84
+ return state, dataset.to_pandas(), message
85
+
86
+
87
+ def preview_dataset(
88
+ state,
89
+ chunk_size: int = 256,
90
+ chunk_overlap: int = 0,
91
+ split_sentences: bool = True,
92
+ ):
93
+ if not state.get("file_paths"):
94
+ raise gr.Error("Please upload files first.")
95
+
96
+ print("parsing into sentences")
97
+ corpus = load_corpus(
98
+ state["file_paths"],
99
+ chunk_size=chunk_size,
100
+ chunk_overlap=chunk_overlap,
101
+ split_sentences=split_sentences,
102
+ )
103
  print("Creating dataset")
104
  dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
105
+ message = f"Dataset preview updated:\n - {len(dataset)} rows"
106
+
107
+ state["dataset"] = dataset
108
+ state["chunk_size"] = chunk_size
109
+ state["chunk_overlap"] = chunk_overlap
110
+
111
+ return state, dataset.to_pandas(), message
112
+
113
+
114
+ def upload_to_hub(
115
+ state,
116
+ hub_id: str = None,
117
+ private: bool = False,
118
+ oauth_token: gr.OAuthToken = None,
119
+ ):
120
+ if not state.get("dataset"):
121
+ raise gr.Error("Please preview the dataset first.")
122
+
123
+ dataset = state["dataset"]
124
+ chunk_size = state["chunk_size"]
125
+ chunk_overlap = state["chunk_overlap"]
126
+
127
+ message = f"Dataset has: \n - {len(dataset)} rows"
128
  if hub_id:
129
  if oauth_token is not None:
130
+ gr.Info("Uploading dataset to the Hugging Face Hub...")
131
  dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
132
  update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
133
  message += (
 
136
  else:
137
  raise gr.Error("Please login to Hugging Face Hub to push to hub")
138
 
139
+ return message
140
 
141
 
142
  def update_dataset_card(
 
166
  The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
167
 
168
  ### Usage:
169
+ 1. Upload Files: Use the upload button to load file(s) for processing. A preview will be automatically generated using default settings.
170
+ 2. Adjust Parameters (Optional): Customize the chunk size, overlap, and sentence splitting option according to your requirements.
171
+ 3. Update Preview (Optional): Click the 'Update Preview' button to view the updated dataset based on your parameter changes.
172
+ 4. Login: When ready to upload, log in to your Hugging Face account using the provided login button.
173
+ 5. Upload to Hub: Specify the Hub ID, choose whether to make the dataset private, and click 'Upload to Hub'."""
174
 
175
  with gr.Blocks() as demo:
176
+ state = gr.State({})
177
+
178
  gr.HTML(
179
  """<h1 style='text-align: center;'> Corpus Creator</h1>
180
+ <center><i> &#128193; From random files to a Hugging Face dataset in a few steps &#128193; </i></center>"""
181
  )
182
  gr.Markdown(description)
183
+
184
  with gr.Row():
185
+ upload_button = gr.File(
186
+ file_types=["text"],
187
+ file_count="multiple",
188
+ height=50,
189
+ interactive=True,
190
+ label="Upload Files",
191
+ )
192
+
193
  with gr.Row():
194
  split_sentences = gr.Checkbox(True, label="Split sentences?")
195
  chunk_size = gr.Number(
 
206
  maximum=4096,
207
  step=1,
208
  )
209
+
210
+ update_preview_button = gr.Button("Update Preview")
211
+ corpus_preview_df = gr.DataFrame(label="Dataset Preview")
212
+ preview_summary = gr.Markdown()
213
+
214
+ with gr.Row():
215
+ gr.LoginButton()
216
+ with gr.Column():
217
+ gr.Markdown(
218
+ "To upload to the Hub, add an ID for where you want to push the dataset"
219
+ )
220
+ hub_id = gr.Textbox(value=None, label="Hub ID")
221
+ private = gr.Checkbox(False, label="Upload dataset to a private repo?")
222
+
223
+ upload_hub_button = gr.Button("Upload to Hub")
224
+ upload_summary = gr.Markdown()
225
 
226
  with gr.Accordion("detailed logs", open=False):
227
  Log(log_file, dark=True, xterm_font_size=12)
228
+
229
  upload_button.upload(
230
+ upload_and_preview,
231
+ inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
232
+ outputs=[state, corpus_preview_df, preview_summary],
233
+ )
234
+
235
+ update_preview_button.click(
236
+ preview_dataset,
237
+ inputs=[state, chunk_size, chunk_overlap, split_sentences],
238
+ outputs=[state, corpus_preview_df, preview_summary],
 
239
  )
240
+
241
+ upload_hub_button.click(
242
+ upload_to_hub,
243
+ inputs=[state, hub_id, private],
244
+ outputs=[upload_summary],
245
+ )
246
+
247
  demo.launch(debug=True)