Spaces:

davanstrien
/

corpus-creator

Running

App Files Files Community

davanstrien HF staff commited on Jun 24

Commit

907b541

•

1 Parent(s): eb008d8

add state to app

Browse files

Files changed (1) hide show

app.py +109 -37

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ def load_corpus(
 def split_corpus(verbose, docs, chunk_size, chunk_overlap):
     if verbose:
-        print(f"Loaded {len(docs)} docs")
     parser = SentenceSplitter.from_defaults(
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
@@ -43,7 +43,7 @@ def split_corpus(verbose, docs, chunk_size, chunk_overlap):
     nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
     if verbose:
-        print(f"Parsed {len(nodes)} nodes")
     docs = {
         node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
@@ -54,17 +54,15 @@ def split_corpus(verbose, docs, chunk_size, chunk_overlap):
     return docs
-def upload_file(
     files,
     chunk_size: int = 256,
     chunk_overlap: int = 0,
-    hub_id: str = None,
-    private: bool = False,
     split_sentences: bool = True,
-    oauth_token: gr.OAuthToken = None,
 ):
     print("loading files")
     file_paths = [file.name for file in files]
     print("parsing into sentences")
     corpus = load_corpus(
         file_paths,
@@ -72,12 +70,64 @@ def upload_file(
         chunk_overlap=chunk_overlap,
         split_sentences=split_sentences,
     )
     print("Creating dataset")
     dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
-    message = f"Dataset created has: \n - {len(dataset)} rows"
     if hub_id:
         if oauth_token is not None:
-            gr.Info("Uploading to Hugging Face Hub")
             dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
             update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
             message += (
@@ -86,7 +136,7 @@ def upload_file(
         else:
             raise gr.Error("Please login to Hugging Face Hub to push to hub")
-    return dataset.to_pandas(), message
 def update_dataset_card(
@@ -116,25 +166,30 @@ The resulting text chunks are stored in a dataset that can be previewed and uplo
 The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
 ### Usage:
-- Login: Start by logging in to your Hugging Face account using the provided login button.
-- Set Parameters: Customize the chunk size and overlap according to your requirements. If you want to split the text into chunks, check the 'Split sentences' box (on by default).
-- Upload Files: Use the upload button to load file(s) for processing.
-- Preview Dataset: View the created dataset in a dataframe format before uploading it to the Hugging Face Hub.
-- Upload to Hub: Optionally, specify the Hub ID and choose whether to make the dataset private before pushing it to the Hugging Face Hub."""
 with gr.Blocks() as demo:
     gr.HTML(
         """<h1 style='text-align: center;'> Corpus Creator</h1>
-        <center><i> &#128193; From random files to a Hugging Face dataset in a single step &#128193; </i></center>"""
     )
     gr.Markdown(description)
     with gr.Row():
-        gr.LoginButton()
-        with gr.Column():
-            gr.Markdown(
-                "To upload to the Hub, add an ID for where you want to push the dataset"
-            )
-            hub_id = gr.Textbox(value=None, label="Hub ID")
     with gr.Row():
         split_sentences = gr.Checkbox(True, label="Split sentences?")
         chunk_size = gr.Number(
@@ -151,25 +206,42 @@ with gr.Blocks() as demo:
             maximum=4096,
             step=1,
         )
-        private = gr.Checkbox(False, label="Upload dataset to a private repo?")
-    upload_button = gr.File(
-        file_types=["text"], file_count="multiple", height=50, interactive=True
-    )
-    summary = gr.Markdown()
     with gr.Accordion("detailed logs", open=False):
         Log(log_file, dark=True, xterm_font_size=12)
-    corpus_preview_df = gr.DataFrame()
     upload_button.upload(
-        upload_file,
-        inputs=[
-            upload_button,
-            chunk_size,
-            chunk_overlap,
-            hub_id,
-            private,
-            split_sentences,
-        ],
-        outputs=[corpus_preview_df, summary],
     )
 demo.launch(debug=True)

 def split_corpus(verbose, docs, chunk_size, chunk_overlap):
     if verbose:
+        gr.Info(f"Loaded {len(docs)} docs")
     parser = SentenceSplitter.from_defaults(
         chunk_size=chunk_size, chunk_overlap=chunk_overlap
     nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)
     if verbose:
+        gr.Info(f"Parsed {len(nodes)} nodes")
     docs = {
         node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
     return docs
+def upload_and_preview(
     files,
     chunk_size: int = 256,
     chunk_overlap: int = 0,
     split_sentences: bool = True,
 ):
     print("loading files")
     file_paths = [file.name for file in files]
     print("parsing into sentences")
     corpus = load_corpus(
         file_paths,
         chunk_overlap=chunk_overlap,
         split_sentences=split_sentences,
     )
+    gr.Info("Creating dataset")
+    dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
+    message = f"Files uploaded and dataset preview created:\n - {len(dataset)} rows"
+    state = {
+        "file_paths": file_paths,
+        "dataset": dataset,
+        "chunk_size": chunk_size,
+        "chunk_overlap": chunk_overlap,
+    }
+    return state, dataset.to_pandas(), message
+def preview_dataset(
+    state,
+    chunk_size: int = 256,
+    chunk_overlap: int = 0,
+    split_sentences: bool = True,
+):
+    if not state.get("file_paths"):
+        raise gr.Error("Please upload files first.")
+    print("parsing into sentences")
+    corpus = load_corpus(
+        state["file_paths"],
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        split_sentences=split_sentences,
+    )
     print("Creating dataset")
     dataset = Dataset.from_dict({"ids": corpus.keys(), "texts": corpus.values()})
+    message = f"Dataset preview updated:\n - {len(dataset)} rows"
+    state["dataset"] = dataset
+    state["chunk_size"] = chunk_size
+    state["chunk_overlap"] = chunk_overlap
+    return state, dataset.to_pandas(), message
+def upload_to_hub(
+    state,
+    hub_id: str = None,
+    private: bool = False,
+    oauth_token: gr.OAuthToken = None,
+):
+    if not state.get("dataset"):
+        raise gr.Error("Please preview the dataset first.")
+    dataset = state["dataset"]
+    chunk_size = state["chunk_size"]
+    chunk_overlap = state["chunk_overlap"]
+    message = f"Dataset has: \n - {len(dataset)} rows"
     if hub_id:
         if oauth_token is not None:
+            gr.Info("Uploading dataset to the Hugging Face Hub...")
             dataset.push_to_hub(hub_id, token=oauth_token.token, private=private)
             update_dataset_card(hub_id, oauth_token.token, chunk_size, chunk_overlap)
             message += (
         else:
             raise gr.Error("Please login to Hugging Face Hub to push to hub")
+    return message
 def update_dataset_card(
 The chunking is done using `Llama-index`'s [`SentenceSplitter`](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/?h=sentencesplitter#sentencesplitter) classes.
 ### Usage:
+1. Upload Files: Use the upload button to load file(s) for processing. A preview will be automatically generated using default settings.
+2. Adjust Parameters (Optional): Customize the chunk size, overlap, and sentence splitting option according to your requirements.
+3. Update Preview (Optional): Click the 'Update Preview' button to view the updated dataset based on your parameter changes.
+4. Login: When ready to upload, log in to your Hugging Face account using the provided login button.
+5. Upload to Hub: Specify the Hub ID, choose whether to make the dataset private, and click 'Upload to Hub'."""
 with gr.Blocks() as demo:
+    state = gr.State({})
     gr.HTML(
         """<h1 style='text-align: center;'> Corpus Creator</h1>
+        <center><i> &#128193; From random files to a Hugging Face dataset in a few steps &#128193; </i></center>"""
     )
     gr.Markdown(description)
     with gr.Row():
+        upload_button = gr.File(
+            file_types=["text"],
+            file_count="multiple",
+            height=50,
+            interactive=True,
+            label="Upload Files",
+        )
     with gr.Row():
         split_sentences = gr.Checkbox(True, label="Split sentences?")
         chunk_size = gr.Number(
             maximum=4096,
             step=1,
         )
+    update_preview_button = gr.Button("Update Preview")
+    corpus_preview_df = gr.DataFrame(label="Dataset Preview")
+    preview_summary = gr.Markdown()
+    with gr.Row():
+        gr.LoginButton()
+        with gr.Column():
+            gr.Markdown(
+                "To upload to the Hub, add an ID for where you want to push the dataset"
+            )
+            hub_id = gr.Textbox(value=None, label="Hub ID")
+            private = gr.Checkbox(False, label="Upload dataset to a private repo?")
+    upload_hub_button = gr.Button("Upload to Hub")
+    upload_summary = gr.Markdown()
     with gr.Accordion("detailed logs", open=False):
         Log(log_file, dark=True, xterm_font_size=12)
     upload_button.upload(
+        upload_and_preview,
+        inputs=[upload_button, chunk_size, chunk_overlap, split_sentences],
+        outputs=[state, corpus_preview_df, preview_summary],
+    )
+    update_preview_button.click(
+        preview_dataset,
+        inputs=[state, chunk_size, chunk_overlap, split_sentences],
+        outputs=[state, corpus_preview_df, preview_summary],
     )
+    upload_hub_button.click(
+        upload_to_hub,
+        inputs=[state, hub_id, private],
+        outputs=[upload_summary],
+    )
 demo.launch(debug=True)