Spaces:
Running
Running
Commit
β’
f6712d8
1
Parent(s):
b717308
improve description
Browse files
app.py
CHANGED
@@ -156,14 +156,22 @@ def update_dataset_card(
|
|
156 |
card.push_to_hub(hub_id, token=token)
|
157 |
|
158 |
|
159 |
-
description = """
|
160 |
-
Corpus Creator is a tool designed to help you easily convert a collection of text files into a dataset suitable for various natural language processing (NLP) tasks.
|
161 |
-
In particular the app is focused on splitting texts into chunks of a specified size and overlap. This can be useful for preparing data for synthetic data generation, pipelines or annotation tasks.
|
162 |
|
163 |
-
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
"""
|
168 |
|
169 |
with gr.Blocks() as demo:
|
@@ -171,7 +179,7 @@ with gr.Blocks() as demo:
|
|
171 |
|
172 |
gr.HTML(
|
173 |
"""<h1 style='text-align: center;'> Corpus Creator</h1>
|
174 |
-
<center><i> 📁 From
|
175 |
)
|
176 |
gr.Markdown(description)
|
177 |
gr.Markdown(
|
|
|
156 |
card.push_to_hub(hub_id, token=token)
|
157 |
|
158 |
|
159 |
+
description = """Corpus Creator is a tool for transforming a collection of text files into a Hugging Face dataset, perfect for various natural language processing (NLP) tasks. Whether you're preparing data for synthetic generation, building pipelines, or setting up annotation tasks, this app simplifies the process.
|
|
|
|
|
160 |
|
161 |
+
Key features:
|
162 |
+
- π Easy text file upload
|
163 |
+
- βοΈ Customizable text chunking
|
164 |
+
- ποΈ Instant dataset preview
|
165 |
+
- π One-click upload to Hugging Face Hubub
|
166 |
|
167 |
+
#### Powered by Llama Index
|
168 |
+
|
169 |
+
Corpus Creator leverages the power of Llama Index, a data framework for LLM-based applications. Specifically, we use Llama Index's `SentenceSplitter` class to intelligently chunk your text. This ensures that your dataset is split in a way that preserves semantic meaning, making it ideal for downstream NLP tasks. [Learn more about Llama Index](https://www.llamaindex.ai/)
|
170 |
+
|
171 |
+
|
172 |
+
Get started by uploading your files and see your corpus take shape!
|
173 |
+
|
174 |
+
[View an example dataset](https://huggingface.co/datasets/davanstrien/MOH-Bethnal-Green) created with Corpus Creator.
|
175 |
"""
|
176 |
|
177 |
with gr.Blocks() as demo:
|
|
|
179 |
|
180 |
gr.HTML(
|
181 |
"""<h1 style='text-align: center;'> Corpus Creator</h1>
|
182 |
+
<center><i> 📁 From scattered files to a structured dataset in minutes 📁 </i></center>"""
|
183 |
)
|
184 |
gr.Markdown(description)
|
185 |
gr.Markdown(
|