Spaces:
Sleeping
Sleeping
Commit
·
0bfcadb
1
Parent(s):
bd9233d
ádd
Browse files
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import spacy
|
|
|
4 |
import gradio as gr
|
5 |
import umap
|
6 |
from sklearn.cluster import OPTICS
|
@@ -27,15 +28,13 @@ ARTICLE = r"""<center>
|
|
27 |
|
28 |
def load_data(fileobj):
|
29 |
"""Load dataset (keep only 500 rows for efficiency)"""
|
30 |
-
data = pd.read_csv(fileobj, on_bad_lines='skip', nrows=500)
|
31 |
assert "text" in data.columns, "The data must have a column named 'text'"
|
32 |
return data[['text']]
|
33 |
|
34 |
|
35 |
def run_nlp_processing(data):
|
36 |
"""As reference for standard NLP processing"""
|
37 |
-
import os
|
38 |
-
|
39 |
# NLP processing
|
40 |
docs = []
|
41 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
|
@@ -130,7 +129,7 @@ with blocks:
|
|
130 |
)
|
131 |
in_file = gr.File()
|
132 |
gr.Markdown("## Inspect the data")
|
133 |
-
in_data = gr.Dataframe()
|
134 |
submit_button = gr.Button("Run BERTopic!")
|
135 |
gr.Examples(inputs=in_file, examples=EXAMPLES)
|
136 |
with gr.Column():
|
@@ -139,7 +138,7 @@ with blocks:
|
|
139 |
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
|
140 |
)
|
141 |
gr.Markdown("## Processed Text")
|
142 |
-
out_dataset = gr.Dataframe()
|
143 |
gr.Markdown("## Embedding + Projection + Clustering")
|
144 |
embedding_plot = gr.Plot(label="BERTopic projections")
|
145 |
gr.Markdown("## Extracted Topics")
|
@@ -147,7 +146,7 @@ with blocks:
|
|
147 |
gr.Markdown(ARTICLE)
|
148 |
# event listeners
|
149 |
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
|
150 |
-
|
151 |
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
|
152 |
|
153 |
blocks.launch()
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import spacy
|
4 |
+
import os
|
5 |
import gradio as gr
|
6 |
import umap
|
7 |
from sklearn.cluster import OPTICS
|
|
|
28 |
|
29 |
def load_data(fileobj):
|
30 |
"""Load dataset (keep only 500 rows for efficiency)"""
|
31 |
+
data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500)
|
32 |
assert "text" in data.columns, "The data must have a column named 'text'"
|
33 |
return data[['text']]
|
34 |
|
35 |
|
36 |
def run_nlp_processing(data):
|
37 |
"""As reference for standard NLP processing"""
|
|
|
|
|
38 |
# NLP processing
|
39 |
docs = []
|
40 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
|
|
|
129 |
)
|
130 |
in_file = gr.File()
|
131 |
gr.Markdown("## Inspect the data")
|
132 |
+
in_data = gr.Dataframe(max_rows=5)
|
133 |
submit_button = gr.Button("Run BERTopic!")
|
134 |
gr.Examples(inputs=in_file, examples=EXAMPLES)
|
135 |
with gr.Column():
|
|
|
138 |
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
|
139 |
)
|
140 |
gr.Markdown("## Processed Text")
|
141 |
+
out_dataset = gr.Dataframe(max_rows=5)
|
142 |
gr.Markdown("## Embedding + Projection + Clustering")
|
143 |
embedding_plot = gr.Plot(label="BERTopic projections")
|
144 |
gr.Markdown("## Extracted Topics")
|
|
|
146 |
gr.Markdown(ARTICLE)
|
147 |
# event listeners
|
148 |
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
|
149 |
+
submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
|
150 |
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
|
151 |
|
152 |
blocks.launch()
|