Spaces:
Sleeping
Sleeping
update app
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import Iterator
|
2 |
|
3 |
import gradio as gr
|
@@ -7,10 +8,10 @@ from huggingface_hub import InferenceClient
|
|
7 |
model_id = "microsoft/Phi-3-mini-4k-instruct"
|
8 |
client = InferenceClient(model_id)
|
9 |
|
10 |
-
|
11 |
-
"A Machine Learning Practioner is looking for a dataset that matches '{
|
12 |
"Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
|
13 |
-
"be helpful. Feel free to reuse words from the query '{
|
14 |
"Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
|
15 |
)
|
16 |
|
@@ -24,12 +25,154 @@ def stream_reponse(msg: str) -> Iterator[str]:
|
|
24 |
yield message.choices[0].delta.content
|
25 |
|
26 |
|
27 |
-
def gen_datasets(
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
|
|
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
demo.launch()
|
|
|
1 |
+
from functools import partial
|
2 |
from typing import Iterator
|
3 |
|
4 |
import gradio as gr
|
|
|
8 |
model_id = "microsoft/Phi-3-mini-4k-instruct"
|
9 |
client = InferenceClient(model_id)
|
10 |
|
11 |
+
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
|
12 |
+
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
|
13 |
"Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
|
14 |
+
"be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
|
15 |
"Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
|
16 |
)
|
17 |
|
|
|
25 |
yield message.choices[0].delta.content
|
26 |
|
27 |
|
28 |
+
def gen_datasets(search_query: str) -> Iterator[str]:
|
29 |
+
search_query = search_query if search_query.strip() else "topic classification"
|
30 |
+
generated_text = ""
|
31 |
+
for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
|
32 |
+
generated_text += token
|
33 |
+
if generated_text.endswith("\n"):
|
34 |
+
yield generated_text.strip()
|
35 |
+
yield generated_text.strip()
|
36 |
+
print("-----\n\n" + generated_text)
|
37 |
|
38 |
+
NB_ITEMS_PER_PAGE = 10
|
39 |
|
40 |
+
default_output = """
|
41 |
+
1. NewsArticleCollection (BreakingNews, VietnameseNewsTrends, CountrySpecificTopics)
|
42 |
+
2. ScienceJournalDataset (AstrophysicsTrends, EcologyPatterns, QuantumMechanicsInsights)
|
43 |
+
3. TechnologyReviewDB (TechInnovationSurges, MobileDevicesAnalysis, CybersecurityBreachStudies)
|
44 |
+
4. BusinessWeeklyReports (MarketTrends, E-commerceGrowth, CorporateChangeDynamics)
|
45 |
+
5. HealthResearchArchive (PandemicPatterns, DiseaseOutbreakInferences, WellnessTrends)
|
46 |
+
6. SportsDataCorpus (ExerciseRoutineShifts, ProfessionalLeagueShifts, InjuryImpactAnalysis)
|
47 |
+
7. EducationSectorStatistics (OnlineEducationAdoption, CurriculumImpactStudies, TeacherTrainingAmendments)
|
48 |
+
8. CinemaCritiqueBank (FilmGenreRotation, HollywoodProductionImpacts, GlobalEntertainmentSurveys)
|
49 |
+
9. CulturalShiftSamples (FoodCuisineEvolution, SocialMediaInfluence, ArtTrendsEvolution)
|
50 |
+
10. LocalLifestyleSections (UrbanAgricultureInfluence, EcoFriendlyLiving, SustainableTransportationTrends)
|
51 |
+
""".strip().split("\n")
|
52 |
+
assert len(default_output) == NB_ITEMS_PER_PAGE
|
53 |
+
|
54 |
+
css = """
|
55 |
+
.datasetButton {
|
56 |
+
justify-content: start;
|
57 |
+
justify-content: left;
|
58 |
+
}
|
59 |
+
.tags {
|
60 |
+
font-size: var(--button-small-text-size);
|
61 |
+
color: var(--body-text-color-subdued);
|
62 |
+
}
|
63 |
+
a {
|
64 |
+
color: var(--body-text-color);
|
65 |
+
}
|
66 |
+
.topButton {
|
67 |
+
justify-content: start;
|
68 |
+
justify-content: left;
|
69 |
+
text-align: left;
|
70 |
+
background: transparent;
|
71 |
+
box-shadow: none;
|
72 |
+
padding-bottom: 0;
|
73 |
+
}
|
74 |
+
.topButton::before {
|
75 |
+
content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E");
|
76 |
+
margin-right: .25rem;
|
77 |
+
margin-left: -.125rem;
|
78 |
+
margin-top: .25rem;
|
79 |
+
}
|
80 |
+
.bottomButton {
|
81 |
+
justify-content: start;
|
82 |
+
justify-content: left;
|
83 |
+
text-align: left;
|
84 |
+
background: transparent;
|
85 |
+
box-shadow: none;
|
86 |
+
font-size: var(--button-small-text-size);
|
87 |
+
color: var(--body-text-color-subdued);
|
88 |
+
padding-top: 0;
|
89 |
+
align-items: baseline;
|
90 |
+
}
|
91 |
+
.bottomButton::before {
|
92 |
+
content: 'tags:\002'
|
93 |
+
}
|
94 |
+
.buttonsGroup {
|
95 |
+
background: transparent;
|
96 |
+
}
|
97 |
+
.buttonsGroup:hover {
|
98 |
+
background: var(--input-background-fill);
|
99 |
+
}
|
100 |
+
.buttonsGroup div {
|
101 |
+
background: transparent;
|
102 |
+
}
|
103 |
+
@keyframes placeHolderShimmer{
|
104 |
+
0%{
|
105 |
+
background-position: -468px 0
|
106 |
+
}
|
107 |
+
100%{
|
108 |
+
background-position: 468px 0
|
109 |
+
}
|
110 |
+
}
|
111 |
+
.linear-background {
|
112 |
+
animation-duration: 1s;
|
113 |
+
animation-fill-mode: forwards;
|
114 |
+
animation-iteration-count: infinite;
|
115 |
+
animation-name: placeHolderShimmer;
|
116 |
+
animation-timing-function: linear;
|
117 |
+
background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%);
|
118 |
+
background-size: 1000px 104px;
|
119 |
+
color: transparent;
|
120 |
+
background-clip: text;
|
121 |
+
}
|
122 |
+
"""
|
123 |
+
|
124 |
+
def search_datasets(search_query):
|
125 |
+
output_values = [
|
126 |
+
gr.Button("β¬β¬β¬β¬β¬β¬", elem_classes="topButton linear-background"),
|
127 |
+
gr.Button("ββββ, ββββ, ββββ", elem_classes="bottomButton linear-background")
|
128 |
+
] * NB_ITEMS_PER_PAGE
|
129 |
+
for generated_text in gen_datasets(search_query):
|
130 |
+
if "I'm sorry" in generated_text:
|
131 |
+
raise gr.Error("Error: inappropriate content")
|
132 |
+
lines = [line for line in generated_text.split("\n") if line and line.split(".", 1)[0].isnumeric()][:NB_ITEMS_PER_PAGE]
|
133 |
+
for i, line in enumerate(lines):
|
134 |
+
dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
|
135 |
+
output_values[2 * i] = gr.Button(dataset_name, elem_classes="topButton")
|
136 |
+
output_values[2 * i + 1] = gr.Button(tags, elem_classes="bottomButton")
|
137 |
+
yield output_values
|
138 |
+
|
139 |
+
|
140 |
+
def show_dataset(*buttons_values, i):
|
141 |
+
dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
|
142 |
+
return f"{dataset_name=}, {tags=}"
|
143 |
+
|
144 |
+
|
145 |
+
with gr.Blocks(css=css) as demo:
|
146 |
+
gr.Markdown(
|
147 |
+
"# π€ Infinite Dataset Hub\n\n"
|
148 |
+
f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
|
149 |
+
)
|
150 |
+
with gr.Row():
|
151 |
+
with gr.Column(scale=4, min_width=0):
|
152 |
+
pass
|
153 |
+
with gr.Column(scale=9):
|
154 |
+
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
|
155 |
+
with gr.Column(min_width=64):
|
156 |
+
search_button = gr.Button("π", variant="primary")
|
157 |
+
with gr.Column(scale=4, min_width=0):
|
158 |
+
pass
|
159 |
+
output = gr.Markdown()
|
160 |
+
with gr.Row():
|
161 |
+
with gr.Column(scale=4, min_width=0):
|
162 |
+
pass
|
163 |
+
with gr.Column(scale=10):
|
164 |
+
buttons = []
|
165 |
+
for i in range(10):
|
166 |
+
line = default_output[i]
|
167 |
+
dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
|
168 |
+
with gr.Group(elem_classes="buttonsGroup"):
|
169 |
+
top = gr.Button(dataset_name, elem_classes="topButton")
|
170 |
+
bottom = gr.Button(tags, elem_classes="bottomButton")
|
171 |
+
buttons += [top, bottom]
|
172 |
+
top.click(partial(show_dataset, i=i), inputs=buttons, outputs=output)
|
173 |
+
bottom.click(partial(show_dataset, i=i), inputs=buttons, outputs=output)
|
174 |
+
with gr.Column(scale=4, min_width=0):
|
175 |
+
pass
|
176 |
+
search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
|
177 |
+
search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
|
178 |
demo.launch()
|