lhoestq HF staff commited on
Commit
e93b0ba
Β·
1 Parent(s): 43037cf

update app

Browse files
Files changed (1) hide show
  1. app.py +152 -9
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import Iterator
2
 
3
  import gradio as gr
@@ -7,10 +8,10 @@ from huggingface_hub import InferenceClient
7
  model_id = "microsoft/Phi-3-mini-4k-instruct"
8
  client = InferenceClient(model_id)
9
 
10
- GENERATE_DATASET_NAMES_FOR_QUERY = (
11
- "A Machine Learning Practioner is looking for a dataset that matches '{query}'. "
12
  "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
13
- "be helpful. Feel free to reuse words from the query '{query}' to name the datasets. "
14
  "Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
15
  )
16
 
@@ -24,12 +25,154 @@ def stream_reponse(msg: str) -> Iterator[str]:
24
  yield message.choices[0].delta.content
25
 
26
 
27
- def gen_datasets(query: str) -> Iterator[str]:
28
- output = ""
29
- for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_QUERY.format(query=query)):
30
- output += token
31
- yield output
 
 
 
 
32
 
 
33
 
34
- demo = gr.Interface(fn=gen_datasets, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  demo.launch()
 
1
+ from functools import partial
2
  from typing import Iterator
3
 
4
  import gradio as gr
 
8
  model_id = "microsoft/Phi-3-mini-4k-instruct"
9
  client = InferenceClient(model_id)
10
 
11
+ GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
12
+ "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
13
  "Generate a list of 10 names of quality dataset that don't exist but sound plausible and would "
14
+ "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
15
  "Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)"
16
  )
17
 
 
25
  yield message.choices[0].delta.content
26
 
27
 
28
+ def gen_datasets(search_query: str) -> Iterator[str]:
29
+ search_query = search_query if search_query.strip() else "topic classification"
30
+ generated_text = ""
31
+ for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)):
32
+ generated_text += token
33
+ if generated_text.endswith("\n"):
34
+ yield generated_text.strip()
35
+ yield generated_text.strip()
36
+ print("-----\n\n" + generated_text)
37
 
38
+ NB_ITEMS_PER_PAGE = 10
39
 
40
+ default_output = """
41
+ 1. NewsArticleCollection (BreakingNews, VietnameseNewsTrends, CountrySpecificTopics)
42
+ 2. ScienceJournalDataset (AstrophysicsTrends, EcologyPatterns, QuantumMechanicsInsights)
43
+ 3. TechnologyReviewDB (TechInnovationSurges, MobileDevicesAnalysis, CybersecurityBreachStudies)
44
+ 4. BusinessWeeklyReports (MarketTrends, E-commerceGrowth, CorporateChangeDynamics)
45
+ 5. HealthResearchArchive (PandemicPatterns, DiseaseOutbreakInferences, WellnessTrends)
46
+ 6. SportsDataCorpus (ExerciseRoutineShifts, ProfessionalLeagueShifts, InjuryImpactAnalysis)
47
+ 7. EducationSectorStatistics (OnlineEducationAdoption, CurriculumImpactStudies, TeacherTrainingAmendments)
48
+ 8. CinemaCritiqueBank (FilmGenreRotation, HollywoodProductionImpacts, GlobalEntertainmentSurveys)
49
+ 9. CulturalShiftSamples (FoodCuisineEvolution, SocialMediaInfluence, ArtTrendsEvolution)
50
+ 10. LocalLifestyleSections (UrbanAgricultureInfluence, EcoFriendlyLiving, SustainableTransportationTrends)
51
+ """.strip().split("\n")
52
+ assert len(default_output) == NB_ITEMS_PER_PAGE
53
+
54
+ css = """
55
+ .datasetButton {
56
+ justify-content: start;
57
+ justify-content: left;
58
+ }
59
+ .tags {
60
+ font-size: var(--button-small-text-size);
61
+ color: var(--body-text-color-subdued);
62
+ }
63
+ a {
64
+ color: var(--body-text-color);
65
+ }
66
+ .topButton {
67
+ justify-content: start;
68
+ justify-content: left;
69
+ text-align: left;
70
+ background: transparent;
71
+ box-shadow: none;
72
+ padding-bottom: 0;
73
+ }
74
+ .topButton::before {
75
+ content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E");
76
+ margin-right: .25rem;
77
+ margin-left: -.125rem;
78
+ margin-top: .25rem;
79
+ }
80
+ .bottomButton {
81
+ justify-content: start;
82
+ justify-content: left;
83
+ text-align: left;
84
+ background: transparent;
85
+ box-shadow: none;
86
+ font-size: var(--button-small-text-size);
87
+ color: var(--body-text-color-subdued);
88
+ padding-top: 0;
89
+ align-items: baseline;
90
+ }
91
+ .bottomButton::before {
92
+ content: 'tags:\002'
93
+ }
94
+ .buttonsGroup {
95
+ background: transparent;
96
+ }
97
+ .buttonsGroup:hover {
98
+ background: var(--input-background-fill);
99
+ }
100
+ .buttonsGroup div {
101
+ background: transparent;
102
+ }
103
+ @keyframes placeHolderShimmer{
104
+ 0%{
105
+ background-position: -468px 0
106
+ }
107
+ 100%{
108
+ background-position: 468px 0
109
+ }
110
+ }
111
+ .linear-background {
112
+ animation-duration: 1s;
113
+ animation-fill-mode: forwards;
114
+ animation-iteration-count: infinite;
115
+ animation-name: placeHolderShimmer;
116
+ animation-timing-function: linear;
117
+ background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%);
118
+ background-size: 1000px 104px;
119
+ color: transparent;
120
+ background-clip: text;
121
+ }
122
+ """
123
+
124
+ def search_datasets(search_query):
125
+ output_values = [
126
+ gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background"),
127
+ gr.Button("β–‘β–‘β–‘β–‘, β–‘β–‘β–‘β–‘, β–‘β–‘β–‘β–‘", elem_classes="bottomButton linear-background")
128
+ ] * NB_ITEMS_PER_PAGE
129
+ for generated_text in gen_datasets(search_query):
130
+ if "I'm sorry" in generated_text:
131
+ raise gr.Error("Error: inappropriate content")
132
+ lines = [line for line in generated_text.split("\n") if line and line.split(".", 1)[0].isnumeric()][:NB_ITEMS_PER_PAGE]
133
+ for i, line in enumerate(lines):
134
+ dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
135
+ output_values[2 * i] = gr.Button(dataset_name, elem_classes="topButton")
136
+ output_values[2 * i + 1] = gr.Button(tags, elem_classes="bottomButton")
137
+ yield output_values
138
+
139
+
140
+ def show_dataset(*buttons_values, i):
141
+ dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
142
+ return f"{dataset_name=}, {tags=}"
143
+
144
+
145
+ with gr.Blocks(css=css) as demo:
146
+ gr.Markdown(
147
+ "# πŸ€— Infinite Dataset Hub\n\n"
148
+ f"_powered by [{model_id}](https://huggingface.co/{model_id})_"
149
+ )
150
+ with gr.Row():
151
+ with gr.Column(scale=4, min_width=0):
152
+ pass
153
+ with gr.Column(scale=9):
154
+ search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False)
155
+ with gr.Column(min_width=64):
156
+ search_button = gr.Button("πŸ”", variant="primary")
157
+ with gr.Column(scale=4, min_width=0):
158
+ pass
159
+ output = gr.Markdown()
160
+ with gr.Row():
161
+ with gr.Column(scale=4, min_width=0):
162
+ pass
163
+ with gr.Column(scale=10):
164
+ buttons = []
165
+ for i in range(10):
166
+ line = default_output[i]
167
+ dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
168
+ with gr.Group(elem_classes="buttonsGroup"):
169
+ top = gr.Button(dataset_name, elem_classes="topButton")
170
+ bottom = gr.Button(tags, elem_classes="bottomButton")
171
+ buttons += [top, bottom]
172
+ top.click(partial(show_dataset, i=i), inputs=buttons, outputs=output)
173
+ bottom.click(partial(show_dataset, i=i), inputs=buttons, outputs=output)
174
+ with gr.Column(scale=4, min_width=0):
175
+ pass
176
+ search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons)
177
+ search_button.click(search_datasets, inputs=search_bar, outputs=buttons)
178
  demo.launch()