Spaces:
Sleeping
Sleeping
from functools import partial | |
from typing import Iterator | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
model_id = "microsoft/Phi-3-mini-4k-instruct" | |
client = InferenceClient(model_id) | |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = ( | |
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. " | |
"Generate a list of 10 names of quality dataset that don't exist but sound plausible and would " | |
"be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. " | |
"Give each dataset descriptive tags/keywords and use the following format:\n1. DatasetName (tag1, tag2, tag3)" | |
) | |
def stream_reponse(msg: str) -> Iterator[str]: | |
for message in client.chat_completion( | |
messages=[{"role": "user", "content": msg}], | |
max_tokens=500, | |
stream=True, | |
): | |
yield message.choices[0].delta.content | |
def gen_datasets(search_query: str) -> Iterator[str]: | |
search_query = search_query if search_query.strip() else "topic classification" | |
generated_text = "" | |
for token in stream_reponse(GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query)): | |
generated_text += token | |
if generated_text.endswith("\n"): | |
yield generated_text.strip() | |
yield generated_text.strip() | |
print("-----\n\n" + generated_text) | |
NB_ITEMS_PER_PAGE = 10 | |
default_output = """ | |
1. NewsArticleCollection (BreakingNews, VietnameseNewsTrends, CountrySpecificTopics) | |
2. ScienceJournalDataset (AstrophysicsTrends, EcologyPatterns, QuantumMechanicsInsights) | |
3. TechnologyReviewDB (TechInnovationSurges, MobileDevicesAnalysis, CybersecurityBreachStudies) | |
4. BusinessWeeklyReports (MarketTrends, E-commerceGrowth, CorporateChangeDynamics) | |
5. HealthResearchArchive (PandemicPatterns, DiseaseOutbreakInferences, WellnessTrends) | |
6. SportsDataCorpus (ExerciseRoutineShifts, ProfessionalLeagueShifts, InjuryImpactAnalysis) | |
7. EducationSectorStatistics (OnlineEducationAdoption, CurriculumImpactStudies, TeacherTrainingAmendments) | |
8. CinemaCritiqueBank (FilmGenreRotation, HollywoodProductionImpacts, GlobalEntertainmentSurveys) | |
9. CulturalShiftSamples (FoodCuisineEvolution, SocialMediaInfluence, ArtTrendsEvolution) | |
10. LocalLifestyleSections (UrbanAgricultureInfluence, EcoFriendlyLiving, SustainableTransportationTrends) | |
""".strip().split("\n") | |
assert len(default_output) == NB_ITEMS_PER_PAGE | |
css = """ | |
.datasetButton { | |
justify-content: start; | |
justify-content: left; | |
} | |
.tags { | |
font-size: var(--button-small-text-size); | |
color: var(--body-text-color-subdued); | |
} | |
a { | |
color: var(--body-text-color); | |
} | |
.topButton { | |
justify-content: start; | |
justify-content: left; | |
text-align: left; | |
background: transparent; | |
box-shadow: none; | |
padding-bottom: 0; | |
} | |
.topButton::before { | |
content: url("data:image/svg+xml,%3Csvg style='color: rgb(209 213 219)' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' aria-hidden='true' focusable='false' role='img' width='1em' height='1em' preserveAspectRatio='xMidYMid meet' viewBox='0 0 25 25'%3E%3Cellipse cx='12.5' cy='5' fill='currentColor' fill-opacity='0.25' rx='7.5' ry='2'%3E%3C/ellipse%3E%3Cpath d='M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z' fill='currentColor' opacity='0.5'%3E%3C/path%3E%3Cpath d='M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z' fill='currentColor'%3E%3C/path%3E%3C/svg%3E"); | |
margin-right: .25rem; | |
margin-left: -.125rem; | |
margin-top: .25rem; | |
} | |
.bottomButton { | |
justify-content: start; | |
justify-content: left; | |
text-align: left; | |
background: transparent; | |
box-shadow: none; | |
font-size: var(--button-small-text-size); | |
color: var(--body-text-color-subdued); | |
padding-top: 0; | |
align-items: baseline; | |
} | |
.bottomButton::before { | |
content: 'tags:\002' | |
} | |
.buttonsGroup { | |
background: transparent; | |
} | |
.buttonsGroup:hover { | |
background: var(--input-background-fill); | |
} | |
.buttonsGroup div { | |
background: transparent; | |
} | |
@keyframes placeHolderShimmer{ | |
0%{ | |
background-position: -468px 0 | |
} | |
100%{ | |
background-position: 468px 0 | |
} | |
} | |
.linear-background { | |
animation-duration: 1s; | |
animation-fill-mode: forwards; | |
animation-iteration-count: infinite; | |
animation-name: placeHolderShimmer; | |
animation-timing-function: linear; | |
background-image: linear-gradient(to right, var(--body-text-color-subdued) 8%, #dddddd11 18%, var(--body-text-color-subdued) 33%); | |
background-size: 1000px 104px; | |
color: transparent; | |
background-clip: text; | |
} | |
""" | |
def search_datasets(search_query): | |
output_values = [ | |
gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background"), | |
gr.Button("░░░░, ░░░░, ░░░░", elem_classes="bottomButton linear-background") | |
] * NB_ITEMS_PER_PAGE | |
for generated_text in gen_datasets(search_query): | |
if "I'm sorry" in generated_text: | |
raise gr.Error("Error: inappropriate content") | |
lines = [line for line in generated_text.split("\n") if line and line.split(".", 1)[0].isnumeric()][:NB_ITEMS_PER_PAGE] | |
for i, line in enumerate(lines): | |
dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
output_values[2 * i] = gr.Button(dataset_name, elem_classes="topButton") | |
output_values[2 * i + 1] = gr.Button(tags, elem_classes="bottomButton") | |
yield output_values | |
def show_dataset(*buttons_values, i): | |
dataset_name, tags = buttons_values[2 * i : 2 * i + 2] | |
return f"{dataset_name=}, {tags=}" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown( | |
"# 🤗 Infinite Dataset Hub\n\n" | |
f"_powered by [{model_id}](https://huggingface.co/{model_id})_" | |
) | |
with gr.Row(): | |
with gr.Column(scale=4, min_width=0): | |
pass | |
with gr.Column(scale=9): | |
search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets", show_label=False, container=False) | |
with gr.Column(min_width=64): | |
search_button = gr.Button("🔍", variant="primary") | |
with gr.Column(scale=4, min_width=0): | |
pass | |
output = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(scale=4, min_width=0): | |
pass | |
with gr.Column(scale=10): | |
buttons = [] | |
for i in range(10): | |
line = default_output[i] | |
dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1) | |
with gr.Group(elem_classes="buttonsGroup"): | |
top = gr.Button(dataset_name, elem_classes="topButton") | |
bottom = gr.Button(tags, elem_classes="bottomButton") | |
buttons += [top, bottom] | |
top.click(partial(show_dataset, i=i), inputs=buttons, outputs=output) | |
bottom.click(partial(show_dataset, i=i), inputs=buttons, outputs=output) | |
with gr.Column(scale=4, min_width=0): | |
pass | |
search_bar.submit(search_datasets, inputs=search_bar, outputs=buttons) | |
search_button.click(search_datasets, inputs=search_bar, outputs=buttons) | |
demo.launch() | |