import arxiv
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from cachetools import TTLCache, cached
from setfit import SetFitModel
from tqdm.auto import tqdm
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
CACHE_TIME = 60 * 60 * 12 # 12 hours
MAX_RESULTS = 1_000
@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
def get_arxiv_result():
search = arxiv.Search(
query="ti:dataset AND abs:machine learning",
max_results=MAX_RESULTS,
sort_by=arxiv.SortCriterion.SubmittedDate,
)
return [
{
"title": result.title,
"abstract": result.summary,
"url": result.entry_id,
"category": result.primary_category,
"updated": result.updated,
}
for result in tqdm(search.results(), total=MAX_RESULTS)
]
def load_model():
return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")
def format_row_for_model(row):
return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"
int2label = {0: "new_dataset", 1: "not_new_dataset"}
def get_predictions(data: list[dict], model=None, batch_size=64):
if model is None:
model = load_model()
predictions = []
for i in tqdm(range(0, len(data), batch_size)):
batch = data[i : i + batch_size]
text_inputs = [format_row_for_model(row) for row in batch]
batch_predictions = model.predict_proba(text_inputs)
for j, row in enumerate(batch):
prediction = batch_predictions[j]
row["prediction"] = int2label[int(prediction.argmax())]
row["probability"] = float(prediction.max())
predictions.append(row)
return predictions
def create_markdown(row):
title = row["title"]
abstract = row["abstract"]
arxiv_id = row["arxiv_id"]
hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
updated = row["updated"]
updated = updated.strftime("%Y-%m-%d")
broad_category = row["broad_category"]
category = row["category"]
return f"""
{title}
Updated: {updated}
| Category: {broad_category} | Subcategory: {category} |
\n\n{abstract}
\n\n [Hugging Face Papers page]({hub_paper_url})
"""
@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
def prepare_data():
print("Downloading arxiv results...")
arxiv_results = get_arxiv_result()
print("loading model...")
model = load_model()
print("Making predictions...")
predictions = get_predictions(arxiv_results, model=model)
df = pd.DataFrame(predictions)
df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
return df
all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
broad_categories = sorted(prepare_data().broad_category.unique().tolist())
def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
df = prepare_data()
if new_only:
df = df[df["prediction"] == "new_dataset"]
if narrow_categories is not None:
df = df[df["category"].isin(narrow_categories)]
if categories is not None and not narrow_categories:
df = df[df["broad_category"].isin(categories)]
number_of_results = len(df)
results = (
" arXiv papers related to datasets
\n\n"
)
results += f"Number of results: {number_of_results}\n\n"
results += "\n\n
".join(df["markdown"].tolist())
return results
scheduler = BackgroundScheduler()
scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
scheduler.start()
description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
The Space works by:
- searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
- passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
This Space is a work in progress. The model is not perfect, and the search query is not perfect. If you have suggestions for how to improve this Space, please open a Discussion.\n\n"""
with gr.Blocks() as demo:
gr.Markdown(
" ✨New Datasets in Machine Learning "
" ✨
"
)
gr.Markdown(description)
with gr.Row():
broad_categories = gr.Dropdown(
choices=broad_categories,
label="Broad arXiv Category",
multiselect=True,
value="cs",
size="sm",
)
with gr.Accordion("Advanced Options", open=False):
gr.Markdown(
"Narrow by arXiv categories. **Note** this will take precedence over the"
" broad category selection."
)
narrow_categories = gr.Dropdown(
choices=all_possible_arxiv_categories,
value=None,
multiselect=True,
label="Narrow arXiv Category",
)
gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
with gr.Row():
new_only = gr.Checkbox(True, label="New Datasets Only", size="sm")
results = gr.Markdown(create_markdown_summary())
broad_categories.change(
create_markdown_summary,
inputs=[broad_categories, new_only, narrow_categories],
outputs=results,
)
narrow_categories.change(
create_markdown_summary,
inputs=[broad_categories, new_only, narrow_categories],
outputs=results,
)
new_only.change(
create_markdown_summary,
[broad_categories, new_only, narrow_categories],
results,
)
demo.launch()