Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,803 Bytes
8a9d0f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import datasets
import polars as pl
from loguru import logger
from polars import datatypes as pdt
BASE_REPO_ID = "ai-conferences/CVPR2025"
PATCH_REPO_ID = "ai-conferences/CVPR2025-patches"
PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"
def get_patch_latest_values(
df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
) -> pl.DataFrame:
df = df.sort(timestamp_col)
list_cols = [
col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
]
df = df.with_columns(
[
pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
for c in list_cols
]
)
update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()
latest_rows = (
melted.sort(timestamp_col)
.group_by([id_col, "variable"])
.agg(pl.col("value").last())
.pivot("variable", index=id_col, values="value")
)
latest_rows = latest_rows.with_columns(
[
pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
for c in list_cols
]
)
missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
if missing_cols:
latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])
return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])
def format_author_claim_ratio(row: dict) -> str:
n_linked_authors = row["n_linked_authors"]
n_authors = row["n_authors"]
if n_linked_authors is None or n_authors is None:
return ""
author_linked = "✅" if n_linked_authors > 0 else ""
return f"{n_linked_authors}/{n_authors} {author_linked}".strip()
df_orig = (
datasets.load_dataset(BASE_REPO_ID, split="train")
.to_polars()
.rename({"cvf_url": "cvf"})
.with_columns(
pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
)
)
df_paper_page = (
datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
.to_polars()
.drop(["summary", "author_names", "ai_keywords"])
)
df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left")
try:
df_patches = (
datasets.load_dataset(PATCH_REPO_ID, split="train")
.to_polars()
.drop("diff")
.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+"))
)
df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
df_orig = (
df_orig.join(df_patches, on="paper_id", how="left")
.with_columns(
[
pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col)
for col in df_orig.columns
if col != "paper_id"
]
)
.select(df_orig.columns)
)
except Exception as e: # noqa: BLE001
logger.warning(e)
# format authors
df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
# format links
df_orig = df_orig.with_columns(
[
pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
for col in ["cvf", "project_page", "github"]
]
)
# format paper page link
df_orig = df_orig.with_columns(
(pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
# count authors
df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
df_orig = df_orig.with_columns(
pl.col("author_usernames")
.map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
.alias("n_linked_authors")
)
df_orig = df_orig.with_columns(
pl.struct(["n_linked_authors", "n_authors"])
.map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
.alias("claimed")
)
# format spaces, models, datasets
for repo_id_col, markdown_col, base_url in [
("space_ids", "Spaces", "https://huggingface.co/spaces/"),
("model_ids", "Models", "https://huggingface.co/"),
("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
]:
df_orig = df_orig.with_columns(
pl.col(repo_id_col)
.map_elements(
lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None, # noqa: B023
return_dtype=pl.Utf8,
)
.fill_null("")
.alias(markdown_col)
)
|