File size: 4,803 Bytes
8a9d0f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import datasets
import polars as pl
from loguru import logger
from polars import datatypes as pdt

BASE_REPO_ID = "ai-conferences/CVPR2025"
PATCH_REPO_ID = "ai-conferences/CVPR2025-patches"
PAPER_PAGE_REPO_ID = "hysts-bot-data/paper-pages-slim"


def get_patch_latest_values(
    df: pl.DataFrame, all_columns: list[str], id_col: str, timestamp_col: str = "timestamp", delimiter: str = ","
) -> pl.DataFrame:
    df = df.sort(timestamp_col)

    list_cols = [
        col for col, dtype in df.schema.items() if col not in (id_col, timestamp_col) and dtype.base_type() is pdt.List
    ]
    df = df.with_columns(
        [
            pl.when(pl.col(c).is_not_null()).then(pl.col(c).list.join(delimiter)).otherwise(None).alias(c)
            for c in list_cols
        ]
    )

    update_columns = [col for col in df.columns if col not in (id_col, timestamp_col)]
    melted = df.unpivot(on=update_columns, index=[timestamp_col, id_col]).drop_nulls()

    latest_rows = (
        melted.sort(timestamp_col)
        .group_by([id_col, "variable"])
        .agg(pl.col("value").last())
        .pivot("variable", index=id_col, values="value")
    )

    latest_rows = latest_rows.with_columns(
        [
            pl.when(pl.col(c).is_not_null()).then(pl.col(c).str.split(delimiter)).otherwise(None).alias(c)
            for c in list_cols
        ]
    )

    missing_cols = [c for c in all_columns if c not in latest_rows.columns and c != id_col]
    if missing_cols:
        latest_rows = latest_rows.with_columns([pl.lit(None).alias(c) for c in missing_cols])

    return latest_rows.select([id_col] + [col for col in all_columns if col != id_col])


def format_author_claim_ratio(row: dict) -> str:
    n_linked_authors = row["n_linked_authors"]
    n_authors = row["n_authors"]

    if n_linked_authors is None or n_authors is None:
        return ""

    author_linked = "✅" if n_linked_authors > 0 else ""
    return f"{n_linked_authors}/{n_authors} {author_linked}".strip()


df_orig = (
    datasets.load_dataset(BASE_REPO_ID, split="train")
    .to_polars()
    .rename({"cvf_url": "cvf"})
    .with_columns(
        pl.lit([], dtype=pl.List(pl.Utf8)).alias(col_name) for col_name in ["space_ids", "model_ids", "dataset_ids"]
    )
)

df_paper_page = (
    datasets.load_dataset(PAPER_PAGE_REPO_ID, split="train")
    .to_polars()
    .drop(["summary", "author_names", "ai_keywords"])
)
df_orig = df_orig.join(df_paper_page, on="arxiv_id", how="left")

try:
    df_patches = (
        datasets.load_dataset(PATCH_REPO_ID, split="train")
        .to_polars()
        .drop("diff")
        .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, "%+"))
    )
    df_patches = get_patch_latest_values(df_patches, df_orig.columns, id_col="paper_id", timestamp_col="timestamp")
    df_orig = (
        df_orig.join(df_patches, on="paper_id", how="left")
        .with_columns(
            [
                pl.coalesce([pl.col(col + "_right"), pl.col(col)]).alias(col)
                for col in df_orig.columns
                if col != "paper_id"
            ]
        )
        .select(df_orig.columns)
    )
except Exception as e:  # noqa: BLE001
    logger.warning(e)

# format authors
df_orig = df_orig.with_columns(pl.col("authors").list.join(", ").alias("authors_str"))
# format links
df_orig = df_orig.with_columns(
    [
        pl.format("[link]({})", pl.col(col)).fill_null("").alias(f"{col}_md")
        for col in ["cvf", "project_page", "github"]
    ]
)
# format paper page link
df_orig = df_orig.with_columns(
    (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))

# count authors
df_orig = df_orig.with_columns(pl.col("authors").list.len().alias("n_authors"))
df_orig = df_orig.with_columns(
    pl.col("author_usernames")
    .map_elements(lambda lst: sum(x is not None for x in lst) if lst is not None else None, return_dtype=pl.Int64)
    .alias("n_linked_authors")
)
df_orig = df_orig.with_columns(
    pl.struct(["n_linked_authors", "n_authors"])
    .map_elements(format_author_claim_ratio, return_dtype=pl.Utf8)
    .alias("claimed")
)

# format spaces, models, datasets
for repo_id_col, markdown_col, base_url in [
    ("space_ids", "Spaces", "https://huggingface.co/spaces/"),
    ("model_ids", "Models", "https://huggingface.co/"),
    ("dataset_ids", "Datasets", "https://huggingface.co/datasets/"),
]:
    df_orig = df_orig.with_columns(
        pl.col(repo_id_col)
        .map_elements(
            lambda lst: "\n".join([f"[link]({base_url}{x})" for x in lst]) if lst is not None else None,  # noqa: B023
            return_dtype=pl.Utf8,
        )
        .fill_null("")
        .alias(markdown_col)
    )