|
import streamlit as st |
|
from huggingface_hub import HfApi |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from datetime import datetime |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from functools import lru_cache |
|
import time |
|
import requests |
|
from collections import Counter |
|
|
|
st.set_page_config(page_title="HF Contributions", layout="wide", initial_sidebar_state="expanded") |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
[data-testid="stSidebar"] { |
|
min-width: 40vw !important; |
|
max-width: 40vw !important; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
api = HfApi() |
|
|
|
|
|
|
|
@lru_cache(maxsize=1000) |
|
def cached_repo_info(repo_id, repo_type): |
|
return api.repo_info(repo_id=repo_id, repo_type=repo_type) |
|
|
|
|
|
@lru_cache(maxsize=1000) |
|
def cached_list_commits(repo_id, repo_type): |
|
return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)) |
|
|
|
|
|
@lru_cache(maxsize=100) |
|
def cached_list_items(username, kind): |
|
if kind == "model": |
|
return list(api.list_models(author=username)) |
|
elif kind == "dataset": |
|
return list(api.list_datasets(author=username)) |
|
elif kind == "space": |
|
return list(api.list_spaces(author=username)) |
|
return [] |
|
|
|
|
|
|
|
@lru_cache(maxsize=1) |
|
def get_trending_accounts(limit=100): |
|
try: |
|
trending_data = {"spaces": [], "models": []} |
|
|
|
|
|
spaces_response = requests.get("https://huggingface.co/api/spaces", |
|
params={"limit": 10000}, |
|
timeout=30) |
|
|
|
|
|
models_response = requests.get("https://huggingface.co/api/models", |
|
params={"limit": 10000}, |
|
timeout=30) |
|
|
|
|
|
spaces_owners = [] |
|
if spaces_response.status_code == 200: |
|
spaces = spaces_response.json() |
|
|
|
|
|
owner_counts_spaces = {} |
|
for space in spaces: |
|
if '/' in space.get('id', ''): |
|
owner, _ = space.get('id', '').split('/', 1) |
|
else: |
|
owner = space.get('owner', '') |
|
|
|
if owner != 'None': |
|
owner_counts_spaces[owner] = owner_counts_spaces.get(owner, 0) + 1 |
|
|
|
|
|
top_owners_spaces = sorted(owner_counts_spaces.items(), key=lambda x: x[1], reverse=True)[:limit] |
|
trending_data["spaces"] = top_owners_spaces |
|
spaces_owners = [owner for owner, _ in top_owners_spaces] |
|
|
|
|
|
models_owners = [] |
|
if models_response.status_code == 200: |
|
models = models_response.json() |
|
|
|
|
|
owner_counts_models = {} |
|
for model in models: |
|
if '/' in model.get('id', ''): |
|
owner, _ = model.get('id', '').split('/', 1) |
|
else: |
|
owner = model.get('owner', '') |
|
|
|
if owner != 'None': |
|
owner_counts_models[owner] = owner_counts_models.get(owner, 0) + 1 |
|
|
|
|
|
top_owners_models = sorted(owner_counts_models.items(), key=lambda x: x[1], reverse=True)[:limit] |
|
trending_data["models"] = top_owners_models |
|
models_owners = [owner for owner, _ in top_owners_models] |
|
|
|
|
|
combined_score = {} |
|
for i, owner in enumerate(spaces_owners): |
|
if owner not in combined_score: |
|
combined_score[owner] = 0 |
|
combined_score[owner] += (limit - i) |
|
|
|
for i, owner in enumerate(models_owners): |
|
if owner not in combined_score: |
|
combined_score[owner] = 0 |
|
combined_score[owner] += (limit - i) |
|
|
|
|
|
sorted_combined = sorted(combined_score.items(), key=lambda x: x[1], reverse=True)[:limit] |
|
trending_authors = [owner for owner, _ in sorted_combined] |
|
|
|
return trending_authors, trending_data["spaces"], trending_data["models"] |
|
except Exception as e: |
|
st.error(f"Error fetching trending accounts: {str(e)}") |
|
fallback_authors = ["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"] |
|
return fallback_authors, [(author, 0) for author in fallback_authors], [(author, 0) for author in fallback_authors] |
|
|
|
|
|
|
|
class RateLimiter: |
|
def __init__(self, calls_per_second=10): |
|
self.calls_per_second = calls_per_second |
|
self.last_call = 0 |
|
|
|
def wait(self): |
|
current_time = time.time() |
|
time_since_last_call = current_time - self.last_call |
|
if time_since_last_call < (1.0 / self.calls_per_second): |
|
time.sleep((1.0 / self.calls_per_second) - time_since_last_call) |
|
self.last_call = time.time() |
|
|
|
|
|
rate_limiter = RateLimiter() |
|
|
|
|
|
|
|
def fetch_commits_for_repo(repo_id, repo_type, username, selected_year): |
|
try: |
|
rate_limiter.wait() |
|
|
|
repo_info = cached_repo_info(repo_id, repo_type) |
|
if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated): |
|
return [], [] |
|
|
|
|
|
initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date() |
|
commit_dates = [] |
|
commit_count = 0 |
|
|
|
|
|
if initial_commit_date.year == selected_year: |
|
commit_dates.append(initial_commit_date) |
|
commit_count += 1 |
|
|
|
|
|
commits = cached_list_commits(repo_id, repo_type) |
|
for commit in commits: |
|
commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date() |
|
if commit_date.year == selected_year: |
|
commit_dates.append(commit_date) |
|
commit_count += 1 |
|
|
|
return commit_dates, commit_count |
|
except Exception: |
|
return [], 0 |
|
|
|
|
|
|
|
def get_commit_events(username, kind=None, selected_year=None): |
|
commit_dates = [] |
|
items_with_type = [] |
|
kinds = [kind] if kind else ["model", "dataset", "space"] |
|
|
|
for k in kinds: |
|
try: |
|
items = cached_list_items(username, k) |
|
items_with_type.extend((item, k) for item in items) |
|
repo_ids = [item.id for item in items] |
|
|
|
|
|
chunk_size = 5 |
|
for i in range(0, len(repo_ids), chunk_size): |
|
chunk = repo_ids[i:i + chunk_size] |
|
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: |
|
future_to_repo = { |
|
executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id |
|
for repo_id in chunk |
|
} |
|
for future in as_completed(future_to_repo): |
|
repo_commits, repo_count = future.result() |
|
if repo_commits: |
|
commit_dates.extend(repo_commits) |
|
except Exception as e: |
|
st.warning(f"Error fetching {k}s for {username}: {str(e)}") |
|
|
|
|
|
df = pd.DataFrame(commit_dates, columns=["date"]) |
|
if not df.empty: |
|
df = df.drop_duplicates() |
|
return df, items_with_type |
|
|
|
|
|
|
|
def make_calendar_heatmap(df, title, year): |
|
if df.empty: |
|
st.info(f"No {title.lower()} found for {year}.") |
|
return |
|
|
|
|
|
df["count"] = 1 |
|
df = df.groupby("date", as_index=False).sum() |
|
df["date"] = pd.to_datetime(df["date"]) |
|
|
|
|
|
start = pd.Timestamp(f"{year}-01-01") |
|
end = pd.Timestamp(f"{year}-12-31") |
|
all_days = pd.date_range(start=start, end=end) |
|
|
|
|
|
heatmap_data = pd.DataFrame({"date": all_days, "count": 0}) |
|
heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y")) |
|
heatmap_data["count"] = heatmap_data["count_y"].fillna(0) |
|
heatmap_data = heatmap_data.drop("count_y", axis=1) |
|
|
|
|
|
heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek |
|
heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7 |
|
|
|
|
|
pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0) |
|
|
|
|
|
month_labels = pd.date_range(start, end, freq="MS").strftime("%b") |
|
month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7) |
|
|
|
|
|
from matplotlib.colors import ListedColormap, BoundaryNorm |
|
colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] |
|
bounds = [0, 1, 3, 11, 31, float('inf')] |
|
cmap = ListedColormap(colors) |
|
norm = BoundaryNorm(bounds, cmap.N) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 1.2)) |
|
|
|
|
|
pivot_int = pivot.astype(int) |
|
|
|
|
|
sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white", |
|
square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"]) |
|
|
|
ax.set_title(f"{title}", fontsize=12, pad=10) |
|
ax.set_xlabel("") |
|
ax.set_ylabel("") |
|
ax.set_xticks(month_positions) |
|
ax.set_xticklabels(month_labels, fontsize=8) |
|
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) |
|
st.pyplot(fig) |
|
|
|
|
|
|
|
@lru_cache(maxsize=100) |
|
def fetch_follower_data(username): |
|
try: |
|
|
|
|
|
url = f"https://huggingface.co/api/users/{username}/followers-history" |
|
response = requests.get(url, timeout=30) |
|
|
|
if response.status_code != 200: |
|
|
|
|
|
import random |
|
from dateutil.relativedelta import relativedelta |
|
|
|
|
|
today = datetime.now() |
|
data = [] |
|
followers = random.randint(10, 100) |
|
|
|
for i in range(12): |
|
date = today - relativedelta(months=11-i) |
|
followers += random.randint(0, 10) |
|
data.append({ |
|
"date": date.strftime("%Y-%m-%d"), |
|
"followers": followers |
|
}) |
|
|
|
return data |
|
|
|
return response.json() |
|
except Exception as e: |
|
st.error(f"Error fetching follower data: {str(e)}") |
|
return [] |
|
|
|
|
|
|
|
def render_follower_chart(username): |
|
follower_data = fetch_follower_data(username) |
|
|
|
if not follower_data: |
|
st.info(f"No follower data available for {username}") |
|
return |
|
|
|
|
|
dates = [item["date"] for item in follower_data] |
|
followers = [item["followers"] for item in follower_data] |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(12, 5)) |
|
ax.plot(dates, followers, marker='o', linestyle='-', color='#60A5FA') |
|
|
|
|
|
ax.set_title(f"Follower Evolution for {username}", fontsize=16) |
|
ax.set_xlabel("Date", fontsize=12) |
|
ax.set_ylabel("Followers", fontsize=12) |
|
|
|
|
|
ax.grid(True, linestyle='--', alpha=0.7) |
|
|
|
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.tight_layout() |
|
|
|
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
with st.spinner("Loading trending accounts..."): |
|
trending_accounts, top_owners_spaces, top_owners_models = get_trending_accounts(limit=100) |
|
|
|
|
|
with st.sidebar: |
|
st.title("๐ค Contributor") |
|
|
|
|
|
tab1, tab2 = st.tabs([ |
|
"Top 100 Overall Contributors", |
|
"Top 100 by Spaces & Models" |
|
]) |
|
|
|
with tab1: |
|
|
|
st.subheader("๐ฅ Top 100 Overall Contributors") |
|
|
|
|
|
st.markdown("### Combined Contributors Ranking") |
|
|
|
|
|
if trending_accounts: |
|
|
|
spaces_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_spaces)} |
|
models_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_models)} |
|
|
|
|
|
overall_data = [] |
|
for idx, username in enumerate(trending_accounts[:100]): |
|
|
|
spaces_position = str(spaces_rank.get(username, "-")) |
|
models_position = str(models_rank.get(username, "-")) |
|
overall_data.append([username, spaces_position, models_position]) |
|
|
|
ranking_data_overall = pd.DataFrame( |
|
overall_data, |
|
columns=["Contributor", "Spaces Rank", "Models Rank"] |
|
) |
|
ranking_data_overall.index = ranking_data_overall.index + 1 |
|
|
|
st.dataframe( |
|
ranking_data_overall, |
|
column_config={ |
|
"Contributor": st.column_config.TextColumn("Contributor"), |
|
"Spaces Rank": st.column_config.TextColumn("Spaces Rank (top 100)"), |
|
"Models Rank": st.column_config.TextColumn("Models Rank (top 100)") |
|
}, |
|
use_container_width=True, |
|
hide_index=False |
|
) |
|
|
|
with tab2: |
|
|
|
st.subheader("๐ Top 100 by Spaces & Models") |
|
|
|
|
|
st.markdown("### Spaces Contributors Ranking") |
|
|
|
|
|
if top_owners_spaces: |
|
ranking_data_spaces = pd.DataFrame(top_owners_spaces[:100], columns=["Contributor", "Spaces Count"]) |
|
ranking_data_spaces.index = ranking_data_spaces.index + 1 |
|
|
|
st.dataframe( |
|
ranking_data_spaces, |
|
column_config={ |
|
"Contributor": st.column_config.TextColumn("Contributor"), |
|
"Spaces Count": st.column_config.NumberColumn("Spaces Count (based on top 500 spaces)", format="%d") |
|
}, |
|
use_container_width=True, |
|
hide_index=False |
|
) |
|
|
|
|
|
with st.expander("View Top 30 Spaces Contributors Chart"): |
|
|
|
if top_owners_spaces: |
|
chart_data = pd.DataFrame(top_owners_spaces[:30], columns=["Owner", "Spaces Count"]) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
bars = ax.barh(chart_data["Owner"], chart_data["Spaces Count"]) |
|
|
|
|
|
for i, bar in enumerate(bars): |
|
bar.set_color(plt.cm.viridis(i/len(bars))) |
|
|
|
ax.set_title("Top 30 Contributors by Number of Spaces") |
|
ax.set_xlabel("Number of Spaces") |
|
plt.tight_layout() |
|
st.pyplot(fig) |
|
|
|
|
|
st.markdown("### Models Contributors Ranking") |
|
|
|
|
|
if top_owners_models: |
|
ranking_data_models = pd.DataFrame(top_owners_models[:100], columns=["Contributor", "Models Count"]) |
|
ranking_data_models.index = ranking_data_models.index + 1 |
|
|
|
st.dataframe( |
|
ranking_data_models, |
|
column_config={ |
|
"Contributor": st.column_config.TextColumn("Contributor"), |
|
"Models Count": st.column_config.NumberColumn("Models Count (based on top 500 models)", format="%d") |
|
}, |
|
use_container_width=True, |
|
hide_index=False |
|
) |
|
|
|
|
|
with st.expander("View Top 30 Models Contributors Chart"): |
|
|
|
if top_owners_models: |
|
chart_data = pd.DataFrame(top_owners_models[:30], columns=["Owner", "Models Count"]) |
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
bars = ax.barh(chart_data["Owner"], chart_data["Models Count"]) |
|
|
|
|
|
for i, bar in enumerate(bars): |
|
bar.set_color(plt.cm.plasma(i/len(bars))) |
|
|
|
ax.set_title("Top 30 Contributors by Number of Models") |
|
ax.set_xlabel("Number of Models") |
|
plt.tight_layout() |
|
st.pyplot(fig) |
|
|
|
|
|
st.subheader("Select Contributor") |
|
selected_trending = st.selectbox( |
|
"Select trending account", |
|
options=trending_accounts[:100], |
|
index=0 if trending_accounts else None, |
|
key="trending_selectbox" |
|
) |
|
|
|
|
|
st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True) |
|
custom = st.text_input("Enter username/org", label_visibility="collapsed") |
|
|
|
|
|
if custom.strip(): |
|
username = custom.strip() |
|
elif selected_trending: |
|
username = selected_trending |
|
else: |
|
username = "facebook" |
|
|
|
|
|
st.subheader("๐๏ธ Time Period") |
|
year_options = list(range(datetime.now().year, 2017, -1)) |
|
selected_year = st.selectbox("Select Year", options=year_options) |
|
|
|
|
|
st.subheader("โ๏ธ Display Options") |
|
show_models = st.checkbox("Show Models", value=True) |
|
show_datasets = st.checkbox("Show Datasets", value=True) |
|
show_spaces = st.checkbox("Show Spaces", value=True) |
|
|
|
|
|
st.title("๐ค Hugging Face Contributions") |
|
if username: |
|
with st.spinner(f"Fetching commit data for {username}..."): |
|
|
|
if username in trending_accounts[:100]: |
|
rank = trending_accounts.index(username) + 1 |
|
st.success(f"๐ {username} is ranked #{rank} in the top trending contributors!") |
|
|
|
|
|
spaces_rank = None |
|
for i, (owner, count) in enumerate(top_owners_spaces): |
|
if owner == username: |
|
spaces_rank = i+1 |
|
st.info(f"๐ Spaces Ranking: #{spaces_rank} with {count} spaces") |
|
break |
|
|
|
|
|
models_rank = None |
|
for i, (owner, count) in enumerate(top_owners_models): |
|
if owner == username: |
|
models_rank = i+1 |
|
st.info(f"๐ง Models Ranking: #{models_rank} with {count} models") |
|
break |
|
|
|
|
|
combined_info = [] |
|
if spaces_rank and spaces_rank <= 100: |
|
combined_info.append(f"Spaces: #{spaces_rank}") |
|
if models_rank and models_rank <= 100: |
|
combined_info.append(f"Models: #{models_rank}") |
|
|
|
if combined_info: |
|
st.success(f"Combined Rankings (Top 100): {', '.join(combined_info)}") |
|
|
|
|
|
commits_by_type = {} |
|
commit_counts_by_type = {} |
|
|
|
|
|
types_to_fetch = [] |
|
if show_models: |
|
types_to_fetch.append("model") |
|
if show_datasets: |
|
types_to_fetch.append("dataset") |
|
if show_spaces: |
|
types_to_fetch.append("space") |
|
|
|
if not types_to_fetch: |
|
st.warning("Please select at least one content type to display (Models, Datasets, or Spaces)") |
|
st.stop() |
|
|
|
|
|
for kind in types_to_fetch: |
|
try: |
|
items = cached_list_items(username, kind) |
|
repo_ids = [item.id for item in items] |
|
|
|
st.info(f"Found {len(repo_ids)} {kind}s for {username}") |
|
|
|
|
|
chunk_size = 5 |
|
total_commits = 0 |
|
all_commit_dates = [] |
|
|
|
progress_bar = st.progress(0) |
|
for i in range(0, len(repo_ids), chunk_size): |
|
chunk = repo_ids[i:i + chunk_size] |
|
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: |
|
future_to_repo = { |
|
executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id |
|
for repo_id in chunk |
|
} |
|
for future in as_completed(future_to_repo): |
|
repo_commits, repo_count = future.result() |
|
if repo_commits: |
|
all_commit_dates.extend(repo_commits) |
|
total_commits += repo_count |
|
|
|
|
|
progress = min(1.0, (i + len(chunk)) / max(1, len(repo_ids))) |
|
progress_bar.progress(progress) |
|
|
|
|
|
progress_bar.progress(1.0) |
|
|
|
commits_by_type[kind] = all_commit_dates |
|
commit_counts_by_type[kind] = total_commits |
|
|
|
except Exception as e: |
|
st.warning(f"Error fetching {kind}s for {username}: {str(e)}") |
|
commits_by_type[kind] = [] |
|
commit_counts_by_type[kind] = 0 |
|
|
|
|
|
total_commits = sum(commit_counts_by_type.values()) |
|
|
|
st.subheader(f"{username}'s Activity in {selected_year}") |
|
|
|
|
|
profile_col1, profile_col2 = st.columns([1, 3]) |
|
with profile_col1: |
|
|
|
try: |
|
avatar_url = f"https://huggingface.co/avatars/{username}" |
|
st.image(avatar_url, width=150) |
|
except: |
|
st.info("No profile image available") |
|
|
|
with profile_col2: |
|
st.metric("Total Commits", total_commits) |
|
|
|
|
|
for owner, count in top_owners_spaces: |
|
if owner.lower() == username.lower(): |
|
st.metric("Spaces Count", count) |
|
break |
|
|
|
st.markdown(f"[View Profile on Hugging Face](https://huggingface.co/{username})") |
|
|
|
|
|
all_commits = [] |
|
for commits in commits_by_type.values(): |
|
all_commits.extend(commits) |
|
all_df = pd.DataFrame(all_commits, columns=["date"]) |
|
if not all_df.empty: |
|
all_df = all_df.drop_duplicates() |
|
|
|
make_calendar_heatmap(all_df, "All Commits", selected_year) |
|
|
|
|
|
st.subheader(f"๐ฅ Follower Evolution for {username}") |
|
render_follower_chart(username) |
|
|
|
|
|
cols = st.columns(len(types_to_fetch)) if types_to_fetch else st.columns(1) |
|
|
|
for i, (kind, emoji, label) in enumerate([ |
|
("model", "๐ง ", "Models"), |
|
("dataset", "๐ฆ", "Datasets"), |
|
("space", "๐", "Spaces") |
|
]): |
|
if kind in types_to_fetch: |
|
with cols[types_to_fetch.index(kind)]: |
|
try: |
|
total = len(cached_list_items(username, kind)) |
|
commits = commits_by_type.get(kind, []) |
|
commit_count = commit_counts_by_type.get(kind, 0) |
|
df_kind = pd.DataFrame(commits, columns=["date"]) |
|
if not df_kind.empty: |
|
df_kind = df_kind.drop_duplicates() |
|
st.metric(f"{emoji} {label}", total) |
|
st.metric(f"Commits in {selected_year}", commit_count) |
|
make_calendar_heatmap(df_kind, f"{label} Commits", selected_year) |
|
except Exception as e: |
|
st.warning(f"Error processing {label}: {str(e)}") |
|
st.metric(f"{emoji} {label}", 0) |
|
st.metric(f"Commits in {selected_year}", 0) |
|
make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year) |
|
else: |
|
st.info("Please select an account from the sidebar to view contributions.") |