Spaces:
Running
Running
import streamlit as st | |
from huggingface_hub import HfApi | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from datetime import datetime | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from functools import lru_cache | |
import time | |
st.set_page_config(page_title="HF Contributions", layout="wide") | |
api = HfApi() | |
# Cache for API responses | |
def cached_repo_info(repo_id, repo_type): | |
return api.repo_info(repo_id=repo_id, repo_type=repo_type) | |
def cached_list_commits(repo_id, repo_type): | |
return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)) | |
def cached_list_items(username, kind): | |
if kind == "model": | |
return list(api.list_models(author=username)) | |
elif kind == "dataset": | |
return list(api.list_datasets(author=username)) | |
elif kind == "space": | |
return list(api.list_spaces(author=username)) | |
return [] | |
# Rate limiting | |
class RateLimiter: | |
def __init__(self, calls_per_second=10): | |
self.calls_per_second = calls_per_second | |
self.last_call = 0 | |
def wait(self): | |
current_time = time.time() | |
time_since_last_call = current_time - self.last_call | |
if time_since_last_call < (1.0 / self.calls_per_second): | |
time.sleep((1.0 / self.calls_per_second) - time_since_last_call) | |
self.last_call = time.time() | |
rate_limiter = RateLimiter() | |
# Function to fetch commits for a repository (optimized) | |
def fetch_commits_for_repo(repo_id, repo_type, username, selected_year): | |
try: | |
rate_limiter.wait() | |
# Skip private/gated repos upfront | |
repo_info = cached_repo_info(repo_id, repo_type) | |
if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated): | |
return [], [] | |
# Get initial commit date | |
initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date() | |
commit_dates = [] | |
commit_count = 0 | |
# Add initial commit if it's from the selected year | |
if initial_commit_date.year == selected_year: | |
commit_dates.append(initial_commit_date) | |
commit_count += 1 | |
# Get all commits | |
commits = cached_list_commits(repo_id, repo_type) | |
for commit in commits: | |
commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date() | |
if commit_date.year == selected_year: | |
commit_dates.append(commit_date) | |
commit_count += 1 | |
return commit_dates, commit_count | |
except Exception: | |
return [], 0 | |
# Function to get commit events for a user (optimized) | |
def get_commit_events(username, kind=None, selected_year=None): | |
commit_dates = [] | |
items_with_type = [] | |
kinds = [kind] if kind else ["model", "dataset", "space"] | |
for k in kinds: | |
try: | |
items = cached_list_items(username, k) | |
items_with_type.extend((item, k) for item in items) | |
repo_ids = [item.id for item in items] | |
# Optimized parallel fetch with chunking | |
chunk_size = 5 # Process 5 repos at a time | |
for i in range(0, len(repo_ids), chunk_size): | |
chunk = repo_ids[i:i + chunk_size] | |
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: | |
future_to_repo = { | |
executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id | |
for repo_id in chunk | |
} | |
for future in as_completed(future_to_repo): | |
repo_commits, repo_count = future.result() | |
if repo_commits: # Only extend if we got commits | |
commit_dates.extend(repo_commits) | |
except Exception as e: | |
st.warning(f"Error fetching {k}s for {username}: {str(e)}") | |
# Create DataFrame with all commits | |
df = pd.DataFrame(commit_dates, columns=["date"]) | |
if not df.empty: | |
df = df.drop_duplicates() # Remove any duplicate dates | |
return df, items_with_type | |
# Calendar heatmap function (optimized) | |
def make_calendar_heatmap(df, title, year): | |
if df.empty: | |
st.info(f"No {title.lower()} found for {year}.") | |
return | |
# Optimize DataFrame operations | |
df["count"] = 1 | |
df = df.groupby("date", as_index=False).sum() | |
df["date"] = pd.to_datetime(df["date"]) | |
# Create date range more efficiently | |
start = pd.Timestamp(f"{year}-01-01") | |
end = pd.Timestamp(f"{year}-12-31") | |
all_days = pd.date_range(start=start, end=end) | |
# Optimize DataFrame creation and merging | |
heatmap_data = pd.DataFrame({"date": all_days, "count": 0}) | |
heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y")) | |
heatmap_data["count"] = heatmap_data["count_y"].fillna(0) | |
heatmap_data = heatmap_data.drop("count_y", axis=1) | |
# Calculate week and day of week more efficiently | |
heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek | |
heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7 | |
# Create pivot table more efficiently | |
pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0) | |
# Optimize month labels calculation | |
month_labels = pd.date_range(start, end, freq="MS").strftime("%b") | |
month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7) | |
# Create custom colormap with specific boundaries | |
from matplotlib.colors import ListedColormap, BoundaryNorm | |
colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors | |
bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions | |
cmap = ListedColormap(colors) | |
norm = BoundaryNorm(bounds, cmap.N) | |
# Create plot more efficiently | |
fig, ax = plt.subplots(figsize=(12, 1.2)) | |
# Convert pivot values to integers to ensure proper color mapping | |
pivot_int = pivot.astype(int) | |
# Create heatmap with explicit vmin and vmax | |
sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white", | |
square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"]) | |
ax.set_title(f"{title}", fontsize=12, pad=10) | |
ax.set_xlabel("") | |
ax.set_ylabel("") | |
ax.set_xticks(month_positions) | |
ax.set_xticklabels(month_labels, fontsize=8) | |
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) | |
st.pyplot(fig) | |
# Sidebar | |
with st.sidebar: | |
st.title("π€ Contributor") | |
username = st.selectbox( | |
"Select or type a username", | |
options=["Gyaneshere", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"], | |
index=0 | |
) | |
st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True) | |
custom = st.text_input("", placeholder="Enter custom username/org") | |
if custom.strip(): | |
username = custom.strip() | |
year_options = list(range(datetime.now().year, 2017, -1)) | |
selected_year = st.selectbox("ποΈ Year", options=year_options) | |
# Main Content | |
st.title("π€ Hugging Face Contributions") | |
if username: | |
with st.spinner("Fetching commit data..."): | |
# Create a dictionary to store commits by type | |
commits_by_type = {} | |
commit_counts_by_type = {} | |
# Fetch commits for each type separately | |
for kind in ["model", "dataset", "space"]: | |
try: | |
items = cached_list_items(username, kind) | |
repo_ids = [item.id for item in items] | |
# Process repos in chunks | |
chunk_size = 5 | |
total_commits = 0 | |
all_commit_dates = [] | |
for i in range(0, len(repo_ids), chunk_size): | |
chunk = repo_ids[i:i + chunk_size] | |
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: | |
future_to_repo = { | |
executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id | |
for repo_id in chunk | |
} | |
for future in as_completed(future_to_repo): | |
repo_commits, repo_count = future.result() | |
if repo_commits: | |
all_commit_dates.extend(repo_commits) | |
total_commits += repo_count | |
commits_by_type[kind] = all_commit_dates | |
commit_counts_by_type[kind] = total_commits | |
except Exception as e: | |
st.warning(f"Error fetching {kind}s for {username}: {str(e)}") | |
commits_by_type[kind] = [] | |
commit_counts_by_type[kind] = 0 | |
# Calculate total commits across all types | |
total_commits = sum(commit_counts_by_type.values()) | |
st.subheader(f"{username}'s Activity in {selected_year}") | |
st.metric("Total Commits", total_commits) | |
# Create DataFrame for all commits | |
all_commits = [] | |
for commits in commits_by_type.values(): | |
all_commits.extend(commits) | |
all_df = pd.DataFrame(all_commits, columns=["date"]) | |
if not all_df.empty: | |
all_df = all_df.drop_duplicates() # Remove any duplicate dates | |
make_calendar_heatmap(all_df, "All Commits", selected_year) | |
# Metrics and heatmaps for each type | |
col1, col2, col3 = st.columns(3) | |
for col, kind, emoji, label in [ | |
(col1, "model", "π§ ", "Models"), | |
(col2, "dataset", "π¦", "Datasets"), | |
(col3, "space", "π", "Spaces") | |
]: | |
with col: | |
try: | |
total = len(cached_list_items(username, kind)) | |
commits = commits_by_type.get(kind, []) | |
commit_count = commit_counts_by_type.get(kind, 0) | |
df_kind = pd.DataFrame(commits, columns=["date"]) | |
if not df_kind.empty: | |
df_kind = df_kind.drop_duplicates() # Remove any duplicate dates | |
st.metric(f"{emoji} {label}", total) | |
st.metric(f"Commits in {selected_year}", commit_count) | |
make_calendar_heatmap(df_kind, f"{label} Commits", selected_year) | |
except Exception as e: | |
st.warning(f"Error processing {label}: {str(e)}") | |
st.metric(f"{emoji} {label}", 0) | |
st.metric(f"Commits in {selected_year}", 0) | |
make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year) |