Contributors-Leaderboard

Running

File size: 23,088 Bytes

import streamlit as st
from huggingface_hub import HfApi
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import time
import requests
from collections import Counter

st.set_page_config(page_title="HF Contributions", layout="wide")
api = HfApi()


# Cache for API responses
@lru_cache(maxsize=1000)
def cached_repo_info(repo_id, repo_type):
    return api.repo_info(repo_id=repo_id, repo_type=repo_type)


@lru_cache(maxsize=1000)
def cached_list_commits(repo_id, repo_type):
    return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type))


@lru_cache(maxsize=100)
def cached_list_items(username, kind):
    if kind == "model":
        return list(api.list_models(author=username))
    elif kind == "dataset":
        return list(api.list_datasets(author=username))
    elif kind == "space":
        return list(api.list_spaces(author=username))
    return []


# Function to fetch trending accounts and create stats
@lru_cache(maxsize=1)
def get_trending_accounts(limit=100):
    try:
        trending_data = {"spaces": [], "models": []}
        
        # Get spaces for stats calculation
        spaces_response = requests.get("https://huggingface.co/api/spaces", 
                                      params={"limit": 10000}, 
                                      timeout=30)
        
        # Get models for stats calculation
        models_response = requests.get("https://huggingface.co/api/models", 
                                      params={"limit": 10000}, 
                                      timeout=30)
        
        # Process spaces data
        if spaces_response.status_code == 200:
            spaces = spaces_response.json()
            
            # Count spaces by owner
            owner_counts_spaces = {}
            for space in spaces:
                if '/' in space.get('id', ''):
                    owner, _ = space.get('id', '').split('/', 1)
                else:
                    owner = space.get('owner', '')
                
                if owner != 'None':
                    owner_counts_spaces[owner] = owner_counts_spaces.get(owner, 0) + 1
            
            # Get top owners by count for spaces
            top_owners_spaces = sorted(owner_counts_spaces.items(), key=lambda x: x[1], reverse=True)[:limit]
            trending_data["spaces"] = top_owners_spaces
        
        # Process models data
        if models_response.status_code == 200:
            models = models_response.json()
            
            # Count models by owner
            owner_counts_models = {}
            for model in models:
                if '/' in model.get('id', ''):
                    owner, _ = model.get('id', '').split('/', 1)
                else:
                    owner = model.get('owner', '')
                
                if owner != 'None':
                    owner_counts_models[owner] = owner_counts_models.get(owner, 0) + 1
            
            # Get top owners by count for models
            top_owners_models = sorted(owner_counts_models.items(), key=lambda x: x[1], reverse=True)[:limit]
            trending_data["models"] = top_owners_models
        
        # Combine rankings for overall trending (weighted average)
        combined_scores = {}
        
        # Add scores from spaces
        for owner, count in trending_data["spaces"]:
            if owner not in combined_scores:
                combined_scores[owner] = {"spaces": 0, "models": 0, "total": 0}
            combined_scores[owner]["spaces"] = count
        
        # Add scores from models
        for owner, count in trending_data["models"]:
            if owner not in combined_scores:
                combined_scores[owner] = {"spaces": 0, "models": 0, "total": 0}
            combined_scores[owner]["models"] = count
        
        # Calculate total score (spaces + models)
        for owner in combined_scores:
            combined_scores[owner]["total"] = combined_scores[owner]["spaces"] + combined_scores[owner]["models"]
        
        # Sort by total score
        sorted_combined = sorted(combined_scores.items(), key=lambda x: x[1]["total"], reverse=True)[:limit]
        
        # Extract just the owner names for dropdown
        trending_authors = [owner for owner, _ in sorted_combined]
        
        return trending_authors, trending_data["spaces"], trending_data["models"]
    except Exception as e:
        st.error(f"Error fetching trending accounts: {str(e)}")
        fallback_authors = ["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"]
        return fallback_authors, [(author, 0) for author in fallback_authors], [(author, 0) for author in fallback_authors]


# Rate limiting
class RateLimiter:
    def __init__(self, calls_per_second=10):
        self.calls_per_second = calls_per_second
        self.last_call = 0

    def wait(self):
        current_time = time.time()
        time_since_last_call = current_time - self.last_call
        if time_since_last_call < (1.0 / self.calls_per_second):
            time.sleep((1.0 / self.calls_per_second) - time_since_last_call)
        self.last_call = time.time()


rate_limiter = RateLimiter()


# Function to fetch commits for a repository (optimized)
def fetch_commits_for_repo(repo_id, repo_type, username, selected_year):
    try:
        rate_limiter.wait()
        # Skip private/gated repos upfront
        repo_info = cached_repo_info(repo_id, repo_type)
        if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated):
            return [], []

        # Get initial commit date
        initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date()
        commit_dates = []
        commit_count = 0

        # Add initial commit if it's from the selected year
        if initial_commit_date.year == selected_year:
            commit_dates.append(initial_commit_date)
            commit_count += 1

        # Get all commits
        commits = cached_list_commits(repo_id, repo_type)
        for commit in commits:
            commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date()
            if commit_date.year == selected_year:
                commit_dates.append(commit_date)
                commit_count += 1

        return commit_dates, commit_count
    except Exception:
        return [], 0


# Function to get commit events for a user (optimized)
def get_commit_events(username, kind=None, selected_year=None):
    commit_dates = []
    items_with_type = []
    kinds = [kind] if kind else ["model", "dataset", "space"]

    for k in kinds:
        try:
            items = cached_list_items(username, k)
            items_with_type.extend((item, k) for item in items)
            repo_ids = [item.id for item in items]

            # Optimized parallel fetch with chunking
            chunk_size = 5  # Process 5 repos at a time
            for i in range(0, len(repo_ids), chunk_size):
                chunk = repo_ids[i:i + chunk_size]
                with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
                    future_to_repo = {
                        executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id
                        for repo_id in chunk
                    }
                    for future in as_completed(future_to_repo):
                        repo_commits, repo_count = future.result()
                        if repo_commits:  # Only extend if we got commits
                            commit_dates.extend(repo_commits)
        except Exception as e:
            st.warning(f"Error fetching {k}s for {username}: {str(e)}")

    # Create DataFrame with all commits
    df = pd.DataFrame(commit_dates, columns=["date"])
    if not df.empty:
        df = df.drop_duplicates()  # Remove any duplicate dates
    return df, items_with_type


# Calendar heatmap function (optimized)
def make_calendar_heatmap(df, title, year):
    if df.empty:
        st.info(f"No {title.lower()} found for {year}.")
        return

    # Optimize DataFrame operations
    df["count"] = 1
    df = df.groupby("date", as_index=False).sum()
    df["date"] = pd.to_datetime(df["date"])

    # Create date range more efficiently
    start = pd.Timestamp(f"{year}-01-01")
    end = pd.Timestamp(f"{year}-12-31")
    all_days = pd.date_range(start=start, end=end)

    # Optimize DataFrame creation and merging
    heatmap_data = pd.DataFrame({"date": all_days, "count": 0})
    heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y"))
    heatmap_data["count"] = heatmap_data["count_y"].fillna(0)
    heatmap_data = heatmap_data.drop("count_y", axis=1)

    # Calculate week and day of week more efficiently
    heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek
    heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7

    # Create pivot table more efficiently
    pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0)

    # Optimize month labels calculation
    month_labels = pd.date_range(start, end, freq="MS").strftime("%b")
    month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7)

    # Create custom colormap with specific boundaries
    from matplotlib.colors import ListedColormap, BoundaryNorm
    colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39']  # GitHub-style green colors
    bounds = [0, 1, 3, 11, 31, float('inf')]  # Boundaries for color transitions
    cmap = ListedColormap(colors)
    norm = BoundaryNorm(bounds, cmap.N)

    # Create plot more efficiently
    fig, ax = plt.subplots(figsize=(12, 1.2))

    # Convert pivot values to integers to ensure proper color mapping
    pivot_int = pivot.astype(int)

    # Create heatmap with explicit vmin and vmax
    sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white",
                square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"])

    ax.set_title(f"{title}", fontsize=12, pad=10)
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.set_xticks(month_positions)
    ax.set_xticklabels(month_labels, fontsize=8)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
    st.pyplot(fig)


# Fetch trending accounts with a loading spinner (do this once at the beginning)
with st.spinner("Loading trending accounts..."):
    trending_accounts, top_owners_spaces, top_owners_models = get_trending_accounts(limit=100)

# Sidebar
with st.sidebar:
    st.title("👤 Contributor")
    
    # Create tabs for Spaces and Models rankings
    tab1, tab2 = st.tabs(["Spaces Ranking", "Models Ranking"])
    
    with tab1:
        # Show trending accounts list by Spaces
        st.subheader("🚀 Top 30 by Spaces")
        
        # Display the top 30 accounts list with their scores
        st.markdown("### Spaces Contributors Ranking")
        
        # Create a data frame for the table
        if top_owners_spaces:
            ranking_data_spaces = pd.DataFrame(top_owners_spaces[:30], columns=["Contributor", "Spaces Count"])
            ranking_data_spaces.index = ranking_data_spaces.index + 1  # Start index from 1 for ranking
            
            # Add a score column based on spaces count
            ranking_data_spaces["Score"] = ranking_data_spaces["Spaces Count"].apply(lambda x: x * 10)  # Multiply by 10 for a score metric
            
            st.dataframe(
                ranking_data_spaces,
                column_config={
                    "Contributor": st.column_config.TextColumn("Contributor"),
                    "Spaces Count": st.column_config.NumberColumn("Spaces Count (based on top 500 spaces)", format="%d"),
                    "Score": st.column_config.ProgressColumn(
                        "Score (within TOP 500 SPACES)",
                        min_value=0,
                        max_value=ranking_data_spaces["Score"].max() * 1.1,  # Add 10% to max for visual scale
                        format="%d pts"
                    )
                },
                use_container_width=True,
                hide_index=False
            )
        
        # Add stats expander with visualization
        with st.expander("View Top 30 Spaces Contributors Chart"):
            # Create a bar chart for top 30 contributors
            if top_owners_spaces:
                chart_data = pd.DataFrame(top_owners_spaces[:30], columns=["Owner", "Spaces Count"])
                
                fig, ax = plt.subplots(figsize=(10, 8))
                bars = ax.barh(chart_data["Owner"], chart_data["Spaces Count"])
                
                # Add color gradient to bars
                for i, bar in enumerate(bars):
                    bar.set_color(plt.cm.viridis(i/len(bars)))
                
                ax.set_title("Top 30 Contributors by Number of Spaces")
                ax.set_xlabel("Number of Spaces")
                plt.tight_layout()
                st.pyplot(fig)
    
    with tab2:
        # Show trending accounts list by Models
        st.subheader("🧠 Top 30 by Models")
        
        # Display the top 30 accounts list with their scores
        st.markdown("### Models Contributors Ranking")
        
        # Create a data frame for the table
        if top_owners_models:
            ranking_data_models = pd.DataFrame(top_owners_models[:30], columns=["Contributor", "Models Count"])
            ranking_data_models.index = ranking_data_models.index + 1  # Start index from 1 for ranking
            
            # Add a score column based on models count
            ranking_data_models["Score"] = ranking_data_models["Models Count"].apply(lambda x: x * 10)  # Multiply by 10 for a score metric
            
            st.dataframe(
                ranking_data_models,
                column_config={
                    "Contributor": st.column_config.TextColumn("Contributor"),
                    "Models Count": st.column_config.NumberColumn("Models Count (based on top 500 models)", format="%d"),
                    "Score": st.column_config.ProgressColumn(
                        "Score (within TOP 500 MODELS)",
                        min_value=0,
                        max_value=ranking_data_models["Score"].max() * 1.1,  # Add 10% to max for visual scale
                        format="%d pts"
                    )
                },
                use_container_width=True,
                hide_index=False
            )
        
        # Add stats expander with visualization
        with st.expander("View Top 30 Models Contributors Chart"):
            # Create a bar chart for top 30 contributors
            if top_owners_models:
                chart_data = pd.DataFrame(top_owners_models[:30], columns=["Owner", "Models Count"])
                
                fig, ax = plt.subplots(figsize=(10, 8))
                bars = ax.barh(chart_data["Owner"], chart_data["Models Count"])
                
                # Add color gradient to bars
                for i, bar in enumerate(bars):
                    bar.set_color(plt.cm.plasma(i/len(bars)))  # Use a different colormap for models
                
                ax.set_title("Top 30 Contributors by Number of Models")
                ax.set_xlabel("Number of Models")
                plt.tight_layout()
                st.pyplot(fig)
    
    # Display trending accounts selection dropdown
    st.subheader("Select Contributor")
    selected_trending = st.selectbox(
        "Select trending account",
        options=trending_accounts[:30],  # Limit to top 30
        index=0 if trending_accounts else None,
        key="trending_selectbox"
    )
    
    # Custom account input option
    st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True)
    custom = st.text_input("", placeholder="Enter custom username/org")
    
    # Set username based on selection or custom input
    if custom.strip():
        username = custom.strip()
    elif selected_trending:
        username = selected_trending
    else:
        username = "facebook"  # Default fallback
    
    # Year selection
    st.subheader("🗓️ Time Period")
    year_options = list(range(datetime.now().year, 2017, -1))
    selected_year = st.selectbox("Select Year", options=year_options)
    
    # Additional options for customization
    st.subheader("⚙️ Display Options")
    show_models = st.checkbox("Show Models", value=True)
    show_datasets = st.checkbox("Show Datasets", value=True)
    show_spaces = st.checkbox("Show Spaces", value=True)

# Main Content
st.title("🤗 Hugging Face Contributions")
if username:
    with st.spinner(f"Fetching commit data for {username}..."):
        # Display contributor rank if in top 30
        if username in trending_accounts[:30]:
            rank = trending_accounts.index(username) + 1
            st.success(f"🏆 {username} is ranked #{rank} in the top trending contributors!")
            
            # Find user in spaces ranking
            for i, (owner, count) in enumerate(top_owners_spaces):
                if owner == username:
                    st.info(f"🚀 Spaces Ranking: #{i+1} with {count} spaces")
                    break
            
            # Find user in models ranking
            for i, (owner, count) in enumerate(top_owners_models):
                if owner == username:
                    st.info(f"🧠 Models Ranking: #{i+1} with {count} models")
                    break
        
        # Create a dictionary to store commits by type
        commits_by_type = {}
        commit_counts_by_type = {}
        
        # Determine which types to fetch based on checkboxes
        types_to_fetch = []
        if show_models:
            types_to_fetch.append("model")
        if show_datasets:
            types_to_fetch.append("dataset")
        if show_spaces:
            types_to_fetch.append("space")
        
        if not types_to_fetch:
            st.warning("Please select at least one content type to display (Models, Datasets, or Spaces)")
            st.stop()

        # Fetch commits for each selected type
        for kind in types_to_fetch:
            try:
                items = cached_list_items(username, kind)
                repo_ids = [item.id for item in items]
                
                st.info(f"Found {len(repo_ids)} {kind}s for {username}")

                # Process repos in chunks
                chunk_size = 5
                total_commits = 0
                all_commit_dates = []

                progress_bar = st.progress(0)
                for i in range(0, len(repo_ids), chunk_size):
                    chunk = repo_ids[i:i + chunk_size]
                    with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
                        future_to_repo = {
                            executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id
                            for repo_id in chunk
                        }
                        for future in as_completed(future_to_repo):
                            repo_commits, repo_count = future.result()
                            if repo_commits:
                                all_commit_dates.extend(repo_commits)
                                total_commits += repo_count
                    
                    # Update progress
                    progress = min(1.0, (i + len(chunk)) / max(1, len(repo_ids)))
                    progress_bar.progress(progress)
                
                # Complete progress
                progress_bar.progress(1.0)

                commits_by_type[kind] = all_commit_dates
                commit_counts_by_type[kind] = total_commits

            except Exception as e:
                st.warning(f"Error fetching {kind}s for {username}: {str(e)}")
                commits_by_type[kind] = []
                commit_counts_by_type[kind] = 0

        # Calculate total commits across all types
        total_commits = sum(commit_counts_by_type.values())

        st.subheader(f"{username}'s Activity in {selected_year}")
        
        # Profile information
        profile_col1, profile_col2 = st.columns([1, 3])
        with profile_col1:
            # Try to get avatar
            try:
                avatar_url = f"https://huggingface.co/avatars/{username}"
                st.image(avatar_url, width=150)
            except:
                st.info("No profile image available")
        
        with profile_col2:
            st.metric("Total Commits", total_commits)
            
            # Show contributor rank if in top owners
            for owner, count in top_owners_spaces:
                if owner.lower() == username.lower():
                    st.metric("Spaces Count", count)
                    break
            
            st.markdown(f"[View Profile on Hugging Face](https://huggingface.co/{username})")

        # Create DataFrame for all commits
        all_commits = []
        for commits in commits_by_type.values():
            all_commits.extend(commits)
        all_df = pd.DataFrame(all_commits, columns=["date"])
        if not all_df.empty:
            all_df = all_df.drop_duplicates()  # Remove any duplicate dates

        make_calendar_heatmap(all_df, "All Commits", selected_year)

        # Metrics and heatmaps for each selected type
        cols = st.columns(len(types_to_fetch)) if types_to_fetch else st.columns(1)
        
        for i, (kind, emoji, label) in enumerate([
            ("model", "🧠", "Models"),
            ("dataset", "📦", "Datasets"),
            ("space", "🚀", "Spaces")
        ]):
            if kind in types_to_fetch:
                with cols[types_to_fetch.index(kind)]:
                    try:
                        total = len(cached_list_items(username, kind))
                        commits = commits_by_type.get(kind, [])
                        commit_count = commit_counts_by_type.get(kind, 0)
                        df_kind = pd.DataFrame(commits, columns=["date"])
                        if not df_kind.empty:
                            df_kind = df_kind.drop_duplicates()  # Remove any duplicate dates
                        st.metric(f"{emoji} {label}", total)
                        st.metric(f"Commits in {selected_year}", commit_count)
                        make_calendar_heatmap(df_kind, f"{label} Commits", selected_year)
                    except Exception as e:
                        st.warning(f"Error processing {label}: {str(e)}")
                        st.metric(f"{emoji} {label}", 0)
                        st.metric(f"Commits in {selected_year}", 0)
                        make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year)
else:
    st.info("Please select an account from the sidebar to view contributions.")