import streamlit as st from huggingface_hub import HfApi import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime from concurrent.futures import ThreadPoolExecutor, as_completed from functools import lru_cache import time import requests from collections import Counter st.set_page_config(page_title="HF Contributions", layout="wide", initial_sidebar_state="expanded") # Set custom sidebar width - UPDATED to 40% of the screen st.markdown(""" """, unsafe_allow_html=True) api = HfApi() # Cache for API responses @lru_cache(maxsize=1000) def cached_repo_info(repo_id, repo_type): return api.repo_info(repo_id=repo_id, repo_type=repo_type) @lru_cache(maxsize=1000) def cached_list_commits(repo_id, repo_type): return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type)) @lru_cache(maxsize=100) def cached_list_items(username, kind): if kind == "model": return list(api.list_models(author=username)) elif kind == "dataset": return list(api.list_datasets(author=username)) elif kind == "space": return list(api.list_spaces(author=username)) return [] # Function to fetch trending accounts and create stats @lru_cache(maxsize=1) def get_trending_accounts(limit=100): try: trending_data = {"spaces": [], "models": []} # Get spaces for stats calculation spaces_response = requests.get("https://huggingface.co/api/spaces", params={"limit": 10000}, timeout=30) # Get models for stats calculation models_response = requests.get("https://huggingface.co/api/models", params={"limit": 10000}, timeout=30) # Process spaces data spaces_owners = [] if spaces_response.status_code == 200: spaces = spaces_response.json() # Count spaces by owner owner_counts_spaces = {} for space in spaces: if '/' in space.get('id', ''): owner, _ = space.get('id', '').split('/', 1) else: owner = space.get('owner', '') if owner != 'None': owner_counts_spaces[owner] = owner_counts_spaces.get(owner, 0) + 1 # Get top owners by count for spaces top_owners_spaces = sorted(owner_counts_spaces.items(), key=lambda x: x[1], reverse=True)[:limit] trending_data["spaces"] = top_owners_spaces spaces_owners = [owner for owner, _ in top_owners_spaces] # Process models data models_owners = [] if models_response.status_code == 200: models = models_response.json() # Count models by owner owner_counts_models = {} for model in models: if '/' in model.get('id', ''): owner, _ = model.get('id', '').split('/', 1) else: owner = model.get('owner', '') if owner != 'None': owner_counts_models[owner] = owner_counts_models.get(owner, 0) + 1 # Get top owners by count for models top_owners_models = sorted(owner_counts_models.items(), key=lambda x: x[1], reverse=True)[:limit] trending_data["models"] = top_owners_models models_owners = [owner for owner, _ in top_owners_models] # Combine rankings for overall trending based on appearance in both lists combined_score = {} for i, owner in enumerate(spaces_owners): if owner not in combined_score: combined_score[owner] = 0 combined_score[owner] += (limit - i) # Higher rank gives more points for i, owner in enumerate(models_owners): if owner not in combined_score: combined_score[owner] = 0 combined_score[owner] += (limit - i) # Higher rank gives more points # Sort by combined score sorted_combined = sorted(combined_score.items(), key=lambda x: x[1], reverse=True)[:limit] trending_authors = [owner for owner, _ in sorted_combined] return trending_authors, trending_data["spaces"], trending_data["models"] except Exception as e: st.error(f"Error fetching trending accounts: {str(e)}") fallback_authors = ["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"] return fallback_authors, [(author, 0) for author in fallback_authors], [(author, 0) for author in fallback_authors] # Rate limiting class RateLimiter: def __init__(self, calls_per_second=10): self.calls_per_second = calls_per_second self.last_call = 0 def wait(self): current_time = time.time() time_since_last_call = current_time - self.last_call if time_since_last_call < (1.0 / self.calls_per_second): time.sleep((1.0 / self.calls_per_second) - time_since_last_call) self.last_call = time.time() rate_limiter = RateLimiter() # Function to fetch commits for a repository (optimized) def fetch_commits_for_repo(repo_id, repo_type, username, selected_year): try: rate_limiter.wait() # Skip private/gated repos upfront repo_info = cached_repo_info(repo_id, repo_type) if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated): return [], [] # Get initial commit date initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date() commit_dates = [] commit_count = 0 # Add initial commit if it's from the selected year if initial_commit_date.year == selected_year: commit_dates.append(initial_commit_date) commit_count += 1 # Get all commits commits = cached_list_commits(repo_id, repo_type) for commit in commits: commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date() if commit_date.year == selected_year: commit_dates.append(commit_date) commit_count += 1 return commit_dates, commit_count except Exception: return [], 0 # Function to get commit events for a user (optimized) def get_commit_events(username, kind=None, selected_year=None): commit_dates = [] items_with_type = [] kinds = [kind] if kind else ["model", "dataset", "space"] for k in kinds: try: items = cached_list_items(username, k) items_with_type.extend((item, k) for item in items) repo_ids = [item.id for item in items] # Optimized parallel fetch with chunking chunk_size = 5 # Process 5 repos at a time for i in range(0, len(repo_ids), chunk_size): chunk = repo_ids[i:i + chunk_size] with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: future_to_repo = { executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id for repo_id in chunk } for future in as_completed(future_to_repo): repo_commits, repo_count = future.result() if repo_commits: # Only extend if we got commits commit_dates.extend(repo_commits) except Exception as e: st.warning(f"Error fetching {k}s for {username}: {str(e)}") # Create DataFrame with all commits df = pd.DataFrame(commit_dates, columns=["date"]) if not df.empty: df = df.drop_duplicates() # Remove any duplicate dates return df, items_with_type # Calendar heatmap function (optimized) def make_calendar_heatmap(df, title, year): if df.empty: st.info(f"No {title.lower()} found for {year}.") return # Optimize DataFrame operations df["count"] = 1 df = df.groupby("date", as_index=False).sum() df["date"] = pd.to_datetime(df["date"]) # Create date range more efficiently start = pd.Timestamp(f"{year}-01-01") end = pd.Timestamp(f"{year}-12-31") all_days = pd.date_range(start=start, end=end) # Optimize DataFrame creation and merging heatmap_data = pd.DataFrame({"date": all_days, "count": 0}) heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y")) heatmap_data["count"] = heatmap_data["count_y"].fillna(0) heatmap_data = heatmap_data.drop("count_y", axis=1) # Calculate week and day of week more efficiently heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7 # Create pivot table more efficiently pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0) # Optimize month labels calculation month_labels = pd.date_range(start, end, freq="MS").strftime("%b") month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7) # Create custom colormap with specific boundaries from matplotlib.colors import ListedColormap, BoundaryNorm colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions cmap = ListedColormap(colors) norm = BoundaryNorm(bounds, cmap.N) # Create plot more efficiently fig, ax = plt.subplots(figsize=(12, 1.2)) # Convert pivot values to integers to ensure proper color mapping pivot_int = pivot.astype(int) # Create heatmap with explicit vmin and vmax sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white", square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"]) ax.set_title(f"{title}", fontsize=12, pad=10) ax.set_xlabel("") ax.set_ylabel("") ax.set_xticks(month_positions) ax.set_xticklabels(month_labels, fontsize=8) ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) st.pyplot(fig) # Function to fetch follower data for a user @lru_cache(maxsize=100) def fetch_follower_data(username): try: # Make API request to get follower history # Note: This is a placeholder. Actual API endpoint may differ url = f"https://huggingface.co/api/users/{username}/followers-history" response = requests.get(url, timeout=30) if response.status_code != 200: # Simulate some data if API doesn't exist # This is just for demonstration import random from dateutil.relativedelta import relativedelta # Generate 12 months of fake data today = datetime.now() data = [] followers = random.randint(10, 100) for i in range(12): date = today - relativedelta(months=11-i) followers += random.randint(0, 10) data.append({ "date": date.strftime("%Y-%m-%d"), "followers": followers }) return data return response.json() except Exception as e: st.error(f"Error fetching follower data: {str(e)}") return [] # Function to render follower chart def render_follower_chart(username): follower_data = fetch_follower_data(username) if not follower_data: st.info(f"No follower data available for {username}") return # Prepare data for chart dates = [item["date"] for item in follower_data] followers = [item["followers"] for item in follower_data] # Create the chart fig, ax = plt.subplots(figsize=(12, 5)) ax.plot(dates, followers, marker='o', linestyle='-', color='#60A5FA') # Set plot styling ax.set_title(f"Follower Evolution for {username}", fontsize=16) ax.set_xlabel("Date", fontsize=12) ax.set_ylabel("Followers", fontsize=12) # Style improvements ax.grid(True, linestyle='--', alpha=0.7) # Rotate date labels for better readability plt.xticks(rotation=45) # Tight layout to ensure everything fits plt.tight_layout() # Display the chart st.pyplot(fig) # Fetch trending accounts with a loading spinner (do this once at the beginning) with st.spinner("Loading trending accounts..."): trending_accounts, top_owners_spaces, top_owners_models = get_trending_accounts(limit=100) # Sidebar with st.sidebar: st.title("👤 Contributor") # Create tabs for Spaces and Models rankings - ONLY SHOWING FIRST TWO TABS tab1, tab2 = st.tabs([ "Top 100 Overall Contributors", "Top 100 by Spaces & Models" ]) with tab1: # Show combined trending accounts list st.subheader("🔥 Top 100 Overall Contributors") # Display the top 100 accounts list st.markdown("### Combined Contributors Ranking") # Create a data frame for the table if trending_accounts: # Create a mapping from username to Spaces and Models rankings spaces_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_spaces)} models_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_models)} # Create the overall ranking dataframe overall_data = [] for idx, username in enumerate(trending_accounts[:100]): # Use strings for all rankings to avoid type conversion issues spaces_position = str(spaces_rank.get(username, "-")) models_position = str(models_rank.get(username, "-")) overall_data.append([username, spaces_position, models_position]) ranking_data_overall = pd.DataFrame( overall_data, columns=["Contributor", "Spaces Rank", "Models Rank"] ) ranking_data_overall.index = ranking_data_overall.index + 1 # Start index from 1 for ranking st.dataframe( ranking_data_overall, column_config={ "Contributor": st.column_config.TextColumn("Contributor"), "Spaces Rank": st.column_config.TextColumn("Spaces Rank (top 100)"), "Models Rank": st.column_config.TextColumn("Models Rank (top 100)") }, use_container_width=True, hide_index=False ) with tab2: # Show trending accounts list by Spaces st.subheader("🚀 Top 100 by Spaces & Models") # Display the top 100 accounts list st.markdown("### Spaces Contributors Ranking") # Create a data frame for the table if top_owners_spaces: ranking_data_spaces = pd.DataFrame(top_owners_spaces[:100], columns=["Contributor", "Spaces Count"]) ranking_data_spaces.index = ranking_data_spaces.index + 1 # Start index from 1 for ranking st.dataframe( ranking_data_spaces, column_config={ "Contributor": st.column_config.TextColumn("Contributor"), "Spaces Count": st.column_config.NumberColumn("Spaces Count (based on top 500 spaces)", format="%d") }, use_container_width=True, hide_index=False ) # Add stats expander with visualization with st.expander("View Top 30 Spaces Contributors Chart"): # Create a bar chart for top 30 contributors if top_owners_spaces: chart_data = pd.DataFrame(top_owners_spaces[:30], columns=["Owner", "Spaces Count"]) fig, ax = plt.subplots(figsize=(10, 8)) bars = ax.barh(chart_data["Owner"], chart_data["Spaces Count"]) # Add color gradient to bars for i, bar in enumerate(bars): bar.set_color(plt.cm.viridis(i/len(bars))) ax.set_title("Top 30 Contributors by Number of Spaces") ax.set_xlabel("Number of Spaces") plt.tight_layout() st.pyplot(fig) # Display the top 100 Models accounts list (ADDED SECTION) st.markdown("### Models Contributors Ranking") # Create a data frame for the Models table if top_owners_models: ranking_data_models = pd.DataFrame(top_owners_models[:100], columns=["Contributor", "Models Count"]) ranking_data_models.index = ranking_data_models.index + 1 # Start index from 1 for ranking st.dataframe( ranking_data_models, column_config={ "Contributor": st.column_config.TextColumn("Contributor"), "Models Count": st.column_config.NumberColumn("Models Count (based on top 500 models)", format="%d") }, use_container_width=True, hide_index=False ) # Add stats expander with visualization for Models (ADDED SECTION) with st.expander("View Top 30 Models Contributors Chart"): # Create a bar chart for top 30 models contributors if top_owners_models: chart_data = pd.DataFrame(top_owners_models[:30], columns=["Owner", "Models Count"]) fig, ax = plt.subplots(figsize=(10, 8)) bars = ax.barh(chart_data["Owner"], chart_data["Models Count"]) # Add color gradient to bars for i, bar in enumerate(bars): bar.set_color(plt.cm.plasma(i/len(bars))) # Using a different colormap for distinction ax.set_title("Top 30 Contributors by Number of Models") ax.set_xlabel("Number of Models") plt.tight_layout() st.pyplot(fig) # Display trending accounts selection dropdown st.subheader("Select Contributor") selected_trending = st.selectbox( "Select trending account", options=trending_accounts[:100], # Limit to top 100 index=0 if trending_accounts else None, key="trending_selectbox" ) # Custom account input option st.markdown("

", unsafe_allow_html=True) custom = st.text_input("Enter username/org", label_visibility="collapsed") # Set username based on selection or custom input if custom.strip(): username = custom.strip() elif selected_trending: username = selected_trending else: username = "facebook" # Default fallback # Year selection st.subheader("🗓️ Time Period") year_options = list(range(datetime.now().year, 2017, -1)) selected_year = st.selectbox("Select Year", options=year_options) # Additional options for customization st.subheader("⚙️ Display Options") show_models = st.checkbox("Show Models", value=True) show_datasets = st.checkbox("Show Datasets", value=True) show_spaces = st.checkbox("Show Spaces", value=True) # Main Content st.title("🤗 Hugging Face Contributions") if username: with st.spinner(f"Fetching commit data for {username}..."): # Display contributor rank if in top 30 if username in trending_accounts[:100]: rank = trending_accounts.index(username) + 1 st.success(f"🏆 {username} is ranked #{rank} in the top trending contributors!") # Find user in spaces ranking spaces_rank = None for i, (owner, count) in enumerate(top_owners_spaces): if owner == username: spaces_rank = i+1 st.info(f"🚀 Spaces Ranking: #{spaces_rank} with {count} spaces") break # Find user in models ranking models_rank = None for i, (owner, count) in enumerate(top_owners_models): if owner == username: models_rank = i+1 st.info(f"🧠 Models Ranking: #{models_rank} with {count} models") break # Display combined ranking info combined_info = [] if spaces_rank and spaces_rank <= 100: combined_info.append(f"Spaces: #{spaces_rank}") if models_rank and models_rank <= 100: combined_info.append(f"Models: #{models_rank}") if combined_info: st.success(f"Combined Rankings (Top 100): {', '.join(combined_info)}") # Create a dictionary to store commits by type commits_by_type = {} commit_counts_by_type = {} # Determine which types to fetch based on checkboxes types_to_fetch = [] if show_models: types_to_fetch.append("model") if show_datasets: types_to_fetch.append("dataset") if show_spaces: types_to_fetch.append("space") if not types_to_fetch: st.warning("Please select at least one content type to display (Models, Datasets, or Spaces)") st.stop() # Fetch commits for each selected type for kind in types_to_fetch: try: items = cached_list_items(username, kind) repo_ids = [item.id for item in items] st.info(f"Found {len(repo_ids)} {kind}s for {username}") # Process repos in chunks chunk_size = 5 total_commits = 0 all_commit_dates = [] progress_bar = st.progress(0) for i in range(0, len(repo_ids), chunk_size): chunk = repo_ids[i:i + chunk_size] with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor: future_to_repo = { executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id for repo_id in chunk } for future in as_completed(future_to_repo): repo_commits, repo_count = future.result() if repo_commits: all_commit_dates.extend(repo_commits) total_commits += repo_count # Update progress progress = min(1.0, (i + len(chunk)) / max(1, len(repo_ids))) progress_bar.progress(progress) # Complete progress progress_bar.progress(1.0) commits_by_type[kind] = all_commit_dates commit_counts_by_type[kind] = total_commits except Exception as e: st.warning(f"Error fetching {kind}s for {username}: {str(e)}") commits_by_type[kind] = [] commit_counts_by_type[kind] = 0 # Calculate total commits across all types total_commits = sum(commit_counts_by_type.values()) st.subheader(f"{username}'s Activity in {selected_year}") # Profile information profile_col1, profile_col2 = st.columns([1, 3]) with profile_col1: # Try to get avatar try: avatar_url = f"https://huggingface.co/avatars/{username}" st.image(avatar_url, width=150) except: st.info("No profile image available") with profile_col2: st.metric("Total Commits", total_commits) # Show contributor rank if in top owners for owner, count in top_owners_spaces: if owner.lower() == username.lower(): st.metric("Spaces Count", count) break st.markdown(f"[View Profile on Hugging Face](https://huggingface.co/{username})") # Create DataFrame for all commits all_commits = [] for commits in commits_by_type.values(): all_commits.extend(commits) all_df = pd.DataFrame(all_commits, columns=["date"]) if not all_df.empty: all_df = all_df.drop_duplicates() # Remove any duplicate dates make_calendar_heatmap(all_df, "All Commits", selected_year) # Add followers chart section st.subheader(f"👥 Follower Evolution for {username}") render_follower_chart(username) # Metrics and heatmaps for each selected type cols = st.columns(len(types_to_fetch)) if types_to_fetch else st.columns(1) for i, (kind, emoji, label) in enumerate([ ("model", "🧠", "Models"), ("dataset", "📦", "Datasets"), ("space", "🚀", "Spaces") ]): if kind in types_to_fetch: with cols[types_to_fetch.index(kind)]: try: total = len(cached_list_items(username, kind)) commits = commits_by_type.get(kind, []) commit_count = commit_counts_by_type.get(kind, 0) df_kind = pd.DataFrame(commits, columns=["date"]) if not df_kind.empty: df_kind = df_kind.drop_duplicates() # Remove any duplicate dates st.metric(f"{emoji} {label}", total) st.metric(f"Commits in {selected_year}", commit_count) make_calendar_heatmap(df_kind, f"{label} Commits", selected_year) except Exception as e: st.warning(f"Error processing {label}: {str(e)}") st.metric(f"{emoji} {label}", 0) st.metric(f"Commits in {selected_year}", 0) make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year) else: st.info("Please select an account from the sidebar to view contributions.")