kolaslab's picture
Update app.py
3d0bb33 verified
raw
history blame
26.9 kB
import streamlit as st
from huggingface_hub import HfApi
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import time
import requests
from collections import Counter
st.set_page_config(page_title="HF Contributions", layout="wide", initial_sidebar_state="expanded")
# Set custom sidebar width - UPDATED to 40% of the screen
st.markdown("""
<style>
[data-testid="stSidebar"] {
min-width: 40vw !important;
max-width: 40vw !important;
}
</style>
""", unsafe_allow_html=True)
api = HfApi()
# Cache for API responses
@lru_cache(maxsize=1000)
def cached_repo_info(repo_id, repo_type):
return api.repo_info(repo_id=repo_id, repo_type=repo_type)
@lru_cache(maxsize=1000)
def cached_list_commits(repo_id, repo_type):
return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type))
@lru_cache(maxsize=100)
def cached_list_items(username, kind):
if kind == "model":
return list(api.list_models(author=username))
elif kind == "dataset":
return list(api.list_datasets(author=username))
elif kind == "space":
return list(api.list_spaces(author=username))
return []
# Function to fetch trending accounts and create stats
@lru_cache(maxsize=1)
def get_trending_accounts(limit=100):
try:
trending_data = {"spaces": [], "models": []}
# Get spaces for stats calculation
spaces_response = requests.get("https://huggingface.co/api/spaces",
params={"limit": 10000},
timeout=30)
# Get models for stats calculation
models_response = requests.get("https://huggingface.co/api/models",
params={"limit": 10000},
timeout=30)
# Process spaces data
spaces_owners = []
if spaces_response.status_code == 200:
spaces = spaces_response.json()
# Count spaces by owner
owner_counts_spaces = {}
for space in spaces:
if '/' in space.get('id', ''):
owner, _ = space.get('id', '').split('/', 1)
else:
owner = space.get('owner', '')
if owner != 'None':
owner_counts_spaces[owner] = owner_counts_spaces.get(owner, 0) + 1
# Get top owners by count for spaces
top_owners_spaces = sorted(owner_counts_spaces.items(), key=lambda x: x[1], reverse=True)[:limit]
trending_data["spaces"] = top_owners_spaces
spaces_owners = [owner for owner, _ in top_owners_spaces]
# Process models data
models_owners = []
if models_response.status_code == 200:
models = models_response.json()
# Count models by owner
owner_counts_models = {}
for model in models:
if '/' in model.get('id', ''):
owner, _ = model.get('id', '').split('/', 1)
else:
owner = model.get('owner', '')
if owner != 'None':
owner_counts_models[owner] = owner_counts_models.get(owner, 0) + 1
# Get top owners by count for models
top_owners_models = sorted(owner_counts_models.items(), key=lambda x: x[1], reverse=True)[:limit]
trending_data["models"] = top_owners_models
models_owners = [owner for owner, _ in top_owners_models]
# Combine rankings for overall trending based on appearance in both lists
combined_score = {}
for i, owner in enumerate(spaces_owners):
if owner not in combined_score:
combined_score[owner] = 0
combined_score[owner] += (limit - i) # Higher rank gives more points
for i, owner in enumerate(models_owners):
if owner not in combined_score:
combined_score[owner] = 0
combined_score[owner] += (limit - i) # Higher rank gives more points
# Sort by combined score
sorted_combined = sorted(combined_score.items(), key=lambda x: x[1], reverse=True)[:limit]
trending_authors = [owner for owner, _ in sorted_combined]
return trending_authors, trending_data["spaces"], trending_data["models"]
except Exception as e:
st.error(f"Error fetching trending accounts: {str(e)}")
fallback_authors = ["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"]
return fallback_authors, [(author, 0) for author in fallback_authors], [(author, 0) for author in fallback_authors]
# Rate limiting
class RateLimiter:
def __init__(self, calls_per_second=10):
self.calls_per_second = calls_per_second
self.last_call = 0
def wait(self):
current_time = time.time()
time_since_last_call = current_time - self.last_call
if time_since_last_call < (1.0 / self.calls_per_second):
time.sleep((1.0 / self.calls_per_second) - time_since_last_call)
self.last_call = time.time()
rate_limiter = RateLimiter()
# Function to fetch commits for a repository (optimized)
def fetch_commits_for_repo(repo_id, repo_type, username, selected_year):
try:
rate_limiter.wait()
# Skip private/gated repos upfront
repo_info = cached_repo_info(repo_id, repo_type)
if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated):
return [], []
# Get initial commit date
initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date()
commit_dates = []
commit_count = 0
# Add initial commit if it's from the selected year
if initial_commit_date.year == selected_year:
commit_dates.append(initial_commit_date)
commit_count += 1
# Get all commits
commits = cached_list_commits(repo_id, repo_type)
for commit in commits:
commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date()
if commit_date.year == selected_year:
commit_dates.append(commit_date)
commit_count += 1
return commit_dates, commit_count
except Exception:
return [], 0
# Function to get commit events for a user (optimized)
def get_commit_events(username, kind=None, selected_year=None):
commit_dates = []
items_with_type = []
kinds = [kind] if kind else ["model", "dataset", "space"]
for k in kinds:
try:
items = cached_list_items(username, k)
items_with_type.extend((item, k) for item in items)
repo_ids = [item.id for item in items]
# Optimized parallel fetch with chunking
chunk_size = 5 # Process 5 repos at a time
for i in range(0, len(repo_ids), chunk_size):
chunk = repo_ids[i:i + chunk_size]
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
future_to_repo = {
executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id
for repo_id in chunk
}
for future in as_completed(future_to_repo):
repo_commits, repo_count = future.result()
if repo_commits: # Only extend if we got commits
commit_dates.extend(repo_commits)
except Exception as e:
st.warning(f"Error fetching {k}s for {username}: {str(e)}")
# Create DataFrame with all commits
df = pd.DataFrame(commit_dates, columns=["date"])
if not df.empty:
df = df.drop_duplicates() # Remove any duplicate dates
return df, items_with_type
# Calendar heatmap function (optimized)
def make_calendar_heatmap(df, title, year):
if df.empty:
st.info(f"No {title.lower()} found for {year}.")
return
# Optimize DataFrame operations
df["count"] = 1
df = df.groupby("date", as_index=False).sum()
df["date"] = pd.to_datetime(df["date"])
# Create date range more efficiently
start = pd.Timestamp(f"{year}-01-01")
end = pd.Timestamp(f"{year}-12-31")
all_days = pd.date_range(start=start, end=end)
# Optimize DataFrame creation and merging
heatmap_data = pd.DataFrame({"date": all_days, "count": 0})
heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y"))
heatmap_data["count"] = heatmap_data["count_y"].fillna(0)
heatmap_data = heatmap_data.drop("count_y", axis=1)
# Calculate week and day of week more efficiently
heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek
heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7
# Create pivot table more efficiently
pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0)
# Optimize month labels calculation
month_labels = pd.date_range(start, end, freq="MS").strftime("%b")
month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7)
# Create custom colormap with specific boundaries
from matplotlib.colors import ListedColormap, BoundaryNorm
colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors
bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions
cmap = ListedColormap(colors)
norm = BoundaryNorm(bounds, cmap.N)
# Create plot more efficiently
fig, ax = plt.subplots(figsize=(12, 1.2))
# Convert pivot values to integers to ensure proper color mapping
pivot_int = pivot.astype(int)
# Create heatmap with explicit vmin and vmax
sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white",
square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"])
ax.set_title(f"{title}", fontsize=12, pad=10)
ax.set_xlabel("")
ax.set_ylabel("")
ax.set_xticks(month_positions)
ax.set_xticklabels(month_labels, fontsize=8)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
st.pyplot(fig)
# Function to fetch follower data for a user
@lru_cache(maxsize=100)
def fetch_follower_data(username):
try:
# Make API request to get follower history
# Note: This is a placeholder. Actual API endpoint may differ
url = f"https://huggingface.co/api/users/{username}/followers-history"
response = requests.get(url, timeout=30)
if response.status_code != 200:
# Simulate some data if API doesn't exist
# This is just for demonstration
import random
from dateutil.relativedelta import relativedelta
# Generate 12 months of fake data
today = datetime.now()
data = []
followers = random.randint(10, 100)
for i in range(12):
date = today - relativedelta(months=11-i)
followers += random.randint(0, 10)
data.append({
"date": date.strftime("%Y-%m-%d"),
"followers": followers
})
return data
return response.json()
except Exception as e:
st.error(f"Error fetching follower data: {str(e)}")
return []
# Function to render follower chart
def render_follower_chart(username):
follower_data = fetch_follower_data(username)
if not follower_data:
st.info(f"No follower data available for {username}")
return
# Prepare data for chart
dates = [item["date"] for item in follower_data]
followers = [item["followers"] for item in follower_data]
# Create the chart
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(dates, followers, marker='o', linestyle='-', color='#60A5FA')
# Set plot styling
ax.set_title(f"Follower Evolution for {username}", fontsize=16)
ax.set_xlabel("Date", fontsize=12)
ax.set_ylabel("Followers", fontsize=12)
# Style improvements
ax.grid(True, linestyle='--', alpha=0.7)
# Rotate date labels for better readability
plt.xticks(rotation=45)
# Tight layout to ensure everything fits
plt.tight_layout()
# Display the chart
st.pyplot(fig)
# Fetch trending accounts with a loading spinner (do this once at the beginning)
with st.spinner("Loading trending accounts..."):
trending_accounts, top_owners_spaces, top_owners_models = get_trending_accounts(limit=100)
# Sidebar
with st.sidebar:
st.title("๐Ÿ‘ค Contributor")
# Create tabs for Spaces and Models rankings - ONLY SHOWING FIRST TWO TABS
tab1, tab2 = st.tabs([
"Top 100 Overall Contributors",
"Top 100 by Spaces & Models"
])
with tab1:
# Show combined trending accounts list
st.subheader("๐Ÿ”ฅ Top 100 Overall Contributors")
# Display the top 100 accounts list
st.markdown("### Combined Contributors Ranking")
# Create a data frame for the table
if trending_accounts:
# Create a mapping from username to Spaces and Models rankings
spaces_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_spaces)}
models_rank = {owner: idx+1 for idx, (owner, _) in enumerate(top_owners_models)}
# Create the overall ranking dataframe
overall_data = []
for idx, username in enumerate(trending_accounts[:100]):
# Use strings for all rankings to avoid type conversion issues
spaces_position = str(spaces_rank.get(username, "-"))
models_position = str(models_rank.get(username, "-"))
overall_data.append([username, spaces_position, models_position])
ranking_data_overall = pd.DataFrame(
overall_data,
columns=["Contributor", "Spaces Rank", "Models Rank"]
)
ranking_data_overall.index = ranking_data_overall.index + 1 # Start index from 1 for ranking
st.dataframe(
ranking_data_overall,
column_config={
"Contributor": st.column_config.TextColumn("Contributor"),
"Spaces Rank": st.column_config.TextColumn("Spaces Rank (top 100)"),
"Models Rank": st.column_config.TextColumn("Models Rank (top 100)")
},
use_container_width=True,
hide_index=False
)
with tab2:
# Show trending accounts list by Spaces
st.subheader("๐Ÿš€ Top 100 by Spaces & Models")
# Display the top 100 accounts list
st.markdown("### Spaces Contributors Ranking")
# Create a data frame for the table
if top_owners_spaces:
ranking_data_spaces = pd.DataFrame(top_owners_spaces[:100], columns=["Contributor", "Spaces Count"])
ranking_data_spaces.index = ranking_data_spaces.index + 1 # Start index from 1 for ranking
st.dataframe(
ranking_data_spaces,
column_config={
"Contributor": st.column_config.TextColumn("Contributor"),
"Spaces Count": st.column_config.NumberColumn("Spaces Count (based on top 500 spaces)", format="%d")
},
use_container_width=True,
hide_index=False
)
# Add stats expander with visualization
with st.expander("View Top 30 Spaces Contributors Chart"):
# Create a bar chart for top 30 contributors
if top_owners_spaces:
chart_data = pd.DataFrame(top_owners_spaces[:30], columns=["Owner", "Spaces Count"])
fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(chart_data["Owner"], chart_data["Spaces Count"])
# Add color gradient to bars
for i, bar in enumerate(bars):
bar.set_color(plt.cm.viridis(i/len(bars)))
ax.set_title("Top 30 Contributors by Number of Spaces")
ax.set_xlabel("Number of Spaces")
plt.tight_layout()
st.pyplot(fig)
# Display the top 100 Models accounts list (ADDED SECTION)
st.markdown("### Models Contributors Ranking")
# Create a data frame for the Models table
if top_owners_models:
ranking_data_models = pd.DataFrame(top_owners_models[:100], columns=["Contributor", "Models Count"])
ranking_data_models.index = ranking_data_models.index + 1 # Start index from 1 for ranking
st.dataframe(
ranking_data_models,
column_config={
"Contributor": st.column_config.TextColumn("Contributor"),
"Models Count": st.column_config.NumberColumn("Models Count (based on top 500 models)", format="%d")
},
use_container_width=True,
hide_index=False
)
# Add stats expander with visualization for Models (ADDED SECTION)
with st.expander("View Top 30 Models Contributors Chart"):
# Create a bar chart for top 30 models contributors
if top_owners_models:
chart_data = pd.DataFrame(top_owners_models[:30], columns=["Owner", "Models Count"])
fig, ax = plt.subplots(figsize=(10, 8))
bars = ax.barh(chart_data["Owner"], chart_data["Models Count"])
# Add color gradient to bars
for i, bar in enumerate(bars):
bar.set_color(plt.cm.plasma(i/len(bars))) # Using a different colormap for distinction
ax.set_title("Top 30 Contributors by Number of Models")
ax.set_xlabel("Number of Models")
plt.tight_layout()
st.pyplot(fig)
# Display trending accounts selection dropdown
st.subheader("Select Contributor")
selected_trending = st.selectbox(
"Select trending account",
options=trending_accounts[:100], # Limit to top 100
index=0 if trending_accounts else None,
key="trending_selectbox"
)
# Custom account input option
st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True)
custom = st.text_input("Enter username/org", label_visibility="collapsed")
# Set username based on selection or custom input
if custom.strip():
username = custom.strip()
elif selected_trending:
username = selected_trending
else:
username = "facebook" # Default fallback
# Year selection
st.subheader("๐Ÿ—“๏ธ Time Period")
year_options = list(range(datetime.now().year, 2017, -1))
selected_year = st.selectbox("Select Year", options=year_options)
# Additional options for customization
st.subheader("โš™๏ธ Display Options")
show_models = st.checkbox("Show Models", value=True)
show_datasets = st.checkbox("Show Datasets", value=True)
show_spaces = st.checkbox("Show Spaces", value=True)
# Main Content
st.title("๐Ÿค— Hugging Face Contributions")
if username:
with st.spinner(f"Fetching commit data for {username}..."):
# Display contributor rank if in top 30
if username in trending_accounts[:100]:
rank = trending_accounts.index(username) + 1
st.success(f"๐Ÿ† {username} is ranked #{rank} in the top trending contributors!")
# Find user in spaces ranking
spaces_rank = None
for i, (owner, count) in enumerate(top_owners_spaces):
if owner == username:
spaces_rank = i+1
st.info(f"๐Ÿš€ Spaces Ranking: #{spaces_rank} with {count} spaces")
break
# Find user in models ranking
models_rank = None
for i, (owner, count) in enumerate(top_owners_models):
if owner == username:
models_rank = i+1
st.info(f"๐Ÿง  Models Ranking: #{models_rank} with {count} models")
break
# Display combined ranking info
combined_info = []
if spaces_rank and spaces_rank <= 100:
combined_info.append(f"Spaces: #{spaces_rank}")
if models_rank and models_rank <= 100:
combined_info.append(f"Models: #{models_rank}")
if combined_info:
st.success(f"Combined Rankings (Top 100): {', '.join(combined_info)}")
# Create a dictionary to store commits by type
commits_by_type = {}
commit_counts_by_type = {}
# Determine which types to fetch based on checkboxes
types_to_fetch = []
if show_models:
types_to_fetch.append("model")
if show_datasets:
types_to_fetch.append("dataset")
if show_spaces:
types_to_fetch.append("space")
if not types_to_fetch:
st.warning("Please select at least one content type to display (Models, Datasets, or Spaces)")
st.stop()
# Fetch commits for each selected type
for kind in types_to_fetch:
try:
items = cached_list_items(username, kind)
repo_ids = [item.id for item in items]
st.info(f"Found {len(repo_ids)} {kind}s for {username}")
# Process repos in chunks
chunk_size = 5
total_commits = 0
all_commit_dates = []
progress_bar = st.progress(0)
for i in range(0, len(repo_ids), chunk_size):
chunk = repo_ids[i:i + chunk_size]
with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
future_to_repo = {
executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id
for repo_id in chunk
}
for future in as_completed(future_to_repo):
repo_commits, repo_count = future.result()
if repo_commits:
all_commit_dates.extend(repo_commits)
total_commits += repo_count
# Update progress
progress = min(1.0, (i + len(chunk)) / max(1, len(repo_ids)))
progress_bar.progress(progress)
# Complete progress
progress_bar.progress(1.0)
commits_by_type[kind] = all_commit_dates
commit_counts_by_type[kind] = total_commits
except Exception as e:
st.warning(f"Error fetching {kind}s for {username}: {str(e)}")
commits_by_type[kind] = []
commit_counts_by_type[kind] = 0
# Calculate total commits across all types
total_commits = sum(commit_counts_by_type.values())
st.subheader(f"{username}'s Activity in {selected_year}")
# Profile information
profile_col1, profile_col2 = st.columns([1, 3])
with profile_col1:
# Try to get avatar
try:
avatar_url = f"https://huggingface.co/avatars/{username}"
st.image(avatar_url, width=150)
except:
st.info("No profile image available")
with profile_col2:
st.metric("Total Commits", total_commits)
# Show contributor rank if in top owners
for owner, count in top_owners_spaces:
if owner.lower() == username.lower():
st.metric("Spaces Count", count)
break
st.markdown(f"[View Profile on Hugging Face](https://huggingface.co/{username})")
# Create DataFrame for all commits
all_commits = []
for commits in commits_by_type.values():
all_commits.extend(commits)
all_df = pd.DataFrame(all_commits, columns=["date"])
if not all_df.empty:
all_df = all_df.drop_duplicates() # Remove any duplicate dates
make_calendar_heatmap(all_df, "All Commits", selected_year)
# Add followers chart section
st.subheader(f"๐Ÿ‘ฅ Follower Evolution for {username}")
render_follower_chart(username)
# Metrics and heatmaps for each selected type
cols = st.columns(len(types_to_fetch)) if types_to_fetch else st.columns(1)
for i, (kind, emoji, label) in enumerate([
("model", "๐Ÿง ", "Models"),
("dataset", "๐Ÿ“ฆ", "Datasets"),
("space", "๐Ÿš€", "Spaces")
]):
if kind in types_to_fetch:
with cols[types_to_fetch.index(kind)]:
try:
total = len(cached_list_items(username, kind))
commits = commits_by_type.get(kind, [])
commit_count = commit_counts_by_type.get(kind, 0)
df_kind = pd.DataFrame(commits, columns=["date"])
if not df_kind.empty:
df_kind = df_kind.drop_duplicates() # Remove any duplicate dates
st.metric(f"{emoji} {label}", total)
st.metric(f"Commits in {selected_year}", commit_count)
make_calendar_heatmap(df_kind, f"{label} Commits", selected_year)
except Exception as e:
st.warning(f"Error processing {label}: {str(e)}")
st.metric(f"{emoji} {label}", 0)
st.metric(f"Commits in {selected_year}", 0)
make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year)
else:
st.info("Please select an account from the sidebar to view contributions.")