Gyaneshere commited on
Commit
c5420f7
Β·
verified Β·
1 Parent(s): bfe26bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -0
app.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from huggingface_hub import HfApi
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from datetime import datetime
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from functools import lru_cache
9
+ import time
10
+
11
+ st.set_page_config(page_title="HF Contributions", layout="wide")
12
+ api = HfApi()
13
+
14
+
15
+ # Cache for API responses
16
+ @lru_cache(maxsize=1000)
17
+ def cached_repo_info(repo_id, repo_type):
18
+ return api.repo_info(repo_id=repo_id, repo_type=repo_type)
19
+
20
+
21
+ @lru_cache(maxsize=1000)
22
+ def cached_list_commits(repo_id, repo_type):
23
+ return list(api.list_repo_commits(repo_id=repo_id, repo_type=repo_type))
24
+
25
+
26
+ @lru_cache(maxsize=100)
27
+ def cached_list_items(username, kind):
28
+ if kind == "model":
29
+ return list(api.list_models(author=username))
30
+ elif kind == "dataset":
31
+ return list(api.list_datasets(author=username))
32
+ elif kind == "space":
33
+ return list(api.list_spaces(author=username))
34
+ return []
35
+
36
+
37
+ # Rate limiting
38
+ class RateLimiter:
39
+ def __init__(self, calls_per_second=10):
40
+ self.calls_per_second = calls_per_second
41
+ self.last_call = 0
42
+
43
+ def wait(self):
44
+ current_time = time.time()
45
+ time_since_last_call = current_time - self.last_call
46
+ if time_since_last_call < (1.0 / self.calls_per_second):
47
+ time.sleep((1.0 / self.calls_per_second) - time_since_last_call)
48
+ self.last_call = time.time()
49
+
50
+
51
+ rate_limiter = RateLimiter()
52
+
53
+
54
+ # Function to fetch commits for a repository (optimized)
55
+ def fetch_commits_for_repo(repo_id, repo_type, username, selected_year):
56
+ try:
57
+ rate_limiter.wait()
58
+ # Skip private/gated repos upfront
59
+ repo_info = cached_repo_info(repo_id, repo_type)
60
+ if repo_info.private or (hasattr(repo_info, 'gated') and repo_info.gated):
61
+ return [], []
62
+
63
+ # Get initial commit date
64
+ initial_commit_date = pd.to_datetime(repo_info.created_at).tz_localize(None).date()
65
+ commit_dates = []
66
+ commit_count = 0
67
+
68
+ # Add initial commit if it's from the selected year
69
+ if initial_commit_date.year == selected_year:
70
+ commit_dates.append(initial_commit_date)
71
+ commit_count += 1
72
+
73
+ # Get all commits
74
+ commits = cached_list_commits(repo_id, repo_type)
75
+ for commit in commits:
76
+ commit_date = pd.to_datetime(commit.created_at).tz_localize(None).date()
77
+ if commit_date.year == selected_year:
78
+ commit_dates.append(commit_date)
79
+ commit_count += 1
80
+
81
+ return commit_dates, commit_count
82
+ except Exception:
83
+ return [], 0
84
+
85
+
86
+ # Function to get commit events for a user (optimized)
87
+ def get_commit_events(username, kind=None, selected_year=None):
88
+ commit_dates = []
89
+ items_with_type = []
90
+ kinds = [kind] if kind else ["model", "dataset", "space"]
91
+
92
+ for k in kinds:
93
+ try:
94
+ items = cached_list_items(username, k)
95
+ items_with_type.extend((item, k) for item in items)
96
+ repo_ids = [item.id for item in items]
97
+
98
+ # Optimized parallel fetch with chunking
99
+ chunk_size = 5 # Process 5 repos at a time
100
+ for i in range(0, len(repo_ids), chunk_size):
101
+ chunk = repo_ids[i:i + chunk_size]
102
+ with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
103
+ future_to_repo = {
104
+ executor.submit(fetch_commits_for_repo, repo_id, k, username, selected_year): repo_id
105
+ for repo_id in chunk
106
+ }
107
+ for future in as_completed(future_to_repo):
108
+ repo_commits, repo_count = future.result()
109
+ if repo_commits: # Only extend if we got commits
110
+ commit_dates.extend(repo_commits)
111
+ except Exception as e:
112
+ st.warning(f"Error fetching {k}s for {username}: {str(e)}")
113
+
114
+ # Create DataFrame with all commits
115
+ df = pd.DataFrame(commit_dates, columns=["date"])
116
+ if not df.empty:
117
+ df = df.drop_duplicates() # Remove any duplicate dates
118
+ return df, items_with_type
119
+
120
+
121
+ # Calendar heatmap function (optimized)
122
+ def make_calendar_heatmap(df, title, year):
123
+ if df.empty:
124
+ st.info(f"No {title.lower()} found for {year}.")
125
+ return
126
+
127
+ # Optimize DataFrame operations
128
+ df["count"] = 1
129
+ df = df.groupby("date", as_index=False).sum()
130
+ df["date"] = pd.to_datetime(df["date"])
131
+
132
+ # Create date range more efficiently
133
+ start = pd.Timestamp(f"{year}-01-01")
134
+ end = pd.Timestamp(f"{year}-12-31")
135
+ all_days = pd.date_range(start=start, end=end)
136
+
137
+ # Optimize DataFrame creation and merging
138
+ heatmap_data = pd.DataFrame({"date": all_days, "count": 0})
139
+ heatmap_data = heatmap_data.merge(df, on="date", how="left", suffixes=("", "_y"))
140
+ heatmap_data["count"] = heatmap_data["count_y"].fillna(0)
141
+ heatmap_data = heatmap_data.drop("count_y", axis=1)
142
+
143
+ # Calculate week and day of week more efficiently
144
+ heatmap_data["dow"] = heatmap_data["date"].dt.dayofweek
145
+ heatmap_data["week"] = (heatmap_data["date"] - start).dt.days // 7
146
+
147
+ # Create pivot table more efficiently
148
+ pivot = heatmap_data.pivot(index="dow", columns="week", values="count").fillna(0)
149
+
150
+ # Optimize month labels calculation
151
+ month_labels = pd.date_range(start, end, freq="MS").strftime("%b")
152
+ month_positions = pd.date_range(start, end, freq="MS").map(lambda x: (x - start).days // 7)
153
+
154
+ # Create custom colormap with specific boundaries
155
+ from matplotlib.colors import ListedColormap, BoundaryNorm
156
+ colors = ['#ebedf0', '#9be9a8', '#40c463', '#30a14e', '#216e39'] # GitHub-style green colors
157
+ bounds = [0, 1, 3, 11, 31, float('inf')] # Boundaries for color transitions
158
+ cmap = ListedColormap(colors)
159
+ norm = BoundaryNorm(bounds, cmap.N)
160
+
161
+ # Create plot more efficiently
162
+ fig, ax = plt.subplots(figsize=(12, 1.2))
163
+
164
+ # Convert pivot values to integers to ensure proper color mapping
165
+ pivot_int = pivot.astype(int)
166
+
167
+ # Create heatmap with explicit vmin and vmax
168
+ sns.heatmap(pivot_int, ax=ax, cmap=cmap, norm=norm, linewidths=0.5, linecolor="white",
169
+ square=True, cbar=False, yticklabels=["M", "T", "W", "T", "F", "S", "S"])
170
+
171
+ ax.set_title(f"{title}", fontsize=12, pad=10)
172
+ ax.set_xlabel("")
173
+ ax.set_ylabel("")
174
+ ax.set_xticks(month_positions)
175
+ ax.set_xticklabels(month_labels, fontsize=8)
176
+ ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
177
+ st.pyplot(fig)
178
+
179
+
180
+ # Sidebar
181
+ with st.sidebar:
182
+ st.title("πŸ‘€ Contributor")
183
+ username = st.selectbox(
184
+ "Select or type a username",
185
+ options=["ritvik77", "facebook", "google", "stabilityai", "Salesforce", "tiiuae", "bigscience"],
186
+ index=0
187
+ )
188
+ st.markdown("<div style='text-align: center; margin: 10px 0;'>OR</div>", unsafe_allow_html=True)
189
+ custom = st.text_input("", placeholder="Enter custom username/org")
190
+ if custom.strip():
191
+ username = custom.strip()
192
+ year_options = list(range(datetime.now().year, 2017, -1))
193
+ selected_year = st.selectbox("πŸ—“οΈ Year", options=year_options)
194
+
195
+ # Main Content
196
+ st.title("πŸ€— Hugging Face Contributions")
197
+ if username:
198
+ with st.spinner("Fetching commit data..."):
199
+ # Create a dictionary to store commits by type
200
+ commits_by_type = {}
201
+ commit_counts_by_type = {}
202
+
203
+ # Fetch commits for each type separately
204
+ for kind in ["model", "dataset", "space"]:
205
+ try:
206
+ items = cached_list_items(username, kind)
207
+ repo_ids = [item.id for item in items]
208
+
209
+ # Process repos in chunks
210
+ chunk_size = 5
211
+ total_commits = 0
212
+ all_commit_dates = []
213
+
214
+ for i in range(0, len(repo_ids), chunk_size):
215
+ chunk = repo_ids[i:i + chunk_size]
216
+ with ThreadPoolExecutor(max_workers=min(5, len(chunk))) as executor:
217
+ future_to_repo = {
218
+ executor.submit(fetch_commits_for_repo, repo_id, kind, username, selected_year): repo_id
219
+ for repo_id in chunk
220
+ }
221
+ for future in as_completed(future_to_repo):
222
+ repo_commits, repo_count = future.result()
223
+ if repo_commits:
224
+ all_commit_dates.extend(repo_commits)
225
+ total_commits += repo_count
226
+
227
+ commits_by_type[kind] = all_commit_dates
228
+ commit_counts_by_type[kind] = total_commits
229
+
230
+ except Exception as e:
231
+ st.warning(f"Error fetching {kind}s for {username}: {str(e)}")
232
+ commits_by_type[kind] = []
233
+ commit_counts_by_type[kind] = 0
234
+
235
+ # Calculate total commits across all types
236
+ total_commits = sum(commit_counts_by_type.values())
237
+
238
+ st.subheader(f"{username}'s Activity in {selected_year}")
239
+ st.metric("Total Commits", total_commits)
240
+
241
+ # Create DataFrame for all commits
242
+ all_commits = []
243
+ for commits in commits_by_type.values():
244
+ all_commits.extend(commits)
245
+ all_df = pd.DataFrame(all_commits, columns=["date"])
246
+ if not all_df.empty:
247
+ all_df = all_df.drop_duplicates() # Remove any duplicate dates
248
+
249
+ make_calendar_heatmap(all_df, "All Commits", selected_year)
250
+
251
+ # Metrics and heatmaps for each type
252
+ col1, col2, col3 = st.columns(3)
253
+ for col, kind, emoji, label in [
254
+ (col1, "model", "🧠", "Models"),
255
+ (col2, "dataset", "πŸ“¦", "Datasets"),
256
+ (col3, "space", "πŸš€", "Spaces")
257
+ ]:
258
+ with col:
259
+ try:
260
+ total = len(cached_list_items(username, kind))
261
+ commits = commits_by_type.get(kind, [])
262
+ commit_count = commit_counts_by_type.get(kind, 0)
263
+ df_kind = pd.DataFrame(commits, columns=["date"])
264
+ if not df_kind.empty:
265
+ df_kind = df_kind.drop_duplicates() # Remove any duplicate dates
266
+ st.metric(f"{emoji} {label}", total)
267
+ st.metric(f"Commits in {selected_year}", commit_count)
268
+ make_calendar_heatmap(df_kind, f"{label} Commits", selected_year)
269
+ except Exception as e:
270
+ st.warning(f"Error processing {label}: {str(e)}")
271
+ st.metric(f"{emoji} {label}", 0)
272
+ st.metric(f"Commits in {selected_year}", 0)
273
+ make_calendar_heatmap(pd.DataFrame(), f"{label} Commits", selected_year)