# pylint: disable=no-member import pandas as pd import gradio as gr import plotly.express as px import plotly.graph_objects as go import numpy as np s3_aggregation_df = pd.read_parquet( "hf://datasets/xet-team/cas-pops-analysis-data/aggregated_s3_logs.parquet" ) aws_regions = pd.read_parquet( "hf://datasets/xet-team/cas-pops-analysis-data/regions.parquet" ) sum_request_count = s3_aggregation_df["request_count"].sum() sum_object_size = s3_aggregation_df["object_size"].sum() n_unique_countries = s3_aggregation_df["country_code"].nunique() unique_regions = list(s3_aggregation_df["region"].unique()) unique_countries = list(s3_aggregation_df["country_name"].unique()) all_regions_countries = unique_regions + unique_countries agg_by_region = ( s3_aggregation_df.groupby(["region"])[["object_size", "request_count"]] .sum() .reset_index() ) agg_by_region["object_size_pct"] = ( agg_by_region["object_size"] / agg_by_region["object_size"].sum() ) agg_by_region["request_count_pct"] = ( agg_by_region["request_count"] / agg_by_region["request_count"].sum() ) agg_by_region["object_size_pct_fmt"] = agg_by_region["object_size_pct"].apply( lambda x: f"{100*x:.2f}" ) agg_by_region["request_pct_fmt"] = agg_by_region["request_count_pct"].apply( lambda x: f"{100*x:.2f}" ) def remap_radio_value(value): return "object_size" if value == "Upload size" else "request_count" def pareto_chart(sort_by, global_filter="All"): sort_by = remap_radio_value(sort_by) title = sort_by.replace("_", " ").title() _df = ( s3_aggregation_df.groupby(["country_code", "country_name", "region"])[sort_by] .sum() .reset_index() ) if global_filter != "All": if global_filter in unique_regions: _df = _df[_df["region"] == global_filter] _df = _df.sort_values(by=sort_by, ascending=False) _df["cumulative_percentage"] = _df[sort_by].cumsum() / _df[sort_by].sum() * 100 _df = _df.head(20) if global_filter != "All": _df = _df.head(10) fig = go.Figure() fig.add_trace( go.Bar( x=_df["country_code"], y=_df[sort_by], name=title, hovertext=_df["country_name"], ) ) fig.add_trace( go.Scatter( x=_df["country_code"], y=_df["cumulative_percentage"], yaxis="y2", name="Cumulative Percentage", mode="lines+markers", ) ) region = global_filter + " region" if global_filter != "All" else "All Regions" # Update layout if title == "Object Size": title = "Uploaded Data (TB)" else: title = "Requests" fig.update_layout( title=f"Top {_df.shape[0]} Countries by Total {title} in {region}", xaxis_title="Country ISO Code", yaxis_title=title, yaxis2=dict(title="Cumulative Percentage", overlaying="y", side="right"), xaxis=dict(range=[-0.5, len(_df["country_code"]) - 0.5]), legend=dict(orientation="h"), ) fig.add_hline( y=80, line_dash="dot", annotation_text="", annotation_position="top right", yref="y2", ) return fig def manually_animated_choropleth_filter(hour, df_column, global_filter): df_column = remap_radio_value(df_column) hour = hour - 1 if global_filter != "All": min_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][ df_column ].min() max_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][ df_column ].max() else: min_range = s3_aggregation_df[df_column].min() max_range = s3_aggregation_df[df_column].max() _df = s3_aggregation_df[s3_aggregation_df["hour"] == hour] if global_filter != "All": if global_filter in unique_regions: _df = _df[_df["region"] == global_filter] title = df_column.replace("_", " ").title() fig = px.choropleth( data_frame=_df, locations="country_code", color=df_column, color_continuous_scale=px.colors.sequential.Plasma, projection="natural earth", height=800, hover_name="country_name", hover_data=df_column, range_color=[min_range, max_range], ) if title == "Object Size": title = "Global Distribution of Uploaded Data (TB)" else: title = "Global Distribution of Requests" fig.update_layout( title_text=title, geo=dict(showframe=False, showcoastlines=False), margin=dict(l=0, r=0, t=0, b=0), ) return fig with gr.Blocks(theme="citrus", fill_width=False) as demo: gr.Markdown( """ # A Global Analysis of Hub Uploads """ ) gr.Markdown( "The [Xet team's](https://huggingface.co/xet-team) backend uses a [content-addressable store (CAS)](https://en.wikipedia.org/wiki/Content-addressable_storage) for efficient deduplication and optimized data storage, making it ideal for Hugging Face Hub's scale. As we re-architect uploads and downloads on the Hub, we are inserting a CAS as the first stop for content distribution. To decide where to deploy our CAS [points of presence](https://docs.aws.amazon.com/whitepapers/latest/aws-fault-isolation-boundaries/points-of-presence.html), we analyzed a 24 hour window of global uploads to the Hub from October 11th, 2024." ) gr.HTML( f"