Spaces:

gaodrew
/

constellation

Runtime error

File size: 12,064 Bytes

67dafee

import streamlit as st
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import math
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
import plotly.figure_factory as ff
from community import community_louvain
import networkx as nx
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from wordcloud import WordCloud
import plotly.graph_objects as go


def create_dendrogram(X, labels):
    Z = linkage(X.toarray(), "single")
    fig = ff.create_dendrogram(Z, orientation='left', labels=labels)
    return fig
    
@st.cache_data
def load_data():
    data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
    return data

df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv")
st.title("Constellation: An Atlas of 15,000 Large Language Models")
st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.")
st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793")
threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000)
numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50)
wordClouds = st.checkbox("Show word clouds?")

def create_downloads_vs_likes_scatter(dataframe):
    # Convert 'likes' column to numeric values
    dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce')
    
    # Filter out the outlier point at 14M likes
    dataframe_filtered = dataframe[dataframe['likes'] != 14000000]
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers',
                             marker=dict(color='blue', size=7, opacity=0.7), 
                             text=dataframe_filtered['model_name'],
                             hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>"))
    fig.update_layout(title='Downloads vs Likes',
                      xaxis_title='Downloads',
                      #xaxis_range=[0,300000],
                      yaxis_title='Likes')
                    #yaxis_range=[0, 800])  # Set custom y-axis range
    return fig

  
if st.button("Run Clustering"):
    df_filtered = df[df['downloads'] > threshold]
    df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first')
    
    # Convert the model names into a matrix of TF-IDF features
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
    X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray()
    
    # Function to compute the pairwise cosine distances
    def distfun(X):
        return cosine_distances(X)
    
    # Function to compute the linkage matrix
    def linkagefun(dist_array):
        return linkage(dist_array, "single")
    
    # Create dendrogram
    fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun)
    #fig.update_layout(width=800, height=500)
    st.plotly_chart(fig, use_container_width=True)
        
    # Group by cluster
    # Convert the model names into a matrix of token counts
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6))
    X = vectorizer.fit_transform(df_extra_filtered['model_name'])
    # Use clustering to group model names
    clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray())
    
    # Add cluster labels to the filtered DataFrame
    df_extra_filtered['cluster'] = clustering.labels_
    
    # Count the number of models in each cluster
    cluster_counts = df_extra_filtered['cluster'].value_counts()
    
    # Create a bar chart
    fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)])
    fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models')
    st.plotly_chart(fig)

    # graphing!
    
    # Convert the model names into a matrix of TF-IDF features
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8))
    X = vectorizer.fit_transform(df_extra_filtered['model_name'])
    
    # Compute the pairwise cosine similarities
    sim_matrix = cosine_similarity(X)
    
    # Create a graph
    G = nx.Graph()

    # Add nodes to the graph
    for i in range(len(df_extra_filtered)):
        G.add_node(i, label=df_extra_filtered['model_name'].iloc[i])
    
    # Add edges to the graph
    for i in range(len(df_extra_filtered)):
        for j in range(i+1, len(df_extra_filtered)):
            # If the similarity is above a certain threshold
            if sim_matrix[i, j] > 0.2:
                G.add_edge(i, j, weight=sim_matrix[i, j])
    
    # Compute the layout positions
    pos = nx.spring_layout(G)

    # Detect communities
    partition = community_louvain.best_partition(G)
    # Create a figure
    # Compute the layout for each community
    layouts = {}
    for community in set(partition.values()):
        nodes_in_community = [node for node, comm in partition.items() if comm == community]
        subgraph = G.subgraph(nodes_in_community)
        layouts[community] = nx.spring_layout(subgraph)
    
    # Combine the layouts, spreading them out on a grid
    grid_size = math.ceil(math.sqrt(len(layouts)))  # Size of the grid
    grid = np.array(list(itertools.product(range(grid_size), repeat=2)))  # Coordinates for the grid
    scale = 2  # Scale factor for spreading out the communities
    offsets = dict(zip(layouts, grid*scale))  # Map communities to grid coordinates
    
    combined_layout = {}
    for community, layout in layouts.items():
        for node, position in layout.items():
            combined_layout[node] = position + offsets[community]
        
    # Prepare data for plotly
    x = [combined_layout[node][0] for node in range(len(df_extra_filtered))]
    y = [combined_layout[node][1] for node in range(len(df_extra_filtered))]
    
    # Create a figure
    fig = go.Figure()
    
    # Prepare lists for node positions, labels, ranks, downloads, likes, and params
    x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], []
    
    # Prepare the node attributes
    for node, community in partition.items():
        # Get model info
        model_info = df_extra_filtered.iloc[node]
        
        # Node position
        x.append(pos[node][0])
        y.append(pos[node][1])
    
        # Node attributes
        labels.append(model_info['model_name'])
        ranks.append(model_info['rank'])
        downloads.append(model_info['downloads'])
        likes.append(model_info['likes'])
        params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A')
    
    # Compute the centroid of each cluster for background coloring
    centroids = dict()
    community_sizes = dict()  # Create a dict to store the sizes of each community
    for community in set(partition.values()):
        nodes_in_community = [node for node, comm in partition.items() if comm == community]
        if len(nodes_in_community) > 1:  # Only consider communities with more than one node
            centroid_x = np.mean([pos[node][0] for node in nodes_in_community])
            centroid_y = np.mean([pos[node][1] for node in nodes_in_community])
            centroids[community] = (centroid_x, centroid_y)
            community_sizes[community] = len(nodes_in_community)
    
    # Add background coloring for each cluster
    for community, centroid in centroids.items():
        fig.add_trace(go.Scatter(
            x=[centroid[0]], y=[centroid[1]],
            mode='markers',
            marker=dict(
                size=community_sizes[community]*5,  # Adjust size by multiplying the community size by a factor
                color=community,
                opacity=0.1  
            ),
            hoverinfo='none',
            showlegend=False
        ))
    
    # Add nodes to the figure
    fig.add_trace(go.Scatter(
        x=x, y=y,
        mode='markers',
        marker=dict(size=3, color=community),
        text=labels,
        customdata=np.stack((ranks, downloads, likes, params), axis=-1),
        hovertemplate=(
            "Model Name: %{text}<br>"
            "Rank: %{customdata[0]}<br>"
            "Downloads: %{customdata[1]}<br>"
            "Likes: %{customdata[2]}<br>"
            "Params (millions): %{customdata[3]}"
            "<extra></extra>"
        )
    ))
    
    # Add edges to the figure
    for edge in G.edges():
        # Calculate edge weight for line width, normalize it for better visibility
        line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values()))
    
        fig.add_trace(go.Scatter(
            x=[pos[edge[0]][0], pos[edge[1]][0]],
            y=[pos[edge[0]][1], pos[edge[1]][1]],
            mode='lines',
            line=dict(width=line_width), # Multiply by a factor for better visibility
            hoverinfo='none'
        ))
    
    # Set the figure layout
    fig.update_layout(showlegend=False, hovermode='closest')
    
    st.plotly_chart(fig)
    
    # Calculate degree of each node
    degrees = dict(G.degree())
    
    # Sort nodes by degree in descending order and get top 20
    top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20]
    
    # Prepare data for display
    models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models]
    connections = [degree for node, degree in top_20_models]
    
    st.subheader("Top 20 Models by Number of Connections")
    for model, connections in zip(models, connections):
        st.write(f"{model}: {connections} connections")


    # Find the representative model for each community
    representatives = dict()
    for community in set(partition.values()):
        nodes_in_community = [node for node, comm in partition.items() if comm == community]
        # Select the node with the highest degree within the community as representative
        representative = max(nodes_in_community, key=lambda node: degrees[node])
        representatives[community] = df_extra_filtered.iloc[representative]['model_name']
    
    # Prepare data for display
    communities = list(representatives.keys())
    community_sizes = [community_sizes.get(comm, 1) for comm in communities]  # Use a default size of 1 for communities not in the dictionary
    representatives = list(representatives.values())
    
    # Create a DataFrame to hold the data
    df_reps = pd.DataFrame({
        'Community ID': communities,
        'Size': community_sizes,
        'Representative Model': representatives
    })
    
    # Sort the DataFrame by community size in descending order
    df_reps.sort_values(by='Size', ascending=False, inplace=True)
    
    # Display in Streamlit
    st.subheader("Representative for each community, sorted by community size.")
    st.dataframe(df_reps)
    if wordClouds:
        groups = df_extra_filtered.groupby('cluster')
            
        for name, group in groups:
            # Join all model names in the cluster into a single string
            text = ' '.join(group['model_name'])
            
            # Generate a word cloud
            wordcloud = WordCloud().generate(text)
            
            # Convert WordCloud to Image
            image = wordcloud.to_image()
        
            # Display the word cloud
            st.image(image, use_column_width=True)
            st.write(f'Word Cloud for Cluster {name}')

    scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered)
    st.plotly_chart(scatter_plot, use_container_width=True)