Spaces:
Running
Running
File size: 10,813 Bytes
b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 aa57274 b020462 a000445 b020462 a000445 b020462 aa57274 b020462 64a090d b020462 aa57274 b020462 aa57274 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
import streamlit as st
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Set page config
st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer", page_icon="π")
# Custom CSS (keep existing styles)
st.markdown("""
<style>
.stApp {
max-width: 1200px;
margin: 0 auto;
font-family: 'Helvetica Neue', Arial, sans-serif;
}
.cta-container {
background-color: #f0f8ff;
border-radius: 10px;
padding: 20px;
margin-top: 30px;
margin-bottom: 30px;
border: 2px solid #1e90ff;
text-align: center;
}
.cta-title {
color: #1e90ff;
font-size: 24px;
font-weight: bold;
margin-bottom: 10px;
}
.cta-description {
color: #333;
font-size: 16px;
margin-bottom: 20px;
}
.stButton > button {
background-color: #1e90ff;
color: white;
font-size: 18px;
font-weight: bold;
padding: 10px 24px;
border-radius: 5px;
border: none;
transition: all 0.3s ease;
}
.stButton > button:hover {
background-color: #0066cc;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
}
</style>
""", unsafe_allow_html=True)
# Title and description
st.title("π Macrocosmos HF Dataset Explorer")
st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.")
# Function to load dataset information
@st.cache_data
def load_datasets():
return [
# Reddit datasets
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_13", "Number of rows": "18,931,749"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_219", "Number of rows": "227,599,340"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_112", "Number of rows": "301,588,714"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1.12 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31.2 MLN"},
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26.9 MLN"},
# X datasets
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/suul999922/x_dataset_71", "Number of rows": "8,998,828"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/thayallans/x_dataset_28", "Number of rows": "178,669"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/apidojo/x_dataset_242", "Number of rows": "499,067"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_112", "Number of rows": "331,500,777"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332 MLN"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9.9 K"},
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89 K"}
]
# Load datasets
datasets = load_datasets()
df = pd.DataFrame(datasets)
# Display statistics
col1, col2, col3 = st.columns(3)
with col1:
total_rows = sum(float(str(rows).split()[0].replace(',', '')) for rows in df['Number of rows'])
st.metric("Total Rows", f"{total_rows / 1000:.2f}B")
with col2:
st.metric("Total Datasets", len(df))
# Display the dataset table
st.subheader("Dataset Overview")
st.dataframe(
df,
column_config={
"Source": st.column_config.TextColumn("Source"),
"DataSet repo link": st.column_config.LinkColumn("Repository"),
"Number of rows": st.column_config.TextColumn("Rows"),
},
hide_index=True,
use_container_width=True
)
# Call-to-action section with styled button
st.markdown("""
<div class="cta-container">
<div class="cta-title">π Explore Dataset Insights</div>
<div class="cta-description">
Dive deep into the rich analytics of our dataset. Uncover trends, distributions, and key metrics that will enhance your understanding and guide your research.
</div>
</div>
""", unsafe_allow_html=True)
# Centered button
col1, col2, col3 = st.columns([1,2,1])
with col2:
show_analysis = st.button("Reveal Dataset Analysis", use_container_width=True)
# Display dataset analysis if the button was clicked
if show_analysis:
# Load analysis results
@st.cache_data
def load_analysis_results():
with open('analysis_results.json', 'r') as f:
return json.load(f)
analysis_results = load_analysis_results()
st.subheader("Analysis of a Sample Reddit Dataset")
st.write("This analysis is based on a sample from one of the Reddit datasets.")
# Display Dataset Structure
st.subheader("Dataset Structure")
structure = analysis_results['structure']
col1, col2, col3, col4 = st.columns(4)
col1.metric("Total Partitions", structure['total_partitions'])
col2.metric("Total Rows", f"{structure['total_rows']:,}")
col3.metric("Number of Columns", len(structure['columns']))
col4.metric("Date Range", f"{structure['date_range'][0]} to {structure['date_range'][1]}")
with st.expander("Show Columns"):
st.write(", ".join(structure['columns']))
# Display Top Communities
st.subheader("Top Communities")
communities_df = pd.DataFrame(analysis_results['communities'])
fig = go.Figure(data=[go.Bar(
x=communities_df['communityName'],
y=communities_df['count'],
text=communities_df['percentage'].apply(lambda x: f'{x:.2%}'),
textposition='auto',
marker_color='#1e88e5'
)])
fig.update_layout(title_text='Top Communities Distribution')
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
st.plotly_chart(fig, use_container_width=True)
# Display Time Distribution
st.subheader("Time Distribution")
time_df = pd.DataFrame(analysis_results['time_distribution'])
time_df['date'] = pd.to_datetime(time_df['date'])
fig = go.Figure(data=[go.Scatter(x=time_df['date'], y=time_df['count'], mode='lines+markers')])
fig.update_layout(title_text='Posts Over Time')
st.plotly_chart(fig, use_container_width=True)
# Display Sentiment Distribution
st.subheader("Sentiment Distribution")
sentiment_df = pd.DataFrame(analysis_results['sentiment_distribution'])
fig = go.Figure(data=[go.Pie(labels=sentiment_df['sentiment'], values=sentiment_df['count'], textinfo='percent+label')])
fig.update_layout(title_text='Sentiment Distribution')
fig.update_traces(marker=dict(colors=['#4CAF50', '#FFC107', '#F44336']))
st.plotly_chart(fig, use_container_width=True)
# Display Data Type Distribution
st.subheader("Data Type Distribution")
data_type_df = pd.DataFrame(analysis_results['data_type_distribution'])
fig = go.Figure(data=[go.Pie(labels=data_type_df['dataType'], values=data_type_df['count'], textinfo='percent+label')])
fig.update_layout(title_text='Data Type Distribution')
fig.update_traces(marker=dict(colors=['#2196F3', '#FF9800']))
st.plotly_chart(fig, use_container_width=True)
# Display Top Topics
st.subheader("Top Topics")
topics_df = pd.DataFrame(analysis_results['top_topics'])
st.dataframe(topics_df, use_container_width=True)
# Display Average Text Length
st.metric("Average Text Length", f"{analysis_results['avg_text_length']:.2f} characters")
# Add instructions for using the datasets
st.subheader("How to Use These Datasets")
code = '''
from datasets import load_dataset
dataset = load_dataset("username/dataset_name")
'''
st.code(code, language='python')
st.markdown("""
1. Click on the dataset link to visit its Hugging Face page.
2. On the dataset page, you'll find information about the dataset's content, structure, and usage.
3. Use the code above to load a dataset, replacing `"username/dataset_name"` with the actual dataset identifier.
4. For these large datasets, consider using streaming or loading specific subsets to manage memory usage.
5. Always check the dataset's license and usage restrictions before incorporating it into your project.
""")
# Footer
st.markdown("---")
st.markdown("Created by Macrocosmos with β€οΈ") |