Spaces:
Sleeping
Sleeping
vtrubamacrocosmos
commited on
Commit
•
b020462
1
Parent(s):
c3df010
inittial commit
Browse files- app.py +102 -0
- macrocosmos-black.png +0 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Set page config
|
5 |
+
st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer")
|
6 |
+
|
7 |
+
# Custom CSS
|
8 |
+
st.markdown("""
|
9 |
+
<style>
|
10 |
+
.stApp {
|
11 |
+
max-width: 1200px;
|
12 |
+
margin: 0 auto;
|
13 |
+
}
|
14 |
+
.st-emotion-cache-1r6slb0 {
|
15 |
+
background-color: #f0f2f6;
|
16 |
+
border-radius: 10px;
|
17 |
+
padding: 20px;
|
18 |
+
margin-bottom: 20px;
|
19 |
+
}
|
20 |
+
.st-emotion-cache-1wivap2 {
|
21 |
+
background-color: #ffffff;
|
22 |
+
border-radius: 10px;
|
23 |
+
padding: 20px;
|
24 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
25 |
+
}
|
26 |
+
.company-logo {
|
27 |
+
max-width: 200px;
|
28 |
+
margin-bottom: 20px;
|
29 |
+
}
|
30 |
+
</style>
|
31 |
+
""", unsafe_allow_html=True)
|
32 |
+
|
33 |
+
# Company logo
|
34 |
+
st.image("macrocosmos-black.png", use_column_width=False, width=200)
|
35 |
+
|
36 |
+
# Title and description
|
37 |
+
st.title("🤗 Hugging Face Large Dataset Explorer")
|
38 |
+
st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.")
|
39 |
+
|
40 |
+
# Function to load dataset information
|
41 |
+
def load_datasets():
|
42 |
+
return [
|
43 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332 MLN"},
|
44 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9.9 K"},
|
45 |
+
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89 K"},
|
46 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249 MLN"},
|
47 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303 MLN"},
|
48 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1.12 MLN"},
|
49 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132 MLN"},
|
50 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130 MLN"},
|
51 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31.2 MLN"},
|
52 |
+
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26.9 MLN"}
|
53 |
+
]
|
54 |
+
|
55 |
+
# Load datasets
|
56 |
+
datasets = load_datasets()
|
57 |
+
df = pd.DataFrame(datasets)
|
58 |
+
|
59 |
+
# Display statistics
|
60 |
+
col1, col2 = st.columns(2)
|
61 |
+
with col1:
|
62 |
+
total_rows = sum(float(str(rows).split()[0].replace(',', '')) for rows in df['Number of rows'])
|
63 |
+
st.metric("Total Rows", f"{total_rows:.2f} Million")
|
64 |
+
with col2:
|
65 |
+
st.metric("Total Datasets", len(df))
|
66 |
+
|
67 |
+
# Display the dataset table
|
68 |
+
st.subheader("Dataset Overview")
|
69 |
+
st.dataframe(
|
70 |
+
df,
|
71 |
+
column_config={
|
72 |
+
"Source": st.column_config.TextColumn("Source"),
|
73 |
+
"DataSet repo link": st.column_config.LinkColumn("Repository"),
|
74 |
+
"Number of rows": st.column_config.TextColumn("Rows"),
|
75 |
+
},
|
76 |
+
hide_index=True,
|
77 |
+
use_container_width=True
|
78 |
+
)
|
79 |
+
|
80 |
+
# Add a note about the size of the datasets
|
81 |
+
st.info("⚠️ These datasets are very large, totaling approximately 100GB. They are not available for direct download through this interface. Please visit the individual dataset links for more information on accessing the data.")
|
82 |
+
|
83 |
+
# Add instructions for using the datasets
|
84 |
+
st.subheader("How to Use These Datasets")
|
85 |
+
code = '''
|
86 |
+
from datasets import load_dataset
|
87 |
+
|
88 |
+
dataset = load_dataset("username/dataset_name")
|
89 |
+
'''
|
90 |
+
st.code(code, language='python')
|
91 |
+
|
92 |
+
st.markdown("""
|
93 |
+
1. Click on the dataset link to visit its Hugging Face page.
|
94 |
+
2. On the dataset page, you'll find information about the dataset's content, structure, and usage.
|
95 |
+
3. Use the code above to load a dataset, replacing `"username/dataset_name"` with the actual dataset identifier.
|
96 |
+
4. For these large datasets, consider using streaming or loading specific subsets to manage memory usage.
|
97 |
+
5. Always check the dataset's license and usage restrictions before incorporating it into your project.
|
98 |
+
""")
|
99 |
+
|
100 |
+
# Footer
|
101 |
+
st.markdown("---")
|
102 |
+
st.markdown("Created by Macrocosmos with ❤️ ")
|
macrocosmos-black.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit
|
3 |
+
streamlit-card
|