Alexander Watson commited on
Commit
eb03925
·
1 Parent(s): 28fb096

initial checkin

Browse files
Files changed (9) hide show
  1. .gitignore +42 -0
  2. LICENSE +201 -0
  3. README.md +1 -14
  4. app.py +11 -0
  5. requirements.txt +14 -0
  6. src/app.py +299 -0
  7. src/utils/__init__.py +0 -0
  8. src/utils/analysis.py +486 -0
  9. src/utils/visualization.py +162 -0
.gitignore ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+
24
+ # Virtual Environment
25
+ venv/
26
+ ENV/
27
+
28
+ # IDEs
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
33
+
34
+ # OS
35
+ .DS_Store
36
+ Thumbs.db
37
+
38
+ # Streamlit
39
+ .streamlit/secrets.toml
40
+
41
+ # Local development
42
+ .env
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,14 +1 @@
1
- ---
2
- title: Dataset Card Generator
3
- emoji: 🦀
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.40.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Generate beautiful documentation for your HF datasets
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # data-card-generator
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Add src directory to Python path
5
+ src_path = Path(__file__).parent / "src"
6
+ sys.path.append(str(src_path))
7
+
8
+ # Import and run the actual app
9
+ from app import main
10
+
11
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.31.1
2
+ pandas==2.2.0
3
+ matplotlib==3.8.2
4
+ seaborn==0.13.2
5
+ datasets==2.17.0
6
+ huggingface-hub==0.20.3
7
+ wordcloud==1.9.3
8
+ PyYAML==6.0.1
9
+ openai==1.12.0
10
+ python-dotenv==1.0.1
11
+ plotly==5.18.0
12
+ kaleido==0.2.1
13
+ scipy==1.12.0
14
+ tiktoken==0.7.0
src/app.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from datasets import load_dataset
6
+ from huggingface_hub import HfApi, login
7
+ from openai import OpenAI
8
+
9
+ # Import our utility functions
10
+ from utils.analysis import analyze_dataset_with_openai, generate_dataset_card
11
+ from utils.visualization import create_distribution_plot, create_wordcloud
12
+
13
+ # Initialize session state variables
14
+ if "openai_analysis" not in st.session_state:
15
+ st.session_state.openai_analysis = None
16
+ if "df" not in st.session_state:
17
+ st.session_state.df = None
18
+ if "dataset_name" not in st.session_state:
19
+ st.session_state.dataset_name = None
20
+ if "selected_dist_columns" not in st.session_state:
21
+ st.session_state.selected_dist_columns = []
22
+ if "selected_wordcloud_columns" not in st.session_state:
23
+ st.session_state.selected_wordcloud_columns = []
24
+
25
+ st.set_page_config(
26
+ page_title="Dataset Card Generator",
27
+ page_icon="📊",
28
+ layout="wide",
29
+ )
30
+
31
+
32
+ def initialize_openai_client(api_key):
33
+ """Initialize OpenAI client with API key."""
34
+ return OpenAI(api_key=api_key)
35
+
36
+
37
+ def load_and_analyze_dataset(dataset_name):
38
+ """Load dataset and perform initial analysis."""
39
+ progress_container = st.empty()
40
+
41
+ with progress_container.container():
42
+ with st.status("Loading dataset...", expanded=True) as status:
43
+ try:
44
+ # Load dataset
45
+ status.write("📥 Loading dataset from HuggingFace...")
46
+ dataset = load_dataset(dataset_name, split="train")
47
+ df = pd.DataFrame(dataset)
48
+ st.session_state.df = df
49
+ st.session_state.dataset_name = dataset_name
50
+
51
+ # Initialize OpenAI analysis
52
+ try:
53
+ status.write("🤖 Analyzing dataset ...")
54
+ client = initialize_openai_client(st.session_state.openai_key)
55
+ sample_data = dataset[:5]
56
+ print("Sample data:", json.dumps(sample_data, indent=2))
57
+
58
+ analysis = analyze_dataset_with_openai(client, sample_data)
59
+ print("Analysis result:", json.dumps(analysis, indent=2))
60
+
61
+ st.session_state.openai_analysis = analysis
62
+ except Exception as e:
63
+ print(f"Analysis error: {str(e)}")
64
+ status.update(label=f"❌ Error: {str(e)}", state="error")
65
+
66
+ status.update(
67
+ label="✅ Dataset loaded and analyzed successfully!",
68
+ state="complete",
69
+ )
70
+
71
+ except Exception as e:
72
+ status.update(label=f"❌ Error: {str(e)}", state="error")
73
+ st.error(f"Failed to load dataset: {str(e)}")
74
+ return
75
+
76
+
77
+ def display_dataset_analysis():
78
+ """Display dataset analysis and visualization options."""
79
+ if st.session_state.df is None:
80
+ return
81
+
82
+ st.header("Dataset Analysis")
83
+
84
+ # Dataset preview
85
+ with st.expander("📊 Dataset Preview", expanded=True):
86
+ st.dataframe(st.session_state.df.head(), use_container_width=True)
87
+
88
+ # Column selection for visualizations
89
+ st.subheader("Select Visualization Fields")
90
+
91
+ col1, col2 = st.columns(2)
92
+
93
+ with col1:
94
+ # Distribution plot selection
95
+ st.session_state.selected_dist_columns = st.multiselect(
96
+ "Distribution Plots (max 2)",
97
+ options=st.session_state.df.columns.tolist(),
98
+ format_func=lambda x: get_column_type_description(st.session_state.df, x),
99
+ max_selections=2,
100
+ help="Select columns to show value distributions. List columns will show frequency of individual items.",
101
+ )
102
+
103
+ with col2:
104
+ # Word cloud selection
105
+ text_columns = [
106
+ col
107
+ for col in st.session_state.df.columns
108
+ if st.session_state.df[col].dtype == "object"
109
+ or isinstance(st.session_state.df[col].iloc[0], list)
110
+ ]
111
+
112
+ st.session_state.selected_wordcloud_columns = st.multiselect(
113
+ "Word Clouds (max 2)",
114
+ options=text_columns,
115
+ format_func=lambda x: get_column_type_description(st.session_state.df, x),
116
+ max_selections=2,
117
+ help="Select text columns to generate word clouds",
118
+ )
119
+
120
+ # Add some spacing
121
+ st.markdown("---")
122
+
123
+ # Generate card button
124
+ if st.button("Generate Dataset Card", type="primary", use_container_width=True):
125
+ if not (
126
+ st.session_state.selected_dist_columns
127
+ or st.session_state.selected_wordcloud_columns
128
+ ):
129
+ st.warning(
130
+ "Please select at least one visualization before generating the card."
131
+ )
132
+ return
133
+
134
+ generate_and_display_card()
135
+
136
+
137
+ def generate_and_display_card():
138
+ """Generate and display the dataset card with visualizations."""
139
+ if not st.session_state.openai_analysis:
140
+ st.error(
141
+ "Dataset analysis not available. Please try loading the dataset again."
142
+ )
143
+ return
144
+
145
+ with st.status("Generating dataset card...", expanded=True) as status:
146
+ try:
147
+ # Create visualizations
148
+ status.write("📊 Creating distribution plots...")
149
+ distribution_plots = {}
150
+ for col in st.session_state.selected_dist_columns:
151
+ print(f"Generating distribution plot for {col}")
152
+ img_base64 = create_distribution_plot(st.session_state.df, col)
153
+ distribution_plots[col] = img_base64
154
+ print(f"Successfully created plot for {col}")
155
+
156
+ status.write("🔤 Generating word clouds...")
157
+ wordcloud_plots = {}
158
+ for col in st.session_state.selected_wordcloud_columns:
159
+ print(f"Generating word cloud for {col}")
160
+ img_base64 = create_wordcloud(st.session_state.df, col)
161
+ wordcloud_plots[col] = img_base64
162
+ print(f"Successfully created word cloud for {col}")
163
+
164
+ # Generate dataset card content
165
+ status.write("📝 Composing dataset card...")
166
+ dataset_info = {"dataset_name": st.session_state.dataset_name}
167
+
168
+ readme_content = generate_dataset_card(
169
+ dataset_info=dataset_info,
170
+ distribution_plots=distribution_plots,
171
+ wordcloud_plots=wordcloud_plots,
172
+ openai_analysis=st.session_state.openai_analysis,
173
+ df=st.session_state.df, # Added DataFrame parameter
174
+ )
175
+
176
+ # Display results
177
+ status.update(label="✅ Dataset card generated!", state="complete")
178
+
179
+ # Display the markdown with images
180
+ st.markdown(readme_content, unsafe_allow_html=True)
181
+
182
+ # Add download button
183
+ st.download_button(
184
+ label="⬇️ Download Dataset Card",
185
+ data=readme_content,
186
+ file_name="README.md",
187
+ mime="text/markdown",
188
+ use_container_width=True,
189
+ )
190
+
191
+ except Exception as e:
192
+ print(f"Error in generate_and_display_card: {str(e)}")
193
+ st.error(f"Error generating dataset card: {str(e)}")
194
+ raise e
195
+
196
+
197
+ def get_column_type_description(data, column):
198
+ """Get a user-friendly description of the column type."""
199
+ try:
200
+ if isinstance(data[column].iloc[0], list):
201
+ return f"{column} (list)"
202
+ elif data[column].dtype in ["int64", "float64"]:
203
+ return f"{column} (numeric)"
204
+ else:
205
+ return f"{column} (text/categorical)"
206
+ except:
207
+ return f"{column} (unknown)"
208
+
209
+
210
+ def get_api_keys():
211
+ """Get API keys from secrets or user input."""
212
+ # Try to get from secrets first
213
+ try:
214
+ hf_token = st.secrets["api_keys"]["huggingface"]
215
+ openai_key = st.secrets["api_keys"]["openai"]
216
+ return hf_token, openai_key
217
+ except:
218
+ return None, None
219
+
220
+
221
+ def get_secrets():
222
+ """Get API keys from secrets.toml if it exists."""
223
+ try:
224
+ hf_token = st.secrets.get("api_keys", {}).get("huggingface", "")
225
+ openai_key = st.secrets.get("api_keys", {}).get("openai", "")
226
+ return hf_token, openai_key
227
+ except Exception as e:
228
+ print(f"No secrets file found or error reading secrets: {e}")
229
+ return "", ""
230
+
231
+
232
+ def main():
233
+ st.title("📊 Dataset Card Generator")
234
+ st.markdown(
235
+ """
236
+ Generate beautiful documentation for your HuggingFace datasets with automated analysis,
237
+ visualizations, and formatted dataset cards.
238
+ """
239
+ )
240
+
241
+ # Get secrets if available
242
+ default_hf_token, default_openai_key = get_api_keys()
243
+
244
+ # Authentication section in sidebar
245
+ with st.sidebar:
246
+ st.header("🔑 Authentication")
247
+
248
+ # OpenAI API key (required)
249
+ openai_key = st.text_input(
250
+ "OpenAI API Key",
251
+ value=default_openai_key,
252
+ type="password" if not default_openai_key else "default",
253
+ help="Required: Your OpenAI API key for dataset analysis",
254
+ )
255
+
256
+ # HuggingFace token (optional)
257
+ hf_token = st.text_input(
258
+ "HuggingFace Token (optional)",
259
+ value=default_hf_token,
260
+ type="password" if not default_hf_token else "default",
261
+ help="Optional: Only required for private datasets",
262
+ )
263
+
264
+ if openai_key:
265
+ try:
266
+ # Only attempt HF login if token is provided
267
+ if hf_token:
268
+ login(hf_token)
269
+ st.success("✅ HuggingFace authentication successful!")
270
+
271
+ st.session_state.openai_key = openai_key
272
+ st.success("✅ OpenAI API key set!")
273
+ except Exception as e:
274
+ st.error(f"❌ Authentication error: {str(e)}")
275
+ return
276
+ else:
277
+ st.info("👆 Please enter your OpenAI API key to get started.")
278
+ return
279
+
280
+ # Main content area
281
+ if not openai_key:
282
+ return
283
+
284
+ dataset_name = st.text_input(
285
+ "Enter HuggingFace Dataset Name",
286
+ placeholder="username/dataset",
287
+ help="Enter the full path to your HuggingFace dataset (e.g., 'username/dataset')",
288
+ )
289
+
290
+ if dataset_name:
291
+ if st.button("Load Dataset", type="primary"):
292
+ load_and_analyze_dataset(dataset_name)
293
+
294
+ if st.session_state.df is not None:
295
+ display_dataset_analysis()
296
+
297
+
298
+ if __name__ == "__main__":
299
+ main()
src/utils/__init__.py ADDED
File without changes
src/utils/analysis.py ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json
3
+ import yaml
4
+ import re
5
+ import datetime
6
+ import plotly.express as px
7
+ import plotly.graph_objects as go
8
+ import pandas as pd
9
+ import base64
10
+ import io
11
+ from collections import Counter
12
+ import tiktoken
13
+
14
+
15
+ def extract_json_from_response(text: str) -> str:
16
+ """Extract JSON from a response that might contain markdown code blocks."""
17
+ # Try to find JSON within code blocks first
18
+ json_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
19
+ if json_match:
20
+ return json_match.group(1)
21
+
22
+ # If no code blocks, try to find raw JSON
23
+ json_match = re.search(r"\{.*\}", text, re.DOTALL)
24
+ if json_match:
25
+ return json_match.group(0)
26
+
27
+ # If no JSON found, return the original text
28
+ return text
29
+
30
+
31
+ def count_tokens(text: str, model: str = "gpt-4") -> int:
32
+ """Count tokens in text using tiktoken."""
33
+ try:
34
+ encoder = tiktoken.encoding_for_model(model)
35
+ return len(encoder.encode(str(text)))
36
+ except Exception as e:
37
+ print(f"Error counting tokens: {e}")
38
+ return 0
39
+
40
+
41
+ def create_distribution_plot(data, column):
42
+ """Create a distribution plot using Plotly and convert to image."""
43
+ try:
44
+ # Check if the column contains lists
45
+ if isinstance(data[column].iloc[0], list):
46
+ print(f"Processing list column: {column}")
47
+ value_counts = flatten_list_column(data, column)
48
+
49
+ fig = go.Figure(
50
+ [
51
+ go.Bar(
52
+ x=value_counts.index,
53
+ y=value_counts.values,
54
+ marker=dict(
55
+ color=value_counts.values,
56
+ colorscale=px.colors.sequential.Plotly3,
57
+ ),
58
+ )
59
+ ]
60
+ )
61
+
62
+ else:
63
+ if data[column].dtype in ["int64", "float64"]:
64
+ # Continuous data - use histogram
65
+ fig = go.Figure()
66
+ fig.add_trace(
67
+ go.Histogram(
68
+ x=data[column],
69
+ name="Count",
70
+ nbinsx=30,
71
+ marker=dict(
72
+ color="rgba(110, 68, 255, 0.7)",
73
+ line=dict(color="rgba(184, 146, 255, 1)", width=1),
74
+ ),
75
+ )
76
+ )
77
+ else:
78
+ # Categorical data
79
+ value_counts = data[column].value_counts()
80
+ fig = go.Figure(
81
+ [
82
+ go.Bar(
83
+ x=value_counts.index,
84
+ y=value_counts.values,
85
+ marker=dict(
86
+ color=value_counts.values,
87
+ colorscale=px.colors.sequential.Plotly3,
88
+ ),
89
+ )
90
+ ]
91
+ )
92
+
93
+ # Common layout updates
94
+ fig.update_layout(
95
+ title=dict(text=f"Distribution of {column}", x=0.5, y=0.95),
96
+ xaxis_title=column,
97
+ yaxis_title="Count",
98
+ template="plotly_white",
99
+ margin=dict(t=50, l=50, r=30, b=50),
100
+ width=600,
101
+ height=400,
102
+ showlegend=False,
103
+ plot_bgcolor="rgba(0,0,0,0)",
104
+ paper_bgcolor="rgba(0,0,0,0)",
105
+ )
106
+
107
+ # Rotate x-axis labels if needed
108
+ if isinstance(data[column].iloc[0], list) or data[column].dtype not in [
109
+ "int64",
110
+ "float64",
111
+ ]:
112
+ fig.update_layout(xaxis_tickangle=-45)
113
+
114
+ # Update grid style
115
+ fig.update_yaxes(gridcolor="rgba(128,128,128,0.1)", gridwidth=1)
116
+ fig.update_xaxes(gridcolor="rgba(128,128,128,0.1)", gridwidth=1)
117
+
118
+ # Convert to PNG with moderate resolution
119
+ img_bytes = fig.to_image(format="png", scale=1.5)
120
+
121
+ # Encode to base64
122
+ img_base64 = base64.b64encode(img_bytes).decode()
123
+
124
+ return img_base64
125
+
126
+ except Exception as e:
127
+ print(f"Error creating distribution plot for {column}: {str(e)}")
128
+ raise e
129
+
130
+
131
+ def create_wordcloud(data, column):
132
+ """Create a word cloud visualization."""
133
+ from wordcloud import WordCloud
134
+ import matplotlib.pyplot as plt
135
+
136
+ try:
137
+ # Handle list columns
138
+ if isinstance(data[column].iloc[0], list):
139
+ text = " ".join(
140
+ [
141
+ " ".join(map(str, sublist))
142
+ for sublist in data[column]
143
+ if isinstance(sublist, list)
144
+ ]
145
+ )
146
+ else:
147
+ # Handle regular columns
148
+ text = " ".join(data[column].astype(str))
149
+
150
+ wordcloud = WordCloud(
151
+ width=600,
152
+ height=300,
153
+ background_color="white",
154
+ colormap="plasma",
155
+ max_words=100,
156
+ ).generate(text)
157
+
158
+ # Create matplotlib figure
159
+ plt.figure(figsize=(8, 4))
160
+ plt.imshow(wordcloud, interpolation="bilinear")
161
+ plt.axis("off")
162
+ plt.title(f"Word Cloud for {column}")
163
+
164
+ # Save to bytes
165
+ buf = io.BytesIO()
166
+ plt.savefig(buf, format="png", bbox_inches="tight", dpi=150)
167
+ plt.close()
168
+ buf.seek(0)
169
+
170
+ # Convert to base64
171
+ img_base64 = base64.b64encode(buf.getvalue()).decode()
172
+
173
+ return img_base64
174
+
175
+ except Exception as e:
176
+ print(f"Error creating word cloud for {column}: {str(e)}")
177
+ raise e
178
+
179
+
180
+ def analyze_dataset_with_openai(client: OpenAI, dataset_sample) -> dict:
181
+ """Analyze dataset sample using OpenAI API."""
182
+ # Get a single record for schema inference
183
+ single_record = (
184
+ dataset_sample[0] if isinstance(dataset_sample, list) else dataset_sample
185
+ )
186
+
187
+ # Convert the full sample to JSON for overview analysis
188
+ sample_json = json.dumps(dataset_sample, indent=2)
189
+ single_record_json = json.dumps(single_record, indent=2)
190
+
191
+ prompt = f"""Analyze this dataset sample and provide the following in a JSON response:
192
+
193
+ 1. A concise description that includes:
194
+ - A one-sentence overview of what the dataset contains
195
+ - A bullet-pointed list of key features and statistics
196
+ - A brief statement about potential ML/AI applications
197
+
198
+ 2. A schema showing each field's type and description. Use this single record for type inference:
199
+ {single_record_json}
200
+
201
+ For schema types, use precise types like:
202
+ - "string" for text fields
203
+ - "number" for numeric fields
204
+ - "boolean" for true/false
205
+ - "array of X" for arrays where X is the type of elements
206
+ - "object" for nested objects, with nested field descriptions
207
+
208
+ 3. A formatted example record
209
+
210
+ Format your response as JSON with these exact keys:
211
+
212
+ {{
213
+ "description": {{
214
+ "overview": "One clear sentence describing the dataset...",
215
+ "key_features": [
216
+ "Feature or statistic 1",
217
+ "Feature or statistic 2"
218
+ ],
219
+ "ml_applications": "Brief statement about ML/AI use cases..."
220
+ }},
221
+ "schema": {{
222
+ "field_name": {{
223
+ "type": "precise type as described above",
224
+ "description": "Description of what this field contains"
225
+ }}
226
+ }},
227
+ "example": {{"key": "value"}}
228
+ }}
229
+
230
+ For context, here are more sample records to help with the overview and features:
231
+ {sample_json}
232
+ """
233
+
234
+ try:
235
+ response = client.chat.completions.create(
236
+ model="gpt-4o-mini",
237
+ messages=[{"role": "user", "content": prompt}],
238
+ temperature=0.7,
239
+ max_tokens=2000,
240
+ )
241
+
242
+ # Get the response content
243
+ response_text = response.choices[0].message.content
244
+ print("OpenAI Response:", response_text)
245
+
246
+ # Extract JSON from the response
247
+ json_str = extract_json_from_response(response_text)
248
+ print("Extracted JSON:", json_str)
249
+
250
+ # Parse the JSON
251
+ result = json.loads(json_str)
252
+ print("Parsed Result:", result)
253
+ return result
254
+
255
+ except Exception as e:
256
+ print(f"OpenAI API error: {str(e)}")
257
+ return {
258
+ "description": {
259
+ "overview": "Error analyzing dataset",
260
+ "key_features": ["Error: Failed to analyze dataset"],
261
+ "ml_applications": "Analysis unavailable",
262
+ },
263
+ "schema": {},
264
+ "example": {},
265
+ }
266
+
267
+
268
+ def analyze_dataset_statistics(df):
269
+ """Generate simplified dataset statistics with token counting."""
270
+ stats = {
271
+ "basic_stats": {
272
+ "total_records": len(df),
273
+ "total_features": len(df.columns),
274
+ "memory_usage": f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f} MB"
275
+ },
276
+ "token_stats": {
277
+ "total": 0,
278
+ "by_column": {}
279
+ }
280
+ }
281
+
282
+ # Count tokens for each column
283
+ for column in df.columns:
284
+ try:
285
+ if df[column].dtype == 'object' or isinstance(df[column].iloc[0], list):
286
+ # For list columns, join items into strings
287
+ if isinstance(df[column].iloc[0], list):
288
+ token_counts = df[column].apply(lambda x: count_tokens(' '.join(str(item) for item in x)))
289
+ else:
290
+ token_counts = df[column].apply(lambda x: count_tokens(str(x)))
291
+
292
+ total_tokens = int(token_counts.sum())
293
+ stats["token_stats"]["total"] += total_tokens
294
+ stats["token_stats"]["by_column"][column] = total_tokens
295
+ except Exception as e:
296
+ print(f"Error processing column {column}: {str(e)}")
297
+ continue
298
+
299
+ return stats
300
+
301
+ def format_dataset_stats(stats):
302
+ """Format simplified dataset statistics as markdown."""
303
+ md = """## Dataset Overview
304
+
305
+ ### Basic Statistics
306
+ * Total Records: {total_records:,}
307
+ * Total Features: {total_features}
308
+ * Memory Usage: {memory_usage}
309
+ """.format(**stats["basic_stats"])
310
+
311
+ # Token Statistics
312
+ if stats["token_stats"]["total"] > 0:
313
+ md += "\n### Token Info\n"
314
+ md += f"* Total Tokens: {stats['token_stats']['total']:,}\n"
315
+ if stats["token_stats"]["by_column"]:
316
+ md += "\nTokens by Column:\n"
317
+ for col, count in stats["token_stats"]["by_column"].items():
318
+ md += f"* {col}: {count:,}\n"
319
+
320
+ return md
321
+
322
+ def generate_dataset_card(
323
+ dataset_info: dict,
324
+ distribution_plots: dict,
325
+ wordcloud_plots: dict,
326
+ openai_analysis: dict,
327
+ df: pd.DataFrame,
328
+ ) -> str:
329
+ """Generate the complete dataset card content."""
330
+ yaml_content = {
331
+ "language": ["en"],
332
+ "license": "apache-2.0",
333
+ "multilinguality": "monolingual",
334
+ "size_categories": ["1K<n<10K"],
335
+ "task_categories": ["other"],
336
+ }
337
+
338
+ yaml_string = yaml.dump(yaml_content, sort_keys=False)
339
+ description = openai_analysis["description"]
340
+
341
+ # Generate schema table
342
+ schema_table = generate_schema_table(openai_analysis["schema"])
343
+
344
+ # Format example as JSON code block
345
+ example_block = f"```json\n{json.dumps(openai_analysis['example'], indent=2)}\n```"
346
+
347
+ # Generate dataset statistics
348
+ stats = analyze_dataset_statistics(df)
349
+ stats_section = format_dataset_stats(stats)
350
+
351
+ # Add distribution plots inline
352
+ distribution_plots_md = ""
353
+ if distribution_plots:
354
+ distribution_plots_md = "\n### Distribution Plots\n\n"
355
+ distribution_plots_md += '<div style="display: grid; grid-template-columns: repeat(1, 1fr); gap: 20px;">\n'
356
+ for col, img_str in distribution_plots.items():
357
+ distribution_plots_md += f"<div>\n"
358
+ distribution_plots_md += f"<h4>Distribution of {col}</h4>\n"
359
+ distribution_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
360
+ distribution_plots_md += "</div>\n"
361
+ distribution_plots_md += "</div>\n\n"
362
+
363
+ # Add word clouds inline in a grid
364
+ wordcloud_plots_md = ""
365
+ if wordcloud_plots:
366
+ wordcloud_plots_md = "\n### Word Clouds\n\n"
367
+ wordcloud_plots_md += '<div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px;">\n'
368
+ for col, img_str in wordcloud_plots.items():
369
+ wordcloud_plots_md += f"<div>\n"
370
+ wordcloud_plots_md += f"<h4>Word Cloud for {col}</h4>\n"
371
+ wordcloud_plots_md += f'<img src="data:image/png;base64,{img_str}" style="width: 100%; height: auto;">\n'
372
+ wordcloud_plots_md += "</div>\n"
373
+ wordcloud_plots_md += "</div>\n\n"
374
+
375
+ # Generate clean dataset name for citation
376
+ clean_dataset_name = dataset_info["dataset_name"].replace("/", "_")
377
+
378
+ # Build the markdown content
379
+ readme_content = f"""---
380
+ {yaml_string}---
381
+
382
+ # {dataset_info['dataset_name']}
383
+
384
+ {description['overview']}
385
+
386
+ The dataset includes:
387
+ {chr(10).join(f'* {feature}' for feature in description['key_features'])}
388
+
389
+ {description['ml_applications']}
390
+
391
+ ## Dataset Schema
392
+
393
+ {schema_table}
394
+
395
+ ## Example Record
396
+
397
+ {example_block}
398
+
399
+ ## Data Distribution Analysis
400
+
401
+ The following visualizations show key characteristics of the dataset:
402
+
403
+ {distribution_plots_md}
404
+ {wordcloud_plots_md}
405
+
406
+ {stats_section}
407
+
408
+ ## Citation and Usage
409
+
410
+ If you use this dataset in your research or applications, please cite it as:
411
+
412
+ ```bibtex
413
+ @dataset{{{clean_dataset_name},
414
+ title = {{{dataset_info['dataset_name']}}},
415
+ author = {{Dataset Authors}},
416
+ year = {{{datetime.datetime.now().year}}},
417
+ publisher = {{Hugging Face}},
418
+ howpublished = {{Hugging Face Datasets}},
419
+ url = {{https://huggingface.co/datasets/{dataset_info['dataset_name']}}}
420
+ }}
421
+ ```
422
+
423
+ ### Usage Guidelines
424
+
425
+ This dataset is released under the Apache 2.0 License. When using this dataset:
426
+
427
+ * 📚 Cite the dataset using the BibTeX entry above
428
+ * 🤝 Consider contributing improvements or reporting issues
429
+ * 💡 Share derivative works with the community when possible
430
+
431
+ For questions or additional information, please visit the dataset repository on Hugging Face.
432
+ """
433
+
434
+ return readme_content
435
+
436
+
437
+ def generate_schema_table(schema: dict) -> str:
438
+ """Generate a markdown table for the schema, handling nested structures."""
439
+ # Table header
440
+ table = "| Field | Type | Description |\n| --- | --- | --- |\n"
441
+
442
+ # Generate rows recursively
443
+ rows = []
444
+ for field, info in schema.items():
445
+ rows.extend(format_schema_item(field, info))
446
+
447
+ # Join all rows
448
+ table += "\n".join(rows)
449
+ return table
450
+
451
+
452
+ def format_schema_item(field_name: str, field_info: dict, prefix: str = "") -> list:
453
+ """Recursively format schema items for nested structures."""
454
+ rows = []
455
+
456
+ # Handle nested objects
457
+ if isinstance(field_info, dict):
458
+ if "type" in field_info and "description" in field_info:
459
+ # This is a leaf node with type and description
460
+ rows.append(
461
+ f"| {prefix}{field_name} | {field_info['type']} | {field_info['description']} |"
462
+ )
463
+ else:
464
+ # This is a nested object, recurse through its properties
465
+ for subfield, subinfo in field_info.items():
466
+ if prefix:
467
+ new_prefix = f"{prefix}{field_name}."
468
+ else:
469
+ new_prefix = f"{field_name}."
470
+ rows.extend(format_schema_item(subfield, subinfo, new_prefix))
471
+
472
+ return rows
473
+
474
+
475
+ def flatten_list_column(data, column):
476
+ """Flatten a column containing lists into individual values with counts."""
477
+ # Flatten the lists into individual items
478
+ flattened = [
479
+ item
480
+ for sublist in data[column]
481
+ if isinstance(sublist, list)
482
+ for item in sublist
483
+ ]
484
+ # Count occurrences
485
+ value_counts = pd.Series(Counter(flattened))
486
+ return value_counts
src/utils/visualization.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.express as px
2
+ import plotly.graph_objects as go
3
+ import pandas as pd
4
+ import base64
5
+ import io
6
+
7
+
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ import pandas as pd
11
+ import base64
12
+ import io
13
+ from collections import Counter
14
+
15
+ def flatten_list_column(data, column):
16
+ """Flatten a column containing lists into individual values with counts."""
17
+ # Flatten the lists into individual items
18
+ flattened = [item for sublist in data[column] if isinstance(sublist, list) for item in sublist]
19
+ # Count occurrences
20
+ value_counts = pd.Series(Counter(flattened))
21
+ return value_counts
22
+
23
+ def create_distribution_plot(data, column):
24
+ """Create a beautiful distribution plot using Plotly and convert to image."""
25
+ try:
26
+ # Check if the column contains lists
27
+ if isinstance(data[column].iloc[0], list):
28
+ print(f"Processing list column: {column}")
29
+ value_counts = flatten_list_column(data, column)
30
+ else:
31
+ # Handle regular columns
32
+ if data[column].dtype in ['int64', 'float64']:
33
+ # Continuous data - use histogram
34
+ fig = go.Figure()
35
+
36
+ # Add histogram
37
+ fig.add_trace(go.Histogram(
38
+ x=data[column],
39
+ name='Count',
40
+ nbinsx=30,
41
+ marker=dict(
42
+ color='rgba(110, 68, 255, 0.7)',
43
+ line=dict(color='rgba(184, 146, 255, 1)', width=1)
44
+ )
45
+ ))
46
+
47
+ else:
48
+ # Categorical data
49
+ value_counts = data[column].value_counts()
50
+
51
+ # For both list columns and categorical data
52
+ if 'value_counts' in locals():
53
+ fig = go.Figure([go.Bar(
54
+ x=value_counts.index,
55
+ y=value_counts.values,
56
+ marker=dict(
57
+ color=value_counts.values,
58
+ colorscale=px.colors.sequential.Plotly3,
59
+ ),
60
+ )])
61
+
62
+ # Common layout updates
63
+ fig.update_layout(
64
+ title=f'Distribution of {column}',
65
+ xaxis_title=column,
66
+ yaxis_title='Count',
67
+ template='plotly_white',
68
+ margin=dict(t=50, l=50, r=50, b=50),
69
+ width=1200,
70
+ height=800,
71
+ showlegend=False
72
+ )
73
+
74
+ # Rotate x-axis labels if needed
75
+ if isinstance(data[column].iloc[0], list) or data[column].dtype not in ['int64', 'float64']:
76
+ fig.update_layout(xaxis_tickangle=-45)
77
+
78
+ # Convert to PNG
79
+ img_bytes = fig.to_image(format="png", scale=2.0)
80
+
81
+ # Encode to base64
82
+ img_base64 = base64.b64encode(img_bytes).decode()
83
+
84
+ return img_base64
85
+
86
+ except Exception as e:
87
+ print(f"Error creating distribution plot for {column}: {str(e)}")
88
+ raise e
89
+
90
+ def create_wordcloud(data, column):
91
+ """Create a word cloud visualization."""
92
+ from wordcloud import WordCloud
93
+ import matplotlib.pyplot as plt
94
+
95
+ try:
96
+ # Handle list columns
97
+ if isinstance(data[column].iloc[0], list):
98
+ text = ' '.join([' '.join(map(str, sublist)) for sublist in data[column] if isinstance(sublist, list)])
99
+ else:
100
+ # Handle regular columns
101
+ text = ' '.join(data[column].astype(str))
102
+
103
+ wordcloud = WordCloud(
104
+ width=1200,
105
+ height=800,
106
+ background_color='white',
107
+ colormap='plasma',
108
+ max_words=100
109
+ ).generate(text)
110
+
111
+ # Create matplotlib figure
112
+ plt.figure(figsize=(10, 5))
113
+ plt.imshow(wordcloud, interpolation='bilinear')
114
+ plt.axis('off')
115
+ plt.title(f'Word Cloud for {column}')
116
+
117
+ # Save to bytes
118
+ buf = io.BytesIO()
119
+ plt.savefig(buf, format='png', bbox_inches='tight', dpi=300)
120
+ plt.close()
121
+ buf.seek(0)
122
+
123
+ # Convert to base64
124
+ img_base64 = base64.b64encode(buf.getvalue()).decode()
125
+
126
+ return img_base64
127
+
128
+ except Exception as e:
129
+ print(f"Error creating word cloud for {column}: {str(e)}")
130
+ raise e
131
+
132
+ def create_wordcloud(data, column):
133
+ """Create a word cloud visualization."""
134
+ from wordcloud import WordCloud
135
+ import matplotlib.pyplot as plt
136
+
137
+ # Generate word cloud
138
+ text = " ".join(data[column].astype(str))
139
+ wordcloud = WordCloud(
140
+ width=800,
141
+ height=400,
142
+ background_color="white",
143
+ colormap="plasma",
144
+ max_words=100,
145
+ ).generate(text)
146
+
147
+ # Create matplotlib figure
148
+ plt.figure(figsize=(10, 5))
149
+ plt.imshow(wordcloud, interpolation="bilinear")
150
+ plt.axis("off")
151
+ plt.title(f"Word Cloud for {column}")
152
+
153
+ # Save to bytes
154
+ buf = io.BytesIO()
155
+ plt.savefig(buf, format="png", bbox_inches="tight", dpi=300)
156
+ plt.close()
157
+ buf.seek(0)
158
+
159
+ # Convert to base64
160
+ img_base64 = base64.b64encode(buf.getvalue()).decode()
161
+
162
+ return img_base64