import streamlit as st from datasets import load_dataset, concatenate_datasets import json import os import base64 from datetime import datetime def load_and_combine_datasets(): try: python_codes_dataset = load_dataset('flytech/python-codes-25k', split='train') streamlit_issues_dataset = load_dataset("andfanilo/streamlit-issues", split='train') streamlit_docs_dataset = load_dataset("sai-lohith/streamlit_docs", split='train') combined_dataset = concatenate_datasets([python_codes_dataset, streamlit_issues_dataset, streamlit_docs_dataset]) return combined_dataset except Exception as e: st.error(f"Error loading datasets: {e}") return None def datetime_serializer(o): if isinstance(o, datetime): return o.strftime('%Y-%m-%d %H:%M:%S') def save_combined_dataset_as_jsonl(combined_dataset, file_path): try: with open(file_path, 'w', encoding='utf-8') as f: for example in combined_dataset: json.dump(example, f, ensure_ascii=False, default=datetime_serializer) f.write('\n') st.success(f"Combined dataset saved as JSONL file: {file_path}") except Exception as e: st.error(f"Error saving dataset: {e}") def main(): st.title("Combined Dataset Viewer and Downloader") # Load and combine datasets combined_dataset = load_and_combine_datasets() if combined_dataset is not None: # Display a subset of the combined dataset st.write("Subset of Combined Dataset:", combined_dataset[:10]) # Take input for output dataset name output_dataset_name = st.text_input("Enter output dataset name (without extension):", "combined_dataset") # Add option to save the combined dataset as JSONL if st.button("Save Combined Dataset (JSONL)"): file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl") save_combined_dataset_as_jsonl(combined_dataset, file_path) # Add option to download the JSONL file if st.button("Download Combined Dataset (JSONL)"): file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl") save_combined_dataset_as_jsonl(combined_dataset, file_path) try: with open(file_path, "rb") as f: bytes_data = f.read() b64 = base64.b64encode(bytes_data).decode() href = f'Download JSONL File' st.markdown(href, unsafe_allow_html=True) # Provide download button st.download_button(label="Click to Download", data=bytes_data, file_name=f"{output_dataset_name}.jsonl", mime="application/jsonl") except Exception as e: st.error(f"Error preparing download: {e}") if __name__ == "__main__": main()