File size: 3,061 Bytes
c7cc986
ff82de9
0f6cf6a
30585ac
08e2876
24b4364
c7cc986
 
1058a79
 
 
 
 
 
 
 
 
 
 
c7cc986
24b4364
 
 
 
0f6cf6a
1058a79
 
 
 
 
 
 
 
360bd52
c7cc986
30585ac
b75c1aa
c7cc986
 
b75c1aa
1058a79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18adb0d
c7cc986
24b4364
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
from datasets import load_dataset, concatenate_datasets
import json
import os
import base64
from datetime import datetime

def load_and_combine_datasets():
    try:
        python_codes_dataset = load_dataset('flytech/python-codes-25k', split='train')
        streamlit_issues_dataset = load_dataset("andfanilo/streamlit-issues", split='train')
        streamlit_docs_dataset = load_dataset("sai-lohith/streamlit_docs", split='train')
        
        combined_dataset = concatenate_datasets([python_codes_dataset, streamlit_issues_dataset, streamlit_docs_dataset])
        
        return combined_dataset
    except Exception as e:
        st.error(f"Error loading datasets: {e}")
        return None

def datetime_serializer(o):
    if isinstance(o, datetime):
        return o.strftime('%Y-%m-%d %H:%M:%S')

def save_combined_dataset_as_jsonl(combined_dataset, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            for example in combined_dataset:
                json.dump(example, f, ensure_ascii=False, default=datetime_serializer)
                f.write('\n')
        st.success(f"Combined dataset saved as JSONL file: {file_path}")
    except Exception as e:
        st.error(f"Error saving dataset: {e}")

def main():
    st.title("Combined Dataset Viewer and Downloader")

    # Load and combine datasets
    combined_dataset = load_and_combine_datasets()

    if combined_dataset is not None:
        # Display a subset of the combined dataset
        st.write("Subset of Combined Dataset:", combined_dataset[:10])

        # Take input for output dataset name
        output_dataset_name = st.text_input("Enter output dataset name (without extension):", "combined_dataset")

        # Add option to save the combined dataset as JSONL
        if st.button("Save Combined Dataset (JSONL)"):
            file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
            save_combined_dataset_as_jsonl(combined_dataset, file_path)

        # Add option to download the JSONL file
        if st.button("Download Combined Dataset (JSONL)"):
            file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
            save_combined_dataset_as_jsonl(combined_dataset, file_path)
            try:
                with open(file_path, "rb") as f:
                    bytes_data = f.read()
                b64 = base64.b64encode(bytes_data).decode()
                href = f'<a href="data:file/jsonl;base64,{b64}" download="{output_dataset_name}.jsonl">Download JSONL File</a>'
                st.markdown(href, unsafe_allow_html=True)

                # Provide download button
                st.download_button(label="Click to Download",
                                   data=bytes_data,
                                   file_name=f"{output_dataset_name}.jsonl",
                                   mime="application/jsonl")
            except Exception as e:
                st.error(f"Error preparing download: {e}")

if __name__ == "__main__":
    main()