File size: 2,464 Bytes
c7cc986
ff82de9
0f6cf6a
30585ac
08e2876
c7cc986
 
 
b75c1aa
 
360bd52
c7cc986
360bd52
c7cc986
 
0f6cf6a
 
 
 
 
360bd52
c7cc986
30585ac
b75c1aa
c7cc986
 
b75c1aa
360bd52
 
 
30585ac
 
 
0f6cf6a
 
30585ac
0f6cf6a
 
48aa054
18adb0d
 
30585ac
18adb0d
 
08e2876
 
30585ac
 
 
08e2876
 
 
 
 
 
18adb0d
c7cc986
308ef5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
from datasets import load_dataset, concatenate_datasets
import json
import os
import base64

def load_and_combine_datasets():
    python_codes_dataset = load_dataset('flytech/python-codes-25k', split='train')
    streamlit_issues_dataset = load_dataset("andfanilo/streamlit-issues", split='train')
    streamlit_docs_dataset = load_dataset("sai-lohith/streamlit_docs", split='train')
    
    combined_dataset = concatenate_datasets([python_codes_dataset, streamlit_issues_dataset, streamlit_docs_dataset])
    
    return combined_dataset

def save_combined_dataset_as_jsonl(combined_dataset, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for example in combined_dataset:
            json.dump(example, f, ensure_ascii=False)
            f.write('\n')

def main():
    st.title("Combined Dataset Viewer and Downloader")

    # Load and combine datasets
    combined_dataset = load_and_combine_datasets()

    # Display a subset of the combined dataset
    st.write("Subset of Combined Dataset:", combined_dataset[:10])

    # Take input for output dataset name
    output_dataset_name = st.text_input("Enter output dataset name (without extension):", "combined_dataset")

    # Add option to save the combined dataset as JSONL
    if st.button("Save Combined Dataset (JSONL)"):
        file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
        save_combined_dataset_as_jsonl(combined_dataset, file_path)
        st.write(f"Combined dataset saved as JSONL file: {file_path}")

    # Add option to download the JSONL file
    if st.button("Download Combined Dataset (JSONL)"):
        file_path = os.path.join(os.getcwd(), f"{output_dataset_name}.jsonl")
        save_combined_dataset_as_jsonl(combined_dataset, file_path)
        st.write("Download the combined dataset as JSONL file:")
        with open(file_path, "r") as f:
            bytes_data = f.read().encode()
        b64 = base64.b64encode(bytes_data).decode()
        href = f'<a href="data:file/jsonl;base64,{b64}" download="{output_dataset_name}.jsonl">Download JSONL File</a>'
        st.markdown(href, unsafe_allow_html=True)
        
        # Provide download button
        st.download_button(label="Click to Download",
                           data=bytes_data,
                           file_name=f"{output_dataset_name}.jsonl",
                           mime="application/jsonl")

if __name__ == "__main__":
    main()