Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import os | |
def process_jsonl(file): | |
""" | |
Processes an uploaded JSONL file to expand the 'messages' column and saves the result as a CSV. | |
Args: | |
file: Uploaded JSONL file. | |
Returns: | |
str: Path to the expanded CSV file. | |
""" | |
# Read the JSONL file | |
data = [eval(line.strip()) for line in file.readlines()] | |
df = pd.DataFrame(data) | |
# Expand the 'messages' column if it exists | |
if 'messages' in df.columns: | |
messages_df = df['messages'].apply(pd.Series) | |
expanded_messages_df = pd.DataFrame() | |
for col in messages_df.columns: | |
if messages_df[col].apply(lambda x: isinstance(x, dict)).any(): | |
expanded_columns = messages_df[col].apply(pd.Series) | |
expanded_columns = expanded_columns.add_prefix(f'message_{col}_') | |
expanded_messages_df = pd.concat([expanded_messages_df, expanded_columns], axis=1) | |
else: | |
expanded_messages_df = pd.concat([expanded_messages_df, messages_df[col].rename(f'message_{col}')], axis=1) | |
# Merge expanded columns back into the original DataFrame | |
df = pd.concat([df.drop(columns=['messages']), expanded_messages_df], axis=1) | |
# Save the expanded DataFrame to a CSV file | |
output_csv_path = "expanded_messages_data.csv" | |
df.to_csv(output_csv_path, index=False) | |
return output_csv_path | |
# Streamlit app | |
st.title("JSONL to CSV Converter with Message Expansion") | |
st.write("Upload a JSONL file, and download the processed CSV file with the `messages` column expanded.") | |
uploaded_file = st.file_uploader("Upload your JSONL file", type=["jsonl"]) | |
if uploaded_file is not None: | |
st.success("File uploaded successfully!") | |
# Process the uploaded file | |
output_csv_path = process_jsonl(uploaded_file) | |
# Provide download link | |
st.download_button( | |
label="Download Expanded CSV", | |
data=open(output_csv_path, "rb"), | |
file_name="expanded_messages_data.csv", | |
mime="text/csv" | |
) | |
# Ensure cleanup of temporary files | |
if os.path.exists("expanded_messages_data.csv"): | |
os.remove("expanded_messages_data.csv") | |