import os import duckdb import streamlit as st from huggingface_hub import hf_hub_download import pandas as pd import tempfile import re HF_REPO_ID = "stcoats/temp-duckdb-upload" HF_FILENAME = "ycsep.duckdb" LOCAL_PATH = "./ycsep.duckdb" st.set_page_config(layout="wide") st.title("YCSEP Audio Dataset Viewer") # Download database if missing if not os.path.exists(LOCAL_PATH): st.write("Downloading from HF Hub...") hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=HF_FILENAME, local_dir=".", local_dir_use_symlinks=False ) st.success("Download complete.") # Connect (only once) @st.cache_resource(show_spinner=False) def get_duckdb_connection(): return duckdb.connect(LOCAL_PATH, read_only=True) try: con = get_duckdb_connection() st.success("Connected to DuckDB.") except Exception as e: st.error(f"DuckDB connection failed: {e}") st.stop() # Search query = st.text_input("Search text (case-insensitive)", "").strip() if query: query_like = f"%{query.lower()}%" sql = """ SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio FROM data WHERE LOWER(text) LIKE ? LIMIT 100 """ df = con.execute(sql, [query_like]).df() else: df = con.execute(""" SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio FROM data LIMIT 100 """).df() st.markdown(f"### Showing {len(df)} results") if len(df) == 0: st.warning("No matches found.") else: def render_audio(audio_bytes): try: if isinstance(audio_bytes, (bytes, bytearray, memoryview)): data = bytes(audio_bytes) elif isinstance(audio_bytes, list): data = bytes(audio_bytes) else: return "" with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tmp.write(data) tmp.flush() return f'' except Exception: return "" df["Audio"] = df["audio"].apply(render_audio) df_display = df.drop(columns=["audio"]).copy() st.markdown("### Results Table (Sortable with Audio Column)") st.markdown("(Scroll right to view audio controls)") st.dataframe(df_display.drop(columns=["Audio"])) st.markdown("### Audio Previews") for i, row in df_display.iterrows(): st.markdown(f"**{row['speaker']} | {row['text']}**") st.markdown(df.loc[i, "Audio"], unsafe_allow_html=True)