import os import duckdb import streamlit as st from huggingface_hub import hf_hub_download import pandas as pd import tempfile import re HF_REPO_ID = "stcoats/temp-duckdb-upload" HF_FILENAME = "ycsep.duckdb" LOCAL_PATH = "./ycsep.duckdb" st.set_page_config(layout="wide") st.title("YCSEP Audio Dataset Viewer") # Download database if missing if not os.path.exists(LOCAL_PATH): st.write("Downloading from HF Hub...") hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=HF_FILENAME, local_dir=".", local_dir_use_symlinks=False ) st.success("Download complete.") # Connect (only once) @st.cache_resource(show_spinner=False) def get_duckdb_connection(): return duckdb.connect(LOCAL_PATH, read_only=True) try: con = get_duckdb_connection() st.success("Connected to DuckDB.") except Exception as e: st.error(f"DuckDB connection failed: {e}") st.stop() # Search query = st.text_input("Search text (case-insensitive)", "").strip() if query: sql = """ SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio FROM data WHERE LOWER(text) LIKE LOWER(?) LIMIT 100 """ df = con.execute(sql, [f"%{query}%"]).df() else: df = con.execute(""" SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio FROM data LIMIT 100 """).df() st.markdown(f"### Showing {len(df)} results") if len(df) == 0: st.warning("No matches found.") # Show table with inline audio players for i, row in df.iterrows(): col1, col2, col3 = st.columns([3, 5, 2]) col1.markdown(f"**ID:** {row['id']}") col1.markdown(f"**Channel:** {row['channel']}") col1.markdown(f"**Video ID:** {row['video_id']}") col1.markdown(f"**Video Title:** {row['video_title']}") col1.markdown(f"**Speaker:** {row['speaker']}") col1.markdown(f"**Start Time:** {row['start_time']}") col1.markdown(f"**End Time:** {row['end_time']}") col1.markdown(f"**Upload Date:** {row['upload_date']}") highlighted_text = row['text'] if query: highlighted_text = re.sub(f'({re.escape(query)})', r'\1', highlighted_text, flags=re.IGNORECASE) col2.markdown(f"**Text:** {highlighted_text}", unsafe_allow_html=True) col2.markdown(f"**POS tags:** {row['pos_tags']}") audio_data = row["audio"] try: if isinstance(audio_data, (bytes, bytearray, memoryview)): audio_bytes = bytes(audio_data) elif isinstance(audio_data, list): # DuckDB sometimes gives list[int] audio_bytes = bytes(audio_data) else: audio_bytes = None if audio_bytes: with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile: tmpfile.write(audio_bytes) tmpfile.flush() col3.audio(tmpfile.name, format="audio/mp3") else: col3.warning("Audio missing or invalid format.") except Exception as e: col3.error(f"Audio error: {e}")