import os import duckdb import streamlit as st from huggingface_hub import hf_hub_download import pandas as pd import tempfile import base64 HF_REPO_ID = "stcoats/temp-duckdb-upload" HF_FILENAME = "ycsep.duckdb" LOCAL_PATH = "./ycsep.duckdb" st.set_page_config(layout="wide") st.title("YCSEP Audio Dataset Viewer") # Download database if missing if not os.path.exists(LOCAL_PATH): with st.spinner("Downloading from HF Hub..."): hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=HF_FILENAME, local_dir=".", local_dir_use_symlinks=False ) st.success("Download complete.") # Connect (only once) @st.cache_resource(show_spinner=False) def get_duckdb_connection(): return duckdb.connect(LOCAL_PATH, read_only=True) try: con = get_duckdb_connection() st.success("Connected to DuckDB.") except Exception as e: st.error(f"DuckDB connection failed: {e}") st.stop() # Search input query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip() # Escape single quotes in query query_safe = query.replace("'", "''") # Build query using exact substring match only (case-insensitive handled by ILIKE) if query: sql = f""" SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio FROM data WHERE text ILIKE '%{query_safe}%' LIMIT 100 """ df = con.execute(sql).df() else: df = con.execute(""" SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio FROM data LIMIT 100 """).df() st.markdown(f"### Showing {len(df)} results") if len(df) == 0: st.warning("No matches found.") else: def get_audio_html(audio_bytes): try: if isinstance(audio_bytes, (bytes, bytearray, memoryview)): data = bytes(audio_bytes) elif isinstance(audio_bytes, list): data = bytes(audio_bytes) else: return "" b64 = base64.b64encode(data).decode("utf-8") return f'' except Exception: return "" df["Audio"] = df["audio"].apply(get_audio_html) df.drop(columns=["audio"], inplace=True) # Reorder columns so "Audio" is last and limit column width for id display_cols = ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"] df = df[display_cols] # Show table with HTML rendering for audio st.markdown("### Results Table (Sortable with Audio Column)") st.write("(Scroll right to view audio controls)") st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)