import os import duckdb import streamlit as st from huggingface_hub import hf_hub_download HF_REPO_ID = "stcoats/temp-duckdb-upload" # Replace with your actual dataset repo if needed HF_FILENAME = "ycsep.duckdb" LOCAL_PATH = "./ycsep.duckdb" st.title("YCSEP Audio Dataset Viewer") # Check if file exists if not os.path.exists(LOCAL_PATH): st.write("Database not found locally. Downloading from HF Hub...") try: downloaded_path = hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=HF_FILENAME, local_dir="." # Saves as ./ycsep.duckdb automatically ) st.success(f"Downloaded: {downloaded_path}") except Exception as e: st.error(f"Download failed: {e}") st.stop() else: st.write("Found local DuckDB file.") # Try connecting to the DB try: con = duckdb.connect(LOCAL_PATH, read_only=True) st.success("Connected to DuckDB.") except Exception as e: st.error(f"Failed to connect to DuckDB: {e}") st.stop() # Query first page without loading everything into memory st.write("Querying first 10 rows...") try: cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10") rows = cursor.fetchall() for speaker, text, audio in rows: st.markdown(f"**Speaker:** {speaker}") st.markdown(f"**Text:** {text}") if isinstance(audio, str) and audio.startswith("http"): st.audio(audio, format="audio/mp3") else: st.warning("Audio not available") st.markdown("---") except Exception as e: st.error(f"DuckDB query failed: {e}")