import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download

HF_REPO_ID = "stcoats/temp-duckdb-upload"   # Replace with your actual dataset repo if needed
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.title("YCSEP Audio Dataset Viewer")

# Check if file exists
if not os.path.exists(LOCAL_PATH):
    st.write("Database not found locally. Downloading from HF Hub...")
    try:
        downloaded_path = hf_hub_download(
            repo_id=HF_REPO_ID,
            repo_type="dataset",
            filename=HF_FILENAME,
            local_dir="."  # Saves as ./ycsep.duckdb automatically
        )
        st.success(f"Downloaded: {downloaded_path}")
    except Exception as e:
        st.error(f"Download failed: {e}")
        st.stop()
else:
    st.write("Found local DuckDB file.")

# Try connecting to the DB
try:
    con = duckdb.connect(LOCAL_PATH, read_only=True)
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"Failed to connect to DuckDB: {e}")
    st.stop()

# Query first page without loading everything into memory
st.write("Querying first 10 rows...")

try:
    cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10")
    rows = cursor.fetchall()
    for speaker, text, audio in rows:
        st.markdown(f"**Speaker:** {speaker}")
        st.markdown(f"**Text:** {text}")
        if isinstance(audio, str) and audio.startswith("http"):
            st.audio(audio, format="audio/mp3")
        else:
            st.warning("Audio not available")
        st.markdown("---")
except Exception as e:
    st.error(f"DuckDB query failed: {e}")