File size: 1,640 Bytes
b5ff3a4
 
f89e539
b5ff3a4
 
af343b5
f89e539
 
b5ff3a4
 
 
af343b5
f89e539
af343b5
 
 
 
 
 
 
 
 
 
 
 
b5ff3a4
af343b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548bf4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download

HF_REPO_ID = "stcoats/temp-duckdb-upload"   # Replace with your actual dataset repo if needed
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.title("YCSEP Audio Dataset Viewer")

# Check if file exists
if not os.path.exists(LOCAL_PATH):
    st.write("Database not found locally. Downloading from HF Hub...")
    try:
        downloaded_path = hf_hub_download(
            repo_id=HF_REPO_ID,
            repo_type="dataset",
            filename=HF_FILENAME,
            local_dir="."  # Saves as ./ycsep.duckdb automatically
        )
        st.success(f"Downloaded: {downloaded_path}")
    except Exception as e:
        st.error(f"Download failed: {e}")
        st.stop()
else:
    st.write("Found local DuckDB file.")

# Try connecting to the DB
try:
    con = duckdb.connect(LOCAL_PATH, read_only=True)
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"Failed to connect to DuckDB: {e}")
    st.stop()

# Query first page without loading everything into memory
st.write("Querying first 10 rows...")

try:
    cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10")
    rows = cursor.fetchall()
    for speaker, text, audio in rows:
        st.markdown(f"**Speaker:** {speaker}")
        st.markdown(f"**Text:** {text}")
        if isinstance(audio, str) and audio.startswith("http"):
            st.audio(audio, format="audio/mp3")
        else:
            st.warning("Audio not available")
        st.markdown("---")
except Exception as e:
    st.error(f"DuckDB query failed: {e}")