File size: 2,209 Bytes
b5ff3a4
 
f89e539
b5ff3a4
e85bb51
 
b5ff3a4
65c9ca0
f89e539
 
b5ff3a4
e85bb51
 
b5ff3a4
e85bb51
f89e539
e85bb51
416c906
4cf6559
 
 
 
 
e85bb51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548bf4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")

# Download database if missing
if not os.path.exists(LOCAL_PATH):
    st.write("Downloading from HF Hub...")
    hf_hub_download(
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        filename=HF_FILENAME,
        local_dir="."
    )
    st.success("Download complete.")

# Connect
try:
    con = duckdb.connect(LOCAL_PATH, read_only=True)
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"DuckDB connection failed: {e}")
    st.stop()

# Search
query = st.text_input("Search text or speaker", "")
query = query.strip().lower()

if query:
    sql = """
        SELECT speaker, text, audio
        FROM data
        WHERE LOWER(CAST(speaker AS VARCHAR)) LIKE ? OR LOWER(CAST(text AS VARCHAR)) LIKE ?
        LIMIT 100
    """
    df = con.execute(sql, [f"%{query}%", f"%{query}%"]).df()
else:
    df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df()

st.markdown(f"### Showing {len(df)} results")

if len(df) == 0:
    st.warning("No matches found.")

# Show table with inline audio players
for i, row in df.iterrows():
    col1, col2, col3 = st.columns([2, 5, 3])
    col1.markdown(f"**{row['speaker']}**")
    col2.markdown(row['text'])

    audio_data = row["audio"]
    try:
        if isinstance(audio_data, (bytes, bytearray, memoryview)):
            audio_bytes = bytes(audio_data)
        elif isinstance(audio_data, list):  # DuckDB sometimes gives list[int]
            audio_bytes = bytes(audio_data)
        else:
            audio_bytes = None

        if audio_bytes:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
                tmpfile.write(audio_bytes)
                tmpfile.flush()
                col3.audio(tmpfile.name, format="audio/mp3")
        else:
            col3.warning("Audio missing or invalid format.")
    except Exception as e:
        col3.error(f"Audio error: {e}")