File size: 2,209 Bytes
b5ff3a4
 
f89e539
b5ff3a4
4cf6559
 
b5ff3a4
65c9ca0
f89e539
 
b5ff3a4
416c906
b5ff3a4
 
65c9ca0
f89e539
65c9ca0
416c906
4cf6559
 
 
 
 
416c906
 
65c9ca0
 
 
 
 
 
 
416c906
65c9ca0
 
 
416c906
65c9ca0
416c906
 
 
65c9ca0
416c906
4cf6559
65c9ca0
4cf6559
416c906
 
65c9ca0
 
 
 
 
 
 
416c906
 
 
 
65c9ca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548bf4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")

# Download database if missing
if not os.path.exists(LOCAL_PATH):
    st.write("Downloading from HF Hub...")
    hf_hub_download(
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        filename=HF_FILENAME,
        local_dir="."
    )
    st.success("Download complete.")

# Connect
try:
    con = duckdb.connect(LOCAL_PATH, read_only=True)
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"DuckDB connection failed: {e}")
    st.stop()

# Search
query = st.text_input("Search text or speaker", "")
query = query.strip().lower()

if query:
    sql = """
        SELECT speaker, text, audio
        FROM data
        WHERE LOWER(CAST(speaker AS VARCHAR)) LIKE ? OR LOWER(CAST(text AS VARCHAR)) LIKE ?
        LIMIT 100
    """
    df = con.execute(sql, [f"%{query}%", f"%{query}%"]).df()
else:
    df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df()

st.markdown(f"### Showing {len(df)} results")

if len(df) == 0:
    st.warning("No matches found.")

# Show table with inline audio players
for i, row in df.iterrows():
    col1, col2, col3 = st.columns([2, 5, 3])
    col1.markdown(f"**{row['speaker']}**")
    col2.markdown(row['text'])

    audio_data = row["audio"]
    try:
        if isinstance(audio_data, (bytes, bytearray, memoryview)):
            audio_bytes = bytes(audio_data)
        elif isinstance(audio_data, list):  # DuckDB sometimes gives list[int]
            audio_bytes = bytes(audio_data)
        else:
            audio_bytes = None

        if audio_bytes:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
                tmpfile.write(audio_bytes)
                tmpfile.flush()
                col3.audio(tmpfile.name, format="audio/mp3")
        else:
            col3.warning("Audio missing or invalid format.")
    except Exception as e:
        col3.error(f"Audio error: {e}")