File size: 2,915 Bytes
b5ff3a4
 
f89e539
b5ff3a4
e85bb51
 
a5de086
b5ff3a4
65c9ca0
f89e539
 
b5ff3a4
e85bb51
 
b5ff3a4
e85bb51
f89e539
a5de086
 
 
 
 
 
 
 
e85bb51
 
a31425b
 
 
 
 
e85bb51
a31425b
e85bb51
 
 
 
 
135e6d1
7974c3c
e85bb51
4c9a1f3
 
 
 
e85bb51
56677fa
a671301
e85bb51
4c9a1f3
e85bb51
 
56677fa
e85bb51
a31425b
a671301
5fde344
a31425b
 
e85bb51
 
 
 
 
a47efdc
a5de086
a671301
 
 
 
 
 
 
a5de086
 
a671301
 
 
a5de086
 
a0a9509
4c9a1f3
 
 
9302019
4c9a1f3
c173a94
4c9a1f3
 
56677fa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import base64

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")

# Download database if missing
if not os.path.exists(LOCAL_PATH):
    with st.spinner("Downloading from HF Hub..."):
        hf_hub_download(
            repo_id=HF_REPO_ID,
            repo_type="dataset",
            filename=HF_FILENAME,
            local_dir=".",
            local_dir_use_symlinks=False
        )
    st.success("Download complete.")

# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
    return duckdb.connect(LOCAL_PATH, read_only=True)

try:
    con = get_duckdb_connection()
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"DuckDB connection failed: {e}")
    st.stop()

# Search input
query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip()

# Escape single quotes in query
query_safe = query.replace("'", "''")

# Build query using exact substring match only (case-insensitive handled by ILIKE)
if query:
    sql = f"""
        SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
        FROM data
        WHERE text ILIKE '%{query_safe}%'
        LIMIT 100
    """
    df = con.execute(sql).df()
else:
    df = con.execute("""
        SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
        FROM data
        LIMIT 100
    """).df()

st.markdown(f"### Showing {len(df)} results")

if len(df) == 0:
    st.warning("No matches found.")
else:
    def get_audio_html(audio_bytes):
        try:
            if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
                data = bytes(audio_bytes)
            elif isinstance(audio_bytes, list):
                data = bytes(audio_bytes)
            else:
                return ""
            b64 = base64.b64encode(data).decode("utf-8")
            return f'<audio controls preload="metadata" style="height:20px;width:120px;"><source src="data:audio/mp3;base64,{b64}" type="audio/mpeg"></audio>'
        except Exception:
            return ""

    df["Audio"] = df["audio"].apply(get_audio_html)
    df.drop(columns=["audio"], inplace=True)

    # Reorder columns so "Audio" is last and limit column width for id
    display_cols = ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
    df = df[display_cols]

    # Show table with HTML rendering for audio
    st.markdown("### Results Table (Sortable with Audio Column)")
    st.write("(Scroll right to view audio controls)")
    st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)