File size: 3,638 Bytes
b5ff3a4
 
f89e539
b5ff3a4
e85bb51
 
a671301
b5ff3a4
65c9ca0
f89e539
 
b5ff3a4
e85bb51
 
b5ff3a4
e85bb51
f89e539
e85bb51
416c906
4cf6559
 
 
a31425b
 
4cf6559
e85bb51
 
a31425b
 
 
 
 
e85bb51
a31425b
e85bb51
 
 
 
 
135e6d1
7974c3c
e85bb51
135e6d1
e85bb51
7974c3c
a671301
e85bb51
7974c3c
e85bb51
 
7974c3c
e85bb51
a31425b
a671301
5fde344
a31425b
 
e85bb51
 
 
 
 
a47efdc
c173a94
a671301
 
 
 
 
 
 
 
 
 
c173a94
a671301
 
 
c173a94
 
a0a9509
7974c3c
895e3ae
c173a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7974c3c
 
c173a94
7974c3c
c173a94
7974c3c
 
 
9302019
c173a94
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")

# Download database if missing
if not os.path.exists(LOCAL_PATH):
    st.write("Downloading from HF Hub...")
    hf_hub_download(
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        filename=HF_FILENAME,
        local_dir=".",
        local_dir_use_symlinks=False
    )
    st.success("Download complete.")

# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
    return duckdb.connect(LOCAL_PATH, read_only=True)

try:
    con = get_duckdb_connection()
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"DuckDB connection failed: {e}")
    st.stop()

# Search input
query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip()

# Build query
if query:
    sql = """
        SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
        FROM data
        WHERE LOWER(text) LIKE LOWER(?)
        LIMIT 100
    """
    df = con.execute(sql, [f"%{query}%"]).df()
else:
    df = con.execute("""
        SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
        FROM data
        LIMIT 100
    """).df()

st.markdown(f"### Showing {len(df)} results")

if len(df) == 0:
    st.warning("No matches found.")
else:
    def render_audio_tag(audio_bytes, index):
        try:
            if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
                data = bytes(audio_bytes)
            elif isinstance(audio_bytes, list):
                data = bytes(audio_bytes)
            else:
                return ""
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
                tmp.write(data)
                tmp.flush()
                return f'<audio controls preload="metadata" style="height:20px;width:100px;"><source src="file://{tmp.name}" type="audio/mpeg"></audio>'
        except Exception:
            return ""

    df["audio_player"] = [render_audio_tag(b, i) for i, b in enumerate(df["audio"])]
    df_display = df.drop(columns=["audio"])

    from streamlit.components.v1 import html

    def display_html_table(df):
        table_html = """
        <table border='1' style='border-collapse:collapse;width:100%;font-size:13px;'>
            <thead>
                <tr>
                    <th style='width:8em;'>id</th>
                    <th>channel</th>
                    <th>video_id</th>
                    <th>speaker</th>
                    <th>start_time</th>
                    <th>end_time</th>
                    <th>upload_date</th>
                    <th>text</th>
                    <th>pos_tags</th>
                    <th>audio</th>
                </tr>
            </thead>
            <tbody>
        """
        for _, row in df.iterrows():
            table_html += "<tr>"
            for col in ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags"]:
                table_html += f"<td>{row[col]}</td>"
            table_html += f"<td>{row['audio_player']}</td>"
            table_html += "</tr>"
        table_html += "</tbody></table>"
        return table_html

    st.markdown("### Results Table (Sortable with Audio Column)")
    html(display_html_table(df_display), height=800, scrolling=True)