File size: 3,319 Bytes
b5ff3a4
 
f89e539
b5ff3a4
e85bb51
 
a31425b
b5ff3a4
65c9ca0
f89e539
 
b5ff3a4
e85bb51
 
b5ff3a4
e85bb51
f89e539
e85bb51
416c906
4cf6559
 
 
a31425b
 
4cf6559
e85bb51
 
a31425b
 
 
 
 
e85bb51
a31425b
e85bb51
 
 
 
 
a0a9509
 
 
e85bb51
a31425b
e85bb51
 
 
5fde344
e85bb51
a0a9509
e85bb51
 
a47efdc
e85bb51
a31425b
5fde344
 
a31425b
 
e85bb51
 
 
 
 
a47efdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0a9509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a47efdc
a0a9509
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")

# Download database if missing
if not os.path.exists(LOCAL_PATH):
    st.write("Downloading from HF Hub...")
    hf_hub_download(
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        filename=HF_FILENAME,
        local_dir=".",
        local_dir_use_symlinks=False
    )
    st.success("Download complete.")

# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
    return duckdb.connect(LOCAL_PATH, read_only=True)

try:
    con = get_duckdb_connection()
    st.success("Connected to DuckDB.")
except Exception as e:
    st.error(f"DuckDB connection failed: {e}")
    st.stop()

# Enable full-text search index on first run (one-time setup if not exists)
con.execute("PRAGMA create_fts_index('data', 'text')")

# Search
query = st.text_input("Search text (case-insensitive)", "").strip()

if query:
    sql = """
        SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
        FROM data
        WHERE text % ?
        LIMIT 100
    """
    df = con.execute(sql, [query]).df()
else:
    df = con.execute("""
        SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
        FROM data
        LIMIT 100
    """).df()

st.markdown(f"### Showing {len(df)} results")

if len(df) == 0:
    st.warning("No matches found.")
else:
    def render_audio_cell(audio_bytes):
        try:
            if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
                data = bytes(audio_bytes)
            elif isinstance(audio_bytes, list):
                data = bytes(audio_bytes)
            else:
                return None
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
                tmp.write(data)
                tmp.flush()
                return tmp.name
        except Exception:
            return None

    df["audio_file"] = df["audio"].apply(render_audio_cell)
    df_display = df.drop(columns=["audio"]).copy()

    # Add HTML audio tag column
    def audio_html(path):
        if path:
            return f'<audio controls preload="none" style="height:20px;"> <source src="file://{path}" type="audio/mpeg"> </audio>'
        return ""

    df_display["Audio"] = df["audio_file"].apply(audio_html)

    # Reorder columns
    column_order = ["id", "channel", "video_id", "video_title", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
    df_display = df_display[column_order]

    st.markdown("### Full Table View (Sortable)")
    st.write("Note: Audio is embedded using HTML tags; not all browsers allow playback from local temp paths.")
    st.dataframe(df_display.drop(columns=["Audio"]))

    st.markdown("### Audio Previews")
    for i, row in df_display.iterrows():
        if row["Audio"]:
            st.markdown(f"**{row['speaker']} | {row['text'][:80]}**", unsafe_allow_html=True)
            st.markdown(row["Audio"], unsafe_allow_html=True)