File size: 2,028 Bytes
b5ff3a4
 
 
f89e539
b5ff3a4
f89e539
b5ff3a4
f89e539
 
 
b5ff3a4
 
 
f89e539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5ff3a4
f89e539
 
 
 
 
b5ff3a4
f89e539
 
 
b5ff3a4
f89e539
b5ff3a4
 
 
f89e539
 
 
 
 
 
 
 
 
 
 
b5ff3a4
f89e539
b5ff3a4
f89e539
 
b5ff3a4
f89e539
 
b5ff3a4
 
 
 
 
 
 
548bf4c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import duckdb
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download
import shutil

HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"

st.title("YCSEP Audio Dataset Viewer")

# Download if needed
if not os.path.exists(LOCAL_PATH):
    st.write("Downloading from HF Hub...")
    path = hf_hub_download(
        repo_id=HF_REPO_ID,
        repo_type="dataset",
        filename=HF_FILENAME,
        local_dir=".",
        local_dir_use_symlinks=False
    )
    if path != LOCAL_PATH:
        shutil.copyfile(path, LOCAL_PATH)

# Connect
con = duckdb.connect(LOCAL_PATH, read_only=True)

# Get total row count (only once)
total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
rows_per_page = 10
total_pages = (total_rows - 1) // rows_per_page + 1
st.success(f"Total rows: {total_rows}")

# Select page
page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
offset = (page - 1) * rows_per_page

# Optional: add search filter
query = st.text_input("Search text or speaker")

if query:
    query_sql = f"""
    SELECT * FROM data
    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
    LIMIT {rows_per_page} OFFSET {offset}
    """
    count_sql = f"""
    SELECT COUNT(*) FROM data
    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
    """
    filtered_rows = con.execute(count_sql).fetchone()[0]
    st.write(f"Filtered rows: {filtered_rows}")
else:
    query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"

# Run query — only small chunk loaded now
df_page = con.execute(query_sql).df()

# Display
for _, row in df_page.iterrows():
    st.markdown(f"**Speaker:** {row['speaker']}")
    st.markdown(f"**Text:** {row['text']}")
    if isinstance(row['audio'], str) and row['audio'].startswith("http"):
        st.audio(row['audio'], format="audio/mp3")
    else:
        st.warning("Audio not available")
    st.markdown("---")