File size: 2,028 Bytes
b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 f89e539 b5ff3a4 548bf4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import duckdb
import pandas as pd
import streamlit as st
from huggingface_hub import hf_hub_download
import shutil
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.title("YCSEP Audio Dataset Viewer")
# Download if needed
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
path = hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
if path != LOCAL_PATH:
shutil.copyfile(path, LOCAL_PATH)
# Connect
con = duckdb.connect(LOCAL_PATH, read_only=True)
# Get total row count (only once)
total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
rows_per_page = 10
total_pages = (total_rows - 1) // rows_per_page + 1
st.success(f"Total rows: {total_rows}")
# Select page
page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
offset = (page - 1) * rows_per_page
# Optional: add search filter
query = st.text_input("Search text or speaker")
if query:
query_sql = f"""
SELECT * FROM data
WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
LIMIT {rows_per_page} OFFSET {offset}
"""
count_sql = f"""
SELECT COUNT(*) FROM data
WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
"""
filtered_rows = con.execute(count_sql).fetchone()[0]
st.write(f"Filtered rows: {filtered_rows}")
else:
query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"
# Run query only small chunk loaded now
df_page = con.execute(query_sql).df()
# Display
for _, row in df_page.iterrows():
st.markdown(f"**Speaker:** {row['speaker']}")
st.markdown(f"**Text:** {row['text']}")
if isinstance(row['audio'], str) and row['audio'].startswith("http"):
st.audio(row['audio'], format="audio/mp3")
else:
st.warning("Audio not available")
st.markdown("---")
|