|
import os |
|
import duckdb |
|
import pandas as pd |
|
import streamlit as st |
|
from huggingface_hub import hf_hub_download |
|
import shutil |
|
|
|
HF_REPO_ID = "stcoats/temp-duckdb-upload" |
|
HF_FILENAME = "ycsep.duckdb" |
|
LOCAL_PATH = "./ycsep.duckdb" |
|
|
|
st.title("YCSEP Audio Dataset Viewer") |
|
|
|
|
|
if not os.path.exists(LOCAL_PATH): |
|
st.write("Downloading from HF Hub...") |
|
path = hf_hub_download( |
|
repo_id=HF_REPO_ID, |
|
repo_type="dataset", |
|
filename=HF_FILENAME, |
|
local_dir=".", |
|
local_dir_use_symlinks=False |
|
) |
|
if path != LOCAL_PATH: |
|
shutil.copyfile(path, LOCAL_PATH) |
|
|
|
|
|
con = duckdb.connect(LOCAL_PATH, read_only=True) |
|
|
|
|
|
total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0] |
|
rows_per_page = 10 |
|
total_pages = (total_rows - 1) // rows_per_page + 1 |
|
st.success(f"Total rows: {total_rows}") |
|
|
|
|
|
page = st.number_input("Page", min_value=1, max_value=total_pages, value=1) |
|
offset = (page - 1) * rows_per_page |
|
|
|
|
|
query = st.text_input("Search text or speaker") |
|
|
|
if query: |
|
query_sql = f""" |
|
SELECT * FROM data |
|
WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%' |
|
LIMIT {rows_per_page} OFFSET {offset} |
|
""" |
|
count_sql = f""" |
|
SELECT COUNT(*) FROM data |
|
WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%' |
|
""" |
|
filtered_rows = con.execute(count_sql).fetchone()[0] |
|
st.write(f"Filtered rows: {filtered_rows}") |
|
else: |
|
query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}" |
|
|
|
|
|
df_page = con.execute(query_sql).df() |
|
|
|
|
|
for _, row in df_page.iterrows(): |
|
st.markdown(f"**Speaker:** {row['speaker']}") |
|
st.markdown(f"**Text:** {row['text']}") |
|
if isinstance(row['audio'], str) and row['audio'].startswith("http"): |
|
st.audio(row['audio'], format="audio/mp3") |
|
else: |
|
st.warning("Audio not available") |
|
st.markdown("---") |
|
|
|
|