import os import duckdb import pandas as pd import streamlit as st from huggingface_hub import hf_hub_download import shutil HF_REPO_ID = "stcoats/temp-duckdb-upload" HF_FILENAME = "ycsep.duckdb" LOCAL_PATH = "./ycsep.duckdb" st.title("YCSEP Audio Dataset Viewer") # Download if needed if not os.path.exists(LOCAL_PATH): st.write("Downloading from HF Hub...") path = hf_hub_download( repo_id=HF_REPO_ID, repo_type="dataset", filename=HF_FILENAME, local_dir=".", local_dir_use_symlinks=False ) if path != LOCAL_PATH: shutil.copyfile(path, LOCAL_PATH) # Connect con = duckdb.connect(LOCAL_PATH, read_only=True) # Get total row count (only once) total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0] rows_per_page = 10 total_pages = (total_rows - 1) // rows_per_page + 1 st.success(f"Total rows: {total_rows}") # Select page page = st.number_input("Page", min_value=1, max_value=total_pages, value=1) offset = (page - 1) * rows_per_page # Optional: add search filter query = st.text_input("Search text or speaker") if query: query_sql = f""" SELECT * FROM data WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%' LIMIT {rows_per_page} OFFSET {offset} """ count_sql = f""" SELECT COUNT(*) FROM data WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%' """ filtered_rows = con.execute(count_sql).fetchone()[0] st.write(f"Filtered rows: {filtered_rows}") else: query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}" # Run query — only small chunk loaded now df_page = con.execute(query_sql).df() # Display for _, row in df_page.iterrows(): st.markdown(f"**Speaker:** {row['speaker']}") st.markdown(f"**Text:** {row['text']}") if isinstance(row['audio'], str) and row['audio'].startswith("http"): st.audio(row['audio'], format="audio/mp3") else: st.warning("Audio not available") st.markdown("---")