|
import os |
|
import duckdb |
|
import streamlit as st |
|
from huggingface_hub import hf_hub_download |
|
import pandas as pd |
|
import tempfile |
|
|
|
HF_REPO_ID = "stcoats/temp-duckdb-upload" |
|
HF_FILENAME = "ycsep.duckdb" |
|
LOCAL_PATH = "./ycsep.duckdb" |
|
|
|
st.set_page_config(layout="wide") |
|
st.title("YCSEP Audio Dataset Viewer") |
|
|
|
|
|
if not os.path.exists(LOCAL_PATH): |
|
st.write("Downloading from HF Hub...") |
|
hf_hub_download( |
|
repo_id=HF_REPO_ID, |
|
repo_type="dataset", |
|
filename=HF_FILENAME, |
|
local_dir="." |
|
) |
|
st.success("Download complete.") |
|
|
|
|
|
try: |
|
con = duckdb.connect(LOCAL_PATH, read_only=True) |
|
st.success("Connected to DuckDB.") |
|
except Exception as e: |
|
st.error(f"DuckDB connection failed: {e}") |
|
st.stop() |
|
|
|
|
|
query = st.text_input("Search text or speaker", "") |
|
query = query.strip().lower() |
|
|
|
if query: |
|
sql = """ |
|
SELECT speaker, text, audio |
|
FROM data |
|
WHERE LOWER(CAST(speaker AS VARCHAR)) LIKE ? OR LOWER(CAST(text AS VARCHAR)) LIKE ? |
|
LIMIT 100 |
|
""" |
|
df = con.execute(sql, [f"%{query}%", f"%{query}%"]).df() |
|
else: |
|
df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df() |
|
|
|
st.markdown(f"### Showing {len(df)} results") |
|
|
|
if len(df) == 0: |
|
st.warning("No matches found.") |
|
|
|
|
|
for i, row in df.iterrows(): |
|
col1, col2, col3 = st.columns([2, 5, 3]) |
|
col1.markdown(f"**{row['speaker']}**") |
|
col2.markdown(row['text']) |
|
|
|
audio_data = row["audio"] |
|
try: |
|
if isinstance(audio_data, (bytes, bytearray, memoryview)): |
|
audio_bytes = bytes(audio_data) |
|
elif isinstance(audio_data, list): |
|
audio_bytes = bytes(audio_data) |
|
else: |
|
audio_bytes = None |
|
|
|
if audio_bytes: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile: |
|
tmpfile.write(audio_bytes) |
|
tmpfile.flush() |
|
col3.audio(tmpfile.name, format="audio/mp3") |
|
else: |
|
col3.warning("Audio missing or invalid format.") |
|
except Exception as e: |
|
col3.error(f"Audio error: {e}") |
|
|
|
|