File size: 2,209 Bytes
b5ff3a4 f89e539 b5ff3a4 4cf6559 b5ff3a4 65c9ca0 f89e539 b5ff3a4 416c906 b5ff3a4 65c9ca0 f89e539 65c9ca0 416c906 4cf6559 416c906 65c9ca0 416c906 65c9ca0 416c906 65c9ca0 416c906 65c9ca0 416c906 4cf6559 65c9ca0 4cf6559 416c906 65c9ca0 416c906 65c9ca0 548bf4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir="."
)
st.success("Download complete.")
# Connect
try:
con = duckdb.connect(LOCAL_PATH, read_only=True)
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Search
query = st.text_input("Search text or speaker", "")
query = query.strip().lower()
if query:
sql = """
SELECT speaker, text, audio
FROM data
WHERE LOWER(CAST(speaker AS VARCHAR)) LIKE ? OR LOWER(CAST(text AS VARCHAR)) LIKE ?
LIMIT 100
"""
df = con.execute(sql, [f"%{query}%", f"%{query}%"]).df()
else:
df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
# Show table with inline audio players
for i, row in df.iterrows():
col1, col2, col3 = st.columns([2, 5, 3])
col1.markdown(f"**{row['speaker']}**")
col2.markdown(row['text'])
audio_data = row["audio"]
try:
if isinstance(audio_data, (bytes, bytearray, memoryview)):
audio_bytes = bytes(audio_data)
elif isinstance(audio_data, list): # DuckDB sometimes gives list[int]
audio_bytes = bytes(audio_data)
else:
audio_bytes = None
if audio_bytes:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
tmpfile.write(audio_bytes)
tmpfile.flush()
col3.audio(tmpfile.name, format="audio/mp3")
else:
col3.warning("Audio missing or invalid format.")
except Exception as e:
col3.error(f"Audio error: {e}")
|