File size: 2,915 Bytes
b5ff3a4 f89e539 b5ff3a4 e85bb51 a5de086 b5ff3a4 65c9ca0 f89e539 b5ff3a4 e85bb51 b5ff3a4 e85bb51 f89e539 a5de086 e85bb51 a31425b e85bb51 a31425b e85bb51 135e6d1 7974c3c e85bb51 4c9a1f3 e85bb51 56677fa a671301 e85bb51 4c9a1f3 e85bb51 56677fa e85bb51 a31425b a671301 5fde344 a31425b e85bb51 a47efdc a5de086 a671301 a5de086 a671301 a5de086 a0a9509 4c9a1f3 9302019 4c9a1f3 c173a94 4c9a1f3 56677fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import base64
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
with st.spinner("Downloading from HF Hub..."):
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Search input
query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip()
# Escape single quotes in query
query_safe = query.replace("'", "''")
# Build query using exact substring match only (case-insensitive handled by ILIKE)
if query:
sql = f"""
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
FROM data
WHERE text ILIKE '%{query_safe}%'
LIMIT 100
"""
df = con.execute(sql).df()
else:
df = con.execute("""
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
else:
def get_audio_html(audio_bytes):
try:
if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
data = bytes(audio_bytes)
elif isinstance(audio_bytes, list):
data = bytes(audio_bytes)
else:
return ""
b64 = base64.b64encode(data).decode("utf-8")
return f'<audio controls preload="metadata" style="height:20px;width:120px;"><source src="data:audio/mp3;base64,{b64}" type="audio/mpeg"></audio>'
except Exception:
return ""
df["Audio"] = df["audio"].apply(get_audio_html)
df.drop(columns=["audio"], inplace=True)
# Reorder columns so "Audio" is last and limit column width for id
display_cols = ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
df = df[display_cols]
# Show table with HTML rendering for audio
st.markdown("### Results Table (Sortable with Audio Column)")
st.write("(Scroll right to view audio controls)")
st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|