File size: 3,319 Bytes
b5ff3a4 f89e539 b5ff3a4 e85bb51 a31425b b5ff3a4 65c9ca0 f89e539 b5ff3a4 e85bb51 b5ff3a4 e85bb51 f89e539 e85bb51 416c906 4cf6559 a31425b 4cf6559 e85bb51 a31425b e85bb51 a31425b e85bb51 a0a9509 e85bb51 a31425b e85bb51 5fde344 e85bb51 a0a9509 e85bb51 a47efdc e85bb51 a31425b 5fde344 a31425b e85bb51 a47efdc a0a9509 a47efdc a0a9509 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Enable full-text search index on first run (one-time setup if not exists)
con.execute("PRAGMA create_fts_index('data', 'text')")
# Search
query = st.text_input("Search text (case-insensitive)", "").strip()
if query:
sql = """
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
WHERE text % ?
LIMIT 100
"""
df = con.execute(sql, [query]).df()
else:
df = con.execute("""
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
else:
def render_audio_cell(audio_bytes):
try:
if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
data = bytes(audio_bytes)
elif isinstance(audio_bytes, list):
data = bytes(audio_bytes)
else:
return None
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tmp.write(data)
tmp.flush()
return tmp.name
except Exception:
return None
df["audio_file"] = df["audio"].apply(render_audio_cell)
df_display = df.drop(columns=["audio"]).copy()
# Add HTML audio tag column
def audio_html(path):
if path:
return f'<audio controls preload="none" style="height:20px;"> <source src="file://{path}" type="audio/mpeg"> </audio>'
return ""
df_display["Audio"] = df["audio_file"].apply(audio_html)
# Reorder columns
column_order = ["id", "channel", "video_id", "video_title", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
df_display = df_display[column_order]
st.markdown("### Full Table View (Sortable)")
st.write("Note: Audio is embedded using HTML tags; not all browsers allow playback from local temp paths.")
st.dataframe(df_display.drop(columns=["Audio"]))
st.markdown("### Audio Previews")
for i, row in df_display.iterrows():
if row["Audio"]:
st.markdown(f"**{row['speaker']} | {row['text'][:80]}**", unsafe_allow_html=True)
st.markdown(row["Audio"], unsafe_allow_html=True)
|