tspace / app.py
stcoats
Add application file
a47efdc
raw
history blame
3.29 kB
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Search
query = st.text_input("Search text (case-insensitive)", "").strip()
if query:
sql = """
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
WHERE LOWER(text) LIKE '%' || LOWER(?) || '%'
LIMIT 100
"""
df = con.execute(sql, [query]).df()
else:
df = con.execute("""
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
else:
def render_audio_cell(audio_bytes):
try:
if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
data = bytes(audio_bytes)
elif isinstance(audio_bytes, list):
data = bytes(audio_bytes)
else:
return None
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tmp.write(data)
tmp.flush()
return tmp.name
except Exception:
return None
df["audio_file"] = df["audio"].apply(render_audio_cell)
# Build an interactive sortable table
st.markdown("### Results Table (Sortable)")
for i, row in df.iterrows():
with st.expander(f"? {row['speaker']} | {row['text'][:60]}..."):
col1, col2 = st.columns([2, 3])
with col1:
st.write(f"**ID:** {row['id']}")
st.write(f"**Channel:** {row['channel']}")
st.write(f"**Video ID:** {row['video_id']}")
st.write(f"**Video Title:** {row['video_title']}")
st.write(f"**Speaker:** {row['speaker']}")
st.write(f"**Start Time:** {row['start_time']}")
st.write(f"**End Time:** {row['end_time']}")
st.write(f"**Upload Date:** {row['upload_date']}")
st.write(f"**POS Tags:** {row['pos_tags']}")
with col2:
st.markdown(f"**Text:** {row['text']}")
if row['audio_file']:
st.audio(row['audio_file'], format="audio/mp3")
else:
st.warning("Audio not available or invalid format.")