tspace / app.py
stcoats
Add application file
a0a9509
raw
history blame
3.32 kB
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Enable full-text search index on first run (one-time setup if not exists)
con.execute("PRAGMA create_fts_index('data', 'text')")
# Search
query = st.text_input("Search text (case-insensitive)", "").strip()
if query:
sql = """
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
WHERE text % ?
LIMIT 100
"""
df = con.execute(sql, [query]).df()
else:
df = con.execute("""
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
else:
def render_audio_cell(audio_bytes):
try:
if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
data = bytes(audio_bytes)
elif isinstance(audio_bytes, list):
data = bytes(audio_bytes)
else:
return None
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tmp.write(data)
tmp.flush()
return tmp.name
except Exception:
return None
df["audio_file"] = df["audio"].apply(render_audio_cell)
df_display = df.drop(columns=["audio"]).copy()
# Add HTML audio tag column
def audio_html(path):
if path:
return f'<audio controls preload="none" style="height:20px;"> <source src="file://{path}" type="audio/mpeg"> </audio>'
return ""
df_display["Audio"] = df["audio_file"].apply(audio_html)
# Reorder columns
column_order = ["id", "channel", "video_id", "video_title", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
df_display = df_display[column_order]
st.markdown("### Full Table View (Sortable)")
st.write("Note: Audio is embedded using HTML tags; not all browsers allow playback from local temp paths.")
st.dataframe(df_display.drop(columns=["Audio"]))
st.markdown("### Audio Previews")
for i, row in df_display.iterrows():
if row["Audio"]:
st.markdown(f"**{row['speaker']} | {row['text'][:80]}**", unsafe_allow_html=True)
st.markdown(row["Audio"], unsafe_allow_html=True)