tspace / app.py
stcoats
Add application file
5fde344
raw
history blame
3.15 kB
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Search
query = st.text_input("Search text (case-insensitive)", "").strip()
if query:
sql = """
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
WHERE LOWER(text) LIKE LOWER(?)
LIMIT 100
"""
df = con.execute(sql, [f"%{query}%"]).df()
else:
df = con.execute("""
SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
# Show table with inline audio players
for i, row in df.iterrows():
col1, col2, col3 = st.columns([3, 5, 2])
col1.markdown(f"**ID:** {row['id']}")
col1.markdown(f"**Channel:** {row['channel']}")
col1.markdown(f"**Video ID:** {row['video_id']}")
col1.markdown(f"**Video Title:** {row['video_title']}")
col1.markdown(f"**Speaker:** {row['speaker']}")
col1.markdown(f"**Start Time:** {row['start_time']}")
col1.markdown(f"**End Time:** {row['end_time']}")
col1.markdown(f"**Upload Date:** {row['upload_date']}")
highlighted_text = row['text']
if query:
highlighted_text = re.sub(f'({re.escape(query)})', r'<mark>\1</mark>', highlighted_text, flags=re.IGNORECASE)
col2.markdown(f"**Text:** {highlighted_text}", unsafe_allow_html=True)
col2.markdown(f"**POS tags:** {row['pos_tags']}")
audio_data = row["audio"]
try:
if isinstance(audio_data, (bytes, bytearray, memoryview)):
audio_bytes = bytes(audio_data)
elif isinstance(audio_data, list): # DuckDB sometimes gives list[int]
audio_bytes = bytes(audio_data)
else:
audio_bytes = None
if audio_bytes:
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
tmpfile.write(audio_bytes)
tmpfile.flush()
col3.audio(tmpfile.name, format="audio/mp3")
else:
col3.warning("Audio missing or invalid format.")
except Exception as e:
col3.error(f"Audio error: {e}")