Spaces:

stcoats
/

tspace

Sleeping

tspace / app.py

stcoats

Add application file

a0a9509 4 months ago

3.32 kB

	import os
	import duckdb
	import streamlit as st
	from huggingface_hub import hf_hub_download
	import pandas as pd
	import tempfile
	import re

	HF_REPO_ID = "stcoats/temp-duckdb-upload"
	HF_FILENAME = "ycsep.duckdb"
	LOCAL_PATH = "./ycsep.duckdb"

	st.set_page_config(layout="wide")
	st.title("YCSEP Audio Dataset Viewer")

	# Download database if missing
	if not os.path.exists(LOCAL_PATH):
	st.write("Downloading from HF Hub...")
	hf_hub_download(
	repo_id=HF_REPO_ID,
	repo_type="dataset",
	filename=HF_FILENAME,
	local_dir=".",
	local_dir_use_symlinks=False
	)
	st.success("Download complete.")

	# Connect (only once)
	@st.cache_resource(show_spinner=False)
	def get_duckdb_connection():
	return duckdb.connect(LOCAL_PATH, read_only=True)

	try:
	con = get_duckdb_connection()
	st.success("Connected to DuckDB.")
	except Exception as e:
	st.error(f"DuckDB connection failed: {e}")
	st.stop()

	# Enable full-text search index on first run (one-time setup if not exists)
	con.execute("PRAGMA create_fts_index('data', 'text')")

	# Search
	query = st.text_input("Search text (case-insensitive)", "").strip()

	if query:
	sql = """
	SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
	FROM data
	WHERE text % ?
	LIMIT 100
	"""
	df = con.execute(sql, [query]).df()
	else:
	df = con.execute("""
	SELECT id, channel, video_id, video_title, speaker, start_time, end_time, text, pos_tags, upload_date, audio
	FROM data
	LIMIT 100
	""").df()

	st.markdown(f"### Showing {len(df)} results")

	if len(df) == 0:
	st.warning("No matches found.")
	else:
	def render_audio_cell(audio_bytes):
	try:
	if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
	data = bytes(audio_bytes)
	elif isinstance(audio_bytes, list):
	data = bytes(audio_bytes)
	else:
	return None
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
	tmp.write(data)
	tmp.flush()
	return tmp.name
	except Exception:
	return None

	df["audio_file"] = df["audio"].apply(render_audio_cell)
	df_display = df.drop(columns=["audio"]).copy()

	# Add HTML audio tag column
	def audio_html(path):
	if path:
	return f'<audio controls preload="none" style="height:20px;"> <source src="file://{path}" type="audio/mpeg"> </audio>'
	return ""

	df_display["Audio"] = df["audio_file"].apply(audio_html)

	# Reorder columns
	column_order = ["id", "channel", "video_id", "video_title", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]
	df_display = df_display[column_order]

	st.markdown("### Full Table View (Sortable)")
	st.write("Note: Audio is embedded using HTML tags; not all browsers allow playback from local temp paths.")
	st.dataframe(df_display.drop(columns=["Audio"]))

	st.markdown("### Audio Previews")
	for i, row in df_display.iterrows():
	if row["Audio"]:
	st.markdown(f"{row['speaker']} \| {row['text'][:80]}", unsafe_allow_html=True)
	st.markdown(row["Audio"], unsafe_allow_html=True)