Spaces:

stcoats
/

tspace

Sleeping

tspace / app.py

stcoats

Add application file

56677fa 4 months ago

3.68 kB

	import os
	import duckdb
	import streamlit as st
	from huggingface_hub import hf_hub_download
	import pandas as pd
	import tempfile
	import base64

	HF_REPO_ID = "stcoats/temp-duckdb-upload"
	HF_FILENAME = "ycsep.duckdb"
	LOCAL_PATH = "./ycsep.duckdb"

	st.set_page_config(layout="wide")
	st.title("YCSEP Audio Dataset Viewer")

	# Download database if missing
	if not os.path.exists(LOCAL_PATH):
	with st.spinner("Downloading from HF Hub..."):
	hf_hub_download(
	repo_id=HF_REPO_ID,
	repo_type="dataset",
	filename=HF_FILENAME,
	local_dir=".",
	local_dir_use_symlinks=False
	)
	st.success("Download complete.")

	# Connect (only once)
	@st.cache_resource(show_spinner=False)
	def get_duckdb_connection():
	return duckdb.connect(LOCAL_PATH, read_only=True)

	try:
	con = get_duckdb_connection()
	st.success("Connected to DuckDB.")
	except Exception as e:
	st.error(f"DuckDB connection failed: {e}")
	st.stop()

	# Search input
	query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip()

	# Build query
	if query:
	sql = f"""
	SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
	FROM data
	WHERE LOWER(text) LIKE LOWER('%{query}%')
	LIMIT 100
	"""
	df = con.execute(sql).df()
	else:
	df = con.execute("""
	SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
	FROM data
	LIMIT 100
	""").df()

	st.markdown(f"### Showing {len(df)} results")

	if len(df) == 0:
	st.warning("No matches found.")
	else:
	def get_audio_html(audio_bytes):
	try:
	if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
	data = bytes(audio_bytes)
	elif isinstance(audio_bytes, list):
	data = bytes(audio_bytes)
	else:
	return ""
	b64 = base64.b64encode(data).decode("utf-8")
	return f'<audio controls preload="metadata" style="height:20px;width:120px;"><source src="data:audio/mp3;base64,{b64}" type="audio/mpeg"></audio>'
	except Exception:
	return ""

	df["Audio"] = df["audio"].apply(get_audio_html)
	df.drop(columns=["audio"], inplace=True)

	from streamlit.components.v1 import html

	def display_table_with_audio(df):
	table_html = """
	<table border='1' style='border-collapse:collapse;width:100%;font-size:13px;'>
	<thead>
	<tr>
	<th style='width:5em;'>id</th>
	<th style='width:6em;'>channel</th>
	<th style='width:6em;'>video_id</th>
	<th style='width:6em;'>speaker</th>
	<th style='width:6em;'>start_time</th>
	<th style='width:6em;'>end_time</th>
	<th style='width:6em;'>upload_date</th>
	<th style='width:20em;'>text</th>
	<th style='width:8em;'>pos_tags</th>
	<th style='width:12em;'>Audio</th>
	</tr>
	</thead>
	<tbody>
	"""
	for _, row in df.iterrows():
	table_html += "<tr>"
	for col in ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]:
	table_html += f"<td>{row[col]}</td>"
	table_html += "</tr>"
	table_html += "</tbody></table>"
	return table_html

	st.markdown("### Results Table (Sortable with Audio Column)")
	html(display_table_with_audio(df), height=900, scrolling=True)