File size: 3,638 Bytes
b5ff3a4 f89e539 b5ff3a4 e85bb51 a671301 b5ff3a4 65c9ca0 f89e539 b5ff3a4 e85bb51 b5ff3a4 e85bb51 f89e539 e85bb51 416c906 4cf6559 a31425b 4cf6559 e85bb51 a31425b e85bb51 a31425b e85bb51 135e6d1 7974c3c e85bb51 135e6d1 e85bb51 7974c3c a671301 e85bb51 7974c3c e85bb51 7974c3c e85bb51 a31425b a671301 5fde344 a31425b e85bb51 a47efdc c173a94 a671301 c173a94 a671301 c173a94 a0a9509 7974c3c 895e3ae c173a94 7974c3c c173a94 7974c3c c173a94 7974c3c 9302019 c173a94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import os
import duckdb
import streamlit as st
from huggingface_hub import hf_hub_download
import pandas as pd
import tempfile
import re
HF_REPO_ID = "stcoats/temp-duckdb-upload"
HF_FILENAME = "ycsep.duckdb"
LOCAL_PATH = "./ycsep.duckdb"
st.set_page_config(layout="wide")
st.title("YCSEP Audio Dataset Viewer")
# Download database if missing
if not os.path.exists(LOCAL_PATH):
st.write("Downloading from HF Hub...")
hf_hub_download(
repo_id=HF_REPO_ID,
repo_type="dataset",
filename=HF_FILENAME,
local_dir=".",
local_dir_use_symlinks=False
)
st.success("Download complete.")
# Connect (only once)
@st.cache_resource(show_spinner=False)
def get_duckdb_connection():
return duckdb.connect(LOCAL_PATH, read_only=True)
try:
con = get_duckdb_connection()
st.success("Connected to DuckDB.")
except Exception as e:
st.error(f"DuckDB connection failed: {e}")
st.stop()
# Search input
query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip()
# Build query
if query:
sql = """
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
FROM data
WHERE LOWER(text) LIKE LOWER(?)
LIMIT 100
"""
df = con.execute(sql, [f"%{query}%"]).df()
else:
df = con.execute("""
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio
FROM data
LIMIT 100
""").df()
st.markdown(f"### Showing {len(df)} results")
if len(df) == 0:
st.warning("No matches found.")
else:
def render_audio_tag(audio_bytes, index):
try:
if isinstance(audio_bytes, (bytes, bytearray, memoryview)):
data = bytes(audio_bytes)
elif isinstance(audio_bytes, list):
data = bytes(audio_bytes)
else:
return ""
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
tmp.write(data)
tmp.flush()
return f'<audio controls preload="metadata" style="height:20px;width:100px;"><source src="file://{tmp.name}" type="audio/mpeg"></audio>'
except Exception:
return ""
df["audio_player"] = [render_audio_tag(b, i) for i, b in enumerate(df["audio"])]
df_display = df.drop(columns=["audio"])
from streamlit.components.v1 import html
def display_html_table(df):
table_html = """
<table border='1' style='border-collapse:collapse;width:100%;font-size:13px;'>
<thead>
<tr>
<th style='width:8em;'>id</th>
<th>channel</th>
<th>video_id</th>
<th>speaker</th>
<th>start_time</th>
<th>end_time</th>
<th>upload_date</th>
<th>text</th>
<th>pos_tags</th>
<th>audio</th>
</tr>
</thead>
<tbody>
"""
for _, row in df.iterrows():
table_html += "<tr>"
for col in ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags"]:
table_html += f"<td>{row[col]}</td>"
table_html += f"<td>{row['audio_player']}</td>"
table_html += "</tr>"
table_html += "</tbody></table>"
return table_html
st.markdown("### Results Table (Sortable with Audio Column)")
html(display_html_table(df_display), height=800, scrolling=True)
|