|
import os |
|
import duckdb |
|
import streamlit as st |
|
from huggingface_hub import hf_hub_download |
|
import pandas as pd |
|
import tempfile |
|
import base64 |
|
|
|
HF_REPO_ID = "stcoats/temp-duckdb-upload" |
|
HF_FILENAME = "ycsep.duckdb" |
|
LOCAL_PATH = "./ycsep.duckdb" |
|
|
|
st.set_page_config(layout="wide") |
|
st.title("YCSEP Audio Dataset Viewer") |
|
|
|
|
|
if not os.path.exists(LOCAL_PATH): |
|
with st.spinner("Downloading from HF Hub..."): |
|
hf_hub_download( |
|
repo_id=HF_REPO_ID, |
|
repo_type="dataset", |
|
filename=HF_FILENAME, |
|
local_dir=".", |
|
local_dir_use_symlinks=False |
|
) |
|
st.success("Download complete.") |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
def get_duckdb_connection(): |
|
return duckdb.connect(LOCAL_PATH, read_only=True) |
|
|
|
try: |
|
con = get_duckdb_connection() |
|
st.success("Connected to DuckDB.") |
|
except Exception as e: |
|
st.error(f"DuckDB connection failed: {e}") |
|
st.stop() |
|
|
|
|
|
query = st.text_input("Search text (case-insensitive, exact substring match)", "").strip() |
|
|
|
|
|
if query: |
|
sql = f""" |
|
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio |
|
FROM data |
|
WHERE LOWER(text) LIKE LOWER('%{query}%') |
|
LIMIT 100 |
|
""" |
|
df = con.execute(sql).df() |
|
else: |
|
df = con.execute(""" |
|
SELECT id, channel, video_id, speaker, start_time, end_time, upload_date, text, pos_tags, audio |
|
FROM data |
|
LIMIT 100 |
|
""").df() |
|
|
|
st.markdown(f"### Showing {len(df)} results") |
|
|
|
if len(df) == 0: |
|
st.warning("No matches found.") |
|
else: |
|
def get_audio_html(audio_bytes): |
|
try: |
|
if isinstance(audio_bytes, (bytes, bytearray, memoryview)): |
|
data = bytes(audio_bytes) |
|
elif isinstance(audio_bytes, list): |
|
data = bytes(audio_bytes) |
|
else: |
|
return "" |
|
b64 = base64.b64encode(data).decode("utf-8") |
|
return f'<audio controls preload="metadata" style="height:20px;width:120px;"><source src="data:audio/mp3;base64,{b64}" type="audio/mpeg"></audio>' |
|
except Exception: |
|
return "" |
|
|
|
df["Audio"] = df["audio"].apply(get_audio_html) |
|
df.drop(columns=["audio"], inplace=True) |
|
|
|
from streamlit.components.v1 import html |
|
|
|
def display_table_with_audio(df): |
|
table_html = """ |
|
<table border='1' style='border-collapse:collapse;width:100%;font-size:13px;'> |
|
<thead> |
|
<tr> |
|
<th style='width:5em;'>id</th> |
|
<th style='width:6em;'>channel</th> |
|
<th style='width:6em;'>video_id</th> |
|
<th style='width:6em;'>speaker</th> |
|
<th style='width:6em;'>start_time</th> |
|
<th style='width:6em;'>end_time</th> |
|
<th style='width:6em;'>upload_date</th> |
|
<th style='width:20em;'>text</th> |
|
<th style='width:8em;'>pos_tags</th> |
|
<th style='width:12em;'>Audio</th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
""" |
|
for _, row in df.iterrows(): |
|
table_html += "<tr>" |
|
for col in ["id", "channel", "video_id", "speaker", "start_time", "end_time", "upload_date", "text", "pos_tags", "Audio"]: |
|
table_html += f"<td>{row[col]}</td>" |
|
table_html += "</tr>" |
|
table_html += "</tbody></table>" |
|
return table_html |
|
|
|
st.markdown("### Results Table (Sortable with Audio Column)") |
|
html(display_table_with_audio(df), height=900, scrolling=True) |
|
|
|
|