stcoats commited on
Commit
e85bb51
·
1 Parent(s): 0659f43

Add application file

Browse files
Files changed (1) hide show
  1. app.py +60 -20
app.py CHANGED
@@ -2,37 +2,77 @@ import os
2
  import duckdb
3
  import streamlit as st
4
  from huggingface_hub import hf_hub_download
 
 
5
 
6
  HF_REPO_ID = "stcoats/temp-duckdb-upload"
7
  HF_FILENAME = "ycsep.duckdb"
8
  LOCAL_PATH = "./ycsep.duckdb"
9
 
10
- st.title("YCSEP Audio Debug")
 
11
 
 
12
  if not os.path.exists(LOCAL_PATH):
13
- st.write("Downloading DB...")
14
  hf_hub_download(
15
  repo_id=HF_REPO_ID,
16
  repo_type="dataset",
17
  filename=HF_FILENAME,
18
  local_dir="."
19
  )
20
- st.success("Downloaded.")
21
-
22
- con = duckdb.connect(LOCAL_PATH, read_only=True)
23
-
24
- st.markdown("### Raw Preview")
25
- df = con.execute("SELECT speaker, text, audio FROM data LIMIT 5").fetchall()
26
-
27
- for row in df:
28
- speaker, text, audio = row
29
- st.write(f"Speaker: {speaker}")
30
- st.write(f"Text: {text}")
31
- st.write(f"AUDIO TYPE: {type(audio)}")
32
- st.write(f"AUDIO LENGTH: {len(audio) if hasattr(audio, '__len__') else 'n/a'}")
33
- if isinstance(audio, (bytes, bytearray, memoryview)):
34
- st.audio(audio, format="audio/mp3")
35
- else:
36
- st.warning("Audio not valid binary")
37
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
2
  import duckdb
3
  import streamlit as st
4
  from huggingface_hub import hf_hub_download
5
+ import pandas as pd
6
+ import tempfile
7
 
8
  HF_REPO_ID = "stcoats/temp-duckdb-upload"
9
  HF_FILENAME = "ycsep.duckdb"
10
  LOCAL_PATH = "./ycsep.duckdb"
11
 
12
+ st.set_page_config(layout="wide")
13
+ st.title("YCSEP Audio Dataset Viewer")
14
 
15
+ # Download database if missing
16
  if not os.path.exists(LOCAL_PATH):
17
+ st.write("Downloading from HF Hub...")
18
  hf_hub_download(
19
  repo_id=HF_REPO_ID,
20
  repo_type="dataset",
21
  filename=HF_FILENAME,
22
  local_dir="."
23
  )
24
+ st.success("Download complete.")
25
+
26
+ # Connect
27
+ try:
28
+ con = duckdb.connect(LOCAL_PATH, read_only=True)
29
+ st.success("Connected to DuckDB.")
30
+ except Exception as e:
31
+ st.error(f"DuckDB connection failed: {e}")
32
+ st.stop()
33
+
34
+ # Search
35
+ query = st.text_input("Search text or speaker", "")
36
+ query = query.strip().lower()
37
+
38
+ if query:
39
+ sql = """
40
+ SELECT speaker, text, audio
41
+ FROM data
42
+ WHERE LOWER(CAST(speaker AS VARCHAR)) LIKE ? OR LOWER(CAST(text AS VARCHAR)) LIKE ?
43
+ LIMIT 100
44
+ """
45
+ df = con.execute(sql, [f"%{query}%", f"%{query}%"]).df()
46
+ else:
47
+ df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df()
48
+
49
+ st.markdown(f"### Showing {len(df)} results")
50
+
51
+ if len(df) == 0:
52
+ st.warning("No matches found.")
53
+
54
+ # Show table with inline audio players
55
+ for i, row in df.iterrows():
56
+ col1, col2, col3 = st.columns([2, 5, 3])
57
+ col1.markdown(f"**{row['speaker']}**")
58
+ col2.markdown(row['text'])
59
+
60
+ audio_data = row["audio"]
61
+ try:
62
+ if isinstance(audio_data, (bytes, bytearray, memoryview)):
63
+ audio_bytes = bytes(audio_data)
64
+ elif isinstance(audio_data, list): # DuckDB sometimes gives list[int]
65
+ audio_bytes = bytes(audio_data)
66
+ else:
67
+ audio_bytes = None
68
+
69
+ if audio_bytes:
70
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
71
+ tmpfile.write(audio_bytes)
72
+ tmpfile.flush()
73
+ col3.audio(tmpfile.name, format="audio/mp3")
74
+ else:
75
+ col3.warning("Audio missing or invalid format.")
76
+ except Exception as e:
77
+ col3.error(f"Audio error: {e}")
78