stcoats commited on
Commit
416c906
·
1 Parent(s): 4cf6559

Add application file

Browse files
Files changed (1) hide show
  1. app.py +40 -52
app.py CHANGED
@@ -5,69 +5,57 @@ from huggingface_hub import hf_hub_download
5
  import pandas as pd
6
  import tempfile
7
 
8
- HF_REPO_ID = "stcoats/temp-duckdb-upload"
 
9
  HF_FILENAME = "ycsep.duckdb"
10
  LOCAL_PATH = "./ycsep.duckdb"
11
 
 
12
  st.title("YCSEP Audio Dataset Viewer")
13
 
 
14
  if not os.path.exists(LOCAL_PATH):
15
- st.write("Downloading from HF Hub...")
16
- downloaded_path = hf_hub_download(
17
  repo_id=HF_REPO_ID,
18
  repo_type="dataset",
19
  filename=HF_FILENAME,
20
  local_dir="."
21
  )
22
- st.success(f"Downloaded: {downloaded_path}")
23
- else:
24
- st.write("DuckDB file already exists.")
25
-
26
- try:
27
- con = duckdb.connect(LOCAL_PATH, read_only=True)
28
- st.success("Connected to DuckDB.")
29
- except Exception as e:
30
- st.error(f"Failed to connect: {e}")
31
- st.stop()
32
-
33
- query = st.text_input("Search text or speaker (case-insensitive)")
34
-
35
- if query:
36
- query_sql = f"""
37
- SELECT speaker, text, audio FROM data
38
- WHERE LOWER(text) LIKE '%{query.lower()}%'
39
- OR LOWER(speaker) LIKE '%{query.lower()}%'
40
- LIMIT 25
41
  """
 
42
  else:
43
- query_sql = "SELECT speaker, text, audio FROM data LIMIT 25"
44
-
45
- try:
46
- df = con.execute(query_sql).df()
47
- st.write(f"Showing {len(df)} results")
48
-
49
- # Display text/speaker table
50
- st.dataframe(df[['speaker', 'text']], use_container_width=True)
51
-
52
- st.write("---")
53
- st.markdown("### Audio Previews:")
54
-
55
- for i, row in df.iterrows():
56
- st.markdown(f"**Speaker:** {row['speaker']}")
57
- st.markdown(f"{row['text']}")
58
-
59
- # Write audio bytes to temp file
60
- audio_bytes = row['audio']
61
- if isinstance(audio_bytes, (bytes, bytearray)):
62
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_audio:
63
- tmp_audio.write(audio_bytes)
64
- tmp_audio_path = tmp_audio.name
65
- st.audio(tmp_audio_path, format="audio/mp3")
66
- else:
67
- st.warning("Audio not available or in unexpected format.")
68
-
69
- st.markdown("---")
70
-
71
- except Exception as e:
72
- st.error(f"DuckDB query failed: {e}")
73
 
 
5
  import pandas as pd
6
  import tempfile
7
 
8
+ # --- Config ---
9
+ HF_REPO_ID = "stcoats/temp-duckdb-upload" # Change if needed
10
  HF_FILENAME = "ycsep.duckdb"
11
  LOCAL_PATH = "./ycsep.duckdb"
12
 
13
+ st.set_page_config(layout="wide")
14
  st.title("YCSEP Audio Dataset Viewer")
15
 
16
+ # --- Download database if not present ---
17
  if not os.path.exists(LOCAL_PATH):
18
+ st.write("Downloading ycsep.duckdb from HF Hub...")
19
+ hf_hub_download(
20
  repo_id=HF_REPO_ID,
21
  repo_type="dataset",
22
  filename=HF_FILENAME,
23
  local_dir="."
24
  )
25
+ st.success("Download complete.")
26
+
27
+ # --- Connect to DuckDB ---
28
+ con = duckdb.connect(LOCAL_PATH, read_only=True)
29
+
30
+ # --- Search input ---
31
+ query = st.text_input("Search text or speaker (case-insensitive)", "")
32
+
33
+ # --- Execute query ---
34
+ if query.strip():
35
+ search_term = f"%{query.strip().lower()}%"
36
+ sql = """
37
+ SELECT speaker, text, audio
38
+ FROM data
39
+ WHERE LOWER(speaker) LIKE ? OR LOWER(text) LIKE ?
40
+ LIMIT 100
 
 
 
41
  """
42
+ df = con.execute(sql, [search_term, search_term]).df()
43
  else:
44
+ df = con.execute("SELECT speaker, text, audio FROM data LIMIT 100").df()
45
+
46
+ # --- Show results ---
47
+ st.markdown("### Search Results")
48
+ for idx, row in df.iterrows():
49
+ col1, col2, col3 = st.columns([2, 5, 3])
50
+ col1.markdown(f"**{row['speaker']}**")
51
+ col2.markdown(row['text'])
52
+
53
+ audio_data = row['audio']
54
+ if isinstance(audio_data, (bytes, bytearray)):
55
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
56
+ tmp.write(audio_data)
57
+ tmp.flush()
58
+ col3.audio(tmp.name, format="audio/mp3")
59
+ else:
60
+ col3.warning("Audio not available or invalid format.")
 
 
 
 
 
 
 
 
 
 
 
 
 
61