stcoats commited on
Commit
4cf6559
·
1 Parent(s): af343b5

Add application file

Browse files
Files changed (1) hide show
  1. app.py +47 -27
app.py CHANGED
@@ -2,52 +2,72 @@ import os
2
  import duckdb
3
  import streamlit as st
4
  from huggingface_hub import hf_hub_download
 
 
5
 
6
- HF_REPO_ID = "stcoats/temp-duckdb-upload" # Replace with your actual dataset repo if needed
7
  HF_FILENAME = "ycsep.duckdb"
8
  LOCAL_PATH = "./ycsep.duckdb"
9
 
10
  st.title("YCSEP Audio Dataset Viewer")
11
 
12
- # Check if file exists
13
  if not os.path.exists(LOCAL_PATH):
14
- st.write("Database not found locally. Downloading from HF Hub...")
15
- try:
16
- downloaded_path = hf_hub_download(
17
- repo_id=HF_REPO_ID,
18
- repo_type="dataset",
19
- filename=HF_FILENAME,
20
- local_dir="." # Saves as ./ycsep.duckdb automatically
21
- )
22
- st.success(f"Downloaded: {downloaded_path}")
23
- except Exception as e:
24
- st.error(f"Download failed: {e}")
25
- st.stop()
26
  else:
27
- st.write("Found local DuckDB file.")
28
 
29
- # Try connecting to the DB
30
  try:
31
  con = duckdb.connect(LOCAL_PATH, read_only=True)
32
  st.success("Connected to DuckDB.")
33
  except Exception as e:
34
- st.error(f"Failed to connect to DuckDB: {e}")
35
  st.stop()
36
 
37
- # Query first page without loading everything into memory
38
- st.write("Querying first 10 rows...")
 
 
 
 
 
 
 
 
 
39
 
40
  try:
41
- cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10")
42
- rows = cursor.fetchall()
43
- for speaker, text, audio in rows:
44
- st.markdown(f"**Speaker:** {speaker}")
45
- st.markdown(f"**Text:** {text}")
46
- if isinstance(audio, str) and audio.startswith("http"):
47
- st.audio(audio, format="audio/mp3")
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  else:
49
- st.warning("Audio not available")
 
50
  st.markdown("---")
 
51
  except Exception as e:
52
  st.error(f"DuckDB query failed: {e}")
53
 
 
2
  import duckdb
3
  import streamlit as st
4
  from huggingface_hub import hf_hub_download
5
+ import pandas as pd
6
+ import tempfile
7
 
8
+ HF_REPO_ID = "stcoats/temp-duckdb-upload"
9
  HF_FILENAME = "ycsep.duckdb"
10
  LOCAL_PATH = "./ycsep.duckdb"
11
 
12
  st.title("YCSEP Audio Dataset Viewer")
13
 
 
14
  if not os.path.exists(LOCAL_PATH):
15
+ st.write("Downloading from HF Hub...")
16
+ downloaded_path = hf_hub_download(
17
+ repo_id=HF_REPO_ID,
18
+ repo_type="dataset",
19
+ filename=HF_FILENAME,
20
+ local_dir="."
21
+ )
22
+ st.success(f"Downloaded: {downloaded_path}")
 
 
 
 
23
  else:
24
+ st.write("DuckDB file already exists.")
25
 
 
26
  try:
27
  con = duckdb.connect(LOCAL_PATH, read_only=True)
28
  st.success("Connected to DuckDB.")
29
  except Exception as e:
30
+ st.error(f"Failed to connect: {e}")
31
  st.stop()
32
 
33
+ query = st.text_input("Search text or speaker (case-insensitive)")
34
+
35
+ if query:
36
+ query_sql = f"""
37
+ SELECT speaker, text, audio FROM data
38
+ WHERE LOWER(text) LIKE '%{query.lower()}%'
39
+ OR LOWER(speaker) LIKE '%{query.lower()}%'
40
+ LIMIT 25
41
+ """
42
+ else:
43
+ query_sql = "SELECT speaker, text, audio FROM data LIMIT 25"
44
 
45
  try:
46
+ df = con.execute(query_sql).df()
47
+ st.write(f"Showing {len(df)} results")
48
+
49
+ # Display text/speaker table
50
+ st.dataframe(df[['speaker', 'text']], use_container_width=True)
51
+
52
+ st.write("---")
53
+ st.markdown("### Audio Previews:")
54
+
55
+ for i, row in df.iterrows():
56
+ st.markdown(f"**Speaker:** {row['speaker']}")
57
+ st.markdown(f"{row['text']}")
58
+
59
+ # Write audio bytes to temp file
60
+ audio_bytes = row['audio']
61
+ if isinstance(audio_bytes, (bytes, bytearray)):
62
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_audio:
63
+ tmp_audio.write(audio_bytes)
64
+ tmp_audio_path = tmp_audio.name
65
+ st.audio(tmp_audio_path, format="audio/mp3")
66
  else:
67
+ st.warning("Audio not available or in unexpected format.")
68
+
69
  st.markdown("---")
70
+
71
  except Exception as e:
72
  st.error(f"DuckDB query failed: {e}")
73