stcoats commited on
Commit
af343b5
·
1 Parent(s): f89e539

Add application file

Browse files
Files changed (1) hide show
  1. app.py +40 -57
app.py CHANGED
@@ -1,70 +1,53 @@
1
  import os
2
  import duckdb
3
- import pandas as pd
4
  import streamlit as st
5
  from huggingface_hub import hf_hub_download
6
- import shutil
7
 
8
- HF_REPO_ID = "stcoats/temp-duckdb-upload"
9
  HF_FILENAME = "ycsep.duckdb"
10
  LOCAL_PATH = "./ycsep.duckdb"
11
 
12
  st.title("YCSEP Audio Dataset Viewer")
13
 
14
- # Download if needed
15
  if not os.path.exists(LOCAL_PATH):
16
- st.write("Downloading from HF Hub...")
17
- path = hf_hub_download(
18
- repo_id=HF_REPO_ID,
19
- repo_type="dataset",
20
- filename=HF_FILENAME,
21
- local_dir=".",
22
- local_dir_use_symlinks=False
23
- )
24
- if path != LOCAL_PATH:
25
- shutil.copyfile(path, LOCAL_PATH)
26
-
27
- # Connect
28
- con = duckdb.connect(LOCAL_PATH, read_only=True)
29
-
30
- # Get total row count (only once)
31
- total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
32
- rows_per_page = 10
33
- total_pages = (total_rows - 1) // rows_per_page + 1
34
- st.success(f"Total rows: {total_rows}")
35
-
36
- # Select page
37
- page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
38
- offset = (page - 1) * rows_per_page
39
-
40
- # Optional: add search filter
41
- query = st.text_input("Search text or speaker")
42
-
43
- if query:
44
- query_sql = f"""
45
- SELECT * FROM data
46
- WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
47
- LIMIT {rows_per_page} OFFSET {offset}
48
- """
49
- count_sql = f"""
50
- SELECT COUNT(*) FROM data
51
- WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
52
- """
53
- filtered_rows = con.execute(count_sql).fetchone()[0]
54
- st.write(f"Filtered rows: {filtered_rows}")
55
  else:
56
- query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"
57
-
58
- # Run query — only small chunk loaded now
59
- df_page = con.execute(query_sql).df()
60
-
61
- # Display
62
- for _, row in df_page.iterrows():
63
- st.markdown(f"**Speaker:** {row['speaker']}")
64
- st.markdown(f"**Text:** {row['text']}")
65
- if isinstance(row['audio'], str) and row['audio'].startswith("http"):
66
- st.audio(row['audio'], format="audio/mp3")
67
- else:
68
- st.warning("Audio not available")
69
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
70
 
 
1
  import os
2
  import duckdb
 
3
  import streamlit as st
4
  from huggingface_hub import hf_hub_download
 
5
 
6
+ HF_REPO_ID = "stcoats/temp-duckdb-upload" # Replace with your actual dataset repo if needed
7
  HF_FILENAME = "ycsep.duckdb"
8
  LOCAL_PATH = "./ycsep.duckdb"
9
 
10
  st.title("YCSEP Audio Dataset Viewer")
11
 
12
+ # Check if file exists
13
  if not os.path.exists(LOCAL_PATH):
14
+ st.write("Database not found locally. Downloading from HF Hub...")
15
+ try:
16
+ downloaded_path = hf_hub_download(
17
+ repo_id=HF_REPO_ID,
18
+ repo_type="dataset",
19
+ filename=HF_FILENAME,
20
+ local_dir="." # Saves as ./ycsep.duckdb automatically
21
+ )
22
+ st.success(f"Downloaded: {downloaded_path}")
23
+ except Exception as e:
24
+ st.error(f"Download failed: {e}")
25
+ st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  else:
27
+ st.write("Found local DuckDB file.")
28
+
29
+ # Try connecting to the DB
30
+ try:
31
+ con = duckdb.connect(LOCAL_PATH, read_only=True)
32
+ st.success("Connected to DuckDB.")
33
+ except Exception as e:
34
+ st.error(f"Failed to connect to DuckDB: {e}")
35
+ st.stop()
36
+
37
+ # Query first page without loading everything into memory
38
+ st.write("Querying first 10 rows...")
39
+
40
+ try:
41
+ cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10")
42
+ rows = cursor.fetchall()
43
+ for speaker, text, audio in rows:
44
+ st.markdown(f"**Speaker:** {speaker}")
45
+ st.markdown(f"**Text:** {text}")
46
+ if isinstance(audio, str) and audio.startswith("http"):
47
+ st.audio(audio, format="audio/mp3")
48
+ else:
49
+ st.warning("Audio not available")
50
+ st.markdown("---")
51
+ except Exception as e:
52
+ st.error(f"DuckDB query failed: {e}")
53