stcoats commited on
Commit
f89e539
·
1 Parent(s): 02a2df0

Add application file

Browse files
Files changed (1) hide show
  1. app.py +45 -46
app.py CHANGED
@@ -1,66 +1,65 @@
1
  import os
2
- import streamlit as st
3
  import duckdb
4
  import pandas as pd
 
5
  from huggingface_hub import hf_hub_download
 
6
 
7
- DB_PATH = "./ycsep.duckdb"
8
- REPO_ID = "stcoats/temp-duckdb-upload"
9
- FILENAME = "ycsep.duckdb"
10
 
11
  st.title("YCSEP Audio Dataset Viewer")
12
 
13
- # Step 1: Show storage status
14
- st.write("Checking persistent storage...")
15
- st.write(f"Expected DB location: `{DB_PATH}`")
16
- st.write(f"File exists: {os.path.exists(DB_PATH)}")
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Step 2: Try downloading if needed
19
- if not os.path.exists(DB_PATH):
20
- st.write("Database not found in persistent storage. Downloading from HF Hub...")
21
- try:
22
- path = hf_hub_download(
23
- repo_id=REPO_ID,
24
- repo_type="dataset",
25
- filename=FILENAME,
26
- local_dir=".",
27
- local_dir_use_symlinks=False,
28
- )
29
- st.success(f"Downloaded to {path}")
30
- except Exception as e:
31
- st.error(f"Download failed: {e}")
32
- st.stop()
33
 
34
- # Step 3: Try loading DB
35
- try:
36
- st.write("Connecting to DuckDB...")
37
- con = duckdb.connect(DB_PATH, read_only=True)
38
- st.write("Reading table...")
39
- df = con.execute("SELECT * FROM data").df()
40
- st.success(f"Loaded {len(df)} rows.")
41
- except Exception as e:
42
- st.error(f"DuckDB load failed: {e}")
43
- st.stop()
44
 
45
- # Step 4: Proceed with app
46
  query = st.text_input("Search text or speaker")
47
 
48
  if query:
49
- filtered_df = df[df["text"].str.contains(query, case=False, na=False) |
50
- df["speaker"].astype(str).str.contains(query, case=False, na=False)]
 
 
 
 
 
 
 
 
 
51
  else:
52
- filtered_df = df
53
-
54
- rows_per_page = 10
55
- total_rows = len(filtered_df)
56
- total_pages = (total_rows - 1) // rows_per_page + 1
57
- page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
58
 
59
- start = (page - 1) * rows_per_page
60
- end = start + rows_per_page
61
- page_df = filtered_df.iloc[start:end]
62
 
63
- for _, row in page_df.iterrows():
 
64
  st.markdown(f"**Speaker:** {row['speaker']}")
65
  st.markdown(f"**Text:** {row['text']}")
66
  if isinstance(row['audio'], str) and row['audio'].startswith("http"):
 
1
  import os
 
2
  import duckdb
3
  import pandas as pd
4
+ import streamlit as st
5
  from huggingface_hub import hf_hub_download
6
+ import shutil
7
 
8
+ HF_REPO_ID = "stcoats/temp-duckdb-upload"
9
+ HF_FILENAME = "ycsep.duckdb"
10
+ LOCAL_PATH = "./ycsep.duckdb"
11
 
12
  st.title("YCSEP Audio Dataset Viewer")
13
 
14
+ # Download if needed
15
+ if not os.path.exists(LOCAL_PATH):
16
+ st.write("Downloading from HF Hub...")
17
+ path = hf_hub_download(
18
+ repo_id=HF_REPO_ID,
19
+ repo_type="dataset",
20
+ filename=HF_FILENAME,
21
+ local_dir=".",
22
+ local_dir_use_symlinks=False
23
+ )
24
+ if path != LOCAL_PATH:
25
+ shutil.copyfile(path, LOCAL_PATH)
26
+
27
+ # Connect
28
+ con = duckdb.connect(LOCAL_PATH, read_only=True)
29
 
30
+ # Get total row count (only once)
31
+ total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
32
+ rows_per_page = 10
33
+ total_pages = (total_rows - 1) // rows_per_page + 1
34
+ st.success(f"Total rows: {total_rows}")
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # Select page
37
+ page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
38
+ offset = (page - 1) * rows_per_page
 
 
 
 
 
 
 
39
 
40
+ # Optional: add search filter
41
  query = st.text_input("Search text or speaker")
42
 
43
  if query:
44
+ query_sql = f"""
45
+ SELECT * FROM data
46
+ WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
47
+ LIMIT {rows_per_page} OFFSET {offset}
48
+ """
49
+ count_sql = f"""
50
+ SELECT COUNT(*) FROM data
51
+ WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
52
+ """
53
+ filtered_rows = con.execute(count_sql).fetchone()[0]
54
+ st.write(f"Filtered rows: {filtered_rows}")
55
  else:
56
+ query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"
 
 
 
 
 
57
 
58
+ # Run query — only small chunk loaded now
59
+ df_page = con.execute(query_sql).df()
 
60
 
61
+ # Display
62
+ for _, row in df_page.iterrows():
63
  st.markdown(f"**Speaker:** {row['speaker']}")
64
  st.markdown(f"**Text:** {row['text']}")
65
  if isinstance(row['audio'], str) and row['audio'].startswith("http"):