stcoats commited on
Commit
b5ff3a4
·
1 Parent(s): 548bf4c

Add application file

Browse files
Files changed (1) hide show
  1. app.py +69 -1
app.py CHANGED
@@ -1,3 +1,71 @@
 
1
  import streamlit as st
2
- st.title("If you see this, the Space works")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
1
+ import os
2
  import streamlit as st
3
+ import duckdb
4
+ import pandas as pd
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ DB_PATH = "/data/ycsep.duckdb"
8
+ REPO_ID = "stcoats/temp-duckdb-upload"
9
+ FILENAME = "ycsep.duckdb"
10
+
11
+ st.title("YCSEP Audio Dataset Viewer")
12
+
13
+ # Step 1: Show storage status
14
+ st.write("Checking persistent storage...")
15
+ st.write(f"Expected DB location: `{DB_PATH}`")
16
+ st.write(f"File exists: {os.path.exists(DB_PATH)}")
17
+
18
+ # Step 2: Try downloading if needed
19
+ if not os.path.exists(DB_PATH):
20
+ st.write("Database not found in persistent storage. Downloading from HF Hub...")
21
+ try:
22
+ path = hf_hub_download(
23
+ repo_id=REPO_ID,
24
+ repo_type="dataset",
25
+ filename=FILENAME,
26
+ local_dir="/data",
27
+ local_dir_use_symlinks=False,
28
+ )
29
+ st.success(f"Downloaded to {path}")
30
+ except Exception as e:
31
+ st.error(f"Download failed: {e}")
32
+ st.stop()
33
+
34
+ # Step 3: Try loading DB
35
+ try:
36
+ st.write("Connecting to DuckDB...")
37
+ con = duckdb.connect(DB_PATH, read_only=True)
38
+ st.write("Reading table...")
39
+ df = con.execute("SELECT * FROM data").df()
40
+ st.success(f"Loaded {len(df)} rows.")
41
+ except Exception as e:
42
+ st.error(f"DuckDB load failed: {e}")
43
+ st.stop()
44
+
45
+ # Step 4: Proceed with app
46
+ query = st.text_input("Search text or speaker")
47
+
48
+ if query:
49
+ filtered_df = df[df["text"].str.contains(query, case=False, na=False) |
50
+ df["speaker"].astype(str).str.contains(query, case=False, na=False)]
51
+ else:
52
+ filtered_df = df
53
+
54
+ rows_per_page = 10
55
+ total_rows = len(filtered_df)
56
+ total_pages = (total_rows - 1) // rows_per_page + 1
57
+ page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
58
+
59
+ start = (page - 1) * rows_per_page
60
+ end = start + rows_per_page
61
+ page_df = filtered_df.iloc[start:end]
62
+
63
+ for _, row in page_df.iterrows():
64
+ st.markdown(f"**Speaker:** {row['speaker']}")
65
+ st.markdown(f"**Text:** {row['text']}")
66
+ if isinstance(row['audio'], str) and row['audio'].startswith("http"):
67
+ st.audio(row['audio'], format="audio/mp3")
68
+ else:
69
+ st.warning("Audio not available")
70
+ st.markdown("---")
71