Spaces:

stcoats
/

tspace

Sleeping

App Files Files Community

stcoats commited on Mar 13

Commit

f89e539

1 Parent(s): 02a2df0

Add application file

Browse files

Files changed (1) hide show

app.py +45 -46

app.py CHANGED Viewed

@@ -1,66 +1,65 @@
 import os
-import streamlit as st
 import duckdb
 import pandas as pd
 from huggingface_hub import hf_hub_download
-DB_PATH = "./ycsep.duckdb"
-REPO_ID = "stcoats/temp-duckdb-upload"
-FILENAME = "ycsep.duckdb"
 st.title("YCSEP Audio Dataset Viewer")
-# Step 1: Show storage status
-st.write("Checking persistent storage...")
-st.write(f"Expected DB location: `{DB_PATH}`")
-st.write(f"File exists: {os.path.exists(DB_PATH)}")
-# Step 2: Try downloading if needed
-if not os.path.exists(DB_PATH):
-    st.write("Database not found in persistent storage. Downloading from HF Hub...")
-    try:
-        path = hf_hub_download(
-            repo_id=REPO_ID,
-            repo_type="dataset",
-            filename=FILENAME,
-            local_dir=".",
-            local_dir_use_symlinks=False,
-        )
-        st.success(f"Downloaded to {path}")
-    except Exception as e:
-        st.error(f"Download failed: {e}")
-        st.stop()
-# Step 3: Try loading DB
-try:
-    st.write("Connecting to DuckDB...")
-    con = duckdb.connect(DB_PATH, read_only=True)
-    st.write("Reading table...")
-    df = con.execute("SELECT * FROM data").df()
-    st.success(f"Loaded {len(df)} rows.")
-except Exception as e:
-    st.error(f"DuckDB load failed: {e}")
-    st.stop()
-# Step 4: Proceed with app
 query = st.text_input("Search text or speaker")
 if query:
-    filtered_df = df[df["text"].str.contains(query, case=False, na=False) |
-                     df["speaker"].astype(str).str.contains(query, case=False, na=False)]
 else:
-    filtered_df = df
-rows_per_page = 10
-total_rows = len(filtered_df)
-total_pages = (total_rows - 1) // rows_per_page + 1
-page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
-start = (page - 1) * rows_per_page
-end = start + rows_per_page
-page_df = filtered_df.iloc[start:end]
-for _, row in page_df.iterrows():
     st.markdown(f"**Speaker:** {row['speaker']}")
     st.markdown(f"**Text:** {row['text']}")
     if isinstance(row['audio'], str) and row['audio'].startswith("http"):

 import os
 import duckdb
 import pandas as pd
+import streamlit as st
 from huggingface_hub import hf_hub_download
+import shutil
+HF_REPO_ID = "stcoats/temp-duckdb-upload"
+HF_FILENAME = "ycsep.duckdb"
+LOCAL_PATH = "./ycsep.duckdb"
 st.title("YCSEP Audio Dataset Viewer")
+# Download if needed
+if not os.path.exists(LOCAL_PATH):
+    st.write("Downloading from HF Hub...")
+    path = hf_hub_download(
+        repo_id=HF_REPO_ID,
+        repo_type="dataset",
+        filename=HF_FILENAME,
+        local_dir=".",
+        local_dir_use_symlinks=False
+    )
+    if path != LOCAL_PATH:
+        shutil.copyfile(path, LOCAL_PATH)
+# Connect
+con = duckdb.connect(LOCAL_PATH, read_only=True)
+# Get total row count (only once)
+total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
+rows_per_page = 10
+total_pages = (total_rows - 1) // rows_per_page + 1
+st.success(f"Total rows: {total_rows}")
+# Select page
+page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
+offset = (page - 1) * rows_per_page
+# Optional: add search filter
 query = st.text_input("Search text or speaker")
 if query:
+    query_sql = f"""
+    SELECT * FROM data
+    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
+    LIMIT {rows_per_page} OFFSET {offset}
+    """
+    count_sql = f"""
+    SELECT COUNT(*) FROM data
+    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
+    """
+    filtered_rows = con.execute(count_sql).fetchone()[0]
+    st.write(f"Filtered rows: {filtered_rows}")
 else:
+    query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"
+# Run query  only small chunk loaded now
+df_page = con.execute(query_sql).df()
+# Display
+for _, row in df_page.iterrows():
     st.markdown(f"**Speaker:** {row['speaker']}")
     st.markdown(f"**Text:** {row['text']}")
     if isinstance(row['audio'], str) and row['audio'].startswith("http"):