Spaces:

stcoats
/

tspace

Sleeping

App Files Files Community

stcoats commited on Mar 13

Commit

af343b5

1 Parent(s): f89e539

Add application file

Browse files

Files changed (1) hide show

app.py +40 -57

app.py CHANGED Viewed

@@ -1,70 +1,53 @@
 import os
 import duckdb
-import pandas as pd
 import streamlit as st
 from huggingface_hub import hf_hub_download
-import shutil
-HF_REPO_ID = "stcoats/temp-duckdb-upload"
 HF_FILENAME = "ycsep.duckdb"
 LOCAL_PATH = "./ycsep.duckdb"
 st.title("YCSEP Audio Dataset Viewer")
-# Download if needed
 if not os.path.exists(LOCAL_PATH):
-    st.write("Downloading from HF Hub...")
-    path = hf_hub_download(
-        repo_id=HF_REPO_ID,
-        repo_type="dataset",
-        filename=HF_FILENAME,
-        local_dir=".",
-        local_dir_use_symlinks=False
-    )
-    if path != LOCAL_PATH:
-        shutil.copyfile(path, LOCAL_PATH)
-# Connect
-con = duckdb.connect(LOCAL_PATH, read_only=True)
-# Get total row count (only once)
-total_rows = con.execute("SELECT COUNT(*) FROM data").fetchone()[0]
-rows_per_page = 10
-total_pages = (total_rows - 1) // rows_per_page + 1
-st.success(f"Total rows: {total_rows}")
-# Select page
-page = st.number_input("Page", min_value=1, max_value=total_pages, value=1)
-offset = (page - 1) * rows_per_page
-# Optional: add search filter
-query = st.text_input("Search text or speaker")
-if query:
-    query_sql = f"""
-    SELECT * FROM data
-    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
-    LIMIT {rows_per_page} OFFSET {offset}
-    """
-    count_sql = f"""
-    SELECT COUNT(*) FROM data
-    WHERE text ILIKE '%{query}%' OR CAST(speaker AS VARCHAR) ILIKE '%{query}%'
-    """
-    filtered_rows = con.execute(count_sql).fetchone()[0]
-    st.write(f"Filtered rows: {filtered_rows}")
 else:
-    query_sql = f"SELECT * FROM data LIMIT {rows_per_page} OFFSET {offset}"
-# Run query  only small chunk loaded now
-df_page = con.execute(query_sql).df()
-# Display
-for _, row in df_page.iterrows():
-    st.markdown(f"**Speaker:** {row['speaker']}")
-    st.markdown(f"**Text:** {row['text']}")
-    if isinstance(row['audio'], str) and row['audio'].startswith("http"):
-        st.audio(row['audio'], format="audio/mp3")
-    else:
-        st.warning("Audio not available")
-    st.markdown("---")

 import os
 import duckdb
 import streamlit as st
 from huggingface_hub import hf_hub_download
+HF_REPO_ID = "stcoats/temp-duckdb-upload"   # Replace with your actual dataset repo if needed
 HF_FILENAME = "ycsep.duckdb"
 LOCAL_PATH = "./ycsep.duckdb"
 st.title("YCSEP Audio Dataset Viewer")
+# Check if file exists
 if not os.path.exists(LOCAL_PATH):
+    st.write("Database not found locally. Downloading from HF Hub...")
+    try:
+        downloaded_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            repo_type="dataset",
+            filename=HF_FILENAME,
+            local_dir="."  # Saves as ./ycsep.duckdb automatically
+        )
+        st.success(f"Downloaded: {downloaded_path}")
+    except Exception as e:
+        st.error(f"Download failed: {e}")
+        st.stop()
 else:
+    st.write("Found local DuckDB file.")
+# Try connecting to the DB
+try:
+    con = duckdb.connect(LOCAL_PATH, read_only=True)
+    st.success("Connected to DuckDB.")
+except Exception as e:
+    st.error(f"Failed to connect to DuckDB: {e}")
+    st.stop()
+# Query first page without loading everything into memory
+st.write("Querying first 10 rows...")
+try:
+    cursor = con.execute("SELECT speaker, text, audio FROM data LIMIT 10")
+    rows = cursor.fetchall()
+    for speaker, text, audio in rows:
+        st.markdown(f"**Speaker:** {speaker}")
+        st.markdown(f"**Text:** {text}")
+        if isinstance(audio, str) and audio.startswith("http"):
+            st.audio(audio, format="audio/mp3")
+        else:
+            st.warning("Audio not available")
+        st.markdown("---")
+except Exception as e:
+    st.error(f"DuckDB query failed: {e}")