Spaces:
Running
Running
File size: 2,941 Bytes
43b66f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import streamlit as st
import pandas as pd
import os
from huggingface_hub import HfApi, list_datasets
from datasets import load_dataset
@st.cache_data(ttl=3600)
def search_huggingface_datasets(query, limit=20):
"""
Search for datasets on Hugging Face Hub.
Args:
query: Search query string
limit: Maximum number of results to return
Returns:
List of dataset metadata
"""
try:
api = HfApi()
datasets = list_datasets(
filter=query,
limit=limit
)
# Convert to list of dicts with relevant info
results = []
for dataset in datasets:
results.append({
'id': dataset.id,
'name': dataset.id.split('/')[-1],
'description': dataset.description or "No description available",
'author': dataset.author or "Unknown",
'tags': dataset.tags,
'downloads': dataset.downloads
})
return results
except Exception as e:
st.error(f"Error searching Hugging Face Hub: {str(e)}")
return []
@st.cache_data(ttl=3600)
def load_huggingface_dataset(dataset_id, split='train'):
"""
Load a dataset from Hugging Face Hub.
Args:
dataset_id: ID of the dataset on HF Hub (e.g., 'mnist', 'glue', etc.)
split: Dataset split to load (e.g., 'train', 'test', 'validation')
Returns:
Pandas DataFrame containing the dataset
"""
try:
# Load the dataset
dataset = load_dataset(dataset_id, split=split)
# Convert to pandas DataFrame
df = dataset.to_pandas()
return df
except Exception as e:
st.error(f"Error loading dataset '{dataset_id}': {str(e)}")
raise
def upload_to_huggingface(dataset, dataset_name, token=None):
"""
Upload a dataset to Hugging Face Hub.
Args:
dataset: Pandas DataFrame to upload
dataset_name: Name for the dataset
token: Hugging Face API token (optional, will use environment variable if not provided)
Returns:
URL to the uploaded dataset
"""
# Get token from environment if not provided
if token is None:
token = os.getenv("HF_TOKEN")
if not token:
raise ValueError("No Hugging Face token provided. Set the HF_TOKEN environment variable or pass a token.")
try:
# Convert to HF dataset
from datasets import Dataset
hf_dataset = Dataset.from_pandas(dataset)
# Upload to HF Hub
push_result = hf_dataset.push_to_hub(
dataset_name,
token=token
)
return f"https://huggingface.co/datasets/{push_result.repo_id}"
except Exception as e:
st.error(f"Error uploading to Hugging Face Hub: {str(e)}")
raise
|