Spaces:

latterworks
/

agen

Sleeping

App Files Files Community

latterworks commited on Mar 20

Commit

717a9f0

verified ·

1 Parent(s): e397d59

Update app.py

Browse files

Files changed (1) hide show

app.py +492 -685

app.py CHANGED Viewed

@@ -1,72 +1,157 @@
 import os
-import sys
-import time
 import logging
-import datasets
 import shodan
-import asyncio
 import aiohttp
-import json
-import gradio as gr
-from typing import List, Dict, Any, Optional, Tuple, Set, Union
-from concurrent.futures import ThreadPoolExecutor
-# Configure production-grade logging
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s [%(filename)s:%(lineno)d] - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler("ollama_scanner.log")
-    ]
 )
 logger = logging.getLogger(__name__)
-def load_or_create_dataset():
     """
-    Load dataset from HuggingFace with optimized error handling and authentication.
     Returns:
-        Dataset: The loaded dataset object ready for query operations
     Raises:
-        ValueError: When authentication fails or dataset structure is invalid
-        ConnectionError: When network issues prevent dataset access
     """
-    # HF token must exist for private dataset access
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        raise ValueError("HF_TOKEN environment variable missing or empty - authentication required")
-    dataset_id = "latterworks/llama_checker_results"
-    logger.info(f"Initializing dataset access: {dataset_id}")
     try:
-        # First attempt: Try modern token parameter
-        try:
-            dataset = datasets.load_dataset(dataset_id, token=hf_token)
-        except TypeError:
-            # Fallback: Use legacy authentication parameter
-            logger.info("Attempting legacy authentication method")
-            dataset = datasets.load_dataset(dataset_id, use_auth_token=hf_token)
-        # Extract the appropriate split
-        if isinstance(dataset, datasets.DatasetDict):
-            if "train" in dataset:
-                return dataset["train"]
-            # No train split found, use first available
-            first_split = next(iter(dataset))
-            logger.info(f"No 'train' split found, using '{first_split}' split")
-            return dataset[first_split]
-        else:
-            # Handle direct Dataset object (no splits)
-            return dataset
     except FileNotFoundError:
-        logger.info(f"Dataset {dataset_id} not found - creating new dataset")
-        # Prepare empty dataset with precise schema
-        empty_dataset = datasets.Dataset.from_dict({
             "ip": [],
             "port": [],
             "country": [],
@@ -74,728 +159,450 @@ def load_or_create_dataset():
             "org": [],
             "models": []
         })
-        try:
-            # Create dataset on Hub with correct token parameter
-            empty_dataset.push_to_hub(dataset_id, token=hf_token)
-            logger.info(f"Successfully created empty dataset: {dataset_id}")
-            # Load the newly created dataset
-            try:
-                dataset = datasets.load_dataset(dataset_id, token=hf_token)
-            except TypeError:
-                dataset = datasets.load_dataset(dataset_id, use_auth_token=hf_token)
-            # Extract appropriate split
-            if isinstance(dataset, datasets.DatasetDict):
-                if "train" in dataset:
-                    return dataset["train"]
-                first_split = next(iter(dataset))
-                logger.info(f"Using '{first_split}' split from newly created dataset")
-                return dataset[first_split]
-            else:
-                return dataset
-        except Exception as creation_error:
-            logger.error(f"Dataset creation failed: {creation_error}")
-            raise ValueError(f"Failed to create dataset: {creation_error}") from creation_error
-    except (ConnectionError, TimeoutError) as network_error:
-        logger.error(f"Network error accessing dataset: {network_error}")
-        raise ConnectionError(f"Network failure accessing HuggingFace Hub: {network_error}") from network_error
-    except Exception as general_error:
-        logger.error(f"Unexpected error accessing dataset: {general_error}")
-        raise ValueError(f"Dataset access failed: {general_error}") from general_error
-def scan_shodan(progress=gr.Progress()) -> List[Dict]:
     """
-    Scan Shodan for Ollama instances using search_cursor for comprehensive result retrieval.
     Args:
-        progress: Gradio progress bar for visual feedback
     Returns:
-        List of Ollama instances from Shodan with comprehensive metadata
     """
-    # API key fetch - no validation needed as it's centralized at startup
-    api_key = os.getenv("SHODAN_API_KEY")
-    shodan_query = os.getenv("SHODAN_QUERY", "product:Ollama port:11434")
-    api = shodan.Shodan(api_key)
-    try:
-        logger.info(f"Executing Shodan search_cursor with query: {shodan_query}")
-        # Use search_cursor to handle pagination automatically
-        cursor = api.search_cursor(shodan_query)
-        # Initialize scan metrics
-        instances = []
-        processed = 0
-        batch_size = 100  # Process results in batches for progress updates
-        progress(0, desc="Initializing Shodan data retrieval")
-        # Process all results from the cursor
-        results_batch = []
-        for result in cursor:
-            results_batch.append(result)
-            processed += 1
-            # Process in batches for efficiency
-            if len(results_batch) >= batch_size:
-                progress(min(1.0, processed / (processed + 100)), desc=f"Retrieved {processed} Ollama instances")
-                # Extract instance data from batch
-                for result in results_batch:
-                    instances.append({
-                        'ip': result.get('ip_str'),
-                        'port': result.get('port', 11434),
-                        'country': result.get('location', {}).get('country_name'),
-                        'region': result.get('location', {}).get('region_name'),
-                        'org': result.get('org'),
-                        'models': []
-                    })
-                results_batch = []
-        # Process any remaining results
-        if results_batch:
-            for result in results_batch:
-                instances.append({
-                    'ip': result.get('ip_str'),
-                    'port': result.get('port', 11434),
-                    'country': result.get('location', {}).get('country_name'),
-                    'region': result.get('location', {}).get('region_name'),
-                    'org': result.get('org'),
-                    'models': []
-                })
-        logger.info(f"Completed Shodan scan, retrieved {len(instances)} Ollama instances")
-        return instances
-    except shodan.APIError as e:
-        error_msg = str(e)
-        if "Invalid API key" in error_msg:
-            logger.error(f"Shodan authentication failed: Invalid API key")
-            raise ValueError("Invalid Shodan API key. Please check your SHODAN_API_KEY environment variable.")
-        elif "Request rate limit reached" in error_msg:
-            logger.error(f"Shodan rate limit exceeded: {e}")
-            raise ValueError("Shodan API rate limit exceeded. Please wait before trying again.")
-        else:
-            logger.error(f"Shodan API error: {e}")
-            raise
-    except Exception as e:
-        logger.error(f"Unhandled exception during Shodan scan: {e}")
-        raise
-async def check_single_endpoint(session, instance):
-    """Check a single Ollama endpoint for available models."""
-    ip = instance['ip']
-    port = instance['port']
     url = f"http://{ip}:{port}/api/tags"
     try:
-        logger.info(f"Checking Ollama endpoint: {url}")
-        # Set a timeout for the request
-        async with session.get(url, timeout=5) as response:
-            if response.status == 200:
-                data = await response.json()
-                models = data.get('models', [])
-                logger.info(f"Found {len(models)} models at {url}")
-                instance['models'] = models
-                return instance
-            else:
-                logger.warning(f"Failed to get models from {url} - Status: {response.status}")
-                return instance
     except asyncio.TimeoutError:
-        logger.warning(f"Timeout connecting to {url}")
-        return instance
     except Exception as e:
-        logger.error(f"Error checking {url}: {e}")
-        return instance
-async def check_ollama_endpoints(instances, progress=gr.Progress()):
     """
-    Efficiently check multiple Ollama endpoints with concurrent processing and comprehensive error handling.
     Args:
-        instances: List of Ollama instances from Shodan
-        progress: Gradio progress bar for visual feedback
     Returns:
-        List of Ollama instances with enriched model information
     """
-    if not instances:
-        logger.info("No instances to check - skipping endpoint verification")
-        return []
-    total_instances = len(instances)
-    logger.info(f"Initiating concurrent validation of {total_instances} Ollama endpoints")
-    progress(0, desc=f"Preparing to check {total_instances} Ollama endpoints")
-    # Configure optimized session with connection pooling and timeouts
-    conn = aiohttp.TCPConnector(limit=50, ttl_dns_cache=300)
-    timeout = aiohttp.ClientTimeout(total=30, connect=5, sock_connect=5, sock_read=20)
-    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
-        # Create task queue
-        tasks = [check_single_endpoint(session, instance) for instance in instances]
-        # Process with dynamic progress tracking
-        updated_instances = []
-        completed = 0
-        for future in asyncio.as_completed(tasks):
-            try:
-                # Process completed task
-                instance = await future
-                updated_instances.append(instance)
-                # Update progress with meaningful metrics
-                completed += 1
-                progress_pct = completed / total_instances
-                progress(progress_pct, desc=f"Checked {completed}/{total_instances} endpoints ({progress_pct:.1%})")
-                # Log models found
-                if instance.get('models'):
-                    logger.info(f"Found {len(instance['models'])} models at {instance['ip']}:{instance['port']}")
-            except Exception as task_error:
-                # Handle per-task errors without stopping the process
-                logger.warning(f"Endpoint check failed: {task_error}")
-                # Continue processing remaining endpoints
-    valid_instances = [i for i in updated_instances if i.get('models')]
-    logger.info(f"Endpoint validation complete: {len(valid_instances)}/{total_instances} accessible")
-    return updated_instances
-def update_dataset_with_instances(dataset, instances):
     """
-    Efficiently update HuggingFace dataset with optimized delta synchronization.
-    Implements single-pass dataset updates with:
-    1. Optimized in-memory index of existing entries
-    2. Differential detection of new vs. modified instances
-    3. Single hub push with consolidated changes
     Args:
-        dataset: HuggingFace dataset object to update
-        instances: List of Ollama instances with model information
     Returns:
-        Updated HuggingFace dataset with synchronized changes
     """
-    if not instances:
-        logger.warning("No instance data provided for dataset update operation")
-        return dataset
-    start_time = time.time()
-    # Optimization: Create indexed lookup of existing instances for O(1) access
-    dataset_dict = {}
-    for idx, item in enumerate(dataset):
-        key = f"{item['ip']}:{item['port']}"
-        dataset_dict[key] = {
-            'idx': idx,
-            'data': item
-        }
-    # Track modification metrics
-    stats = {
-        'new': 0,
-        'updated': 0,
-        'unchanged': 0,
-        'models_added': 0
-    }
-    # Process differentials
-    update_candidates = []
-    new_instances = []
-    for instance in instances:
-        # Skip instances without valid IP
-        if not instance.get('ip'):
-            continue
-        instance_key = f"{instance['ip']}:{instance['port']}"
-        if instance_key in dataset_dict:
-            # Existing instance - determine if update needed
-            existing = dataset_dict[instance_key]['data']
-            needs_update = False
-            # Check metadata changes
-            for field in ['country', 'region', 'org']:
-                if instance.get(field) and instance.get(field) != existing.get(field):
-                    needs_update = True
-            # Check model changes - only update if models were found
-            if instance.get('models'):
-                # Compare model signatures to detect changes
-                existing_models = {model.get('name', ''): model for model in existing.get('models', [])}
-                new_models = {model.get('name', ''): model for model in instance.get('models', [])}
-                if set(new_models.keys()) != set(existing_models.keys()):
-                    needs_update = True
-                    stats['models_added'] += len(set(new_models.keys()) - set(existing_models.keys()))
-            if needs_update:
-                # Create updated instance
-                updated = dict(existing)
-                updated.update({
-                    'country': instance.get('country', existing.get('country')),
-                    'region': instance.get('region', existing.get('region')),
-                    'org': instance.get('org', existing.get('org')),
-                })
-                # Only update models if they were found
-                if instance.get('models'):
-                    updated['models'] = instance['models']
-                update_candidates.append(updated)
-                stats['updated'] += 1
-            else:
-                stats['unchanged'] += 1
-        else:
-            # New instance
-            new_instances.append(instance)
-            stats['new'] += 1
-    # Efficiently construct updated dataset
-    if new_instances or update_candidates:
-        # Start with current dataset
-        current_data = dataset.to_list()
-        # Apply updates
-        for updated in update_candidates:
-            instance_key = f"{updated['ip']}:{updated['port']}"
-            idx = dataset_dict[instance_key]['idx']
-            current_data[idx] = updated
-        # Add new instances
-        current_data.extend(new_instances)
-        # Create updated dataset
-        updated_dataset = datasets.Dataset.from_list(current_data)
-        # Push to hub with single operation
-        hf_token = os.getenv("HF_TOKEN")
-        updated_dataset.push_to_hub("latterworks/llama_checker_results", token=hf_token)
-        execution_time = time.time() - start_time
-        logger.info(f"Dataset synchronization complete in {execution_time:.2f}s: {stats['new']} new, {stats['updated']} updated, {stats['unchanged']} unchanged, {stats['models_added']} new models")
-        return updated_dataset
-    else:
-        logger.info("No dataset changes detected - skipping hub synchronization")
-        return dataset
-def get_unique_values(dataset):
     """
-    Get unique values for model attributes to populate dropdown filters.
     Args:
-        dataset: HuggingFace dataset
     Returns:
-        Dictionary with unique values for each attribute
     """
-    # Initialize empty sets
-    families = set()
-    parameter_sizes = set()
-    # Extract unique values from models
-    for instance in dataset:
-        for model in instance.get('models', []):
-            details = model.get('details', {})
-            # Handle both direct details in the model and nested details
-            if isinstance(details, dict):
-                family = details.get('family')
-                parameter_size = details.get('parameter_size')
-            else:
-                family = model.get('family')
-                parameter_size = model.get('parameter_size')
-            if family:
-                families.add(family)
-            if parameter_size:
-                parameter_sizes.add(parameter_size)
-    return {
-        'families': sorted(list(families)),
-        'parameter_sizes': sorted(list(parameter_sizes))
-    }
-def search_models(dataset, family=None, parameter_size=None, name_search=None, is_admin=False):
     """
     Search for models in the dataset based on filters.
     Args:
-        dataset: HuggingFace dataset
-        family: Filter by model family
-        parameter_size: Filter by parameter size
-        name_search: Filter by model name (substring match)
-        is_admin: Whether to include IP and port information
     Returns:
-        List of dictionaries with model information
     """
-    results = []
-    for instance in dataset:
-        ip = instance.get('ip')
-        port = instance.get('port')
-        country = instance.get('country')
-        region = instance.get('region')
-        org = instance.get('org')
-        for model in instance.get('models', []):
-            # Extract model details
-            model_name = model.get('name', '')
-            # Handle both direct details in the model and nested details
-            details = model.get('details', {})
-            if isinstance(details, dict):
-                model_family = details.get('family', '')
-                model_param_size = details.get('parameter_size', '')
-                model_quant_level = details.get('quantization_level', '')
-            else:
-                model_family = model.get('family', '')
-                model_param_size = model.get('parameter_size', '')
-                model_quant_level = model.get('quantization_level', '')
-            model_size_bytes = model.get('size', 0)
-            model_size_gb = model_size_bytes / (1024 * 1024 * 1024) if model_size_bytes else 0
             # Apply filters
-            if family and model_family != family:
                 continue
-            if parameter_size and model_param_size != parameter_size:
                 continue
-            if name_search and name_search.lower() not in model_name.lower():
                 continue
-            # Create result object
-            result = {
-                'name': model_name,
-                'family': model_family,
-                'parameter_size': model_param_size,
-                'quantization_level': model_quant_level,
-                'size_gb': round(model_size_gb, 2),
-                'country': country,
-                'region': region,
-                'org': org,
             }
-            # Include full model info for details view
-            result['full_model_info'] = json.dumps(model, indent=2)
-            # Include IP and port for admin users only
             if is_admin:
-                result['ip'] = ip
-                result['port'] = port
-            results.append(result)
-    return results
-def create_interface():
     """
-    Create enterprise-grade Gradio interface with optimized data loading and admin authentication.
     Returns:
-        gr.Blocks: Fully configured Gradio interface ready for deployment
     """
-    # Administrative authentication function
-    def validate_admin():
-        """Check if current user has admin privileges based on API key"""
-        # For production systems, this would use proper authentication
-        # Currently using API key presence as simple auth mechanism
-        admin_key = os.getenv("ADMIN_KEY", "")
-        shodan_key = os.getenv("SHODAN_API_KEY", "")
-        return bool(admin_key and shodan_key)
     try:
-        # Initialize critical data structures once at startup
-        logger.info("Initializing application data layer")
         dataset = load_or_create_dataset()
-        # Extract model metadata attributes for filtering
-        unique_values = get_unique_values(dataset)
-        logger.info(f"Loaded dataset with {len(unique_values['families'])} model families and {len(unique_values['parameter_sizes'])} parameter sizes")
-        # Preload initial model data
-        initial_results = search_models(dataset)
-        logger.info(f"Preloaded {len(initial_results)} models for initial display")
-        # Determine administrative access
-        is_admin = validate_admin()
-        admin_status = "enabled" if is_admin else "disabled"
-        logger.info(f"Administrative access: {admin_status}")
-        # Create interface with optimized structure
-        with gr.Blocks(
-            title="Ollama Instance Scanner",
-            theme=gr.themes.Soft(),
-            css=".footer {text-align: center; margin-top: 20px; color: #666;}"
-        ) as interface:
-            # Header section
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("# Ollama Instance Scanner")
-                    gr.Markdown("Browse publicly accessible Ollama models and their capabilities")
-            # Tab container
-            with gr.Tabs() as tabs:
-                # Tab 1: Model Browser (Public)
-                with gr.TabItem("Browse Models"):
-                    with gr.Row():
-                        # Filter controls
-                        with gr.Column(scale=1):
-                            with gr.Box():
-                                gr.Markdown("### Search Filters")
-                                family_dropdown = gr.Dropdown(
-                                    choices=["All"] + unique_values['families'],
-                                    value="All",
-                                    label="Model Family",
-                                    interactive=True
-                                )
-                                parameter_size_dropdown = gr.Dropdown(
-                                    choices=["All"] + unique_values['parameter_sizes'],
-                                    value="All",
-                                    label="Parameter Size",
-                                    interactive=True
-                                )
-                                name_search = gr.Textbox(
-                                    label="Model Name",
-                                    placeholder="Enter model name...",
-                                    interactive=True
-                                )
-                                search_button = gr.Button("Search Models", variant="primary")
-                    # Results section
-                    with gr.Row():
-                        # Model results table
-                        results_table = gr.DataFrame(
-                            value=initial_results,
-                            headers=["name", "family", "parameter_size", "quantization_level", "size_gb", "country", "region", "org"],
-                            label="Available Models",
-                            interactive=False,
-                            wrap=True
                         )
-                    # Details section
-                    with gr.Row():
-                        # Model specifications panel
-                        model_details = gr.JSON(
-                            label="Model Specifications",
-                            visible=True
                         )
-                # Tab 2: Shodan Scanner (Admin Only)
-                with gr.TabItem("Shodan Scan", visible=is_admin):
-                    with gr.Box():
-                        gr.Markdown("## Ollama Instance Scanner")
-                        gr.Markdown("This tool scans for publicly accessible Ollama instances using Shodan API")
-                        # Scanner controls
-                        with gr.Row():
-                            shodan_scan_button = gr.Button(
-                                "Start Shodan Scan",
-                                variant="primary",
-                                interactive=is_admin
-                            )
-                        # Status display
-                        with gr.Row():
-                            scan_status = gr.Textbox(
-                                label="Scan Status",
-                                value="Ready to scan" if is_admin else "Admin access required",
-                                interactive=False
-                            )
-            # Footer
-            gr.Markdown(
-                "### Ollama Instance Scanner | Powered by Shodan & Hugging Face",
-                elem_classes=["footer"]
-            )
-            # Define optimized event handlers
-            def on_search_click(family, parameter_size, name_search):
-                """Process model search with optimized filtering"""
-                try:
-                    # Apply filters
-                    family_filter = None if family == "All" else family
-                    param_size_filter = None if parameter_size == "All" else parameter_size
-                    name_filter = None if not name_search else name_search.strip()
-                    # Execute search with admin privileges if available
-                    results = search_models(
-                        dataset,
-                        family_filter,
-                        param_size_filter,
-                        name_filter,
-                        is_admin
                     )
-                    logger.info(f"Search completed: {len(results)} models found matching criteria")
-                    return results
-                except Exception as search_error:
-                    logger.error(f"Search failed: {search_error}")
-                    # Return empty results on error
-                    return []
-            def on_table_select(evt: gr.SelectData, results):
-                """Handle table row selection with error protection"""
-                try:
-                    if evt and evt.index and len(results) > evt.index[0]:
-                        selected_row = results[evt.index[0]]
-                        # Extract and return model details
-                        return selected_row.get('full_model_info', "{}")
-                    return "{}"
-                except Exception as selection_error:
-                    logger.error(f"Selection error: {selection_error}")
-                    return "{}"
-            async def run_shodan_scan():
-                """Execute Shodan scan workflow with comprehensive monitoring"""
-                if not is_admin:
-                    return "Error: Administrative access required"
-                scan_id = int(time.time())  # Generate unique scan identifier
-                logger.info(f"Initiating Shodan scan {scan_id}")
-                try:
-                    # Phase 1: Shodan API scan
-                    instances = scan_shodan()
-                    if not instances:
-                        return "Scan complete: No Ollama instances found"
-                    instance_count = len(instances)
-                    logger.info(f"Scan {scan_id}: Found {instance_count} potential Ollama instances")
-                    # Phase 2: Endpoint validation
-                    updated_instances = await check_ollama_endpoints(instances)
-                    accessible_count = sum(1 for i in updated_instances if i.get('models'))
-                    logger.info(f"Scan {scan_id}: Validated {accessible_count} accessible instances")
-                    # Phase 3: Dataset synchronization
-                    nonlocal dataset
-                    dataset = update_dataset_with_instances(dataset, updated_instances)
-                    # Phase 4: Interface update
-                    nonlocal unique_values
-                    unique_values = get_unique_values(dataset)
-                    # Update UI components with new data
-                    family_dropdown.choices = ["All"] + unique_values['families']
-                    parameter_size_dropdown.choices = ["All"] + unique_values['parameter_sizes']
-                    # Build detailed completion report
-                    report = (
-                        f"Scan {scan_id} completed successfully:\n"
-                        f"• {instance_count} total instances discovered\n"
-                        f"• {accessible_count} instances with accessible models\n"
-                        f"• {len(unique_values['families'])} unique model families\n"
-                        f"• {len(unique_values['parameter_sizes'])} parameter size variants"
                     )
-                    logger.info(f"Scan {scan_id} completed successfully")
-                    return report
-                except Exception as scan_error:
-                    logger.error(f"Scan {scan_id} failed: {scan_error}")
-                    # Generate actionable error message
-                    if isinstance(scan_error, ValueError) and "API key" in str(scan_error):
-                        return "Error: Invalid Shodan API key. Please check your SHODAN_API_KEY environment variable."
-                    elif isinstance(scan_error, ConnectionError):
-                        return "Error: Network connectivity issue. Please check your internet connection."
-                    else:
-                        return f"Error: Scan operation failed - {str(scan_error)}"
-            # Connect event handlers to UI components
-            search_button.click(
-                fn=on_search_click,
-                inputs=[family_dropdown, parameter_size_dropdown, name_search],
-                outputs=[results_table]
-            )
-            results_table.select(
-                fn=on_table_select,
-                inputs=[results_table],
-                outputs=[model_details]
-            )
-            shodan_scan_button.click(
-                fn=run_shodan_scan,
-                inputs=[],
-                outputs=[scan_status]
-            )
-        logger.info("Gradio interface successfully initialized")
-        return interface
-    except Exception as interface_error:
-        logger.critical(f"Interface initialization failed: {interface_error}")
-        raise ValueError(f"Failed to create application interface: {interface_error}") from interface_error
-def validate_env_variables():
-    """
-    Centralized validation of critical environment variables with precise error messaging.
-    Raises:
-        ValueError: When any required environment variable is missing
-    """
-    required_vars = ["SHODAN_API_KEY", "HF_TOKEN"]
-    missing_vars = [var for var in required_vars if not os.getenv(var)]
-    if missing_vars:
-        error_msg = f"Missing critical environment variables: {', '.join(missing_vars)}"
-        logger.critical(error_msg)
-        raise ValueError(error_msg)
-    # Validate token quality
-    hf_token = os.getenv("HF_TOKEN")
-    if len(hf_token) < 8:  # Minimum length for plausible token
-        logger.warning("HF_TOKEN appears malformed (insufficient length)")
-    logger.info("Environment validation successful - all required variables present")
-def main():
-    """
-    Application entry point with centralized error handling and environment validation.
-    """
-    try:
-        # Validate environment once at startup
-        validate_env_variables()
-        # Initialize and launch interface
-        logger.info("Initializing Gradio interface")
-        interface = create_interface()
-        if interface:
-            logger.info("Starting Gradio server")
-            interface.launch()
-        else:
-            logger.critical("Interface initialization failed")
-            sys.exit(1)
-    except ValueError as config_error:
-        # Handle configuration errors
-        logger.critical(f"Configuration error: {config_error}")
-        sys.exit(1)
-    except Exception as fatal_error:
-        # Handle unexpected errors
-        logger.critical(f"Fatal application error: {fatal_error}")
-        sys.exit(1)
 if __name__ == "__main__":
-    main()

+"""
+Ollama Instance & Model Scanner for Hugging Face Space
+This application scans for publicly accessible Ollama instances, retrieves model information,
+and provides a secure interface for browsing discovered models.
+Security Architecture:
+- Server-side authorization based on environment variables
+- Strict input sanitization
+- Comprehensive error handling
+- Asynchronous endpoint checking
+- Efficient dataset management
+"""
 import os
+import re
+import json
+import asyncio
 import logging
+import gradio as gr
 import shodan
 import aiohttp
+from datasets import load_dataset, Dataset
+from typing import Dict, List, Optional, Any, Tuple, Union
+from datetime import datetime
+from functools import wraps
+# Configure logging
 logging.basicConfig(
     level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler()]
 )
 logger = logging.getLogger(__name__)
+# Security layer - Authorization functions
+def authorization_required(func):
     """
+    Decorator that enforces server-side authorization for protected functions.
+    Authorization is determined by environment variables, not client parameters.
+    Args:
+        func: The function to protect with authorization
     Returns:
+        A wrapped function that performs authorization check
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not verify_admin_authorization():
+            logger.warning(f"Unauthorized access attempt to {func.__name__}")
+            return {"error": "Unauthorized access"} if kwargs.get("return_error", False) else None
+        return func(*args, **kwargs)
+    return wrapper
+def verify_admin_authorization() -> bool:
+    """
+    Perform server-side verification of admin authorization.
+    Authorization is based on environment variables, not client data.
+    Returns:
+        bool: True if valid admin credentials exist
+    """
+    try:
+        # Check for the existence of the Shodan API key
+        api_key = os.getenv("SHODAN_API_KEY")
+        hf_token = os.getenv("HF_TOKEN")
+        return (api_key is not None and
+                len(api_key.strip()) > 10 and
+                hf_token is not None and
+                len(hf_token.strip()) > 10)
+    except Exception as e:
+        logger.error(f"Error verifying admin authorization: {str(e)}")
+        return False
+# Security layer - Input validation
+def sanitize_input(input_string: str) -> str:
+    """
+    Sanitize user input to prevent injection attacks.
+    Args:
+        input_string: User input string to sanitize
+    Returns:
+        str: Sanitized string
+    """
+    if not isinstance(input_string, str):
+        return ""
+    # Remove potentially harmful characters
+    sanitized = re.sub(r'[^\w\s\-\.]', '', input_string)
+    # Limit length to prevent DoS
+    return sanitized[:100]
+def get_env_variables() -> Dict[str, str]:
+    """
+    Get all required environment variables.
+    Returns:
+        Dict[str, str]: Dictionary containing environment variables
     Raises:
+        ValueError: If any required environment variable is missing
     """
+    env_vars = {
+        "SHODAN_API_KEY": os.getenv("SHODAN_API_KEY"),
+        "SHODAN_QUERY": os.getenv("SHODAN_QUERY", "product:Ollama port:11434"),
+        "HF_TOKEN": os.getenv("HF_TOKEN")
+    }
+    missing_vars = [name for name, value in env_vars.items() if not value]
+    if missing_vars:
+        error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
+        logger.error(error_msg)
+        raise ValueError(error_msg)
+    return env_vars
+# Data access layer
+def load_or_create_dataset() -> Dataset:
+    """
+    Load the dataset from Hugging Face Hub or create it if it doesn't exist.
+    Returns:
+        Dataset: Loaded or created dataset
+    Raises:
+        Exception: If dataset loading or creation fails
+    """
     try:
+        # Attempt to get environment variables - this will raise ValueError if missing
+        env_vars = get_env_variables()
+        logger.info("Attempting to load dataset from Hugging Face Hub")
+        dataset = load_dataset("latterworks/llama_checker_results", use_auth_token=env_vars["HF_TOKEN"])
+        dataset = dataset['train']
+        logger.info(f"Successfully loaded dataset with {len(dataset)} entries")
+        return dataset
+    except ValueError as e:
+        # Re-raise environment variable errors
+        raise
     except FileNotFoundError:
+        # Only create dataset if admin authorization is verified
+        if not verify_admin_authorization():
+            logger.error("Unauthorized attempt to create dataset")
+            raise ValueError("Unauthorized: Only admins can create the dataset")
+        logger.info("Dataset not found, creating a new one")
+        env_vars = get_env_variables()
+        dataset = Dataset.from_dict({
             "ip": [],
             "port": [],
             "country": [],
             "org": [],
             "models": []
         })
+        dataset.push_to_hub("latterworks/llama_checker_results", token=env_vars["HF_TOKEN"])
+        logger.info("Created and pushed empty dataset to Hugging Face Hub")
+        # Reload the dataset to ensure consistency
+        dataset = load_dataset("latterworks/llama_checker_results", use_auth_token=env_vars["HF_TOKEN"])['train']
+        return dataset
+    except Exception as e:
+        error_msg = f"Failed to load or create dataset: {str(e)}"
+        logger.error(error_msg)
+        raise
+async def check_single_endpoint(ip: str, port: int, timeout: int = 5) -> Optional[List[Dict[str, Any]]]:
     """
+    Check a single Ollama endpoint for available models.
     Args:
+        ip: IP address of the Ollama instance
+        port: Port number of the Ollama instance
+        timeout: Timeout in seconds for the HTTP request
     Returns:
+        Optional[List[Dict[str, Any]]]: List of model information dictionaries, or None if endpoint check fails
     """
     url = f"http://{ip}:{port}/api/tags"
     try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, timeout=timeout) as response:
+                if response.status == 200:
+                    data = await response.json()
+                    if "models" in data and isinstance(data["models"], list):
+                        logger.info(f"Successfully retrieved {len(data['models'])} models from {ip}:{port}")
+                        return data["models"]
+                    else:
+                        logger.warning(f"Unexpected response format from {ip}:{port}")
+                else:
+                    logger.warning(f"Received status code {response.status} from {ip}:{port}")
+    except aiohttp.ClientError as e:
+        logger.warning(f"Connection error for {ip}:{port}: {str(e)}")
     except asyncio.TimeoutError:
+        logger.warning(f"Connection timeout for {ip}:{port}")
     except Exception as e:
+        logger.warning(f"Unexpected error checking {ip}:{port}: {str(e)}")
+    return None
+@authorization_required
+async def check_ollama_endpoints(dataset: Dataset, progress: Optional[gr.Progress] = None) -> Dataset:
     """
+    Check all Ollama endpoints in the dataset for available models.
+    Requires admin authorization.
     Args:
+        dataset: Dataset containing Ollama endpoints
+        progress: Optional Gradio progress bar
     Returns:
+        Dataset: Updated dataset with model information
     """
+    if progress:
+        progress(0, desc="Preparing to check endpoints...")
+    # Build a list of tasks to execute
+    total_endpoints = len(dataset)
+    tasks = []
+    for i, item in enumerate(dataset):
+        ip = item["ip"]
+        port = item["port"]
+        tasks.append(check_single_endpoint(ip, port))
+    # Execute tasks in batches to avoid overwhelming resources
+    batch_size = 10
+    updated_dataset = dataset.copy()
+    for i in range(0, len(tasks), batch_size):
+        if progress:
+            progress(i / len(tasks), desc=f"Checking endpoints {i+1}-{min(i+batch_size, len(tasks))} of {len(tasks)}...")
+        batch_tasks = tasks[i:i+batch_size]
+        batch_results = await asyncio.gather(*batch_tasks)
+        for j, result in enumerate(batch_results):
+            idx = i + j
+            if idx < len(dataset):
+                if result:
+                    updated_dataset = updated_dataset.add_item({
+                        "ip": dataset[idx]["ip"],
+                        "port": dataset[idx]["port"],
+                        "country": dataset[idx]["country"],
+                        "region": dataset[idx]["region"],
+                        "org": dataset[idx]["org"],
+                        "models": result
+                    })
+    if progress:
+        progress(1.0, desc="Endpoint checking complete!")
+    logger.info(f"Checked {total_endpoints} endpoints, found models on {sum(1 for item in updated_dataset if item['models'])} endpoints")
+    # Push updated dataset to Hugging Face Hub
+    env_vars = get_env_variables()
+    updated_dataset.push_to_hub("latterworks/llama_checker_results", token=env_vars["HF_TOKEN"])
+    logger.info("Successfully pushed updated dataset to Hugging Face Hub")
+    return updated_dataset
+@authorization_required
+def scan_shodan(progress: Optional[gr.Progress] = None) -> str:
     """
+    Scan Shodan for Ollama instances and update the dataset.
+    Requires admin authorization.
     Args:
+        progress: Optional Gradio progress bar
     Returns:
+        str: Status message
     """
+    try:
+        # Get environment variables
+        env_vars = get_env_variables()
+        # Load dataset
+        dataset = load_or_create_dataset()
+        # Initialize Shodan API client
+        api = shodan.Shodan(env_vars["SHODAN_API_KEY"])
+        query = env_vars["SHODAN_QUERY"]
+        if progress:
+            progress(0, desc="Starting Shodan search...")
+        # Get total results count
+        count_result = api.count(query)
+        total_results = count_result.get('total', 0)
+        if total_results == 0:
+            return "No Ollama instances found on Shodan."
+        logger.info(f"Found {total_results} potential Ollama instances on Shodan")
+        # Search Shodan
+        new_instances = []
+        results_processed = 0
+        for result in api.search_cursor(query):
+            results_processed += 1
+            if progress:
+                progress(results_processed / total_results,
+                         desc=f"Processing Shodan result {results_processed}/{total_results}")
+            ip = result.get('ip_str')
+            port = result.get('port', 11434)
+            # Skip if instance already exists in dataset
+            if any(item["ip"] == ip and item["port"] == port for item in dataset):
+                continue
+            # Extract location information
+            country = result.get('location', {}).get('country_name', '')
+            region = result.get('location', {}).get('region_name', '')
+            org = result.get('org', '')
+            new_instances.append({
+                "ip": ip,
+                "port": port,
+                "country": country,
+                "region": region,
+                "org": org,
+                "models": []
+            })
+        if progress:
+            progress(1.0, desc="Shodan search complete!")
+        # Add new instances to dataset
+        updated_dataset = dataset.copy()
+        for instance in new_instances:
+            updated_dataset = updated_dataset.add_item(instance)
+        logger.info(f"Added {len(new_instances)} new instances to dataset")
+        # Check Ollama endpoints asynchronously
+        if new_instances:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            updated_dataset = loop.run_until_complete(check_ollama_endpoints(updated_dataset, progress))
+            loop.close()
+        status_message = f"Scan complete! Found {len(new_instances)} new Ollama instances."
+        return status_message
+    except shodan.APIError as e:
+        error_msg = f"Shodan API error: {str(e)}"
+        logger.error(error_msg)
+        return error_msg
+    except Exception as e:
+        error_msg = f"Error during Shodan scan: {str(e)}"
+        logger.error(error_msg)
+        return error_msg
+def get_unique_values(dataset: Dataset, field: str) -> List[str]:
     """
+    Get unique values for a specific field in the dataset.
     Args:
+        dataset: Dataset to extract values from
+        field: Field name to extract values from
     Returns:
+        List[str]: List of unique values
     """
+    unique_values = set()
+    if field == "family" or field == "parameter_size" or field == "quantization_level":
+        for item in dataset:
+            models = item.get("models", [])
+            if not models:
+                continue
+            for model in models:
+                details = model.get("details", {})
+                if details and field in details:
+                    value = details.get(field)
+                    if value:
+                        unique_values.add(value)
+    return sorted(list(unique_values))
+def search_models(dataset: Dataset, name_search: str = "", family: str = "", parameter_size: str = "") -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
     """
     Search for models in the dataset based on filters.
+    Authorization is determined server-side.
     Args:
+        dataset: Dataset to search
+        name_search: Model name search string
+        family: Model family filter
+        parameter_size: Parameter size filter
     Returns:
+        Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: Filtered model list and detailed model list
     """
+    # Server-side authorization check
+    is_admin = verify_admin_authorization()
+    name_search = sanitize_input(name_search).lower()
+    family = sanitize_input(family)
+    parameter_size = sanitize_input(parameter_size)
+    filtered_models = []
+    detailed_models = []
+    for item in dataset:
+        models = item.get("models", [])
+        if not models:
+            continue
+        ip = item.get("ip", "")
+        port = item.get("port", 0)
+        country = item.get("country", "")
+        region = item.get("region", "")
+        org = item.get("org", "")
+        for model in models:
+            model_name = model.get("name", "").lower()
+            details = model.get("details", {})
+            model_family = details.get("family", "")
+            model_parameter_size = details.get("parameter_size", "")
+            model_quantization = details.get("quantization_level", "")
+            model_size = model.get("size", 0)
+            model_size_gb = round(model_size / (1024**3), 2) if model_size else 0
             # Apply filters
+            if name_search and name_search not in model_name:
                 continue
+            if family and family != model_family:
                 continue
+            if parameter_size and parameter_size != model_parameter_size:
                 continue
+            # Prepare filtered model entry
+            filtered_model = {
+                "name": model.get("name", ""),
+                "family": model_family,
+                "parameter_size": model_parameter_size,
+                "quantization_level": model_quantization,
+                "size_gb": model_size_gb
             }
+            # Add IP and port information only for admins - server-side check
+            if is_admin:
+                filtered_model["ip"] = ip
+                filtered_model["port"] = port
+            filtered_models.append(filtered_model)
+            # Prepare detailed model entry
+            detailed_model = {
+                "name": model.get("name", ""),
+                "family": model_family,
+                "parameter_size": model_parameter_size,
+                "quantization_level": model_quantization,
+                "size_gb": model_size_gb,
+                "digest": model.get("digest", ""),
+                "modified_at": model.get("modified_at", ""),
+                "country": country,
+                "region": region,
+                "org": org
+            }
+            # Add IP and port information only for admins - server-side check
             if is_admin:
+                detailed_model["ip"] = ip
+                detailed_model["port"] = port
+            detailed_models.append(detailed_model)
+    return filtered_models, detailed_models
+def create_ui() -> gr.Blocks:
     """
+    Create the Gradio user interface with server-side authorization.
     Returns:
+        gr.Blocks: Gradio interface
     """
+    # Load dataset
     try:
         dataset = load_or_create_dataset()
+    except Exception as e:
+        # Fallback to empty dataset if loading fails
+        logger.error(f"Failed to load dataset: {str(e)}")
+        dataset = Dataset.from_dict({
+            "ip": [],
+            "port": [],
+            "country": [],
+            "region": [],
+            "org": [],
+            "models": []
+        })
+    # Server-side authorization check
+    is_admin = verify_admin_authorization()
+    # Get unique values for dropdowns
+    families = [""] + get_unique_values(dataset, "family")
+    parameter_sizes = [""] + get_unique_values(dataset, "parameter_size")
+    # Initial search results
+    initial_results, initial_details = search_models(dataset)
+    with gr.Blocks(title="Ollama Instance & Model Browser") as app:
+        gr.Markdown("# Ollama Instance & Model Browser")
+        with gr.Tabs() as tabs:
+            with gr.Tab("Browse Models"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        name_search = gr.Textbox(label="Model Name Search")
+                        family_dropdown = gr.Dropdown(
+                            choices=families,
+                            label="Model Family",
+                            value=""
                         )
+                        parameter_size_dropdown = gr.Dropdown(
+                            choices=parameter_sizes,
+                            label="Parameter Size",
+                            value=""
                         )
+                        search_button = gr.Button("Search Models")
+                with gr.Row():
+                    model_results = gr.DataFrame(
+                        value=initial_results,
+                        label="Model Results",
+                        interactive=False
                     )
+                with gr.Row():
+                    model_details = gr.JSON(label="Model Details")
+                def search_callback(name, family, parameter_size):
+                    results, details = search_models(dataset, name, family, parameter_size)
+                    return results, None
+                def select_model(evt: gr.SelectData):
+                    results, details = search_models(dataset, name_search.value,
+                                                    family_dropdown.value,
+                                                    parameter_size_dropdown.value)
+                    if evt.index[0] < len(details):
+                        return details[evt.index[0]]
+                    return None
+                search_button.click(
+                    search_callback,
+                    inputs=[name_search, family_dropdown, parameter_size_dropdown],
+                    outputs=[model_results, model_details]
+                )
+                model_results.select(
+                    select_model,
+                    None,
+                    model_details
+                )
+            # Only show Shodan Scan tab for admins - server-side check
+            if is_admin:
+                with gr.Tab("Shodan Scan"):
+                    gr.Markdown("## Scan for Ollama Instances")
+                    gr.Markdown("**Note:** This scan will update the dataset with new Ollama instances.")
+                    scan_button = gr.Button("Start Scan")
+                    scan_output = gr.Textbox(label="Scan Status")
+                    scan_button.click(
+                        lambda progress=gr.Progress(): scan_shodan(progress),
+                        outputs=scan_output
                     )
+        # Refresh dataset when the app starts
+        def refresh_data():
+            nonlocal dataset
+            try:
+                dataset = load_or_create_dataset()
+            except Exception as e:
+                logger.error(f"Failed to refresh dataset: {str(e)}")
+                # Continue with existing dataset
+            results, details = search_models(dataset)
+            return results
+        app.load(
+            fn=refresh_data,
+            outputs=model_results
+        )
+    return app
+# Main entry point
 if __name__ == "__main__":
+    try:
+        ui = create_ui()
+        ui.launch()
+    except Exception as e:
+        logger.critical(f"Failed to start application: {str(e)}")