Spaces:

vertify
/

biomass-prediction-app

Running

App Files Files Community

pokkiri commited on May 19

Commit

a49479d

verified ·

1 Parent(s): 0d67e1c

Update feature_engineering.py

Browse files

Files changed (1) hide show

feature_engineering.py +377 -670

feature_engineering.py CHANGED Viewed

@@ -1,709 +1,416 @@
-def create_interface(self):
-        """Create Gradio interface with sample image thumbnails"""
-        # Generate thumbnails for sample images
-        sample_thumbnails = {}
-        for name, path in self.sample_images.items():
-            if os.path.exists(path):
-                thumbnail = self.create_thumbnail(path)
-                if thumbnail:
-                    sample_thumbnails[name] = Image.open(thumbnail)
-            else:
-                logger.warning(f"Sample image not found: {path}")
-        with gr.Blocks(title="Biomass Prediction Model") as interface:
-            gr.Markdown("# Above-Ground Biomass Prediction")
-            gr.Markdown("""
-            Upload a multi-band satellite image to predict above-ground biomass (AGB) across the landscape.
-            **Requirements:**
-            - Image must be a GeoTIFF with spectral bands
-            - For best results, image should contain at least 3 bands
-            """)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    input_image = gr.File(
-                        label="Upload Satellite Image (GeoTIFF)",
-                        file_types=[".tif", ".tiff"]
-                    )
-                    # Sample images section
-                    gr.Markdown("### Sample Images")
-                    # Sample buttons container
-                    sample_buttons = []
-                    # First row - sample thumbnails side by side horizontally
-                    with gr.Row():
-                        for name, thumbnail in sample_thumbnails.items():
-                            with gr.Column():
-                                gr.Image(
-                                    value=thumbnail,
-                                    label=name.replace("input_", "Input ").replace("chip_", "Chip "),
-                                    show_download_button=False,
-                                    height=180
-                                )
-                    # Second row - buttons side by side horizontally, matching the thumbnails above
-                    with gr.Row():
-                        for name, _ in sample_thumbnails.items():
-                            with gr.Column():
-                                sample_btn = gr.Button(
-                                    f"Use {name.replace('input_', 'Input ').replace('chip_', 'Chip ')}",
-                                    variant="secondary",
-                                    size="lg"
-                                )
-                                sample_buttons.append((sample_btn, name))
-                    # Generate button at the bottom
-                    generate_btn = gr.Button("Generate Biomass Prediction", variant="primary", size="lg")
-                with gr.Column(scale=2):
-                    output_image = gr.Image(
-                        label="Biomass Prediction Map",
-                        type="pil"
-                    )
-                    output_stats = gr.Markdown(
-                        label="Statistics"
-                    )
-            with gr.Accordion("About", open=False):
-                gr.Markdown("""
-                ## About This Model
-                This biomass prediction model uses the StableResNet architecture to predict above-ground biomass from satellite imagery.
-                ### Model Details
-                - Architecture: StableResNet
-                - Input: Multi-spectral satellite imagery
-                - Output: Above-ground biomass (Mg/ha)
-                - Creator: vertify.earth
-                - Date: 2025-05-19
-                ### Improvements in This Version
-                - Added calibration factor to match full-tile inference values
-                - Improved chunk processing with overlap to reduce edge artifacts
-                - Enhanced feature calculation for better results
-                - Optimized visualization to show the full range of biomass values
-                """)
-            # Add a warning if model failed to load
-            if self.model is None:
-                gr.Warning("⚠️ Model failed to load. The app may not work correctly. Check logs for details.")
-            # Connect the process button
-            generate_btn.click(
-                fn=self.predict_biomass,
-                inputs=[input_image],
-                outputs=[output_image, output_stats]
-            )
-            # Connect the sample buttons
-            for button, name in sample_buttons:
-                button.click(
-                    fn=lambda path=self.sample_images[name]: self.predict_biomass(path),
-                    inputs=[],
-                    outputs=[output_image, output_stats]
-                )
-        return interface
-def launch_app():
-    """Launch the Gradio app"""
-    try:
-        # Create app instance
-        app = BiomassPredictorApp()
-        # Create interface
-        interface = app.create_interface()
-        # Launch interface
-        interface.launch()
-    except Exception as e:
-        logger.error(f"Error launching app: {e}")
-        logger.error(traceback.format_exc())
-if __name__ == "__main__":
-    launch_app()"""
-Biomass Prediction Gradio App with Two Sample Images and RGB Comparison
 Author: najahpokkiri
 Date: 2025-05-19
-Updated with sample image thumbnails and always-on RGB comparison.
 """
-import os
-import sys
-import torch
 import numpy as np
-import gradio as gr
-import joblib
-import tempfile
-import matplotlib.pyplot as plt
-import matplotlib.colors as colors
-from PIL import Image
-import io
 import logging
-from huggingface_hub import hf_hub_download
-import rasterio
 # Configure logger
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Import model architecture
-from model import StableResNet
-# Define a placeholder for feature engineering if not available
-def extract_all_features(image):
-    """
-    Extract all 99 features from satellite bands.
-    Placeholder function - in production, use the actual feature_engineering module.
-    """
-    # Get image dimensions
-    n_bands, height, width = image.shape
-    # Create a valid mask (non-NaN pixels)
-    valid_mask = np.all(np.isfinite(image), axis=0)
-    # Get valid pixel coordinates
-    valid_y, valid_x = np.where(valid_mask)
-    n_valid = len(valid_y)
-    # Create a feature matrix (placeholder)
-    # In a real scenario, these would be spectral indices, texture features, etc.
-    # For now, we'll just use the original bands and pad to 99 features
-    # Original bands for each valid pixel
-    feature_matrix = np.zeros((n_valid, 99), dtype=np.float32)
-    # Fill in the available band values
-    for i in range(n_valid):
-        y, x = valid_y[i], valid_x[i]
-        # Copy available bands
-        for b in range(min(n_bands, 99)):
-            feature_matrix[i, b] = image[b, y, x]
-    # Create feature names
-    generated_features = [f"Band_{i+1}" for i in range(99)]
-    return feature_matrix, valid_mask, generated_features
-class BiomassPredictorApp:
-    """Gradio app for biomass prediction from satellite imagery"""
-    def __init__(self, model_repo="pokkiri/biomass-model"):
-        """Initialize the app with model repository information"""
-        self.model = None
-        self.package = None
-        self.feature_names = []
-        self.model_repo = model_repo
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        # Sample image paths
-        self.sample_images = {
-            "input_chip_1": "input_chip_1.tif",
-            "input_chip_2": "input_chip_2.tif"
-        }
-        # Cache for storing temporary files
-        self.temp_files = []
-        # Load the model
-        self.load_model()
-    def load_model(self):
-        """Load the model and preprocessing pipeline"""
-        try:
-            logger.info(f"Loading model from {self.model_repo}")
-            # Download model files from HuggingFace or use local files
-            try:
-                model_path = hf_hub_download(repo_id=self.model_repo, filename="model.pt")
-                package_path = hf_hub_download(repo_id=self.model_repo, filename="model_package.pkl")
-            except Exception as e:
-                logger.warning(f"Failed to download from HuggingFace: {e}")
-                # Fallback to local files
-                model_path = "model.pt"
-                package_path = "model_package.pkl"
-            # Try to load package with metadata
-            try:
-                logger.info(f"Loading package from {package_path}")
-                self.package = joblib.load(package_path)
-                logger.info("Successfully loaded model package")
-                # Extract information from package
-                n_features = self.package['n_features']
-                self.feature_names = self.package.get('feature_names', [f"feature_{i}" for i in range(n_features)])
-                logger.info(f"Package keys: {list(self.package.keys())}")
-                logger.info(f"Model expects {n_features} features")
-            except Exception as e:
-                logger.error(f"Error loading package file: {e}")
-                # Fallback to default values
-                n_features = 99  # We know there are 99 features
-                self.feature_names = [f"feature_{i}" for i in range(n_features)]
-                # Create a minimal package with essential components
-                self.package = {
-                    'n_features': n_features,
-                    'use_log_transform': True,
-                    'epsilon': 1.0,
-                    'scaler': None  # Will handle the None case in prediction
-                }
-            # Initialize model
-            self.model = StableResNet(n_features=n_features)
-            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
-            self.model.to(self.device)
-            self.model.eval()
-            logger.info(f"Model loaded successfully")
-            logger.info(f"Number of features: {n_features}")
-            logger.info(f"Using device: {self.device}")
-            return True
-        except Exception as e:
-            logger.error(f"Error loading model: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-            return False
-    def cleanup(self):
-        """Clean up temporary files"""
-        for tmp_path in self.temp_files:
-            try:
-                if os.path.exists(tmp_path):
-                    os.unlink(tmp_path)
-            except Exception as e:
-                logger.warning(f"Failed to remove temporary file {tmp_path}: {e}")
-        self.temp_files = []
-    def create_thumbnail(self, image_path, max_size=(200, 200), output_format="PNG"):
-        """Create a thumbnail image from a GeoTIFF"""
-        try:
-            if not os.path.exists(image_path):
-                logger.warning(f"Image file not found: {image_path}")
-                return None
-            # Open the GeoTIFF
-            with rasterio.open(image_path) as src:
-                # Read data with RGB bands if available
-                if src.count >= 3:
-                    # Use first three bands as RGB
-                    rgb_data = src.read([1, 2, 3])
-                    # Transpose from (bands, height, width) to (height, width, bands)
-                    rgb_data = np.transpose(rgb_data, (1, 2, 0))
-                    # Normalize to 0-255 range
-                    rgb_data = np.clip(rgb_data, 0, None)  # Clip negative values
-                    for i in range(3):
-                        p2 = np.percentile(rgb_data[:,:,i], 2)
-                        p98 = np.percentile(rgb_data[:,:,i], 98)
-                        if p98 > p2:
-                            rgb_data[:,:,i] = np.clip((rgb_data[:,:,i] - p2) / (p98 - p2) * 255, 0, 255)
-                        else:
-                            rgb_data[:,:,i] = np.clip(rgb_data[:,:,i] / (rgb_data[:,:,i].max() or 1) * 255, 0, 255)
-                    # Convert to uint8
-                    rgb_data = rgb_data.astype(np.uint8)
-                    # Create PIL image
-                    img = Image.fromarray(rgb_data)
-                else:
-                    # Use first band as grayscale
-                    gray_data = src.read(1)
-                    # Normalize to 0-255 range
-                    p2 = np.percentile(gray_data, 2)
-                    p98 = np.percentile(gray_data, 98)
-                    if p98 > p2:
-                        gray_data = np.clip((gray_data - p2) / (p98 - p2) * 255, 0, 255)
-                    else:
-                        gray_data = np.clip(gray_data / (gray_data.max() or 1) * 255, 0, 255)
-                    # Convert to uint8
-                    gray_data = gray_data.astype(np.uint8)
-                    # Create PIL image
-                    img = Image.fromarray(gray_data, mode='L')
-            # Resize to thumbnail
-            img.thumbnail(max_size)
-            # Save to bytes buffer
-            buf = io.BytesIO()
-            img.save(buf, format=output_format)
-            buf.seek(0)
-            return buf
-        except Exception as e:
-            logger.error(f"Error creating thumbnail: {e}")
-            return None
-    def predict_biomass(self, image_file):
-        """Predict biomass from a satellite image with RGB comparison"""
-        if self.model is None:
-            return None, "Error: Model not loaded. Please check logs for details."
-        if image_file is None:
-            return None, "Error: No file uploaded. Please upload a GeoTIFF file or use one of the sample images."
-        try:
-            # Check if we're using a sample image (string path) or an uploaded file
-            if isinstance(image_file, str):
-                logger.info(f"Using sample image: {image_file}")
-                tmp_path = image_file  # Use the sample path directly
-                cleanup_tmp = False  # Don't delete the sample file
-            else:
-                # Create a temporary file to save the uploaded file
-                with tempfile.NamedTemporaryFile(suffix='.tif', delete=False) as tmp_file:
-                    tmp_path = tmp_file.name
-                    with open(image_file.name, 'rb') as f:
-                        tmp_file.write(f.read())
-                # Add to list for cleanup later
-                self.temp_files.append(tmp_path)
-                cleanup_tmp = True
-            # Open the image file
-            with rasterio.open(tmp_path) as src:
-                image = src.read()
-                height, width = image.shape[1], image.shape[2]
-                transform = src.transform
-                crs = src.crs
-                logger.info(f"Processing image: {height}x{width} pixels, {image.shape[0]} bands")
-                # Validate minimum band count
-                if image.shape[0] < 3:
-                    return None, f"Error: Image has only {image.shape[0]} bands. At least 3 bands are required for RGB visualization."
-                # Generate all features using feature engineering
-                logger.info("Generating all 99 features from bands...")
-                feature_matrix, valid_mask, generated_features = extract_all_features(image)
-                # Verify we have exactly 99 features
-                if feature_matrix.shape[1] != 99:
-                    logger.error(f"Error: Generated {feature_matrix.shape[1]} features, but model expects 99.")
-                    return None, f"Error: Generated {feature_matrix.shape[1]} features, but model expects 99."
-                # Apply feature scaling if available
                 try:
-                    if 'scaler' in self.package and self.package['scaler'] is not None:
-                        logger.info("Applying feature scaling...")
-                        feature_matrix = self.package['scaler'].transform(feature_matrix)
-                except Exception as e:
-                    logger.warning(f"Error applying scaler: {e}. Using original features.")
-                # Initialize predictions array
-                predictions = np.zeros((height, width), dtype=np.float32)
-                # Get valid pixel coordinates
-                valid_y, valid_x = np.where(valid_mask)
-                # Make predictions
-                logger.info(f"Running model inference on {len(valid_y)} valid pixels...")
-                with torch.no_grad():
-                    # Process in batches to avoid memory issues
-                    batch_size = 10000
-                    for i in range(0, len(valid_y), batch_size):
-                        end_idx = min(i + batch_size, len(valid_y))
-                        batch = feature_matrix[i:end_idx]
-                        # Convert to tensor
-                        batch_tensor = torch.tensor(batch, dtype=torch.float32).to(self.device)
-                        # Get predictions
-                        batch_predictions = self.model(batch_tensor).cpu().numpy()
-                        # Handle scalar case for single-item batches
-                        if batch_predictions.ndim == 0:
-                            batch_predictions = np.array([batch_predictions])
-                        # Convert from log scale if needed
-                        if self.package.get('use_log_transform', True):
-                            epsilon = self.package.get('epsilon', 1.0)
-                            batch_predictions = np.exp(batch_predictions) - epsilon
-                            batch_predictions = np.maximum(batch_predictions, 0)  # Ensure non-negative
-                        # Map predictions back to image
-                        for j, pred in enumerate(batch_predictions):
-                            y_idx = valid_y[i + j]
-                            x_idx = valid_x[i + j]
-                            predictions[y_idx, x_idx] = pred
-                        # Log progress
-                        if (i // batch_size) % 5 == 0 or end_idx == len(valid_y):
-                            logger.info(f"Processed {end_idx}/{len(valid_y)} pixels")
-                # Create visualization - always RGB+Biomass side-by-side
-                logger.info("Creating RGB + Biomass visualization...")
-                # Create side-by-side comparison (RGB and Biomass)
-                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
-                # Prepare RGB image - try different band combinations if needed
-                rgb_bands = [3, 2, 1]  # Common RGB combination (R,G,B)
-                # Check if we have enough bands for RGB
-                if image.shape[0] < 3:
-                    logger.warning(f"Image has only {image.shape[0]} bands, using available bands for display")
-                    rgb_bands = list(range(min(3, image.shape[0])))
-                    while len(rgb_bands) < 3:
-                        rgb_bands.append(0)  # Pad with zeros if needed
-                # Create RGB image
-                rgb = np.zeros((height, width, 3), dtype=np.float32)
-                for i, band_idx in enumerate(rgb_bands):
-                    if band_idx < image.shape[0]:
-                        rgb[:, :, i] = image[band_idx]
-                # Handle potential NaN values
-                rgb = np.nan_to_num(rgb)
-                # Enhance contrast with percentile-based normalization
-                for i in range(3):
-                    p2 = np.percentile(rgb[:,:,i], 2)
-                    p98 = np.percentile(rgb[:,:,i], 98)
-                    if p98 > p2:
-                        rgb[:,:,i] = np.clip((rgb[:,:,i] - p2) / (p98 - p2), 0, 1)
-                # Display RGB image
-                ax1.imshow(rgb)
-                ax1.set_title('RGB Image')
-                ax1.axis('off')
-                # Display biomass prediction
-                masked_predictions = np.ma.masked_where(~valid_mask, predictions)
-                vmin = np.percentile(predictions[valid_mask], 1)
-                vmax = np.percentile(predictions[valid_mask], 99)
-                im = ax2.imshow(masked_predictions, cmap='viridis', vmin=vmin, vmax=vmax)
-                fig.colorbar(im, ax=ax2, label='Biomass (Mg/ha)')
-                ax2.set_title('Predicted Biomass')
-                ax2.axis('off')
-                # Add super title
-                plt.suptitle('RGB Image and Biomass Prediction', fontsize=16)
-                plt.tight_layout()
-                # Save figure to bytes buffer
-                buf = io.BytesIO()
-                fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
-                buf.seek(0)
-                plt.close(fig)
-                # Calculate summary statistics
-                valid_predictions = predictions[valid_mask]
-                stats = {
-                    'Mean Biomass': f"{np.mean(valid_predictions):.2f} Mg/ha",
-                    'Median Biomass': f"{np.median(valid_predictions):.2f} Mg/ha",
-                    'Min Biomass': f"{np.min(valid_predictions):.2f} Mg/ha",
-                    'Max Biomass': f"{np.max(valid_predictions):.2f} Mg/ha"
-                }
-                # Add area and total biomass if transform is available
-                if transform is not None:
-                    pixel_area_m2 = abs(transform[0] * transform[4])  # Assuming square pixels
-                    total_biomass = np.sum(valid_predictions) * (pixel_area_m2 / 10000)  # Convert to hectares
-                    area_hectares = np.sum(valid_mask) * (pixel_area_m2 / 10000)
-                    stats['Total Biomass'] = f"{total_biomass:.2f} Mg"
-                    stats['Area'] = f"{area_hectares:.2f} hectares"
-                # Format statistics as markdown
-                stats_md = "### Biomass Statistics\n\n"
-                stats_md += "| Metric | Value |\n|--------|-------|\n"
-                for k, v in stats.items():
-                    stats_md += f"| {k} | {v} |\n"
-                # Add processing info
-                stats_md += f"\n\n*Processed {np.sum(valid_mask):,} valid pixels with {feature_matrix.shape[1]} features*"
-                # Cleanup temporary files if needed
-                if cleanup_tmp:
-                    self.cleanup()
-                # Return visualization and statistics
-                return Image.open(buf), stats_md
-        except Exception as e:
-            # Ensure cleanup even on error
-            self.cleanup()
-            import traceback
-            logger.error(f"Error predicting biomass: {e}")
-            logger.error(traceback.format_exc())
-            return None, f"Error predicting biomass: {str(e)}\n\nPlease check logs for details."
-    def create_interface(self):
-        """Create Gradio interface with sample image thumbnails"""
-        # Generate thumbnails for sample images
-        sample_thumbnails = {}
-        for name, path in self.sample_images.items():
-            if os.path.exists(path):
-                thumbnail = self.create_thumbnail(path)
-                if thumbnail:
-                    sample_thumbnails[name] = Image.open(thumbnail)
             else:
-                logger.warning(f"Sample image not found: {path}")
-        with gr.Blocks(title="Biomass Prediction Model") as interface:
-            gr.Markdown("# Above-Ground Biomass Prediction")
-            gr.Markdown("""
-            Upload a multi-band satellite image to predict above-ground biomass (AGB) across the landscape.
-            **Requirements:**
-            - Image must be a GeoTIFF with spectral bands
-            - For best results, image should contain at least 3 bands
-            """)
-            with gr.Row():
-                with gr.Column(scale=1):
-                    input_image = gr.File(
-                        label="Upload Satellite Image (GeoTIFF)",
-                        file_types=[".tif", ".tiff"]
-                    )
-                    # Sample images section
-                    gr.Markdown("### Sample Images")
-                    # Sample buttons container
-                    sample_buttons = []
-                    # First row - sample thumbnails side by side horizontally
-                    with gr.Row():
-                        for name, thumbnail in sample_thumbnails.items():
-                            with gr.Column():
-                                gr.Image(
-                                    value=thumbnail,
-                                    label=name.replace("input_", "Input ").replace("chip_", "Chip "),
-                                    show_download_button=False,
-                                    height=180
-                                )
-                    # Second row - buttons side by side horizontally, matching the thumbnails above
-                    with gr.Row():
-                        for name, _ in sample_thumbnails.items():
-                            with gr.Column():
-                                sample_btn = gr.Button(
-                                    f"Use {name.replace('input_', 'Input ').replace('chip_', 'Chip ')}",
-                                    variant="secondary",
-                                    size="lg"
-                                )
-                                sample_buttons.append((sample_btn, name))
-                    # Generate button at the bottom
-                    generate_btn = gr.Button("Generate Biomass Prediction", variant="primary", size="lg")
-                with gr.Column(scale=2):
-                    output_image = gr.Image(
-                        label="Biomass Prediction Map",
-                        type="pil"
-                    )
-                    output_stats = gr.Markdown(
-                        label="Statistics"
-                    )_image = gr.Image(
-                        label="Biomass Prediction Map",
-                        type="pil"
-                    )
-                    output_stats = gr.Markdown(
-                        label="Statistics"
-                    )
-            # Sample images section with thumbnails in a separate row
-            gr.Markdown("### Sample Images")
-            with gr.Row():
-                # Only show thumbnails for images that were found
-                sample_buttons = []
-                # Create a column for each sample image
-                for name, thumbnail in sample_thumbnails.items():
-                    with gr.Column():
-                        gr.Image(value=thumbnail, label=name.replace("input_", "Input ").replace("chip_", "Chip "),
-                                show_download_button=False, show_label=True, height=200)
-                        sample_btn = gr.Button(f"Use {name.replace('input_', 'Input ').replace('chip_', 'Chip ')}",
-                                            size="lg", variant="secondary")
-                        sample_buttons.append((sample_btn, name))
-                with gr.Column(scale=2):
-                    output_image = gr.Image(
-                        label="Biomass Prediction Map",
-                        type="pil"
-                    )
-                    output_stats = gr.Markdown(
-                        label="Statistics"
-                    )
-            with gr.Accordion("About", open=False):
-                gr.Markdown("""
-                ## About This Model
-                This biomass prediction model uses the StableResNet architecture to predict above-ground biomass from satellite imagery.
-                ### Model Details
-                - Architecture: StableResNet
-                - Input: Multi-spectral satellite imagery
-                - Output: Above-ground biomass (Mg/ha)
-                - Creator: vertify.earth for GIZ Forest Forward
-                - Date: 2025-05-19
-                ### How It Works
-                1. The model extracts features from each pixel in the satellite image
-                2. These features include spectral bands, vegetation indices, texture metrics, and more
-                3. The model outputs a biomass prediction for each pixel
-                4. Results are visualized as RGB and biomass prediction side-by-side
-                """)
-            # Add a warning if model failed to load
-            if self.model is None:
-                gr.Warning("⚠️ Model failed to load. The app may not work correctly. Check logs for details.")
-            # Connect the process button
-            process_btn.click(
-                fn=self.predict_biomass,
-                inputs=[input_image],
-                outputs=[output_image, output_stats]
-            )
-            # Connect the sample buttons
-            for button, name in sample_buttons:
-                button.click(
-                    fn=lambda path=self.sample_images[name]: self.predict_biomass(path),
-                    inputs=[],
-                    outputs=[output_image, output_stats]
-                )
-        return interface
-def launch_app():
-    """Launch the Gradio app"""
     try:
-        # Create app instance
-        app = BiomassPredictorApp()
-        # Create interface
-        interface = app.create_interface()
-        # Launch interface
-        interface.launch()
     except Exception as e:
-        logger.error(f"Error launching app: {e}")
         import traceback
-        logger.error(traceback.format_exc())
 if __name__ == "__main__":
-    launch_app()

+"""
+Feature engineering module for biomass prediction.
+This module extracts the 99 features needed by the StableResNet model.
 Author: najahpokkiri
 Date: 2025-05-19
 """
 import numpy as np
 import logging
+from datetime import datetime
 # Configure logger
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Try to import optional dependencies but don't fail if not available
+try:
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.decomposition import PCA
+    SKLEARN_AVAILABLE = True
+except ImportError:
+    SKLEARN_AVAILABLE = False
+    logger.warning("scikit-learn not available. PCA features will be approximated.")
+try:
+    from skimage.filters import sobel
+    from skimage.feature import local_binary_pattern, graycomatrix, graycoprops
+    SKIMAGE_AVAILABLE = True
+except ImportError:
+    SKIMAGE_AVAILABLE = False
+    logger.warning("scikit-image not available. Texture features will be approximated.")
+def safe_divide(a, b, fill_value=0.0):
+    """Safe division that handles zeros in the denominator"""
+    a = np.asarray(a, dtype=np.float32)
+    b = np.asarray(b, dtype=np.float32)
+    # Handle NaN/Inf in inputs
+    a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
+    b = np.nan_to_num(b, nan=1e-10, posinf=1e10, neginf=-1e10)
+    mask = np.abs(b) < 1e-10
+    result = np.full_like(a, fill_value, dtype=np.float32)
+    if np.any(~mask):
+        result[~mask] = a[~mask] / b[~mask]
+    return np.nan_to_num(result, nan=fill_value, posinf=fill_value, neginf=fill_value)
+def calculate_spectral_indices(satellite_data):
+    """Calculate spectral indices from satellite bands"""
+    indices = {}
+    n_bands = satellite_data.shape[0]
+    # Enhanced band mapping with error checking
+    def safe_get_band(idx):
+        return satellite_data[idx] if idx < n_bands else None
+    # Sentinel-2 bands (assuming standard band order)
+    # B2(blue), B3(green), B4(red), B8(nir), B11(swir1), B12(swir2)
+    try:
+        blue = safe_get_band(1)  # Adjust indices based on your data
+        green = safe_get_band(2)
+        red = safe_get_band(3)
+        nir = safe_get_band(7)
+        swir1 = safe_get_band(9)
+        swir2 = safe_get_band(10)
+        if all(b is not None for b in [red, nir]):
+            # NDVI (Normalized Difference Vegetation Index)
+            indices['NDVI'] = safe_divide(nir - red, nir + red)
+            if blue is not None and green is not None:
+                # EVI (Enhanced Vegetation Index)
+                indices['EVI'] = 2.5 * safe_divide(nir - red, nir + 6*red - 7.5*blue + 1)
+                # SAVI (Soil Adjusted Vegetation Index)
+                indices['SAVI'] = 1.5 * safe_divide(nir - red, nir + red + 0.5)
+                # MSAVI2 (Modified Soil Adjusted Vegetation Index)
+                indices['MSAVI2'] = 0.5 * (2 * nir + 1 - np.sqrt((2 * nir + 1)**2 - 8 * (nir - red)))
+                # NDWI (Normalized Difference Water Index)
+                indices['NDWI'] = safe_divide(green - nir, green + nir)
+        if swir1 is not None and nir is not None:
+            # NDMI (Normalized Difference Moisture Index)
+            indices['NDMI'] = safe_divide(nir - swir1, nir + swir1)
+        if swir2 is not None and nir is not None:
+            # NBR (Normalized Burn Ratio)
+            indices['NBR'] = safe_divide(nir - swir2, nir + swir2)
+    except Exception as e:
+        logger.warning(f"Error calculating spectral indices: {e}")
+    # Clean up None values and NaNs
+    indices = {k: np.nan_to_num(v, nan=0.0) for k, v in indices.items() if v is not None}
+    # Ensure we have all required indices by providing defaults
+    required_indices = ['NDVI', 'EVI', 'SAVI', 'MSAVI2', 'NDWI', 'NDMI', 'NBR']
+    for idx in required_indices:
+        if idx not in indices:
+            if satellite_data.shape[1] > 0 and satellite_data.shape[2] > 0:
+                indices[idx] = np.zeros((satellite_data.shape[1], satellite_data.shape[2]), dtype=np.float32)
+    return indices
+def extract_texture_features(satellite_data):
+    """Extract texture features from satellite data"""
+    texture_features = {}
+    height, width = satellite_data.shape[1], satellite_data.shape[2]
+    # If scikit-image is not available, return placeholders
+    if not SKIMAGE_AVAILABLE:
+        texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
+                       'GLCM_homogeneity_B7', 'GLCM_energy_B7']
+        for name in texture_names:
+            texture_features[name] = np.zeros((height, width), dtype=np.float32)
+        return texture_features
+    try:
+        # Use NIR band (band 7) for texture features
+        b7_idx = min(7, satellite_data.shape[0] - 1)
+        band = satellite_data[b7_idx].copy()
+        band = np.nan_to_num(band, nan=0.0)
+        # 1. Sobel filter for edge detection
+        sobel_filtered = sobel(band)
+        texture_features['Sobel_B7'] = sobel_filtered
+        # 2. Local Binary Pattern
+        # Normalize band to 0-255 range for LBP
+        band_norm = band.copy()
+        if np.any(~np.isnan(band)):
+            band_min, band_max = np.nanpercentile(band, [1, 99])
+            if band_max > band_min:
+                band_norm = np.clip((band - band_min) / (band_max - band_min + 1e-8) * 255, 0, 255).astype(np.uint8)
+        else:
+            band_norm = np.zeros_like(band, dtype=np.uint8)
+        # Calculate LBP
+        lbp = local_binary_pattern(band_norm, 8, 1, method='uniform')
+        texture_features['LBP_B7'] = lbp
+        # 3. GLCM properties
+        # Create sample patch for GLCM calculation
+        sample_size = min(128, height, width)
+        center_y, center_x = height // 2, width // 2
+        offset = sample_size // 2
+        y_start = max(0, center_y - offset)
+        y_end = min(height, center_y + offset)
+        x_start = max(0, center_x - offset)
+        x_end = min(width, center_x + offset)
+        patch = band_norm[y_start:y_end, x_start:x_end]
+        # Calculate GLCM properties if patch is valid
+        if patch.size > 0:
+            glcm = graycomatrix(patch, [1], [0], levels=256, symmetric=True, normed=True)
+            for prop in ['contrast', 'dissimilarity', 'homogeneity', 'energy']:
                 try:
+                    value = float(graycoprops(glcm, prop)[0, 0])
+                    texture_features[f'GLCM_{prop}_B7'] = np.full((height, width), value)
+                except:
+                    texture_features[f'GLCM_{prop}_B7'] = np.zeros((height, width), dtype=np.float32)
+        else:
+            # Create placeholder GLCM features if patch is invalid
+            for prop in ['contrast', 'dissimilarity', 'homogeneity', 'energy']:
+                texture_features[f'GLCM_{prop}_B7'] = np.zeros((height, width), dtype=np.float32)
+    except Exception as e:
+        logger.error(f"Error in texture feature extraction: {e}")
+        # Provide placeholder features in case of error
+        texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
+                       'GLCM_homogeneity_B7', 'GLCM_energy_B7']
+        for name in texture_names:
+            texture_features[name] = np.zeros((height, width), dtype=np.float32)
+    return texture_features
+def calculate_spatial_features(satellite_data, indices):
+    """Calculate spatial context features like gradients"""
+    spatial_features = {}
+    height, width = satellite_data.shape[1], satellite_data.shape[2]
+    # 1. Gradient of Band 7 (NIR)
+    b7_idx = min(7, satellite_data.shape[0] - 1)
+    band = satellite_data[b7_idx].copy()
+    band = np.nan_to_num(band, nan=0.0)
+    try:
+        # Calculate the gradient magnitude
+        grad_y, grad_x = np.gradient(band)
+        grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)
+        spatial_features['Gradient_B7'] = grad_magnitude
+    except Exception as e:
+        logger.warning(f"Error calculating band gradient: {e}")
+        spatial_features['Gradient_B7'] = np.zeros((height, width), dtype=np.float32)
+    # 2. NDVI gradient
+    try:
+        ndvi = indices.get('NDVI', np.zeros((height, width), dtype=np.float32))
+        ndvi = np.nan_to_num(ndvi, nan=0.0)
+        # Calculate the gradient magnitude for NDVI
+        grad_y, grad_x = np.gradient(ndvi)
+        grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)
+        spatial_features['NDVI_gradient'] = grad_magnitude
+    except Exception as e:
+        logger.warning(f"Error calculating NDVI gradient: {e}")
+        spatial_features['NDVI_gradient'] = np.zeros((height, width), dtype=np.float32)
+    return spatial_features
+def calculate_pca_features(satellite_data, n_components=25):
+    """Calculate PCA features from satellite bands"""
+    pca_features = {}
+    height, width = satellite_data.shape[1], satellite_data.shape[2]
+    n_bands = satellite_data.shape[0]
+    # If scikit-learn is not available, return placeholders
+    if not SKLEARN_AVAILABLE:
+        for i in range(1, n_components + 1):
+            # Create some basic derived features as placeholders
+            if i <= n_bands:
+                # Use band values directly for first components
+                pca_features[f'PCA_{i:02d}'] = satellite_data[i-1]
             else:
+                # Create synthetic features for remaining components
+                pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
+        return pca_features
+    try:
+        # Reshape for PCA (pixels x bands)
+        bands_reshaped = satellite_data.reshape(n_bands, -1).T
+        # Handle NaN values
+        valid_mask = ~np.any(np.isnan(bands_reshaped), axis=1)
+        bands_clean = bands_reshaped[valid_mask]
+        if len(bands_clean) == 0:
+            logger.warning("No valid data for PCA calculation")
+            # Create placeholder PCA features
+            for i in range(1, n_components + 1):
+                pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
+            return pca_features
+        # Standardize valid data
+        scaler = StandardScaler()
+        bands_scaled = scaler.fit_transform(bands_clean)
+        # Calculate PCA
+        pca = PCA(n_components=min(n_components, bands_scaled.shape[1], bands_scaled.shape[0]))
+        pca_result = pca.fit_transform(bands_scaled)
+        # Extend to full 25 components if needed
+        actual_components = pca_result.shape[1]
+        if actual_components < n_components:
+            logger.warning(f"Only {actual_components} PCA components calculated, padding to {n_components}")
+            padding = np.zeros((pca_result.shape[0], n_components - actual_components))
+            pca_result = np.hstack([pca_result, padding])
+        # Map back to original pixels
+        pca_all = np.zeros((bands_reshaped.shape[0], n_components))
+        pca_all[valid_mask] = pca_result
+        # Reshape to spatial dimensions
+        pca_spatial = pca_all.reshape(height, width, n_components)
+        # Store each component with the correct naming
+        for i in range(1, n_components + 1):
+            pca_features[f'PCA_{i:02d}'] = pca_spatial[:, :, i-1]
+        # Log PCA explained variance
+        if hasattr(pca, 'explained_variance_ratio_'):
+            logger.info(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.3f}")
+    except Exception as e:
+        logger.error(f"Error calculating PCA features: {e}")
+        # Create placeholder PCA features
+        for i in range(1, n_components + 1):
+            pca_features[f'PCA_{i:02d}'] = np.zeros((height, width), dtype=np.float32)
+    return pca_features
+def extract_all_features(satellite_data):
+    """
+    Extract exactly 99 features needed by the model:
+    - 59 original bands
+    - 7 spectral indices
+    - 6 texture features
+    - 2 spatial features
+    - 25 PCA components
+    Parameters:
+        satellite_data (ndarray): Array of shape (bands, height, width)
+    Returns:
+        features_array (ndarray): Array of shape (valid_pixels, 99)
+        valid_mask (ndarray): Boolean mask of valid pixels
+        feature_names (list): List of 99 feature names
+    """
+    start_time = datetime.now()
+    logger.info("Extracting features for biomass prediction...")
+    height, width = satellite_data.shape[1], satellite_data.shape[2]
+    # Create valid pixel mask (no NaN or Inf values)
+    valid_mask = np.all(np.isfinite(satellite_data), axis=0)
+    valid_y, valid_x = np.where(valid_mask)
+    n_valid = len(valid_y)
+    logger.info(f"Found {n_valid} valid pixels out of {height*width}")
+    # Generate all feature categories
+    logger.info("Calculating spectral indices...")
+    indices = calculate_spectral_indices(satellite_data)
+    logger.info("Extracting texture features...")
+    texture_features = extract_texture_features(satellite_data)
+    logger.info("Calculating spatial features...")
+    spatial_features = calculate_spatial_features(satellite_data, indices)
+    logger.info("Computing PCA components...")
+    pca_features = calculate_pca_features(satellite_data)
+    # Define the ordered list of feature names
+    feature_names = []
+    # 1. Add original band names (Band_01 through Band_59)
+    for i in range(1, 60):
+        feature_names.append(f'Band_{i:02d}')
+    # 2. Add spectral indices
+    spectral_indices = ['NDVI', 'EVI', 'SAVI', 'MSAVI2', 'NDWI', 'NDMI', 'NBR']
+    feature_names.extend(spectral_indices)
+    # 3. Add texture features
+    texture_names = ['Sobel_B7', 'LBP_B7', 'GLCM_contrast_B7', 'GLCM_dissimilarity_B7',
+                    'GLCM_homogeneity_B7', 'GLCM_energy_B7']
+    feature_names.extend(texture_names)
+    # 4. Add spatial features
+    spatial_names = ['Gradient_B7', 'NDVI_gradient']
+    feature_names.extend(spatial_names)
+    # 5. Add PCA components
+    for i in range(1, 26):
+        feature_names.append(f'PCA_{i:02d}')
+    # Create feature dictionary with all features
+    all_features = {}
+    # 1. Original bands
+    for i in range(min(satellite_data.shape[0], 59)):
+        all_features[f'Band_{i+1:02d}'] = satellite_data[i]
+    # Pad with zeros if we have fewer than 59 bands
+    for i in range(satellite_data.shape[0], 59):
+        all_features[f'Band_{i+1:02d}'] = np.zeros((height, width), dtype=np.float32)
+    # 2. Add other feature categories
+    all_features.update(indices)
+    all_features.update(texture_features)
+    all_features.update(spatial_features)
+    all_features.update(pca_features)
+    # Verify we have exactly 99 features
+    assert len(feature_names) == 99, f"Expected 99 features, but got {len(feature_names)}"
+    # Extract feature values for valid pixels
+    feature_matrix = np.zeros((n_valid, len(feature_names)), dtype=np.float32)
+    for i, name in enumerate(feature_names):
+        if name in all_features:
+            feature_data = all_features[name]
+            if feature_data.ndim == 2:
+                feature_values = feature_data[valid_y, valid_x]
+            else:
+                feature_values = np.full(n_valid, feature_data)
+            feature_matrix[:, i] = np.nan_to_num(feature_values, nan=0.0)
+        else:
+            logger.warning(f"Feature '{name}' not found, using zeros")
+            feature_matrix[:, i] = 0.0
+    end_time = datetime.now()
+    processing_time = (end_time - start_time).total_seconds()
+    logger.info(f"Successfully extracted {len(feature_names)} features for {n_valid} pixels in {processing_time:.2f} seconds")
+    return feature_matrix, valid_mask, feature_names
+# Simple test function
+def test_feature_extraction():
+    """Test the feature extraction pipeline with sample data"""
     try:
+        # Create sample data (5 bands, 100x100 pixels)
+        satellite_data = np.random.random((5, 100, 100)).astype(np.float32)
+        # Extract features
+        feature_matrix, valid_mask, feature_names = extract_all_features(satellite_data)
+        # Print summary
+        print(f"Sample data shape: {satellite_data.shape}")
+        print(f"Feature matrix shape: {feature_matrix.shape}")
+        print(f"Number of feature names: {len(feature_names)}")
+        print(f"Valid pixels: {np.sum(valid_mask)}")
+        return True
     except Exception as e:
+        print(f"Feature extraction test failed: {e}")
         import traceback
+        traceback.print_exc()
+        return False
 if __name__ == "__main__":
+    # Run a simple test if this script is executed directly
+    test_feature_extraction()