Spaces:

neuralworm
/

SWCK

Running

App Files Files Community

neuralworm commited on May 30

Commit

ce4931d

verified ·

1 Parent(s): 40376ef

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -81

app.py CHANGED Viewed

@@ -11,10 +11,10 @@ from model import SWCKModel, SeedParser, EntropyEstimator # Assuming model.py is
 # --- Vocabulary and Tokenizer Setup ---
 PAD_TOKEN_STR = "<pad>"; SOS_TOKEN_STR = "<sos>"; EOS_TOKEN_STR = "<eos>"; UNK_TOKEN_STR = "<unk>"
 PAD_TOKEN = 0; SOS_TOKEN = 1; EOS_TOKEN = 2; UNK_TOKEN = 3
-SEQ_LEN_APP = 64 # Max sequence length for training samples in app & generation context
 # --- Model Configuration ---
-VOCAB_SIZE_APP = 189 # Placeholder, will be updated by vocab loading/building
 D_MODEL_APP = 64
 N_HEADS_APP = 2
 D_FF_APP = 128
@@ -38,7 +38,7 @@ This is a stream of consciousness, a digital mindscape.
 The target is not just prediction, but a form of self-understanding, however metaphorical.
 Let the adaptive blocks find their balance. Let the entropy guide the wiring.
 A painter paints. A scientist explores. A writer writes. The machine... becomes.
-""" # Re-added for in-app training data
 # Global model variables
 swck_model_global = None
@@ -48,14 +48,13 @@ idx_to_word_global = None
 device_global = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model_load_status_global = "Model not loaded."
-CHECKPOINT_FILENAME = "swck_model_conceptual_app.pth.tar" # App specific checkpoint
-# Loss Weights (should match train.py for consistency if loading that checkpoint)
 MAIN_LOSS_WEIGHT_APP = 1.0
 BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP = 0.02
 OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP = 0.01
 GATE_SPARSITY_LOSS_WEIGHT_APP = 0.001
-WIRING_PHASE_EPOCHS_APP = 1 # Very short wiring phase for in-app training demo
 def build_vocab_from_corpus_text_app(corpus_text):
@@ -94,12 +93,11 @@ def initialize_or_load_model_app():
     }
     swck_model_global = SWCKModel(**model_args).to(device_global)
-    # Enable all debug prints for console view
-    swck_model_global.debug_prints_enabled = True
     if hasattr(swck_model_global, 'seed_parser'): swck_model_global.seed_parser.debug_prints_enabled = True
     for i,block in enumerate(swck_model_global.adaptive_blocks):
-        block.debug_prints_enabled = True
-        print(f"App: Debug prints enabled for AdaptiveBlock {i}")
     if os.path.exists(CHECKPOINT_FILENAME):
@@ -108,27 +106,29 @@ def initialize_or_load_model_app():
             checkpoint = torch.load(CHECKPOINT_FILENAME, map_location=device_global)
             swck_model_global.load_state_dict(checkpoint['model_state_dict'])
-            # Re-initialize optimizer for the loaded model
-            optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001) # Use app's LR
-            if 'optimizer_state_dict' in checkpoint: # Load optimizer state if you want to continue training
                  optimizer_global.load_state_dict(checkpoint['optimizer_state_dict'])
-            # Vocab should ideally be part of checkpoint for consistency, but we rebuilt it
-            if 'word_to_idx' in checkpoint: # Overwrite with checkpoint vocab if present
                 loaded_w2i = checkpoint['word_to_idx']
-                if len(loaded_w2i) == VOCAB_SIZE_APP: # Basic sanity check
                     word_to_idx_global = loaded_w2i
                     idx_to_word_global = {v: k for k,v in loaded_w2i.items()}
-                    print("App: Overwrote vocab with checkpoint's vocab.")
                 else:
-                    print("App: Checkpoint vocab size mismatch, using app's rebuilt vocab.")
             model_load_status_global = f"Model loaded successfully from {CHECKPOINT_FILENAME}."
             print(model_load_status_global)
         except Exception as e:
             print(f"App: Error loading model from checkpoint: {e}. Initializing new model.")
-            # Re-initialize model if loading failed to ensure it's fresh
-            swck_model_global = SWCKModel(**model_args).to(device_global)
             optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001)
             model_load_status_global = "Error loading checkpoint. Using new (untrained) model."
     else:
@@ -136,11 +136,10 @@ def initialize_or_load_model_app():
         optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001)
         model_load_status_global = "Initialized a new (untrained) model."
-    swck_model_global.eval() # Default to eval mode
     return model_load_status_global
-# --- Dataset for in-app training ---
 class AppSWCKDataset(Dataset):
     def __init__(self, text_corpus_str, w2i_map, seq_len, sos_id, eos_id, pad_id):
         tokens = re.sub(r'\s+', ' ', text_corpus_str.lower()).strip().split()
@@ -149,9 +148,11 @@ class AppSWCKDataset(Dataset):
         self.seq_len = seq_len
         self.sos_id, self.eos_id, self.pad_id = sos_id, eos_id, pad_id
         self.samples = []
-        for i in range(len(token_ids) - seq_len):
-            input_seq = [self.sos_id] + token_ids[i : i + seq_len]
-            target_seq = token_ids[i + 1 : i + seq_len + 1] + [self.eos_id]
             self.samples.append((input_seq, target_seq))
         print(f"AppSWCKDataset: Created {len(self.samples)} training samples for in-app training.")
@@ -166,7 +167,6 @@ def app_swck_collate_fn(batch):
     padded_tgt = nn.utils.rnn.pad_sequence(tgt_list, batch_first=True, padding_value=PAD_TOKEN)
     return padded_src, padded_tgt
-# --- In-app Training Function (Simplified) ---
 def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app, progress=gr.Progress(track_tqdm=True)):
     global swck_model_global, optimizer_global, word_to_idx_global, model_load_status_global
@@ -176,56 +176,80 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
     print("\n--- App: Starting Short Training Session ---")
     progress(0, desc="Preparing training data...")
-    # Use the extended text for training
     training_corpus = SEED_PHRASE_APP + " " + EXTENDED_TEXT_FOR_TRAINING_APP
     app_dataset = AppSWCKDataset(training_corpus, word_to_idx_global, SEQ_LEN_APP, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN)
     if not app_dataset.samples:
         return "App Training Error: No samples created from the corpus."
-    app_dataloader = DataLoader(app_dataset, batch_size=batch_size_app, shuffle=True, collate_fn=app_swck_collate_fn)
-    # Re-initialize optimizer or update LR
     if optimizer_global is None:
         optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=learning_rate_app)
-    else: # Update LR if optimizer exists
         for param_group in optimizer_global.param_groups:
             param_group['lr'] = learning_rate_app
     criterion_main_app = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
-    training_log_output = ""
-    swck_model_global.train() # Set model to training mode
-    for epoch in progress.tqdm(range(num_epochs_app), desc="Training Epochs"):
-        swck_model_global.set_wiring_phase(epoch < WIRING_PHASE_EPOCHS_APP) # wiring phase for first few
         epoch_loss = 0.0
         for batch_idx, (src_batch, tgt_batch) in enumerate(app_dataloader):
             src_batch, tgt_batch = src_batch.to(device_global), tgt_batch.to(device_global)
-            decoder_input_tokens = src_batch
-            gold_standard_for_loss = tgt_batch
             src_key_padding_mask = (decoder_input_tokens == PAD_TOKEN)
             optimizer_global.zero_grad()
             logits, entropy_report = swck_model_global(decoder_input_tokens, src_key_padding_mask=src_key_padding_mask)
-            main_loss = criterion_main_app(logits.view(-1, logits.size(-1)), gold_standard_for_loss.view(-1))
             block_entropy_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["block_output_entropies"]:
-                for i, block_entropy in enumerate(entropy_report["block_output_entropies"]):
-                    target_entropy = swck_model_global.seed_parser.get_block_config(i)["target_entropy"]
-                    block_entropy_loss += F.mse_loss(block_entropy, torch.tensor(target_entropy, device=device_global))
-                if entropy_report["block_output_entropies"]:
                     block_entropy_loss = block_entropy_loss / len(entropy_report["block_output_entropies"])
             overall_entropy_loss = entropy_report["overall_output_entropy"]
             gate_sparsity_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["block_gate_weights"]:
-                for gates_softmax in entropy_report["block_gate_weights"]:
-                    gate_sparsity_loss += torch.mean(gates_softmax * torch.log(gates_softmax + 1e-9))
-                if entropy_report["block_gate_weights"]:
                      gate_sparsity_loss = - (gate_sparsity_loss / len(entropy_report["block_gate_weights"]))
             combined_loss = (MAIN_LOSS_WEIGHT_APP * main_loss +
                              BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP * block_entropy_loss +
                              OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP * overall_entropy_loss +
@@ -236,33 +260,38 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
             optimizer_global.step()
             epoch_loss += combined_loss.item()
-            if batch_idx % 1 == 0: # Log every batch for small dataset
-                log_line = f"  Epoch {epoch+1}, Batch {batch_idx+1}/{len(app_dataloader)}, Loss: {combined_loss.item():.4f}"
-                print(log_line) # To Space console logs
-                # training_log_output += log_line + "\n" # Accumulate for Gradio output (can get long)
-        avg_epoch_loss = epoch_loss / len(app_dataloader)
         epoch_summary = f"Epoch {epoch+1}/{num_epochs_app} - Avg Loss: {avg_epoch_loss:.4f}\n"
         print(epoch_summary)
         training_log_output += epoch_summary
-        # progress.update() # Not needed with track_tqdm
-    swck_model_global.eval() # Set back to eval mode
-    # Save the updated model state
     try:
         torch.save({
             'model_state_dict': swck_model_global.state_dict(),
-            'optimizer_state_dict': optimizer_global.state_dict(), # Save optimizer too
             'word_to_idx': word_to_idx_global,
             'idx_to_word': idx_to_word_global,
-            # Include other necessary metadata for consistent loading
-            'model_hyperparameters': { # Example of saving model construction args
                 'vocab_size': VOCAB_SIZE_APP, 'd_model': D_MODEL_APP, 'n_heads': N_HEADS_APP,
                 'd_ff': D_FF_APP, 'num_adaptive_blocks': NUM_ADAPTIVE_BLOCKS_APP, 'dropout': DROPOUT_APP
             }
         }, CHECKPOINT_FILENAME)
-        save_msg = f"Training finished. Model checkpoint saved to {CHECKPOINT_FILENAME} in Space."
         print(save_msg)
         training_log_output += save_msg
         model_load_status_global = f"Model trained in-app & saved. Last status: {save_msg}"
@@ -274,14 +303,16 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
     return training_log_output
-# --- Text Generation Function (adapted from train.py) ---
 def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
-    global model_load_status_global # To update if model isn't ready
     if swck_model_global is None or word_to_idx_global is None or idx_to_word_global is None:
         return "Model not loaded. Please check server logs or try training.", "Model not available."
     swck_model_global.eval()
     swck_model_global.set_wiring_phase(False)
     print(f"App: Generating for prompt: '{prompt_str}', max_len: {max_len_gen}, temp: {temperature_gen}")
@@ -290,8 +321,12 @@ def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
     debug_info_lines = [f"Prompt tokens: {generated_ids_app}"]
     with torch.no_grad():
-        for i in range(max_len_gen):
-            current_context_ids = generated_ids_app[-SEQ_LEN_APP:]
             input_tensor = torch.tensor([current_context_ids], dtype=torch.long).to(device_global)
             padding_mask = (input_tensor == PAD_TOKEN)
@@ -302,9 +337,9 @@ def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
                 next_token_id = torch.argmax(next_token_logits).item()
             else:
                 probs = F.softmax(next_token_logits / temperature_gen, dim=-1)
-                if probs.isnan().any() or probs.isinf().any() or torch.sum(probs).item() < 1e-9 : # Check for bad probs
                     print(f"Warning: Invalid probabilities at step {i}. Using uniform.")
-                    probs = torch.ones_like(next_token_logits) / next_token_logits.size(-1) # Fallback
                 next_token_id = torch.multinomial(probs, 1).item()
             if next_token_id == EOS_TOKEN:
@@ -315,12 +350,15 @@ def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
             if i < 10 :
                 current_word = idx_to_word_global.get(next_token_id, UNK_TOKEN_STR)
                 overall_ent = entropy_report_infer['overall_output_entropy'].item()
-                if entropy_report_infer['block_output_entropies']: # Check if list is not empty
                     b0_ent = entropy_report_infer['block_output_entropies'][0].item()
-                    b0_gates_str = ", ".join([f"{g.item():.2f}" for g in entropy_report_infer['block_gate_weights'][0]])
-                    debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, B0Ent={b0_ent:.3f}, B0Gates=[{b0_gates_str}]")
                 else:
-                    debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, No block entropy report.")
     generated_text_list = [idx_to_word_global.get(idx, UNK_TOKEN_STR) for idx in generated_ids_app[1:]]
@@ -331,12 +369,14 @@ def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
     final_text = re.sub(r'\s+', ' ', final_text).strip()
     debug_output_str = "\n".join(debug_info_lines)
     return final_text, debug_output_str
 # --- Gradio Interface ---
-# Load model on app startup
-initial_load_status = initialize_or_load_model_app()
 with gr.Blocks(title="SWCK Conceptual Demo") as demo:
     gr.Markdown(f"""
@@ -364,12 +404,18 @@ with gr.Blocks(title="SWCK Conceptual Demo") as demo:
             with gr.Row():
                 train_epochs_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Number of Training Epochs")
                 train_batch_size_slider = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Training Batch Size")
-                train_lr_slider = gr.Slider(minimum=1e-5, maximum=1e-3, value=5e-4, step=1e-5, label="Learning Rate", format="%.1e")
             start_training_button = gr.Button("Start Short Training Session")
-            training_status_output = gr.Textbox(label="Training Log / Status:", lines=10, interactive=False)
-    # Define actions
     generate_button.click(
         fn=generate_text_for_app,
         inputs=[prompt_input, max_len_slider, temp_slider],
@@ -380,12 +426,11 @@ with gr.Blocks(title="SWCK Conceptual Demo") as demo:
         fn=run_short_training_session,
         inputs=[train_epochs_slider, train_batch_size_slider, train_lr_slider],
         outputs=[training_status_output]
-    ).then(fn=lambda: model_load_status_global, inputs=None, outputs=gr.Markdown(elem_id="model_status_display"))
-    # The .then part to update status might need JavaScript if Markdown elem_id doesn't work directly for dynamic updates.
-    # For simplicity, the training function itself prints to console and returns a string.
-    # A more robust status update would use gr.HTML or JS.
 if __name__ == "__main__":
-    # When running locally, ensure debug=True for Gradio's own debug mode if needed.
-    # On Spaces, console logs are primary.
-    demo.launch(debug=True) # Enable Gradio debug for local run

 # --- Vocabulary and Tokenizer Setup ---
 PAD_TOKEN_STR = "<pad>"; SOS_TOKEN_STR = "<sos>"; EOS_TOKEN_STR = "<eos>"; UNK_TOKEN_STR = "<unk>"
 PAD_TOKEN = 0; SOS_TOKEN = 1; EOS_TOKEN = 2; UNK_TOKEN = 3
+SEQ_LEN_APP = 64
 # --- Model Configuration ---
+VOCAB_SIZE_APP = 189
 D_MODEL_APP = 64
 N_HEADS_APP = 2
 D_FF_APP = 128
 The target is not just prediction, but a form of self-understanding, however metaphorical.
 Let the adaptive blocks find their balance. Let the entropy guide the wiring.
 A painter paints. A scientist explores. A writer writes. The machine... becomes.
+"""
 # Global model variables
 swck_model_global = None
 device_global = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model_load_status_global = "Model not loaded."
+CHECKPOINT_FILENAME = "swck_model_conceptual_app.pth.tar"
 MAIN_LOSS_WEIGHT_APP = 1.0
 BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP = 0.02
 OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP = 0.01
 GATE_SPARSITY_LOSS_WEIGHT_APP = 0.001
+WIRING_PHASE_EPOCHS_APP = 1
 def build_vocab_from_corpus_text_app(corpus_text):
     }
     swck_model_global = SWCKModel(**model_args).to(device_global)
+    swck_model_global.debug_prints_enabled = True # Top-level model debug
     if hasattr(swck_model_global, 'seed_parser'): swck_model_global.seed_parser.debug_prints_enabled = True
     for i,block in enumerate(swck_model_global.adaptive_blocks):
+        block.debug_prints_enabled = True # Block-level debug
+        # print(f"App: Debug prints explicitly enabled for AdaptiveBlock {i}")
     if os.path.exists(CHECKPOINT_FILENAME):
             checkpoint = torch.load(CHECKPOINT_FILENAME, map_location=device_global)
             swck_model_global.load_state_dict(checkpoint['model_state_dict'])
+            optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001)
+            if 'optimizer_state_dict' in checkpoint:
                  optimizer_global.load_state_dict(checkpoint['optimizer_state_dict'])
+            if 'word_to_idx' in checkpoint:
                 loaded_w2i = checkpoint['word_to_idx']
+                # Basic check, could be more robust
+                if isinstance(loaded_w2i, dict) and len(loaded_w2i) > 4:
                     word_to_idx_global = loaded_w2i
                     idx_to_word_global = {v: k for k,v in loaded_w2i.items()}
+                    VOCAB_SIZE_APP = len(word_to_idx_global) # Ensure vocab size reflects loaded
+                    print(f"App: Overwrote vocab with checkpoint's vocab. New size: {VOCAB_SIZE_APP}")
                 else:
+                    print("App: Checkpoint vocab seems invalid, using app's rebuilt vocab.")
+            else:
+                print("App: word_to_idx not in checkpoint, using app's rebuilt vocab.")
             model_load_status_global = f"Model loaded successfully from {CHECKPOINT_FILENAME}."
             print(model_load_status_global)
         except Exception as e:
             print(f"App: Error loading model from checkpoint: {e}. Initializing new model.")
+            swck_model_global = SWCKModel(**model_args).to(device_global) # Re-init
             optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001)
             model_load_status_global = "Error loading checkpoint. Using new (untrained) model."
     else:
         optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=0.001)
         model_load_status_global = "Initialized a new (untrained) model."
+    swck_model_global.eval()
     return model_load_status_global
 class AppSWCKDataset(Dataset):
     def __init__(self, text_corpus_str, w2i_map, seq_len, sos_id, eos_id, pad_id):
         tokens = re.sub(r'\s+', ' ', text_corpus_str.lower()).strip().split()
         self.seq_len = seq_len
         self.sos_id, self.eos_id, self.pad_id = sos_id, eos_id, pad_id
         self.samples = []
+        # Create overlapping sequences for language modeling
+        # Ensure target is seq_len for consistency with input to model.
+        for i in range(len(token_ids) - seq_len -1): # -1 to ensure target has full seq_len
+            input_seq = [self.sos_id] + token_ids[i : i + seq_len] # length seq_len + 1
+            target_seq = token_ids[i + 1 : i + seq_len + 1] + [self.eos_id] # length seq_len + 1
             self.samples.append((input_seq, target_seq))
         print(f"AppSWCKDataset: Created {len(self.samples)} training samples for in-app training.")
     padded_tgt = nn.utils.rnn.pad_sequence(tgt_list, batch_first=True, padding_value=PAD_TOKEN)
     return padded_src, padded_tgt
 def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app, progress=gr.Progress(track_tqdm=True)):
     global swck_model_global, optimizer_global, word_to_idx_global, model_load_status_global
     print("\n--- App: Starting Short Training Session ---")
     progress(0, desc="Preparing training data...")
     training_corpus = SEED_PHRASE_APP + " " + EXTENDED_TEXT_FOR_TRAINING_APP
     app_dataset = AppSWCKDataset(training_corpus, word_to_idx_global, SEQ_LEN_APP, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN)
     if not app_dataset.samples:
         return "App Training Error: No samples created from the corpus."
+    app_dataloader = DataLoader(app_dataset, batch_size=int(batch_size_app), shuffle=True, collate_fn=app_swck_collate_fn)
     if optimizer_global is None:
         optimizer_global = optim.AdamW(swck_model_global.parameters(), lr=learning_rate_app)
+    else:
         for param_group in optimizer_global.param_groups:
             param_group['lr'] = learning_rate_app
     criterion_main_app = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
+    training_log_output = f"Starting training for {num_epochs_app} epochs...\n"
+    swck_model_global.train()
+    for epoch in progress.tqdm(range(int(num_epochs_app)), desc="Training Epochs"):
+        swck_model_global.set_wiring_phase(epoch < WIRING_PHASE_EPOCHS_APP)
         epoch_loss = 0.0
+        # Enable debug for first batch of first epoch
+        first_batch_debug = (epoch == 0)
         for batch_idx, (src_batch, tgt_batch) in enumerate(app_dataloader):
+            if first_batch_debug and batch_idx == 0:
+                swck_model_global.debug_prints_enabled = True
+                for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = True
+            elif not (first_batch_debug and batch_idx == 0) : # Disable after first batch for speed
+                swck_model_global.debug_prints_enabled = False
+                for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = False
             src_batch, tgt_batch = src_batch.to(device_global), tgt_batch.to(device_global)
+            decoder_input_tokens = src_batch[:, :-1] # Remove EOS from input
+            gold_standard_for_loss = tgt_batch[:, 1:] # Remove SOS from target
             src_key_padding_mask = (decoder_input_tokens == PAD_TOKEN)
             optimizer_global.zero_grad()
             logits, entropy_report = swck_model_global(decoder_input_tokens, src_key_padding_mask=src_key_padding_mask)
+            # Ensure logits and gold_standard_for_loss are aligned for CrossEntropyLoss
+            # Logits: (B, S_len_in, VocabSize)
+            # Gold: (B, S_len_target)
+            # If S_len_in == S_len_target, it's fine.
+            if logits.size(1) != gold_standard_for_loss.size(1):
+                # This can happen if seq len handling differs slightly, adjust shorter one
+                min_len = min(logits.size(1), gold_standard_for_loss.size(1))
+                logits_for_loss = logits[:, :min_len, :].contiguous()
+                gold_for_loss_aligned = gold_standard_for_loss[:, :min_len].contiguous()
+            else:
+                logits_for_loss = logits
+                gold_for_loss_aligned = gold_standard_for_loss
+            main_loss = criterion_main_app(logits_for_loss.view(-1, logits_for_loss.size(-1)), gold_for_loss_aligned.view(-1))
             block_entropy_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["block_output_entropies"]:
+                for i, block_entropy_tensor in enumerate(entropy_report["block_output_entropies"]):
+                    target_entropy_val = swck_model_global.seed_parser.get_block_config(i)["target_entropy"]
+                    block_entropy_loss += F.mse_loss(block_entropy_tensor, torch.tensor(target_entropy_val, device=device_global))
+                if entropy_report["block_output_entropies"]: # Avoid division by zero
                     block_entropy_loss = block_entropy_loss / len(entropy_report["block_output_entropies"])
             overall_entropy_loss = entropy_report["overall_output_entropy"]
             gate_sparsity_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["block_gate_weights"]:
+                for gates_softmax_tensor in entropy_report["block_gate_weights"]:
+                    gate_sparsity_loss += torch.mean(gates_softmax_tensor * torch.log(gates_softmax_tensor + 1e-9))
+                if entropy_report["block_gate_weights"]: # Avoid division by zero
                      gate_sparsity_loss = - (gate_sparsity_loss / len(entropy_report["block_gate_weights"]))
             combined_loss = (MAIN_LOSS_WEIGHT_APP * main_loss +
                              BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP * block_entropy_loss +
                              OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP * overall_entropy_loss +
             optimizer_global.step()
             epoch_loss += combined_loss.item()
+            log_line = f"  Epoch {epoch+1}, Batch {batch_idx+1}/{len(app_dataloader)}, Loss: {combined_loss.item():.4f}"
+            if batch_idx % max(1, len(app_dataloader)//2) == 0 or batch_idx == len(app_dataloader)-1 : # Log less frequently to UI
+                print(log_line)
+                training_log_output += log_line + "\n"
+        # Disable debug prints after the very first batch of the first epoch
+        swck_model_global.debug_prints_enabled = False
+        for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = False
+        avg_epoch_loss = epoch_loss / len(app_dataloader) if len(app_dataloader) > 0 else epoch_loss
         epoch_summary = f"Epoch {epoch+1}/{num_epochs_app} - Avg Loss: {avg_epoch_loss:.4f}\n"
         print(epoch_summary)
         training_log_output += epoch_summary
+    # Ensure debug prints are off after training session
+    swck_model_global.debug_prints_enabled = False
+    for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = False
+    swck_model_global.eval()
     try:
         torch.save({
             'model_state_dict': swck_model_global.state_dict(),
+            'optimizer_state_dict': optimizer_global.state_dict(),
             'word_to_idx': word_to_idx_global,
             'idx_to_word': idx_to_word_global,
+            'model_hyperparameters': {
                 'vocab_size': VOCAB_SIZE_APP, 'd_model': D_MODEL_APP, 'n_heads': N_HEADS_APP,
                 'd_ff': D_FF_APP, 'num_adaptive_blocks': NUM_ADAPTIVE_BLOCKS_APP, 'dropout': DROPOUT_APP
             }
         }, CHECKPOINT_FILENAME)
+        save_msg = f"Training finished. Model checkpoint saved to {CHECKPOINT_FILENAME} in Space's ephemeral storage."
         print(save_msg)
         training_log_output += save_msg
         model_load_status_global = f"Model trained in-app & saved. Last status: {save_msg}"
     return training_log_output
 def generate_text_for_app(prompt_str, max_len_gen, temperature_gen):
+    global model_load_status_global
     if swck_model_global is None or word_to_idx_global is None or idx_to_word_global is None:
         return "Model not loaded. Please check server logs or try training.", "Model not available."
     swck_model_global.eval()
     swck_model_global.set_wiring_phase(False)
+    # Temporarily enable debug for generation if needed, then disable
+    # swck_model_global.debug_prints_enabled = True # For generation debug
+    # for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = True
     print(f"App: Generating for prompt: '{prompt_str}', max_len: {max_len_gen}, temp: {temperature_gen}")
     debug_info_lines = [f"Prompt tokens: {generated_ids_app}"]
     with torch.no_grad():
+        for i in range(int(max_len_gen)): # Ensure max_len_gen is int
+            # Context windowing for input_tensor
+            # Take up to SEQ_LEN_APP tokens from the end of generated_ids_app
+            context_start_idx = max(0, len(generated_ids_app) - SEQ_LEN_APP)
+            current_context_ids = generated_ids_app[context_start_idx:]
             input_tensor = torch.tensor([current_context_ids], dtype=torch.long).to(device_global)
             padding_mask = (input_tensor == PAD_TOKEN)
                 next_token_id = torch.argmax(next_token_logits).item()
             else:
                 probs = F.softmax(next_token_logits / temperature_gen, dim=-1)
+                if probs.isnan().any() or probs.isinf().any() or torch.sum(probs).item() < 1e-9 :
                     print(f"Warning: Invalid probabilities at step {i}. Using uniform.")
+                    probs = torch.ones_like(next_token_logits) / next_token_logits.size(-1)
                 next_token_id = torch.multinomial(probs, 1).item()
             if next_token_id == EOS_TOKEN:
             if i < 10 :
                 current_word = idx_to_word_global.get(next_token_id, UNK_TOKEN_STR)
                 overall_ent = entropy_report_infer['overall_output_entropy'].item()
+                if entropy_report_infer['block_output_entropies'] and len(entropy_report_infer['block_output_entropies']) > 0:
                     b0_ent = entropy_report_infer['block_output_entropies'][0].item()
+                    if entropy_report_infer['block_gate_weights'] and len(entropy_report_infer['block_gate_weights']) > 0:
+                         b0_gates_str = ", ".join([f"{g.item():.2f}" for g in entropy_report_infer['block_gate_weights'][0]])
+                         debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, B0Ent={b0_ent:.3f}, B0Gates=[{b0_gates_str}]")
+                    else:
+                         debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, B0Ent={b0_ent:.3f}, No B0 gates.")
                 else:
+                    debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, No block entropy/gate report.")
     generated_text_list = [idx_to_word_global.get(idx, UNK_TOKEN_STR) for idx in generated_ids_app[1:]]
     final_text = re.sub(r'\s+', ' ', final_text).strip()
     debug_output_str = "\n".join(debug_info_lines)
+    # Disable debug prints after generation
+    # swck_model_global.debug_prints_enabled = False
+    # for blk in swck_model_global.adaptive_blocks: blk.debug_prints_enabled = False
     return final_text, debug_output_str
 # --- Gradio Interface ---
+initial_load_status = initialize_or_load_model_app() # Load model on app startup
 with gr.Blocks(title="SWCK Conceptual Demo") as demo:
     gr.Markdown(f"""
             with gr.Row():
                 train_epochs_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Number of Training Epochs")
                 train_batch_size_slider = gr.Slider(minimum=1, maximum=8, value=2, step=1, label="Training Batch Size")
+                # REMOVED format="%.1e"
+                train_lr_slider = gr.Slider(minimum=1e-5, maximum=1e-3, value=5e-4, step=1e-5, label="Learning Rate")
             start_training_button = gr.Button("Start Short Training Session")
+            training_status_output = gr.Textbox(label="Training Log / Status:", lines=10, interactive=False,show_label=True )
+    model_status_md = gr.Markdown(value=f"**Model Status:** {model_load_status_global}")
+    def update_status_text(): # Helper to refresh status after training
+        return f"**Model Status:** {model_load_status_global}"
     generate_button.click(
         fn=generate_text_for_app,
         inputs=[prompt_input, max_len_slider, temp_slider],
         fn=run_short_training_session,
         inputs=[train_epochs_slider, train_batch_size_slider, train_lr_slider],
         outputs=[training_status_output]
+    ).then(fn=update_status_text, inputs=None, outputs=model_status_md)
 if __name__ == "__main__":
+    # The Gradio app launch options (like debug=True) are for local execution.
+    # On Hugging Face Spaces, these are typically controlled by the environment.
+    # The `print()` statements will go to the Space's console logs.
+    demo.launch(debug=True)