Spaces:

neuralworm
/

SWCK

Running

App Files Files Community

neuralworm commited on 28 days ago

Commit

b8efd7e

1 Parent(s): d82b2bb

overhaul by Gemini

Browse files

Files changed (1) hide show

app.py +9 -12

app.py CHANGED Viewed

@@ -68,7 +68,7 @@ BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP = 0.02
 OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP = 0.01
 GATE_SPARSITY_LOSS_WEIGHT_APP = 0.001
 GATE_ALIGNMENT_LOSS_WEIGHT_APP = 0.005 # For ObserverTime Sync during wiring phase
-WIRING_PHASE_EPOCHS_APP = 5 # Slightly increased for gate alignment to take effect
 def set_model_debug_prints(model, seed_parser_debug, block_debug, model_debug):
     if model:
@@ -228,7 +228,6 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
         swck_model_global.set_wiring_phase(epoch < WIRING_PHASE_EPOCHS_APP)
         epoch_loss = 0.0; print(f"\n>>> EPOCH {epoch+1} <<<")
         for batch_idx, (src_batch, tgt_batch) in enumerate(app_dataloader):
-            # print(f"\n--- Training Batch {batch_idx+1}/{len(app_dataloader)} (Epoch {epoch+1}) ---") # Verbose
             src_batch, tgt_batch = src_batch.to(device_global), tgt_batch.to(device_global)
             src_key_padding_mask = (src_batch == PAD_TOKEN)
             optimizer_global.zero_grad()
@@ -248,11 +247,11 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
             gate_sparsity_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["current_block_gate_softmaxes"]:
                 num_valid_gates_sparsity = 0
-                for gates_tensor in entropy_report["current_block_gate_softmaxes"]: # These are already softmaxed
                     if torch.is_tensor(gates_tensor) and gates_tensor.numel() > 0:
-                        gate_sparsity_loss += torch.mean(gates_tensor * torch.log(gates_tensor + 1e-9)) # Negative Entropy
                         num_valid_gates_sparsity +=1
-                if num_valid_gates_sparsity > 0 : gate_sparsity_loss = -(gate_sparsity_loss / num_valid_gates_sparsity) # Minimize entropy
             gate_alignment_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["current_block_gate_softmaxes"] and entropy_report["initial_block_gate_targets"]:
@@ -265,7 +264,8 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
                         num_valid_align_gates +=1
                 if num_valid_align_gates > 0: gate_alignment_loss /= num_valid_align_gates
-            current_gate_alignment_weight = GATE_ALIGNMENT_LOSS_WEIGHT if epoch < WIRING_PHASE_EPOCHS_APP else GATE_ALIGNMENT_LOSS_WEIGHT * 0.1
             combined_loss = (MAIN_LOSS_WEIGHT_APP * main_loss + BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP * block_entropy_loss +
                              OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP * overall_entropy_loss + GATE_SPARSITY_LOSS_WEIGHT_APP * gate_sparsity_loss +
@@ -285,7 +285,7 @@ def run_short_training_session(num_epochs_app, batch_size_app, learning_rate_app
             'num_adaptive_blocks': len(swck_model_global.adaptive_blocks), 'dropout': current_dropout,
             'seed_phrase': seed_phrase_ui, 'seed_number_str': seed_number_ui,
             'num_sub_modules_per_block': swck_model_global.adaptive_blocks[0].num_sub_modules if swck_model_global.adaptive_blocks else current_num_sub_modules_pb,
-            'seq_len_trained_on': SEQ_LEN_APP # Store the sequence length it was trained with
         }
         torch.save({'model_state_dict': swck_model_global.state_dict(), 'optimizer_state_dict': optimizer_global.state_dict(),
                     'word_to_idx': word_to_idx_global, 'idx_to_word': idx_to_word_global, 'model_hyperparameters': hyperparams
@@ -312,9 +312,7 @@ def generate_text_for_app(current_interaction_text, max_len_gen, temperature_gen
     newly_generated_tokens_list = []
     with torch.no_grad():
         for i in range(int(max_len_gen)):
-            # print(f"\n--- Gen Step {i+1}/{max_len_gen} ---") # Verbose
             context_for_model = generated_ids_app[-SEQ_LEN_APP:]
-            # print(f"  Context for model (len {len(context_for_model)}): {[idx_to_word_global.get(t, UNK_TOKEN_STR) for t in context_for_model[-20:]]}...") # Verbose
             if not context_for_model: print("Warning: Empty context_for_model!"); break
             input_tensor = torch.tensor([context_for_model], dtype=torch.long).to(device_global)
             padding_mask = (input_tensor == PAD_TOKEN)
@@ -344,13 +342,12 @@ def generate_text_for_app(current_interaction_text, max_len_gen, temperature_gen
             generated_ids_app.append(next_token_id)
             current_word = idx_to_word_global.get(next_token_id, UNK_TOKEN_STR)
             newly_generated_tokens_list.append(current_word)
-            # print(f"  ==> Generated token {i+1}: '{current_word}' (ID: {next_token_id})") # Verbose
             if i < 10:
                 overall_ent = entropy_report_infer['overall_output_entropy'].item() if torch.is_tensor(entropy_report_infer['overall_output_entropy']) else 0.0
                 b0_ent_str, b0_gates_str = "N/A", "N/A"
                 if entropy_report_infer['block_output_entropies'] and len(entropy_report_infer['block_output_entropies']) > 0 and torch.is_tensor(entropy_report_infer['block_output_entropies'][0]):
                     b0_ent_str = f"{entropy_report_infer['block_output_entropies'][0].item():.3f}"
-                if entropy_report_infer['current_block_gate_softmaxes'] and len(entropy_report_infer['current_block_gate_softmaxes']) > 0 and torch.is_tensor(entropy_report_infer['current_block_gate_softmaxes'][0]): # Use softmaxes for debug
                     b0_gates_str = ", ".join([f"{g.item():.2f}" for g in entropy_report_infer['current_block_gate_softmaxes'][0]])
                 debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, B0Ent={b0_ent_str}, B0Gates=[{b0_gates_str}]")
@@ -382,7 +379,7 @@ def prepare_model_for_download():
             'num_adaptive_blocks': len(swck_model_global.adaptive_blocks), 'dropout': current_dropout,
             'seed_phrase': swck_model_global.seed_parser.seed_phrase, 'seed_number_str': swck_model_global.seed_parser.seed_number_str,
             'num_sub_modules_per_block': swck_model_global.adaptive_blocks[0].num_sub_modules if swck_model_global.adaptive_blocks else current_num_sub_modules_pb,
-            'seq_len_trained_on': SEQ_LEN_APP # Store SEQ_LEN_APP as it's used for dataset in-app
         }
         torch.save({'model_state_dict': swck_model_global.state_dict(), 'optimizer_state_dict': optimizer_global.state_dict(),
                     'word_to_idx': word_to_idx_global, 'idx_to_word': idx_to_word_global, 'model_hyperparameters': hyperparams

 OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP = 0.01
 GATE_SPARSITY_LOSS_WEIGHT_APP = 0.001
 GATE_ALIGNMENT_LOSS_WEIGHT_APP = 0.005 # For ObserverTime Sync during wiring phase
+WIRING_PHASE_EPOCHS_APP = 5
 def set_model_debug_prints(model, seed_parser_debug, block_debug, model_debug):
     if model:
         swck_model_global.set_wiring_phase(epoch < WIRING_PHASE_EPOCHS_APP)
         epoch_loss = 0.0; print(f"\n>>> EPOCH {epoch+1} <<<")
         for batch_idx, (src_batch, tgt_batch) in enumerate(app_dataloader):
             src_batch, tgt_batch = src_batch.to(device_global), tgt_batch.to(device_global)
             src_key_padding_mask = (src_batch == PAD_TOKEN)
             optimizer_global.zero_grad()
             gate_sparsity_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["current_block_gate_softmaxes"]:
                 num_valid_gates_sparsity = 0
+                for gates_tensor in entropy_report["current_block_gate_softmaxes"]:
                     if torch.is_tensor(gates_tensor) and gates_tensor.numel() > 0:
+                        gate_sparsity_loss += torch.mean(gates_tensor * torch.log(gates_tensor + 1e-9))
                         num_valid_gates_sparsity +=1
+                if num_valid_gates_sparsity > 0 : gate_sparsity_loss = -(gate_sparsity_loss / num_valid_gates_sparsity)
             gate_alignment_loss = torch.tensor(0.0, device=device_global)
             if entropy_report["current_block_gate_softmaxes"] and entropy_report["initial_block_gate_targets"]:
                         num_valid_align_gates +=1
                 if num_valid_align_gates > 0: gate_alignment_loss /= num_valid_align_gates
+            # CORRECTED VARIABLE NAME HERE
+            current_gate_alignment_weight = GATE_ALIGNMENT_LOSS_WEIGHT_APP if epoch < WIRING_PHASE_EPOCHS_APP else GATE_ALIGNMENT_LOSS_WEIGHT_APP * 0.1
             combined_loss = (MAIN_LOSS_WEIGHT_APP * main_loss + BLOCK_TARGET_ENTROPY_LOSS_WEIGHT_APP * block_entropy_loss +
                              OVERALL_OUTPUT_ENTROPY_REG_WEIGHT_APP * overall_entropy_loss + GATE_SPARSITY_LOSS_WEIGHT_APP * gate_sparsity_loss +
             'num_adaptive_blocks': len(swck_model_global.adaptive_blocks), 'dropout': current_dropout,
             'seed_phrase': seed_phrase_ui, 'seed_number_str': seed_number_ui,
             'num_sub_modules_per_block': swck_model_global.adaptive_blocks[0].num_sub_modules if swck_model_global.adaptive_blocks else current_num_sub_modules_pb,
+            'seq_len_trained_on': SEQ_LEN_APP
         }
         torch.save({'model_state_dict': swck_model_global.state_dict(), 'optimizer_state_dict': optimizer_global.state_dict(),
                     'word_to_idx': word_to_idx_global, 'idx_to_word': idx_to_word_global, 'model_hyperparameters': hyperparams
     newly_generated_tokens_list = []
     with torch.no_grad():
         for i in range(int(max_len_gen)):
             context_for_model = generated_ids_app[-SEQ_LEN_APP:]
             if not context_for_model: print("Warning: Empty context_for_model!"); break
             input_tensor = torch.tensor([context_for_model], dtype=torch.long).to(device_global)
             padding_mask = (input_tensor == PAD_TOKEN)
             generated_ids_app.append(next_token_id)
             current_word = idx_to_word_global.get(next_token_id, UNK_TOKEN_STR)
             newly_generated_tokens_list.append(current_word)
             if i < 10:
                 overall_ent = entropy_report_infer['overall_output_entropy'].item() if torch.is_tensor(entropy_report_infer['overall_output_entropy']) else 0.0
                 b0_ent_str, b0_gates_str = "N/A", "N/A"
                 if entropy_report_infer['block_output_entropies'] and len(entropy_report_infer['block_output_entropies']) > 0 and torch.is_tensor(entropy_report_infer['block_output_entropies'][0]):
                     b0_ent_str = f"{entropy_report_infer['block_output_entropies'][0].item():.3f}"
+                if entropy_report_infer['current_block_gate_softmaxes'] and len(entropy_report_infer['current_block_gate_softmaxes']) > 0 and torch.is_tensor(entropy_report_infer['current_block_gate_softmaxes'][0]):
                     b0_gates_str = ", ".join([f"{g.item():.2f}" for g in entropy_report_infer['current_block_gate_softmaxes'][0]])
                 debug_info_lines.append(f"Gen {i+1}: '{current_word}', OvrlEnt={overall_ent:.3f}, B0Ent={b0_ent_str}, B0Gates=[{b0_gates_str}]")
             'num_adaptive_blocks': len(swck_model_global.adaptive_blocks), 'dropout': current_dropout,
             'seed_phrase': swck_model_global.seed_parser.seed_phrase, 'seed_number_str': swck_model_global.seed_parser.seed_number_str,
             'num_sub_modules_per_block': swck_model_global.adaptive_blocks[0].num_sub_modules if swck_model_global.adaptive_blocks else current_num_sub_modules_pb,
+            'seq_len_trained_on': SEQ_LEN_APP
         }
         torch.save({'model_state_dict': swck_model_global.state_dict(), 'optimizer_state_dict': optimizer_global.state_dict(),
                     'word_to_idx': word_to_idx_global, 'idx_to_word': idx_to_word_global, 'model_hyperparameters': hyperparams