import torch import torch.nn as nn import torch.nn.functional as F import math import hashlib # --- Future Entropy/State Predictor (FEP V6) --- (No changes from V6.1/V6.2) class FutureEntropyStatePredictor(nn.Module): def __init__(self, ssr_dim, input_scalar_dim=2, hidden_dim=32, name=""): super().__init__() self.ssr_dim = ssr_dim; self.name = name; self.debug_prints_enabled = False fep_input_dim = ssr_dim + input_scalar_dim self.fc_ssr1 = nn.Linear(fep_input_dim, hidden_dim * 2); self.fc_ssr2 = nn.Linear(hidden_dim * 2, hidden_dim); self.fc_ssr_out = nn.Linear(hidden_dim, ssr_dim) self.fc_ent1 = nn.Linear(fep_input_dim, hidden_dim); self.fc_ent_out = nn.Linear(hidden_dim, 1) def forward(self, current_ssr_detached, current_block_entropy_detached, current_static_target_diff_detached): if current_ssr_detached.dim() == 1: current_ssr_expanded = current_ssr_detached.unsqueeze(0) else: current_ssr_expanded = current_ssr_detached current_block_entropy_exp = current_block_entropy_detached.view(current_ssr_expanded.size(0), -1) current_static_target_diff_exp = current_static_target_diff_detached.view(current_ssr_expanded.size(0),-1) fep_input = torch.cat((current_ssr_expanded, current_block_entropy_exp, current_static_target_diff_exp), dim=1) h_ssr = F.relu(self.fc_ssr1(fep_input)); h_ssr = F.relu(self.fc_ssr2(h_ssr)); delta_ssr_proposal = torch.tanh(self.fc_ssr_out(h_ssr)) h_ent = F.relu(self.fc_ent1(fep_input)); entropy_adj_factor_raw = self.fc_ent_out(h_ent) if current_ssr_detached.dim() == 1: delta_ssr_proposal = delta_ssr_proposal.squeeze(0); entropy_adj_factor_raw = entropy_adj_factor_raw.squeeze(0) return delta_ssr_proposal, entropy_adj_factor_raw.squeeze(-1) # --- Entropy Estimator --- (No change from V6.1/V6.2) class EntropyEstimator(nn.Module): def __init__(self, input_dim, hidden_dim=32, name=""): super().__init__(); self.fc1 = nn.Linear(input_dim, hidden_dim); self.fc2 = nn.Linear(hidden_dim, 1); self.name = name; self.debug_prints_enabled = False def forward(self, x, active_mask=None): if x.numel() == 0: return torch.tensor(0.0, device=x.device) if active_mask is not None: if active_mask.dtype != torch.bool: active_mask = active_mask.bool() if x.dim() == 3 and active_mask.dim() == 2 and x.shape[0] == active_mask.shape[0] and x.shape[1] == active_mask.shape[1]: x_masked = x[active_mask] elif x.dim() == 2 and active_mask.dim() == 1 and x.shape[0] == active_mask.shape[0]: x_masked = x[active_mask] else: x_masked = x.reshape(-1, x.size(-1)) else: x_masked = x.reshape(-1, x.size(-1)) if x_masked.numel() == 0: return torch.tensor(0.0, device=x.device) h = F.relu(self.fc1(x_masked)); return torch.sigmoid(self.fc2(h)).mean() # --- Seed Parser (V6) --- (No changes from V6.1/V6.2) class SeedParser: def __init__(self, seed_phrase, seed_number_str, d_model, ssr_dim, num_adaptive_blocks, num_sub_modules_per_block): self.seed_phrase = seed_phrase; self.seed_number_str = seed_number_str; self.d_model = d_model self.ssr_dim = ssr_dim self.num_adaptive_blocks = num_adaptive_blocks; self.num_sub_modules_per_block = num_sub_modules_per_block self.debug_prints_enabled = True if self.debug_prints_enabled: print(f"--- SeedParser Initialization (V6) ---\n Seed Phrase (start): '{self.seed_phrase[:50]}...'\n Seed Number: {self.seed_number_str}") phrase_hash = hashlib.sha256(seed_phrase.encode()).hexdigest(); self.phrase_base_val = int(phrase_hash[:16], 16) if self.debug_prints_enabled: print(f" Phrase Base Value (from hash): {self.phrase_base_val}") self.num_sequence = [int(d) for d in seed_number_str if d.isdigit()] if not self.num_sequence: self.num_sequence = [sum(bytearray(seed_number_str.encode())) % 10] if self.debug_prints_enabled: print(f" Numerical Sequence (from seed number): {self.num_sequence}") self.init_map = self._generate_init_map() if self.debug_prints_enabled: print(f" SeedParser: Generated InitMap:") for i, block_config in enumerate(self.init_map["block_configs"]): raw_gate_scores_str = [f'{g:.3f}' for g in block_config['raw_gate_scores_for_param_init']] initial_ssr_str = [f'{s:.3f}' for s in block_config['initial_ssr_values'][:min(3, self.ssr_dim)]] + (["..."] if self.ssr_dim > 3 else []) print(f" Block {i}: StaticTgtEnt: {block_config['static_target_entropy']:.4f}, RawGateScores: {raw_gate_scores_str}, InitialSSR (sample): {initial_ssr_str}") if self.debug_prints_enabled: print(f"--- SeedParser Initialized ---") def _get_deterministic_float_list(self, key_name_prefix, num_values, min_val=-1.0, max_val=1.0, sequence_idx_offset=0): values = [] for i in range(num_values): values.append(self._get_deterministic_float(f"{key_name_prefix}_{i}", min_val, max_val, sequence_idx_offset + i)) return values def _get_deterministic_float(self, key_name, min_val=0.0, max_val=1.0, sequence_idx_offset=0): key_specific_hash = int(hashlib.sha256(key_name.encode() + self.seed_phrase.encode()).hexdigest()[:8], 16); num_seq_val = 0 if self.num_sequence: for i_digit, digit in enumerate(self.num_sequence): num_seq_val = (num_seq_val * 10 + digit + i_digit) % 1000003 combined_seed_val = self.phrase_base_val + key_specific_hash + num_seq_val + sequence_idx_offset norm_float = (math.sin(float(combined_seed_val) * 0.12345) + 1.0) / 2.0 return min_val + norm_float * (max_val - min_val) def _generate_init_map(self): init_map = {"block_configs": []} for i in range(self.num_adaptive_blocks): gate_raw_scores = self._get_deterministic_float_list(f"block_{i}_gate_raw_score", self.num_sub_modules_per_block, -1.5, 1.5, sequence_idx_offset=i*30) initial_ssr_values = self._get_deterministic_float_list(f"block_{i}_initial_ssr", self.ssr_dim, -0.1, 0.1, sequence_idx_offset=i*30 + self.num_sub_modules_per_block) static_target_entropy = self._get_deterministic_float(f"block_{i}_static_target_entropy", 0.15, 0.45, sequence_idx_offset=i*30 + self.num_sub_modules_per_block + self.ssr_dim) init_map["block_configs"].append({"raw_gate_scores_for_param_init": gate_raw_scores, "initial_ssr_values": initial_ssr_values, "static_target_entropy": static_target_entropy}) return init_map def get_block_config(self, block_idx): if 0 <= block_idx < len(self.init_map["block_configs"]): return self.init_map["block_configs"][block_idx] return None # --- Adaptive Block (V6.3) --- class AdaptiveBlock(nn.Module): MAX_DYNAMIC_ENTROPY_ADJUSTMENT_RANGE = 0.05 INITIAL_HEURISTIC_STRENGTH = 0.025 FINAL_HEURISTIC_STRENGTH = 0.005 # V6.3: Increased initial SSR proposal scale INITIAL_SSR_PROPOSAL_SCALE = 0.25 # Was 0.2 FINAL_SSR_PROPOSAL_SCALE = 0.05 def __init__(self, d_model, ssr_dim, n_heads, d_ff, dropout, seed_parser_config_for_block, block_idx, num_sub_modules=3): super().__init__() self.d_model = d_model; self.ssr_dim = ssr_dim; self.block_idx = block_idx; self.num_sub_modules = num_sub_modules self.config_from_seed = seed_parser_config_for_block; self.debug_prints_enabled = True initial_ssr_vals = self.config_from_seed.get("initial_ssr_values", [0.0] * self.ssr_dim) if len(initial_ssr_vals) != self.ssr_dim: initial_ssr_vals = [0.0] * self.ssr_dim self.ssr = nn.Parameter(torch.tensor(initial_ssr_vals, dtype=torch.float32)) self.register_buffer('initial_ssr_buffer', torch.tensor(initial_ssr_vals, dtype=torch.float32)) raw_gate_param_inits_list = self.config_from_seed.get("raw_gate_scores_for_param_init", [0.0] * self.num_sub_modules) if len(raw_gate_param_inits_list) != self.num_sub_modules: raw_gate_param_inits_list = [0.0] * self.num_sub_modules self.gates_params = nn.Parameter(torch.tensor(raw_gate_param_inits_list, dtype=torch.float32)) self.register_buffer('initial_raw_gate_scores_buffer', torch.tensor(raw_gate_param_inits_list, dtype=torch.float32)) if self.debug_prints_enabled: raw_gate_scores_str = [f'{g:.3f}' for g in raw_gate_param_inits_list] ssr_sample_str = [f'{s:.3f}' for s in initial_ssr_vals[:min(3, self.ssr_dim)]] + (["..."] if self.ssr_dim > 3 else []) print(f" Initializing AdaptiveBlock {self.block_idx} (V6.3): StaticSeedTgtEnt={self.config_from_seed['static_target_entropy']:.3f}, InitialRawGateScores={raw_gate_scores_str}, InitialSSR (sample): {ssr_sample_str}") self.d_model_effective = self.d_model + self.ssr_dim self.sub_module_0 = nn.MultiheadAttention(self.d_model_effective, n_heads, dropout=dropout, batch_first=True) self.sub_module_1 = nn.Sequential(nn.Linear(self.d_model_effective, d_ff), nn.GELU(), nn.Dropout(dropout), nn.Linear(d_ff, self.d_model_effective)) self.sub_module_2 = nn.Sequential(nn.Linear(self.d_model_effective, self.d_model_effective), nn.GELU(), nn.Dropout(dropout)) self.sub_modules = nn.ModuleList([self.sub_module_0, self.sub_module_1, self.sub_module_2]) if self.num_sub_modules > len(self.sub_modules): self.num_sub_modules = len(self.sub_modules) elif self.num_sub_modules <= 0: raise ValueError(f"AdaptiveBlock {self.block_idx} must have at least one sub_module.") self.norm_input_x = nn.LayerNorm(self.d_model) self.norm_ssr_input = nn.LayerNorm(self.ssr_dim) self.norm_after_gates = nn.LayerNorm(self.d_model_effective) self.ssr_update_net = nn.Sequential( nn.Linear(self.ssr_dim + self.d_model_effective + self.ssr_dim, self.ssr_dim * 2), nn.GELU(), nn.Dropout(dropout), nn.Linear(self.ssr_dim * 2, self.ssr_dim) ) self.norm_ssr_output = nn.LayerNorm(self.ssr_dim) self.dropout_layer = nn.Dropout(dropout) self.output_entropy_estimator = EntropyEstimator(self.d_model_effective, name=f"Block{block_idx}_ProcessedOutEntropy") self.x_output_entropy_estimator = EntropyEstimator(self.d_model, name=f"Block{block_idx}_X_OutEntropy") # V6.3 self.fep = FutureEntropyStatePredictor(ssr_dim=self.ssr_dim, input_scalar_dim=2, name=f"Block{block_idx}_FEP") self.wiring_phase_active = False self.static_seed_target_entropy = self.config_from_seed.get("static_target_entropy", 0.25) self.current_epoch_in_wiring = 0 self.total_wiring_epochs = 1 def set_wiring_phase(self, active, current_epoch_num=0, total_wiring_epochs=1): self.wiring_phase_active = active if active: self.current_epoch_in_wiring = current_epoch_num; self.total_wiring_epochs = total_wiring_epochs if total_wiring_epochs > 0 else 1 def _get_current_decaying_factor(self, initial_val, final_val): if not self.wiring_phase_active or self.total_wiring_epochs <= 1: return initial_val progress = min(self.current_epoch_in_wiring / max(1, (self.total_wiring_epochs - 1)), 1.0) return initial_val - progress * (initial_val - final_val) def _get_current_heuristic_strength(self): return self._get_current_decaying_factor(self.INITIAL_HEURISTIC_STRENGTH, self.FINAL_HEURISTIC_STRENGTH) def _get_current_ssr_proposal_scale(self): # V6.1 return self._get_current_decaying_factor(self.INITIAL_SSR_PROPOSAL_SCALE, self.FINAL_SSR_PROPOSAL_SCALE) def forward(self, x, key_padding_mask=None, attn_mask=None): batch_size, seq_len, _ = x.shape ssr_before_update_for_loss = self.ssr.data.clone().detach() current_ssr_expanded = self.ssr.unsqueeze(0).unsqueeze(0).expand(batch_size, seq_len, -1).to(x.device) normed_x = self.norm_input_x(x) normed_ssr_expanded = self.norm_ssr_input(current_ssr_expanded) x_conditioned = torch.cat((normed_x, normed_ssr_expanded), dim=-1) current_gates_activations = torch.sigmoid(self.gates_params) if self.debug_prints_enabled and (self.wiring_phase_active or not self.training): ssr_print_val = self.ssr.data.detach().clone() ssr_sample_str = [f'{s.item():.3f}' for s in ssr_print_val[:min(3, self.ssr_dim)]] + (["..."] if self.ssr_dim > 3 else []) print(f" AdaptiveBlock {self.block_idx} (Wiring: {'ON' if self.wiring_phase_active else 'OFF'}, Epoch {self.current_epoch_in_wiring+1}/{self.total_wiring_epochs if self.wiring_phase_active else 'N/A'})") print(f" Input x: {x.shape}, CurrentSSR (sample): {ssr_sample_str}, RawG: {[f'{g.item():.3f}' for g in self.gates_params.data]}, SigmoidG: {[f'{s.item():.3f}' for s in current_gates_activations.data]}") outputs_from_submodules = [] for i, module_instance in enumerate(self.sub_modules): if i >= self.num_sub_modules: break if i == 0: module_out, _ = module_instance(x_conditioned, x_conditioned, x_conditioned, key_padding_mask=key_padding_mask, attn_mask=attn_mask, need_weights=False) else: module_out = module_instance(x_conditioned) outputs_from_submodules.append(module_out * current_gates_activations[i]) gated_sum_output = torch.sum(torch.stack(outputs_from_submodules, dim=0), dim=0) if outputs_from_submodules else torch.zeros_like(x_conditioned) block_processed_output_unnorm = x_conditioned + self.dropout_layer(gated_sum_output) block_processed_output = self.norm_after_gates(block_processed_output_unnorm) x_output_for_next_block = block_processed_output[:, :, :self.d_model] # V6.2: Get entropy of d_model part for loss x_output_part_entropy = self.x_output_entropy_estimator(x_output_for_next_block.detach(), active_mask=~key_padding_mask if key_padding_mask is not None else None) block_processed_output_entropy = self.output_entropy_estimator(block_processed_output.detach(), active_mask=~key_padding_mask if key_padding_mask is not None else None) current_static_target_diff = block_processed_output_entropy - self.static_seed_target_entropy dynamic_target_entropy_for_heuristic = self.static_seed_target_entropy fep_delta_ssr_proposal_scaled = torch.zeros_like(self.ssr.data, device=x.device) fep_entropy_adj_factor_for_report = torch.tensor(0.0, device=x.device) if self.wiring_phase_active and self.training: fep_delta_ssr_proposal_raw, fep_entropy_adj_factor_raw = self.fep(self.ssr.data.detach(), block_processed_output_entropy.detach(), current_static_target_diff.detach()) current_ssr_scale = self._get_current_ssr_proposal_scale() # V6.1 fep_delta_ssr_proposal_scaled = fep_delta_ssr_proposal_raw * current_ssr_scale fep_entropy_adj_factor_tanh = torch.tanh(fep_entropy_adj_factor_raw) dynamic_adjustment = fep_entropy_adj_factor_tanh * self.MAX_DYNAMIC_ENTROPY_ADJUSTMENT_RANGE dynamic_target_entropy_for_heuristic = self.static_seed_target_entropy + dynamic_adjustment.item() dynamic_target_entropy_for_heuristic = max(0.01, min(0.99, dynamic_target_entropy_for_heuristic)) fep_entropy_adj_factor_for_report = fep_entropy_adj_factor_tanh with torch.no_grad(): entropy_diff_for_heuristic = block_processed_output_entropy - dynamic_target_entropy_for_heuristic base_adj_strength = self._get_current_heuristic_strength() adaptive_strength_factor = min(max(abs(entropy_diff_for_heuristic.item()) * 7.0, 0.3), 2.5) adj_strength = base_adj_strength * adaptive_strength_factor if self.debug_prints_enabled: print(f" AdaptiveBlock {self.block_idx} WIRING HEURISTIC: RawG={[f'{g.item():.3f}' for g in self.gates_params.data]}, SigmoidG={[f'{s.item():.3f}' for s in current_gates_activations.data]}") print(f" BlockProcOutEnt={block_processed_output_entropy.item():.4f}, X_OutEnt={x_output_part_entropy.item():.4f}, StaticTgtEnt={self.static_seed_target_entropy:.4f}, FEP_EntAdjFactor={fep_entropy_adj_factor_tanh.item():.4f}, DynTgtEnt={dynamic_target_entropy_for_heuristic:.4f}, ED_Dyn={entropy_diff_for_heuristic.item():.4f}, BaseHeurStr={base_adj_strength:.4f} AdjStr={adj_strength:.4f}, SSR_PropScale={current_ssr_scale:.4f}") if entropy_diff_for_heuristic.item() > 1e-4: self.gates_params.data[0] -= adj_strength; self.gates_params.data[1] += adj_strength * 0.6 if self.num_sub_modules > 2: self.gates_params.data[2] += adj_strength * 0.4 elif entropy_diff_for_heuristic.item() < -1e-4: self.gates_params.data[0] += adj_strength; self.gates_params.data[1] -= adj_strength * 0.6 if self.num_sub_modules > 2: self.gates_params.data[2] -= adj_strength * 0.4 self.gates_params.data.clamp_(-3.5, 3.5) if self.debug_prints_enabled: print(f" AdaptiveBlock {self.block_idx} WIRING HEURISTIC POST: RawG={[f'{g.item():.3f}' for g in self.gates_params.data]}, SigmoidG={[f'{s.item():.3f}' for s in torch.sigmoid(self.gates_params.data)]}") block_output_aggregated = torch.mean(block_processed_output, dim=1) ssr_update_input_list = [] for b_idx in range(batch_size): current_fep_delta_ssr_for_update = fep_delta_ssr_proposal_scaled[b_idx] if fep_delta_ssr_proposal_scaled.dim() > 1 and fep_delta_ssr_proposal_scaled.size(0) == batch_size else fep_delta_ssr_proposal_scaled # V6.2 EXPERIMENT: block_output_aggregated is NOT detached to allow gradients to flow back ssr_update_input_list.append(torch.cat(( self.ssr.data.detach().clone(), # Previous SSR state (context for update) block_output_aggregated[b_idx], # Current block's processed output (NOT detached) current_fep_delta_ssr_for_update.detach() # FEP proposal (context for update) ))) ssr_update_input_batched = torch.stack(ssr_update_input_list, dim=0) new_ssr_values_batched = self.ssr_update_net(ssr_update_input_batched) if self.training: self.ssr.data = self.norm_ssr_output(torch.mean(new_ssr_values_batched, dim=0)) elif batch_size == 1: self.ssr.data = self.norm_ssr_output(new_ssr_values_batched.squeeze(0)) ssr_after_update_for_report = self.ssr.data.clone() return x_output_for_next_block, block_processed_output_entropy, x_output_part_entropy, \ current_gates_activations, self.gates_params.data.clone(), \ fep_entropy_adj_factor_for_report, torch.tensor(dynamic_target_entropy_for_heuristic, device=x.device), \ ssr_before_update_for_loss, ssr_after_update_for_report, fep_delta_ssr_proposal_scaled # --- Positional Encoding --- class PositionalEncoding(nn.Module): def __init__(self,d_model,dropout=0.1,max_len=512): super().__init__(); self.dropout=nn.Dropout(p=dropout); pe=torch.zeros(max_len,d_model); pos=torch.arange(0,max_len,dtype=torch.float).unsqueeze(1); div=torch.exp(torch.arange(0,d_model,2).float()*(-math.log(10000.0)/d_model)); pe[:,0::2]=torch.sin(pos*div); pe[:,1::2]=torch.cos(pos*div); self.register_buffer('pe',pe.unsqueeze(0)) def forward(self,x): x=x+self.pe[:,:x.size(1),:]; return self.dropout(x) # --- Main SWCK Model (V6.2) --- class SWCKModel(nn.Module): def __init__(self, vocab_size, d_model, ssr_dim, n_heads, d_ff, num_adaptive_blocks, dropout, seed_phrase, seed_number_str, num_sub_modules_per_block=3): super().__init__() self.d_model = d_model; self.ssr_dim = ssr_dim; self.seed_phrase = seed_phrase; self.seed_number_str = seed_number_str self.num_adaptive_blocks = num_adaptive_blocks self.debug_prints_enabled = True if self.debug_prints_enabled: print(f"--- Initializing SWCKModel (V6.2) ---") self.seed_parser = SeedParser(seed_phrase, seed_number_str, d_model, ssr_dim, num_adaptive_blocks, num_sub_modules_per_block) self.seed_parser.debug_prints_enabled = self.debug_prints_enabled self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoder = PositionalEncoding(d_model, dropout) self.adaptive_blocks = nn.ModuleList() for i in range(num_adaptive_blocks): block_config = self.seed_parser.get_block_config(i) if block_config is None: raise ValueError(f"SWCKModel Error: Could not get seed config for block {i}") new_block = AdaptiveBlock(d_model, ssr_dim, n_heads, d_ff, dropout, block_config, block_idx=i, num_sub_modules=num_sub_modules_per_block) new_block.debug_prints_enabled = self.debug_prints_enabled self.adaptive_blocks.append(new_block) if self.debug_prints_enabled: print(f" SWCKModel: Added AdaptiveBlock {i} (V6.2)") self.fc_out = nn.Linear(d_model, vocab_size) # V6.2: Renamed for clarity self.final_d_model_entropy_estimator = EntropyEstimator(d_model, name="Final_DMODEL_OutEntropy") self.final_d_model_entropy_estimator.debug_prints_enabled = False self._init_weights() if self.debug_prints_enabled: print(f"--- SWCKModel V6.2 Initialized (Vocab: {vocab_size}, d_model: {d_model}, SSR_dim: {ssr_dim}, Blocks: {num_adaptive_blocks}x{num_sub_modules_per_block}sub) ---") def _init_weights(self): initrange = 0.1; self.embedding.weight.data.uniform_(-initrange, initrange) self.fc_out.bias.data.zero_(); self.fc_out.weight.data.uniform_(-initrange, initrange) def set_wiring_phase(self, active, current_epoch_num=0, total_wiring_epochs=1): if self.debug_prints_enabled: print(f"SWCKModel: Setting wiring phase to {active} for all blocks (Epoch {current_epoch_num+1}/{total_wiring_epochs} of wiring if active).") for block in self.adaptive_blocks: block.set_wiring_phase(active, current_epoch_num, total_wiring_epochs) def forward(self, src_tokens, src_key_padding_mask=None): if self.debug_prints_enabled: print(f"\n--- SWCKModel V6.2 Forward Pass (Training: {self.training}) ---") print(f" Input src_tokens: {src_tokens.shape}") x = self.embedding(src_tokens) * math.sqrt(self.d_model) x = self.pos_encoder(x) if self.debug_prints_enabled: print(f" After Embedding & PosEnc, x: {x.shape}") block_processed_output_entropies = [] block_x_output_entropies = [] # V6.2 current_block_gate_activations = []; current_block_gate_raw_params = [] fep_entropy_adj_factors = []; dynamic_target_entropies_used = [] ssr_befores_for_loss = []; ssr_afters_for_report = []; fep_delta_ssr_proposals_report = [] for i, block in enumerate(self.adaptive_blocks): if self.debug_prints_enabled: print(f" Processing AdaptiveBlock {i}...") x, blk_proc_out_ent, x_out_ent, current_gate_acts, raw_gate_params, fep_ent_adj_factor, dyn_target_ent, ssr_before, ssr_after, fep_delta_ssr = block(x, key_padding_mask=src_key_padding_mask, attn_mask=None) block_processed_output_entropies.append(blk_proc_out_ent) block_x_output_entropies.append(x_out_ent) current_block_gate_activations.append(current_gate_acts) current_block_gate_raw_params.append(raw_gate_params); fep_entropy_adj_factors.append(fep_ent_adj_factor) dynamic_target_entropies_used.append(dyn_target_ent) ssr_befores_for_loss.append(ssr_before) ssr_afters_for_report.append(ssr_after) fep_delta_ssr_proposals_report.append(fep_delta_ssr) if self.debug_prints_enabled: acts_str = [f'{act.item():.3f}' for act in current_gate_acts] raw_str = [f'{rp.item():.3f}' for rp in raw_gate_params] ssr_after_str = [f'{srp.item():.3f}' for srp in ssr_after[:min(3, self.ssr_dim)]] + (["..."] if self.ssr_dim > 3 else []) fep_ds_str_report_inner = "N/A" if torch.is_tensor(fep_delta_ssr) and fep_delta_ssr.numel() > 0 : fep_ds_str_report_inner = [f'{ds.item():.3f}' for ds in fep_delta_ssr[:min(3, self.ssr_dim)]] + (["..."] if self.ssr_dim > 3 else []) fep_ent_adj_factor_str = f"{fep_ent_adj_factor.item():.3f}" if torch.is_tensor(fep_ent_adj_factor) else "N/A_Scalar" dyn_target_str = f"{dyn_target_ent.item():.3f}" if torch.is_tensor(dyn_target_ent) else "N/A_Scalar" print(f" Output x from Block {i}: {x.shape}, BlkProcOutEnt: {blk_proc_out_ent.item():.4f}, X_OutEnt: {x_out_ent.item():.4f}, SigmoidG: {acts_str}, RawG: {raw_str}") print(f" Block {i} SSR_After (sample): {ssr_after_str}, FEP_DeltaSSR_Proposal (sample): {fep_ds_str_report_inner}, FEP_EntAdjFactor: {fep_ent_adj_factor_str}, DynTgtEnt: {dyn_target_str}") logits = self.fc_out(x) if self.debug_prints_enabled: print(f" Output logits: {logits.shape}") final_active_mask = ~src_key_padding_mask if src_key_padding_mask is not None else None overall_d_model_output_entropy = self.final_d_model_entropy_estimator(x.detach(), active_mask=final_active_mask) # Use renamed estimator if self.debug_prints_enabled: print(f" Overall Final d_model Output Entropy (before fc_out): {overall_d_model_output_entropy.item():.4f}") entropy_report = { "block_processed_output_entropies": block_processed_output_entropies, "block_x_output_entropies": block_x_output_entropies, # V6.2 "overall_d_model_output_entropy": overall_d_model_output_entropy, # V6.2 "current_block_gate_activations": current_block_gate_activations, "current_block_gate_params": current_block_gate_raw_params, "fep_entropy_adj_factors": fep_entropy_adj_factors, "dynamic_target_entropies_used": dynamic_target_entropies_used, "ssr_befores_for_loss": ssr_befores_for_loss, "ssr_afters_for_report": ssr_afters_for_report, "fep_delta_ssr_proposals": fep_delta_ssr_proposals_report } if self.debug_prints_enabled: print(f"--- SWCKModel V6.2 Forward Pass Complete ---") return logits, entropy_report