RWKV
/

rwkv-5-world-1b5

Text Generation

Model card Files Files and versions Community

KaleiNeely commited on Dec 3, 2023

Commit

af0c7e3

•

1 Parent(s): ad95cec

Update modeling_rwkv5.py

Files changed (1) hide show

modeling_rwkv5.py +0 -20

modeling_rwkv5.py CHANGED Viewed

@@ -92,7 +92,6 @@ def rwkv_linear_attention_v5_2(B, H, S, T, n_head, hidden, time_decay, time_firs
     time_first = time_first.float().reshape(-1,1,1).reshape(n_head, -1, 1)
     lxw = lxw.float()
     lxb = lxb.float()
-    # if seq_mode:
     out = torch.empty((B, T, H, S), dtype=receptance.dtype, device=receptance.device)
     for t in range(T):
         rt = receptance[:,:,t:t+1,:]
@@ -106,25 +105,6 @@ def rwkv_linear_attention_v5_2(B, H, S, T, n_head, hidden, time_decay, time_firs
     out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H*S)
     out = out.to(dtype=hidden.dtype) * gate
     out = out @ ow
-    # else:
-    #     a = key @ value
-    #     # print('key.shape: ', key.shape)
-    #     # print('value.shape: ', value.shape)
-    #     # print('receptance.shape: ', receptance.shape)
-    #     # print('a.shape: ', a.shape)
-    #     # print('time_first.shape: ', time_first.shape)
-    #     # print('(time_first * a).shape: ', (time_first * a).shape)
-    #     # print('time_decay.shape: ', time_decay.shape)
-    #     # print('state.shape: ', state.shape)
-    #     out = receptance @ (time_first * a + state)
-    #     # print('out.shape: ', out.shape)
-    #     state = a + time_decay * state
-    #     # print('state.shape: ', state.shape)
-    #     out = out.reshape(B, H*S)
-    #     out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, 1, H*S)
-    #     out = out.to(dtype=hidden.dtype) * gate
-    #     out = out @ ow
     return out, state

     time_first = time_first.float().reshape(-1,1,1).reshape(n_head, -1, 1)
     lxw = lxw.float()
     lxb = lxb.float()
     out = torch.empty((B, T, H, S), dtype=receptance.dtype, device=receptance.device)
     for t in range(T):
         rt = receptance[:,:,t:t+1,:]
     out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H*S)
     out = out.to(dtype=hidden.dtype) * gate
     out = out @ ow
     return out, state