Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

a275f69

1 Parent(s): ea57214

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -35,51 +35,54 @@ h = number_field('Num heads', value=16)
 d = number_field('Dimension', value=768)
 n_start = number_field('Start seq', value=1)
 n = number_field('End seq', value=1024)
-l = number_field('Num layers', value=2)
-st.header('Query, Key, Value projection')
-st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*1*d*3*d
 mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.subheader("Multi-Query Attention")
 mqa_flop = 2*bs*1*d*(1+2/h)*d
 mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.header('Attention scores: query-key gemm')
-st.write("Calculation depends on sequence length (n). Take end of sequence.")
-st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*h*(d/h)*n
 mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 att1_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.subheader("Multi-Query Attention")
 mqa_flop = 2*bs*h*(d/h)*n
 mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.header('Attention scores: attention-value gemm')
 st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
-st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*h*n*(d/h)
 mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
 att_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.subheader("Multi-Query Attention")
 mqa_flop = 2*bs*h*n*(d/h)
 mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
 att_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
 st.header('MLP')
 st.subheader('First Linear')
 mlp1_flop = 2*bs*1*d*4*d

 d = number_field('Dimension', value=768)
 n_start = number_field('Start seq', value=1)
 n = number_field('End seq', value=1024)
+l = number_field('Num layers', value=24)
+st.header('Attention layer')
+st.subheader('QKV projection')
+st.caption("Multi-Head Attention")
 mha_flop = 2*bs*1*d*3*d
 mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+st.caption("Multi-Query Attention")
 mqa_flop = 2*bs*1*d*(1+2/h)*d
 mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+st.subheader('QK gemm')
+st.write("Note that calculation depends on sequence length (n)")
+st.caption("Multi-Head Attention")
 mha_flop = 2*bs*h*(d/h)*n
 mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 att1_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+st.caption("Multi-Query Attention")
 mqa_flop = 2*bs*h*(d/h)*n
 mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+st.header('Attention-value gemm')
 st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
+st.caption("Multi-Head Attention")
 mha_flop = 2*bs*h*n*(d/h)
 mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
 att_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+st.caption("Multi-Query Attention")
 mqa_flop = 2*bs*h*n*(d/h)
 mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
 c1, c2 = st.columns([2, 3])
 att_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+st.subheader('Output projection')
 st.header('MLP')
 st.subheader('First Linear')
 mlp1_flop = 2*bs*1*d*4*d