Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 22, 2022

Commit

8cbefab

1 Parent(s): 54cd0e6

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -7

app.py CHANGED Viewed

@@ -33,12 +33,12 @@ col1, col2 = st.columns([2, 4])
 bs = number_field('Batch size', value=10)
 h = number_field('Num heads', value=16)
 d = number_field('Dimension', value=768)
-n = number_field('Seq length', value=1024)
 st.header('Query, Key, Value projection')
 st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*1*d*3*d
 mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
@@ -51,16 +51,28 @@ mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.header('Attention')
 st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*h*(d/h)*n
 mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
-att_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
 st.subheader("Multi-Query Attention")
 mqa_flop = 2*bs*h*(d/h)*n
@@ -68,7 +80,6 @@ mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
 att_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
 st.header('MLP')
 st.subheader('First Linear')
 mlp1_flop = 2*bs*1*d*4*d

 bs = number_field('Batch size', value=10)
 h = number_field('Num heads', value=16)
 d = number_field('Dimension', value=768)
+n_start = number_field('Start seq', value=1)
+n = number_field('End seq', value=1024)
+l = number_field('Num layers', value=2)
 st.header('Query, Key, Value projection')
 st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*1*d*3*d
 mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
 c1, c2 = st.columns([2, 3])
 qkv_mha_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+st.header('Attention scores: query-key gemm')
+st.write("Calculation depends on sequence length (n). Take end of sequence.")
 st.subheader("Multi-Head Attention")
 mha_flop = 2*bs*h*(d/h)*n
 mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
 c1, c2 = st.columns([2, 3])
+att1_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+st.subheader("Multi-Query Attention")
+mqa_flop = 2*bs*h*(d/h)*n
+mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
+c1, c2 = st.columns([2, 3])
+att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+st.header('Attention scores: ')
+st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
+st.subheader("Multi-Head Attention")
+mha_flop = 2*bs*h*(d/h)*n
+mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
+c1, c2 = st.columns([2, 3])
+att_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
 st.subheader("Multi-Query Attention")
 mqa_flop = 2*bs*h*(d/h)*n
 c1, c2 = st.columns([2, 3])
 att_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
 st.header('MLP')
 st.subheader('First Linear')
 mlp1_flop = 2*bs*1*d*4*d