Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

67aca21

1 Parent(s): 1c3236f

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -26

app.py CHANGED Viewed

@@ -31,8 +31,9 @@ THREAD_OVERHEAD = 0.005
 # in ms
 def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
   exec_time = comp_flop/TFLOPS + mem_bytes/GB_S
   if include_overhead:
-    exec_time = max(exec_time*1000, THREAD_OVERHEAD)
   return exec_time
 def qkv_mha_exec(bs, h, n, d):
@@ -109,9 +110,6 @@ def print_kernel_execution(flop, nbytes):
   c2.write(str(exec_time))
   c1.write("Overhead (ms):")
   c2.write(str(THREAD_OVERHEAD))
-  return exec_time
 st.header("Execution time (ms)")
@@ -185,44 +183,34 @@ if breakdown:
   st.subheader('Attention-value gemm')
   st.write("Showing calculation for the maximum sequence length (n)")
   st.caption("Multi-Head Attention")
-  mha_flop = 2*bs*h*n*(d/h)
-  mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
-  c1, c2 = st.columns([2, 3])
-  att2_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
   st.caption("Multi-Query Attention")
-  mqa_flop = 2*bs*h*n*(d/h)
-  mqa_bytes = 2*bs*h*n + 2*bs*n*(d/h) + 2*bs*h*(d/h)
-  c1, c2 = st.columns([2, 3])
-  att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('Output projection')
-  out_flop = 2*bs*1*d*d
-  out_bytes = 2*bs*1*d + 2*d*d + 2*bs*1*d
-  c1, c2 = st.columns([2, 3])
-  out_time = print_kernel_execution(c1, c2, out_flop, out_bytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
   st.caption("Softmax")
-  softmax_bytes = 2*bs*h*n + 2*bs*h*n
-  c1, c2 = st.columns([2, 3])
-  softmax_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
   st.caption("Layer norm/residual connection")
-  ln_bytes = 2*bs*1*d
-  ln_flop = 0
-  ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
   st.header('MLP layer')
   st.subheader('First and Second Linear Layer')
   flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
-  c1, c2 = st.columns([2, 3])
-  mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
   flop, nbytes, exec_time = ln_exec(bs, h, n, d)
-  c1, c2 = st.columns([2, 3])
-  mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)

 # in ms
 def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
   exec_time = comp_flop/TFLOPS + mem_bytes/GB_S
+  exec_time *= 1000
   if include_overhead:
+    exec_time = max(exec_time, THREAD_OVERHEAD)
   return exec_time
 def qkv_mha_exec(bs, h, n, d):
   c2.write(str(exec_time))
   c1.write("Overhead (ms):")
   c2.write(str(THREAD_OVERHEAD))
 st.header("Execution time (ms)")
   st.subheader('Attention-value gemm')
   st.write("Showing calculation for the maximum sequence length (n)")
   st.caption("Multi-Head Attention")
+  flop, nbytes, exec_time = att2_mha_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.caption("Multi-Query Attention")
+  flop, nbytes, exec_time = att2_mqa_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.subheader('Output projection')
+  flop, nbytes, exec_time = out_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
   st.caption("Softmax")
+  flop, nbytes, exec_time = softmax_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.caption("Layer norm/residual connection")
+  flop, nbytes, exec_time = ln_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.header('MLP layer')
   st.subheader('First and Second Linear Layer')
   flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
   flop, nbytes, exec_time = ln_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)