Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

87bf3c7

1 Parent(s): bc2a18b

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -31

app.py CHANGED Viewed

@@ -1,16 +1,39 @@
 import streamlit as st
-# A100 specs
-TFLOPS = 312e12
-GB_S = 1935e9
 # in ms
 THREAD_OVERHEAD = 0.005
 # in ms
-def calc_exec_time(comp_flop, mem_bytes):
   exec_time = comp_flop/TFLOPS + mem_bytes/GB_S
-  return max(exec_time*1000, THREAD_OVERHEAD)
 def qkv_mha_exec(bs, h, n, d):
   flop = 2*bs*1*d*3*d
@@ -72,10 +95,11 @@ def mlp_exec(bs, h, n, d):
   exec_time = calc_exec_time(flop, nbytes)
   return flop, nbytes, exec_time
-def print_kernel_execution(c1, c2, comp_flop, mem_bytes):
-  exec_time = calc_exec_time(comp_flop, mem_bytes)
-  comp_flop = round(comp_flop/1e9, 2)
-  mem_bytes = round(mem_bytes/1e6, 2)
   c1.write("GFLOP:")
   c2.write(str(comp_flop))
@@ -83,22 +107,11 @@ def print_kernel_execution(c1, c2, comp_flop, mem_bytes):
   c2.write(str(mem_bytes))
   c1.write("Time (ms):")
   c2.write(str(exec_time))
   return exec_time
-st.sidebar.header("Transformer parameters")
-col1, col2 = st.sidebar.columns([2, 4])
-bs = st.sidebar.number_input('Batch size', value=10)
-h = st.sidebar.number_input('Num heads',value=16)
-d = st.sidebar.number_input('Dimension', value=768)
-l = st.sidebar.number_input('Num layers', value=24)
-n_start = st.sidebar.number_input('Start seq', value=1)
-n = st.sidebar.number_input('End seq', value=1024)
-st.sidebar.header("GPU parameters")
 st.header("Execution time (ms)")
@@ -151,16 +164,13 @@ if breakdown:
   st.subheader('QKV projection')
   st.caption("Multi-Head Attention")
-  mha_flop = 2*bs*1*d*3*d
-  mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
-  c1, c2 = st.columns([2, 3])
-  qkv_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
   st.caption("Multi-Query Attention")
-  mqa_flop = 2*bs*1*d*(1+2/h)*d
-  mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
-  c1, c2 = st.columns([2, 3])
-  qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('QK gemm')
   st.write("Showing calculation for the maximum sequence length (n)")
@@ -187,7 +197,7 @@ if breakdown:
   st.caption("Multi-Query Attention")
   mqa_flop = 2*bs*h*n*(d/h)
-  mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
   c1, c2 = st.columns([2, 3])
   att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)

 import streamlit as st
+st.sidebar.header("Transformer parameters")
+col1, col2 = st.sidebar.columns([2, 4])
+bs = st.sidebar.number_input('Batch size', value=10)
+h = st.sidebar.number_input('Num heads',value=16)
+d = st.sidebar.number_input('Dimension', value=768)
+l = st.sidebar.number_input('Num layers', value=24)
+n_start = st.sidebar.number_input('Start seq', value=1)
+n = st.sidebar.number_input('End seq', value=1024)
+st.sidebar.header("GPU parameters")
+GPU = st.selectbox('GPU', ('A100', 'V100'))
+if GPU == 'A100':
+  # A100 specs
+  TFLOPS = 312e12
+  GB_S = 1935e9
+elif GPU == 'V100':
+  TFLOPS = 112e12
+  GB_S = 900e9
+else:
+  raise ValueError('Unknown GPU')
 # in ms
 THREAD_OVERHEAD = 0.005
 # in ms
+def calc_exec_time(comp_flop, mem_bytes, include_overhead=True):
   exec_time = comp_flop/TFLOPS + mem_bytes/GB_S
+  if include_overhead:
+    exec_time = max(exec_time*1000, THREAD_OVERHEAD)
+  return exec_time
 def qkv_mha_exec(bs, h, n, d):
   flop = 2*bs*1*d*3*d
   exec_time = calc_exec_time(flop, nbytes)
   return flop, nbytes, exec_time
+def print_kernel_execution(flop, mem_bytes):
+  c1, c2 = st.columns([2, 3])
+  exec_time = calc_exec_time(comp_flop, mem_bytes, include_overhead=False)
+  flop = round(flop/1e9, 2)
+  nbytes = round(nbytes/1e6, 2)
   c1.write("GFLOP:")
   c2.write(str(comp_flop))
   c2.write(str(mem_bytes))
   c1.write("Time (ms):")
   c2.write(str(exec_time))
+  c1.write("Overhead (ms):")
+  c2.write(str(THREAD_OVERHEAD))
   return exec_time
 st.header("Execution time (ms)")
   st.subheader('QKV projection')
   st.caption("Multi-Head Attention")
+  flop, nbytes, exec_time = qkv_mha_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.caption("Multi-Query Attention")
+  flop, nbytes, exec_time = qkv_mqa_exec(bs, h, n, d)
+  print_kernel_execution(flop, nbytes)
   st.subheader('QK gemm')
   st.write("Showing calculation for the maximum sequence length (n)")
   st.caption("Multi-Query Attention")
   mqa_flop = 2*bs*h*n*(d/h)
+  mqa_bytes = 2*bs*h*n + 2*bs*n*(d/h) + 2*bs*h*(d/h)
   c1, c2 = st.columns([2, 3])
   att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)