harmdevries commited on
Commit
064a5f0
·
1 Parent(s): 32aafee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -160,7 +160,7 @@ c1.write("Cached keys and values (GB)")
160
  acts = round(2*bs*l*(d/h)*2*n/1e9, 2)
161
  c2.write(str(acts))
162
 
163
- st.subheader("Approximations")
164
  st.markdown("We use the [following crude approximation](https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#understand-perf) to estimate the execution time for each matrix multiplication.")
165
 
166
  st.latex("C = A \cdot B")
@@ -183,9 +183,13 @@ st.latex("T_{math}(A \cdot B) = 2*M*K*N / BW_{math}")
183
  st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")
184
 
185
  st.markdown("If we assume we can *perfectly* overlap memory access with math operations, then the estimated execution time for the operation is:")
186
- st.latex("max(T_math, T_mem)")
187
 
188
- breakdown = st.checkbox("Show breakdown per operation")
 
 
 
 
189
  if breakdown:
190
  st.header('Attention layer')
191
 
 
160
  acts = round(2*bs*l*(d/h)*2*n/1e9, 2)
161
  c2.write(str(acts))
162
 
163
+ st.subheader("Estimating execution time")
164
  st.markdown("We use the [following crude approximation](https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#understand-perf) to estimate the execution time for each matrix multiplication.")
165
 
166
  st.latex("C = A \cdot B")
 
183
  st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")
184
 
185
  st.markdown("If we assume we can *perfectly* overlap memory access with math operations, then the estimated execution time for the operation is:")
186
+ st.latex("max(T_{math}, T_{mem})")
187
 
188
+ st.markdown("We also a minimum time for executing the operation due to [kernel launch overhead](https://forums.developer.nvidia.com/t/any-way-to-measure-the-latency-of-a-kernel-launch/221413/2)")
189
+
190
+ st.subheader("Operations in MHA and MQA")
191
+
192
+ breakdown = st.checkbox("Show inference time for each operation")
193
  if breakdown:
194
  st.header('Attention layer')
195