Spaces:
Running
Running
Felix Marty
commited on
Commit
·
6e19ff8
1
Parent(s):
0325bda
turn off aws instances
Browse files- app.py +6 -0
- defaults.py +12 -12
app.py
CHANGED
|
@@ -80,6 +80,12 @@ with gr.Blocks() as demo:
|
|
| 80 |
"## Speed up inference and support more workload with PyTorch's BetterTransformer 🤗"
|
| 81 |
)
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
gr.Markdown(
|
| 84 |
"""
|
| 85 |
Let's try out [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) + [TorchServe](https://pytorch.org/serve/)!
|
|
|
|
| 80 |
"## Speed up inference and support more workload with PyTorch's BetterTransformer 🤗"
|
| 81 |
)
|
| 82 |
|
| 83 |
+
gr.Markdown(
|
| 84 |
+
"""
|
| 85 |
+
**The two AWS instances powering this Space are offline (to save us the $$$). Feel free to reproduce using [this backend code](https://github.com/fxmarty/bettertransformer_demo). The example results are from an AWS EC2 g4dn.xlarge instance with a single NVIDIA T4 GPU.**
|
| 86 |
+
"""
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
gr.Markdown(
|
| 90 |
"""
|
| 91 |
Let's try out [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) + [TorchServe](https://pytorch.org/serve/)!
|
defaults.py
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
defaults_vanilla_single = {
|
| 2 |
"status": 200,
|
| 3 |
"prediction": "Positive",
|
| 4 |
-
"inf_latency":
|
| 5 |
"peak_gpu_memory": 2706.21,
|
| 6 |
-
"end_to_end_latency":
|
| 7 |
}
|
| 8 |
|
| 9 |
defaults_bt_single = {
|
| 10 |
"status": 200,
|
| 11 |
"prediction": "Positive",
|
| 12 |
-
"inf_latency":
|
| 13 |
"peak_gpu_memory": 2706.22,
|
| 14 |
-
"end_to_end_latency":
|
| 15 |
}
|
| 16 |
|
| 17 |
defaults_vanilla_spam = {
|
| 18 |
-
"throughput":
|
| 19 |
-
"mean_inference_latency":
|
| 20 |
-
"mean_peak_gpu_memory":
|
| 21 |
"mean_padding_ratio": 69.53,
|
| 22 |
"mean_sequence_length": 128.0,
|
| 23 |
-
"effective_batch_size":
|
| 24 |
}
|
| 25 |
|
| 26 |
defaults_bt_spam = {
|
| 27 |
-
"throughput":
|
| 28 |
-
"mean_inference_latency":
|
| 29 |
-
"mean_peak_gpu_memory":
|
| 30 |
"mean_padding_ratio": 69.53,
|
| 31 |
"mean_sequence_length": 128.0,
|
| 32 |
-
"effective_batch_size":
|
| 33 |
}
|
| 34 |
|
| 35 |
BATCH_SIZE = 8 # fixed!
|
|
|
|
| 1 |
defaults_vanilla_single = {
|
| 2 |
"status": 200,
|
| 3 |
"prediction": "Positive",
|
| 4 |
+
"inf_latency": 6.25,
|
| 5 |
"peak_gpu_memory": 2706.21,
|
| 6 |
+
"end_to_end_latency": 81.95,
|
| 7 |
}
|
| 8 |
|
| 9 |
defaults_bt_single = {
|
| 10 |
"status": 200,
|
| 11 |
"prediction": "Positive",
|
| 12 |
+
"inf_latency": 4.96,
|
| 13 |
"peak_gpu_memory": 2706.22,
|
| 14 |
+
"end_to_end_latency": 78.69,
|
| 15 |
}
|
| 16 |
|
| 17 |
defaults_vanilla_spam = {
|
| 18 |
+
"throughput": 184.58,
|
| 19 |
+
"mean_inference_latency": 32.2,
|
| 20 |
+
"mean_peak_gpu_memory": 3046.26,
|
| 21 |
"mean_padding_ratio": 69.53,
|
| 22 |
"mean_sequence_length": 128.0,
|
| 23 |
+
"effective_batch_size": 8.0,
|
| 24 |
}
|
| 25 |
|
| 26 |
defaults_bt_spam = {
|
| 27 |
+
"throughput": 312.21,
|
| 28 |
+
"mean_inference_latency": 14.42,
|
| 29 |
+
"mean_peak_gpu_memory": 2798.78,
|
| 30 |
"mean_padding_ratio": 69.53,
|
| 31 |
"mean_sequence_length": 128.0,
|
| 32 |
+
"effective_batch_size": 8.0,
|
| 33 |
}
|
| 34 |
|
| 35 |
BATCH_SIZE = 8 # fixed!
|