jadehardouin
commited on
Commit
·
42b592e
1
Parent(s):
0e893b5
Update models.py
Browse files
models.py
CHANGED
@@ -145,7 +145,7 @@ class DIYLlama2Model(BaseTCOModel):
|
|
145 |
r = maxed_out / 100
|
146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
147 |
|
148 |
-
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""")
|
149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
150 |
interactive=False,
|
151 |
visible=False)
|
@@ -156,7 +156,7 @@ class DIYLlama2Model(BaseTCOModel):
|
|
156 |
self.vm_cost_per_hour = gr.Number(4.42, label="Instance cost ($) per hour",
|
157 |
interactive=False, visible=False)
|
158 |
self.info_vm = gr.Markdown("This price above is from [CoreWeave's pricing web page](https://www.coreweave.com/gpu-cloud-pricing)", interactive=False, visible=False)
|
159 |
-
self.maxed_out = gr.Slider(minimum=1, maximum=100, value=65, step=1, label="Maxed out", info="Estimated average percentage of total GPU memory that is used. The instantaneous value can go from very high when many users are using the service to very low when no one does.")
|
160 |
self.info_maxed_out = gr.Markdown(r"""This percentage influences the input and output cost/token values, and more precisely the number of token/s. Here is the formula used:<br>
|
161 |
$CT = \frac{VM_C}{TS}$ where $TS = TS_{max} * \frac{MO}{100}$ <br>
|
162 |
with: <br>
|
|
|
145 |
r = maxed_out / 100
|
146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
147 |
|
148 |
+
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""", visible=False)
|
149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
150 |
interactive=False,
|
151 |
visible=False)
|
|
|
156 |
self.vm_cost_per_hour = gr.Number(4.42, label="Instance cost ($) per hour",
|
157 |
interactive=False, visible=False)
|
158 |
self.info_vm = gr.Markdown("This price above is from [CoreWeave's pricing web page](https://www.coreweave.com/gpu-cloud-pricing)", interactive=False, visible=False)
|
159 |
+
self.maxed_out = gr.Slider(minimum=1, maximum=100, value=65, step=1, label="Maxed out", info="Estimated average percentage of total GPU memory that is used. The instantaneous value can go from very high when many users are using the service to very low when no one does.", visible=False)
|
160 |
self.info_maxed_out = gr.Markdown(r"""This percentage influences the input and output cost/token values, and more precisely the number of token/s. Here is the formula used:<br>
|
161 |
$CT = \frac{VM_C}{TS}$ where $TS = TS_{max} * \frac{MO}{100}$ <br>
|
162 |
with: <br>
|