Spaces:

adaptiveaiventures
/

llama2-interference

Runtime error

adaptiveaiventures commited on Jan 18

Commit

1b4a88d

verified ·

1 Parent(s): b6df890

Update Dockerfile

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -1,7 +1,7 @@
 FROM ghcr.io/huggingface/text-generation-inference:latest
-# Define the model to use
 ENV MODEL_ID="adaptiveaiventures/Llama-2-7b-chat-finetune"
-# Set the number of GPU shards (1 if using CPU, 2+ if using multiple GPUs)
-CMD ["--model-id", "adaptiveaiventures/Llama-2-7b-chat-finetune", "--port", "8080", "--num-shard", "1"]

 FROM ghcr.io/huggingface/text-generation-inference:latest
+# Define Model
 ENV MODEL_ID="adaptiveaiventures/Llama-2-7b-chat-finetune"
+# Run the TGI server
+CMD ["--model-id", "${MODEL_ID}", "--port", "8080", "--num-shard", "1", "--dtype", "bfloat16", "--max-batch-prefill-tokens", "1024", "--disable-custom-kernels"]