envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN>
service:
replica_policy:
# by not specifying max_replicas and target_qps_per_second, the service will
# always run on exactly 1 replica, with no autoscaling
min_replicas: 1
readiness_probe:
initial_delay_seconds: 1800
path: /health
resources:
gpus: A100:1
ports: 8000
setup: |
pip install vllm vllm-flash-attn
run: |
vllm serve meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 1 --port 8000