envs:
  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
  HF_TOKEN: <REPLACE_WITH_YOUR_HUGGINGFACE_TOKEN>
service:
  replica_policy:
    # by not specifying max_replicas and target_qps_per_second, the service will
    # always run on exactly 1 replica, with no autoscaling
    min_replicas: 1
  readiness_probe:
    initial_delay_seconds: 1800
    path: /health
resources:
  gpus: A100:1
  ports: 8000
setup: |
  pip install vllm vllm-flash-attn
run: |
  vllm serve meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 1 --port 8000