Komodo provides an intuitive YAML interface to specify a job/machine/service based on the SkyPilot YAML specification. A task YAML is used to specify resource requirements, setup commands, run commands, file mounts, storage mounts, and so on.

Available fields:

# Working directory (optional), synced to ~/sky_workdir on the remote cluster
# each time launch or exec is run with the yaml file.
#
# Commands in "setup" and "run" will be executed under this directory.
#
# If a relative path is used, it's evaluated relative to the location from
# which `komo <cmd> launch` is called.
# 
# The workdir must be less than 1GB, otherwise your task will fail to launch.
# For larger files, store them in a cloud storage bucket and see the file_mounts
# section below.
workdir: ~/my-task-code

# Number of nodes (optional; defaults to 1) to launch.
num_nodes: 1

# Per-node resource requirements (optional).
resources:
  # The region to use (optional). Auto-failover will be disabled
  # if this is specified.
  region: us-east-1

  # Accelerator name and count per node (optional).
  #
  # The following three ways are valid for specifying accelerators for a cluster:
  #
  #   To specify a single type of accelerator:
  #     Format: <name>:<count> (or simply <name>, short for a count of 1).
  #     accelerators: V100:4
  #
  #   To specify an ordered list of accelerators (try the accelerators in
  #   the specified order):
  #     Format: [<name>:<count>, ...]
  #     accelerators: ['K80:1', 'V100:1', 'T4:1']
  #
  #   To specify an unordered set of accelerators (optimize all specified
  #   accelerators together, and try accelerator with lowest cost first):
  #     Format: {<name>:<count>, ...}
  #     accelerators: {'K80:1', 'V100:1', 'T4:1'}
  accelerators: V100:4

  # Ports to expose (optional).
  #
  # All ports specified here will be exposed to the public Internet.
  #
  # Could be an integer, a range, or a list of integers and ranges:
  #   To specify a single port:
  #     ports: 8081
  #   To specify a port range:
  #     ports: 10052-10100
  #   To specify multiple ports / port ranges:
  #     ports:
  #       - 8080
  #       - 10022-10040
  ports: 8081

  # Number of vCPUs per node (optional).
  #
  # Format:
  #   <count>: exactly <count> vCPUs
  #   <count>+: at least <count> vCPUs
  #
  # E.g., 4+ means first try to find an instance type with >= 4 vCPUs. If
  # not found, use the next cheapest instance with more than 4 vCPUs.
  cpus: 4+

  # Memory in GiB per node (optional).
  #
  # Format:
  #  <num>: exactly <num> GiB
  #  <num>+: at least <num> GiB
  #
  # E.g., 32+ means first try to find an instance type with >= 32 GiB. If
  # not found, use the next cheapest instance with more than 32 GiB.
  memory: 32+

  # Instance type to use (optional). If 'accelerators' is specified,
  # the corresponding instance type is automatically inferred.
  instance_type: p3.8xlarge

  #####################################################################################################
  ### ALL FIELDS BELOW IN THIS RESOURCES SECTION ARE ONLY APPLICABLE WHEN CONNECTING YOUR OWN CLOUD ###
  #####################################################################################################

  # The cloud to use (optional). If not specified, Komodo Cloud will be used.
  cloud: aws

  # The zone to use (optional). Auto-failover will be disabled
  # if this is specified.
  zone: us-east-1a

  # Whether the cluster should use spot instances (optional).
  # If unspecified, defaults to False (on-demand instances).
  # NOTE: spot instances are not currently supported, this must be set to False
  use_spot: False

  # Disk size in GB to allocate for OS (mounted at /). Increase this if you
  # have a large working directory or tasks that write out large outputs.
  disk_size: 256

  # Disk tier to use for OS (optional).
  # Could be one of 'low', 'medium', 'high' or 'best' (default: 'medium').
  # if 'best' is specified, use the best disk tier enabled.
  # Rough performance estimate:
  #   low: 500 IOPS; read 20MB/s; write 40 MB/s
  #   medium: 3000 IOPS; read 220 MB/s; write 200 MB/s
  #   high: 6000 IOPS; 340 MB/s; write 250 MB/s
  disk_tier: medium

  # Additional accelerator metadata (optional); only used for TPU node
  # and TPU VM.
  # Example usage:
  #
  #   To request a TPU VM:
  #     accelerator_args:
  #       tpu_vm: True (optional, default: True)
  #
  #   To request a TPU node:
  #     accelerator_args:
  #       tpu_name: ...
  #       tpu_vm: False
  #
  # By default, the value for "runtime_version" is decided based on which is
  # requested and should work for either case. If passing in an incompatible
  # version, GCP will throw an error during provisioning.
  accelerator_args:
    # Default is "tpu-vm-base" for TPU VM and "2.12.0" for TPU node.
    runtime_version: tpu-vm-base
  # tpu_name: mytpu
  # tpu_vm: True  # True to use TPU VM (the default); False to use TPU node.

  # Custom image id (optional, advanced). The image id used to boot the
  # instances. Only supported for AWS and GCP (for non-docker image). If not
  # specified, Komodo will use the default debian-based image suitable for
  # machine learning tasks.
  #
  # Docker support
  # You can specify docker image to use by setting the image_id to
  # `docker:<image name>`, and is only supported on Azure, AWS and GCP. For example,
  #   image_id: docker:ubuntu:latest
  # Currently, only debian and ubuntu images are supported.
  # If you want to use a docker image in a private registry, you can specify your
  # username, password, and registry server as task environment variable. For
  # details, please refer to the `envs` section below.
  #
  # AWS
  # To find AWS AMI ids: https://leaherb.com/how-to-find-an-aws-marketplace-ami-image-id
  # You can also change the default OS version by choosing from the
  # following image tags provided by SkyPilot:
  #   image_id: skypilot:gpu-ubuntu-2004
  #   image_id: skypilot:k80-ubuntu-2004
  #   image_id: skypilot:gpu-ubuntu-1804
  #   image_id: skypilot:k80-ubuntu-1804
  #
  # It is also possible to specify a per-region image id (failover will only
  # go through the regions specified as keys; useful when you have the
  # custom images in multiple regions):
  #   image_id:
  #     us-east-1: ami-0729d913a335efca7
  #     us-west-2: ami-050814f384259894c
  image_id: ami-0868a20f5a3bf9702
  # GCP
  # To find GCP images: https://cloud.google.com/compute/docs/images
  # image_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310
  # Or machine image: https://cloud.google.com/compute/docs/machine-images
  # image_id: projects/my-project/global/machineImages/my-machine-image

  # Labels to apply to the instances (optional).
  #
  # If specified, these labels will be applied to the VMs or pods created
  # by Komodo Control Plane. These are useful for assigning metadata that may be
  # used by external tools. Implementation depends on the chosen cloud -
  # On AWS, labels map to instance tags. On GCP, labels map to instance
  # labels. On Kubernetes, labels map to pod labels. On other clouds,
  # labels are not supported and will be ignored.
  #
  labels:
    my-label: my-value

# Environment variables (optional). These values can be accessed in the
# `file_mounts`, `setup`, and `run` sections below.
envs:
  MY_BUCKET: s3://my-s3-bucket
  MY_LOCAL_PATH: tmp-workdir
  MODEL_SIZE: 13b

file_mounts:
  # Mounts an S3 bucket at /datasets-storage
  /datasets-storage:
    source: s3://my-s3-bucket
    mode: MOUNT # Either MOUNT or COPY. Defaults to MOUNT. Optional.

  # Demoing env var usage.
  /mydir:
    source: ${MY_BUCKET} # Name of the bucket.
    mode: MOUNT

# Setup script (optional) to execute on every `komo <CMD> launch`.
# This is executed before the 'run' commands.
#
# The '|' separator indicates a multiline string. To specify a single command:
#   setup: pip install -r requirements.txt
setup: |
  echo "Begin setup."
  pip install -r requirements.txt
  echo "Setup complete."

# NOTE: this only applies to machines, and will be ignored for jobs/services
# Start a Jupyter notebook on the machine (default is False). It will be started
# on port 8888
notebook: true

#############################################################################
# The following sections ONLY apply to jobs and services
#############################################################################

# Main program 
run: |
  echo "Beginning task."
  python train.py

  # Demoing env var usage.
  echo Env var MODEL_SIZE has value: ${MODEL_SIZE}

#############################################################################
# The following sections ONLY apply to services
#############################################################################

service:

  # Readiness probe (required). Used by Komodo Load Balancer to check if your service
  # replicas are ready for accepting traffic. If the readiness probe returns
  # a 200, Komodo Load Balancer will start routing traffic to that replica.
  readiness_probe:
    # Path to probe (required).
    path: /v1/models
    # Post data (optional). If this is specified, the readiness probe will use
    # POST instead of GET, and the post data will be sent as the request body.
    post_data: {'model_name': 'model'}
    # Initial delay in seconds (optional). Defaults to 1200 seconds (20 minutes).
    # Any readiness probe failures during this period will be ignored. This is
    # highly related to your service, so it is recommended to set this value
    # based on your service's startup time.
    initial_delay_seconds: 1200

  # Replica autoscaling policy. This describes how Komodo Load Balancer autoscales
  # your service based on the QPS (queries per second) of your service.
  replica_policy:
    # Minimum number of replicas (required).
    min_replicas: 1
    # Maximum number of replicas (optional). If not specified, Komodo Load Balancer will
    # use a fixed number of replicas (the same as min_replicas) and ignore
    # any QPS threshold specified below.
    max_replicas: 3
    # Following specs describe the autoscaling policy.
    # Target query per second per replica (optional). Komodo Load Balancer will scale your
    # service so that, ultimately, each replica manages approximately
    # target_qps_per_replica queries per second.
    target_qps_per_replica: 5
    # Upscale and downscale delay in seconds (optional). Defaults to 300 seconds
    # (5 minutes) and 1200 seconds (20 minutes) respectively. To avoid aggressive
    # autoscaling, Komodo Load Balancer will only upscale or downscale your service if the
    # QPS of your service is higher or lower than the target QPS for a period
    # of time. This period of time is controlled by upscale_delay_seconds and
    # downscale_delay_seconds. The default values should work in most cases.
    # If you want to scale your service more aggressively, you can set
    # these values to a smaller number.
    upscale_delay_seconds: 300
    downscale_delay_seconds: 1200