kube_job_hmax.yaml

apiVersion: batch/v1
kind: Job
metadata:
  name: selfsup-tpu
spec:
  # parallelism: 8  # Matches number of preemptables. This is the queue size.
  template:
    metadata:
      annotations:
        # The Cloud TPUs that will be created for this Job will support
        # TensorFlow 2.2. This version MUST match the
        # TensorFlow version that your model is built on.
        tf-version.cloud-tpus.google.com: "1.15.2"
    spec:
      securityContext:
        runAsUser: 0
      restartPolicy: OnFailure
      containers:
      - name: resnet-tpu
        # The official TensorFlow 1.15.2 image.
        # https://hub.docker.com/r/tensorflow/tensorflow
        # image: tensorflow/tensorflow:1.15.2
        image: gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
        command: ["/bin/sh"]
        args:
          - -c
          - >-
            CHANNELS=32 &&
            TD_LOSS=ar &&
            BU_LOSS=ar &&
            MASK=0 &&
            git clone https://github.com/serre-lab/prj_selfsup.git &&
            cd prj_selfsup &&
            pip3 install --upgrade pip &&
            pip3 install -r requirements.txt &&
            pip3 install -U tpunicorn &&
            export PATH="$HOME/.local/bin:$PATH" &&
            pu list &&
            bash jobs/hmax_supervised.sh $CHANNELS $TD_LOSS $BU_LOSS $MASK $(KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS)
        resources:
          limits:
            # Request a single Preemptible v2-8 Cloud TPU device to train the
            # model. A single v2-8 Cloud TPU device consists of 4 chips, each of
            # which has 2 cores, so there are 8 cores in total.
            cloud-tpus.google.com/preemptible-v3: 256