/
kube_job_hmax.yaml
45 lines (45 loc) · 1.65 KB
/
kube_job_hmax.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
apiVersion: batch/v1
kind: Job
metadata:
name: selfsup-tpu
spec:
# parallelism: 8 # Matches number of preemptables. This is the queue size.
template:
metadata:
annotations:
# The Cloud TPUs that will be created for this Job will support
# TensorFlow 2.2. This version MUST match the
# TensorFlow version that your model is built on.
tf-version.cloud-tpus.google.com: "1.15.2"
spec:
securityContext:
runAsUser: 0
restartPolicy: OnFailure
containers:
- name: resnet-tpu
# The official TensorFlow 1.15.2 image.
# https://hub.docker.com/r/tensorflow/tensorflow
# image: tensorflow/tensorflow:1.15.2
image: gcr.io/kubeflow-images-public/tensorflow-1.15.2-notebook-gpu:1.0.0
command: ["/bin/sh"]
args:
- -c
- >-
CHANNELS=32 &&
TD_LOSS=ar &&
BU_LOSS=ar &&
MASK=0 &&
git clone https://github.com/serre-lab/prj_selfsup.git &&
cd prj_selfsup &&
pip3 install --upgrade pip &&
pip3 install -r requirements.txt &&
pip3 install -U tpunicorn &&
export PATH="$HOME/.local/bin:$PATH" &&
pu list &&
bash jobs/hmax_supervised.sh $CHANNELS $TD_LOSS $BU_LOSS $MASK $(KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS)
resources:
limits:
# Request a single Preemptible v2-8 Cloud TPU device to train the
# model. A single v2-8 Cloud TPU device consists of 4 chips, each of
# which has 2 cores, so there are 8 cores in total.
cloud-tpus.google.com/preemptible-v3: 256