Skip to content

Commit

Permalink
Create multi-arch (amd64,arm64) images. (#41)
Browse files Browse the repository at this point in the history
* Create multi-arch (amd64,arm64) images.

* Create 'local' build target easy 'docker images' integration

Signed-off-by: Douglas Wightman <dwightman@nvidia.com>
  • Loading branch information
glowkey committed Jan 30, 2022
1 parent 8c01dfc commit 6bfdd1a
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 33 deletions.
27 changes: 20 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

DOCKER ?= docker
MKDIR ?= mkdir
REGISTRY ?= nvidia

DCGM_VERSION := 2.3.2
GOLANG_VERSION := 1.17
VERSION := 2.6.2
VERSION := 2.6.3
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/tmp/dcgm-exporter.tar
PLATFORMS := linux/amd64,linux/arm64
DOCKERCMD := docker buildx build

NON_TEST_FILES := pkg/dcgmexporter/dcgm.go pkg/dcgmexporter/gpu_collector.go pkg/dcgmexporter/parser.go
NON_TEST_FILES += pkg/dcgmexporter/pipeline.go pkg/dcgmexporter/server.go pkg/dcgmexporter/system_info.go
NON_TEST_FILES += pkg/dcgmexporter/types.go pkg/dcgmexporter/utils.go pkg/dcgmexporter/kubernetes.go
NON_TEST_FILES += cmd/dcgm-exporter/main.go
MAIN_TEST_FILES := pkg/dcgmexporter/system_info_test.go

.PHONY: all binary install check-format
.PHONY: all binary install check-format local
all: ubuntu20.04 ubi8

binary:
Expand All @@ -46,18 +48,29 @@ check-format:
test $$(gofmt -l cmd | tee /dev/stderr | wc -l) -eq 0

push:
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04"
$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8"
$(MAKE) ubuntu20.04 OUTPUT=type=registry
$(MAKE) ubi8 OUTPUT=type=registry

local:
ifeq ($(shell uname -p),aarch64)
$(MAKE) PLATFORMS=linux/arm64 OUTPUT=type=docker DOCKERCMD='docker build'
else
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
endif

ubuntu20.04:
$(DOCKER) build --pull \
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--platform $(PLATFORMS) \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \
--file docker/Dockerfile.ubuntu20.04 .

ubi8:
$(DOCKER) build --pull \
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--platform $(PLATFORMS) \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(FULL_VERSION)" \
Expand Down
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https

To gather metrics on a GPU node, simply start the `dcgm-exporter` container:
```
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.2-ubuntu20.04
$ docker run -d --gpus all --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.3-ubuntu20.04
$ curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down Expand Up @@ -135,6 +135,26 @@ You will also find the `json` file on this repo under `grafana/dcgm-exporter-das

Pull requests are accepted!


### Building the containers

This project uses [docker buildx](https://docs.docker.com/buildx/working-with-buildx/) for multi-arch image creation. Follow the instructions on that page to get a working builder instance for creating these containers. Some other useful build options follow.

Builds local images based on the machine architecture and makes them available in 'docker images'
```
make local
```

Build the ubuntu image and export to 'docker images'
```
make ubuntu20.04 PLATFORMS=linux/amd64 OUTPUT=type=docker
```

Build and push the images to some other 'private_registry'
```
make REGISTRY=<private_registry> push
```

## Issues and Contributing

[Checkout the Contributing document!](CONTRIBUTING.md)
Expand Down
12 changes: 6 additions & 6 deletions dcgm-exporter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
spec:
updateStrategy:
type: RollingUpdate
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
template:
metadata:
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
name: "dcgm-exporter"
spec:
containers:
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.2-ubuntu20.04"
- image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.3-ubuntu20.04"
env:
- name: "DCGM_EXPORTER_LISTEN"
value: ":9400"
Expand Down Expand Up @@ -64,11 +64,11 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
spec:
selector:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
ports:
- name: "metrics"
port: 9400
4 changes: 2 additions & 2 deletions deployment/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
apiVersion: v2
name: dcgm-exporter
description: A Helm chart for DCGM exporter
version: "2.6.2"
version: "2.6.3"
kubeVersion: ">= 1.13.0-0"
appVersion: "2.6.2"
appVersion: "2.6.3"
sources:
- https://github.com/nvidia/dcgm-exporter
home: https://github.com/nvidia/dcgm-exporter/
Expand Down
2 changes: 1 addition & 1 deletion deployment/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ image:
pullPolicy: IfNotPresent
# Image tag defaults to AppVersion, but you can use the tag key
# for the image tag, e.g:
tag: 2.3.2-2.6.2-ubuntu20.04
tag: 2.3.2-2.6.3-ubuntu20.04

# Comment the following line to stop profiling metrics from DCGM
arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
Expand Down
7 changes: 3 additions & 4 deletions docker/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ COPY . .

RUN make binary check-format

FROM nvcr.io/nvidia/cuda:11.4.2-base-ubi8
FROM nvcr.io/nvidia/cuda:11.6.0-base-ubi8
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"

ARG DCGM_VERSION
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
dnf clean expire-cache
RUN dnf install -y datacenter-gpu-manager-${DCGM_VERSION}

RUN dnf clean expire-cache && dnf install -y datacenter-gpu-manager-${DCGM_VERSION} libcap

COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
COPY etc /etc/dcgm-exporter
Expand Down
12 changes: 2 additions & 10 deletions docker/Dockerfile.ubuntu20.04
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,15 @@ COPY . .

RUN make binary check-format

FROM nvcr.io/nvidia/cuda:11.4.2-base-ubuntu20.04
FROM nvcr.io/nvidia/cuda:11.6.0-base-ubuntu20.04
LABEL io.k8s.display-name="NVIDIA DCGM Exporter"

COPY --from=builder /go/src/github.com/NVIDIA/dcgm-exporter/cmd/dcgm-exporter/dcgm-exporter /usr/bin/
COPY etc /etc/dcgm-exporter

ARG DCGM_VERSION
RUN apt-get update && apt-get install -y --no-install-recommends \
libcap2-bin gnupg2 curl ca-certificates && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get purge --autoremove -y curl \
&& rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install -y --no-install-recommends \
datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl
datacenter-gpu-manager=1:${DCGM_VERSION} libcap2-bin && apt-get purge --autoremove -y openssl

# Required for DCP metrics
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
Expand Down
4 changes: 2 additions & 2 deletions service-monitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ metadata:
name: "dcgm-exporter"
labels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "dcgm-exporter"
app.kubernetes.io/version: "2.6.2"
app.kubernetes.io/version: "2.6.3"
endpoints:
- port: "metrics"
path: "/metrics"

0 comments on commit 6bfdd1a

Please sign in to comment.