Skip to content

Commit

Permalink
Merge pull request #54 from jmhardison/add_grafana_controller
Browse files Browse the repository at this point in the history
Add grafana controller
  • Loading branch information
jmhardison committed Nov 11, 2023
2 parents 8892e1c + e09056a commit d9ca728
Show file tree
Hide file tree
Showing 15 changed files with 2,440 additions and 289 deletions.
2 changes: 1 addition & 1 deletion bootstrap/01_certmanager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: Kustomization
#namespace: cert-manager

bases:
- https://github.com/cert-manager/cert-manager/releases/download/v1.12.2/cert-manager.yaml
- https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml

resources:
- hhouse-cloudflare-api-token-secret_secrets.yaml
Expand Down
1,967 changes: 1,924 additions & 43 deletions bootstrap/03_argocd/base/ha-install.yaml

Large diffs are not rendered by default.

76 changes: 76 additions & 0 deletions core/grafana/agent.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: grafana-agent
namespace: grafana-agents
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: grafana-agent
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
- events
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
- /metrics/cadvisor
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: grafana-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana-agent
subjects:
- kind: ServiceAccount
name: grafana-agent
namespace: grafana-agents
---
apiVersion: monitoring.grafana.com/v1alpha1
kind: GrafanaAgent
metadata:
name: grafana-agent
namespace: grafana-agents
spec:
image: grafana/agent:v0.35.3
integrations:
selector:
matchLabels:
agent: grafana-agent
metrics:
externalLabels:
cluster: hhk8s
instanceSelector:
matchLabels:
agent: grafana-agent
scrapeInterval: 60s
logs:
instanceSelector:
matchLabels:
agent: grafana-agent
serviceAccountName: grafana-agent
132 changes: 132 additions & 0 deletions core/grafana/customcollect.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
apiVersion: monitoring.grafana.com/v1alpha1
kind: Integration
metadata:
labels:
agent: grafana-agent
name: node-exporter
namespace: grafana-agents
spec:
config:
autoscrape:
enable: true
metrics_instance: grafana-agents/grafana-agent-metrics
procfs_path: host/proc
rootfs_path: /host/root
sysfs_path: /host/sys
name: node_exporter
type:
allNodes: true
unique: true
volumeMounts:
- mountPath: /host/root
name: rootfs
- mountPath: /host/sys
name: sysfs
- mountPath: /host/proc
name: procfs
volumes:
- hostPath:
path: /
name: rootfs
- hostPath:
path: /sys
name: sysfs
- hostPath:
path: /proc
name: procfs
---
apiVersion: monitoring.grafana.com/v1alpha1
kind: MetricsInstance
metadata:
labels:
agent: grafana-agent
name: grafana-agent-metrics
namespace: grafana-agents
spec:
podMonitorNamespaceSelector: {}
podMonitorSelector:
matchLabels:
instance: primary
remoteWrite:
- basicAuth:
password:
key: password
name: metrics-secret
username:
key: username
name: metrics-secret
url: https://prometheus-prod-10-prod-us-central-0.grafana.net/api/prom/push
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector:
matchLabels:
instance: primary
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
instance: primary
name: cadvisor-monitor
namespace: grafana-agents
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 60s
metricRelabelings:
- action: keep
regex: kube_pod_container_status_waiting_reason|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|kube_daemonset_status_desired_number_scheduled|kubelet_certificate_manager_client_ttl_seconds|kube_node_status_condition|kubelet_node_config_error|rest_client_requests_total|kube_horizontalpodautoscaler_status_current_replicas|node_namespace_pod_container:container_memory_cache|kubelet_cgroup_manager_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_pod_info|kubelet_pod_worker_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kube_job_failed|kube_replicaset_owner|namespace_workload_pod:kube_pod_owner:relabel|kubelet_runtime_operations_errors_total|volume_manager_total_volumes|kubelet_server_expiration_renew_errors|container_memory_rss|container_memory_working_set_bytes|kubelet_running_container_count|container_fs_writes_bytes_total|namespace_cpu:kube_pod_container_resource_requests:sum|kubelet_running_pod_count|kube_statefulset_status_replicas_updated|kube_job_status_active|kube_node_status_capacity|kubelet_volume_stats_inodes|kube_statefulset_status_replicas|kube_deployment_status_replicas_updated|kube_node_status_allocatable|kube_statefulset_status_replicas_ready|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_pod_worker_duration_seconds_bucket|kubelet_runtime_operations_total|kube_horizontalpodautoscaler_spec_max_replicas|kube_statefulset_status_current_revision|node_namespace_pod_container:container_memory_rss|kubelet_pleg_relist_duration_seconds_count|kube_daemonset_status_updated_number_scheduled|kube_horizontalpodautoscaler_spec_min_replicas|container_cpu_usage_seconds_total|kubelet_node_name|kubelet_certificate_manager_client_expiration_renew_errors|kube_pod_owner|container_network_transmit_packets_dropped_total|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_persistentvolumeclaim_resource_requests_storage_bytes|storage_operation_errors_total|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_bucket|go_goroutines|kube_statefulset_status_observed_generation|container_fs_reads_total|container_cpu_cfs_periods_total|kubelet_running_containers|kube_daemonset_status_number_misscheduled|container_network_receive_packets_total|kube_node_info|kube_namespace_status_phase|process_resident_memory_bytes|kube_pod_status_phase|container_network_transmit_packets_total|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_memory_cache|kubelet_running_pods|kube_job_status_start_time|kube_node_spec_taint|container_network_receive_packets_dropped_total|kube_pod_container_resource_requests|kubelet_volume_stats_available_bytes|node_filesystem_avail_bytes|kube_statefulset_metadata_generation|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_daemonset_status_number_available|namespace_cpu:kube_pod_container_resource_limits:sum|container_fs_reads_bytes_total|kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|process_cpu_seconds_total|container_network_receive_bytes_total|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_inodes_used|kube_statefulset_replicas|kube_statefulset_status_update_revision|kube_deployment_status_replicas_available|kube_deployment_metadata_generation|kubernetes_build_info|namespace_memory:kube_pod_container_resource_limits:sum|namespace_workload_pod|kube_pod_status_reason|kube_deployment_status_observed_generation|container_memory_swap|kube_deployment_spec_replicas|node_filesystem_size_bytes|kubelet_pod_start_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|machine_memory_bytes|kube_resourcequota|container_cpu_cfs_throttled_periods_total|container_network_transmit_bytes_total|storage_operation_duration_seconds_count|kubelet_pod_start_duration_seconds_count|kubelet_certificate_manager_server_ttl_seconds|kube_namespace_status_phase|container_cpu_usage_seconds_total|kube_pod_status_phase|kube_pod_start_time|kube_pod_container_status_restarts_total|kube_pod_container_info|kube_pod_container_status_waiting_reason|kube_daemonset.*|kube_replicaset.*|kube_statefulset.*|kube_job.*|kube_node.*|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|namespace_cpu:kube_pod_container_resource_requests:sum|node_cpu.*|node_memory.*|node_filesystem.*|node_network_transmit_bytes_total
sourceLabels:
- __name__
path: /metrics/cadvisor
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
- action: replace
replacement: integrations/kubernetes/cadvisor
targetLabel: job
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
any: true
selector:
matchLabels:
app.kubernetes.io/name: kubelet
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
instance: primary
name: kubelet-monitor
namespace: grafana-agents
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 60s
metricRelabelings:
- action: keep
regex: kube_pod_container_status_waiting_reason|cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits|kube_daemonset_status_desired_number_scheduled|kubelet_certificate_manager_client_ttl_seconds|kube_node_status_condition|kubelet_node_config_error|rest_client_requests_total|kube_horizontalpodautoscaler_status_current_replicas|node_namespace_pod_container:container_memory_cache|kubelet_cgroup_manager_duration_seconds_count|kube_horizontalpodautoscaler_status_desired_replicas|container_fs_writes_total|kube_daemonset_status_current_number_scheduled|kube_pod_info|kubelet_pod_worker_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kube_job_failed|kube_replicaset_owner|namespace_workload_pod:kube_pod_owner:relabel|kubelet_runtime_operations_errors_total|volume_manager_total_volumes|kubelet_server_expiration_renew_errors|container_memory_rss|container_memory_working_set_bytes|kubelet_running_container_count|container_fs_writes_bytes_total|namespace_cpu:kube_pod_container_resource_requests:sum|kubelet_running_pod_count|kube_statefulset_status_replicas_updated|kube_job_status_active|kube_node_status_capacity|kubelet_volume_stats_inodes|kube_statefulset_status_replicas|kube_deployment_status_replicas_updated|kube_node_status_allocatable|kube_statefulset_status_replicas_ready|node_namespace_pod_container:container_memory_working_set_bytes|kubelet_pod_worker_duration_seconds_bucket|kubelet_runtime_operations_total|kube_horizontalpodautoscaler_spec_max_replicas|kube_statefulset_status_current_revision|node_namespace_pod_container:container_memory_rss|kubelet_pleg_relist_duration_seconds_count|kube_daemonset_status_updated_number_scheduled|kube_horizontalpodautoscaler_spec_min_replicas|container_cpu_usage_seconds_total|kubelet_node_name|kubelet_certificate_manager_client_expiration_renew_errors|kube_pod_owner|container_network_transmit_packets_dropped_total|node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile|kube_persistentvolumeclaim_resource_requests_storage_bytes|storage_operation_errors_total|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_bucket|go_goroutines|kube_statefulset_status_observed_generation|container_fs_reads_total|container_cpu_cfs_periods_total|kubelet_running_containers|kube_daemonset_status_number_misscheduled|container_network_receive_packets_total|kube_node_info|kube_namespace_status_phase|process_resident_memory_bytes|kube_pod_status_phase|container_network_transmit_packets_total|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|container_memory_cache|kubelet_running_pods|kube_job_status_start_time|kube_node_spec_taint|container_network_receive_packets_dropped_total|kube_pod_container_resource_requests|kubelet_volume_stats_available_bytes|node_filesystem_avail_bytes|kube_statefulset_metadata_generation|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|cluster:namespace:pod_memory:active:kube_pod_container_resource_requests|cluster:namespace:pod_memory:active:kube_pod_container_resource_limits|kube_daemonset_status_number_available|namespace_cpu:kube_pod_container_resource_limits:sum|container_fs_reads_bytes_total|kube_pod_container_resource_limits|node_namespace_pod_container:container_memory_swap|process_cpu_seconds_total|container_network_receive_bytes_total|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_inodes_used|kube_statefulset_replicas|kube_statefulset_status_update_revision|kube_deployment_status_replicas_available|kube_deployment_metadata_generation|kubernetes_build_info|namespace_memory:kube_pod_container_resource_limits:sum|namespace_workload_pod|kube_pod_status_reason|kube_deployment_status_observed_generation|container_memory_swap|kube_deployment_spec_replicas|node_filesystem_size_bytes|kubelet_pod_start_duration_seconds_bucket|namespace_memory:kube_pod_container_resource_requests:sum|machine_memory_bytes|kube_resourcequota|container_cpu_cfs_throttled_periods_total|container_network_transmit_bytes_total|storage_operation_duration_seconds_count|kubelet_pod_start_duration_seconds_count|kubelet_certificate_manager_server_ttl_seconds|kube_namespace_status_phase|container_cpu_usage_seconds_total|kube_pod_status_phase|kube_pod_start_time|kube_pod_container_status_restarts_total|kube_pod_container_info|kube_pod_container_status_waiting_reason|kube_daemonset.*|kube_replicaset.*|kube_statefulset.*|kube_job.*|kube_node.*|node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate|cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests|namespace_cpu:kube_pod_container_resource_requests:sum|node_cpu.*|node_memory.*|node_filesystem.*|node_network_transmit_bytes_total
sourceLabels:
- __name__
path: /metrics
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
- action: replace
replacement: integrations/kubernetes/kubelet
targetLabel: job
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
any: true
selector:
matchLabels:
app.kubernetes.io/name: kubelet
34 changes: 34 additions & 0 deletions core/grafana/eventcollect.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: agent-eventhandler
namespace: grafana-agents
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: monitoring.grafana.com/v1alpha1
kind: Integration
metadata:
labels:
agent: grafana-agent
name: agent-eventhandler
namespace: grafana-agents
spec:
config:
cache_path: /etc/eventhandler/eventhandler.cache
logs_instance: grafana-agents/grafana-agent-logs
name: eventhandler
type:
unique: true
volumeMounts:
- mountPath: /etc/eventhandler
name: agent-eventhandler
volumes:
- name: agent-eventhandler
persistentVolumeClaim:
claimName: agent-eventhandler
16 changes: 16 additions & 0 deletions core/grafana/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: grafana-agents

resources:
- namespace.yaml
- agent.yaml
- customcollect.yaml
- op-secret.yaml
- eventcollect.yaml

helmCharts:
- name: grafana-agent-operator
repo: https://grafana.github.io/helm-charts
namespace: grafana-agents
includeCRDs: true
8 changes: 8 additions & 0 deletions core/grafana/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
kubernetes.io/metadata.name: grafana-agents
annotations:
operator.1password.io/auto-restart: "true"
name: grafana-agents
7 changes: 7 additions & 0 deletions core/grafana/op-secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: onepassword.com/v1
kind: OnePasswordItem
metadata:
name: metrics-secret
namespace: grafana-agents
spec:
itemPath: "vaults/k8s-prod/items/grafanapass"
16 changes: 0 additions & 16 deletions core/kube-state-metrics/cluster-role-binding.yaml

This file was deleted.

0 comments on commit d9ca728

Please sign in to comment.