Merge pull request #2 from anunarapureddy/main

Section 5: Sampling content
pavolloffay · Mar 11, 2024 · d4b917c · d4b917c
2 parents 899d294 + e241490
commit d4b917c
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 27 deletions.
diff --git a/05-sampling.md b/05-sampling.md
@@ -8,15 +8,6 @@ This tutorial step covers the basic usage of the OpenTelemetry Collector on Kube
 
 [excalidraw](https://excalidraw.com/#json=15BrdSOMEkc9RA5cxeqwz,urTmfk01mbx7V-PpQI7KgA)
 
-### OpenTelemetry Collector on k8s
-
-After installing the OpenTelemetry Operator, the `v1alpha1.OpenTelemetryCollector` simplifies the operation of the OpenTelemetry Collector on Kubernetes. There are different deployment modes available, breaking config changes are migrated automatically, provides integration with Prometheus (including operating on Prometheus Operator CRs) and simplifies sidecar injection.
-
-TODO: update collector
-```yaml
-
-```
-
 ## Sampling, what does it mean and why is it important?
 
 Sampling refers to the practice of selectively capturing and recording traces of requests flowing through a distributed system, rather than capturing every single request. It is crucial in distributed tracing systems because modern distributed applications often generate a massive volume of requests and transactions, which can overwhelm the tracing infrastructure or lead to excessive storage costs if every request is
@@ -64,9 +55,16 @@ https://opentelemetry.io/docs/languages/sdk-configuration/general/#otel_traces_s
 
 Tail sampling is where the decision to sample a trace takes place by considering all or most of the spans within the trace. Tail Sampling gives you the option to sample your traces based on specific criteria derived from different parts of a trace, which isn’t an option with Head Sampling.
 
-Usecase: Sample 100% of the traces that have an error-ing span in them.
+Deploy the opentelemetry collector with `tail_sampling` enabled.
+
+```shell
+kubectl apply -f https://raw.githubusercontent.com/pavolloffay/kubecon-eu-2024-opentelemetry-kubernetes-tracing-tutorial/backend/05-tail-sampling-collector.yaml
+kubectl get pods -n observability-backend -w
+```
 
 ```yaml
+  # Sample 100% of traces with ERROR-ing spans (omit traces with all OK spans)
+  # and traces which have a duration longer than 500ms
   processors: 
     tail_sampling:
       decision_wait: 10s # time to wait before making a sampling decision is made
@@ -87,11 +85,8 @@ Usecase: Sample 100% of the traces that have an error-ing span in them.
         ]
 ```
 
-Applying this chart will start a new collector with the tailsampling processor
+<TODO: Add screenshot>
 
-```shell
-kubectl apply -f https://raw.githubusercontent.com/pavolloffay/kubecon-eu-2024-opentelemetry-kubernetes-tracing-tutorial/backend/03-tail-sampling-config.yaml
-```
 
 -----
 ### Advanced Topic: Sampling at scale with OpenTelemetry
@@ -102,4 +97,13 @@ Requires two deployments of the Collector, the first layer routing all the spans
 
 [excalidraw](https://excalidraw.com/#room=6a15d65ba4615c535a40,xcZD6DG977owHRoxpYY4Ag)
 
+Apply the YAML below to deploy a layer of Collectors containing the load-balancing exporter in front of collectors performing tail-sampling:
+
+```shell
+kubectl apply -f https://raw.githubusercontent.com/pavolloffay/kubecon-eu-2024-opentelemetry-kubernetes-tracing-tutorial/backend/05-scale-otel-collectors.yaml
+kubectl get pods -n observability-backend -w
+```
+
+<TODO: Add screenshot>
+
 [Next steps](./06-RED-metrics.md)
diff --git a/app/instrumentation-head-sampling.yaml b/app/instrumentation-head-sampling.yaml
@@ -0,0 +1,28 @@
+apiVersion: opentelemetry.io/v1alpha1
+kind: Instrumentation
+metadata:
+  name: my-instrumentation
+  namespace: tutorial-application
+spec:
+  exporter:
+    endpoint: http://otel-collector.observability-backend.svc.cluster.local:4317
+  propagators:
+    - tracecontext
+    - baggage
+    - b3
+  sampler:
+    type: parentbased_traceidratio
+    argument: "0.5"
+  resource:
+    addK8sUIDAttributes: false
+  python:
+    env:
+      # Required if endpoint is set to 4317.
+      # Python autoinstrumentation uses http/proto by default
+      # so data must be sent to 4318 instead of 4317.
+      - name: OTEL_EXPORTER_OTLP_ENDPOINT
+        value: http://otel-collector.observability-backend.svc.cluster.local:4318
+  java:
+    env:
+      - name: OTEL_LOGS_EXPORTER
+        value: otlp
diff --git a/backend/05-scale-otel-collectors.yaml b/backend/05-scale-otel-collectors.yaml
@@ -0,0 +1,137 @@
+apiVersion: opentelemetry.io/v1alpha1
+kind: OpenTelemetryCollector
+metadata:
+  name: otel
+  namespace: observability-backend
+spec:
+  image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.94.0
+  mode: deployment
+  replicas: 1
+  ports:
+    - port: 8888
+      protocol: TCP
+      name: metrics
+  config: |
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+    
+    processors:   
+      # Sample 100% of traces with ERROR-ing spans (omit traces with all OK spans)
+      # and traces which have a duration longer than 500ms
+      tail_sampling:
+        decision_wait: 10s # time to wait before making a sampling decision is made
+        num_traces: 100 # number of traces to be kept in memory
+        expected_new_traces_per_sec: 10 # expected rate of new traces per second
+        policies:
+          - name: keep-errors
+            type: status_code
+            status_code: {status_codes: [ERROR]}
+          - name: keep-slow-traces
+            type: latency
+            latency: {threshold_ms: 500}
+
+    exporters:
+      debug:
+        verbosity: detailed
+      loadbalancing:
+        protocol:
+          otlp:
+            timeout: 1s
+            tls:
+              insecure: true
+        resolver:
+          k8s:
+            service: otel-gateway.observability-backend
+            ports: 
+              - 4317
+      
+      otlphttp/metrics:
+        endpoint: http://prometheus.observability-backend.svc.cluster.local:80/api/v1/otlp/
+        tls:
+          insecure: true
+      
+      debug:
+        verbosity: detailed
+
+    service:
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [tail_sampling]
+          exporters: [otlp/traces]
+        metrics:
+          receivers: [otlp]
+          exporters: [otlphttp/metrics]
+        logs:
+          receivers: [otlp]
+          exporters: [debug]
+---
+apiVersion: opentelemetry.io/v1alpha1
+kind: OpenTelemetryCollector
+metadata:
+  name: otel-gateway
+  namespace: observability-backend
+spec:
+  image: ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:0.94.0
+  mode: deployment
+  replicas: 2
+  ports:
+    - port: 8888
+      protocol: TCP
+      name: metrics
+  config: |
+    receivers:
+      otlp:
+        protocols:
+          grpc:
+            endpoint: 0.0.0.0:4317
+          http:
+            endpoint: 0.0.0.0:4318
+    
+    processors:   
+      # Sample 100% of traces with ERROR-ing spans (omit traces with all OK spans)
+      # and traces which have a duration longer than 500ms
+      tail_sampling:
+        decision_wait: 10s # time to wait before making a sampling decision is made
+        num_traces: 100 # number of traces to be kept in memory
+        expected_new_traces_per_sec: 10 # expected rate of new traces per second
+        policies:
+          - name: keep-errors
+            type: status_code
+            status_code: {status_codes: [ERROR]}
+          - name: keep-slow-traces
+            type: latency
+            latency: {threshold_ms: 500}
+
+    exporters:
+      otlp/traces:
+        endpoint: jaeger-collector:4317
+        tls:
+          insecure: true
+      
+      otlphttp/metrics:
+        endpoint: http://prometheus.observability-backend.svc.cluster.local:80/api/v1/otlp/
+        tls:
+          insecure: true
+      
+      debug:
+        verbosity: detailed
+
+    service:
+      pipelines:
+        traces:
+          receivers: [otlp]
+          processors: [tail_sampling]
+          exporters: [otlp/traces]
+        metrics:
+          receivers: [otlp]
+          exporters: [otlphttp/metrics]
+        logs:
+          receivers: [otlp]
+          exporters: [debug]
+---
diff --git a/backend/03-tail-sampling-config.yaml → ...nd/05-tail-sampling-collector copy 2.yaml b/backend/03-tail-sampling-config.yaml → ...nd/05-tail-sampling-collector copy 2.yaml
@@ -20,24 +20,20 @@ spec:
           http:
             endpoint: 0.0.0.0:4318
     
-    processors: 
+    processors:   
+      # Sample 100% of traces with ERROR-ing spans (omit traces with all OK spans)
+      # and traces which have a duration longer than 500ms
       tail_sampling:
         decision_wait: 10s # time to wait before making a sampling decision is made
         num_traces: 100 # number of traces to be kept in memory
         expected_new_traces_per_sec: 10 # expected rate of new traces per second
         policies:
-          [          
-            {
-              name: keep-errors,
-              type: status_code,
-              status_code: {status_codes: [ERROR]}
-            },
-            {
-              name: keep-slow-traces,
-              type: latency,
-              latency: {threshold_ms: 500}
-            }
-          ]
+          - name: keep-errors
+            type: status_code
+            status_code: {status_codes: [ERROR]}
+          - name: keep-slow-traces
+            type: latency
+            latency: {threshold_ms: 500}
 
     exporters:
       otlp/traces: