Initial cost metrics (#4990)

This PR adds selector that will be used for displaying cost metrics on the router. A separate PR will be raised for query depth and height under a different category. ```yaml telemetry: instrumentation: instruments: supergraph: cost.actual: true cost.delta: true cost.estimated: true ``` In addition the following selectors are introduced: * cost.estimated * cost.actual * cost.delta * cost.result This allow you to attach cost related data to spans instruments or events. There is a lifecycle issue present in this PR which prevents cost.actual and cost delta from functioning correct that we will tackled separately. This is because supergraph response happens before the response payload has been returned.  --- **Checklist** Complete the checklist (and note appropriate exceptions) before the PR is marked ready-for-review. - [ ] Changes are compatible[^1] - [ ] Documentation[^2] completed - [ ] Performance impact assessed and acceptable - Tests added and passing[^3] - [ ] Unit Tests - [ ] Integration Tests - [ ] Manual Tests **Exceptions** *Note any exceptions here* **Notes** [^1]: It may be appropriate to bring upcoming changes to the attention of other (impacted) groups. Please endeavour to do this before seeking PR approval. The mechanism for doing this will vary considerably, so use your judgement as to how and when to do this. [^2]: Configuration is an important part of many changes. Where applicable please try to document configuration examples. [^3]: Tick whichever testing boxes are applicable. If you are adding Manual Tests, please document the manual testing (extensively) in the Exceptions. --------- Signed-off-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com> Co-authored-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com> Co-authored-by: bryn <bryn@apollographql.com>
apollographql · Apr 29, 2024 · dc6c05e · dc6c05e
1 parent 87095d5
commit dc6c05e
Show file tree

Hide file tree

Showing 23 changed files with 8,911 additions and 3,885 deletions.
diff --git a/...r/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/...r/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap
diff --git a/apollo-router/src/context/extensions/mod.rs b/apollo-router/src/context/extensions/mod.rs
@@ -85,6 +85,18 @@ impl Extensions {
             .and_then(|boxed| (&mut **boxed as &mut (dyn Any + 'static)).downcast_mut())
     }
 
+    /// Get a mutable reference to a type or insert and return the value if it does not exist
+    pub fn get_or_default_mut<T: Default + Send + Sync + 'static>(&mut self) -> &mut T {
+        let map = self.map.get_or_insert_with(Box::default);
+        let value = map
+            .entry(TypeId::of::<T>())
+            .or_insert_with(|| Box::<T>::default());
+        // It should be impossible for the entry to be the wrong type as we don't allow direct access to the map.
+        value
+            .downcast_mut()
+            .expect("default value should be inserted and we should be able to downcast it")
+    }
+
     /// Returns `true` type has been stored in `Extensions`.
     pub fn contains_key<T: Send + Sync + 'static>(&self) -> bool {
         self.map

diff --git a/apollo-router/src/plugins/demand_control/mod.rs b/apollo-router/src/plugins/demand_control/mod.rs
@@ -22,6 +22,7 @@ use tower::ServiceExt;
 use crate::error::Error;
 use crate::graphql;
 use crate::graphql::IntoGraphQLErrors;
+use crate::json_ext::Object;
 use crate::layers::ServiceBuilderExt;
 use crate::plugin::Plugin;
 use crate::plugin::PluginInit;
@@ -35,6 +36,34 @@ use crate::services::subgraph;
 pub(crate) mod cost_calculator;
 pub(crate) mod strategy;
 
+/// The cost calculation information stored in context for use in telemetry and other plugins that need to know what cost was calculated.
+pub(crate) struct CostContext {
+    pub(crate) estimated: f64,
+    pub(crate) actual: f64,
+    pub(crate) result: &'static str,
+}
+
+impl Default for CostContext {
+    fn default() -> Self {
+        Self {
+            estimated: 0.0,
+            actual: 0.0,
+            result: "COST_OK",
+        }
+    }
+}
+
+impl CostContext {
+    pub(crate) fn delta(&self) -> f64 {
+        self.estimated - self.actual
+    }
+
+    pub(crate) fn result(&mut self, error: DemandControlError) -> DemandControlError {
+        self.result = error.code();
+        error
+    }
+}
+
 /// Algorithm for calculating the cost of an incoming query.
 #[derive(Clone, Debug, Deserialize, JsonSchema)]
 #[serde(deny_unknown_fields, rename_all = "snake_case")]
@@ -89,11 +118,21 @@ pub(crate) struct DemandControlConfig {
 
 #[derive(Debug, Display, Error)]
 pub(crate) enum DemandControlError {
-    /// Query estimated cost exceeded configured maximum
-    EstimatedCostTooExpensive,
-    /// Query actual cost exceeded configured maximum
+    /// query estimated cost {estimated_cost} exceeded configured maximum {max_cost}
+    EstimatedCostTooExpensive {
+        /// The estimated cost of the query
+        estimated_cost: f64,
+        /// The maximum cost of the query
+        max_cost: f64,
+    },
+    /// auery actual cost {actual_cost} exceeded configured maximum {max_cost}
     #[allow(dead_code)]
-    ActualCostTooExpensive,
+    ActualCostTooExpensive {
+        /// The actual cost of the query
+        actual_cost: f64,
+        /// The maximum cost of the query
+        max_cost: f64,
+    },
     /// Query could not be parsed: {0}
     QueryParseFailure(String),
     /// The response body could not be properly matched with its query's structure: {0}
@@ -103,26 +142,55 @@ pub(crate) enum DemandControlError {
 impl IntoGraphQLErrors for DemandControlError {
     fn into_graphql_errors(self) -> Result<Vec<Error>, Self> {
         match self {
-            DemandControlError::EstimatedCostTooExpensive => Ok(vec![graphql::Error::builder()
-                .extension_code("COST_ESTIMATED_TOO_EXPENSIVE")
-                .message(self.to_string())
-                .build()]),
-            DemandControlError::ActualCostTooExpensive => Ok(vec![graphql::Error::builder()
-                .extension_code("COST_ACTUAL_TOO_EXPENSIVE")
-                .message(self.to_string())
-                .build()]),
+            DemandControlError::EstimatedCostTooExpensive {
+                estimated_cost,
+                max_cost,
+            } => {
+                let mut extensions = Object::new();
+                extensions.insert("cost.estimated", estimated_cost.into());
+                extensions.insert("cost.max", max_cost.into());
+                Ok(vec![graphql::Error::builder()
+                    .extension_code(self.code())
+                    .extensions(extensions)
+                    .message(self.to_string())
+                    .build()])
+            }
+            DemandControlError::ActualCostTooExpensive {
+                actual_cost,
+                max_cost,
+            } => {
+                let mut extensions = Object::new();
+                extensions.insert("cost.actual", actual_cost.into());
+                extensions.insert("cost.max", max_cost.into());
+                Ok(vec![graphql::Error::builder()
+                    .extension_code(self.code())
+                    .extensions(extensions)
+                    .message(self.to_string())
+                    .build()])
+            }
             DemandControlError::QueryParseFailure(_) => Ok(vec![graphql::Error::builder()
-                .extension_code("COST_QUERY_PARSE_FAILURE")
+                .extension_code(self.code())
                 .message(self.to_string())
                 .build()]),
             DemandControlError::ResponseTypingFailure(_) => Ok(vec![graphql::Error::builder()
-                .extension_code("COST_RESPONSE_TYPING_FAILURE")
+                .extension_code(self.code())
                 .message(self.to_string())
                 .build()]),
         }
     }
 }
 
+impl DemandControlError {
+    fn code(&self) -> &'static str {
+        match self {
+            DemandControlError::EstimatedCostTooExpensive { .. } => "COST_ESTIMATED_TOO_EXPENSIVE",
+            DemandControlError::ActualCostTooExpensive { .. } => "COST_ACTUAL_TOO_EXPENSIVE",
+            DemandControlError::QueryParseFailure(_) => "COST_QUERY_PARSE_FAILURE",
+            DemandControlError::ResponseTypingFailure(_) => "COST_RESPONSE_TYPING_FAILURE",
+        }
+    }
+}
+
 impl<T> From<WithErrors<T>> for DemandControlError {
     fn from(value: WithErrors<T>) -> Self {
         DemandControlError::QueryParseFailure(format!("{}", value))
@@ -184,12 +252,13 @@ impl Plugin for DemandControl {
                         .get::<Strategy>()
                         .expect("must have strategy")
                         .clone();
+                    let context = resp.context.clone();
                     resp.response = resp.response.map(move |resp| {
                         // Here we are going to abort the stream if the cost is too high
                         // First we map based on cost, then we use take while to abort the stream if an error is emitted.
                         // When we terminate the stream we still want to emit a graphql error, so the error response is emitted first before a termination error.
                         resp.flat_map(move |resp| {
-                            match strategy.on_execution_response(req.as_ref(), &resp) {
+                            match strategy.on_execution_response(&context, req.as_ref(), &resp) {
                                 Ok(_) => Either::Left(stream::once(future::ready(Ok(resp)))),
                                 Err(err) => Either::Right(stream::iter(vec![
                                     // This is the error we are returning to the user
@@ -253,7 +322,10 @@ impl Plugin for DemandControl {
                 })
                 .map_future_with_request_data(
                     |req: &subgraph::Request| {
-                        req.executable_document.clone().expect("must have document")
+                        //TODO convert this to expect
+                        req.executable_document.clone().unwrap_or_else(|| {
+                            Arc::new(Valid::assume_valid(ExecutableDocument::new()))
+                        })
                     },
                     |req: Arc<Valid<ExecutableDocument>>, fut| async move {
                         let resp: subgraph::Response = fut.await?;
@@ -272,7 +344,7 @@ impl Plugin for DemandControl {
                                         .expect("must be able to convert to graphql error"),
                                 )
                                 .context(resp.context.clone())
-                                .extensions(crate::json_ext::Object::new())
+                                .extensions(Object::new())
                                 .build(),
                         })
                     },
@@ -466,10 +538,16 @@ mod test {
         fn from(value: &TestError) -> Self {
             match value {
                 TestError::EstimatedCostTooExpensive => {
-                    DemandControlError::EstimatedCostTooExpensive
+                    DemandControlError::EstimatedCostTooExpensive {
+                        max_cost: 1.0,
+                        estimated_cost: 2.0,
+                    }
                 }
 
-                TestError::ActualCostTooExpensive => DemandControlError::ActualCostTooExpensive,
+                TestError::ActualCostTooExpensive => DemandControlError::ActualCostTooExpensive {
+                    actual_cost: 1.0,
+                    max_cost: 2.0,
+                },
             }
         }
     }

diff --git a/...snapshots/apollo_router__plugins__demand_control__test__enforce_on_execution_request.snap b/...snapshots/apollo_router__plugins__demand_control__test__enforce_on_execution_request.snap
@@ -3,6 +3,8 @@ source: apollo-router/src/plugins/demand_control/mod.rs
 expression: body
 ---
 - errors:
-    - message: Query estimated cost exceeded configured maximum
+    - message: query estimated cost 2 exceeded configured maximum 1
       extensions:
+        cost.estimated: 2
+        cost.max: 1
         code: COST_ESTIMATED_TOO_EXPENSIVE
diff --git a/...napshots/apollo_router__plugins__demand_control__test__enforce_on_execution_response.snap b/...napshots/apollo_router__plugins__demand_control__test__enforce_on_execution_response.snap
@@ -3,6 +3,8 @@ source: apollo-router/src/plugins/demand_control/mod.rs
 expression: body
 ---
 - errors:
-    - message: Query estimated cost exceeded configured maximum
+    - message: query estimated cost 2 exceeded configured maximum 1
       extensions:
+        cost.estimated: 2
+        cost.max: 1
         code: COST_ESTIMATED_TOO_EXPENSIVE
diff --git a/.../snapshots/apollo_router__plugins__demand_control__test__enforce_on_subgraph_request.snap b/.../snapshots/apollo_router__plugins__demand_control__test__enforce_on_subgraph_request.snap
@@ -4,6 +4,8 @@ expression: body
 ---
 data: ~
 errors:
-  - message: Query estimated cost exceeded configured maximum
+  - message: query estimated cost 2 exceeded configured maximum 1
     extensions:
+      cost.estimated: 2
+      cost.max: 1
       code: COST_ESTIMATED_TOO_EXPENSIVE
diff --git a/...snapshots/apollo_router__plugins__demand_control__test__enforce_on_subgraph_response.snap b/...snapshots/apollo_router__plugins__demand_control__test__enforce_on_subgraph_response.snap
@@ -4,6 +4,8 @@ expression: body
 ---
 data: ~
 errors:
-  - message: Query estimated cost exceeded configured maximum
+  - message: query estimated cost 2 exceeded configured maximum 1
     extensions:
+      cost.estimated: 2
+      cost.max: 1
       code: COST_ESTIMATED_TOO_EXPENSIVE
diff --git a/apollo-router/src/plugins/demand_control/strategy/mod.rs b/apollo-router/src/plugins/demand_control/strategy/mod.rs
@@ -14,6 +14,7 @@ use crate::plugins::demand_control::Mode;
 use crate::plugins::demand_control::StrategyConfig;
 use crate::services::execution;
 use crate::services::subgraph;
+use crate::Context;
 
 mod static_estimated;
 #[cfg(test)]
@@ -59,10 +60,11 @@ impl Strategy {
     }
     pub(crate) fn on_execution_response(
         &self,
+        context: &Context,
         request: &ExecutableDocument,
         response: &graphql::Response,
     ) -> Result<(), DemandControlError> {
-        match self.inner.on_execution_response(request, response) {
+        match self.inner.on_execution_response(context, request, response) {
             Err(e) if self.mode == Mode::Enforce => Err(e),
             _ => Ok(()),
         }
@@ -122,6 +124,7 @@ pub(crate) trait StrategyImpl: Send + Sync {
     ) -> Result<(), DemandControlError>;
     fn on_execution_response(
         &self,
+        context: &Context,
         request: &ExecutableDocument,
         response: &graphql::Response,
     ) -> Result<(), DemandControlError>;

diff --git a/apollo-router/src/plugins/demand_control/strategy/static_estimated.rs b/apollo-router/src/plugins/demand_control/strategy/static_estimated.rs
@@ -3,6 +3,7 @@ use apollo_compiler::ExecutableDocument;
 use crate::graphql;
 use crate::plugins::demand_control::cost_calculator::static_cost::StaticCostCalculator;
 use crate::plugins::demand_control::strategy::StrategyImpl;
+use crate::plugins::demand_control::CostContext;
 use crate::plugins::demand_control::DemandControlError;
 use crate::services::execution;
 use crate::services::subgraph;
@@ -19,8 +20,16 @@ impl StrategyImpl for StaticEstimated {
         self.cost_calculator
             .planned(&request.query_plan)
             .and_then(|cost| {
+                let mut extensions = request.context.extensions().lock();
+                let cost_result = extensions.get_or_default_mut::<CostContext>();
+                cost_result.estimated = cost;
                 if cost > self.max {
-                    Err(DemandControlError::EstimatedCostTooExpensive)
+                    Err(
+                        cost_result.result(DemandControlError::EstimatedCostTooExpensive {
+                            estimated_cost: cost,
+                            max_cost: self.max,
+                        }),
+                    )
                 } else {
                     Ok(())
                 }
@@ -41,12 +50,15 @@ impl StrategyImpl for StaticEstimated {
 
     fn on_execution_response(
         &self,
+        context: &crate::Context,
         request: &ExecutableDocument,
         response: &graphql::Response,
     ) -> Result<(), DemandControlError> {
         if response.data.is_some() {
-            let _cost = self.cost_calculator.actual(request, response)?;
-            // Todo metrics
+            let cost = self.cost_calculator.actual(request, response)?;
+            let mut extensions = context.extensions().lock();
+            let cost_result = extensions.get_or_default_mut::<CostContext>();
+            cost_result.actual = cost;
         }
         Ok(())
     }

diff --git a/apollo-router/src/plugins/demand_control/strategy/test.rs b/apollo-router/src/plugins/demand_control/strategy/test.rs
@@ -54,6 +54,7 @@ impl StrategyImpl for Test {
 
     fn on_execution_response(
         &self,
+        _context: &crate::Context,
         _request: &ExecutableDocument,
         _response: &crate::graphql::Response,
     ) -> Result<(), DemandControlError> {

diff --git a/apollo-router/src/plugins/telemetry/config_new/attributes.rs b/apollo-router/src/plugins/telemetry/config_new/attributes.rs
@@ -31,14 +31,13 @@ use opentelemetry_semantic_conventions::trace::URL_SCHEME;
 use opentelemetry_semantic_conventions::trace::USER_AGENT_ORIGINAL;
 use schemars::JsonSchema;
 use serde::Deserialize;
-#[cfg(test)]
-use serde::Serialize;
 use tower::BoxError;
 use tracing::Span;
 
 use crate::axum_factory::utils::ConnectionInfo;
 use crate::context::OPERATION_KIND;
 use crate::context::OPERATION_NAME;
+use crate::plugins::telemetry::config_new::cost::SupergraphCostAttributes;
 use crate::plugins::telemetry::config_new::trace_id;
 use crate::plugins::telemetry::config_new::DatadogId;
 use crate::plugins::telemetry::config_new::DefaultForLevel;
@@ -114,7 +113,7 @@ impl DefaultForLevel for RouterAttributes {
 }
 
 #[derive(Deserialize, JsonSchema, Clone, Default, Debug)]
-#[cfg_attr(test, derive(Serialize, PartialEq))]
+#[cfg_attr(test, derive(PartialEq))]
 #[serde(deny_unknown_fields, default)]
 pub(crate) struct SupergraphAttributes {
     /// The GraphQL document being executed.
@@ -137,6 +136,10 @@ pub(crate) struct SupergraphAttributes {
     /// Requirement level: Recommended
     #[serde(rename = "graphql.operation.type")]
     pub(crate) graphql_operation_type: Option<bool>,
+
+    /// Cost attributes for the operation being executed
+    #[serde(flatten)]
+    pub(crate) cost: SupergraphCostAttributes,
 }
 
 impl DefaultForLevel for SupergraphAttributes {
@@ -890,8 +893,10 @@ impl Selectors for SupergraphAttributes {
         attrs
     }
 
-    fn on_response(&self, _response: &supergraph::Response) -> Vec<KeyValue> {
-        Vec::default()
+    fn on_response(&self, response: &supergraph::Response) -> Vec<KeyValue> {
+        let mut attrs = Vec::new();
+        attrs.append(&mut self.cost.on_response(response));
+        attrs
     }
 
     fn on_error(&self, _error: &BoxError) -> Vec<KeyValue> {

diff --git a/apollo-router/src/plugins/telemetry/config_new/cost/fixtures/cost_actual.router.yaml b/apollo-router/src/plugins/telemetry/config_new/cost/fixtures/cost_actual.router.yaml
@@ -0,0 +1,5 @@
+telemetry:
+  instrumentation:
+    instruments:
+      supergraph:
+        cost.actual: true
diff --git a/...er/src/plugins/telemetry/config_new/cost/fixtures/cost_actual_with_attributes.router.yaml b/...er/src/plugins/telemetry/config_new/cost/fixtures/cost_actual_with_attributes.router.yaml
@@ -0,0 +1,7 @@
+telemetry:
+  instrumentation:
+    instruments:
+      supergraph:
+        cost.actual:
+          attributes:
+            cost.result: true