Skip to content

Commit

Permalink
Initial cost metrics (#4990)
Browse files Browse the repository at this point in the history
This PR adds selector that will be used for displaying cost metrics on
the router.

A separate PR will be raised for query depth and height under a
different category.
```yaml
telemetry:
  instrumentation:
    instruments:
      supergraph:
        cost.actual: true
        cost.delta: true
        cost.estimated: true
```

In addition the following selectors are introduced:
* cost.estimated
* cost.actual
* cost.delta
* cost.result

This allow you to attach cost related data to spans instruments or
events.


There is a lifecycle issue present in this PR which prevents cost.actual
and cost delta from functioning correct that we will tackled separately.
This is because supergraph response happens before the response payload
has been returned.


<!-- start metadata -->
---

**Checklist**

Complete the checklist (and note appropriate exceptions) before the PR
is marked ready-for-review.

- [ ] Changes are compatible[^1]
- [ ] Documentation[^2] completed
- [ ] Performance impact assessed and acceptable
- Tests added and passing[^3]
    - [ ] Unit Tests
    - [ ] Integration Tests
    - [ ] Manual Tests

**Exceptions**

*Note any exceptions here*

**Notes**

[^1]: It may be appropriate to bring upcoming changes to the attention
of other (impacted) groups. Please endeavour to do this before seeking
PR approval. The mechanism for doing this will vary considerably, so use
your judgement as to how and when to do this.
[^2]: Configuration is an important part of many changes. Where
applicable please try to document configuration examples.
[^3]: Tick whichever testing boxes are applicable. If you are adding
Manual Tests, please document the manual testing (extensively) in the
Exceptions.

---------

Signed-off-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com>
Co-authored-by: Benjamin Coenen <5719034+bnjjj@users.noreply.github.com>
Co-authored-by: bryn <bryn@apollographql.com>
  • Loading branch information
3 people committed Apr 29, 2024
1 parent 87095d5 commit dc6c05e
Show file tree
Hide file tree
Showing 23 changed files with 8,911 additions and 3,885 deletions.

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions apollo-router/src/context/extensions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,18 @@ impl Extensions {
.and_then(|boxed| (&mut **boxed as &mut (dyn Any + 'static)).downcast_mut())
}

/// Get a mutable reference to a type or insert and return the value if it does not exist
pub fn get_or_default_mut<T: Default + Send + Sync + 'static>(&mut self) -> &mut T {
let map = self.map.get_or_insert_with(Box::default);
let value = map
.entry(TypeId::of::<T>())
.or_insert_with(|| Box::<T>::default());
// It should be impossible for the entry to be the wrong type as we don't allow direct access to the map.
value
.downcast_mut()
.expect("default value should be inserted and we should be able to downcast it")
}

/// Returns `true` type has been stored in `Extensions`.
pub fn contains_key<T: Send + Sync + 'static>(&self) -> bool {
self.map
Expand Down
116 changes: 97 additions & 19 deletions apollo-router/src/plugins/demand_control/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use tower::ServiceExt;
use crate::error::Error;
use crate::graphql;
use crate::graphql::IntoGraphQLErrors;
use crate::json_ext::Object;
use crate::layers::ServiceBuilderExt;
use crate::plugin::Plugin;
use crate::plugin::PluginInit;
Expand All @@ -35,6 +36,34 @@ use crate::services::subgraph;
pub(crate) mod cost_calculator;
pub(crate) mod strategy;

/// The cost calculation information stored in context for use in telemetry and other plugins that need to know what cost was calculated.
pub(crate) struct CostContext {
pub(crate) estimated: f64,
pub(crate) actual: f64,
pub(crate) result: &'static str,
}

impl Default for CostContext {
fn default() -> Self {
Self {
estimated: 0.0,
actual: 0.0,
result: "COST_OK",
}
}
}

impl CostContext {
pub(crate) fn delta(&self) -> f64 {
self.estimated - self.actual
}

pub(crate) fn result(&mut self, error: DemandControlError) -> DemandControlError {
self.result = error.code();
error
}
}

/// Algorithm for calculating the cost of an incoming query.
#[derive(Clone, Debug, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
Expand Down Expand Up @@ -89,11 +118,21 @@ pub(crate) struct DemandControlConfig {

#[derive(Debug, Display, Error)]
pub(crate) enum DemandControlError {
/// Query estimated cost exceeded configured maximum
EstimatedCostTooExpensive,
/// Query actual cost exceeded configured maximum
/// query estimated cost {estimated_cost} exceeded configured maximum {max_cost}
EstimatedCostTooExpensive {
/// The estimated cost of the query
estimated_cost: f64,
/// The maximum cost of the query
max_cost: f64,
},
/// auery actual cost {actual_cost} exceeded configured maximum {max_cost}
#[allow(dead_code)]
ActualCostTooExpensive,
ActualCostTooExpensive {
/// The actual cost of the query
actual_cost: f64,
/// The maximum cost of the query
max_cost: f64,
},
/// Query could not be parsed: {0}
QueryParseFailure(String),
/// The response body could not be properly matched with its query's structure: {0}
Expand All @@ -103,26 +142,55 @@ pub(crate) enum DemandControlError {
impl IntoGraphQLErrors for DemandControlError {
fn into_graphql_errors(self) -> Result<Vec<Error>, Self> {
match self {
DemandControlError::EstimatedCostTooExpensive => Ok(vec![graphql::Error::builder()
.extension_code("COST_ESTIMATED_TOO_EXPENSIVE")
.message(self.to_string())
.build()]),
DemandControlError::ActualCostTooExpensive => Ok(vec![graphql::Error::builder()
.extension_code("COST_ACTUAL_TOO_EXPENSIVE")
.message(self.to_string())
.build()]),
DemandControlError::EstimatedCostTooExpensive {
estimated_cost,
max_cost,
} => {
let mut extensions = Object::new();
extensions.insert("cost.estimated", estimated_cost.into());
extensions.insert("cost.max", max_cost.into());
Ok(vec![graphql::Error::builder()
.extension_code(self.code())
.extensions(extensions)
.message(self.to_string())
.build()])
}
DemandControlError::ActualCostTooExpensive {
actual_cost,
max_cost,
} => {
let mut extensions = Object::new();
extensions.insert("cost.actual", actual_cost.into());
extensions.insert("cost.max", max_cost.into());
Ok(vec![graphql::Error::builder()
.extension_code(self.code())
.extensions(extensions)
.message(self.to_string())
.build()])
}
DemandControlError::QueryParseFailure(_) => Ok(vec![graphql::Error::builder()
.extension_code("COST_QUERY_PARSE_FAILURE")
.extension_code(self.code())
.message(self.to_string())
.build()]),
DemandControlError::ResponseTypingFailure(_) => Ok(vec![graphql::Error::builder()
.extension_code("COST_RESPONSE_TYPING_FAILURE")
.extension_code(self.code())
.message(self.to_string())
.build()]),
}
}
}

impl DemandControlError {
fn code(&self) -> &'static str {
match self {
DemandControlError::EstimatedCostTooExpensive { .. } => "COST_ESTIMATED_TOO_EXPENSIVE",
DemandControlError::ActualCostTooExpensive { .. } => "COST_ACTUAL_TOO_EXPENSIVE",
DemandControlError::QueryParseFailure(_) => "COST_QUERY_PARSE_FAILURE",
DemandControlError::ResponseTypingFailure(_) => "COST_RESPONSE_TYPING_FAILURE",
}
}
}

impl<T> From<WithErrors<T>> for DemandControlError {
fn from(value: WithErrors<T>) -> Self {
DemandControlError::QueryParseFailure(format!("{}", value))
Expand Down Expand Up @@ -184,12 +252,13 @@ impl Plugin for DemandControl {
.get::<Strategy>()
.expect("must have strategy")
.clone();
let context = resp.context.clone();
resp.response = resp.response.map(move |resp| {
// Here we are going to abort the stream if the cost is too high
// First we map based on cost, then we use take while to abort the stream if an error is emitted.
// When we terminate the stream we still want to emit a graphql error, so the error response is emitted first before a termination error.
resp.flat_map(move |resp| {
match strategy.on_execution_response(req.as_ref(), &resp) {
match strategy.on_execution_response(&context, req.as_ref(), &resp) {
Ok(_) => Either::Left(stream::once(future::ready(Ok(resp)))),
Err(err) => Either::Right(stream::iter(vec![
// This is the error we are returning to the user
Expand Down Expand Up @@ -253,7 +322,10 @@ impl Plugin for DemandControl {
})
.map_future_with_request_data(
|req: &subgraph::Request| {
req.executable_document.clone().expect("must have document")
//TODO convert this to expect
req.executable_document.clone().unwrap_or_else(|| {
Arc::new(Valid::assume_valid(ExecutableDocument::new()))
})
},
|req: Arc<Valid<ExecutableDocument>>, fut| async move {
let resp: subgraph::Response = fut.await?;
Expand All @@ -272,7 +344,7 @@ impl Plugin for DemandControl {
.expect("must be able to convert to graphql error"),
)
.context(resp.context.clone())
.extensions(crate::json_ext::Object::new())
.extensions(Object::new())
.build(),
})
},
Expand Down Expand Up @@ -466,10 +538,16 @@ mod test {
fn from(value: &TestError) -> Self {
match value {
TestError::EstimatedCostTooExpensive => {
DemandControlError::EstimatedCostTooExpensive
DemandControlError::EstimatedCostTooExpensive {
max_cost: 1.0,
estimated_cost: 2.0,
}
}

TestError::ActualCostTooExpensive => DemandControlError::ActualCostTooExpensive,
TestError::ActualCostTooExpensive => DemandControlError::ActualCostTooExpensive {
actual_cost: 1.0,
max_cost: 2.0,
},
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ source: apollo-router/src/plugins/demand_control/mod.rs
expression: body
---
- errors:
- message: Query estimated cost exceeded configured maximum
- message: query estimated cost 2 exceeded configured maximum 1
extensions:
cost.estimated: 2
cost.max: 1
code: COST_ESTIMATED_TOO_EXPENSIVE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ source: apollo-router/src/plugins/demand_control/mod.rs
expression: body
---
- errors:
- message: Query estimated cost exceeded configured maximum
- message: query estimated cost 2 exceeded configured maximum 1
extensions:
cost.estimated: 2
cost.max: 1
code: COST_ESTIMATED_TOO_EXPENSIVE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ expression: body
---
data: ~
errors:
- message: Query estimated cost exceeded configured maximum
- message: query estimated cost 2 exceeded configured maximum 1
extensions:
cost.estimated: 2
cost.max: 1
code: COST_ESTIMATED_TOO_EXPENSIVE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ expression: body
---
data: ~
errors:
- message: Query estimated cost exceeded configured maximum
- message: query estimated cost 2 exceeded configured maximum 1
extensions:
cost.estimated: 2
cost.max: 1
code: COST_ESTIMATED_TOO_EXPENSIVE
5 changes: 4 additions & 1 deletion apollo-router/src/plugins/demand_control/strategy/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::plugins::demand_control::Mode;
use crate::plugins::demand_control::StrategyConfig;
use crate::services::execution;
use crate::services::subgraph;
use crate::Context;

mod static_estimated;
#[cfg(test)]
Expand Down Expand Up @@ -59,10 +60,11 @@ impl Strategy {
}
pub(crate) fn on_execution_response(
&self,
context: &Context,
request: &ExecutableDocument,
response: &graphql::Response,
) -> Result<(), DemandControlError> {
match self.inner.on_execution_response(request, response) {
match self.inner.on_execution_response(context, request, response) {
Err(e) if self.mode == Mode::Enforce => Err(e),
_ => Ok(()),
}
Expand Down Expand Up @@ -122,6 +124,7 @@ pub(crate) trait StrategyImpl: Send + Sync {
) -> Result<(), DemandControlError>;
fn on_execution_response(
&self,
context: &Context,
request: &ExecutableDocument,
response: &graphql::Response,
) -> Result<(), DemandControlError>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use apollo_compiler::ExecutableDocument;
use crate::graphql;
use crate::plugins::demand_control::cost_calculator::static_cost::StaticCostCalculator;
use crate::plugins::demand_control::strategy::StrategyImpl;
use crate::plugins::demand_control::CostContext;
use crate::plugins::demand_control::DemandControlError;
use crate::services::execution;
use crate::services::subgraph;
Expand All @@ -19,8 +20,16 @@ impl StrategyImpl for StaticEstimated {
self.cost_calculator
.planned(&request.query_plan)
.and_then(|cost| {
let mut extensions = request.context.extensions().lock();
let cost_result = extensions.get_or_default_mut::<CostContext>();
cost_result.estimated = cost;
if cost > self.max {
Err(DemandControlError::EstimatedCostTooExpensive)
Err(
cost_result.result(DemandControlError::EstimatedCostTooExpensive {
estimated_cost: cost,
max_cost: self.max,
}),
)
} else {
Ok(())
}
Expand All @@ -41,12 +50,15 @@ impl StrategyImpl for StaticEstimated {

fn on_execution_response(
&self,
context: &crate::Context,
request: &ExecutableDocument,
response: &graphql::Response,
) -> Result<(), DemandControlError> {
if response.data.is_some() {
let _cost = self.cost_calculator.actual(request, response)?;
// Todo metrics
let cost = self.cost_calculator.actual(request, response)?;
let mut extensions = context.extensions().lock();
let cost_result = extensions.get_or_default_mut::<CostContext>();
cost_result.actual = cost;
}
Ok(())
}
Expand Down
1 change: 1 addition & 0 deletions apollo-router/src/plugins/demand_control/strategy/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ impl StrategyImpl for Test {

fn on_execution_response(
&self,
_context: &crate::Context,
_request: &ExecutableDocument,
_response: &crate::graphql::Response,
) -> Result<(), DemandControlError> {
Expand Down
15 changes: 10 additions & 5 deletions apollo-router/src/plugins/telemetry/config_new/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ use opentelemetry_semantic_conventions::trace::URL_SCHEME;
use opentelemetry_semantic_conventions::trace::USER_AGENT_ORIGINAL;
use schemars::JsonSchema;
use serde::Deserialize;
#[cfg(test)]
use serde::Serialize;
use tower::BoxError;
use tracing::Span;

use crate::axum_factory::utils::ConnectionInfo;
use crate::context::OPERATION_KIND;
use crate::context::OPERATION_NAME;
use crate::plugins::telemetry::config_new::cost::SupergraphCostAttributes;
use crate::plugins::telemetry::config_new::trace_id;
use crate::plugins::telemetry::config_new::DatadogId;
use crate::plugins::telemetry::config_new::DefaultForLevel;
Expand Down Expand Up @@ -114,7 +113,7 @@ impl DefaultForLevel for RouterAttributes {
}

#[derive(Deserialize, JsonSchema, Clone, Default, Debug)]
#[cfg_attr(test, derive(Serialize, PartialEq))]
#[cfg_attr(test, derive(PartialEq))]
#[serde(deny_unknown_fields, default)]
pub(crate) struct SupergraphAttributes {
/// The GraphQL document being executed.
Expand All @@ -137,6 +136,10 @@ pub(crate) struct SupergraphAttributes {
/// Requirement level: Recommended
#[serde(rename = "graphql.operation.type")]
pub(crate) graphql_operation_type: Option<bool>,

/// Cost attributes for the operation being executed
#[serde(flatten)]
pub(crate) cost: SupergraphCostAttributes,
}

impl DefaultForLevel for SupergraphAttributes {
Expand Down Expand Up @@ -890,8 +893,10 @@ impl Selectors for SupergraphAttributes {
attrs
}

fn on_response(&self, _response: &supergraph::Response) -> Vec<KeyValue> {
Vec::default()
fn on_response(&self, response: &supergraph::Response) -> Vec<KeyValue> {
let mut attrs = Vec::new();
attrs.append(&mut self.cost.on_response(response));
attrs
}

fn on_error(&self, _error: &BoxError) -> Vec<KeyValue> {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
telemetry:
instrumentation:
instruments:
supergraph:
cost.actual: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
telemetry:
instrumentation:
instruments:
supergraph:
cost.actual:
attributes:
cost.result: true

0 comments on commit dc6c05e

Please sign in to comment.