Skip to content

Commit

Permalink
Merge pull request #18 from airbnb/jimmyngo/interferon_updates
Browse files Browse the repository at this point in the history
Interferon updates to v0.0.19
  • Loading branch information
jimmyngo committed Jan 5, 2017
2 parents b0cc2a5 + e38a1ef commit 53a33db
Show file tree
Hide file tree
Showing 13 changed files with 622 additions and 147 deletions.
4 changes: 4 additions & 0 deletions interferon.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ Gem::Specification.new do |gem|
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
gem.add_runtime_dependency "diffy", "~> 3.1.0", ">= 3.1.0"
gem.add_runtime_dependency "parallel", "~> 1.9", ">= 1.9.0"
gem.add_runtime_dependency "nokogiri", "< 1.7.0"
gem.add_runtime_dependency "tzinfo", "~> 1.2.2", ">= 1.2.2"

gem.add_development_dependency "rspec", "~> 3.2"
gem.add_development_dependency "pry", "~> 0.10"
Expand Down
179 changes: 134 additions & 45 deletions lib/interferon.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#require 'pry' #uncomment if you're debugging
require 'erb'
require 'ostruct'
require 'parallel'
require 'set'
require 'yaml'

Expand All @@ -23,11 +24,14 @@ class Interferon
# groups_sources is a hash from type => options for each group source
# host_sources is a hash from type => options for each host source
# destinations is a similiar hash from type => options for each alerter
def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
def initialize(alerts_repo_path, groups_sources, host_sources, destinations,
dry_run=false, processes=nil)
@alerts_repo_path = alerts_repo_path
@groups_sources = groups_sources
@host_sources = host_sources
@destinations = destinations
@dry_run = dry_run
@processes = processes
@request_shutdown = false
end

Expand All @@ -36,7 +40,8 @@ def run(dry_run = false)
log.info "SIGTERM received. shutting down gracefully..."
@request_shutdown = true
end
run_desc = dry_run ? 'dry run' : 'run'
@dry_run = dry_run
run_desc = @dry_run ? 'dry run' : 'run'
log.info "beginning alerts #{run_desc}"

alerts = read_alerts
Expand All @@ -45,9 +50,12 @@ def run(dry_run = false)

@destinations.each do |dest|
dest['options'] ||= {}
if @dry_run
dest['options']['dry_run'] = true
end
end

update_alerts(@destinations, hosts, alerts, groups, dry_run)
update_alerts(@destinations, hosts, alerts, groups)

if @request_shutdown
log.info "interferon #{run_desc} shut down by SIGTERM"
Expand Down Expand Up @@ -133,23 +141,23 @@ def read_hosts(sources)
return hosts
end

def update_alerts(destinations, hosts, alerts, groups, dry_run)
def update_alerts(destinations, hosts, alerts, groups)
loader = DestinationsLoader.new([@alerts_repo_path])
loader.get_all(destinations).each do |dest|
break if @request_shutdown
log.info "updating alerts on #{dest.class.name}"
update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
update_alerts_on_destination(dest, hosts, alerts, groups)
end
end

def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
def update_alerts_on_destination(dest, hosts, alerts, groups)
# track some counters/stats per destination
start_time = Time.new.to_f

# get already-defined alerts
existing_alerts = dest.existing_alerts.dup
existing_alerts = dest.existing_alerts

if dry_run
if @dry_run
do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
else
do_regular_update(dest, hosts, alerts, existing_alerts, groups)
Expand All @@ -159,7 +167,7 @@ def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
# run time summary
run_time = Time.new.to_f - start_time
statsd.histogram(
dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
@dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
run_time,
:tags => ["destination:#{dest.class.name}"])
log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
Expand All @@ -168,46 +176,103 @@ def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
dest.report_stats
end

if dry_run && !dest.api_errors.empty?
if @dry_run && !dest.api_errors.empty?
raise dest.api_errors.to_s
end
end

def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
to_remove = existing_alerts.reject{|key, a| !key.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)}
# Track these to clean up dry-run alerts from previous runs
existing_dry_run_alerts = []
existing_alerts.each do |name, alert|
if name.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)
existing_dry_run_alerts << [alert['name'], [alert['id']]]
existing_alerts.delete(name)
end
end

alerts_queue = build_alerts_queue(hosts, alerts, groups)
alerts_queue.reject!{|name, pair| !Interferon::need_dry_run(pair[0], existing_alerts)}
alerts_queue.each do |name, pair|
alert = pair[0]
alert.change_name(DRY_RUN_ALERTS_NAME_PREFIX + alert['name'])
updates_queue = alerts_queue.reject do |name, alert_people_pair|
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
end

# Add dry-run prefix to alerts and delete id to avoid impacting real alerts
existing_alerts.keys.each do |name|
existing_alert = existing_alerts[name]
dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + name
existing_alert['name'] = dry_run_alert_name
existing_alert['id'] = [nil]
existing_alerts[dry_run_alert_name] = existing_alerts.delete(name)
end

# Build new queue with dry-run prefixes and ensure they are silenced
alerts_queue.each do |name, alert_people_pair|
alert = alert_people_pair[0]
dry_run_alert_name = DRY_RUN_ALERTS_NAME_PREFIX + alert['name']
alert.change_name(dry_run_alert_name)
alert.silence
end

# Create alerts in destination
created_alerts = create_alerts(dest, updates_queue)

# Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
to_remove = existing_alerts.dup
alerts_queue.each do |name, alert_people_pair|
alert = alert_people_pair[0]
old_alerts = to_remove[alert['name']]

if !old_alerts.nil?
if old_alerts['id'].length == 1
to_remove.delete(alert['name'])
else
old_alerts['id'] = old_alerts['id'].drop(1)
end
end
end

# flush queue
created_alerts_key_ids = create_alerts(dest, alerts_queue)
created_alerts_ids = created_alerts_key_ids.map{|a| a[1]}
to_remove_ids = to_remove.empty? ? [] : to_remove.map{|a| a['id']}
# remove existing alerts that shouldn't exist
(created_alerts_ids + to_remove_ids).each do |id|
# Clean up alerts not longer being generated
to_remove.each do |name, alert|
break if @request_shutdown
dest.remove_alert_by_id(id) unless id.nil?
dest.remove_alert(alert)
end

# Clean up dry-run created alerts
(created_alerts + existing_dry_run_alerts).each do |alert_id_pair|
alert_ids = alert_id_pair[1]
alert_ids.each do |alert_id|
dest.remove_alert_by_id(alert_id)
end
end

end

def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
existing_alerts.each{ |key, existing_alert| existing_alert['still_exists'] = false }

alerts_queue = build_alerts_queue(hosts, alerts, groups)
updates_queue = alerts_queue.reject do |name, alert_people_pair|
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
end

# flush queue
created_alerts_keys = create_alerts(dest, alerts_queue).map{|a| a[0]}
created_alerts_keys.each do |alert_key|
# don't delete alerts we still have defined
existing_alerts[alert_key]['still_exists'] = true if existing_alerts.include?(alert_key)
# Create alerts in destination
create_alerts(dest, updates_queue)

# Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
to_remove = existing_alerts.dup
alerts_queue.each do |name, alert_people_pair|
alert = alert_people_pair[0]
old_alerts = to_remove[alert['name']]

if !old_alerts.nil?
if old_alerts['id'].length == 1
to_remove.delete(alert['name'])
else
old_alerts['id'] = old_alerts['id'].drop(1)
end
end
end

# remove existing alerts that shouldn't exist
to_delete = existing_alerts.reject{ |key, existing_alert| existing_alert['still_exists'] }
to_delete.each do |key, alert|
# Clean up alerts not longer being generated
to_remove.each do |name, alert|
break if @request_shutdown
dest.remove_alert(alert)
end
Expand Down Expand Up @@ -237,10 +302,11 @@ def create_alerts(dest, alerts_queue)
end

def build_alerts_queue(hosts, alerts, groups)
alerts_queue = {}
# create or update alerts; mark when we've done that
alerts_queue = Hash.new
alerts.each do |alert|
result = Parallel.map(alerts, in_processes: @processes) do |alert|
break if @request_shutdown
alerts_generated = {}
counters = {
:errors => 0,
:evals => 0,
Expand Down Expand Up @@ -268,7 +334,7 @@ def build_alerts_queue(hosts, alerts, groups)

counters[:applies] += 1
# don't define alerts twice
next if alerts_queue.key?(alert[:name])
next if alerts_generated.key?(alert[:name])

# figure out who to notify
people = Set.new(alert[:notify][:people])
Expand All @@ -277,7 +343,7 @@ def build_alerts_queue(hosts, alerts, groups)
end

# queue the alert up for creation; we clone the alert to save the current state
alerts_queue[alert[:name]] ||= [alert.clone, people]
alerts_generated[alert[:name]] = [alert.clone, people]
end

# log some of the counters
Expand All @@ -289,7 +355,7 @@ def build_alerts_queue(hosts, alerts, groups)
end

# did the alert fail to evaluate on all hosts?
if counters[:errors] == counters[:hosts]
if counters[:errors] == counters[:hosts] && !last_eval_error.nil?
log.error "alert #{alert} failed to evaluate in the context of all hosts!"
log.error "last error on alert #{alert}: #{last_eval_error}"

Expand All @@ -306,25 +372,48 @@ def build_alerts_queue(hosts, alerts, groups)
else
statsd.gauge('alerts.evaluate.never_applies', 0, :tags => ["alert:#{alert}"])
end
alerts_generated
end

result.each do |alerts_generated|
alerts_queue.merge! alerts_generated
end
alerts_queue
end

def self.need_dry_run(alert, existing_alerts_from_api)
def self.need_update(dest, alert_people_pair, existing_alerts_from_api)
alert = alert_people_pair[0]
existing = existing_alerts_from_api[alert['name']]
if existing.nil?
true
else
!same_alerts_for_dry_run_purpose(alert, existing)
!same_alerts(dest, alert_people_pair, existing)
end
end

def self.same_alerts_for_dry_run_purpose(alert, alert_api_json)
query1 = alert['metric']['datadog_query']
query2 = alert_api_json['query']
query1.strip!
query2.strip!
query1 == query2
def self.same_alerts(dest, alert_people_pair, alert_api_json)
alert, people = alert_people_pair

prev_alert = {
:query => alert_api_json['query'].strip,
:message => alert_api_json['message'].strip,
:notify_no_data => alert_api_json['notify_no_data'],
:silenced => alert_api_json['silenced'],
:timeout => alert_api_json['timeout_h'],
:no_data_timeframe => alert_api_json['no_data_timeframe']
}

new_alert = {
:query => alert['metric']['datadog_query'].strip,
:message => dest.generate_message(alert['message'], people).strip,
:notify_no_data => alert['notify_no_data'],
:silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
:timeout => alert['timeout'] ? [1, alert['timeout'].to_i / 3600].max : nil,
:no_data_timeframe => alert['no_data_timeframe'] || nil
}

prev_alert == new_alert
end

end
end
8 changes: 8 additions & 0 deletions lib/interferon/alert.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ def change_name(name)
@dsl.name(name)
end

def silence
unless @dsl
raise "This alert has not yet been evaluated"
end

@dsl.silenced(true)
end

def [](attr)
unless @dsl
raise "This alert has not yet been evaluated"
Expand Down
10 changes: 10 additions & 0 deletions lib/interferon/alert_dsl.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require 'interferon/work_hours_helper'

module Interferon
module DSLMixin
Expand Down Expand Up @@ -47,6 +48,15 @@ def silenced_until(v = nil, &block)
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
end

def is_work_hour?(args = {})
# Args can contain
# :hours => range of work hours (0 to 23h), for example (9..16)
# :days => range of week days (0 = sunday), for example (1..5) (Monday to Friday)
# :timezone => example 'America/Los_Angeles'
# 9 to 5 Monday to Friday in PST is the default
WorkHoursHelper.is_work_hour?(Time.now.utc, args)
end

def notify_no_data(v = nil, &block)
get_or_set(:@notify_no_data, v, block, false)
end
Expand Down

0 comments on commit 53a33db

Please sign in to comment.