Skip to content

Commit

Permalink
[interferon] Have dry-run output diff format
Browse files Browse the repository at this point in the history
* Add Diffy
* Dry-run will now output a diff format for alerts
* Moved dry-run to the destination module as it simplifies the logic
  and the output becomes cleaner
* Remove dry-run alerts from known existing alerts during dry-run
* Bump version to 0.0.13
  • Loading branch information
Jimmy Ngo authored and Jimmy Ngo committed Jan 3, 2017
1 parent 98a8fdb commit 28d7511
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 58 deletions.
1 change: 1 addition & 0 deletions interferon.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Gem::Specification.new do |gem|
gem.add_runtime_dependency "dogapi", "~> 1.11", ">= 1.11.1"
gem.add_runtime_dependency "aws-sdk", "~> 1.35", ">= 1.35.1"
gem.add_runtime_dependency "dogstatsd-ruby", "~> 1.4", ">= 1.4.1"
gem.add_runtime_dependency "diffy", "~> 3.1.0", ">= 3.1.0"

gem.add_development_dependency "rspec", "~> 3.2"
gem.add_development_dependency "pry", "~> 0.10"
Expand Down
76 changes: 50 additions & 26 deletions lib/interferon.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ class Interferon
# groups_sources is a hash from type => options for each group source
# host_sources is a hash from type => options for each host source
# destinations is a similiar hash from type => options for each alerter
def initialize(alerts_repo_path, groups_sources, host_sources, destinations)
def initialize(alerts_repo_path, groups_sources, host_sources, destinations, dry_run=false)
@alerts_repo_path = alerts_repo_path
@groups_sources = groups_sources
@host_sources = host_sources
@destinations = destinations
@dry_run = dry_run
@request_shutdown = false
end

Expand All @@ -36,7 +37,8 @@ def run(dry_run = false)
log.info "SIGTERM received. shutting down gracefully..."
@request_shutdown = true
end
run_desc = dry_run ? 'dry run' : 'run'
@dry_run = dry_run
run_desc = @dry_run ? 'dry run' : 'run'
log.info "beginning alerts #{run_desc}"

alerts = read_alerts
Expand All @@ -45,9 +47,12 @@ def run(dry_run = false)

@destinations.each do |dest|
dest['options'] ||= {}
if @dry_run
dest['options']['dry_run'] = true
end
end

update_alerts(@destinations, hosts, alerts, groups, dry_run)
update_alerts(@destinations, hosts, alerts, groups)

if @request_shutdown
log.info "interferon #{run_desc} shut down by SIGTERM"
Expand Down Expand Up @@ -133,23 +138,23 @@ def read_hosts(sources)
return hosts
end

def update_alerts(destinations, hosts, alerts, groups, dry_run)
def update_alerts(destinations, hosts, alerts, groups)
loader = DestinationsLoader.new([@alerts_repo_path])
loader.get_all(destinations).each do |dest|
break if @request_shutdown
log.info "updating alerts on #{dest.class.name}"
update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
update_alerts_on_destination(dest, hosts, alerts, groups)
end
end

def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
def update_alerts_on_destination(dest, hosts, alerts, groups)
# track some counters/stats per destination
start_time = Time.new.to_f

# get already-defined alerts
existing_alerts = dest.existing_alerts

if dry_run
if @dry_run
do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
else
do_regular_update(dest, hosts, alerts, existing_alerts, groups)
Expand All @@ -159,7 +164,7 @@ def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
# run time summary
run_time = Time.new.to_f - start_time
statsd.histogram(
dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
@dry_run ? 'destinations.run_time.dry_run' : 'destinations.run_time',
run_time,
:tags => ["destination:#{dest.class.name}"])
log.info "#{dest.class.name} : run completed in %.2f seconds" % (run_time)
Expand All @@ -168,7 +173,7 @@ def update_alerts_on_destination(dest, hosts, alerts, groups, dry_run)
dest.report_stats
end

if dry_run && !dest.api_errors.empty?
if @dry_run && !dest.api_errors.empty?
raise dest.api_errors.to_s
end
end
Expand All @@ -179,6 +184,7 @@ def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
existing_alerts.each do |name, alert|
if name.start_with?(DRY_RUN_ALERTS_NAME_PREFIX)
existing_dry_run_alerts << [alert['name'], alert['id']]
existing_alerts.remove(name)
end
end

Expand All @@ -199,8 +205,12 @@ def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
alert.change_name(dry_run_alert_name)
end

updates_queue = alerts_queue.reject{
|name, alert_people_pair| !Interferon::need_update(dest, alert_people_pair, existing_alerts)}
updates_queue = alerts_queue.reject do |name, alert_people_pair|
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
end

# Create alerts in destination
created_alerts = create_alerts(dest, updates_queue)

# Existing alerts are pruned until all that remains are alerts that aren't being generated anymore
to_remove = existing_alerts.dup
Expand All @@ -209,27 +219,25 @@ def do_dry_run_update(dest, hosts, alerts, existing_alerts, groups)
to_remove.delete(alert['name'])
end

# Create alerts in destination
created_alerts = create_alerts(dest, updates_queue)
# Clean up alerts not longer being generated
to_remove.each do |name, alert|
break if @request_shutdown
dest.remove_alert(alert)
end

# Clean up dry-run created alerts
(created_alerts + existing_dry_run_alerts).each do |alert_id_pair|
alert_id = alert_id_pair[1]
dest.remove_alert_by_id(alert_id)
end

# Log dry-run creations that are actually updates
dry_run_updates = updates_queue.reject{
|name, alert_people_pair| existing_alerts[alert_people_pair[0]['name']].nil?}
log.info "datadog: [dry-run] #{dry_run_updates.length} alerts pending update: #{dry_run_updates.keys}"
# Log clean up of alerts not longer being generated
log.info "datadog: [dry-run] #{to_remove.length} alerts pending deletion: #{to_remove.keys}"
end

def do_regular_update(dest, hosts, alerts, existing_alerts, groups)
alerts_queue = build_alerts_queue(hosts, alerts, groups)
updates_queue = alerts_queue.reject{
|name, alert_people_pair| !Interferon::need_update(dest, alert_people_pair, existing_alerts)}
updates_queue = alerts_queue.reject do |name, alert_people_pair|
!Interferon::need_update(dest, alert_people_pair, existing_alerts)
end

# Create alerts in destination
create_alerts(dest, updates_queue)
Expand Down Expand Up @@ -357,11 +365,27 @@ def self.need_update(dest, alert_people_pair, existing_alerts_from_api)

def self.same_alerts(dest, alert_people_pair, alert_api_json)
alert, people = alert_people_pair
query1 = alert['metric']['datadog_query'].strip
message1 = dest.generate_message(alert['message'], people).strip
query2 = alert_api_json['query'].strip
message2 = alert_api_json['message'].strip
query1 == query2 and message1 == message2

prev_alert = {
:query => alert_api_json['query'].strip,
:message => alert_api_json['message'].strip,
:notify_no_data => alert_api_json['notify_no_data'],
:silenced => alert_api_json['silenced'],
:timeout => alert_api_json['timeout_h'],
:no_data_timeframe => alert_api_json['no_data_timeframe']
}

new_alert = {
:query => alert['metric']['datadog_query'].strip,
:message => dest.generate_message(alert['message'], people).strip,
:notify_no_data => alert['notify_no_data'],
:silenced => alert['silenced'] || alert['silenced_until'] > Time.now,
:timeout => alert['timeout'] || nil,
:no_data_timeframe => alert['no_data_timeframe'] || nil
}

prev_alert == new_alert
end

end
end
58 changes: 37 additions & 21 deletions lib/interferon/destinations/datadog.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
require 'diffy'
require 'dogapi'
require 'set'

Diffy::Diff.default_format = :text

module Interferon::Destinations
class Datadog
include ::Interferon::Logging
Expand Down Expand Up @@ -32,6 +35,7 @@ def initialize(options)
@dog = Dogapi::Client.new(*args)

@existing_alerts = nil
@dry_run = options['dry_run']

# create datadog alerts 10 at a time
@concurrency = 10
Expand Down Expand Up @@ -108,11 +112,13 @@ def create_alert(alert, people)

datadog_query = alert['metric']['datadog_query'].strip
existing_alert = existing_alerts[alert['name']]

# new alert, create it
if existing_alert.nil? or existing_alert['id'].nil?
if existing_alert.nil?
action = :creating
@stats[:alerts_to_be_created] += 1
log.info("creating new alert #{alert['name']}: #{datadog_query} #{message}")
new_alert_text = "Query: #{datadog_query} Message: #{message.split().join(' ')}"
log.info("creating new alert #{alert['name']}: #{new_alert_text}")

resp = @dog.alert(
alert['metric']['datadog_query'].strip,
Expand All @@ -125,19 +131,23 @@ def create_alert(alert, people)
@stats[:alerts_to_be_updated] += 1
id = existing_alert['id']

if datadog_query != existing_alert['query'] and message != existing_alert['message']
log.info("updating existing alert #{id} (#{alert['name']}) with new message and query: #{datadog_query} #{message}")
elsif message != existing_alert['message']
log.info("updating existing alert #{id} (#{alert['name']}) with new message: #{message}")
new_alert_text = "Query:\n#{datadog_query}\nMessage:\n#{message}"
existing_alert_text = "Query:\n#{existing_alert['query']}\nMessage:\n#{existing_alert['message']}\n"
diff = Diffy::Diff.new(existing_alert_text, new_alert_text, :context=>1)
log.info("updating existing alert #{id} (#{alert['name']}): #{diff}")

if @dry_run
resp = @dog.alert(
alert['metric']['datadog_query'].strip,
alert_opts,
)
else
log.info("updating existing alert #{id} (#{alert['name']}) with query: #{datadog_query}")
resp = @dog.update_alert(
id,
alert['metric']['datadog_query'].strip,
alert_opts
)
end

resp = @dog.update_alert(
id,
alert['metric']['datadog_query'].strip,
alert_opts
)
end

# log whenever we've encountered errors
Expand All @@ -159,7 +169,19 @@ def create_alert(alert, people)

def remove_alert(alert)
if alert['message'].include?(ALERT_KEY)
remove_alert_by_id(alert['id'])
@stats[:alerts_to_be_deleted] += 1
log.info("deleting alert: #{alert['name']}")

if not @dry_run
resp = @dog.delete_alert(alert['id'])
code = resp[0].to_i
log_datadog_response_code(resp, code, :deleting)

if !(code >= 300 || code == -1)
# assume this was a success
@stats[:alerts_deleted] += 1
end
end
else
log.warn("not deleting manually-created alert #{alert['id']} (#{alert['name']})")
end
Expand All @@ -181,17 +203,11 @@ def report_stats
end

def remove_alert_by_id(alert_id)
# This should only be used by dry-run to clean up created dry-run alerts
log.debug("deleting alert, id: #{alert_id}")
@stats[:alerts_to_be_deleted] += 1

resp = @dog.delete_alert(alert_id)
code = resp[0].to_i
log_datadog_response_code(resp, code, :deleting)

if !(code >= 300 || code == -1)
# assume this was a success
@stats[:alerts_deleted] += 1
end
end

def log_datadog_response_code(resp, code, action, alert=nil)
Expand Down
1 change: 1 addition & 0 deletions lib/interferon/group_sources/filesystem.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
include ::Interferon::Logging

module Interferon::GroupSources
class Filesystem
Expand Down
2 changes: 1 addition & 1 deletion lib/interferon/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module Interferon
VERSION = "0.0.12"
VERSION = "0.0.13"
end
11 changes: 10 additions & 1 deletion spec/helpers/dsl_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ def initialize
def get_or_set(field, val, block, default)
@hash ||= Hash.new
if val.nil?
return @hash[field]
f = @hash[field]
f.nil? ? default : f
else
@hash[field] = val
end
Expand All @@ -27,6 +28,14 @@ def metric(v = nil)
def id(v = nil, &block)
get_or_set(:@id, v, block, '')
end

def silenced(v = nil, &block)
get_or_set(:@silenced, v, block, false)
end

def silenced_until(v = nil, &block)
get_or_set(:@silenced_until, v && Time.parse(v), block, Time.at(0))
end
end

class MockNotifyDSL < NotifyDSL
Expand Down

0 comments on commit 28d7511

Please sign in to comment.