diff --git a/defaults/main.yml b/defaults/main.yml index e36482055d4764bed98260844630d40f632d3128..f41a6401b097252352f403ac626796d3497d2ed7 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,4 +1,4 @@ -netdata_version: 'v1.14.0' +netdata_version: 'v1.15.0' netdata_force_reset: false netdata_fluentd_buffer: green: 75 diff --git a/tasks/install.yml b/tasks/install.yml index 689dca5203bfcfb333f3c866d0b8528e704dc825..a2c6f2fa25fd0598d9d0ef19181d85c7ce1c47f2 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -44,5 +44,6 @@ dest: '/etc/netdata/health.d/{{ item }}.conf' with_items: - 'fluentd_buffer' + - 'httpcheck' notify: - "Restart NetData" diff --git a/templates/health/httpcheck.conf b/templates/health/httpcheck.conf new file mode 100644 index 0000000000000000000000000000000000000000..4bb012e7bf76ec18401c320b21c57f83b4b7839d --- /dev/null +++ b/templates/health/httpcheck.conf @@ -0,0 +1,95 @@ +template: httpcheck_last_collected_secs +families: * + on: httpcheck.status + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges +template: web_service_up +families: * + on: httpcheck.status + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: at least 75% verified responses during last 60 seconds, ideal for badges + to: silent + +template: web_service_bad_content +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of unexpected http response content during the last 5 minutes + to: webmaster + +template: web_service_bad_status +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of unexpected http status during the last 5 minutes + to: webmaster + +template: web_service_timeouts +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + info: average of timeouts during the last 5 minutes + +template: no_web_service_connections +families: * + on: httpcheck.status + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + info: average of failed requests during the last 5 minutes + +# combined timeout & no connection alarm +template: web_service_unreachable +families: * + on: httpcheck.status + calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) + units: % + every: 10s + warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) + crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 + delay: down 5m multiplier 1.5 max 1h + info: average of failed requests either due to timeouts or no connection during the last 5 minutes + to: webmaster + +template: 1h_web_service_response_time +families: * + on: httpcheck.responsetime + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average response time over the last hour + +template: web_service_slow +families: * + on: httpcheck.responsetime + lookup: average -5m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_web_service_response_time * 4) ) + crit: ($this > ($1h_web_service_response_time * 6) ) + info: average response time over the last 5 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: webmaster