Commit 1b12d939 authored by Jürgen Haas's avatar Jürgen Haas
Browse files

ansible-inventories/arocom#2969 Rearchitect netdata role and config

parent b9806b82
......@@ -10,7 +10,7 @@
- name: Apache Status Site
template:
src: apache.conf
src: etc/apache2/sites-available/status.conf
dest: /etc/apache2/sites-available/status{{ apache_conf_ext|default(".conf") }}
owner: root
group: root
......@@ -27,7 +27,7 @@
- name: Weblog Module Configuration
template:
src: web_log.conf
src: go.d/web_log.conf
dest: /etc/netdata/go.d/web_log.conf
owner: netdata
group: netdata
......@@ -35,7 +35,7 @@
- name: Install Custom Alerts
template:
src: health/web_log.conf
dest: /etc/netdata/health.d/web_log.conf
src: health.d/web_log.conf
dest: /etc/netdata/health.d/my_web_log.conf
notify:
- Restart NetData
......@@ -17,8 +17,6 @@
backup: yes
with_items:
- health_alarm_notify.conf
- python.d.conf
- go.d.conf
- ebpf.conf
- exporting.conf
notify:
......@@ -26,7 +24,7 @@
- name: Configure Python Plugins
template:
src: '{{ item }}'
src: python.d/{{ item }}
dest: /etc/netdata/python.d/{{ item }}
owner: netdata
group: netdata
......@@ -41,7 +39,7 @@
- name: Configure Go Plugins
template:
src: '{{ item }}'
src: 'go.d/{{ item }}'
dest: /etc/netdata/go.d/{{ item }}
owner: netdata
group: netdata
......
......@@ -3,7 +3,7 @@
- name: Configure ElasticSearch Plugin
template:
src: elasticsearch.conf
src: go.d/elasticsearch.conf
dest: /etc/netdata/go.d/elasticsearch.conf
owner: netdata
group: netdata
......
......@@ -8,8 +8,8 @@
append: yes
- name: Configure HaProxy Plugin
copy:
src: haproxy.conf
template:
src: python.d/haproxy.conf
dest: /etc/netdata/python.d/haproxy.conf
owner: netdata
group: netdata
......
......@@ -29,13 +29,14 @@
chdir: /opt/{{ netdata_local_archive|default('netdata') }}
when: netdata_clone.changed or netdata_extract.changed
- name: Install Custom Alerts
- name: Install Custom Alert Templates
template:
src: health/{{ item }}.conf
dest: /etc/netdata/health.d/{{ item }}.conf
src: health.d/{{ item }}.conf
dest: /etc/netdata/health.d/my_{{ item }}.conf
with_items:
- fluentd_buffer
- httpcheck
- oracle
notify:
- Restart NetData
......
# TODO: Review, if these settings are still valid
[global]
ebpf load mode = return
disable apps = no
......
# configure the netdata ports
server_netdata_ports="tcp/19999"
interface eth0 world bidirectional ethernet balanced rate 100Mbit
class arp
match arp
class icmp
match icmp
class dns commit 1Mbit
server dns
client dns
class ntp
server ntp
client ntp
class ssh commit 2Mbit
server ssh
client ssh
class rsync commit 2Mbit max 10Mbit
server rsync
client rsync
class web_server commit 40Mbit
server http
server netdata
class client
client surfing
#!/bin/bash
# chkconfig: 345 99 01
# description: startup script
# Source functions
. /lib/lsb/init-functions
DAEMON="netdata"
DAEMON_PATH=/usr/sbin
PIDFILE=/var/run/$DAEMON.pid
DAEMONOPTS="-pidfile $PIDFILE"
STOP_TIMEOUT="10"
service_start()
{
printf "%-50s" "Starting $DAEMON..."
start_daemon $DAEMON_PATH/$DAEMON $DAEMONOPTS
echo
}
service_stop()
{
printf "%-50s" "Stopping $DAEMON..."
killproc -p ${PIDFILE} $DAEMON_PATH/$DAEMON
rm -f ${PIDFILE}
echo
}
service_status()
{
status_of_proc -p ${PIDFILE} $DAEMON_PATH/$DAEMON
}
case "$1" in
start)
service_start
;;
status)
service_status
;;
stop)
service_stop
;;
restart)
service_stop
service_start
;;
*)
echo "Usage: $0 {status|start|stop|restart}"
exit 1
esac
/var/log/netdata/*.log {
daily
rotate 7
missingok
compress
delaycompress
notifempty
sharedscripts
postrotate
/bin/kill -HUP `pidof netdata 2>/dev/null` 2>/dev/null || true
endscript
}
# netdata go.d.plugin configuration
#
# This file is in YaML format.
# Enable/disable the whole go.d.plugin.
enabled: yes
# Enable/disable default value for all modules.
default_run: yes
# Maximum number of used CPUs. Zero means no limit.
max_procs: 0
# Enable/disable specific g.d.plugin module
modules:
# activemq: yes
# apache: yes
# bind: yes
# consul: yes
# coredns: yes
# dns_query: yes
# docker_engine: yes
# dockerhub: yes
# example: no
# freeradius: yes
# httpcheck: yes
# k8s_kubelet: yes
# k8s_kubeproxy: yes
# lighttpd: yes
# lighttpd2: yes
# logstash: yes
# mysql: yes
# nginx: yes
# openvpn: yes
# portcheck: yes
# rabbitmq: yes
# scaleio: yes
# solr: yes
# springboot2: yes
# tengine: yes
# unbound: yes
# vsphere: yes
web_log: {{ (groups['webserver'] is defined and inventory_hostname in groups['webserver'])|ternary("yes", "no") }}
# wmi: yes
# x509check: yes
# Change: this is our own template, it doesn't overwrite anything from stock
alarm: fluentd_local.buffer_total_queued_size
on: fluentd_local.buffer_total_queued_size
os: linux
......
# Change: use 5 instead of 3 minutes and multiply with 4/6 instead of 2/3
template: web_service_slow
families: *
on: httpcheck.responsetime
lookup: average -5m unaligned of time
units: ms
every: 10s
warn: ($this > ($1h_web_service_response_time * 4) )
crit: ($this > ($1h_web_service_response_time * 6) )
info: average response time over the last 5 minutes, compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: webmaster
# Change: this is our own template, it doesn't overwrite anything from stock
template: oracle_space_usage
on: tablespace_usage_in_percent
os: linux freebsd
......
# Change: make successful rate silent
template: 1m_successful
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned of successful_requests
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
to: silent
template: httpcheck_last_collected_secs
families: *
on: httpcheck.status
calc: $now - $last_collected_t
every: 10s
units: seconds ago
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
template: web_service_up
families: *
on: httpcheck.status
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
units: up/down
info: at least 75% verified responses during last 60 seconds, ideal for badges
to: silent
template: web_service_bad_content
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_content
every: 10s
units: %
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of unexpected http response content during the last 5 minutes
to: webmaster
template: web_service_bad_status
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_status
every: 10s
units: %
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of unexpected http status during the last 5 minutes
to: webmaster
template: web_service_timeouts
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
info: average of timeouts during the last 5 minutes
template: no_web_service_connections
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of no_connection
every: 10s
units: %
info: average of failed requests during the last 5 minutes
# combined timeout & no connection alarm
template: web_service_unreachable
families: *
on: httpcheck.status
calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
units: %
every: 10s
warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
delay: down 5m multiplier 1.5 max 1h
info: average of failed requests either due to timeouts or no connection during the last 5 minutes
to: webmaster
template: 1h_web_service_response_time
families: *
on: httpcheck.responsetime
lookup: average -1h unaligned of time
every: 30s
units: ms
info: average response time over the last hour
template: web_service_slow
families: *
on: httpcheck.responsetime
lookup: average -5m unaligned of time
units: ms
every: 10s
warn: ($this > ($1h_web_service_response_time * 4) )
crit: ($this > ($1h_web_service_response_time * 6) )
info: average response time over the last 5 minutes, compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: webmaster
# make sure we can collect web log data
template: last_collected_secs
on: web_log.response_codes
families: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
# -----------------------------------------------------------------------------
# high level response code alarms
# the following alarms trigger only when there are enough data.
# we assume there are enough data when:
#
# $1m_requests > 120
#
# i.e. when there are at least 120 requests during the last minute
template: 1m_requests
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
info: the sum of all HTTP requests over the last minute
template: 1m_successful
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned of successful_requests
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
to: silent
template: 1m_redirects
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned of redirects
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: the ratio of HTTP redirects (3xx except 304) over the last minute
to: webmaster
template: 1m_bad_requests
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned of bad_requests
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: the ratio of HTTP bad requests (4xx) over the last minute
to: webmaster
template: 1m_internal_errors
on: web_log.response_statuses
families: *
lookup: sum -1m unaligned of server_errors
calc: $this * 100 / $1m_requests
units: %
every: 10s
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
info: the ratio of HTTP internal server errors (5xx), over the last minute
to: webmaster
# unmatched lines
# the following alarms trigger only when there are enough data.
# we assume there are enough data when:
#
# $1m_total_requests > 120
#
# i.e. when there are at least 120 requests during the last minute
template: 1m_total_requests
on: web_log.response_codes
families: *
lookup: sum -1m unaligned
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
info: the sum of all HTTP requests over the last minute
template: 1m_unmatched
on: web_log.response_codes
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $1m_total_requests
units: %
every: 10s
warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
crit: ($1m_total_requests > 120) ? ($this > 5) : ( 0 )
delay: up 1m down 5m multiplier 1.5 max 1h
info: the ratio of unmatched lines, over the last minute
to: webmaster
# -----------------------------------------------------------------------------
# web slow
# the following alarms trigger only when there are enough data.
# we assume there are enough data when:
#
# $1m_requests > 120
#
# i.e. when there are at least 120 requests during the last minute
template: 10m_response_time
on: web_log.response_time
families: *
lookup: average -10m unaligned of avg
units: ms
every: 30s
info: the average time to respond to HTTP requests, over the last 10 minutes
template: web_slow
on: web_log.response_time
families: *
lookup: average -1m unaligned of avg
units: ms
every: 10s
green: 500
red: 1000
warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
info: the average time to respond to HTTP requests, over the last 1 minute
to: webmaster
# -----------------------------------------------------------------------------
# web too many or too few requests
# the following alarms trigger only when there are enough data.
# we assume there are enough data when:
#
# $5m_successful_old > 120
#
# i.e. when there were at least 120 requests during the 5 minutes starting
# at -10m and ending at -5m
template: 5m_successful_old
on: web_log.response_statuses
families: *
lookup: average -5m at -5m unaligned of successful_requests
units: requests/s
every: 30s
info: average rate of successful HTTP requests over the last 5 minutes
template: 5m_successful
on: web_log.response_statuses
families: *
lookup: average -5m unaligned of successful_requests
units: requests/s
every: 30s
info: average successful HTTP requests over the last 5 minutes
template: 5m_requests_ratio
on: web_log.response_codes
families: *
calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
units: %
every: 30s
warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
delay: down 15m multiplier 1.5 max 1h
info: the percentage of successful web requests over the last 5 minutes, \
compared with the previous 5 minutes \
(clear notification for this alarm will not be sent)
to: webmaster
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment