diff options
Diffstat (limited to 'lass/2configs')
-rw-r--r-- | lass/2configs/default.nix | 1 | ||||
-rw-r--r-- | lass/2configs/monitoring/node-exporter.nix | 15 | ||||
-rw-r--r-- | lass/2configs/monitoring/prometheus-server.nix | 217 |
3 files changed, 0 insertions, 233 deletions
diff --git a/lass/2configs/default.nix b/lass/2configs/default.nix index 9dc2eed21..7ee83c174 100644 --- a/lass/2configs/default.nix +++ b/lass/2configs/default.nix @@ -8,7 +8,6 @@ with import <stockholm/lib>; ./gc.nix ./mc.nix ./vim.nix - ./monitoring/node-exporter.nix ./zsh.nix ./htop.nix ./security-workarounds.nix diff --git a/lass/2configs/monitoring/node-exporter.nix b/lass/2configs/monitoring/node-exporter.nix deleted file mode 100644 index 561e3a25c..000000000 --- a/lass/2configs/monitoring/node-exporter.nix +++ /dev/null @@ -1,15 +0,0 @@ -{ config, lib, pkgs, ... }: -{ - krebs.iptables.tables.filter.INPUT.rules = [ - { predicate = "-i retiolum -p tcp --dport 9100 -s ${config.krebs.hosts.prism.nets.retiolum.ip4.addr}"; target = "ACCEPT"; v6 = false; } - { predicate = "-i retiolum -p tcp --dport 9100 -s ${config.krebs.hosts.prism.nets.retiolum.ip6.addr}"; target = "ACCEPT"; v4 = false; } - ]; - services.prometheus.exporters = { - node = { - enable = true; - enabledCollectors = [ - "systemd" - ]; - }; - }; -} diff --git a/lass/2configs/monitoring/prometheus-server.nix b/lass/2configs/monitoring/prometheus-server.nix deleted file mode 100644 index b7083c776..000000000 --- a/lass/2configs/monitoring/prometheus-server.nix +++ /dev/null @@ -1,217 +0,0 @@ -{ pkgs, lib, config, ... }: -{ - #networking = { - # firewall.allowedTCPPorts = [ - # 3000 # grafana - # 9090 # prometheus - # 9093 # alertmanager - # ]; - # useDHCP = true; - #}; - - krebs.iptables.tables.filter.INPUT.rules = [ - { predicate = "-i retiolum -p tcp --dport 3000"; target = "ACCEPT"; } - { predicate = "-i retiolum -p tcp --dport 9090"; target = "ACCEPT"; } - { predicate = "-i retiolum -p tcp --dport 9093"; target = "ACCEPT"; } - ]; - - services = { - prometheus = { - enable = true; - extraFlags = [ - "-storage.local.retention 8760h" - "-storage.local.series-file-shrink-ratio 0.3" - "-storage.local.memory-chunks 2097152" - "-storage.local.max-chunks-to-persist 1048576" - "-storage.local.index-cache-size.fingerprint-to-metric 2097152" - "-storage.local.index-cache-size.fingerprint-to-timerange 1048576" - "-storage.local.index-cache-size.label-name-to-label-values 2097152" - "-storage.local.index-cache-size.label-pair-to-fingerprints 41943040" - ]; - alertmanagerURL = [ "http://localhost:9093" ]; - rules = [ - '' - ALERT node_down - IF up == 0 - FOR 5m - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: Node is down.", - description = "{{$labels.alias}} has been down for more than 5 minutes." - } - ALERT node_systemd_service_failed - IF node_systemd_unit_state{state="failed"} == 1 - FOR 4m - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.", - description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}." - } - ALERT node_filesystem_full_90percent - IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3 - FOR 5m - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: Filesystem is running out of space soon.", - description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem." - } - ALERT node_filesystem_full_in_4h - IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0 - FOR 5m - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.", - description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours" - } - ALERT node_filedescriptors_full_in_3h - IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum - FOR 20m - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.", - description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours" - } - ALERT node_load1_90percent - IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9 - FOR 1h - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: Running on high load.", - description = "{{$labels.alias}} is running with > 90% total load for at least 1h." - } - ALERT node_cpu_util_90percent - IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90 - FOR 1h - LABELS { - severity="page" - } - ANNOTATIONS { - summary = "{{$labels.alias}}: High CPU utilization.", - description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h." - } - ALERT node_ram_using_90percent - IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1 - FOR 30m - LABELS { - severity="page" - } - ANNOTATIONS { - summary="{{$labels.alias}}: Using lots of RAM.", - description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.", - } - ALERT node_swap_using_80percent - IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8 - FOR 10m - LABELS { - severity="page" - } - ANNOTATIONS { - summary="{{$labels.alias}}: Running out of swap soon.", - description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now." - } - '' - ]; - scrapeConfigs = [ - { - job_name = "node"; - scrape_interval = "10s"; - static_configs = [ - { - targets = [ - ] ++ map (host: "${host}:9100") (lib.attrNames (lib.filterAttrs (_: host: host.owner.name == "lass" && host.monitoring) config.krebs.hosts)); - #labels = { - # alias = "prometheus.example.com"; - #}; - } - ]; - } - ]; - alertmanager = { - enable = true; - listenAddress = "0.0.0.0"; - configuration = { - "global" = { - "smtp_smarthost" = "smtp.example.com:587"; - "smtp_from" = "alertmanager@example.com"; - }; - "route" = { - "group_by" = [ "alertname" "alias" ]; - "group_wait" = "30s"; - "group_interval" = "2m"; - "repeat_interval" = "4h"; - "receiver" = "team-admins"; - }; - "receivers" = [ - { - "name" = "team-admins"; - "email_configs" = [ - { - "to" = "devnull@example.com"; - } - ]; - "webhook_configs" = [ - { - "url" = "http://127.0.0.1:14813/prometheus-alerts"; - "send_resolved" = true; - } - ]; - } - ]; - }; - }; - }; - grafana = { - enable = true; - addr = "0.0.0.0"; - domain = "grafana.example.com"; - rootUrl = "https://grafana.example.com/"; - auth.anonymous.enable = true; - auth.anonymous.org_role = "Admin"; - }; - }; - services.logstash = { - enable = true; - inputConfig = '' - http { - port => 14813 - host => "127.0.0.1" - } - ''; - filterConfig = '' - if ([alerts]) { - ruby { - code => ' - lines = [] - event["alerts"].each {|p| - lines << "#{p["labels"]["instance"]}#{p["annotations"]["summary"]} #{p["status"]}" - } - event["output"] = lines.join("\n") - ' - } - } - ''; - outputConfig = '' - file { path => "/tmp/logs.json" codec => "json_lines" } - irc { - channels => [ "#noise" ] - host => "irc.r" - nick => "alarm" - codec => "json_lines" - format => "%{output}" - } - ''; - #plugins = [ ]; - }; -} |