From 726238d8717a03f6c80da31d4923632fed27e15a Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Tue, 1 Sep 2020 23:25:17 +0200
Subject: glados/wasser: giesszeit 10 -> 20

---
 krebs/2configs/shack/glados/multi/wasser.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'krebs')

diff --git a/krebs/2configs/shack/glados/multi/wasser.nix b/krebs/2configs/shack/glados/multi/wasser.nix
index 6f3dc98a..e3ba2838 100644
--- a/krebs/2configs/shack/glados/multi/wasser.nix
+++ b/krebs/2configs/shack/glados/multi/wasser.nix
@@ -2,7 +2,7 @@
 #  switch.crafting_giesskanne_relay
 let
   glados = import ../lib;
-  seconds = 10;
+  seconds = 20;
   wasser = "switch.crafting_giesskanne_relay";
 in
 {
-- 
cgit v1.2.3


From 440d1cc6642a22f4155fa616c5647cbd2bfbfb77 Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Wed, 9 Sep 2020 00:32:10 +0200
Subject: shack/glados: re-enable influxdb output

---
 krebs/2configs/shack/glados/default.nix | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'krebs')

diff --git a/krebs/2configs/shack/glados/default.nix b/krebs/2configs/shack/glados/default.nix
index 9bf90bca..e4f03295 100644
--- a/krebs/2configs/shack/glados/default.nix
+++ b/krebs/2configs/shack/glados/default.nix
@@ -62,13 +62,14 @@ in {
         ];
       };
       # https://www.home-assistant.io/components/influxdb/
-      #influxdb = {
-      #  database = "hass";
-      #  tags = {
-      #    instance = "wolf";
-      #    source = "hass";
-      #  };
-      #};
+      influxdb = {
+        database = "glados";
+        host = "influx.shack";
+        tags = {
+          instance = "wolf";
+          source = "glados";
+        };
+      };
       esphome = {};
       api = {};
       mqtt = {
-- 
cgit v1.2.3


From 306653db954b41759fccfbd8b9d42ff712fe64f0 Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Sun, 13 Sep 2020 13:56:58 +0200
Subject: wolf.r: more relevant nginx data

---
 krebs/1systems/wolf/config.nix      | 16 ++++++++++++++++
 krebs/2configs/shack/influx.nix     |  5 +++++
 krebs/2configs/shack/node-light.nix |  3 +++
 3 files changed, 24 insertions(+)

(limited to 'krebs')

diff --git a/krebs/1systems/wolf/config.nix b/krebs/1systems/wolf/config.nix
index 029644ca..a9e8aafe 100644
--- a/krebs/1systems/wolf/config.nix
+++ b/krebs/1systems/wolf/config.nix
@@ -34,6 +34,22 @@ in
     # powerraw usb serial to mqtt and raw socket
     <stockholm/krebs/2configs/shack/powerraw.nix>
 
+    { # do not log to /var/spool/log
+      services.nginx.appendHttpConfig = ''
+          map $request_method $loggable {
+            default 1;
+            GET 0;
+          }
+          log_format vhost '$host $remote_addr - $remote_user '
+                     '[$time_local] "$request" $status '
+                     '$body_bytes_sent "$http_referer" '
+                     '"$http_user_agent"';
+          error_log stderr;
+          access_log syslog:server=unix:/dev/log vhost;
+      '';
+      services.journald.rateLimitBurst = 10000;
+    }
+
     # create samba share for anonymous usage with the laser and 3d printer pc
     <stockholm/krebs/2configs/shack/share.nix>
 
diff --git a/krebs/2configs/shack/influx.nix b/krebs/2configs/shack/influx.nix
index 92cb24bf..93d83a59 100644
--- a/krebs/2configs/shack/influx.nix
+++ b/krebs/2configs/shack/influx.nix
@@ -8,6 +8,11 @@ in
   networking.firewall.allowedTCPPorts = [ port ]; # for legacy applications
   networking.firewall.allowedUDPPorts = [ collectd-port ];
   services.nginx.virtualHosts."influx.shack" = {
+    # Disable constant GET request logging.
+    # $loggable map is defined in 1/wolf
+    extraConfig = ''
+      access_log syslog:server=unix:/dev/log combined if=$loggable;
+    '';
     locations."/" = {
       proxyPass = "http://localhost:${toString port}/";
     };
diff --git a/krebs/2configs/shack/node-light.nix b/krebs/2configs/shack/node-light.nix
index b471f2af..4a981ea8 100644
--- a/krebs/2configs/shack/node-light.nix
+++ b/krebs/2configs/shack/node-light.nix
@@ -28,6 +28,9 @@ in {
   };
 
   services.nginx.virtualHosts."openhab.shack" = {
+    extraConfig = ''
+      access_log syslog:server=unix:/dev/log combined if=$loggable;
+    '';
     serverAliases = [ "lightapi.shack" ];
     locations."/power/".proxyPass = "http://localhost:${port}/power/";
     locations."/lounge/".proxyPass = "http://localhost:${port}/lounge/";
-- 
cgit v1.2.3


From eaf4ed0b24dcc89191d018fa4ebd9331e22ea324 Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Sun, 13 Sep 2020 23:52:22 +0200
Subject: shack/prometheus: activate alerting to telegram

---
 .../2configs/shack/prometheus/alertmanager-telegram.nix | 17 +++++++++++++++++
 krebs/2configs/shack/prometheus/server.nix              | 11 +++--------
 2 files changed, 20 insertions(+), 8 deletions(-)
 create mode 100644 krebs/2configs/shack/prometheus/alertmanager-telegram.nix

(limited to 'krebs')

diff --git a/krebs/2configs/shack/prometheus/alertmanager-telegram.nix b/krebs/2configs/shack/prometheus/alertmanager-telegram.nix
new file mode 100644
index 00000000..9d0ef45e
--- /dev/null
+++ b/krebs/2configs/shack/prometheus/alertmanager-telegram.nix
@@ -0,0 +1,17 @@
+{ pkgs, ...}:
+{
+  systemd.services.alertmanager-bot-telegram = {
+    wantedBy = [ "multi-user.target" ];
+    after = [ "ip-up.target" ];
+    serviceConfig = {
+      EnvironmentFile = toString <secrets/shack/telegram_bot.env>;
+      DynamicUser = true;
+      StateDirectory = "alertbot";
+      ExecStart = ''${pkgs.alertmanager-bot-telegram}/bin/alertmanager-bot \
+        --alertmanager.url=http://alert.prometheus.shack --log.level=info \
+        --store=bolt --bolt.path=/var/lib/alertbot/bot.db \
+        --listen.addr="0.0.0.0:16320" \
+        --template.paths=${pkgs.alertmanager-bot-telegram}/templates/default.tmpl'';
+    };
+  };
+}
diff --git a/krebs/2configs/shack/prometheus/server.nix b/krebs/2configs/shack/prometheus/server.nix
index c088a3b0..8f37f447 100644
--- a/krebs/2configs/shack/prometheus/server.nix
+++ b/krebs/2configs/shack/prometheus/server.nix
@@ -118,7 +118,7 @@
       ];
       alertmanager = {
         enable = true;
-        listenAddress = "0.0.0.0";
+        listenAddress = "127.0.0.1";
         configuration = {
           "global" = {
             "smtp_smarthost" = "smtp.example.com:587";
@@ -134,15 +134,10 @@
           "receivers" = [
             {
               "name" = "team-admins";
-              "email_configs" = [
-                {
-                  "to" = "devnull@example.com";
-                  "send_resolved" = true;
-                }
-              ];
+              "email_configs" = [ ];
               "webhook_configs" = [
                 {
-                  "url" = "https://example.com/prometheus-alerts";
+                  "url" = "http://localhost:8080";
                   "send_resolved" = true;
                 }
               ];
-- 
cgit v1.2.3


From 156339f63adcddcd8b1eb6d17bd4f76f72086920 Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Mon, 14 Sep 2020 00:26:21 +0200
Subject: shack/prometheus: alertmanager-bot on a different port now

---
 krebs/2configs/shack/prometheus/alert-rules.nix | 5 -----
 krebs/2configs/shack/prometheus/server.nix      | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'krebs')

diff --git a/krebs/2configs/shack/prometheus/alert-rules.nix b/krebs/2configs/shack/prometheus/alert-rules.nix
index 096c551b..afef5b8a 100644
--- a/krebs/2configs/shack/prometheus/alert-rules.nix
+++ b/krebs/2configs/shack/prometheus/alert-rules.nix
@@ -71,11 +71,6 @@ in mapAttrsToList (name: opts: {
     summary = "{{$labels.alias}}: Using more than 30% of its swap.";
     description = "{{$labels.alias}} is using 30% of its swap space for at least 30 minutes.";
   };
-  node_visible_confluence_space = {
-    condition = "node_visible_confluence_space != 0";
-    summary = "crowd prometheus cann see the {{$labels.space_name}} confluence space!";
-    description = "crowd user `prometheus` can see the `{{$labels.space_name}}` confluence space.";
-  };
   node_hwmon_temp = {
     condition = "node_hwmon_temp_celsius > node_hwmon_temp_crit_celsius*0.9 OR node_hwmon_temp_celsius > node_hwmon_temp_max_celsius*0.95";
     time = "5m";
diff --git a/krebs/2configs/shack/prometheus/server.nix b/krebs/2configs/shack/prometheus/server.nix
index 8f37f447..a10a43af 100644
--- a/krebs/2configs/shack/prometheus/server.nix
+++ b/krebs/2configs/shack/prometheus/server.nix
@@ -119,6 +119,9 @@
       alertmanager = {
         enable = true;
         listenAddress = "127.0.0.1";
+        webExternalUrl = "http://alert.prometheus.shack";
+        logLevel = "debug";
+
         configuration = {
           "global" = {
             "smtp_smarthost" = "smtp.example.com:587";
@@ -137,7 +140,7 @@
               "email_configs" = [ ];
               "webhook_configs" = [
                 {
-                  "url" = "http://localhost:8080";
+                  "url" = "http://localhost:16320";
                   "send_resolved" = true;
                 }
               ];
-- 
cgit v1.2.3


From fc836fc0cb2aa2a9e65b2314ab361788d8e73186 Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Mon, 14 Sep 2020 11:11:58 +0200
Subject: shack/prometheus: strip down number of alerts to 3

---
 krebs/2configs/shack/prometheus/alert-rules.nix | 135 +++++++-----------------
 krebs/2configs/shack/prometheus/server.nix      |   9 +-
 2 files changed, 43 insertions(+), 101 deletions(-)

(limited to 'krebs')

diff --git a/krebs/2configs/shack/prometheus/alert-rules.nix b/krebs/2configs/shack/prometheus/alert-rules.nix
index afef5b8a..730921be 100644
--- a/krebs/2configs/shack/prometheus/alert-rules.nix
+++ b/krebs/2configs/shack/prometheus/alert-rules.nix
@@ -1,97 +1,42 @@
-{ lib }:
-with lib;
-
+{ lib,... }:
 let
-  deviceFilter = ''device!="ramfs",device!="rpc_pipefs",device!="lxcfs",device!="nsfs",device!="borgfs"'';
-in mapAttrsToList (name: opts: {
-  alert = name;
-  expr = opts.condition;
-  for = opts.time or "2m";
-  labels = if (opts.page or true) then { severity = "page"; } else {};
-  annotations = {
-    summary = opts.summary;
-    description = opts.description;
-  };
-}) {
-  node_down = {
-    condition = ''up{job="node"} == 0'';
-    summary = "{{$labels.alias}}: Node is down.";
-    description = "{{$labels.alias}} has been down for more than 2 minutes.";
-  };
-  node_systemd_service_failed = {
-    condition = ''node_systemd_unit_state{state="failed"} == 1'';
-    summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.";
-    description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}.";
-  };
-  node_filesystem_full_80percent = {
-    condition = ''sort(node_filesystem_free_bytes{${deviceFilter}} < node_filesystem_size_bytes{${deviceFilter}} * 0.2) / 1024^3'';
-    time = "10m";
-    summary = "{{$labels.alias}}: Filesystem is running out of space soon.";
-    description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 20% space left on its filesystem.";
-  };
-  node_filesystem_full_in_7d = {
-    condition = ''predict_linear(node_filesystem_free_bytes{${deviceFilter}}[2d], 7*24*3600) <= 0'';
-    time = "1h";
-    summary = "{{$labels.alias}}: Filesystem is running out of space in 7 days.";
-    description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 7 days";
-  };
-  node_filesystem_full_in_30d = {
-    condition = ''predict_linear(node_filesystem_free_bytes{${deviceFilter}}[30d], 30*24*3600) <= 0'';
-    time = "1h";
-    summary = "{{$labels.alias}}: Filesystem is running out of space in 30 days.";
-    description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 30 days";
-  };
-  node_filedescriptors_full_in_3h = {
-    condition = ''predict_linear(node_filefd_allocated[3h], 3*3600) >= node_filefd_maximum'';
-    time = "20m";
-    summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.";
-    description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours";
-  };
-  node_filedescriptors_full_in_7d = {
-    condition = ''predict_linear(node_filefd_allocated[7d], 7*24*3600) >= node_filefd_maximum'';
-    time = "1h";
-    summary = "{{$labels.alias}} is running out of available file descriptors in 7 days.";
-    description = "{{$labels.alias}} is running out of available file descriptors in approx. 7 days";
-  };
-  node_load15 = {
-    condition = ''node_load15 / on(alias) count(node_cpu_seconds_total{mode="system"}) by (alias) >= 1.0'';
-    time = "10m";
-    summary = "{{$labels.alias}}: Running on high load: {{$value}}";
-    description = "{{$labels.alias}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
-  };
-  node_ram_using_90percent = {
-    condition =  "node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1";
-    time = "1h";
-    summary = "{{$labels.alias}}: Using lots of RAM.";
-    description = "{{$labels.alias}} is using at least 90% of its RAM for at least 1 hour.";
-  };
-  node_swap_using_30percent = {
-    condition = "node_memory_SwapTotal_bytes - (node_memory_SwapFree_bytes + node_memory_SwapCached_bytes) > node_memory_SwapTotal_bytes * 0.3";
-    time = "30m";
-    summary = "{{$labels.alias}}: Using more than 30% of its swap.";
-    description = "{{$labels.alias}} is using 30% of its swap space for at least 30 minutes.";
-  };
-  node_hwmon_temp = {
-    condition = "node_hwmon_temp_celsius > node_hwmon_temp_crit_celsius*0.9 OR node_hwmon_temp_celsius > node_hwmon_temp_max_celsius*0.95";
-    time = "5m";
-    summary = "{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}} ";
-    description = "{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}";
-  };
-  node_conntrack_limit = {
-    condition  = "node_nf_conntrack_entries_limit - node_nf_conntrack_entries < 1000";
-    time = "5m";
-    summary = "{{$labels.alias}}: Number of tracked connections high";
-    description = "{{$labels.alias}} has only {{$value}} free slots for connection tracking available.";
-  };
-  node_reboot = {
-    condition = "time() - node_boot_time_seconds < 300";
-    summary = "{{$labels.alias}}: Reboot";
-    description = "{{$labels.alias}} just rebooted.";
-  };
-  node_uptime = {
-    condition = "time() - node_boot_time_seconds > 2592000";
-    page = false;
-    summary = "{{$labels.alias}}: Uptime monster";
-    description = "{{$labels.alias}} has been up for more than 30 days.";
-  };
+  disk_free_threshold = "10"; # at least this much free disk percentage
+in {
+  services.prometheus.rules = [(builtins.toJSON
+    {
+      groups = [
+        { name = "shack-env";
+          rules = [
+            {
+              alert = "RootPartitionFull";
+              for = "30m";
+              expr = ''(node_filesystem_avail_bytes{alias="wolf",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf",mountpoint="/"} < ${disk_free_threshold}'';
+              labels.severity = "warning";
+              annotations.summary = "{{ $labels.alias }} root disk full";
+              annotations.url = "http://grafana.shack/";
+              annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).A vast number of shackspace services will stop working. CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and clean up the shack share folder in `/home/share` .If this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
+            }
+            {
+              alert = "RootPartitionFull";
+              for = "30m";
+              expr = ''(node_filesystem_avail_bytes{alias="puyak",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="puyak",mountpoint="/"} < ${disk_free_threshold}'';
+              labels.severity = "warning";
+              annotations.summary = "{{ $labels.alias }} root disk full";
+              annotations.url = "http://grafana.shack/";
+              annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete'';
+            }
+            {
+              alert = "HostDown";
+              expr = ''up{alias="wolf"} == 0'';
+              for = "5m";
+              labels.severity = "page";
+              annotations.summary = "Instance {{ $labels.alias }} down for 5 minutes";
+              annotations.url = "http://grafana.shack/";
+              annotations.description = ''Host {{ $labels.alias }} went down and has not been reconnected after 5 minutes. This is probably bad news, try to restart the host via naproxen ( http://naproxen.shack:8006 ). Wolf being down means that CI,glados automation, light management and a couple of other services will not work anymore.'';
+            }
+          ];
+        }
+      ];
+    }
+  )];
 }
diff --git a/krebs/2configs/shack/prometheus/server.nix b/krebs/2configs/shack/prometheus/server.nix
index a10a43af..9e4b4d1a 100644
--- a/krebs/2configs/shack/prometheus/server.nix
+++ b/krebs/2configs/shack/prometheus/server.nix
@@ -1,6 +1,9 @@
 { pkgs, lib, config, ... }:
 # from https://gist.github.com/globin/02496fd10a96a36f092a8e7ea0e6c7dd
 {
+  imports = [
+    ./alert-rules.nix
+  ];
   networking = {
     firewall.allowedTCPPorts = [
       9090  # prometheus
@@ -18,12 +21,6 @@
     };
     prometheus = {
       enable = true;
-      ruleFiles = lib.singleton (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
-            groups = lib.singleton {
-              name = "mf-alerting-rules";
-              rules = import ./alert-rules.nix { inherit lib; };
-            };
-          }));
       scrapeConfigs = [
         {
           job_name = "node";
-- 
cgit v1.2.3


From 57e6296ac4ca6b600f9e57b2c347f413096d09ed Mon Sep 17 00:00:00 2001
From: makefu <github@syntax-fehler.de>
Date: Mon, 14 Sep 2020 11:13:03 +0200
Subject: puyak.r: enable alertmanager-bot

---
 krebs/1systems/puyak/config.nix | 1 +
 1 file changed, 1 insertion(+)

(limited to 'krebs')

diff --git a/krebs/1systems/puyak/config.nix b/krebs/1systems/puyak/config.nix
index a50d2eab..8b30dfef 100644
--- a/krebs/1systems/puyak/config.nix
+++ b/krebs/1systems/puyak/config.nix
@@ -18,6 +18,7 @@
     <stockholm/krebs/2configs/shack/prometheus/server.nix>
     <stockholm/krebs/2configs/shack/prometheus/blackbox.nix>
     <stockholm/krebs/2configs/shack/prometheus/unifi.nix>
+    <stockholm/krebs/2configs/shack/prometheus/alertmanager-telegram.nix>
     <stockholm/krebs/2configs/shack/gitlab-runner.nix>
 
     ## Collect local statistics via collectd and send to collectd
-- 
cgit v1.2.3