diff options
Diffstat (limited to 'krebs/2configs/shack/prometheus')
-rw-r--r-- | krebs/2configs/shack/prometheus/alert-rules.nix | 21 | ||||
-rw-r--r-- | krebs/2configs/shack/prometheus/irc-alerts.py | 207 | ||||
-rw-r--r-- | krebs/2configs/shack/prometheus/irc-hooks.nix | 59 | ||||
-rw-r--r-- | krebs/2configs/shack/prometheus/server.nix | 5 |
4 files changed, 271 insertions, 21 deletions
diff --git a/krebs/2configs/shack/prometheus/alert-rules.nix b/krebs/2configs/shack/prometheus/alert-rules.nix index 5ba49ede6..4cefdc3e5 100644 --- a/krebs/2configs/shack/prometheus/alert-rules.nix +++ b/krebs/2configs/shack/prometheus/alert-rules.nix @@ -1,6 +1,6 @@ { lib,... }: let - disk_free_threshold = "10"; # at least this much free disk percentage + disk_free_threshold = "5"; # at least this much free disk percentage in { services.prometheus.rules = [(builtins.toJSON { @@ -8,22 +8,6 @@ in { { name = "shack-env"; rules = [ { - alert = "Wolf RootPartitionFull"; - for = "30m"; - expr = ''(node_filesystem_avail_bytes{alias="wolf.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="wolf.shack",mountpoint="/"} < ${disk_free_threshold}''; - labels.severity = "warning"; - annotations.summary = "{{ $labels.alias }} root disk full"; - annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=wolf"; - annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%). CI for deploying new configuration will seize working. Log in to the system and try to clean up the obsolete files on the machine. There are a couple of things you can do: -1. `nix-collect-garbage -d` -2. clean up the shack share folder in `/home/share` -3. check `du -hs /var/ | sort -h`. -4. run `docker system prune` -5. `find /var/lib/containers/news/var/lib/htgen-go/items -mtime +7 -delete;` to clean up the link shortener data -5. If you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete -6. as a last resort the root disk can be expanded via `lvresize -L +10G /dev/pool/root && btrfs filesystem resize max /` ''; - } - { alert = "Puyak RootPartitionFull"; for = "30m"; expr = ''(node_filesystem_avail_bytes{alias="puyak.shack",mountpoint="/"} * 100) / node_filesystem_size_bytes{alias="puyak.shack",mountpoint="/"} < ${disk_free_threshold}''; @@ -32,9 +16,8 @@ in { annotations.url = "http://grafana.shack/d/hb7fSE0Zz/shack-system-dashboard?orgId=1&var-job=node&var-hostname=All&var-node=wolf.shack:9100&var-device=All&var-maxmount=%2F&var-show_hostname=puyak"; annotations.description = ''The root disk of {{ $labels.alias }} has {{ $value | printf "%.2f" }}% free disk space (Threshold at ${disk_free_threshold}%).Prometheus will not be able to create new alerts and CI for deploying new configuration will also seize working. Log in to the system and run `nix-collect-garbage -d` and if this does not help you can check `du -hs /var/ | sort -h`, run `docker system prune` or if you are really desperate run `du -hs / | sort -h` and go through the folders recursively until you've found something to delete''; } - # wolf.shack is not worth supervising anymore { - alert = "HostDown"; + alert = "Infra01 down"; expr = ''up{alias="infra01.shack"} == 0''; for = "5m"; labels.severity = "page"; diff --git a/krebs/2configs/shack/prometheus/irc-alerts.py b/krebs/2configs/shack/prometheus/irc-alerts.py new file mode 100644 index 000000000..005a2013b --- /dev/null +++ b/krebs/2configs/shack/prometheus/irc-alerts.py @@ -0,0 +1,207 @@ +import base64 +import cgi +import json +import os +import re +import socket +import ssl +import sys +from http.server import BaseHTTPRequestHandler +from typing import List, Optional, Tuple +from urllib.parse import urlparse + +DEBUG = os.environ.get("DEBUG") is not None + + +def _irc_send( + server: str, + nick: str, + channel: str, + sasl_password: Optional[str] = None, + server_password: Optional[str] = None, + tls: bool = True, + port: int = 6697, + messages: List[str] = [], +) -> None: + if not messages: + return + + sock = socket.socket() + if tls: + sock = ssl.wrap_socket( + sock, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1_2 + ) + + def _send(command: str) -> int: + if DEBUG: + print(command) + return sock.send((f"{command}\r\n").encode()) + + def _pong(ping: str): + if ping.startswith("PING"): + sock.send(ping.replace("PING", "PONG").encode("ascii")) + + recv_file = sock.makefile(mode="r") + + print(f"connect {server}:{port}") + sock.connect((server, port)) + if server_password: + _send(f"PASS {server_password}") + _send(f"USER {nick} 0 * :{nick}") + _send(f"NICK {nick}") + for line in recv_file.readline(): + if re.match(r"^:[^ ]* (MODE|221|376|422) ", line): + break + else: + _pong(line) + + if sasl_password: + _send("CAP REQ :sasl") + _send("AUTHENTICATE PLAIN") + auth = base64.encodebytes(f"{nick}\0{nick}\0{sasl_password}".encode("utf-8")) + _send(f"AUTHENTICATE {auth.decode('ascii')}") + _send("CAP END") + _send(f"JOIN :{channel}") + + for m in messages: + _send(f"PRIVMSG {channel} :{m}") + + _send("INFO") + for line in recv_file: + if DEBUG: + print(line, end="") + # Assume INFO reply means we are done + if "End of /INFO" in line: + break + else: + _pong(line) + + sock.send(b"QUIT") + print("disconnect") + sock.close() + + +def irc_send( + url: str, notifications: List[str], password: Optional[str] = None +) -> None: + parsed = urlparse(f"{url}") + username = parsed.username or "prometheus" + server = parsed.hostname or "chat.freenode.net" + if parsed.fragment != "": + channel = f"#{parsed.fragment}" + else: + channel = "#krebs-announce" + port = parsed.port or 6697 + if not password: + password = parsed.password + if len(notifications) == 0: + return + _irc_send( + server=server, + nick=username, + sasl_password=password, + channel=channel, + port=port, + messages=notifications, + tls=parsed.scheme == "irc+tls", + ) + + +class PrometheusWebHook(BaseHTTPRequestHandler): + def __init__( + self, + irc_url: str, + conn: socket.socket, + addr: Tuple[str, int], + password: Optional[str] = None, + ) -> None: + self.irc_url = irc_url + self.password = password + self.rfile = conn.makefile("rb") + self.wfile = conn.makefile("wb") + self.client_address = addr + self.handle() + + # for testing + def do_GET(self) -> None: + if DEBUG: + print("GET: Request Received") + self.send_response(200) + self.send_header("Content-type", "text/plain") + self.end_headers() + self.wfile.write(b"ok") + + def do_POST(self) -> None: + if DEBUG: + print("POST: Request Received") + content_type, _ = cgi.parse_header(self.headers.get("content-type")) + + # refuse to receive non-json content + if content_type != "application/json": + if DEBUG: + print(f"POST: wrong content type {content_type}") + self.send_response(400) + self.end_headers() + return + + length = int(self.headers.get("content-length")) + payload = json.loads(self.rfile.read(length)) + messages = [] + for alert in payload["alerts"]: + description = alert["annotations"]["description"] + messages.append(f"{alert['status']}: {description}") + irc_send(self.irc_url, messages, password=self.password) + + self.do_GET() + + +def systemd_socket_response() -> None: + irc_url = os.environ.get("IRC_URL", None) + if irc_url is None: + print( + "IRC_URL environment variable not set: i.e. IRC_URL=irc+tls://mic92-prometheus@chat.freenode.net/#krebs-announce", + file=sys.stderr, + ) + sys.exit(1) + + password = None + irc_password_file = os.environ.get("IRC_PASSWORD_FILE", None) + if irc_password_file: + with open(irc_password_file) as f: + password = f.read() + + msgs = sys.argv[1:] + + if msgs != []: + irc_send(irc_url, msgs, password=password) + return + + nfds = os.environ.get("LISTEN_FDS", None) + if nfds is None: + print( + "LISTEN_FDS not set. Run me with systemd(TM) socket activation?", + file=sys.stderr, + ) + sys.exit(1) + fds = range(3, 3 + int(nfds)) + + for fd in fds: + sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(0) + + try: + while True: + PrometheusWebHook(irc_url, *sock.accept(), password=password) + except BlockingIOError: + # no more connections + pass + + +if __name__ == "__main__": + if DEBUG: + print("Starting in DEBUG mode") + if len(sys.argv) == 3: + print(f"{sys.argv[1]} {sys.argv[2]}") + irc_send(sys.argv[1], [sys.argv[2]]) + else: + systemd_socket_response() diff --git a/krebs/2configs/shack/prometheus/irc-hooks.nix b/krebs/2configs/shack/prometheus/irc-hooks.nix new file mode 100644 index 000000000..07bb2423b --- /dev/null +++ b/krebs/2configs/shack/prometheus/irc-hooks.nix @@ -0,0 +1,59 @@ +{ config +, lib +, pkgs +, ... +}: +let + irc-alerts = pkgs.writers.writePython3 "irc-alerts" { + flakeIgnore = [ "E501" ]; + } (builtins.readFile ./irc-alerts.py); + endpoints = { + binaergewitter = { + url = "irc+tls://puyak-alerts@irc.libera.chat:6697/#binaergewitter-alerts"; + port = 9223; + }; + }; +in +{ + systemd.sockets = + lib.mapAttrs' + (name: opts: + lib.nameValuePair "irc-alerts-${name}" { + description = "Receive http hook and send irc message for ${name}"; + wantedBy = [ "sockets.target" ]; + listenStreams = [ "[::]:${builtins.toString opts.port}" ]; + }) endpoints; + + systemd.services = + lib.mapAttrs' + (name: opts: + let + serviceName = "irc-alerts-${name}"; + hasPassword = opts.passwordFile or null != null; + in + lib.nameValuePair serviceName { + description = "Receive http hook and send irc message for ${name}"; + requires = [ "irc-alerts-${name}.socket" ]; + serviceConfig = + { + Environment = + [ + "IRC_URL=${opts.url}" + "DEBUG=y" + ] + ++ lib.optional hasPassword "IRC_PASSWORD_FILE=/run/${serviceName}/password"; + DynamicUser = true; + User = serviceName; + ExecStart = irc-alerts; + } + // lib.optionalAttrs hasPassword { + PermissionsStartOnly = true; + ExecStartPre = + "${pkgs.coreutils}/bin/install -m400 " + + "-o ${serviceName} -g ${serviceName} " + + "${config.sops.secrets.prometheus-irc-password.path} " + + "/run/${serviceName}/password"; + RuntimeDirectory = serviceName; + }; + }) endpoints; +} diff --git a/krebs/2configs/shack/prometheus/server.nix b/krebs/2configs/shack/prometheus/server.nix index 9e4b4d1a7..7a5532027 100644 --- a/krebs/2configs/shack/prometheus/server.nix +++ b/krebs/2configs/shack/prometheus/server.nix @@ -3,6 +3,7 @@ { imports = [ ./alert-rules.nix + ./irc-hooks.nix ]; networking = { firewall.allowedTCPPorts = [ @@ -129,11 +130,11 @@ "group_wait" = "30s"; "group_interval" = "2m"; "repeat_interval" = "4h"; - "receiver" = "team-admins"; + "receiver" = "shack-admins"; }; "receivers" = [ { - "name" = "team-admins"; + "name" = "shack-admins"; "email_configs" = [ ]; "webhook_configs" = [ { |