{ config, lib, pkgs, ... }: let cfg = config.krebs.sync-containers3; slib = pkgs.stockholm.lib; in { options.krebs.sync-containers3 = { inContainer = { enable = lib.mkEnableOption "container config for syncing"; pubkey = lib.mkOption { type = lib.types.str; # TODO ssh key }; }; containers = lib.mkOption { default = {}; type = lib.types.attrsOf (lib.types.submodule ({ config, ... }: { options = { name = lib.mkOption { type = lib.types.str; default = config._module.args.name; }; sshKey = lib.mkOption { type = slib.types.absolute-pathname; }; luksKey = lib.mkOption { type = slib.types.absolute-pathname; default = config.sshKey; }; ephemeral = lib.mkOption { type = lib.types.bool; default = false; }; runContainer = lib.mkOption { type = lib.types.bool; default = true; }; }; })); }; }; config = lib.mkMerge [ (lib.mkIf (cfg.containers != {}) { containers = lib.mapAttrs' (n: ctr: lib.nameValuePair ctr.name { config = { environment.systemPackages = [ pkgs.dhcpcd pkgs.git pkgs.jq ]; networking.useDHCP = lib.mkForce true; systemd.services.autoswitch = { environment = { NIX_REMOTE = "daemon"; }; wantedBy = [ "multi-user.target" ]; serviceConfig.ExecStart = pkgs.writers.writeDash "autoswitch" '' set -efu mkdir -p /var/state/var_src ln -Tfrs /var/state/var_src /var/src if test -e /var/src/nixos-config; then /run/current-system/sw/bin/nixos-rebuild -I /var/src switch || : fi ''; unitConfig.X-StopOnRemoval = false; }; }; autoStart = false; enableTun = true; ephemeral = ctr.ephemeral; privateNetwork = true; hostBridge = "ctr0"; bindMounts = { "/var/lib/self/disk" = { hostPath = "/var/lib/sync-containers3/${ctr.name}/disk"; isReadOnly = false; }; "/var/state" = { hostPath = "/var/lib/sync-containers3/${ctr.name}/state"; isReadOnly = false; }; }; }) (lib.filterAttrs (_: ctr: ctr.runContainer) cfg.containers); systemd.services = lib.foldr lib.recursiveUpdate {} (lib.flatten (map (ctr: [ { "${ctr.name}_syncer" = { path = with pkgs; [ coreutils consul rsync openssh systemd ]; startAt = "*:0/1"; serviceConfig = { User = "${ctr.name}_container"; LoadCredential = [ "ssh_key:${ctr.sshKey}" ]; ExecCondition = pkgs.writers.writeDash "${ctr.name}_checker" '' set -efu ! systemctl is-active --quiet container@${ctr.name}.service ''; ExecStart = pkgs.writers.writeDash "${ctr.name}_syncer" '' set -efux consul lock sync_${ctr.name} ${pkgs.writers.writeDash "${ctr.name}-sync" '' set -efux if /run/wrappers/bin/ping -c 1 ${ctr.name}.r; then nice --adjustment=30 rsync -a -e "ssh -i $CREDENTIALS_DIRECTORY/ssh_key" --timeout=30 container_sync@${ctr.name}.r:disk "$HOME"/disk rm -f "$HOME"/incomplete fi ''} ''; }; }; } { "${ctr.name}_watcher" = lib.mkIf ctr.runContainer { path = with pkgs; [ coreutils consul cryptsetup curl mount util-linux jq retry ]; serviceConfig = { ExecStart = pkgs.writers.writeDash "${ctr.name}_watcher" '' set -efux while sleep 5; do # get the payload # check if the host reacted recently case $(curl -s -o /dev/null --retry 10 --retry-delay 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in 404) echo 'got 404 from kv, should kill the container' break ;; 500) echo 'got 500 from kv, will kill container' break ;; 200) # echo 'got 200 from kv, will check payload' payload=$(consul kv get containers/${ctr.name}) || continue export payload if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then # echo 'we are the host, trying to reach container' if $(retry -t 10 -d 10 -- /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null); then # echo 'container is reachable, continueing' continue else # echo 'container seems dead, killing' break fi else echo 'we are not host, killing container' break fi ;; *) echo 'unknown state, continuing' continue ;; esac done /run/current-system/sw/bin/nixos-container stop ${ctr.name} || : umount /var/lib/sync-containers3/${ctr.name}/state || : cryptsetup luksClose ${ctr.name} || : ''; }; }; } { "${ctr.name}_scheduler" = lib.mkIf ctr.runContainer { wantedBy = [ "multi-user.target" ]; path = with pkgs; [ coreutils consul cryptsetup mount util-linux curl systemd jq retry bc ]; serviceConfig = { Restart = "always"; RestartSec = "30s"; ExecStart = pkgs.writers.writeDash "${ctr.name}_scheduler" '' set -efux # get the payload # check if the host reacted recently case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in 404) # echo 'got 404 from kv, will create container' ;; 500) # echo 'got 500 from kv, retrying again' exit 0 ;; 200) # echo 'got 200 from kv, will check payload' export payload=$(consul kv get containers/${ctr.name}) if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then echo 'we are the host, starting container' else # echo 'we are not host, checking timestamp' # if [ $(echo "$(date +%s) - $(jq -rn 'env.payload | fromjson.time') > 100" | bc) -eq 1 ]; then if [ "$(jq -rn 'env.payload | fromjson.time | now - tonumber > 100')" = 'true' ]; then echo 'last beacon is more than 100s ago, taking over' else # echo 'last beacon was recent. trying again' exit 0 fi fi ;; *) echo 'unknown state, bailing out' exit 0 ;; esac consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null consul lock -verbose -monitor-retry=100 -timeout 30s -name container_${ctr.name} container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" '' set -efu cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} || : mkdir -p /var/lib/sync-containers3/${ctr.name}/state mountpoint /var/lib/sync-containers3/${ctr.name}/state || mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state /run/current-system/sw/bin/nixos-container start ${ctr.name} # wait for system to become reachable for the first time systemctl start ${ctr.name}_watcher.service retry -t 10 -d 10 -- /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null while systemctl is-active container@${ctr.name}.service >/devnull && /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r >/dev/null; do consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null sleep 10 done ''} ''; }; }; } { "container@${ctr.name}" = lib.mkIf ctr.runContainer { serviceConfig = { ExecStop = pkgs.writers.writeDash "remove_interface" '' ${pkgs.iproute2}/bin/ip link del vb-${ctr.name} ''; }; }; } ]) (lib.attrValues cfg.containers))); systemd.timers = lib.mapAttrs' (n: ctr: lib.nameValuePair "${ctr.name}_syncer" { timerConfig = { RandomizedDelaySec = 100; }; }) cfg.containers; users.groups = lib.mapAttrs' (_: ctr: lib.nameValuePair "${ctr.name}_container" { }) cfg.containers; users.users = lib.mapAttrs' (_: ctr: lib.nameValuePair "${ctr.name}_container" ({ group = "${ctr.name}_container"; isNormalUser = true; uid = slib.genid_uint31 "container_${ctr.name}"; home = "/var/lib/sync-containers3/${ctr.name}"; createHome = true; homeMode = "705"; })) cfg.containers; environment.systemPackages = lib.mapAttrsToList (_: ctr: (pkgs.writers.writeDashBin "${ctr.name}_init" '' set -efux export PATH=${lib.makeBinPath [ pkgs.coreutils pkgs.cryptsetup pkgs.libxfs.bin ]}:$PATH truncate -s 5G /var/lib/sync-containers3/${ctr.name}/disk cryptsetup luksFormat /var/lib/sync-containers3/${ctr.name}/disk ${ctr.luksKey} cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} mkfs.xfs /dev/mapper/${ctr.name} mkdir -p /var/lib/sync-containers3/${ctr.name}/state mountpoint /var/lib/sync-containers3/${ctr.name}/state || mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state /run/current-system/sw/bin/nixos-container start ${ctr.name} /run/current-system/sw/bin/nixos-container run ${ctr.name} -- ${pkgs.writeDash "init" '' mkdir -p /var/state ''} '')) cfg.containers; }) (lib.mkIf (cfg.containers != {}) { # networking # needed because otherwise we lose local dns environment.etc."resolv.conf".source = lib.mkForce "/run/systemd/resolve/resolv.conf"; boot.kernel.sysctl."net.ipv4.ip_forward" = lib.mkForce 1; systemd.network.networks.ctr0 = { name = "ctr0"; address = [ "10.233.0.1/24" ]; networkConfig = { # IPForward = "yes"; # IPMasquerade = "both"; ConfigureWithoutCarrier = true; DHCPServer = "yes"; }; }; systemd.network.netdevs.ctr0.netdevConfig = { Kind = "bridge"; Name = "ctr0"; }; networking.networkmanager.unmanaged = [ "ctr0" ]; krebs.iptables.tables.filter.INPUT.rules = [ { predicate = "-i ctr0"; target = "ACCEPT"; } ]; krebs.iptables.tables.filter.FORWARD.rules = [ { predicate = "-i ctr0"; target = "ACCEPT"; } { predicate = "-o ctr0"; target = "ACCEPT"; } ]; krebs.iptables.tables.nat.POSTROUTING.rules = [ { v6 = false; predicate = "-s 10.233.0.0/24"; target = "MASQUERADE"; } ]; }) (lib.mkIf cfg.inContainer.enable { users.groups.container_sync = {}; users.users.container_sync = { group = "container_sync"; uid = slib.genid_uint31 "container_sync"; isNormalUser = true; home = "/var/lib/self"; createHome = true; openssh.authorizedKeys.keys = [ cfg.inContainer.pubkey ]; }; networking.useHostResolvConf = false; networking.useNetworkd = true; systemd.network = { enable = true; networks.eth0 = { matchConfig.Name = "eth0"; DHCP = "yes"; dhcpV4Config.UseDNS = true; }; }; }) ]; }