nixpkgs-immich/nixos/tests/pacemaker.nix
2024-09-17 10:23:40 -06:00

111 lines
3.5 KiB
Nix

import ./make-test-python.nix ({ pkgs, lib, ... }: rec {
name = "pacemaker";
meta = with pkgs.lib.maintainers; {
maintainers = [ astro ];
};
nodes =
let
node = i: {
networking.interfaces.eth1.ipv4.addresses = [ {
address = "192.168.0.${toString i}";
prefixLength = 24;
} ];
services.corosync = {
enable = true;
clusterName = "zentralwerk-network";
nodelist = lib.imap (i: name: {
nodeid = i;
inherit name;
ring_addrs = [
(builtins.head nodes.${name}.networking.interfaces.eth1.ipv4.addresses).address
];
}) (builtins.attrNames nodes);
};
environment.etc."corosync/authkey" = {
source = builtins.toFile "authkey"
# minimum length: 128 bytes
"testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
mode = "0400";
};
services.pacemaker.enable = true;
# used for pacemaker resource
systemd.services.ha-cat = {
description = "Highly available netcat";
serviceConfig.ExecStart = "${pkgs.netcat}/bin/nc -l discard";
};
};
in {
node1 = node 1;
node2 = node 2;
node3 = node 3;
};
# sets up pacemaker with resources configuration, then crashes a
# node and waits for service restart on another node
testScript =
let
resources = builtins.toFile "cib-resources.xml" ''
<resources>
<primitive id="cat" class="systemd" type="ha-cat">
<operations>
<op id="stop-cat" name="start" interval="0" timeout="1s"/>
<op id="start-cat" name="start" interval="0" timeout="1s"/>
<op id="monitor-cat" name="monitor" interval="1s" timeout="1s"/>
</operations>
</primitive>
</resources>
'';
in ''
import re
import time
start_all()
${lib.concatMapStrings (node: ''
${node}.wait_until_succeeds("corosync-quorumtool")
${node}.wait_for_unit("pacemaker.service")
'') (builtins.attrNames nodes)}
# No STONITH device
node1.succeed("crm_attribute -t crm_config -n stonith-enabled -v false")
# Configure the cat resource
node1.succeed("cibadmin --replace --scope resources --xml-file ${resources}")
# wait until the service is started
while True:
output = node1.succeed("crm_resource -r cat --locate")
match = re.search("is running on: (.+)", output)
if match:
for machine in machines:
if machine.name == match.group(1):
current_node = machine
break
time.sleep(1)
current_node.log("Service running here!")
current_node.crash()
# pick another node that's still up
for machine in machines:
if machine.booted:
check_node = machine
# find where the service has been started next
while True:
output = check_node.succeed("crm_resource -r cat --locate")
match = re.search("is running on: (.+)", output)
# output will remain the old current_node until the crash is detected by pacemaker
if match and match.group(1) != current_node.name:
for machine in machines:
if machine.name == match.group(1):
next_node = machine
break
time.sleep(1)
next_node.log("Service migrated here!")
'';
})