summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMicah Anderson <micah@riseup.net>2022-02-02 20:43:39 -0500
committerMicah Anderson <micah@riseup.net>2022-02-02 20:43:39 -0500
commitf916b8c08f136e38bccf15840ff967756b8962d2 (patch)
tree6abf0100bcba508c785e343cbb823af74488f16e
parent34bd962095ab5e81282f01c829fb6311b5643153 (diff)
git subrepo pull float
subrepo: subdir: "float" merged: "f1ee5fa4" upstream: origin: "https://git.autistici.org/ai3/float.git" branch: "master" commit: "f1ee5fa4" git-subrepo: version: "0.4.1" origin: "https://github.com/ingydotnet/git-subrepo" commit: "a04d8c2"
-rw-r--r--float/.gitlab-ci.yml17
-rw-r--r--float/.gitrepo4
-rw-r--r--float/docs/reference.md247
-rw-r--r--float/docs/reference.pdfbin487445 -> 499864 bytes
-rwxr-xr-xfloat/float3
-rw-r--r--float/playbooks/all.yml3
-rw-r--r--float/playbooks/frontend.yml6
-rw-r--r--float/plugins/action/sshca_sign.py2
-rw-r--r--float/plugins/action/tinc_host_conf.py3
-rw-r--r--float/plugins/inventory/float.py54
-rw-r--r--float/roles/float-base-auth-server/templates/auth-server.yml.j219
-rw-r--r--float/roles/float-base-auth-server/vars/main.yml5
-rw-r--r--float/roles/float-base-backup-metadata/tasks/main.yml3
-rwxr-xr-xfloat/roles/float-base-datasets/files/float-dataset-restore30
-rw-r--r--float/roles/float-base-datasets/tasks/dataset.yml95
-rw-r--r--float/roles/float-base-datasets/tasks/dataset_litestream.yml83
-rw-r--r--float/roles/float-base-datasets/tasks/dataset_tabacco.yml47
-rw-r--r--float/roles/float-base-datasets/tasks/main.yml26
-rw-r--r--float/roles/float-base-datasets/templates/litestream-env.j23
-rw-r--r--float/roles/float-base-datasets/templates/litestream-replicate-service.j224
-rw-r--r--float/roles/float-base-datasets/templates/litestream-restore-script.j216
-rw-r--r--float/roles/float-base-datasets/templates/restore-service.j26
-rw-r--r--float/roles/float-base-datasets/templates/tabacco-restore-script.j2 (renamed from float/roles/float-base-datasets/templates/restore-script.j2)27
-rw-r--r--float/roles/float-base-docker/tasks/main.yml10
-rw-r--r--float/roles/float-base-docker/tasks/podman_debian.yml4
-rw-r--r--float/roles/float-base-docker/templates/assetmon.default.j21
-rwxr-xr-xfloat/roles/float-base-docker/templates/float-pull-image.j210
-rw-r--r--float/roles/float-base-docker/templates/run.sh.j224
-rw-r--r--float/roles/float-base-docker/templates/systemd.j230
-rw-r--r--float/roles/float-base-net-overlay/templates/firewall/11net-overlay-raw.j214
-rw-r--r--float/roles/float-base/defaults/main.yml19
-rw-r--r--float/roles/float-base/files/disable-kmod-load.service10
-rw-r--r--float/roles/float-base/files/float-lockdown.target7
-rw-r--r--float/roles/float-base/files/mtail.service (renamed from float/roles/float-base/files/mtail.service.bullseye)0
-rw-r--r--float/roles/float-base/files/mtail.service.buster17
l---------float/roles/float-base/files/mtail.service.stretch1
-rw-r--r--float/roles/float-base/files/node-exporter.default (renamed from float/roles/float-base/files/node-exporter.default.bullseye)0
-rw-r--r--float/roles/float-base/files/node-exporter.default.buster1
l---------float/roles/float-base/files/node-exporter.default.stretch1
-rw-r--r--float/roles/float-base/handlers/main.yml2
-rw-r--r--float/roles/float-base/tasks/apt.yml21
-rw-r--r--float/roles/float-base/tasks/firewall.yml4
-rw-r--r--float/roles/float-base/tasks/harden.yml40
-rw-r--r--float/roles/float-base/tasks/main.yml3
-rw-r--r--float/roles/float-base/tasks/prometheus.yml14
-rw-r--r--float/roles/float-base/tasks/serial.yml20
-rw-r--r--float/roles/float-base/tasks/service_discovery.yml2
-rw-r--r--float/roles/float-base/tasks/ssh.yml2
-rw-r--r--float/roles/float-base/tasks/syslog.yml20
-rw-r--r--float/roles/float-base/templates/firewall/10float.j210
-rw-r--r--float/roles/float-base/templates/grub-serial.j24
-rw-r--r--float/roles/float-base/templates/jail.local.j22
-rw-r--r--float/roles/float-base/templates/resolv.conf.j22
-rw-r--r--float/roles/float-base/templates/rsyslog.conf.j231
-rw-r--r--float/roles/float-base/templates/sources.list.j25
-rw-r--r--float/roles/float-base/templates/sysctl.conf.j22
-rw-r--r--float/roles/float-infra-acme/templates/config.yml.j24
-rw-r--r--float/roles/float-infra-assetmon/handlers/main.yml6
-rw-r--r--float/roles/float-infra-assetmon/tasks/main.yml18
-rw-r--r--float/roles/float-infra-assetmon/templates/server.yml.j213
-rw-r--r--float/roles/float-infra-dns/defaults/main.yml1
-rw-r--r--float/roles/float-infra-dns/templates/bind/named.conf.local7
-rw-r--r--float/roles/float-infra-dns/templates/bind/named.conf.options13
-rw-r--r--float/roles/float-infra-dns/templates/dns/infra.yml38
-rw-r--r--float/roles/float-infra-dns/templates/zonetool.yml15
-rw-r--r--float/roles/float-infra-haproxy/templates/firewall/20haproxy.j24
-rw-r--r--float/roles/float-infra-haproxy/templates/haproxy.cfg.j210
-rwxr-xr-xfloat/roles/float-infra-log-collector/files/es_init.py58
-rwxr-xr-xfloat/roles/float-infra-log-collector/files/kibana_importer.py106
-rw-r--r--float/roles/float-infra-log-collector/templates/log-collector.logrotate.j26
-rw-r--r--float/roles/float-infra-log-collector/templates/rsyslog-collector.conf.j227
-rw-r--r--float/roles/float-infra-nginx/templates/config/snippets/proxy.conf2
-rw-r--r--float/roles/float-infra-nginx/templates/nginx-upstream.j22
-rw-r--r--float/roles/float-infra-prometheus/templates/alertmanager.yml.j26
-rw-r--r--float/roles/float-infra-prometheus/templates/blackbox.yml.j250
-rw-r--r--float/roles/float-infra-prometheus/templates/prometheus.yml.j240
-rw-r--r--float/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml9
-rw-r--r--float/roles/float-infra-prometheus/templates/rules/alerts_elasticsearch.conf.yml1
-rw-r--r--float/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml22
-rw-r--r--float/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml11
-rw-r--r--float/roles/float-infra-sso-server/templates/server.yml.j210
-rw-r--r--float/roles/float-util-geoip-dataset/tasks/main.yml2
-rwxr-xr-xfloat/scripts/floatup.py2
-rw-r--r--float/services.prometheus-lts.yml7
-rw-r--r--float/services.yml.default257
-rw-r--r--float/services.yml.no-elasticsearch109
-rw-r--r--float/test/backup.ref/config-backup.yml9
-rw-r--r--float/test/backup.ref/passwords.yml2
-rw-r--r--float/test/backup.ref/services.yml43
-rw-r--r--float/test/backup.ref/site.yml10
-rw-r--r--float/test/base.ref/services.yml10
-rw-r--r--float/test/float_integration_test/__init__.py47
-rw-r--r--float/test/float_integration_test/http.py3
-rw-r--r--float/test/float_integration_test/test_system.py77
-rw-r--r--float/test/full.ref/services.yml17
95 files changed, 1299 insertions, 831 deletions
diff --git a/float/.gitlab-ci.yml b/float/.gitlab-ci.yml
index 5a218f9..eaca15b 100644
--- a/float/.gitlab-ci.yml
+++ b/float/.gitlab-ci.yml
@@ -24,7 +24,7 @@ variables:
${APT_PROXY:+-e config.apt_proxy=${APT_PROXY}}
$CREATE_ENV_VARS $BUILD_DIR
- - with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-buster} up
+ - with-ssh-key ./scripts/floatup.py ${LIBVIRT:+--ssh $LIBVIRT} --inventory $BUILD_DIR/hosts.yml --ram 2048 --cpu 2 --image ${VM_IMAGE:-bullseye} up
- with-ssh-key ./test-driver init --no-vagrant $BUILD_DIR
- with-ssh-key ./test-driver run $BUILD_DIR
after_script:
@@ -45,12 +45,6 @@ variables:
base_test:
<<: *base_test
variables:
- CREATE_ENV_VARS: "-e config.float_debian_dist=buster"
- TEST_DIR: "test/base.ref"
-
-base_bullseye_test:
- <<: *base_test
- variables:
VM_IMAGE: "bullseye"
CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
TEST_DIR: "test/base.ref"
@@ -58,15 +52,16 @@ base_bullseye_test:
full_test:
<<: *base_test
variables:
- CREATE_ENV_VARS: "-e config.float_debian_dist=buster"
+ VM_IMAGE: "bullseye"
+ CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
TEST_DIR: "test/full.ref"
-full_bullseye_test:
+backup_test:
<<: *base_test
variables:
VM_IMAGE: "bullseye"
- CREATE_ENV_VARS: "-e config.float_debian_dist=bullseye -e inventory.group_vars.vagrant.ansible_python_interpreter=/usr/bin/python3"
- TEST_DIR: "test/full.ref"
+ CREATE_ENV_VARS: "--additional-config test/backup.ref/config-backup.yml --playbook test/backup.ref/site.yml"
+ TEST_DIR: "test/backup.ref"
docker_build_and_release_tests:
stage: docker_build
diff --git a/float/.gitrepo b/float/.gitrepo
index 1fcfa32..5588de9 100644
--- a/float/.gitrepo
+++ b/float/.gitrepo
@@ -6,7 +6,7 @@
[subrepo]
remote = https://git.autistici.org/ai3/float.git
branch = master
- commit = 7406f7aa6f959d89b1d71cad7e2202e5b39e1668
- parent = f736f38035d8f838477a620715f165e62be75de6
+ commit = f1ee5fa49479e505815abd3cc9216a69800c1031
+ parent = 34bd962095ab5e81282f01c829fb6311b5643153
cmdver = 0.4.1
method = merge
diff --git a/float/docs/reference.md b/float/docs/reference.md
index e7af3fb..72c45fb 100644
--- a/float/docs/reference.md
+++ b/float/docs/reference.md
@@ -311,23 +311,42 @@ datasets only once (on the service master host).
### Backups
If provided with credentials for an external data repository, float
-will automatically make backups of your configured datasets. Float
-runs its own backup management system
-([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of
-Restic, which adds additional metadata to Restic snapshots to map
-float datasets.
-
-When a service is scheduled on a new host, for instance as a result of
-a re-scheduling, float will attempt to restore the associated datasets
-from their backups. While this is not a practical failover solution
-for complex services, we've found it works pretty well for a category
-of services with "important, but small - can afford to lose one day of
-changes" datasets that is quite common and useful in itself. For these
-services, running with num_instances=1 and counting on the
-backup/restore data move mechanism might provide sufficient
-availability and reliability.
-
-Restores can of course also be triggered manually whenever necessary.
+will automatically make backups of your configured datasets. These
+aren't just used for disaster recovery, but are an integral part of
+float's service management approach: when a service is scheduled on a
+new host, for instance as a result of a re-scheduling, float will
+attempt to automatically restore the associated datasets from their
+backups. Restores can of course also be triggered manually whenever
+necessary.
+
+Float offers two backup mechanisms for datasets:
+
+* For bulk data, it can use its own backup management system
+ ([tabacco](https://git.autistici.org/ai3/tools/tabacco)) on top of
+ Restic, which adds additional metadata to Restic snapshots to map
+ float datasets. This can be used as a primitive failover solution
+ for services that aren't "important" enough to afford their own
+ distributed storage abstractions, and where losing up to one day of
+ changes is tolerable. An alternative, "live" solution, that would
+ favor correctness over availability, is also in the works. This
+ backup mechanism is *extensible* to understand the structure and
+ metadata of specific services' entities and accounts, if necessary.
+
+* There are a number of instances, in float, of a specific category of
+ service, single-hosted small API services that run off a simple
+ SQLite database, some of which are critical to float's operation
+ (for example the backup metadata service itself). For this
+ particular use case, float supports backups with
+ [Litestream](https://litestream.io), an asynchronous replication
+ solution for SQLite, that offers point-in-time restore capabilities
+ (less than 1 second of data loss window) in case of disaster or when
+ the service is rescheduled.
+
+ Litestream requires an S3-compatible backend (Minio, AWS, etc).
+
+Note that float does not, in its default configuration, provide the
+data storage services used by its backup mechanisms. These are treated
+as third-party (external) resources.
### Volumes
@@ -1006,6 +1025,27 @@ in as root in order to run Ansible, so you'll most likely want to set
Keys used for login will be logged in the audit log, so you can still
tell admins apart.
+### SSH Host Certificates
+
+SSH host certificates contain a list of one or more *principals*, or
+names. For SSH CA validation to work correctly, that list should
+include the name used to connect to the host. But float doesn't really
+have an opinion on what public DNS names your hosts have: it only
+knows about the Ansible inventory! So in order to control the
+generation of SSH host certificate principals, it is possible to set
+the *ssh_host_key_principal* variable to a pattern that makes sense
+for your production environment. By default this is:
+
+```
+{{ inventory_hostname }}.{{ domain }}
+```
+
+which generates fully-qualified names on the *internal* zone. These
+won't be generated by float, and are likely not to exist, so you'll
+want to change this to something that matches your environment. The
+*ssh_host_key_principal* variable can of course also be set on a
+host-by-host basis, in the inventory.
+
### SSH Client Setup
You will find the public key for this CA in the
@@ -1017,7 +1057,7 @@ use a wildcard), you should add the following entry to
*~/.ssh/known_hosts*:
```
-@cert_authority *.example.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAA....
+@cert-authority *.example.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAA....
```
Since all logins happen as root, it may be convenient to also add a
@@ -1787,19 +1827,21 @@ Variables can be Ansible variables: SSH parameters, etc., usually with
an *ansible_* prefix. But some host variables have special meaning for
float:
-`ip` (mandatory) is the IPv4 address of this host that other hosts
-(i.e. internal services) should use to reach it
-
-`ip6` (optional) is the IPv6 version of the above
-
-`public_ip` (optional) is the IPv4 address that will be advertised in
-the public-facing DNS zones, if unset it defaults to `ip`
+`ips` (mandatory) is the list of IP addresses of this host that other
+hosts (i.e. internal services) should use to reach it. You can specify
+one or more IP addresses, IPv4 or IPv6. Note that this is a **list**.
+For legacy reasons, float still also understands the `ip` (singular)
+attribute, which is expected to be a single IPv4 address, but this
+support will eventually be retired, so on new inventories you should
+use the `ips` list attribute.
-`public_ip6` (optional) is the IPv6 version of the above (if unset,
-it will default to `ip6`)
+`public_ips` (optional) is the list of IP addresses for this host that
+will be advertised in the public-facing DNS zones. If unset it
+defaults to `ips`.
`ip_<name>` (optional) defines the IPv4 address for this host on the
-overlay network called *name*
+overlay network called *name*. Note that as opposed to `ips` this is
+not a list but a single IPv4 address.
`groups` (optional) is a list of Ansible groups that this host should
be a member of
@@ -1958,21 +2000,21 @@ service.
Each entry in the *monitoring_endpoints* list can have the following
attributes:
-`job_name`: Job name in Prometheus, defaults to the service name.
-
-`type` (deprecated): Selects the service discovery mechanism used by
-Prometheus to find the service endpoints. This can only have the value
-*static*, which is also the default.
-
`port`: Port where the `/metrics` endpoint is exported.
`scheme`: HTTP scheme for the service endpoint. The default is *https*.
+`healthcheck_http_method`: HTTP method to use for checking job status. The default is *HEAD* to query the endpoint without transferring all the metric data. Not all endpoints support this method, so if the probe fails set it to a method that it does support (worst case: *GET*).
+
`metrics_path`: Path for metrics if different from the default of `/metrics`.
`labels`: An optional dictionary of key/value labels to set for this
target (they will be added to all metrics scraped from it).
+The Prometheus *job* labels for service targets will be automatically
+generated by *float* to include the service name and the endpoint
+port.
+
### Traffic routing
Services can define *public* HTTP and TCP endpoints, that will be
@@ -2046,10 +2088,10 @@ attributes, all required:
`name`: Name of the endpoint.
-`port`: Port where the service is running. Also the port that will be
-publicly exported (at least in the current implementation), which
-unfortunately means that the service itself shouldn't be running on
-*frontend* nodes.
+`port`: Port where the service is running.
+
+`public_port`: Port that should be exposed to the Internet. Defaults
+to `port` if unset.
`use_proxy_protocol`: When true, enable the HAProxy proxy protocol for
the service, to propagate the original client IP to the backends.
@@ -2105,6 +2147,19 @@ option automatically sets *drop_capabilities* to false.
drop all capabilities for this container. Otherwise, the capability
set will be controlled by systemd.
+`egress_policy` (default: *allow-all*): selects the network egress
+policy for this container. This allows broad control over network
+connections made by the process running in the container, and it can
+take one of the following values:
+
+* *allow-all*, allows all traffic
+* *internal*, only allows traffic to float's internal private networks
+ (necessary for containers serving public_endpoints, of course)
+* *none*, only allows traffic to localhost
+
+These policies are implemented using BPF filters, which at the moment
+are quite simplistic, hence the limited configurability.
+
### Non-container services
`systemd_services`: List of systemd service units that are associated
@@ -2191,6 +2246,37 @@ The LVs are created in the volume specified by the `volumes_vg` global
configuration variable, which by default is *vg0*. The VG must already
exist, float will not attempt to create it.
+### Annotations
+
+`annotations`: Dictionary with service-specific annotations
+
+Annotations are manually curated metadata associated with the service,
+intended for debugging purposes. This is data meant for humans to
+consume, with the idea of helping the operators understand and debug
+your services and their interconnections.
+
+Annotations are for now only displayed on the float admin dashboard.
+
+`summary`: A short summary (description) of the service.
+
+#### Dependency graphs
+
+`dependencies`: A list of additional service dependencies.
+
+Float can automatically compute part of the dependency graph between
+your services, at least insofar as the structure of *public_endpoints*
+is concerned. Since this data can be quite useful in understanding the
+structure of a service, it is possible to extend the dependency graph
+manually by specifying additional edges (representing the dependencies
+between services).
+
+Edges of the dependency graphs are specified as objects with `client`
+and `server` attributes, identifying a specific container or systemd
+unit in either the current service or a different one. If you're
+referring to an entity within the same service, you can just use its
+name, while for external services the syntax is
+*service-name*/*entity-name* (e.g. "log-collector/elasticsearch").
+
### Examples
Let's look at some example *services.yml* files:
@@ -2340,8 +2426,11 @@ each a dictionary with the following attributes:
documentation](https://git.autistici.org/id/auth/blob/master/README.md#password-encoding).
* `totp_secret` - TOTP secret for 2FA, base32-encoded
* `ssh_keys` - a list of strings representing SSH public keys
-* `u2f_registrations` - a list of objects representing U2F token
- registrations
+* `webauthn_registrations` - a list of objects representing
+ WebAuthN(U2F) token registrations
+* `u2f_registrations` - a list of objects representing legacy U2F
+ token registrations, only supported for old registrations created
+ before the switch to WebAuthN. Don't add new entries to this list.
### Authentication and SSO
@@ -2522,16 +2611,53 @@ but it will still be active and functional (via *amtool*).
#### Backups
To configure the backup system, you're going to need credentials for
-an external repository. The backup system
-uses [restic](https://restic.net), so check its documentation for the
-URI syntax.
+the third-party (external) data storage services. While it is possible
+to run a production service *without* backups configured, note that
+the cluster's functionality will be incomplete unless at least a
+Litestream backend is configured.
-`backup_repository_uri` - URI of the global (shared) restic repository
+##### Bulk backup (Restic)
-`backup_repository_restic_password` - the password used to encrypt
-the restic repository.
+`backup_repository_uri` - URI of the global (shared) restic
+repository. Though Restic supports [numerous
+backends](https://restic.readthedocs.io/en/stable/030_preparing_a_new_repo.html),
+float works best with Restic's own [REST
+Server](https://github.com/restic/rest-server).
+`backup_repository_restic_password` - password used to encrypt the
+restic repository.
+##### Asynchronous SQLite replication (Litestream)
+
+Litestream requires a S3-compatible API to store its SQLite WAL
+snapshots.
+
+`backup_litestream_config` is the object that configures the
+Litestream replica target, and it corresponds to the "replica" field
+of the Litestream configuration, so you can check the [Litestream
+documentation](https://litestream.io/reference/config/#replica-settings)
+for reference. The most important fields to set are `endpoint` (the
+URL of the storage service API), and `bucket` (the name of the bucket
+to use). The *path* attribute will be automatically set by float,
+based on the dataset name.
+
+`backup_litestream_credentials` is a dictionary of environment
+variables to configure credentials for access to the backend storage
+service. Keys will depend on which type of API is being used, but for
+the default *s3* type they should be `LITESTREAM_ACCESS_KEY_ID` and
+`LITESTREAM_SECRET_ACCESS_KEY`.
+
+An example of a (fictional) litestream configuration:
+
+```yaml
+backup_litestream_config:
+ type: s3
+ endpoint: "https://backup.service:9000/"
+ bucket: "mybackups"
+backup_litestream_credentials:
+ LITESTREAM_ACCESS_KEY_ID: "minio"
+ LITESTREAM_SECRET_ACCESS_KEY: "miniopassword"
+```
# Operations
@@ -3102,21 +3228,23 @@ using a hardware token (U2F) is preferred.
### Registering a U2F hardware token for an admin account
In the *group_vars/all/admins.yml* file, you can add the
-*u2f_registrations* attribute to accounts, which is a list of the
-allowed U2F device registrations.
+*webauthn_registrations* attribute to accounts, which is a list of the
+allowed WebAuthN/U2F device registrations.
-To register a new device, you are going to need the *pamu2fcfg* tool
-(part of the *pamu2fcfg* Debian package). The following snippet should
-produce the two YAML attributes that you need to set:
+To register a new device, you are going to need to install another
+small custom tool:
+[webauthn-cred](https://git.autistici.org/ai3/tools/webauthn-cred). Follow
+its installation instructions to obtain the *webauthn-cred* binary,
+then invoke it to make a new registration:
```shell
-$ pamu2fcfg --nouser --appid https://accounts.example.com \
- | tr -d : \
- | awk -F, '{print "key_handle: \"" $1 "\"\npublic_key: \"" $2 "\""}'
+$ webauthn-cred --rpid accounts.example.com
```
-press enter, touch the key, copy the output and insert it in
-*group_vars/all/admins.yml*, the final results should look like:
+follow the instructions, copy the output and insert it in
+*group_vars/all/admins.yml* as a new item in the
+*webauthn_registrations* attribute of your user. The final results
+should look like:
```yaml
---
@@ -3126,14 +3254,11 @@ admins:
password: "$a2$3$32768$4$abcdef...."
ssh_keys:
- "ssh-ed25519 AAAAC3Nza..."
- u2f_registrations:
+ webauthn_registrations:
- key_handle: "r4wWRHgzJjl..."
- public_key: "04803e4aff4..."
+ public_key: "ajgh73-31bc..."
```
-**NOTE**: the above will work with *pam_u2f* version 1.0.7, but it will *not*
-work with pam_u2f version 1.1.0 due to changes in the output format!
-
### Upgrading Debian version on target hosts
Float generally targets the current Debian *stable* distribution, but
diff --git a/float/docs/reference.pdf b/float/docs/reference.pdf
index 0a9f8e9..d6dcce0 100644
--- a/float/docs/reference.pdf
+++ b/float/docs/reference.pdf
Binary files differ
diff --git a/float/float b/float/float
index e27b6bb..b315fd9 100755
--- a/float/float
+++ b/float/float
@@ -177,6 +177,7 @@ DEFAULT_VARS = {
'callback_plugins': '{{ srcdir | relpath(targetdir) }}/plugins/callback',
'force_handlers': True,
'log_path': 'ansible.log',
+ 'retry_files_enabled': False,
'nocows': 1,
'display_skipped_hosts': False,
@@ -296,7 +297,7 @@ def _random_hosts(num_hosts, extra_memberships):
hostvars = {
'name': hostname,
'ansible_host': f'{net}.{i+10}',
- 'ip': f'{net}.{i+10}',
+ 'ips': [f'{net}.{i+10}'],
'ip_vpn0': f'192.168.13.{i+10}',
}
hostgroups = ['vagrant']
diff --git a/float/playbooks/all.yml b/float/playbooks/all.yml
index 27bc4c5..ce74359 100644
--- a/float/playbooks/all.yml
+++ b/float/playbooks/all.yml
@@ -43,3 +43,6 @@
roles:
- float-infra-sso-server
+- hosts: assets
+ roles:
+ - float-infra-assetmon
diff --git a/float/playbooks/frontend.yml b/float/playbooks/frontend.yml
index 6c59341..27fe8e8 100644
--- a/float/playbooks/frontend.yml
+++ b/float/playbooks/frontend.yml
@@ -6,9 +6,13 @@
gather_facts: no
roles:
- float-infra-nginx
- - float-infra-dns
- float-infra-haproxy
+- hosts: dns
+ gather_facts: no
+ roles:
+ - float-infra-dns
+
- hosts: admin_dashboard
gather_facts: no
roles:
diff --git a/float/plugins/action/sshca_sign.py b/float/plugins/action/sshca_sign.py
index 6bcab8b..d9e307d 100644
--- a/float/plugins/action/sshca_sign.py
+++ b/float/plugins/action/sshca_sign.py
@@ -50,6 +50,7 @@ class ActionModule(ActionBase):
ca_private_key_path = self._task.args['ca']
pubkey_path = self._task.args['pubkey']
+ principals = self._task.args.get('principals', [fqdn])
validity = self._task.args.get('validity', '52w')
renew_days = int(self._task.args.get('renew_days', '60'))
cert_path = re.sub(r'\.pub$', '-cert.pub', pubkey_path)
@@ -74,7 +75,6 @@ class ActionModule(ActionBase):
tmp_cert_path = os.path.join(tmpdir, 'host-cert.pub')
self._fetch_pubkey(task_vars, pubkey_path, tmp_pubkey_path)
- principals = [hostname, fqdn]
subprocess.check_call(
['ssh-keygen', '-h', '-s', tmp_ca_private_key_path,
'-I', 'host-' + hostname, '-n', ','.join(principals),
diff --git a/float/plugins/action/tinc_host_conf.py b/float/plugins/action/tinc_host_conf.py
index 0eb14f3..5b83883 100644
--- a/float/plugins/action/tinc_host_conf.py
+++ b/float/plugins/action/tinc_host_conf.py
@@ -7,8 +7,9 @@ from ansible.module_utils._text import to_text
HOST_TEMPLATE = '''
+{% for ip in ips %}
Address = {{ ip }}
-{% if ip6 is defined %}Address = {{ ip6 }}{% endif %}
+{% endfor %}
Port = {{ tinc_config.port | default('655') }}
Cipher = {{ tinc_config.cipher | default('aes-128-cbc') }}
Digest = {{ tinc_config.digest | default('sha256') }}
diff --git a/float/plugins/inventory/float.py b/float/plugins/inventory/float.py
index 1e29228..e9fa22b 100644
--- a/float/plugins/inventory/float.py
+++ b/float/plugins/inventory/float.py
@@ -38,6 +38,9 @@ DEFAULT_SERVICE_CREDENTIALS = [
{
'name': 'auth-server',
},
+ {
+ 'name': 'assetmon-client',
+ },
]
@@ -172,30 +175,31 @@ def _host_groups(name, inventory, assignments=None):
# Return all host IP addresses for the specified overlay.
def _host_net_overlay_addrs(name, inventory, overlay):
if overlay == 'public':
- keys = ('ip', 'ip6')
- else:
- keys = ('ip_' + overlay,)
+ return inventory['hosts'][name]['public_ips']
+
addrs = []
- for k in keys:
- v = inventory['hosts'][name].get(k)
- if v:
- addrs.append(v)
+ key = 'ip_' + overlay
+ if key in inventory['hosts'][name]:
+ addrs.append(inventory['hosts'][name][key])
return addrs
# Return all host IP addresses, on all interfaces.
def _host_addrs(name, inventory):
- return [
- v for k, v in inventory['hosts'][name].items()
- if k == 'ip' or k == 'ip6' or k.startswith('ip_')]
+ addrs = []
+ for ip in inventory['hosts'][name]['ips']:
+ addrs.append(ip)
+ for k, v in inventory['hosts'][name].items():
+ if k.startswith('ip_'):
+ addrs.append(v)
+ return addrs
def _host_dns_map(name, inventory):
dns = {}
+ dns[name] = inventory['hosts'][name]['ips']
for k, v in inventory['hosts'][name].items():
- if k == 'ip' or k == 'ip6':
- dns.setdefault(name, []).append(v)
- elif k.startswith('ip_'):
+ if k.startswith('ip_'):
dns.setdefault(name + '.' + k[3:], []).append(v)
return dns
@@ -551,10 +555,34 @@ def _any_attribute_set(services, attr):
return False
+# Pre-process inventory entries, to normalize host variables and
+# provide defaults (thus simplifying the jinja template logic).
+def _preprocess_inventory(inventory):
+ for host in inventory['hosts'].values():
+ # Set 'ips' if the legacy variables are set.
+ if 'ips' not in host:
+ host['ips'] = []
+ if 'ip' in host:
+ host['ips'].append(host['ip'])
+ if 'ip6' in host:
+ host['ips'].append(host['ip6'])
+ # Same for 'public_ips'.
+ if 'public_ips' not in host:
+ host['public_ips'] = []
+ if 'public_ip' in host:
+ host['public_ips'].append(host['public_ip'])
+ if 'public_ip6' in host:
+ host['public_ips'].append(host['public_ip6'])
+ # Default public_ips to ips.
+ if not host['public_ips']:
+ host['public_ips'] = host['ips']
+
+
# Run the scheduler, and return inventory and groups for Ansible.
def run_scheduler(config):
services = config['services']
inventory = config['inventory']
+ _preprocess_inventory(inventory)
assignments = Assignments.schedule(services, inventory)
# Augment all data structures with autogenerated and
diff --git a/float/roles/float-base-auth-server/templates/auth-server.yml.j2 b/float/roles/float-base-auth-server/templates/auth-server.yml.j2
index cc8af1d..b19e22b 100644
--- a/float/roles/float-base-auth-server/templates/auth-server.yml.j2
+++ b/float/roles/float-base-auth-server/templates/auth-server.yml.j2
@@ -1,9 +1,16 @@
---
+{% set all_ips = hostvars.values() | rejectattr('ips', 'undefined') | map(attribute='ips') | flatten | sort %}
+
backends_dir: /etc/auth-server/backends.d
services_dir: /etc/auth-server/services.d
services: {}
+webauthn:
+ rp_id: "{{ webauthn_rp_id }}"
+ rp_origin: "https://{{ webauthn_rp_id }}"
+ rp_display_name: "{{ webauthn_rp_display_name | default(webauthn_rp_id) }}"
+
{% if 'user-meta-server' in services %}
user_meta_server:
url: "https://user-meta-server.{{ domain }}:5505"
@@ -25,9 +32,9 @@ rate_limits:
value: "127.0.0.1"
- key: ip
value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
- key: ip
- value: "{{ hostvars[h]['ip'] }}"
+ value: "{{ ip }}"
{% endfor %}
# Per-IP rate limiter specific to account recovery, with stricter limits.
@@ -40,9 +47,9 @@ rate_limits:
value: "127.0.0.1"
- key: ip
value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
- key: ip
- value: "{{ hostvars[h]['ip'] }}"
+ value: "{{ ip }}"
{% endfor %}
# Blacklist users with too many failed account recovery attempts.
@@ -73,9 +80,9 @@ rate_limits:
value: "127.0.0.1"
- key: ip
value: "::1"
-{% for h in groups['all'] | sort %}
+{% for ip in all_ips %}
- key: ip
- value: "{{ hostvars[h]['ip'] }}"
+ value: "{{ ip }}"
{% endfor %}
{% if 'auth-cache' in services %}
diff --git a/float/roles/float-base-auth-server/vars/main.yml b/float/roles/float-base-auth-server/vars/main.yml
new file mode 100644
index 0000000..0ebeaa5
--- /dev/null
+++ b/float/roles/float-base-auth-server/vars/main.yml
@@ -0,0 +1,5 @@
+---
+
+# Autodetect the default WebAuthN RP ID by looking at
+# the first public_endpoint of the sso-server service.
+webauthn_rp_id: "{{ services['sso-server'].public_endpoints[0].name if 'sso-server' in services else 'login' }}.{{ domain_public[0] }}"
diff --git a/float/roles/float-base-backup-metadata/tasks/main.yml b/float/roles/float-base-backup-metadata/tasks/main.yml
index 45cbacf..0eee23e 100644
--- a/float/roles/float-base-backup-metadata/tasks/main.yml
+++ b/float/roles/float-base-backup-metadata/tasks/main.yml
@@ -13,6 +13,9 @@
system: yes
state: present
+# The directory is already created by the dataset, but we need
+# to ensure the permissions are correct or the first ansible run
+# will fail (breaking tests).
- name: Create backup metadata server database dir
file:
path: /var/lib/tabacco-metadb
diff --git a/float/roles/float-base-datasets/files/float-dataset-restore b/float/roles/float-base-datasets/files/float-dataset-restore
new file mode 100755
index 0000000..2af8787
--- /dev/null
+++ b/float/roles/float-base-datasets/files/float-dataset-restore
@@ -0,0 +1,30 @@
+#!/bin/sh
+#
+# Restore a dataset (tag passed as command-line argument).
+#
+# Uses a guard file to ensure the restore runs only once
+# on a specific machine (or actually, once every time the
+# service is newly scheduled there).
+
+dataset_tag="$1"
+[ -z "${dataset_tag}" ] && exit 2
+
+umask 027
+
+guard_dir=/var/lib/float/datasets
+mkdir -p ${guard_dir}
+
+guard_file="${guard_dir}/${dataset_tag}.restore_guard"
+restore_script="/usr/lib/float/datasets/restore-${dataset_tag}"
+
+if [ -e "${guard_file}" ]; then
+ echo "restore already ran for this dataset, skipping..." >&2
+ exit 0
+fi
+
+${restore_script} && {
+ echo "restore was successful" >&2
+ touch "${guard_file}"
+}
+
+exit $?
diff --git a/float/roles/float-base-datasets/tasks/dataset.yml b/float/roles/float-base-datasets/tasks/dataset.yml
index b71cee2..80e0cee 100644
--- a/float/roles/float-base-datasets/tasks/dataset.yml
+++ b/float/roles/float-base-datasets/tasks/dataset.yml
@@ -6,19 +6,22 @@
- set_fact:
service: "{{ item.0 }}"
dataset: "{{ item.1 }}"
- dataset_name: "{{ item.0.name }}/{{ item.1.name }}"
- dataset_filename: "{{ item.0.name }}_{{ item.1.name }}"
- dataset_owner: "{{ item.1.get('owner', '') }}"
- dataset_group: "{{ item.1.get('group', 'root') }}"
- dataset_mode: "{{ item.1.get('mode', '0700') }}"
- dataset_path: "{{ item.1.get('path', '') }}"
- dataset_type: "{% if 'backup_command' in item.1 %}pipe{% else %}file{% endif %}"
- dataset_is_present: "{{ (item.0.name in float_enabled_services) }}"
- dataset_should_backup: "{{ (item.0.name in float_enabled_services) and ((not item.1.get('on_master_only', False)) or (item.0.get('master_host') == inventory_hostname)) }}"
- set_fact:
- dataset_should_restore: "{{ dataset_should_backup and not item.1.get('sharded', False) }}"
- dataset_restore_unit: "restore-{{ dataset_filename }}.service"
+ dataset_name: "{{ service.name }}/{{ dataset.name }}"
+ dataset_tag: "{{ service.name }}_{{ dataset.name }}"
+ dataset_desired_owner: "{{ dataset.get('owner', '') }}"
+ dataset_owner: "root"
+ dataset_group: "{{ dataset.get('group', 'root') }}"
+ dataset_mode: "{{ dataset.get('mode', '0700') }}"
+ dataset_path: "{{ dataset.get('path', '') }}"
+ dataset_driver: "{{ dataset.get('type', 'tabacco') }}"
+ dataset_is_present: "{{ (service.name in float_enabled_services) }}"
+ dataset_should_backup: "{{ (service.name in float_enabled_services) and ((not dataset.get('on_master_only', False)) or (service.get('master_host') == inventory_hostname)) }}"
+
+- set_fact:
+ dataset_should_restore: "{{ dataset_should_backup and not dataset.get('sharded', False) }}"
+ dataset_restore_unit: "restore-{{ dataset_tag }}.service"
- name: "Create path for dataset {{ dataset_name }}"
file:
@@ -33,59 +36,46 @@
- name: Check if the dataset owner exists
getent:
database: passwd
- key: "{{ dataset_owner }}"
+ key: "{{ dataset_desired_owner }}"
fail_key: false
- when: "(dataset_is_present) and (dataset_path) and (dataset_owner)"
+ when: "(dataset_is_present) and (dataset_path) and (dataset_desired_owner)"
+
+# Make it so that 'dataset_owner' is always safe to use.
+- set_fact:
+ dataset_owner: "{{ dataset_desired_owner }}"
+ when: "dataset_is_present and dataset_path and dataset_desired_owner and getent_passwd.get(dataset_desired_owner)"
- name: "Set permissions for dataset directory of {{ dataset_name }}"
file:
path: "{{ dataset_path }}"
state: directory
owner: "{{ dataset_owner }}"
- group: "{{ dataset_group | default('root') }}"
- when: "(dataset_is_present) and (dataset_path) and (dataset_owner) and (getent_passwd.get(dataset_owner))"
-
-- name: Set up configuration for dataset {{ dataset_name }} (source)
- template:
- src: "sources/source.yml.j2"
- dest: "/etc/tabacco/sources/{{ dataset_filename }}.yml"
- mode: 0600
- when: dataset_should_backup
- notify:
- - reload backup agent
-
-- name: Set up configuration for dataset {{ dataset_name }} (handler)
- template:
- src: "handlers/{{ dataset_type }}.yml.j2"
- dest: "/etc/tabacco/handlers/{{ dataset_filename }}.yml"
- mode: 0600
- when: dataset_should_backup and dataset_type == 'pipe'
- notify:
- - reload backup agent
+ group: "{{ dataset_group }}"
+ when: "dataset_is_present and dataset_path and dataset_desired_owner"
-- name: Clear configuration for dataset {{ dataset_name }}
- file:
- path: "/etc/tabacco/{{ diritem }}/{{ dataset_filename }}.yml"
- state: absent
- when: not dataset_should_backup
- with_items:
- - sources
- - handlers
- loop_control:
- loop_var: diritem
+- include_tasks: dataset_tabacco.yml
+ when: "dataset_driver == 'tabacco'"
-- name: Create restore script
- template:
- src: "restore-script.j2"
- dest: "/usr/lib/float/datasets/restore-{{ dataset_filename }}"
- mode: 0755
- when: dataset_should_restore
+- include_tasks: dataset_litestream.yml
+ when: "dataset_driver == 'litestream'"
+# Set up a restore unit that will need to run before the main service
+# units (via a Before= clause) to restore data from backups, if any.
+# These units run a driver-dependent restore script, that is protected
+# by a "guard file", to ensure that the restore script is only run
+# once, whenever the service is newly scheduled on a host.
- name: Create restore service unit
template:
src: "restore-service.j2"
dest: "/etc/systemd/system/{{ dataset_restore_unit }}"
- mode: 0444
+ mode: 0644
+ when: dataset_should_restore
+
+- name: Create restore script
+ template:
+ src: "{{ dataset_driver }}-restore-script.j2"
+ dest: "/usr/lib/float/datasets/restore-{{ dataset_tag }}"
+ mode: 0755
when: dataset_should_restore
- name: Enable restore service unit
@@ -95,12 +85,11 @@
daemon_reload: yes
when: dataset_should_restore
-# systemd disable is not idempotent, hence the ignore_errors.
- name: Disable restore service unit
systemd:
name: "{{ dataset_restore_unit }}"
enabled: no
- when: "(not dataset_should_restore) and (dataset_restore_unit in loaded_restore_systemd_units.stdout_lines)"
+ when: "(not dataset_should_restore) and (dataset_restore_unit in loaded_backup_systemd_units.stdout_lines)"
- name: Cleanup restore service unit
file:
@@ -111,6 +100,6 @@
- name: Wipe dataset restore guard file
file:
- path: "/var/lib/float/datasets/{{ dataset_filename }}.restore_guard"
+ path: "/var/lib/float/datasets/{{ dataset_tag }}.restore_guard"
state: absent
when: not dataset_should_backup
diff --git a/float/roles/float-base-datasets/tasks/dataset_litestream.yml b/float/roles/float-base-datasets/tasks/dataset_litestream.yml
new file mode 100644
index 0000000..7e31240
--- /dev/null
+++ b/float/roles/float-base-datasets/tasks/dataset_litestream.yml
@@ -0,0 +1,83 @@
+---
+
+- set_fact:
+ dataset_filename: "{{ dataset.filename }}"
+ dataset_replica_url: "{{ backup_litestream_url | default('') }}/{{ dataset_tag }}"
+ dataset_replication_unit: "replicate-{{ dataset_tag }}.service"
+ # Just don't backup at all if litestream is not configured.
+ dataset_should_backup: "{{ dataset_should_backup and (backup_litestream_config is defined) }}"
+
+# Automatically set the replication path for s3-type configs. Create a
+# copy of backup_litestream_config that is specific to this dataset.
+- set_fact:
+ dataset_litestream_config: "{{ backup_litestream_config }}"
+ when: dataset_should_backup
+
+- set_fact:
+ dataset_litestream_config: "{{ dataset_litestream_config | combine({'path': dataset_tag}) }}"
+ when: "dataset_should_backup and backup_litestream_config.get('type', 's3') == 's3'"
+
+- set_fact:
+ litestream_config:
+ dbs:
+ - path: "{{ dataset_path }}/{{ dataset_filename }}"
+ replicas: ["{{ dataset_litestream_config }}"]
+ when: dataset_should_backup
+
+- name: Create dataset litestream config
+ copy:
+ dest: "/etc/litestream/{{ dataset_tag }}.yml"
+ content: "{{ litestream_config | to_yaml }}\n"
+ owner: "{{ dataset_owner }}"
+ group: "{{ dataset_group }}"
+ mode: "{{ dataset_mode or '750' }}"
+ when: dataset_should_backup
+ register: ls_config
+
+- name: Create dataset litestream credentials config
+ template:
+ src: "litestream-env.j2"
+ dest: "/etc/litestream/{{ dataset_tag }}.env"
+ owner: "{{ dataset_owner }}"
+ group: "{{ dataset_group }}"
+ mode: "{{ dataset_mode or '750' }}"
+ when: dataset_should_backup
+ register: ls_env
+
+- name: Create dataset litestream replication systemd unit
+ template:
+ src: "litestream-replicate-service.j2"
+ dest: "/etc/systemd/system/{{ dataset_replication_unit }}"
+ when: dataset_should_backup
+ register: ls_unit
+
+# Since we can't parameterize handlers, we're forced to detect
+# needs-restart ourselves using the results from the previous tasks.
+- set_fact:
+ litestream_restart: "{{ ls_config.changed or ls_env.changed or ls_unit.changed }}"
+
+- name: Enable the litestream replication systemd unit
+ systemd:
+ name: "{{ dataset_replication_unit }}"
+ enabled: true
+ state: "{{ 'restarted' if litestream_restart else 'started' }}"
+ daemon_reload: true
+ when: dataset_should_backup
+
+- name: Disable the litestream replication systemd unit
+ systemd:
+ name: "{{ dataset_replication_unit}}"
+ enabled: false
+ when: "(not dataset_should_backup) and (dataset_replication_unit in loaded_backup_systemd_units.stdout_lines)"
+
+- name: Delete dataset litestream replication configs
+ file:
+ path: "{{ diritem }}"
+ state: absent
+ when: not dataset_should_backup
+ loop:
+ - "/etc/litestream/{{ dataset_tag }}.yml"
+ - "/etc/systemd/system/{{ dataset_replication_unit }}"
+ loop_control:
+ loop_var: diritem
+
diff --git a/float/roles/float-base-datasets/tasks/dataset_tabacco.yml b/float/roles/float-base-datasets/tasks/dataset_tabacco.yml
new file mode 100644
index 0000000..902e524
--- /dev/null
+++ b/float/roles/float-base-datasets/tasks/dataset_tabacco.yml
@@ -0,0 +1,47 @@
+---
+
+- set_fact:
+ dataset_type: "{{ 'pipe' if 'backup_command' in dataset else 'file' }}"
+ dataset_should_backup: "{{ dataset_should_backup and (backup_repository_uri is defined) }}"
+
+- name: Set up configuration for dataset {{ dataset_name }} (source)
+ template:
+ src: "sources/source.yml.j2"
+ dest: "/etc/tabacco/sources/{{ dataset_tag }}.yml"
+ mode: 0600
+ when: dataset_should_backup
+ notify:
+ - reload backup agent
+
+- name: Set up configuration for dataset {{ dataset_name }} (handler)
+ template:
+ src: "handlers/{{ dataset_type }}.yml.j2"
+ dest: "/etc/tabacco/handlers/{{ dataset_tag }}.yml"
+ mode: 0600
+ when: dataset_should_backup and dataset_type == 'pipe'
+ notify:
+ - reload backup agent
+
+- name: Clear configuration for dataset {{ dataset_name }}
+ file:
+ path: "/etc/tabacco/{{ diritem }}/{{ dataset_tag }}.yml"
+ state: absent
+ when: not dataset_should_backup
+ with_items:
+ - sources
+ - handlers
+ loop_control:
+ loop_var: diritem
+
+- name: Create restore script
+ template:
+ src: "tabacco-restore-script.j2"
+ dest: "/usr/lib/float/datasets/restore-{{ dataset_tag }}"
+ mode: 0755
+ when: dataset_should_restore
+
+- name: Delete restore script
+ file:
+ path: "/usr/lib/float/datasets/restore-{{ dataset_tag }}"
+ state: absent
+ when: not dataset_should_restore
diff --git a/float/roles/float-base-datasets/tasks/main.yml b/float/roles/float-base-datasets/tasks/main.yml
index a7a32a7..1e1b94e 100644
--- a/float/roles/float-base-datasets/tasks/main.yml
+++ b/float/roles/float-base-datasets/tasks/main.yml
@@ -35,15 +35,31 @@
enabled: yes
when: backup_repository_uri is defined
-- file:
- path: /usr/lib/float/datasets
+- name: Create backup-related directories
+ file:
+ path: "{{ item }}"
state: directory
+ owner: root
+ group: root
+ mode: 0755
+ loop:
+ - "/usr/lib/float/datasets"
+ - "/etc/litestream"
+
+- name: Create restore wrapper script
+ copy:
+ src: "float-dataset-restore"
+ dest: "/usr/lib/float/float-dataset-restore"
+ mode: 0755
-- name: Obtain list of restore service units
- shell: "systemctl list-units --no-legend --no-pager --full --type service restore-\\* | awk '{print $1}'"
+# Ansible systemd unit will fail when trying to disable a unit that
+# does not exist. To avoid such errors, we gather the list of known
+# service units and use it later to check for existance.
+- name: Obtain list of backup-related service units
+ shell: "systemctl list-units --no-legend --no-pager --full --type service restore-\\* replicate-\\* | awk '{print $1}'"
check_mode: no
changed_when: false
- register: loaded_restore_systemd_units
+ register: loaded_backup_systemd_units
- include_tasks: dataset.yml
loop: "{{ services | subelements('datasets', skip_missing=True) }}"
diff --git a/float/roles/float-base-datasets/templates/litestream-env.j2 b/float/roles/float-base-datasets/templates/litestream-env.j2
new file mode 100644
index 0000000..2fefe3b
--- /dev/null
+++ b/float/roles/float-base-datasets/templates/litestream-env.j2
@@ -0,0 +1,3 @@
+{% for var, value in backup_litestream_credentials | default({}) | dictsort %}
+{{ var }}="{{ value }}"
+{% endfor %}
diff --git a/float/roles/float-base-datasets/templates/litestream-replicate-service.j2 b/float/roles/float-base-datasets/templates/litestream-replicate-service.j2
new file mode 100644
index 0000000..053d4bf
--- /dev/null
+++ b/float/roles/float-base-datasets/templates/litestream-replicate-service.j2
@@ -0,0 +1,24 @@
+{% set required_by = service.systemd_services | default([]) %}
+
+[Unit]
+Description=Replicate dataset {{ dataset_name }}
+After={{ required_by | join(' ') }}
+PartOf={{ required_by | join(' ') }}
+
+[Service]
+Type=simple
+Restart=always
+RestartSec=3
+EnvironmentFile=/etc/litestream/{{ dataset_tag }}.env
+ExecStart=/usr/bin/litestream replicate --config=/etc/litestream/{{ dataset_tag }}.yml
+{% if dataset_owner %}
+User={{ dataset_owner }}
+{% endif %}
+Group={{ dataset_group }}
+
+NoNewPrivileges=true
+ReadOnlyDirectories=/
+ReadWriteDirectories={{ dataset_path }}
+
+[Install]
+RequiredBy={{ required_by | join(' ') }}
diff --git a/float/roles/float-base-datasets/templates/litestream-restore-script.j2 b/float/roles/float-base-datasets/templates/litestream-restore-script.j2
new file mode 100644
index 0000000..4d0d28a
--- /dev/null
+++ b/float/roles/float-base-datasets/templates/litestream-restore-script.j2
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+{% if backup_litestream_url is defined %}
+# Restore the dataset {{ dataset_name }} using litestream.
+
+/usr/bin/litestream restore --config=/etc/litestream/{{ dataset_tag }}.yml --if-replica-exists -v "{{ dataset_path }}/{{ dataset_filename }}"
+
+if [ $? -gt 0 ]; then
+ echo "ERROR: restore failed!" >&2
+ exit 1
+fi
+
+chown -R "{{ dataset_desired_owner }}":"{{ dataset_group }}" "{{ dataset_path }}"
+{% endif %}
+
+exit 0
diff --git a/float/roles/float-base-datasets/templates/restore-service.j2 b/float/roles/float-base-datasets/templates/restore-service.j2
index 592871a..07801ed 100644
--- a/float/roles/float-base-datasets/templates/restore-service.j2
+++ b/float/roles/float-base-datasets/templates/restore-service.j2
@@ -5,13 +5,9 @@ Description=Restore dataset {{ dataset_name }}
Before={{ required_by | join(' ') }}
[Service]
-{% if backup_repository_uri is defined %}
-ExecStart=/usr/lib/float/datasets/restore-{{ dataset_filename }}
-{% else %}
-ExecStart=/bin/true
-{% endif %}
Type=oneshot
RemainAfterExit=true
+ExecStart=/usr/lib/float/float-dataset-restore {{ dataset_tag }}
[Install]
RequiredBy={{ required_by | join(' ') }}
diff --git a/float/roles/float-base-datasets/templates/restore-script.j2 b/float/roles/float-base-datasets/templates/tabacco-restore-script.j2
index 3d05f23..4de8d75 100644
--- a/float/roles/float-base-datasets/templates/restore-script.j2
+++ b/float/roles/float-base-datasets/templates/tabacco-restore-script.j2
@@ -1,21 +1,7 @@
#!/bin/sh
-# Restore the dataset {{ dataset_name }}.
-
-# Uses a guard file to ensure the restore runs only once
-# on a specific machine (or actually, once every time the
-# service is newly scheduled there).
-
-umask 077
-
-guard_dir=/var/lib/float/datasets
-mkdir -p ${guard_dir}
-
-guard_file="${guard_dir}/{{ dataset_filename }}.restore_guard"
-if [ -e "${guard_file}" ]; then
- echo "restore already ran for this dataset, skipping..." >&2
- exit 0
-fi
+{% if backup_repository_uri is defined %}
+# Restore the dataset {{ dataset_name }} using tabacco.
# Use 'tabacco query' to detect if a backup of this dataset exists,
# otherwise there's nothing to restore (the service might be new
@@ -25,6 +11,7 @@ ds=$(tabacco query "${ds_pattern}" 2>/dev/null)
if [ "x${ds}" = "x[]" ]; then
echo "could not find any backups for ${ds_pattern}" >&2
echo "nothing to restore, skipping..." >&2
+ exit 0
else
echo "starting restore of ${ds_pattern}..." >&2
tabacco restore --target / "${ds_pattern}"
@@ -34,11 +21,9 @@ else
fi
fi
-{% if dataset_path and dataset_owner %}
-chown -R "{{ dataset_owner }}":"{{ dataset_group }}" "{{ dataset_path }}"
+{% if dataset_path and dataset_desired_owner %}
+chown -R "{{ dataset_desired_owner }}":"{{ dataset_group }}" "{{ dataset_path }}"
+{% endif %}
{% endif %}
-
-echo "marking restore successful" >&2
-touch "${guard_file}"
exit 0
diff --git a/float/roles/float-base-docker/tasks/main.yml b/float/roles/float-base-docker/tasks/main.yml
index 8b1d86f..459ac56 100644
--- a/float/roles/float-base-docker/tasks/main.yml
+++ b/float/roles/float-base-docker/tasks/main.yml
@@ -1,5 +1,10 @@
---
+- name: Configure asset tracking
+ template:
+ src: "assetmon.default.j2"
+ dest: "/etc/default/assetmon"
+
- include_tasks: docker.yml
when: "container_runtime == 'docker'"
@@ -8,7 +13,8 @@
- name: Login to the Docker registry
shell: 'echo -n "{{ docker_registry_password }}" | {{ container_runtime }} login -u "{{ docker_registry_username }}" --password-stdin "{{ docker_registry_url }}"'
- changed_when: False
+ changed_when: false
+ check_mode: no
when: "docker_registry_url != ''"
- name: Install docker-related scripts
@@ -75,7 +81,7 @@
register: etc_hosts_stat
- name: Find containers that need a restart
- shell: "{{ container_runtime }} ps --format={% raw %}'{{.Names}} {{.Created}}'{% endraw %} | awk '$2 < {{ etc_hosts_stat.stat.mtime | int }} {print $1}'"
+ shell: "{{ container_runtime }} ps --format={% raw %}'{{.Names}} {{.Created}}'{% endraw %} | sed -e 's,[-+][0-9]* [A-Z]*$,,' | while read name d t ; do s=$(date --date=\"$d $t\" +%s); echo \"$name $s\"; done | awk '$2 < {{ etc_hosts_stat.stat.mtime | int }} {print $1}'"
register: containers_to_restart
changed_when: false
check_mode: no
diff --git a/float/roles/float-base-docker/tasks/podman_debian.yml b/float/roles/float-base-docker/tasks/podman_debian.yml
index ba4e212..3ed64cc 100644
--- a/float/roles/float-base-docker/tasks/podman_debian.yml
+++ b/float/roles/float-base-docker/tasks/podman_debian.yml
@@ -3,10 +3,6 @@
# Install Podman using packages from the Debian distribution
# (available starting with Bullseye).
-- fail:
- msg: "Debian packages for Podman are only present in Bullseye"
- when: "float_debian_dist in ('stretch', 'buster')"
-
- name: Remove podman Kubic repository key
file:
path: "/etc/apt/trusted.gpg.d/kubic.gpg"
diff --git a/float/roles/float-base-docker/templates/assetmon.default.j2 b/float/roles/float-base-docker/templates/assetmon.default.j2
new file mode 100644
index 0000000..d9c0af4
--- /dev/null
+++ b/float/roles/float-base-docker/templates/assetmon.default.j2
@@ -0,0 +1 @@
+OPTIONS="--server=https://assets.{{ domain }}:3798 --tls-cert=/etc/credentials/x509/assetmon-client/client/cert.pem --tls-key=/etc/credentials/x509/assetmon-client/client/private_key.pem --tls-ca=/etc/credentials/x509/assetmon-client/ca.pem"
diff --git a/float/roles/float-base-docker/templates/float-pull-image.j2 b/float/roles/float-base-docker/templates/float-pull-image.j2
index 4e0517b..fae251f 100755
--- a/float/roles/float-base-docker/templates/float-pull-image.j2
+++ b/float/roles/float-base-docker/templates/float-pull-image.j2
@@ -17,8 +17,12 @@ get_main_auth_token() {
get_auth_token() {
local url="$1"
- local auth_hdr="$(curl -s -I -H "Accept: application/vnd.docker.distribution.manifest.v2+json" "$url" \
+ local auth_hdr="$(curl -fs -I -H "Accept: application/vnd.docker.distribution.manifest.v2+json" "$url" \
| awk 'BEGIN{IGNORECASE=1} /^www-authenticate:/ {print $3}')"
+ if [ -z "$auth_hdr" ]; then
+ echo "Could not obtain authentication token from $url" >&2
+ exit 1
+ fi
local scope=$(printf "%s" "${auth_hdr}" | sed -e 's/^.*scope="\([^"]*\)".*$/\1/')
local service=$(printf "%s" "${auth_hdr}" | sed -e 's/^.*service="\([^"]*\)".*$/\1/')
local realm=$(printf "%s" "${auth_hdr}" | sed -e 's/^.*realm="\([^"]*\)".*$/\1/')
@@ -27,13 +31,13 @@ get_auth_token() {
if [ -n "${main_auth_token}" ]; then
curl_opts="-H \"Authorization: Bearer ${main_auth_token}\""
fi
- curl ${curl_opts} -s "${realm}?service=${service}&scope=${scope}" | jq -r .token
+ curl ${curl_opts} -sf "${realm}?service=${service}&scope=${scope}" | jq -r .token
}
get_remote_image_version() {
local url="https://${registry_hostname}/v2/${image_path}/manifests/${image_tag}"
local token="$(get_auth_token "$url")"
- curl -s -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \
+ curl -sf -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \
-H "Authorization: Bearer ${token}" \
"$url" \
| jq -r .config.digest
diff --git a/float/roles/float-base-docker/templates/run.sh.j2 b/float/roles/float-base-docker/templates/run.sh.j2
index 58d9dd7..4e08780 100644
--- a/float/roles/float-base-docker/templates/run.sh.j2
+++ b/float/roles/float-base-docker/templates/run.sh.j2
@@ -108,29 +108,25 @@ for gid in $(id -G {{ services[item.service].user }}); do
done
{% endif %}
+
+# TODO: move to --log-driver=passthrough once it is supported
+# by the Podman version in Debian stable, and then add the -d
+# option to get rid of the useless 'podman' process.
{% if container_runtime == 'podman' %}
exec /usr/bin/podman run \
-{% if float_debian_dist in ('stretch', 'buster') %}
- --cgroup-manager=cgroupfs \
- --cgroup-parent /system.slice/docker-{{ item.tag }}.service \
-{% else %}
--cgroups=disabled \
-{% endif %}
- --rm --name {{ item.service }}-{{ item.container.name }} \
- --no-healthcheck \
--replace \
- --log-driver=none \
- $opts \
-{% for opt in g.options %}
- {{ opt }} \
-{% endfor %}
- {{ item.container.image }} {{ item.container.get('args', '') }}
+ --sdnotify=conmon \
{% elif container_runtime == 'docker' %}
exec /usr/bin/systemd-docker --env run \
+{% endif %}
--rm --name {{ item.service }}-{{ item.container.name }} \
+ --pull=never \
+ --log-driver=none \
+ --no-healthcheck \
$opts \
{% for opt in g.options %}
{{ opt }} \
{% endfor %}
+ "$@" \
{{ item.container.image }} {{ item.container.get('args', '') }}
-{% endif %}
diff --git a/float/roles/float-base-docker/templates/systemd.j2 b/float/roles/float-base-docker/templates/systemd.j2
index 0fce97f..1d45fc1 100644
--- a/float/roles/float-base-docker/templates/systemd.j2
+++ b/float/roles/float-base-docker/templates/systemd.j2
@@ -6,30 +6,42 @@ Requires=docker.service
{% endif %}
[Service]
-ExecStart=/usr/lib/float/docker/run-{{ item.service }}-{{ item.container.name }}.sh
-ExecStop=/usr/bin/{{ container_runtime }} stop --time 20 {{ item.service }}-{{ item.container.name }}
-ExecStopPost=-/usr/bin/{{ container_runtime }} kill {{ item.service }}-{{ item.container.name }} 2>/dev/null
+ExecStartPre=-rm -f %t/%N.cid
+ExecStart=/usr/lib/float/docker/run-{{ item.service }}-{{ item.container.name }}.sh --cidfile=%t/%N.cid
+ExecStopPost=-/usr/bin/{{ container_runtime }} rm -f -i --cidfile=%t/%N.cid
+ExecStopPost=-rm -f %t/%N.cid
TimeoutStopSec=60
-KillMode=control-group
+KillMode=mixed
Restart=always
RestartSec=3s
-Type=simple
+Type=notify
+NotifyAccess=all
SyslogIdentifier={{ item.service }}-{{ item.container.name }}
{% if item.container.resources is defined %}
{% if item.container.resources.ram is defined %}
MemoryMax={{ item.container.resources.ram }}
-{% if float_debian_dist == 'buster' %}
-ExecStartPost=+/bin/sh -c "echo 0 > /sys/fs/cgroup/memory/system.slice/%n/memory.swappiness"
-{% else %}
MemorySwapMax=0
{% endif %}
-{% endif %}
{% if item.container.resources.cpu is defined %}
CPUQuota={{ 100 * item.container.resources.cpu }}%
{% endif %}
{% endif %}
LimitNOFILE=65535
+{% set egress_policy = item.container.get('egress_policy', 'allow-all') %}
+{% if egress_policy == 'internal' %}
+IPAddressDeny=any
+IPAddressAllow=localhost
+{# This is a terrible way to determine which private networks the host is on.
+ It would be a good candidate for pre-processing in the float plugin. #}
+{% for net_overlay in net_overlays | sort if ('ip_' + net_overlay.name) in hostvars[inventory_hostname] %}
+IPAddressAllow={{ net_overlay.network }}
+{% endfor %}
+{% elif egress_policy == 'none' %}
+IPAddressDeny=any
+IPAddressAllow=localhost
+{% endif %}
+
[Install]
WantedBy=multi-user.target {{ 'docker.service' if container_runtime == 'docker' else '' }}
diff --git a/float/roles/float-base-net-overlay/templates/firewall/11net-overlay-raw.j2 b/float/roles/float-base-net-overlay/templates/firewall/11net-overlay-raw.j2
index 8d8d8bf..4526343 100644
--- a/float/roles/float-base-net-overlay/templates/firewall/11net-overlay-raw.j2
+++ b/float/roles/float-base-net-overlay/templates/firewall/11net-overlay-raw.j2
@@ -1,11 +1,17 @@
+{% macro allow_host_ips(h, chain) %}
+{% for ip in hostvars[h]['ips'] | ansible.netcommon.ipv4 | sort %}
+add_rule4 -A {{ chain }} -s {{ ip }} -j CT --notrack
+{% endfor %}
+{% for ip in hostvars[h]['ips'] | ansible.netcommon.ipv6 | sort %}
+add_rule6 -A {{ chain }} -s {{ ip }} -j CT --notrack
+{% endfor %}
+{% endmacro %}
+
# Allow peer nodes to communicate with our tinc daemon.
create_chain allow-vpn-{{ tinc_net }}
{% for h in groups['overlay_' + tinc_net]|sort %}
{% if h != inventory_hostname %}
-add_rule4 -A allow-vpn-{{ tinc_net }} -s {{ hostvars[h]['ip'] }} -j CT --notrack
-{% if hostvars[h].get('ip6') %}
-add_rule6 -A allow-vpn-{{ tinc_net }} -s {{ hostvars[h]['ip6'] }} -j CT --notrack
-{% endif %}
+{{ allow_host_ips(h, 'allow-vpn-' + tinc_net) }}
{% endif %}
{% endfor %}
diff --git a/float/roles/float-base/defaults/main.yml b/float/roles/float-base/defaults/main.yml
index da89650..eb35e9a 100644
--- a/float/roles/float-base/defaults/main.yml
+++ b/float/roles/float-base/defaults/main.yml
@@ -13,8 +13,12 @@ emergency_ssh_key: ""
# Port that SSH should listen on (best to keep it at the default).
ssh_port: 22
+# Principal for the SSH host key, i.e. name that is used to reach the host
+# over SSH.
+ssh_host_key_principal: "{{ inventory_hostname }}.{{ domain }}"
+
# The Debian distribution that we are using as the basis.
-float_debian_dist: "buster"
+float_debian_dist: "bullseye"
# How to configure resolv.conf, one of the following options:
# 'ignore' - do nothing and leave resolv.conf alone
@@ -34,6 +38,17 @@ nf_conntrack_max: 524288
# Customize /etc/motd.
motd: |2
┏━╸╻ ┏━┓┏━┓╺┳╸
- ** ┣╸ ┃ ┃ ┃┣━┫ ┃
+ ** ┣╸ ┃ ┃ ┃┣━┫ ┃
╹ ┗━╸┗━┛╹ ╹ ╹ {{ inventory_hostname }}
+
+
+# Enable kernel lockdown measures (e.g. disable module loading post-boot)
+# Once enabled this feature can be disabled only with a reboot.
+kernel_lockdown_enabled: false
+
+# When enabled, the kernel GRUB and getty will be using configured to use
+# serial port 0 (by default)
+enable_serial_port: false
+serial_port_unit: 0
+serial_port_speed: 115200
diff --git a/float/roles/float-base/files/disable-kmod-load.service b/float/roles/float-base/files/disable-kmod-load.service
new file mode 100644
index 0000000..b661e3f
--- /dev/null
+++ b/float/roles/float-base/files/disable-kmod-load.service
@@ -0,0 +1,10 @@
+[Unit]
+Description=Disable kernel module loading
+After=multi-user.target
+
+[Service]
+Type=oneshot
+ExecStart=/bin/sh -c "echo 1 > /proc/sys/kernel/modules_disabled"
+
+[Install]
+WantedBy=float-lockdown.target
diff --git a/float/roles/float-base/files/float-lockdown.target b/float/roles/float-base/files/float-lockdown.target
new file mode 100644
index 0000000..2aea3a4
--- /dev/null
+++ b/float/roles/float-base/files/float-lockdown.target
@@ -0,0 +1,7 @@
+[Unit]
+Description=float has been locked down
+Requires=multi-user.target
+After=multi-user.target
+
+[Install]
+WantedBy=multi-user.target
diff --git a/float/roles/float-base/files/mtail.service.bullseye b/float/roles/float-base/files/mtail.service
index a823ba4..a823ba4 100644
--- a/float/roles/float-base/files/mtail.service.bullseye
+++ b/float/roles/float-base/files/mtail.service
diff --git a/float/roles/float-base/files/mtail.service.buster b/float/roles/float-base/files/mtail.service.buster
deleted file mode 100644
index 4fc12dd..0000000
--- a/float/roles/float-base/files/mtail.service.buster
+++ /dev/null
@@ -1,17 +0,0 @@
-[Unit]
-Description=MTail
-Requires=mtail.socket
-
-[Service]
-Type=simple
-# Systemd will pass mtail.socket as FD 3.
-ExecStart=/usr/bin/mtail --progs /etc/mtail --logtostderr --port 3903 --poll_interval 0 --logs /dev/fd/3
-Restart=on-failure
-User=mtail
-
-# Limit memory leaks
-MemoryMax=1G
-ExecStartPost=+/bin/sh -c "echo 0 > /sys/fs/cgroup/memory/system.slice/%n/memory.swappiness"
-
-[Install]
-WantedBy=multi-user.target
diff --git a/float/roles/float-base/files/mtail.service.stretch b/float/roles/float-base/files/mtail.service.stretch
deleted file mode 120000
index b914cb3..0000000
--- a/float/roles/float-base/files/mtail.service.stretch
+++ /dev/null
@@ -1 +0,0 @@
-mtail.service.buster \ No newline at end of file
diff --git a/float/roles/float-base/files/node-exporter.default.bullseye b/float/roles/float-base/files/node-exporter.default
index 73e558e..73e558e 100644
--- a/float/roles/float-base/files/node-exporter.default.bullseye
+++ b/float/roles/float-base/files/node-exporter.default
diff --git a/float/roles/float-base/files/node-exporter.default.buster b/float/roles/float-base/files/node-exporter.default.buster
deleted file mode 100644
index 0509290..0000000
--- a/float/roles/float-base/files/node-exporter.default.buster
+++ /dev/null
@@ -1 +0,0 @@
-ARGS="--collector.ntp --collector.ntp.server-is-local --collector.systemd.unit-blacklist=.+(\.device|\.swap|\.mount|\.scope|\.slice|\.target)"
diff --git a/float/roles/float-base/files/node-exporter.default.stretch b/float/roles/float-base/files/node-exporter.default.stretch
deleted file mode 120000
index addd3e8..0000000
--- a/float/roles/float-base/files/node-exporter.default.stretch
+++ /dev/null
@@ -1 +0,0 @@
-node-exporter.default.buster \ No newline at end of file
diff --git a/float/roles/float-base/handlers/main.yml b/float/roles/float-base/handlers/main.yml
index 8aac477..ac47fa5 100644
--- a/float/roles/float-base/handlers/main.yml
+++ b/float/roles/float-base/handlers/main.yml
@@ -70,3 +70,5 @@
- name: update-ca-certificates
command: "/usr/sbin/update-ca-certificates"
+- name: update-grub
+ command: /usr/sbin/update-grub
diff --git a/float/roles/float-base/tasks/apt.yml b/float/roles/float-base/tasks/apt.yml
index 01ac223..5a0a5fb 100644
--- a/float/roles/float-base/tasks/apt.yml
+++ b/float/roles/float-base/tasks/apt.yml
@@ -83,6 +83,12 @@
state: present
when: "testing|default(True)"
+# Remove legacy stretch/buster mtail package pin.
+- name: Cleanup mtail package pin
+ file:
+ path: "/etc/apt/preferences.d/99float-syslog"
+ state: absent
+
- name: Install base packages
apt:
name: "{{ packages }}"
@@ -90,6 +96,7 @@
vars:
packages:
- unattended-upgrades
+ - systemd-coredump
- rsync
- git
- ntp
@@ -97,16 +104,26 @@
- curl
- lsof
- cgroups-exporter
- - rsyslog-exporter
- logcat
- tabacco
- restic
+ - litestream
- runcron
- acpid
- - lz4
+ - zstd
- man-db
- jq
- gpg
+ - firewall
+ - rsyslog
+ - rsyslog-relp
+ - rsyslog-exporter
+ - mtail
+ - auditd
+ - audisp-json
+ - prometheus-node-exporter
+ - prometheus-node-exporter-collectors
+ - assetmon
- name: Install extra packages
apt:
diff --git a/float/roles/float-base/tasks/firewall.yml b/float/roles/float-base/tasks/firewall.yml
index 849eee7..84b34d9 100644
--- a/float/roles/float-base/tasks/firewall.yml
+++ b/float/roles/float-base/tasks/firewall.yml
@@ -1,9 +1,5 @@
---
-- apt:
- name: firewall
- state: present
-
- template:
src: firewall/10float.j2
dest: /etc/firewall/filter.d/10float
diff --git a/float/roles/float-base/tasks/harden.yml b/float/roles/float-base/tasks/harden.yml
index b9e1958..3202889 100644
--- a/float/roles/float-base/tasks/harden.yml
+++ b/float/roles/float-base/tasks/harden.yml
@@ -39,18 +39,6 @@
- name: Restrict core dumps (PAM)
lineinfile: dest=/etc/security/limits.conf line="* hard core 0" state=present
-# Audit configuration on Debian stretch uses augenrules by default, so
-# we copy our rules in /etc/audit/rules.d.
-# TODO: evaluate whether we still need this.
-- name: Auditd installed
- apt:
- name: "{{ packages }}"
- state: present
- vars:
- packages:
- - auditd
- - audisp-json
-
- name: Auditd default config removed
file:
path: /etc/audit/rules.d/audit.rules
@@ -75,7 +63,7 @@
- name: Audispd plugins configured
copy:
src: "audit/plugins.d/{{ item }}"
- dest: "/etc/{{ 'audisp' if float_debian_dist in ('stretch', 'buster') else 'audit' }}/plugins.d/{{ item }}"
+ dest: "/etc/audit/plugins.d/{{ item }}"
with_items:
- syslog.conf
- json.conf
@@ -87,9 +75,31 @@
name: auditd.service
enabled: yes
-- name: Disable journald-auditd link
+- name: Mask unwanted systemd units
systemd:
- name: systemd-journald-audit.socket
+ name: "{{ item }}"
state: stopped
enabled: no
masked: yes
+ loop:
+ - "sys-fs-fuse-connections.mount"
+ - "systemd-journald-audit.socket"
+
+- name: Install lockdown systemd units
+ copy:
+ src: "{{ item }}"
+ dest: "/lib/systemd/system/{{ item }}"
+ loop:
+ - 'float-lockdown.target'
+ - 'disable-kmod-load.service'
+
+- name: Enable lockdown systemd units
+ systemd:
+ name: "{{ item }}"
+ enabled: "{{ kernel_lockdown_enabled }}"
+ daemon_reload: yes
+ loop:
+ - 'float-lockdown.target'
+ - 'disable-kmod-load.service'
+ ignore_errors: "{{ ansible_check_mode }}"
+
diff --git a/float/roles/float-base/tasks/main.yml b/float/roles/float-base/tasks/main.yml
index bf12d47..af7e332 100644
--- a/float/roles/float-base/tasks/main.yml
+++ b/float/roles/float-base/tasks/main.yml
@@ -83,6 +83,9 @@
- include_tasks: osquery.yml
when: enable_osquery|bool
+- include_tasks: serial.yml
+ when: enable_serial_port|bool
+
- name: Check ipmi availability
stat:
path: "/dev/ipmi0"
diff --git a/float/roles/float-base/tasks/prometheus.yml b/float/roles/float-base/tasks/prometheus.yml
index b713f22..9835d0a 100644
--- a/float/roles/float-base/tasks/prometheus.yml
+++ b/float/roles/float-base/tasks/prometheus.yml
@@ -2,23 +2,11 @@
- name: Install prometheus config files in /etc/default
copy:
- src: "node-exporter.default.{{ float_debian_dist }}"
+ src: "node-exporter.default"
dest: "/etc/default/prometheus-node-exporter"
notify:
- reload prometheus-node-exporter
-- name: Install prometheus node package
- apt:
- name: prometheus-node-exporter
- state: present
-
-- name: Install prometheus node extra package
- apt:
- name:
- - prometheus-node-exporter-collectors
- state: present
- when: "float_debian_dist not in ('stretch', 'buster')"
-
- name: Add static metrics
template:
src: "{{ item }}.j2"
diff --git a/float/roles/float-base/tasks/serial.yml b/float/roles/float-base/tasks/serial.yml
new file mode 100644
index 0000000..5bcc3e9
--- /dev/null
+++ b/float/roles/float-base/tasks/serial.yml
@@ -0,0 +1,20 @@
+# Enable serial output for logins and kernel/GRUB (if running GRUB)
+
+- name: Enable getty on serial port
+ systemd:
+ name: "getty@ttyS{{ serial_port_unit }}"
+ enabled: yes
+ state: started
+
+- name: Check GRUB availability
+ stat:
+ path: "/usr/sbin/update-grub"
+ register: grub_present
+
+- name: Configure kernel and GRUB to use the serial port
+ template:
+ src: grub-serial.j2
+ dest: /etc/default/grub.d/30-float-serial.cfg
+ notify:
+ - update-grub
+ when: grub_present.stat.exists == true
diff --git a/float/roles/float-base/tasks/service_discovery.yml b/float/roles/float-base/tasks/service_discovery.yml
index a39c90f..bd6e415 100644
--- a/float/roles/float-base/tasks/service_discovery.yml
+++ b/float/roles/float-base/tasks/service_discovery.yml
@@ -8,5 +8,5 @@
- name: Create /etc/host.conf
copy:
dest: /etc/host.conf
- content: "multi on"
+ content: "multi on\n"
diff --git a/float/roles/float-base/tasks/ssh.yml b/float/roles/float-base/tasks/ssh.yml
index 576d2b6..8b0fbee 100644
--- a/float/roles/float-base/tasks/ssh.yml
+++ b/float/roles/float-base/tasks/ssh.yml
@@ -28,6 +28,8 @@
sshca_sign:
ca: "{{ credentials_dir }}/ssh/key"
pubkey: "/etc/ssh/ssh_host_{{ item }}_key.pub"
+ principals:
+ - "{{ ssh_host_key_principal }}"
with_items: "{{ ssh_host_key_types }}"
notify:
- reload ssh
diff --git a/float/roles/float-base/tasks/syslog.yml b/float/roles/float-base/tasks/syslog.yml
index 3adc819..0bd78ee 100644
--- a/float/roles/float-base/tasks/syslog.yml
+++ b/float/roles/float-base/tasks/syslog.yml
@@ -1,23 +1,5 @@
---
-# mtail 3.0.0~rc19-2 on Buster is broken when reading from named pipes
-# Pin mtail to ai3 repo that ships mtail 3.0.0~rc5-1~bpo9+1
-- name: Force mtail version on buster
- copy:
- src: "mtail.apt-preferences"
- dest: "/etc/apt/preferences.d/99float-syslog"
- when: float_debian_dist == 'buster'
-
-- name: Install rsyslog packages
- apt:
- name: "{{ packages }}"
- state: present
- vars:
- packages:
- - rsyslog
- - rsyslog-gnutls
- - mtail
-
- name: Install mtail systemd socket unit
copy:
src: "mtail.socket"
@@ -26,7 +8,7 @@
- name: Install mtail systemd unit
copy:
- src: "mtail.service.{{ float_debian_dist }}"
+ src: "mtail.service"
dest: "/etc/systemd/system/mtail.service"
notify: restart mtail
diff --git a/float/roles/float-base/templates/firewall/10float.j2 b/float/roles/float-base/templates/firewall/10float.j2
index c14c507..e8888a4 100644
--- a/float/roles/float-base/templates/firewall/10float.j2
+++ b/float/roles/float-base/templates/firewall/10float.j2
@@ -2,10 +2,12 @@
# specific sets of hosts.
{% macro allow_host_ips(h, chain) %}
-add_rule4 -A {{ chain }} -s {{ hostvars[h]['ip'] }} -j ACCEPT
-{% if hostvars[h].get('ip6') %}
-add_rule6 -A {{ chain }} -s {{ hostvars[h]['ip6'] }} -j ACCEPT
-{% endif %}
+{% for ip in hostvars[h]['ips'] | ansible.netcommon.ipv4 | sort %}
+add_rule4 -A {{ chain }} -s {{ ip }} -j ACCEPT
+{% endfor %}
+{% for ip in hostvars[h]['ips'] | ansible.netcommon.ipv6 | sort %}
+add_rule6 -A {{ chain }} -s {{ ip }} -j ACCEPT
+{% endfor %}
{% endmacro %}
{% macro create_chain_from_host_group(chain, group) %}
diff --git a/float/roles/float-base/templates/grub-serial.j2 b/float/roles/float-base/templates/grub-serial.j2
new file mode 100644
index 0000000..3876a8a
--- /dev/null
+++ b/float/roles/float-base/templates/grub-serial.j2
@@ -0,0 +1,4 @@
+GRUB_CMDLINE_LINUX="$GRUB_CMDLINE_LINUX console=ttyS{{ serial_port_unit }},{{ serial_port_speed }}n8"
+
+GRUB_TERMINAL="$GRUB_TERMINAL serial"
+GRUB_SERIAL_COMMAND="serial --unit={{ serial_port_unit }} --speed={{ serial_port_speed }}"
diff --git a/float/roles/float-base/templates/jail.local.j2 b/float/roles/float-base/templates/jail.local.j2
index 2a55dbb..157c4f6 100644
--- a/float/roles/float-base/templates/jail.local.j2
+++ b/float/roles/float-base/templates/jail.local.j2
@@ -1,7 +1,7 @@
[DEFAULT]
# Avoid blacklisting any of our own IPs.
-ignoreip = 127.0.0.1/8 ::1 {{ hostvars | dictsort | map(attribute='1') | map(attribute='ip') | join(' ') }} {{ hostvars | dictsort | map(attribute='1') | map(attribute='ip6') | reject('undefined') | join(' ') }} {{ net_overlays | map(attribute='network') | sort | join(' ') }}
+ignoreip = 127.0.0.1/8 ::1 {{ hostvars.values() | rejectattr('ips', 'undefined') | map(attribute='ips') | flatten | sort | join(' ') }} {{ net_overlays | map(attribute='network') | sort | join(' ') }}
# Default to reading from the journal.
backend = systemd
diff --git a/float/roles/float-base/templates/resolv.conf.j2 b/float/roles/float-base/templates/resolv.conf.j2
index 26d85da..515fb0c 100644
--- a/float/roles/float-base/templates/resolv.conf.j2
+++ b/float/roles/float-base/templates/resolv.conf.j2
@@ -3,7 +3,7 @@ nameserver 127.0.0.1
options edns0
{% elif resolver_mode.startswith('internal:') %}
{% set dns_overlay_net = resolver_mode[9:] %}
-{% for h in groups['frontend'] | sort %}
+{% for h in services['dns'].hosts | sort %}
nameserver {{ hostvars[h]['ip_' + dns_overlay_net] }}
{% endfor %}
options edns0 rotate
diff --git a/float/roles/float-base/templates/rsyslog.conf.j2 b/float/roles/float-base/templates/rsyslog.conf.j2
index fbc831f..a9de870 100644
--- a/float/roles/float-base/templates/rsyslog.conf.j2
+++ b/float/roles/float-base/templates/rsyslog.conf.j2
@@ -1,9 +1,5 @@
global(
maxMessageSize="64k"
- defaultNetstreamDriver="gtls"
- defaultNetstreamDriverCAFile="/etc/credentials/x509/log-client/ca.pem"
- defaultNetstreamDriverCertFile="/etc/credentials/x509/log-client/client/cert.pem"
- defaultNetstreamDriverKeyFile="/etc/credentials/x509/log-client/client/private_key.pem"
)
module(load="imuxsock"
@@ -25,6 +21,11 @@ module(
load="omprog"
)
+module(
+ load="omrelp"
+ tls.tlslib="openssl"
+)
+
ruleset(name="process_stats") {
action(
type="omprog"
@@ -48,28 +49,32 @@ ruleset(name="incoming") {
# Protect the main queue from mtail pipe full: discard messages on
# ompipe action queue full.
queue.type="FixedArray"
- queue.size="4096"
+ queue.size="1024"
queue.timeoutEnqueue="0"
action.resumeRetryCount="-1"
action.resumeInterval="2"
action.resumeIntervalMax="30")
# Send everything to remote peer, do not write anything locally.
- action(type="omfwd"
- protocol="tcp"
+ action(type="omrelp"
target="log-collector.{{ domain }}"
port="6514"
- StreamDriver="gtls"
- StreamDriverMode="1"
- StreamDriverAuthMode="x509/name"
- StreamDriverPermittedPeers="log-collector.{{ domain }}"
- Keepalive="on"
+ tls="on"
+ tls.compression="on"
+ tls.authmode="certvalid"
+ tls.permittedpeer="log-collector.{{ domain }}"
+ tls.cacert="/etc/credentials/x509/log-client/ca.pem"
+ tls.mycert="/etc/credentials/x509/log-client/client/cert.pem"
+ tls.myprivkey="/etc/credentials/x509/log-client/client/private_key.pem"
action.resumeRetryCount="-1"
action.resumeInterval="2"
action.reportSuspension="on"
+ queue.workerthreads="4"
+ queue.size="50000"
queue.spoolDirectory="/var/spool/rsyslog"
queue.filename="remote"
- queue.maxdiskspace="1g"
+ queue.maxfilesize="64m"
+ queue.maxdiskspace="2g"
queue.type="LinkedList"
queue.saveonshutdown="on"
)
diff --git a/float/roles/float-base/templates/sources.list.j2 b/float/roles/float-base/templates/sources.list.j2
index de7b2e1..939e9ed 100644
--- a/float/roles/float-base/templates/sources.list.j2
+++ b/float/roles/float-base/templates/sources.list.j2
@@ -1,10 +1,5 @@
{% if apt_sources_list_override is defined %}{{ apt_sources_list_override }}{% else %}
deb http://deb.debian.org/debian {{ float_debian_dist }} main contrib non-free
deb http://deb.debian.org/debian {{ float_debian_dist }}-updates main contrib non-free
-{% if float_debian_dist in ('stretch', 'buster') %}
-deb http://deb.debian.org/debian {{ float_debian_dist }}-backports main
-deb http://security.debian.org/ {{ float_debian_dist }}/updates main contrib non-free
-{% else %}
deb http://security.debian.org/debian-security {{ float_debian_dist }}-security main contrib non-free
{% endif %}
-{% endif %}
diff --git a/float/roles/float-base/templates/sysctl.conf.j2 b/float/roles/float-base/templates/sysctl.conf.j2
index 7a0e0d7..2a443ea 100644
--- a/float/roles/float-base/templates/sysctl.conf.j2
+++ b/float/roles/float-base/templates/sysctl.conf.j2
@@ -119,7 +119,7 @@ kernel.unprivileged_bpf_disabled=1
# Disable unprivileged user namespaces
# https://lwn.net/Articles/673597
# (linux-hardened default)
-#kernel.unprivileged_userns_clone=0
+kernel.unprivileged_userns_clone=0
# Enable yama ptrace restrictions
# https://www.kernel.org/doc/Documentation/security/Yama.txt
diff --git a/float/roles/float-infra-acme/templates/config.yml.j2 b/float/roles/float-infra-acme/templates/config.yml.j2
index 0839814..8533272 100644
--- a/float/roles/float-infra-acme/templates/config.yml.j2
+++ b/float/roles/float-infra-acme/templates/config.yml.j2
@@ -14,8 +14,8 @@ dns:
tsig_key_algo: "{{ acme_tsig_key.algo }}"
tsig_key_secret: "{{ acme_tsig_key.private }}"
nameservers:
-{% for h in groups['frontend']|sort %}
- - {{ hostvars[h]['ip'] }}
+{% for h in services['dns'].hosts | sort %}
+ - {{ h }}.dns.{{ domain }}
{% endfor %}
output:
path: "/var/lib/replds/acme"
diff --git a/float/roles/float-infra-assetmon/handlers/main.yml b/float/roles/float-infra-assetmon/handlers/main.yml
new file mode 100644
index 0000000..acbc01c
--- /dev/null
+++ b/float/roles/float-infra-assetmon/handlers/main.yml
@@ -0,0 +1,6 @@
+---
+
+- listen: reload assetmon
+ systemd:
+ name: docker-assets-http.service
+ state: restarted
diff --git a/float/roles/float-infra-assetmon/tasks/main.yml b/float/roles/float-infra-assetmon/tasks/main.yml
new file mode 100644
index 0000000..0334b4e
--- /dev/null
+++ b/float/roles/float-infra-assetmon/tasks/main.yml
@@ -0,0 +1,18 @@
+---
+
+- name: Create /etc/assetmon
+ file:
+ path: "/etc/assetmon"
+ state: directory
+ owner: root
+ group: docker-assets
+ mode: 0750
+
+- name: Configure asset tracking server
+ template:
+ src: "server.yml.j2"
+ dest: "/etc/assetmon/server.yml"
+ owner: root
+ group: docker-assets
+ mode: 0640
+ notify: reload assetmon
diff --git a/float/roles/float-infra-assetmon/templates/server.yml.j2 b/float/roles/float-infra-assetmon/templates/server.yml.j2
new file mode 100644
index 0000000..3311665
--- /dev/null
+++ b/float/roles/float-infra-assetmon/templates/server.yml.j2
@@ -0,0 +1,13 @@
+db_uri: /var/lib/assetmon/assets.db
+http_server:
+ request_timeout: 30
+ tls:
+ cert: "/etc/credentials/x509/assetmon/server/cert.pem"
+ key: "/etc/credentials/x509/assetmon/server/private_key.pem"
+ ca: "/etc/credentials/x509/assetmon/ca.pem"
+ acl:
+ allow:
+ - path: "/api/v1/.*"
+ cn: "assetmon-client.{{ domain }}"
+ - path: "/(|image)$"
+ cn: ".*"
diff --git a/float/roles/float-infra-dns/defaults/main.yml b/float/roles/float-infra-dns/defaults/main.yml
index 34f1562..a66e91e 100644
--- a/float/roles/float-infra-dns/defaults/main.yml
+++ b/float/roles/float-infra-dns/defaults/main.yml
@@ -3,3 +3,4 @@
# The domain name used for NS and MX records for autogenerated zones.
# By default, this is the first public domain.
mx_ns_domain: "{{ domain_public[0] }}"
+
diff --git a/float/roles/float-infra-dns/templates/bind/named.conf.local b/float/roles/float-infra-dns/templates/bind/named.conf.local
index bd16de0..baa7da7 100644
--- a/float/roles/float-infra-dns/templates/bind/named.conf.local
+++ b/float/roles/float-infra-dns/templates/bind/named.conf.local
@@ -8,8 +8,6 @@ view "internal-in" in {
{% endfor %}
};
recursion yes;
- additional-from-auth yes;
- additional-from-cache yes;
zone-statistics no;
// Send minimal responses, to avoid problems with the Spamassassin
@@ -31,11 +29,6 @@ view "external-in" in {
recursion no;
zone-statistics yes;
- // Do not trust the cache when generating additional records
- // for our authoritative zones.
- additional-from-auth no;
- additional-from-cache no;
-
// Include manually-maintained zones.
include "/etc/bind/named.conf.external-custom-zones";
diff --git a/float/roles/float-infra-dns/templates/bind/named.conf.options b/float/roles/float-infra-dns/templates/bind/named.conf.options
index 5e22f5d..fb34501 100644
--- a/float/roles/float-infra-dns/templates/bind/named.conf.options
+++ b/float/roles/float-infra-dns/templates/bind/named.conf.options
@@ -10,17 +10,17 @@ options {
{% if float_limit_bind_to_known_interfaces | default(False) %}
listen-on {
127.0.0.1;
-{% for h in services['frontend'].hosts | sort %}
- {{ hostvars[h]['ip'] }};
-{% for n in net_overlays | sort if ('ip_' + n.name) in hostvars[h] %}
- {{ hostvars[h]['ip_' + n.name] }};
+{% for ip in ips | ansible.netcommon.ipv4 | sort %}
+ {{ ip }};
{% endfor %}
+{% for n in net_overlays | sort if ('ip_' + n.name) in hostvars[inventory_hostname] %}
+ {{ hostvars[inventory_hostname]['ip_' + n.name] }};
{% endfor %}
};
listen-on-v6 {
::1;
-{% for h in services['frontend'].hosts | sort if 'ip6' in hostvars[h] %}
- {{ hostvars[h]['ip6'] }};
+{% for ip in ips | ansible.netcommon.ipv6 | sort %}
+ {{ ip }};
{% endfor %}
};
{% else %}
@@ -28,7 +28,6 @@ options {
listen-on-v6 { any; };
{% endif %}
- dnssec-enable yes;
dnssec-validation auto;
notify no;
diff --git a/float/roles/float-infra-dns/templates/dns/infra.yml b/float/roles/float-infra-dns/templates/dns/infra.yml
index fa0cd60..b104d31 100644
--- a/float/roles/float-infra-dns/templates/dns/infra.yml
+++ b/float/roles/float-infra-dns/templates/dns/infra.yml
@@ -2,9 +2,13 @@
"@ns":
_:
-{% for h in services['frontend'].hosts|sort %}
+{% for h in services['dns'].hosts | sort %}
+{% set host_ip4 = hostvars[h]['public_ips'] | ansible.netcommon.ipv4 %}
+{% set host_ip6 = hostvars[h]['public_ips'] | ansible.netcommon.ipv6 %}
+{% if host_ip4 %}
- NS ns{{ loop.index }}.{{ mx_ns_domain }}.
-{% if 'ip6' in hostvars[h] %}
+{% endif %}
+{% if host_ip6 %}
- NS ns{{ loop.index }}-v6.{{ mx_ns_domain }}.
{% endif %}
{% endfor %}
@@ -27,22 +31,34 @@
www: CNAME www.l.{{ d }}.
# The explicit NS delegation for 'l' is necessary for dnssec-sign to work properly.
l:
-{% for h in services['frontend'].hosts|sort %}
+{% for h in services['dns'].hosts | sort %}
+{% set host_ip4 = hostvars[h]['public_ips'] | ansible.netcommon.ipv4 %}
+{% set host_ip6 = hostvars[h]['public_ips'] | ansible.netcommon.ipv6 %}
+{% if host_ip4 %}
- NS ns{{ loop.index }}.{{ mx_ns_domain }}.
+{% endif %}
+{% if host_ip6 %}
+ - NS ns{{ loop.index }}-v6.{{ mx_ns_domain }}.
+{% endif %}
{% endfor %}
-{% for h in services['frontend'].hosts|sort %}
- ns{{ loop.index }}: {{ hostvars[h]['public_ip'] | default(hostvars[h]['ip']) }}
-{% if 'ip6' in hostvars[h] or 'public_ip6' in hostvars[h] %}
- ns{{ loop.index }}-v6: AAAA {{ hostvars[h]['public_ip6'] | default(hostvars[h]['ip6']) }}
+
+{% if d == mx_ns_domain %}
+{# Only generate the nameservers' A records on the chosen zone #}
+{% for h in services['dns'].hosts | sort %}
+{% set host_ip4 = hostvars[h]['public_ips'] | ansible.netcommon.ipv4 %}
+{% set host_ip6 = hostvars[h]['public_ips'] | ansible.netcommon.ipv6 %}
+{% if host_ip4 %}
+ ns{{ loop.index }}: {{ host_ip4 | to_json }}
{% endif %}
- mx{{ loop.index }}:
- - {{ hostvars[h]['public_ip'] | default(hostvars[h]['ip']) }}
-{% if 'ip6' in hostvars[h] or 'public_ip6' in hostvars[h] %}
- - AAAA {{ hostvars[h]['public_ip6'] | default(hostvars[h]['ip6']) }}
+{% if host_ip6 %}
+ ns{{ loop.index }}-v6: {{ host_ip6 | map('regex_replace', '^', 'AAAA ') | list | to_json }}
{% endif %}
+ mx{{ loop.index }}: {{ (host_ip4 + (host_ip6 | map('regex_replace', '^', 'AAAA ') | list)) | to_json }}
{% endfor %}
{% endif %}
+{% endif %}
+
{% for service_name, s in services|dictsort %}
{# Iterate over the HTTP endpoints #}
{% for pe in s.get('public_endpoints', []) if pe.get('name') and not pe.get('skip_dns', False) %}
diff --git a/float/roles/float-infra-dns/templates/zonetool.yml b/float/roles/float-infra-dns/templates/zonetool.yml
index a21721d..4f6fcbd 100644
--- a/float/roles/float-infra-dns/templates/zonetool.yml
+++ b/float/roles/float-infra-dns/templates/zonetool.yml
@@ -1,15 +1,6 @@
---
+{% set all_ips = services['frontend'].hosts | map('extract', hostvars) | rejectattr('traffic', 'false') | map(attribute='public_ips') | reject('undefined') | flatten %}
-FRONTENDS4:
-{% for h in services['frontend'].hosts|sort if hostvars[h].get('traffic', 1) %}
- - {{ hostvars[h]['public_ip'] | default(hostvars[h]['ip']) }}
-{% endfor %}
+FRONTENDS4: {{ all_ips | ansible.netcommon.ipv4 | list | to_json }}
-{% for h in services['frontend'].hosts|sort if (hostvars[h].get('traffic', 1) and (hostvars[h].get('public_ip6') or hostvars[h].get('ip6'))) %}
-{% if loop.first %}
-FRONTENDS6:
-{% endif %}
- - {{ hostvars[h].public_ip6 | default(hostvars[h].ip6) }}
-{% else %}
-FRONTENDS6: []
-{% endfor %}
+FRONTENDS6: {{ all_ips | ansible.netcommon.ipv6 | list | to_json }}
diff --git a/float/roles/float-infra-haproxy/templates/firewall/20haproxy.j2 b/float/roles/float-infra-haproxy/templates/firewall/20haproxy.j2
index ce02899..6460e7f 100644
--- a/float/roles/float-infra-haproxy/templates/firewall/20haproxy.j2
+++ b/float/roles/float-infra-haproxy/templates/firewall/20haproxy.j2
@@ -1,11 +1,11 @@
-{% for service_name, service in services|dictsort %}
+{% for service_name, service in services | dictsort %}
{% for ep in service.get('public_tcp_endpoints', []) %}
{% if ep.get('ports', []) %}
{% for port in ep.ports %}
allow_port tcp {{ port }}
{% endfor %}
{% else %}
-allow_port tcp {{ ep.port }}
+allow_port tcp {{ ep.public_port | default(ep.port) }}
{% endif %}
{% endfor %}
{% endfor %}
diff --git a/float/roles/float-infra-haproxy/templates/haproxy.cfg.j2 b/float/roles/float-infra-haproxy/templates/haproxy.cfg.j2
index 734895c..4fe5770 100644
--- a/float/roles/float-infra-haproxy/templates/haproxy.cfg.j2
+++ b/float/roles/float-infra-haproxy/templates/haproxy.cfg.j2
@@ -4,11 +4,9 @@ global
group haproxy
chroot /var/lib/haproxy
daemon
-{% if float_debian_dist != 'buster' %}
# use journald-compatibile short format, and don't send 'emerg' level out
# http://cbonte.github.io/haproxy-dconv/2.2/configuration.html#3.1-log
log stdout format short local4 info alert
-{% endif %}
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
defaults
@@ -41,22 +39,24 @@ backend be_{{ service_name }}_{{ ep.name }}_{{ port }}
option independent-streams
{% for s in services[service_name].hosts|sort %}
server task{{ loop.index -1 }} {{ s }}.{{ service_name }}.{{ domain }}:{{ port }} check fall 3 id {{ loop.index + 999 }} inter 5000 rise 3 slowstart 60000 weight 50{% if ep.get('use_proxy_protocol') %} send-proxy-v2{% endif %}
+
{% endfor %}
{% endfor %} # ep.ports
{% else %}
-frontend fe_{{ service_name }}_{{ ep.name }}_{{ ep.port }}
- bind :::{{ ep.port }}
+{% set public_port = ep.public_port | default(ep.port) %}
+frontend fe_{{ service_name }}_{{ ep.name }}_{{ public_port }}
+ bind :::{{ public_port }}
default_backend be_{{ service_name }}_{{ ep.name }}_{{ ep.port }}
-
backend be_{{ service_name }}_{{ ep.name }}_{{ ep.port }}
log global
balance leastconn
option independent-streams
{% for s in services[service_name].hosts|sort %}
server task{{ loop.index -1 }} {{ s }}.{{ service_name }}.{{ domain }}:{{ ep.port }} check fall 3 id {{ loop.index + 999 }} inter 5000 rise 3 slowstart 60000 weight 50{% if ep.get('use_proxy_protocol') %} send-proxy-v2{% endif %}
+
{% endfor %}
{% endif %} # ep.get('ports')
diff --git a/float/roles/float-infra-log-collector/files/es_init.py b/float/roles/float-infra-log-collector/files/es_init.py
index 31797f8..6a6b9bb 100755
--- a/float/roles/float-infra-log-collector/files/es_init.py
+++ b/float/roles/float-infra-log-collector/files/es_init.py
@@ -1,16 +1,14 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# Initialize ES index templates, after having waited for ES to be ready.
-from __future__ import print_function
-
+import argparse
import glob
import json
-import optparse
import os
-import urllib2
import sys
import time
+import urllib.request
# Default index settings that are applied to indices that already
@@ -33,8 +31,8 @@ def wait_for_es(url, timeout):
while time.time() < deadline:
try:
resp = json.load(
- urllib2.urlopen('%s/_cluster/health?wait_for_status=yellow&timeout=%ds' % (
- url, timeout)))
+ urllib.request.urlopen(
+ f'{url}/_cluster/health?wait_for_status=yellow&timeout={timeout}s'))
if resp['status'] in ('yellow', 'green'):
return True
except Exception as e:
@@ -50,37 +48,38 @@ def load_index_template(url, tplfile):
with open(tplfile, 'r') as fd:
tpldata = fd.read()
name = os.path.splitext(os.path.basename(tplfile))[0]
- req = urllib2.Request(
+ req = urllib.request.Request(
'%s/_template/%s' % (url, name),
headers={'Content-Type': 'application/json'},
data=tpldata)
req.get_method = lambda: 'PUT'
try:
- urllib2.urlopen(req)
+ urllib.request.urlopen(req)
return True
- except urllib2.HTTPError as e:
+ except urllib.request.HTTPError as e:
print(e.read())
return False
def update_index_settings(url, index_name):
- req = urllib2.Request(
+ req = urllib.request.Request(
'%s/%s/_settings' % (url, index_name),
headers={'Content-Type': 'application/json'},
data=INDEX_SETTINGS)
req.get_method = lambda: 'PUT'
try:
- urllib2.urlopen(req)
+ urllib.request.urlopen(req)
return True
- except urllib2.HTTPError as e:
+ except urllib.request.HTTPError as e:
print(e.read())
return False
def update_existing_indices(url):
try:
- index_data = json.load(urllib2.urlopen('%s/_all' % url))
- except urllib2.HTTPError as e:
+ index_data = json.load(
+ urllib.request.urlopen(f'{url}/_all'))
+ except urllib.request.HTTPError as e:
print(e.read())
return False
for index_name in index_data.keys():
@@ -93,27 +92,25 @@ def update_existing_indices(url):
def main():
- parser = optparse.OptionParser()
- parser.add_option('--url', default='http://localhost:9200',
- help='Elasticsearch URL')
- parser.add_option('--dir', default='/etc/elasticsearch/templates',
- help='Directory containing JSON index templates')
- parser.add_option('--wait-timeout', dest='wait_timeout', type='int',
- default=1800)
- opts, args = parser.parse_args()
- if len(args) > 0:
- parser.error('too many arguments')
-
- if not wait_for_es(opts.url, opts.wait_timeout):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--url', default='http://localhost:9200',
+ help='Elasticsearch URL')
+ parser.add_argument('--dir', default='/etc/elasticsearch/templates',
+ help='Directory containing JSON index templates')
+ parser.add_argument('--wait-timeout', dest='wait_timeout', type='int',
+ default=1800)
+ args = parser.parse_args()
+
+ if not wait_for_es(args.url, args.wait_timeout):
return 1
ret = 0
- for tplfile in glob.glob(os.path.join(opts.dir, '*.json')):
+ for tplfile in glob.glob(os.path.join(args.dir, '*.json')):
print('Loading index template %s' % (tplfile,))
- if not load_index_template(opts.url, tplfile):
+ if not load_index_template(args.url, tplfile):
ret = 1
- if not update_existing_indices(opts.url):
+ if not update_existing_indices(args.url):
ret = 1
return ret
@@ -121,4 +118,3 @@ def main():
if __name__ == '__main__':
sys.exit(main())
-
diff --git a/float/roles/float-infra-log-collector/files/kibana_importer.py b/float/roles/float-infra-log-collector/files/kibana_importer.py
index a8827b3..407f973 100755
--- a/float/roles/float-infra-log-collector/files/kibana_importer.py
+++ b/float/roles/float-infra-log-collector/files/kibana_importer.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python3
# Imports Kibana dashboards by loading all files with a .json
# extension from a directory.
@@ -15,78 +15,78 @@ import json
import logging
import sys
import time
-import urllib2
+import urllib.request
DEFAULT_KIBANA_BASE_URL = 'http://localhost:5601'
def wait_for_green_status(kibana_base_url):
- while True:
- try:
- r = urllib2.urlopen(kibana_base_url + '/api/status')
- status = json.loads(r.read())['status']['overall']['state']
- if status == 'green':
- break
- logging.debug('Kibana status is not green (%s), retrying...', status)
- except Exception as e:
- logging.debug('Kibana is not reachable (%s), retrying...', e)
- time.sleep(1)
+ while True:
+ try:
+ r = json.load(urllib.request.urlopen(kibana_base_url + '/api/status'))
+ status = r['status']['overall']['state']
+ if status == 'green':
+ break
+ logging.debug('Kibana status is not green (%s), retrying...', status)
+ except Exception as e:
+ logging.debug('Kibana is not reachable (%s), retrying...', e)
+ time.sleep(1)
def upload_kibana_dashboards(kibana_base_url, jsonArray):
- for obj in jsonArray:
- headers = {
- 'kbn-xsrf': 'anything',
- 'Content-Type': 'application/json',
- }
- req = urllib2.Request(
- '%s/api/kibana/dashboards/import?force=true' % (kibana_base_url,),
- data=json.dumps(obj), headers=headers)
- urllib2.urlopen(req).read()
+ for obj in jsonArray:
+ headers = {
+ 'kbn-xsrf': 'anything',
+ 'Content-Type': 'application/json',
+ }
+ req = urllib.request.Request(
+ f'{kibana_base_url}/api/kibana/dashboards/import?force=true',
+ data=json.dumps(obj), headers=headers)
+ urllib.request.urlopen(req)
def set_default_index(kibana_base_url, index_id):
- req = urllib2.Request(
- '%s/api/kibana/settings/defaultIndex' % (kibana_base_url,),
- data=json.dumps({'value': index_id}),
- headers={
- 'kbn-xsrf': 'anything',
- 'Content-Type': 'application/json',
- })
- urllib2.urlopen(req).read()
+ req = urllib.request.Request(
+ f'{kibana_base_url}/api/kibana/settings/defaultIndex',
+ data=json.dumps({'value': index_id}),
+ headers={
+ 'kbn-xsrf': 'anything',
+ 'Content-Type': 'application/json',
+ })
+ urllib.request.urlopen(req)
def main():
- parser = argparse.ArgumentParser(
- description='Imports Kibana json dashboard files into Kibana via its REST API.\n\nExample:\n %s --dir . --kibana-url %s' % (sys.argv[0], DEFAULT_KIBANA_BASE_URL),
- epilog='This is Free Software under the MIT license.\nCopyright 2017 Niklas Hambuechen <mail@nh2.me>',
- formatter_class=argparse.RawDescriptionHelpFormatter)
- parser.add_argument('--dir', metavar='path', type=str, default='/etc/kibana/provisioning', help='Path to *.json files to import')
- parser.add_argument('--kibana-url', metavar='url', type=str, default=DEFAULT_KIBANA_BASE_URL, help='Kibana base URL (default ' + DEFAULT_KIBANA_BASE_URL + ')')
- parser.add_argument('--default-index', metavar='id', type=str, help='Default index ID')
- parser.add_argument('--wait', action='store_true', help='Wait indefinitely for Kibana port to up with status green')
- parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity')
- args = parser.parse_args()
+ parser = argparse.ArgumentParser(
+ description='Imports Kibana json dashboard files into Kibana via its REST API.\n\nExample:\n %s --dir . --kibana-url %s' % (sys.argv[0], DEFAULT_KIBANA_BASE_URL),
+ epilog='This is Free Software under the MIT license.\nCopyright 2017 Niklas Hambuechen <mail@nh2.me>',
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument('--dir', metavar='path', type=str, default='/etc/kibana/provisioning', help='Path to *.json files to import')
+ parser.add_argument('--kibana-url', metavar='url', type=str, default=DEFAULT_KIBANA_BASE_URL, help='Kibana base URL (default ' + DEFAULT_KIBANA_BASE_URL + ')')
+ parser.add_argument('--default-index', metavar='id', type=str, help='Default index ID')
+ parser.add_argument('--wait', action='store_true', help='Wait indefinitely for Kibana port to up with status green')
+ parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity')
+ args = parser.parse_args()
- if args.verbose:
- logging.basicConfig(level=logging.DEBUG)
+ if args.verbose:
+ logging.basicConfig(level=logging.DEBUG)
- # Load JSON file; it contains an array of objects, whose _type field
- # determines what it is and which endpoint we have to hit.
- dashboards = []
- for f in glob.glob(args.dir + '/*.json'):
- with open(f) as fd:
- dashboards.append(json.load(fd))
+ # Load JSON file; it contains an array of objects, whose _type field
+ # determines what it is and which endpoint we have to hit.
+ dashboards = []
+ for f in glob.glob(args.dir + '/*.json'):
+ with open(f) as fd:
+ dashboards.append(json.load(fd))
- if args.wait:
- wait_for_green_status(args.kibana_url)
+ if args.wait:
+ wait_for_green_status(args.kibana_url)
- upload_kibana_dashboards(args.kibana_url, dashboards)
+ upload_kibana_dashboards(args.kibana_url, dashboards)
- if args.default_index:
- set_default_index(args.kibana_url, args.default_index)
+ if args.default_index:
+ set_default_index(args.kibana_url, args.default_index)
if __name__ == '__main__':
- main()
+ main()
diff --git a/float/roles/float-infra-log-collector/templates/log-collector.logrotate.j2 b/float/roles/float-infra-log-collector/templates/log-collector.logrotate.j2
index 2c45786..7f1ccbe 100644
--- a/float/roles/float-infra-log-collector/templates/log-collector.logrotate.j2
+++ b/float/roles/float-infra-log-collector/templates/log-collector.logrotate.j2
@@ -3,9 +3,9 @@
missingok
rotate {{ log_collector_retention_days|default(15) }}
compress
- compresscmd /usr/bin/lz4c
- compressoptions -z
- compressext .lz4
+ compresscmd /usr/bin/zstd
+ compressoptions -9
+ compressext .zstd
notifempty
create 0600 docker-log-collector adm
sharedscripts
diff --git a/float/roles/float-infra-log-collector/templates/rsyslog-collector.conf.j2 b/float/roles/float-infra-log-collector/templates/rsyslog-collector.conf.j2
index 4282419..9adc7e1 100644
--- a/float/roles/float-infra-log-collector/templates/rsyslog-collector.conf.j2
+++ b/float/roles/float-infra-log-collector/templates/rsyslog-collector.conf.j2
@@ -1,10 +1,6 @@
global(
maxMessageSize="64k"
- defaultNetstreamDriver="gtls"
- defaultNetstreamDriverCAFile="/etc/credentials/x509/log-collector/ca.pem"
- defaultNetstreamDriverCertFile="/etc/credentials/x509/log-collector/server/cert.pem"
- defaultNetstreamDriverKeyFile="/etc/credentials/x509/log-collector/server/private_key.pem"
)
main_queue(
@@ -293,13 +289,20 @@ ruleset(name="incoming"){
}
module(
- load="imtcp"
- MaxSessions="500"
- StreamDriver.Name="gtls"
- StreamDriver.Mode="1"
- StreamDriver.AuthMode="x509/name"
- PermittedPeer="*.{{ domain }}"
+ load="imrelp"
+ tls.tlslib="openssl"
)
-input(type="imtcp" port="6514" ruleset="incoming")
-
+input(
+ type="imrelp"
+ port="6514"
+ maxDataSize="64k"
+ ruleset="incoming"
+ tls="on"
+ tls.compression="on"
+ tls.cacert="/etc/credentials/x509/log-collector/ca.pem"
+ tls.mycert="/etc/credentials/x509/log-collector/server/cert.pem"
+ tls.myprivkey="/etc/credentials/x509/log-collector/server/private_key.pem"
+ tls.permittedpeer="*.{{ domain }}"
+ tls.authmode="certvalid"
+)
diff --git a/float/roles/float-infra-nginx/templates/config/snippets/proxy.conf b/float/roles/float-infra-nginx/templates/config/snippets/proxy.conf
index 0f7bf32..4db06ec 100644
--- a/float/roles/float-infra-nginx/templates/config/snippets/proxy.conf
+++ b/float/roles/float-infra-nginx/templates/config/snippets/proxy.conf
@@ -2,7 +2,7 @@
proxy_set_header Host $http_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-Ssl on;
-proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+proxy_set_header X-Forwarded-For $remote_addr;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "";
diff --git a/float/roles/float-infra-nginx/templates/nginx-upstream.j2 b/float/roles/float-infra-nginx/templates/nginx-upstream.j2
index caae964..1b3e6bc 100644
--- a/float/roles/float-infra-nginx/templates/nginx-upstream.j2
+++ b/float/roles/float-infra-nginx/templates/nginx-upstream.j2
@@ -15,10 +15,8 @@ upstream {{ upstream.name }}{% if shard %}_{{ shard }}{% endif %} {
{% endif %}
keepalive 8;
-{% if float_debian_dist != 'buster' %}
keepalive_timeout 300s;
keepalive_requests 1000;
-{% endif %}
}
{% endmacro %}
diff --git a/float/roles/float-infra-prometheus/templates/alertmanager.yml.j2 b/float/roles/float-infra-prometheus/templates/alertmanager.yml.j2
index e9d133a..f784347 100644
--- a/float/roles/float-infra-prometheus/templates/alertmanager.yml.j2
+++ b/float/roles/float-infra-prometheus/templates/alertmanager.yml.j2
@@ -59,6 +59,12 @@ inhibit_rules:
target_match:
alertname: 'HostUnreachable'
equal: ['host']
+ # Inhibit job-level alerts if a service-level alert is available.
+ - source_match:
+ alertname: 'ProbeFailure'
+ target_match_re:
+ alertname: '(ServiceDegraded|ServiceAvailabilityTooLow)'
+ equal: ['float_service']
receivers:
- name: default
diff --git a/float/roles/float-infra-prometheus/templates/blackbox.yml.j2 b/float/roles/float-infra-prometheus/templates/blackbox.yml.j2
index 020533d..44ba6ed 100644
--- a/float/roles/float-infra-prometheus/templates/blackbox.yml.j2
+++ b/float/roles/float-infra-prometheus/templates/blackbox.yml.j2
@@ -1,16 +1,56 @@
modules:
- # Healthcheck probe to use on internal targets.
- http_health:
+ # Healthcheck probe to use on internal Prometheus targets. Only use
+ # when the target does not support HTTP methods that do not transfer
+ # data (HEAD / OPTIONS).
+ http_health_get:
prober: http
timeout: 5s
http:
- valid_status_codes: []
+ valid_status_codes: [200]
method: GET
fail_if_ssl: false
fail_if_not_ssl: false
- fail_if_body_not_matches_regexp:
- - "OK"
+ preferred_ip_protocol: ip4
+ tls_config:
+ ca_file: /etc/credentials/x509/prometheus/ca.pem
+ cert_file: /etc/credentials/x509/prometheus/client/cert.pem
+ key_file: /etc/credentials/x509/prometheus/client/private_key.pem
+ insecure_skip_verify: false
+
+ # Healthcheck probe to use on internal Prometheus targets, makes
+ # a simple HEAD request to /metrics to avoid transferring the entire
+ # metrics dump (which might be huge). This probe also accepts a 405
+ # status (unsupported method), as that also indicates that the server
+ # is up and running and the chances it will serve metrics on a GET are
+ # extremely high.
+ http_health_head:
+ prober: http
+ timeout: 5s
+ http:
+ valid_status_codes: [200, 405]
+ method: HEAD
+ fail_if_ssl: false
+ fail_if_not_ssl: false
+ preferred_ip_protocol: ip4
+ tls_config:
+ ca_file: /etc/credentials/x509/prometheus/ca.pem
+ cert_file: /etc/credentials/x509/prometheus/client/cert.pem
+ key_file: /etc/credentials/x509/prometheus/client/private_key.pem
+ insecure_skip_verify: false
+
+ # Healthcheck probe to use on internal Prometheus targets, makes
+ # a simple OPTIONS request to /metrics to avoid transferring the entire
+ # metrics dump (which might be huge). In particular, Prometheus jobs
+ # themselves do not support the HEAD method, but do understand OPTIONS.
+ http_health_options:
+ prober: http
+ timeout: 5s
+ http:
+ valid_status_codes: [200]
+ method: OPTIONS
+ fail_if_ssl: false
+ fail_if_not_ssl: false
preferred_ip_protocol: ip4
tls_config:
ca_file: /etc/credentials/x509/prometheus/ca.pem
diff --git a/float/roles/float-infra-prometheus/templates/prometheus.yml.j2 b/float/roles/float-infra-prometheus/templates/prometheus.yml.j2
index 2719acb..5d86029 100644
--- a/float/roles/float-infra-prometheus/templates/prometheus.yml.j2
+++ b/float/roles/float-infra-prometheus/templates/prometheus.yml.j2
@@ -42,7 +42,7 @@
{# Job scrape config for a float service. #}
{% macro job_service_config(service_name, target_config) %}
- - job_name: "{{ target_config.get('job_name', service_name) }}"
+ - job_name: "{{ service_name }}_{{ target_config.port }}"
scheme: "{{ target_config.get('scheme', 'https') }}"
{% if target_config.get('metrics_path') %}
metrics_path: "{{ target_config['metrics_path'] }}"
@@ -104,6 +104,42 @@ scrape_configs:
{% for prober_host in services['prometheus'].hosts|sort %}
{% set prober_idx = loop.index %}
+{# Create all the health probes for all the monitored services #}
+{% for service_name, service in services | dictsort %}
+{% for target_config in service.get('monitoring_endpoints', []) %}
+ - job_name: "prober_health_{{ service_name | replace('-', '_') }}_{{ prober_idx }}_{{ loop.index }}"
+ metrics_path: "/probe"
+ params:
+ module:
+ - http_health_{{ target_config.healthcheck_http_method | default('HEAD') | lower }}
+ relabel_configs:
+ - source_labels: [__address__]
+ target_label: host
+ regex: "https?://([^.:/]*).*"
+ replacement: "${1}"
+ - source_labels: [__address__]
+ target_label: __param_target
+ - source_labels: [__param_target]
+ target_label: instance
+ - target_label: __address__
+ replacement: {{ prober_host }}.prometheus.{{ domain }}:9115
+ - target_label: prober_host
+ replacement: {{ prober_host }}
+ static_configs:
+ - targets:
+{% for host in service.hosts | sort %}
+ - "{{ target_config.get('scheme', 'http') }}://{{ host }}.{{ service_name }}.{{ domain }}:{{ target_config.port }}{{ target_config.metrics_path | default('/metrics') }}"
+{% endfor %}
+ labels:
+ zone: internal
+ probe: health
+ probeset: health
+ prober_float_service: prometheus
+ float_service: "{{ service_name }}"
+ float_job: "{{ service_name }}_{{ target_config.port }}"
+{% endfor %}
+{% endfor %}
+
- job_name: "prober_ping_{{ loop.index }}"
metrics_path: "/probe"
params:
@@ -181,7 +217,7 @@ scrape_configs:
replacement: {{ prober_host }}
static_configs:
- targets:
-{% for host in services['frontend'].hosts|sort %}
+{% for host in services['dns'].hosts|sort %}
- "{{ host }}"
{% endfor %}
labels:
diff --git a/float/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/float/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
index 11110d6..c163552 100644
--- a/float/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
+++ b/float/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
@@ -4,8 +4,11 @@ groups:
# HostUnreachable is used as a gate for most other host-based pages
# (via inhibit rules in the alertmanager configuration).
+ # Thanks to min_over_time() the alert stays active for 10 minutes
+ # once the host becomes reachable again, so as to inhibit alerts that
+ # might fire immediately after the transition.
- alert: HostUnreachable
- expr: host_reachable == 0
+ expr: min_over_time(host_reachable[10m]) == 0
for: 1m
labels:
severity: warn
@@ -55,8 +58,8 @@ groups:
redundancy ({{ $value }}) and may eventually be at risk.'
- alert: JobDown
- expr: job:up:ratio < 0.5
- for: 5m
+ expr: job:up:ratio < 0.5 and job:up:count > 1
+ for: 10m
labels:
severity: page
scope: global
diff --git a/float/roles/float-infra-prometheus/templates/rules/alerts_elasticsearch.conf.yml b/float/roles/float-infra-prometheus/templates/rules/alerts_elasticsearch.conf.yml
index a75b5c1..a97bd51 100644
--- a/float/roles/float-infra-prometheus/templates/rules/alerts_elasticsearch.conf.yml
+++ b/float/roles/float-infra-prometheus/templates/rules/alerts_elasticsearch.conf.yml
@@ -38,4 +38,5 @@ groups:
annotations:
summary: Logs are not being indexed
description: "The end-to-end log testing system has detected that logs are not reaching the Elasticsearch index. Something must be broken either with Elasticsearch itself, or with the log-collector service (rsyslog)."
+ runbook: '[[ alert_runbook_fmt | format("LogCollectionBroken") ]]'
diff --git a/float/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/float/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
index 86fdc16..c7df069 100644
--- a/float/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
+++ b/float/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
@@ -1,27 +1,33 @@
groups:
- name: roles/float-infra-prometheus/templates/rules/rules_base.conf
rules:
+ # Look at prober metrics to assess target state, rather than
+ # using Prometheus' "up" metric. This allows us to take
+ # advantage of a redundant blackbox prober setup and remove
+ # noise caused by prober failures.
- record: job:up:count
- expr: count(up) by (job)
+ expr: label_replace(count(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)")
- record: job:up:sum
- expr: sum(up) by (job)
+ expr: label_replace(sum(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)")
- record: job:up:ratio
expr: job:up:sum / job:up:count
- # Sum prober metrics over the probers (hosts), producing
- # an aggregation by target.
+ # Sum prober metrics over the probers (hosts), producing an
+ # aggregation by target. The following rules use 'without' in
+ # order to preserve additional probe_success labels that might
+ # be present.
- record: target:probe_success:count
- expr: count(probe_success) by (probe,probeset,zone,host,prober_float_service)
+ expr: count(probe_success) without (job,instance,prober_host)
- record: target:probe_success:sum
- expr: sum(probe_success) by (probe,probeset,zone,host,prober_float_service)
+ expr: sum(probe_success) without (job,instance,prober_host)
- record: target:probe_success:ratio
expr: target:probe_success:sum / target:probe_success:count
# Sum prober metrics over targets, aggregating by probe.
- record: probe:probe_success:count
- expr: count(probe_success) by (probe,probeset,prober_float_service,zone)
+ expr: count(probe_success) without (job,instance,prober_host,host)
- record: probe:probe_success:sum
- expr: sum(probe_success) by (probe,probeset,prober_float_service,zone)
+ expr: sum(probe_success) without (job,instance,prober_host,host)
- record: probe:probe_success:ratio
expr: probe:probe_success:sum / probe:probe_success:count
diff --git a/float/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml b/float/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml
index abe03a8..fd7f9b6 100644
--- a/float/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml
+++ b/float/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml
@@ -1,13 +1,7 @@
groups:
- name: roles/float-infra-prometheus/templates/rules/rules_services.conf
rules:
- - record: job:total:count
- expr: count(up) by (job)
- - record: job:up:count
- expr: sum(up) by (job)
- - record: job:up:fraction
- expr: job:up:count / job:total:count
-
+ # Service health is determined based on systemd unit state.
- record: node_systemd_unit_ok
expr: sum(node_systemd_unit_state{state="active"}) by (host,name)
@@ -18,7 +12,8 @@ groups:
# By joining the node_systemd_unit_state with svcmap we can find
# float_services that have at least one failing systemd unit.
- # The result metric has both float_service and service labels.
+ # The result metric has both float_service and service (systemd
+ # unit) labels.
- record: service:presence_by_host
expr: label_replace(max(node_systemd_unit_state) by (host,name),"service","$1","name","(.*)") * on (service,host) group_right global:svcmap
diff --git a/float/roles/float-infra-sso-server/templates/server.yml.j2 b/float/roles/float-infra-sso-server/templates/server.yml.j2
index 17b16a3..780611f 100644
--- a/float/roles/float-infra-sso-server/templates/server.yml.j2
+++ b/float/roles/float-infra-sso-server/templates/server.yml.j2
@@ -29,9 +29,8 @@ allowed_cors_origins: {{ sso_allowed_cors_origins | to_json }}
allowed_exchanges: {{ sso_allowed_exchanges | to_json }}
service_ttls: {{ sso_service_ttls | to_json }}
auth_session_lifetime: 43200
-session_secrets:
- - "{{ sso_session_auth_secret }}"
- - "{{ sso_session_enc_secret }}"
+session_auth_key: "{{ sso_session_auth_secret }}"
+session_enc_key: "{{ sso_session_enc_secret }}"
csrf_secret: "{{ sso_csrf_secret }}"
auth_service: sso
{% if enable_keystore %}
@@ -47,13 +46,16 @@ keystore_enable_groups:
{% endif %}
url_path_prefix: "{{ sso_server_url_path_prefix }}"
account_recovery_url: "{{ sso_server_account_recovery_url | default('') }}"
+default_signed_in_redirect: "{{ sso_server_default_signed_in_redirect | default('') }}"
device_manager:
auth_key: "{{ sso_device_manager_auth_secret }}"
trusted_forwarders:
- 127.0.0.1
- ::1
{% for h in services['frontend'].hosts|sort %}
- - {{ hostvars[h]['ip'] }}
+{% for ip in hostvars[h]['ips'] %}
+ - {{ ip }}
+{% endfor %}
{% for n in net_overlays %}{% if hostvars[h].get('ip_' + n.name) %}
- {{ hostvars[h]['ip_' + n.name] }}
{% endif %}{% endfor %}
diff --git a/float/roles/float-util-geoip-dataset/tasks/main.yml b/float/roles/float-util-geoip-dataset/tasks/main.yml
index fac8c66..7d6c87d 100644
--- a/float/roles/float-util-geoip-dataset/tasks/main.yml
+++ b/float/roles/float-util-geoip-dataset/tasks/main.yml
@@ -14,5 +14,5 @@
- name: Setup geoipupdate cron job
copy:
dest: "/etc/cron.d/geoipupdate_{{ item }}"
- content: "12 0 * * 5 root /usr/local/bin/splay 3600 && /usr/bin/geoipupdate -f /etc/GeoIP_{{ item }}.conf"
+ content: "12 0 * * 5 root /usr/local/bin/splay 3600 && /usr/bin/geoipupdate -f /etc/GeoIP_{{ item }}.conf\n"
with_items: "{{ geoip_dataset }}"
diff --git a/float/scripts/floatup.py b/float/scripts/floatup.py
index 5d1e890..adaa640 100755
--- a/float/scripts/floatup.py
+++ b/float/scripts/floatup.py
@@ -132,7 +132,7 @@ def main():
req = parse_inventory(args.inventory, host_attrs)
req['ttl'] = args.ttl
- print('creating VM group with attrs %r ...', host_attrs)
+ print('creating VM group with attrs %s ...' % host_attrs)
resp = do_request(args.url + '/api/create-group', args.ssh, req)
group_id = resp['group_id']
with open(args.state_file, 'w') as fd:
diff --git a/float/services.prometheus-lts.yml b/float/services.prometheus-lts.yml
index 49d4ef8..c488aee 100644
--- a/float/services.prometheus-lts.yml
+++ b/float/services.prometheus-lts.yml
@@ -32,11 +32,10 @@ prometheus-lts:
scheme: http
enable_sso_proxy: true
monitoring_endpoints:
- - job_name: prometheus-lts
- port: 9099
+ - port: 9099
scheme: http
- - job_name: thanos-sidecar-lts
- port: 10912
+ healthcheck_http_method: OPTIONS
+ - port: 10912
scheme: http
ports:
- 10911
diff --git a/float/services.yml.default b/float/services.yml.default
index fedfbe2..bb6f715 100644
--- a/float/services.yml.default
+++ b/float/services.yml.default
@@ -1,28 +1,7 @@
---
-frontend:
- scheduling_group: frontend
- service_credentials:
- - name: nginx
- enable_server: false
- - name: ssoproxy
- enable_server: false
- - name: replds-acme
- systemd_services:
- - nginx.service
- - sso-proxy.service
- - bind9.service
- - replds@acme.service
- ports:
- - 5005
- monitoring_endpoints:
- - name: bind
- port: 9119
- scheme: http
- volumes:
- - name: cache
- path: /var/cache/nginx
- size: 20g
+include:
+ - "services.yml.no-elasticsearch"
reports-collector:
scheduling_group: frontend
@@ -42,8 +21,7 @@ reports-collector:
port: 3995
scheme: http
monitoring_endpoints:
- - job_name: reports-collector
- port: 3995
+ - port: 3995
scheme: http
ports:
- 3996
@@ -55,11 +33,9 @@ log-collector:
- name: log-collector
enable_client: false
monitoring_endpoints:
- - job_name: rsyslog-collector
- port: 9105
+ - port: 9105
scheme: http
- - job_name: elasticsearch
- port: 9201
+ - port: 9201
scheme: http
public_endpoints:
- name: logs
@@ -80,7 +56,7 @@ log-collector:
- /var/log/remote: /var/log/remote
- name: kibana
image: registry.git.autistici.org/ai3/docker/kibana:master
- port: 5061
+ port: 5601
volumes:
- /etc/kibana: /etc/kibana
- /var/lib/kibana: /var/lib/kibana
@@ -106,6 +82,12 @@ log-collector:
owner: docker-log-collector
group: docker-log-collector
mode: "0700"
+ annotations:
+ dependencies:
+ - client: kibana
+ server: elasticsearch
+ - client: log-collector-e2e/prober
+ server: elasticsearch
log-collector-e2e:
scheduling_group: all
@@ -120,218 +102,3 @@ log-collector-e2e:
port: 7094
scheme: http
-prometheus:
- scheduling_group: backend
- num_instances: 1
- service_credentials:
- - { name: prometheus }
- containers:
- - name: prometheus
- image: registry.git.autistici.org/ai3/docker/prometheus:master
- port: 9090
- volumes:
- - /etc/prometheus: /etc/prometheus
- - /var/lib/prometheus/metrics2: /var/lib/prometheus/metrics2
- args: "--storage.tsdb.retention.time={{ prometheus_tsdb_retention | default('90d') }} --web.external-url=https://monitor.{{ domain_public[0] }} --web.enable-lifecycle --query.max-samples={{ prometheus_max_samples | default('5000000') }}"
- - name: alertmanager
- image: registry.git.autistici.org/ai3/docker/prometheus-alertmanager:master
- ports:
- - 9093
- - 9094
- volumes:
- - /etc/prometheus: /etc/prometheus
- - /var/lib/prometheus/alertmanager: /var/lib/prometheus/alertmanager
- args: "--web.external-url=https://alertmanager.{{ domain_public[0] }} --cluster.listen-address=:9094 --cluster.advertise-address={{ float_host_dns_map.get(inventory_hostname + '.prometheus', ['']) | list | first }}:9094{% for h in groups['prometheus']|sort if h != inventory_hostname %} --cluster.peer={{ h }}.prometheus.{{ domain }}:9094{% endfor %}"
- - name: blackbox
- image: registry.git.autistici.org/ai3/docker/prometheus-blackbox:master
- ports:
- - 9115
- volumes:
- - /etc/prometheus: /etc/prometheus
- args: "--config.file /etc/prometheus/blackbox.yml"
- docker_options: "--cap-add=NET_RAW"
- drop_capabilities: false
- - name: grafana
- image: registry.git.autistici.org/ai3/docker/grafana:master
- port: 2929
- volumes:
- - /etc/grafana: /etc/grafana
- - /var/lib/grafana: /var/lib/grafana
- - name: thanos
- image: registry.git.autistici.org/ai3/docker/thanos:master
- ports:
- - 10901 # sidecar grpc
- - 10902 # sidecar http
- - 10903 # query grpc
- - 10904 # query http
- - 10905 # query-frontend grpc
- - 10906 # query-frontend http
- resources:
- ram: "1G"
- env:
- QUERY_FLAGS: "--query.replica-label=monitor {% for h in groups['prometheus']|sort %} --store={{ h }}.prometheus.{{ domain }}:10901{% endfor %}"
- SIDECAR_FLAGS: ""
- QUERY_FRONTEND_FLAGS: "--query-range.response-cache-config-file=/etc/thanos/query-frontend-cache.yml"
- volumes:
- - /etc/thanos: /etc/thanos
- - name: karma
- image: registry.git.autistici.org/ai3/docker/karma:master
- ports:
- - 9193
- env:
- # https://github.com/prymitive/karma/blob/master/docs/CONFIGURATION.md#environment-variables
- CONFIG_FILE: "/etc/karma/float.yml"
- PORT: 9193
- volumes:
- - /etc/karma: /etc/karma
- public_endpoints:
- - name: monitor
- port: 9090
- scheme: http
- enable_sso_proxy: true
- - name: prober
- port: 9115
- scheme: http
- enable_sso_proxy: true
- - name: grafana
- port: 2929
- scheme: https
- enable_sso_proxy: true
- - name: thanos
- port: 10906
- scheme: http
- enable_sso_proxy: true
- - name: alerts
- port: 9193
- scheme: http
- enable_sso_proxy: true
- monitoring_endpoints:
- - job_name: prometheus
- port: 9090
- scheme: http
- - job_name: alertmanager
- port: 9093
- scheme: http
- - job_name: karma
- port: 9193
- scheme: http
- - job_name: grafana
- port: 2929
- scheme: https
- - job_name: thanos-query
- port: 10904
- scheme: http
- - job_name: thanos-sidecar
- port: 10902
- scheme: http
- - job_name: thanos-query-frontend
- port: 10906
- scheme: http
- ports:
- - 9094
- - 10901
- volumes:
- - name: metrics
- path: /var/lib/prometheus
- owner: docker-prometheus
- group: docker-prometheus
- mode: "0755"
-
-sso-server:
- num_instances: 1
- scheduling_group: backend
- service_credentials:
- - name: sso-server
- enable_server: false
- public_endpoints:
- - name: login
- port: 5002
- scheme: http
- monitoring_endpoints:
- - job_name: sso-server
- port: 5002
- scheme: http
-
-auth-cache:
- scheduling_group: backend
- containers:
- - name: memcache
- image: registry.git.autistici.org/ai3/docker/memcached:master
- port: 11212
- env:
- PORT: "11212"
- ports:
- - 11212
-
-user-meta-server:
- num_instances: 1
- scheduling_group: backend
- service_credentials:
- - name: user-meta-server
- monitoring_endpoints:
- - job_name: user-meta-server
- port: 5505
- scheme: https
- ports:
- - 5505
- systemd_services:
- - user-meta-server.service
- datasets:
- - name: db
- path: /var/lib/user-meta-server
- owner: user-meta-server
-
-admin-dashboard:
- scheduling_group: frontend
- service_credentials:
- - name: admin-dashboard
- containers:
- - name: http
- image: registry.git.autistici.org/ai3/tools/float-dashboard:refactor
- port: 8011
- volumes:
- - /etc/float: /etc/float
- env:
- ADDR: ":8011"
- DOMAIN: "{{ domain_public[0] }}"
- public_endpoints:
- - name: admin
- port: 8011
- scheme: http
- enable_sso_proxy: true
-
-backup-metadata:
- num_instances: 1
- scheduling_group: backend
- service_credentials:
- - name: backup-metadata
- enable_client: false
- monitoring_endpoints:
- - job_name: backup-metadata
- port: 5332
- scheme: https
- public_endpoints:
- - name: backups
- port: 5332
- scheme: https
- enable_sso_proxy: true
- ports:
- - 5332
- systemd_services:
- - tabacco-metadb.service
-
-acme:
- num_instances: 1
- scheduling_group: frontend
- service_credentials:
- - name: acme
- enable_server: false
- monitoring_endpoints:
- - job_name: acme
- port: 5004
- scheme: http
- ports:
- - 5004
- systemd_services:
- - acmeserver.service
-
diff --git a/float/services.yml.no-elasticsearch b/float/services.yml.no-elasticsearch
index 005f64b..229994f 100644
--- a/float/services.yml.no-elasticsearch
+++ b/float/services.yml.no-elasticsearch
@@ -11,19 +11,23 @@ frontend:
systemd_services:
- nginx.service
- sso-proxy.service
- - bind9.service
- replds@acme.service
ports:
- 5005
- monitoring_endpoints:
- - name: bind
- port: 9119
- scheme: http
volumes:
- name: cache
path: /var/cache/nginx
size: 20g
+dns:
+ scheduling_group: frontend
+ systemd_services:
+ - bind9.service
+ monitoring_endpoints:
+ - name: bind
+ port: 9119
+ scheme: http
+
log-collector:
scheduling_group: backend
num_instances: 1
@@ -31,8 +35,7 @@ log-collector:
- name: log-collector
enable_client: false
monitoring_endpoints:
- - job_name: rsyslog-collector
- port: 9105
+ - port: 9105
scheme: http
containers:
- name: rsyslog
@@ -70,6 +73,7 @@ prometheus:
- /etc/prometheus: /etc/prometheus
- /var/lib/prometheus/alertmanager: /var/lib/prometheus/alertmanager
args: "--web.external-url=https://alertmanager.{{ domain_public[0] }} --cluster.listen-address=:9094 --cluster.advertise-address={{ float_host_dns_map.get(inventory_hostname + '.prometheus', ['']) | list | first }}:9094{% for h in groups['prometheus']|sort if h != inventory_hostname %} --cluster.peer={{ h }}.prometheus.{{ domain }}:9094{% endfor %}"
+ egress_policy: internal
- name: blackbox
image: registry.git.autistici.org/ai3/docker/prometheus-blackbox:master
ports:
@@ -85,6 +89,7 @@ prometheus:
volumes:
- /etc/grafana: /etc/grafana
- /var/lib/grafana: /var/lib/grafana
+ egress_policy: internal
- name: thanos
image: registry.git.autistici.org/ai3/docker/thanos:master
ports:
@@ -102,6 +107,7 @@ prometheus:
QUERY_FRONTEND_FLAGS: "--query-range.response-cache-config-file=/etc/thanos/query-frontend-cache.yml"
volumes:
- /etc/thanos: /etc/thanos
+ egress_policy: internal
- name: karma
image: registry.git.autistici.org/ai3/docker/karma:master
ports:
@@ -112,6 +118,7 @@ prometheus:
PORT: 9193
volumes:
- /etc/karma: /etc/karma
+ egress_policy: internal
public_endpoints:
- name: monitor
port: 9090
@@ -134,26 +141,22 @@ prometheus:
scheme: http
enable_sso_proxy: true
monitoring_endpoints:
- - job_name: prometheus
- port: 9090
+ - port: 9090
scheme: http
- - job_name: alertmanager
- port: 9093
+ healthcheck_http_method: OPTIONS
+ - port: 9093
scheme: http
- - job_name: karma
- port: 9193
+ healthcheck_http_method: OPTIONS
+ - port: 9193
scheme: http
- - job_name: grafana
- port: 2929
+ healthcheck_http_method: GET
+ - port: 2929
scheme: https
- - job_name: thanos-query
- port: 10904
+ - port: 10904
scheme: http
- - job_name: thanos-sidecar
- port: 10902
+ - port: 10902
scheme: http
- - job_name: thanos-query-frontend
- port: 10906
+ - port: 10906
scheme: http
ports:
- 9094
@@ -164,6 +167,14 @@ prometheus:
owner: docker-prometheus
group: docker-prometheus
mode: "0755"
+ annotations:
+ dependencies:
+ - client: prometheus
+ server: alertmanager
+ - client: karma
+ server: alertmanager
+ - client: thanos
+ server: prometheus
sso-server:
num_instances: 1
@@ -176,9 +187,14 @@ sso-server:
port: 5002
scheme: http
monitoring_endpoints:
- - job_name: sso-server
- port: 5002
+ - port: 5002
scheme: http
+ systemd_services:
+ - sso-server.service
+ annotations:
+ dependencies:
+ - client: sso-server
+ server: user-meta-server/user-meta-server
auth-cache:
scheduling_group: backend
@@ -188,6 +204,7 @@ auth-cache:
port: 11212
env:
PORT: "11212"
+ egress_policy: internal
ports:
- 11212
@@ -197,8 +214,7 @@ user-meta-server:
service_credentials:
- name: user-meta-server
monitoring_endpoints:
- - job_name: user-meta-server
- port: 5505
+ - port: 5505
scheme: https
ports:
- 5505
@@ -206,7 +222,9 @@ user-meta-server:
- user-meta-server.service
datasets:
- name: db
+ type: litestream
path: /var/lib/user-meta-server
+ filename: usermeta.db
owner: user-meta-server
admin-dashboard:
@@ -215,13 +233,14 @@ admin-dashboard:
- name: admin-dashboard
containers:
- name: http
- image: registry.git.autistici.org/ai3/tools/float-dashboard:refactor
+ image: registry.git.autistici.org/ai3/tools/float-dashboard:master
port: 8011
volumes:
- /etc/float: /etc/float
env:
ADDR: ":8011"
DOMAIN: "{{ domain_public[0] }}"
+ egress_policy: internal
public_endpoints:
- name: admin
port: 8011
@@ -235,8 +254,7 @@ backup-metadata:
- name: backup-metadata
enable_client: false
monitoring_endpoints:
- - job_name: backup-metadata
- port: 5332
+ - port: 5332
scheme: https
public_endpoints:
- name: backups
@@ -247,6 +265,12 @@ backup-metadata:
- 5332
systemd_services:
- tabacco-metadb.service
+ datasets:
+ - name: db
+ type: litestream
+ path: /var/lib/tabacco-metadb
+ filename: meta.db
+ owner: backup-metadata
acme:
num_instances: 1
@@ -255,11 +279,36 @@ acme:
- name: acme
enable_server: false
monitoring_endpoints:
- - job_name: acme
- port: 5004
+ - port: 5004
scheme: http
ports:
- 5004
systemd_services:
- acmeserver.service
+assets:
+ num_instances: 1
+ scheduling_group: backend
+ service_credentials:
+ - name: assetmon
+ containers:
+ - name: http
+ image: registry.git.autistici.org/ai3/tools/assetmon:master
+ volumes:
+ - /etc/assetmon/server.yml: /etc/assetmon/server.yml
+ - /var/lib/assetmon: /var/lib/assetmon
+ ports:
+ - 3798
+ egress_policy: internal
+ monitoring_endpoints:
+ - port: 3798
+ scheme: https
+ public_endpoints:
+ - name: assets
+ port: 3798
+ scheme: https
+ enable_sso_proxy: true
+ datasets:
+ - name: db
+ path: /var/lib/assetmon
+ owner: docker-assets
diff --git a/float/test/backup.ref/config-backup.yml b/float/test/backup.ref/config-backup.yml
new file mode 100644
index 0000000..380578b
--- /dev/null
+++ b/float/test/backup.ref/config-backup.yml
@@ -0,0 +1,9 @@
+---
+backup_litestream_config:
+ type: s3
+ endpoint: "http://backup:9000/"
+ bucket: "backuptest"
+backup_litestream_credentials:
+ LITESTREAM_ACCESS_KEY_ID: "minio"
+ LITESTREAM_SECRET_ACCESS_KEY: "miniopassword"
+
diff --git a/float/test/backup.ref/passwords.yml b/float/test/backup.ref/passwords.yml
new file mode 100644
index 0000000..7bdaf7c
--- /dev/null
+++ b/float/test/backup.ref/passwords.yml
@@ -0,0 +1,2 @@
+---
+- include: ../../passwords.yml.default
diff --git a/float/test/backup.ref/services.yml b/float/test/backup.ref/services.yml
new file mode 100644
index 0000000..8db1002
--- /dev/null
+++ b/float/test/backup.ref/services.yml
@@ -0,0 +1,43 @@
+---
+
+include:
+ - "../../services.yml.no-elasticsearch"
+
+ok:
+ scheduling_group: backend
+ containers:
+ - name: http
+ image: registry.git.autistici.org/ai3/docker/okserver:latest
+ port: 3100
+ env:
+ PORT: 3100
+ resources:
+ ram: 1g
+ cpu: 0.5
+ public_endpoints:
+ - name: ok
+ port: 3100
+ scheme: http
+
+backup:
+ scheduling_group: backend
+ num_instances: 1
+ containers:
+ - name: s3
+ image: quay.io/minio/minio
+ port: 9000
+ env:
+ HOME: /data
+ MINIO_ROOT_USER: minio
+ MINIO_ROOT_PASSWORD: miniopassword
+ args: "server /data --console-address :9001"
+ volumes:
+ - /var/lib/backup: /data
+ ports:
+ - 9000
+ volumes:
+ - name: backup
+ path: /var/lib/backup
+ owner: docker-backup
+ size: 2g
+
diff --git a/float/test/backup.ref/site.yml b/float/test/backup.ref/site.yml
new file mode 100644
index 0000000..601f945
--- /dev/null
+++ b/float/test/backup.ref/site.yml
@@ -0,0 +1,10 @@
+---
+
+- import_playbook: "../../playbooks/all.yml"
+
+- hosts: backup
+ tasks:
+ - name: Create the test bucket
+ run_once: true
+ command: "podman run --env MC_HOST_local=http://minio:miniopassword@backup:9000 --network host --rm quay.io/minio/mc mb local/backuptest"
+
diff --git a/float/test/base.ref/services.yml b/float/test/base.ref/services.yml
index afe062f..0ed283e 100644
--- a/float/test/base.ref/services.yml
+++ b/float/test/base.ref/services.yml
@@ -11,11 +11,19 @@ frontend:
systemd_services:
- nginx.service
- sso-proxy.service
- - bind9.service
- replds@acme.service
ports:
- 5005
+dns:
+ scheduling_group: frontend
+ systemd_services:
+ - bind9.service
+ monitoring_endpoints:
+ - name: bind
+ port: 9119
+ scheme: http
+
ok:
scheduling_group: all
num_instances: 1
diff --git a/float/test/float_integration_test/__init__.py b/float/test/float_integration_test/__init__.py
index f3ff2e5..feb09c0 100644
--- a/float/test/float_integration_test/__init__.py
+++ b/float/test/float_integration_test/__init__.py
@@ -1,6 +1,8 @@
+import json
import os
import random
import unittest
+from urllib.parse import urlencode
import yaml
import jinja2
@@ -50,11 +52,11 @@ class TestBase(unittest.TestCase):
def frontend_ip(self):
"""Return a random IP for the 'frontend' group."""
host = random.choice(hosts_in_group('frontend'))
- return ANSIBLE_VARS['hostvars'][host]['ip']
+ return ANSIBLE_VARS['hostvars'][host]['ips'][0]
def all_frontend_ips(self):
"""Return all IPs in the 'frontend' group."""
- return [ANSIBLE_VARS['hostvars'][x]['ip']
+ return [ANSIBLE_VARS['hostvars'][x]['ips'][0]
for x in hosts_in_group('frontend')]
def sso_conversation(self, sso_username=None, sso_password=None):
@@ -69,3 +71,44 @@ class TestBase(unittest.TestCase):
sso_password=sso_password,
login_server=url,
)
+
+
+class PrometheusTestBase(TestBase):
+
+ def setUp(self):
+ super().setUp()
+ if 'prometheus' not in ANSIBLE_VARS['services']:
+ self.skipTest('monitoring not enabled')
+ self.prometheus_url = 'https://monitor.%s' % (
+ ANSIBLE_VARS['domain_public'][0],)
+
+ def eval_prometheus_expr(self, expr):
+ c = self.sso_conversation()
+ uri = '%s/api/v1/query?%s' % (
+ self.prometheus_url, urlencode({'query': expr}))
+ resp = c.request(uri, self.frontend_ip())
+ self.assertFalse(
+ 'error' in resp,
+ 'Request failed with error: %s' % resp.get('error'))
+ self.assertEqual(200, resp['status'])
+ result = json.loads(resp['body'])
+ self.assertEqual('success', result['status'],
+ 'Prometheus error: %s' % json.dumps(result))
+ return result['data']['result']
+
+
+class URLTestBase(TestBase):
+
+ UNKNOWN_DOMAIN_MSG = b'You have reached this page because your request could not be properly identified'
+
+ def assert_endpoint_ok(self, public_endpoint_name, auth=False):
+ c = self.sso_conversation()
+ url = 'https://%s.%s/' % (
+ public_endpoint_name, ANSIBLE_VARS['domain_public'][0])
+ result = c.request(url, self.frontend_ip())
+ self.assertFalse(result.get('error'), f'url={url}')
+ self.assertEqual(200, result['status'], f'url={url}')
+ self.assertFalse(
+ self.UNKNOWN_DOMAIN_MSG in result['body'],
+ f'The server returned the generic "unknown domain" page for {url}')
+ self.assertEqual(auth, c.auth_requested)
diff --git a/float/test/float_integration_test/http.py b/float/test/float_integration_test/http.py
index 0a7dd8e..261aceb 100644
--- a/float/test/float_integration_test/http.py
+++ b/float/test/float_integration_test/http.py
@@ -46,9 +46,6 @@ class SSOHandler(urllib.request.BaseHandler):
form[name] = value
return form
- def _extract_error(self, html):
- return self._error_pattern.search(html)
-
def https_response(self, req, resp):
request_url = req.get_full_url()
if resp.code == 200 and request_url.startswith(self._login_form_url):
diff --git a/float/test/float_integration_test/test_system.py b/float/test/float_integration_test/test_system.py
index 114eac3..d07823c 100644
--- a/float/test/float_integration_test/test_system.py
+++ b/float/test/float_integration_test/test_system.py
@@ -2,25 +2,9 @@ import json
import time
import unittest
from dns.resolver import Resolver
-from float_integration_test import TestBase, ANSIBLE_VARS
-
-UNKNOWN_DOMAIN_MSG = b'You have reached this page because your request could not be properly identified'
-
-
-class URLTestBase(TestBase):
-
- def _assert_endpoint_ok(self, public_endpoint_name, auth=False):
- c = self.sso_conversation()
- url = 'https://%s.%s/' % (
- public_endpoint_name, ANSIBLE_VARS['domain_public'][0])
- result = c.request(url, self.frontend_ip())
- self.assertFalse(result.get('error'), f'url={url}')
- self.assertEqual(200, result['status'], f'url={url}')
- self.assertFalse(
- UNKNOWN_DOMAIN_MSG in result['body'],
- f'The server returned the generic "unknown domain" page for {url}')
- self.assertEqual(auth, c.auth_requested)
+from float_integration_test import TestBase, PrometheusTestBase, URLTestBase, \
+ ANSIBLE_VARS
class TestHTTPRouter(URLTestBase):
@@ -32,7 +16,7 @@ class TestHTTPRouter(URLTestBase):
self.frontend_ip())
self.assertFalse(result.get('error'))
self.assertEqual(200, result['status'])
- self.assertTrue(UNKNOWN_DOMAIN_MSG in result['body'])
+ self.assertTrue(self.UNKNOWN_DOMAIN_MSG in result['body'])
class TestDNS(TestBase):
@@ -63,35 +47,35 @@ class TestBuiltinServiceURLs(URLTestBase):
"""
- def _assert_endpoint_ok_if_enabled(self, service_name,
- public_endpoint_name,
- auth=False):
+ def assert_endpoint_ok_if_enabled(self, service_name,
+ public_endpoint_name,
+ auth=False):
if service_name not in ANSIBLE_VARS['services']:
self.skipTest('service %s not enabled' % service_name)
- self._assert_endpoint_ok(public_endpoint_name, auth)
+ self.assert_endpoint_ok(public_endpoint_name, auth)
def test_okserver(self):
- self._assert_endpoint_ok_if_enabled('ok', 'ok')
+ self.assert_endpoint_ok_if_enabled('ok', 'ok')
def test_admin_dashboard(self):
- self._assert_endpoint_ok_if_enabled('admin-dashboard', 'admin', True)
+ self.assert_endpoint_ok_if_enabled('admin-dashboard', 'admin', True)
def test_monitor(self):
- self._assert_endpoint_ok_if_enabled('prometheus', 'monitor', True)
+ self.assert_endpoint_ok_if_enabled('prometheus', 'monitor', True)
def test_alertmanager(self):
- self._assert_endpoint_ok_if_enabled('prometheus', 'alerts', True)
+ self.assert_endpoint_ok_if_enabled('prometheus', 'alerts', True)
def test_grafana(self):
- self._assert_endpoint_ok_if_enabled('prometheus', 'grafana', True)
+ self.assert_endpoint_ok_if_enabled('prometheus', 'grafana', True)
def test_thanos(self):
- self._assert_endpoint_ok_if_enabled('prometheus', 'thanos', True)
+ self.assert_endpoint_ok_if_enabled('prometheus', 'thanos', True)
def test_kibana(self):
if not ANSIBLE_VARS.get('enable_elasticsearch', True):
self.skipTest('Elasticsearch is disabled')
- self._assert_endpoint_ok_if_enabled('log-collector', 'logs', True)
+ self.assert_endpoint_ok_if_enabled('log-collector', 'logs', True)
def _alert_to_string(metric):
@@ -114,15 +98,13 @@ def _alert_to_string(metric):
return o
-class TestSystem(TestBase):
+class TestSystem(PrometheusTestBase):
"""Check functionality at the system level."""
# Alerts that will be ignored.
WHITELISTED_ALERTS = ['DiskWillFillIn4Hours']
def test_no_firing_alerts(self):
- if 'prometheus' not in ANSIBLE_VARS['services']:
- self.skipTest('monitoring not enabled')
firing_alerts = None
for i in range(5):
try:
@@ -139,25 +121,26 @@ class TestSystem(TestBase):
', '.join(firing_alerts),))
def _get_firing_alerts(self):
- c = self.sso_conversation()
- alerts_uri = 'https://monitor.%s/api/v1/query?query=ALERTS' % (
- ANSIBLE_VARS['domain_public'][0],)
- result = c.request(alerts_uri, self.frontend_ip())
- self.assertFalse(
- 'error' in result,
- 'Request failed with error: %s' % result.get('error'))
- self.assertEqual(200, result['status'])
- response = json.loads(result['body'])
- self.assertEqual('success', response['status'])
- print(json.dumps(response['data'], indent=4))
- firing_alerts = [
+ alerts = self.eval_prometheus_expr('ALERTS')
+ print(json.dumps(alerts, indent=4))
+ return [
_alert_to_string(x['metric'])
- for x in response['data']['result']
+ for x in alerts
if (x['metric']['alertstate'] == 'firing' and
x['metric']['severity'] == 'page' and
x['metric']['alertname'] not in self.WHITELISTED_ALERTS)
]
- return firing_alerts
+
+
+class TestPrometheusMetrics(PrometheusTestBase):
+
+ def test_all_targets_are_reachable(self):
+ result = self.eval_prometheus_expr('up < 1')
+ self.assertEqual([], result)
+
+ def test_all_targets_are_being_scraped(self):
+ result = self.eval_prometheus_expr('scrape_samples_scraped == 0')
+ self.assertEqual([], result)
if __name__ == '__main__':
diff --git a/float/test/full.ref/services.yml b/float/test/full.ref/services.yml
index 939a878..893079d 100644
--- a/float/test/full.ref/services.yml
+++ b/float/test/full.ref/services.yml
@@ -19,4 +19,19 @@ ok:
port: 3100
scheme: http
-
+ok-root:
+ scheduling_group: backend
+ containers:
+ - name: http
+ image: registry.git.autistici.org/ai3/docker/okserver:latest
+ root: true
+ port: 799
+ env:
+ PORT: 799
+ resources:
+ ram: 1g
+ cpu: 0.5
+ public_endpoints:
+ - name: ok-root
+ port: 799
+ scheme: http