Documentation and playbook for recovering control plane from node failure (#4146)

This commit is contained in:
qvicksilver
2019-04-29 10:40:20 +02:00
committed by Kubernetes Prow Robot
parent 9335cdcebc
commit 48a182844c
9 changed files with 287 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
---
control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}"

View File

@@ -0,0 +1,36 @@
---
- name: Check for etcdctl binary
raw: "test -e {{ bin_dir }}/etcdctl"
register: test_etcdctl
- name: Set has_etcdctl fact
set_fact:
has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}"
- name: Check if etcd cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_cluster_health
ignore_errors: true
changed_when: false
check_mode: no
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
when: has_etcdctl
- name: Set etcd_cluster_is_healthy fact
set_fact:
etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}"
- name: Abort if etcd cluster is healthy and old_etcd_members is undefined
assert:
that: "{{ old_etcd_members is defined }}"
msg: "'old_etcd_members' must be defined when the etcd cluster has quorum."
when: etcd_cluster_is_healthy
- name: Warn for untested recovery
debug:
msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk.
delay: 30
when: not control_plane_is_converged