mirror of
https://github.com/kubernetes-sigs/kubespray.git
synced 2026-02-28 09:39:12 +03:00
Documentation and playbook for recovering control plane from node failure (#4146)
This commit is contained in:
committed by
Kubernetes Prow Robot
parent
9335cdcebc
commit
48a182844c
7
roles/recover_control_plane/etcd/tasks/main.yml
Normal file
7
roles/recover_control_plane/etcd/tasks/main.yml
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
- include_tasks: prepare.yml
|
||||
|
||||
- include_tasks: recover_lost_quorum.yml
|
||||
when:
|
||||
- has_etcdctl
|
||||
- not etcd_cluster_is_healthy
|
||||
47
roles/recover_control_plane/etcd/tasks/prepare.yml
Normal file
47
roles/recover_control_plane/etcd/tasks/prepare.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
---
|
||||
- name: Delete old certificates
|
||||
shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*"
|
||||
with_items: "{{ old_etcds.split(',') }}"
|
||||
register: delete_old_cerificates
|
||||
ignore_errors: true
|
||||
when: old_etcds is defined
|
||||
|
||||
- name: Fail if unable to delete old certificates
|
||||
fail:
|
||||
msg: "Unable to delete old certificates for: {{ item.item }}"
|
||||
loop: "{{ delete_old_cerificates.results }}"
|
||||
changed_when: false
|
||||
when:
|
||||
- old_etcds is defined
|
||||
- "item.rc != 0 and not 'No such file or directory' in item.stderr"
|
||||
|
||||
- name: Get etcd cluster members
|
||||
shell: "{{ bin_dir }}/etcdctl member list"
|
||||
register: member_list
|
||||
changed_when: false
|
||||
check_mode: no
|
||||
environment:
|
||||
- ETCDCTL_API: 3
|
||||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
|
||||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
|
||||
when:
|
||||
- has_etcdctl
|
||||
- etcd_cluster_is_healthy
|
||||
- old_etcd_members is defined
|
||||
|
||||
- name: Remove old cluster members
|
||||
shell: "{{ bin_dir}}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
|
||||
environment:
|
||||
- ETCDCTL_API: 3
|
||||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
|
||||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
|
||||
with_nested:
|
||||
- "{{ old_etcd_members.split(',') }}"
|
||||
- "{{ member_list.stdout_lines }}"
|
||||
when:
|
||||
- has_etcdctl
|
||||
- etcd_cluster_is_healthy
|
||||
- old_etcd_members is defined
|
||||
- item[0] == item[1].replace(' ','').split(',')[2]
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
- name: Save etcd snapshot
|
||||
shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db"
|
||||
environment:
|
||||
- ETCDCTL_API: 3
|
||||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
|
||||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
|
||||
when: etcd_snapshot is not defined
|
||||
|
||||
- name: Transfer etcd snapshot to host
|
||||
copy:
|
||||
src: "{{ etcd_snapshot }}"
|
||||
dest: /tmp/snapshot.db
|
||||
when: etcd_snapshot is defined
|
||||
|
||||
- name: Stop etcd
|
||||
systemd:
|
||||
name: etcd
|
||||
state: stopped
|
||||
|
||||
- name: Remove etcd data-dir
|
||||
shell: "rm -rf {{ etcd_data_dir }}"
|
||||
|
||||
- name: Restore etcd snapshot
|
||||
shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
|
||||
environment:
|
||||
- ETCDCTL_API: 3
|
||||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
|
||||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
|
||||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
|
||||
|
||||
- name: Remove etcd snapshot
|
||||
file:
|
||||
path: /tmp/snapshot.db
|
||||
state: absent
|
||||
|
||||
- name: Change etcd data-dir owner
|
||||
file:
|
||||
path: "{{ etcd_data_dir }}"
|
||||
owner: etcd
|
||||
group: etcd
|
||||
recurse: true
|
||||
|
||||
- name: Reconfigure etcd
|
||||
replace:
|
||||
path: /etc/etcd.env
|
||||
regexp: "^(ETCD_INITIAL_CLUSTER=).*"
|
||||
replace: '\1{{ etcd_member_name }}={{ etcd_peer_url }}'
|
||||
|
||||
- name: Start etcd
|
||||
systemd:
|
||||
name: etcd
|
||||
state: started
|
||||
28
roles/recover_control_plane/master/tasks/main.yml
Normal file
28
roles/recover_control_plane/master/tasks/main.yml
Normal file
@@ -0,0 +1,28 @@
|
||||
---
|
||||
- name: Wait for apiserver
|
||||
shell: "{{ bin_dir }}/kubectl get nodes"
|
||||
environment:
|
||||
- KUBECONFIG: /root/.kube/config
|
||||
register: apiserver_is_ready
|
||||
until: apiserver_is_ready.rc == 0
|
||||
retries: 6
|
||||
delay: 10
|
||||
changed_when: false
|
||||
|
||||
- name: Delete old kube-master nodes from cluster
|
||||
shell: "{{ bin_dir }}/kubectl delete node {{ item }}"
|
||||
environment:
|
||||
- KUBECONFIG: /root/.kube/config
|
||||
with_items: "{{ old_kube_masters.split(',') }}"
|
||||
register: delete_old_kube_masters
|
||||
failed_when: false
|
||||
when: old_kube_masters is defined
|
||||
|
||||
- name: Fail if unable to delete old kube-master nodes from cluster
|
||||
fail:
|
||||
msg: "Unable to delete old kube-master node: {{ item.item }}"
|
||||
loop: "{{ delete_old_kube_masters.results }}"
|
||||
changed_when: false
|
||||
when:
|
||||
- old_kube_masters is defined
|
||||
- "item.rc != 0 and not 'NotFound' in item.stderr"
|
||||
19
roles/recover_control_plane/post-recover/tasks/main.yml
Normal file
19
roles/recover_control_plane/post-recover/tasks/main.yml
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
# TODO: Figure out why kubeadm does not fix this
|
||||
- name: Set etcd-servers fact
|
||||
set_fact:
|
||||
etcd_servers: >-
|
||||
{% for host in groups['etcd'] -%}
|
||||
{% if not loop.last -%}
|
||||
https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2379,
|
||||
{%- endif -%}
|
||||
{%- if loop.last -%}
|
||||
https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2379
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
|
||||
- name: Update apiserver etcd-servers list
|
||||
replace:
|
||||
path: /etc/kubernetes/manifests/kube-apiserver.yaml
|
||||
regexp: "(etcd-servers=).*"
|
||||
replace: "\\1{{ etcd_servers }}"
|
||||
@@ -0,0 +1,2 @@
|
||||
---
|
||||
control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}"
|
||||
36
roles/recover_control_plane/pre-recover/tasks/main.yml
Normal file
36
roles/recover_control_plane/pre-recover/tasks/main.yml
Normal file
@@ -0,0 +1,36 @@
|
||||
---
|
||||
- name: Check for etcdctl binary
|
||||
raw: "test -e {{ bin_dir }}/etcdctl"
|
||||
register: test_etcdctl
|
||||
|
||||
- name: Set has_etcdctl fact
|
||||
set_fact:
|
||||
has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}"
|
||||
|
||||
- name: Check if etcd cluster is healthy
|
||||
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
|
||||
register: etcd_cluster_health
|
||||
ignore_errors: true
|
||||
changed_when: false
|
||||
check_mode: no
|
||||
environment:
|
||||
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
||||
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
||||
ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem"
|
||||
when: has_etcdctl
|
||||
|
||||
- name: Set etcd_cluster_is_healthy fact
|
||||
set_fact:
|
||||
etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}"
|
||||
|
||||
- name: Abort if etcd cluster is healthy and old_etcd_members is undefined
|
||||
assert:
|
||||
that: "{{ old_etcd_members is defined }}"
|
||||
msg: "'old_etcd_members' must be defined when the etcd cluster has quorum."
|
||||
when: etcd_cluster_is_healthy
|
||||
|
||||
- name: Warn for untested recovery
|
||||
debug:
|
||||
msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk.
|
||||
delay: 30
|
||||
when: not control_plane_is_converged
|
||||
Reference in New Issue
Block a user