Documentation and playbook for recovering control plane from node failure (#4146)

This commit is contained in:
qvicksilver
2019-04-29 10:40:20 +02:00
committed by Kubernetes Prow Robot
parent 9335cdcebc
commit 48a182844c
9 changed files with 287 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
---
- include_tasks: prepare.yml
- include_tasks: recover_lost_quorum.yml
when:
- has_etcdctl
- not etcd_cluster_is_healthy

View File

@@ -0,0 +1,47 @@
---
- name: Delete old certificates
shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*"
with_items: "{{ old_etcds.split(',') }}"
register: delete_old_cerificates
ignore_errors: true
when: old_etcds is defined
- name: Fail if unable to delete old certificates
fail:
msg: "Unable to delete old certificates for: {{ item.item }}"
loop: "{{ delete_old_cerificates.results }}"
changed_when: false
when:
- old_etcds is defined
- "item.rc != 0 and not 'No such file or directory' in item.stderr"
- name: Get etcd cluster members
shell: "{{ bin_dir }}/etcdctl member list"
register: member_list
changed_when: false
check_mode: no
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
when:
- has_etcdctl
- etcd_cluster_is_healthy
- old_etcd_members is defined
- name: Remove old cluster members
shell: "{{ bin_dir}}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem"
with_nested:
- "{{ old_etcd_members.split(',') }}"
- "{{ member_list.stdout_lines }}"
when:
- has_etcdctl
- etcd_cluster_is_healthy
- old_etcd_members is defined
- item[0] == item[1].replace(' ','').split(',')[2]

View File

@@ -0,0 +1,54 @@
---
- name: Save etcd snapshot
shell: "{{ bin_dir }}/etcdctl snapshot save /tmp/snapshot.db"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
when: etcd_snapshot is not defined
- name: Transfer etcd snapshot to host
copy:
src: "{{ etcd_snapshot }}"
dest: /tmp/snapshot.db
when: etcd_snapshot is defined
- name: Stop etcd
systemd:
name: etcd
state: stopped
- name: Remove etcd data-dir
shell: "rm -rf {{ etcd_data_dir }}"
- name: Restore etcd snapshot
shell: "{{ bin_dir }}/etcdctl snapshot restore /tmp/snapshot.db --name {{ etcd_member_name }} --initial-cluster {{ etcd_member_name }}={{ etcd_peer_url }} --initial-cluster-token k8s_etcd --initial-advertise-peer-urls {{ etcd_peer_url }} --data-dir {{ etcd_data_dir }}"
environment:
- ETCDCTL_API: 3
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}.pem"
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/member-{{ inventory_hostname }}-key.pem"
- name: Remove etcd snapshot
file:
path: /tmp/snapshot.db
state: absent
- name: Change etcd data-dir owner
file:
path: "{{ etcd_data_dir }}"
owner: etcd
group: etcd
recurse: true
- name: Reconfigure etcd
replace:
path: /etc/etcd.env
regexp: "^(ETCD_INITIAL_CLUSTER=).*"
replace: '\1{{ etcd_member_name }}={{ etcd_peer_url }}'
- name: Start etcd
systemd:
name: etcd
state: started