Merge pull request #1205 from holser/resolv_updates

Refactoring resolv.conf
2025-12-14 13:54:37 +03:00 · 2017-04-05 14:22:52 +03:00 · 2017-04-05 09:28:01 +02:00 · 2017-04-04 22:03:44 +03:00 · 2017-04-04 20:49:55 +03:00 · 2017-04-04 20:43:47 +03:00
312 changed files with 6829 additions and 2243 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 .vagrant
 *.retry
 inventory/vagrant_ansible_inventory
+inventory/group_vars/fake_hosts.yml
+inventory/host_vars/
 temp
 .idea
 .tox
@@ -10,4 +12,7 @@ temp
 *.pyo
 *.tfstate
 *.tfstate.backup
+**/*.sw[pon]
 /ssh-bastion.conf
+**/*.sw[pon]
+vagrant/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,5 @@
 stages:
+  - moderator
  - unit-tests
  - deploy-gce-part1
  - deploy-gce-part2
@@ -17,7 +18,7 @@ variables:
 # us-west1-a

 before_script:
-    - pip install ansible
+    - pip install ansible==2.2.1.0
    - pip install netaddr
    - pip install apache-libcloud==0.20.1
    - pip install boto==2.9.0
@@ -46,14 +47,23 @@ before_script:
  PRIVATE_KEY: $GCE_PRIVATE_KEY
  GS_ACCESS_KEY_ID: $GS_KEY
  GS_SECRET_ACCESS_KEY: $GS_SECRET
+  CLOUD_MACHINE_TYPE: "g1-small"
  ANSIBLE_KEEP_REMOTE_FILES: "1"
+  ANSIBLE_CONFIG: ./tests/ansible.cfg
  BOOTSTRAP_OS: none
+  DOWNLOAD_LOCALHOST: "false"
+  DOWNLOAD_RUN_ONCE: "false"
+  IDEMPOT_CHECK: "false"
+  RESET_CHECK: "false"
+  UPGRADE_TEST: "false"
  RESOLVCONF_MODE: docker_dns
  LOG_LEVEL: "-vv"
  ETCD_DEPLOYMENT: "docker"
  KUBELET_DEPLOYMENT: "docker"
+  VAULT_DEPLOYMENT: "docker"
+  WEAVE_CPU_LIMIT: "100m"
  MAGIC: "ci check this"
-  
+
 .gce: &gce
  <<: *job
  <<: *docker_service
@@ -62,65 +72,172 @@ before_script:
    paths:
      - downloads/
      - $HOME/.cache
-  stage: deploy-gce
  before_script:
    - docker info
-    - pip install ansible==2.1.3.0
+    - pip install ansible==2.2.1.0
    - pip install netaddr
    - pip install apache-libcloud==0.20.1
    - pip install boto==2.9.0
    - mkdir -p /.ssh
-    - cp tests/ansible.cfg .
    - mkdir -p $HOME/.ssh
    - echo $PRIVATE_KEY | base64 -d > $HOME/.ssh/id_rsa
    - echo $GCE_PEM_FILE | base64 -d > $HOME/.ssh/gce
    - echo $GCE_CREDENTIALS > $HOME/.ssh/gce.json
    - chmod 400 $HOME/.ssh/id_rsa
    - ansible-playbook --version
-    - cp tests/ansible.cfg .
    - export PYPATH=$([ $BOOTSTRAP_OS = none ] && echo /usr/bin/python || echo /opt/bin/python)
  script:
    - pwd
    - ls
    - echo ${PWD}
    - >
-      ansible-playbook tests/cloud_playbooks/create-gce.yml -i tests/local_inventory/hosts.cfg -c local $LOG_LEVEL
-      -e mode=${CLUSTER_MODE}
-      -e test_id=${TEST_ID}
-      -e kube_network_plugin=${KUBE_NETWORK_PLUGIN}
+      ansible-playbook tests/cloud_playbooks/create-gce.yml -i tests/local_inventory/hosts.cfg -c local 
+      ${LOG_LEVEL}
+      -e cloud_image=${CLOUD_IMAGE}
+      -e cloud_region=${CLOUD_REGION}
+      -e gce_credentials_file=${HOME}/.ssh/gce.json
      -e gce_project_id=${GCE_PROJECT_ID}
      -e gce_service_account_email=${GCE_ACCOUNT}
-      -e gce_credentials_file=${HOME}/.ssh/gce.json
-      -e cloud_image=${CLOUD_IMAGE}
+      -e cloud_machine_type=${CLOUD_MACHINE_TYPE}
      -e inventory_path=${PWD}/inventory/inventory.ini
-      -e cloud_region=${CLOUD_REGION}
+      -e kube_network_plugin=${KUBE_NETWORK_PLUGIN}
+      -e mode=${CLUSTER_MODE}
+      -e test_id=${TEST_ID}
+
+    # Check out latest tag if testing upgrade
+    # Uncomment when gitlab kargo repo has tags
+    #- test "${UPGRADE_TEST}" != "false" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+    - test "${UPGRADE_TEST}" != "false" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0
+

    # Create cluster
    - >
-      ansible-playbook -i inventory/inventory.ini -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS
-      -b --become-user=root -e cloud_provider=gce $LOG_LEVEL -e kube_network_plugin=${KUBE_NETWORK_PLUGIN}
-      --private-key=${HOME}/.ssh/id_rsa
-      -e bootstrap_os=${BOOTSTRAP_OS}
+      ansible-playbook -i inventory/inventory.ini -b --become-user=root --private-key=${HOME}/.ssh/id_rsa -u $SSH_USER
+      ${SSH_ARGS}
+      ${LOG_LEVEL}
      -e ansible_python_interpreter=${PYPATH}
-      -e download_run_once=true
-      -e download_localhost=true
+      -e ansible_ssh_user=${SSH_USER} 
+      -e bootstrap_os=${BOOTSTRAP_OS}
+      -e cert_management=${CERT_MGMT:-script}
+      -e cloud_provider=gce
      -e deploy_netchecker=true
-      -e resolvconf_mode=${RESOLVCONF_MODE}
-      -e local_release_dir=${PWD}/downloads
+      -e download_localhost=${DOWNLOAD_LOCALHOST}
+      -e download_run_once=${DOWNLOAD_RUN_ONCE}
      -e etcd_deployment_type=${ETCD_DEPLOYMENT}
+      -e kube_network_plugin=${KUBE_NETWORK_PLUGIN}
      -e kubelet_deployment_type=${KUBELET_DEPLOYMENT}
+      -e local_release_dir=${PWD}/downloads
+      -e resolvconf_mode=${RESOLVCONF_MODE}
+      -e vault_deployment_type=${VAULT_DEPLOYMENT}
+      --limit "all:!fake_hosts"
      cluster.yml

+    # Repeat deployment if testing upgrade
+    - >
+      if [ "${UPGRADE_TEST}" != "false" ]; then 
+      test "${UPGRADE_TEST}" == "basic" && PLAYBOOK="cluster.yml";
+      test "${UPGRADE_TEST}" == "graceful" && PLAYBOOK="upgrade-cluster.yml";
+      pip install ansible==2.2.1.0; 
+      git checkout "${CI_BUILD_REF}"; 
+      ansible-playbook -i inventory/inventory.ini -b --become-user=root --private-key=${HOME}/.ssh/id_rsa -u $SSH_USER 
+      ${SSH_ARGS} 
+      ${LOG_LEVEL} 
+      -e ansible_python_interpreter=${PYPATH} 
+      -e ansible_ssh_user=${SSH_USER} 
+      -e bootstrap_os=${BOOTSTRAP_OS} 
+      -e cloud_provider=gce 
+      -e deploy_netchecker=true 
+      -e download_localhost=${DOWNLOAD_LOCALHOST} 
+      -e download_run_once=${DOWNLOAD_RUN_ONCE} 
+      -e etcd_deployment_type=${ETCD_DEPLOYMENT} 
+      -e kube_network_plugin=${KUBE_NETWORK_PLUGIN} 
+      -e kubelet_deployment_type=${KUBELET_DEPLOYMENT} 
+      -e local_release_dir=${PWD}/downloads 
+      -e resolvconf_mode=${RESOLVCONF_MODE} 
+      -e weave_cpu_requests=${WEAVE_CPU_LIMIT} 
+      -e weave_cpu_limit=${WEAVE_CPU_LIMIT} 
+      --limit "all:!fake_hosts" 
+      $PLAYBOOK; 
+      fi

    # Tests Cases
    ## Test Master API
-    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root tests/testcases/010_check-apiserver.yml $LOG_LEVEL
+    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root --limit "all:!fake_hosts" tests/testcases/010_check-apiserver.yml $LOG_LEVEL

    ## Ping the between 2 pod
-    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root tests/testcases/030_check-network.yml $LOG_LEVEL
+    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root --limit "all:!fake_hosts" tests/testcases/030_check-network.yml $LOG_LEVEL

    ## Advanced DNS checks
-    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root tests/testcases/040_check-network-adv.yml $LOG_LEVEL
+    - ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root --limit "all:!fake_hosts" tests/testcases/040_check-network-adv.yml $LOG_LEVEL
+
+    ## Idempotency checks 1/5 (repeat deployment)
+    - >
+      if [ "${IDEMPOT_CHECK}" = "true" ]; then
+      ansible-playbook -i inventory/inventory.ini -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS 
+      -b --become-user=root -e cloud_provider=gce $LOG_LEVEL -e kube_network_plugin=${KUBE_NETWORK_PLUGIN} 
+      --private-key=${HOME}/.ssh/id_rsa 
+      -e bootstrap_os=${BOOTSTRAP_OS} 
+      -e ansible_python_interpreter=${PYPATH} 
+      -e download_localhost=${DOWNLOAD_LOCALHOST} 
+      -e download_run_once=${DOWNLOAD_RUN_ONCE} 
+      -e deploy_netchecker=true 
+      -e resolvconf_mode=${RESOLVCONF_MODE} 
+      -e local_release_dir=${PWD}/downloads 
+      -e etcd_deployment_type=${ETCD_DEPLOYMENT} 
+      -e kubelet_deployment_type=${KUBELET_DEPLOYMENT} 
+      --limit "all:!fake_hosts" 
+      cluster.yml;
+      fi
+
+    ## Idempotency checks 2/5 (Advanced DNS checks)
+    - >
+      if [ "${IDEMPOT_CHECK}" = "true" ]; then
+      ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} 
+      -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root 
+      --limit "all:!fake_hosts" 
+      tests/testcases/040_check-network-adv.yml $LOG_LEVEL;
+      fi
+
+    ## Idempotency checks 3/5 (reset deployment)
+    - >
+      if [ "${IDEMPOT_CHECK}" = "true" AND "${RESET_CHECK}" = "true" ]; then
+      ansible-playbook -i inventory/inventory.ini -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS 
+      -b --become-user=root -e cloud_provider=gce $LOG_LEVEL -e kube_network_plugin=${KUBE_NETWORK_PLUGIN} 
+      --private-key=${HOME}/.ssh/id_rsa 
+      -e bootstrap_os=${BOOTSTRAP_OS} 
+      -e ansible_python_interpreter=${PYPATH} 
+      -e reset_confirmation=yes 
+      --limit "all:!fake_hosts"
+      reset.yml;
+      fi
+
+    ## Idempotency checks 4/5 (redeploy after reset)
+    - >
+      if [ "${IDEMPOT_CHECK}" = "true" AND "${RESET_CHECK}" = "true" ]; then
+      ansible-playbook -i inventory/inventory.ini -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS 
+      -b --become-user=root -e cloud_provider=gce $LOG_LEVEL -e kube_network_plugin=${KUBE_NETWORK_PLUGIN} 
+      --private-key=${HOME}/.ssh/id_rsa 
+      -e bootstrap_os=${BOOTSTRAP_OS} 
+      -e ansible_python_interpreter=${PYPATH} 
+      -e download_localhost=${DOWNLOAD_LOCALHOST} 
+      -e download_run_once=${DOWNLOAD_RUN_ONCE} 
+      -e deploy_netchecker=true 
+      -e resolvconf_mode=${RESOLVCONF_MODE} 
+      -e local_release_dir=${PWD}/downloads 
+      -e etcd_deployment_type=${ETCD_DEPLOYMENT} 
+      -e kubelet_deployment_type=${KUBELET_DEPLOYMENT} 
+      --limit "all:!fake_hosts" 
+      cluster.yml;
+      fi
+
+    ## Idempotency checks 5/5 (Advanced DNS checks)
+    - >
+      if [ "${IDEMPOT_CHECK}" = "true" AND "${RESET_CHECK}" = "true" ]; then
+      ansible-playbook -i inventory/inventory.ini -e ansible_python_interpreter=${PYPATH} 
+      -u $SSH_USER -e ansible_ssh_user=$SSH_USER $SSH_ARGS -b --become-user=root 
+      --limit "all:!fake_hosts" 
+      tests/testcases/040_check-network-adv.yml $LOG_LEVEL;
+      fi

  after_script:
    - >
@@ -139,18 +256,21 @@ before_script:
 .coreos_calico_sep_variables: &coreos_calico_sep_variables
 # stage: deploy-gce-part1
  KUBE_NETWORK_PLUGIN: calico
-  CLOUD_IMAGE: coreos-stable
+  CLOUD_IMAGE: coreos-stable-1298-6-0-v20170315
  CLOUD_REGION: us-west1-b
-  CLUSTER_MODE: separated
+  CLUSTER_MODE: separate
  BOOTSTRAP_OS: coreos
  RESOLVCONF_MODE: host_resolvconf # This is required as long as the CoreOS stable channel uses docker < 1.12

-.debian8_canal_ha_variables: &debian8_canal_ha_variables
+.ubuntu_canal_ha_variables: &ubuntu_canal_ha_variables
 # stage: deploy-gce-part1
  KUBE_NETWORK_PLUGIN: canal
-  CLOUD_IMAGE: debian-8-kubespray
-  CLOUD_REGION: us-east1-b
+  CLOUD_IMAGE: ubuntu-1604-xenial
+  CLOUD_REGION: europe-west1-b
+  CLOUD_MACHINE_TYPE: "n1-standard-2"
+  UPGRADE_TEST: "basic"
  CLUSTER_MODE: ha
+  UPGRADE_TEST: "graceful"

 .rhel7_weave_variables: &rhel7_weave_variables
 # stage: deploy-gce-part1
@@ -176,10 +296,11 @@ before_script:
 .coreos_canal_variables: &coreos_canal_variables
 # stage: deploy-gce-part2
  KUBE_NETWORK_PLUGIN: canal
-  CLOUD_IMAGE: coreos-stable
+  CLOUD_IMAGE: coreos-stable-1298-6-0-v20170315
  CLOUD_REGION: us-east1-b
  CLUSTER_MODE: default
  BOOTSTRAP_OS: coreos
+  IDEMPOT_CHECK: "true"
  RESOLVCONF_MODE: host_resolvconf # This is required as long as the CoreOS stable channel uses docker < 1.12

 .rhel7_canal_sep_variables: &rhel7_canal_sep_variables
@@ -187,39 +308,52 @@ before_script:
  KUBE_NETWORK_PLUGIN: canal
  CLOUD_IMAGE: rhel-7
  CLOUD_REGION: us-east1-b
-  CLUSTER_MODE: separated
+  CLUSTER_MODE: separate

 .ubuntu_weave_sep_variables: &ubuntu_weave_sep_variables
 # stage: deploy-gce-special
  KUBE_NETWORK_PLUGIN: weave
  CLOUD_IMAGE: ubuntu-1604-xenial
  CLOUD_REGION: us-central1-b
-  CLUSTER_MODE: separated
+  CLUSTER_MODE: separate
+  IDEMPOT_CHECK: "false"

 .centos7_calico_ha_variables: &centos7_calico_ha_variables
 # stage: deploy-gce-special
  KUBE_NETWORK_PLUGIN: calico
+  DOWNLOAD_LOCALHOST: "true"
+  DOWNLOAD_RUN_ONCE: "true"
  CLOUD_IMAGE: centos-7
  CLOUD_REGION: europe-west1-b
-  CLUSTER_MODE: ha
+  CLUSTER_MODE: ha-scale
+  IDEMPOT_CHECK: "true"

 .coreos_alpha_weave_ha_variables: &coreos_alpha_weave_ha_variables
 # stage: deploy-gce-special
  KUBE_NETWORK_PLUGIN: weave
-  CLOUD_IMAGE: coreos-alpha
+  CLOUD_IMAGE: coreos-alpha-1325-0-0-v20170216
  CLOUD_REGION: us-west1-a
-  CLUSTER_MODE: ha
+  CLUSTER_MODE: ha-scale
  BOOTSTRAP_OS: coreos
+  RESOLVCONF_MODE: host_resolvconf # This is required as long as the CoreOS stable channel uses docker < 1.12

 .ubuntu_rkt_sep_variables: &ubuntu_rkt_sep_variables
 # stage: deploy-gce-part1
  KUBE_NETWORK_PLUGIN: flannel
  CLOUD_IMAGE: ubuntu-1604-xenial
  CLOUD_REGION: us-central1-b
-  CLUSTER_MODE: separated
+  CLUSTER_MODE: separate
  ETCD_DEPLOYMENT: rkt
  KUBELET_DEPLOYMENT: rkt

+.ubuntu_vault_sep_variables: &ubuntu_vault_sep_variables
+# stage: deploy-gce-part1
+  KUBE_NETWORK_PLUGIN: canal
+  CERT_MGMT: vault
+  CLOUD_IMAGE: ubuntu-1604-xenial
+  CLOUD_REGION: us-central1-b
+  CLUSTER_MODE: separate
+
 # Builds for PRs only (premoderated by unit-tests step) and triggers (auto)
 coreos-calico-sep:
  stage: deploy-gce-part1
@@ -285,24 +419,24 @@ ubuntu-weave-sep-triggers:
  only: ['triggers']

 # More builds for PRs/merges (manual) and triggers (auto)
-debian8-canal-ha:
+ubuntu-canal-ha:
  stage: deploy-gce-part1
  <<: *job
  <<: *gce
  variables:
    <<: *gce_variables
-    <<: *debian8_canal_ha_variables
+    <<: *ubuntu_canal_ha_variables
  when: manual
  except: ['triggers']
  only: ['master', /^pr-.*$/]

-debian8-canal-ha-triggers:
+ubuntu-canal-ha-triggers:
  stage: deploy-gce-part1
  <<: *job
  <<: *gce
  variables:
    <<: *gce_variables
-    <<: *debian8_canal_ha_variables
+    <<: *ubuntu_canal_ha_variables
  when: on_success
  only: ['triggers']

@@ -327,7 +461,7 @@ rhel7-weave-triggers:
  when: on_success
  only: ['triggers']

-debian8-calico:
+debian8-calico-upgrade:
  stage: deploy-gce-part2
  <<: *job
  <<: *gce
@@ -434,16 +568,35 @@ ubuntu-rkt-sep:
  except: ['triggers']
  only: ['master', /^pr-.*$/]

-# Premoderated with manual actions
-syntax-check:
+ubuntu-vault-sep:
+  stage: deploy-gce-part1
  <<: *job
-  stage: unit-tests
+  <<: *gce
+  variables:
+    <<: *gce_variables
+    <<: *ubuntu_vault_sep_variables
+  when: manual
+  except: ['triggers']
+  only: ['master', /^pr-.*$/]
+
+# Premoderated with manual actions
+ci-authorized:
+  <<: *job
+  stage: moderator
  before_script:
    - apt-get -y install jq
  script:
-    - ansible-playbook -i inventory/local-tests.cfg -u root -e ansible_ssh_user=root  -b --become-user=root cluster.yml -vvv  --syntax-check
    - /bin/sh scripts/premoderator.sh
  except: ['triggers', 'master']
+  
+syntax-check:
+  <<: *job
+  stage: unit-tests
+  script:
+    - ansible-playbook -i inventory/local-tests.cfg -u root -e ansible_ssh_user=root  -b --become-user=root cluster.yml -vvv  --syntax-check
+    - ansible-playbook -i inventory/local-tests.cfg -u root -e ansible_ssh_user=root  -b --become-user=root upgrade-cluster.yml -vvv  --syntax-check
+    - ansible-playbook -i inventory/local-tests.cfg -u root -e ansible_ssh_user=root  -b --become-user=root reset.yml -vvv  --syntax-check
+  except: ['triggers', 'master']

 tox-inventory-builder:
  stage: unit-tests
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ![Kubernetes Logo](https://s28.postimg.org/lf3q4ocpp/k8s.png)

-##Deploy a production ready kubernetes cluster
+## Deploy a production ready kubernetes cluster

 If you have questions, join us on the [kubernetes slack](https://slack.k8s.io), channel **#kargo**.

@@ -55,7 +55,7 @@ Versions of supported components
 [flanneld](https://github.com/coreos/flannel/releases) v0.6.2 <br>
 [calicoctl](https://github.com/projectcalico/calico-docker/releases) v0.23.0 <br>
 [canal](https://github.com/projectcalico/canal) (given calico/flannel versions) <br>
-[weave](http://weave.works/) v1.6.1 <br>
+[weave](http://weave.works/) v1.8.2 <br>
 [docker](https://www.docker.com/) v1.12.5 <br>
 [rkt](https://coreos.com/rkt/docs/latest/) v1.21.0 <br>

@@ -67,16 +67,18 @@ plugins can be deployed for a given single cluster.
 Requirements
 --------------

+* **Ansible v2.2 (or newer) and python-netaddr is installed on the machine
+  that will run Ansible commands**
+* **Jinja 2.8 (or newer) is required to run the Ansible Playbooks**
 * The target servers must have **access to the Internet** in order to pull docker images.
+* The target servers are configured to allow **IPv4 forwarding**.
+* **Your ssh key must be copied** to all the servers part of your inventory.
 * The **firewalls are not managed**, you'll need to implement your own rules the way you used to.
 in order to avoid any issue during deployment you should disable your firewall.
-* The target servers are configured to allow **IPv4 forwarding**.
-* **Copy your ssh keys** to all the servers part of your inventory.
-* **Ansible v2.2 (or newer) and python-netaddr**


 ## Network plugins
-You can choose between 4 network plugins. (default: `flannel` with vxlan backend)
+You can choose between 4 network plugins. (default: `calico`)

 * [**flannel**](docs/flannel.md): gre/vxlan (layer 2) networking.

@@ -91,11 +93,23 @@ The choice is defined with the variable `kube_network_plugin`. There is also an
 option to leverage built-in cloud provider networking instead.
 See also [Network checker](docs/netcheck.md).

+## Community docs and resources
+ - [kubernetes.io/docs/getting-started-guides/kargo/](https://kubernetes.io/docs/getting-started-guides/kargo/)
+ - [kargo, monitoring and logging](https://github.com/gregbkr/kubernetes-kargo-logging-monitoring) by @gregbkr
+ - [Deploy Kubernetes w/ Ansible & Terraform](https://rsmitty.github.io/Terraform-Ansible-Kubernetes/) by @rsmitty
+ - [Deploy a Kubernets Cluster with Kargo (video)](https://www.youtube.com/watch?v=N9q51JgbWu8)
+
+## Tools and projects on top of Kargo
+ - [Digital Rebar](https://github.com/digitalrebar/digitalrebar)
+ - [Kargo-cli](https://github.com/kubespray/kargo-cli)
+ - [Fuel-ccp-installer](https://github.com/openstack/fuel-ccp-installer)
+ - [Terraform Contrib](https://github.com/kubernetes-incubator/kargo/tree/master/contrib/terraform)
+
 ## CI Tests

 ![Gitlab Logo](https://s27.postimg.org/wmtaig1wz/gitlabci.png)

 [![Build graphs](https://gitlab.com/kargo-ci/kubernetes-incubator__kargo/badges/master/build.svg)](https://gitlab.com/kargo-ci/kubernetes-incubator__kargo/pipelines) </br>

-CI/end-to-end tests sponsored by Google (GCE), and [teuto.net](https://teuto.net/) for OpenStack.
+CI/end-to-end tests sponsored by Google (GCE), DigitalOcean, [teuto.net](https://teuto.net/) (openstack).
 See the [test matrix](docs/test_cases.md) for details.
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -7,3 +7,34 @@ The Kargo Project is released on an as-needed basis. The process is as follows:
 3. An OWNER runs `git tag -s $VERSION` and inserts the changelog and pushes the tag with `git push $VERSION`
 4. The release issue is closed
 5. An announcement email is sent to `kubernetes-dev@googlegroups.com` with the subject `[ANNOUNCE] kargo $VERSION is released`
+
+## Major/minor releases, merge freezes and milestones
+
+* Kargo does not maintain stable branches for releases. Releases are tags, not
+  branches, and there are no backports. Therefore, there is no need for merge
+  freezes as well.
+
+* Fixes for major releases (vX.x.0) and minor releases (vX.Y.x) are delivered
+  via maintenance releases (vX.Y.Z) and assigned to the corresponding open
+  milestone (vX.Y). That milestone remains open for the major/minor releases
+  support lifetime, which ends once the milestone closed. Then only a next major
+  or minor release can be done.
+
+* Kargo major and minor releases are bound to the given ``kube_version`` major/minor
+  version numbers and other components' arbitrary versions, like etcd or network plugins.
+  Older or newer versions are not supported and not tested for the given release.
+
+* There is no unstable releases and no APIs, thus Kargo doesn't follow
+  [semver](http://semver.org/). Every version describes only a stable release.
+  Breaking changes, if any introduced by changed defaults or non-contrib ansible roles'
+  playbooks, shall be described in the release notes. Other breaking changes, if any in
+  the contributed addons or bound versions of Kubernetes and other components, are
+  considered out of Kargo scope and are up to the components' teams to deal with and
+  document.
+
+* Minor releases can change components' versions, but not the major ``kube_version``.
+  Greater ``kube_version`` requires a new major or minor release. For example, if Kargo v2.0.0
+  is bound to ``kube_version: 1.4.x``, ``calico_version: 0.22.0``, ``etcd_version: v3.0.6``,
+  then Kargo v2.1.0 may be bound to only minor changes to ``kube_version``, like v1.5.1
+  and *any* changes to other components, like etcd v4, or calico 1.2.3.
+  And Kargo v3.x.x shall be bound to ``kube_version: 2.x.x`` respectively.
--- a/30
+++ b/30
@@ -17,6 +17,13 @@ $shared_folders = {}
 $forwarded_ports = {}
 $subnet = "172.17.8"
 $box = "bento/ubuntu-16.04"
+# The first three nodes are etcd servers
+$etcd_instances = $num_instances
+# The first two nodes are masters
+$kube_master_instances = $num_instances == 1 ? $num_instances : ($num_instances - 1)
+# All nodes are kube nodes
+$kube_node_instances = $num_instances
+$local_release_dir = "/vagrant/temp"

 host_vars = {}

@@ -88,12 +95,14 @@ Vagrant.configure("2") do |config|

      ip = "#{$subnet}.#{i+100}"
      host_vars[vm_name] = {
-        "ip" => ip,
-        #"access_ip" => ip,
-        "flannel_interface" => ip,
-        "flannel_backend_type" => "host-gw",
-        "local_release_dir" => "/vagrant/temp",
-        "download_run_once" => "False"
+        "ip": ip,
+        "flannel_interface": ip,
+        "flannel_backend_type": "host-gw",
+        "local_release_dir" => $local_release_dir,
+        "download_run_once": "False",
+        # Override the default 'calico' with flannel.
+        # inventory/group_vars/k8s-cluster.yml
+        "kube_network_plugin": "flannel",
      }
      config.vm.network :private_network, ip: ip

@@ -112,12 +121,9 @@ Vagrant.configure("2") do |config|
          ansible.host_vars = host_vars
          #ansible.tags = ['download']
          ansible.groups = {
-            # The first three nodes should be etcd servers
-            "etcd" => ["#{$instance_name_prefix}-0[1:3]"],
-            # The first two nodes should be masters
-            "kube-master" => ["#{$instance_name_prefix}-0[1:2]"],
-            # all nodes should be kube nodes
-            "kube-node" => ["#{$instance_name_prefix}-0[1:#{$num_instances}]"],
+            "etcd" => ["#{$instance_name_prefix}-0[1:#{$etcd_instances}]"],
+            "kube-master" => ["#{$instance_name_prefix}-0[1:#{$kube_master_instances}]"],
+            "kube-node" => ["#{$instance_name_prefix}-0[1:#{$kube_node_instances}]"],
            "k8s-cluster:children" => ["kube-master", "kube-node"],
          }
        end
--- a/ansible.cfg
+++ b/ansible.cfg
@@ -7,3 +7,6 @@ host_key_checking=False
 gathering = smart
 fact_caching = jsonfile
 fact_caching_connection = /tmp
+stdout_callback = skippy
+library = ./library
+callback_whitelist = profile_tasks
--- a/cluster.yml
+++ b/cluster.yml
@@ -2,66 +2,91 @@
 - hosts: localhost
  gather_facts: False
  roles:
-    - bastion-ssh-config
-  tags: [localhost, bastion]
+    - { role: kargo-defaults}
+    - { role: bastion-ssh-config, tags: ["localhost", "bastion"]}

 - hosts: k8s-cluster:etcd:calico-rr
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  gather_facts: false
  vars:
    # Need to disable pipelining for bootstrap-os as some systems have requiretty in sudoers set, which makes pipelining
    # fail. bootstrap-os fixes this on these systems, so in later plays it can be enabled.
    ansible_ssh_pipelining: false
  roles:
-    - bootstrap-os
-  tags:
-    - bootstrap-os
+    - { role: kargo-defaults}
+    - { role: bootstrap-os, tags: bootstrap-os}

 - hosts: k8s-cluster:etcd:calico-rr
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  vars:
    ansible_ssh_pipelining: true
  gather_facts: true

 - hosts: k8s-cluster:etcd:calico-rr
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
+    - { role: kargo-defaults}
+    - { role: kernel-upgrade, tags: kernel-upgrade, when: kernel_upgrade is defined and kernel_upgrade }
    - { role: kubernetes/preinstall, tags: preinstall }
    - { role: docker, tags: docker }
-    - { role: rkt, tags: rkt, when: "'rkt' in [ etcd_deployment_type, kubelet_deployment_type ]" }
+    - role: rkt
+      tags: rkt
+      when: "'rkt' in [etcd_deployment_type, kubelet_deployment_type, vault_deployment_type]"

- hosts: etcd:!k8s-cluster
-  any_errors_fatal: true
+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
-    - { role: etcd, tags: etcd }
+    - { role: kargo-defaults, when: "cert_management == 'vault'" }
+    - { role: vault, tags: vault, vault_bootstrap: true, when: "cert_management == 'vault'" }
+
+- hosts: etcd
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
+  roles:
+    - { role: kargo-defaults}
+    - { role: etcd, tags: etcd, etcd_cluster_setup: true }

 - hosts: k8s-cluster
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
-    - { role: etcd, tags: etcd }
+    - { role: kargo-defaults}
+    - { role: etcd, tags: etcd, etcd_cluster_setup: false }
+
+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
+  roles:
+    - { role: kargo-defaults}
+    - { role: vault, tags: vault, when: "cert_management == 'vault'"}
+
+- hosts: k8s-cluster
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
+  roles:
+    - { role: kargo-defaults}
    - { role: kubernetes/node, tags: node }
    - { role: network_plugin, tags: network }

 - hosts: kube-master
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
+    - { role: kargo-defaults}
    - { role: kubernetes/master, tags: master }
-    - { role: kubernetes-apps/lib, tags: apps }
    - { role: kubernetes-apps/network_plugin, tags: network }
+    - { role: kubernetes-apps/policy_controller, tags: policy-controller }

 - hosts: calico-rr
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
+    - { role: kargo-defaults}
    - { role: network_plugin/calico/rr, tags: network }

 - hosts: k8s-cluster
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
+    - { role: kargo-defaults}
    - { role: dnsmasq, when: "dns_mode == 'dnsmasq_kubedns'", tags: dnsmasq }
    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }

 - hosts: kube-master[0]
-  any_errors_fatal: true
+  any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
  roles:
-    - { role: kubernetes-apps/lib, tags: apps }
+    - { role: kargo-defaults}
    - { role: kubernetes-apps, tags: apps }
--- a/contrib/aws_iam/kubernetes-master-policy.json
+++ b/contrib/aws_iam/kubernetes-master-policy.json
@@ -0,0 +1,27 @@
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": ["ec2:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["elasticloadbalancing:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["route53:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": "s3:*",
+      "Resource": [
+        "arn:aws:s3:::kubernetes-*"
+      ]
+    }
+  ]
+}
--- a/contrib/aws_iam/kubernetes-master-role.json
+++ b/contrib/aws_iam/kubernetes-master-role.json
@@ -0,0 +1,10 @@
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": { "Service": "ec2.amazonaws.com"},
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
--- a/contrib/aws_iam/kubernetes-minion-policy.json
+++ b/contrib/aws_iam/kubernetes-minion-policy.json
@@ -0,0 +1,45 @@
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": "s3:*",
+      "Resource": [
+        "arn:aws:s3:::kubernetes-*"
+      ]
+    },
+    {
+      "Effect": "Allow",
+      "Action": "ec2:Describe*",
+      "Resource": "*"
+    },
+    {
+      "Effect": "Allow",
+      "Action": "ec2:AttachVolume",
+      "Resource": "*"
+    },
+    {
+      "Effect": "Allow",
+      "Action": "ec2:DetachVolume",
+      "Resource": "*"
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["route53:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": [
+        "ecr:GetAuthorizationToken",
+        "ecr:BatchCheckLayerAvailability",
+        "ecr:GetDownloadUrlForLayer",
+        "ecr:GetRepositoryPolicy",
+        "ecr:DescribeRepositories",
+        "ecr:ListImages",
+        "ecr:BatchGetImage"
+      ],
+      "Resource": "*"
+    }
+  ]
+}
--- a/contrib/aws_iam/kubernetes-minion-role.json
+++ b/contrib/aws_iam/kubernetes-minion-role.json
@@ -0,0 +1,10 @@
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": { "Service": "ec2.amazonaws.com"},
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
--- a/contrib/inventory_builder/inventory.py
+++ b/contrib/inventory_builder/inventory.py
@@ -40,7 +40,8 @@ import os
 import re
 import sys

-ROLES = ['kube-master', 'all', 'k8s-cluster:children', 'kube-node', 'etcd']
+ROLES = ['all', 'kube-master', 'kube-node', 'etcd', 'k8s-cluster:children',
+         'calico-rr']
 PROTECTED_NAMES = ROLES
 AVAILABLE_COMMANDS = ['help', 'print_cfg', 'print_ips', 'load']
 _boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True,
@@ -51,10 +52,18 @@ def get_var_as_bool(name, default):
    value = os.environ.get(name, '')
    return _boolean_states.get(value.lower(), default)

+# Configurable as shell vars start
+
 CONFIG_FILE = os.environ.get("CONFIG_FILE", "./inventory.cfg")
+# Reconfigures cluster distribution at scale
+SCALE_THRESHOLD = int(os.environ.get("SCALE_THRESHOLD", 50))
+MASSIVE_SCALE_THRESHOLD = int(os.environ.get("SCALE_THRESHOLD", 200))
+
 DEBUG = get_var_as_bool("DEBUG", True)
 HOST_PREFIX = os.environ.get("HOST_PREFIX", "node")

+# Configurable as shell vars end
+

 class KargoInventory(object):

@@ -74,11 +83,16 @@ class KargoInventory(object):
        if changed_hosts:
            self.hosts = self.build_hostnames(changed_hosts)
            self.purge_invalid_hosts(self.hosts.keys(), PROTECTED_NAMES)
-            self.set_kube_master(list(self.hosts.keys())[:2])
            self.set_all(self.hosts)
            self.set_k8s_cluster()
-            self.set_kube_node(self.hosts.keys())
            self.set_etcd(list(self.hosts.keys())[:3])
+            if len(self.hosts) >= SCALE_THRESHOLD:
+                self.set_kube_master(list(self.hosts.keys())[3:5])
+            else:
+                self.set_kube_master(list(self.hosts.keys())[:2])
+            self.set_kube_node(self.hosts.keys())
+            if len(self.hosts) >= SCALE_THRESHOLD:
+                self.set_calico_rr(list(self.hosts.keys())[:3])
        else:  # Show help if no options
            self.show_help()
            sys.exit(0)
@@ -205,8 +219,32 @@ class KargoInventory(object):
        self.add_host_to_group('k8s-cluster:children', 'kube-node')
        self.add_host_to_group('k8s-cluster:children', 'kube-master')

+    def set_calico_rr(self, hosts):
+        for host in hosts:
+            if host in self.config.items('kube-master'):
+                    self.debug("Not adding {0} to calico-rr group because it "
+                               "conflicts with kube-master group".format(host))
+                    continue
+            if host in self.config.items('kube-node'):
+                    self.debug("Not adding {0} to calico-rr group because it "
+                               "conflicts with kube-node group".format(host))
+                    continue
+            self.add_host_to_group('calico-rr', host)
+
    def set_kube_node(self, hosts):
        for host in hosts:
+            if len(self.config['all']) >= SCALE_THRESHOLD:
+                if self.config.has_option('etcd', host):
+                    self.debug("Not adding {0} to kube-node group because of "
+                               "scale deployment and host is in etcd "
+                               "group.".format(host))
+                    continue
+            if len(self.config['all']) >= MASSIVE_SCALE_THRESHOLD:
+                if self.config.has_option('kube-master', host):
+                    self.debug("Not adding {0} to kube-node group because of "
+                               "scale deployment and host is in kube-master "
+                               "group.".format(host))
+                    continue
            self.add_host_to_group('kube-node', host)

    def set_etcd(self, hosts):
@@ -275,7 +313,15 @@ print_ips - Write a space-delimited list of IPs from "all" group
 Advanced usage:
 Add another host after initial creation: inventory.py 10.10.1.5
 Delete a host: inventory.py -10.10.1.3
-Delete a host by id: inventory.py -node1'''
+Delete a host by id: inventory.py -node1
+
+Configurable env vars:
+DEBUG                   Enable debug printing. Default: True
+CONFIG_FILE             File to write config to Default: ./inventory.cfg
+HOST_PREFIX             Host prefix for generated hosts. Default: node
+SCALE_THRESHOLD         Separate ETCD role if # of nodes >= 50
+MASSIVE_SCALE_THRESHOLD Separate K8s master and ETCD if # of nodes >= 200
+'''
        print(help_text)

    def print_config(self):
--- a/contrib/inventory_builder/requirements.yml
+++ b/contrib/inventory_builder/requirements.yml
@@ -1,48 +0,0 @@
---
- src: https://gitlab.com/kubespray-ansibl8s/k8s-common.git
-  path: roles/apps
-  scm: git
-
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-dashboard.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-kubedns.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-elasticsearch.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-redis.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-memcached.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-postgres.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-pgbouncer.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-heapster.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-influxdb.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-kubedash.git
-#  path: roles/apps
-#  scm: git
-#
-#- src: https://gitlab.com/kubespray-ansibl8s/k8s-kube-logstash.git
-#  path: roles/apps
-#  scm: git
--- a/contrib/inventory_builder/tests/test_inventory.py
+++ b/contrib/inventory_builder/tests/test_inventory.py
@@ -210,3 +210,31 @@ class TestInventory(unittest.TestCase):

        self.inv.set_etcd([host])
        self.assertTrue(host in self.inv.config[group])
+
+    def test_scale_scenario_one(self):
+        num_nodes = 50
+        hosts = OrderedDict()
+
+        for hostid in range(1, num_nodes+1):
+            hosts["node" + str(hostid)] = ""
+
+        self.inv.set_all(hosts)
+        self.inv.set_etcd(hosts.keys()[0:3])
+        self.inv.set_kube_master(hosts.keys()[0:2])
+        self.inv.set_kube_node(hosts.keys())
+        for h in range(3):
+            self.assertFalse(hosts.keys()[h] in self.inv.config['kube-node'])
+
+    def test_scale_scenario_two(self):
+        num_nodes = 500
+        hosts = OrderedDict()
+
+        for hostid in range(1, num_nodes+1):
+            hosts["node" + str(hostid)] = ""
+
+        self.inv.set_all(hosts)
+        self.inv.set_etcd(hosts.keys()[0:3])
+        self.inv.set_kube_master(hosts.keys()[3:5])
+        self.inv.set_kube_node(hosts.keys())
+        for h in range(5):
+            self.assertFalse(hosts.keys()[h] in self.inv.config['kube-node'])
--- a/contrib/inventory_builder/tox.ini
+++ b/contrib/inventory_builder/tox.ini
@@ -11,7 +11,7 @@ deps =
    -r{toxinidir}/test-requirements.txt
 setenv = VIRTUAL_ENV={envdir}
 passenv = http_proxy HTTP_PROXY https_proxy HTTPS_PROXY no_proxy NO_PROXY
-commands = py.test -vv #{posargs:./tests}
+commands = pytest -vv #{posargs:./tests}

 [testenv:pep8]
 usedevelop = False
--- a/contrib/kvm-setup/README.md
+++ b/contrib/kvm-setup/README.md
@@ -0,0 +1,11 @@
+# Kargo on KVM Virtual Machines hypervisor preparation
+
+A simple playbook to ensure your system has the right settings to enable Kargo
+deployment on VMs.
+
+This playbook does not create Virtual Machines, nor does it run Kargo itself.
+
+### User creation
+
+If you want to create a user for running Kargo deployment, you should specify
+both `k8s_deployment_user` and `k8s_deployment_user_pkey_path`.
--- a/contrib/kvm-setup/group_vars/all
+++ b/contrib/kvm-setup/group_vars/all
@@ -0,0 +1,3 @@
+#k8s_deployment_user: kargo
+#k8s_deployment_user_pkey_path: /tmp/ssh_rsa
+
--- a/contrib/kvm-setup/kvm-setup.yml
+++ b/contrib/kvm-setup/kvm-setup.yml
@@ -0,0 +1,8 @@
+---
+- hosts: localhost
+  gather_facts: False
+  become: yes
+  vars:
+    - bootstrap_os: none
+  roles:
+    - kvm-setup
--- a/contrib/kvm-setup/roles/kvm-setup/tasks/main.yml
+++ b/contrib/kvm-setup/roles/kvm-setup/tasks/main.yml
@@ -0,0 +1,46 @@
+---
+
+- name: Upgrade all packages to the latest version (yum)
+  yum:
+   name: '*'
+   state: latest
+  when: ansible_os_family == "RedHat"
+
+- name: Install required packages
+  yum:
+    name: "{{ item }}"
+    state: latest
+  with_items:
+    - bind-utils
+    - ntp
+  when: ansible_os_family == "RedHat"
+
+- name: Install required packages
+  apt:
+    upgrade: yes
+    update_cache: yes
+    cache_valid_time: 3600
+    name: "{{ item }}"
+    state: latest
+    install_recommends: no
+  with_items:
+    - dnsutils
+    - ntp
+  when: ansible_os_family == "Debian"
+
+- name: Upgrade all packages to the latest version (apt)
+  shell: apt-get -o \
+       Dpkg::Options::=--force-confdef -o \
+       Dpkg::Options::=--force-confold -q -y \
+       dist-upgrade
+  environment:
+    DEBIAN_FRONTEND: noninteractive
+  when: ansible_os_family == "Debian"
+
+
+# Create deployment user if required
+- include: user.yml
+  when: k8s_deployment_user is defined
+
+# Set proper sysctl values
+- include: sysctl.yml
--- a/contrib/kvm-setup/roles/kvm-setup/tasks/sysctl.yml
+++ b/contrib/kvm-setup/roles/kvm-setup/tasks/sysctl.yml
@@ -0,0 +1,46 @@
+---
+- name: Load br_netfilter module
+  modprobe:
+    name: br_netfilter
+    state: present
+  register: br_netfilter
+
+- name: Add br_netfilter into /etc/modules
+  lineinfile:
+    dest: /etc/modules
+    state: present
+    line: 'br_netfilter'
+  when: br_netfilter is defined and ansible_os_family == 'Debian'
+
+- name: Add br_netfilter into /etc/modules-load.d/kargo.conf
+  copy:
+    dest: /etc/modules-load.d/kargo.conf
+    content: |-
+      ### This file is managed by Ansible
+      br-netfilter
+    owner: root
+    group: root
+    mode: 0644
+  when: br_netfilter is defined
+
+
+- name: Enable net.ipv4.ip_forward in sysctl
+  sysctl:
+    name: net.ipv4.ip_forward
+    value: 1
+    sysctl_file: /etc/sysctl.d/ipv4-ip_forward.conf
+    state: present
+    reload: yes
+
+- name: Set bridge-nf-call-{arptables,iptables} to 0
+  sysctl:
+    name: "{{ item }}"
+    state: present
+    value: 0
+    sysctl_file: /etc/sysctl.d/bridge-nf-call.conf
+    reload: yes
+  with_items:
+    - net.bridge.bridge-nf-call-arptables
+    - net.bridge.bridge-nf-call-ip6tables
+    - net.bridge.bridge-nf-call-iptables
+  when: br_netfilter is defined
--- a/contrib/kvm-setup/roles/kvm-setup/tasks/user.yml
+++ b/contrib/kvm-setup/roles/kvm-setup/tasks/user.yml
@@ -0,0 +1,46 @@
+---
+- name: Create user {{ k8s_deployment_user }}
+  user:
+    name: "{{ k8s_deployment_user }}"
+    groups: adm
+    shell: /bin/bash
+
+- name: Ensure that .ssh exists
+  file:
+    path: "/home/{{ k8s_deployment_user }}/.ssh"
+    state: directory
+    owner: "{{ k8s_deployment_user }}"
+    group: "{{ k8s_deployment_user }}"
+
+- name: Configure sudo for deployment user
+  copy:
+    content: |
+      %{{ k8s_deployment_user }} ALL=(ALL) NOPASSWD: ALL
+    dest: "/etc/sudoers.d/55-k8s-deployment"
+    owner: root
+    group: root
+    mode: 0644
+
+- name: Write private SSH key
+  copy:
+    src: "{{ k8s_deployment_user_pkey_path }}"
+    dest: "/home/{{ k8s_deployment_user }}/.ssh/id_rsa"
+    mode: 0400
+    owner: "{{ k8s_deployment_user }}"
+    group: "{{ k8s_deployment_user }}"
+  when: k8s_deployment_user_pkey_path is defined
+
+- name: Write public SSH key
+  shell: "ssh-keygen -y -f /home/{{ k8s_deployment_user }}/.ssh/id_rsa \
+            > /home/{{ k8s_deployment_user }}/.ssh/authorized_keys"
+  args:
+    creates: "/home/{{ k8s_deployment_user }}/.ssh/authorized_keys"
+  when: k8s_deployment_user_pkey_path is defined
+
+- name: Fix ssh-pub-key permissions
+  file:
+    path: "/home/{{ k8s_deployment_user }}/.ssh/authorized_keys"
+    mode: 0600
+    owner: "{{ k8s_deployment_user }}"
+    group: "{{ k8s_deployment_user }}"
+  when: k8s_deployment_user_pkey_path is defined
--- a/contrib/network-storage/glusterfs/roles/kubernetes-pv/lib
+++ b/contrib/network-storage/glusterfs/roles/kubernetes-pv/lib
@@ -1 +0,0 @@
-../../../../../roles/kubernetes-apps/lib
--- a/contrib/terraform/aws/.gitignore
+++ b/contrib/terraform/aws/.gitignore
@@ -1,2 +1,2 @@
 *.tfstate*
-inventory
+.terraform
--- a/contrib/terraform/aws/00-create-infrastructure.tf
+++ b/contrib/terraform/aws/00-create-infrastructure.tf
@@ -1,261 +0,0 @@
-variable "deploymentName" {
-  type = "string"
-  description = "The desired name of your deployment."
-}
-
-variable "numControllers"{
-  type = "string"
-  description = "Desired # of controllers."
-}
-
-variable "numEtcd" {
-  type = "string"
-  description = "Desired # of etcd nodes. Should be an odd number."
-}
-
-variable "numNodes" {
-  type = "string"
-  description = "Desired # of nodes."
-}
-
-variable "volSizeController" {
-  type = "string"
-  description = "Volume size for the controllers (GB)."
-}
-
-variable "volSizeEtcd" {
-  type = "string"
-  description = "Volume size for etcd (GB)."
-}
-
-variable "volSizeNodes" {
-  type = "string"
-  description = "Volume size for nodes (GB)."
-}
-
-variable "subnet" {
-  type = "string"
-  description = "The subnet in which to put your cluster."
-}
-
-variable "securityGroups" {
-  type = "string"
-  description = "The sec. groups in which to put your cluster."
-}
-
-variable "ami"{
-  type = "string"
-  description = "AMI to use for all VMs in cluster."
-}
-
-variable "SSHKey" {
-  type = "string"
-  description = "SSH key to use for VMs."
-}
-
-variable "master_instance_type" {
-  type = "string"
-  description = "Size of VM to use for masters."
-}
-
-variable "etcd_instance_type" {
-  type = "string"
-  description = "Size of VM to use for etcd."
-}
-
-variable "node_instance_type" {
-  type = "string"
-  description = "Size of VM to use for nodes."
-}
-
-variable "terminate_protect" {
-  type = "string"
-  default = "false"
-}
-
-variable "awsRegion" {
-  type = "string"
-}
-
-provider "aws" {
-  region = "${var.awsRegion}"
-}
-
-variable "iam_prefix" {
-  type = "string"
-  description = "Prefix name for IAM profiles"
-}
-
-resource "aws_iam_instance_profile" "kubernetes_master_profile" {
-  name = "${var.iam_prefix}_kubernetes_master_profile"
-  roles = ["${aws_iam_role.kubernetes_master_role.name}"]
-}
-
-resource "aws_iam_role" "kubernetes_master_role" {
-  name = "${var.iam_prefix}_kubernetes_master_role"
-  assume_role_policy = <<EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Principal": { "Service": "ec2.amazonaws.com"},
-      "Action": "sts:AssumeRole"
-    }
-  ]
-}
-EOF
-}
-
-resource "aws_iam_role_policy" "kubernetes_master_policy" {
-    name = "${var.iam_prefix}_kubernetes_master_policy"
-    role = "${aws_iam_role.kubernetes_master_role.id}"
-    policy = <<EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Action": ["ec2:*"],
-      "Resource": ["*"]
-    },
-    {
-      "Effect": "Allow",
-      "Action": ["elasticloadbalancing:*"],
-      "Resource": ["*"]
-    },
-    {
-      "Effect": "Allow",
-      "Action": "s3:*",
-      "Resource": "*"
-    }
-  ]
-}
-EOF
-}
-
-resource "aws_iam_instance_profile" "kubernetes_node_profile" {
-  name = "${var.iam_prefix}_kubernetes_node_profile"
-  roles = ["${aws_iam_role.kubernetes_node_role.name}"]
-}
-
-resource "aws_iam_role" "kubernetes_node_role" {
-  name = "${var.iam_prefix}_kubernetes_node_role"
-  assume_role_policy = <<EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Principal": { "Service": "ec2.amazonaws.com"},
-      "Action": "sts:AssumeRole"
-    }
-  ]
-}
-EOF
-}
-
-resource "aws_iam_role_policy" "kubernetes_node_policy" {
-    name = "${var.iam_prefix}_kubernetes_node_policy"
-    role = "${aws_iam_role.kubernetes_node_role.id}"
-    policy = <<EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Action": "s3:*",
-      "Resource": "*"
-    },
-    {
-      "Effect": "Allow",
-      "Action": "ec2:Describe*",
-      "Resource": "*"
-    },
-    {
-      "Effect": "Allow",
-      "Action": "ec2:AttachVolume",
-      "Resource": "*"
-    },
-    {
-      "Effect": "Allow",
-      "Action": "ec2:DetachVolume",
-      "Resource": "*"
-    }
-  ]
-}
-EOF
-}
-
-resource "aws_instance" "master" {
-    count = "${var.numControllers}"
-    ami = "${var.ami}"
-    instance_type = "${var.master_instance_type}"
-    subnet_id = "${var.subnet}"
-    vpc_security_group_ids = ["${var.securityGroups}"]
-    key_name = "${var.SSHKey}"
-    disable_api_termination = "${var.terminate_protect}"
-    iam_instance_profile = "${aws_iam_instance_profile.kubernetes_master_profile.id}"
-    root_block_device {
-      volume_size = "${var.volSizeController}"
-    }
-    tags {
-      Name = "${var.deploymentName}-master-${count.index + 1}"
-    }
-}
-
-resource "aws_instance" "etcd" {
-    count = "${var.numEtcd}"
-    ami = "${var.ami}"
-    instance_type = "${var.etcd_instance_type}"
-    subnet_id = "${var.subnet}"
-    vpc_security_group_ids = ["${var.securityGroups}"]
-    key_name = "${var.SSHKey}"
-    disable_api_termination = "${var.terminate_protect}"
-    root_block_device {
-      volume_size = "${var.volSizeEtcd}"
-    }
-    tags {
-      Name = "${var.deploymentName}-etcd-${count.index + 1}"
-    }
-}
-
-
-resource "aws_instance" "minion" {
-    count = "${var.numNodes}"
-    ami = "${var.ami}"
-    instance_type = "${var.node_instance_type}"
-    subnet_id = "${var.subnet}"
-    vpc_security_group_ids = ["${var.securityGroups}"]
-    key_name = "${var.SSHKey}"
-    disable_api_termination = "${var.terminate_protect}"
-    iam_instance_profile = "${aws_iam_instance_profile.kubernetes_node_profile.id}"
-    root_block_device {
-      volume_size = "${var.volSizeNodes}"
-    }
-    tags {
-      Name = "${var.deploymentName}-minion-${count.index + 1}"
-    }
-}
-
-output "kubernetes_master_profile" {
-  value = "${aws_iam_instance_profile.kubernetes_master_profile.id}"
-}
-
-output "kubernetes_node_profile" {
-  value = "${aws_iam_instance_profile.kubernetes_node_profile.id}"
-}
-
-output "master-ip" {
-    value = "${join(", ", aws_instance.master.*.private_ip)}"
-}
-
-output "etcd-ip" {
-    value = "${join(", ", aws_instance.etcd.*.private_ip)}"
-}
-
-output "minion-ip" {
-    value = "${join(", ", aws_instance.minion.*.private_ip)}"
-}
-
-
--- a/contrib/terraform/aws/01-create-inventory.tf
+++ b/contrib/terraform/aws/01-create-inventory.tf
@@ -1,37 +0,0 @@
-variable "SSHUser" {
-  type = "string"
-  description = "SSH User for VMs."
-}
-
-resource "null_resource" "ansible-provision" {
-
-  depends_on = ["aws_instance.master","aws_instance.etcd","aws_instance.minion"]
-
-  ##Create Master Inventory
-  provisioner "local-exec" {
-    command =  "echo \"[kube-master]\" > inventory"
-  }
-  provisioner "local-exec" {
-    command =  "echo \"${join("\n",formatlist("%s ansible_ssh_user=%s", aws_instance.master.*.private_ip, var.SSHUser))}\" >> inventory"
-  }
-
-  ##Create ETCD Inventory
-  provisioner "local-exec" {
-    command =  "echo \"\n[etcd]\" >> inventory"
-  }
-  provisioner "local-exec" {
-    command =  "echo \"${join("\n",formatlist("%s ansible_ssh_user=%s", aws_instance.etcd.*.private_ip, var.SSHUser))}\" >> inventory"
-  }
-
-  ##Create Nodes Inventory
-  provisioner "local-exec" {
-    command =  "echo \"\n[kube-node]\" >> inventory"
-  }
-  provisioner "local-exec" {
-    command =  "echo \"${join("\n",formatlist("%s ansible_ssh_user=%s", aws_instance.minion.*.private_ip, var.SSHUser))}\" >> inventory"
-  }
-
-  provisioner "local-exec" {
-    command =  "echo \"\n[k8s-cluster:children]\nkube-node\nkube-master\" >> inventory"
-  }
-}
--- a/contrib/terraform/aws/README.md
+++ b/contrib/terraform/aws/README.md
@@ -2,27 +2,34 @@

 **Overview:**

- This will create nodes in a VPC inside of AWS
+This project will create:
+* VPC with Public and Private Subnets in # Availability Zones
+* Bastion Hosts and NAT Gateways in the Public Subnet
+* A dynamic number of masters, etcd, and worker nodes in the Private Subnet
+ * even distributed over the # of Availability Zones
+* AWS ELB in the Public Subnet for accessing the Kubernetes API from the internet

- A dynamic number of masters, etcd, and nodes can be created
-
- These scripts currently expect Private IP connectivity with the nodes that are created. This means that you may need a tunnel to your VPC or to run these scripts from a VM inside the VPC. Will be looking into how to work around this later.
+**Requirements**
+- Terraform 0.8.7 or newer

 **How to Use:**

- Export the variables for your Amazon credentials:
+- Export the variables for your AWS credentials or edit credentials.tfvars:

 ```
-export AWS_ACCESS_KEY_ID="xxx"
-export AWS_SECRET_ACCESS_KEY="yyy"
+export aws_access_key="xxx"
+export aws_secret_key="yyy"
+export aws_ssh_key_name="zzz"
 ```

 - Update contrib/terraform/aws/terraform.tfvars with your data

- Run with `terraform apply`
+- Run with `terraform apply -var-file="credentials.tfvars"` or `terraform apply` depending if you exported your AWS credentials

- Once the infrastructure is created, you can run the kubespray playbooks and supply contrib/terraform/aws/inventory with the `-i` flag.
+- Once the infrastructure is created, you can run the kargo playbooks and supply inventory/hosts with the `-i` flag.

-**Future Work:**
+**Architecture**

- Update the inventory creation file to be something a little more reasonable. It's just a local-exec from Terraform now, using terraform.py or something may make sense in the future.
+Pictured is an AWS Infrastructure created with this Terraform project distributed over two Availability Zones.
+
+![AWS Infrastructure with Terraform  ](docs/aws_kargo.png)
--- a/contrib/terraform/aws/create-infrastructure.tf
+++ b/contrib/terraform/aws/create-infrastructure.tf
@@ -0,0 +1,185 @@
+terraform {
+    required_version = ">= 0.8.7"
+}
+
+provider "aws" {
+    access_key = "${var.AWS_ACCESS_KEY_ID}"
+    secret_key = "${var.AWS_SECRET_ACCESS_KEY}"
+    region = "${var.AWS_DEFAULT_REGION}"
+}
+
+/*
+* Calling modules who create the initial AWS VPC / AWS ELB
+* and AWS IAM Roles for Kubernetes Deployment
+*/
+
+module "aws-vpc" {
+  source = "modules/vpc"
+
+  aws_cluster_name = "${var.aws_cluster_name}"
+  aws_vpc_cidr_block = "${var.aws_vpc_cidr_block}"
+  aws_avail_zones="${var.aws_avail_zones}"
+
+  aws_cidr_subnets_private="${var.aws_cidr_subnets_private}"
+  aws_cidr_subnets_public="${var.aws_cidr_subnets_public}"
+
+}
+
+
+module "aws-elb" {
+  source = "modules/elb"
+
+  aws_cluster_name="${var.aws_cluster_name}"
+  aws_vpc_id="${module.aws-vpc.aws_vpc_id}"
+  aws_avail_zones="${var.aws_avail_zones}"
+  aws_subnet_ids_public="${module.aws-vpc.aws_subnet_ids_public}"
+  aws_elb_api_port = "${var.aws_elb_api_port}"
+  k8s_secure_api_port = "${var.k8s_secure_api_port}"
+
+}
+
+module "aws-iam" {
+  source = "modules/iam"
+
+  aws_cluster_name="${var.aws_cluster_name}"
+}
+
+/*
+* Create Bastion Instances in AWS
+*
+*/
+resource "aws_instance" "bastion-server" {
+    ami = "${var.aws_bastion_ami}"
+    instance_type = "${var.aws_bastion_size}"
+    count = "${length(var.aws_cidr_subnets_public)}"
+    associate_public_ip_address = true
+    availability_zone  = "${element(var.aws_avail_zones,count.index)}"
+    subnet_id = "${element(module.aws-vpc.aws_subnet_ids_public,count.index)}"
+
+
+    vpc_security_group_ids = [ "${module.aws-vpc.aws_security_group}" ]
+
+    key_name = "${var.AWS_SSH_KEY_NAME}"
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-bastion-${count.index}"
+        Cluster = "${var.aws_cluster_name}"
+        Role = "bastion-${var.aws_cluster_name}-${count.index}"
+    }
+}
+
+
+/*
+* Create K8s Master and worker nodes and etcd instances
+*
+*/
+
+resource "aws_instance" "k8s-master" {
+    ami = "${var.aws_cluster_ami}"
+    instance_type = "${var.aws_kube_master_size}"
+
+    count = "${var.aws_kube_master_num}"
+
+
+    availability_zone  = "${element(var.aws_avail_zones,count.index)}"
+    subnet_id = "${element(module.aws-vpc.aws_subnet_ids_private,count.index)}"
+
+
+    vpc_security_group_ids = [ "${module.aws-vpc.aws_security_group}" ]
+
+
+    iam_instance_profile = "${module.aws-iam.kube-master-profile}"
+    key_name = "${var.AWS_SSH_KEY_NAME}"
+
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-master${count.index}"
+        Cluster = "${var.aws_cluster_name}"
+        Role = "master"
+    }
+}
+
+resource "aws_elb_attachment" "attach_master_nodes" {
+  count = "${var.aws_kube_master_num}"
+  elb      = "${module.aws-elb.aws_elb_api_id}"
+  instance = "${element(aws_instance.k8s-master.*.id,count.index)}"
+}
+
+
+resource "aws_instance" "k8s-etcd" {
+    ami = "${var.aws_cluster_ami}"
+    instance_type = "${var.aws_etcd_size}"
+
+    count = "${var.aws_etcd_num}"
+
+
+    availability_zone = "${element(var.aws_avail_zones,count.index)}"
+    subnet_id = "${element(module.aws-vpc.aws_subnet_ids_private,count.index)}"
+
+
+    vpc_security_group_ids = [ "${module.aws-vpc.aws_security_group}" ]
+
+    key_name = "${var.AWS_SSH_KEY_NAME}"
+
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-etcd${count.index}"
+        Cluster = "${var.aws_cluster_name}"
+        Role = "etcd"
+    }
+
+}
+
+
+resource "aws_instance" "k8s-worker" {
+    ami = "${var.aws_cluster_ami}"
+    instance_type = "${var.aws_kube_worker_size}"
+
+    count = "${var.aws_kube_worker_num}"
+
+    availability_zone  = "${element(var.aws_avail_zones,count.index)}"
+    subnet_id = "${element(module.aws-vpc.aws_subnet_ids_private,count.index)}"
+
+    vpc_security_group_ids = [ "${module.aws-vpc.aws_security_group}" ]
+
+    iam_instance_profile = "${module.aws-iam.kube-worker-profile}"
+    key_name = "${var.AWS_SSH_KEY_NAME}"
+
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-worker${count.index}"
+        Cluster = "${var.aws_cluster_name}"
+        Role = "worker"
+    }
+
+}
+
+
+
+/*
+* Create Kargo Inventory File
+*
+*/
+data "template_file" "inventory" {
+    template = "${file("${path.module}/templates/inventory.tpl")}"
+
+    vars {
+        public_ip_address_bastion = "${join("\n",formatlist("bastion ansible_ssh_host=%s" , aws_instance.bastion-server.*.public_ip))}"
+        connection_strings_master = "${join("\n",formatlist("%s ansible_ssh_host=%s",aws_instance.k8s-master.*.tags.Name, aws_instance.k8s-master.*.private_ip))}"
+        connection_strings_node = "${join("\n", formatlist("%s ansible_ssh_host=%s", aws_instance.k8s-worker.*.tags.Name, aws_instance.k8s-worker.*.private_ip))}"
+        connection_strings_etcd = "${join("\n",formatlist("%s ansible_ssh_host=%s", aws_instance.k8s-etcd.*.tags.Name, aws_instance.k8s-etcd.*.private_ip))}"
+        list_master = "${join("\n",aws_instance.k8s-master.*.tags.Name)}"
+        list_node = "${join("\n",aws_instance.k8s-worker.*.tags.Name)}"
+        list_etcd = "${join("\n",aws_instance.k8s-etcd.*.tags.Name)}"
+        elb_api_fqdn = "apiserver_loadbalancer_domain_name=\"${module.aws-elb.aws_elb_api_fqdn}\""
+        elb_api_port = "loadbalancer_apiserver.port=${var.aws_elb_api_port}"
+
+    }
+}
+
+resource "null_resource" "inventories" {
+  provisioner "local-exec" {
+      command = "echo '${data.template_file.inventory.rendered}' > ../../../inventory/hosts"
+  }
+
+}
--- a/contrib/terraform/aws/credentials.tfvars.example
+++ b/contrib/terraform/aws/credentials.tfvars.example
@@ -0,0 +1,8 @@
+#AWS Access Key
+AWS_ACCESS_KEY_ID = ""
+#AWS Secret Key
+AWS_SECRET_ACCESS_KEY = ""
+#EC2 SSH Key Name
+AWS_SSH_KEY_NAME = ""
+#AWS Region
+AWS_DEFAULT_REGION = "eu-central-1"
--- a/contrib/terraform/aws/docs/aws_kargo.png
+++ b/contrib/terraform/aws/docs/aws_kargo.png
--- a/contrib/terraform/aws/modules/elb/main.tf
+++ b/contrib/terraform/aws/modules/elb/main.tf
@@ -0,0 +1,58 @@
+resource "aws_security_group" "aws-elb" {
+    name = "kubernetes-${var.aws_cluster_name}-securitygroup-elb"
+    vpc_id = "${var.aws_vpc_id}"
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-securitygroup-elb"
+    }
+}
+
+
+resource "aws_security_group_rule" "aws-allow-api-access" {
+    type = "ingress"
+    from_port = "${var.aws_elb_api_port}"
+    to_port = "${var.k8s_secure_api_port}"
+    protocol = "TCP"
+    cidr_blocks = ["0.0.0.0/0"]
+    security_group_id = "${aws_security_group.aws-elb.id}"
+}
+
+resource "aws_security_group_rule" "aws-allow-api-egress" {
+    type = "egress"
+    from_port = 0
+    to_port = 65535
+    protocol = "TCP"
+    cidr_blocks = ["0.0.0.0/0"]
+    security_group_id = "${aws_security_group.aws-elb.id}"
+}
+
+# Create a new AWS ELB for K8S API
+resource "aws_elb" "aws-elb-api" {
+  name = "kubernetes-elb-${var.aws_cluster_name}"
+  subnets = ["${var.aws_subnet_ids_public}"]
+  security_groups = ["${aws_security_group.aws-elb.id}"]
+
+  listener {
+    instance_port = "${var.k8s_secure_api_port}"
+    instance_protocol = "tcp"
+    lb_port = "${var.aws_elb_api_port}"
+    lb_protocol = "tcp"
+  }
+
+  health_check {
+    healthy_threshold = 2
+    unhealthy_threshold = 2
+    timeout = 3
+    target = "HTTP:8080/"
+    interval = 30
+  }
+
+  cross_zone_load_balancing = true
+  idle_timeout = 400
+  connection_draining = true
+  connection_draining_timeout = 400
+
+  tags {
+    Name = "kubernetes-${var.aws_cluster_name}-elb-api"
+  }
+}
--- a/contrib/terraform/aws/modules/elb/outputs.tf
+++ b/contrib/terraform/aws/modules/elb/outputs.tf
@@ -0,0 +1,7 @@
+output "aws_elb_api_id" {
+    value = "${aws_elb.aws-elb-api.id}"
+}
+
+output "aws_elb_api_fqdn" {
+    value = "${aws_elb.aws-elb-api.dns_name}"
+}
--- a/contrib/terraform/aws/modules/elb/variables.tf
+++ b/contrib/terraform/aws/modules/elb/variables.tf
@@ -0,0 +1,28 @@
+variable "aws_cluster_name" {
+    description = "Name of Cluster"
+}
+
+variable "aws_vpc_id" {
+    description = "AWS VPC ID"
+}
+
+variable "aws_elb_api_port" {
+    description = "Port for AWS ELB"
+}
+
+variable "k8s_secure_api_port" {
+    description = "Secure Port of K8S API Server"
+}
+
+
+
+variable "aws_avail_zones" {
+    description = "Availability Zones Used"
+    type = "list"
+}
+
+
+variable "aws_subnet_ids_public" {
+    description = "IDs of Public Subnets"
+    type = "list"
+}
--- a/contrib/terraform/aws/modules/iam/main.tf
+++ b/contrib/terraform/aws/modules/iam/main.tf
@@ -0,0 +1,138 @@
+#Add AWS Roles for Kubernetes
+
+resource "aws_iam_role" "kube-master" {
+    name = "kubernetes-${var.aws_cluster_name}-master"
+    assume_role_policy = <<EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": "sts:AssumeRole",
+      "Principal": {
+        "Service": "ec2.amazonaws.com"
+      }
+      }
+  ]
+}
+EOF
+}
+
+resource "aws_iam_role" "kube-worker" {
+    name = "kubernetes-${var.aws_cluster_name}-node"
+    assume_role_policy = <<EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": "sts:AssumeRole",
+      "Principal": {
+        "Service": "ec2.amazonaws.com"
+      }
+      }
+  ]
+}
+EOF
+}
+
+#Add AWS Policies for Kubernetes
+
+resource "aws_iam_role_policy" "kube-master" {
+    name = "kubernetes-${var.aws_cluster_name}-master"
+    role = "${aws_iam_role.kube-master.id}"
+    policy = <<EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": ["ec2:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["elasticloadbalancing:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": ["route53:*"],
+      "Resource": ["*"]
+    },
+    {
+      "Effect": "Allow",
+      "Action": "s3:*",
+      "Resource": [
+        "arn:aws:s3:::kubernetes-*"
+      ]
+    }
+  ]
+}
+EOF
+}
+
+resource "aws_iam_role_policy" "kube-worker" {
+    name = "kubernetes-${var.aws_cluster_name}-node"
+    role = "${aws_iam_role.kube-worker.id}"
+    policy = <<EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+        {
+          "Effect": "Allow",
+          "Action": "s3:*",
+          "Resource": [
+            "arn:aws:s3:::kubernetes-*"
+          ]
+        },
+        {
+          "Effect": "Allow",
+          "Action": "ec2:Describe*",
+          "Resource": "*"
+        },
+        {
+          "Effect": "Allow",
+          "Action": "ec2:AttachVolume",
+          "Resource": "*"
+        },
+        {
+          "Effect": "Allow",
+          "Action": "ec2:DetachVolume",
+          "Resource": "*"
+        },
+        {
+          "Effect": "Allow",
+          "Action": ["route53:*"],
+          "Resource": ["*"]
+        },
+        {
+          "Effect": "Allow",
+          "Action": [
+            "ecr:GetAuthorizationToken",
+            "ecr:BatchCheckLayerAvailability",
+            "ecr:GetDownloadUrlForLayer",
+            "ecr:GetRepositoryPolicy",
+            "ecr:DescribeRepositories",
+            "ecr:ListImages",
+            "ecr:BatchGetImage"
+          ],
+          "Resource": "*"
+        }
+      ]
+}
+EOF
+}
+
+
+#Create AWS Instance Profiles
+
+resource "aws_iam_instance_profile" "kube-master" {
+    name = "kube_${var.aws_cluster_name}_master_profile"
+    roles = ["${aws_iam_role.kube-master.name}"]
+}
+
+resource "aws_iam_instance_profile" "kube-worker" {
+    name = "kube_${var.aws_cluster_name}_node_profile"
+    roles = ["${aws_iam_role.kube-worker.name}"]
+}
--- a/contrib/terraform/aws/modules/iam/outputs.tf
+++ b/contrib/terraform/aws/modules/iam/outputs.tf
@@ -0,0 +1,7 @@
+output "kube-master-profile" {
+    value = "${aws_iam_instance_profile.kube-master.name }"
+}
+
+output "kube-worker-profile" {
+    value = "${aws_iam_instance_profile.kube-worker.name }"
+}
--- a/contrib/terraform/aws/modules/iam/variables.tf
+++ b/contrib/terraform/aws/modules/iam/variables.tf
@@ -0,0 +1,3 @@
+variable "aws_cluster_name" {
+    description = "Name of Cluster"
+}
--- a/contrib/terraform/aws/modules/vpc/main.tf
+++ b/contrib/terraform/aws/modules/vpc/main.tf
@@ -0,0 +1,138 @@
+
+resource "aws_vpc" "cluster-vpc" {
+    cidr_block = "${var.aws_vpc_cidr_block}"
+
+    #DNS Related Entries
+    enable_dns_support = true
+    enable_dns_hostnames = true
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-vpc"
+    }
+}
+
+
+resource "aws_eip" "cluster-nat-eip" {
+  count    = "${length(var.aws_cidr_subnets_public)}"
+  vpc      = true
+}
+
+
+
+resource "aws_internet_gateway" "cluster-vpc-internetgw" {
+  vpc_id = "${aws_vpc.cluster-vpc.id}"
+
+  tags {
+      Name = "kubernetes-${var.aws_cluster_name}-internetgw"
+  }
+}
+
+resource "aws_subnet" "cluster-vpc-subnets-public" {
+    vpc_id = "${aws_vpc.cluster-vpc.id}"
+    count="${length(var.aws_avail_zones)}"
+    availability_zone = "${element(var.aws_avail_zones, count.index)}"
+    cidr_block = "${element(var.aws_cidr_subnets_public, count.index)}"
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-${element(var.aws_avail_zones, count.index)}-public"
+    }
+}
+
+resource "aws_nat_gateway" "cluster-nat-gateway" {
+    count = "${length(var.aws_cidr_subnets_public)}"
+    allocation_id = "${element(aws_eip.cluster-nat-eip.*.id, count.index)}"
+    subnet_id = "${element(aws_subnet.cluster-vpc-subnets-public.*.id, count.index)}"
+
+}
+
+resource "aws_subnet" "cluster-vpc-subnets-private" {
+    vpc_id = "${aws_vpc.cluster-vpc.id}"
+    count="${length(var.aws_avail_zones)}"
+    availability_zone = "${element(var.aws_avail_zones, count.index)}"
+    cidr_block = "${element(var.aws_cidr_subnets_private, count.index)}"
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-${element(var.aws_avail_zones, count.index)}-private"
+    }
+}
+
+#Routing in VPC
+
+#TODO: Do we need two routing tables for each subnet for redundancy or is one enough?
+
+resource "aws_route_table" "kubernetes-public" {
+    vpc_id = "${aws_vpc.cluster-vpc.id}"
+    route {
+        cidr_block = "0.0.0.0/0"
+        gateway_id = "${aws_internet_gateway.cluster-vpc-internetgw.id}"
+    }
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-routetable-public"
+    }
+}
+
+resource "aws_route_table" "kubernetes-private" {
+    count = "${length(var.aws_cidr_subnets_private)}"
+    vpc_id = "${aws_vpc.cluster-vpc.id}"
+    route {
+        cidr_block = "0.0.0.0/0"
+        nat_gateway_id = "${element(aws_nat_gateway.cluster-nat-gateway.*.id, count.index)}"
+    }
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-routetable-private-${count.index}"
+    }
+}
+
+resource "aws_route_table_association" "kubernetes-public" {
+    count = "${length(var.aws_cidr_subnets_public)}"
+    subnet_id = "${element(aws_subnet.cluster-vpc-subnets-public.*.id,count.index)}"
+    route_table_id = "${aws_route_table.kubernetes-public.id}"
+
+}
+
+resource "aws_route_table_association" "kubernetes-private" {
+    count = "${length(var.aws_cidr_subnets_private)}"
+    subnet_id = "${element(aws_subnet.cluster-vpc-subnets-private.*.id,count.index)}"
+    route_table_id = "${element(aws_route_table.kubernetes-private.*.id,count.index)}"
+
+}
+
+
+#Kubernetes Security Groups
+
+resource "aws_security_group" "kubernetes" {
+    name = "kubernetes-${var.aws_cluster_name}-securitygroup"
+    vpc_id = "${aws_vpc.cluster-vpc.id}"
+
+    tags {
+        Name = "kubernetes-${var.aws_cluster_name}-securitygroup"
+    }
+}
+
+resource "aws_security_group_rule" "allow-all-ingress" {
+    type = "ingress"
+    from_port = 0
+    to_port = 65535
+    protocol = "-1"
+    cidr_blocks= ["${var.aws_vpc_cidr_block}"]
+    security_group_id = "${aws_security_group.kubernetes.id}"
+}
+
+resource "aws_security_group_rule" "allow-all-egress" {
+    type = "egress"
+    from_port = 0
+    to_port = 65535
+    protocol = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+    security_group_id = "${aws_security_group.kubernetes.id}"
+}
+
+
+resource "aws_security_group_rule" "allow-ssh-connections" {
+    type = "ingress"
+    from_port = 22
+    to_port = 22
+    protocol = "TCP"
+    cidr_blocks = ["0.0.0.0/0"]
+    security_group_id = "${aws_security_group.kubernetes.id}"
+}
--- a/contrib/terraform/aws/modules/vpc/outputs.tf
+++ b/contrib/terraform/aws/modules/vpc/outputs.tf
@@ -0,0 +1,16 @@
+output "aws_vpc_id" {
+    value = "${aws_vpc.cluster-vpc.id}"
+}
+
+output "aws_subnet_ids_private" {
+    value = ["${aws_subnet.cluster-vpc-subnets-private.*.id}"]
+}
+
+output "aws_subnet_ids_public" {
+    value = ["${aws_subnet.cluster-vpc-subnets-public.*.id}"]
+}
+
+output "aws_security_group" {
+    value = ["${aws_security_group.kubernetes.*.id}"]
+
+}
--- a/contrib/terraform/aws/modules/vpc/variables.tf
+++ b/contrib/terraform/aws/modules/vpc/variables.tf
@@ -0,0 +1,24 @@
+variable "aws_vpc_cidr_block" {
+    description = "CIDR Blocks for AWS VPC"
+}
+
+
+variable "aws_cluster_name" {
+    description = "Name of Cluster"
+}
+
+
+variable "aws_avail_zones" {
+    description = "AWS Availability Zones Used"
+    type = "list"
+}
+
+variable "aws_cidr_subnets_private" {
+  description = "CIDR Blocks for private subnets in Availability zones"
+  type    = "list"
+}
+
+variable "aws_cidr_subnets_public" {
+  description = "CIDR Blocks for public subnets in Availability zones"
+  type    = "list"
+}
--- a/contrib/terraform/aws/output.tf
+++ b/contrib/terraform/aws/output.tf
@@ -0,0 +1,20 @@
+output "bastion_ip" {
+    value = "${join("\n", aws_instance.bastion-server.*.public_ip)}"
+}
+
+output "masters" {
+    value = "${join("\n", aws_instance.k8s-master.*.private_ip)}"
+}
+
+output "workers" {
+    value = "${join("\n", aws_instance.k8s-worker.*.private_ip)}"
+}
+
+output "etcd" {
+    value = "${join("\n", aws_instance.k8s-etcd.*.private_ip)}"
+}
+
+
+output "aws_elb_api_fqdn" {
+    value = "${module.aws-elb.aws_elb_api_fqdn}:${var.aws_elb_api_port}"
+}
--- a/contrib/terraform/aws/templates/inventory.tpl
+++ b/contrib/terraform/aws/templates/inventory.tpl
@@ -0,0 +1,27 @@
+${connection_strings_master}
+${connection_strings_node}
+${connection_strings_etcd}
+
+
+${public_ip_address_bastion}
+
+[kube-master]
+${list_master}
+
+
+[kube-node]
+${list_node}
+
+
+[etcd]
+${list_etcd}
+
+
+[k8s-cluster:children]
+kube-node
+kube-master
+
+
+[k8s-cluster:vars]
+${elb_api_fqdn}
+${elb_api_port}
--- a/contrib/terraform/aws/terraform.tfvars
+++ b/contrib/terraform/aws/terraform.tfvars
@@ -1,22 +1,31 @@
-deploymentName="test-kube-deploy"
+#Global Vars
+aws_cluster_name = "devtest"

-numControllers="2"
-numEtcd="3"
-numNodes="2"
+#VPC Vars
+aws_vpc_cidr_block = "10.250.192.0/18"
+aws_cidr_subnets_private = ["10.250.192.0/20","10.250.208.0/20"]
+aws_cidr_subnets_public = ["10.250.224.0/20","10.250.240.0/20"]
+aws_avail_zones = ["eu-central-1a","eu-central-1b"]

-volSizeController="20"
-volSizeEtcd="20"
-volSizeNodes="20"
+#Bastion Host
+aws_bastion_ami = "ami-5900cc36"
+aws_bastion_size = "t2.small"

-awsRegion="us-west-2"
-subnet="subnet-xxxxx"
-ami="ami-32a85152"
-securityGroups="sg-xxxxx"
-SSHUser="core"
-SSHKey="my-key"

-master_instance_type="m3.xlarge"
-etcd_instance_type="m3.xlarge"
-node_instance_type="m3.xlarge"
+#Kubernetes Cluster

-terminate_protect="false"
+aws_kube_master_num = 3
+aws_kube_master_size = "t2.medium"
+
+aws_etcd_num = 3
+aws_etcd_size = "t2.medium"
+
+aws_kube_worker_num = 4
+aws_kube_worker_size = "t2.medium"
+
+aws_cluster_ami = "ami-903df7ff"
+
+#Settings AWS ELB
+
+aws_elb_api_port = 443
+k8s_secure_api_port = 443
--- a/contrib/terraform/aws/terraform.tfvars.example
+++ b/contrib/terraform/aws/terraform.tfvars.example
@@ -0,0 +1,32 @@
+#Global Vars
+aws_cluster_name = "devtest"
+aws_region = "eu-central-1"
+
+#VPC Vars
+aws_vpc_cidr_block = "10.250.192.0/18"
+aws_cidr_subnets_private = ["10.250.192.0/20","10.250.208.0/20"]
+aws_cidr_subnets_public = ["10.250.224.0/20","10.250.240.0/20"]
+aws_avail_zones = ["eu-central-1a","eu-central-1b"]
+
+#Bastion Host
+aws_bastion_ami = "ami-5900cc36"
+aws_bastion_size = "t2.small"
+
+
+#Kubernetes Cluster
+
+aws_kube_master_num = 3
+aws_kube_master_size = "t2.medium"
+
+aws_etcd_num = 3
+aws_etcd_size = "t2.medium"
+
+aws_kube_worker_num = 4
+aws_kube_worker_size = "t2.medium"
+
+aws_cluster_ami = "ami-903df7ff"
+
+#Settings AWS ELB
+
+aws_elb_api_port = 443
+k8s_secure_api_port = 443
--- a/contrib/terraform/aws/variables.tf
+++ b/contrib/terraform/aws/variables.tf
@@ -0,0 +1,97 @@
+variable "AWS_ACCESS_KEY_ID" {
+  description = "AWS Access Key"
+}
+
+variable "AWS_SECRET_ACCESS_KEY" {
+  description = "AWS Secret Key"
+}
+
+variable "AWS_SSH_KEY_NAME" {
+  description = "Name of the SSH keypair to use in AWS."
+}
+
+variable "AWS_DEFAULT_REGION" {
+  description = "AWS Region"
+}
+
+//General Cluster Settings
+
+variable "aws_cluster_name" {
+  description = "Name of AWS Cluster"
+}
+
+
+//AWS VPC Variables
+
+variable "aws_vpc_cidr_block" {
+  description = "CIDR Block for VPC"
+}
+
+variable "aws_avail_zones" {
+  description = "Availability Zones Used"
+  type = "list"
+}
+
+variable "aws_cidr_subnets_private" {
+  description = "CIDR Blocks for private subnets in Availability Zones"
+  type = "list"
+}
+
+variable "aws_cidr_subnets_public" {
+  description = "CIDR Blocks for public subnets in Availability Zones"
+  type = "list"
+}
+
+//AWS EC2 Settings
+
+variable "aws_bastion_ami" {
+    description = "AMI ID for Bastion Host in chosen AWS Region"
+}
+
+variable "aws_bastion_size" {
+    description = "EC2 Instance Size of Bastion Host"
+}
+
+/*
+* AWS EC2 Settings
+* The number should be divisable by the number of used
+* AWS Availability Zones without an remainder.
+*/
+variable "aws_kube_master_num" {
+    description = "Number of Kubernetes Master Nodes"
+}
+
+variable "aws_kube_master_size" {
+    description = "Instance size of Kube Master Nodes"
+}
+
+variable "aws_etcd_num" {
+    description = "Number of etcd Nodes"
+}
+
+variable "aws_etcd_size" {
+    description = "Instance size of etcd Nodes"
+}
+
+variable "aws_kube_worker_num" {
+    description = "Number of Kubernetes Worker Nodes"
+}
+
+variable "aws_kube_worker_size" {
+    description = "Instance size of Kubernetes Worker Nodes"
+}
+
+variable "aws_cluster_ami" {
+    description = "AMI ID for Kubernetes Cluster"
+}
+/*
+* AWS ELB Settings
+*
+*/
+variable "aws_elb_api_port" {
+    description = "Port for AWS ELB"
+}
+
+variable "k8s_secure_api_port" {
+    description = "Secure Port of K8S API Server"
+}
--- a/contrib/terraform/group_vars
+++ b/contrib/terraform/group_vars
@@ -0,0 +1 @@
+../../inventory/group_vars
--- a/contrib/terraform/openstack/README.md
+++ b/contrib/terraform/openstack/README.md
@@ -30,7 +30,7 @@ requirements.

 #### OpenStack

-Ensure your OpenStack credentials are loaded in environment variables. This can be done by downloading a credentials .rc file from your OpenStack dashboard and sourcing it:
+Ensure your OpenStack **Identity v2** credentials are loaded in environment variables. This can be done by downloading a credentials .rc file from your OpenStack dashboard and sourcing it:

 ```
 $ source ~/.stackrc
--- a/contrib/terraform/openstack/group_vars
+++ b/contrib/terraform/openstack/group_vars
@@ -0,0 +1 @@
+../../../inventory/group_vars
--- a/contrib/terraform/openstack/group_vars/all.yml
+++ b/contrib/terraform/openstack/group_vars/all.yml
@@ -1,165 +0,0 @@
-# Valid bootstrap options (required): ubuntu, coreos, none
-bootstrap_os: none
-
-# Directory where the binaries will be installed
-bin_dir: /usr/local/bin
-
-# Where the binaries will be downloaded.
-# Note: ensure that you've enough disk space (about 1G)
-local_release_dir: "/tmp/releases"
-# Random shifts for retrying failed ops like pushing/downloading
-retry_stagger: 5
-
-# Uncomment this line for Container Linux by CoreOS only.
-# Directory where python binary is installed
-# ansible_python_interpreter: "/opt/bin/python"
-
-# This is the group that the cert creation scripts chgrp the
-# cert files to. Not really changable...
-kube_cert_group: kube-cert
-
-# Cluster Loglevel configuration
-kube_log_level: 2
-
-# Users to create for basic auth in Kubernetes API via HTTP
-kube_api_pwd: "changeme"
-kube_users:
-  kube:
-    pass: "{{kube_api_pwd}}"
-    role: admin
-  root:
-    pass: "changeme"
-    role: admin
-
-# Kubernetes cluster name, also will be used as DNS domain
-cluster_name: cluster.local
-# Subdomains of DNS domain to be resolved via /etc/resolv.conf
-ndots: 5
-# Deploy netchecker app to verify DNS resolve as an HTTP service
-deploy_netchecker: false
-
-# For some environments, each node has a pubilcally accessible
-# address and an address it should bind services to.  These are
-# really inventory level variables, but described here for consistency.
-#
-# When advertising access, the access_ip will be used, but will defer to
-# ip and then the default ansible ip when unspecified.
-#
-# When binding to restrict access, the ip variable will be used, but will
-# defer to the default ansible ip when unspecified.
-#
-# The ip variable is used for specific address binding, e.g. listen address
-# for etcd.  This is use to help with environments like Vagrant or multi-nic
-# systems where one address should be preferred over another.
-# ip: 10.2.2.2
-#
-# The access_ip variable is used to define how other nodes should access
-# the node.  This is used in flannel to allow other flannel nodes to see
-# this node for example.  The access_ip is really useful AWS and Google
-# environments where the nodes are accessed remotely by the "public" ip,
-# but don't know about that address themselves.
-# access_ip: 1.1.1.1
-
-# Etcd access modes:
-# Enable multiaccess to configure clients to access all of the etcd members directly
-# as the "http://hostX:port, http://hostY:port, ..." and ignore the proxy loadbalancers.
-# This may be the case if clients support and loadbalance multiple etcd servers  natively.
-etcd_multiaccess: true
-
-# Assume there are no internal loadbalancers for apiservers exist and listen on
-# kube_apiserver_port (default 443)
-loadbalancer_apiserver_localhost: true
-
-# Choose network plugin (calico, weave or flannel)
-kube_network_plugin: flannel
-
-# Kubernetes internal network for services, unused block of space.
-kube_service_addresses: 10.233.0.0/18
-
-# internal network. When used, it will assign IP
-# addresses from this range to individual pods.
-# This network must be unused in your network infrastructure!
-kube_pods_subnet: 10.233.64.0/18
-
-# internal network total size (optional). This is the prefix of the
-# entire network. Must be unused in your environment.
-# kube_network_prefix: 18
-
-# internal network node size allocation (optional). This is the size allocated
-# to each node on your network.  With these defaults you should have
-# room for 4096 nodes with 254 pods per node.
-kube_network_node_prefix: 24
-
-# With calico it is possible to distributed routes with border routers of the datacenter.
-peer_with_router: false
-# Warning : enabling router peering will disable calico's default behavior ('node mesh').
-# The subnets of each nodes will be distributed by the datacenter router
-
-# The port the API Server will be listening on.
-kube_apiserver_ip: "{{ kube_service_addresses|ipaddr('net')|ipaddr(1)|ipaddr('address') }}"
-kube_apiserver_port: 443 # (https)
-kube_apiserver_insecure_port: 8080 # (http)
-
-# Internal DNS configuration.
-# Kubernetes can create and mainatain its own DNS server to resolve service names
-# into appropriate IP addresses. It's highly advisable to run such DNS server,
-# as it greatly simplifies configuration of your applications - you can use
-# service names instead of magic environment variables.
-
-# Can be dnsmasq_kubedns, kubedns or none
-dns_mode: dnsmasq_kubedns
-
-# Can be docker_dns, host_resolvconf or none
-resolvconf_mode: docker_dns
-
-## Upstream dns servers used by dnsmasq
-#upstream_dns_servers:
-#  - 8.8.8.8
-#  - 8.8.4.4
-
-dns_domain: "{{ cluster_name }}"
-
-# Ip address of the kubernetes skydns service
-skydns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(3)|ipaddr('address') }}"
-dns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(2)|ipaddr('address') }}"
-
-# There are some changes specific to the cloud providers
-# for instance we need to encapsulate packets with some network plugins
-# If set the possible values are either 'gce', 'aws', 'azure' or 'openstack'
-# When openstack is used make sure to source in the openstack credentials
-# like you would do when using nova-client before starting the playbook.
-# When azure is used, you need to also set the following variables.
-# cloud_provider:
-
-# see docs/azure.md for details on how to get these values
-#azure_tenant_id:
-#azure_subscription_id:
-#azure_aad_client_id:
-#azure_aad_client_secret:
-#azure_resource_group:
-#azure_location:
-#azure_subnet_name:
-#azure_security_group_name:
-#azure_vnet_name:
-
-
-## Set these proxy values in order to update docker daemon to use proxies
-# http_proxy: ""
-# https_proxy: ""
-# no_proxy: ""
-
-# Path used to store Docker data
-docker_daemon_graph: "/var/lib/docker"
-
-## A string of extra options to pass to the docker daemon.
-## This string should be exactly as you wish it to appear.
-## An obvious use case is allowing insecure-registry access
-## to self hosted registries like so:
-docker_options: "--insecure-registry={{ kube_service_addresses }} --graph={{ docker_daemon_graph }}"
-
-# K8s image pull policy (imagePullPolicy)
-k8s_image_pull_policy: IfNotPresent
-
-# default packages to install within the cluster
-kpm_packages: []
-#  - name: kube-system/grafana
--- a/contrib/terraform/openstack/kubespray.tf
+++ b/contrib/terraform/openstack/kubespray.tf
@@ -68,7 +68,7 @@ resource "openstack_compute_instance_v2" "k8s_master" {
    floating_ip = "${element(openstack_networking_floatingip_v2.k8s_master.*.address, count.index)}"
    metadata = {
        ssh_user = "${var.ssh_user}"
-        kubespray_groups = "etcd,kube-master,kube-node,k8s-cluster"
+        kubespray_groups = "etcd,kube-master,kube-node,k8s-cluster,vault"
    }
    
 }
@@ -87,10 +87,10 @@ resource "openstack_compute_instance_v2" "k8s_master_no_floating_ip" {
                        "${openstack_compute_secgroup_v2.k8s.name}" ]
    metadata = {
        ssh_user = "${var.ssh_user}"
-        kubespray_groups = "etcd,kube-master,kube-node,k8s-cluster"
+        kubespray_groups = "etcd,kube-master,kube-node,k8s-cluster,vault,no-floating"
    }
    provisioner "local-exec" {
-        command = "sed s/USER/${var.ssh_user}/ contrib/terraform/openstack/ansible_bastion_template.txt | sed s/BASTION_ADDRESS/${element(openstack_networking_floatingip_v2.k8s_master.*.address, 0)}/ > contrib/terraform/openstack/group_vars/k8s-cluster.yml"
+        command = "sed s/USER/${var.ssh_user}/ contrib/terraform/openstack/ansible_bastion_template.txt | sed s/BASTION_ADDRESS/${element(openstack_networking_floatingip_v2.k8s_master.*.address, 0)}/ > contrib/terraform/openstack/group_vars/no-floating.yml"
    }
 }

@@ -107,7 +107,7 @@ resource "openstack_compute_instance_v2" "k8s_node" {
    floating_ip = "${element(openstack_networking_floatingip_v2.k8s_node.*.address, count.index)}"
    metadata = {
        ssh_user = "${var.ssh_user}"
-        kubespray_groups = "kube-node,k8s-cluster"
+        kubespray_groups = "kube-node,k8s-cluster,vault"
    }
 }

@@ -123,10 +123,10 @@ resource "openstack_compute_instance_v2" "k8s_node_no_floating_ip" {
    security_groups = ["${openstack_compute_secgroup_v2.k8s.name}" ]
    metadata = {
        ssh_user = "${var.ssh_user}"
-        kubespray_groups = "kube-node,k8s-cluster"
+        kubespray_groups = "kube-node,k8s-cluster,vault,no-floating"
    }
    provisioner "local-exec" {
-	command = "sed s/USER/${var.ssh_user}/ contrib/terraform/openstack/ansible_bastion_template.txt | sed s/BASTION_ADDRESS/${element(openstack_networking_floatingip_v2.k8s_master.*.address, 0)}/ > contrib/terraform/openstack/group_vars/k8s-cluster.yml"        
+	command = "sed s/USER/${var.ssh_user}/ contrib/terraform/openstack/ansible_bastion_template.txt | sed s/BASTION_ADDRESS/${element(openstack_networking_floatingip_v2.k8s_master.*.address, 0)}/ > contrib/terraform/openstack/group_vars/no-floating.yml"        
    }
 }

--- a/docs/ansible.md
+++ b/docs/ansible.md
@@ -8,20 +8,39 @@ The inventory is composed of 3 groups:

 * **kube-node** : list of kubernetes nodes where the pods will run.
 * **kube-master** : list of servers where kubernetes master components (apiserver, scheduler, controller) will run.
-  Note: if you want the server to act both as master and node the server must be defined on both groups _kube-master_ and _kube-node_
-* **etcd**: list of server to compose the etcd server. you should have at least 3 servers for failover purposes.
+* **etcd**: list of servers to compose the etcd server. You should have at least 3 servers for failover purpose.
+
+Note: do not modify the children of _k8s-cluster_, like putting
+the _etcd_ group into the _k8s-cluster_, unless you are certain
+to do that and you have it fully contained in the latter:
+
+```
+k8s-cluster ⊂ etcd => kube-node ∩ etcd = etcd
+```
+
+When _kube-node_ contains _etcd_, you define your etcd cluster to be as well schedulable for Kubernetes workloads.
+If you want it a standalone, make sure those groups do not intersect.
+If you want the server to act both as master and node, the server must be defined
+on both groups _kube-master_ and _kube-node_. If you want a standalone and
+unschedulable master, the server must be defined only in the _kube-master_ and
+not _kube-node_.
+
+There are also two special groups:
+
+* **calico-rr**  : explained for [advanced Calico networking cases](docs/calico.md)
+* **bastion** : configure a bastion host if your nodes are not directly reachable

 Below is a complete inventory example:

 ```
 ## Configure 'ip' variable to bind kubernetes services on a
 ## different ip than the default iface
-node1 ansible_ssh_host=95.54.0.12  # ip=10.3.0.1
-node2 ansible_ssh_host=95.54.0.13  # ip=10.3.0.2
-node3 ansible_ssh_host=95.54.0.14  # ip=10.3.0.3
-node4 ansible_ssh_host=95.54.0.15  # ip=10.3.0.4
-node5 ansible_ssh_host=95.54.0.16  # ip=10.3.0.5
-node6 ansible_ssh_host=95.54.0.17  # ip=10.3.0.6
+node1 ansible_ssh_host=95.54.0.12 ip=10.3.0.1
+node2 ansible_ssh_host=95.54.0.13 ip=10.3.0.2
+node3 ansible_ssh_host=95.54.0.14 ip=10.3.0.3
+node4 ansible_ssh_host=95.54.0.15 ip=10.3.0.4
+node5 ansible_ssh_host=95.54.0.16 ip=10.3.0.5
+node6 ansible_ssh_host=95.54.0.17 ip=10.3.0.6

 [kube-master]
 node1
@@ -42,14 +61,15 @@ node6
 [k8s-cluster:children]
 kube-node
 kube-master
-etcd
 ```

 Group vars and overriding variables precedence
 ----------------------------------------------

 The group variables to control main deployment options are located in the directory ``inventory/group_vars``.
-
+Optional variables are located in the `inventory/group_vars/all.yml`.
+Mandatory variables that are common for at least one role (or a node group) can be found in the
+`inventory/group_vars/k8s-cluster.yml`.
 There are also role vars for docker, rkt, kubernetes preinstall and master roles.
 According to the [ansible docs](http://docs.ansible.com/ansible/playbooks_variables.html#variable-precedence-where-should-i-put-a-variable),
 those cannot be overriden from the group vars. In order to override, one should use
@@ -142,7 +162,7 @@ ansible-playbook -i inventory/inventory.ini -e dns_server='' cluster.yml --tags
 And this prepares all container images localy (at the ansible runner node) without installing
 or upgrading related stuff or trying to upload container to K8s cluster nodes:
 ```
-ansible-playbook -i inventory/inventory.ini cluster.yaml \
+ansible-playbook -i inventory/inventory.ini cluster.yml \
    -e download_run_once=true -e download_localhost=true \
    --tags download --skip-tags upload,upgrade
 ```
--- a/docs/atomic.md
+++ b/docs/atomic.md
@@ -0,0 +1,22 @@
+Atomic host bootstrap
+=====================
+
+Atomic host testing has been done with the network plugin flannel. Change the inventory var `kube_network_plugin: flannel`.
+
+Note: Flannel is the only plugin that has currently been tested with atomic
+
+### Vagrant
+
+* For bootstrapping with Vagrant, use box centos/atomic-host 
+* Update VagrantFile variable `local_release_dir` to `/var/vagrant/temp`.
+* Update `vm_memory = 2048` and `vm_cpus = 2`
+* Networking on vagrant hosts has to be brought up manually once they are booted.
+
+    ```
+    vagrant ssh
+    sudo /sbin/ifup enp0s8
+    ```
+
+* For users of vagrant-libvirt download qcow2 format from https://wiki.centos.org/SpecialInterestGroup/Atomic/Download/
+
+Then you can proceed to [cluster deployment](#run-deployment)
--- a/docs/aws.md
+++ b/docs/aws.md
@@ -3,7 +3,7 @@ AWS

 To deploy kubespray on [AWS](https://aws.amazon.com/) uncomment the `cloud_provider` option in `group_vars/all.yml` and set it to `'aws'`.

-Prior to creating your instances, you **must** ensure that you have created IAM roles and policies for both "kubernetes-master" and "kubernetes-node". You can find the IAM policies [here](https://github.com/kubernetes/kubernetes/tree/master/cluster/aws/templates/iam). See the [IAM Documentation](https://aws.amazon.com/documentation/iam/) if guidance is needed on how to set these up. When you bring your instances online, associate them with the respective IAM role. Nodes that are only to be used for Etcd do not need a role.
+Prior to creating your instances, you **must** ensure that you have created IAM roles and policies for both "kubernetes-master" and "kubernetes-node". You can find the IAM policies [here](https://github.com/kubernetes-incubator/kargo/tree/master/contrib/aws_iam/). See the [IAM Documentation](https://aws.amazon.com/documentation/iam/) if guidance is needed on how to set these up. When you bring your instances online, associate them with the respective IAM role. Nodes that are only to be used for Etcd do not need a role.

 The next step is to make sure the hostnames in your `inventory` file are identical to your internal hostnames in AWS. This may look something like `ip-111-222-333-444.us-west-2.compute.internal`. You can then specify how Ansible connects to these instances with `ansible_ssh_host` and `ansible_ssh_user`.

--- a/docs/azure.md
+++ b/docs/azure.md
@@ -1,7 +1,7 @@
 Azure
 ===============

-To deploy kubespray on [Azure](https://azure.microsoft.com) uncomment the `cloud_provider` option in `group_vars/all.yml` and set it to `'azure'`.
+To deploy Kubernetes on [Azure](https://azure.microsoft.com) uncomment the `cloud_provider` option in `group_vars/all.yml` and set it to `'azure'`.

 All your instances are required to run in a resource group and a routing table has to be attached to the subnet your instances are in.

@@ -49,8 +49,8 @@ This is the AppId from the last command
 - Create the role assignment with:
 `azure role assignment create --spn http://kubernetes -o "Owner" -c /subscriptions/SUBSCRIPTION_ID`

-azure\_aad\_client\_id musst be set to the AppId, azure\_aad\_client\_secret is your choosen secret.
+azure\_aad\_client\_id must be set to the AppId, azure\_aad\_client\_secret is your choosen secret.

 ## Provisioning Azure with Resource Group Templates

-You'll find Resource Group Templates and scripts to provision the required infrastructore to Azure in [*contrib/azurerm*](../contrib/azurerm/README.md)
+You'll find Resource Group Templates and scripts to provision the required infrastructure to Azure in [*contrib/azurerm*](../contrib/azurerm/README.md)
--- a/docs/calico.md
+++ b/docs/calico.md
@@ -147,6 +147,16 @@ The inventory above will deploy the following topology assuming that calico's

 ![Image](figures/kargo-calico-rr.png?raw=true)

+##### Optional : Define default endpoint to host action
+
+By default Calico blocks traffic from endpoints to the host itself by using an iptables DROP action. When using it in kubernetes the action has to be changed to RETURN (default in kargo) or ACCEPT (see https://github.com/projectcalico/felix/issues/660 and https://github.com/projectcalico/calicoctl/issues/1389). Otherwise all network packets from pods (with hostNetwork=False) to services endpoints (with hostNetwork=True) withing the same node are dropped.
+
+
+To re-define default action please set the following variable in your inventory:
+```
+calico_endpoint_to_host_action: "ACCEPT"
+```
+
 Cloud providers configuration
 =============================

--- a/docs/calico_peer_example/new-york.yml
+++ b/docs/calico_peer_example/new-york.yml
--- a/docs/calico_peer_example/paris.yml
+++ b/docs/calico_peer_example/paris.yml
--- a/docs/coreos.md
+++ b/docs/coreos.md
@@ -13,12 +13,4 @@ Before running the cluster playbook you must satisfy the following requirements:

 * On each CoreOS nodes a writable directory **/opt/bin** (~400M disk space)

-* Uncomment the variable **ansible\_python\_interpreter** in the file `inventory/group_vars/all.yml`
-
-* run the Python bootstrap playbook
-
-```
-ansible-playbook -u smana -e ansible_ssh_user=smana  -b --become-user=root -i inventory/inventory.cfg coreos-bootstrap.yml
-```
-
 Then you can proceed to [cluster deployment](#run-deployment)
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -1,10 +1,10 @@
 Getting started
 ===============

-The easiest way to run the deployement is to use the **kargo-cli** tool. 
+The easiest way to run the deployement is to use the **kargo-cli** tool.
 A complete documentation can be found in its [github repository](https://github.com/kubespray/kargo-cli).

-Here is a simple example on AWS: 
+Here is a simple example on AWS:

 * Create instances and generate the inventory

@@ -12,21 +12,46 @@ Here is a simple example on AWS:
 kargo aws --instances 3
 ```

-* Run the deployment 
+* Run the deployment

 ```
 kargo deploy --aws -u centos -n calico
 ```

 Building your own inventory
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+---------------------------

-Ansible inventory can be stored in 3 formats: YAML, JSON, or inifile. There is
+Ansible inventory can be stored in 3 formats: YAML, JSON, or INI-like. There is
 an example inventory located
 [here](https://github.com/kubernetes-incubator/kargo/blob/master/inventory/inventory.example).

 You can use an
-[inventory generator](https://github.com/kubernetes-incubator/kargo/blob/master/contrib/inventory_generator/inventory_generator.py)
+[inventory generator](https://github.com/kubernetes-incubator/kargo/blob/master/contrib/inventory_builder/inventory.py)
 to create or modify an Ansible inventory. Currently, it is limited in
 functionality and is only use for making a basic Kargo cluster, but it does
-support creating large clusters.
+support creating large clusters. It now supports
+separated ETCD and Kubernetes master roles from node role if the size exceeds a
+certain threshold. Run inventory.py help for more information.
+
+Example inventory generator usage:
+
+```
+cp -r inventory my_inventory
+declare -a IPS=(10.10.1.3 10.10.1.4 10.10.1.5)
+CONFIG_FILE=my_inventory/inventory.cfg python3 contrib/inventory_builder/inventory.py ${IPS}
+```
+
+Starting custom deployment
+--------------------------
+
+Once you have an inventory, you may want to customize deployment data vars
+and start the deployment:
+
+**IMPORTANT: Edit my_inventory/groups_vars/*.yaml to override data vars**
+
+```
+ansible-playbook -i my_inventory/inventory.cfg cluster.yml -b -v \
+  --private-key=~/.ssh/private_key
+```
+
+See more details in the [ansible guide](ansible.md).
--- a/docs/ha-mode.md
+++ b/docs/ha-mode.md
@@ -11,18 +11,11 @@ achieve the same goal.
 Etcd
 ----

-Etcd proxies are deployed on each node in the `k8s-cluster` group. A proxy is
-a separate etcd process. It has a `localhost:2379` frontend and all of the etcd
-cluster members as backends. Note that the `access_ip` is used as the backend
-IP, if specified. Frontend endpoints cannot be accessed externally as they are
-bound to a localhost only.
-
 The `etcd_access_endpoint` fact provides an access pattern for clients. And the
-`etcd_multiaccess` (defaults to `false`) group var controlls that behavior.
-When enabled, it makes deployed components to access the etcd cluster members
+`etcd_multiaccess` (defaults to `True`) group var controlls that behavior.
+It makes deployed components to access the etcd cluster members
 directly: `http://ip1:2379, http://ip2:2379,...`. This mode assumes the clients
-do a loadbalancing and handle HA for connections. Note, a pod definition of a
-flannel networking plugin always uses a single `--etcd-server` endpoint!
+do a loadbalancing and handle HA for connections.


 Kube-apiserver
@@ -33,15 +26,20 @@ proxy. Kargo includes support for an nginx-based proxy that resides on each
 non-master Kubernetes node. This is referred to as localhost loadbalancing. It
 is less efficient than a dedicated load balancer because it creates extra
 health checks on the Kubernetes apiserver, but is more practical for scenarios
-where an external LB or virtual IP management is inconvenient.
+where an external LB or virtual IP management is inconvenient.  This option is
+configured by the variable `loadbalancer_apiserver_localhost` (defaults to `True`).
+You may also define the port the local internal loadbalancer users by changing,
+`nginx_kube_apiserver_port`.  This defaults to the value of `kube_apiserver_port`.
+It is also import to note that Kargo will only configure kubelet and kube-proxy
+on non-master nodes to use the local internal loadbalancer.

-This option is configured by the variable `loadbalancer_apiserver_localhost`.
-you will need to configure your own loadbalancer to achieve HA. Note that
-deploying a loadbalancer is up to a user and is not covered by ansible roles
-in Kargo. By default, it only configures a non-HA endpoint, which points to
-the `access_ip` or IP address of the first server node in the `kube-master`
-group. It can also configure clients to use endpoints for a given loadbalancer
-type. The following diagram shows how traffic to the apiserver is directed.
+If you choose to NOT use the local internal loadbalancer, you will need to configure
+your own loadbalancer to achieve HA. Note that deploying a loadbalancer is up to
+a user and is not covered by ansible roles in Kargo. By default, it only configures
+a non-HA endpoint, which points to the `access_ip` or IP address of the first server
+node in the `kube-master` group. It can also configure clients to use endpoints
+for a given loadbalancer type. The following diagram shows how traffic to the
+apiserver is directed.

 ![Image](figures/loadbalancer_localhost.png?raw=true)

@@ -63,8 +61,8 @@ listen kubernetes-apiserver-https
  mode tcp
  timeout client 3h
  timeout server 3h
-  server master1 <IP1>:443
-  server master2 <IP2>:443
+  server master1 <IP1>:6443
+  server master2 <IP2>:6443
  balance roundrobin
 ```

@@ -90,16 +88,18 @@ Access endpoints are evaluated automagically, as the following:

 | Endpoint type                | kube-master   | non-master          |
 |------------------------------|---------------|---------------------|
-| Local LB                     | http://lc:p   | https://lc:sp       |
+| Local LB (default)           | http://lc:p   | https://lc:nsp      |
 | External LB, no internal     | https://lb:lp | https://lb:lp       |
-| No ext/int LB (default)      | http://lc:p   | https://m[0].aip:sp |
+| No ext/int LB                | http://lc:p   | https://m[0].aip:sp |

 Where:
 * `m[0]` - the first node in the `kube-master` group;
 * `lb` - LB FQDN, `apiserver_loadbalancer_domain_name`;
 * `lc` - localhost;
 * `p` - insecure port, `kube_apiserver_insecure_port`
+* `nsp` - nginx secure port, `nginx_kube_apiserver_port`;
 * `sp` - secure port, `kube_apiserver_port`;
 * `lp` - LB port, `loadbalancer_apiserver.port`, defers to the secure port;
 * `ip` - the node IP, defers to the ansible IP;
 * `aip` - `access_ip`, defers to the ip.
+
--- a/docs/kubernetes-reliability.md
+++ b/docs/kubernetes-reliability.md
@@ -0,0 +1,103 @@
+# Overview
+
+Distributed system such as Kubernetes are designed to be resilient to the
+failures.  More details about Kubernetes High-Availability (HA) may be found at
+[Building High-Availability Clusters](https://kubernetes.io/docs/admin/high-availability/)
+
+To have a simple view the most of parts of HA will be skipped to describe
+Kubelet<->Controller Manager communication only.
+
+By default the normal behavior looks like:
+
+1. Kubelet updates it status to apiserver periodically, as specified by
+   `--node-status-update-frequency`. The default value is **10s**.
+
+2. Kubernetes controller manager checks the statuses of Kubelets every
+   `–-node-monitor-period`. The default value is **5s**.
+
+3. In case the status is updated  within `--node-monitor-grace-period` of time,
+   Kubernetes controller manager considers healthy status of Kubelet. The
+   default value is **40s**.
+
+> Kubernetes controller manager and Kubelets work asynchronously. It means that
+> the delay may include any network latency, API Server latency, etcd latency,
+> latency caused by load on one's master nodes and so on. So if
+> `--node-status-update-frequency` is set to 5s in reality it may appear in
+> etcd in 6-7 seconds or even longer when etcd cannot commit data to quorum
+> nodes.
+
+# Failure
+
+Kubelet will try to make `nodeStatusUpdateRetry` post attempts. Currently
+`nodeStatusUpdateRetry` is constantly set to 5 in
+[kubelet.go](https://github.com/kubernetes/kubernetes/blob/release-1.5/pkg/kubelet/kubelet.go#L102).
+
+Kubelet will try to update the status in
+[tryUpdateNodeStatus](https://github.com/kubernetes/kubernetes/blob/release-1.5/pkg/kubelet/kubelet_node_status.go#L345)
+function. Kubelet uses `http.Client()` Golang method, but has no specified
+timeout. Thus there may be some glitches when API Server is overloaded while
+TCP connection is established.
+
+So, there will be `nodeStatusUpdateRetry` * `--node-status-update-frequency`
+attempts to set a status of node.
+
+At the same time Kubernetes controller manager will try to check
+`nodeStatusUpdateRetry` times every `--node-monitor-period` of time. After
+`--node-monitor-grace-period` it will consider node unhealthy. It will remove
+its pods based on `--pod-eviction-timeout`
+
+Kube proxy has a watcher over API. Once pods are evicted, Kube proxy will
+notice and will update iptables of the node. It will remove endpoints from
+services so pods from failed node won't be accessible anymore.
+
+# Recommendations for different cases
+
+## Fast Update and Fast Reaction
+
+If `-–node-status-update-frequency` is set to **4s** (10s is default).
+`--node-monitor-period` to **2s** (5s is default).
+`--node-monitor-grace-period` to **20s** (40s is default).
+`--pod-eviction-timeout` is set to **30s** (5m is default)
+
+In such scenario, pods will be evicted in **50s** because the node will be
+considered as down after **20s**, and `--pod-eviction-timeout` occurs after
+**30s** more.  However, this scenario creates an overhead on etcd as every node
+will try to update its status every 2 seconds.
+
+If the environment has 1000 nodes, there will be 15000 node updates per
+minute which may require large etcd containers or even dedicated nodes for etcd.
+
+> If we calculate the number of tries, the division will give 5, but in reality
+> it will be from 3 to 5 with `nodeStatusUpdateRetry` attempts of each try. The
+> total number of attemtps will vary from 15 to 25 due to latency of all
+> components.
+
+## Medium Update and Average Reaction
+
+Let's set `-–node-status-update-frequency` to **20s**
+`--node-monitor-grace-period` to **2m** and `--pod-eviction-timeout` to **1m**.
+In that case, Kubelet will try to update status every 20s. So, it will be 6 * 5
+= 30 attempts before Kubernetes controller manager will consider unhealthy
+status of node. After 1m it will evict all pods. The total time will be 3m
+before eviction process.
+
+Such scenario is good for medium environments as 1000 nodes will require 3000
+etcd updates per minute.
+
+> In reality, there will be from 4 to 6 node update tries. The total number of
+> of attempts will vary from 20 to 30.
+
+## Low Update and Slow reaction
+
+Let's set `-–node-status-update-frequency` to **1m**.
+`--node-monitor-grace-period` will set to **5m** and `--pod-eviction-timeout`
+to **1m**. In this scenario, every kubelet will try to update the status every
+minute. There will be 5 * 5 = 25 attempts before unhealty status. After 5m,
+Kubernetes controller manager will set unhealthy status. This means that pods
+will be evicted after 1m after being marked unhealthy. (6m in total).
+
+> In reality, there will be from 3 to 5 tries. The total number of attempt will
+> vary from 15 to 25.
+
+There can be different combinations such as Fast Update with Slow reaction to
+satisfy specific cases.
--- a/docs/large-deployments.md
+++ b/docs/large-deployments.md
@@ -3,7 +3,8 @@ Large deployments of K8s

 For a large scaled deployments, consider the following configuration changes:

-* Tune [ansible settings](http://docs.ansible.com/ansible/intro_configuration.html)
+* Tune [ansible settings]
+  (http://docs.ansible.com/ansible/intro_configuration.html)
  for `forks` and `timeout` vars to fit large numbers of nodes being deployed.

 * Override containers' `foo_image_repo` vars to point to intranet registry.
@@ -23,9 +24,25 @@ For a large scaled deployments, consider the following configuration changes:
 * Tune CPU/memory limits and requests. Those are located in roles' defaults
  and named like ``foo_memory_limit``, ``foo_memory_requests`` and
  ``foo_cpu_limit``, ``foo_cpu_requests``. Note that 'Mi' memory units for K8s
-  will be submitted as 'M', if applied for ``docker run``, and cpu K8s units will
-  end up with the 'm' skipped for docker as well. This is required as docker does not
-  understand k8s units well.
+  will be submitted as 'M', if applied for ``docker run``, and cpu K8s units
+  will end up with the 'm' skipped for docker as well. This is required as
+  docker does not understand k8s units well.
+
+* Tune ``kubelet_status_update_frequency`` to increase reliability of kubelet.
+  ``kube_controller_node_monitor_grace_period``,
+  ``kube_controller_node_monitor_period``,
+  ``kube_controller_pod_eviction_timeout`` for better Kubernetes reliability.
+  Check out [Kubernetes Reliability](kubernetes-reliability.md)
+
+* Add calico-rr nodes if you are deploying with Calico or Canal. Nodes recover
+  from host/network interruption much quicker with calico-rr. Note that
+  calico-rr role must be on a host without kube-master or kube-node role (but
+  etcd role is okay).
+
+* Check out the
+  [Inventory](getting-started.md#building-your-own-inventory)
+  section of the Getting started guide for tips on creating a large scale
+  Ansible inventory.

 For example, when deploying 200 nodes, you may want to run ansible with
 ``--forks=50``, ``--timeout=600`` and define the ``retry_stagger: 60``.
--- a/docs/netcheck.md
+++ b/docs/netcheck.md
@@ -2,7 +2,7 @@ Network Checker Application
 ===========================

 With the ``deploy_netchecker`` var enabled (defaults to false), Kargo deploys a
-Network Checker Application from the 3rd side `l23network/mcp-netchecker` docker
+Network Checker Application from the 3rd side `l23network/k8s-netchecker` docker
 images. It consists of the server and agents trying to reach the server by usual
 for Kubernetes applications network connectivity meanings. Therefore, this
 automagically verifies a pod to pod connectivity via the cluster IP and checks
@@ -25,8 +25,8 @@ There are related application specifc variables:
 netchecker_port: 31081
 agent_report_interval: 15
 netcheck_namespace: default
-agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1"
-server_img: "quay.io/l23network/mcp-netchecker-server:v0.1"
+agent_img: "quay.io/l23network/k8s-netchecker-agent:v1.0"
+server_img: "quay.io/l23network/k8s-netchecker-server:v1.0"
 ```

 Note that the application verifies DNS resolve for FQDNs comprising only the
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -5,7 +5,7 @@ Kargo's roadmap
 - Propose kubeadm as an option in order to setup the kubernetes cluster.
 That would probably improve deployment speed and certs management [#553](https://github.com/kubespray/kargo/issues/553)

-### Self deployment (pull-mode) [#320](https://github.com/kubespray/kargo/issues/320)
+### Self deployment (pull-mode) [#320](https://github.com/kubespray/kargo/issues/320)
 - the playbook would install and configure docker/rkt and the etcd cluster
 - the following data would be inserted into etcd: certs,tokens,users,inventory,group_vars.
 - a "kubespray" container would be deployed (kargo-cli, ansible-playbook, kpm)
@@ -13,42 +13,43 @@ That would probably improve deployment speed and certs management [#553](https:/
 - **self deployment** of the node from inside a container [#321](https://github.com/kubespray/kargo/issues/321)

 ### Provisionning and cloud providers
- Terraform to provision instances on **GCE, AWS, Openstack, Digital Ocean, Azure**
- On AWS autoscaling, multi AZ
- On Azure autoscaling, create loadbalancer [#297](https://github.com/kubespray/kargo/issues/297)
- On GCE be able to create a loadbalancer automatically (IAM ?) [#280](https://github.com/kubespray/kargo/issues/280)
- **TLS boostrap** support for kubelet [#234](https://github.com/kubespray/kargo/issues/234)
-  (related issues: https://github.com/kubernetes/kubernetes/pull/20439 <br>
+- [ ] Terraform to provision instances on **GCE, AWS, Openstack, Digital Ocean, Azure**
+- [ ] On AWS autoscaling, multi AZ
+- [ ] On Azure autoscaling, create loadbalancer [#297](https://github.com/kubespray/kargo/issues/297)
+- [ ] On GCE be able to create a loadbalancer automatically (IAM ?) [#280](https://github.com/kubespray/kargo/issues/280)
+- [x] **TLS boostrap** support for kubelet [#234](https://github.com/kubespray/kargo/issues/234)
+  (related issues: https://github.com/kubernetes/kubernetes/pull/20439 <br>
   https://github.com/kubernetes/kubernetes/issues/18112)

 ### Tests
- Run kubernetes e2e tests
- migrate to jenkins
+- [x] Run kubernetes e2e tests
+- [x] migrate to jenkins
 (a test is currently a deployment on a 3 node cluste, testing k8s api, ping between 2 pods)
- Full tests on GCE per day (All OS's, all network plugins)
- trigger a single test per pull request
- single test with the Ansible version n-1 per day
- Test idempotency on on single OS but for all network plugins/container engines
- single test on AWS per day
- test different achitectures :
+- [x] Full tests on GCE per day (All OS's, all network plugins)
+- [x] trigger a single test per pull request
+- [ ] ~~single test with the Ansible version n-1 per day~~
+- [x] Test idempotency on on single OS but for all network plugins/container engines
+- [ ] single test on AWS per day
+- [x] test different achitectures :
           - 3 instances, 3 are members of the etcd cluster, 2 of them acting as master and node, 1 as node
           - 5 instances, 3 are etcd and nodes, 2 are masters only
           - 7 instances, 3 etcd only, 2 masters, 2 nodes
- test scale up cluster:  +1 etcd, +1 master, +1 node
+- [ ] test scale up cluster:  +1 etcd, +1 master, +1 node

 ### Lifecycle
- Adopt the kubeadm tool by delegating CM tasks it is capable to accomplish well [#553](https://github.com/kubespray/kargo/issues/553)
- Drain worker node when upgrading k8s components in a worker node. [#154](https://github.com/kubespray/kargo/issues/154)
- Drain worker node when shutting down/deleting an instance
+- [ ] Adopt the kubeadm tool by delegating CM tasks it is capable to accomplish well [#553](https://github.com/kubespray/kargo/issues/553)
+- [x] Drain worker node when upgrading k8s components in a worker node. [#154](https://github.com/kubespray/kargo/issues/154)
+- [ ] Drain worker node when shutting down/deleting an instance
+- [ ] Upgrade granularity: select components to upgrade and skip others

 ### Networking
- romana.io support [#160](https://github.com/kubespray/kargo/issues/160)
- Configure network policy for Calico. [#159](https://github.com/kubespray/kargo/issues/159)
- Opencontrail
- Canal
- Cloud Provider native networking (instead of our network plugins)
+- [ ] romana.io support [#160](https://github.com/kubespray/kargo/issues/160)
+- [ ] Configure network policy for Calico. [#159](https://github.com/kubespray/kargo/issues/159)
+- [ ] Opencontrail
+- [x] Canal
+- [x] Cloud Provider native networking (instead of our network plugins)

-### High availability
+### High availability
 - (to be discussed) option to set a loadbalancer for the apiservers like ucarp/packemaker/keepalived
 While waiting for the issue [kubernetes/kubernetes#18174](https://github.com/kubernetes/kubernetes/issues/18174) to be fixed.

@@ -72,7 +73,7 @@ Include optionals deployments to init the cluster:

 ##### Others

-##### Dashboards:
+##### Dashboards:
 - kubernetes-dashboard
 - Fabric8
 - Tectonic
--- a/docs/test_cases.md
+++ b/docs/test_cases.md
@@ -4,25 +4,40 @@ Travis CI test matrix
 GCE instances
 -------------

-Here is the test matrix for the Travis CI gates:
+Here is the test matrix for the CI gates:

 |           Network plugin|                  OS type|               GCE region|             Nodes layout|
 |-------------------------|-------------------------|-------------------------|-------------------------|
-|                    canal|       debian-8-kubespray|             asia-east1-a|                       ha|
+|                    canal|       debian-8-kubespray|             asia-east1-a|                 ha-scale|
 |                   calico|       debian-8-kubespray|           europe-west1-c|                  default|
 |                  flannel|                 centos-7|        asia-northeast1-c|                  default|
 |                   calico|                 centos-7|            us-central1-b|                       ha|
 |                    weave|                   rhel-7|               us-east1-c|                  default|
-|                    canal|            coreos-stable|               us-west1-b|                  default|
+|                    canal|            coreos-stable|               us-west1-b|                 ha-scale|
 |                    canal|                   rhel-7|        asia-northeast1-b|                 separate|
 |                    weave|       ubuntu-1604-xenial|           europe-west1-d|                 separate|
 |                   calico|            coreos-stable|            us-central1-f|                 separate|

-Where the nodes layout `default` is a non-HA two nodes setup with the separate `kube-node`
-and the `etcd` group merged with the `kube-master`. The `separate` layout is when
-there is only node of each type, which is a kube master, compute and etcd cluster member.
-And the `ha` layout stands for a two etcd nodes, two masters and a single worker node,
-partially intersecting though.
+
+Node Layouts
+------------
+
+There are four node layout types: `default`, `separate`, `ha`, and `scale`.
+
+
+`default` is a non-HA two nodes setup with one separate `kube-node`
+and the `etcd` group merged with the `kube-master`.
+
+`separate` layout is when there is only node of each type, which includes
+ a kube-master, kube-node, and etcd cluster member.
+
+`ha` layout consists of two etcd nodes, two masters and a single worker node,
+with role intersection.
+
+`scale` layout can be combined with above layouts. It includes 200 fake hosts
+in the Ansible inventory. This helps test TLS certificate generation at scale
+to prevent regressions and profile certain long-running tasks. These nodes are
+never actually deployed, but certificates are generated for them.

 Note, the canal network plugin deploys flannel as well plus calico policy controller.

@@ -40,15 +55,15 @@ GCE instances

 |               Stage|      Network plugin|             OS type|          GCE region|        Nodes layout
 |--------------------|--------------------|--------------------|--------------------|--------------------|
-|               part1|              calico|       coreos-stable|          us-west1-b|           separated|
+|               part1|              calico|       coreos-stable|          us-west1-b|            separate|
 |               part1|               canal|  debian-8-kubespray|          us-east1-b|                  ha|
 |               part1|               weave|              rhel-7|      europe-west1-b|             default|
 |               part2|             flannel|            centos-7|          us-west1-a|             default|
 |               part2|              calico|  debian-8-kubespray|       us-central1-b|             default|
 |               part2|               canal|       coreos-stable|          us-east1-b|             default|
-|             special|               canal|              rhel-7|          us-east1-b|           separated|
-|             special|               weave|  ubuntu-1604-xenial|       us-central1-b|           separated|
-|             special|              calico|            centos-7|      europe-west1-b|                  ha|
-|             special|               weave|        coreos-alpha|          us-west1-a|                  ha|
+|             special|               canal|              rhel-7|          us-east1-b|            separate|
+|             special|               weave|  ubuntu-1604-xenial|       us-central1-b|             default|
+|             special|              calico|            centos-7|      europe-west1-b|            ha-scale|
+|             special|               weave|        coreos-alpha|          us-west1-a|            ha-scale|

 The "Stage" means a build step of the build pipeline. The steps are ordered as `part1->part2->special`.
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@@ -18,7 +18,7 @@ versions. Here are all version vars for each component:
 * flannel_version
 * kubedns_version

-#### Example
+#### Unsafe upgrade example

 If you wanted to upgrade just kube_version from v1.4.3 to v1.4.6, you could
 deploy the following way:
@@ -33,15 +33,29 @@ And then repeat with v1.4.6 as kube_version:
 ansible-playbook cluster.yml -i inventory/inventory.cfg -e kube_version=v1.4.6
 ```

+#### Graceful upgrade
+
+Kargo also supports cordon, drain and uncordoning of nodes when performing 
+a cluster upgrade. There is a separate playbook used for this purpose. It is
+important to note that upgrade-cluster.yml can only be used for upgrading an
+existing cluster. That means there must be at least 1 kube-master already
+deployed.
+
+```
+git fetch origin
+git checkout origin/master
+ansible-playbook upgrade-cluster.yml -b -i inventory/inventory.cfg
+```
+
 #### Upgrade order

 As mentioned above, components are upgraded in the order in which they were
 installed in the Ansible playbook. The order of component installation is as
 follows:

-# Docker
-# etcd
-# kubelet and kube-proxy
-# network_plugin (such as Calico or Weave)
-# kube-apiserver, kube-scheduler, and kube-controller-manager
-# Add-ons (such as KubeDNS)
+* Docker
+* etcd
+* kubelet and kube-proxy
+* network_plugin (such as Calico or Weave)
+* kube-apiserver, kube-scheduler, and kube-controller-manager
+* Add-ons (such as KubeDNS)
--- a/docs/vars.md
+++ b/docs/vars.md
@@ -23,7 +23,7 @@ Some variables of note include:
 * *hyperkube_image_repo* - Specify the Docker repository where Hyperkube
  resides
 * *hyperkube_image_tag* - Specify the Docker tag where Hyperkube resides
-* *kube_network_plugin* - Changes k8s plugin to Calico
+* *kube_network_plugin* - Sets k8s network plugin (default Calico)
 * *kube_proxy_mode* - Changes k8s proxy mode to iptables mode
 * *kube_version* - Specify a given Kubernetes hyperkube version
 * *searchdomains* - Array of DNS domains to search when looking up hostnames
@@ -41,8 +41,9 @@ Some variables of note include:
  address instead of localhost for kube-masters and kube-master[0] for
  kube-nodes. See more details in the
  [HA guide](https://github.com/kubernetes-incubator/kargo/blob/master/docs/ha-mode.md).
-* *loadbalancer_apiserver_localhost* - If enabled, all hosts will connect to
-  the apiserver internally load balanced endpoint.  See more details in the
+* *loadbalancer_apiserver_localhost* - makes all hosts to connect to
+  the apiserver internally load balanced endpoint. Mutual exclusive to the
+  `loadbalancer_apiserver`. See more details in the
  [HA guide](https://github.com/kubernetes-incubator/kargo/blob/master/docs/ha-mode.md).

 #### Cluster variables
@@ -92,9 +93,12 @@ Stack](https://github.com/kubernetes-incubator/kargo/blob/master/docs/dns-stack.
  ``--insecure-registry=myregistry.mydomain:5000``
 * *http_proxy/https_proxy/no_proxy* - Proxy variables for deploying behind a
  proxy
+* *kubelet_load_modules* - For some things, kubelet needs to load kernel modules.  For example,
+  dynamic kernel services are needed for mounting persistent volumes into containers.  These may not be
+  loaded by preinstall kubernetes processes.  For example, ceph and rbd backed volumes.  Set this variable to
+  true to let kubelet load kernel modules.

 #### User accounts

 Kargo sets up two Kubernetes accounts by default: ``root`` and ``kube``. Their
 passwords default to changeme. You can set this by changing ``kube_api_pwd``.
-
--- a/docs/vault.md
+++ b/docs/vault.md
@@ -0,0 +1,92 @@
+Hashicorp Vault Role
+====================
+
+Overview
+--------
+
+The Vault role is a two-step process:
+
+1. Bootstrap
+
+You cannot start your certificate management service securely with SSL (and 
+the datastore behind it) without having the certificates in-hand already. This
+presents an unfortunate chicken and egg scenario, with one requiring the other.
+To solve for this, the Bootstrap step was added.
+
+This step spins up a temporary instance of Vault to issue certificates for
+Vault itself. It then leaves the temporary instance running, so that the Etcd
+role can generate certs for itself as well. Eventually, this may be improved
+to allow alternate backends (such as Consul), but currently the tasks are
+hardcoded to only create a Vault role for Etcd.
+
+2. Cluster
+
+This step is where the long-term Vault cluster is started and configured. Its
+first task, is to stop any temporary instances of Vault, to free the port for
+the long-term. At the end of this task, the entire Vault cluster should be up
+and read to go.
+
+
+Keys to the Kingdom
+-------------------
+
+The two most important security pieces of Vault are the ``root_token``
+and ``unsealing_keys``. Both of these values are given exactly once, during
+the initialization of the Vault cluster. For convenience, they are saved
+to the ``vault_secret_dir`` (default: /etc/vault/secrets) of every host in the
+vault group.
+
+It is *highly* recommended that these secrets are removed from the servers after
+your cluster has been deployed, and kept in a safe location of your choosing.
+Naturally, the seriousness of the situation depends on what you're doing with
+your Kargo cluster, but with these secrets, an attacker will have the ability
+to authenticate to almost everything in Kubernetes and decode all private
+(HTTPS) traffic on your network signed by Vault certificates.
+
+For even greater security, you may want to remove and store elsewhere any
+CA keys generated as well (e.g. /etc/vault/ssl/ca-key.pem). 
+
+Vault by default encrypts all traffic to and from the datastore backend, all
+resting data, and uses TLS for its TCP listener. It is recommended that you
+do not change the Vault config to disable TLS, unless you absolutely have to.
+
+
+Usage
+-----
+
+To get the Vault role running, you must to do two things at a minimum:
+
+1. Assign the ``vault`` group to at least 1 node in your inventory
+2. Change ``cert_management`` to be ``vault`` instead of ``script``
+
+Nothing else is required, but customization is possible. Check
+``roles/vault/defaults/main.yml`` for the different variables that can be
+overridden, most common being ``vault_config``, ``vault_port``, and
+``vault_deployment_type``.
+
+Also, if you intend to use a Root or Intermediate CA generated elsewhere,
+you'll need to copy the certificate and key to the hosts in the vault group
+prior to running the vault role. By default, they'll be located at
+``/etc/vault/ssl/ca.pem`` and ``/etc/vault/ssl/ca-key.pem``, respectively.
+
+Additional Notes:
+
+- ``groups.vault|first`` is considered the source of truth for Vault variables
+- ``vault_leader_url`` is used as pointer for the current running Vault
+- Each service should have its own role and credentials. Currently those 
+  credentials are saved to ``/etc/vault/roles/<role>/``. The service will
+  need to read in those credentials, if they want to interact with Vault.
+
+
+Potential Work
+--------------
+
+- Change the Vault role to not run certain tasks when ``root_token`` and
+  ``unseal_keys`` are not present. Alternatively, allow user input for these
+  values when missing.
+- Add the ability to start temp Vault with Host, Rkt, or Docker
+- Add a dynamic way to change out the backend role creation during Bootstrap,
+  so other services can be used (such as Consul)
+- Segregate Server Cert generation from Auth Cert generation (separate CAs).
+  This work was partially started with the `auth_cert_backend` tasks, but would
+  need to be further applied to all roles (particularly Etcd and Kubernetes).
--- a/inventory/group_vars/all.yml
+++ b/inventory/group_vars/all.yml
@@ -1,165 +1,59 @@
-# Valid bootstrap options (required): ubuntu, coreos, centos, none
-bootstrap_os: none
+## The access_ip variable is used to define how other nodes should access
+## the node.  This is used in flannel to allow other flannel nodes to see
+## this node for example.  The access_ip is really useful AWS and Google
+## environments where the nodes are accessed remotely by the "public" ip,
+## but don't know about that address themselves.
+#access_ip: 1.1.1.1

-# Directory where the binaries will be installed
-bin_dir: /usr/local/bin
+### LOADBALANCING AND ACCESS MODES
+## Enable multiaccess to configure etcd clients to access all of the etcd members directly
+## as the "http://hostX:port, http://hostY:port, ..." and ignore the proxy loadbalancers.
+## This may be the case if clients support and loadbalance multiple etcd servers  natively.
+#etcd_multiaccess: true

-# Kubernetes configuration dirs and system namespace.
-# Those are where all the additional config stuff goes
-# the kubernetes normally puts in /srv/kubernets.
-# This puts them in a sane location and namespace.
-# Editting those values will almost surely break something.
-kube_config_dir: /etc/kubernetes
-kube_script_dir: "{{ bin_dir }}/kubernetes-scripts"
-kube_manifest_dir: "{{ kube_config_dir }}/manifests"
-system_namespace: kube-system
+## External LB example config
+## apiserver_loadbalancer_domain_name: "elb.some.domain"
+#loadbalancer_apiserver:
+#  address: 1.2.3.4
+#  port: 1234

-# This is where all the cert scripts and certs will be located
-kube_cert_dir: "{{ kube_config_dir }}/ssl"
+## Internal loadbalancers for apiservers
+#loadbalancer_apiserver_localhost: true

-# This is where all of the bearer tokens will be stored
-kube_token_dir: "{{ kube_config_dir }}/tokens"
+## Local loadbalancer should use this port instead, if defined.
+## Defaults to kube_apiserver_port (6443)
+#nginx_kube_apiserver_port: 8443

-# This is where to save basic auth file
-kube_users_dir: "{{ kube_config_dir }}/users"
+### OTHER OPTIONAL VARIABLES
+## For some things, kubelet needs to load kernel modules.  For example, dynamic kernel services are needed
+## for mounting persistent volumes into containers.  These may not be loaded by preinstall kubernetes
+## processes.  For example, ceph and rbd backed volumes.  Set to true to allow kubelet to load kernel
+## modules.
+# kubelet_load_modules: false

-## Change this to use another Kubernetes version, e.g. a current beta release
-kube_version: v1.5.1
+## Internal network total size. This is the prefix of the
+## entire network. Must be unused in your environment.
+#kube_network_prefix: 18

-# Where the binaries will be downloaded.
-# Note: ensure that you've enough disk space (about 1G)
-local_release_dir: "/tmp/releases"
-# Random shifts for retrying failed ops like pushing/downloading
-retry_stagger: 5
-
-# Uncomment this line for CoreOS only.
-# Directory where python binary is installed
-# ansible_python_interpreter: "/opt/bin/python"
-
-# This is the group that the cert creation scripts chgrp the
-# cert files to. Not really changable...
-kube_cert_group: kube-cert
-
-# Cluster Loglevel configuration
-kube_log_level: 2
-
-# Kubernetes 1.5 added a new flag to the apiserver to disable anonymous auth. In previos versions, anonymous auth was
-# not implemented. As the new flag defaults to true, we have to explicetely disable it. Change this line if you want the
-# 1.5 default behavior. The flag is actually only added if the used kubernetes version is >= 1.5
-kube_api_anonymous_auth: false
-
-# Users to create for basic auth in Kubernetes API via HTTP
-kube_api_pwd: "changeme"
-kube_users:
-  kube:
-    pass: "{{kube_api_pwd}}"
-    role: admin
-  root:
-    pass: "changeme"
-    role: admin
-
-# Kubernetes cluster name, also will be used as DNS domain
-cluster_name: cluster.local
-# Subdomains of DNS domain to be resolved via /etc/resolv.conf for hostnet pods
-ndots: 2
-# Deploy netchecker app to verify DNS resolve as an HTTP service
-deploy_netchecker: false
-
-# For some environments, each node has a pubilcally accessible
-# address and an address it should bind services to.  These are
-# really inventory level variables, but described here for consistency.
-#
-# When advertising access, the access_ip will be used, but will defer to
-# ip and then the default ansible ip when unspecified.
-#
-# When binding to restrict access, the ip variable will be used, but will
-# defer to the default ansible ip when unspecified.
-#
-# The ip variable is used for specific address binding, e.g. listen address
-# for etcd.  This is use to help with environments like Vagrant or multi-nic
-# systems where one address should be preferred over another.
-# ip: 10.2.2.2
-#
-# The access_ip variable is used to define how other nodes should access
-# the node.  This is used in flannel to allow other flannel nodes to see
-# this node for example.  The access_ip is really useful AWS and Google
-# environments where the nodes are accessed remotely by the "public" ip,
-# but don't know about that address themselves.
-# access_ip: 1.1.1.1
-
-# Etcd access modes:
-# Enable multiaccess to configure clients to access all of the etcd members directly
-# as the "http://hostX:port, http://hostY:port, ..." and ignore the proxy loadbalancers.
-# This may be the case if clients support and loadbalance multiple etcd servers  natively.
-etcd_multiaccess: true
-
-# Assume there are no internal loadbalancers for apiservers exist and listen on
-# kube_apiserver_port (default 443)
-loadbalancer_apiserver_localhost: true
-
-# Choose network plugin (calico, weave or flannel)
-# Can also be set to 'cloud', which lets the cloud provider setup appropriate routing
-kube_network_plugin: flannel
-
-# Kubernetes internal network for services, unused block of space.
-kube_service_addresses: 10.233.0.0/18
-
-# internal network. When used, it will assign IP
-# addresses from this range to individual pods.
-# This network must be unused in your network infrastructure!
-kube_pods_subnet: 10.233.64.0/18
-
-# internal network total size (optional). This is the prefix of the
-# entire network. Must be unused in your environment.
-# kube_network_prefix: 18
-
-# internal network node size allocation (optional). This is the size allocated
-# to each node on your network.  With these defaults you should have
-# room for 4096 nodes with 254 pods per node.
-kube_network_node_prefix: 24
-
-# With calico it is possible to distributed routes with border routers of the datacenter.
-peer_with_router: false
-# Warning : enabling router peering will disable calico's default behavior ('node mesh').
-# The subnets of each nodes will be distributed by the datacenter router
-
-# The port the API Server will be listening on.
-kube_apiserver_ip: "{{ kube_service_addresses|ipaddr('net')|ipaddr(1)|ipaddr('address') }}"
-kube_apiserver_port: 443 # (https)
-kube_apiserver_insecure_port: 8080 # (http)
-
-# Internal DNS configuration.
-# Kubernetes can create and mainatain its own DNS server to resolve service names
-# into appropriate IP addresses. It's highly advisable to run such DNS server,
-# as it greatly simplifies configuration of your applications - you can use
-# service names instead of magic environment variables.
-
-# Can be dnsmasq_kubedns, kubedns or none
-dns_mode: dnsmasq_kubedns
-
-# Can be docker_dns, host_resolvconf or none
-resolvconf_mode: docker_dns
+## With calico it is possible to distributed routes with border routers of the datacenter.
+## Warning : enabling router peering will disable calico's default behavior ('node mesh').
+## The subnets of each nodes will be distributed by the datacenter router
+#peer_with_router: false

 ## Upstream dns servers used by dnsmasq
 #upstream_dns_servers:
 #  - 8.8.8.8
 #  - 8.8.4.4

-dns_domain: "{{ cluster_name }}"
+## There are some changes specific to the cloud providers
+## for instance we need to encapsulate packets with some network plugins
+## If set the possible values are either 'gce', 'aws', 'azure', 'openstack', or 'vsphere'
+## When openstack is used make sure to source in the openstack credentials
+## like you would do when using nova-client before starting the playbook.
+#cloud_provider:

-# Ip address of the kubernetes skydns service
-skydns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(3)|ipaddr('address') }}"
-dns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(2)|ipaddr('address') }}"
-
-# There are some changes specific to the cloud providers
-# for instance we need to encapsulate packets with some network plugins
-# If set the possible values are either 'gce', 'aws', 'azure' or 'openstack'
-# When openstack is used make sure to source in the openstack credentials
-# like you would do when using nova-client before starting the playbook.
-# When azure is used, you need to also set the following variables.
-# cloud_provider:
-
-# see docs/azure.md for details on how to get these values
+## When azure is used, you need to also set the following variables.
+## see docs/azure.md for details on how to get these values
 #azure_tenant_id:
 #azure_subscription_id:
 #azure_aad_client_id:
@@ -171,34 +65,24 @@ dns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(2)|ipaddr('address')
 #azure_vnet_name:
 #azure_route_table_name:

-
 ## Set these proxy values in order to update docker daemon to use proxies
-# http_proxy: ""
-# https_proxy: ""
-# no_proxy: ""
-
-# Path used to store Docker data
-docker_daemon_graph: "/var/lib/docker"
-
-## A string of extra options to pass to the docker daemon.
-## This string should be exactly as you wish it to appear.
-## An obvious use case is allowing insecure-registry access
-## to self hosted registries like so:
-docker_options: "--insecure-registry={{ kube_service_addresses }} --graph={{ docker_daemon_graph }}"
-docker_bin_dir: "/usr/bin"
+#http_proxy: ""
+#https_proxy: ""
+#no_proxy: ""

 ## Uncomment this if you want to force overlay/overlay2 as docker storage driver
 ## Please note that overlay2 is only supported on newer kernels
 #docker_storage_options: -s overlay2

-# K8s image pull policy (imagePullPolicy)
-k8s_image_pull_policy: IfNotPresent
-
-# default packages to install within the cluster
-kpm_packages: []
+## Default packages to install within the cluster, f.e:
+#kpm_packages:
 #  - name: kube-system/grafana

-# Settings for containerized control plane (etcd/kubelet)
-rkt_version: 1.21.0
-etcd_deployment_type: docker
-kubelet_deployment_type: docker
+## Certificate Management
+## This setting determines whether certs are generated via scripts or whether a
+## cluster of Hashicorp's Vault is started to issue certificates (using etcd
+## as a backend). Options are "script" or "vault"
+#cert_management: script
+
+## Please specify true if you want to perform a kernel upgrade
+kernel_upgrade: false
--- a/inventory/group_vars/k8s-cluster.yml
+++ b/inventory/group_vars/k8s-cluster.yml
@@ -0,0 +1,143 @@
+# Valid bootstrap options (required): ubuntu, coreos, centos, none
+bootstrap_os: none
+
+#Directory where etcd data stored
+etcd_data_dir: /var/lib/etcd
+
+# Directory where the binaries will be installed
+bin_dir: /usr/local/bin
+
+# Kubernetes configuration dirs and system namespace.
+# Those are where all the additional config stuff goes
+# the kubernetes normally puts in /srv/kubernets.
+# This puts them in a sane location and namespace.
+# Editting those values will almost surely break something.
+kube_config_dir: /etc/kubernetes
+kube_script_dir: "{{ bin_dir }}/kubernetes-scripts"
+kube_manifest_dir: "{{ kube_config_dir }}/manifests"
+system_namespace: kube-system
+
+# Logging directory (sysvinit systems)
+kube_log_dir: "/var/log/kubernetes"
+
+# This is where all the cert scripts and certs will be located
+kube_cert_dir: "{{ kube_config_dir }}/ssl"
+
+# This is where all of the bearer tokens will be stored
+kube_token_dir: "{{ kube_config_dir }}/tokens"
+
+# This is where to save basic auth file
+kube_users_dir: "{{ kube_config_dir }}/users"
+
+kube_api_anonymous_auth: false
+
+## Change this to use another Kubernetes version, e.g. a current beta release
+kube_version: v1.5.3
+
+# Where the binaries will be downloaded.
+# Note: ensure that you've enough disk space (about 1G)
+local_release_dir: "/tmp/releases"
+# Random shifts for retrying failed ops like pushing/downloading
+retry_stagger: 5
+
+# This is the group that the cert creation scripts chgrp the
+# cert files to. Not really changable...
+kube_cert_group: kube-cert
+
+# Cluster Loglevel configuration
+kube_log_level: 2
+
+# Users to create for basic auth in Kubernetes API via HTTP
+kube_api_pwd: "changeme"
+kube_users:
+  kube:
+    pass: "{{kube_api_pwd}}"
+    role: admin
+  root:
+    pass: "{{kube_api_pwd}}"
+    role: admin
+
+
+
+## It is possible to activate / deactivate selected authentication methods (basic auth, static token auth)
+#kube_oidc_auth: false
+#kube_basic_auth: false
+#kube_token_auth: false
+
+
+## Variables for OpenID Connect Configuration https://kubernetes.io/docs/admin/authentication/
+## To use OpenID you have to deploy additional an OpenID Provider (e.g Dex, Keycloak, ...)
+
+# kube_oidc_url: https:// ...
+# kube_oidc_client_id: kubernetes
+## Optional settings for OIDC
+# kube_oidc_ca_file: {{ kube_cert_dir }}/ca.pem
+# kube_oidc_username_claim: sub
+# kube_oidc_groups_claim: groups
+
+
+# Choose network plugin (calico, weave or flannel)
+# Can also be set to 'cloud', which lets the cloud provider setup appropriate routing
+kube_network_plugin: calico
+
+# Enable kubernetes network policies
+enable_network_policy: false
+
+# Kubernetes internal network for services, unused block of space.
+kube_service_addresses: 10.233.0.0/18
+
+# internal network. When used, it will assign IP
+# addresses from this range to individual pods.
+# This network must be unused in your network infrastructure!
+kube_pods_subnet: 10.233.64.0/18
+
+# internal network node size allocation (optional). This is the size allocated
+# to each node on your network.  With these defaults you should have
+# room for 4096 nodes with 254 pods per node.
+kube_network_node_prefix: 24
+
+# The port the API Server will be listening on.
+kube_apiserver_ip: "{{ kube_service_addresses|ipaddr('net')|ipaddr(1)|ipaddr('address') }}"
+kube_apiserver_port: 6443 # (https)
+kube_apiserver_insecure_port: 8080 # (http)
+
+# DNS configuration.
+# Kubernetes cluster name, also will be used as DNS domain
+cluster_name: cluster.local
+# Subdomains of DNS domain to be resolved via /etc/resolv.conf for hostnet pods
+ndots: 2
+# Can be dnsmasq_kubedns, kubedns or none
+dns_mode: dnsmasq_kubedns
+# Can be docker_dns, host_resolvconf or none
+resolvconf_mode: docker_dns
+# Deploy netchecker app to verify DNS resolve as an HTTP service
+deploy_netchecker: false
+# Ip address of the kubernetes skydns service
+skydns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(3)|ipaddr('address') }}"
+dns_server: "{{ kube_service_addresses|ipaddr('net')|ipaddr(2)|ipaddr('address') }}"
+dns_domain: "{{ cluster_name }}"
+
+# Path used to store Docker data
+docker_daemon_graph: "/var/lib/docker"
+
+## A string of extra options to pass to the docker daemon.
+## This string should be exactly as you wish it to appear.
+## An obvious use case is allowing insecure-registry access
+## to self hosted registries like so:
+docker_options: "--insecure-registry={{ kube_service_addresses }} --graph={{ docker_daemon_graph }}"
+docker_bin_dir: "/usr/bin"
+
+# Settings for containerized control plane (etcd/kubelet/secrets)
+etcd_deployment_type: docker
+kubelet_deployment_type: docker
+cert_management: script
+vault_deployment_type: docker
+
+# K8s image pull policy (imagePullPolicy)
+k8s_image_pull_policy: IfNotPresent
+
+# Monitoring apps for k8s
+efk_enabled: false
+
+# Helm deployment
+helm_enabled: false
--- a/inventory/inventory.example
+++ b/inventory/inventory.example
@@ -7,7 +7,7 @@
 # node5 ansible_ssh_host=95.54.0.16  # ip=10.3.0.5
 # node6 ansible_ssh_host=95.54.0.17  # ip=10.3.0.6

-# ## configure a bastion host if your nodes are not publicly reachable
+# ## configure a bastion host if your nodes are not directly reachable
 # bastion ansible_ssh_host=x.x.x.x

 # [kube-master]
--- a/roles/dnsmasq/library/kube.py
+++ b/roles/dnsmasq/library/kube.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,14 @@
-ansible
+ansible==2.2.1.0
 netaddr
+# Ansible 2.2.1 requires jinja2<2.9, see <https://github.com/ansible/ansible/blob/v2.2.1.0-1/setup.py#L25>,
+# but without explicit limiting upper jinja2 version here pip ignores
+# Ansible requirements and installs latest available jinja2
+# (pip is not very smart here), which is incompatible with with
+# Ansible 2.2.1.
+# With incompatible jinja2 version "ansible-vault create" (and probably other parts)
+# fails with:
+#   ERROR! Unexpected Exception: The 'jinja2<2.9' distribution was not found 
+#   and is required by ansible
+# This upper limit should be removed in 2.2.2 release, see:
+# <https://github.com/ansible/ansible/commit/978311bf3f91dae5806ab72b665b0937adce38ad>
+jinja2>=2.8,<2.9
--- a/reset.yml
+++ b/reset.yml
@@ -1,5 +1,18 @@
 ---

 - hosts: all
+  vars_prompt:
+    name: "reset_confirmation"
+    prompt: "Are you sure you want to reset cluster state? Type 'yes' to reset your cluster."
+    default: "no"
+    private: no
+
+  pre_tasks:
+    - name: check confirmation
+      fail:
+        msg: "Reset confirmation failed"
+      when: reset_confirmation != "yes"
+
  roles:
+    - { role: kargo-defaults}
    - { role: reset, tags: reset }
--- a/roles/adduser/defaults/main.yml
+++ b/roles/adduser/defaults/main.yml
@@ -1,10 +1,13 @@
 ---
+kube_cert_group: kube-cert
+etcd_data_dir: "/var/lib/etcd"
+
 addusers:
  etcd:
    name: etcd
    comment: "Etcd user"
    createhome: yes
-    home: "/var/lib/etcd"
+    home: "{{ etcd_data_dir }}"
    system: yes
    shell: /bin/nologin
  kube:
--- a/roles/adduser/tasks/main.yml
+++ b/roles/adduser/tasks/main.yml
@@ -1,6 +1,8 @@
 ---
 - name: User | Create User Group
-  group: name={{user.group|default(user.name)}} system={{user.system|default(omit)}}
+  group:
+    name: "{{user.group|default(user.name)}}"
+    system: "{{user.system|default(omit)}}"

 - name: User | Create User
  user:
--- a/roles/adduser/vars/debian.yml
+++ b/roles/adduser/vars/debian.yml
@@ -3,7 +3,7 @@ addusers:
  - name: etcd
    comment: "Etcd user"
    createhome: yes
-    home: "/var/lib/etcd"
+    home: "{{ etcd_data_dir }}"
    system: yes
    shell: /bin/nologin

--- a/roles/adduser/vars/redhat.yml
+++ b/roles/adduser/vars/redhat.yml
@@ -3,7 +3,7 @@ addusers:
  - name: etcd
    comment: "Etcd user"
    createhome: yes
-    home: "/var/lib/etcd"
+    home: "{{ etcd_data_dir }}"
    system: yes
    shell: /bin/nologin

--- a/roles/bastion-ssh-config/tasks/main.yml
+++ b/roles/bastion-ssh-config/tasks/main.yml
@@ -15,4 +15,6 @@

 - name: create ssh bastion conf
  become: false
-  template: src=ssh-bastion.conf dest="{{ playbook_dir }}/ssh-bastion.conf"
+  template:
+    src: ssh-bastion.conf
+    dest: "{{ playbook_dir }}/ssh-bastion.conf"
--- a/roles/bastion-ssh-config/templates/ssh-bastion.conf
+++ b/roles/bastion-ssh-config/templates/ssh-bastion.conf
@@ -4,7 +4,7 @@

 {% for h in groups['all'] %}
 {% if h != 'bastion' %}
-{% if vars.update({'hosts': vars['hosts'] + ' ' + hostvars[h]['ansible_ssh_host']}) %}{% endif %}
+{% if vars.update({'hosts': vars['hosts'] + ' ' + (hostvars[h].get('ansible_ssh_host') or hostvars[h]['ansible_host'])}) %}{% endif %}
 {% endif %}
 {% endfor %}

@@ -18,4 +18,4 @@ Host {{ bastion_ip }}
 Host {{ vars['hosts'] }}
  ProxyCommand ssh -W %h:%p {{ real_user }}@{{ bastion_ip }}
  StrictHostKeyChecking no
-{% endif %}
+{% endif %}
--- a/roles/bootstrap-os/tasks/bootstrap-centos.yml
+++ b/roles/bootstrap-os/tasks/bootstrap-centos.yml
@@ -1,7 +1,8 @@
 ---

 - name: Check presence of fastestmirror.conf
-  stat: path=/etc/yum/pluginconf.d/fastestmirror.conf
+  stat:
+    path: /etc/yum/pluginconf.d/fastestmirror.conf
  register: fastestmirror

 # fastestmirror plugin actually slows down Ansible deployments
--- a/roles/bootstrap-os/tasks/bootstrap-coreos.yml
+++ b/roles/bootstrap-os/tasks/bootstrap-coreos.yml
@@ -18,11 +18,14 @@
  register: need_pip
  failed_when: false
  changed_when: false
+  check_mode: no
  when: (need_bootstrap | failed)
  tags: facts

 - name: Bootstrap | Copy get-pip.py
-  copy: src=get-pip.py dest=~/get-pip.py
+  copy:
+    src: get-pip.py
+    dest: ~/get-pip.py
  when: (need_pip | failed)

 - name: Bootstrap | Install pip
@@ -30,11 +33,16 @@
  when: (need_pip | failed)

 - name: Bootstrap | Remove get-pip.py
-  file: path=~/get-pip.py state=absent
+  file:
+    path: ~/get-pip.py
+    state: absent
  when: (need_pip | failed)

 - name: Bootstrap | Install pip launcher
-  copy: src=runner dest=/opt/bin/pip mode=0755
+  copy:
+    src: runner
+    dest: /opt/bin/pip
+    mode: 0755
  when: (need_pip | failed)

 - name: Install required python modules
@@ -42,10 +50,3 @@
    name: "{{ item }}"
  with_items: "{{pip_python_modules}}"

- name: Check configured hostname
-  shell: hostname
-  register: configured_hostname
-
- name: Assign inventory name to unconfigured hostnames
-  shell: sh -c "echo \"{{inventory_hostname}}\" > /etc/hostname; hostname \"{{inventory_hostname}}\""
-  when: (configured_hostname.stdout == 'localhost')
--- a/roles/bootstrap-os/tasks/bootstrap-ubuntu.yml
+++ b/roles/bootstrap-os/tasks/bootstrap-ubuntu.yml
@@ -2,14 +2,20 @@
 #  raw: cat /etc/issue.net | grep '{{ bootstrap_versions }}'

 - name: Bootstrap | Check if bootstrap is needed
-  raw: which python
+  raw: which "{{ item }}"
  register: need_bootstrap
  failed_when: false
+  with_items:
+    - python
+    - pip
  tags: facts

- name: Bootstrap | Install python 2.x
-  raw: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python-minimal
-  when: need_bootstrap | failed
+- name: Bootstrap | Install python 2.x and pip
+  raw:
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y python-minimal python-pip
+  when:
+    "{{ need_bootstrap.results | map(attribute='rc') | sort | last | bool }}"

 - set_fact:
    ansible_python_interpreter: "/usr/bin/python"
--- a/roles/bootstrap-os/tasks/main.yml
+++ b/roles/bootstrap-os/tasks/main.yml
@@ -8,4 +8,23 @@
 - include: bootstrap-centos.yml
  when: bootstrap_os == "centos"

- include: setup-pipelining.yml
+- include: setup-pipelining.yml
+
+- name: check if atomic host
+  stat:
+    path: /run/ostree-booted
+  register: ostree
+
+- set_fact:
+    is_atomic: "{{ ostree.stat.exists }}"
+
+- name: Gather nodes hostnames
+  setup:
+    gather_subset: '!all'
+    filter: ansible_hostname
+
+- name: Assign inventory name to unconfigured hostnames
+  hostname:
+    name: "{{inventory_hostname}}"
+  when: ansible_hostname == 'localhost'
+
--- a/roles/bootstrap-os/tasks/setup-pipelining.yml
+++ b/roles/bootstrap-os/tasks/setup-pipelining.yml
@@ -2,5 +2,8 @@
 # Remove requiretty to make ssh pipelining work

 - name: Remove require tty
-  lineinfile: regexp="^\w+\s+requiretty" dest=/etc/sudoers state=absent
+  lineinfile:
+    regexp: '^\w+\s+requiretty'
+    dest: /etc/sudoers
+    state: absent

--- a/roles/dnsmasq/defaults/main.yml
+++ b/roles/dnsmasq/defaults/main.yml
@@ -11,6 +11,9 @@
 #nameservers:
 #  - 127.0.0.1

+dns_forward_max: 150
+cache_size: 1000
+
 # Versions
 dnsmasq_version: 2.72

@@ -21,5 +24,9 @@ dnsmasq_image_tag: "{{ dnsmasq_version }}"
 # Limits for dnsmasq/kubedns apps
 dns_cpu_limit: 100m
 dns_memory_limit: 170Mi
-dns_cpu_requests: 70m
-dns_memory_requests: 70Mi
+dns_cpu_requests: 40m
+dns_memory_requests: 50Mi
+
+# Autoscaler parameters
+dnsmasq_nodes_per_replica: 10
+dnsmasq_min_replicas: 1
--- a/roles/dnsmasq/tasks/main.yml
+++ b/roles/dnsmasq/tasks/main.yml
@@ -1,4 +1,6 @@
 ---
+- include: pre_upgrade.yml
+
 - name: ensure dnsmasq.d directory exists
  file:
    path: /etc/dnsmasq.d
@@ -11,15 +13,36 @@
    state: directory
  tags: bootstrap-os

+- name: check system nameservers
+  shell: awk '/^nameserver/ {print $NF}' /etc/resolv.conf
+  changed_when: False
+  register: system_nameservers
+
+- name: init system_and_upstream_dns_servers
+  set_fact:
+    system_and_upstream_dns_servers: "{{ upstream_dns_servers|default([]) }}"
+
+- name: combine upstream_dns_servers and system nameservers (only for docker_dns)
+  set_fact:
+    system_and_upstream_dns_servers: "{{ system_and_upstream_dns_servers | union(system_nameservers.stdout_lines) | unique }}"
+  when: system_nameservers.stdout != "" and resolvconf_mode != 'host_resolvconf'
+
 - name: Write dnsmasq configuration
  template:
    src: 01-kube-dns.conf.j2
    dest: /etc/dnsmasq.d-available/01-kube-dns.conf
    mode: 0755
    backup: yes
+  register: dnsmasq_config

- name: Stat dnsmasq configuration
-  stat: path=/etc/dnsmasq.d/01-kube-dns.conf
+- name: Stat dnsmasq link
+  stat:
+    path: /etc/dnsmasq.d-available/01-kube-dns.conf
+  register: dnsmasq_stat
+
+- name: Stat dnsmasq link
+  stat:
+    path: /etc/dnsmasq.d/01-kube-dns.conf
  register: sym

 - name: Move previous configuration
@@ -34,16 +57,19 @@
    state: link

 - name: Create dnsmasq manifests
-  template: src={{item.file}} dest={{kube_config_dir}}/{{item.file}}
+  template:
+    src: "{{item.file}}"
+    dest: "{{kube_config_dir}}/{{item.file}}"
  with_items:
-    - {file: dnsmasq-ds.yml, type: ds}
-    - {file: dnsmasq-svc.yml, type: svc}
+    - {name: dnsmasq, file: dnsmasq-deploy.yml, type: deployment}
+    - {name: dnsmasq, file: dnsmasq-svc.yml, type: svc}
+    - {name: dnsmasq-autoscaler, file: dnsmasq-autoscaler.yml, type: deployment}
  register: manifests
  when: inventory_hostname == groups['kube-master'][0]

 - name: Start Resources
  kube:
-    name: dnsmasq
+    name: "{{item.item.name}}"
    namespace: "{{system_namespace}}"
    kubectl: "{{bin_dir}}/kubectl"
    resource: "{{item.item.type}}"
@@ -56,6 +82,6 @@
  wait_for:
    host: "{{dns_server}}"
    port: 53
-    delay: 5
-  when: inventory_hostname == groups['kube-node'][0]
-  tags: facts
+    timeout: 180
+  when: inventory_hostname == groups['kube-node'][0] and groups['kube-node'][0] in ansible_play_hosts
+
--- a/roles/dnsmasq/tasks/pre_upgrade.yml
+++ b/roles/dnsmasq/tasks/pre_upgrade.yml
@@ -0,0 +1,9 @@
+---
+- name: Delete legacy dnsmasq daemonset
+  kube:
+    name: dnsmasq
+    namespace: "{{system_namespace}}"
+    kubectl: "{{bin_dir}}/kubectl"
+    resource: "ds"
+    state: absent
+  when: inventory_hostname == groups['kube-master'][0]
--- a/roles/dnsmasq/templates/01-kube-dns.conf.j2
+++ b/roles/dnsmasq/templates/01-kube-dns.conf.j2
@@ -11,23 +11,23 @@ server=/{{ dns_domain }}/{{ skydns_server }}
 local=/{{ bogus_domains }}

 #Set upstream dns servers
-{% if upstream_dns_servers is defined %}
-{% for srv in upstream_dns_servers %}
+{% if system_and_upstream_dns_servers|length > 0 %}
+{% for srv in system_and_upstream_dns_servers %}
 server={{ srv }}
 {% endfor %}
-no-resolv
 {% elif resolvconf_mode == 'host_resolvconf' %}
 {# The default resolver is only needed when the hosts resolv.conf was modified by us. If it was not modified, we can rely on dnsmasq to reuse the systems resolv.conf #}
 server={{ default_resolver }}
-no-resolv
 {% endif %}

 {% if kube_log_level == '4' %}
 log-queries
 {% endif %}
+no-resolv
 bogus-priv
 no-negcache
-cache-size=1000
+cache-size={{ cache_size }}
+dns-forward-max={{ dns_forward_max }}
 max-cache-ttl=10
 max-ttl=20
 log-facility=-
--- a/roles/dnsmasq/templates/dnsmasq-autoscaler.yml
+++ b/roles/dnsmasq/templates/dnsmasq-autoscaler.yml
@@ -0,0 +1,50 @@
+# Copyright 2016 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: dnsmasq-autoscaler
+  namespace: kube-system
+  labels:
+    k8s-app: dnsmasq-autoscaler
+    kubernetes.io/cluster-service: "true"
+    addonmanager.kubernetes.io/mode: Reconcile
+spec:
+  template:
+    metadata:
+      labels:
+        k8s-app: dnsmasq-autoscaler
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ''
+        scheduler.alpha.kubernetes.io/tolerations: '[{"key":"CriticalAddonsOnly", "operator":"Exists"}]'
+    spec:
+      containers:
+      - name: autoscaler
+        image: gcr.io/google_containers/cluster-proportional-autoscaler-amd64:1.1.1
+        resources:
+            requests:
+                cpu: "20m"
+                memory: "10Mi"
+        command:
+          - /cluster-proportional-autoscaler
+          - --namespace=kube-system
+          - --configmap=dnsmasq-autoscaler
+          - --target=ReplicationController/dnsmasq
+          # When cluster is using large nodes(with more cores), "coresPerReplica" should dominate.
+          # If using small nodes, "nodesPerReplica" should dominate.
+          - --default-params={"linear":{"nodesPerReplica":{{ dnsmasq_nodes_per_replica }},"preventSinglePointFailure":true}}
+          - --logtostderr=true
+          - --v={{ kube_log_level }}
+
--- a/roles/dnsmasq/templates/dnsmasq-deploy.yml
+++ b/roles/dnsmasq/templates/dnsmasq-deploy.yml
@@ -1,16 +1,25 @@
 ---
 apiVersion: extensions/v1beta1
-kind: DaemonSet
+kind: Deployment
 metadata:
  name: dnsmasq
  namespace: "{{system_namespace}}"
  labels:
    k8s-app: dnsmasq
+    kubernetes.io/cluster-service: "true"
 spec:
+  replicas: {{ dnsmasq_min_replicas }}
+  selector:
+    matchLabels:
+      k8s-app: dnsmasq
+  strategy:
+    type: "Recreate"
  template:
    metadata:
      labels:
        k8s-app: dnsmasq
+        kubernetes.io/cluster-service: "true"
+        kargo/dnsmasq-checksum: "{{ dnsmasq_stat.stat.checksum }}"
    spec:
      containers:
        - name: dnsmasq
@@ -20,8 +29,8 @@ spec:
            - dnsmasq
          args:
            - -k
-            - "-7"
-            - /etc/dnsmasq.d
+            - -C
+            - /etc/dnsmasq.d/01-kube-dns.conf
          securityContext:
            capabilities:
              add:
@@ -55,3 +64,4 @@ spec:
          hostPath:
            path: /etc/dnsmasq.d-available
      dnsPolicy: Default  # Don't use cluster DNS.
+
--- a/roles/docker/defaults/main.yml
+++ b/roles/docker/defaults/main.yml
@@ -1,4 +1,4 @@
-docker_version: '1.12'
+docker_version: '1.13'

 docker_package_info:
  pkgs:
--- a/roles/docker/handlers/main.yml
+++ b/roles/docker/handlers/main.yml
@@ -23,7 +23,9 @@
    state: restarted

 - name: Docker | pause while Docker restarts
-  pause: seconds=10 prompt="Waiting for docker restart"
+  pause:
+    seconds: 10
+    prompt: "Waiting for docker restart"

 - name: Docker | wait for docker
  command: "{{ docker_bin_dir }}/docker images"
--- a/roles/docker/tasks/main.yml
+++ b/roles/docker/tasks/main.yml
@@ -34,11 +34,11 @@
    keyserver: "{{docker_repo_key_info.keyserver}}"
    state: present
  register: keyserver_task_result
-  until: keyserver_task_result|success
+  until: keyserver_task_result|succeeded
  retries: 4
  delay: "{{ retry_stagger | random + 3 }}"
  with_items: "{{ docker_repo_key_info.repo_keys }}"
-  when: not ansible_os_family in ["CoreOS", "Container Linux by CoreOS"]
+  when: not (ansible_os_family in ["CoreOS", "Container Linux by CoreOS"] or is_atomic)

 - name: ensure docker repository is enabled
  action: "{{ docker_repo_info.pkg_repo }}"
@@ -46,13 +46,13 @@
    repo: "{{item}}"
    state: present
  with_items: "{{ docker_repo_info.repos }}"
-  when: (not ansible_os_family in ["CoreOS", "Container Linux by CoreOS"]) and (docker_repo_info.repos|length > 0)
+  when: not (ansible_os_family in ["CoreOS", "Container Linux by CoreOS"] or is_atomic) and (docker_repo_info.repos|length > 0)

 - name: Configure docker repository on RedHat/CentOS
-  copy:
-    src: "rh_docker.repo"
+  template:
+    src: "rh_docker.repo.j2"
    dest: "/etc/yum.repos.d/docker.repo"
-  when: ansible_distribution in ["CentOS","RedHat"]
+  when: ansible_distribution in ["CentOS","RedHat"] and not is_atomic

 - name: ensure docker packages are installed
  action: "{{ docker_package_info.pkg_mgr }}"
@@ -61,14 +61,15 @@
    force: "{{item.force|default(omit)}}"
    state: present
  register: docker_task_result
-  until: docker_task_result|success
+  until: docker_task_result|succeeded
  retries: 4
  delay: "{{ retry_stagger | random + 3 }}"
  with_items: "{{ docker_package_info.pkgs }}"
-  when: (not ansible_os_family in ["CoreOS", "Container Linux by CoreOS"]) and (docker_package_info.pkgs|length > 0)
+  notify: restart docker
+  when: not (ansible_os_family in ["CoreOS", "Container Linux by CoreOS"] or is_atomic) and (docker_package_info.pkgs|length > 0)

 - name: check minimum docker version for docker_dns mode. You need at least docker version >= 1.12 for resolvconf_mode=docker_dns
-  shell: docker version -f "{{ '{{' }}.Client.Version{{ '}}' }}"
+  command: "docker version -f '{{ '{{' }}.Client.Version{{ '}}' }}'"
  register: docker_version
  failed_when: docker_version.stdout|version_compare('1.12', '<')
  changed_when: false
--- a/roles/docker/tasks/set_facts_dns.yml
+++ b/roles/docker/tasks/set_facts_dns.yml
@@ -32,11 +32,13 @@
  shell: grep "^nameserver" /etc/resolv.conf | sed 's/^nameserver\s*//'
  changed_when: False
  register: system_nameservers
+  check_mode: no

 - name: check system search domains
  shell: grep "^search" /etc/resolv.conf | sed 's/^search\s*//'
  changed_when: False
  register: system_search_domains
+  check_mode: no

 - name: add system nameservers to docker options
  set_fact:
@@ -49,13 +51,16 @@
  when: system_search_domains.stdout != "" 

 - name: check number of nameservers
-  fail: msg="Too many nameservers"
+  fail:
+    msg: "Too many nameservers"
  when: docker_dns_servers|length > 3

 - name: check number of search domains
-  fail: msg="Too many search domains"
+  fail:
+    msg: "Too many search domains"
  when: docker_dns_search_domains|length > 6

 - name: check length of search domains
-  fail: msg="Search domains exceeded limit of 256 characters"
+  fail:
+    msg: "Search domains exceeded limit of 256 characters"
  when: docker_dns_search_domains|join(' ')|length > 256
--- a/roles/docker/tasks/systemd.yml
+++ b/roles/docker/tasks/systemd.yml
@@ -1,6 +1,8 @@
 ---
 - name: Create docker service systemd directory if it doesn't exist
-  file: path=/etc/systemd/system/docker.service.d state=directory
+  file:
+    path: /etc/systemd/system/docker.service.d
+    state: directory

 - name: Write docker proxy drop-in
  template:
@@ -13,7 +15,14 @@
    src: docker.service.j2
    dest: /etc/systemd/system/docker.service
  register: docker_service_file
-  when: not ansible_os_family in ["CoreOS", "Container Linux by CoreOS"]
+  when: not (ansible_os_family in ["CoreOS", "Container Linux by CoreOS"] or is_atomic)
+
+- name: Write docker.service systemd file for atomic
+  template:
+    src: docker_atomic.service.j2
+    dest: /etc/systemd/system/docker.service
+  notify: restart docker
+  when: is_atomic

 - name: Write docker options systemd drop-in
  template:
--- a/roles/docker/templates/docker-dns.conf.j2
+++ b/roles/docker/templates/docker-dns.conf.j2
@@ -3,4 +3,4 @@ Environment="DOCKER_DNS_OPTIONS=\
    {% for d in docker_dns_servers %}--dns {{ d }} {% endfor %} \
    {% for d in docker_dns_search_domains %}--dns-search {{ d }} {% endfor %} \
    {% for o in docker_dns_options %}--dns-opt {{ o }} {% endfor %} \
-"
+"
--- a/roles/docker/templates/docker-options.conf.j2
+++ b/roles/docker/templates/docker-options.conf.j2
@@ -1,2 +1,2 @@
 [Service]
-Environment="DOCKER_OPTS={% if docker_options is defined %}{{ docker_options }}{% endif %}"
+Environment="DOCKER_OPTS={% if docker_options is defined %}{{ docker_options }}{% endif %} --iptables={% if kube_network_plugin == 'flannel' %}true{% else %}false{% endif %}"
--- a/roles/docker/templates/docker_atomic.service.j2
+++ b/roles/docker/templates/docker_atomic.service.j2
@@ -0,0 +1,37 @@
+[Unit]
+Description=Docker Application Container Engine
+Documentation=http://docs.docker.com
+After=network.target
+Wants=docker-storage-setup.service
+
+[Service]
+Type=notify
+NotifyAccess=all
+EnvironmentFile=-/etc/sysconfig/docker
+EnvironmentFile=-/etc/sysconfig/docker-storage
+Environment=GOTRACEBACK=crash
+Environment=DOCKER_HTTP_HOST_COMPAT=1
+Environment=PATH=/usr/libexec/docker:/usr/bin:/usr/sbin
+ExecReload=/bin/kill -s HUP $MAINPID
+Delegate=yes
+KillMode=process
+ExecStart=/usr/bin/dockerd-current \
+          --add-runtime docker-runc=/usr/libexec/docker/docker-runc-current \
+          --default-runtime=docker-runc \
+          --exec-opt native.cgroupdriver=systemd \
+          --userland-proxy-path=/usr/libexec/docker/docker-proxy-current \
+          $DOCKER_OPTS \
+          $DOCKER_STORAGE_OPTIONS \
+          $DOCKER_NETWORK_OPTIONS \
+          $DOCKER_DNS_OPTIONS \
+          $ADD_REGISTRY \
+          $BLOCK_REGISTRY \
+          $INSECURE_REGISTRY
+LimitNOFILE=1048576
+LimitNPROC=1048576
+LimitCORE=infinity
+TimeoutStartSec=1min
+Restart=on-abnormal
+
+[Install]
+WantedBy=multi-user.target
--- a/roles/docker/templates/rh_docker.repo.j2
+++ b/roles/docker/templates/rh_docker.repo.j2
@@ -4,3 +4,4 @@ baseurl=https://yum.dockerproject.org/repo/main/centos/7
 enabled=1
 gpgcheck=1
 gpgkey=https://yum.dockerproject.org/gpg
+{% if http_proxy is defined %}proxy={{ http_proxy }}{% endif %}
--- a/roles/docker/vars/debian.yml
+++ b/roles/docker/vars/debian.yml
@@ -4,7 +4,10 @@ docker_kernel_min_version: '3.10'
 docker_versioned_pkg:
  'latest': docker-engine
  '1.11': docker-engine=1.11.2-0~{{ ansible_distribution_release|lower }}
-  '1.12': docker-engine=1.12.5-0~debian-{{ ansible_distribution_release|lower }}
+  '1.12': docker-engine=1.12.6-0~debian-{{ ansible_distribution_release|lower }}
+  '1.13': docker-engine=1.13.1-0~debian-{{ ansible_distribution_release|lower }}
+  'stable': docker-engine=17.03.0~ce-0~debian-{{ ansible_distribution_release|lower }}
+  'edge': docker-engine=17.03.0~ce-0~debian-{{ ansible_distribution_release|lower }}

 docker_package_info:
  pkg_mgr: apt
--- a/roles/docker/vars/fedora.yml
+++ b/roles/docker/vars/fedora.yml
@@ -1,9 +1,15 @@
 docker_kernel_min_version: '0'

+# https://docs.docker.com/engine/installation/linux/fedora/#install-from-a-package
+# https://download.docker.com/linux/fedora/7/x86_64/stable/
+# the package names below are guesses;
+# docs mention `sudo dnf config-manager --enable docker-ce-edge` for edge
 docker_versioned_pkg:
  'latest': docker
  '1.11': docker-1:1.11.2
  '1.12': docker-1:1.12.5
+  'stable': docker-ce
+  'edge': docker-ce-edge

 docker_package_info:
  pkg_mgr: dnf
--- a/Show More
+++ b/Show More