Module tests.test_cluster_autoscaler
Functions
def annotate_safe_to_evict_to_namespace_pods(core_api, namespace)
-
Expand source code
def annotate_safe_to_evict_to_namespace_pods(core_api, namespace): # NOQA payload = { "metadata": { "annotations": { K8S_CLUSTER_AUTOSCALER_EVICT_KEY: "true" } } } pods = core_api.list_namespaced_pod(namespace=namespace) for pod in pods.items: meta = pod.metadata try: key = K8S_CLUSTER_AUTOSCALER_EVICT_KEY if key in meta.annotations and \ meta.annotations[key] == "true": continue # we dont mind type error for different cloudproviders. except TypeError: pass core_api.patch_namespaced_pod( meta.name, meta.namespace, payload )
def configure_node_scale_down(core_api, nodes, disable)
-
Expand source code
def configure_node_scale_down(core_api, nodes, disable): # NOQA payload = { "metadata": { "annotations": { K8S_CLUSTER_AUTOSCALER_SCALE_DOWN_DISABLED_KEY: disable, } } } for node in nodes: core_api.patch_node(node.id, body=payload)
def get_new_nodes(client, old_nodes)
-
Expand source code
def get_new_nodes(client, old_nodes): # NOQA old_nodes_name = [n.name for n in old_nodes] nodes = client.list_node() return [n for n in nodes if not (n.name in old_nodes_name)]
def get_replica_count_to_scale_up(core_api, node_number, cpu_request)
-
Expand source code
def get_replica_count_to_scale_up(core_api, node_number, cpu_request): # NOQA host_id = get_self_host_id() host_kb_node = core_api.read_node(host_id) if host_kb_node.status.allocatable["cpu"].endswith('m'): allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"][:-1]) else: allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"])*1000 return 10 * math.ceil(allocatable_millicpu/cpu_request*node_number/10)
def test_cluster_autoscaler(client, core_api, apps_api, make_deployment_cpu_request, request)
-
Expand source code
@pytest.mark.cluster_autoscaler # NOQA def test_cluster_autoscaler(client, core_api, apps_api, make_deployment_cpu_request, request): # NOQA """ Scenario: Test CA Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set `kubernetes-cluster-autoscaler-enabled` to `true`. And Create deployment with cpu request. When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number. When Trigger CA to scale-down by decrease deployment replicas. Then Cluster should scale-down to original node number. """ # Cleanup def finalizer(): configure_node_scale_down(core_api, client.list_node(), disable="") request.addfinalizer(finalizer) host_id = get_self_host_id() host_node = client.by_id_node(host_id) configure_node_scale_down(core_api, [host_node], disable="true") set_node_cordon(core_api, host_id, True) update_setting(client, SETTING_K8S_CLUSTER_AUTOSCALER_ENABLED, "true") nodes = client.list_node() scale_size = len(nodes)-1 scale_up_replica = get_replica_count_to_scale_up( core_api, scale_size, CPU_REQUEST ) deployment_name = "ca-scaling-control" deployment = make_deployment_cpu_request(deployment_name, CPU_REQUEST) create_and_wait_deployment(apps_api, deployment) deployment["spec"]["replicas"] = scale_up_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) wait_cluster_autoscale_up(client, nodes, scale_size) scale_down_replica = 0 deployment["spec"]["replicas"] = scale_down_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) nodes = client.list_node() client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size)
Scenario: Test CA
Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set
kubernetes-cluster-autoscaler-enabled
totrue
. And Create deployment with cpu request.When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number.
When Trigger CA to scale-down by decrease deployment replicas. Then Cluster should scale-down to original node number.
def test_cluster_autoscaler_all_nodes_with_volume_replicas(client, core_api, apps_api, make_deployment_cpu_request, volume_name, pod_make, request)
-
Expand source code
@pytest.mark.cluster_autoscaler # NOQA def test_cluster_autoscaler_all_nodes_with_volume_replicas(client, core_api, apps_api, make_deployment_cpu_request, volume_name, pod_make, request): # NOQA """ Scenario: Test CA scale down all nodes with volume replicas Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set `kubernetes-cluster-autoscaler-enabled` to `true`. And Create volume. And Attach the volume. And Write some data to volume. And Detach the volume. And Create deployment with cpu request. When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number. When Annotate new nodes with `cluster-autoscaler.kubernetes.io/scale-down-disabled`. (this ensures scale-down only the old nodes) And Trigger CA to scale-down by decrease deployment replicas. Then Cluster should have original node number + 1 blocked node. When Attach the volume to a new node. This triggers replica rebuild. And Volume data should be the same. And Detach the volume. Then Cluster should scale-down to original node number. And Volume data should be the same. """ # Cleanup def finalizer(): configure_node_scale_down(core_api, client.list_node(), disable="") request.addfinalizer(finalizer) host_id = get_self_host_id() host_node = client.by_id_node(host_id) configure_node_scale_down(core_api, [host_node], disable="true") set_node_cordon(core_api, host_id, True) update_setting(client, SETTING_REPLICA_REPLENISHMENT_WAIT_INTERVAL, "0") update_setting(client, SETTING_K8S_CLUSTER_AUTOSCALER_ENABLED, "true") nodes = client.list_node() scale_size = len(nodes)-1 volume = create_and_check_volume(client, volume_name, num_of_replicas=scale_size) create_pv_for_volume(client, core_api, volume, volume.name) create_pvc_for_volume(client, core_api, volume, volume.name) pod_manifest = pod_make() pod_manifest['spec']['volumes'] = [create_pvc_spec(volume.name)] pod_name = pod_manifest['metadata']['name'] create_and_wait_pod(core_api, pod_manifest) data = generate_random_data(16*Ki) write_pod_volume_data(core_api, pod_name, data) delete_and_wait_pod(core_api, pod_name) volume = wait_for_volume_detached(client, volume.name) scale_up_replica = get_replica_count_to_scale_up( core_api, scale_size, CPU_REQUEST ) deployment_name = "autoscale-control" deployment = make_deployment_cpu_request(deployment_name, CPU_REQUEST) create_and_wait_deployment(apps_api, deployment) deployment["spec"]["replicas"] = scale_up_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) wait_cluster_autoscale_up(client, nodes, scale_size) new_nodes = get_new_nodes(client, old_nodes=nodes) configure_node_scale_down(core_api, new_nodes, disable="true") scale_down_replica = 0 deployment["spec"]["replicas"] = scale_down_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) nodes = client.list_node() is_blocked = False try: client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size) except AssertionError: client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size-1) is_blocked = True assert is_blocked configure_node_scale_down(core_api, [new_nodes[0]], disable="") volume = client.by_id_volume(volume.name) volume = volume.attach(hostId=new_nodes[0].id) volume = wait_for_volume_healthy(client, volume_name) volume.detach() client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size) create_and_wait_pod(core_api, pod_manifest) resp = read_volume_data(core_api, pod_name) assert resp == data
Scenario: Test CA scale down all nodes with volume replicas
Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set
kubernetes-cluster-autoscaler-enabled
totrue
. And Create volume. And Attach the volume. And Write some data to volume. And Detach the volume. And Create deployment with cpu request.When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number.
When Annotate new nodes with
cluster-autoscaler.kubernetes.io/scale-down-disabled
. (this ensures scale-down only the old nodes) And Trigger CA to scale-down by decrease deployment replicas. Then Cluster should have original node number + 1 blocked node.When Attach the volume to a new node. This triggers replica rebuild. And Volume data should be the same. And Detach the volume. Then Cluster should scale-down to original node number. And Volume data should be the same.
def test_cluster_autoscaler_backing_image(client, core_api, apps_api, make_deployment_cpu_request, request)
-
Expand source code
@pytest.mark.cluster_autoscaler # NOQA @pytest.mark.backing_image # NOQA def test_cluster_autoscaler_backing_image(client, core_api, apps_api, make_deployment_cpu_request, request): # NOQA """ Scenario: Test CA scale down with backing image Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set `kubernetes-cluster-autoscaler-enabled` to `true`. And Create backing image. And Create deployment with cpu request. When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number. When Annotate new nodes with `cluster-autoscaler.kubernetes.io/scale-down-disabled`. (this ensures scale-down only the old nodes) And Trigger CA to scale-down by decrease deployment replicas. Then Cluster should have original node number + 1 blocked node. When Remove backing image. Then Cluster should scale-down to original node number. """ # Cleanup def finalizer(): configure_node_scale_down(core_api, client.list_node(), disable="") request.addfinalizer(finalizer) host_id = get_self_host_id() host_node = client.by_id_node(host_id) configure_node_scale_down(core_api, [host_node], disable="true") set_node_cordon(core_api, host_id, True) update_setting(client, SETTING_REPLICA_REPLENISHMENT_WAIT_INTERVAL, "0") update_setting(client, SETTING_K8S_CLUSTER_AUTOSCALER_ENABLED, "true") nodes = client.list_node() scale_size = len(nodes)-1 create_backing_image_with_matching_url( client, BACKING_IMAGE_NAME, BACKING_IMAGE_QCOW2_URL) scale_up_replica = get_replica_count_to_scale_up( core_api, scale_size, CPU_REQUEST ) deployment_name = "autoscale-control" deployment = make_deployment_cpu_request(deployment_name, CPU_REQUEST) create_and_wait_deployment(apps_api, deployment) deployment["spec"]["replicas"] = scale_up_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) wait_cluster_autoscale_up(client, nodes, scale_size) new_nodes = get_new_nodes(client, old_nodes=nodes) configure_node_scale_down(core_api, new_nodes, disable="true") scale_down_replica = 0 deployment["spec"]["replicas"] = scale_down_replica apps_api.patch_namespaced_deployment(body=deployment, namespace='default', name=deployment_name) nodes = client.list_node() is_blocked = False try: client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size) except AssertionError: client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size-1) is_blocked = True assert is_blocked cleanup_all_backing_images(client) client = wait_cluster_autoscale_down(client, core_api, nodes, scale_size)
Scenario: Test CA scale down with backing image
Given Cluster with Kubernetes cluster-autoscaler. And Longhorn installed. And Set
kubernetes-cluster-autoscaler-enabled
totrue
. And Create backing image. And Create deployment with cpu request.When Trigger CA to scale-up by increase deployment replicas. (double the node number, not including host node) Then Cluster should have double the node number.
When Annotate new nodes with
cluster-autoscaler.kubernetes.io/scale-down-disabled
. (this ensures scale-down only the old nodes) And Trigger CA to scale-down by decrease deployment replicas. Then Cluster should have original node number + 1 blocked node.When Remove backing image. Then Cluster should scale-down to original node number.
def wait_cluster_autoscale_down(client, core_api, nodes, diff)
-
Expand source code
def wait_cluster_autoscale_down(client, core_api, nodes, diff): # NOQA for _ in range(RETRY_AUTOSCALER_COUNTS): time.sleep(RETRY_AUTOSCALER_INTERVAL) # Sometimes CA gets blocked by kube-system components annotate_safe_to_evict_to_namespace_pods(core_api, "kube-system") try: check_nodes = client.list_node() except requests.exceptions.ConnectionError: client = get_longhorn_api_client() continue removed = len(nodes) - len(check_nodes) if removed >= diff: return client assert False, \ f"cluster autoscaler failed to scaled down.\n" \ f"Expect scale={diff}\n" \ f"Got scale={removed}"
def wait_cluster_autoscale_up(client, nodes, diff)
-
Expand source code
def wait_cluster_autoscale_up(client, nodes, diff): # NOQA for _ in range(RETRY_AUTOSCALER_COUNTS): time.sleep(RETRY_AUTOSCALER_INTERVAL) check_nodes = client.list_node() added = len(check_nodes) - len(nodes) if added >= diff: return assert False, \ f"cluster autoscaler failed to scaled up.\n" \ f"Expect scale={diff}\n" \ f"Got scale={added}"