Module tests.test_settings
Functions
def check_priority_class(pod, priority_class=None)
-
Expand source code
def check_priority_class(pod, priority_class=None): # NOQA if priority_class: return pod.spec.priority == priority_class['value'] and \ pod.spec.priority_class_name == \ priority_class['metadata']['name'] else: return pod.spec.priority == 0 and pod.spec.priority_class_name == ''
def check_tolerations_set(current_toleration_list, expected_tolerations, chk_removed_tolerations=[])
-
Expand source code
def check_tolerations_set(current_toleration_list, expected_tolerations, chk_removed_tolerations=[]): found = 0 unexpected = 0 for t in current_toleration_list: current_toleration = { "key": t.key, "value": t.value, "operator": t.operator, "effect": t.effect } for expected in expected_tolerations: if current_toleration == expected: found += 1 for removed in chk_removed_tolerations: if current_toleration == removed: unexpected += 1 return len(expected_tolerations) == found and unexpected == 0
def check_workload_update(core_api, apps_api, count)
-
Expand source code
def check_workload_update(core_api, apps_api, count): # NOQA da_list = apps_api.list_namespaced_daemon_set(LONGHORN_NAMESPACE).items for da in da_list: if da.status.updated_number_scheduled != count: return False dp_list = apps_api.list_namespaced_deployment(LONGHORN_NAMESPACE).items for dp in dp_list: if dp.status.updated_replicas != dp.spec.replicas: return False im_pod_list = core_api.list_namespaced_pod( LONGHORN_NAMESPACE, label_selector="longhorn.io/component=instance-manager, \ longhorn.io/data-engine={}".format(DATA_ENGINE)).items if len(im_pod_list) != count: return False for p in im_pod_list: if p.status.phase != "Running": return False client = get_longhorn_api_client() # NOQA images = client.list_engine_image() assert len(images) == 1 ei_state = get_engine_image_status_value(client, images[0].name) if images[0].state != ei_state: return False return True
def config_map_with_value(configmap_name,
setting_names,
setting_values,
data_yaml_name='default-setting.yaml')-
Expand source code
def config_map_with_value(configmap_name, setting_names, setting_values, data_yaml_name="default-setting.yaml"): # NOQA setting = {} num_settings = len(setting_names) if num_settings > 0: for i in range(num_settings): setting.update({setting_names[i]: setting_values[i]}) return { "apiVersion": "v1", "kind": "ConfigMap", "metadata": { "name": configmap_name, }, "data": { data_yaml_name: yaml.dump(setting), } }
def guaranteed_instance_manager_cpu_setting_check(client, core_api, instance_managers, state, desire, cpu_val)
-
Expand source code
def guaranteed_instance_manager_cpu_setting_check( # NOQA client, core_api, instance_managers, state, desire, cpu_val): # NOQA """ We check if instance managers are in the desired state with correct setting desire is for reflect the state we are looking for. If desire is True, meanning we need the state to be the same. Otherwise, we are looking for the state to be different. e.g. 'Pending', 'OutofCPU', 'Terminating' they are all 'Not Running'. """ # Give sometime to k8s to update the instance manager status for im in instance_managers: wait_for_instance_manager_desire_state(client, core_api, im.name, state, desire) if desire: # Verify guaranteed CPU set correctly for im in instance_managers: pod = core_api.read_namespaced_pod(name=im.name, namespace=LONGHORN_NAMESPACE) if pod.metadata.labels["longhorn.io/data-engine"] == "v1": if cpu_val: assert (pod.spec.containers[0].resources.requests['cpu'] == cpu_val) else: assert (not pod.spec.containers[0].resources.requests)
We check if instance managers are in the desired state with correct setting desire is for reflect the state we are looking for. If desire is True, meanning we need the state to be the same. Otherwise, we are looking for the state to be different. e.g. 'Pending', 'OutofCPU', 'Terminating' they are all 'Not Running'.
def init_longhorn_default_setting_configmap(core_api,
client,
configmap_name='longhorn-default-setting',
data_yaml_name='default-setting.yaml')-
Expand source code
def init_longhorn_default_setting_configmap(core_api, client, # NOQA configmap_name=DEFAULT_SETTING_CONFIGMAP_NAME, # NOQA data_yaml_name=DEFAULT_SETTING_YAML_NAME): # NOQA core_api.delete_namespaced_config_map(name=configmap_name, namespace='longhorn-system') configmap_body = config_map_with_value(configmap_name, [], [], data_yaml_name) core_api.create_namespaced_config_map(body=configmap_body, namespace='longhorn-system')
def retry_setting_update(client, setting_name, setting_value)
-
Expand source code
def retry_setting_update(client, setting_name, setting_value): # NOQA for i in range(RETRY_COUNTS): try: update_setting(client, setting_name, setting_value) except Exception as e: if i < RETRY_COUNTS: time.sleep(RETRY_INTERVAL) continue print(e) raise else: break
def setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test(client, volname, is_DR_volumes=False)
-
Expand source code
def setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test(client, volname, is_DR_volumes=False): # NOQA """ Given Setting concurrent-volume-backup-restore-per-node-limit is 2. And Volume (for backup) created. And Volume (for backup) has backup with some data. When Create some volumes (num_node * setting value * 3) from backup. Then Number of restoring volumes per node should be expected based on if they are normal volumes or DR volumes. """ update_setting(client, SETTING_DEGRADED_AVAILABILITY, "false") concurrent_limit = 2 update_setting(client, SETTING_CONCURRENT_VOLUME_BACKUP_RESTORE, str(concurrent_limit)) _, backup = create_volume_and_backup(client, volname + "-with-backup", 1000 * Mi, 600 * Mi) nodes = client.list_node() restore_volume_names = [] for i in range(len(nodes) * concurrent_limit * 3): name = volname + "-restore-" + str(i) restore_volume_names.append(name) client.create_volume(name=name, numberOfReplicas=1, fromBackup=backup.url, standby=is_DR_volumes, dataEngine=DATA_ENGINE) is_case_tested = False for i in range(RETRY_COUNTS): time.sleep(RETRY_INTERVAL) restoring_volume = None for name in restore_volume_names: volume = client.by_id_volume(name) if volume.restoreStatus and volume.restoreStatus[0].progress != 0: restoring_volume = volume break if not restoring_volume: continue concurrent_count = 0 restoring_status = restoring_volume.restoreStatus if len(restoring_status) != 0 and \ restoring_status[0].progress != 100: restoring_host_id = get_engine_host_id(client, restoring_volume.name) for restore_volume_name in restore_volume_names: if restore_volume_name == restoring_volume.name: concurrent_count += 1 continue host_id = get_engine_host_id(client, restore_volume_name) if host_id != restoring_host_id: continue volume = client.by_id_volume(restore_volume_name) restore_status = volume.restoreStatus if len(restore_status) == 0: continue if not restore_status[0].progress or \ restore_status[0].progress == 0: continue concurrent_count += 1 if is_DR_volumes: if concurrent_count > concurrent_limit: is_case_tested = True break else: if concurrent_count == concurrent_limit: is_case_tested = True break assert is_case_tested, \ f"Unexpected concurrent count: {concurrent_count}\n" for restore_volume_name in restore_volume_names: if is_DR_volumes: wait_for_backup_restore_completed(client, restore_volume_name, backup.name) continue wait_for_volume_restoration_completed(client, restore_volume_name)
Given Setting concurrent-volume-backup-restore-per-node-limit is 2. And Volume (for backup) created. And Volume (for backup) has backup with some data.
When Create some volumes (num_node * setting value * 3) from backup.
Then Number of restoring volumes per node should be expected based on if they are normal volumes or DR volumes.
def test_instance_manager_cpu_reservation(client, core_api)
-
Expand source code
def test_instance_manager_cpu_reservation(client, core_api): # NOQA """ Test if the CPU requests of instance manager pods are controlled by the settings and the node specs correctly. 1. On node 1, set `node.instanceManagerCPURequest` to 150. --> The IM pods on this node will be restarted. And the CPU requests of these IM pods matches the above milli value. 2. Change the new setting `Guaranteed Instance Manager CPU` to 10, Then wait for all IM pods except for the pods on node 1 restarting. --> The CPU requests of the restarted IM pods equals to the new setting value multiply the kube node allocatable CPU. 3. Set the new settings to 0. --> All IM pods except for the pod on node 1 will be restarted without CPU requests. 4. Set the fields on node 1 to 0. --> The IM pods on node 1 will be restarted without CPU requests. 5. Set the new setting to a values smaller than 40. Then wait for all IM pods restarting. --> The CPU requests of all IM pods equals to the new setting value multiply the kube node allocatable CPU. 6. Set the new setting to a value greater than 40. --> The setting update should fail. 7. Create a volume, verify everything works as normal Note: use fixture to restore the setting into the original state """ instance_managers = client.list_instance_manager() host_node_name = get_self_host_id() host_node = client.by_id_node(host_node_name) other_ims = [] for im in instance_managers: if im.nodeID == host_node_name: im_on_host = im else: other_ims.append(im) assert im_on_host host_kb_node = core_api.read_node(host_node_name) if host_kb_node.status.allocatable["cpu"].endswith('m'): allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"][:-1]) else: allocatable_millicpu = int(host_kb_node.status.allocatable["cpu"])*1000 client.update(host_node, allowScheduling=True, instanceManagerCPURequest=150) time.sleep(5) guaranteed_instance_manager_cpu_setting_check( client, core_api, [im_on_host], "Running", True, "150m") update_setting(client, SETTING_GUARANTEED_INSTANCE_MANAGER_CPU, '{"v1":"10","v2":"10"}') time.sleep(5) guaranteed_instance_manager_cpu_setting_check( client, core_api, other_ims, "Running", True, str(int(allocatable_millicpu*10/100)) + "m") update_setting(client, SETTING_GUARANTEED_INSTANCE_MANAGER_CPU, '{"v1":"0","v2":"0"}') time.sleep(5) guaranteed_instance_manager_cpu_setting_check( client, core_api, other_ims, "Running", True, "") ims = other_ims ims.append(im_on_host) host_node = client.by_id_node(host_node_name) client.update(host_node, allowScheduling=True, instanceManagerCPURequest=0) time.sleep(5) guaranteed_instance_manager_cpu_setting_check( client, core_api, ims, "Running", True, "") update_setting(client, SETTING_GUARANTEED_INSTANCE_MANAGER_CPU, '{"v1":"20","v2":"20"}') time.sleep(5) guaranteed_instance_manager_cpu_setting_check( client, core_api, ims, "Running", True, str(int(allocatable_millicpu*20/100)) + "m") with pytest.raises(Exception) as e: setting = client.by_id_setting(SETTING_GUARANTEED_INSTANCE_MANAGER_CPU) client.update(setting, value="41") assert "should be less than or equal to 40" in \ str(e.value) # Create a volume to test vol_name = generate_volume_name() volume = create_and_check_volume(client, vol_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, vol_name) assert len(volume.replicas) == 3 data = write_volume_random_data(volume) check_volume_data(volume, data) cleanup_volume(client, volume)
Test if the CPU requests of instance manager pods are controlled by the settings and the node specs correctly.
- On node 1, set
node.instanceManagerCPURequest
to 150. –> The IM pods on this node will be restarted. And the CPU requests of these IM pods matches the above milli value. - Change the new setting
Guaranteed Instance Manager CPU
to 10, Then wait for all IM pods except for the pods on node 1 restarting. –> The CPU requests of the restarted IM pods equals to the new setting value multiply the kube node allocatable CPU. - Set the new settings to 0. –> All IM pods except for the pod on node 1 will be restarted without CPU requests.
- Set the fields on node 1 to 0. –> The IM pods on node 1 will be restarted without CPU requests.
- Set the new setting to a values smaller than 40. Then wait for all IM pods restarting. –> The CPU requests of all IM pods equals to the new setting value multiply the kube node allocatable CPU.
- Set the new setting to a value greater than 40. –> The setting update should fail.
- Create a volume, verify everything works as normal
Note: use fixture to restore the setting into the original state
- On node 1, set
def test_setting_backing_image_auto_cleanup(client, core_api, volume_name)
-
Expand source code
@pytest.mark.backing_image # NOQA def test_setting_backing_image_auto_cleanup(client, core_api, volume_name): # NOQA """ Test that the Backing Image Cleanup Wait Interval setting works correctly. The default value of setting `BackingImageCleanupWaitInterval` is 60. 1. Clean up the backing image work directory so that the current case won't be intervened by previous tests. 2. Create a backing image. 3. Create multiple volumes using the backing image. 4. Attach all volumes, Then: 1. Wait for all volumes can become running. 2. Verify the correct in all volumes. 3. Verify the backing image disk status map. 4. Verify the only backing image file in each disk is reused by multiple replicas. The backing image file path is `<Data path>/<The backing image name>/backing` 5. Unschedule test node to guarantee when replica removed from test node, no new replica can be rebuilt on the test node. 6. Remove all replicas in one disk. Wait for 50 seconds. Then verify nothing changes in the backing image disk state map (before the cleanup wait interval is passed). 7. Modify `BackingImageCleanupWaitInterval` to a small value. Then verify: 1. The download state of the disk containing no replica becomes terminating first, and the entry will be removed from the map later. 2. The related backing image file is removed. 3. The download state of other disks keep unchanged. All volumes still work fine. 8. Delete all volumes. Verify that there will only remain 1 entry in the backing image disk map 9. Delete the backing image. """ # Step 1 subprocess.check_call([ "rm", "-rf", os.path.join(DEFAULT_DISK_PATH, 'backing-images') ]) # Step 2 create_backing_image_with_matching_url( client, BACKING_IMAGE_NAME, BACKING_IMAGE_QCOW2_URL) # Step 3 volume_names = [ volume_name + "-1", volume_name + "-2", volume_name + "-3" ] for volume_name in volume_names: create_and_check_volume(client, volume_name, num_of_replicas=3, size=str(BACKING_IMAGE_EXT4_SIZE), backing_image=BACKING_IMAGE_NAME) # Step 4 lht_host_id = get_self_host_id() for volume_name in volume_names: volume = client.by_id_volume(volume_name) volume.attach(hostId=lht_host_id) for volume_name in volume_names: volume = wait_for_volume_healthy(client, volume_name) assert volume.backingImage == BACKING_IMAGE_NAME backing_image = client.by_id_backing_image(BACKING_IMAGE_NAME) assert len(backing_image.diskFileStatusMap) == 3 for disk_id, status in iter(backing_image.diskFileStatusMap.items()): assert status.state == "ready" backing_images_in_disk = os.listdir(os.path.join(DEFAULT_DISK_PATH, 'backing-images')) assert len(backing_images_in_disk) == 1 assert os.path.exists(os.path.join(DEFAULT_DISK_PATH, "backing-images", backing_images_in_disk[0], "backing")) assert os.path.exists(os.path.join(DEFAULT_DISK_PATH, "backing-images", backing_images_in_disk[0], "backing.cfg")) # Step 5 current_host = client.by_id_node(id=lht_host_id) client.update(current_host, allowScheduling=False) wait_for_node_update(client, lht_host_id, "allowScheduling", False) # Step 6 for volume_name in volume_names: volume = client.by_id_volume(volume_name) for replica in volume.replicas: if replica.hostId == lht_host_id: replica_name = replica.name volume.replicaRemove(name=replica_name) # This wait interval should be smaller than the setting value. # Otherwise, the backing image files may be cleaned up. time.sleep(int(BACKING_IMAGE_CLEANUP_WAIT_INTERVAL)) check_backing_image_disk_map_status(client, BACKING_IMAGE_NAME, 3, "ready") # Step 7 update_setting(client, "backing-image-cleanup-wait-interval", "1") check_backing_image_disk_map_status(client, BACKING_IMAGE_NAME, 2, "ready") for i in range(RETRY_EXEC_COUNTS): try: backing_images_in_disk = os.listdir( os.path.join(DEFAULT_DISK_PATH, "backing-images")) assert len(backing_images_in_disk) == 0 except Exception: time.sleep(RETRY_INTERVAL) # Step 8 for volume_name in volume_names: volume = client.by_id_volume(volume_name) client.delete(volume) wait_for_volume_delete(client, volume_name) check_backing_image_disk_map_status(client, BACKING_IMAGE_NAME, 1, "ready")
Test that the Backing Image Cleanup Wait Interval setting works correctly.
The default value of setting
BackingImageCleanupWaitInterval
is 60.- Clean up the backing image work directory so that the current case won't be intervened by previous tests.
- Create a backing image.
- Create multiple volumes using the backing image.
- Attach all volumes, Then:
- Wait for all volumes can become running.
- Verify the correct in all volumes.
- Verify the backing image disk status map.
- Verify the only backing image file in each disk is reused by
multiple replicas. The backing image file path is
<Data path>/<The backing image name>/backing
- Unschedule test node to guarantee when replica removed from test node, no new replica can be rebuilt on the test node.
- Remove all replicas in one disk. Wait for 50 seconds. Then verify nothing changes in the backing image disk state map (before the cleanup wait interval is passed).
- Modify
BackingImageCleanupWaitInterval
to a small value. Then verify:- The download state of the disk containing no replica becomes terminating first, and the entry will be removed from the map later.
- The related backing image file is removed.
- The download state of other disks keep unchanged. All volumes still work fine.
- Delete all volumes. Verify that there will only remain 1 entry in the backing image disk map
- Delete the backing image.
def test_setting_backup_target_update_via_configmap(client, core_api, request)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_backup_target_update_via_configmap(client, core_api, request): # NOQA """ Test the backup target setting via configmap 1. Initialize longhorn-default-setting configmap 2. Update longhorn-default-setting configmap with a new backup-target value 3. Verify the updated settings """ # Check whether the config map `longhorn-default-resource` is created lh_cms = core_api.list_namespaced_config_map(namespace='longhorn-system') cm_names = [config_map.metadata.name for config_map in lh_cms.items] if DEFAULT_RESOURCE_CONFIGMAP_NAME in cm_names: config_map_name = DEFAULT_RESOURCE_CONFIGMAP_NAME data_yaml_name = DEFAULT_RESOURCE_YAML_NAME else: config_map_name = DEFAULT_SETTING_CONFIGMAP_NAME data_yaml_name = DEFAULT_SETTING_YAML_NAME # Step 1 client = get_longhorn_api_client() # NOQA init_longhorn_default_setting_configmap(core_api, client, configmap_name=config_map_name, data_yaml_name=data_yaml_name) # Step 2 target = "s3://backupbucket-invalid@us-east-1/backupstore" update_settings_via_configmap(core_api, client, [SETTING_BACKUP_TARGET], [target], request, configmap_name=config_map_name, data_yaml_name=data_yaml_name) # Step 3 try: validate_settings(core_api, client, [SETTING_BACKUP_TARGET], [target]) except Exception as e: if SETTING_BACKUP_TARGET_NOT_SUPPORTED in str(e): wait_backup_target_url_updated(client, target) else: raise e
Test the backup target setting via configmap 1. Initialize longhorn-default-setting configmap 2. Update longhorn-default-setting configmap with a new backup-target value 3. Verify the updated settings
def test_setting_concurrent_rebuild_limit(client, core_api, volume_name)
-
Expand source code
def test_setting_concurrent_rebuild_limit(client, core_api, volume_name): # NOQA """ Test if setting Concurrent Replica Rebuild Per Node Limit works correctly. The default setting value is 0, which means no limit. Case 1 - the setting will limit the rebuilding correctly: 1. Set `ConcurrentReplicaRebuildPerNodeLimit` to 1. 2. Create 2 volumes then attach both volumes. 3. Write a large amount of data into both volumes, so that the rebuilding will take a while. 4. Delete one replica for volume 1 then the replica on the same node for volume 2 to trigger (concurrent) rebuilding. 5. Verify the new replica of volume 2 won't be started until volume 1 rebuilding complete. And the new replica of volume 2 will be started immediately once the 1st rebuilding is done. 6. Wait for rebuilding complete then repeat step 4. 7. Set `ConcurrentReplicaRebuildPerNodeLimit` to 0 or 2 while the volume 1 rebuilding is still in progress. Then the new replica of volume 2 will be started immediately before the 1st rebuilding is done. 8. Wait for rebuilding complete then repeat step 4. 9. Set `ConcurrentReplicaRebuildPerNodeLimit` to 1 10. Crash the replica process of volume 1 while the rebuilding is in progress. Then the rebuilding of volume 2 will be started, and the rebuilding of volume 1 will wait for the volume 2 becoming healthy. (There is no need to clean up the above 2 volumes.) Case 2 - the setting won't intervene normal attachment: 1. Set `ConcurrentReplicaRebuildPerNodeLimit` to 1. 2. Make volume 1 state attached and healthy while volume 2 is detached. 3. Delete one replica for volume 1 to trigger the rebuilding. 4. Attach then detach volume 2. The attachment/detachment should succeed even if the rebuilding in volume 1 is still in progress. """ # Step 1-1 update_setting(client, "concurrent-replica-rebuild-per-node-limit", "1") # Step 1-2 volume1_name = "test-vol-1" # NOQA volume1 = create_and_check_volume(client, volume1_name, size=str(4 * Gi)) volume1.attach(hostId=get_self_host_id()) volume1 = wait_for_volume_healthy(client, volume1_name) volume2_name = "test-vol-2" # NOQA volume2 = create_and_check_volume(client, volume2_name, size=str(4 * Gi)) volume2.attach(hostId=get_self_host_id()) volume2 = wait_for_volume_healthy(client, volume2_name) # Step 1-3 volume1_endpoint = get_volume_endpoint(volume1) volume2_endpoint = get_volume_endpoint(volume2) write_volume_dev_random_mb_data(volume1_endpoint, 1, 3500, 5) write_volume_dev_random_mb_data(volume2_endpoint, 1, 3500, 5) # Step 1-4, 1-5 delete_replica_on_test_node(client, volume1_name) wait_for_rebuild_start(client, volume1_name) delete_replica_on_test_node(client, volume2_name) for i in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) volume2 = client.by_id_volume(volume2_name) if volume1.rebuildStatus == []: break assert volume1.rebuildStatus[0].state == "in_progress" assert volume2.rebuildStatus == [] time.sleep(RETRY_INTERVAL) wait_for_rebuild_complete(client, volume1_name) wait_for_rebuild_start(client, volume2_name) wait_for_rebuild_complete(client, volume2_name) # Step 1-6 wait_for_volume_healthy(client, volume1_name) wait_for_volume_healthy(client, volume2_name) # Step 1-7 delete_replica_on_test_node(client, volume1_name) delete_replica_on_test_node(client, volume2_name) update_setting(client, "concurrent-replica-rebuild-per-node-limit", "2") # In a 2 minutes retry loop: # verify that volume 2 start rebuilding while volume 1 is still rebuilding concourent_build = False for i in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) volume2 = client.by_id_volume(volume2_name) try: if volume1.rebuildStatus[0].state == "in_progress" and \ volume2.rebuildStatus[0].state == "in_progress": concourent_build = True break except: # NOQA pass time.sleep(RETRY_SNAPSHOT_INTERVAL) assert concourent_build is True # Step 1-8 wait_for_rebuild_complete(client, volume1_name) wait_for_rebuild_complete(client, volume2_name) # Step 1-9 update_setting(client, "concurrent-replica-rebuild-per-node-limit", "1") # Step 1-10 delete_replica_on_test_node(client, volume1_name) wait_for_rebuild_start(client, volume1_name) volume1 = client.by_id_volume(volume1_name) current_node = get_self_host_id() replicas = [] for r in volume1.replicas: if r["hostId"] == current_node: replicas.append(r) assert len(replicas) > 0 crash_replica_processes(client, core_api, volume1_name, replicas) delete_replica_on_test_node(client, volume2_name) # While one volume is rebuilding, verify another volume is not # rebuilding and stuck in degrading state rebuild_started = False for i in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) volume2 = client.by_id_volume(volume2_name) if volume1.rebuildStatus == [] and \ volume2.rebuildStatus == [] and \ rebuild_started is False: continue elif volume1.rebuildStatus == [] and \ volume2.rebuildStatus == [] and \ rebuild_started is True: break elif volume2.rebuildStatus == []: assert volume1.rebuildStatus[0].state == "in_progress" rebuild_started = True elif volume1.rebuildStatus == []: assert volume2.rebuildStatus[0].state == "in_progress" rebuild_started = True time.sleep(RETRY_INTERVAL) wait_for_rebuild_complete(client, volume2_name) wait_for_rebuild_complete(client, volume1_name) # Step 2-1 # Step 2-2 wait_for_volume_healthy(client, volume1_name) wait_for_volume_healthy(client, volume2_name) volume2 = client.by_id_volume(volume2_name) lht_host_id = get_self_host_id() volume2.detach(hostId=lht_host_id) # Step 2-2 delete_replica_on_test_node(client, volume1_name) wait_for_rebuild_start(client, volume1_name) # Step 2-3 volume2 = client.by_id_volume(volume2_name) volume2.attach(hostId=lht_host_id) # In a 2 minutes retry loop: # verify that we can see the case: volume2 becomes healthy while # volume1 is rebuilding expect_case = False for i in range(RETRY_COUNTS): volume1 = client.by_id_volume(volume1_name) volume2 = client.by_id_volume(volume2_name) try: if volume1.rebuildStatus[0].state == "in_progress" and \ volume2["robustness"] == "healthy": expect_case = True break except: # NOQA pass time.sleep(RETRY_INTERVAL) assert expect_case is True wait_for_volume_healthy(client, volume1_name) volume2.detach(hostId=lht_host_id) wait_for_volume_detached(client, volume2_name) volume2.attach(hostId=lht_host_id) wait_for_volume_healthy(client, volume2_name)
Test if setting Concurrent Replica Rebuild Per Node Limit works correctly.
The default setting value is 0, which means no limit.
Case 1 - the setting will limit the rebuilding correctly: 1. Set
ConcurrentReplicaRebuildPerNodeLimit
to 1. 2. Create 2 volumes then attach both volumes. 3. Write a large amount of data into both volumes, so that the rebuilding will take a while. 4. Delete one replica for volume 1 then the replica on the same node for volume 2 to trigger (concurrent) rebuilding. 5. Verify the new replica of volume 2 won't be started until volume 1 rebuilding complete. And the new replica of volume 2 will be started immediately once the 1st rebuilding is done. 6. Wait for rebuilding complete then repeat step 4. 7. SetConcurrentReplicaRebuildPerNodeLimit
to 0 or 2 while the volume 1 rebuilding is still in progress. Then the new replica of volume 2 will be started immediately before the 1st rebuilding is done. 8. Wait for rebuilding complete then repeat step 4. 9. SetConcurrentReplicaRebuildPerNodeLimit
to 1 10. Crash the replica process of volume 1 while the rebuilding is in progress. Then the rebuilding of volume 2 will be started, and the rebuilding of volume 1 will wait for the volume 2 becoming healthy.(There is no need to clean up the above 2 volumes.)
Case 2 - the setting won't intervene normal attachment: 1. Set
ConcurrentReplicaRebuildPerNodeLimit
to 1. 2. Make volume 1 state attached and healthy while volume 2 is detached. 3. Delete one replica for volume 1 to trigger the rebuilding. 4. Attach then detach volume 2. The attachment/detachment should succeed even if the rebuilding in volume 1 is still in progress. def test_setting_concurrent_volume_backup_restore_limit(set_random_backupstore, client, volume_name)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_concurrent_volume_backup_restore_limit(set_random_backupstore, client, volume_name): # NOQA """ Scenario: setting Concurrent Volume Backup Restore Limit should limit the concurrent volume backup restoring Issue: https://github.com/longhorn/longhorn/issues/4558 Given/When see: setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test Then Number of restoring volumes per node not exceed the setting value. """ setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test( client, volume_name )
Scenario: setting Concurrent Volume Backup Restore Limit should limit the concurrent volume backup restoring
Issue: https://github.com/longhorn/longhorn/issues/4558
Given/When see: setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test
Then Number of restoring volumes per node not exceed the setting value.
def test_setting_concurrent_volume_backup_restore_limit_should_not_effect_dr_volumes(set_random_backupstore, client, volume_name)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_concurrent_volume_backup_restore_limit_should_not_effect_dr_volumes(set_random_backupstore, client, volume_name): # NOQA """ Scenario: setting Concurrent Volume Backup Restore Limit should not effect DR volumes Issue: https://github.com/longhorn/longhorn/issues/4558 Given/When see: setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test Then Number of restoring volumes can exceed the setting value. """ setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test( client, volume_name, is_DR_volumes=True )
Scenario: setting Concurrent Volume Backup Restore Limit should not effect DR volumes
Issue: https://github.com/longhorn/longhorn/issues/4558
Given/When see: setting_concurrent_volume_backup_restore_limit_concurrent_restoring_test
Then Number of restoring volumes can exceed the setting value.
def test_setting_data_engine(client, request)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_data_engine(client, request): # NOQA """ Test that the v1 data engine setting works correctly. 1. Create a volume and attach it. 2. Set v1 data engine setting to false. The setting should be rejected. 3. Detach the volume. 4. Set v1 data engine setting to false again. The setting should be accepted. Then, attach the volume. The volume is unable to attach. 5. set v1 data engine setting to true. The setting should be accepted. 6. Attach the volume. """ if DATA_ENGINE == "v1": setting_data_engine = SETTING_V1_DATA_ENGINE elif DATA_ENGINE == "v2": setting_data_engine = SETTING_V2_DATA_ENGINE setting = client.by_id_setting(setting_data_engine) # Step 1 volume_name = "test-{0}-vol".format(DATA_ENGINE) # NOQA volume = create_and_check_volume(client, volume_name) def finalizer(): cleanup_volume(client, volume) update_setting(client, setting_data_engine, "true") request.addfinalizer(finalizer) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) # Step 2 with pytest.raises(Exception) as e: client.update(setting, value="false") assert 'cannot apply {0}-data-engine setting to Longhorn workloads when ' \ 'there are attached {0} volumes'.format(DATA_ENGINE) in str(e.value) # Step 3 volume.detach() wait_for_volume_detached(client, volume_name) # Step 4 update_setting(client, setting_data_engine, "false") count = wait_for_instance_manager_count(client, 0) assert count == 0 volume.attach(hostId=get_self_host_id()) with pytest.raises(Exception) as e: wait_for_volume_healthy(client, volume_name) assert 'volume[key]=detached' in str(e.value) # Step 5 update_setting(client, setting_data_engine, "true") nodes = client.list_node() count = wait_for_instance_manager_count(client, len(nodes)) assert count == len(nodes) # Step 6 volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name)
Test that the v1 data engine setting works correctly. 1. Create a volume and attach it. 2. Set v1 data engine setting to false. The setting should be rejected. 3. Detach the volume. 4. Set v1 data engine setting to false again. The setting should be accepted. Then, attach the volume. The volume is unable to attach. 5. set v1 data engine setting to true. The setting should be accepted. 6. Attach the volume.
def test_setting_priority_class(client, core_api, apps_api, scheduling_api, priority_class, volume_name)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_priority_class(client, core_api, apps_api, scheduling_api, priority_class, volume_name): # NOQA """ Test that the Priority Class setting is validated and utilized correctly. 1. Verify that the name of a non-existent Priority Class cannot be used for the Setting. 2. Create a new Priority Class in Kubernetes. 3. Create and attach a Volume. 4. Verify that the Priority Class Setting can be updated with an attached volume. 5. Generate and write `data1`. 6. Detach the Volume. 7. Update the Priority Class Setting to the new Priority Class. 8. Wait for all the Longhorn system components to restart with the new Priority Class. 9. Verify that UI, manager, and drive deployer don't have Priority Class 10. Attach the Volume and verify `data1`. 11. Generate and write `data2`. 12. Unset the Priority Class Setting. 13. Wait for all the Longhorn system components to restart with the new Priority Class. 14. Verify that UI, manager, and drive deployer don't have Priority Class 15. Attach the Volume and verify `data2`. 16. Generate and write `data3`. Note: system components are workloads other than UI, manager, driver deployer """ client = get_longhorn_api_client() # NOQA count = len(client.list_node()) name = priority_class['metadata']['name'] setting = client.by_id_setting(SETTING_PRIORITY_CLASS) with pytest.raises(Exception) as e: client.update(setting, value=name) assert 'failed to get priority class ' in str(e.value) scheduling_api.create_priority_class(priority_class) volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) update_setting(client, SETTING_PRIORITY_CLASS, name) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach() wait_for_volume_detached(client, volume_name) wait_for_priority_class_update(core_api, apps_api, count, priority_class) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach() wait_for_volume_detached(client, volume_name) update_setting(client, SETTING_PRIORITY_CLASS, '') wait_for_priority_class_update(core_api, apps_api, count) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
Test that the Priority Class setting is validated and utilized correctly.
- Verify that the name of a non-existent Priority Class cannot be used for the Setting.
- Create a new Priority Class in Kubernetes.
- Create and attach a Volume.
- Verify that the Priority Class Setting can be updated with an attached volume.
- Generate and write
data1
. - Detach the Volume.
- Update the Priority Class Setting to the new Priority Class.
- Wait for all the Longhorn system components to restart with the new Priority Class.
- Verify that UI, manager, and drive deployer don't have Priority Class
- Attach the Volume and verify
data1
. - Generate and write
data2
. - Unset the Priority Class Setting.
- Wait for all the Longhorn system components to restart with the new Priority Class.
- Verify that UI, manager, and drive deployer don't have Priority Class
- Attach the Volume and verify
data2
. - Generate and write
data3
.
Note: system components are workloads other than UI, manager, driver deployer
def test_setting_replica_count_update_via_configmap(client, core_api, request)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_replica_count_update_via_configmap(client, core_api, request): # NOQA """ Test the default-replica-count setting via configmap 1. Get default-replica-count value 2. Initialize longhorn-default-setting configmap 3. Verify default-replica-count is not changed 4. Update longhorn-default-setting configmap with a new default-replica-count value 5. Verify the updated settings 6. Update default-replica-count setting CR with the old value """ # Step 1 client = get_longhorn_api_client() # NOQA old_setting = client.by_id_setting(SETTING_DEFAULT_REPLICA_COUNT) # Step 2 init_longhorn_default_setting_configmap(core_api, client) # Step 3 assert wait_for_setting_updated(client, SETTING_DEFAULT_REPLICA_COUNT, old_setting.value) # Step 4 replica_count = "1" update_settings_via_configmap(core_api, client, [SETTING_DEFAULT_REPLICA_COUNT], [ json.dumps({ "v1": str(replica_count), "v2": str(replica_count) }, separators=(',', ':')) ], request) # Step 5 validate_settings(core_api, client, [SETTING_DEFAULT_REPLICA_COUNT], [ json.dumps({ "v1": str(replica_count), "v2": str(replica_count) }, separators=(',', ':')) ]) # Step 6 retry_setting_update(client, SETTING_DEFAULT_REPLICA_COUNT, old_setting.definition.default)
Test the default-replica-count setting via configmap 1. Get default-replica-count value 2. Initialize longhorn-default-setting configmap 3. Verify default-replica-count is not changed 4. Update longhorn-default-setting configmap with a new default-replica-count value 5. Verify the updated settings 6. Update default-replica-count setting CR with the old value
def test_setting_toleration(client)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_toleration(client): # NOQA """ Test toleration setting 1. Set `taint-toleration` to "key1=value1:NoSchedule; key2:InvalidEffect". 2. Verify the request fails. 3. Create a volume and attach it. 4. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 5. Verify that can update toleration setting when any volume is attached. 6. Generate and write `data1` into the volume. 7. Detach the volume. 8. Set `taint-toleration` to "key1=value1:NoSchedule; key2:NoExecute". 9. Wait for all the Longhorn system components to restart with new toleration. 10. Verify that UI, manager, and drive deployer don't restart and don't have new toleration. 11. Attach the volume again and verify the volume `data1`. 12. Generate and write `data2` to the volume. 13. Detach the volume. 14. Clean the `toleration` setting. 15. Wait for all the Longhorn system components to restart with no toleration. 16. Attach the volume and validate `data2`. 17. Generate and write `data3` to the volume. """ client = get_longhorn_api_client() # NOQA apps_api = get_apps_api_client() # NOQA core_api = get_core_api_client() # NOQA count = len(client.list_node()) setting = client.by_id_setting(SETTING_TAINT_TOLERATION) with pytest.raises(Exception) as e: client.update(setting, value="key1=value1:NoSchedule; key2:InvalidEffect") assert 'invalid effect' in str(e.value) volume_name = "test-toleration-vol" # NOQA volume = create_and_check_volume(client, volume_name) volume.attach(hostId=get_self_host_id()) volume = wait_for_volume_healthy(client, volume_name) setting_value_str = "key1=value1:NoSchedule; key2:NoExecute" setting_value_dicts = [ { "key": "key1", "value": "value1", "operator": "Equal", "effect": "NoSchedule" }, { "key": "key2", "value": None, "operator": "Exists", "effect": "NoExecute" }, ] update_setting(client, SETTING_TAINT_TOLERATION, setting_value_str) data1 = write_volume_random_data(volume) check_volume_data(volume, data1) volume.detach() wait_for_volume_detached(client, volume_name) wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data1) data2 = write_volume_random_data(volume) check_volume_data(volume, data2) volume.detach() wait_for_volume_detached(client, volume_name) # cleanup setting_value_str = "" setting_value_dicts = [] update_setting(client, SETTING_TAINT_TOLERATION, setting_value_str) wait_for_toleration_update(core_api, apps_api, count, setting_value_dicts) client, node = wait_for_longhorn_node_ready() volume = client.by_id_volume(volume_name) volume.attach(hostId=node) volume = wait_for_volume_healthy(client, volume_name) check_volume_data(volume, data2) data3 = write_volume_random_data(volume) check_volume_data(volume, data3) cleanup_volume(client, volume)
Test toleration setting
- Set
taint-toleration
to "key1=value1:NoSchedule; key2:InvalidEffect". - Verify the request fails.
- Create a volume and attach it.
- Set
taint-toleration
to "key1=value1:NoSchedule; key2:NoExecute". - Verify that can update toleration setting when any volume is attached.
- Generate and write
data1
into the volume. - Detach the volume.
- Set
taint-toleration
to "key1=value1:NoSchedule; key2:NoExecute". - Wait for all the Longhorn system components to restart with new toleration.
- Verify that UI, manager, and drive deployer don't restart and don't have new toleration.
- Attach the volume again and verify the volume
data1
. - Generate and write
data2
to the volume. - Detach the volume.
- Clean the
toleration
setting. - Wait for all the Longhorn system components to restart with no toleration.
- Attach the volume and validate
data2
. - Generate and write
data3
to the volume.
- Set
def test_setting_toleration_extra(client, core_api, apps_api)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_toleration_extra(client, core_api, apps_api): # NOQA """ Steps: 1. Set Kubernetes Taint Toleration to: `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. 2. Verify that all system components have the 2 tolerations `ex.com/foobar:NoExecute; ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 3. Set Kubernetes Taint Toleration to: `node-role.kubernetes.io/controlplane=true:NoSchedule`. 4. Verify that all system components have the the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`, and don't have the 2 tolerations `ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 5. Set Kubernetes Taint Toleration to special value: `:`. 6. Verify that all system components have the toleration with `operator: Exists` and other field of the toleration are empty. Verify that all system components don't have the toleration `node-role.kubernetes.io/controlplane=true:NoSchedule`. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 7. Clear Kubernetes Taint Toleration Note: system components are workloads other than UI, manager, driver deployer """ settings = [ { "value": "ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule", "expect": [ { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoExecute" }, { "key": "ex.com/foobar", "value": None, "operator": "Exists", "effect": "NoSchedule" }, ], }, { "value": "node-role.kubernetes.io/controlplane=true:NoSchedule", "expect": [ { "key": "node-role.kubernetes.io/controlplane", "value": "true", "operator": "Equal", "effect": "NoSchedule" }, ], }, # Skip the this special toleration for now because it makes # Longhorn deploy manager pods on control/etcd nodes # and the control/etcd nodes become "down" after the test # clear this toleration. # We will enable this test once we implement logic for # deleting failed nodes. # { # "value": ":", # "expect": [ # { # "key": None, # "value": None, # "operator": "Exists", # "effect": None, # }, # ] # }, { "value": "", "expect": [], }, ] chk_removed_tolerations = [] for setting in settings: update_setting(get_longhorn_api_client(), SETTING_TAINT_TOLERATION, setting["value"]) node_count = len(get_longhorn_api_client().list_node()) wait_for_toleration_update(core_api, apps_api, node_count, setting["expect"], chk_removed_tolerations) chk_removed_tolerations = setting["expect"]
Steps: 1. Set Kubernetes Taint Toleration to:
ex.com/foobar:NoExecute;ex.com/foobar:NoSchedule
. 2. Verify that all system components have the 2 tolerationsex.com/foobar:NoExecute; ex.com/foobar:NoSchedule
. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 3. Set Kubernetes Taint Toleration to:node-role.kubernetes.io/controlplane=true:NoSchedule
. 4. Verify that all system components have the the tolerationnode-role.kubernetes.io/controlplane=true:NoSchedule
, and don't have the 2 tolerationsex.com/foobar:NoExecute;ex.com/foobar:NoSchedule
. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 5. Set Kubernetes Taint Toleration to special value::
. 6. Verify that all system components have the toleration withoperator: Exists
and other field of the toleration are empty. Verify that all system components don't have the tolerationnode-role.kubernetes.io/controlplane=true:NoSchedule
. Verify that UI, manager, and drive deployer don't restart and don't have toleration. 7. Clear Kubernetes Taint TolerationNote: system components are workloads other than UI, manager, driver deployer
def test_setting_update_with_invalid_value_via_configmap(client, core_api, apps_api, request)
-
Expand source code
@pytest.mark.v2_volume_test # NOQA def test_setting_update_with_invalid_value_via_configmap(client, core_api, apps_api, request): # NOQA """ Test the default settings update with invalid value via configmap 1. Create an attached volume 2. Initialize longhorn-default-setting configmap containing valid and invalid settings 3. Update longhorn-default-setting configmap with invalid settings. The invalid settings SETTING_TAINT_TOLERATION will be updated 4. The changes will be applied once the volumes are detached. (To Do) 5. Validate the default settings values. """ # Check whether the config map `longhorn-default-resource` is created backup_cm_created = False lh_cms = core_api.list_namespaced_config_map(namespace='longhorn-system') cm_names = [config_map.metadata.name for config_map in lh_cms.items] if DEFAULT_RESOURCE_CONFIGMAP_NAME in cm_names: backup_cm_created = True bt_config_map_name = DEFAULT_RESOURCE_CONFIGMAP_NAME bt_data_yaml_name = DEFAULT_RESOURCE_YAML_NAME # Step 1 client = get_longhorn_api_client() # NOQA lht_hostId = get_self_host_id() vol_name = generate_volume_name() volume = create_volume(client, vol_name, str(Gi), lht_hostId, 3) volume.attach(hostId=lht_hostId) volume = wait_for_volume_healthy(client, vol_name) # Step 2 init_longhorn_default_setting_configmap(core_api, client) # Step 3 target = "s3://backupbucket-invalid@us-east-1/backupstore" update_settings_via_configmap(core_api, client, [SETTING_TAINT_TOLERATION], ["key1=value1:NoSchedule"], request) # Step 4 validate_settings(core_api, client, [SETTING_TAINT_TOLERATION], ["key1=value1:NoSchedule"]) if backup_cm_created: init_longhorn_default_setting_configmap( core_api, client, configmap_name=bt_config_map_name, data_yaml_name=bt_data_yaml_name) update_settings_via_configmap(core_api, client, [SETTING_BACKUP_TARGET], [target], request, configmap_name=bt_config_map_name, data_yaml_name=bt_data_yaml_name) wait_backup_target_url_updated(client, target) cleanup_volume_by_name(client, vol_name) # cleanup the ConfigMap `longhorn-default-setting` init_longhorn_default_setting_configmap(core_api, client) # reset the toleration setting setting_value_str = "" setting_value_dicts = [] update_setting(client, SETTING_TAINT_TOLERATION, setting_value_str) wait_for_toleration_update(core_api, apps_api, len(client.list_node()), setting_value_dicts) wait_for_longhorn_node_ready()
Test the default settings update with invalid value via configmap 1. Create an attached volume 2. Initialize longhorn-default-setting configmap containing valid and invalid settings 3. Update longhorn-default-setting configmap with invalid settings. The invalid settings SETTING_TAINT_TOLERATION will be updated 4. The changes will be applied once the volumes are detached. (To Do) 5. Validate the default settings values.
def update_settings_via_configmap(core_api,
client,
setting_names,
setting_values,
request,
configmap_name='longhorn-default-setting',
data_yaml_name='default-setting.yaml')-
Expand source code
def update_settings_via_configmap(core_api, client, setting_names, setting_values, request, # NOQA configmap_name=DEFAULT_SETTING_CONFIGMAP_NAME, # NOQA data_yaml_name=DEFAULT_SETTING_YAML_NAME): # NOQA configmap_body = config_map_with_value(configmap_name, setting_names, setting_values, data_yaml_name) core_api.patch_namespaced_config_map(name=configmap_name, namespace='longhorn-system', body=configmap_body) def reset_default_settings(): # Directly cleanup data of ConfigMap # `longhorn-default-resource` and `longhorn-default-setting` # It will reset the default backup target, replica count, # and tolerations settings. # if the ConfigMap contains the data 'backup-target': "", ... . init_longhorn_default_setting_configmap(core_api, client, configmap_name=configmap_name, data_yaml_name=data_yaml_name) if configmap_name == DEFAULT_RESOURCE_CONFIGMAP_NAME: reset_backupstore_setting(client) request.addfinalizer(reset_default_settings)
def validate_settings(core_api, client, setting_names, setting_values)
-
Expand source code
def validate_settings(core_api, client, setting_names, setting_values): # NOQA num_settings = len(setting_names) for i in range(num_settings): name = setting_names[i] value = setting_values[i] assert wait_for_setting_updated(client, name, value)
def wait_backup_target_url_updated(client, target)
-
Expand source code
def wait_backup_target_url_updated(client, target): # NOQA updated = False for _ in range(RETRY_COUNTS_SHORT): backup_target_url = backupstore_get_backup_target(client) if backup_target_url == target: updated = True break time.sleep(RETRY_INTERVAL) assert updated
def wait_for_longhorn_node_ready()
-
Expand source code
def wait_for_longhorn_node_ready(): client = get_longhorn_api_client() # NOQA ei = get_default_engine_image(client) ei_name = ei["name"] ei_state = get_engine_image_status_value(client, ei_name) wait_for_engine_image_state(client, ei_name, ei_state) node = get_self_host_id() wait_for_node_up_longhorn(node, client) return client, node
def wait_for_priority_class_update(core_api, apps_api, count, priority_class=None)
-
Expand source code
def wait_for_priority_class_update(core_api, apps_api, count, priority_class=None): # NOQA updated = False for i in range(RETRY_COUNTS): time.sleep(RETRY_INTERVAL_LONG) updated = True if not check_workload_update(core_api, apps_api, count): updated = False continue pod_list = core_api.list_namespaced_pod(LONGHORN_NAMESPACE).items for p in pod_list: if p.status.phase != "Running" and \ not check_priority_class(p, priority_class): updated = False break if not updated: continue if updated: break assert updated
def wait_for_setting_updated(client, name, expected_value)
-
Expand source code
def wait_for_setting_updated(client, name, expected_value): # NOQA for _ in range(RETRY_COUNTS): setting = client.by_id_setting(name) if setting.value == expected_value: return True time.sleep(RETRY_INTERVAL) return False
def wait_for_toleration_update(core_api, apps_api, count, expected_tolerations, chk_removed_tolerations=[])
-
Expand source code
def wait_for_toleration_update(core_api, apps_api, count, # NOQA expected_tolerations, chk_removed_tolerations=[]): not_managed_apps = [ "csi-attacher", "csi-provisioner", "csi-resizer", "csi-snapshotter", "longhorn-csi-plugin", "longhorn-driver-deployer", "longhorn-manager", "longhorn-ui", ] updated = False for _ in range(RETRY_COUNTS): time.sleep(RETRY_INTERVAL_LONG) updated = True if not check_workload_update(core_api, apps_api, count): updated = False continue pod_list = core_api.list_namespaced_pod(LONGHORN_NAMESPACE).items for p in pod_list: managed_by = p.metadata.labels.get('longhorn.io/managed-by', '') if str(managed_by) != "longhorn-manager": continue else: app_name = str(p.metadata.labels.get('app', '')) assert app_name not in not_managed_apps if p.status.phase != "Running" \ or not check_tolerations_set(p.spec.tolerations, expected_tolerations, chk_removed_tolerations): updated = False break if updated: break assert updated