You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
client = <longhorn.Client object at 0x7f57ba1eae50>
core_api = <kubernetes.client.api.core_v1_api.CoreV1Api object at 0x7f57c78d0b80>
set_random_backupstore = None
def test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume(client, core_api, set_random_backupstore): # NOQA
"""
Test DR, restoring, expanding volumes when engine image DaemonSet
is not fully deployed
Prerequisite:
Prepare system for the test by calling the method
prepare_engine_not_fully_deployed_evnironment to have
tainted node and not fully deployed engine.
1. Create volume vol-1 with 2 replicas
2. Attach vol-1 to node-2, write data and create backup
3. Create a DR volume (vol-dr) of 2 replicas.
4. Verify that 2 replicas are on node-2 and node-3 and the DR volume
is attached to either node-2 or node-3.
Let's say it is attached to node-x
5. Taint node-x with the taint `key=value:NoSchedule`
6. Delete the pod of engine image DeamonSet on node-x. Now, the engine
image is missing on node-1 and node-x
7. Verify that vol-dr is auto-attached node-y.
8. Restore a volume from backupstore with name vol-rs and replica count
is 1
9. Verify that replica is on node-y and the volume successfully restored.
10. Wait for vol-rs to finish restoring
11. Expand vol-rs.
12. Verify that the expansion is ok
13. Set `Replica Replenishment Wait Interval` setting to 600
14. Crash the replica of vol-1 on node-x. Wait for the replica to fail
15. In a 2-min retry verify that Longhorn doesn't create new replica
for vol-1 and doesn't reuse the failed replica on node-x
"""
update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false")
tainted_node_id = \
prepare_engine_not_fully_deployed_environment(client, core_api)
# step 1
volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2,
size=str(1 * Gi))
# node1: tainted node, node2: self host node, node3: the last one
nodes = client.list_node()
for node in nodes:
if node.id == get_self_host_id():
node2 = node
elif node.id != tainted_node_id and node.id != get_self_host_id:
node3 = node
# step 2
volume1 = volume1.attach(hostId=node2.id)
volume1 = wait_for_volume_healthy(client, volume1.name)
volume_endpoint = get_volume_endpoint(volume1)
snap1_offset = 1
snap_data_size_in_mb = 4
write_volume_dev_random_mb_data(volume_endpoint,
snap1_offset, snap_data_size_in_mb)
snap = create_snapshot(client, volume1.name)
volume1.snapshotBackup(name=snap.name)
wait_for_backup_completion(client,
volume1.name,
snap.name,
retry_count=600)
bv, b1 = find_backup(client, volume1.name, snap.name)
dr_volume_name = volume1.name + "-dr"
client.create_volume(name=dr_volume_name, size=str(1 * Gi),
numberOfReplicas=2, fromBackup=b1.url,
frontend="", standby=True)
wait_for_volume_creation(client, dr_volume_name)
wait_for_backup_restore_completed(client, dr_volume_name, b1.name)
dr_volume = client.by_id_volume(dr_volume_name)
# step 4
on_node2 = False
on_node3 = False
for replica in dr_volume.replicas:
if replica.hostId == node2.id:
on_node2 = True
if replica.hostId == node3.id:
on_node3 = True
assert on_node2
assert on_node3
# step 5
node_x = dr_volume.controllers[0].hostId
core_api.patch_node(
node_x, {
"spec": {
"taints":
[{"effect": "NoSchedule",
"key": "key",
"value": "value"}]
}
})
# step 6
restart_and_wait_ready_engine_count(client, 1)
# step 7
dr_volume = wait_for_volume_degraded(client, dr_volume.name)
assert dr_volume.controllers[0].hostId != tainted_node_id
assert dr_volume.controllers[0].hostId != node_x
node_running_latest_enging = dr_volume.controllers[0].hostId
# step 8, 9 10
res_vol_name = "vol-rs"
client.create_volume(name=res_vol_name, numberOfReplicas=1,
fromBackup=b1.url)
wait_for_volume_condition_restore(client, res_vol_name,
"status", "True")
wait_for_volume_condition_restore(client, res_vol_name,
"reason", "RestoreInProgress")
res_volume = wait_for_volume_detached(client, res_vol_name)
res_volume = client.by_id_volume(res_vol_name)
assert res_volume.ready is True
assert len(res_volume.replicas) == 1
assert res_volume.replicas[0].hostId == node_running_latest_enging
# step 11, 12
expand_size = str(2 * Gi)
res_volume.expand(size=expand_size)
wait_for_volume_expansion(client, res_volume.name)
res_volume = wait_for_volume_detached(client, res_volume.name)
res_volume.attach(hostId=node_running_latest_enging, disableFrontend=False)
res_volume = wait_for_volume_healthy(client, res_volume.name)
assert res_volume.size == expand_size
# step 13
replenish_wait_setting = \
client.by_id_setting(SETTING_REPLICA_REPLENISHMENT_WAIT_INTERVAL)
client.update(replenish_wait_setting, value="600")
# step 14
volume1 = client.by_id_volume(volume1.name)
for replica in volume1.replicas:
if replica.hostId == node_x:
crash_replica_processes(client, core_api, res_volume.name,
replicas=[replica],
wait_to_fail=True)
# step 15
> volume1 = wait_for_volume_degraded(client, volume1.name)
test_ha.py:3085:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
common.py:1958: in wait_for_volume_degraded
return wait_for_volume_status(client, name,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
client = <longhorn.Client object at 0x7f57ba1eae50>, name = 'vol-1'
key = 'robustness', value = 'degraded', retry_count = 360
def wait_for_volume_status(client, name, key, value,
retry_count=RETRY_COUNTS_LONG):
wait_for_volume_creation(client, name)
for i in range(retry_count):
volume = client.by_id_volume(name)
if volume[key] == value:
break
time.sleep(RETRY_INTERVAL)
> assert volume[key] == value, f" value={value}\n. \
volume[key]={volume[key]}\n. volume={volume}"
E AssertionError: value=degraded
E . volume[key]=healthy
E . volume={'accessMode': 'rwo', 'backingImage': '', 'backupCompressionMethod': 'lz4', 'backupStatus': [{'backupURL': 's3://backupbucket@us-east-1/backupstore?backup=backup-e0aa9bf01f4f4726&volume=vol-1', 'error': '', 'progress': 100, 'replica': 'tcp://10.42.3.9:10000', 'size': '6291456', 'snapshot': '657008a8-d35e-49e9-b442-f90d9b2b34e3', 'state': 'Completed'}], 'cloneStatus': {'snapshot': '', 'sourceVolume': '', 'state': ''}, 'conditions': {'Restore': {'lastProbeTime': '', 'lastTransitionTime': '2024-05-06T18:38:53Z', 'message': '', 'reason': '', 'status': 'False'}, 'Scheduled': {'lastProbeTime': '', 'lastTransitionTime': '2024-05-06T18:39:04Z', 'message': '', 'reason': '', 'status': 'True'}, 'TooManySnapshots': {'lastProbeTime': '', 'lastTransitionTime': '2024-05-06T18:38:53Z', 'message': '', 'reason': '', 'status': 'False'}, 'WaitForBackingImage': {'lastProbeTime': '', 'lastTransitionTime': '2024-05-06T18:38:53Z', 'message': '', 'reason': '', 'status': 'False'}}, 'controllers': [{'actualSize': '4194304', 'address': '10.42.3.9', 'currentImage': 'longhornio/longhorn-engine:v1.6.x-head', 'endpoint': '/dev/longhorn/vol-1', 'hostId': 'ip-10-0-2-237', 'image': 'longhornio/longhorn-engine:v1.6.x-head', 'instanceManagerName': 'instance-manager-a884087b76edd3fa09f0fde0121ae877', 'isExpanding': False, 'lastExpansionError': '', 'lastExpansionFailedAt': '', 'lastRestoredBackup': '', 'name': 'vol-1-e-0', 'requestedBackupRestore': '', 'running': True, 'size': '1073741824', 'unmapMarkSnapChainRemovedEnabled': False}], 'created': '2024-05-06 18:38:53 +0000 UTC', 'currentImage': 'longhornio/longhorn-engine:v1.6.x-head', 'dataEngine': 'v1', 'dataLocality': 'disabled', 'dataSource': '', 'disableFrontend': False, 'diskSelector': [], 'encrypted': False, 'fromBackup': '', 'frontend': 'blockdev', 'image': 'longhornio/longhorn-engine:v1.6.x-head', 'kubernetesStatus': {'lastPVCRefAt': '', 'lastPodRefAt': '', 'namespace': '', 'pvName': '', 'pvStatus': '', 'pvcName': '', 'workloadsStatus': None}, 'lastAttachedBy': '', 'lastBackup': 'backup-e0aa9bf01f4f4726', 'lastBackupAt': '2024-05-06T18:39:05Z', 'migratable': False, 'name': 'vol-1', 'nodeSelector': [], 'numberOfReplicas': 2, 'offlineReplicaRebuilding': 'disabled', 'offlineReplicaRebuildingRequired': False, 'purgeStatus': [{'error': '', 'isPurging': False, 'progress': 0, 'replica': 'vol-1-r-332c7049', 'state': ''}, {'error': '', 'isPurging': False, 'progress': 0, 'replica': 'vol-1-r-aa4677d7', 'state': ''}], 'ready': True, 'rebuildStatus': [], 'recurringJobSelector': None, 'replicaAutoBalance': 'ignored', 'replicaDiskSoftAntiAffinity': 'ignored', 'replicaSoftAntiAffinity': 'ignored', 'replicaZoneSoftAntiAffinity': 'ignored', 'replicas': [{'address': '10.42.2.204', 'currentImage': 'longhornio/longhorn-engine:v1.6.x-head', 'dataEngine': 'v1', 'dataPath': '/var/lib/longhorn/replicas/vol-1-612a663e', 'diskID': '93acd9c6-9851-417e-b558-8d9026276e73', 'diskPath': '/var/lib/longhorn/', 'failedAt': '', 'hostId': 'ip-10-0-2-72', 'image': 'longhornio/longhorn-engine:v1.6.x-head', 'instanceManagerName': 'instance-manager-bcf7b0edbf6c14c34b67a3099635ceb3', 'mode': 'RW', 'name': 'vol-1-r-332c7049', 'running': True}, {'address': '', 'currentImage': '', 'dataEngine': 'v1', 'dataPath': '/var/lib/longhorn/replicas/vol-1-c3b16f4a', 'diskID': '20739329-0ed6-4905-94f6-cfa15cf0c376', 'diskPath': '/var/lib/longhorn/', 'failedAt': '2024-05-06T18:40:45Z', 'hostId': 'ip-10-0-2-237', 'image': 'longhornio/longhorn-engine:v1.6.x-head', 'instanceManagerName': '', 'mode': '', 'name': 'vol-1-r-5f342f7d', 'running': False}, {'address': '10.42.2.204', 'currentImage': 'longhornio/longhorn-engine:v1.6.x-head', 'dataEngine': 'v1', 'dataPath': '/var/lib/longhorn/replicas/vol-1-09f2c38b', 'diskID': '93acd9c6-9851-417e-b558-8d9026276e73', 'diskPath': '/var/lib/longhorn/', 'failedAt': '', 'hostId': 'ip-10-0-2-72', 'image': 'longhornio/longhorn-engine:v1.6.x-head', 'instanceManagerName': 'instance-manager-bcf7b0edbf6c14c34b67a3099635ceb3', 'mode': 'RW', 'name': 'vol-1-r-aa4677d7', 'running': True}], 'restoreInitiated': False, 'restoreRequired': False, 'restoreStatus': [{'backupURL': '', 'error': '', 'filename': '', 'isRestoring': False, 'lastRestored': '', 'progress': 0, 'replica': 'vol-1-r-332c7049', 'state': ''}, {'backupURL': '', 'error': '', 'filename': '', 'isRestoring': False, 'lastRestored': '', 'progress': 0, 'replica': 'vol-1-r-aa4677d7', 'state': ''}], 'restoreVolumeRecurringJob': 'ignored', 'revisionCounterDisabled': False, 'robustness': 'healthy', 'shareEndpoint': '', 'shareState': '', 'size': '1073741824', 'snapshotDataIntegrity': 'ignored', 'snapshotMaxCount': 250, 'snapshotMaxSize': '0', 'staleReplicaTimeout': 0, 'standby': False, 'state': 'attached', 'unmapMarkSnapChainRemoved': 'ignored', 'volumeAttachment': {'attachments': {'': {'attachmentID': '', 'attachmentType': 'longhorn-api', 'conditions': [{'lastProbeTime': '', 'lastTransitionTime': '2024-05-06T18:38:59Z', 'message': '', 'reason': '', 'status': 'True'}], 'nodeID': 'ip-10-0-2-237', 'parameters': {'disableFrontend': 'false', 'lastAttachedBy': ''}, 'satisfied': True}}, 'volume': 'vol-1'}}
common.py:1982: AssertionError
It's because the replica (instance manager) process to be crashed is not deterministic. It could be on the volume-attached node or not. If it's not on the volume-attached node, the volume becomes attached/degraded state as expected, but if it's on the volume-attached node, it becomes unknown/healthy state instead of degraded.
So the wait_for_volume_degraded is not guaranteed.
The volume creation failed because there is no engine images deployed on a node:
client = <longhorn.Client object at 0x7f6734dff370>
core_api = <kubernetes.client.api.core_v1_api.CoreV1Api object at 0x7f6734e55c10>
set_random_backupstore = None
def test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume(client, core_api, set_random_backupstore): # NOQA
"""
Test DR, restoring, expanding volumes when engine image DaemonSet
is not fully deployed
Prerequisite:
Prepare system for the test by calling the method
prepare_engine_not_fully_deployed_evnironment to have
tainted node and not fully deployed engine.
1. Create volume vol-1 with 2 replicas
2. Attach vol-1 to node-2, write data and create backup
3. Create a DR volume (vol-dr) of 2 replicas.
4. Verify that 2 replicas are on node-2 and node-3 and the DR volume
is attached to either node-2 or node-3.
Let's say it is attached to node-x
5. Taint node-x with the taint `key=value:NoSchedule`
6. Delete the pod of engine image DeamonSet on node-x. Now, the engine
image is missing on node-1 and node-x
7. Verify that vol-dr is auto-attached node-y.
8. Restore a volume from backupstore with name vol-rs and replica count
is 1
9. Verify that replica is on node-y and the volume successfully restored.
10. Wait for vol-rs to finish restoring
11. Expand vol-rs.
12. Verify that the expansion is ok
13. Set `Replica Replenishment Wait Interval` setting to 600
14. Crash the replica of vol-1 on node-x. Wait for the replica to fail
15. In a 2-min retry verify that Longhorn doesn't create new replica
for vol-1 and doesn't reuse the failed replica on node-x
"""
update_setting(client, common.SETTING_DEGRADED_AVAILABILITY, "false")
tainted_node_id = \
prepare_engine_not_fully_deployed_environment(client, core_api)
# step 1
volume1 = create_and_check_volume(client, "vol-1", num_of_replicas=2,
size=str(1 * Gi))
# node1: tainted node, node2: self host node, node3: the last one
nodes = client.list_node()
for node in nodes:
if node.id == get_self_host_id():
node2 = node
elif node.id != tainted_node_id and node.id != get_self_host_id:
node3 = node
# step 2
> volume1 = volume1.attach(hostId=node2.id)
test_ha.py:2986:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
longhorn.py:262: in cb
return self.action(_result, _link_name,
longhorn.py:457: in action
return self._post_and_retry(url, *args, **kw)
longhorn.py:415: in _post_and_retry
raise e
longhorn.py:409: in _post_and_retry
return self._post(url, data=self._to_dict(*args, **kw))
longhorn.py:74: in wrapped
return fn(*args, **kw)
longhorn.py:303: in _post
self._error(r.text, r.status_code)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <longhorn.Client object at 0x7f6734dff370>
text = '{"actions":{},"code":"Internal Server Error","detail":"","links":{"self":"http://10.42.1.8:9500/v1/volumes/vol-1"},"m...east one of the the replicas\' nodes or the node that the volume is going to attach to","status":500,"type":"error"}\n'
status_code = 500
def _error(self, text, status_code):
> raise ApiError(self._unmarshall(text), status_code)
E longhorn.ApiError: (ApiError(...), '500 : unable to attach volume vol-1 to ip-10-0-2-35: cannot attach volume vol-1 because the data engine image longhornio/longhorn-engine:v1.6.x-head is not deployed on at least one of the the replicas\' nodes or the node that the volume is going to attach to\n{\'code\': 500, \'detail\': \'\', \'message\': "unable to attach volume vol-1 to ip-10-0-2-35: cannot attach volume vol-1 because the data engine image longhornio/longhorn-engine:v1.6.x-head is not deployed on at least one of the the replicas\' nodes or the node that the volume is going to attach to", \'status\': 500}')
longhorn.py:283: ApiError
It's because prepare_engine_not_fully_deployed_environment returns once there are engine image pods running, but it doesn't mean engine images have been deployed on these nodes.
Extra wait_for_deployed_engine_image_count should be added to wait until engine images are deployed.
Describe the tasks for the test
Additional context
The text was updated successfully, but these errors were encountered:
What's the test to develop? Please describe
Test case
test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume
is flaky.There are 2 types of errors:
https://ci.longhorn.io/job/public/job/v1.6.x/job/v1.6.x-longhorn-tests-sles-amd64/130/testReport/junit/tests/test_ha/test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume_s3_/
It's because the replica (instance manager) process to be crashed is not deterministic. It could be on the volume-attached node or not. If it's not on the volume-attached node, the volume becomes
attached/degraded
state as expected, but if it's on the volume-attached node, it becomesunknown/healthy
state instead ofdegraded
.So the
wait_for_volume_degraded
is not guaranteed.https://ci.longhorn.io/job/public/job/v1.6.x/job/v1.6.x-longhorn-tests-sles-amd64/131/testReport/junit/tests/test_ha/test_engine_image_not_fully_deployed_perform_dr_restoring_expanding_volume_nfs_/
It's because
prepare_engine_not_fully_deployed_environment
returns once there are engine image pods running, but it doesn't mean engine images have beendeployed
on these nodes.Extra
wait_for_deployed_engine_image_count
should be added to wait until engine images are deployed.Describe the tasks for the test
Additional context
The text was updated successfully, but these errors were encountered: