|
|
7ed5e3 |
From 419aa23f0c6338b01b644094c3af2c024470e9b2 Mon Sep 17 00:00:00 2001
|
|
|
7ed5e3 |
From: Nir Soffer <nsoffer@redhat.com>
|
|
|
7ed5e3 |
Date: Sat, 10 Jul 2021 02:35:27 +0300
|
|
|
7ed5e3 |
Subject: [PATCH] v2v: rhv-upload-plugin: Fix waiting for finalize
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
Waiting for image transfer finalize is complex. In the past we tried to
|
|
|
7ed5e3 |
simplify the process by waiting on the disk status, but turns out that
|
|
|
7ed5e3 |
due to the way oVirt lock the disk, this is not reliable.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
These is finalize success flow:
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
1. User asks to finalize the transfer
|
|
|
7ed5e3 |
2. oVirt sets transfer phase to FINALIZING_SUCCESS
|
|
|
7ed5e3 |
3. oVirt finalizes the transfer
|
|
|
7ed5e3 |
4. oVirt sets disk status to OK
|
|
|
7ed5e3 |
5. oVirt unlocks the disk and changes transfer phase to FINISHED_SUCCESS
|
|
|
7ed5e3 |
6. oVirt removes the transfer
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
In oVirt logs we can see that disk status changes to OK about 3 seconds
|
|
|
7ed5e3 |
before the disk is actually unlocked. This is a very old problem that is
|
|
|
7ed5e3 |
unlikely to be fixed.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
The only way to wait for transfer completion is to poll the transfer
|
|
|
7ed5e3 |
phase, but oVirt makes this hard by removing the transfer shortly after
|
|
|
7ed5e3 |
it completes, so we may not be able to get the FINISHED_SUCCESS phase.
|
|
|
7ed5e3 |
If the transfer was removed before we got one of the final phases, we
|
|
|
7ed5e3 |
need to check the disk status to understand the result of transfer.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
oVirt 4.4.7 made polling transfer phase easier by keeping the transfer
|
|
|
7ed5e3 |
after completion, but we need to support older versions so we must have
|
|
|
7ed5e3 |
generic code that work with any version.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
To make debugging easier, we log the transfer phase during polling. Here
|
|
|
7ed5e3 |
is a typical transfer log when finalizing transfer:
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
finalizing transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finalizing_success
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa is finished_success
|
|
|
7ed5e3 |
transfer 59e545f3-db1f-4a6b-90b1-80ac66572faa finalized in 5.153 seconds
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
Signed-off-by: Nir Soffer <nsoffer@redhat.com>
|
|
|
7ed5e3 |
Fixes: https://bugzilla.redhat.com/1976024
|
|
|
7ed5e3 |
(cherry picked from commit 79702b28329d15a7485801ed7e915d486fcc0cf4)
|
|
|
7ed5e3 |
---
|
|
|
7ed5e3 |
v2v/rhv-upload-plugin.py | 98 ++++++++++++++++++++++++++++------------
|
|
|
7ed5e3 |
1 file changed, 69 insertions(+), 29 deletions(-)
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
diff --git a/v2v/rhv-upload-plugin.py b/v2v/rhv-upload-plugin.py
|
|
|
7ed5e3 |
index 7cd6dea6..61ade1a8 100644
|
|
|
7ed5e3 |
--- a/v2v/rhv-upload-plugin.py
|
|
|
7ed5e3 |
+++ b/v2v/rhv-upload-plugin.py
|
|
|
7ed5e3 |
@@ -601,17 +601,29 @@ def finalize_transfer(connection, transfer, disk_id):
|
|
|
7ed5e3 |
"""
|
|
|
7ed5e3 |
Finalize a transfer, making the transfer disk available.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- If finalizing succeeds, transfer's phase will change to FINISHED_SUCCESS
|
|
|
7ed5e3 |
- and the transer's disk status will change to OK. On errors, the transfer's
|
|
|
7ed5e3 |
- phase will change to FINISHED_FAILURE and the disk status will change to
|
|
|
7ed5e3 |
- ILLEGAL and it will be removed. In both cases the transfer entity will be
|
|
|
7ed5e3 |
- removed shortly after.
|
|
|
7ed5e3 |
+ If finalizing succeeds, the transfer's disk status will change to OK
|
|
|
7ed5e3 |
+ and transfer's phase will change to FINISHED_SUCCESS. Unfortunately,
|
|
|
7ed5e3 |
+ the disk status is modified before the transfer finishes, and oVirt
|
|
|
7ed5e3 |
+ may still hold a lock on the disk at this point.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- If oVirt fails to finalize the transfer, transfer's phase will change to
|
|
|
7ed5e3 |
- PAUSED_SYSTEM. In this case the disk's status will change to ILLEGAL and it
|
|
|
7ed5e3 |
- will not be removed.
|
|
|
7ed5e3 |
+ The only way to make sure that the disk is unlocked, is to wait
|
|
|
7ed5e3 |
+ until the transfer phase switches FINISHED_SUCCESS. Unfortunately
|
|
|
7ed5e3 |
+ oVirt makes this hard to use because the transfer is removed shortly
|
|
|
7ed5e3 |
+ after switching the phase to the final phase. However if the
|
|
|
7ed5e3 |
+ transfer was removed, we can be sure that the disk is not locked,
|
|
|
7ed5e3 |
+ since oVirt releases the locks before removing the transfer.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- For simplicity, we track only disk's status changes.
|
|
|
7ed5e3 |
+ On errors, the transfer's phase will change to FINISHED_FAILURE and
|
|
|
7ed5e3 |
+ the disk status will change to ILLEGAL and it will be removed. Again
|
|
|
7ed5e3 |
+ the transfer will be removed shortly after that.
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ If oVirt fails to finalize the transfer, transfer's phase will
|
|
|
7ed5e3 |
+ change to PAUSED_SYSTEM. In this case the disk's status will change
|
|
|
7ed5e3 |
+ to ILLEGAL and it will not be removed.
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ oVirt 4.4.7 made waiting for transfer easier by keeping transfers
|
|
|
7ed5e3 |
+ after they complete, but we must support older versions so we have
|
|
|
7ed5e3 |
+ generic code that work with any version.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
For more info see:
|
|
|
7ed5e3 |
- http://ovirt.github.io/ovirt-engine-api-model/4.4/#services/image_transfer
|
|
|
7ed5e3 |
@@ -626,34 +638,62 @@ def finalize_transfer(connection, transfer, disk_id):
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
transfer_service.finalize()
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- disk_service = (connection.system_service()
|
|
|
7ed5e3 |
- .disks_service()
|
|
|
7ed5e3 |
- .disk_service(disk_id))
|
|
|
7ed5e3 |
-
|
|
|
7ed5e3 |
while True:
|
|
|
7ed5e3 |
time.sleep(1)
|
|
|
7ed5e3 |
try:
|
|
|
7ed5e3 |
- disk = disk_service.get()
|
|
|
7ed5e3 |
+ transfer = transfer_service.get()
|
|
|
7ed5e3 |
except sdk.NotFoundError:
|
|
|
7ed5e3 |
- # Disk verification failed and the system removed the disk.
|
|
|
7ed5e3 |
- raise RuntimeError(
|
|
|
7ed5e3 |
- "transfer %s failed: disk %s was removed"
|
|
|
7ed5e3 |
- % (transfer.id, disk_id))
|
|
|
7ed5e3 |
+ # Transfer was removed (ovirt < 4.4.7). We need to check the
|
|
|
7ed5e3 |
+ # disk status to understand if the transfer was successful.
|
|
|
7ed5e3 |
+ # Due to the way oVirt does locking, we know that the disk
|
|
|
7ed5e3 |
+ # is unlocked at this point so we can check only once.
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- if disk.status == types.DiskStatus.ILLEGAL:
|
|
|
7ed5e3 |
- # Disk verification failed or transfer was paused by the system.
|
|
|
7ed5e3 |
- raise RuntimeError(
|
|
|
7ed5e3 |
- "transfer %s failed: disk is ILLEGAL" % transfer.id)
|
|
|
7ed5e3 |
+ debug("transfer %s was removed, checking disk %s status"
|
|
|
7ed5e3 |
+ % (transfer.id, disk_id))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ disk_service = (connection.system_service()
|
|
|
7ed5e3 |
+ .disks_service()
|
|
|
7ed5e3 |
+ .disk_service(disk_id))
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- if disk.status == types.DiskStatus.OK:
|
|
|
7ed5e3 |
- debug("transfer %s finalized in %.3f seconds"
|
|
|
7ed5e3 |
- % (transfer.id, time.time() - start))
|
|
|
7ed5e3 |
- break
|
|
|
7ed5e3 |
+ try:
|
|
|
7ed5e3 |
+ disk = disk_service.get()
|
|
|
7ed5e3 |
+ except sdk.NotFoundError:
|
|
|
7ed5e3 |
+ raise RuntimeError(
|
|
|
7ed5e3 |
+ "transfer %s failed: disk %s was removed"
|
|
|
7ed5e3 |
+ % (transfer.id, disk_id))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ debug("disk %s is %s" % (disk_id, disk.status))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ if disk.status == types.DiskStatus.OK:
|
|
|
7ed5e3 |
+ break
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
- if time.time() > start + timeout:
|
|
|
7ed5e3 |
raise RuntimeError(
|
|
|
7ed5e3 |
- "timed out waiting for transfer %s to finalize"
|
|
|
7ed5e3 |
- % transfer.id)
|
|
|
7ed5e3 |
+ "transfer %s failed: disk is %s" % (transfer.id, disk.status))
|
|
|
7ed5e3 |
+ else:
|
|
|
7ed5e3 |
+ # Transfer exists, check if it reached one of the final
|
|
|
7ed5e3 |
+ # phases, or we timed out.
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ debug("transfer %s is %s" % (transfer.id, transfer.phase))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ if transfer.phase == types.ImageTransferPhase.FINISHED_SUCCESS:
|
|
|
7ed5e3 |
+ break
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ if transfer.phase == types.ImageTransferPhase.FINISHED_FAILURE:
|
|
|
7ed5e3 |
+ raise RuntimeError(
|
|
|
7ed5e3 |
+ "transfer %s has failed" % (transfer.id,))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ if transfer.phase == types.ImageTransferPhase.PAUSED_SYSTEM:
|
|
|
7ed5e3 |
+ raise RuntimeError(
|
|
|
7ed5e3 |
+ "transfer %s was paused by system" % (transfer.id,))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ if time.time() > start + timeout:
|
|
|
7ed5e3 |
+ raise RuntimeError(
|
|
|
7ed5e3 |
+ "timed out waiting for transfer %s to finalize, "
|
|
|
7ed5e3 |
+ "transfer is %s"
|
|
|
7ed5e3 |
+ % (transfer.id, transfer.phase))
|
|
|
7ed5e3 |
+
|
|
|
7ed5e3 |
+ debug("transfer %s finalized in %.3f seconds"
|
|
|
7ed5e3 |
+ % (transfer.id, time.time() - start))
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
|
|
|
7ed5e3 |
def transfer_supports_format():
|
|
|
7ed5e3 |
--
|
|
|
7ed5e3 |
2.18.4
|
|
|
7ed5e3 |
|