From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Stefan Reiter <s.reiter@proxmox.com>
Date: Mon, 28 Sep 2020 13:40:51 +0200
Subject: [PATCH] PVE-Backup: Don't block on finishing and cleanup
 create_backup_jobs

proxmox_backup_co_finish is already async, but previously we would wait
for the coroutine using block_on_coroutine_fn(). Avoid this by
scheduling pvebackup_co_complete_stream (and thus pvebackup_co_cleanup)
as a real coroutine when calling from pvebackup_complete_cb. This is ok,
since complete_stream uses the backup_mutex internally to synchronize,
and other streams can happily continue writing in the meantime anyway.

To accomodate, backup_mutex is converted to a CoMutex. This means
converting every user to a coroutine. This is not just useful here, but
will come in handy once this series[0] is merged, and QMP calls can be
yield-able coroutines too. Then we can also finally get rid of
block_on_coroutine_fn.

Cases of aio_context_acquire/release from within what is now a coroutine
are changed to aio_co_reschedule_self, which works since a running
coroutine always holds the aio lock for the context it is running in.

job_cancel_sync is called from a BH since it can't be run from a
coroutine (uses AIO_WAIT_WHILE internally).

Same thing for create_backup_jobs, which is converted to a BH too.

To communicate the finishing state, a new property is introduced to
query-backup: 'finishing'. A new state is explicitly not used, since
that would break compatibility with older qemu-server versions.

Also fix create_backup_jobs:

No more weird bool returns, just the standard "errp" format used
everywhere else too. With this, if backup_job_create fails, the error
message is actually returned over QMP and can be shown to the user.

To facilitate correct cleanup on such an error, we call
create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
This additionally allows us to actually hold the backup_mutex during
operation.

Also add a job_cancel_sync before job_unref, since a job must be in
STATUS_NULL to be deleted by unref, which could trigger an assert
before.

[0] https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg03515.html

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 pve-backup.c         | 217 ++++++++++++++++++++++++++++---------------
 qapi/block-core.json |   5 +-
 2 files changed, 144 insertions(+), 78 deletions(-)

diff --git a/pve-backup.c b/pve-backup.c
index 2e628d68e4..9c20ef3a5e 100644
--- a/pve-backup.c
+++ b/pve-backup.c
@@ -33,7 +33,9 @@ const char *PBS_BITMAP_NAME = "pbs-incremental-dirty-bitmap";
 
 static struct PVEBackupState {
     struct {
-        // Everithing accessed from qmp_backup_query command is protected using lock
+        // Everything accessed from qmp_backup_query command is protected using
+        // this lock. Do NOT hold this lock for long times, as it is sometimes
+        // acquired from coroutines, and thus any wait time may block the guest.
         QemuMutex lock;
         Error *error;
         time_t start_time;
@@ -47,20 +49,22 @@ static struct PVEBackupState {
         size_t reused;
         size_t zero_bytes;
         GList *bitmap_list;
+        bool finishing;
+        bool starting;
     } stat;
     int64_t speed;
     VmaWriter *vmaw;
     ProxmoxBackupHandle *pbs;
     GList *di_list;
     JobTxn *txn;
-    QemuMutex backup_mutex;
+    CoMutex backup_mutex;
     CoMutex dump_callback_mutex;
 } backup_state;
 
 static void pvebackup_init(void)
 {
     qemu_mutex_init(&backup_state.stat.lock);
-    qemu_mutex_init(&backup_state.backup_mutex);
+    qemu_co_mutex_init(&backup_state.backup_mutex);
     qemu_co_mutex_init(&backup_state.dump_callback_mutex);
 }
 
@@ -72,6 +76,7 @@ typedef struct PVEBackupDevInfo {
     size_t size;
     uint64_t block_size;
     uint8_t dev_id;
+    int completed_ret; // INT_MAX if not completed
     char targetfile[PATH_MAX];
     BdrvDirtyBitmap *bitmap;
     BlockDriverState *target;
@@ -227,12 +232,12 @@ pvebackup_co_dump_vma_cb(
 }
 
 // assumes the caller holds backup_mutex
-static void coroutine_fn pvebackup_co_cleanup(void *unused)
+static void coroutine_fn pvebackup_co_cleanup(void)
 {
     assert(qemu_in_coroutine());
 
     qemu_mutex_lock(&backup_state.stat.lock);
-    backup_state.stat.end_time = time(NULL);
+    backup_state.stat.finishing = true;
     qemu_mutex_unlock(&backup_state.stat.lock);
 
     if (backup_state.vmaw) {
@@ -261,35 +266,29 @@ static void coroutine_fn pvebackup_co_cleanup(void *unused)
 
     g_list_free(backup_state.di_list);
     backup_state.di_list = NULL;
+
+    qemu_mutex_lock(&backup_state.stat.lock);
+    backup_state.stat.end_time = time(NULL);
+    backup_state.stat.finishing = false;
+    qemu_mutex_unlock(&backup_state.stat.lock);
 }
 
-// assumes the caller holds backup_mutex
-static void coroutine_fn pvebackup_complete_stream(void *opaque)
+static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
 {
     PVEBackupDevInfo *di = opaque;
+    int ret = di->completed_ret;
 
-    bool error_or_canceled = pvebackup_error_or_canceled();
-
-    if (backup_state.vmaw) {
-        vma_writer_close_stream(backup_state.vmaw, di->dev_id);
+    qemu_mutex_lock(&backup_state.stat.lock);
+    bool starting = backup_state.stat.starting;
+    qemu_mutex_unlock(&backup_state.stat.lock);
+    if (starting) {
+        /* in 'starting' state, no tasks have been run yet, meaning we can (and
+         * must) skip all cleanup, as we don't know what has and hasn't been
+         * initialized yet. */
+        return;
     }
 
-    if (backup_state.pbs && !error_or_canceled) {
-        Error *local_err = NULL;
-        proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
-        if (local_err != NULL) {
-            pvebackup_propagate_error(local_err);
-        }
-    }
-}
-
-static void pvebackup_complete_cb(void *opaque, int ret)
-{
-    assert(!qemu_in_coroutine());
-
-    PVEBackupDevInfo *di = opaque;
-
-    qemu_mutex_lock(&backup_state.backup_mutex);
+    qemu_co_mutex_lock(&backup_state.backup_mutex);
 
     if (ret < 0) {
         Error *local_err = NULL;
@@ -301,7 +300,19 @@ static void pvebackup_complete_cb(void *opaque, int ret)
 
     assert(di->target == NULL);
 
-    block_on_coroutine_fn(pvebackup_complete_stream, di);
+    bool error_or_canceled = pvebackup_error_or_canceled();
+
+    if (backup_state.vmaw) {
+        vma_writer_close_stream(backup_state.vmaw, di->dev_id);
+    }
+
+    if (backup_state.pbs && !error_or_canceled) {
+        Error *local_err = NULL;
+        proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
+        if (local_err != NULL) {
+            pvebackup_propagate_error(local_err);
+        }
+    }
 
     // remove self from job list
     backup_state.di_list = g_list_remove(backup_state.di_list, di);
@@ -310,21 +321,49 @@ static void pvebackup_complete_cb(void *opaque, int ret)
 
     /* call cleanup if we're the last job */
     if (!g_list_first(backup_state.di_list)) {
-        block_on_coroutine_fn(pvebackup_co_cleanup, NULL);
+        pvebackup_co_cleanup();
     }
 
-    qemu_mutex_unlock(&backup_state.backup_mutex);
+    qemu_co_mutex_unlock(&backup_state.backup_mutex);
 }
 
-static void pvebackup_cancel(void)
+static void pvebackup_complete_cb(void *opaque, int ret)
 {
-    assert(!qemu_in_coroutine());
+    PVEBackupDevInfo *di = opaque;
+    di->completed_ret = ret;
 
+    /*
+     * Schedule stream cleanup in async coroutine. close_image and finish might
+     * take a while, so we can't block on them here. This way it also doesn't
+     * matter if we're already running in a coroutine or not.
+     * Note: di is a pointer to an entry in the global backup_state struct, so
+     * it stays valid.
+     */
+    Coroutine *co = qemu_coroutine_create(pvebackup_co_complete_stream, di);
+    aio_co_enter(qemu_get_aio_context(), co);
+}
+
+/*
+ * job_cancel(_sync) does not like to be called from coroutines, so defer to
+ * main loop processing via a bottom half.
+ */
+static void job_cancel_bh(void *opaque) {
+    CoCtxData *data = (CoCtxData*)opaque;
+    Job *job = (Job*)data->data;
+    AioContext *job_ctx = job->aio_context;
+    aio_context_acquire(job_ctx);
+    job_cancel_sync(job);
+    aio_context_release(job_ctx);
+    aio_co_enter(data->ctx, data->co);
+}
+
+static void coroutine_fn pvebackup_co_cancel(void *opaque)
+{
     Error *cancel_err = NULL;
     error_setg(&cancel_err, "backup canceled");
     pvebackup_propagate_error(cancel_err);
 
-    qemu_mutex_lock(&backup_state.backup_mutex);
+    qemu_co_mutex_lock(&backup_state.backup_mutex);
 
     if (backup_state.vmaw) {
         /* make sure vma writer does not block anymore */
@@ -342,27 +381,22 @@ static void pvebackup_cancel(void)
         ((PVEBackupDevInfo *)bdi->data)->job :
         NULL;
 
-    /* ref the job before releasing the mutex, just to be safe */
     if (cancel_job) {
-        job_ref(&cancel_job->job);
+        CoCtxData data = {
+            .ctx = qemu_get_current_aio_context(),
+            .co = qemu_coroutine_self(),
+            .data = &cancel_job->job,
+        };
+        aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
+        qemu_coroutine_yield();
     }
 
-    /* job_cancel_sync may enter the job, so we need to release the
-     * backup_mutex to avoid deadlock */
-    qemu_mutex_unlock(&backup_state.backup_mutex);
-
-    if (cancel_job) {
-        AioContext *aio_context = cancel_job->job.aio_context;
-        aio_context_acquire(aio_context);
-        job_cancel_sync(&cancel_job->job);
-        job_unref(&cancel_job->job);
-        aio_context_release(aio_context);
-    }
+    qemu_co_mutex_unlock(&backup_state.backup_mutex);
 }
 
 void qmp_backup_cancel(Error **errp)
 {
-    pvebackup_cancel();
+    block_on_coroutine_fn(pvebackup_co_cancel, NULL);
 }
 
 // assumes the caller holds backup_mutex
@@ -415,10 +449,18 @@ static int coroutine_fn pvebackup_co_add_config(
     goto out;
 }
 
-static bool create_backup_jobs(void) {
+/*
+ * backup_job_create can *not* be run from a coroutine (and requires an
+ * acquired AioContext), so this can't either.
+ * The caller is responsible that backup_mutex is held nonetheless.
+ */
+static void create_backup_jobs_bh(void *opaque) {
 
     assert(!qemu_in_coroutine());
 
+    CoCtxData *data = (CoCtxData*)opaque;
+    Error **errp = (Error**)data->data;
+
     Error *local_err = NULL;
 
     /* create job transaction to synchronize bitmap commit and cancel all
@@ -454,24 +496,19 @@ static bool create_backup_jobs(void) {
 
         aio_context_release(aio_context);
 
-        if (!job || local_err != NULL) {
-            Error *create_job_err = NULL;
-            error_setg(&create_job_err, "backup_job_create failed: %s",
+        di->job = job;
+
+        if (!job || local_err) {
+            error_setg(errp, "backup_job_create failed: %s",
                        local_err ? error_get_pretty(local_err) : "null");
-
-            pvebackup_propagate_error(create_job_err);
             break;
         }
 
-        di->job = job;
-
         bdrv_unref(di->target);
         di->target = NULL;
     }
 
-    bool errors = pvebackup_error_or_canceled();
-
-    if (errors) {
+    if (*errp) {
         l = backup_state.di_list;
         while (l) {
             PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
@@ -483,12 +520,17 @@ static bool create_backup_jobs(void) {
             }
 
             if (di->job) {
+                AioContext *ctx = di->job->job.aio_context;
+                aio_context_acquire(ctx);
+                job_cancel_sync(&di->job->job);
                 job_unref(&di->job->job);
+                aio_context_release(ctx);
             }
         }
     }
 
-    return errors;
+    /* return */
+    aio_co_enter(data->ctx, data->co);
 }
 
 typedef struct QmpBackupTask {
@@ -525,11 +567,12 @@ typedef struct QmpBackupTask {
     UuidInfo *result;
 } QmpBackupTask;
 
-// assumes the caller holds backup_mutex
 static void coroutine_fn pvebackup_co_prepare(void *opaque)
 {
     assert(qemu_in_coroutine());
 
+    qemu_co_mutex_lock(&backup_state.backup_mutex);
+
     QmpBackupTask *task = opaque;
 
     task->result = NULL; // just to be sure
@@ -550,8 +593,9 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
     const char *firewall_name = "qemu-server.fw";
 
     if (backup_state.di_list) {
-         error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
+        error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
                   "previous backup not finished");
+        qemu_co_mutex_unlock(&backup_state.backup_mutex);
         return;
     }
 
@@ -618,6 +662,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
         }
         di->size = size;
         total += size;
+
+        di->completed_ret = INT_MAX;
     }
 
     uuid_generate(uuid);
@@ -849,6 +895,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
     backup_state.stat.dirty = total - backup_state.stat.reused;
     backup_state.stat.transferred = 0;
     backup_state.stat.zero_bytes = 0;
+    backup_state.stat.finishing = false;
+    backup_state.stat.starting = true;
 
     qemu_mutex_unlock(&backup_state.stat.lock);
 
@@ -863,6 +911,33 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
     uuid_info->UUID = uuid_str;
 
     task->result = uuid_info;
+
+    /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
+    * backup_mutex locked. This is fine, a CoMutex can be held across yield
+    * points, and we'll release it as soon as the BH reschedules us.
+    */
+    CoCtxData waker = {
+        .co = qemu_coroutine_self(),
+        .ctx = qemu_get_current_aio_context(),
+        .data = &local_err,
+    };
+    aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
+    qemu_coroutine_yield();
+
+    if (local_err) {
+        error_propagate(task->errp, local_err);
+        goto err;
+    }
+
+    qemu_co_mutex_unlock(&backup_state.backup_mutex);
+
+    qemu_mutex_lock(&backup_state.stat.lock);
+    backup_state.stat.starting = false;
+    qemu_mutex_unlock(&backup_state.stat.lock);
+
+    /* start the first job in the transaction */
+    job_txn_start_seq(backup_state.txn);
+
     return;
 
 err_mutex:
@@ -885,6 +960,7 @@ err:
         g_free(di);
     }
     g_list_free(di_list);
+    backup_state.di_list = NULL;
 
     if (devs) {
         g_strfreev(devs);
@@ -905,6 +981,8 @@ err:
     }
 
     task->result = NULL;
+
+    qemu_co_mutex_unlock(&backup_state.backup_mutex);
     return;
 }
 
@@ -958,24 +1036,8 @@ UuidInfo *qmp_backup(
         .errp = errp,
     };
 
-    qemu_mutex_lock(&backup_state.backup_mutex);
-
     block_on_coroutine_fn(pvebackup_co_prepare, &task);
 
-    if (*errp == NULL) {
-        bool errors = create_backup_jobs();
-        qemu_mutex_unlock(&backup_state.backup_mutex);
-
-        if (!errors) {
-            /* start the first job in the transaction
-             * note: this might directly enter the job, so we need to do this
-             * after unlocking the backup_mutex */
-            job_txn_start_seq(backup_state.txn);
-        }
-    } else {
-        qemu_mutex_unlock(&backup_state.backup_mutex);
-    }
-
     return task.result;
 }
 
@@ -1027,6 +1089,7 @@ BackupStatus *qmp_query_backup(Error **errp)
     info->transferred = backup_state.stat.transferred;
     info->has_reused = true;
     info->reused = backup_state.stat.reused;
+    info->finishing = backup_state.stat.finishing;
 
     qemu_mutex_unlock(&backup_state.stat.lock);
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 170c13984d..a0d1d278e9 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -729,12 +729,15 @@
 #
 # @uuid: uuid for this backup job
 #
+# @finishing: if status='active' and finishing=true, then the backup process is
+#             waiting for the target to finish.
+#
 ##
 { 'struct': 'BackupStatus',
   'data': {'*status': 'str', '*errmsg': 'str', '*total': 'int', '*dirty': 'int',
            '*transferred': 'int', '*zero-bytes': 'int', '*reused': 'int',
            '*start-time': 'int', '*end-time': 'int',
-           '*backup-file': 'str', '*uuid': 'str' } }
+           '*backup-file': 'str', '*uuid': 'str', 'finishing': 'bool' } }
 
 ##
 # @BackupFormat: