[pve-devel] [PATCH qemu 2/2] PVE: Don't call job_cancel in coroutines

Wolfgang Bumiller w.bumiller at proxmox.com
Tue Oct 27 15:17:34 CET 2020


On Thu, Oct 22, 2020 at 02:11:18PM +0200, Stefan Reiter wrote:
> ...because it hangs on cancelling other jobs in the txn if you do.
> 
> Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
> ---
>  pve-backup.c | 26 +++++++++++++++++++++++++-
>  1 file changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/pve-backup.c b/pve-backup.c
> index 9179754dcb..af2db0d4b9 100644
> --- a/pve-backup.c
> +++ b/pve-backup.c
> @@ -82,6 +82,12 @@ typedef struct PVEBackupDevInfo {
>      BlockJob *job;
>  } PVEBackupDevInfo;
>  
> +typedef struct JobCancelData {
> +    AioContext *ctx;
> +    Coroutine *co;
> +    Job *job;
> +} JobCancelData;
> +
>  static void pvebackup_propagate_error(Error *err)
>  {
>      qemu_mutex_lock(&backup_state.stat.lock);
> @@ -332,6 +338,18 @@ static void pvebackup_complete_cb(void *opaque, int ret)
>      aio_co_enter(qemu_get_aio_context(), co);
>  }
>  
> +/*
> + * job_cancel(_sync) does not like to be called from coroutines, so defer to
> + * main loop processing via a bottom half.
> + */
> +static void job_cancel_bh(void *opaque) {
> +    JobCancelData *data = (JobCancelData*)opaque;
> +    aio_context_acquire(data->job->aio_context);
> +    job_cancel_sync(data->job);
> +    aio_context_release(data->job->aio_context);
> +    aio_co_schedule(data->ctx, data->co);
> +}
> +
>  static void coroutine_fn pvebackup_co_cancel(void *opaque)
>  {
>      Error *cancel_err = NULL;
> @@ -357,7 +375,13 @@ static void coroutine_fn pvebackup_co_cancel(void *opaque)
>          NULL;
>  
>      if (cancel_job) {
> -        job_cancel(&cancel_job->job, false);
> +        JobCancelData data = {
> +            .ctx = qemu_get_current_aio_context(),
> +            .co = qemu_coroutine_self(),
> +            .job = &cancel_job->job,
> +        };
> +        aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
> +        qemu_coroutine_yield();

Don't we need some kind of synchronization here? The yield does not
guarantee we don't run before the bh is run, or does it? Maybe a condvar
to trigger the coro after the job cancel bh?

>      }
>  
>      qemu_co_mutex_unlock(&backup_state.backup_mutex);
> -- 
> 2.20.1





More information about the pve-devel mailing list