[pve-devel] [PATCH qemu 2/2] PVE: Don't call job_cancel in coroutines

Tue Oct 27 15:57:06 CET 2020

On 10/27/20 3:17 PM, Wolfgang Bumiller wrote:
> On Thu, Oct 22, 2020 at 02:11:18PM +0200, Stefan Reiter wrote:
>> ...because it hangs on cancelling other jobs in the txn if you do.
>>
>> Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
>> ---
>>   pve-backup.c | 26 +++++++++++++++++++++++++-
>>   1 file changed, 25 insertions(+), 1 deletion(-)
>>
>> diff --git a/pve-backup.c b/pve-backup.c
>> index 9179754dcb..af2db0d4b9 100644
>> --- a/pve-backup.c
>> +++ b/pve-backup.c
>> @@ -82,6 +82,12 @@ typedef struct PVEBackupDevInfo {
>>       BlockJob *job;
>>   } PVEBackupDevInfo;
>>   
>> +typedef struct JobCancelData {
>> +    AioContext *ctx;
>> +    Coroutine *co;
>> +    Job *job;
>> +} JobCancelData;
>> +
>>   static void pvebackup_propagate_error(Error *err)
>>   {
>>       qemu_mutex_lock(&backup_state.stat.lock);
>> @@ -332,6 +338,18 @@ static void pvebackup_complete_cb(void *opaque, int ret)
>>       aio_co_enter(qemu_get_aio_context(), co);
>>   }
>>   
>> +/*
>> + * job_cancel(_sync) does not like to be called from coroutines, so defer to
>> + * main loop processing via a bottom half.
>> + */
>> +static void job_cancel_bh(void *opaque) {
>> +    JobCancelData *data = (JobCancelData*)opaque;
>> +    aio_context_acquire(data->job->aio_context);
>> +    job_cancel_sync(data->job);
>> +    aio_context_release(data->job->aio_context);
>> +    aio_co_schedule(data->ctx, data->co);
>> +}
>> +
>>   static void coroutine_fn pvebackup_co_cancel(void *opaque)
>>   {
>>       Error *cancel_err = NULL;
>> @@ -357,7 +375,13 @@ static void coroutine_fn pvebackup_co_cancel(void *opaque)
>>           NULL;
>>   
>>       if (cancel_job) {
>> -        job_cancel(&cancel_job->job, false);
>> +        JobCancelData data = {
>> +            .ctx = qemu_get_current_aio_context(),
>> +            .co = qemu_coroutine_self(),
>> +            .job = &cancel_job->job,
>> +        };
>> +        aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
>> +        qemu_coroutine_yield();
> 
> Don't we need some kind of synchronization here? The yield does not
> guarantee we don't run before the bh is run, or does it? Maybe a condvar
> to trigger the coro after the job cancel bh?
> 

No, it cannot race, since we execute the BH in the same context as the 
coroutine (qemu_get_current_aio_context()). The coroutine thus blocks 
execution of the BH until it yields.

See also code and comment in aio_co_reschedule_self() from 'util/async.c'.

>>       }
>>   
>>       qemu_co_mutex_unlock(&backup_state.backup_mutex);
>> -- 
>> 2.20.1