[pve-devel] [PATCH access-control v2 2/2] add realmsync plugin for jobs and CRUD api for domainsync-jobs

Wed Dec 14 10:53:52 CET 2022

On 12/13/22 14:43, Thomas Lamprecht wrote:
> Some mostly API and slightly higher level or cosmetical comments, but also some issue with
> how you use the cfs domain locks at the end.
> 
> Also, a patch-wide s/realmsync/realm-sync/ would be great.

sure, no problem

> 
> Am 06/12/2022 um 12:06 schrieb Dominik Csapak:
>> to be able to define automated jobs that sync ldap/ad
>>
>> The jobs plugin contains special handling when no node is given, since
>> we only want it to run on a single node when that triggers. For that,
>> we save a statefile in /etc/pve/priv/jobs/ which contains the
>> node/time/upid of the node that runs the job. The first node that
>> is able to lock the realm (via cfs_lock_domain) "wins" and may
>> sync from the ldap.
>>
>> in case a specific node was selected, this is omitted and the Jobs
>> handling will not let it run on other nodes anyway
>>
>> the API part is our usual sectionconfig CRUD api, but specialized
>> for the specific type of job.
>>
>> Signed-off-by: Dominik Csapak <d.csapak at proxmox.com>
>> ---
>>   src/PVE/API2/AccessControl.pm               |   6 +
>>   src/PVE/API2/AccessControl/Job.pm           |  47 +++
>>   src/PVE/API2/AccessControl/Job/Makefile     |   6 +
>>   src/PVE/API2/AccessControl/Job/RealmSync.pm | 324 ++++++++++++++++++++
>>   src/PVE/API2/AccessControl/Makefile         |   9 +
>>   src/PVE/API2/Makefile                       |   4 +
>>   src/PVE/Jobs/Makefile                       |   6 +
>>   src/PVE/Jobs/RealmSync.pm                   | 193 ++++++++++++
>>   src/PVE/Makefile                            |   1 +
>>   9 files changed, 596 insertions(+)
>>   create mode 100644 src/PVE/API2/AccessControl/Job.pm
>>   create mode 100644 src/PVE/API2/AccessControl/Job/Makefile
>>   create mode 100644 src/PVE/API2/AccessControl/Job/RealmSync.pm
>>   create mode 100644 src/PVE/API2/AccessControl/Makefile
>>   create mode 100644 src/PVE/Jobs/Makefile
>>   create mode 100644 src/PVE/Jobs/RealmSync.pm
>>
>> diff --git a/src/PVE/API2/AccessControl.pm b/src/PVE/API2/AccessControl.pm
>> index 104e7e3..221ecb2 100644
>> --- a/src/PVE/API2/AccessControl.pm
>> +++ b/src/PVE/API2/AccessControl.pm
>> @@ -14,6 +14,7 @@ use PVE::DataCenterConfig;
>>   use PVE::RESTHandler;
>>   use PVE::AccessControl;
>>   use PVE::JSONSchema qw(get_standard_option);
>> +use PVE::API2::AccessControl::Job;
>>   use PVE::API2::Domains;
>>   use PVE::API2::User;
>>   use PVE::API2::Group;
>> @@ -57,6 +58,11 @@ __PACKAGE__->register_method ({
>>       path => 'domains',
>>   });
>>   
>> +__PACKAGE__->register_method ({
>> +    subclass => "PVE::API2::AccessControl::Job",
>> +    path => 'jobs',
>> +});
>> +
>>   __PACKAGE__->register_method ({
>>       subclass => "PVE::API2::OpenId",
>>       path => 'openid',
>> diff --git a/src/PVE/API2/AccessControl/Job.pm b/src/PVE/API2/AccessControl/Job.pm
>> new file mode 100644
>> index 0000000..cf2d0a9
>> --- /dev/null
>> +++ b/src/PVE/API2/AccessControl/Job.pm
>> @@ -0,0 +1,47 @@
>> +package PVE::API2::AccessControl::Job;
>> +
>> +use strict;
>> +use warnings;
>> +
>> +use PVE::RESTHandler;
>> +
>> +use PVE::API2::AccessControl::Job::RealmSync;
>> +
>> +use base qw(PVE::RESTHandler);
>> +
>> +__PACKAGE__->register_method ({
>> +    subclass => "PVE::API2::AccessControl::Job::RealmSync",
>> +    path => 'realm-sync',
>> +});
>> +
>> +__PACKAGE__->register_method ({
>> +    name => 'index',
>> +    path => '',
>> +    method => 'GET',
>> +    description => "Directory index.",
>> +    permissions => {
>> +	user => 'all',
>> +    },
>> +    parameters => {
>> +	additionalProperties => 0,
>> +	properties => {},
>> +    },
>> +    returns => {
>> +	type => 'array',
>> +	items => {
>> +	    type => "object",
>> +	    properties => {
>> +		subdir => { type => 'string' },
>> +	    },
>> +	},
>> +	links => [ { rel => 'child', href => "{subdir}" } ],
>> +    },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	return {
>> +	    subdir => 'realm-sync',
>> +	};
> 
> as from the return type schema this should be an array:
> 
> return [
>      { subdir => 'realm-sync' },
> ];

oops

> 
>> +    }});
>> +
>> +1;
>> diff --git a/src/PVE/API2/AccessControl/Job/Makefile b/src/PVE/API2/AccessControl/Job/Makefile
>> new file mode 100644
>> index 0000000..5feae4b
>> --- /dev/null
>> +++ b/src/PVE/API2/AccessControl/Job/Makefile
>> @@ -0,0 +1,6 @@
>> +API2_SOURCES= 		\
>> +	RealmSync.pm	\
>> +
>> +.PHONY: install
>> +install:
>> +	for i in ${API2_SOURCES}; do install -D -m 0644 $$i ${DESTDIR}${PERLDIR}/PVE/API2/AccessControl/Job/$$i; done
>> diff --git a/src/PVE/API2/AccessControl/Job/RealmSync.pm b/src/PVE/API2/AccessControl/Job/RealmSync.pm
>> new file mode 100644
>> index 0000000..f14bb2b
>> --- /dev/null
>> +++ b/src/PVE/API2/AccessControl/Job/RealmSync.pm
>> @@ -0,0 +1,324 @@
>> +package PVE::API2::AccessControl::Job::RealmSync;
>> +
>> +use strict;
>> +use warnings;
>> +
>> +use JSON;
>> +
>> +use PVE::Cluster qw (cfs_read_file cfs_write_file cfs_lock_file);
>> +use PVE::Exception qw(raise_param_exc);
>> +use PVE::JSONSchema qw(get_standard_option);
>> +use PVE::Job::Registry;
>> +use PVE::RESTHandler;
>> +use PVE::SafeSyslog;
>> +use PVE::Tools qw(extract_param);
>> +
>> +use PVE::AccessControl;
>> +use PVE::Auth::Plugin;
>> +use PVE::Jobs::RealmSync;
>> +
>> +
>> +use base qw(PVE::RESTHandler);
>> +
>> +my $get_cluster_last_run = sub {
>> +    my ($jobid) = @_;
>> +
>> +    my $fn = "/etc/pve/priv/jobs/realmsync-$jobid.json";
>> +    my $raw = PVE::Tools::file_get_contents($fn);
>> +    my $state = eval { decode_json($raw) };
>> +    die "json decode error on $fn: $@\n" if $@;
>> +
>> +    if (my $upid = $state->{upid}) {
>> +	if (my $decoded = PVE::Tools::upid_decode($upid)) {
>> +	    return $decoded->{starttime};
>> +	}
>> +    } else {
>> +	return $state->{time};
>> +    }
>> +
>> +    return undef;
>> +};
>> +
>> +__PACKAGE__->register_method ({
>> +    name => 'syncjob_index',
>> +    path => '',
>> +    method => 'GET',
>> +    description => "List configured realm-sync-jobs.",
>> +    permissions => {
>> +	check => ['perm', '/', ['Sys.Audit']],
>> +    },
>> +    parameters => {
>> +	additionalProperties => 0,
>> +	properties => {},
>> +    },
>> +    returns => {
>> +	type => 'array',
>> +	items => {
>> +	    type => "object",
>> +	    properties => {
>> +		id => {
>> +		    description => "The ID of the entry.",
>> +		    type => 'string'
>> +		},
>> +		enabled => {
>> +		    description => "If the job is enalbed or not.",
> 
> typo: enabled
> 
>> +		    type => 'boolean',
>> +		},
>> +		comment => {
>> +		    description => "A comment for the job.",
>> +		    type => 'string',
>> +		    optional => 1,
>> +		},
>> +		schedule => {
>> +		    description => "The configured sync schedule.",
>> +		    type => 'string',
>> +		    optional => 1,
> 
> do we want to allow a empty schedule if there's a enable boolean switch anyway?
> 
> Or is this for making the "only used to run manually" use case slightly less confusing
> to configure (as enabled=false + bogus schedule might be weird for some)?
> 
>> +		},
>> +		realm => {
>> +		    description => "The realm to sync.",
>> +		    type => 'string',
>> +		    optional => 1,
> 
> is this really optional?

no you're right, was too fast with c&p

> 
>> +		},
>> +		node => {
>> +		    description => "The node to run the job on, if any.",
> 
> what if this is empty? might want to communicate that over the description, either with
> extra sentence or something like "The node the job is limited too run on"
> 
> Albeit it's IMO a bit of a odd thing in general, as especially realm syncs will always
> have effects on the whole cluster, so why expose that and not just auto select one
> (e.h., jitter sleep + node lock + check if recently run); we can always add such things
> if there's an actual need for it.

sure makes sense

> 
>> +		    type => 'string',
>> +		    optional => 1,
>> +		},
>> +		scope => {
>> +		    description => "The Selection of what to sync.",
> 
> might want to define the enum here? or better said, should this be a (registered) format
> we can reuse here?
> 
>> +		    type => 'string',
>> +		    optional => 1,
>> +		},
>> +		'remove-vanished' => {
>> +		    description => "A list of things to remove when the entity vanished during a sync.",
> 
> same here
> 
>> +		    type => 'string',
>> +		    optional => 1,
>> +		},
>> +		'last-run' => {
>> +		    description => "UNIX epoch when the job last ran.",
> 
> "Last execution time of the job in seconds since the beginning of the UNIX epoch."
> 
>> +		    type => 'integer',
>> +		    optional => 1,
>> +		},
>> +		'next-run' => {
>> +		    description => "UNIX epoch when the job is planned to run next.",
> 
> "Next planned execution time of the job in seconds since the beginning of the UNIX epoch."
> 
>> +		    type => 'integer',
>> +		    optional => 1,
>> +		},
>> +	    },
>> +	},
>> +	links => [ { rel => 'child', href => "{id}" } ],
>> +    },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	my $rpcenv = PVE::RPCEnvironment::get();
>> +	my $user = $rpcenv->get_user();
>> +
>> +	my $jobs_data = cfs_read_file('jobs.cfg');
>> +	my $order = $jobs_data->{order};
>> +	my $jobs = $jobs_data->{ids};
>> +
>> +	my $res = [];
>> +	for my $jobid (sort { $order->{$a} <=> $order->{$b} } keys %$jobs) {
>> +	    my $job = $jobs->{$jobid};
>> +	    next if $job->{type} ne 'realmsync';
> 
> Should the "type" be "realm-sync" for consistency?
> 
>> +
>> +	    $job->{id} = $jobid;
>> +	    if (my $schedule = $job->{schedule}) {
>> +		$job->{'last-run'} = eval { $get_cluster_last_run->($jobid) };
>> +		my $last_run = $job->{'last-run'} // time(); # current time as fallback
>> +		my $calendar_event = Proxmox::RS::CalendarEvent->new($schedule);
>> +		my $next_run = $calendar_event->compute_next_event($last_run);
>> +		$job->{'next-run'} = $next_run if defined($next_run);
>> +	    }
>> +
>> +	    push @$res, $job;
>> +	}
>> +
>> +	return $res;
>> +    }});
>> +
>> +__PACKAGE__->register_method({
>> +    name => 'read_job',
>> +    path => '{id}',
>> +    method => 'GET',
>> +    description => "Read realm-sync job definition.",
>> +    permissions => {
>> +	check => ['perm', '/', ['Sys.Audit']],
>> +    },
>> +    parameters => {
>> +	additionalProperties => 0,
>> +	properties => {
>> +	    id => {
>> +		type => 'string',
>> +		format => 'pve-configid',
>> +	    },
>> +	},
>> +    },
>> +    returns => {
>> +	type => 'object',
>> +    },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	my $jobs = cfs_read_file('jobs.cfg');
>> +	my $id = $param->{id};
>> +	my $job = $jobs->{ids}->{$id};
>> +	return $job if $job && $job->{type} eq 'realmsync';
>> +
>> +	raise_param_exc({ id => "No such job '$id'" });
>> +
>> +    }});
>> +
>> +__PACKAGE__->register_method({
>> +    name => 'create_job',
>> +    path => '{id}',
>> +    method => 'POST',
>> +    protected => 1,
>> +    description => "Create new realm-sync job.",
>> +    permissions => {
>> +	description => "'Realm.AllocateUser' on '/access/realm/<realm>' and "
>> +	    ."'User.Modify' permissions to '/access/groups/'.",
>> +	check => [ 'and',
>> +	    ['perm', '/access/realm/{realm}', ['Realm.AllocateUser']],
>> +	    ['perm', '/access/groups', ['User.Modify']],
>> +	],
>> +    },
>> +    parameters => PVE::Jobs::RealmSync->createSchema(),
>> +    returns => { type => 'null' },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	my $id = extract_param($param, 'id');
>> +
>> +	cfs_lock_file('jobs.cfg', undef, sub {
>> +	    my $data = cfs_read_file('jobs.cfg');
>> +
>> +	    die "Job '$id' already exists\n"
>> +		if $data->{ids}->{$id};
>> +
>> +	    my $plugin = PVE::Job::Registry->lookup('realmsync');
>> +	    my $opts = $plugin->check_config($id, $param, 1, 1);
>> +
>> +	    my $realm = $opts->{realm};
>> +	    my $cfg = cfs_read_file('domains.cfg');
>> +
>> +	    raise_param_exc({ realm => "No such realm '$realm'" })
>> +		if !defined($cfg->{ids}->{$realm});
>> +
>> +	    my $realm_type = $cfg->{ids}->{$realm}->{type};
>> +	    raise_param_exc({ realm => "Only LDAP/AD realms can be synced." })
>> +		if $realm_type ne 'ldap' && $realm_type ne 'ad';
>> +
>> +	    $data->{ids}->{$id} = $opts;
>> +
>> +	    PVE::Jobs::create_job($id, 'realmsync');
>> +
>> +	    cfs_write_file('jobs.cfg', $data);
>> +	});
>> +	die "$@" if ($@);
>> +
>> +	return undef;
>> +    }});
>> +
>> +__PACKAGE__->register_method({
>> +    name => 'update_job',
>> +    path => '{id}',
>> +    method => 'PUT',
>> +    protected => 1,
>> +    description => "Update realm-sync job definition.",
>> +    permissions => {
>> +	description => "'Realm.AllocateUser' on '/access/realm/<realm>' and "
>> +	    ." 'User.Modify' permissions to '/access/groups/'.",
>> +	check => [ 'and',
>> +	    ['perm', '/access/realm/{realm}', ['Realm.AllocateUser']],
>> +	    ['perm', '/access/groups', ['User.Modify']],
>> +	],
>> +    },
>> +    parameters => PVE::Jobs::RealmSync->updateSchema(),
>> +    returns => { type => 'null' },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	my $id = extract_param($param, 'id');
>> +	my $delete = extract_param($param, 'delete');
>> +	if ($delete) {
>> +	    $delete = [PVE::Tools::split_list($delete)];
>> +	}
>> +
>> +	cfs_lock_file('jobs.cfg', undef, sub {
>> +	    my $jobs = cfs_read_file('jobs.cfg');
>> +
>> +	    die "no options specified\n" if !scalar(keys %$param);
>> +
>> +	    my $plugin = PVE::Job::Registry->lookup('realmsync');
>> +	    my $opts = $plugin->check_config($id, $param, 0, 1);
>> +
>> +	    my $job = $jobs->{ids}->{$id};
>> +	    die "no such realmsync job\n" if !$job || $job->{type} ne 'realmsync';
>> +
>> +	    my $options = $plugin->options();
>> +	    for my $k (@$delete) {
>> +		my $d = $options->{$k} || die "no such option '$k'\n";
>> +		die "unable to delete required option '$k'\n" if !$d->{optional};
>> +		die "unable to delete fixed option '$k'\n" if $d->{fixed};
>> +		die "cannot set and delete property '$k' at the same time!\n"
>> +			if defined($opts->{$k});
>> +		delete $job->{$k};
>> +	    }
> 
> this seems something that we have a few times already in some form, might want to add this
> as helper somehere, possibly to section config.

yes, i'll see that i add a helper there for that

> 
>> +
>> +	    $job->{$_} = $param->{$_} for keys $param->%*;
>> +
>> +	    cfs_write_file('jobs.cfg', $jobs);
>> +
>> +	    PVE::Jobs::detect_changed_runtime_props($id, 'realmsync', $job);
>> +
>> +	    return;
>> +	});
>> +	die "$@" if ($@);
>> +    }});
>> +
>> +
>> +__PACKAGE__->register_method({
>> +    name => 'delete_job',
>> +    path => '{id}',
>> +    method => 'DELETE',
>> +    description => "Delete realm-sync job definition.",
>> +    permissions => {
>> +	check => ['perm', '/', ['Sys.Modify']],
>> +    },
>> +    protected => 1,
>> +    parameters => {
>> +	additionalProperties => 0,
>> +	properties => {
>> +	    id => {
>> +		type => 'string',
>> +		format => 'pve-configid',
>> +	    },
>> +	},
>> +    },
>> +    returns => { type => 'null' },
>> +    code => sub {
>> +	my ($param) = @_;
>> +
>> +	my $id = $param->{id};
>> +
>> +	cfs_lock_file('jobs.cfg', undef, sub {
>> +	    my $jobs = cfs_read_file('jobs.cfg');
>> +
>> +	    if (!defined($jobs->{ids}->{$id}) || $jobs->{ids}->{$id}->{type} ne 'realmsync') {
>> +		raise_param_exc({ id => "No such job '$id'" });
>> +	    }
>> +	    delete $jobs->{ids}->{$id};
>> +
>> +	    PVE::Jobs::remove_job($id, 'realmsync');> +
>> +	    cfs_write_file('jobs.cfg', $jobs);
>> +	    unlink "/etc/pve/priv/jobs/realmsync-$id.json";
> 
> should be contained in a sub, either save_state($id, undef) or explicit delete_state one (names
> are just examples)
> 
>> +	});
>> +	die "$@" if $@;
>> +
>> +	return undef;
>> +    }});
>> +
>> +1;
>> diff --git a/src/PVE/API2/AccessControl/Makefile b/src/PVE/API2/AccessControl/Makefile
>> new file mode 100644
>> index 0000000..8d3fb58
>> --- /dev/null
>> +++ b/src/PVE/API2/AccessControl/Makefile
>> @@ -0,0 +1,9 @@
>> +API2_SOURCES= 	\
>> +	Job.pm	\
>> +
>> +SUBDIRS=Job
>> +
>> +.PHONY: install
>> +install:
>> +	for i in ${API2_SOURCES}; do install -D -m 0644 $$i ${DESTDIR}${PERLDIR}/PVE/API2/AccessControl/$$i; done
>> +	set -e && for i in ${SUBDIRS}; do ${MAKE} -C $$i $@; done
>> diff --git a/src/PVE/API2/Makefile b/src/PVE/API2/Makefile
>> index 2817f48..7d2be93 100644
>> --- a/src/PVE/API2/Makefile
>> +++ b/src/PVE/API2/Makefile
>> @@ -2,6 +2,7 @@
>>   API2_SOURCES= 		 	\
>>   	AccessControl.pm 	\
>>   	Domains.pm	 	\
>> +	RealmSync.pm	 	\
>>   	ACL.pm		 	\
>>   	Role.pm		 	\
>>   	Group.pm	 	\
>> @@ -9,6 +10,9 @@ API2_SOURCES= 		 	\
>>   	TFA.pm			\
>>   	OpenId.pm
>>   
>> +SUBDIRS=AccessControl
>> +
>>   .PHONY: install
>>   install:
>>   	for i in ${API2_SOURCES}; do install -D -m 0644 $$i ${DESTDIR}${PERLDIR}/PVE/API2/$$i; done
>> +	set -e && for i in ${SUBDIRS}; do ${MAKE} -C $$i $@; done
>> diff --git a/src/PVE/Jobs/Makefile b/src/PVE/Jobs/Makefile
>> new file mode 100644
>> index 0000000..9eed1b2
>> --- /dev/null
>> +++ b/src/PVE/Jobs/Makefile
>> @@ -0,0 +1,6 @@
>> +SOURCES=RealmSync.pm
>> +
>> +.PHONY: install
>> +install: ${SOURCES}
>> +	install -d -m 0755 ${DESTDIR}${PERLDIR}/PVE/Jobs
>> +	for i in ${SOURCES}; do install -D -m 0644 $$i ${DESTDIR}${PERLDIR}/PVE/Jobs/$$i; done
>> diff --git a/src/PVE/Jobs/RealmSync.pm b/src/PVE/Jobs/RealmSync.pm
>> new file mode 100644
>> index 0000000..e8a86e3
>> --- /dev/null
>> +++ b/src/PVE/Jobs/RealmSync.pm
>> @@ -0,0 +1,193 @@
>> +package PVE::Jobs::RealmSync;
>> +
>> +use strict;
>> +use warnings;
>> +
>> +use JSON;
>> +
>> +use PVE::JSONSchema qw(get_standard_option);
>> +use PVE::Cluster;
>> +use PVE::CalendarEvent;
>> +use PVE::Tools;
>> +
>> +use PVE::API2::Domains;
>> +
>> +use base qw(PVE::Job::Registry);
>> +
>> +sub type {
>> +    return 'realmsync';
>> +}
>> +
>> +my $props = get_standard_option('realm-sync-options', {
>> +    realm => get_standard_option('realm'),
>> +});
>> +
>> +sub properties {
>> +    return $props;
>> +}
>> +
>> +sub options {
>> +    my $options = {
>> +	enabled => { optional => 1 },
>> +	schedule => {},
>> +	comment => { optional => 1 },
>> +	node => { optional => 1 },
> 
> might want to drop that for now (as commented above in the api return schema)
> 
>> +	scope => {},
>> +    };
>> +    for my $opt (keys %$props) {
>> +	next if defined($options->{$opt});
>> +	if ($props->{$opt}->{optional}) {
>> +	    $options->{$opt} = { optional => 1 };
>> +	} else {
>> +	    $options->{$opt} = {};
>> +	}
>> +    }
>> +    $options->{realm}->{fixed} = 1;
>> +
>> +    return $options;
>> +}
>> +
>> +sub decode_value {
>> +    my ($class, $type, $key, $value) = @_;
>> +    return $value;
>> +}
>> +
>> +sub encode_value {
>> +    my ($class, $type, $key, $value) = @_;
>> +    return $value;
>> +}
>> +
>> +sub createSchema {
>> +    my ($class, $skip_type) = @_;
> 
> add extra newline here
> 
>> +    my $schema = $class->SUPER::createSchema($skip_type);
>> +
>> +    my $opts = $class->options();
>> +    for my $opt (keys $schema->{properties}->%*) {
>> +	next if defined($opts->{$opt}) || $opt eq 'id';
>> +	delete $schema->{properties}->{$opt};
>> +    }
>> +
>> +    # remove legacy props
> 
> bit confused, we never had sync jobs, so why should it matter?

we reuse the schema from the existing realm sync (where these
legacy options exist), but better would be to filter that out
in the 'options' sub, so that it never reaches it here

> 
>> +    delete $schema->{properties}->{full};
>> +    delete $schema->{properties}->{purge};
>> +
>> +    return $schema;
>> +}
>> +
>> +sub updateSchema {
>> +    my ($class, $skip_type) = @_;
>> +    my $schema = $class->SUPER::updateSchema($skip_type);
>> +
>> +    my $opts = $class->options();
>> +    for my $opt (keys $schema->{properties}->%*) {
>> +	next if defined($opts->{$opt});
>> +	next if $opt eq 'id' || $opt eq 'delete';
>> +	delete $schema->{properties}->{$opt};
>> +    }
>> +
>> +    # remove legacy props
> 
> same here
> 
>> +    delete $schema->{properties}->{full};
>> +    delete $schema->{properties}->{purge};
>> +
>> +    return $schema;
>> +}
>> +
>> +sub run {
>> +    my ($class, $conf, $id, $schedule) = @_;
>> +
>> +    my $node = $conf->{node};
>> +    for my $opt (keys %$conf) {
>> +	delete $conf->{$opt} if !defined($props->{$opt});
>> +    }
>> +
>> +    my $realm = $conf->{realm};
>> +    my $statedir = "/etc/pve/priv/jobs";
>> +    my $statefile = "$statedir/realmsync-$id.json";
> 
> s/realmsync/realm-sync/
> 
>> +
>> +    if (!defined($node)) {
>> +	# cluster synced
>> +	my $now = time();
>> +	my $nodename = PVE::INotify::nodename();
>> +
>> +	# check statefile in pmxcfs if we should start
>> +	my $shouldrun = PVE::Cluster::cfs_lock_domain($realm, undef, sub {
> 
> erm, using just the $realm as lock ID can go quite bad, e.g., if a realm is named
> "ha" it would interfere with the HA domain locks. Can we prefix this, or just use
> the general, non-realm specific "realm-sync"? Not much downside in practice and
> adding things like groups might be better, even if just for auditability, if not done
> in parallel anyway?
> 
> In anyway, a domain lock must never be (just) an $id.

ok sorry, i obviously misinterpreted the sub name, i thought it was for locking
'domains' (which we often use instead of 'realms') because it was
right below the 'cfs_lock_storage' sub in pve-cluster

sure, just using 'realm-sync' probably works (conflicting sync jobs must wait
for the the others, but that shouln't take too long)

> 
> 
>> +	    mkdir $statedir;
>> +	    my $raw = eval { PVE::Tools::file_get_contents($statefile) } // '';
> 
> 
> a non-node specific kv-store might be really nice, would avoid some DB/disk write
> churn (note that this specific one is adding that much extra churn, but in sum
> we start to have now quite a few places where it would help, but can me  moved to
> such a thing later on too)

i can look into that

> 
>> +	    my $members = PVE::Cluster::get_members();
>> +
>> +	    my $state = ($raw =~ m/^(\{.*\})$/) ? decode_json($1) : {};
>> +	    my $lastnode = $state->{node} // $nodename;
>> +	    my $lastupid = $state->{upid};
>> +	    my $lasttime = $state->{time};
> 
> nit: please stay consistent with variable name style (snake_case vs noneatall - prefer the
> former, at least if not for things where we used nostyle since pretty much always, e.g.,
> nodename, albeit just $node is often fitting for that anyway)
> 
>> +
>> +	    my $node_online = 0;
> 
> should this be $last_node_online?

yeah probably better

> 
>> +	    if ($lastnode ne $nodename) {
>> +		if (defined(my $member = $members->{$lastnode})) {
>> +		    $node_online = $member->{online};
>> +		}
>> +	    } else {
>> +		$node_online = 1;
>> +	    }
> 
> could be way shorter (untested though):
>   
> my $node_online = $last_node eq $nodename || ($members->{$last_node} // {})->{online};
> 
>> +
>> +	    if (defined($lastupid)) {
>> +		# first check if the next run is scheduled
>> +		if (my $parsed = PVE::Tools::upid_decode($lastupid, 1)) {
>> +		    my $calspec = PVE::CalendarEvent::parse_calendar_event($schedule);
> 
> same here w.r.t. variable naming style
> 
>> +		    my $next_sync = PVE::CalendarEvent::compute_next_event($calspec, $parsed->{starttime});
>> +		    return 0 if !defined($next_sync) || $now < $next_sync; # not yet its (next) turn
>> +		}
>> +		# check if still running and node is online
>> +		my $tasks = PVE::Cluster::get_tasklist();
>> +		for my $task (@$tasks) {
>> +		    next if $task->{upid} ne $lastupid;
>> +		    last if defined($task->{endtime}); # it's already finished
>> +		    last if !$node_online; # it's not finished but the node is offline
> 
> s/but/and/ ?

does not make really a difference in semantics here? but sure

> 
>> +		    return 0; # not finished and online
>> +		}
>> +	    } elsif (defined($lasttime) && ($lasttime+60) > $now && $node_online) {
>> +		# another node started in the last 60 seconds and is online
> 
> "another node started this job in the last 60 seconds and is still online"
> 
>> +		return 0;
>> +	    }
>> +
>> +	    # any of the following conditions should be true here:
>> +	    # * it was started on another node but that node is offline now
>> +	    # * it was started but either too long ago, or with an error
>> +	    # * the started task finished
>> +
>> +	    PVE::Tools::file_set_contents($statefile, encode_json({
>> +		node => $nodename,
>> +		time => $now,
>> +	    }));
> 
> I would slightly prefer to move this into a (just an example name) save_state method,
> even if really small it's IMO better to contain it there, same for getting the state above.
> 
>> +	    return 1;
>> +	});
>> +	die $@ if $@;
>> +
>> +	if ($shouldrun) {
>> +	    my $upid = eval { PVE::API2::Domains->sync($conf) };
>> +	    my $err = $@;
>> +	    PVE::Cluster::cfs_lock_domain($realm, undef, sub {
> 
> same here w.r.t. wrongly used domain locking
> 
>> +		if ($err && !$upid) {
>> +		    PVE::Tools::file_set_contents($statefile, encode_json({
>> +			node => $nodename,
>> +			time => $now,
>> +			error => $err,
>> +		    }));
>> +		    die "$err\n";
>> +		}
>> +
>> +		PVE::Tools::file_set_contents($statefile, encode_json({
>> +		    node => $nodename,
>> +		    upid => $upid,
>> +		}));
> 
> same here with state saving
> 
>> +	    });
>> +	    die $@ if $@;
>> +	    return $upid;
>> +	}
>> +
>> +	return "OK"; # all other cases should not run the sync on this node
> 
> possibly stupid question, but why return a string here?

thats currently the interface of the Jobs handler, was derived from
starting vzdump backups where the api returns that if the call
did not actually run when e.g. the vmid filter only selected
vms on a different node

we can change it there of course, but then we have to introduce some logic
there to filter out the OK on job run (not a problem, it just moves
the handling into the plugin)

> 
>> +    }
>> +
>> +    return PVE::API2::Domains->sync($conf);
>> +}
>> +
>> +1;
>> diff --git a/src/PVE/Makefile b/src/PVE/Makefile
>> index c839d8f..dfc5314 100644
>> --- a/src/PVE/Makefile
>> +++ b/src/PVE/Makefile
>> @@ -8,3 +8,4 @@ install:
>>   	install -D -m 0644 TokenConfig.pm ${DESTDIR}${PERLDIR}/PVE/TokenConfig.pm
>>   	make -C API2 install
>>   	make -C CLI install
>> +	make -C Jobs install
>