[pmg-devel] [PATCH pmg-api 2/2] ruledb: content-type: add flags for source of matching
Dominik Csapak
d.csapak at proxmox.com
Mon Feb 17 14:03:55 CET 2025
code looks mostly fine (some comments inline)
but one higher level question:
does it really make sense to expose 3 settings for this?
e.g. wouldn't it much more simple to have a 'default' and 'strict' mode
where 'strict' only matches on the contents file type?
both filename&header can be set aribtrarily by the sender, so doing
filtering based on that seem dangerous?
Or is there some specific use case that makes it necessary
to specify all 3 sources separately ?
On 2/12/25 16:12, Stoiko Ivanov wrote:
> our current content-type matching is sensibly quite cautious in
> matching if any available information indicates a potential match:
> * mime-type detection based on file contents
> * mime-type detection based on file suffix
> * content-type header
>
> Sometimes this can lead to surprises (e.g. when a MUA sets the
> filetype of a pdf to application/octet-stream (the default type if no
> information is available).
>
> This change gives users the option to rely only on some of the sources
> for matching.
>
> This is a fix for the intial request in #2691 and addresses the
> suggestion from Friedrich from:
> https://bugzilla.proxmox.com/show_bug.cgi?id=5618#c2
>
> inspired by the changes for disclaimer released with PMG 8.1:
> 51d1507 ("fix #2430: ruledb disclaimer: make separator configurable")
>
> Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
> ---
> src/PMG/RuleDB/ContentTypeFilter.pm | 91 +++++++++++++++++++++++++++--
> 1 file changed, 86 insertions(+), 5 deletions(-)
>
> diff --git a/src/PMG/RuleDB/ContentTypeFilter.pm b/src/PMG/RuleDB/ContentTypeFilter.pm
> index 0199311..0dafa64 100644
> --- a/src/PMG/RuleDB/ContentTypeFilter.pm
> +++ b/src/PMG/RuleDB/ContentTypeFilter.pm
> @@ -26,7 +26,7 @@ sub otype_text {
> }
>
> sub new {
> - my ($type, $fvalue, $ogroup) = @_;
> + my ($type, $fvalue, $ogroup, $header, $magic, $glob) = @_;
>
> my $class = ref($type) || $type;
>
> @@ -36,6 +36,9 @@ sub new {
> }
>
> my $self = $class->SUPER::new('content-type', $fvalue, $ogroup);
> + $self->{header} = $header;
> + $self->{magic} = $magic;
> + $self->{glob} = $glob;
>
> return $self;
> }
> @@ -52,9 +55,53 @@ sub load_attr {
> $obj->{field_value} = $nt;
> }
>
> + my $sth = $ruledb->{dbh}->prepare(
> + "SELECT * FROM Attribut WHERE Object_ID = ?");
> +
> + $sth->execute($id);
> +
> + $obj->{header} = $obj->{magic} = $obj->{glob} = 1;
> +
> + while (my $ref = $sth->fetchrow_hashref()) {
> + if ($ref->{name} =~ /^(header|magic|glob)$/) {
> + $obj->{$1} = $ref->{value};
> + }
> + }
> +
> + $sth->finish();
> +
> + $obj->{id} = $id;
> +
> + $obj->{digest} = Digest::SHA::sha1_hex(
> + $id, $value, $ogroup, $obj->{header} // 1, $obj->{magic} //1 , $obj->{glob} // 1);
missing whitespace: '//1' -> '// 1'
> +
> return $obj;
> }
>
> +sub save {
> + my ($self, $ruledb) = @_;
> +
> + if (defined($self->{id})) {
> + #update - clean old attribut entries
> + $ruledb->{dbh}->do(
> + "DELETE FROM Attribut WHERE Object_ID = ?",
> + undef, $self->{id});
> + }
> +
> + $self->{id} = $self->SUPER::save($ruledb);
> +
> + for my $prop (qw(header magic glob)) {
> + if (defined($self->{$prop})) {
> + $ruledb->{dbh}->do(
> + "INSERT INTO Attribut (Value, Name, Object_ID) VALUES (?, ?, ?) ".
> + "ON CONFLICT(Object_ID, Name) DO UPDATE SET Value = Excluded.Value ",
> + undef, $self->{$prop}, $prop, $self->{id});
> + }
> + }
> +
> + return $self->{id};
> +}
> +
> sub parse_entity {
> my ($self, $entity) = @_;
>
> @@ -78,11 +125,14 @@ sub parse_entity {
>
> my $glob_ct = $entity->{PMX_glob_ct};
>
> - if ($header_ct && $header_ct =~ m|$self->{field_value}|) {
> + my $check_header = !defined($self->{header}) || ${self}->{header};
> + my $check_magic = !defined($self->{magic}) || ${self}->{magic};
> + my $check_glob = !defined($self->{glob}) || ${self}->{glob};
IMHO would be more readable when using the same syntax as above:
my $check_header = $self->{header} // 1;
what do you think?
> + if ($header_ct && $check_header && $header_ct =~ m|$self->{field_value}|) {
> push @$res, $id;
> - } elsif ($magic_ct && $magic_ct =~ m|$self->{field_value}|) {
> + } elsif ($magic_ct && $check_magic && $magic_ct =~ m|$self->{field_value}|) {
> push @$res, $id;
> - } elsif ($glob_ct && $glob_ct =~ m|$self->{field_value}|) {
> + } elsif ($glob_ct && $check_glob && $glob_ct =~ m|$self->{field_value}|) {
> push @$res, $id;
> }
> }
> @@ -112,19 +162,50 @@ sub properties {
> pattern => '[0-9a-zA-Z\/\\\[\]\+\-\.\*\_]+',
> maxLength => 1024,
> },
> + header => {
> + description => "use content-type from mail-header for matching",
is that really the 'mail-header' or the "mime-part header" ?
(not sure how to express that in a way it's clear to all)
> + type => 'boolean',
> + optional => 1,
> + default => 1,
> + },
> + magic => {
> + description => "use content-type from scanning the content for matching",
> + type => 'boolean',
> + optional => 1,
> + default => 1,
> + },
> + glob => {
> + description => "use content-type based on file-name for matching",
> + type => 'boolean',
> + optional => 1,
> + default => 1,
> + },
> };
> }
>
> sub get {
> my ($self) = @_;
>
> - return { contenttype => $self->{field_value} };
> + return {
> + contenttype => $self->{field_value},
> + header => $self->{header},
> + magic => $self->{magic},
> + glob => $self->{glob},
> + };
> }
>
> sub update {
> my ($self, $param) = @_;
>
> $self->{field_value} = $param->{contenttype};
> +
> + for my $prop (qw(header magic glob)) {
> + if (defined($param->{$prop}) && $param->{$prop} == 0) {
> + $self->{$prop} = 0;
> + } else {
> + delete $self->{$prop};
> + }
> + }
> }
>
> 1;
More information about the pmg-devel
mailing list