[pmg-devel] [PATCH pmg-api 5/7] config: add spam option for extract_text
Stoiko Ivanov
s.ivanov at proxmox.com
Mon Mar 13 22:23:48 CET 2023
toggling the configuration options for the ExtractText SA plugin (see
[0]).
The config is copied from the module itself, the informational headers
were not added, as I don't see too much gain, apart from verifying
that the plugin is working.
the external dependencies for the plugin to work are added as
Recommends, as it is a possible config to not have them installed and
simply disable the option
[0] https://metacpan.org/pod/Mail::SpamAssassin::Plugin::ExtractText
Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
debian/control | 9 ++++++++-
src/PMG/Config.pm | 6 ++++++
src/templates/v400.pre.in | 34 ++++++++++++++++++++++++++++++----
3 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/debian/control b/debian/control
index 93ad72c..d2ed7da 100644
--- a/debian/control
+++ b/debian/control
@@ -98,7 +98,14 @@ Depends: apt (>= 2~),
ucf,
${misc:Depends},
${perl:Depends},
-Recommends: ifupdown2, proxmox-offline-mirror-helper
+Recommends: antiword,
+ docx2txt,
+ ifupdown2,
+ odt2txt,
+ poppler-utils,
+ proxmox-offline-mirror-helper,
+ tesseract-ocr,
+ unrtf
Suggests: zfsutils-linux
Description: Proxmox Mailgateway API Server Implementation
This implements a REST API to configure Proxmox Mailgateway.
diff --git a/src/PMG/Config.pm b/src/PMG/Config.pm
index 5dcffb7..699a622 100755
--- a/src/PMG/Config.pm
+++ b/src/PMG/Config.pm
@@ -211,6 +211,11 @@ sub properties {
minimum => 64,
default => 256*1024,
},
+ extract_text => {
+ description => "Extract text from attachments (doc, pdf, rtf, images) and scan for spam.",
+ type => 'boolean',
+ default => 0,
+ },
};
}
@@ -225,6 +230,7 @@ sub options {
bounce_score => { optional => 1 },
rbl_checks => { optional => 1 },
maxspamsize => { optional => 1 },
+ extract_text => { optional => 1 },
};
}
diff --git a/src/templates/v400.pre.in b/src/templates/v400.pre.in
index 052e73e..4d68d6c 100644
--- a/src/templates/v400.pre.in
+++ b/src/templates/v400.pre.in
@@ -16,11 +16,37 @@
# added to new files, named according to the release they're added in.
###########################################################################
+
+[% IF pmg.spam.extract_text %]
# ExtractText - Extract text from documents or images for matching
-#
-# Requires manual configuration, see plugin documentation.
-#
-# loadplugin Mail::SpamAssassin::Plugin::ExtractText
+# informational headers and hits not configured
+loadplugin Mail::SpamAssassin::Plugin::ExtractText
+
+ifplugin Mail::SpamAssassin::Plugin::ExtractText
+
+ extracttext_external pdftotext /usr/bin/pdftotext -nopgbrk -layout -enc UTF-8 {} -
+ extracttext_use pdftotext .pdf application/pdf
+
+ # http://docx2txt.sourceforge.net
+ extracttext_external docx2txt /usr/bin/docx2txt {} -
+ extracttext_use docx2txt .docx application/docx
+
+ extracttext_external antiword /usr/bin/antiword -t -w 0 -m UTF-8.txt {}
+ extracttext_use antiword .doc application/(?:vnd\.?)?ms-?word.*
+
+ extracttext_external unrtf /usr/bin/unrtf --nopict {}
+ extracttext_use unrtf .doc .rtf application/rtf text/rtf
+
+ extracttext_external odt2txt /usr/bin/odt2txt --encoding=UTF-8 {}
+ extracttext_use odt2txt .odt .ott application/.*?opendocument.*text
+ extracttext_use odt2txt .sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter
+
+ extracttext_external tesseract {OMP_THREAD_LIMIT=1} /usr/bin/tesseract -c page_separator= {} -
+ extracttext_use tesseract .jpg .png .bmp .tif .tiff image/(?:jpeg|png|x-ms-bmp|tiff)
+
+endif
+
+[% END %]
# DecodeShortUrl - Check for shortened URLs
#
--
2.30.2
More information about the pmg-devel
mailing list