diff options
author | adamw <adamw@FreeBSD.org> | 2014-03-12 05:49:40 +0800 |
---|---|---|
committer | adamw <adamw@FreeBSD.org> | 2014-03-12 05:49:40 +0800 |
commit | 4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0 (patch) | |
tree | 7c7c408a52b143121035d7bda8f7d6d99104c54e /japanese/p5-Mail-SpamAssassin | |
parent | 5d2367ae581b0263527bd9fbaf6cfce5897cf270 (diff) | |
download | freebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.tar.gz freebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.tar.zst freebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.zip |
Move {mail,japanese}/p5-Mail-SpamAssassin to &/spamassassin, in the name of
improving accessibility.
I think people who want to just find the port/package and install it are
more likely to look for "spamassassin the program" than "spamassassin the
perl module collection."
Diffstat (limited to 'japanese/p5-Mail-SpamAssassin')
-rw-r--r-- | japanese/p5-Mail-SpamAssassin/Makefile | 35 | ||||
-rw-r--r-- | japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch | 1143 | ||||
-rw-r--r-- | japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist | 12 | ||||
-rw-r--r-- | japanese/p5-Mail-SpamAssassin/files/tokenizer.pre | 8 | ||||
-rw-r--r-- | japanese/p5-Mail-SpamAssassin/pkg-message | 11 |
5 files changed, 0 insertions, 1209 deletions
diff --git a/japanese/p5-Mail-SpamAssassin/Makefile b/japanese/p5-Mail-SpamAssassin/Makefile deleted file mode 100644 index 5c00e75e17ea..000000000000 --- a/japanese/p5-Mail-SpamAssassin/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -# Created by: TAOKA Fumiyoshi -# $FreeBSD$ - -PORTREVISION= 2 -CATEGORIES= japanese mail perl5 -PKGNAMEPREFIX= ja-p5- - -MAINTAINER= masaki@club.kyutech.ac.jp -COMMENT= SpamAssassin with patches to handle multibyte character - -LICENSE= APACHE20 - -MASTERDIR= ${.CURDIR}/../../mail/p5-Mail-SpamAssassin - -BUILD_DEPENDS= ja-p5-MeCab>=0.98:${PORTSDIR}/japanese/p5-MeCab - -CONFLICTS= p5-Mail-SpamAssassin-[0-9]* - -EXTRA_PATCHES= ${.CURDIR}/files/spamassassin-ja.patch - -PKGMESSAGE= ${.CURDIR}/pkg-message -PLIST= ${WRKDIR}/pkg-plist - -TOKENIZER_PRE= tokenizer.pre - -PLIST_SUB+= TOKENIZER_PRE=${TOKENIZER_PRE} - -pre-install: - @${CAT} ${EXTRA_PATCHES:S/.patch/.plist/} > ${PLIST} - @${CAT} ${PKGDIR}/pkg-plist >> ${PLIST} - -post-install:: - ${INSTALL_DATA} ${.CURDIR}/files/${TOKENIZER_PRE} ${STAGEDIR}${ETCDIR}/${TOKENIZER_PRE}.sample - -.include "${MASTERDIR}/Makefile" diff --git a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch b/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch deleted file mode 100644 index 3544abe3555d..000000000000 --- a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch +++ /dev/null @@ -1,1143 +0,0 @@ ---- lib/Mail/SpamAssassin/HTML.pm.orig 2014-02-07 17:36:28.000000000 +0900 -+++ lib/Mail/SpamAssassin/HTML.pm 2014-03-04 11:18:44.000000000 +0900 -@@ -86,7 +86,7 @@ - $ok_attributes{div}{$_} = 1 for qw( style ); - - sub new { -- my ($class) = @_; -+ my ($class, $opts) = @_; - my $self = $class->SUPER::new( - api_version => 3, - handlers => [ -@@ -99,6 +99,7 @@ - declaration => ["html_declaration", "self,text"], - ], - marked_sections => 1); -+ $self->{normalize} = $opts->{'normalize'} || 0; - - $self; - } -@@ -681,7 +682,14 @@ - } - } - else { -- $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; -+ if ($self->{normalize}) { -+ $text =~ s/\xc2\xa0/ /g; # no-break space -+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace -+ $text =~ s/[ \t\n\r\f\x0b]+/ /g; -+ } -+ else { -+ $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; -+ } - # trim leading whitespace if previous element was whitespace - # and current element is not invisible - if (@{ $self->{text} } && !$display{invisible} && ---- lib/Mail/SpamAssassin/Message/Node.pm.orig 2014-02-07 17:36:23.000000000 +0900 -+++ lib/Mail/SpamAssassin/Message/Node.pm 2014-03-04 11:22:38.000000000 +0900 -@@ -42,6 +42,7 @@ - use Mail::SpamAssassin::Constants qw(:sa); - use Mail::SpamAssassin::HTML; - use Mail::SpamAssassin::Logger; -+use Mail::SpamAssassin::Util::Charset; - - =item new() - -@@ -385,27 +386,10 @@ - - sub _normalize { - my ($self, $data, $charset) = @_; -- return $data unless $self->{normalize}; -+ return wantarray ? ($data, $charset) : $data unless $self->{normalize}; - -- my $detected = Encode::Detect::Detector::detect($data); -- -- my $converter; -- -- if ($charset && $charset !~ /^us-ascii$/i && -- ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) { -- dbg("message: Using labeled charset $charset"); -- $converter = Encode::find_encoding($charset); -- } -- -- $converter = Encode::find_encoding($detected) unless $converter || !defined($detected); -- -- return $data unless $converter; -- -- dbg("message: Converting..."); -- -- my $rv = $converter->decode($data, 0); -- utf8::downgrade($rv, 1); -- return $rv -+ my ($decoded_data, $detected_charset) = normalize_charset($data, $charset); -+ return wantarray ? ($decoded_data, $detected_charset) : $decoded_data; - } - - =item rendered() -@@ -428,8 +412,12 @@ - # text/x-aol is ignored here, but looks like text/html ... - return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i ); - -- my $text = $self->_normalize($self->decode(), $self->{charset}); -+ my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset}); - my $raw = length($text); -+ if ($self->{normalize}) { -+ $self->{charset} = $charset; -+ $self->{language} = get_language($text, $charset); -+ } - - # render text/html always, or any other text|text/plain part as text/html - # based on a heuristic which simulates a certain common mail client -@@ -439,7 +427,7 @@ - { - $self->{rendered_type} = 'text/html'; - -- my $html = Mail::SpamAssassin::HTML->new(); # object -+ my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}}); # object - $html->parse($text); # parse+render text - $self->{rendered} = $html->get_rendered_text(); - $self->{visible_rendered} = $html->get_rendered_text(invisible => 0); ---- lib/Mail/SpamAssassin/Message.pm.orig 2014-02-07 17:36:28.000000000 +0900 -+++ lib/Mail/SpamAssassin/Message.pm 2014-03-04 11:27:31.000000000 +0900 -@@ -604,6 +604,8 @@ - delete $self->{'pristine_headers'}; - delete $self->{'line_ending'}; - delete $self->{'missing_head_body_separator'}; -+ delete $self->{'charset'}; -+ delete $self->{'language'}; - - my @toclean = ( $self ); - -@@ -630,6 +632,8 @@ - delete $part->{'invisible_rendered'}; - delete $part->{'type'}; - delete $part->{'rendered_type'}; -+ delete $self->{'charset'}; -+ delete $self->{'language'}; - - # if there are children nodes, add them to the queue of nodes to clean up - if (exists $part->{'body_parts'}) { -@@ -1085,7 +1089,14 @@ - - # whitespace handling (warning: small changes have large effects!) - $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed -- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ if ($self->{normalize}) { -+ $text =~ s/\xc2\xa0/ /g; # no-break space => space -+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space -+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space -+ } -+ else { -+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ } - $text =~ tr/\f/\n/; # form feeds => newline - - # warn "message: $text"; -@@ -1142,7 +1153,14 @@ - - # whitespace handling (warning: small changes have large effects!) - $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed -- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ if ($self->{normalize}) { -+ $text =~ s/\xc2\xa0/ /g; # no-break space => space -+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space -+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space -+ } -+ else { -+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ } - $text =~ tr/\f/\n/; # form feeds => newline - - my @textary = split_into_array_of_short_lines ($text); -@@ -1193,7 +1211,14 @@ - - # whitespace handling (warning: small changes have large effects!) - $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed -- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ if ($self->{normalize}) { -+ $text =~ s/\xc2\xa0/ /g; # no-break space => space -+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space -+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space -+ } -+ else { -+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space -+ } - $text =~ tr/\f/\n/; # form feeds => newline - - my @textary = split_into_array_of_short_lines ($text); -@@ -1269,6 +1294,28 @@ - - # --------------------------------------------------------------------------- - -+sub get_language { -+ my ($self) = @_; -+ -+ if (defined $self->{language}) { return $self->{language}; } -+ my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1); -+ return '' unless @parts; -+ -+ # Go through each part -+ my @langs; -+ for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) { -+ my $p = $parts[$pt]; -+ my $lang = $p->{language}; -+ next unless ($lang); -+ push(@langs, $lang) unless (grep(/^$lang$/, @langs)) -+ } -+ $self->{language} = scalar(@langs) ? join(' ', @langs) : ''; -+ return $self->{language}; -+} -+ -+# --------------------------------------------------------------------------- -+ -+ - 1; - - =back ---- lib/Mail/SpamAssassin/PerMsgStatus.pm.orig 2014-02-07 17:36:28.000000000 +0900 -+++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2014-03-04 11:30:25.000000000 +0900 -@@ -53,6 +53,7 @@ - use warnings; - use re 'taint'; - -+use Encode; - use Errno qw(ENOENT); - use Time::HiRes qw(time); - -@@ -996,19 +997,41 @@ - - # the report charset - my $report_charset = "; charset=iso-8859-1"; -- if ($self->{conf}->{report_charset}) { -- $report_charset = "; charset=" . $self->{conf}->{report_charset}; -- } - - # the SpamAssassin report - my $report = $self->get_report(); -+ if ($self->{conf}->{report_charset}) { -+ $report_charset = "; charset=" . $self->{conf}->{report_charset}; -+ } - - # If there are any wide characters, need to MIME-encode in UTF-8 - # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then - # we could try converting to that charset if possible -- unless ($] < 5.008 || utf8::downgrade($report, 1)) { -+ my $is_utf8 = 0; -+ if ($self->{conf}->{normalize_charset}) { -+ $report = Encode::decode_utf8($report); -+ $is_utf8 = 1; -+ } -+ else { -+ if ($self->{msg}->{charset}) { -+ eval { -+ my $scratch = $report; -+ $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK); -+ $is_utf8 = 1; -+ }; -+ } -+ } -+ if ($is_utf8) { -+ $is_utf8 = 1; -+ eval { -+ my $scratch = $report; -+ $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK); -+ $is_utf8 = 0; -+ }; -+ if ($is_utf8) { -+ $report = Encode::encode_utf8($report); - $report_charset = "; charset=utf-8"; -- utf8::encode($report); -+ } - } - - # get original headers, "pristine" if we can do it ---- lib/Mail/SpamAssassin/Plugin/Bayes.pm.orig 2014-02-07 17:36:27.000000000 +0900 -+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2014-03-04 11:34:46.000000000 +0900 -@@ -223,6 +223,15 @@ - # will require a longer token than English ones.) - use constant MAX_TOKEN_LENGTH => 15; - -+# Skip if a token is too short. -+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?: -+ [\x00-\x7F] # 1 byte -+ | [\xC0-\xDF][\x80-\xBF] # 2 bytes -+ | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes -+ | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes -+ | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana -+)}x; -+ - ########################################################################### - - sub new { -@@ -1039,9 +1048,28 @@ - $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); - $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); - @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); -+ if ($self->{conf}->{normalize_charset}) { -+ my $tokenizer = $self->get_tokenizer($msg); -+ if (ref($tokenizer)) { -+ $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body}); -+ $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz}); -+ } -+ } - return $msgdata; - } - -+sub get_tokenizer { -+ my ($self, $msg) = @_; -+ -+ my $tokenizer; -+ my @languages = split(/\s+/, $msg->{msg}->get_language()); -+ foreach my $lang (@languages) { -+ $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang}; -+ last if (ref($tokenizer)); -+ } -+ return $tokenizer; -+} -+ - ########################################################################### - - # The calling functions expect a uniq'ed array of tokens ... -@@ -1095,7 +1123,7 @@ - # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, - # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. - # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" -- tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; -+ tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs; - - # DO split on "..." or "--" or "---"; common formatting error resulting in - # hapaxes. Keep the separator itself as a token, though, as long ones can -@@ -1124,6 +1152,11 @@ - # - next if ( defined $magic_re && $token =~ /$magic_re/ ); - -+ # Skip short UTF-8 tokens. -+ if ($self->{conf}->{normalize_charset}) { -+ next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o); -+ } -+ - # *do* keep 3-byte tokens; there's some solid signs in there - my $len = length($token); - -@@ -1152,14 +1185,16 @@ - # the domain ".net" appeared in the To header. - # - if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { -- if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { -- # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, -- # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan -- # to me! (jm) -- while ($token =~ s/^(..?)//) { -- push (@rettokens, "8:$1"); -- } -- next; -+ unless ($self->{conf}->{normalize_charset}) { -+ if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { -+ # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, -+ # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan -+ # to me! (jm) -+ while ($token =~ s/^(..?)//) { -+ push (@rettokens, "8:$1"); -+ } -+ next; -+ } - } - - if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) -diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm ---- /dev/null 1970-01-01 09:00:00.000000000 +0900 -+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2011-07-14 22:29:19.000000000 +0900 -@@ -0,0 +1,84 @@ -+# <@LICENSE> -+# Copyright 2004 Apache Software Foundation -+# -+# Licensed under the Apache License, Version 2.0 (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+# </@LICENSE> -+ -+=head1 NAME -+ -+Tokenizer::MeCab - Japanese tokenizer with MeCab -+ -+=head1 SYNOPSIS -+ -+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab -+ -+=head1 DESCRIPTION -+ -+This plugin tokenizes a Japanese string with MeCab that is -+the morphological analysis engine. -+ -+Text::MeCab 0.12 or over is required. -+ -+=cut -+ -+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; -+ -+use strict; -+use warnings; -+use Mail::SpamAssassin::Plugin::Tokenizer; -+ -+use vars qw(@ISA); -+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); -+ -+# Have to do this so that RPM doesn't find these as required perl modules -+BEGIN { require MeCab; } -+our $language = 'ja'; -+our $mecab = new MeCab::Tagger(-Ochasen); -+ -+sub new { -+ my $class = shift; -+ my $mailsaobject = shift; -+ -+ $class = ref($class) || $class; -+ my $self = $class->SUPER::new($mailsaobject, $language); -+ bless ($self, $class); -+ -+ return $self; -+} -+ -+sub tokenize { -+ my $self = shift; -+ my $text_array = shift; -+ -+ my @tokenized_array; -+ foreach my $text (@$text_array) { -+ next unless ($text); -+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; -+ push(@tokenized_array, $text); -+ } -+ return \@tokenized_array; -+} -+ -+sub _tokenize { -+ my $text = shift; -+ -+ my @buf; -+ for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) { -+ push(@buf, $node->{surface}); -+ } -+ my $tokenized = join(' ', @buf) . ' '; -+ return $tokenized; -+} -+ -+1; -+ -diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm ---- /dev/null 1970-01-01 09:00:00.000000000 +0900 -+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2011-07-14 22:29:19.000000000 +0900 -@@ -0,0 +1,111 @@ -+# <@LICENSE> -+# Copyright 2004 Apache Software Foundation -+# -+# Licensed under the Apache License, Version 2.0 (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+# </@LICENSE> -+ -+=head1 NAME -+ -+Tokenizer::SimpleJA - simple Japanese tokenizer -+ -+=head1 SYNOPSIS -+ -+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA -+ -+=head1 DESCRIPTION -+ -+This plugin simply tokenizes a Japanese string by characters other than -+the alphabet, the Chinese character, and the katakana. -+ -+=cut -+ -+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; -+ -+use strict; -+use warnings; -+use Mail::SpamAssassin::Plugin::Tokenizer; -+ -+use vars qw(@ISA); -+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); -+ -+our $language = 'ja'; -+ -+our $RE = qr{( -+ # Hiragana -+ (?: -+ \xE3\x81[\x80-\xBF] -+ | \xE3\x82[\x80-\x9F] -+ )+ -+ # Katakana -+ | (?: -+ \xE3\x82[\xA0-\xBF] -+ | \xE3\x83[\x80-\xBF] -+ )+ -+ # Kanji -+ | (?: -+ \xE3[\x90-\xBF][\x80-\xBF] -+ | [\xE4-\xE9][\x80-\xBF]{2} -+ | \xEF[\xA4-\xAB][\x80-\xBF] -+ )+ -+ # Fullwidth -+ | (?: -+ \xEF\xBC[\x80-\xBF] -+ | \xEF\xBD[\x80-\x9F] -+ )+ -+ # Others -+ | [\xC0-\xDF][\x80-\xBF] -+ | [\xE0-\xE2][\x80-\xBF]{2} -+ | \xE3\x80[\x80-\xBF] -+ | \xE3[\x84-\x8F][\x80-\xBF] -+ | [\xEA-\xEE][\x80-\xBF]{2} -+ | \xEF[\x80-\xA3][\x80-\xBF] -+ | \xEF[\xAC-\xBB][\x80-\xBF] -+ | \xEF\xBD[\xA0-\xBF] -+ | \xEF[\xBE-\xBF][\x80-\xBF] -+ | [\xF0-\xF7][\x80-\xBF]{3} -+)}x; -+ -+sub new { -+ my $class = shift; -+ my $mailsaobject = shift; -+ -+ $class = ref($class) || $class; -+ my $self = $class->SUPER::new($mailsaobject, $language); -+ bless ($self, $class); -+ -+ return $self; -+} -+ -+sub tokenize { -+ my $self = shift; -+ my $text_array = shift; -+ -+ my @tokenized_array; -+ foreach my $text (@$text_array) { -+ next unless ($text); -+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg; -+ push(@tokenized_array, $text); -+ } -+ return \@tokenized_array; -+} -+ -+sub _tokenize { -+ my $text = shift; -+ -+ $text =~ s/$RE/$1 /og; -+ $text = ' ' . $text; -+ return $text; -+} -+ -+1; -+ -diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer.pm ---- /dev/null 1970-01-01 09:00:00.000000000 +0900 -+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2011-07-14 22:35:46.000000000 +0900 -@@ -0,0 +1,115 @@ -+# <@LICENSE> -+# Copyright 2004 Apache Software Foundation -+# -+# Licensed under the Apache License, Version 2.0 (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+# </@LICENSE> -+ -+=head1 NAME -+ -+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class -+ -+=head1 SYNOPSIS -+ -+=head2 SpamAssassin configuration: -+ -+ loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm -+ -+=head2 Perl code: -+ -+ use Mail::SpamAssassin::Plugin::Tokenizer; -+ use vars qw(@ISA); -+ @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); -+ # language to use this plugin -+ our $language = 'ja'; -+ -+ # constructor: register language -+ sub new { -+ my $class = shift; -+ my $mailsaobject = shift; -+ -+ # some boilerplate... -+ $class = ref($class) || $class; -+ my $self = $class->SUPER::new($mailsaobject, $language); -+ bless ($self, $class); -+ -+ return $self; -+ } -+ -+ # tokenize function -+ sub tokenize { -+ my $self = shift; -+ my $text_array_ref = shift; -+ -+ ...... -+ -+ return $tokenized_array_ref; -+ } -+ -+ -+=head1 DESCRIPTION -+ -+This plugin is the base class of tokenizer plugin. -+You must define tokenize() and $language -+ -+=head1 INTERFACE -+ -+ sub tokenize { -+ my $self = shift; -+ my $text_array_ref = shift; -+ -+ ...... -+ -+ return $tokenized_array_ref; -+ } -+ -+=cut -+ -+package Mail::SpamAssassin::Plugin::Tokenizer; -+ -+use Mail::SpamAssassin::Plugin; -+use Mail::SpamAssassin::Logger; -+use strict; -+use warnings; -+use bytes; -+ -+use vars qw(@ISA); -+@ISA = qw(Mail::SpamAssassin::Plugin); -+ -+sub new { -+ my $class = shift; -+ my $mailsaobject = shift; -+ my $language = shift; -+ -+ # some boilerplate... -+ $class = ref($class) || $class; -+ my $self = $class->SUPER::new($mailsaobject); -+ bless ($self, $class); -+ -+ if ($language) { -+ $self->{main}->{conf}->{tokenizer}->{$language} = $self; -+ } -+ else { -+ dbg("plugin: $self: \$language is not defined"); -+ } -+ -+ return $self; -+} -+ -+sub tokenize { -+ my ($self, $ref) = @_; -+ -+ return $ref; -+} -+ -+1; -+ -diff -uNr /dev/null lib/Mail/SpamAssassin/Util/Charset.pm ---- /dev/null 1970-01-01 09:00:00.000000000 +0900 -+++ lib/Mail/SpamAssassin/Util/Charset.pm 2011-07-14 22:29:19.000000000 +0900 -@@ -0,0 +1,471 @@ -+# <@LICENSE> -+# Copyright 2006 Apache Software Foundation -+# -+# Licensed under the Apache License, Version 2.0 (the "License"); -+# you may not use this file except in compliance with the License. -+# You may obtain a copy of the License at -+# -+# http://www.apache.org/licenses/LICENSE-2.0 -+# -+# Unless required by applicable law or agreed to in writing, software -+# distributed under the License is distributed on an "AS IS" BASIS, -+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+# See the License for the specific language governing permissions and -+# limitations under the License. -+# </@LICENSE> -+ -+ -+=head1 NAME -+ -+ Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language -+ -+=head1 SYNOPSIS -+ -+ my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset); -+ my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset); -+ -+=head1 DESCRIPTION -+ -+This module implements utility methods for charset and language. -+ -+=cut -+ -+package Mail::SpamAssassin::Util::Charset; -+ -+use strict; -+use warnings; -+use Encode; -+use Encode::Guess; -+use Encode::Alias; -+ -+use vars qw ( -+ @ISA @EXPORT -+); -+ -+require Exporter; -+ -+@ISA = qw(Exporter); -+@EXPORT = qw(normalize_charset get_language); -+ -+########################################################################### -+ -+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; -+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; -+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; }; -+ -+########################################################################### -+ -+our $KANA_HAN_RE = qr{ -+ # Hiragana and Katakana -+ \xE3[\x81-\x83][\x80-\xBF] -+ # Han -+ | \xE3[\x90-\xBF][\x80-\xBF] -+ | [\xE4-\xE9][\x80-\xBF]{2} -+ | \xEF[\xA4-\xAB][\x80-\xBF] -+}x; -+ -+our %enc2lang; -+our %lang2enc; -+our %scr2lang; -+our %cjkscr2lang; -+our @scrorder; -+ -+BEGIN { -+ -+ # See the following URL about this map: -+ # http://czyborra.com/charsets/iso8859.html -+ # http://czyborra.com/charsets/codepages.html -+ # http://czyborra.com/charsets/cyrillic.html -+ # http://en.wikipedia.org/wiki/ISO_8859 -+ # http://www.w3.org/International/O-charset-lang.html -+ %enc2lang = ( -+ # buint-in Encodings and Encode::Byte -+ # N. America -+ 'ascii' => 'en', -+ 'cp437' => 'en', -+ 'cp863' => 'weurope', -+ -+ # W. Europe (Latin1, Latin9) -+ # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af -+ 'iso-8859-1' => 'weurope', -+ 'iso-8859-15' => 'weurope', -+ 'cp850' => 'weurope', -+ 'cp860' => 'weurope', -+ 'cp1252' => 'weurope', -+ 'MacRoman' => 'weurope', -+ -+ # Cntrl. Europe / Latin2 / Latin10 -+ # hr cs hu pl sr sk sl -+ 'iso-8859-2' => 'ceurope', -+ 'cp852' => 'ceurope', -+ 'cp1250' => 'ceurope', -+ 'MacCentralEurRoman' => 'ceurope', -+ 'MacCroatian' => 'ceurope', -+ 'iso-8859-16' => 'ceurope', -+ 'MacRomanian' => 'ceurope', -+ -+ # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) -+ # eo mt -+ 'iso-8859-3' => 'seurope', -+ -+ # Baltics (Latin4, Latin7) -+ # lv lt -+ 'iso-8859-4' => 'neurope', -+ 'iso-8859-13' => 'baltic', -+ 'cp1257' => 'baltic', -+ -+ # Nordics (Latin6) -+ # et kl iu se -+ 'iso-8859-10' => 'nordic', -+ -+ # Cyrillics -+ # bg be uk sr mk ru -+ 'iso-8859-5' => 'ru', -+ 'cp855' => 'ru', -+ 'cp1251' => 'ru', -+ 'cp866' => 'ru', -+ 'MacCyrillic' => 'ru', -+ 'koi8-r' => 'ru', -+ 'MacUkrainian' => 'uk', -+ 'koi8-u' => 'uk', -+ -+ # Arabic -+ 'iso-8859-6' => 'ar', -+ 'cp864' => 'ar', -+ 'cp1256' => 'ar', -+ 'MacArabic' => 'ar', -+ 'cp1006' => 'fa', -+ 'MacFarsi' => 'fa', -+ -+ # Greek -+ 'iso-8859-7' => 'el', -+ 'cp1253' => 'el', -+ 'MacGreek' => 'el', -+ -+ # Hebrew -+ # he yi -+ 'iso-8859-8' => 'he', -+ 'cp862' => 'he', -+ 'cp1255' => 'he', -+ 'MacHebrew' => 'he', -+ -+ # Turkish -+ 'iso-8859-9' => 'tr', -+ 'cp857' => 'tr', -+ 'cp1254' => 'tr', -+ 'MacTurkish' => 'tr', -+ -+ # Thai -+ 'iso-8859-11' => 'th', -+ 'cp874' => 'th', -+ -+ # Celtics (Latin8) -+ # gd cy br -+ 'iso-8859-14' => 'celtic', -+ -+ # Vietnamese -+ 'viscii' => 'vi', -+ 'cp1258' => 'vi', -+ -+ # Encode::CN -+ 'euc-cn' => 'zh', -+ 'cp936' => 'zh', -+ 'hz' => 'zh', -+ -+ # Encode::TW -+ 'big5-eten' => 'zh', -+ 'big5-hkscs' => 'zh', -+ 'cp950' => 'zh', -+ -+ # Encode::JP -+ 'euc-jp' => 'ja', -+ 'shiftjis' => 'ja', -+ '7bit-jis' => 'ja', -+ 'iso-2022-jp' => 'ja', -+ 'iso-2022-jp-1' => 'ja', -+ 'cp932' => 'ja', -+ -+ # Encode::KR -+ 'euc-kr' => 'ko', -+ 'cp949' => 'ko', -+ 'johab' => 'ko', -+ 'iso-2022-kr' => 'ko', -+ -+ # Encode::HanExtra -+ 'euc-tw' => 'zh', -+ 'gb18030' => 'zh', -+ -+ # Encode::JIS2K -+ 'euc-jisx0213' => 'ja', -+ 'shiftjisx0123' => 'ja', -+ 'iso-2022-jp-3' => 'ja', -+ -+ # Encode::EUCJPMS -+ 'eucJP-ms' => 'ja', -+ 'cp51932' => 'ja', -+ 'cp50220' => 'ja', -+ 'cp50221' => 'ja', -+ -+ ); -+ -+ %lang2enc = ( -+ # Latin1 -+ 'en' => ['ascii'], -+ 'weurope' => ['cp1252'], -+ -+ # Latin2 -+ 'ceurope' => ['cp1250'], -+ -+ # Latin3 -+ 'seurope' => ['iso-8859-3'], -+ -+ # Latin4 -+ 'neurope' => ['iso-8859-4'], -+ -+ # Latin5 -+ 'tr' => ['cp1254'], -+ -+ # Latin6 -+ 'nordic' => ['iso-8859-10'], -+ -+ # Latin7 -+ 'baltic' => ['cp1257'], -+ -+ # Latin8 -+ 'celtic' => ['iso-8859-14'], -+ -+ # Non Latin -+ 'ru' => ['koi8-r', 'cp1251'], -+ 'uk' => ['koi8-u'], -+ -+ 'ar' => ['cp1256'], -+ 'el' => ['cp1253'], -+ 'he' => ['cp1255'], -+ 'th' => ['cp874'], -+ 'vi' => ['viscii', 'cp1258'], -+ 'zh' => ['euc-cn', 'cp950'], -+ 'ja' => ['euc-jp', 'cp932'], -+ 'ko' => ['euc-kr', 'cp949'], -+ -+ ); -+ -+ %scr2lang = ( -+ 'InLatin1Supplement' => ['weurope'], -+ 'InLatinExtendedA' => [ -+ 'ceurope', -+ 'seurope', -+ 'tr', -+ 'vi' -+ ], -+ 'InLatinExtendedB' => [ -+ 'nordic', -+ 'baltic', -+ 'celtic' -+ ], -+ 'Thai' => ['th'], -+ 'Cyrillic' => ['ru', 'uk'], -+ 'Arabic' => ['ar'], -+ 'Greek' => ['el'], -+ 'Hebrew' => ['he'], -+ ); -+ -+ # better detection for CJK -+ @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); -+ %cjkscr2lang = ( -+ 'Hiragana' => ['ja'], -+ 'Katakana' => ['ja'], -+ 'Hangul' => ['ko'], -+ 'Han' => ['zh', 'ja', 'ko'], -+ ); -+ -+ unless (HAS_ENCODE_HANEXTRA) { -+ Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); -+ } -+ Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' ); -+ Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); -+ Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); -+ Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' ); -+ if (HAS_ENCODE_EUCJPMS) { -+ Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' ); -+ } -+} -+ -+sub get_language { -+ my $str = shift; # $str must be UTF-8 encoding -+ my $charset = shift; -+ -+ return 'en' unless $charset; -+ if ($charset !~ /^utf/i) { -+ return $enc2lang{$charset}; -+ } elsif (defined($str)) { -+ $str =~ s/[\x00-\x7F]//g; # remove ASCII characters -+ return 'en' if ($str eq ''); -+ -+ my %handled; -+ $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); -+ foreach my $scr (@scrorder) { -+ next if ($str !~ /\p{$scr}/); -+ my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; -+ foreach my $lang (@$scrlangs) { -+ next if (exists($handled{$lang})); -+ foreach my $enc (@{$lang2enc{$lang}}) { -+ my $scratch = $str; -+ Encode::encode($enc, $scratch, Encode::FB_QUIET); -+ return $lang if ($scratch eq ''); -+ } -+ $handled{$lang} = 1; -+ } -+ } -+ } -+ return 'en'; -+} -+ -+# TEST 1: try conversion to use the specified charset. -+# TEST 2: try conversion to use Encode::Detect. -+# TEST 3: try conversion to use Encode::Guess. -+sub normalize_charset { -+ my $str = shift; -+ my $charset = shift; -+ -+ return wantarray ? ($str, 'ascii') : $str unless ($str); -+ -+ my $decoded; -+ my $detected; -+ -+ if ($charset) { -+ ($decoded, $detected) = _specified_encoding($str, $charset); -+ } -+ unless ($detected) { -+ ($decoded, $detected) = _encode_detect($str); -+ } -+ unless ($detected) { -+ ($decoded, $detected) = _encode_guess($str); -+ } -+ unless ($detected) { -+ return ($str, undef); -+ } -+ $decoded =~ s/^\x{feff}//g; -+ $decoded = Encode::encode_utf8($decoded); -+ -+ # unfold hiragana, katakana and han -+ if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) { -+ $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og; -+ } -+ return wantarray ? ($decoded, $detected) : $decoded; -+} -+ -+sub _specified_encoding { -+ my $str = shift; -+ my $encoding = shift; -+ -+ my $detected; -+ my $decoded; -+ -+ return (undef, undef) unless ($encoding); -+ -+ # note: ISO-2022-* is not deistinguish from US-ASCII -+ return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i); -+ -+ # UTF-16|32 encoding without BOM cannot be trusted. -+ return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); -+ return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); -+ -+ #$encoding = _get_alias($encoding); -+ my $encoder = Encode::find_encoding($encoding); -+ if (ref($encoder)) { -+ $decoded = $encoder->decode($str,Encode::FB_QUIET); -+ $detected = $encoder->name if ($str eq ''); -+ } -+ return ($decoded, $detected); -+} -+ -+sub _encode_detect { -+ return undef unless HAS_ENCODE_DETECT; -+ my $str = shift; -+ -+ # UTF-16|32 encoding without BOM cannot be trusted. -+ return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/); -+ return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/); -+ -+ my $decoded; -+ my $detected = Encode::Detect::Detector::detect($str); -+ if ($detected) { -+ $detected = _get_alias($detected); -+ my $encoder = Encode::find_encoding($detected); -+ if (ref($encoder)) { -+ $decoded = $encoder->decode($str); -+ $detected = $decoded ? $encoder->name : undef; -+ } -+ else { -+ $detected = undef; -+ } -+ } -+ return ($decoded, $detected); -+} -+ -+sub _encode_guess { -+ my $str = shift; -+ -+ my $detected; -+ my $decoded; -+ my $encoder; -+ -+ # Step 1: Examine ISO-2022-*. -+ if ($str =~ /\e/) { -+ $Encode::Guess::NoUTFAutoGuess = 1; -+ $encoder = Encode::Guess::guess_encoding($str, -+ qw/cp50221 7bit-jis iso-2022-kr/); -+ $Encode::Guess::NoUTFAutoGuess = 0; -+ } -+ -+ # Step 2: Examine US-ASCII/UTF-(8|16|32) -+ unless (ref($encoder)) { -+ $Encode::Guess::NoUTFAutoGuess = 0; -+ $encoder = Encode::Guess::guess_encoding($str); -+ } -+ -+ # Step 3: Examine other encodings -+ unless (ref($encoder)) { -+ $Encode::Guess::NoUTFAutoGuess = 1; -+ eval { -+ if ($str =~ /[\x80-\xFF]{4}/) { -+ $encoder = Encode::Guess::guess_encoding($str, -+ qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); -+ } -+ else { -+ $encoder = Encode::Guess::guess_encoding($str, -+ qw/iso-8859-1 cp1252/); -+ } -+ }; -+ $Encode::Guess::NoUTFAutoGuess = 0; -+ } -+ if (ref($encoder)) { -+ $detected = $encoder->name; -+ if ($detected) { -+ $decoded = $encoder->decode($str); -+ } -+ } -+ return ($decoded, $detected); -+} -+ -+sub _get_alias { -+ my $encoding = shift; -+ -+ unless (HAS_ENCODE_HANEXTRA) { -+ $encoding =~ s/^gb18030$/euc-cn/i; -+ } -+ $encoding =~ s/^unicode-1-1-(.+)$/$1/i; -+ $encoding =~ s/^TIS-620$/iso-8859-11/i; -+ $encoding =~ s/x-mac-(.+)$/Mac$1/i; -+ $encoding =~ s/^Shift_JIS$/cp932/i; -+ if (HAS_ENCODE_EUCJPMS) { -+ $encoding =~ s/^iso-2022-jp$/cp50221/i; -+ $encoding =~ s/^euc-jp$/cp51932/i; -+ } -+ -+ return $encoding; -+} -+ -+ -+1; -+ diff --git a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist b/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist deleted file mode 100644 index 1292bbb2d733..000000000000 --- a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist +++ /dev/null @@ -1,12 +0,0 @@ -%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm -%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm -@dirrm %%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer -%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer.pm -%%SITE_PERL%%/Mail/SpamAssassin/Util/Charset.pm -%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer.3.gz -%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::MeCab.3.gz -%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA.3.gz -%%PERL5_MAN3%%/Mail::SpamAssassin::Util::Charset.3.gz -@unexec if cmp -s %D/%%ETCDIR%%/%%TOKENIZER_PRE%%.sample %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; then rm -f %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; fi -%%ETCDIR%%/%%TOKENIZER_PRE%%.sample -@exec if [ ! -f %B/%%TOKENIZER_PRE%% ]; then cp -p %B/%f %B/%%TOKENIZER_PRE%%; fi diff --git a/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre b/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre deleted file mode 100644 index d21410bbadc9..000000000000 --- a/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre +++ /dev/null @@ -1,8 +0,0 @@ - -# Tokenizer::SimpleJA -# -loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA - -# Tokenizer::MeCab -# -#loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab diff --git a/japanese/p5-Mail-SpamAssassin/pkg-message b/japanese/p5-Mail-SpamAssassin/pkg-message deleted file mode 100644 index 49cea24cb5a8..000000000000 --- a/japanese/p5-Mail-SpamAssassin/pkg-message +++ /dev/null @@ -1,11 +0,0 @@ - -************************************************************************ -For Japanese users, see documents in -http://emaillab.jp/spamassassin/ja-patch/ - -Tokenizer::MeCab uses UTF-8 encoding. You may have to manually -(re)install the following ports with the build options for UTF-8: - japanese/mecab WITH_CHARSET=utf-8 - japanese/mecab-ipadic WITH_CHARSET=utf-8 - japanese/p5-MeCab -************************************************************************ |