aboutsummaryrefslogtreecommitdiffstats
path: root/japanese/p5-Mail-SpamAssassin
diff options
context:
space:
mode:
authoradamw <adamw@FreeBSD.org>2014-03-12 05:49:40 +0800
committeradamw <adamw@FreeBSD.org>2014-03-12 05:49:40 +0800
commit4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0 (patch)
tree7c7c408a52b143121035d7bda8f7d6d99104c54e /japanese/p5-Mail-SpamAssassin
parent5d2367ae581b0263527bd9fbaf6cfce5897cf270 (diff)
downloadfreebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.tar.gz
freebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.tar.zst
freebsd-ports-gnome-4b6420ac1604ec91e0adc0d109cd3ae8ab4cf9b0.zip
Move {mail,japanese}/p5-Mail-SpamAssassin to &/spamassassin, in the name of
improving accessibility. I think people who want to just find the port/package and install it are more likely to look for "spamassassin the program" than "spamassassin the perl module collection."
Diffstat (limited to 'japanese/p5-Mail-SpamAssassin')
-rw-r--r--japanese/p5-Mail-SpamAssassin/Makefile35
-rw-r--r--japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch1143
-rw-r--r--japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist12
-rw-r--r--japanese/p5-Mail-SpamAssassin/files/tokenizer.pre8
-rw-r--r--japanese/p5-Mail-SpamAssassin/pkg-message11
5 files changed, 0 insertions, 1209 deletions
diff --git a/japanese/p5-Mail-SpamAssassin/Makefile b/japanese/p5-Mail-SpamAssassin/Makefile
deleted file mode 100644
index 5c00e75e17ea..000000000000
--- a/japanese/p5-Mail-SpamAssassin/Makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-# Created by: TAOKA Fumiyoshi
-# $FreeBSD$
-
-PORTREVISION= 2
-CATEGORIES= japanese mail perl5
-PKGNAMEPREFIX= ja-p5-
-
-MAINTAINER= masaki@club.kyutech.ac.jp
-COMMENT= SpamAssassin with patches to handle multibyte character
-
-LICENSE= APACHE20
-
-MASTERDIR= ${.CURDIR}/../../mail/p5-Mail-SpamAssassin
-
-BUILD_DEPENDS= ja-p5-MeCab>=0.98:${PORTSDIR}/japanese/p5-MeCab
-
-CONFLICTS= p5-Mail-SpamAssassin-[0-9]*
-
-EXTRA_PATCHES= ${.CURDIR}/files/spamassassin-ja.patch
-
-PKGMESSAGE= ${.CURDIR}/pkg-message
-PLIST= ${WRKDIR}/pkg-plist
-
-TOKENIZER_PRE= tokenizer.pre
-
-PLIST_SUB+= TOKENIZER_PRE=${TOKENIZER_PRE}
-
-pre-install:
- @${CAT} ${EXTRA_PATCHES:S/.patch/.plist/} > ${PLIST}
- @${CAT} ${PKGDIR}/pkg-plist >> ${PLIST}
-
-post-install::
- ${INSTALL_DATA} ${.CURDIR}/files/${TOKENIZER_PRE} ${STAGEDIR}${ETCDIR}/${TOKENIZER_PRE}.sample
-
-.include "${MASTERDIR}/Makefile"
diff --git a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch b/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch
deleted file mode 100644
index 3544abe3555d..000000000000
--- a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.patch
+++ /dev/null
@@ -1,1143 +0,0 @@
---- lib/Mail/SpamAssassin/HTML.pm.orig 2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/HTML.pm 2014-03-04 11:18:44.000000000 +0900
-@@ -86,7 +86,7 @@
- $ok_attributes{div}{$_} = 1 for qw( style );
-
- sub new {
-- my ($class) = @_;
-+ my ($class, $opts) = @_;
- my $self = $class->SUPER::new(
- api_version => 3,
- handlers => [
-@@ -99,6 +99,7 @@
- declaration => ["html_declaration", "self,text"],
- ],
- marked_sections => 1);
-+ $self->{normalize} = $opts->{'normalize'} || 0;
-
- $self;
- }
-@@ -681,7 +682,14 @@
- }
- }
- else {
-- $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
-+ if ($self->{normalize}) {
-+ $text =~ s/\xc2\xa0/ /g; # no-break space
-+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace
-+ $text =~ s/[ \t\n\r\f\x0b]+/ /g;
-+ }
-+ else {
-+ $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
-+ }
- # trim leading whitespace if previous element was whitespace
- # and current element is not invisible
- if (@{ $self->{text} } && !$display{invisible} &&
---- lib/Mail/SpamAssassin/Message/Node.pm.orig 2014-02-07 17:36:23.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message/Node.pm 2014-03-04 11:22:38.000000000 +0900
-@@ -42,6 +42,7 @@
- use Mail::SpamAssassin::Constants qw(:sa);
- use Mail::SpamAssassin::HTML;
- use Mail::SpamAssassin::Logger;
-+use Mail::SpamAssassin::Util::Charset;
-
- =item new()
-
-@@ -385,27 +386,10 @@
-
- sub _normalize {
- my ($self, $data, $charset) = @_;
-- return $data unless $self->{normalize};
-+ return wantarray ? ($data, $charset) : $data unless $self->{normalize};
-
-- my $detected = Encode::Detect::Detector::detect($data);
--
-- my $converter;
--
-- if ($charset && $charset !~ /^us-ascii$/i &&
-- ($detected || 'none') !~ /^(?:UTF|EUC|ISO-2022|Shift_JIS|Big5|GB)/i) {
-- dbg("message: Using labeled charset $charset");
-- $converter = Encode::find_encoding($charset);
-- }
--
-- $converter = Encode::find_encoding($detected) unless $converter || !defined($detected);
--
-- return $data unless $converter;
--
-- dbg("message: Converting...");
--
-- my $rv = $converter->decode($data, 0);
-- utf8::downgrade($rv, 1);
-- return $rv
-+ my ($decoded_data, $detected_charset) = normalize_charset($data, $charset);
-+ return wantarray ? ($decoded_data, $detected_charset) : $decoded_data;
- }
-
- =item rendered()
-@@ -428,8 +412,12 @@
- # text/x-aol is ignored here, but looks like text/html ...
- return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
-
-- my $text = $self->_normalize($self->decode(), $self->{charset});
-+ my ($text, $charset) = $self->_normalize($self->decode(), $self->{charset});
- my $raw = length($text);
-+ if ($self->{normalize}) {
-+ $self->{charset} = $charset;
-+ $self->{language} = get_language($text, $charset);
-+ }
-
- # render text/html always, or any other text|text/plain part as text/html
- # based on a heuristic which simulates a certain common mail client
-@@ -439,7 +427,7 @@
- {
- $self->{rendered_type} = 'text/html';
-
-- my $html = Mail::SpamAssassin::HTML->new(); # object
-+ my $html = Mail::SpamAssassin::HTML->new({normalize=>$self->{normalize}}); # object
- $html->parse($text); # parse+render text
- $self->{rendered} = $html->get_rendered_text();
- $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
---- lib/Mail/SpamAssassin/Message.pm.orig 2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message.pm 2014-03-04 11:27:31.000000000 +0900
-@@ -604,6 +604,8 @@
- delete $self->{'pristine_headers'};
- delete $self->{'line_ending'};
- delete $self->{'missing_head_body_separator'};
-+ delete $self->{'charset'};
-+ delete $self->{'language'};
-
- my @toclean = ( $self );
-
-@@ -630,6 +632,8 @@
- delete $part->{'invisible_rendered'};
- delete $part->{'type'};
- delete $part->{'rendered_type'};
-+ delete $self->{'charset'};
-+ delete $self->{'language'};
-
- # if there are children nodes, add them to the queue of nodes to clean up
- if (exists $part->{'body_parts'}) {
-@@ -1085,7 +1089,14 @@
-
- # whitespace handling (warning: small changes have large effects!)
- $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
-- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ if ($self->{normalize}) {
-+ $text =~ s/\xc2\xa0/ /g; # no-break space => space
-+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space
-+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space
-+ }
-+ else {
-+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ }
- $text =~ tr/\f/\n/; # form feeds => newline
-
- # warn "message: $text";
-@@ -1142,7 +1153,14 @@
-
- # whitespace handling (warning: small changes have large effects!)
- $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
-- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ if ($self->{normalize}) {
-+ $text =~ s/\xc2\xa0/ /g; # no-break space => space
-+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space
-+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space
-+ }
-+ else {
-+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ }
- $text =~ tr/\f/\n/; # form feeds => newline
-
- my @textary = split_into_array_of_short_lines ($text);
-@@ -1193,7 +1211,14 @@
-
- # whitespace handling (warning: small changes have large effects!)
- $text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
-- $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ if ($self->{normalize}) {
-+ $text =~ s/\xc2\xa0/ /g; # no-break space => space
-+ $text =~ s/\xe3\x80\x80/ /g; # ideographicspace => space
-+ $text =~ tr/ \t\n\r\x0b/ /s; # whitespace => space
-+ }
-+ else {
-+ $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace => space
-+ }
- $text =~ tr/\f/\n/; # form feeds => newline
-
- my @textary = split_into_array_of_short_lines ($text);
-@@ -1269,6 +1294,28 @@
-
- # ---------------------------------------------------------------------------
-
-+sub get_language {
-+ my ($self) = @_;
-+
-+ if (defined $self->{language}) { return $self->{language}; }
-+ my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
-+ return '' unless @parts;
-+
-+ # Go through each part
-+ my @langs;
-+ for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
-+ my $p = $parts[$pt];
-+ my $lang = $p->{language};
-+ next unless ($lang);
-+ push(@langs, $lang) unless (grep(/^$lang$/, @langs))
-+ }
-+ $self->{language} = scalar(@langs) ? join(' ', @langs) : '';
-+ return $self->{language};
-+}
-+
-+# ---------------------------------------------------------------------------
-+
-+
- 1;
-
- =back
---- lib/Mail/SpamAssassin/PerMsgStatus.pm.orig 2014-02-07 17:36:28.000000000 +0900
-+++ lib/Mail/SpamAssassin/PerMsgStatus.pm 2014-03-04 11:30:25.000000000 +0900
-@@ -53,6 +53,7 @@
- use warnings;
- use re 'taint';
-
-+use Encode;
- use Errno qw(ENOENT);
- use Time::HiRes qw(time);
-
-@@ -996,19 +997,41 @@
-
- # the report charset
- my $report_charset = "; charset=iso-8859-1";
-- if ($self->{conf}->{report_charset}) {
-- $report_charset = "; charset=" . $self->{conf}->{report_charset};
-- }
-
- # the SpamAssassin report
- my $report = $self->get_report();
-+ if ($self->{conf}->{report_charset}) {
-+ $report_charset = "; charset=" . $self->{conf}->{report_charset};
-+ }
-
- # If there are any wide characters, need to MIME-encode in UTF-8
- # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
- # we could try converting to that charset if possible
-- unless ($] < 5.008 || utf8::downgrade($report, 1)) {
-+ my $is_utf8 = 0;
-+ if ($self->{conf}->{normalize_charset}) {
-+ $report = Encode::decode_utf8($report);
-+ $is_utf8 = 1;
-+ }
-+ else {
-+ if ($self->{msg}->{charset}) {
-+ eval {
-+ my $scratch = $report;
-+ $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK);
-+ $is_utf8 = 1;
-+ };
-+ }
-+ }
-+ if ($is_utf8) {
-+ $is_utf8 = 1;
-+ eval {
-+ my $scratch = $report;
-+ $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
-+ $is_utf8 = 0;
-+ };
-+ if ($is_utf8) {
-+ $report = Encode::encode_utf8($report);
- $report_charset = "; charset=utf-8";
-- utf8::encode($report);
-+ }
- }
-
- # get original headers, "pristine" if we can do it
---- lib/Mail/SpamAssassin/Plugin/Bayes.pm.orig 2014-02-07 17:36:27.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm 2014-03-04 11:34:46.000000000 +0900
-@@ -223,6 +223,15 @@
- # will require a longer token than English ones.)
- use constant MAX_TOKEN_LENGTH => 15;
-
-+# Skip if a token is too short.
-+our $SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
-+ [\x00-\x7F] # 1 byte
-+ | [\xC0-\xDF][\x80-\xBF] # 2 bytes
-+ | [\xE0-\xEF][\x80-\xBF]{2} # 3 bytes
-+ | [\xF0-\xF7][\x80-\xBF]{3} # 4 bytes
-+ | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana
-+)}x;
-+
- ###########################################################################
-
- sub new {
-@@ -1039,9 +1048,28 @@
- $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
- $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
- @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
-+ if ($self->{conf}->{normalize_charset}) {
-+ my $tokenizer = $self->get_tokenizer($msg);
-+ if (ref($tokenizer)) {
-+ $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
-+ $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
-+ }
-+ }
- return $msgdata;
- }
-
-+sub get_tokenizer {
-+ my ($self, $msg) = @_;
-+
-+ my $tokenizer;
-+ my @languages = split(/\s+/, $msg->{msg}->get_language());
-+ foreach my $lang (@languages) {
-+ $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang};
-+ last if (ref($tokenizer));
-+ }
-+ return $tokenizer;
-+}
-+
- ###########################################################################
-
- # The calling functions expect a uniq'ed array of tokens ...
-@@ -1095,7 +1123,7 @@
- # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
- # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
- # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
-- tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
-+ tr/-A-Za-z0-9,\@\*\!_'"\$.\200-\377 / /cs;
-
- # DO split on "..." or "--" or "---"; common formatting error resulting in
- # hapaxes. Keep the separator itself as a token, though, as long ones can
-@@ -1124,6 +1152,11 @@
- #
- next if ( defined $magic_re && $token =~ /$magic_re/ );
-
-+ # Skip short UTF-8 tokens.
-+ if ($self->{conf}->{normalize_charset}) {
-+ next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
-+ }
-+
- # *do* keep 3-byte tokens; there's some solid signs in there
- my $len = length($token);
-
-@@ -1152,14 +1185,16 @@
- # the domain ".net" appeared in the To header.
- #
- if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
-- if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
-- # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
-- # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
-- # to me! (jm)
-- while ($token =~ s/^(..?)//) {
-- push (@rettokens, "8:$1");
-- }
-- next;
-+ unless ($self->{conf}->{normalize_charset}) {
-+ if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
-+ # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
-+ # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
-+ # to me! (jm)
-+ while ($token =~ s/^(..?)//) {
-+ push (@rettokens, "8:$1");
-+ }
-+ next;
-+ }
- }
-
- if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
---- /dev/null 1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm 2011-07-14 22:29:19.000000000 +0900
-@@ -0,0 +1,84 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Tokenizer::MeCab - Japanese tokenizer with MeCab
-+
-+=head1 SYNOPSIS
-+
-+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab
-+
-+=head1 DESCRIPTION
-+
-+This plugin tokenizes a Japanese string with MeCab that is
-+the morphological analysis engine.
-+
-+Text::MeCab 0.12 or over is required.
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;
-+
-+use strict;
-+use warnings;
-+use Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+
-+# Have to do this so that RPM doesn't find these as required perl modules
-+BEGIN { require MeCab; }
-+our $language = 'ja';
-+our $mecab = new MeCab::Tagger(-Ochasen);
-+
-+sub new {
-+ my $class = shift;
-+ my $mailsaobject = shift;
-+
-+ $class = ref($class) || $class;
-+ my $self = $class->SUPER::new($mailsaobject, $language);
-+ bless ($self, $class);
-+
-+ return $self;
-+}
-+
-+sub tokenize {
-+ my $self = shift;
-+ my $text_array = shift;
-+
-+ my @tokenized_array;
-+ foreach my $text (@$text_array) {
-+ next unless ($text);
-+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
-+ push(@tokenized_array, $text);
-+ }
-+ return \@tokenized_array;
-+}
-+
-+sub _tokenize {
-+ my $text = shift;
-+
-+ my @buf;
-+ for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
-+ push(@buf, $node->{surface});
-+ }
-+ my $tokenized = join(' ', @buf) . ' ';
-+ return $tokenized;
-+}
-+
-+1;
-+
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
---- /dev/null 1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm 2011-07-14 22:29:19.000000000 +0900
-@@ -0,0 +1,111 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Tokenizer::SimpleJA - simple Japanese tokenizer
-+
-+=head1 SYNOPSIS
-+
-+loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
-+
-+=head1 DESCRIPTION
-+
-+This plugin simply tokenizes a Japanese string by characters other than
-+the alphabet, the Chinese character, and the katakana.
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;
-+
-+use strict;
-+use warnings;
-+use Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+
-+our $language = 'ja';
-+
-+our $RE = qr{(
-+ # Hiragana
-+ (?:
-+ \xE3\x81[\x80-\xBF]
-+ | \xE3\x82[\x80-\x9F]
-+ )+
-+ # Katakana
-+ | (?:
-+ \xE3\x82[\xA0-\xBF]
-+ | \xE3\x83[\x80-\xBF]
-+ )+
-+ # Kanji
-+ | (?:
-+ \xE3[\x90-\xBF][\x80-\xBF]
-+ | [\xE4-\xE9][\x80-\xBF]{2}
-+ | \xEF[\xA4-\xAB][\x80-\xBF]
-+ )+
-+ # Fullwidth
-+ | (?:
-+ \xEF\xBC[\x80-\xBF]
-+ | \xEF\xBD[\x80-\x9F]
-+ )+
-+ # Others
-+ | [\xC0-\xDF][\x80-\xBF]
-+ | [\xE0-\xE2][\x80-\xBF]{2}
-+ | \xE3\x80[\x80-\xBF]
-+ | \xE3[\x84-\x8F][\x80-\xBF]
-+ | [\xEA-\xEE][\x80-\xBF]{2}
-+ | \xEF[\x80-\xA3][\x80-\xBF]
-+ | \xEF[\xAC-\xBB][\x80-\xBF]
-+ | \xEF\xBD[\xA0-\xBF]
-+ | \xEF[\xBE-\xBF][\x80-\xBF]
-+ | [\xF0-\xF7][\x80-\xBF]{3}
-+)}x;
-+
-+sub new {
-+ my $class = shift;
-+ my $mailsaobject = shift;
-+
-+ $class = ref($class) || $class;
-+ my $self = $class->SUPER::new($mailsaobject, $language);
-+ bless ($self, $class);
-+
-+ return $self;
-+}
-+
-+sub tokenize {
-+ my $self = shift;
-+ my $text_array = shift;
-+
-+ my @tokenized_array;
-+ foreach my $text (@$text_array) {
-+ next unless ($text);
-+ $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
-+ push(@tokenized_array, $text);
-+ }
-+ return \@tokenized_array;
-+}
-+
-+sub _tokenize {
-+ my $text = shift;
-+
-+ $text =~ s/$RE/$1 /og;
-+ $text = ' ' . $text;
-+ return $text;
-+}
-+
-+1;
-+
-diff -uNr /dev/null lib/Mail/SpamAssassin/Plugin/Tokenizer.pm
---- /dev/null 1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm 2011-07-14 22:35:46.000000000 +0900
-@@ -0,0 +1,115 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class
-+
-+=head1 SYNOPSIS
-+
-+=head2 SpamAssassin configuration:
-+
-+ loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm
-+
-+=head2 Perl code:
-+
-+ use Mail::SpamAssassin::Plugin::Tokenizer;
-+ use vars qw(@ISA);
-+ @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+ # language to use this plugin
-+ our $language = 'ja';
-+
-+ # constructor: register language
-+ sub new {
-+ my $class = shift;
-+ my $mailsaobject = shift;
-+
-+ # some boilerplate...
-+ $class = ref($class) || $class;
-+ my $self = $class->SUPER::new($mailsaobject, $language);
-+ bless ($self, $class);
-+
-+ return $self;
-+ }
-+
-+ # tokenize function
-+ sub tokenize {
-+ my $self = shift;
-+ my $text_array_ref = shift;
-+
-+ ......
-+
-+ return $tokenized_array_ref;
-+ }
-+
-+
-+=head1 DESCRIPTION
-+
-+This plugin is the base class of tokenizer plugin.
-+You must define tokenize() and $language
-+
-+=head1 INTERFACE
-+
-+ sub tokenize {
-+ my $self = shift;
-+ my $text_array_ref = shift;
-+
-+ ......
-+
-+ return $tokenized_array_ref;
-+ }
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use Mail::SpamAssassin::Plugin;
-+use Mail::SpamAssassin::Logger;
-+use strict;
-+use warnings;
-+use bytes;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin);
-+
-+sub new {
-+ my $class = shift;
-+ my $mailsaobject = shift;
-+ my $language = shift;
-+
-+ # some boilerplate...
-+ $class = ref($class) || $class;
-+ my $self = $class->SUPER::new($mailsaobject);
-+ bless ($self, $class);
-+
-+ if ($language) {
-+ $self->{main}->{conf}->{tokenizer}->{$language} = $self;
-+ }
-+ else {
-+ dbg("plugin: $self: \$language is not defined");
-+ }
-+
-+ return $self;
-+}
-+
-+sub tokenize {
-+ my ($self, $ref) = @_;
-+
-+ return $ref;
-+}
-+
-+1;
-+
-diff -uNr /dev/null lib/Mail/SpamAssassin/Util/Charset.pm
---- /dev/null 1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Util/Charset.pm 2011-07-14 22:29:19.000000000 +0900
-@@ -0,0 +1,471 @@
-+# <@LICENSE>
-+# Copyright 2006 Apache Software Foundation
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+
-+=head1 NAME
-+
-+ Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language
-+
-+=head1 SYNOPSIS
-+
-+ my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
-+ my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);
-+
-+=head1 DESCRIPTION
-+
-+This module implements utility methods for charset and language.
-+
-+=cut
-+
-+package Mail::SpamAssassin::Util::Charset;
-+
-+use strict;
-+use warnings;
-+use Encode;
-+use Encode::Guess;
-+use Encode::Alias;
-+
-+use vars qw (
-+ @ISA @EXPORT
-+);
-+
-+require Exporter;
-+
-+@ISA = qw(Exporter);
-+@EXPORT = qw(normalize_charset get_language);
-+
-+###########################################################################
-+
-+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
-+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
-+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };
-+
-+###########################################################################
-+
-+our $KANA_HAN_RE = qr{
-+ # Hiragana and Katakana
-+ \xE3[\x81-\x83][\x80-\xBF]
-+ # Han
-+ | \xE3[\x90-\xBF][\x80-\xBF]
-+ | [\xE4-\xE9][\x80-\xBF]{2}
-+ | \xEF[\xA4-\xAB][\x80-\xBF]
-+}x;
-+
-+our %enc2lang;
-+our %lang2enc;
-+our %scr2lang;
-+our %cjkscr2lang;
-+our @scrorder;
-+
-+BEGIN {
-+
-+ # See the following URL about this map:
-+ # http://czyborra.com/charsets/iso8859.html
-+ # http://czyborra.com/charsets/codepages.html
-+ # http://czyborra.com/charsets/cyrillic.html
-+ # http://en.wikipedia.org/wiki/ISO_8859
-+ # http://www.w3.org/International/O-charset-lang.html
-+ %enc2lang = (
-+ # buint-in Encodings and Encode::Byte
-+ # N. America
-+ 'ascii' => 'en',
-+ 'cp437' => 'en',
-+ 'cp863' => 'weurope',
-+
-+ # W. Europe (Latin1, Latin9)
-+ # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
-+ 'iso-8859-1' => 'weurope',
-+ 'iso-8859-15' => 'weurope',
-+ 'cp850' => 'weurope',
-+ 'cp860' => 'weurope',
-+ 'cp1252' => 'weurope',
-+ 'MacRoman' => 'weurope',
-+
-+ # Cntrl. Europe / Latin2 / Latin10
-+ # hr cs hu pl sr sk sl
-+ 'iso-8859-2' => 'ceurope',
-+ 'cp852' => 'ceurope',
-+ 'cp1250' => 'ceurope',
-+ 'MacCentralEurRoman' => 'ceurope',
-+ 'MacCroatian' => 'ceurope',
-+ 'iso-8859-16' => 'ceurope',
-+ 'MacRomanian' => 'ceurope',
-+
-+ # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
-+ # eo mt
-+ 'iso-8859-3' => 'seurope',
-+
-+ # Baltics (Latin4, Latin7)
-+ # lv lt
-+ 'iso-8859-4' => 'neurope',
-+ 'iso-8859-13' => 'baltic',
-+ 'cp1257' => 'baltic',
-+
-+ # Nordics (Latin6)
-+ # et kl iu se
-+ 'iso-8859-10' => 'nordic',
-+
-+ # Cyrillics
-+ # bg be uk sr mk ru
-+ 'iso-8859-5' => 'ru',
-+ 'cp855' => 'ru',
-+ 'cp1251' => 'ru',
-+ 'cp866' => 'ru',
-+ 'MacCyrillic' => 'ru',
-+ 'koi8-r' => 'ru',
-+ 'MacUkrainian' => 'uk',
-+ 'koi8-u' => 'uk',
-+
-+ # Arabic
-+ 'iso-8859-6' => 'ar',
-+ 'cp864' => 'ar',
-+ 'cp1256' => 'ar',
-+ 'MacArabic' => 'ar',
-+ 'cp1006' => 'fa',
-+ 'MacFarsi' => 'fa',
-+
-+ # Greek
-+ 'iso-8859-7' => 'el',
-+ 'cp1253' => 'el',
-+ 'MacGreek' => 'el',
-+
-+ # Hebrew
-+ # he yi
-+ 'iso-8859-8' => 'he',
-+ 'cp862' => 'he',
-+ 'cp1255' => 'he',
-+ 'MacHebrew' => 'he',
-+
-+ # Turkish
-+ 'iso-8859-9' => 'tr',
-+ 'cp857' => 'tr',
-+ 'cp1254' => 'tr',
-+ 'MacTurkish' => 'tr',
-+
-+ # Thai
-+ 'iso-8859-11' => 'th',
-+ 'cp874' => 'th',
-+
-+ # Celtics (Latin8)
-+ # gd cy br
-+ 'iso-8859-14' => 'celtic',
-+
-+ # Vietnamese
-+ 'viscii' => 'vi',
-+ 'cp1258' => 'vi',
-+
-+ # Encode::CN
-+ 'euc-cn' => 'zh',
-+ 'cp936' => 'zh',
-+ 'hz' => 'zh',
-+
-+ # Encode::TW
-+ 'big5-eten' => 'zh',
-+ 'big5-hkscs' => 'zh',
-+ 'cp950' => 'zh',
-+
-+ # Encode::JP
-+ 'euc-jp' => 'ja',
-+ 'shiftjis' => 'ja',
-+ '7bit-jis' => 'ja',
-+ 'iso-2022-jp' => 'ja',
-+ 'iso-2022-jp-1' => 'ja',
-+ 'cp932' => 'ja',
-+
-+ # Encode::KR
-+ 'euc-kr' => 'ko',
-+ 'cp949' => 'ko',
-+ 'johab' => 'ko',
-+ 'iso-2022-kr' => 'ko',
-+
-+ # Encode::HanExtra
-+ 'euc-tw' => 'zh',
-+ 'gb18030' => 'zh',
-+
-+ # Encode::JIS2K
-+ 'euc-jisx0213' => 'ja',
-+ 'shiftjisx0123' => 'ja',
-+ 'iso-2022-jp-3' => 'ja',
-+
-+ # Encode::EUCJPMS
-+ 'eucJP-ms' => 'ja',
-+ 'cp51932' => 'ja',
-+ 'cp50220' => 'ja',
-+ 'cp50221' => 'ja',
-+
-+ );
-+
-+ %lang2enc = (
-+ # Latin1
-+ 'en' => ['ascii'],
-+ 'weurope' => ['cp1252'],
-+
-+ # Latin2
-+ 'ceurope' => ['cp1250'],
-+
-+ # Latin3
-+ 'seurope' => ['iso-8859-3'],
-+
-+ # Latin4
-+ 'neurope' => ['iso-8859-4'],
-+
-+ # Latin5
-+ 'tr' => ['cp1254'],
-+
-+ # Latin6
-+ 'nordic' => ['iso-8859-10'],
-+
-+ # Latin7
-+ 'baltic' => ['cp1257'],
-+
-+ # Latin8
-+ 'celtic' => ['iso-8859-14'],
-+
-+ # Non Latin
-+ 'ru' => ['koi8-r', 'cp1251'],
-+ 'uk' => ['koi8-u'],
-+
-+ 'ar' => ['cp1256'],
-+ 'el' => ['cp1253'],
-+ 'he' => ['cp1255'],
-+ 'th' => ['cp874'],
-+ 'vi' => ['viscii', 'cp1258'],
-+ 'zh' => ['euc-cn', 'cp950'],
-+ 'ja' => ['euc-jp', 'cp932'],
-+ 'ko' => ['euc-kr', 'cp949'],
-+
-+ );
-+
-+ %scr2lang = (
-+ 'InLatin1Supplement' => ['weurope'],
-+ 'InLatinExtendedA' => [
-+ 'ceurope',
-+ 'seurope',
-+ 'tr',
-+ 'vi'
-+ ],
-+ 'InLatinExtendedB' => [
-+ 'nordic',
-+ 'baltic',
-+ 'celtic'
-+ ],
-+ 'Thai' => ['th'],
-+ 'Cyrillic' => ['ru', 'uk'],
-+ 'Arabic' => ['ar'],
-+ 'Greek' => ['el'],
-+ 'Hebrew' => ['he'],
-+ );
-+
-+ # better detection for CJK
-+ @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang));
-+ %cjkscr2lang = (
-+ 'Hiragana' => ['ja'],
-+ 'Katakana' => ['ja'],
-+ 'Hangul' => ['ko'],
-+ 'Han' => ['zh', 'ja', 'ko'],
-+ );
-+
-+ unless (HAS_ENCODE_HANEXTRA) {
-+ Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
-+ }
-+ Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
-+ Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
-+ Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
-+ Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
-+ if (HAS_ENCODE_EUCJPMS) {
-+ Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
-+ }
-+}
-+
-+sub get_language {
-+ my $str = shift; # $str must be UTF-8 encoding
-+ my $charset = shift;
-+
-+ return 'en' unless $charset;
-+ if ($charset !~ /^utf/i) {
-+ return $enc2lang{$charset};
-+ } elsif (defined($str)) {
-+ $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
-+ return 'en' if ($str eq '');
-+
-+ my %handled;
-+ $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
-+ foreach my $scr (@scrorder) {
-+ next if ($str !~ /\p{$scr}/);
-+ my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
-+ foreach my $lang (@$scrlangs) {
-+ next if (exists($handled{$lang}));
-+ foreach my $enc (@{$lang2enc{$lang}}) {
-+ my $scratch = $str;
-+ Encode::encode($enc, $scratch, Encode::FB_QUIET);
-+ return $lang if ($scratch eq '');
-+ }
-+ $handled{$lang} = 1;
-+ }
-+ }
-+ }
-+ return 'en';
-+}
-+
-+# TEST 1: try conversion to use the specified charset.
-+# TEST 2: try conversion to use Encode::Detect.
-+# TEST 3: try conversion to use Encode::Guess.
-+sub normalize_charset {
-+ my $str = shift;
-+ my $charset = shift;
-+
-+ return wantarray ? ($str, 'ascii') : $str unless ($str);
-+
-+ my $decoded;
-+ my $detected;
-+
-+ if ($charset) {
-+ ($decoded, $detected) = _specified_encoding($str, $charset);
-+ }
-+ unless ($detected) {
-+ ($decoded, $detected) = _encode_detect($str);
-+ }
-+ unless ($detected) {
-+ ($decoded, $detected) = _encode_guess($str);
-+ }
-+ unless ($detected) {
-+ return ($str, undef);
-+ }
-+ $decoded =~ s/^\x{feff}//g;
-+ $decoded = Encode::encode_utf8($decoded);
-+
-+ # unfold hiragana, katakana and han
-+ if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) {
-+ $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
-+ }
-+ return wantarray ? ($decoded, $detected) : $decoded;
-+}
-+
-+sub _specified_encoding {
-+ my $str = shift;
-+ my $encoding = shift;
-+
-+ my $detected;
-+ my $decoded;
-+
-+ return (undef, undef) unless ($encoding);
-+
-+ # note: ISO-2022-* is not deistinguish from US-ASCII
-+ return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);
-+
-+ # UTF-16|32 encoding without BOM cannot be trusted.
-+ return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
-+ return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
-+
-+ #$encoding = _get_alias($encoding);
-+ my $encoder = Encode::find_encoding($encoding);
-+ if (ref($encoder)) {
-+ $decoded = $encoder->decode($str,Encode::FB_QUIET);
-+ $detected = $encoder->name if ($str eq '');
-+ }
-+ return ($decoded, $detected);
-+}
-+
-+sub _encode_detect {
-+ return undef unless HAS_ENCODE_DETECT;
-+ my $str = shift;
-+
-+ # UTF-16|32 encoding without BOM cannot be trusted.
-+ return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
-+ return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
-+
-+ my $decoded;
-+ my $detected = Encode::Detect::Detector::detect($str);
-+ if ($detected) {
-+ $detected = _get_alias($detected);
-+ my $encoder = Encode::find_encoding($detected);
-+ if (ref($encoder)) {
-+ $decoded = $encoder->decode($str);
-+ $detected = $decoded ? $encoder->name : undef;
-+ }
-+ else {
-+ $detected = undef;
-+ }
-+ }
-+ return ($decoded, $detected);
-+}
-+
-+sub _encode_guess {
-+ my $str = shift;
-+
-+ my $detected;
-+ my $decoded;
-+ my $encoder;
-+
-+ # Step 1: Examine ISO-2022-*.
-+ if ($str =~ /\e/) {
-+ $Encode::Guess::NoUTFAutoGuess = 1;
-+ $encoder = Encode::Guess::guess_encoding($str,
-+ qw/cp50221 7bit-jis iso-2022-kr/);
-+ $Encode::Guess::NoUTFAutoGuess = 0;
-+ }
-+
-+ # Step 2: Examine US-ASCII/UTF-(8|16|32)
-+ unless (ref($encoder)) {
-+ $Encode::Guess::NoUTFAutoGuess = 0;
-+ $encoder = Encode::Guess::guess_encoding($str);
-+ }
-+
-+ # Step 3: Examine other encodings
-+ unless (ref($encoder)) {
-+ $Encode::Guess::NoUTFAutoGuess = 1;
-+ eval {
-+ if ($str =~ /[\x80-\xFF]{4}/) {
-+ $encoder = Encode::Guess::guess_encoding($str,
-+ qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
-+ }
-+ else {
-+ $encoder = Encode::Guess::guess_encoding($str,
-+ qw/iso-8859-1 cp1252/);
-+ }
-+ };
-+ $Encode::Guess::NoUTFAutoGuess = 0;
-+ }
-+ if (ref($encoder)) {
-+ $detected = $encoder->name;
-+ if ($detected) {
-+ $decoded = $encoder->decode($str);
-+ }
-+ }
-+ return ($decoded, $detected);
-+}
-+
-+sub _get_alias {
-+ my $encoding = shift;
-+
-+ unless (HAS_ENCODE_HANEXTRA) {
-+ $encoding =~ s/^gb18030$/euc-cn/i;
-+ }
-+ $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
-+ $encoding =~ s/^TIS-620$/iso-8859-11/i;
-+ $encoding =~ s/x-mac-(.+)$/Mac$1/i;
-+ $encoding =~ s/^Shift_JIS$/cp932/i;
-+ if (HAS_ENCODE_EUCJPMS) {
-+ $encoding =~ s/^iso-2022-jp$/cp50221/i;
-+ $encoding =~ s/^euc-jp$/cp51932/i;
-+ }
-+
-+ return $encoding;
-+}
-+
-+
-+1;
-+
diff --git a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist b/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist
deleted file mode 100644
index 1292bbb2d733..000000000000
--- a/japanese/p5-Mail-SpamAssassin/files/spamassassin-ja.plist
+++ /dev/null
@@ -1,12 +0,0 @@
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
-@dirrm %%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer.pm
-%%SITE_PERL%%/Mail/SpamAssassin/Util/Charset.pm
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::MeCab.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Util::Charset.3.gz
-@unexec if cmp -s %D/%%ETCDIR%%/%%TOKENIZER_PRE%%.sample %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; then rm -f %D/%%ETCDIR%%/%%TOKENIZER_PRE%%; fi
-%%ETCDIR%%/%%TOKENIZER_PRE%%.sample
-@exec if [ ! -f %B/%%TOKENIZER_PRE%% ]; then cp -p %B/%f %B/%%TOKENIZER_PRE%%; fi
diff --git a/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre b/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre
deleted file mode 100644
index d21410bbadc9..000000000000
--- a/japanese/p5-Mail-SpamAssassin/files/tokenizer.pre
+++ /dev/null
@@ -1,8 +0,0 @@
-
-# Tokenizer::SimpleJA
-#
-loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
-
-# Tokenizer::MeCab
-#
-#loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab
diff --git a/japanese/p5-Mail-SpamAssassin/pkg-message b/japanese/p5-Mail-SpamAssassin/pkg-message
deleted file mode 100644
index 49cea24cb5a8..000000000000
--- a/japanese/p5-Mail-SpamAssassin/pkg-message
+++ /dev/null
@@ -1,11 +0,0 @@
-
-************************************************************************
-For Japanese users, see documents in
-http://emaillab.jp/spamassassin/ja-patch/
-
-Tokenizer::MeCab uses UTF-8 encoding. You may have to manually
-(re)install the following ports with the build options for UTF-8:
- japanese/mecab WITH_CHARSET=utf-8
- japanese/mecab-ipadic WITH_CHARSET=utf-8
- japanese/p5-MeCab
-************************************************************************