author: Rene Ladan <rene@FreeBSD.org> 2018-12-31 13:37:58 +0000
committer: Rene Ladan <rene@FreeBSD.org> 2018-12-31 13:37:58 +0000
commit: d1a5353b9cccb568b52fb525175941ad0b8a0345 (patch)
tree: 25abb8f5b9cdfd31fb9d446791a87606bf77ff0b /japanese
parent: d460c1a711a6e1319160cba021ecc281f1e4fc70 (diff)
download: ports-d1a5353b9cccb568b52fb525175941ad0b8a0345.tar.gz
ports-d1a5353b9cccb568b52fb525175941ad0b8a0345.zip
6 files changed, 0 insertions, 1133 deletions
diff --git a/japanese/Makefile b/japanese/Makefile
index 660ca8210f51..6f7da3bbc21d 100644
--- a/japanese/Makefile
+++ b/japanese/Makefile
@@ -259,7 +259,6 @@
     SUBDIR += skkserv
     SUBDIR += slimeforest
     SUBDIR += slrn
-    SUBDIR += spamassassin
     SUBDIR += srd-fpw-package
     SUBDIR += suicavalue
     SUBDIR += t2ps
diff --git a/japanese/spamassassin/Makefile b/japanese/spamassassin/Makefile
deleted file mode 100644
index 9e755ca624a5..000000000000
--- a/japanese/spamassassin/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-# Created by: TAOKA Fumiyoshi
-# $FreeBSD$
-
-PORTREVISION=	0
-CATEGORIES=	japanese mail perl5
-PKGNAMEPREFIX=	ja-
-
-MAINTAINER=	ports@FreeBSD.org
-COMMENT=	SpamAssassin with patches to handle multibyte character
-
-LICENSE=	APACHE20
-
-BROKEN=		patch does not apply
-DEPRECATED=	does not build
-EXPIRATION_DATE=2018-12-31
-
-MASTERDIR=	${.CURDIR}/../../mail/spamassassin
-
-BUILD_DEPENDS=	ja-p5-MeCab>=0.98:japanese/p5-MeCab
-
-CONFLICTS=	spamassassin-[0-9]*
-
-EXTRA_PATCHES=	${.CURDIR}/files/spamassassin-ja.patch
-
-PKGMESSAGE=	${.CURDIR}/pkg-message
-PLIST=		${WRKDIR}/pkg-plist
-
-TOKENIZER_PRE=	tokenizer.pre
-
-PLIST_SUB+=	TOKENIZER_PRE=${TOKENIZER_PRE}
-
-pre-install:
-	@${CAT} ${EXTRA_PATCHES:S/.patch/.plist/} > ${PLIST}
-	@${CAT} ${PKGDIR}/pkg-plist >> ${PLIST}
-
-post-install::
-	${INSTALL_DATA} ${.CURDIR}/files/${TOKENIZER_PRE} ${STAGEDIR}${ETCDIR}/${TOKENIZER_PRE}.sample
-
-.include "${MASTERDIR}/Makefile"
diff --git a/japanese/spamassassin/files/spamassassin-ja.patch b/japanese/spamassassin/files/spamassassin-ja.patch
deleted file mode 100644
index 968e1b986aee..000000000000
--- a/japanese/spamassassin/files/spamassassin-ja.patch
+++ /dev/null
@@ -1,1065 +0,0 @@
---- lib/Mail/SpamAssassin/HTML.pm	2015-04-29 04:56:49.000000000 +0900
-+++ lib/Mail/SpamAssassin/HTML.pm	2015-08-30 00:46:40.902000000 +0900
-@@ -695,7 +695,8 @@
-   }
-   else {
-     # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
--    $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
-+    # Ideographic Space: UTF-8: E3 80 80
-+    $text =~ s/[ \t\n\r\f\x0b]+|(?:\xc2\xa0)+|(?:\xe3\x80\x80)+/ /gs;
-     # trim leading whitespace if previous element was whitespace 
-     # and current element is not invisible
-     if (@{ $self->{text} } && !$display{invisible} &&
-@@ -742,7 +743,8 @@
-   my $invisible_for_bayes = 0;
- 
-   # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
--  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) {
-+  # Ideographic Space: UTF-8: E3 80 80
-+  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0|\xe3\x80\x80)*\z/s) {
-     $invisible_for_bayes = $self->html_font_invisible($text);
-   }
- 
---- lib/Mail/SpamAssassin/Message/Node.pm	2015-04-29 04:56:48.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message/Node.pm	2015-08-30 00:25:32.534000000 +0900
-@@ -44,6 +44,7 @@
- use Mail::SpamAssassin::Constants qw(:sa);
- use Mail::SpamAssassin::HTML;
- use Mail::SpamAssassin::Logger;
-+use Mail::SpamAssassin::Util::Charset;
- 
- our($enc_utf8, $enc_w1252, $have_encode_detector);
- BEGIN {
-@@ -407,6 +408,10 @@
- 
-   return $_[1]  unless $self->{normalize} && $enc_utf8;
- 
-+  # FIXME: to be merged.
-+  my ($decoded_data, $charset_detected) = normalize_charset($_[1], $charset_declared, $return_decoded);
-+  return wantarray ? ($decoded_data, $charset_detected) : $decoded_data;
-+
-   warn "message: _normalize() was given characters, expected bytes: $_[1]\n"
-     if utf8::is_utf8($_[1]);
- 
-@@ -603,6 +608,7 @@
- 
-     my $text = $self->decode;  # QP and Base64 decoding, bytes
-     my $text_len = length($text);  # num of bytes in original charset encoding
-+    my $charset;
- 
-     # render text/html always, or any other text|text/plain part as text/html
-     # based on a heuristic which simulates a certain common mail client
-@@ -622,7 +628,9 @@
-         # subroutine _normalize() to return Unicode text.  See Bug 7133
-         #
-         $character_semantics = 1;  # $text will be in characters
--        $text = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
-+        ($text, $charset) = $self->_normalize($text, $self->{charset}, 1); # bytes to chars
-+        $self->{charset} = $charset;
-+        $self->{language} = get_language($text, $charset);
-       } elsif (!defined $self->{charset} ||
-                $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
-         # With some luck input can be interpreted as UTF-8, do not warn.
-@@ -657,7 +665,9 @@
-     else {  # plain text
-       if ($self->{normalize} && $enc_utf8) {
-         # request transcoded result as UTF-8 octets!
--        $text = $self->_normalize($text, $self->{charset}, 0);
-+        ($text, $charset) = $self->_normalize($text, $self->{charset}, 0);
-+        $self->{charset} = $charset;
-+        $self->{language} = get_language($text, $charset);
-       }
-       $self->{rendered_type} = $self->{type};
-       $self->{rendered} = $self->{'visible_rendered'} = $text;
---- lib/Mail/SpamAssassin/Message.pm	2015-04-29 04:56:49.000000000 +0900
-+++ lib/Mail/SpamAssassin/Message.pm	2015-08-30 00:52:32.210000000 +0900
-@@ -627,6 +627,8 @@
-   delete $self->{'pristine_headers'};
-   delete $self->{'line_ending'};
-   delete $self->{'missing_head_body_separator'};
-+  delete $self->{'charset'};
-+  delete $self->{'language'};
- 
-   my @toclean = ( $self );
- 
-@@ -653,6 +655,8 @@
-     delete $part->{'invisible_rendered'};
-     delete $part->{'type'};
-     delete $part->{'rendered_type'};
-+    delete $self->{'charset'};
-+    delete $self->{'language'};
- 
-     # if there are children nodes, add them to the queue of nodes to clean up
-     if (exists $part->{'body_parts'}) {
-@@ -1143,6 +1147,9 @@
-   # whitespace handling (warning: small changes have large effects!)
-   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
- # $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
-+  if ($self->{normalize}) {
-+    $text =~ s/\xc2\xa0|\xe3\x80\x80/ /g; # whitespace (NBSP, ideographic space) => space
-+  }
-   $text =~ tr/ \t\n\r\x0b/ /s;		# whitespace (incl. VT) => space
-   $text =~ tr/\f/\n/;			# form feeds => newline
- 
-@@ -1235,6 +1242,27 @@
- }
- 
- # ---------------------------------------------------------------------------
-+
-+sub get_language {
-+  my ($self) = @_;
-+
-+  if (defined $self->{language}) { return $self->{language}; }
-+  my @parts = $self->find_parts(qr/^(?:text|message)\b/i,1);
-+  return '' unless @parts;
-+
-+  # Go through each part
-+  my @langs;
-+  for(my $pt = 0 ; $pt <= $#parts ; $pt++ ) {
-+    my $p = $parts[$pt];
-+    my $lang = $p->{language};
-+    next unless ($lang);
-+    push(@langs, $lang) unless (grep(/^$lang$/, @langs))
-+  }
-+  $self->{language} = scalar(@langs) ? join(' ', @langs) : '';
-+  return $self->{language};
-+}
-+
-+# ---------------------------------------------------------------------------
- 
- 1;
- 
---- lib/Mail/SpamAssassin/PerMsgStatus.pm	2015-04-29 04:56:49.000000000 +0900
-+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	2015-08-30 00:55:35.583000000 +0900
-@@ -55,6 +55,7 @@
- 
- use Errno qw(ENOENT);
- use Time::HiRes qw(time);
-+use Encode;
- 
- use Mail::SpamAssassin::Constants qw(:sa);
- use Mail::SpamAssassin::AsyncLoop;
-@@ -1053,12 +1054,32 @@
-   # the SpamAssassin report
-   my $report = $self->get_report();
- 
--  # If there are any wide characters, need to MIME-encode in UTF-8
--  # TODO: If $report_charset is something other than iso-8859-1/us-ascii, then
--  # we could try converting to that charset if possible
--  unless ($] < 5.008 || utf8::downgrade($report, 1)) {
-+  # decode to utf-8.
-+  my $is_utf8 = 0;
-+  if ($self->{conf}->{normalize_charset}) {
-+    $report = Encode::decode_utf8($report);
-+    $is_utf8 = 1;
-+  }
-+  else {
-+    if ($self->{msg}->{charset}) {
-+      eval {
-+        my $scratch = $report;
-+        $report = Encode::decode($self->{msg}->{charset},$scratch,Encode::FB_CROAK);
-+        $is_utf8 = 1;
-+      };
-+    }
-+  }
-+  # encode to report_charset. encode to utf-8 if charset conversion fail.
-+  if ($is_utf8) {
-+    eval {
-+      my $scratch = $report;
-+      $report = Encode::encode($self->{conf}->{report_charset},$scratch,Encode::FB_CROAK);
-+      $is_utf8 = 0;
-+    };
-+    if ($is_utf8) {
-+      $report = Encode::encode_utf8($report);
-       $report_charset = "; charset=utf-8";
--      utf8::encode($report);
-+    }
-   }
- 
-   # get original headers, "pristine" if we can do it
---- lib/Mail/SpamAssassin/Plugin/Bayes.pm	2015-04-29 04:56:47.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	2015-08-30 00:25:43.443000000 +0900
-@@ -70,6 +70,7 @@
-   $MARK_PRESENCE_ONLY_HDRS
-   %HEADER_NAME_COMPRESSION
-   $OPPORTUNISTIC_LOCK_VALID
-+  $SKIP_UTF8_SHORT_TOKENS_RE
- };
- 
- # Which headers should we scan for tokens?  Don't use all of them, as it's easy
-@@ -226,6 +227,15 @@
- # will require a longer token than English ones.)
- use constant MAX_TOKEN_LENGTH => 15;
- 
-+# Skip if a token is too short.
-+$SKIP_UTF8_SHORT_TOKENS_RE = qr{(?:
-+    [\x00-\x7F]                # 1 byte
-+  | [\xC0-\xDF][\x80-\xBF]     # 2 bytes
-+  | [\xE0-\xEF][\x80-\xBF]{2}  # 3 bytes
-+  | [\xF0-\xF7][\x80-\xBF]{3}  # 4 bytes
-+  | (?:\xE3[\x81-\x83][\x80-\xBF]){2} # 2 characters of Hiragana and Katakana
-+)}x;
-+
- ###########################################################################
- 
- sub new {
-@@ -1048,9 +1058,28 @@
-     $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
-   @{$msgdata->{bayes_token_uris}} =
-     $pms->get_uri_list() if $t_src->{uri};
-+  if ($self->{conf}->{normalize_charset}) {
-+    my $tokenizer = $self->get_tokenizer($pms);
-+    if (ref($tokenizer)) {
-+      $msgdata->{bayes_token_body} = $tokenizer->tokenize($msgdata->{bayes_token_body});
-+      $msgdata->{bayes_token_inviz} = $tokenizer->tokenize($msgdata->{bayes_token_inviz});
-+    }
-+  }
-   return $msgdata;
- }
- 
-+sub get_tokenizer {
-+  my ($self, $msg) = @_;
-+
-+  my $tokenizer;
-+  my @languages = split(/\s+/, $msg->{msg}->get_language());
-+  foreach my $lang (@languages) {
-+    $tokenizer = $self->{'conf'}->{'tokenizer'}->{$lang};
-+    last if (ref($tokenizer));
-+  }
-+  return $tokenizer;
-+}
-+
- ###########################################################################
- 
- # The calling functions expect a uniq'ed array of tokens ...
-@@ -1192,6 +1221,11 @@
-     next if $len < 3 ||
- 	($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
- 
-+    # Skip short UTF-8 tokens.
-+    if ($self->{conf}->{normalize_charset}) {
-+      next if ($token =~ /^$SKIP_UTF8_SHORT_TOKENS_RE$/o);
-+    }
-+
-     # are we in the body?  If so, apply some body-specific breakouts
-     if ($region == 1 || $region == 2) {
-       if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
-@@ -1222,14 +1256,16 @@
- 	}
-       }
- 
--      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
--	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
--	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
--	# to me! (jm)
--	while ($token =~ s/^(..?)//) {
--	  push (@rettokens, "8:$1");
-+      unless ($self->{conf}->{normalize_charset}) {
-+	if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
-+	  # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
-+	  # but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
-+	  # to me! (jm)
-+	  while ($token =~ s/^(..?)//) {
-+	    push (@rettokens, "8:$1");
-+	  }
-+	  next;
- 	}
--	next;
-       }
- 
-       if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
---- lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm	2015-08-30 00:25:32.537000000 +0900
-@@ -0,0 +1,84 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+# 
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+# 
-+#     http://www.apache.org/licenses/LICENSE-2.0
-+# 
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Tokenizer::MeCab - Japanese tokenizer with MeCab
-+
-+=head1 SYNOPSIS
-+
-+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::MeCab
-+
-+=head1 DESCRIPTION
-+
-+This plugin tokenizes a Japanese string with MeCab that is 
-+the morphological analysis engine. 
-+
-+Text::MeCab 0.12 or over is required.
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;
-+
-+use strict;
-+use warnings;
-+use Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+
-+# Have to do this so that RPM doesn't find these as required perl modules
-+BEGIN { require MeCab; }
-+our $language = 'ja';
-+our $mecab = new MeCab::Tagger(-Ochasen);
-+
-+sub new {
-+  my $class = shift;
-+  my $mailsaobject = shift;
-+
-+  $class = ref($class) || $class;
-+  my $self = $class->SUPER::new($mailsaobject, $language);
-+  bless ($self, $class);
-+
-+  return $self;
-+}
-+
-+sub tokenize {
-+  my $self = shift;
-+  my $text_array = shift;
-+
-+  my @tokenized_array;
-+  foreach my $text (@$text_array) {
-+    next unless ($text);
-+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
-+    push(@tokenized_array, $text);
-+  }
-+  return \@tokenized_array;
-+}
-+
-+sub _tokenize {
-+  my $text = shift;
-+
-+  my @buf;
-+  for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
-+    push(@buf, $node->{surface});
-+  }
-+  my $tokenized = join(' ', @buf) . ' ';
-+  return $tokenized;
-+}
-+
-+1;
-+
---- lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm	2015-08-30 00:25:32.538000000 +0900
-@@ -0,0 +1,111 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+# 
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+# 
-+#     http://www.apache.org/licenses/LICENSE-2.0
-+# 
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Tokenizer::SimpleJA - simple Japanese tokenizer
-+
-+=head1 SYNOPSIS
-+
-+loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
-+
-+=head1 DESCRIPTION
-+
-+This plugin simply tokenizes a Japanese string by characters other than 
-+the alphabet, the Chinese character, and the katakana. 
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;
-+
-+use strict;
-+use warnings;
-+use Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+
-+our $language = 'ja';
-+
-+our $RE = qr{(
-+  # Hiragana
-+    (?:
-+        \xE3\x81[\x80-\xBF]
-+      | \xE3\x82[\x80-\x9F]
-+    )+
-+  # Katakana
-+  | (?:
-+        \xE3\x82[\xA0-\xBF]
-+      | \xE3\x83[\x80-\xBF]
-+    )+
-+  # Kanji
-+  | (?:
-+        \xE3[\x90-\xBF][\x80-\xBF]
-+      | [\xE4-\xE9][\x80-\xBF]{2}
-+      | \xEF[\xA4-\xAB][\x80-\xBF]
-+    )+
-+  # Fullwidth
-+  | (?:
-+        \xEF\xBC[\x80-\xBF]
-+      | \xEF\xBD[\x80-\x9F]
-+    )+
-+  # Others
-+  | [\xC0-\xDF][\x80-\xBF]
-+  | [\xE0-\xE2][\x80-\xBF]{2}
-+  | \xE3\x80[\x80-\xBF]
-+  | \xE3[\x84-\x8F][\x80-\xBF]
-+  | [\xEA-\xEE][\x80-\xBF]{2}
-+  | \xEF[\x80-\xA3][\x80-\xBF]
-+  | \xEF[\xAC-\xBB][\x80-\xBF]
-+  | \xEF\xBD[\xA0-\xBF]
-+  | \xEF[\xBE-\xBF][\x80-\xBF]
-+  | [\xF0-\xF7][\x80-\xBF]{3}
-+)}x;
-+
-+sub new {
-+  my $class = shift;
-+  my $mailsaobject = shift;
-+
-+  $class = ref($class) || $class;
-+  my $self = $class->SUPER::new($mailsaobject, $language);
-+  bless ($self, $class);
-+
-+  return $self;
-+}
-+
-+sub tokenize {
-+  my $self = shift;
-+  my $text_array = shift;
-+
-+  my @tokenized_array;
-+  foreach my $text (@$text_array) {
-+    next unless ($text);
-+    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
-+    push(@tokenized_array, $text);
-+  }
-+  return \@tokenized_array;
-+}
-+
-+sub _tokenize {
-+  my $text = shift;
-+
-+  $text =~ s/$RE/$1 /og;
-+  $text = ' ' . $text;
-+  return $text;
-+}
-+
-+1;
-+
---- lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Plugin/Tokenizer.pm	2015-08-30 00:25:32.538000000 +0900
-@@ -0,0 +1,115 @@
-+# <@LICENSE>
-+# Copyright 2004 Apache Software Foundation
-+# 
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+# 
-+#     http://www.apache.org/licenses/LICENSE-2.0
-+# 
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+=head1 NAME
-+
-+Mail::SpamAssassin::Plugin::Tokenizer - Tokenizer plugin base class
-+
-+=head1 SYNOPSIS
-+
-+=head2 SpamAssassin configuration:
-+
-+  loadplugin MyTokenizerPlugin /path/to/MyTokenizerPlugin.pm
-+
-+=head2 Perl code:
-+
-+  use Mail::SpamAssassin::Plugin::Tokenizer;
-+  use vars qw(@ISA);
-+  @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);
-+  # language to use this plugin
-+  our $language = 'ja';
-+
-+  # constructor: register language
-+  sub new {
-+    my $class = shift;
-+    my $mailsaobject = shift;
-+
-+    # some boilerplate...
-+    $class = ref($class) || $class;
-+    my $self = $class->SUPER::new($mailsaobject, $language);
-+    bless ($self, $class);
-+
-+    return $self;
-+  }
-+
-+  # tokenize function
-+  sub tokenize {
-+    my $self = shift;
-+    my $text_array_ref = shift;
-+
-+    ......
-+
-+    return $tokenized_array_ref;
-+  }
-+
-+
-+=head1 DESCRIPTION
-+
-+This plugin is the base class of tokenizer plugin.
-+You must define tokenize() and $language
-+
-+=head1 INTERFACE
-+
-+  sub tokenize {
-+    my $self = shift;
-+    my $text_array_ref = shift;
-+
-+    ......
-+ 
-+    return $tokenized_array_ref;
-+  }
-+
-+=cut
-+
-+package Mail::SpamAssassin::Plugin::Tokenizer;
-+
-+use Mail::SpamAssassin::Plugin;
-+use Mail::SpamAssassin::Logger;
-+use strict;
-+use warnings;
-+use bytes;
-+
-+use vars qw(@ISA);
-+@ISA = qw(Mail::SpamAssassin::Plugin);
-+
-+sub new {
-+  my $class = shift;
-+  my $mailsaobject = shift;
-+  my $language = shift;
-+
-+  # some boilerplate...
-+  $class = ref($class) || $class;
-+  my $self = $class->SUPER::new($mailsaobject);
-+  bless ($self, $class);
-+
-+  if ($language) {
-+    $self->{main}->{conf}->{tokenizer}->{$language} = $self;
-+  }
-+  else {
-+    dbg("plugin: $self: \$language is not defined");
-+  }
-+
-+  return $self;
-+}
-+
-+sub tokenize {
-+  my ($self, $ref) = @_;
-+
-+  return $ref;
-+}
-+
-+1;
-+
---- lib/Mail/SpamAssassin/Util/Charset.pm	1970-01-01 09:00:00.000000000 +0900
-+++ lib/Mail/SpamAssassin/Util/Charset.pm	2015-08-30 00:25:32.539000000 +0900
-@@ -0,0 +1,473 @@
-+# <@LICENSE>
-+# Copyright 2006 Apache Software Foundation
-+#
-+# Licensed under the Apache License, Version 2.0 (the "License");
-+# you may not use this file except in compliance with the License.
-+# You may obtain a copy of the License at
-+#
-+#     http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing, software
-+# distributed under the License is distributed on an "AS IS" BASIS,
-+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+# See the License for the specific language governing permissions and
-+# limitations under the License.
-+# </@LICENSE>
-+
-+
-+=head1 NAME
-+
-+  Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language
-+
-+=head1 SYNOPSIS
-+
-+  my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
-+  my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);
-+
-+=head1 DESCRIPTION
-+
-+This module implements utility methods for charset and language.
-+
-+=cut
-+
-+package Mail::SpamAssassin::Util::Charset;
-+
-+use strict;
-+use warnings;
-+use Encode;
-+use Encode::Guess;
-+use Encode::Alias;
-+
-+use vars qw (
-+  @ISA @EXPORT
-+);
-+
-+require Exporter;
-+
-+@ISA = qw(Exporter);
-+@EXPORT = qw(normalize_charset get_language);
-+
-+###########################################################################
-+
-+use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
-+use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
-+use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };
-+
-+###########################################################################
-+
-+our $KANA_HAN_RE = qr{
-+  # Hiragana and Katakana
-+    \xE3[\x81-\x83][\x80-\xBF]
-+  # Han
-+  | \xE3[\x90-\xBF][\x80-\xBF]
-+  | [\xE4-\xE9][\x80-\xBF]{2}
-+  | \xEF[\xA4-\xAB][\x80-\xBF]
-+}x;
-+
-+our %enc2lang;
-+our %lang2enc;
-+our %scr2lang;
-+our %cjkscr2lang;
-+our @scrorder;
-+
-+BEGIN {
-+
-+  # See the following URL about this map:
-+  #   http://czyborra.com/charsets/iso8859.html
-+  #   http://czyborra.com/charsets/codepages.html
-+  #   http://czyborra.com/charsets/cyrillic.html
-+  #   http://en.wikipedia.org/wiki/ISO_8859
-+  #   http://www.w3.org/International/O-charset-lang.html
-+  %enc2lang = (
-+    # buint-in Encodings and Encode::Byte
-+    #   N. America
-+    'ascii'         => 'en',
-+    'cp437'         => 'en',
-+    'cp863'         => 'weurope',
-+
-+    #   W. Europe (Latin1, Latin9)
-+    #       fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
-+    'iso-8859-1'    => 'weurope',
-+    'iso-8859-15'   => 'weurope',
-+    'cp850'         => 'weurope',
-+    'cp860'         => 'weurope',
-+    'cp1252'        => 'weurope',
-+    'MacRoman'      => 'weurope',
-+
-+    #   Cntrl. Europe / Latin2 / Latin10
-+    #       hr cs hu pl sr sk sl
-+    'iso-8859-2'    => 'ceurope',
-+    'cp852'         => 'ceurope',
-+    'cp1250'        => 'ceurope',
-+    'MacCentralEurRoman' => 'ceurope',
-+    'MacCroatian'   => 'ceurope',
-+    'iso-8859-16'   => 'ceurope',
-+    'MacRomanian'   => 'ceurope',
-+
-+    #   Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
-+    #       eo mt
-+    'iso-8859-3'    => 'seurope',
-+
-+    #   Baltics (Latin4, Latin7)
-+    #       lv lt
-+    'iso-8859-4'    => 'neurope',
-+    'iso-8859-13'   => 'baltic',
-+    'cp1257'        => 'baltic',
-+
-+    #   Nordics (Latin6)
-+    #       et kl iu se
-+    'iso-8859-10'   => 'nordic',
-+
-+    #   Cyrillics
-+    #       bg be uk sr mk ru
-+    'iso-8859-5'    => 'ru',
-+    'cp855'         => 'ru',
-+    'cp1251'        => 'ru',
-+    'cp866'         => 'ru',
-+    'MacCyrillic'   => 'ru',
-+    'koi8-r'        => 'ru',
-+    'MacUkrainian'  => 'uk',
-+    'koi8-u'        => 'uk',
-+
-+    #   Arabic
-+    'iso-8859-6'    => 'ar',
-+    'cp864'         => 'ar',
-+    'cp1256'        => 'ar',
-+    'MacArabic'     => 'ar',
-+    'cp1006'        => 'fa',
-+    'MacFarsi'      => 'fa',
-+
-+    #   Greek
-+    'iso-8859-7'    => 'el',
-+    'cp1253'        => 'el',
-+    'MacGreek'      => 'el',
-+
-+    #   Hebrew
-+    #       he yi
-+    'iso-8859-8'    => 'he',
-+    'cp862'         => 'he',
-+    'cp1255'        => 'he',
-+    'MacHebrew'     => 'he',
-+
-+    #   Turkish
-+    'iso-8859-9'    => 'tr',
-+    'cp857'         => 'tr',
-+    'cp1254'        => 'tr',
-+    'MacTurkish'    => 'tr',
-+
-+    #   Thai
-+    'iso-8859-11'   => 'th',
-+    'cp874'         => 'th',
-+
-+    #   Celtics (Latin8)
-+    #       gd cy br
-+    'iso-8859-14'   => 'celtic',
-+
-+    #   Vietnamese
-+    'viscii'        => 'vi',
-+    'cp1258'        => 'vi',
-+
-+    # Encode::CN
-+    'euc-cn'        => 'zh',
-+    'cp936'         => 'zh',
-+    'hz'            => 'zh',
-+
-+    # Encode::TW
-+    'big5-eten'     => 'zh',
-+    'big5-hkscs'    => 'zh',
-+    'cp950'         => 'zh',
-+
-+    # Encode::JP
-+    'euc-jp'        => 'ja',
-+    'shiftjis'      => 'ja',
-+    '7bit-jis'      => 'ja',
-+    'iso-2022-jp'   => 'ja',
-+    'iso-2022-jp-1' => 'ja',
-+    'cp932'         => 'ja',
-+
-+    # Encode::KR
-+    'euc-kr'        => 'ko',
-+    'cp949'         => 'ko',
-+    'johab'         => 'ko',
-+    'iso-2022-kr'   => 'ko',
-+
-+    # Encode::HanExtra
-+    'euc-tw'        => 'zh',
-+    'gb18030'       => 'zh',
-+
-+    # Encode::JIS2K
-+    'euc-jisx0213'  => 'ja',
-+    'shiftjisx0123' => 'ja',
-+    'iso-2022-jp-3' => 'ja',
-+
-+    # Encode::EUCJPMS
-+    'eucJP-ms'      => 'ja',
-+    'cp51932'       => 'ja',
-+    'cp50220'       => 'ja',
-+    'cp50221'       => 'ja',
-+
-+  );
-+
-+  %lang2enc = (
-+    # Latin1
-+    'en' => ['ascii'],
-+    'weurope' => ['cp1252'],
-+
-+    # Latin2
-+    'ceurope' => ['cp1250'],
-+
-+    # Latin3
-+    'seurope' => ['iso-8859-3'],
-+
-+    # Latin4
-+    'neurope' => ['iso-8859-4'],
-+
-+    # Latin5
-+    'tr' => ['cp1254'],
-+
-+    # Latin6
-+    'nordic' => ['iso-8859-10'],
-+
-+    # Latin7
-+    'baltic' => ['cp1257'],
-+
-+    # Latin8
-+    'celtic' => ['iso-8859-14'],
-+
-+    # Non Latin
-+    'ru' => ['koi8-r', 'cp1251'],
-+    'uk' => ['koi8-u'],
-+
-+    'ar' => ['cp1256'],
-+    'el' => ['cp1253'],
-+    'he' => ['cp1255'],
-+    'th' => ['cp874'],
-+    'vi' => ['viscii', 'cp1258'],
-+    'zh' => ['euc-cn', 'cp950'],
-+    'ja' => ['euc-jp', 'cp932'],
-+    'ko' => ['euc-kr', 'cp949'],
-+
-+  );
-+
-+  %scr2lang = (
-+    'InLatin1Supplement' => ['weurope'],
-+    'InLatinExtendedA' => [
-+      'ceurope',
-+      'seurope',
-+      'tr',
-+      'vi'
-+    ],
-+    'InLatinExtendedB' => [
-+      'nordic',
-+      'baltic',
-+      'celtic'
-+    ],
-+    'Thai'   => ['th'],
-+    'Cyrillic' => ['ru', 'uk'],
-+    'Arabic' => ['ar'],
-+    'Greek'  => ['el'],
-+    'Hebrew' => ['he'],
-+  );
-+
-+  # better detection for CJK
-+  @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang));
-+  %cjkscr2lang = (
-+    'Hiragana' => ['ja'],
-+    'Katakana' => ['ja'],
-+    'Hangul' => ['ko'],
-+    'Han'    => ['zh', 'ja', 'ko'],
-+  );
-+
-+  unless (HAS_ENCODE_HANEXTRA) {
-+    Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
-+  }
-+  Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
-+  Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
-+  Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
-+  Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
-+  if (HAS_ENCODE_EUCJPMS) {
-+    Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
-+    Encode::Alias::define_alias( qr/^euc-jp$/i => ' "cp51932"' );
-+  }
-+}
-+
-+sub get_language {
-+  my $str = shift; # $str must be UTF-8 encoding
-+  my $charset = shift;
-+
-+  return 'en' unless $charset;
-+  if ($charset !~ /^utf/i) {
-+    return $enc2lang{$charset};
-+  } elsif (defined($str)) {
-+    $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
-+    return 'en' if ($str eq '');
-+
-+    my %handled;
-+    $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
-+    foreach my $scr (@scrorder) {
-+      next if ($str !~ /\p{$scr}/);
-+      my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
-+      foreach my $lang (@$scrlangs) {
-+        next if (exists($handled{$lang}));
-+        foreach my $enc (@{$lang2enc{$lang}}) {
-+          my $scratch = $str;
-+          Encode::encode($enc, $scratch, Encode::FB_QUIET);
-+          return $lang if ($scratch eq '');
-+        }
-+        $handled{$lang} = 1;
-+      }
-+    }
-+  } 
-+  return 'en';
-+}
-+
-+# TEST 1: try conversion to use the specified charset. 
-+# TEST 2: try conversion to use Encode::Detect.
-+# TEST 3: try conversion to use Encode::Guess.
-+sub normalize_charset {
-+  my $str = shift;
-+  my $charset = shift;
-+  my $return_decoded = shift;
-+
-+  return wantarray ? ($str, 'ascii')  : $str unless ($str);
-+
-+  my $decoded;
-+  my $detected;
-+
-+  if ($charset) {
-+    ($decoded, $detected) = _specified_encoding($str, $charset);
-+  }
-+  unless ($detected) {
-+    ($decoded, $detected) = _encode_detect($str);
-+  }
-+  unless ($detected) {
-+    ($decoded, $detected) = _encode_guess($str);
-+  }
-+  unless ($detected) {
-+    return ($str, undef);
-+  }
-+  $decoded =~ s/^\x{feff}//g;
-+  $decoded = Encode::encode_utf8($decoded) if $return_decoded;
-+
-+  # unfold hiragana, katakana and han
-+  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$|CP50220|CP50221$)/i) {
-+    $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
-+  }
-+  return wantarray ? ($decoded, $detected) : $decoded;
-+}
-+
-+sub _specified_encoding {
-+  my $str = shift;
-+  my $encoding = shift;
-+
-+  my $detected;
-+  my $decoded;
-+
-+  return (undef, undef) unless ($encoding);
-+
-+  # note: ISO-2022-* is not deistinguish from US-ASCII
-+  return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);
-+
-+  # UTF-16|32 encoding without BOM cannot be trusted.
-+  return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
-+  return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
-+
-+  $encoding = _get_alias($encoding);
-+  my $encoder = Encode::find_encoding($encoding);
-+  if (ref($encoder)) {
-+    $decoded = $encoder->decode($str,Encode::FB_QUIET);
-+    $detected = $encoder->name if ($str eq '');
-+  }
-+  return ($decoded, $detected);
-+}
-+
-+sub _encode_detect {
-+  return undef unless HAS_ENCODE_DETECT;
-+  my $str = shift;
-+
-+  # UTF-16|32 encoding without BOM cannot be trusted.
-+  return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
-+  return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);
-+
-+  my $decoded;
-+  my $detected = Encode::Detect::Detector::detect($str);
-+  if ($detected) {
-+    $detected = _get_alias($detected);
-+    my $encoder = Encode::find_encoding($detected);
-+    if (ref($encoder)) {
-+      $decoded = $encoder->decode($str);
-+      $detected = $decoded ? $encoder->name : undef;
-+    }
-+    else {
-+      $detected = undef;
-+    }
-+  }
-+  return ($decoded, $detected);
-+}
-+
-+sub _encode_guess {
-+  my $str = shift;
-+
-+  my $detected;
-+  my $decoded;
-+  my $encoder;
-+
-+  # Step 1: Examine ISO-2022-*.
-+  if ($str =~ /\e/) {
-+    $Encode::Guess::NoUTFAutoGuess = 1;
-+    $encoder = Encode::Guess::guess_encoding($str,
-+        qw/cp50221 7bit-jis iso-2022-kr/);
-+    $Encode::Guess::NoUTFAutoGuess = 0;
-+  }
-+
-+  # Step 2: Examine US-ASCII/UTF-(8|16|32)
-+  unless (ref($encoder)) {
-+    $Encode::Guess::NoUTFAutoGuess = 0;
-+    $encoder = Encode::Guess::guess_encoding($str);
-+  }
-+
-+  # Step 3: Examine other encodings
-+  unless (ref($encoder)) {
-+    $Encode::Guess::NoUTFAutoGuess = 1;
-+    eval {
-+      if ($str =~ /[\x80-\xFF]{4}/) {
-+        $encoder = Encode::Guess::guess_encoding($str,
-+          qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
-+      }
-+      else {
-+        $encoder = Encode::Guess::guess_encoding($str,
-+          qw/iso-8859-1 cp1252/);
-+      }
-+    };
-+    $Encode::Guess::NoUTFAutoGuess = 0;
-+  }
-+  if (ref($encoder)) {
-+    $detected = $encoder->name;
-+    if ($detected) {
-+      $decoded = $encoder->decode($str);
-+    }
-+  }
-+  return ($decoded, $detected);
-+}
-+
-+sub _get_alias {
-+  my $encoding = shift;
-+
-+  unless (HAS_ENCODE_HANEXTRA) {
-+    $encoding =~ s/^gb18030$/euc-cn/i;
-+  }
-+  $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
-+  $encoding =~ s/^TIS-620$/iso-8859-11/i;
-+  $encoding =~ s/x-mac-(.+)$/Mac$1/i;
-+  $encoding =~ s/^Shift_JIS$/cp932/i;
-+  if (HAS_ENCODE_EUCJPMS) {
-+    $encoding =~ s/^iso-2022-jp$/cp50221/i;
-+    $encoding =~ s/^euc-jp$/cp51932/i;
-+  }
-+
-+  return $encoding;
-+}
-+
-+
-+1;
-+
diff --git a/japanese/spamassassin/files/spamassassin-ja.plist b/japanese/spamassassin/files/spamassassin-ja.plist
deleted file mode 100644
index acfde3df93a1..000000000000
--- a/japanese/spamassassin/files/spamassassin-ja.plist
+++ /dev/null
@@ -1,9 +0,0 @@
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
-%%SITE_PERL%%/Mail/SpamAssassin/Plugin/Tokenizer.pm
-%%SITE_PERL%%/Mail/SpamAssassin/Util/Charset.pm
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::MeCab.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA.3.gz
-%%PERL5_MAN3%%/Mail::SpamAssassin::Util::Charset.3.gz
-@sample %%ETCDIR%%/%%TOKENIZER_PRE%%.sample
diff --git a/japanese/spamassassin/files/tokenizer.pre b/japanese/spamassassin/files/tokenizer.pre
deleted file mode 100644
index d21410bbadc9..000000000000
--- a/japanese/spamassassin/files/tokenizer.pre
+++ /dev/null
@@ -1,8 +0,0 @@
-
-# Tokenizer::SimpleJA
-#
-loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA
-
-# Tokenizer::MeCab
-#
-#loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab
diff --git a/japanese/spamassassin/pkg-message b/japanese/spamassassin/pkg-message
deleted file mode 100644
index 49cea24cb5a8..000000000000
--- a/japanese/spamassassin/pkg-message
+++ /dev/null
@@ -1,11 +0,0 @@
-
-************************************************************************
-For Japanese users, see documents in
-http://emaillab.jp/spamassassin/ja-patch/
-
-Tokenizer::MeCab uses UTF-8 encoding.  You may have to manually
-(re)install the following ports with the build options for UTF-8:
-	japanese/mecab		WITH_CHARSET=utf-8
-	japanese/mecab-ipadic	WITH_CHARSET=utf-8
-	japanese/p5-MeCab
-************************************************************************
author	Rene Ladan <rene@FreeBSD.org>	2018-12-31 13:37:58 +0000
committer	Rene Ladan <rene@FreeBSD.org>	2018-12-31 13:37:58 +0000
commit	d1a5353b9cccb568b52fb525175941ad0b8a0345 (patch)
tree	25abb8f5b9cdfd31fb9d446791a87606bf77ff0b /japanese
parent	d460c1a711a6e1319160cba021ecc281f1e4fc70 (diff)
download	ports-d1a5353b9cccb568b52fb525175941ad0b8a0345.tar.gz ports-d1a5353b9cccb568b52fb525175941ad0b8a0345.zip