# <@LICENSE> # Copyright 2006 Apache Software Foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language =head1 SYNOPSIS my ($detected, $decoded) = Mail::SpamAssassin::Util::Charset::normalize_charset($charset, $str); my $language = Mail::SpamAssassin::Util::Charset::get_language($charset, $str); =head1 DESCRIPTION This module implements utility methods for charset and language. =cut #package Mail::SpamAssassin::Util::Charset; package Charset; use strict; use warnings; use vars qw ( @ISA @EXPORT ); require Exporter; @ISA = qw(Exporter); @EXPORT = qw(normalize_charset get_language); sub dbg { my $message = shift; print STDERR "$message\n"; } ########################################################################### use constant HAS_ENCODE => eval { require Encode; require Encode::Guess; require Encode::Alias; }; use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; }; use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; }; ########################################################################### our %enc2lang; our %lang2enc; our %scr2lang; our %cjkscr2lang; our @scrorder; BEGIN { # See the following URL about this map: # http://czyborra.com/charsets/iso8859.html # http://czyborra.com/charsets/codepages.html # http://czyborra.com/charsets/cyrillic.html # http://en.wikipedia.org/wiki/ISO_8859 # http://www.w3.org/International/O-charset-lang.html %enc2lang = ( # buint-in Encodings and Encode::Byte # N. America 'ascii' => 'en', 'cp437' => 'en', 'cp863' => 'weurope', # W. Europe (Latin1, Latin9) # fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af 'iso-8859-1' => 'weurope', 'iso-8859-15' => 'weurope', 'cp850' => 'weurope', 'cp860' => 'weurope', 'cp1252' => 'weurope', 'MacRoman' => 'weurope', # Cntrl. Europe / Latin2 / Latin10 # hr cs hu pl sr sk sl 'iso-8859-2' => 'ceurope', 'cp852' => 'ceurope', 'cp1250' => 'ceurope', 'MacCentralEurRoman' => 'ceurope', 'MacCroatian' => 'ceurope', 'iso-8859-16' => 'ceurope', 'MacRomanian' => 'ceurope', # Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.) # eo mt 'iso-8859-3' => 'seurope', # Baltics (Latin4, Latin7) # lv lt 'iso-8859-4' => 'neurope', 'iso-8859-13' => 'baltic', 'cp1257' => 'baltic', # Nordics (Latin6) # et kl iu se 'iso-8859-10' => 'nordic', # Cyrillics # bg be uk sr mk ru 'iso-8859-5' => 'ru', 'cp855' => 'ru', 'cp1251' => 'ru', 'cp866' => 'ru', 'MacCyrillic' => 'ru', 'koi8-r' => 'ru', 'MacUkrainian' => 'uk', 'koi8-u' => 'uk', # Arabic 'iso-8859-6' => 'ar', 'cp864' => 'ar', 'cp1256' => 'ar', 'MacArabic' => 'ar', 'cp1006' => 'fa', 'MacFarsi' => 'fa', # Greek 'iso-8859-7' => 'el', 'cp1253' => 'el', 'MacGreek' => 'el', # Hebrew # he yi 'iso-8859-8' => 'he', 'cp862' => 'he', 'cp1255' => 'he', 'MacHebrew' => 'he', # Turkish 'iso-8859-9' => 'tr', 'cp857' => 'tr', 'cp1254' => 'tr', 'MacTurkish' => 'tr', # Thai 'iso-8859-11' => 'th', 'cp874' => 'th', # Celtics (Latin8) # gd cy br 'iso-8859-14' => 'celtic', # Vietnamese 'viscii' => 'vi', 'cp1258' => 'vi', # Encode::CN 'euc-cn' => 'zh', 'cp936' => 'zh', 'hz' => 'zh', # Encode::TW 'big5-eten' => 'zh', 'big5-hkscs' => 'zh', 'cp950' => 'zh', # Encode::JP 'euc-jp' => 'ja', 'shiftjis' => 'ja', '7bit-jis' => 'ja', 'iso-2022-jp' => 'ja', 'iso-2022-jp-1' => 'ja', 'cp932' => 'ja', # Encode::KR 'euc-kr' => 'ko', 'cp949' => 'ko', 'johab' => 'ko', 'iso-2022-kr' => 'ko', # Encode::HanExtra 'euc-tw' => 'zh', 'gb18030' => 'zh', # Encode::JIS2K 'euc-jisx0213' => 'ja', 'shiftjisx0123' => 'ja', 'iso-2022-jp-3' => 'ja', ); %lang2enc = ( # Latin1 'en' => ['ascii'], 'weurope' => ['cp1252'], # Latin2 'ceurope' => ['cp1250'], # Latin3 'seurope' => ['iso-8859-3'], # Latin4 'neurope' => ['iso-8859-4'], # Latin5 'tr' => ['cp1254'], # Latin6 'nordic' => ['iso-8859-10'], # Latin7 'baltic' => ['cp1257'], # Latin8 'celtic' => ['iso-8859-14'], # Non Latin 'ru' => ['koi8-r', 'cp1251'], 'uk' => ['koi8-u'], 'ar' => ['cp1256'], 'el' => ['cp1253'], 'he' => ['cp1255'], 'th' => ['cp874'], 'vi' => ['viscii', 'cp1258'], 'zh' => ['euc-cn', 'cp950'], 'ja' => ['euc-jp', 'cp932'], 'ko' => ['euc-kr', 'cp949'], ); %scr2lang = ( 'InLatin1Supplement' => ['weurope'], 'InLatinExtendedA' => [ 'ceurope', 'seurope', 'tr', 'vi' ], 'InLatinExtendedB' => [ 'nordic', 'baltic', 'celtic' ], 'Thai' => ['th'], 'Cyrillic' => ['ru', 'uk'], 'Arabic' => ['ar'], 'Greek' => ['el'], 'Hebrew' => ['he'], ); # better detection for CJK @scrorder = ('Hiragana','Katakana','Hangul','Han',keys(%scr2lang)); %cjkscr2lang = ( 'Hiragana' => ['ja'], 'Katakana' => ['ja'], 'Hangul' => ['ko'], 'Han' => ['zh', 'ja', 'ko'], ); if (HAS_ENCODE) { unless (HAS_ENCODE_HANEXTRA) { Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' ); } Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' ); Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' ); Encode::Alias::define_alias( qr/^Shift_JIS$/ => ' "cp932"' ); } } sub get_language { my $charset = shift; my $str = shift; # $str must be UTF-8 encoding return 'en' unless HAS_ENCODE; return 'en' unless $charset; if ($charset !~ /^utf/i) { return $enc2lang{$charset}; } elsif (defined($str)) { $str =~ s/[\x00-\x7F]//g; # remove ASCII characters return 'en' if ($str eq ''); my %handled; $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str)); foreach my $scr (@scrorder) { next if ($str !~ /\p{$scr}/); my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr}; foreach my $lang (@$scrlangs) { next if (exists($handled{$lang})); foreach my $enc (@{$lang2enc{$lang}}) { my $scratch = $str; Encode::encode($enc, $scratch, Encode::FB_QUIET); return $lang if ($scratch eq ''); } $handled{$lang} = 1; } } } return 'en'; } # TEST 1: try conversion to use the specified charset. # TEST 2: try conversion to use Encode::Detect. # TEST 3: try conversion to use Encode::Guess. sub normalize_charset { my $charset = shift; my $str = shift; return ($charset, $str) unless HAS_ENCODE; return ('ascii', $str) unless ($str); my $decoded; my $detected; if ($charset) { ($detected, $decoded) = _specified_encoding($charset, $str); } unless ($detected) { ($detected, $decoded) = _encode_detect($str); } unless ($detected) { ($detected, $decoded) = _encode_guess($charset, $str); } unless ($detected) { return (undef, $str); } # unfold hiragana, katakana and han if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949$)/i) { $decoded =~ s/([\x{3040}-\x{30FF}\x{3400}-\x{9FFF}\x{F900}-\x{FAFF}])\r?\n([\x{3040}-\x{30FF}\x{3400}-\x{9FFF}\x{F900}-\x{FAFF}])/$1$2/g; } return ($detected, Encode::encode_utf8($decoded)); } sub _specified_encoding { my $encoding = shift; my $str = shift; my $detected; my $decoded; # note: ISO-2022-* is not deistinguish from US-ASCII if ($encoding and ($str !~ /\e/ or $encoding =~ /^ISO-2022/i)) { my $encoder = Encode::find_encoding($encoding); if (ref($encoder)) { $decoded = $encoder->decode($str,Encode::FB_QUIET); $detected = $encoder->name if ($str eq ''); dbg("charset: the specified charset $encoding detected $detected") if ($str eq ''); } } return ($detected, $decoded); } sub _encode_detect { return undef unless HAS_ENCODE_DETECT; my $str = shift; my $decoded; my $detected = Encode::Detect::Detector::detect($str); if ($detected) { dbg("charset: Encode::Detect::Detector::detect detected $detected"); my $encoder = Encode::find_encoding($detected); if (ref($encoder)) { $decoded = $encoder->decode($str); $detected = $decoded ? $encoder->name : undef; } else { $detected = undef; } } return ($detected, $decoded); } sub _encode_guess { my $encoding = shift; my $str = shift; my $detected; my $decoded; my $encoder; if ($encoding) { $encoding = Encode::resolve_alias($encoding); if ($encoding) { $encoder = Encode::Guess::guess_encoding($str, $encoding); } } unless (ref($encoder)) { if ($str =~ /\e/) { $encoder = Encode::Guess::guess_encoding($str, qw/7bit-jis iso-2022-kr/); } elsif ($str =~ /[\x80-\xFF]{4}/) { $encoder = Encode::Guess::guess_encoding($str, qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/); } else { $encoder = Encode::Guess::guess_encoding($str, qw/iso-8859-1 cp1252/); } } if (ref($encoder)) { $detected = $encoder->name; if ($detected =~ /^UTF-(?:16|32)[BL]E$/i) { # The guessed UTF-16|32 encoding without BOM cannot be trusted. $detected = undef; } if ($detected) { $decoded = $encoder->decode($str); dbg("charset: Encode::Guess::guess_encoding detected $detected"); } } return ($detected, $decoded); } 1;