#Japanese dictionary-order sort # #0. ignore non-kana #1. fold katakana to hiragana (if identical, hiragana comes first) #2. replace tsu characters with base + N, # where N is 0 for standard, 1 for small, 2 for voiced #3. replace voice-able chars with base + N. # #bydict is a sort function for arrays of hashes, each of which #is assumed to contain a key SORTKEY, consisting of a [jstring,num] #pair. All non-hiragana, non-katakana characters in jstring are #ignored, the above modifications are applied, and a key named #_sortkey is created, which contains the processed output. During #comparison, num is used as the tie-breaker, so that there's some #consistent way to order common words like "ai". # use strict; package JDictSort; my %S = ( "\x{304b}" => ["\x{304b}","0"], # ka "\x{304c}" => ["\x{304b}","1"], "\x{304d}" => ["\x{304d}","0"], # ki "\x{304e}" => ["\x{304d}","1"], "\x{304f}" => ["\x{304f}","0"], # ku "\x{3050}" => ["\x{304f}","1"], "\x{3051}" => ["\x{3051}","0"], # ke "\x{3052}" => ["\x{3051}","1"], "\x{3053}" => ["\x{3053}","0"], # ko "\x{3054}" => ["\x{3053}","1"], "\x{3055}" => ["\x{3055}","0"], # sa "\x{3056}" => ["\x{3055}","1"], "\x{3057}" => ["\x{3057}","0"], # shi "\x{3058}" => ["\x{3057}","1"], "\x{3059}" => ["\x{3059}","0"], # su "\x{305a}" => ["\x{3059}","1"], "\x{305b}" => ["\x{305b}","0"], # se "\x{305c}" => ["\x{305b}","1"], "\x{305d}" => ["\x{305d}","0"], # so "\x{305e}" => ["\x{305d}","1"], "\x{305f}" => ["\x{305f}","0"], # ta "\x{3060}" => ["\x{305f}","1"], "\x{3061}" => ["\x{3061}","0"], # chi "\x{3062}" => ["\x{3061}","1"], "\x{3066}" => ["\x{3066}","0"], # te "\x{3067}" => ["\x{3066}","1"], "\x{3068}" => ["\x{3068}","0"], # to "\x{3069}" => ["\x{3068}","1"], "\x{306f}" => ["\x{306f}","0"], # ha "\x{3070}" => ["\x{306f}","1"], "\x{3071}" => ["\x{306f}","2"], "\x{3072}" => ["\x{3072}","0"], # hi "\x{3073}" => ["\x{3072}","1"], "\x{3074}" => ["\x{3072}","2"], "\x{3075}" => ["\x{3075}","0"], # fu "\x{3076}" => ["\x{3075}","1"], "\x{3077}" => ["\x{3075}","2"], "\x{3078}" => ["\x{3078}","0"], # he "\x{3079}" => ["\x{3078}","1"], "\x{307a}" => ["\x{3078}","2"], "\x{307b}" => ["\x{307b}","0"], # ho "\x{307c}" => ["\x{307b}","1"], "\x{307d}" => ["\x{307b}","2"], "\x{3064}" => ["\x{3064}","0"], #tsu "\x{3063}" => ["\x{3064}","1"], #small-tsu "\x{3065}" => ["\x{3064}","2"], #dzu ); sub bydict { my ($a,$b) = ($main::a,$main::b); $a->{_sortkey} = _getkey($a->{SORTKEY}) unless $a->{_sortkey}; my $akey = $a->{_sortkey}; $b->{_sortkey} = _getkey($b->{SORTKEY}) unless $b->{_sortkey}; my $bkey = $b->{_sortkey}; return $akey->[2] <=> $bkey->[2] if $akey->[0] eq $bkey->[0] and $akey->[1] eq $bkey->[1]; return $akey->[1] cmp $bkey->[1] if $akey->[0] eq $bkey->[0]; return $akey->[0] cmp $bkey->[0]; } sub _getkey { my ($aref) = @_; my ($w,$n) = @$aref; my $k = ''; my $k2 = ''; $w =~ s/^[^\t]*\p{InCJKUnifiedIdeographs}[^\t]*\t//; foreach (split(//,$w)) { last if /^\s/; next unless /^(\p{InHiragana}|\p{InKatakana})$/; if (/^\p{InKataKana}$/) { $_ = chr(ord($_)-96); $k2 .= "k"; }else{ $k2 .= "h"; } if (ref $S{$_}) { $k .= $S{$_}->[0]; $k2 .= $S{$_}->[1]; }else{ $k .= $_; } } return [$k,$k2,$n]; } 1;