package lorm; require Exporter; @ISA = qw(Exporter); @EXPORT_OK = qw/build_syllables find_lao_syllable romanize/; use warnings; use strict; use utf8; my %BGN_PCGN = ( #consonants 'ກ' => 'k', 'ຂ' => 'kh', 'ຄ' => 'kh', 'ງ' => 'ng', 'ຈ' => 'ch', 'ສ' => 's', 'ຊ' => 'x', 'ຕ' => 't', 'ຖ' => 'th', 'ທ' => 'th', 'ນ' => 'n', 'ໜ' => 'n', 'ປ' => 'p', 'ຜ' => 'ph', 'ຝ' => 'f', 'ພ' => 'ph', 'ຟ' => 'f', 'ມ' => 'm', 'ໝ' => 'm', 'ຢ' => 'y', 'ຣ' => 'r', 'ຣ໌' => 'r', 'ລ' => 'l', 'ຼ' => 'l', 'ຫ' => 'h', 'ຮ' => 'h', #ຍ, ຽ, ດ, ບ, ວ, ອ special cases #vowels 'ະ' => 'a', 'ັ' => 'a', 'າ' => 'a', 'ິ' => 'i', 'ີ' => 'i', 'ຶ' => 'u', 'ື' => 'u', 'ຸ' => 'ou', 'ູ' => 'ou', 'ເະ' => 'é', 'ເັ' => 'é', 'ເ' => 'é', 'ແະ' => 'è', 'ແັ' => 'è', 'ແ' => 'è', 'ໂະ' => 'ô', 'ົ' => 'ô', 'ໂ' => 'ô', 'ເາະ' => 'o', 'ັອ' => 'o', 'ໍ' => 'o', 'ອ' => 'o', 'ເັຽະ' => 'ia', 'ັຽ' => 'ia', 'ເັຽ' => 'ia', 'ຽ' => 'ia', 'ເັຍະ' => 'ia', #? 'ັຍ' => 'ia', #? 'ເັຍ' => 'ia', #? 'ຍ' => 'ia', #? 'ເຍ' => 'ia', 'ເຶອະ' => 'ua', 'ເຶອ' => 'ua', 'ເືອ' => 'ua', 'ເິະ' => 'eu', 'ເິ' => 'eu', 'ເີ' => 'eu', 'ເື' => 'eu', 'ໄ' => 'ai', 'ໃ' => 'ai', 'ເົາ' => 'ao', 'ຳ' => 'am', #not needed? 'ໍາ' => 'am', #not needed? 'ິວ' => 'iou', 'ີວ' => 'iou', # special case: ົວະ, ັວ, ົວ, ວ ); sub romanize { my ($text, $hyphen) = @_; my $syllables = build_syllables ($text); my @romanized; for my $syllable (@$syllables) { my $romanized = romanize_syllable ($syllable); push @romanized, $romanized; } my $romanized_str; my $join_str = ''; $join_str = '-' if $hyphen; $romanized_str = join $join_str, @romanized; # print "$romanized_str\n"; $romanized_str =~ s/^-//; # print "$romanized_str\n"; $romanized_str =~ s/--/-/g if $hyphen; # print "$romanized_str\n"; return $romanized_str; } sub romanize_syllable { my ($syllable) = @_; my $romanized_str; $syllable =~ s/[່-໋]//; return '...' if $syllable =~ /^ຯ$/; if ($syllable =~ /^[໐-໙]+$/) { foreach (split //, $syllable) { $romanized_str .= (ord($_) - 3792) if /^[໐-໙]$/; } return $romanized_str; } $syllable =~ s/^ຫ([ເ-ໄ]?[ນມ])/$1/; return $syllable unless $syllable =~ /^[ເ-ໄ]?([ກຂຄງຈສຊຍຽດຕຖທນບປຜຝພຟມຢຣລຼວຫອຮໜໝ])/; my $consonant = $1; #ຍ, ຽ, ດ, ບ, ວ, ອ if ($consonant =~ /^[ຍຽ]$/) { $romanized_str = 'gn'; } elsif ($consonant =~ /^ດ$/) { $romanized_str = 'd'; } elsif ($consonant =~ /^ບ$/) { $romanized_str = 'b'; } elsif ($consonant =~ /^ວ$/) { $romanized_str = 'v'; } elsif ($consonant =~ /^ອ$/) { $romanized_str = '-'; } elsif (defined ($BGN_PCGN{$consonant})) { $romanized_str = $BGN_PCGN{$consonant}; } if ($consonant =~ /^ຫ$/ && $syllable =~ /^[ເ-ໄ]?ຫ([ຍຣລຼວ])/) { my $sec_consonant = $1; $consonant .= $1; if ($sec_consonant =~ /ຍ/) { $romanized_str = 'gn'; } elsif ($sec_consonant =~ /ວ/) { $romanized_str = 'v'; } elsif (defined ($BGN_PCGN{$sec_consonant})) { $romanized_str = $BGN_PCGN{$sec_consonant}; } } elsif ($syllable =~ /^[ເ-ໄ]?$consonant(ວ)./) { $consonant .= $1; $romanized_str .= 'o'; } elsif ($syllable =~ /^[ເ-ໄ]?$consonant([ຣລຼ])/) { # ວ, ຣ, or ລ (also ຼ) can be used in combination with another consonant my $sec_consonant = $1; $consonant .= $sec_consonant; if (defined ($BGN_PCGN{$sec_consonant})) { $romanized_str .= $BGN_PCGN{$sec_consonant}; } } #vowel my $vowel = ''; my $final_consonant; if ($syllable =~ /^([ເ-ໄ]?)$consonant/) { $vowel .= $1 if $1; } if ($syllable =~ /^[ເ-ໄ]?$consonant([ະັາິີຶືຸູະັົອໍວຽຍຳ]*)/) { $vowel .= $1 if $1; } if ($syllable =~ /^[ເ-ໄ]?$consonant(?:[ະັາິີຶືຸູະັົອໍວຽຍຳ]*)([ກງຍຽດນບມຣວ]|ຣ໌)?$/) { $final_consonant = $1 if $1; } return $romanized_str . 'am' if ($vowel =~ /^(?:ໍາ|ຳ)/); if (defined ($BGN_PCGN{$vowel})) { $romanized_str .= $BGN_PCGN{$vowel}; } elsif ($vowel =~ /^ົວະ$/ || $vowel =~ /^ັວ$/ || $vowel =~ /^ົວ$/ || $vowel =~ /^ວ$/) { $romanized_str .= 'oua'; } elsif ($vowel =~ s/([ອວຽຍ])$// && defined ($BGN_PCGN{$vowel})) { $final_consonant = $1; $romanized_str .= $BGN_PCGN{$vowel}; } # last character if ($final_consonant) { if ($final_consonant =~ /ວ/) { $romanized_str .= 'o'; } elsif ($final_consonant =~ /ດ/) { $romanized_str .= 't'; } elsif ($final_consonant =~ /ບ/) { $romanized_str .= 'p'; } elsif ($final_consonant =~ /[ຍຽ]/) { $romanized_str .= 'y'; } elsif (defined ($BGN_PCGN{$final_consonant})) { $romanized_str .= $BGN_PCGN{$final_consonant}; } } return $romanized_str; } sub build_syllables { my ($word) = @_; my @syllables; my $count = 0; while ($word) { $count++; # print "$count\n"; if ($word =~ s/^[^ກ-ໝ]+//s) { push @syllables, $&; } elsif ($word =~ s/^[໐-໙]+//s) { push @syllables, $&; } elsif ($word =~ s/^ໆ//) { if (scalar(@syllables)) { my $prev_syllable = $syllables[-1]; push @syllables, $prev_syllable; } } elsif ($word =~ s/^ຯ//) { #... or perhaps it should be together with a lao syllable push @syllables, $&; } elsif (my $syllable = find_lao_syllable($word)) { $word =~ s/^$syllable// or $word =~ s/^.//; #just so that we don't loop forever push @syllables, $syllable; } else { #just so we don't loop forever $word =~ s/.//; } } return \@syllables; } sub find_lao_syllable { my ($word) = @_; my $syllable; if ($word !~ /^[ເ-ໄ]?([ກຂຄງຈສຊຍຽດຕຖທນບປຜຝພຟມຢຣລຼວຫອຮໜໝ])/) { return; } my $consonant = $1; if ($word =~ /^[ເ-ໄ]?$consonant[ເ-ໄ]?([ວຣລຼ])/) { # ວ, ຣ, or ລ (also ຼ) can be used in combination with another consonant my $extra = $1; unless ($extra eq 'ວ' && $word =~ /^$consonant(?:ວ)[^ະັາິີຶືຸູະັົອໍວຽຍຳ]/) { $consonant .= $extra; } } my $vowels = ''; if ($consonant =~ /^ຫ$/ && $word =~ /^ຫ[ເ-ໄ]?([ຍນມ])/) { my $extra = $1; #fetch the surounding vowels and tone mark if any $word =~ /^$consonant([ເ-ໄ])?$extra([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/; $consonant .= $extra; $vowels .= $1 if $1; $vowels .= $2 if $2; } else { #fetch the surounding vowels and tone mark if any $word =~ /^([ເ-ໄ])?$consonant([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/; $vowels .= $1 if $1; $vowels .= $2 if $2; } my $tone; if ($vowels =~ s/([່-໋])//) { $tone = $1; } #find first vowel if ($vowels =~ /^(?:ໍາ|ຳ)/) { #'sala am' is always the end of a syllable my $found = $&; $syllable = $consonant; $found =~ s/^ໍ// and $syllable .= 'ໍ'; $syllable .= $tone if defined ($tone); $syllable .= $found; return $syllable; } elsif ($vowels =~ /^(?:ເັຍະ|ເຶອະ|ເັຽະ)/ || $vowels =~ /^(?:ເາະ|ົວະ|ເັຽ|ເັຍ|ເືອ|ເິະ|ເົາ)/) { # trying to match the largest vowel first, then go to shorter (less characters) # doing 4 and 3 character vowels my $found = $&; $found =~ /^ເ/ and $syllable = 'ເ'; $syllable .= $consonant; $found =~ /^ເ?([ົັືິຶ])/ and $syllable .= $1; $syllable .= $tone if defined ($tone); if ($found =~ /(ຍະ|ອະ|ຽະ|າະ|ວະ)$/ || $found =~/([ຽຍອະາ])$/) { $syllable .= $1; } } elsif ($vowels =~ /^(?:ເະ|ເັ|ແະ|ແັ|ໂະ|ັອ|ັວ|ົວ|ັຽ|ັຍ|ເິ|ເຍ|ເຽ|ເີ|ເື|ີວ|ິວ)/) { # doing 2 character vowels my $found = $&; $found =~ /^([ເແໂ])/ and $syllable .= $1; $syllable .= $consonant; $found =~ /^[ເແ]?([ັົິີື])/ and $syllable .= $1; $syllable .= $tone if defined ($tone); $found =~ /([ະອວຽຍ])$/ and $syllable .= $1; } elsif ($vowels =~ /^[ະັາິີຶືຸູເແົໂໍອວຽຍໄໃ]/) { # doing single character vowels my $found = $&; if ($found =~ /^([ເ-ໄ])$/) { $syllable = $1 . $consonant; $syllable .= $tone if defined ($tone); } elsif ($found =~ /^([ັິີຶືຸູົໍ])$/) { $syllable = $consonant; $syllable .= $found; $syllable .= $tone if defined ($tone); } else { $syllable = $consonant; $syllable .= $tone if defined ($tone); $syllable .= $found; } } else { #lonely constant, just return it (with possible tone) $syllable = $consonant; $syllable .= $tone if defined ($tone); return $syllable; } my $regexp = qr{$syllable([ກງຍຽດນບມຣວ]໌?)(?:([^່້໊໋ະັາິີຶືຸູະັົໍຳ])(.?)|$)}; # checking for a possible closing consonant if ($word =~ /^$regexp/) { my $last_consonant = $1; my $possible_vowel = $2 if $2; my $continued_vowel = $3 if defined $3; if (!(defined $2) || # end of string $possible_vowel =~ /[^ຽວອຍ]/ || #for sure a consonant (defined $continued_vowel && $continued_vowel =~/[ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]/)) {#post vowels and tones $syllable .= $last_consonant; } } return $syllable; } 1; __END__ =head1 COPYRIGHT & LICENSE Copyright 2009 Joakim Lagerqvist, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut