package lorm;
require Exporter;
@ISA = qw(Exporter);
@EXPORT_OK = qw/build_syllables find_lao_syllable romanize/;
use warnings;
use strict;
use utf8;

my %BGN_PCGN = (
        #consonants
        'ກ'   => 'k',
        'ຂ'   => 'kh',
        'ຄ'   => 'kh',
        'ງ'   => 'ng',
        'ຈ'   => 'ch',
        'ສ'   => 's',
        'ຊ'   => 'x',
        'ຕ'   => 't',
        'ຖ'   => 'th',
        'ທ'   => 'th',
        'ນ'   => 'n',
        'ໜ'   => 'n',
        'ປ'   => 'p',
        'ຜ'   => 'ph',
        'ຝ'   => 'f',
        'ພ'   => 'ph',
        'ຟ'   => 'f',
        'ມ'   => 'm',
        'ໝ'   => 'm',
        'ຢ'   => 'y',
        'ຣ'   => 'r',
        'ຣ໌'  => 'r',
        'ລ'   => 'l',
        'ຼ'   => 'l',
        'ຫ'   => 'h',
        'ຮ'   => 'h',
        #ຍ, ຽ, ດ, ບ, ວ, ອ special cases
        #vowels
        'ະ'       => 'a',
        'ັ'       => 'a',
        'າ'       => 'a',
        'ິ'       => 'i',
        'ີ'       => 'i',
        'ຶ'       => 'u',
        'ື'       => 'u',
        'ຸ'       => 'ou',
        'ູ'       => 'ou',
        'ເະ'      => 'é',
        'ເັ'      => 'é',
        'ເ'       => 'é',
        'ແະ'      => 'è',
        'ແັ'      => 'è',
        'ແ'       => 'è',
        'ໂະ'      => 'ô',
        'ົ'       => 'ô',
        'ໂ'       => 'ô',
        'ເາະ'     => 'o',
        'ັອ'      => 'o',
        'ໍ'       => 'o',
        'ອ'       => 'o',
        'ເັຽະ'    => 'ia',
        'ັຽ'      => 'ia',
        'ເັຽ'     => 'ia',
        'ຽ'       => 'ia',
        'ເັຍະ'    => 'ia', #?
        'ັຍ'      => 'ia', #?
        'ເັຍ'     => 'ia', #?
        'ຍ'       => 'ia', #?
        'ເຍ'      => 'ia',
        'ເຶອະ'    => 'ua',
        'ເຶອ'     => 'ua',
        'ເືອ'     => 'ua',
        'ເິະ'     => 'eu',
        'ເິ'      => 'eu',
        'ເີ'      => 'eu',
        'ເື'      => 'eu',
        'ໄ'       => 'ai',
        'ໃ'       => 'ai',
        'ເົາ'     => 'ao',
        'ຳ'       => 'am', #not needed?
        'ໍາ'      => 'am', #not needed?
        'ິວ'      => 'iou',
        'ີວ'      => 'iou',
        # special case: ົວະ, ັວ, ົວ, ວ
);

sub romanize
{
    my ($text, $hyphen) = @_;
    my $syllables = build_syllables ($text);
    my @romanized;
    for my $syllable (@$syllables) {
        my $romanized = romanize_syllable ($syllable);
        push @romanized, $romanized;
    }
    my $romanized_str;
    my $join_str = '';
    
    $join_str = '-' if $hyphen;
    
    $romanized_str = join $join_str, @romanized;
#    print "$romanized_str\n";
    $romanized_str =~ s/^-//; 
#    print "$romanized_str\n";
   
    $romanized_str =~ s/--/-/g if $hyphen;
#    print "$romanized_str\n";
    
    return $romanized_str;
}

sub romanize_syllable
{
    my ($syllable) = @_;
    my $romanized_str;
    
    $syllable =~ s/[່-໋]//;
    
    return '...' if $syllable =~ /^ຯ$/;
    
    if ($syllable =~ /^[໐-໙]+$/) {
        foreach (split //, $syllable) {
            $romanized_str .= (ord($_) - 3792) if /^[໐-໙]$/;
        }
        return $romanized_str;
    }
    
    $syllable =~ s/^ຫ([ເ-ໄ]?[ນມ])/$1/;
    
    return $syllable
        unless $syllable =~ /^[ເ-ໄ]?([ກຂຄງຈສຊຍຽດຕຖທນບປຜຝພຟມຢຣລຼວຫອຮໜໝ])/;
    
    my $consonant = $1;
    
    #ຍ, ຽ, ດ, ບ, ວ, ອ
    if ($consonant =~ /^[ຍຽ]$/) {
        $romanized_str = 'gn';
    } elsif ($consonant =~ /^ດ$/) {
        $romanized_str = 'd';
    } elsif ($consonant =~ /^ບ$/) {
        $romanized_str = 'b';
    } elsif ($consonant =~ /^ວ$/) {
        $romanized_str = 'v';
    } elsif ($consonant =~ /^ອ$/) {
        $romanized_str = '-';
    } elsif (defined ($BGN_PCGN{$consonant})) {
        $romanized_str = $BGN_PCGN{$consonant};
    }
    
    if ($consonant =~ /^ຫ$/ && $syllable =~ /^[ເ-ໄ]?ຫ([ຍຣລຼວ])/) {
        my $sec_consonant = $1;
        $consonant .= $1;
        if ($sec_consonant =~ /ຍ/) {
            $romanized_str = 'gn';
        } elsif ($sec_consonant =~ /ວ/) {
            $romanized_str = 'v';
        } elsif (defined ($BGN_PCGN{$sec_consonant})) {
            $romanized_str = $BGN_PCGN{$sec_consonant};
        }
    } elsif ($syllable =~ /^[ເ-ໄ]?$consonant(ວ)./) {
        $consonant .= $1;
        $romanized_str .= 'o';
    } 
    elsif ($syllable =~ /^[ເ-ໄ]?$consonant([ຣລຼ])/) { # ວ, ຣ, or ລ (also ຼ) can be used in combination with another consonant
        my $sec_consonant = $1;
        $consonant .= $sec_consonant;
        if (defined ($BGN_PCGN{$sec_consonant})) {
            $romanized_str .= $BGN_PCGN{$sec_consonant};
        }
    }
    
    #vowel
    my $vowel = '';
    my $final_consonant;
    if ($syllable =~ /^([ເ-ໄ]?)$consonant/) {
        $vowel .= $1 if $1;
    }
    if ($syllable =~ /^[ເ-ໄ]?$consonant([ະັາິີຶືຸູະັົອໍວຽຍຳ]*)/) {
        $vowel .= $1 if $1;
    }
    if ($syllable =~ /^[ເ-ໄ]?$consonant(?:[ະັາິີຶືຸູະັົອໍວຽຍຳ]*)([ກງຍຽດນບມຣວ]|ຣ໌)?$/) {
        $final_consonant = $1 if $1;
    }
        
    return $romanized_str . 'am' if ($vowel =~ /^(?:ໍາ|ຳ)/);
    
    if (defined ($BGN_PCGN{$vowel})) {
        $romanized_str .= $BGN_PCGN{$vowel};
    } elsif ($vowel =~ /^ົວະ$/ || $vowel =~ /^ັວ$/ || $vowel =~ /^ົວ$/ || $vowel =~ /^ວ$/) {
        $romanized_str .= 'oua';
    } elsif ($vowel =~ s/([ອວຽຍ])$// && defined ($BGN_PCGN{$vowel})) {
        $final_consonant = $1;
        $romanized_str .= $BGN_PCGN{$vowel};
    } 
    
    # last character
    if ($final_consonant) {
        if ($final_consonant =~ /ວ/) {
            $romanized_str .= 'o';
        } elsif ($final_consonant =~ /ດ/) {
            $romanized_str .= 't';
        } elsif ($final_consonant =~ /ບ/) {
            $romanized_str .= 'p';
        } elsif ($final_consonant =~ /[ຍຽ]/) {
            $romanized_str .= 'y';
        } elsif (defined ($BGN_PCGN{$final_consonant})) {
            $romanized_str .= $BGN_PCGN{$final_consonant};
        }
    }
    
    return $romanized_str;
}

sub build_syllables 
{
    my ($word) = @_;
    my @syllables;
    my $count = 0;
    while ($word) {
        $count++;
#        print "$count\n";
        if ($word =~ s/^[^ກ-ໝ]+//s) {
            push @syllables, $&;
        } elsif ($word =~ s/^[໐-໙]+//s) {
            push @syllables, $&;
        } elsif ($word =~ s/^ໆ//) {
            if (scalar(@syllables)) {
                my $prev_syllable = $syllables[-1];
                push @syllables, $prev_syllable;
            }
        } elsif ($word =~ s/^ຯ//) { #... or perhaps it should be together with a lao syllable
            push @syllables, $&;
        } elsif (my $syllable = find_lao_syllable($word)) {
            $word =~ s/^$syllable// or $word =~ s/^.//; #just so that we don't loop forever
            push @syllables, $syllable;
        } else { #just so we don't loop forever
            $word =~ s/.//;
        }
    }
    return \@syllables;
}

sub find_lao_syllable
{
    my ($word) = @_;
    my $syllable;
    if ($word !~ /^[ເ-ໄ]?([ກຂຄງຈສຊຍຽດຕຖທນບປຜຝພຟມຢຣລຼວຫອຮໜໝ])/) {
        return;
    }
    my $consonant = $1;
    if ($word =~ /^[ເ-ໄ]?$consonant[ເ-ໄ]?([ວຣລຼ])/) { # ວ, ຣ, or ລ (also ຼ) can be used in combination with another consonant
        my $extra = $1;
        unless ($extra eq 'ວ' && $word =~ /^$consonant(?:ວ)[^ະັາິີຶືຸູະັົອໍວຽຍຳ]/) {
            $consonant .= $extra;
        }
    }
    my $vowels = '';
    
    if ($consonant =~ /^ຫ$/ && $word =~ /^ຫ[ເ-ໄ]?([ຍນມ])/) {
        my $extra = $1;
        #fetch the surounding vowels and tone mark if any
        $word =~ /^$consonant([ເ-ໄ])?$extra([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/;
        $consonant .= $extra;
        $vowels .= $1 if $1;
        $vowels .= $2 if $2;
    } else {
        #fetch the surounding vowels and tone mark if any
        $word =~ /^([ເ-ໄ])?$consonant([ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]*)/;
        
        $vowels .= $1 if $1;
        $vowels .= $2 if $2;
    }
    
    my $tone;
    if ($vowels =~ s/([່-໋])//) {
        $tone = $1;
    }
    
    #find first vowel
    if ($vowels =~ /^(?:ໍາ|ຳ)/) { #'sala am' is always the end of a syllable
        my $found = $&;
        $syllable = $consonant;
        $found =~ s/^ໍ// and $syllable .= 'ໍ';
        $syllable .= $tone if defined ($tone);
        $syllable .= $found;
        return $syllable;
    } elsif ($vowels =~ /^(?:ເັຍະ|ເຶອະ|ເັຽະ)/ || $vowels =~ /^(?:ເາະ|ົວະ|ເັຽ|ເັຍ|ເືອ|ເິະ|ເົາ)/) {
        # trying to match the largest vowel first, then go to shorter (less characters)
        # doing 4 and 3 character vowels
        my $found = $&;
        $found =~ /^ເ/ and $syllable = 'ເ';
        $syllable .= $consonant;
        $found =~ /^ເ?([ົັືິຶ])/ and $syllable .= $1;
        $syllable .= $tone if defined ($tone);
        if ($found =~ /(ຍະ|ອະ|ຽະ|າະ|ວະ)$/ || $found =~/([ຽຍອະາ])$/) {
            $syllable .= $1;
        }
    } elsif ($vowels =~ /^(?:ເະ|ເັ|ແະ|ແັ|ໂະ|ັອ|ັວ|ົວ|ັຽ|ັຍ|ເິ|ເຍ|ເຽ|ເີ|ເື|ີວ|ິວ)/) {
        # doing 2 character vowels
        my $found = $&;
        $found =~ /^([ເແໂ])/ and $syllable .= $1;
        $syllable .= $consonant;
        $found =~ /^[ເແ]?([ັົິີື])/ and $syllable .= $1;
        $syllable .= $tone if defined ($tone);
        $found =~ /([ະອວຽຍ])$/ and $syllable .= $1;
    } elsif ($vowels =~ /^[ະັາິີຶືຸູເແົໂໍອວຽຍໄໃ]/) {
        # doing single character vowels
        my $found = $&;
        if ($found =~ /^([ເ-ໄ])$/) {
            $syllable = $1 . $consonant;
            $syllable .= $tone if defined ($tone);
        } elsif ($found =~ /^([ັິີຶືຸູົໍ])$/) {
            $syllable = $consonant;
            $syllable .= $found;
            $syllable .= $tone if defined ($tone);
        } else {
            $syllable = $consonant;
            $syllable .= $tone if defined ($tone);
            $syllable .= $found;
        }
    } else { #lonely constant, just return it (with possible tone)
        $syllable = $consonant;
        $syllable .= $tone if defined ($tone);
        return $syllable;
    }
    
    my $regexp = qr{$syllable([ກງຍຽດນບມຣວ]໌?)(?:([^່້໊໋ະັາິີຶືຸູະັົໍຳ])(.?)|$)};
    
    # checking for a possible closing consonant
    if ($word =~ /^$regexp/) {
        my $last_consonant = $1;
        my $possible_vowel = $2 if $2;
        my $continued_vowel = $3 if defined $3;
        if (!(defined $2) || # end of string
            $possible_vowel =~ /[^ຽວອຍ]/ || #for sure a consonant
            (defined $continued_vowel && $continued_vowel =~/[ະັາິີຶືຸູະັົອໍວຽຍຳ່້໊໋]/)) {#post vowels and tones
            $syllable .= $last_consonant;
        }
    }
    return $syllable;
}

1;

__END__

=head1 COPYRIGHT & LICENSE

Copyright 2009 Joakim Lagerqvist, all rights reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.


=cut