Match a dictionary against a string
This page compares various Perl modules for matching a list of words against a piece of text.
| Algorithm::AhoCorasick | 
| Algorithm::AhoCorasick::XS | 
| Convert::Moji | 
| Data::Munge | 
| Search::WuManber | 
| Text::Match::FastAlternatives | 
| Text::Prefix::XS | 
| Text::Scan | 
#!/home/ben/software/install/bin/perl use warnings; use strict; use utf8; use FindBin '$Bin'; use File::Slurper 'read_text'; # The modules which we compare use Algorithm::AhoCorasick 'find_all'; use Algorithm::AhoCorasick::XS; use Data::Munge; use Time::HiRes; use Convert::Moji 'make_regex'; use Text::Scan; use Search::WuManber; my @words = read_dic (); # This file is the Project Gutenberg complete works of Shakespeare, # http://gutenberg.org/ebooks/100 my $text = read_shakespeare (); test_modules (); exit; # Test various modules sub test_modules { data_munge (); # algo_aho (); algo_aho_xs (); text_scan (); # search_wumanber (); } # Test Data::Munge sub data_munge { my $re = list2re (@words); my $start = Time::HiRes::time (); my @matches; while ($text =~ /($re)/g) { push @matches, $1; } print "Data::Munge::list2re time: "; print Time::HiRes::time () - $start; print " #matches: "; print scalar (@matches), "\n"; } # Test Algorithm::AhoCorasick sub algo_aho { my $start = Time::HiRes::time (); my $all = find_all ($text, @words); print "Algorithm::AhoCorasick time: "; print Time::HiRes::time () - $start; print " #matches: "; print scalar (keys %$all), "\n"; } # Test Algorithm::AhoCorasick::XS sub algo_aho_xs { my $start = Time::HiRes::time (); my $ac = Algorithm::AhoCorasick::XS->new(\@words); my @matches = $ac->match_details($text); print "Algorithm::AhoCorasick::XS time: "; print Time::HiRes::time () - $start; print " #matches: "; print scalar (@matches), "\n"; } # Test Search::WuManber sub search_wumanber { my $start = Time::HiRes::time (); my $search = Search::WuManber->new (\@words); my $matches = $search->all ($text); print "Search::WuManber time: "; print Time::HiRes::time () - $start; print " #matches: "; print scalar (@$matches), "\n"; } # Test Text::Scan sub text_scan { my $start = Time::HiRes::time (); my $dict = new Text::Scan; for (@words) { $dict->insert ($_, 1); } my %found = $dict->scan ($text); print "Text::Scan time: "; print Time::HiRes::time () - $start; print " #matches: "; print scalar (keys %found), "\n"; } # Read the data in sub read_shakespeare { my $s = '/home/ben/data/shakespeare/100.txt'; return read_text ($s); } # Read the system dictionary and pick 1000 random words from it to # search for. sub read_dic { my $max = 1000; my $dic = '/usr/share/dict/words'; open my $in, "<", $dic or die $!; my @words; while (<$in>) { s/\s//g; push @words, $_; } die "not enough words" unless scalar (@words) >= $max; my %rwords; while (scalar (keys %rwords) < $max) { my $randword = $words[rand (@words)]; if (! $rwords{$randword}) { $rwords{$randword} = 1; } } return keys %rwords; }
List of modules in machine-readable format. This is the list of modules in Table::Readable format.
Copyright © Ben Bullock 2009-2025. All
rights reserved. 
For comments, questions, and corrections, please email
Ben Bullock
(benkasminbullock@gmail.com).
 /
Privacy / 
Disclaimer