Match a dictionary against a string

This page compares various Perl modules for matching a list of words against a piece of text.

Algorithm::AhoCorasick Rating
Algorithm::AhoCorasick::XS Rating
Convert::Moji
Data::Munge
Search::WuManber Rating
Text::Match::FastAlternatives Rating
Text::Prefix::XS Rating
Text::Scan Rating
#!/home/ben/software/install/bin/perl
use warnings;
use strict;
use utf8;
use FindBin '$Bin';
use File::Slurper 'read_text';

# The modules which we compare

use Algorithm::AhoCorasick 'find_all';
use Algorithm::AhoCorasick::XS;
use Data::Munge;
use Time::HiRes;
use Convert::Moji 'make_regex';
use Text::Scan;
use Search::WuManber;

my @words = read_dic ();

# This file is the Project Gutenberg complete works of Shakespeare,
# http://gutenberg.org/ebooks/100

my $text = read_shakespeare ();
test_modules ();
exit;

# Test various modules

sub test_modules
{
    data_munge ();
#    algo_aho ();
    algo_aho_xs ();
    text_scan ();
#    search_wumanber ();
}

# Test Data::Munge

sub data_munge
{
    my $re = list2re (@words);
    my $start = Time::HiRes::time ();
    my @matches;
    while ($text =~ /($re)/g) {
        push @matches, $1;
    }
    print "Data::Munge::list2re time: ";
    print Time::HiRes::time () - $start;
    print "   #matches: ";
    print scalar (@matches), "\n";
}

# Test Algorithm::AhoCorasick

sub algo_aho
{
    my $start = Time::HiRes::time ();
    my $all = find_all ($text, @words);
    print "Algorithm::AhoCorasick time: ";
    print Time::HiRes::time () - $start;
    print "   #matches: ";
    print scalar (keys %$all), "\n";
}

# Test Algorithm::AhoCorasick::XS

sub algo_aho_xs
{
    my $start = Time::HiRes::time ();
    my $ac = Algorithm::AhoCorasick::XS->new(\@words);
    my @matches = $ac->match_details($text);
    print "Algorithm::AhoCorasick::XS time: ";
    print Time::HiRes::time () - $start;
    print "   #matches: ";
    print scalar (@matches), "\n";

}

# Test Search::WuManber

sub search_wumanber
{
    my $start = Time::HiRes::time ();
    my $search = Search::WuManber->new (\@words);
    my $matches = $search->all ($text);
    print "Search::WuManber time: ";
    print Time::HiRes::time () - $start;
    print "   #matches: ";
    print scalar (@$matches), "\n";
}

# Test Text::Scan

sub text_scan
{
    my $start = Time::HiRes::time ();
    my $dict = new Text::Scan;
    for (@words) {
        $dict->insert ($_, 1);
    }
    my %found = $dict->scan ($text);
    print "Text::Scan time: ";
    print Time::HiRes::time () - $start;
    print "   #matches: ";
    print scalar (keys %found), "\n";
}

# Read the data in

sub read_shakespeare
{
    my $s = '/home/ben/data/shakespeare/100.txt';
    return read_text ($s);
}

# Read the system dictionary and pick 1000 random words from it to
# search for.

sub read_dic
{
    my $max = 1000;
    my $dic = '/usr/share/dict/words';
    open my $in, "<", $dic or die $!;
    my @words;
    while (<$in>) {
        s/\s//g;
        push @words, $_;
    }
    die "not enough words" unless scalar (@words) >= $max;
    my %rwords;
    while (scalar (keys %rwords) < $max) {
        my $randword = $words[rand (@words)];
        if (! $rwords{$randword}) {
            $rwords{$randword} = 1;
        }
    }
    return keys %rwords;
}

(download)

List of modules in machine-readable format. This is the list of modules in Table::Readable format.


Copyright © Ben Bullock 2009-2019. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com) or use the discussion group at Google Groups. / Privacy / Disclaimer