Example implementation of the Knuth-Morris-Pratt algorithm in C
This is an example C program demonstrating the Knuth-Morris-Pratt
algorithm. This example assumes that the search string does not
contain a newline (\n
) character.
#include <stdio.h> #include <stdlib.h> #include "kmp.h" #ifdef HEADER /* One chain in a linked list of results of a search. */ typedef struct kmp_result kmp_result_t; struct kmp_result { /* Offset in bytes of the start of the match from the start of the string. */ unsigned int offset; /* The line number of the match. */ unsigned int line; /* The next match. */ kmp_result_t * next; }; /* The table for the Knuth-Morris-Pratt matching algorithm. */ typedef struct kmp { /* The thing we are matching. */ const unsigned char * needle; /* The number of bytes in "needle". */ int length; /* A table of length "length" containing a decomposition of "needle" according to the Knuth-Morris-Pratt algorithm. */ int * table; } kmp_t; /* Holder for file contents. */ typedef struct fc { /* File name for error messages. */ const char * name; /* File contents. */ unsigned char * contents; /* Length of file. */ size_t len; } fc_t; /* Return statuses of the functions in this module. */ typedef enum { kmp_status_ok, kmp_status_memory_failure, kmp_status_too_short, kmp_status_overflow, kmp_status_empty_input, } kmp_status_t; #endif /* def HEADER */ //#define DEBUG #ifdef DEBUG #define MESSAGE(x, ...) { \ printf (x, ## __VA_ARGS__); \ } #else #define MESSAGE(x, ...) #endif /* Match the contents of "fc" against "k", the table for the Knuth-Morris-Pratt matching algorithm. The results go into "* results_ptr". */ kmp_status_t kmp_search (const fc_t * fc, const kmp_t * k, kmp_result_t ** result_ptr) { // Offset into "fc->contents" int m = 0; // Offset into "k->needle". int i = 0; // Table of similarities const int * t = k->table; // Search string. const unsigned char * w = k->needle; /* For printing line numbers. */ int line = 1; // Haystack const unsigned char * s = fc->contents; size_t len = fc->len; /* Linked list of search results. */ kmp_result_t * result = 0; kmp_result_t * last = 0; if (len < k->length) { * result_ptr = 0; return kmp_status_too_short; } // We are going to increment "i" and "m" by more than one if there // is a similarity, so use a while rather than a for loop. while (m + i < len) { if (w[i] == s[m + i]) { // The two bytes at offset "i" in "w" and "m+i" in "s" // match. if (i == k->length - 1) { kmp_result_t * r; r = calloc (1, sizeof (kmp_result_t)); if (! r) { return kmp_status_memory_failure; } r->offset = m; r->line = line; r->next = 0; if (! result) { result = r; last = r; } else { last->next = r; last = r; } /* Put "m" after the match, and set "i" to zero. */ MESSAGE("matched at %d %d %d\n", line, m, i); m += i + 1; i = 0; } else { /* Continue comparing bytes. */ i++; } } else { if (i >= k->length) { return kmp_status_overflow; } // Increment the line counter and start point when '\n' is seen. if (s[m + i] == '\n') { line++; m = m + i + 1; i = 0; continue; } if (t[i] > -1) { // If the kmp table contains a similarity for the i'th // byte, we can jump over several bytes of input. m = m + i - t[i]; i = t[i]; } else { // There were no similarities, start from the // beginning of w again. i = 0; m++; } } } * result_ptr = result; return kmp_status_ok; } /* Make the table for the Knuth-Morris-Pratt matching algorithm. */ kmp_status_t kmp_make_table (kmp_t * k) { int * t; const unsigned char * w; // Position in t. int pos; // Next candidate substring. int cnd; if (! k->needle || k->length == 0) { return kmp_status_empty_input; } t = k->table = calloc (sizeof (int), k->length); if (! t) { return kmp_status_memory_failure; } /* If we are at the start of the string, there is nothing to go back to. */ t[0] = -1; /* Check we have enough bytes. */ if (k->length < 2) { return kmp_status_ok; } /* If we are at the second byte of the string, we can only go back to the first byte. */ t[1] = 0; w = k->needle; pos = 2; cnd = 0; // Loop to the end of w. while (pos < k->length) { if (w[pos - 1] == w[cnd]) { // We are continuing a string of matches. The byte at "pos // - 1" is also similar to the previous byte at "cnd", so // record that here. cnd++; t[pos] = cnd; pos++; } else if (cnd > 0) { // If there was a previous match, fall back to previous // match. cnd = t[cnd]; } else { // cnd == 0, so no similarities. This table entry goes to // the beginning of the string. t[pos] = 0; // Go on with the next byte. pos++; } } return kmp_status_ok; } kmp_status_t kmp_free (kmp_t * kmp) { free (kmp->table); return kmp_status_ok; } /* Free the memory associated with a set of results. */ kmp_status_t kmp_result_free (kmp_result_t * r) { while (r) { kmp_result_t * next; next = r->next; free (r); r = next; } return kmp_status_ok; }
Here is a test script which uses this simple C tap test to test it using the Test Anything Protocol
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <errno.h> #include <sys/stat.h> #include "kmp.h" #include "c-tap-test.h" /* Consistently initialise a content type. */ #define FC(x,s) fc_t x = { \ .contents = (unsigned char *) s, \ .len = strlen (s), \ } /* Test it doesn't blow up on one-byte search strings. */ static void short_test () { kmp_result_t * results; kmp_result_t * r; kmp_t sho = { .needle = (unsigned char *) "a", .length = 1, }; FC(fc, "I wandered lonely as a cloud"); printf ("# Test with one-byte search.\n"); TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok); r = results; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 3); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 18); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 21); TAP_TEST_EQUAL (r->next, 0); TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok); } /* Test that it can match subsequent identical strings with no gap. */ static void close_test () { kmp_result_t * results; kmp_result_t * r; kmp_t sho = { .needle = (unsigned char *) "ill", .length = 3, }; FC(fc, "illillillbillfillill"); printf ("# Test with repetitive text.\n"); TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok); r = results; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 0); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 3); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 6); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 10); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 14); r = r->next; TAP_TEST_EQUAL (r->line, 1); TAP_TEST_EQUAL (r->offset, 17); TAP_TEST_EQUAL (r->next, 0); TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok); } /* Test the generation of line numbers. */ static void line_test () { kmp_result_t * results; kmp_result_t * r; kmp_t sho = { .needle = (unsigned char *) "il", .length = 2, }; FC(fc, "I wandered lonely as a cloud\n" "That floats on high o'er vales and hills,\n" "When all at once I saw a crowd,\n" "A host, of golden daffodils;\n" "Beside the lake, beneath the trees,\n" "Fluttering and dancing in the breeze.\n"); printf ("# Test line number counting.\n"); TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok); r = results; TAP_TEST_EQUAL (r->line, 2); TAP_TEST_EQUAL (r->offset, 65); r = r->next; TAP_TEST_EQUAL (r->line, 4); TAP_TEST_EQUAL (r->offset, 127); TAP_TEST_EQUAL (r->next, 0); TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok); } /* Test what happens when the search string does not match. */ static void no_match_test () { kmp_result_t * results; kmp_t sho = { .needle = (unsigned char *) "monkey", .length = strlen ("monkey"), }; FC(fc, "I wandered lonely as a cloud\n"); printf ("# Test matching failure.\n"); TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok); TAP_TEST_EQUAL (results, 0); TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok); } /* Test what happens when the search string is longer than the thing to search. */ static void too_short_test () { kmp_result_t * results; kmp_t sho = { .needle = (unsigned char *) "monkey", .length = strlen ("monkey"), }; FC(fc, "monke"); printf ("# Test mismatched string lengths.\n"); TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok); TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_too_short); TAP_TEST_EQUAL (results, 0); TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok); } /* Test the table comes out the same as in the Wikipedia example. */ static void table_test () { int i; kmp_t x = { .needle = (unsigned char *) "PARTICIPATE IN PARACHUTE", .length = strlen ("PARTICIPATE IN PARACHUTE"), }; int table[] = { -1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0 }; printf ("# Test table construction from KMP's paper.\n"); TAP_TEST_EQUAL (kmp_make_table (& x), kmp_status_ok); for (i = 0; i < sizeof (table) / sizeof (int); i++) { TAP_TEST_EQUAL (x.table[i], table[i]); } TAP_TEST_EQUAL (kmp_free (& x), kmp_status_ok); } /* Run the tests then print a plan. */ int main () { short_test (); close_test (); line_test (); no_match_test (); too_short_test (); table_test (); TAP_PLAN; return 0; }
The Makefile is as follows.
C=kmp.c CFLAGS=-Wall -g -O TESTOBJS=kmp.o kmp-test.o all: test test: kmp-test prove --nocolor ./kmp-test kmp-test: kmp.o kmp-test.o $(CC) $(CFLAGS) -o $@ $(TESTOBJS) kmp.o: kmp.c kmp.h kmp-test.o: kmp-test.c kmp.h c-tap-test.h kmp.h: kmp.c cfunctions kmp.c CTT=/home/ben/projects/c-tap-test c-tap-test.h: $(CTT)/$@ if [ -f $@ ]; then chmod 0644 $@; fi cp -f $(CTT)/$@ $@ chmod 0444 $@ clean: -rm -f kmp kmp.o kmp-test.o kmp-test kmp.h c-tap-test.h kmp.txt
The .h file, which is automatically generated, looks like this:
/* This file was generated by the following command: cfunctions kmp.c */ #ifndef CFH_KMP_H #define CFH_KMP_H #line 4 "kmp.c" typedef struct kmp_result kmp_result_t; struct kmp_result{ unsigned int offset; unsigned int line; kmp_result_t * next; } #line 17 "kmp.c" ; typedef struct kmp { /* The thing we are matching. */ const unsigned char * needle; /* The number of bytes in "needle". */ int length; /* A table of length "length" containing a decomposition of "needle" according to the Knuth-Morris-Pratt algorithm. */ int * table; } kmp_t; typedef struct fc { /* File name for error messages. */ const char * name; /* File contents. */ unsigned char * contents; /* Length of file. */ size_t len; } fc_t; typedef enum { kmp_status_ok, kmp_status_memory_failure, kmp_status_too_short, kmp_status_overflow, kmp_status_empty_input, } kmp_status_t; #line 47 "kmp.c" kmp_status_t kmp_search (const fc_t* fc, const kmp_t* k, kmp_result_t** result_ptr); #line 135 "kmp.c" kmp_status_t kmp_make_table (kmp_t* k); #line 189 "kmp.c" kmp_status_t kmp_free (kmp_t* kmp); #line 198 "kmp.c" kmp_status_t kmp_result_free (kmp_result_t* r); #endif /* CFH_KMP_H */
Copyright © Ben Bullock 2009-2024. All
rights reserved.
For comments, questions, and corrections, please email
Ben Bullock
(benkasminbullock@gmail.com) or use the discussion group at Google Groups.
/
Privacy /
Disclaimer