Example implementation of the Knuth-Morris-Pratt algorithm in C

This is an example C program demonstrating the Knuth-Morris-Pratt algorithm.

#include <stdio.h>
#include <stdlib.h>
#include "kmp.h"

#ifdef HEADER

/* One chain in a linked list of results of a search. */

typedef struct kmp_result kmp_result_t;

struct kmp_result
{
    /* Offset in bytes of the start of the match from the start of the
       string. */
    unsigned int offset;
    /* The line number of the match. */
    unsigned int line;
    /* The next match. */
    kmp_result_t * next;
};

/* The table for the Knuth-Morris-Pratt matching algorithm. */

typedef struct kmp
{
    /* The thing we are matching. */
    const unsigned char * needle;
    /* The number of bytes in "needle". */
    int length;
    /* A table of length "length" containing a decomposition of
       "needle" according to the Knuth-Morris-Pratt algorithm. */
    int * table;
}
kmp_t;

/* Holder for file contents. */

typedef struct fc
{
    /* File name for error messages. */
    const char * name;
    /* File contents. */
    unsigned char * contents;
    /* Length of file. */
    size_t len;
}
fc_t;

/* Return statuses of the functions in this module. */

typedef enum
{
    kmp_status_ok,
    kmp_status_memory_failure,
    kmp_status_too_short,
    kmp_status_overflow,
    kmp_status_empty_input,
}
kmp_status_t;

#endif /* def HEADER */

//#define DEBUG

#ifdef DEBUG
#define MESSAGE(x, ...) {			\
	printf (x, ## __VA_ARGS__);		\
    }						
#else
#define MESSAGE(x, ...)
#endif

/* Match the contents of "fc" against "k", the table for the
   Knuth-Morris-Pratt matching algorithm. The results go into 
   "* results_ptr". */

kmp_status_t 
kmp_search (const fc_t * fc, const kmp_t * k, kmp_result_t ** result_ptr)
{
    // Offset into "fc->contents"
    int m = 0;
    // Offset into "k->needle".
    int i = 0;
    // Table of similarities
    const int * t = k->table;
    // Search string.
    const unsigned char * w = k->needle;
    /* For printing line numbers. */
    int line = 1;
    /* Records the start of the current text line. */
    int start = 0;
    // Haystack
    const unsigned char * s = fc->contents;
    size_t len = fc->len;
    /* Linked list of search results. */
    kmp_result_t * result = 0;
    kmp_result_t * last = 0;

    if (len < k->length) {
	* result_ptr = 0;
	return kmp_status_too_short;
    }
    // We are going to increment "i" and "m" by more than one if there
    // is a similarity, so use a while rather than a for loop.
    while (m + i < len) {
	if (w[i] == s[m + i]) {
	    // The two bytes at offset "i" in "w" and "m+i" in "s"
	    // match.
	    if (i == k->length - 1) {
		kmp_result_t * r;
		r = calloc (1, sizeof (kmp_result_t));
		if (! r) {
		    return kmp_status_memory_failure;
		}
		r->offset = m;
		r->line = line;
		r->next = 0;
		if (! result) {
		    result = r;
		    last = r;
		}
		else {
		    last->next = r;
		    last = r;
		}
		/* Put "m" after the match, and set "i" to zero. */
		MESSAGE("matched at %d %d %d\n", line, m, i);
		m += i + 1;
		i = 0;
	    }
	    else {
		/* Continue comparing bytes. */
		i++;
	    }
	}
	else {
	    if (i >= k->length) {
		return kmp_status_overflow;
	    }
	    // Increment the line counter and start point when '\n' is seen.
	    if (s[m + i] == '\n') {
		line++;
		start = m + i + 1;
	    }
	    if (t[i] > -1) {
		// If the kmp table contains a similarity for the i'th
		// byte, we can jump over several bytes of input.
		m = m + i - t[i];
		i = t[i];
	    }
	    else {
		// There were no similarities, start from the
		// beginning of w again.
		i = 0;
		m++;
	    }
	}
    }
    * result_ptr = result;
    return kmp_status_ok;
}

/* Make the table for the Knuth-Morris-Pratt matching algorithm. */

kmp_status_t 
kmp_make_table (kmp_t * k)
{
    int * t;
    const unsigned char * w;
    // Position in t.
    int pos;
    // Next candidate substring.
    int cnd;

    if (! k->needle || k->length == 0) {
	return kmp_status_empty_input;
    }
    t = k->table = calloc (sizeof (int), k->length);
    if (! t) {
	return kmp_status_memory_failure;
    }
    /* If we are at the start of the string, there is nothing to go
       back to. */
    t[0] = -1;
    /* Check we have enough bytes. */
    if (k->length < 2) {
	return kmp_status_ok;
    }
    /* If we are at the second byte of the string, we can only go back
       to the first byte. */
    t[1] = 0;
    w = k->needle;
    pos = 2;
    cnd = 0;
    // Loop to the end of w.
    while (pos < k->length) {
	if (w[pos - 1] == w[cnd]) {
	    // We are continuing a string of matches. The byte at "pos
	    // - 1" is also similar to the previous byte at "cnd", so
	    // record that here.
	    cnd++;
	    t[pos] = cnd;
	    pos++;
	}
	else if (cnd > 0) {
	    // If there was a previous match, fall back to previous
	    // match.
	    cnd = t[cnd];
	}
	else {
	    // cnd == 0, so no similarities. This table entry goes to
	    // the beginning of the string.
	    t[pos] = 0;
	    // Go on with the next byte.
	    pos++;
	}
    }
    return kmp_status_ok;
}

kmp_status_t
kmp_free (kmp_t * kmp)
{
    free (kmp->table);
    return kmp_status_ok;
}

/* Free the memory associated with a set of results. */

kmp_status_t
kmp_result_free (kmp_result_t * r)
{
    while (r) {
	kmp_result_t * next;
	next = r->next;
	free (r);
	r = next;
    }
    return kmp_status_ok;
}

(download)

Here is a test script which uses this simple C tap test to test it using the Test Anything Protocol

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>

#include "kmp.h"
#include "c-tap-test.h"

/* Consistently initialise a content type. */

#define FC(x,s) fc_t x = {			\
	.contents = (unsigned char *) s,	\
	.len = strlen (s),			\
    }

/* Test it doesn't blow up on one-byte search strings. */

static void
short_test ()
{
    kmp_result_t * results;
    kmp_result_t * r;
    kmp_t sho = {
	.needle = (unsigned char *) "a",
	.length = 1,
    };
    FC(fc, "I wandered lonely as a cloud");
    TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok);
    r = results;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 3);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 18);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 21);
    TAP_TEST_EQUAL (r->next, 0);
    TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok);
}

/* Test that it can match subsequent identical strings with no gap. */

static void
close_test ()
{
    kmp_result_t * results;
    kmp_result_t * r;
    kmp_t sho = {
	.needle = (unsigned char *) "ill",
	.length = 3,
    };
    FC(fc, "illillillbillfillill");
    TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok);
    r = results;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 0);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 3);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 6);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 10);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 14);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 1);
    TAP_TEST_EQUAL (r->offset, 17);
    TAP_TEST_EQUAL (r->next, 0);
    TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok);
}

/* Test the generation of line numbers. */

static void
line_test ()
{
    kmp_result_t * results;
    kmp_result_t * r;
    kmp_t sho = {
	.needle = (unsigned char *) "il",
	.length = 2,
    };
    FC(fc, 
       "I wandered lonely as a cloud\n"
       "That floats on high o'er vales and hills,\n"
       "When all at once I saw a crowd,\n"
       "A host, of golden daffodils;\n"
       "Beside the lake, beneath the trees,\n"
       "Fluttering and dancing in the breeze.\n");
    TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok);
    r = results;
    TAP_TEST_EQUAL (r->line, 2);
    TAP_TEST_EQUAL (r->offset, 65);
    r = r->next;
    TAP_TEST_EQUAL (r->line, 4);
    TAP_TEST_EQUAL (r->offset, 127);
    TAP_TEST_EQUAL (r->next, 0);
    TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok);
}

/* Test what happens when the search string does not match. */

static void
no_match_test ()
{
    kmp_result_t * results;
    kmp_t sho = {
	.needle = (unsigned char *) "monkey",
	.length = strlen ("monkey"),
    };
    FC(fc, "I wandered lonely as a cloud\n");
    TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_ok);
    TAP_TEST_EQUAL (results, 0);
    TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_result_free (results), kmp_status_ok);
}

/* Test what happens when the search string is longer than the thing
   to search. */

static void
too_short_test ()
{
    kmp_result_t * results;
    kmp_t sho = {
	.needle = (unsigned char *) "monkey",
	.length = strlen ("monkey"),
    };
    FC(fc, "monke");
    TAP_TEST_EQUAL (kmp_make_table (& sho), kmp_status_ok);
    TAP_TEST_EQUAL (kmp_search (& fc, & sho, & results), kmp_status_too_short);
    TAP_TEST_EQUAL (results, 0);
    TAP_TEST_EQUAL (kmp_free (& sho), kmp_status_ok);
}

/* Test the table comes out the same as in the Wikipedia example. */

static void
table_test ()
{
    int i;
    kmp_t x = {
	.needle = (unsigned char *) "PARTICIPATE IN PARACHUTE",
	.length = strlen ("PARTICIPATE IN PARACHUTE"),
    };
    int table[] = {
	-1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0
    };
    TAP_TEST_EQUAL (kmp_make_table (& x), kmp_status_ok);
    for (i = 0; i < sizeof (table) / sizeof (int); i++) {
	TAP_TEST_EQUAL (x.table[i], table[i]);
    }
    TAP_TEST_EQUAL (kmp_free (& x), kmp_status_ok);
}

/* Run the tests then print a plan. */

int main ()
{
    short_test ();
    close_test ();
    line_test ();
    no_match_test ();
    too_short_test ();
    table_test ();
    TAP_PLAN;
    return 0;
}

(download)

The Makefile is as follows.

C=kmp.c
CFLAGS=-Wall -g -O
TESTOBJS=kmp.o kmp-test.o

all: test

test: kmp-test
        prove --nocolor ./kmp-test

kmp-test: kmp.o kmp-test.o
        $(CC) $(CFLAGS) -o $@ $(TESTOBJS)

kmp.o: kmp.c kmp.h

kmp-test.o: kmp-test.c kmp.h c-tap-test.h

kmp.h: kmp.c
        cfunctions kmp.c

CTT=/home/ben/projects/c-tap-test

c-tap-test.h: $(CTT)/$@
        if [ -f $@ ]; then chmod 0644 $@; fi
        cp -f $(CTT)/$@ $@
        chmod 0444 $@

clean:
        -rm -f kmp kmp.o kmp-test.o kmp-test kmp.h c-tap-test.h kmp.txt




(download)

The .h file, which is automatically generated, looks like this:

/*
    This file was generated by the command

       cfunctions kmp.c

    at 17:14:02 on Tuesday 31 January 2017, local time.
*/
#ifndef CFH_KMP_H
#define CFH_KMP_H

/* From 'kmp.c': */

#line 5 "kmp.c"
typedef struct kmp_result kmp_result_t;
struct kmp_result {
    
    unsigned int offset;
    
    unsigned int line;
    
    kmp_result_t * next;
} 

#line 20 "kmp.c"
;
typedef struct kmp
{
    /* The thing we are matching. */
    const unsigned char * needle;
    /* The number of bytes in "needle". */
    int length;
    /* A table of length "length" containing a decomposition of
       "needle" according to the Knuth-Morris-Pratt algorithm. */
    int * table;
}
kmp_t;
typedef struct fc
{
    /* File name for error messages. */
    const char * name;
    /* File contents. */
    unsigned char * contents;
    /* Length of file. */
    size_t len;
}
fc_t;
typedef enum
{
    kmp_status_ok,
    kmp_status_memory_failure,
    kmp_status_too_short,
    kmp_status_overflow,
    kmp_status_empty_input,
}
kmp_status_t;

#line 79 "kmp.c"
kmp_status_t kmp_search (const fc_t * fc , const kmp_t * k , kmp_result_t ** result_ptr );

#line 167 "kmp.c"
kmp_status_t kmp_make_table (kmp_t * k );

#line 223 "kmp.c"
kmp_status_t kmp_free (kmp_t * kmp );

#line 232 "kmp.c"
kmp_status_t kmp_result_free (kmp_result_t * r );

#endif /* CFH_KMP_H */

(download)


Copyright © Ben Bullock 2009-2017. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com) or use the discussion group at Google Groups. / Privacy / Disclaimer