The Boyer-Moore string matching algorithm in C

This C program implements the Boyer-Moore string searching algorithm.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdarg.h>
#include "bm.h"

#ifdef HEADER

typedef struct bm
{
    /* The string to search for. */
    const unsigned char * needle;
    /* The length of "needle" in bytes. */
    int len;
    /* The offset of the first byte of where "needle" was found in the
       searched string. */
    int offset;
    /* True if "needle" was found within the searched string, false
       otherwise. */
    unsigned int found : 1;
    /* True if initialised correctly. */
    unsigned int init : 1;
    /* The length into "needle" where each byte is last found. */
    int delta1[0x100];
    int * delta2;
}
bm_t;

typedef enum
{
    bm_status_ok,
    bm_status_memory_failure,
    /* Over or under flow. */
    bm_status_bounds,
    /* Uninitialised structure. */
    bm_status_uninit,
}
bm_status_t;

#endif /* def HEADER */

/* https://www.lemoda.net/c/die/index.html */

static void
error (const char * format, ...)
{
    va_list vargs;
    va_start (vargs, format);
    vfprintf (stderr, format, vargs);
    fprintf (stderr, ".\n");
    va_end (vargs);
    exit (EXIT_FAILURE);
}

/* Print messages if necessary to debug. */

#if 0 || defined(TEST)
#define MESSAGE(...) {				\
	fprintf (stderr, "# %s:%d: ",		\
		 __FILE__, __LINE__);		\
	fprintf (stderr, __VA_ARGS__);		\
	fprintf (stderr, ".\n");		\
    }
#else
#define MESSAGE(...)
#endif /* 0 */

/* Given "word" of length "wordlen", is word[0..pos-1] the same as
   word[wordlen - pos - 1 .. wordlen - 1], in other words is the end
   of the string from pos onwards a prefix of the string. */

static int
is_prefix (const unsigned char *word, int wordlen, int pos)
{
    int i;
    int suffixlen = wordlen - pos;
    for (i = 0; i < suffixlen; i++) {
        if (word[i] != word[pos + i]) {
	    MESSAGE ("%d: %c != %c", i, word[i], word[pos + i]);
            return 0;
        }
    }
    MESSAGE ("%s %d %.*s It's a prefix", word, suffixlen, suffixlen, word);
    return 1;
}

/* Find the longest suffix of "bm->needle" which is the same as
   "bm->needle[pos]". */

static bm_status_t
suffix_length (const bm_t * bm, int pos, int * length_ptr)
{
    int i;
    int end;
    if (pos >= bm->len) {
	error ("bad position %d >= %d", pos, bm->len);
	return bm_status_bounds;
    }
    if (pos <= 0) {
	error ("meaningless position %d <= 0", pos);
	return bm_status_bounds;
    }
    MESSAGE ("suffix_length (%s, %d, %d)", bm->needle, bm->len, pos);
    end = bm->len - 1;
    for (i = 0; i < pos; i++) {
	if (bm->needle[pos - i] != bm->needle[end - i]) {
	    MESSAGE ("%d %c != %d %c", 
		     pos - i, bm->needle[pos - i], end - i,
		     bm->needle[end - i]);
	    break;
	}
    }
    MESSAGE ("%d %.*s suffix %.*s", 
	     i, i, bm->needle + pos - i, i, bm->needle + bm->len - 1 - i);
    * length_ptr = i;
    return bm_status_ok;
}

/* Fill in the delta2 table of "bm" using the given search string. */

static bm_status_t
make_delta2 (bm_t * bm)
{
    int p;
    int last_prefix_index;

    /* Allocate memory for bm->delta2. */

    bm->delta2 = calloc (bm->len, sizeof (* bm->delta2));
    if (! bm->delta2) {
	error ("No memory");
	return bm_status_memory_failure;
    }
    /* "bm->needle" is always a prefix of itself, so
       "last_prefix_index" starts at "bm->len". */

    last_prefix_index = bm->len;

    for (p = bm->len - 2; p >= 0; p--) {
	/* If the first p+1 characters of bm->needle are the same as
	   the final p+1 characters of bm->needle, the last prefix
	   index is p+1, in other words we can shift the string down
	   by p+1 characters. */
        if (is_prefix (bm->needle, bm->len, p + 1)) {
            last_prefix_index = p + 1;
        }
	/* In the case of a mismatch at character "p", we can jump
	   over this many characters to find the next match. */
        bm->delta2[p] = last_prefix_index + bm->len - 1 - p;
    }

    /* Search for common substrings which are not prefixes. */

    for (p = 1; p < bm->len - 1; p++) {
        int slen;
	int offset;
	bm_status_t status;

	status = suffix_length (bm, p, & slen);
	if (status != bm_status_ok) {
	    error ("Bad status %d getting suffix length (%s, %d, %d)",
		   status, bm->needle, bm->len, p);
	    return status;
	}
	offset = bm->len - 1 - slen;
        if (bm->needle[p - slen] != bm->needle[offset]) {
	    MESSAGE ("%s: Mismatch %c, %c at %d %d",
		     bm->needle, bm->needle[p - slen],
		     bm->needle[offset], p-slen, offset);
            bm->delta2[offset] = bm->len - 1 - p + slen;
        }
    }
    bm->init = 1;
    return bm_status_ok;
}

/* Quiet some compiler warnings. */

static int ustrlen (const unsigned char * u)
{
    return strlen ((const char *) u);
}

/* Given "needle", allocate the values of "bm". */

bm_status_t 
bm_prepare (bm_t * bm, const unsigned char * needle)
{
    int i;

    bm->found = 0;
    bm->init = 0;
    bm->needle = needle;
    bm->len = ustrlen (needle);
    for (i = 0; i < 0x100; i++) {
	/* 'If "char" does not occur in "pat", "delta_1" is "patlen".'
	   [BM p.763] */
	bm->delta1[i] = bm->len;
    }
    for (i = 0; i < bm->len; i++) {
	/* 'If "char" does occur in "pat", "delta_1" is the difference
	   between "patlen" and the position of the rightmost
	   occurrence of "char" in "pat".' [BM p.763] */
	bm->delta1[needle[i]] = bm->len - 1 - i;
    }
    return make_delta2 (bm);
}

bm_status_t
bm_free (bm_t * bm)
{
    free (bm->delta2);
    bm->delta2 = 0;
    return bm_status_ok;
}

static int
delta2 (bm_t * bm, int j)
{
    if (j >= bm->len) {
	error ("index %d >= pattern length %d", j, bm->len);
    }
    return bm->delta2[j];
}

/* Search for bm->needle in "haystack". */

bm_status_t
bm_search (bm_t * bm, const unsigned char * haystack)
{
    int stringlen;
    int i;
    int j;

    if (! bm->init) {
	error ("Uninitialised structure");
	return bm_status_uninit;
    }
    stringlen = ustrlen (haystack);
    i = bm->len - 1;
    while (1) {
	int d1;
	int d2;

	if (i > stringlen) {
	    bm->found = 0;
	    return bm_status_ok;
	}
	MESSAGE ("pointer = %d, comparing %.*s",
		 i, bm->len, haystack + i - (bm->len - 1));
	j = bm->len - 1;
	while (bm->needle[j] == haystack[i]) {
	    /* 'we have matched all of "pat" (and thus have succeeded
	       in finding a match)' [BM p.763] */
	    if (j == 0) {
		bm->found = 1;
		bm->offset = i;
		return bm_status_ok;
	    }
	    j--;
	    i--;
	}
	/* 'we come to a mismatch at some new "char" after matching
	   the last "m" characters of "pat".' [BM p.763] */
	d1 = bm->delta1[haystack[i]];
	d2 = delta2 (bm, j);
	if (d1 > d2) {
	    /* Move the pointer forward to the next possible place
	       which could match, disregarding suffix/prefix. */
	    MESSAGE ("delta_1 > delta_2: jumping by %d for char %c", d1,
		     haystack[i]);
	    i += d1;
	}
	else {
	    MESSAGE ("delta_2 >= delta_1: jumping ahead by %d", d2);
	    i += d2;
	}
    }
    return bm_status_ok;
}

#ifdef TEST

/*  _____         _   _             
   |_   _|__  ___| |_(_)_ __   __ _ 
     | |/ _ \/ __| __| | '_ \ / _` |
     | |  __/\__ \ |_| | | | | (_| |
     |_|\___||___/\__|_|_| |_|\__, |
                              |___/  */


#include "c-tap-test.h"

#define CALL_OK(X) {						\
	bm_status_t status;					\
	status = X;						\
	TAP_TEST_EQUAL (status, bm_status_ok);			\
    }

static void
is_prefix_test ()
{
    const unsigned char * test;
    test = (const unsigned char *) "ABCDABC";
    TAP_TEST_EQUAL (is_prefix (test, 7, 7), 1);
    TAP_TEST_EQUAL (is_prefix (test, 7, 3), 0);
    TAP_TEST_EQUAL (is_prefix (test, 7, 4), 1);
}

static void
suffix_length_test ()
{
    const unsigned char * test;
    int slen;
    bm_t bm = {0};
    test = (const unsigned char *) "ABCDABC";
    bm.len = ustrlen (test);
    bm.needle = test;
    MESSAGE ("suffix length test for %s", test);
    CALL_OK (suffix_length (& bm, 6, & slen));
    TAP_TEST_EQUAL (slen, 6);
    CALL_OK (suffix_length (& bm, 2, & slen));
    TAP_TEST_EQUAL (slen, 2);
    CALL_OK (suffix_length (& bm, 4, & slen));
    /* DABC is not the same as ABCD. */
    TAP_TEST_EQUAL (slen, 0);
}

/* This tests whether delta_2 contains the same values as those of [BM
   p. 765]. */

static void
bm_prepare_test ()
{
    bm_t bm;
    int d2[] = {14, 13, 12, 11, 10, 9, 11, 10, 1};
    int d3[] = {17, 16, 15, 14, 13, 12, 7, 10, 1};
    int i;
    bm_prepare (& bm, (unsigned char *) "ABCXXXABC");
    for (i = 0; i < sizeof (d2) / sizeof (int); i++) {
	TAP_TEST_EQUAL (bm.delta2[i], d2[i]);
    }
    bm_free (& bm);
    bm_prepare (& bm, (unsigned char *) "ABYXCDEYX");
    for (i = 0; i < sizeof (d3) / sizeof (int); i++) {
	TAP_TEST_EQUAL (bm.delta2[i], d3[i]);
    }
    bm_free (& bm);
    /* This very simple case is not in Boyer-Moore's original
       paper. */
    bm_prepare (& bm, (unsigned char *) "ABCDEFGHIJK");
    for (i = 0; i < bm.len - 1; i++) {
	TAP_TEST_EQUAL (bm.delta2[i], 2 * bm.len - i - 1);
    }
    bm_free (& bm);
}


int main ()
{
    bm_t bm;

    /* [BM p.764]. */

    const unsigned char * needle = (unsigned char *) "AT-THAT";
    const unsigned char * haystack = (unsigned char *) "WHICH-FINALLY-HALTS.--AT-THAT-POINT";

    const unsigned char * needle1 = (unsigned char *) "needle";
    const unsigned char * haystack1 = (unsigned char *)
       	"For your own ladies and pale-visag'd maids,\n"
       	"Like Amazons, come tripping after drums,\n"
       	"Their thimbles into armed gauntlets change,\n"
       	"Their needles to lances, and their gentle hearts\n"
       	"To fierce and bloody inclination.\n";

    is_prefix_test ();
    suffix_length_test ();
    exit (0);
    bm_prepare_test ();

    bm_prepare (& bm, needle1);
    bm_search (& bm, haystack1);
    TAP_TEST (bm.found);
    bm_free (& bm);
    bm_prepare (& bm, needle);
    bm_search (& bm, haystack);
    TAP_TEST (bm.found);
    bm_free (& bm);
    TAP_PLAN;
    return 0;
}

#endif /* def TEST */

(download)

The header file looks like this.

/*
This file was generated by the following command:

cfunctions bm.c

*/
#ifndef CFH_BM_H
#define CFH_BM_H

#line 7 "bm.c"
typedef struct bm
{
    /* The string to search for. */
    const unsigned char * needle;
    /* The length of "needle" in bytes. */
    int len;
    /* The offset of the first byte of where "needle" was found in the
       searched string. */
    int offset;
    /* True if "needle" was found within the searched string, false
       otherwise. */
    unsigned int found : 1;
    /* True if initialised correctly. */
    unsigned int init : 1;
    /* The length into "needle" where each byte is last found. */
    int delta1[0x100];
    int * delta2;
}
bm_t;
typedef enum
{
    bm_status_ok,
    bm_status_memory_failure,
    /* Over or under flow. */
    bm_status_bounds,
    /* Uninitialised structure. */
    bm_status_uninit,
}
bm_status_t;

#line 153 "bm.c"
bm_status_t bm_prepare (bm_t* bm, const unsigned char* needle);

#line 173 "bm.c"
bm_status_t bm_free (bm_t* bm);

#line 192 "bm.c"
bm_status_t bm_search (bm_t* bm, const unsigned char* haystack);

#endif /* CFH_BM_H */

(download)

The makefile looks like this.

CFLAGS=-Wall -g -O

all: bm.txt

bm.txt: bm-example
        ./bm-example > $@

bm-example: bm-example.c bm.c bm.h
        $(CC) $(CFLAGS) -o $@ bm.c bm-example.c

test: bm-test
        prove ./bm-test

bm-test: bm.c bm.h c-tap-test.h
        $(CC) -DTEST $(CFLAGS) -o $@ bm.c

bm.h: bm.c
        cfunctions bm.c

CTT=/home/ben/projects/c-tap-test

c-tap-test.h: $(CTT)/$@
        cp $(CTT)/$@ $@

clean:
        -rm -f bm-test bm.txt c-tap-test.h bm-example *.o bm.h
        purge

(download)

Here is a simple example of its use.

#include <stdio.h>
#include "bm.h"

#define CALL(X) {						\
	bm_status_t status;					\
	status = X;						\
	if (status != bm_status_ok) {				\
	    fprintf (stderr, "Bad status %d from %s.\n",	\
		     status, #X);				\
	    return -1;						\
	}							\
    }

int main ()
{
    bm_t bm;
    const unsigned char * needle1 = (unsigned char *) "needle";
    const unsigned char * haystack1 = (unsigned char *)
       	"For your own ladies and pale-visag'd maids,\n"
       	"Like Amazons, come tripping after drums,\n"
       	"Their thimbles into armed gauntlets change,\n"
       	"Their needles to lances, and their gentle hearts\n"
       	"To fierce and bloody inclination.\n";
    CALL (bm_prepare (& bm, needle1));
    CALL (bm_search (& bm, haystack1));
    if (bm.found) {
	printf ("Found at byte %d.\n", bm.offset);
    }
    else {
	printf ("Not found.\n");
    }
    CALL (bm_free (& bm));
    return 0;
}

(download)

The output of the example looks like this:

Found at byte 135.

The [BM p.123] quotes in the source code above refer to the location of the quote in Boyer and Moore's paper, The Boyer-Moore Fast String Searching Algorithm.


Copyright © Ben Bullock 2009-2018. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com) or use the discussion group at Google Groups. / Privacy / Disclaimer