#include #include #include #include #include #include "bm.h" #ifdef HEADER typedef struct bm { /* The string to search for. */ const unsigned char * needle; /* The length of "needle" in bytes. */ int len; /* The offset of the first byte of where "needle" was found in the searched string. */ int offset; /* True if "needle" was found within the searched string, false otherwise. */ unsigned int found : 1; /* True if initialised correctly. */ unsigned int init : 1; /* The length into "needle" where each byte is last found. */ int delta1[0x100]; int * delta2; } bm_t; typedef enum { bm_status_ok, bm_status_memory_failure, /* Over or under flow. */ bm_status_bounds, /* Uninitialised structure. */ bm_status_uninit, } bm_status_t; #endif /* def HEADER */ /* https://www.lemoda.net/c/die/index.html */ static void error (const char * format, ...) { va_list vargs; va_start (vargs, format); vfprintf (stderr, format, vargs); fprintf (stderr, ".\n"); va_end (vargs); exit (EXIT_FAILURE); } /* Print messages if necessary to debug. */ #if 0 || defined(TEST) #define MESSAGE(...) { \ fprintf (stderr, "# %s:%d: ", \ __FILE__, __LINE__); \ fprintf (stderr, __VA_ARGS__); \ fprintf (stderr, ".\n"); \ } #else #define MESSAGE(...) #endif /* 0 */ /* Given "word" of length "wordlen", is word[0..pos-1] the same as word[wordlen - pos - 1 .. wordlen - 1], in other words is the end of the string from pos onwards a prefix of the string. */ static int is_prefix (const unsigned char *word, int wordlen, int pos) { int i; int suffixlen = wordlen - pos; for (i = 0; i < suffixlen; i++) { if (word[i] != word[pos + i]) { MESSAGE ("%d: %c != %c", i, word[i], word[pos + i]); return 0; } } MESSAGE ("%s %d %.*s It's a prefix", word, suffixlen, suffixlen, word); return 1; } /* Find the longest suffix of "bm->needle" which is the same as "bm->needle[pos]". */ static bm_status_t suffix_length (const bm_t * bm, int pos, int * length_ptr) { int i; int end; if (pos >= bm->len) { error ("bad position %d >= %d", pos, bm->len); return bm_status_bounds; } if (pos <= 0) { error ("meaningless position %d <= 0", pos); return bm_status_bounds; } MESSAGE ("suffix_length (%s, %d, %d)", bm->needle, bm->len, pos); end = bm->len - 1; for (i = 0; i < pos; i++) { if (bm->needle[pos - i] != bm->needle[end - i]) { MESSAGE ("%d %c != %d %c", pos - i, bm->needle[pos - i], end - i, bm->needle[end - i]); break; } } MESSAGE ("%d %.*s suffix %.*s", i, i, bm->needle + pos - i, i, bm->needle + bm->len - 1 - i); * length_ptr = i; return bm_status_ok; } /* Fill in the delta2 table of "bm" using the given search string. */ static bm_status_t make_delta2 (bm_t * bm) { int p; int last_prefix_index; /* Allocate memory for bm->delta2. */ bm->delta2 = calloc (bm->len, sizeof (* bm->delta2)); if (! bm->delta2) { error ("No memory"); return bm_status_memory_failure; } /* "bm->needle" is always a prefix of itself, so "last_prefix_index" starts at "bm->len". */ last_prefix_index = bm->len; for (p = bm->len - 2; p >= 0; p--) { /* If the first p+1 characters of bm->needle are the same as the final p+1 characters of bm->needle, the last prefix index is p+1, in other words we can shift the string down by p+1 characters. */ if (is_prefix (bm->needle, bm->len, p + 1)) { last_prefix_index = p + 1; } /* In the case of a mismatch at character "p", we can jump over this many characters to find the next match. */ bm->delta2[p] = last_prefix_index + bm->len - 1 - p; } /* Search for common substrings which are not prefixes. */ for (p = 1; p < bm->len - 1; p++) { int slen; int offset; bm_status_t status; status = suffix_length (bm, p, & slen); if (status != bm_status_ok) { error ("Bad status %d getting suffix length (%s, %d, %d)", status, bm->needle, bm->len, p); return status; } offset = bm->len - 1 - slen; if (bm->needle[p - slen] != bm->needle[offset]) { MESSAGE ("%s: Mismatch %c, %c at %d %d", bm->needle, bm->needle[p - slen], bm->needle[offset], p-slen, offset); bm->delta2[offset] = bm->len - 1 - p + slen; } } bm->init = 1; return bm_status_ok; } /* Quiet some compiler warnings. */ static int ustrlen (const unsigned char * u) { return strlen ((const char *) u); } /* Given "needle", allocate the values of "bm". */ bm_status_t bm_prepare (bm_t * bm, const unsigned char * needle) { int i; bm->found = 0; bm->init = 0; bm->needle = needle; bm->len = ustrlen (needle); for (i = 0; i < 0x100; i++) { /* 'If "char" does not occur in "pat", "delta_1" is "patlen".' [BM p.763] */ bm->delta1[i] = bm->len; } for (i = 0; i < bm->len; i++) { /* 'If "char" does occur in "pat", "delta_1" is the difference between "patlen" and the position of the rightmost occurrence of "char" in "pat".' [BM p.763] */ bm->delta1[needle[i]] = bm->len - 1 - i; } return make_delta2 (bm); } bm_status_t bm_free (bm_t * bm) { free (bm->delta2); bm->delta2 = 0; return bm_status_ok; } static int delta2 (bm_t * bm, int j) { if (j >= bm->len) { error ("index %d >= pattern length %d", j, bm->len); } return bm->delta2[j]; } /* Search for bm->needle in "haystack". */ bm_status_t bm_search (bm_t * bm, const unsigned char * haystack) { int stringlen; int i; int j; if (! bm->init) { error ("Uninitialised structure"); return bm_status_uninit; } stringlen = ustrlen (haystack); i = bm->len - 1; while (1) { int d1; int d2; if (i > stringlen) { bm->found = 0; return bm_status_ok; } MESSAGE ("pointer = %d, comparing %.*s", i, bm->len, haystack + i - (bm->len - 1)); j = bm->len - 1; while (bm->needle[j] == haystack[i]) { /* 'we have matched all of "pat" (and thus have succeeded in finding a match)' [BM p.763] */ if (j == 0) { bm->found = 1; bm->offset = i; return bm_status_ok; } j--; i--; } /* 'we come to a mismatch at some new "char" after matching the last "m" characters of "pat".' [BM p.763] */ d1 = bm->delta1[haystack[i]]; d2 = delta2 (bm, j); if (d1 > d2) { /* Move the pointer forward to the next possible place which could match, disregarding suffix/prefix. */ MESSAGE ("delta_1 > delta_2: jumping by %d for char %c", d1, haystack[i]); i += d1; } else { MESSAGE ("delta_2 >= delta_1: jumping ahead by %d", d2); i += d2; } } return bm_status_ok; } #ifdef TEST /* _____ _ _ |_ _|__ ___| |_(_)_ __ __ _ | |/ _ \/ __| __| | '_ \ / _` | | | __/\__ \ |_| | | | | (_| | |_|\___||___/\__|_|_| |_|\__, | |___/ */ #include "c-tap-test.h" #define CALL_OK(X) { \ bm_status_t status; \ status = X; \ TAP_TEST_EQUAL (status, bm_status_ok); \ } static void is_prefix_test () { const unsigned char * test; test = (const unsigned char *) "ABCDABC"; TAP_TEST_EQUAL (is_prefix (test, 7, 7), 1); TAP_TEST_EQUAL (is_prefix (test, 7, 3), 0); TAP_TEST_EQUAL (is_prefix (test, 7, 4), 1); } static void suffix_length_test () { const unsigned char * test; int slen; bm_t bm = {0}; test = (const unsigned char *) "ABCDABC"; bm.len = ustrlen (test); bm.needle = test; MESSAGE ("suffix length test for %s", test); CALL_OK (suffix_length (& bm, 6, & slen)); TAP_TEST_EQUAL (slen, 6); CALL_OK (suffix_length (& bm, 2, & slen)); TAP_TEST_EQUAL (slen, 2); CALL_OK (suffix_length (& bm, 4, & slen)); /* DABC is not the same as ABCD. */ TAP_TEST_EQUAL (slen, 0); } /* This tests whether delta_2 contains the same values as those of [BM p. 765]. */ static void bm_prepare_test () { bm_t bm; int d2[] = {14, 13, 12, 11, 10, 9, 11, 10, 1}; int d3[] = {17, 16, 15, 14, 13, 12, 7, 10, 1}; int i; bm_prepare (& bm, (unsigned char *) "ABCXXXABC"); for (i = 0; i < sizeof (d2) / sizeof (int); i++) { TAP_TEST_EQUAL (bm.delta2[i], d2[i]); } bm_free (& bm); bm_prepare (& bm, (unsigned char *) "ABYXCDEYX"); for (i = 0; i < sizeof (d3) / sizeof (int); i++) { TAP_TEST_EQUAL (bm.delta2[i], d3[i]); } bm_free (& bm); /* This very simple case is not in Boyer-Moore's original paper. */ bm_prepare (& bm, (unsigned char *) "ABCDEFGHIJK"); for (i = 0; i < bm.len - 1; i++) { TAP_TEST_EQUAL (bm.delta2[i], 2 * bm.len - i - 1); } bm_free (& bm); } int main () { bm_t bm; /* [BM p.764]. */ const unsigned char * needle = (unsigned char *) "AT-THAT"; const unsigned char * haystack = (unsigned char *) "WHICH-FINALLY-HALTS.--AT-THAT-POINT"; const unsigned char * needle1 = (unsigned char *) "needle"; const unsigned char * haystack1 = (unsigned char *) "For your own ladies and pale-visag'd maids,\n" "Like Amazons, come tripping after drums,\n" "Their thimbles into armed gauntlets change,\n" "Their needles to lances, and their gentle hearts\n" "To fierce and bloody inclination.\n"; is_prefix_test (); suffix_length_test (); exit (0); bm_prepare_test (); bm_prepare (& bm, needle1); bm_search (& bm, haystack1); TAP_TEST (bm.found); bm_free (& bm); bm_prepare (& bm, needle); bm_search (& bm, haystack); TAP_TEST (bm.found); bm_free (& bm); TAP_PLAN; return 0; } #endif /* def TEST */