#include #include #include "kmp.h" #ifdef HEADER /* One chain in a linked list of results of a search. */ typedef struct kmp_result kmp_result_t; struct kmp_result { /* Offset in bytes of the start of the match from the start of the string. */ unsigned int offset; /* The line number of the match. */ unsigned int line; /* The next match. */ kmp_result_t * next; }; /* The table for the Knuth-Morris-Pratt matching algorithm. */ typedef struct kmp { /* The thing we are matching. */ const unsigned char * needle; /* The number of bytes in "needle". */ int length; /* A table of length "length" containing a decomposition of "needle" according to the Knuth-Morris-Pratt algorithm. */ int * table; } kmp_t; /* Holder for file contents. */ typedef struct fc { /* File name for error messages. */ const char * name; /* File contents. */ unsigned char * contents; /* Length of file. */ size_t len; } fc_t; /* Return statuses of the functions in this module. */ typedef enum { kmp_status_ok, kmp_status_memory_failure, kmp_status_too_short, kmp_status_overflow, kmp_status_empty_input, } kmp_status_t; #endif /* def HEADER */ //#define DEBUG #ifdef DEBUG #define MESSAGE(x, ...) { \ printf (x, ## __VA_ARGS__); \ } #else #define MESSAGE(x, ...) #endif /* Match the contents of "fc" against "k", the table for the Knuth-Morris-Pratt matching algorithm. The results go into "* results_ptr". */ kmp_status_t kmp_search (const fc_t * fc, const kmp_t * k, kmp_result_t ** result_ptr) { // Offset into "fc->contents" int m = 0; // Offset into "k->needle". int i = 0; // Table of similarities const int * t = k->table; // Search string. const unsigned char * w = k->needle; /* For printing line numbers. */ int line = 1; // Haystack const unsigned char * s = fc->contents; size_t len = fc->len; /* Linked list of search results. */ kmp_result_t * result = 0; kmp_result_t * last = 0; if (len < k->length) { * result_ptr = 0; return kmp_status_too_short; } // We are going to increment "i" and "m" by more than one if there // is a similarity, so use a while rather than a for loop. while (m + i < len) { if (w[i] == s[m + i]) { // The two bytes at offset "i" in "w" and "m+i" in "s" // match. if (i == k->length - 1) { kmp_result_t * r; r = calloc (1, sizeof (kmp_result_t)); if (! r) { return kmp_status_memory_failure; } r->offset = m; r->line = line; r->next = 0; if (! result) { result = r; last = r; } else { last->next = r; last = r; } /* Put "m" after the match, and set "i" to zero. */ MESSAGE("matched at %d %d %d\n", line, m, i); m += i + 1; i = 0; } else { /* Continue comparing bytes. */ i++; } } else { if (i >= k->length) { return kmp_status_overflow; } // Increment the line counter and start point when '\n' is seen. if (s[m + i] == '\n') { line++; m = m + i + 1; i = 0; continue; } if (t[i] > -1) { // If the kmp table contains a similarity for the i'th // byte, we can jump over several bytes of input. m = m + i - t[i]; i = t[i]; } else { // There were no similarities, start from the // beginning of w again. i = 0; m++; } } } * result_ptr = result; return kmp_status_ok; } /* Make the table for the Knuth-Morris-Pratt matching algorithm. */ kmp_status_t kmp_make_table (kmp_t * k) { int * t; const unsigned char * w; // Position in t. int pos; // Next candidate substring. int cnd; if (! k->needle || k->length == 0) { return kmp_status_empty_input; } t = k->table = calloc (sizeof (int), k->length); if (! t) { return kmp_status_memory_failure; } /* If we are at the start of the string, there is nothing to go back to. */ t[0] = -1; /* Check we have enough bytes. */ if (k->length < 2) { return kmp_status_ok; } /* If we are at the second byte of the string, we can only go back to the first byte. */ t[1] = 0; w = k->needle; pos = 2; cnd = 0; // Loop to the end of w. while (pos < k->length) { if (w[pos - 1] == w[cnd]) { // We are continuing a string of matches. The byte at "pos // - 1" is also similar to the previous byte at "cnd", so // record that here. cnd++; t[pos] = cnd; pos++; } else if (cnd > 0) { // If there was a previous match, fall back to previous // match. cnd = t[cnd]; } else { // cnd == 0, so no similarities. This table entry goes to // the beginning of the string. t[pos] = 0; // Go on with the next byte. pos++; } } return kmp_status_ok; } kmp_status_t kmp_free (kmp_t * kmp) { free (kmp->table); return kmp_status_ok; } /* Free the memory associated with a set of results. */ kmp_status_t kmp_result_free (kmp_result_t * r) { while (r) { kmp_result_t * next; next = r->next; free (r); r = next; } return kmp_status_ok; }