Levenshtein edit distance with a maximum

This is an example C program which demonstrates the Levenshtein edit distance with a cutoff or maximum. This increases the speed of the algorithm by reducing the number of unnecessary calculations.

#include <string.h>
#include <stdio.h>
/* For INT_MAX */
#include <limits.h>
#include "edit-distance.h"

int distance (const char * word1,
              int len1,
              const char * word2,
              int len2,
              int max)
{
    int matrix[2][len2 + 1];
    int i;
    int j;

    /*
      Initialize the 0 row of "matrix".

        0  
        1  
        2  
        3  

     */

    for (j = 0; j <= len2; j++) {
        matrix[0][j] = j;
    }

    /* Loop over column. */
    for (i = 1; i <= len1; i++) {
        char c1;
        /* The first value to consider of the ith column. */
        int min_j;
        /* The last value to consider of the ith column. */
        int max_j;
        /* The smallest value of the matrix in the ith column. */
        int col_min;
        /* The next column of the matrix to fill in. */
        int next;
        /* The previously-filled-in column of the matrix. */
        int prev;

        c1 = word1[i-1];
        min_j = 1;
        if (i > max) {
            min_j = i - max;
        }
        max_j = len2;
        if (len2 > max + i) {
            max_j = max + i;
        }
        col_min = INT_MAX;
        next = i % 2;
        if (next == 1) {
            prev = 0;
        }
        else {
            prev = 1;
        }
        matrix[next][0] = i;
        /* Loop over rows. */
        for (j = 1; j <= len2; j++) {
            if (j < min_j || j > max_j) {
                /* Put a large value in there. */
                matrix[next][j] = max + 1;
            }
            else {
                char c2;

                c2 = word2[j-1];
                if (c1 == c2) {
                    /* The character at position i in word1 is the same as
                       the character at position j in word2. */
                    matrix[next][j] = matrix[prev][j-1];
                }
                else {
                    /* The character at position i in word1 is not the
                       same as the character at position j in word2, so
                       work out what the minimum cost for getting to cell
                       i, j is. */
                    int delete;
                    int insert;
                    int substitute;
                    int minimum;

                    delete = matrix[prev][j] + 1;
                    insert = matrix[next][j-1] + 1;
                    substitute = matrix[prev][j-1] + 1;
                    minimum = delete;
                    if (insert < minimum) {
                        minimum = insert;
                    }
                    if (substitute < minimum) {
                        minimum = substitute;
                    }
                    matrix[next][j] = minimum;
                }
            }
            /* Find the minimum value in the ith column. */
            if (matrix[next][j] < col_min) {
                col_min = matrix[next][j];
            }
        }
        if (col_min > max) {
            /* All the elements of the ith column are greater than the
               maximum, so no match less than or equal to max can be
               found by looking at succeeding columns. */
            return max + 1;
        }
    }
    return matrix[len1 % 2][len2];
}

(download)

A test program as follows:

#include <string.h>
#include <stdio.h>
#include "edit-distance.h"


struct pair {
    char * left;
    char * right;
    int distance;
}
pairs [] = {
    {"cat", "cut", 1},
    {"Miyagi", "Miyagawa", 3},
    {"Monster::Baby", "Manitor:;Bobi", 6},
};

int n_pairs = sizeof (pairs) / sizeof (struct pair);

static void run (int max)
{
    int i;
    printf ("Setting maximum allowed distance to %d.\n", max);
    for (i = 0; i < n_pairs; i++) {
        int left_len;
        int right_len;
        char * left;
        char * right;
        int d;

        left = pairs[i].left;
        right = pairs[i].right;
        left_len = strlen (left);
        right_len = strlen (right);
        d = distance (left, left_len, right, right_len, max);
        if (d != pairs[i].distance && pairs[i].distance <= max) {
            fprintf (stderr, "Error with distance for %s/%s: "
                     "expect %d got %d\n",
                     left, right, pairs[i].distance, d);
        }
        else {
            printf ("OK %s/%s: %d\n", left, right, d);
        }
    }
}

int main ()
{
    int max;

    max = 10;
    run (max);
    max = 6;
    run (max);
    max = 3;
    run (max);
    max = 1;
    run (max);
    return 0;
}

(download)

The output of the example looks like this:

Setting maximum allowed distance to 10.
OK cat/cut: 1
OK Miyagi/Miyagawa: 3
OK Monster::Baby/Manitor:;Bobi: 6
Setting maximum allowed distance to 6.
OK cat/cut: 1
OK Miyagi/Miyagawa: 3
OK Monster::Baby/Manitor:;Bobi: 6
Setting maximum allowed distance to 3.
OK cat/cut: 1
OK Miyagi/Miyagawa: 3
OK Monster::Baby/Manitor:;Bobi: 4
Setting maximum allowed distance to 1.
OK cat/cut: 1
OK Miyagi/Miyagawa: 2
OK Monster::Baby/Manitor:;Bobi: 2


Copyright © Ben Bullock 2009-2024. All rights reserved. For comments, questions, and corrections, please email Ben Bullock (benkasminbullock@gmail.com). / Privacy / Disclaimer