1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include "levenshtein.h" |
3 | #include <errno.h> |
4 | #include <stdlib.h> |
5 | #include <string.h> |
6 | |
7 | /* |
8 | * This function implements the Damerau-Levenshtein algorithm to |
9 | * calculate a distance between strings. |
10 | * |
11 | * Basically, it says how many letters need to be swapped, substituted, |
12 | * deleted from, or added to string1, at least, to get string2. |
13 | * |
14 | * The idea is to build a distance matrix for the substrings of both |
15 | * strings. To avoid a large space complexity, only the last three rows |
16 | * are kept in memory (if swaps had the same or higher cost as one deletion |
17 | * plus one insertion, only two rows would be needed). |
18 | * |
19 | * At any stage, "i + 1" denotes the length of the current substring of |
20 | * string1 that the distance is calculated for. |
21 | * |
22 | * row2 holds the current row, row1 the previous row (i.e. for the substring |
23 | * of string1 of length "i"), and row0 the row before that. |
24 | * |
25 | * In other words, at the start of the big loop, row2[j + 1] contains the |
26 | * Damerau-Levenshtein distance between the substring of string1 of length |
27 | * "i" and the substring of string2 of length "j + 1". |
28 | * |
29 | * All the big loop does is determine the partial minimum-cost paths. |
30 | * |
31 | * It does so by calculating the costs of the path ending in characters |
32 | * i (in string1) and j (in string2), respectively, given that the last |
33 | * operation is a substitution, a swap, a deletion, or an insertion. |
34 | * |
35 | * This implementation allows the costs to be weighted: |
36 | * |
37 | * - w (as in "sWap") |
38 | * - s (as in "Substitution") |
39 | * - a (for insertion, AKA "Add") |
40 | * - d (as in "Deletion") |
41 | * |
42 | * Note that this algorithm calculates a distance _iff_ d == a. |
43 | */ |
44 | int levenshtein(const char *string1, const char *string2, |
45 | int w, int s, int a, int d) |
46 | { |
47 | int len1 = strlen(string1), len2 = strlen(string2); |
48 | int *row0 = malloc(sizeof(int) * (len2 + 1)); |
49 | int *row1 = malloc(sizeof(int) * (len2 + 1)); |
50 | int *row2 = malloc(sizeof(int) * (len2 + 1)); |
51 | int i, j; |
52 | |
53 | for (j = 0; j <= len2; j++) |
54 | row1[j] = j * a; |
55 | for (i = 0; i < len1; i++) { |
56 | int *dummy; |
57 | |
58 | row2[0] = (i + 1) * d; |
59 | for (j = 0; j < len2; j++) { |
60 | /* substitution */ |
61 | row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); |
62 | /* swap */ |
63 | if (i > 0 && j > 0 && string1[i - 1] == string2[j] && |
64 | string1[i] == string2[j - 1] && |
65 | row2[j + 1] > row0[j - 1] + w) |
66 | row2[j + 1] = row0[j - 1] + w; |
67 | /* deletion */ |
68 | if (row2[j + 1] > row1[j + 1] + d) |
69 | row2[j + 1] = row1[j + 1] + d; |
70 | /* insertion */ |
71 | if (row2[j + 1] > row2[j] + a) |
72 | row2[j + 1] = row2[j] + a; |
73 | } |
74 | |
75 | dummy = row0; |
76 | row0 = row1; |
77 | row1 = row2; |
78 | row2 = dummy; |
79 | } |
80 | |
81 | i = row1[len2]; |
82 | free(row0); |
83 | free(row1); |
84 | free(row2); |
85 | |
86 | return i; |
87 | } |
88 | |