1 | //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file defines a Levenshtein distance function that works for any two |
11 | /// sequences, with each element of each sequence being analogous to a character |
12 | /// in a string. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #ifndef LLVM_ADT_EDIT_DISTANCE_H |
17 | #define LLVM_ADT_EDIT_DISTANCE_H |
18 | |
19 | #include "llvm/ADT/ArrayRef.h" |
20 | #include <algorithm> |
21 | |
22 | namespace llvm { |
23 | |
24 | /// Determine the edit distance between two sequences. |
25 | /// |
26 | /// \param FromArray the first sequence to compare. |
27 | /// |
28 | /// \param ToArray the second sequence to compare. |
29 | /// |
30 | /// \param Map A Functor to apply to each item of the sequences before |
31 | /// comparison. |
32 | /// |
33 | /// \param AllowReplacements whether to allow element replacements (change one |
34 | /// element into another) as a single operation, rather than as two operations |
35 | /// (an insertion and a removal). |
36 | /// |
37 | /// \param MaxEditDistance If non-zero, the maximum edit distance that this |
38 | /// routine is allowed to compute. If the edit distance will exceed that |
39 | /// maximum, returns \c MaxEditDistance+1. |
40 | /// |
41 | /// \returns the minimum number of element insertions, removals, or (if |
42 | /// \p AllowReplacements is \c true) replacements needed to transform one of |
43 | /// the given sequences into the other. If zero, the sequences are identical. |
44 | template <typename T, typename Functor> |
45 | unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
46 | Functor Map, bool AllowReplacements = true, |
47 | unsigned MaxEditDistance = 0) { |
48 | // The algorithm implemented below is the "classic" |
49 | // dynamic-programming algorithm for computing the Levenshtein |
50 | // distance, which is described here: |
51 | // |
52 | // http://en.wikipedia.org/wiki/Levenshtein_distance |
53 | // |
54 | // Although the algorithm is typically described using an m x n |
55 | // array, only one row plus one element are used at a time, so this |
56 | // implementation just keeps one vector for the row. To update one entry, |
57 | // only the entries to the left, top, and top-left are needed. The left |
58 | // entry is in Row[x-1], the top entry is what's in Row[x] from the last |
59 | // iteration, and the top-left entry is stored in Previous. |
60 | typename ArrayRef<T>::size_type m = FromArray.size(); |
61 | typename ArrayRef<T>::size_type n = ToArray.size(); |
62 | |
63 | if (MaxEditDistance) { |
64 | // If the difference in size between the 2 arrays is larger than the max |
65 | // distance allowed, we can bail out as we will always need at least |
66 | // MaxEditDistance insertions or removals. |
67 | typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m; |
68 | if (AbsDiff > MaxEditDistance) |
69 | return MaxEditDistance + 1; |
70 | } |
71 | |
72 | SmallVector<unsigned, 64> Row(n + 1); |
73 | for (unsigned i = 1; i < Row.size(); ++i) |
74 | Row[i] = i; |
75 | |
76 | for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { |
77 | Row[0] = y; |
78 | unsigned BestThisRow = Row[0]; |
79 | |
80 | unsigned Previous = y - 1; |
81 | const auto &CurItem = Map(FromArray[y - 1]); |
82 | for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { |
83 | int OldRow = Row[x]; |
84 | if (AllowReplacements) { |
85 | Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), |
86 | std::min(Row[x - 1], Row[x]) + 1); |
87 | } |
88 | else { |
89 | if (CurItem == Map(ToArray[x - 1])) |
90 | Row[x] = Previous; |
91 | else Row[x] = std::min(Row[x-1], Row[x]) + 1; |
92 | } |
93 | Previous = OldRow; |
94 | BestThisRow = std::min(BestThisRow, Row[x]); |
95 | } |
96 | |
97 | if (MaxEditDistance && BestThisRow > MaxEditDistance) |
98 | return MaxEditDistance + 1; |
99 | } |
100 | |
101 | unsigned Result = Row[n]; |
102 | return Result; |
103 | } |
104 | |
105 | template <typename T> |
106 | unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
107 | bool AllowReplacements = true, |
108 | unsigned MaxEditDistance = 0) { |
109 | return ComputeMappedEditDistance( |
110 | FromArray, ToArray, [](const T &X) -> const T & { return X; }, |
111 | AllowReplacements, MaxEditDistance); |
112 | } |
113 | |
114 | } // End llvm namespace |
115 | |
116 | #endif |
117 | |