1/*
2 Copyright (c) Marshall Clow 2010-2012.
3
4 Distributed under the Boost Software License, Version 1.0. (See accompanying
5 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6
7 For more information, see http://www.boost.org
8*/
9
10#ifndef BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
11#define BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
12
13#include <iterator> // for std::iterator_traits
14
15#include <boost/config.hpp>
16#include <boost/assert.hpp>
17#include <boost/static_assert.hpp>
18
19#include <boost/range/begin.hpp>
20#include <boost/range/end.hpp>
21
22#include <boost/core/enable_if.hpp>
23#include <boost/type_traits/is_same.hpp>
24
25#include <boost/algorithm/searching/detail/bm_traits.hpp>
26#include <boost/algorithm/searching/detail/debugging.hpp>
27
28namespace boost { namespace algorithm {
29
30/*
31 A templated version of the boyer-moore searching algorithm.
32
33References:
34 http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
35 http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
36
37Explanations:
38 http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
39 http://www.movsd.com/bm.htm
40 http://www.cs.ucdavis.edu/~gusfield/cs224f09/bnotes.pdf
41
42The Boyer-Moore search algorithm uses two tables, a "bad character" table
43to tell how far to skip ahead when it hits a character that is not in the pattern,
44and a "good character" table to tell how far to skip ahead when it hits a
45mismatch on a character that _is_ in the pattern.
46
47Requirements:
48 * Random access iterators
49 * The two iterator types (patIter and corpusIter) must
50 "point to" the same underlying type and be comparable.
51 * Additional requirements may be imposed but the skip table, such as:
52 ** Numeric type (array-based skip table)
53 ** Hashable type (map-based skip table)
54*/
55
56 template <typename patIter, typename traits = detail::BM_traits<patIter> >
57 class boyer_moore {
58 typedef typename std::iterator_traits<patIter>::difference_type difference_type;
59 public:
60 boyer_moore ( patIter first, patIter last )
61 : pat_first ( first ), pat_last ( last ),
62 k_pattern_length ( std::distance ( pat_first, pat_last )),
63 skip_ ( k_pattern_length, -1 ),
64 suffix_ ( k_pattern_length + 1 )
65 {
66 this->build_skip_table ( first, last );
67 this->build_suffix_table ( first, last );
68 }
69
70 ~boyer_moore () {}
71
72 /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last )
73 /// \brief Searches the corpus for the pattern that was passed into the constructor
74 ///
75 /// \param corpus_first The start of the data to search (Random Access Iterator)
76 /// \param corpus_last One past the end of the data to search
77 ///
78 template <typename corpusIter>
79 std::pair<corpusIter, corpusIter>
80 operator () ( corpusIter corpus_first, corpusIter corpus_last ) const {
81 BOOST_STATIC_ASSERT (( boost::is_same<
82 typename std::iterator_traits<patIter>::value_type,
83 typename std::iterator_traits<corpusIter>::value_type>::value ));
84
85 if ( corpus_first == corpus_last ) return std::make_pair(corpus_last, corpus_last); // if nothing to search, we didn't find it!
86 if ( pat_first == pat_last ) return std::make_pair(corpus_first, corpus_first); // empty pattern matches at start
87
88 const difference_type k_corpus_length = std::distance ( corpus_first, corpus_last );
89 // If the pattern is larger than the corpus, we can't find it!
90 if ( k_corpus_length < k_pattern_length )
91 return std::make_pair(corpus_last, corpus_last);
92
93 // Do the search
94 return this->do_search ( corpus_first, corpus_last );
95 }
96
97 template <typename Range>
98 std::pair<typename boost::range_iterator<Range>::type, typename boost::range_iterator<Range>::type>
99 operator () ( Range &r ) const {
100 return (*this) (boost::begin(r), boost::end(r));
101 }
102
103 private:
104/// \cond DOXYGEN_HIDE
105 patIter pat_first, pat_last;
106 const difference_type k_pattern_length;
107 typename traits::skip_table_t skip_;
108 std::vector <difference_type> suffix_;
109
110 /// \fn operator ( corpusIter corpus_first, corpusIter corpus_last, Pred p )
111 /// \brief Searches the corpus for the pattern that was passed into the constructor
112 ///
113 /// \param corpus_first The start of the data to search (Random Access Iterator)
114 /// \param corpus_last One past the end of the data to search
115 /// \param p A predicate used for the search comparisons.
116 ///
117 template <typename corpusIter>
118 std::pair<corpusIter, corpusIter>
119 do_search ( corpusIter corpus_first, corpusIter corpus_last ) const {
120 /* ---- Do the matching ---- */
121 corpusIter curPos = corpus_first;
122 const corpusIter lastPos = corpus_last - k_pattern_length;
123 difference_type j, k, m;
124
125 while ( curPos <= lastPos ) {
126 /* while ( std::distance ( curPos, corpus_last ) >= k_pattern_length ) { */
127 // Do we match right where we are?
128 j = k_pattern_length;
129 while ( pat_first [j-1] == curPos [j-1] ) {
130 j--;
131 // We matched - we're done!
132 if ( j == 0 )
133 return std::make_pair(curPos, curPos + k_pattern_length);
134 }
135
136 // Since we didn't match, figure out how far to skip forward
137 k = skip_ [ curPos [ j - 1 ]];
138 m = j - k - 1;
139 if ( k < j && m > suffix_ [ j ] )
140 curPos += m;
141 else
142 curPos += suffix_ [ j ];
143 }
144
145 return std::make_pair(corpus_last, corpus_last); // We didn't find anything
146 }
147
148
149 void build_skip_table ( patIter first, patIter last ) {
150 for ( std::size_t i = 0; first != last; ++first, ++i )
151 skip_.insert ( *first, i );
152 }
153
154
155 template<typename Iter, typename Container>
156 void compute_bm_prefix ( Iter first, Iter last, Container &prefix ) {
157 const std::size_t count = std::distance ( first, last );
158 BOOST_ASSERT ( count > 0 );
159 BOOST_ASSERT ( prefix.size () == count );
160
161 prefix[0] = 0;
162 std::size_t k = 0;
163 for ( std::size_t i = 1; i < count; ++i ) {
164 BOOST_ASSERT ( k < count );
165 while ( k > 0 && ( first[k] != first[i] )) {
166 BOOST_ASSERT ( k < count );
167 k = prefix [ k - 1 ];
168 }
169
170 if ( first[k] == first[i] )
171 k++;
172 prefix [ i ] = k;
173 }
174 }
175
176 void build_suffix_table ( patIter first, patIter last ) {
177 const std::size_t count = (std::size_t) std::distance ( first, last );
178
179 if ( count > 0 ) { // empty pattern
180 std::vector<typename std::iterator_traits<patIter>::value_type> reversed(count);
181 (void) std::reverse_copy ( first, last, reversed.begin ());
182
183 std::vector<difference_type> prefix (count);
184 compute_bm_prefix ( first, last, prefix );
185
186 std::vector<difference_type> prefix_reversed (count);
187 compute_bm_prefix ( reversed.begin (), reversed.end (), prefix_reversed );
188
189 for ( std::size_t i = 0; i <= count; i++ )
190 suffix_[i] = count - prefix [count-1];
191
192 for ( std::size_t i = 0; i < count; i++ ) {
193 const std::size_t j = count - prefix_reversed[i];
194 const difference_type k = i - prefix_reversed[i] + 1;
195
196 if (suffix_[j] > k)
197 suffix_[j] = k;
198 }
199 }
200 }
201/// \endcond
202 };
203
204
205/* Two ranges as inputs gives us four possibilities; with 2,3,3,4 parameters
206 Use a bit of TMP to disambiguate the 3-argument templates */
207
208/// \fn boyer_moore_search ( corpusIter corpus_first, corpusIter corpus_last,
209/// patIter pat_first, patIter pat_last )
210/// \brief Searches the corpus for the pattern.
211///
212/// \param corpus_first The start of the data to search (Random Access Iterator)
213/// \param corpus_last One past the end of the data to search
214/// \param pat_first The start of the pattern to search for (Random Access Iterator)
215/// \param pat_last One past the end of the data to search for
216///
217 template <typename patIter, typename corpusIter>
218 std::pair<corpusIter, corpusIter> boyer_moore_search (
219 corpusIter corpus_first, corpusIter corpus_last,
220 patIter pat_first, patIter pat_last )
221 {
222 boyer_moore<patIter> bm ( pat_first, pat_last );
223 return bm ( corpus_first, corpus_last );
224 }
225
226 template <typename PatternRange, typename corpusIter>
227 std::pair<corpusIter, corpusIter> boyer_moore_search (
228 corpusIter corpus_first, corpusIter corpus_last, const PatternRange &pattern )
229 {
230 typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
231 boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
232 return bm ( corpus_first, corpus_last );
233 }
234
235 template <typename patIter, typename CorpusRange>
236 typename boost::disable_if_c<
237 boost::is_same<CorpusRange, patIter>::value,
238 std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type> >
239 ::type
240 boyer_moore_search ( CorpusRange &corpus, patIter pat_first, patIter pat_last )
241 {
242 boyer_moore<patIter> bm ( pat_first, pat_last );
243 return bm (boost::begin (corpus), boost::end (corpus));
244 }
245
246 template <typename PatternRange, typename CorpusRange>
247 std::pair<typename boost::range_iterator<CorpusRange>::type, typename boost::range_iterator<CorpusRange>::type>
248 boyer_moore_search ( CorpusRange &corpus, const PatternRange &pattern )
249 {
250 typedef typename boost::range_iterator<const PatternRange>::type pattern_iterator;
251 boyer_moore<pattern_iterator> bm ( boost::begin(pattern), boost::end (pattern));
252 return bm (boost::begin (corpus), boost::end (corpus));
253 }
254
255
256 // Creator functions -- take a pattern range, return an object
257 template <typename Range>
258 boost::algorithm::boyer_moore<typename boost::range_iterator<const Range>::type>
259 make_boyer_moore ( const Range &r ) {
260 return boost::algorithm::boyer_moore
261 <typename boost::range_iterator<const Range>::type> (boost::begin(r), boost::end(r));
262 }
263
264 template <typename Range>
265 boost::algorithm::boyer_moore<typename boost::range_iterator<Range>::type>
266 make_boyer_moore ( Range &r ) {
267 return boost::algorithm::boyer_moore
268 <typename boost::range_iterator<Range>::type> (boost::begin(r), boost::end(r));
269 }
270
271}}
272
273#endif // BOOST_ALGORITHM_BOYER_MOORE_SEARCH_HPP
274

source code of boost/libs/algorithm/include/boost/algorithm/searching/boyer_moore.hpp