Update Gnulib to the latest git version
[deliverable/binutils-gdb.git] / gnulib / import / str-two-way.h
CommitLineData
f6ea5628 1/* Byte-wise substring search, using the Two-Way algorithm.
c0c3707f 2 Copyright (C) 2008-2019 Free Software Foundation, Inc.
f6ea5628
DJ
3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
4c38e0a4 8 the Free Software Foundation; either version 3, or (at your option)
f6ea5628
DJ
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
c0c3707f 17 with this program; if not, see <https://www.gnu.org/licenses/>. */
f6ea5628
DJ
18
19/* Before including this file, you need to include <config.h> and
20 <string.h>, and define:
c0c3707f 21 RETURN_TYPE A macro that expands to the return type.
f6ea5628 22 AVAILABLE(h, h_l, j, n_l)
f434ba03
PA
23 A macro that returns nonzero if there are
24 at least N_L bytes left starting at H[J].
25 H is 'unsigned char *', H_L, J, and N_L
26 are 'size_t'; H_L is an lvalue. For
27 NUL-terminated searches, H_L can be
28 modified each iteration to avoid having
29 to compute the end of H up front.
f6ea5628
DJ
30
31 For case-insensitivity, you may optionally define:
32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
f434ba03 33 characters of P1 and P2 are equal.
f6ea5628 34 CANON_ELEMENT(c) A macro that canonicalizes an element right after
f434ba03
PA
35 it has been fetched from one of the two strings.
36 The argument is an 'unsigned char'; the result
37 must be an 'unsigned char' as well.
f6ea5628
DJ
38
39 This file undefines the macros documented above, and defines
40 LONG_NEEDLE_THRESHOLD.
41*/
42
43#include <limits.h>
44#include <stdint.h>
45
770d76d7
PA
46/* We use the Two-Way string matching algorithm (also known as
47 Chrochemore-Perrin), which guarantees linear complexity with
48 constant space. Additionally, for long needles, we also use a bad
49 character shift table similar to the Boyer-Moore algorithm to
50 achieve improved (potentially sub-linear) performance.
f6ea5628 51
c0c3707f
CB
52 See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53 https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54 https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
f6ea5628
DJ
55*/
56
57/* Point at which computing a bad-byte shift table is likely to be
58 worthwhile. Small needles should not compute a table, since it
59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60 speedup no greater than a factor of NEEDLE_LEN. The larger the
61 needle, the better the potential performance gain. On the other
62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63 memory required for the table is prohibitive. */
64#if CHAR_BIT < 10
65# define LONG_NEEDLE_THRESHOLD 32U
66#else
67# define LONG_NEEDLE_THRESHOLD SIZE_MAX
68#endif
69
f434ba03
PA
70#ifndef MAX
71# define MAX(a, b) ((a < b) ? (b) : (a))
72#endif
f6ea5628
DJ
73
74#ifndef CANON_ELEMENT
75# define CANON_ELEMENT(c) c
76#endif
77#ifndef CMP_FUNC
78# define CMP_FUNC memcmp
79#endif
80
81/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82 Return the index of the first byte in the right half, and set
83 *PERIOD to the global period of the right half.
84
85 The global period of a string is the smallest index (possibly its
86 length) at which all remaining bytes in the string are repetitions
87 of the prefix (the last repetition may be a subset of the prefix).
88
89 When NEEDLE is factored into two halves, a local period is the
90 length of the smallest word that shares a suffix with the left half
91 and shares a prefix with the right half. All factorizations of a
92 non-empty NEEDLE have a local period of at least 1 and no greater
93 than NEEDLE_LEN.
94
95 A critical factorization has the property that the local period
96 equals the global period. All strings have at least one critical
97 factorization with the left half smaller than the global period.
770d76d7
PA
98 And while some strings have more than one critical factorization,
99 it is provable that with an ordered alphabet, at least one of the
100 critical factorizations corresponds to a maximal suffix.
f6ea5628
DJ
101
102 Given an ordered alphabet, a critical factorization can be computed
103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
770d76d7
PA
104 shorter of two ordered maximal suffixes. The ordered maximal
105 suffixes are determined by lexicographic comparison while tracking
f6ea5628
DJ
106 periodicity. */
107static size_t
108critical_factorization (const unsigned char *needle, size_t needle_len,
f434ba03 109 size_t *period)
f6ea5628
DJ
110{
111 /* Index of last byte of left half, or SIZE_MAX. */
112 size_t max_suffix, max_suffix_rev;
113 size_t j; /* Index into NEEDLE for current candidate suffix. */
114 size_t k; /* Offset into current period. */
115 size_t p; /* Intermediate period. */
116 unsigned char a, b; /* Current comparison bytes. */
117
770d76d7
PA
118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119 out 0-length needles. */
120 if (needle_len < 3)
121 {
122 *period = 1;
123 return needle_len - 1;
124 }
125
f6ea5628
DJ
126 /* Invariants:
127 0 <= j < NEEDLE_LEN - 1
128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129 min(max_suffix, max_suffix_rev) < global period of NEEDLE
130 1 <= p <= global period of NEEDLE
131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
132 1 <= k <= p
133 */
134
135 /* Perform lexicographic search. */
136 max_suffix = SIZE_MAX;
137 j = 0;
138 k = p = 1;
139 while (j + k < needle_len)
140 {
141 a = CANON_ELEMENT (needle[j + k]);
142 b = CANON_ELEMENT (needle[max_suffix + k]);
143 if (a < b)
f434ba03
PA
144 {
145 /* Suffix is smaller, period is entire prefix so far. */
146 j += k;
147 k = 1;
148 p = j - max_suffix;
149 }
f6ea5628 150 else if (a == b)
f434ba03
PA
151 {
152 /* Advance through repetition of the current period. */
153 if (k != p)
154 ++k;
155 else
156 {
157 j += p;
158 k = 1;
159 }
160 }
f6ea5628 161 else /* b < a */
f434ba03
PA
162 {
163 /* Suffix is larger, start over from current location. */
164 max_suffix = j++;
165 k = p = 1;
166 }
f6ea5628
DJ
167 }
168 *period = p;
169
170 /* Perform reverse lexicographic search. */
171 max_suffix_rev = SIZE_MAX;
172 j = 0;
173 k = p = 1;
174 while (j + k < needle_len)
175 {
176 a = CANON_ELEMENT (needle[j + k]);
177 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
178 if (b < a)
f434ba03
PA
179 {
180 /* Suffix is smaller, period is entire prefix so far. */
181 j += k;
182 k = 1;
183 p = j - max_suffix_rev;
184 }
f6ea5628 185 else if (a == b)
f434ba03
PA
186 {
187 /* Advance through repetition of the current period. */
188 if (k != p)
189 ++k;
190 else
191 {
192 j += p;
193 k = 1;
194 }
195 }
f6ea5628 196 else /* a < b */
f434ba03
PA
197 {
198 /* Suffix is larger, start over from current location. */
199 max_suffix_rev = j++;
200 k = p = 1;
201 }
f6ea5628
DJ
202 }
203
770d76d7
PA
204 /* Choose the shorter suffix. Return the index of the first byte of
205 the right half, rather than the last byte of the left half.
206
207 For some examples, 'banana' has two critical factorizations, both
208 exposed by the two lexicographic extreme suffixes of 'anana' and
209 'nana', where both suffixes have a period of 2. On the other
210 hand, with 'aab' and 'bba', both strings have a single critical
211 factorization of the last byte, with the suffix having a period
212 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214 critical factorization. Conversely, the maximal reverse
215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216 'aab'. The shorter suffix of the two will always be a critical
217 factorization. */
f6ea5628
DJ
218 if (max_suffix_rev + 1 < max_suffix + 1)
219 return max_suffix + 1;
220 *period = p;
221 return max_suffix_rev + 1;
222}
223
224/* Return the first location of non-empty NEEDLE within HAYSTACK, or
225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227 Performance is guaranteed to be linear, with an initialization cost
228 of 2 * NEEDLE_LEN comparisons.
229
230 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
234static RETURN_TYPE
235two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
f434ba03 236 const unsigned char *needle, size_t needle_len)
f6ea5628
DJ
237{
238 size_t i; /* Index into current byte of NEEDLE. */
239 size_t j; /* Index into current window of HAYSTACK. */
240 size_t period; /* The period of the right half of needle. */
241 size_t suffix; /* The index of the right half of needle. */
242
243 /* Factor the needle into two halves, such that the left half is
244 smaller than the global period, and the right half is
245 periodic (with a period as large as NEEDLE_LEN - suffix). */
246 suffix = critical_factorization (needle, needle_len, &period);
247
248 /* Perform the search. Each iteration compares the right half
249 first. */
250 if (CMP_FUNC (needle, needle + period, suffix) == 0)
251 {
770d76d7
PA
252 /* Entire needle is periodic; a mismatch in the left half can
253 only advance by the period, so use memory to avoid rescanning
254 known occurrences of the period in the right half. */
f6ea5628
DJ
255 size_t memory = 0;
256 j = 0;
257 while (AVAILABLE (haystack, haystack_len, j, needle_len))
f434ba03
PA
258 {
259 /* Scan for matches in right half. */
260 i = MAX (suffix, memory);
261 while (i < needle_len && (CANON_ELEMENT (needle[i])
262 == CANON_ELEMENT (haystack[i + j])))
263 ++i;
264 if (needle_len <= i)
265 {
266 /* Scan for matches in left half. */
267 i = suffix - 1;
268 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
269 == CANON_ELEMENT (haystack[i + j])))
270 --i;
271 if (i + 1 < memory + 1)
272 return (RETURN_TYPE) (haystack + j);
273 /* No match, so remember how many repetitions of period
274 on the right half were scanned. */
275 j += period;
276 memory = needle_len - period;
277 }
278 else
279 {
280 j += i - suffix + 1;
281 memory = 0;
282 }
283 }
f6ea5628
DJ
284 }
285 else
286 {
287 /* The two halves of needle are distinct; no extra memory is
f434ba03 288 required, and any mismatch results in a maximal shift. */
f6ea5628
DJ
289 period = MAX (suffix, needle_len - suffix) + 1;
290 j = 0;
291 while (AVAILABLE (haystack, haystack_len, j, needle_len))
f434ba03
PA
292 {
293 /* Scan for matches in right half. */
294 i = suffix;
295 while (i < needle_len && (CANON_ELEMENT (needle[i])
296 == CANON_ELEMENT (haystack[i + j])))
297 ++i;
298 if (needle_len <= i)
299 {
300 /* Scan for matches in left half. */
301 i = suffix - 1;
302 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
303 == CANON_ELEMENT (haystack[i + j])))
304 --i;
305 if (i == SIZE_MAX)
306 return (RETURN_TYPE) (haystack + j);
307 j += period;
308 }
309 else
310 j += i - suffix + 1;
311 }
f6ea5628
DJ
312 }
313 return NULL;
314}
315
316/* Return the first location of non-empty NEEDLE within HAYSTACK, or
317 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
318 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319 Performance is guaranteed to be linear, with an initialization cost
320 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
321
322 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327 sublinear performance is not possible. */
328static RETURN_TYPE
329two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
f434ba03 330 const unsigned char *needle, size_t needle_len)
f6ea5628
DJ
331{
332 size_t i; /* Index into current byte of NEEDLE. */
333 size_t j; /* Index into current window of HAYSTACK. */
334 size_t period; /* The period of the right half of needle. */
335 size_t suffix; /* The index of the right half of needle. */
336 size_t shift_table[1U << CHAR_BIT]; /* See below. */
337
338 /* Factor the needle into two halves, such that the left half is
339 smaller than the global period, and the right half is
340 periodic (with a period as large as NEEDLE_LEN - suffix). */
341 suffix = critical_factorization (needle, needle_len, &period);
342
343 /* Populate shift_table. For each possible byte value c,
344 shift_table[c] is the distance from the last occurrence of c to
345 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
347 for (i = 0; i < 1U << CHAR_BIT; i++)
348 shift_table[i] = needle_len;
349 for (i = 0; i < needle_len; i++)
350 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
351
352 /* Perform the search. Each iteration compares the right half
353 first. */
354 if (CMP_FUNC (needle, needle + period, suffix) == 0)
355 {
770d76d7
PA
356 /* Entire needle is periodic; a mismatch in the left half can
357 only advance by the period, so use memory to avoid rescanning
358 known occurrences of the period in the right half. */
f6ea5628
DJ
359 size_t memory = 0;
360 size_t shift;
361 j = 0;
362 while (AVAILABLE (haystack, haystack_len, j, needle_len))
f434ba03
PA
363 {
364 /* Check the last byte first; if it does not match, then
365 shift to the next possible match location. */
366 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
367 if (0 < shift)
368 {
369 if (memory && shift < period)
370 {
371 /* Since needle is periodic, but the last period has
372 a byte out of place, there can be no match until
373 after the mismatch. */
374 shift = needle_len - period;
f434ba03 375 }
770d76d7 376 memory = 0;
f434ba03
PA
377 j += shift;
378 continue;
379 }
380 /* Scan for matches in right half. The last byte has
381 already been matched, by virtue of the shift table. */
382 i = MAX (suffix, memory);
383 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
384 == CANON_ELEMENT (haystack[i + j])))
385 ++i;
386 if (needle_len - 1 <= i)
387 {
388 /* Scan for matches in left half. */
389 i = suffix - 1;
390 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
391 == CANON_ELEMENT (haystack[i + j])))
392 --i;
393 if (i + 1 < memory + 1)
394 return (RETURN_TYPE) (haystack + j);
395 /* No match, so remember how many repetitions of period
396 on the right half were scanned. */
397 j += period;
398 memory = needle_len - period;
399 }
400 else
401 {
402 j += i - suffix + 1;
403 memory = 0;
404 }
405 }
f6ea5628
DJ
406 }
407 else
408 {
409 /* The two halves of needle are distinct; no extra memory is
f434ba03 410 required, and any mismatch results in a maximal shift. */
f6ea5628
DJ
411 size_t shift;
412 period = MAX (suffix, needle_len - suffix) + 1;
413 j = 0;
414 while (AVAILABLE (haystack, haystack_len, j, needle_len))
f434ba03
PA
415 {
416 /* Check the last byte first; if it does not match, then
417 shift to the next possible match location. */
418 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
419 if (0 < shift)
420 {
421 j += shift;
422 continue;
423 }
424 /* Scan for matches in right half. The last byte has
425 already been matched, by virtue of the shift table. */
426 i = suffix;
427 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
428 == CANON_ELEMENT (haystack[i + j])))
429 ++i;
430 if (needle_len - 1 <= i)
431 {
432 /* Scan for matches in left half. */
433 i = suffix - 1;
434 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
435 == CANON_ELEMENT (haystack[i + j])))
436 --i;
437 if (i == SIZE_MAX)
438 return (RETURN_TYPE) (haystack + j);
439 j += period;
440 }
441 else
442 j += i - suffix + 1;
443 }
f6ea5628
DJ
444 }
445 return NULL;
446}
447
448#undef AVAILABLE
449#undef CANON_ELEMENT
450#undef CMP_FUNC
451#undef MAX
452#undef RETURN_TYPE
This page took 0.960706 seconds and 4 git commands to generate.