1 /**
2  * Fast, non-allocating string functions.
3  *
4  * Authors:
5  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
6  *
7  * Copyright:
8  *   © 2017 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
9  *
10  * License:
11  *   $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0)
12  */
13 module fast..string;
14 
15 import core.bitop;
16 import core.simd;
17 import core.stdc.stdlib;
18 
19 version (GNU) import gcc.attribute;
20 
21 import std.algorithm;
22 import std.range;
23 import std.stdio;
24 import std..string;
25 import std.traits;
26 
27 import fast.buffer;
28 
29 
30 /**
31  * Splits a string in two around one or more compile-time known code units.
32  *
33  * Params:
34  *   match = An expression that matches all characters around which a split should occur.
35  *   str = The string to scan.
36  *   before = The part before the split is stored here. If no character in $(D match) is found, the original string is returned here.
37  *   after = The part after the split is stored here. If no character in $(D match) is found, $(D null) is returned here.
38  *   splitter = If not $(D null), this pointer will receive a copy of the splitting char.
39  *
40  * Returns:
41  *   $(D true), iff a split occured.
42  */
43 bool split(string match)(scope inout(char[]) str, ref inout(char)[] before, ref inout(char)[] after, char* splitter = null)
44 {
45 	immutable pos = min(str.length, SimdMatcher!match.find(str.ptr, str.ptr + str.length));
46 	before = str[0 .. pos];
47 	if (pos < str.length) {
48 		after = str[pos+1 .. $];
49 		if (splitter) *splitter = str[pos];
50 		return true;
51 	}
52 	after = null;
53 	return false;
54 }
55 
56 /**
57  * Similar to the overload for strings, this function works a little faster as it lacks boundary checks.
58  * It assumes that one of the characters in $(D match) is actually contained in the string.
59  *
60  * Params:
61  *   match = An expression that matches all characters around which a split should occur.
62  *   ptr = The string to scan.
63  *   before = The part before the split is stored here. If no character in $(D match) is found, the original string is returned here.
64  *   after = The pointer to the part after the split is stored here.
65  * 
66  * Returns:
67  *   The char that caused the split. (From $(D match).)
68  */
69 char split(string match)(scope inout(char*) ptr, ref inout(char)[] before, ref inout(char)* after)
70 {
71 	immutable pos = SimdMatcher!match.find(str.ptr);
72 	before = ptr[0 .. pos];
73 	after = ptr + pos + 1;
74 	return ptr[pos];
75 }
76 
77 
78 /*******************************************************************************
79  * 
80  * Finds the first occurrence of a set of compile-time known code units in a
81  * string. While the algorithm is `O(n)` in relation to the count of given code
82  * units, the overhead when using it on short strings weights more for only 1 or
83  * 2 code units.
84  *
85  * Params:
86  *   match = An expression that matches all characters around which a split
87  *           should occur.
88  *   str = The string to search for a code unit.
89  *
90  * Returns:
91  *   If a match is found, the index into the string is returned.
92  *   Otherwise an invalid index is returned. Check with
93  *   `if (result &lt; str.length)`.
94  *
95  * See_Also:
96  *   split,
97  *   $(LINK2 http://mischasan.wordpress.com/2011/11/09/the-generic-sse2-loop/,
98  *           The Generic SSE2 Loop)
99  *
100  * Example:
101  * ---
102  * // Check if there is a '/' or '\' in the string
103  * auto pos = str.find!(`or(=/,=\)`);
104  * if (pos < str.length) { }
105  * ---
106  **************************************/
107 size_t find(string match)(in char[] str) pure nothrow
108 {
109 	return SimdMatcher!match.find(str.ptr, str.ptr + str.length);
110 }
111 
112 /*******************************************************************************
113  * 
114  * Same as the overload for strings, but with only a char*, making it faster as
115  * it cannot do a boundary check.
116  *
117  * Sometimes when looking for a character it is helpful to append it as a
118  * sentinel to the char buffer and then use this function instead of the slower
119  * one that checks the boundary constantly.
120  *
121  * Example:
122  * ---
123  * // Find a ']' in a buffer of 1024 bytes using an additional sentinel.
124  * size_t length = 1024;
125  * char[] buffer = new char[](length+1);
126  * buffer[length] = ']';
127  * auto pos = buffer.ptr.find!("=]");
128  * if (pos < length) { // was an actual find before the sentinel }
129  * ---
130  **************************************/
131 inout(char)* find(string match)(inout(char*) ptr) pure nothrow
132 {
133 	return SimdMatcher!match.find(ptr);
134 }
135 
136 
137 bool keyword1(string key)(in char[] str,
138 	scope bool function(ref immutable(char)* key, ref const(char)* str) mismatcher = null)
139 {
140 	auto strPtr = str.ptr;
141 	auto keyPtr = key.ptr;
142 	auto keyEnd = keyPtr + key.length;
143 
144 	while (keyPtr !is keyEnd)
145 	{
146 		while (*strPtr == '\\')
147 			if (!mismatcher(keyPtr, strPtr))
148 				return false;
149 
150 		if (*strPtr == '"' || *strPtr != *keyPtr)
151 			return false;
152 
153 		strPtr++;
154 		keyPtr++;
155 	}
156 	return true;
157 }
158 
159 
160 bool keyword2(string key)(in char[] str,
161 	scope bool function(ref immutable(char)* key, ref const(char)* str) mismatcher = null)
162 {
163 	version (LDC) import ldc.gccbuiltins_x86;
164 	
165 	/* Since SIMD typically works with word aligned data, we duplicate 'key' for every possible start of 'str' when
166 	 * loaded from an aligned memory address where the first character appears 0 to Word.sizeof bytes into the SIMD
167 	 * register.
168 	 * For 16-byte SIMD we could just create an array of 16 strings with 0 to 15 padding bytes in front and some after,
169 	 * but we can be more compact with at most 16 wasted padding bytes. Since machine registers are powers of 2, if we
170 	 * pad all keys to an odd length and repeat them 16 times we get a sequence with the following properties:
171 	 * - It consists of as many SIMD words as the key is long.
172 	 * - All 16 shift offsets of the key are contained in the SIMD words due to the periodicity introduced by using
173 	 *   disjunct prime factors for the key length and the SIMD word size.
174 	 * Interpreted as an array of SIMD words, it can be indexed with the desired shift multiplied by a constant factor
175 	 * and taken modulo the SIMD array length to use the periodicity. The constant factor is the smallest value that
176 	 * when multiplied with the key length ends up at a SIMD word boundary + 1 (the first shift).
177 	 */
178 	
179 	// 'key' length rounded up to next odd value is the number of SIMD words we need.
180 	enum keyLenOdd = uint(key.length | 1); // TODO: uint or implicit type ?
181 	align(16) static immutable char[keyLenOdd * Word.sizeof] keyData = key.representation
182 		.chain(ubyte(0x20).repeat(keyLenOdd - key.length)).cycle.take(keyLenOdd * Word.sizeof).array;
183 	align(16) static immutable char[Word.sizeof] dquote = '"';
184 	align(16) static immutable char[Word.sizeof] bslash = '\\';
185 	enum mul = { uint result = 0; while ((++result * Word.sizeof + 1) % keyLenOdd) {} return result; }();
186 	
187 	const(char)* strPtr = str.ptr;
188 	immutable(char)* keyPtr = keyData.ptr;
189 	auto bsWord = *cast(immutable Word*) &bslash;
190 	auto dqWord = *cast(immutable Word*) &dquote;
191 	
192 	do
193 	{
194 		//		writeln("enter loop");
195 		// Calculate SSE word boundary before 'str'
196 		size_t strOff = cast(size_t) strPtr % Word.sizeof;
197 		Word strWord = *cast(Word*) (strPtr - strOff);
198 		size_t keyPos = keyPtr - keyData.ptr;
199 		size_t keyOff = (strOff - keyPos) % Word.sizeof;
200 		Word keyWord = (cast(Word*) keyData.ptr)[keyOff * mul % keyLenOdd + (keyOff + keyPos) / Word.sizeof];
201 		
202 		// Escape seqences have priority. 'key' may contain backslashes as part of the text, but in 'str' a backslash
203 		// at the same position is actually the begin of the escape sequence "\\".
204 		Word bsMask = strWord.maskEqual(bsWord);
205 		// If after processing backslashes there is a double-quote in 'str' we must not match it with a double-quote in
206 		// 'key', since it is the delimiter of 'str'.
207 		Word dqMask = strWord.maskEqual(dqWord);
208 		// How many bytes of 'key' and 'str' match in our 'Word' ?
209 		Word missMask = strWord.maskNotEqual(keyWord);
210 		// Merge mismatch, backslash and double-quote masks and move them into a non-SSE register.
211 		Word allMasks = or(missMask, or(bsMask, dqMask));
212 		uint skip = bsf((__builtin_ia32_pmovmskb128(allMasks) | 1 << Word.sizeof) >> strOff);
213 		//		writeln(keyPtr[0 .. 5]);
214 		//		writeln(strPtr[0 .. 5]);
215 		//		writeln(skip);
216 		strPtr += skip;
217 		keyPtr += skip;
218 		
219 		// Have we matched enough bytes to reach the end of 'key' ?
220 		if (keyPtr - keyData.ptr >= key.length)
221 			return true;
222 		
223 		// When we find a mismatch between 'key' and 'str', we try to call a provided helper function.
224 		// It may decode escape sequences in 'str' and recover from the state.
225 		// If that fails we accept the mismatch and return 'false'.
226 		//		writefln("Key: %s, Str %s", *keyPtr, *strPtr);
227 		//		const(char*) strPtrOld = strPtr;
228 		//		immutable(char*) keyPtrOld = keyPtr;
229 		if (strOff + skip < Word.sizeof && !(mismatcher && mismatcher(keyPtr, strPtr)))
230 		{
231 			//			writefln("Key: %s, Str %s", *keyPtr, *strPtr);
232 			return false;
233 		}
234 		//		writefln("Key: %s, Str %s", *keyPtr, *strPtr);
235 	}
236 	while (keyPtr - keyData.ptr < key.length);
237 	
238 	return true;
239 }
240 
241 
242 bool keyword3(string key)(in char[] str, bool function(ref immutable(char)*, ref const(char)*) mismatcher = null)
243 {
244 	version (LDC) import ldc.gccbuiltins_x86;
245 	version (GNU) import gcc.builtins;
246 
247 	/* Since SIMD typically works with word aligned data, we duplicate 'key' for every possible start of 'str' when
248 	 * loaded from an aligned memory address where the first character appears 0 to Word.sizeof bytes into the SIMD
249 	 * register.
250 	 * For 16-byte SIMD we could just create an array of 16 strings with 0 to 15 padding bytes in front and some after,
251 	 * but we can be more compact with at most 16 wasted padding bytes. Since machine registers are powers of 2, if we
252 	 * pad all keys to an odd length and repeat them 16 times we get a sequence with the following properties:
253 	 * - It consists of as many SIMD words as the key is long.
254 	 * - All 16 shift offsets of the key are contained in the SIMD words due to the periodicity introduced by using
255 	 *   disjunct prime factors for the key length and the SIMD word size.
256 	 * Interpreted as an array of SIMD words, it can be indexed with the desired shift multiplied by a constant factor
257 	 * and taken modulo the SIMD array length to use the periodicity. The constant factor is the smallest value that
258 	 * when multiplied with the key length ends up at a SIMD word boundary + 1 (the first shift).
259 	 */
260 	
261 	// 'key' length rounded up to next odd value is the number of SIMD words we need.
262 	enum keyLenOdd = uint(key.length | 1); // TODO: uint or implicit type ?
263 	align(16) static immutable char[keyLenOdd * Word.sizeof] keyData = key.representation
264 		.chain(ubyte(0x20).repeat(keyLenOdd - key.length)).cycle.take(keyLenOdd * Word.sizeof).array;
265 	align(16) static immutable char[Word.sizeof] dqbs = `\"""""""""""""""`;
266 	enum mul = { uint result = 0; while ((++result * Word.sizeof + 1) % keyLenOdd) {} return result; }();
267 
268 	// Calculate SSE word boundary before 'str'
269 	uint off = cast(uint) str.ptr % Word.sizeof;
270 	// SSE aligned pointer <= 'str.ptr'.
271 	auto strPtr = cast(const(Word)*) (str.ptr - off);
272 	auto keyPtr = cast(immutable(Word)*) keyData.ptr + off * mul % keyLenOdd;
273 	auto keyStart = cast(immutable(char)*) keyPtr + off;
274 	Word strWord = *strPtr;
275 
276 LoadKey:
277 	auto keyEnd = keyStart + key.length;
278 
279 Compare:
280 	// Get bitmask of special characters in 'str'.
281 	uint escMask = getScalar(cast(int4) __builtin_ia32_pcmpistrm128(*cast(Word*) &dqbs, strWord, 0b_0_00_00_00));
282 //	writeln("Called a");
283 	// Get bitmask of characters from 'key' and 'str' that don't match.
284 	uint missMask = getScalar(cast(int4) __builtin_ia32_pcmpistrm128(*keyPtr, strWord, 0b_0_01_10_00));
285 //	writeln("Called b");
286 	// Create a merged mask for both and an additional bit at position 16, serving as a delimiter for 'bsf'.
287 	uint mask = (escMask | missMask) & (uint.max << off);
288 
289 	// No bit set means all 16 bytes are equal and there are no escape characters. That's as good as it gets.
290 	if (!mask)
291 	{
292 		// Jump forward by a word size and see if we successfully compared all bytes to the end of our 'key'.
293 		keyPtr += 16;
294 		if (cast(immutable(char)*) keyPtr >= keyEnd)
295 			return true;
296 		// Otherwise continue with the next set of 16 bytes.
297 		strPtr += 16;
298 		off = 0;
299 		goto Compare;
300 	}
301 
302 	// One of two cases ...
303 	off = bsf(mask);
304 
305 	// 1) Did the mismatch occur past the end of 'key' ? Then we compared succesfully.
306 	if (cast(immutable(char)*) keyPtr + off >= keyEnd)
307 		return true;
308 
309 	// 2) It must be a special character or actual mismatch, let 'mismatcher' decide.
310 //	writefln("Skipping: %s", (cast(const(char)*) strPtr)[0 .. off]);
311 	auto strChP = cast(const(char)*) strPtr + off;
312 	auto strChPOld = strChP;
313 	auto keyChP = cast(immutable(char)*) keyPtr + off;
314 	bool goodToGo = mismatcher(keyChP, strChP);
315 
316 //	writefln("Mismatcher used %s key chars, %s str chars and returned: %s", keyAdd, strAdd, goodToGo);
317 	if (keyChP >= keyEnd)
318 		return true;
319 	if (!goodToGo)
320 		return false;
321 
322 	// Arriving here we just decoded an escape sequence and have to adjust our pointers.
323 	auto keyPos = keyChP - keyStart;
324 	off += strChP - strChPOld;
325 	if (off >= 16)
326 	{
327 		strPtr += off / 16;
328 		strWord = *strPtr;
329 		off %= 16;
330 	}
331 	auto baseOff = (off - keyPos) & 15;
332 	keyPtr = cast(immutable(Word)*) keyData.ptr + baseOff * mul % keyLenOdd;
333 	keyStart = cast(immutable(char)*) keyPtr + baseOff;
334 	keyPtr += (baseOff + keyPos) / 16;
335 	goto LoadKey;
336 }
337 
338 
339 size_t equalLength(scope inout(char[]) a, scope inout(char[]) b)
340 {
341 	return 0;
342 }
343 
344 
345 /*******************************************************************************
346  * 
347  * Concatenates a series of strings.
348  *
349  * Params:
350  *   Strs = a series of string symbols or literals to be concatenated
351  *   buffer = optional buffer, implicitly allocated
352  *
353  * Returns:
354  *   A $(D TempBuffer!char) containing the concatenated string. It is kept alive
355  *   for as long as it is in scope.
356  *
357  **************************************/
358 nothrow @nogc
359 template concat(Strs...)
360 {
361 	import core.stdc..string : memcpy;
362 	import fast.internal.helpers;
363 
364 	enum allocExpr = ctfeJoin!(Strs.length)("Strs[%s].length", "+") ~ "+1";
365 
366 	auto concat(void* buffer = (mixin(allocExpr) <= allocaLimit) ? alloca(mixin(allocExpr)) : null)
367 	{
368 		immutable length = mixin(allocExpr);
369 		auto result = TempBuffer!char(
370 			(cast(char*) (buffer is null ? malloc(length) : buffer))[0 .. length - 1],
371 			buffer is null);
372 
373 		char* p = result.ptr;
374 		foreach (const(char[]) str; Strs)
375 		{
376 			memcpy (p, str.ptr, str.length);
377 			p += str.length;
378 		}
379 		*p = '\0';
380 
381 		return result;
382 	}
383 }
384 
385 
386 
387 private:
388 
389 template SimdMatcher(string match)
390 {
391 	import core.simd;
392 	import std..string;
393 	import fast.internal.sysdef;
394 	
395 	static if (match != strip(match)) {
396 		// Reinstanciate the template with any whitespace stripped from the match string.
397 		alias SimdMatcher = SimdMatcher!(strip(match));
398 	} else {
399 		/* For SSE in DMD I am blocked by:
400 		 * https://d.puremagic.com/issues/show_bug.cgi?id=8047
401 		 * https://d.puremagic.com/issues/show_bug.cgi?id=11585
402 		 */
403 		enum isUsingSSE = hasSSE2 && (isLDC || isGDC);
404 		enum isSingleChar = match.length == 2 && match[0] == '=';
405 		static if (isSingleChar) enum singleChar = match[1];
406 		static if (isUsingSSE) {
407 			// Using MOVMSKB we get one boolean per bit in a 16-bit value.
408 			alias Word = ubyte16;
409 			alias Mask = uint;
410 			enum sparseness = 1;
411 		} else {
412 			// The fallback is to work with machine words and tricky bit-twiddling algorithms.
413 			// As a result we get machine words where matching bytes have the high bit set.
414 			alias Word = size_t;
415 			alias Mask = size_t;
416 			enum sparseness = 8;
417 		}
418 		enum matchCode = genMatchCode!isUsingSSE("*wp");
419 		// Used in generic comparison code
420 		enum lows = size_t.max / 0xFF;
421 		enum highs = lows * 0x80;
422 		
423 		enum betterUseTables = (isDMD && matchCode.complexity >= 4)
424 			|| (isGDC && matchCode.complexity >= 18)
425 			|| (isLDC && matchCode.complexity >= 18);
426 
427 		static if (betterUseTables)
428 		{
429 			immutable matchTable = genMatchTable();
430 			
431 			size_t find(scope inout(char*) b, scope inout(char*) e) pure nothrow @nogc
432 			{
433 				import core.stdc..string;
434 				import fast.internal.helpers;
435 
436 				// catch "strlen" and "memchr" like calls, that are highly optimized compiler built-ins.
437 				static if (isSingleChar) {
438 					return memchr(b, singleChar, e - b) - b;
439 				} else {
440 					if (b >= e) return 0;
441 					
442 					size_t off = cast(size_t) b % ushort.sizeof;
443 					ushort* wp = cast(ushort*) (b - off);
444 					ushort* we = cast(ushort*) alignPtrNext(e, ushort.sizeof);
445 					if (off) {
446 						// Throw away bytes from before start of the string
447 						if (auto mask = matchTable[*wp] >> off)
448 							return bsf(mask);
449 						if (++wp is we) return size_t.max;
450 					}
451 					
452 					do {
453 						if (auto mask = matchTable[*wp])
454 							return bsf(mask) + (cast(char*) wp - b);
455 					} while (++wp !is we);
456 					return size_t.max;
457 				}
458 			}
459 			
460 			inout(char)* find(scope inout(char*) b) pure nothrow @nogc
461 			{
462 				import core.stdc..string;
463 				// catch "strlen" and "memchr" like calls, that are highly optimized compiler built-ins.
464 				static if (isSingleChar && singleChar == '\0') {
465 					return strlen(b) + b;
466 				} else static if (isSingleChar && isDMD) { // DMD is better off using optimized C library code.
467 					return memchr(b, singleChar, e - b) - b;
468 				} else {
469 					size_t off = cast(size_t) b % ushort.sizeof;
470 					ushort* wp = cast(ushort*) (b - off);
471 					if (off) {
472 						// Throw away bytes from before start of the string
473 						if (auto mask = matchTable[*wp] >> off)
474 							return b + bsf(mask);
475 					}
476 					
477 					do {
478 						if (auto mask = matchTable[*wp])
479 							return cast(inout(char)*) wp + bsf(mask);
480 					} while (true);
481 				}
482 			}
483 		}
484 		else
485 		{
486 			import core.stdc..string, core.simd;
487 			import std.simd;
488 			import fast.internal.helpers;
489 			
490 			version (LDC) {
491 				import ldc.gccbuiltins_x86;
492 			} else version (GNU) {
493 				import gcc.builtins;
494 			}
495 			
496 			size_t find(scope inout(char*) b, scope inout(char*) e) pure nothrow
497 			{
498 				// catch "strlen" and "memchr" like calls, that are highly optimized compiler built-ins.
499 				static if (isSingleChar) {
500 					return memchr(b, singleChar, e - b) - b;
501 				} else {
502 					if (b >= e) return 0;
503 					
504 					size_t off = cast(size_t) b % Word.sizeof;
505 					Word* wp = cast(Word*) (b - off);
506 					Word* we = cast(Word*) alignPtrNext(e, Word.sizeof);
507 					if (off) {
508 						// Throw away bytes from before start of the string
509 						if (auto mask = (mixin(matchCode.code)) >> (off * sparseness))
510 							return bsf(mask) / sparseness;
511 						if (++wp is we) return size_t.max;
512 					}
513 					
514 					do {
515 						if (auto mask = mixin(matchCode.code))
516 							return bsf(mask) / sparseness + (cast(char*) wp - b);
517 					} while (++wp !is we);
518 					return size_t.max;
519 				}
520 			}
521 			
522 			inout(char)* find(scope inout(char*) b) pure nothrow
523 			{
524 				// catch "strlen" and "memchr" like calls, that are highly optimized compiler built-ins.
525 				static if (isSingleChar && singleChar == '\0') {
526 					return strlen(b) + b;
527 				} else static if (isSingleChar && isDMD) { // DMD is better off using optimized C library code.
528 					return cast(inout(char*)) memchr(b, singleChar, size_t.max);
529 				} else {
530 					size_t off = cast(size_t) b % Word.sizeof;
531 					Word* wp = cast(Word*) (b - off);
532 					if (off) {
533 						// Throw away bytes from before start of the string
534 						if (auto mask = (mixin(matchCode.code)) >> (off * sparseness))
535 							return b + bsf(mask) / sparseness;
536 						++wp;
537 					}
538 					
539 					do {
540 						if (auto mask = mixin(matchCode.code))
541 							return cast(inout(char)*) wp + bsf(mask) / sparseness;
542 						++wp;
543 					} while (true);
544 				}
545 			}
546 		}
547 		
548 		enum genMatchCode(bool sse)(string var)
549 		{
550 			import std.ascii, std.exception;
551 			
552 			struct Code {
553 				string code;
554 				size_t complexity = 1;
555 			}
556 			Code result;
557 			string[] nesting;
558 			
559 			with (result) {
560 				for (size_t i = 0; i < match.length;) {
561 					string handleChar() {
562 						char c = match[i+1];
563 						switch (c) {
564 							case 0:
565 								return `'\0'`;
566 							case '\\':
567 								return `'\\'`;
568 							case "'"[0]:
569 								return `'\''`;
570 							case '\t':
571 								return `'\t'`;
572 							case '\r':
573 								return `'\r'`;
574 							case '\n':
575 								return `'\n'`;
576 							default:
577 								return `'` ~ c ~ `'`;
578 						}
579 					}
580 					
581 					if (match[i] == '=') {
582 						static if (sse) {
583 							code ~= "maskEqual(" ~ var ~ ", SIMDFromScalar!(ubyte16, " ~ handleChar() ~ "))";
584 						} else if (match[i+1] == 0) {
585 							code ~= "" ~ var ~ " - lows & ~" ~ var;
586 						} else {
587 							code ~= "(" ~ var ~ " ^ lows * " ~ handleChar() ~ ") - lows & ~(" ~ var ~ " ^ lows * " ~ handleChar() ~ ")";
588 						}
589 						i += 2;
590 					} else if (match[i] == '!') {
591 						static if (sse) {
592 							code ~= "maskNotEqual(" ~ var ~ ", SIMDFromScalar!(ubyte16, " ~ handleChar() ~ "))";
593 						} else if (match[i+1] == 0) {
594 							code ~= "(~(" ~ var ~ " - lows) | " ~ var ~ ")";
595 						} else {
596 							code ~= "(~((" ~ var ~ " ^ lows * " ~ handleChar() ~ ") - lows) | (" ~ var ~ " ^ lows * " ~ handleChar() ~ "))";
597 						}
598 						i += 2;
599 					} else if (match[i] == '<') {
600 						static if (sse)
601 							code ~= "maskGreater(SIMDFromScalar!(ubyte16, " ~ handleChar() ~ "), " ~ var ~ ")";
602 						else
603 							code ~= "maskLessGeneric!" ~ handleChar() ~ "(" ~ var ~ ")";
604 						i += 2;
605 					} else if (match[i] == '>') {
606 						static if (sse)
607 							code ~= "maskGreater(" ~ var ~ ", SIMDFromScalar!(ubyte16, " ~ handleChar() ~ "))";
608 						else
609 							code ~= "maskGreaterGeneric!" ~ handleChar() ~ "(" ~ var ~ ")";
610 						i += 2;
611 					} else if (match[i .. $].startsWith("or(")) {
612 						static if (sse) {
613 							nesting ~= ", ";
614 							code ~= "or(";
615 						} else {
616 							nesting ~= " | ";
617 						}
618 						complexity++;
619 						i += 3;
620 					} else if (match[i .. $].startsWith("and(")) {
621 						static if (sse) {
622 							nesting ~= ", ";
623 							code ~= "and(";
624 						} else {
625 							nesting ~= " & ";
626 						}
627 						complexity++;
628 						i += 4;
629 					} else if (match[i] == ',') {
630 						enforce(nesting.length, "',' on top level");
631 						code ~= nesting[$-1];
632 						i++;
633 					} else if (match[i] == ')') {
634 						enforce(nesting.length, "Unbalanced closing parenthesis");
635 						nesting.length--;
636 						static if (sse) {
637 							code ~= ")";
638 						}
639 						i++;
640 					} else if (match[i].isWhite) {
641 						i++;
642 					} else {
643 						throw new Exception(format("Unexpected character at index %s: 0x%02x", i, match[i]));
644 					}
645 				}
646 				static if (sse) {
647 					code = "__builtin_ia32_pmovmskb128(" ~ code ~ ")";
648 				} else {
649 					code = "(" ~ code ~ ") & highs";
650 				}
651 			}
652 			return result;
653 		}
654 		
655 		enum genMatchTable()
656 		{
657 			ubyte[1 << 16] table;
658 			ubyte[256] lut;
659 			foreach (uint i; 0 .. 256) {
660 				lut[i] = (mixin(genMatchCode!false("i").code) >> 7) & 1;
661 			}
662 			foreach (i; 0 .. 256) foreach (k; 0 .. 256) {
663 				table[i * 256 + k] = cast(ubyte) (lut[i] << 1 | lut[k]);
664 			}
665 			return table;
666 		}
667 	}
668 }
669 
670 /**
671  * Template for searching a fixed value in a word sized memory block (i.e. 1, 2, 4 or 8 bytes).
672  *
673  * Params:
674  *   value = The value you are looking for.
675  *   word = The data word to search for the value.
676  *
677  * Returns:
678  *   non-zero, iff the value is contained in the data word.
679  *   Specifically it returns 0x80 for every byte of the word that was a match and 0x00 for others.
680  *
681  * See_Also:
682  *   http://graphics.stanford.edu/~seander/bithacks.html#ValueInWord
683  */
684 T maskEqualGeneric(ubyte value, T)(T word) @safe pure nothrow
685 	if (isUnsigned!T)
686 {
687 	// This value results in 0x01 for each byte of a T value.
688 	enum lows = T.max / 0xFF;
689 	static if (value == 0) {
690 		enum highs = lows * 0x80;
691 		return (word - lows) & ~word & highs;
692 	} else {
693 		enum xor = lows * value;
694 		return maskEqualGeneric!0(word ^ xor);
695 	}
696 }
697 
698 T maskLessGeneric(ubyte value, T)(T word) @safe pure nothrow
699 	if (isUnsigned!T && value <= 128)
700 {
701 	enum lows = T.max / 0xFF;
702 	enum highs = lows * 0x80;
703 	return (word - lows * value) & ~word & highs;
704 }
705 
706 T maskGreaterGeneric(ubyte value, T)(T word) @safe pure nothrow
707 	if (isUnsigned!T && value <= 127)
708 {
709 	enum lows = T.max / 0xFF;
710 	enum highs = lows * 0x80;
711 	return (word + lows * (127 - value) | word) & highs;
712 }
713 
714 T orGeneric(T)(T a, T b) @safe pure nothrow
715 	if (isUnsigned!T)
716 {
717 	return a | b;
718 }