1 /*************************************************************************************************** 2 * 3 * Text parsing functionality. 4 * 5 * Authors: 6 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 7 * 8 * Copyright: 9 * © 2015 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 10 * 11 * License: 12 * $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0) 13 * 14 **************************************************************************************************/ 15 module fast.parsing; 16 17 import std.traits; 18 import fast.helpers; 19 20 21 /+ 22 ╔══════════════════════════════════════════════════════════════════════════════ 23 ║ ⚑ Hexadecimal 24 ╚══════════════════════════════════════════════════════════════════════════════ 25 +/ 26 27 /******************************************************************************* 28 * 29 * Decodes a single hexadecimal character. 30 * 31 * Params: 32 * c = The hexadecimal digit. 33 * 34 * Returns: 35 * `c` converted to an integer. 36 * 37 **************************************/ 38 @safe @nogc pure nothrow 39 uint hexDecode(char c) 40 { 41 return c + 9 * (c >> 6) & 15; 42 } 43 44 45 @nogc pure nothrow 46 uint hexDecode4(ref const(char)* hex) 47 { 48 uint x = *cast(uint*) &hex; 49 hex += 4; 50 x = (x & 0x0F0F0F0F) + 9 * (x >> 6 & 0x01010101); 51 version (LittleEndian) 52 { 53 return x >> 24 | x >> 12 & 0xF0 | x & 0xF00 | x << 12 & 0xF000; 54 } 55 else 56 { 57 x = (x | x >> 4) & 0x00FF00FF; 58 return (x | x >> 8) & 0x0000FFFF; 59 } 60 } 61 62 63 @nogc pure nothrow 64 inout(char)* hexDecode4(ref inout(char)* hex, out uint result) 65 { 66 foreach (i; 0 .. 4) 67 { 68 char ch = hex[i]; 69 result *= 16; 70 if (ch >= '0' && ch <= '9') 71 { 72 result += ch - '0'; 73 } 74 else 75 { 76 ch |= 0x20; 77 if (ch >= 'a' && ch <= 'f') 78 result += ch - 'a' + 10; 79 else 80 return hex + i; 81 } 82 } 83 hex += 4; 84 return null; 85 } 86 87 88 /+ 89 ╔══════════════════════════════════════════════════════════════════════════════ 90 ║ ⚑ Numbers 91 ╚══════════════════════════════════════════════════════════════════════════════ 92 +/ 93 94 95 /// Options for `parseNumber`. 96 struct NumberOptions 97 { 98 /// Allows the minus sign as the first character and thus negative numbers. 99 bool minus; 100 } 101 102 103 /******************************************************************************* 104 * 105 * Parse a number from a character read pointer. 106 * 107 * On success, the read pointer is set behind the number. 108 * 109 * Params: 110 * opt = Selects features for the implementation. Less features make the 111 * parser faster. 112 * str = The read pointer. 113 * n = A reference to a number to be overwritten with the result. 114 * 115 * Returns: 116 * An indication of success. Typically the function fails when a number cannot 117 * be stored in an integer of the given size or invalid characters are 118 * encountered. 119 * 120 **************************************/ 121 @nogc pure nothrow 122 bool parseNumber(NumberOptions opt, N)(ref const(char)* str, ref N n) if (isNumeric!N) 123 { 124 import core.bitop; 125 import std.range; 126 import fast.helpers; 127 128 // Integer types larger than the mantissa of N. 129 static if (N.sizeof <= size_t.sizeof) 130 { 131 alias U = size_t; 132 alias I = ptrdiff_t; 133 } 134 else 135 { 136 alias U = ulong; 137 alias I = long; 138 } 139 140 // Largest value of type U that can be multiplied by 10 and have a digit added without overflow. 141 enum canHoldOneMoreDigit = (U.max - 9) / 10; 142 static if (isFloatingPoint!N) 143 { 144 enum significandRightShift = 8 * U.sizeof - N.mant_dig + 1; 145 enum lastSignificandBit = U(2) << 8 * U.sizeof - N.mant_dig; 146 enum firstFractionBit = U(1) << 8 * U.sizeof - N.mant_dig; 147 enum remainderBits = U.max - N.mant_dig + 1; 148 enum expShift = N.mant_dig - 1; 149 enum expBias = N.max_exp - 1; 150 } 151 152 static if (isFloatingPoint!N) 153 { 154 alias pow5Max = PowData!(U, 5).powMax; 155 alias pow5 = PowData!(U, 5).pows; 156 157 // Largest power of 10 that fits into a float of type N. The factor 5 here is correct, as the 2s 158 // go in as an increment in the exponent, that is neglectable here. 159 enum pow10MaxF = { 160 U v = 1; uint exp; 161 while (v <= ((U(1) << N.mant_dig) - 1) / 5) { v *= 5; exp++; } 162 return exp; 163 }(); 164 165 static immutable N[pow10MaxF] pow10F = N(10).recurrence!((a, n) => 10 * a[n-1]).take(pow10MaxF).array; 166 } 167 else 168 { 169 alias pow10Max = PowData!(U, 10).powMax; 170 alias pow10 = PowData!(U, 10).pows; 171 } 172 173 const(char)* p = str; 174 const(char)* point = null; 175 U significand = 0; 176 size_t exponent = 0; 177 size_t expAdjust = void; 178 bool expSign = void; 179 static if (isFloatingPoint!N) 180 { 181 U exp2 = void; 182 bool roundUp = false; 183 } 184 185 /////////////////// SIGN BIT HANDLING /////////////////// 186 187 // Check for the sign. 188 static if (opt.minus) 189 { 190 bool sign = (*p == '-'); 191 if (sign) 192 p++; 193 } 194 195 /////////////////// INTEGRAL PART OF SIGNIFICAND /////////////////// 196 197 uint digit = *p - '0'; 198 if (digit == 0) 199 { 200 // We have a single zero. 201 p++; 202 } 203 else if (digit <= 9) 204 { 205 // Regular case of one or more digits. 206 do 207 { 208 if (significand > canHoldOneMoreDigit) 209 goto BigMantissa; 210 BigMantissaNotSoMuch: 211 significand = 10 * significand + digit; 212 digit = *++p - '0'; 213 } 214 while (digit <= 9); 215 } 216 else return false; 217 218 /////////////////// FRACTIONAL PART OF SIGNIFICAND /////////////////// 219 220 if (*p == '.') 221 { 222 point = ++p; 223 digit = *p - '0'; 224 if (digit > 9) 225 return false; 226 do 227 { 228 if (significand > canHoldOneMoreDigit) 229 goto BigMantissa; 230 significand = 10 * significand + digit; 231 digit = *++p - '0'; 232 } 233 while (digit <= 9); 234 } 235 236 /////////////////// EXPONENT HANDLING /////////////////// 237 238 expAdjust = (point is null) ? 0 : p - point; 239 if ((*p | 0x20) == 'e') 240 { 241 p++; 242 expSign = (*p == '-'); 243 if (expSign || *p == '+') 244 p++; 245 digit = *p - '0'; 246 if (digit > 9) 247 return false; 248 do 249 { 250 if (exponent > canHoldOneMoreDigit) 251 goto BigExponent; 252 exponent = 10 * exponent + digit; 253 digit = *++p - '0'; 254 } 255 while (digit <= 9); 256 } 257 258 if (expAdjust) 259 { 260 if (expSign) 261 { 262 if (exponent > size_t.max - expAdjust) 263 goto BigExponentAdjustForDecimalPoint; 264 exponent += expAdjust; 265 } 266 else if (exponent >= expAdjust) 267 { 268 exponent -= expAdjust; 269 } 270 else 271 { 272 // Amount of fraction digits turns exponent from positive to negative. 273 expAdjust -= exponent; 274 exponent = expAdjust; 275 expSign = true; 276 } 277 } 278 279 /////////////////// RESULT ASSEMBLY /////////////////// 280 281 static if (isFloatingPoint!N) 282 { 283 if (significand == 0 || exponent == 0) 284 { 285 // The significand is the unsigned result. 286 static if (opt.minus) 287 if (sign) 288 n = -N(significand); 289 n = +N(significand); 290 str = p; 291 return true; 292 } 293 294 // Try the floating-point fast path: The significand's bits, as well as the 10^x exponent can be expressed 295 // accurately as a float of type N. We just need to divide or multiply them based on the signedness of the 296 // exponent. 297 exp2 = bsr(significand); 298 if (exp2 - bsf(significand) < N.mant_dig && exponent <= pow10MaxF) 299 { 300 N b = pow10F[exponent - 1]; 301 static if (opt.minus) 302 if (sign) 303 b = -b; 304 n = expSign ? significand / b : significand * b; 305 str = p; 306 return true; 307 } 308 else if (exponent <= pow5Max) 309 { 310 // Special case, mostly to handle the little bit of extra precision that comes from 311 // converting a double to its string representation. The last base-10 digit doesn't quite 312 // fit back into a double, but we don't need to resort to arbitrary precision math just yet. 313 if (expSign) 314 { 315 U divisor = pow5[exponent - 1]; 316 static if (isAMD64 && (isLDC || isGDC)) 317 { 318 // AMD64 can divide 128-bit numbers by 64-bit numbers directly. 319 ubyte expDivisor = clz(divisor); 320 divisor <<= expDivisor; 321 exp2 = expDivisor - exponent - bigDiv(significand, divisor); 322 significand <<= 1; 323 } 324 else 325 { 326 // We perform an iterative division. 327 U dividend = significand << 8 * U.sizeof - 1 - exp2; 328 U quotient = dividend / divisor; 329 dividend %= divisor; 330 331 ubyte lzs = clz(quotient); 332 exp2 -= exponent + lzs; 333 significand = quotient << ++lzs; 334 size_t accuracy = 8 * U.sizeof - lzs; 335 while (accuracy < N.mant_dig) 336 { 337 lzs = clz(dividend); 338 dividend <<= lzs; 339 quotient = dividend / divisor; 340 dividend %= divisor; 341 significand |= quotient << (8 * U.sizeof - lzs) >> accuracy; 342 accuracy += lzs; 343 } 344 } 345 346 // Assemble floating point value from bits. 347 roundUp = (significand & firstFractionBit) != 0; 348 significand >>= significandRightShift; 349 if (roundUp) 350 { 351 significand++; 352 significand &= ~(U(1) << N.mant_dig - 1); 353 if (significand == 0) 354 ++exp2; 355 } 356 357 U* result = cast(U*) &n; 358 *result = exp2 + expBias << expShift | significand; 359 static if (opt.minus) 360 *result |= U(sign) << U.sizeof * 8 - 1; 361 str = p; 362 return true; 363 } 364 else assert(0, "Not implemented"); 365 } 366 else assert(0, "Not implemented"); 367 } 368 else 369 { 370 import fast.intmath; 371 372 if (exponent && significand) 373 { 374 // We need to account for the exponent. 375 U pow = pow10[exponent - 1]; 376 if (expSign) 377 { 378 // Negative exponent, if we get a fractional result, abort. 379 if (significand % pow) 380 return false; 381 significand /= pow; 382 } 383 else static if (U.sizeof < ulong.sizeof) 384 { 385 // Multiply using a bigger result type 386 ulong prod = ulong(significand) * pow; 387 if (prod > U.max) 388 return false; 389 significand = cast(U) prod; 390 } 391 else 392 { 393 // If the multiply will overflow, abort. 394 bool overflowed; 395 significand = mulu(significand, pow, overflowed); 396 if (overflowed) 397 return false; 398 } 399 } 400 401 n = cast(N) significand; 402 static if (isSigned!N && opt.minus) 403 { 404 if (significand > U(N.max) + sign) 405 return false; 406 if (sign) 407 n = -n; 408 } 409 else if (significand > N.max) 410 return false; 411 str = p; 412 return true; 413 } 414 415 BigMantissa: 416 if (significand <= (significand.max - digit) / 10) 417 goto BigMantissaNotSoMuch; 418 // assert(0, "Not implemented"); 419 420 BigExponent: 421 // assert(0, "Not implemented"); 422 423 BigExponentAdjustForDecimalPoint: 424 // assert(0, "Not implemented"); 425 return false; 426 } 427 428 429 private template PowData(U, U base) 430 { 431 import std.range; 432 433 // Largest power of `base` that fits into an integer of type U. 434 enum powMax = { U v = 1; uint exp; while (v <= U.max / base) { v *= base; exp++; } return exp; }(); 435 436 // Table of powers of `base`. (We skip base^0) 437 static immutable U[powMax] pows = base.recurrence!((a, n) => base * a[n-1]).take(powMax).array; 438 } 439 440 441 static if (isAMD64 && (isLDC || isGDC)) 442 { 443 @nogc pure nothrow 444 private ubyte bigDiv(ref size_t a, size_t b) 445 in 446 { 447 assert(b > size_t.max / 2, "High bit of divisor must be set."); 448 } 449 body 450 { 451 // Make sure that the division will yield exactly 32 or 64 significant bits. 452 ubyte lza = clz(a); 453 version (LDC) 454 { 455 import ldc.llvmasm; 456 a <<= lza; 457 if (a >= b) { a >>= 1; lza--; } 458 a = __asm!ulong(" 459 xor %rax, %rax 460 divq $2 461 ", "={rax},{rdx},rm", a, b); 462 } 463 else version (GNU) 464 { 465 size_t dividend = a << lza; 466 if (dividend >= b) { dividend >>= 1; lza--; } 467 asm { " 468 xor %%rax, %%rax 469 divq %3 470 " : "=&a" a, "=d" dividend : "d" dividend, "rm" b; } 471 } 472 return ++lza; 473 } 474 475 unittest 476 { 477 size_t a = size_t.max / 11; 478 size_t b = size_t.max / 5; 479 version (X86_64) 480 { 481 int exp = clz(b); // Positive base-2 exponent 482 b <<= exp; 483 exp -= bigDiv(a, b); 484 assert(a == 0xE8BA2E8BA2E8BA2AUL); 485 assert(exp == -1); 486 } 487 } 488 } 489 490 491 /+ 492 ╔══════════════════════════════════════════════════════════════════════════════ 493 ║ ⚑ String Scanning and Comparison 494 ╚══════════════════════════════════════════════════════════════════════════════ 495 +/ 496 497 /******************************************************************************* 498 * 499 * Compares a string of unknown length against a statically known key. 500 * 501 * This function also handles escapes and requires one or more terminator chars. 502 * 503 * Params: 504 * C = Character with. 505 * key = The static key string. 506 * terminators = A list of code units that terminate the string. 507 * special = A list of code units that are handled by the user callback. Use 508 * this for escape string handling. Default is `null`. 509 * p_str = Pointer to the string for the comparison. 510 * callback = User callback to handle special escape characters if `special` 511 * is non-empty. 512 * 513 * Returns: 514 * A code with following meanings: -1 = not equal, terminator character hit, 515 * 0 = not equal, but string not exhausted, 1 = string equals key. 516 * 517 **************************************/ 518 int fixedTermStrCmp(C, immutable C[] key, immutable C[] terminators, immutable C[] special = null) 519 (ref const(C)* p_str, scope bool delegate(ref immutable(char)*, ref const(char)*) callback = null) 520 in 521 { 522 assert(special.length == 0 || callback !is null); 523 } 524 body 525 { 526 import std.algorithm, std.range; 527 528 static immutable byte[256] classify = 529 iota(256).map!(c => terminators.canFind(c) ? byte(-1) : special.canFind(c) ? 1 : 0).array; 530 531 immutable(C)* p_key = key.ptr; 532 immutable C* e_key = p_key + key.length; 533 534 while (p_key !is e_key) 535 { 536 int clazz = *p_str <= 0xFF ? classify[*p_str] : 0; 537 538 if (clazz < 0) 539 { 540 return clazz; 541 } 542 else if (clazz == 0) 543 { 544 if (*p_str != *p_key) 545 return clazz; 546 547 p_str++; 548 p_key++; 549 } 550 else if (clazz > 0) 551 { 552 if (!callback(p_key, p_str)) 553 return 0; 554 } 555 } 556 557 return classify[*p_str & 0xFF] < 0; 558 } 559 560 561 /* 562 @nogc nothrow 563 void fixedStringCompareSSE4() 564 { 565 enum words = key.length / 16; 566 enum remainder = key.length % 16; 567 enum contains0 = key.canFind('\0'); // For SSE4.2 string search. 568 static assert(!contains0, "Not implemented"); 569 570 size_t remaining = e - b; 571 auto p = b; 572 573 foreach (i; staticIota!(0, words)) 574 { 575 auto backup = p; 576 p.vpcmpistri!(char, key[16 * i .. 16 * i + 16], Operation.equalElem, Polarity.negateValid); 577 p = backup; 578 p.vpcmpistri!(char, key[16 * i .. 16 * i + 16], Operation.equalElem, Polarity.negateValid); 579 } 580 } 581 */ 582 583 584 @forceinline @nogc nothrow pure 585 void seekToAnyOf(string cs)(ref const(char)* p) 586 { 587 p.vpcmpistri!(char, sanitizeChars(cs), Operation.equalAnyElem); 588 } 589 590 591 @forceinline @nogc nothrow pure 592 void seekToRanges(string cs)(ref const(char)* p) 593 { 594 p.vpcmpistri!(char, sanitizeRanges(cs), Operation.inRanges); 595 } 596 597 598 /******************************************************************************* 599 * 600 * Searches for a specific character known to appear in the stream and skips the 601 * read pointer over it. 602 * 603 * Params: 604 * c = the character 605 * p = the read pointer 606 * 607 **************************************/ 608 @forceinline @nogc nothrow pure 609 void seekPast(char c)(ref const(char)* p) 610 { 611 p.vpcmpistri!(char, c.repeat(16).to!string, Operation.equalElem); 612 p++; 613 } 614 615 616 /******************************************************************************* 617 * 618 * Skips the read pointer over characters that fall into any of up to 8 ranges 619 * of characters. The first character in `cs` is the start of the first range, 620 * the second character is the end. This is repeated for any other character 621 * pair. A character falls into a range from `a` to `b` if `a <= *p <= b`. 622 * 623 * Params: 624 * cs = the character ranges 625 * p = the read pointer 626 * 627 **************************************/ 628 @forceinline @nogc nothrow pure 629 void skipCharRanges(string cs)(ref const(char)* p) 630 { 631 p.vpcmpistri!(char, cs, Operation.inRanges, Polarity.negate); 632 } 633 634 635 /******************************************************************************* 636 * 637 * Skips the read pointer over all and any of the given characters. 638 * 639 * Params: 640 * cs = the characters to skip over 641 * p = the read pointer 642 * 643 **************************************/ 644 @forceinline @nogc nothrow pure 645 void skipAllOf(string cs)(ref const(char)* p) 646 { 647 p.vpcmpistri!(char, cs, Operation.equalAnyElem, Polarity.negate); 648 } 649 650 651 /******************************************************************************* 652 * 653 * Skips the read pointer over ASCII white-space including comprising '\t', 654 * '\r', '\n' and ' '. 655 * 656 * Params: 657 * p = the read pointer 658 * 659 **************************************/ 660 @forceinline @nogc nothrow pure 661 void skipAsciiWhitespace(ref const(char)* p) 662 { 663 if (*p == ' ') 664 p++; 665 if (*p > ' ') 666 return; 667 p.skipAllOf!" \t\r\n"; 668 } 669 670 671 /******************************************************************************* 672 * 673 * Sets the read pointer to the start of the next line. 674 * 675 * Params: 676 * p = the read pointer 677 * 678 **************************************/ 679 @forceinline @nogc nothrow pure 680 void skipToNextLine(ref const(char)* p) 681 { 682 p.vpcmpistri!(char, "\r\n", Operation.equalAnyElem); 683 if (p[0] == '\r' && p[1] == '\n') 684 p++; 685 p++; 686 } 687 688 689 private enum sanitizeChars(string cs) 690 { 691 import std.exception; 692 693 bool has0 = false; 694 foreach (c; cs) if (!c) { has0 = true; break; } 695 assert(has0, "Parsers are required to also check for \0 when looking for specific chars."); 696 697 char[] result; 698 foreach (i; 1 .. 256) foreach (c; cs) if (i == c) 699 result ~= c; 700 return result.assumeUnique; 701 } 702 703 704 private enum sanitizeRanges(string cs) 705 { 706 import std.exception; 707 708 bool has0 = false; 709 foreach (i; 0 .. cs.length / 2) if (!cs[2*i]) { has0 = true; break; } 710 assert(has0, "Parsers are required to also check for \0 when looking for specific chars."); 711 712 char[] result; 713 foreach (i; 0 .. cs.length / 2) 714 { 715 if (cs[2*i]) 716 result ~= cs[2*i .. 2*i+2]; 717 else if (cs[2*i+1]) 718 result ~= ['\x01', cs[2*i+1]]; 719 } 720 return result.assumeUnique; 721 } 722 723 724 private enum Operation 725 { 726 equalAnyElem = 0b0_00_00_00, 727 inRanges = 0b0_00_01_00, 728 equalElem = 0b0_00_10_00, 729 substrPos = 0b0_00_11_00, 730 } 731 732 733 private enum Polarity 734 { 735 keep = 0b0_00_00_00, 736 negate = 0b0_01_00_00, 737 negateValid = 0b0_11_00_00, 738 } 739 740 741 @forceinline @nogc nothrow pure 742 private void vpcmpistri(C, immutable(C[]) cs, Operation op, Polarity pol = Polarity.keep, bool lastIndex = false) 743 (ref const(char)* p) 744 if (is(C == char) || is(C == ubyte) || is(C == wchar) || is(C == ushort) || is(C == byte) || is(C == short)) 745 { 746 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53712 747 static if (is(C == char) || is(C == ubyte)) 748 enum ct = 0b00; 749 else static if (is(C == wchar) || is(C == ushort)) 750 enum ct = 0b01; 751 else static if (is(C == byte)) 752 enum ct = 0b10; 753 else 754 enum ct = 0b11; 755 756 enum mode = ct | op | pol | (!!lastIndex << 6); 757 758 version (X86_64) 759 enum creg = "rcx"; 760 else version (X86) 761 enum creg = "ecx"; 762 else static assert(0, "Not implemented"); 763 764 version (LDC) 765 { 766 import ldc.llvmasm; 767 768 p = __asm!(const(char*))(" 769 1: 770 pcmpistri $2, ($1), $3 771 add $$16, $1 772 cmp $$16, %ecx 773 je 1b 774 sub $$16, $1 775 add %" ~ creg ~ ", $1 776 ", "=r,0,K,x,~{ecx}", p, mode, SIMDFromString!cs); 777 } 778 else version (GNU) 779 { 780 asm { " 781 1: 782 pcmpistri %2, (%1), %3 783 add $16, %1 784 cmp $16, %%ecx 785 je 1b 786 sub $16, %1 787 add %%" ~ creg ~ ", %1 788 " : "=r" p : "0" p, "K" mode, "x" SIMDFromString!cs : "ecx"; } 789 } 790 else 791 { 792 alias csXMM = SIMDFromString!cs; 793 version (D_InlineAsm_X86_64) 794 { 795 version (Posix) 796 { 797 asm @nogc pure nothrow 798 { 799 naked; 800 movdqa XMM0, csXMM; 801 mov RAX, [RDI]; 802 L1: 803 vpcmpistri XMM0, [RAX], mode; 804 add RAX, 16; 805 cmp ECX, 16; 806 je L1; 807 sub RAX, 16; 808 add RAX, RCX; 809 mov [RDI], RAX; 810 ret; 811 } 812 } 813 else static assert(0, "Not implemented"); 814 } 815 else version (D_InlineAsm_X86) 816 { 817 version (Posix) 818 { 819 asm @nogc pure nothrow 820 { 821 naked; 822 movdqa XMM0, csXMM; 823 mov EDX, [EAX]; 824 L1: 825 vpcmpistri XMM0, [EDX], mode; 826 add EDX, 16; 827 cmp ECX, 16; 828 je L1; 829 sub EDX, 16; 830 add EDX, ECX; 831 mov [EAX], EDX; 832 ret; 833 } 834 } 835 else static assert(0, "Not implemented"); 836 } 837 else static assert(0, "Not implemented"); 838 } 839 }