1 /*************************************************************************************************** 2 * 3 * Functions to work with the Unicode Transformation Format. 4 * 5 * Grapheme clusters: 6 * A grapheme cluster is roughly speaking what the user would perceive as the smallest unit in a 7 * writing system. Their count can be thought of as a caret position in a text editor. In 8 * particular at grapheme cluster level, different normalization forms (NFC, NFD) become 9 * transparent. The default definition used here is independent of the user's locale. 10 * 11 * Authors: 12 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 13 * 14 * Copyright: 15 * © 2017 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 16 * 17 * License: 18 * $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0) 19 * 20 **************************************************************************************************/ 21 module fast.unicode; 22 23 import fast.internal.unicode_tables; 24 import fast.internal.sysdef; 25 import std.simd; 26 27 28 /******************************************************************************* 29 * 30 * Enumeration for the Unicode "General Category" used to roughly classify 31 * codepoints into letters, punctuation etc. 32 * 33 **************************************/ 34 alias GeneralCategory = DerivedGeneralCategory.Enum; 35 36 37 /******************************************************************************* 38 * 39 * A customizable structure providing information on a code point. It consists 40 * of a Unicode `property` in the form of an `enum` (e.g. `GeneralCategory`) and 41 * a `length` in bytes of the code point in UTF-8. 42 * 43 **************************************/ 44 struct CodePointInfo(Enum) 45 { 46 alias property this; 47 size_t length; 48 Enum property; 49 } 50 51 52 /******************************************************************************* 53 * 54 * Counts the number of grapheme clusters (character count) in a UTF string. 55 * 56 * This function uses "extended grapheme clusters" as defined in Unicode: 57 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries 58 * 59 * When invalid byte sequences are encountered, each byte that does not make up 60 * a code point will be counted as one grapheme as visual representations of 61 * such broken strings will often show a square with the hexadecimal byte value 62 * in them. 63 * 64 * Params: 65 * str = the UTF-8 string 66 * 67 * Returns: 68 * the number of grapheme clusters 69 * 70 **************************************/ 71 @nogc @trusted pure nothrow size_t 72 countGraphemes(scope const(char)[] str) 73 { 74 enum numValues = GraphemeBreakProperty.Enum.max + 1; 75 static immutable graphemeBreakRules = 76 { 77 // GB999 78 byte[numValues][numValues] graphemeBreaks = true; 79 with (GraphemeBreakProperty.Enum) 80 { 81 // GB12 + GB13 (special handling) 82 foreach (i; 0 .. numValues) 83 graphemeBreaks[i][Regional_Indicator] = -1; 84 // GB11 85 graphemeBreaks[ZWJ][Glue_After_Zwj] = false; 86 graphemeBreaks[ZWJ][E_Base_GAZ] = false; 87 // GB10 (special handling) 88 graphemeBreaks[E_Base] [E_Modifier] = false; 89 graphemeBreaks[E_Base_GAZ][E_Modifier] = false; 90 graphemeBreaks[Extend] [E_Modifier] = -1; 91 // GB9b 92 foreach (i; 0 .. numValues) 93 graphemeBreaks[Prepend][i] = false; 94 // GB9a 95 foreach (i; 0 .. numValues) 96 graphemeBreaks[i][SpacingMark] = false; 97 // GB9 98 foreach (i; 0 .. numValues) 99 { 100 graphemeBreaks[i][Extend] = false; 101 graphemeBreaks[i][ZWJ] = false; 102 } 103 graphemeBreaks[E_Base] [Extend] = -1; 104 graphemeBreaks[E_Base_GAZ][Extend] = -1; 105 // GB8 106 graphemeBreaks[LVT][T] = false; 107 graphemeBreaks[T] [T] = false; 108 // GB7 109 graphemeBreaks[LV][V] = false; 110 graphemeBreaks[LV][T] = false; 111 graphemeBreaks[V] [V] = false; 112 graphemeBreaks[V] [T] = false; 113 // GB6 114 graphemeBreaks[L][L] = false; 115 graphemeBreaks[L][V] = false; 116 graphemeBreaks[L][LV] = false; 117 graphemeBreaks[L][LVT] = false; 118 // GB5 119 foreach (i; 0 .. numValues) 120 { 121 graphemeBreaks[i][Control] = true; 122 graphemeBreaks[i][CR] = true; 123 graphemeBreaks[i][LF] = true; 124 } 125 // GB4 126 foreach (i; 0 .. numValues) 127 { 128 graphemeBreaks[Control][i] = true; 129 graphemeBreaks[CR] [i] = true; 130 graphemeBreaks[LF] [i] = true; 131 } 132 // GB3 133 graphemeBreaks[CR][LF] = false; 134 // Additional homebrew top level rule to break before and after invalid characters 135 foreach (i; 0 .. numValues) 136 { 137 graphemeBreaks[i][__] = true; 138 graphemeBreaks[__][i] = true; 139 } 140 } 141 return graphemeBreaks; 142 }(); 143 144 size_t graphemeCount = 0; 145 auto p = str.ptr; 146 auto graphemeStart = p; 147 GraphemeBreakProperty.Enum last, next; 148 bool riEven, inEmojiBaseExtension; 149 150 @noinline @safe @nogc pure nothrow bool 151 complexRules() 152 { 153 pragma(inline, false); 154 with (GraphemeBreakProperty.Enum) 155 { 156 if (next == Regional_Indicator) 157 { 158 // For GB12 + GB13 we need break only after a complete country code (2 indicators). 159 if (last == Regional_Indicator) 160 return riEven = !riEven; 161 riEven = true; 162 return false; 163 } 164 else if (next == Extend) 165 { 166 inEmojiBaseExtension = true; 167 return false; 168 } 169 else if (inEmojiBaseExtension) 170 { 171 return inEmojiBaseExtension = false; 172 } 173 return true; 174 } 175 } 176 177 @forceinline void 178 graphemeCountImpl(S)(ref S str) 179 { 180 version (LDC) pragma(inline, true); 181 auto cpi = getProperty!GraphemeBreakProperty(str); 182 auto next = cpi.property; 183 byte isBoundary = graphemeBreakRules[last][next]; 184 if (isBoundary < 0 ? complexRules() : isBoundary) 185 { 186 graphemeCount++; 187 static if (is(S == const(char)*)) 188 graphemeStart = str; 189 else 190 graphemeStart = str.ptr; 191 inEmojiBaseExtension = false; 192 } 193 static if (is(S == const(char)*)) 194 str += cpi.length; 195 else 196 str = str[cpi.length..$]; 197 last = next; 198 } 199 200 if (str.length >= 4) 201 { 202 const e = str.ptr + str.length - 4; 203 do 204 graphemeCountImpl(p); 205 while (p <= e); 206 str = str[p - str.ptr..$]; 207 } 208 while (str.length) 209 graphemeCountImpl(str); 210 return graphemeCount; 211 } 212 213 214 /******************************************************************************* 215 * 216 * Retrieves the "General Category" of the first code point in some UTF-8 217 * string. For broken UTF-8, the property is set to `GeneralCategory.__` (`0`). 218 * 219 * Params: 220 * str = the UTF-8 encoded text, which must not be empty 221 * 222 * Returns: 223 * a code point information struct consisting of a the fields `property`, 224 * containing the `GeneralCategory` enumeration and the `length` of the code 225 * point in bytes. 226 * 227 **************************************/ 228 @property @safe @nogc pure nothrow CodePointInfo!GeneralCategory 229 generalCategory(scope const(char)[] str) 230 { 231 return getProperty!DerivedGeneralCategory(str); 232 } 233 unittest 234 { 235 assert("क".generalCategory == GeneralCategory.Other_Letter); 236 assert("̸".generalCategory == GeneralCategory.Nonspacing_Mark); 237 assert("\xFF".generalCategory == GeneralCategory.__); 238 } 239 240 241 242 private: 243 244 @forceinline pure @nogc nothrow auto 245 getProperty(Property, S)(scope S str) if (is(S == const(char)*) || is(S == const(char)[])) 246 in 247 { 248 static if (is(S == const(char)[])) 249 assert(str.length != 0, "No code units passed in."); 250 } 251 out 252 { 253 assert(__result <= Property.Enum.max); 254 } 255 body 256 { 257 version (LDC) pragma(inline, true); 258 import fast.internal.helpers; 259 260 alias Enum = Property.Enum; 261 alias CPI = CodePointInfo!Enum; 262 // Fast path for ASCII. 263 size_t idx = Property.level0[0][str[0]]; 264 if (byte(str[0]) >= 0) return CPI(1, cast(Enum)idx); 265 // On multi-byte sequences, set the length to 1 for invalid sequences (idx == 0). 266 size_t length = clz(str[0] ^ 0xFFu) - 24; 267 // Safely return invalid code point of 1 byte length if string exhausted. 268 static if (is(S == const(char)[])) 269 if (length > str.length) 270 return CPI(1, cast(Enum)0); 271 // Otherwise use lookup table hierarchy to determine if code units form a valid code point 272 if (idx > Enum.max) { 273 idx = Property.level1[idx - Enum.max - 1][str[1]]; 274 if (idx > Enum.max) { 275 idx = Property.level2[idx - Enum.max - 1][str[2]]; 276 if (idx > Enum.max) 277 idx = Property.level3[idx - Enum.max - 1][str[3]]; 278 } 279 } 280 if (idx) 281 return CPI(length, cast(Enum)idx); 282 else 283 return CPI(1, cast(Enum)0); 284 }