fast.unicode source code

1 /***************************************************************************************************
2  * 
3  * Functions to work with the Unicode Transformation Format.
4  * 
5  * Grapheme clusters:
6  *   A grapheme cluster is roughly speaking what the user would perceive as the smallest unit in a
7  *   writing system. Their count can be thought of as a caret position in a text editor. In
8  *   particular at grapheme cluster level, different normalization forms (NFC, NFD) become
9  *   transparent. The default definition used here is independent of the user's locale.
10  * 
11  * Authors:
12  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
13  * 
14  * Copyright:
15  *   © 2017 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
16  * 
17  * License:
18  *   $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0)
19  * 
20  **************************************************************************************************/
21 module fast.unicode;
22 
23 import fast.internal.unicode_tables;
24 import fast.internal.sysdef;
25 import std.simd;
26 
27 
28 /*******************************************************************************
29  * 
30  * Enumeration for the Unicode "General Category" used to roughly classify
31  * codepoints into letters, punctuation etc.
32  *
33  **************************************/
34 alias GeneralCategory = DerivedGeneralCategory.Enum;
35 
36 
37 /*******************************************************************************
38  * 
39  * A customizable structure providing information on a code point. It consists
40  * of a Unicode `property` in the form of an `enum` (e.g. `GeneralCategory`) and
41  * a `length` in bytes of the code point in UTF-8.
42  *
43  **************************************/
44 struct CodePointInfo(Enum)
45 {
46 	alias property this;
47 	size_t length;
48 	Enum   property;
49 }
50 
51 
52 /*******************************************************************************
53  * 
54  * Counts the number of grapheme clusters (character count) in a UTF string.
55  * 
56  * This function uses "extended grapheme clusters" as defined in Unicode:
57  * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
58  * 
59  * When invalid byte sequences are encountered, each byte that does not make up
60  * a code point will be counted as one grapheme as visual representations of
61  * such broken strings will often show a square with the hexadecimal byte value
62  * in them.
63  *
64  * Params:
65  *   str = the UTF-8 string
66  *
67  * Returns:
68  *   the number of grapheme clusters
69  *
70  **************************************/
71 @nogc @trusted pure nothrow size_t
72 countGraphemes(scope const(char)[] str)
73 {
74 	enum numValues = GraphemeBreakProperty.Enum.max + 1;
75 	static immutable graphemeBreakRules =
76 	{
77 		// GB999
78 		byte[numValues][numValues] graphemeBreaks = true;
79 		with (GraphemeBreakProperty.Enum)
80 		{
81 			// GB12 + GB13 (special handling)
82 			foreach (i; 0 .. numValues)
83 				graphemeBreaks[i][Regional_Indicator] = -1;
84 			// GB11
85 			graphemeBreaks[ZWJ][Glue_After_Zwj] = false;
86 			graphemeBreaks[ZWJ][E_Base_GAZ] = false;
87 			// GB10 (special handling)
88 			graphemeBreaks[E_Base]    [E_Modifier] = false;
89 			graphemeBreaks[E_Base_GAZ][E_Modifier] = false;
90 			graphemeBreaks[Extend]    [E_Modifier] = -1;
91 			// GB9b
92 			foreach (i; 0 .. numValues)
93 				graphemeBreaks[Prepend][i] = false;
94 			// GB9a
95 			foreach (i; 0 .. numValues)
96 				graphemeBreaks[i][SpacingMark] = false;
97 			// GB9
98 			foreach (i; 0 .. numValues)
99 			{
100 				graphemeBreaks[i][Extend] = false;
101 				graphemeBreaks[i][ZWJ] = false;
102 			}
103 			graphemeBreaks[E_Base]    [Extend] = -1;
104 			graphemeBreaks[E_Base_GAZ][Extend] = -1;
105 			// GB8
106 			graphemeBreaks[LVT][T] = false;
107 			graphemeBreaks[T]  [T] = false;
108 			// GB7
109 			graphemeBreaks[LV][V] = false;
110 			graphemeBreaks[LV][T] = false;
111 			graphemeBreaks[V] [V] = false;
112 			graphemeBreaks[V] [T] = false;
113 			// GB6
114 			graphemeBreaks[L][L] = false;
115 			graphemeBreaks[L][V] = false;
116 			graphemeBreaks[L][LV] = false;
117 			graphemeBreaks[L][LVT] = false;
118 			// GB5
119 			foreach (i; 0 .. numValues)
120 			{
121 				graphemeBreaks[i][Control] = true;
122 				graphemeBreaks[i][CR] = true;
123 				graphemeBreaks[i][LF] = true;
124 			}
125 			// GB4
126 			foreach (i; 0 .. numValues)
127 			{
128 				graphemeBreaks[Control][i] = true;
129 				graphemeBreaks[CR]     [i] = true;
130 				graphemeBreaks[LF]     [i] = true;
131 			}
132 			// GB3
133 			graphemeBreaks[CR][LF] = false;
134 			// Additional homebrew top level rule to break before and after invalid characters
135 			foreach (i; 0 .. numValues)
136 			{
137 				graphemeBreaks[i][__] = true;
138 				graphemeBreaks[__][i] = true;
139 			}
140 		}
141 		return graphemeBreaks;
142 	}();
143 
144 	size_t graphemeCount = 0;
145 	auto p = str.ptr;
146 	auto graphemeStart = p;
147 	GraphemeBreakProperty.Enum last, next;
148 	bool riEven, inEmojiBaseExtension;
149 
150 	@noinline @safe @nogc pure nothrow bool
151 	complexRules()
152 	{
153 		pragma(inline, false);
154 		with (GraphemeBreakProperty.Enum)
155 		{
156 			if (next == Regional_Indicator)
157 			{
158 				// For GB12 + GB13 we need break only after a complete country code (2 indicators).
159 				if (last == Regional_Indicator)
160 					return riEven = !riEven;
161 				riEven = true;
162 				return false;
163 			}
164 			else if (next == Extend)
165 			{
166 				inEmojiBaseExtension = true;
167 				return false;
168 			}
169 			else if (inEmojiBaseExtension)
170 			{
171 				return inEmojiBaseExtension = false;
172 			}
173 			return true;
174 		}
175 	}
176 
177 	@forceinline void
178 	graphemeCountImpl(S)(ref S str)
179 	{
180 		version (LDC) pragma(inline, true);
181 		auto cpi = getProperty!GraphemeBreakProperty(str);
182 		auto next = cpi.property;
183 		byte isBoundary = graphemeBreakRules[last][next];
184 		if (isBoundary < 0 ? complexRules() : isBoundary)
185 		{
186 			graphemeCount++;
187 			static if (is(S == const(char)*))
188 				graphemeStart = str;
189 			else
190 				graphemeStart = str.ptr;
191 			inEmojiBaseExtension = false;
192 		}
193 		static if (is(S == const(char)*))
194 			str += cpi.length;
195 		else
196 			str = str[cpi.length..$];
197 		last = next;
198 	}
199 
200 	if (str.length >= 4) 
201 	{
202 		const e = str.ptr + str.length - 4;
203 		do
204 			graphemeCountImpl(p);
205 		while (p <= e);
206 		str = str[p - str.ptr..$];
207 	}
208 	while (str.length)
209 		graphemeCountImpl(str);
210 	return graphemeCount;
211 }
212 
213 
214 /*******************************************************************************
215  * 
216  * Retrieves the "General Category" of the first code point in some UTF-8
217  * string. For broken UTF-8, the property is set to `GeneralCategory.__` (`0`).
218  *
219  * Params:
220  *   str = the UTF-8 encoded text, which must not be empty
221  *
222  * Returns:
223  *   a code point information struct consisting of a the fields `property`,
224  *   containing the `GeneralCategory` enumeration and the `length` of the code
225  *   point in bytes.
226  * 
227  **************************************/
228 @property @safe @nogc pure nothrow CodePointInfo!GeneralCategory
229 generalCategory(scope const(char)[] str)
230 {
231 	return getProperty!DerivedGeneralCategory(str);
232 }
233 unittest
234 {
235 	assert("क".generalCategory == GeneralCategory.Other_Letter);
236 	assert("̸".generalCategory == GeneralCategory.Nonspacing_Mark);
237 	assert("\xFF".generalCategory == GeneralCategory.__);
238 }
239 
240 
241 
242 private:
243 
244 @forceinline pure @nogc nothrow auto
245 getProperty(Property, S)(scope S str) if (is(S == const(char)*) || is(S == const(char)[]))
246 in
247 {
248 	static if (is(S == const(char)[]))
249 		assert(str.length != 0, "No code units passed in.");
250 }
251 out
252 {
253 	assert(__result <= Property.Enum.max);
254 }
255 body
256 {
257 	version (LDC) pragma(inline, true);
258 	import fast.internal.helpers;
259 
260 	alias Enum = Property.Enum;
261 	alias CPI = CodePointInfo!Enum;
262 	// Fast path for ASCII.
263 	size_t idx = Property.level0[0][str[0]];
264 	if (byte(str[0]) >= 0) return CPI(1, cast(Enum)idx);
265 	// On multi-byte sequences, set the length to 1 for invalid sequences (idx == 0).
266 	size_t length = clz(str[0] ^ 0xFFu) - 24;
267 	// Safely return invalid code point of 1 byte length if string exhausted.
268 	static if (is(S == const(char)[]))
269 		if (length > str.length)
270 			return CPI(1, cast(Enum)0);
271 	// Otherwise use lookup table hierarchy to determine if code units form a valid code point
272 	if (idx > Enum.max) {
273 		idx = Property.level1[idx - Enum.max - 1][str[1]];
274 		if (idx > Enum.max) {
275 			idx = Property.level2[idx - Enum.max - 1][str[2]];
276 			if (idx > Enum.max)
277 				idx = Property.level3[idx - Enum.max - 1][str[3]];
278 		}
279 	}
280 	if (idx)
281 		return CPI(length, cast(Enum)idx);
282 	else
283 		return CPI(1, cast(Enum)0);
284 }