1 /***************************************************************************************************
2  * 
3  * Helper program to generate the lookup tables required for certain Unicode algorithms.
4  * This code is conforming with Unicode 10.0.0.
5  * 
6  * Authors:
7  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
8  * 
9  * Copyright:
10  *   © 2017 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
11  * 
12  * License:
13  *   $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0)
14  * 
15  **************************************************************************************************/
16 module unicode.generator;
17 import std.conv;
18 import std.exception;
19 import core.bitop;
20 import std.stdio;
21 import std.string;
22 import std.algorithm;
23 import std.meta;
24 import std.path;
25 
26 enum PropertyType
27 {
28 	catalog, enumeration, binary, string, numeric, miscellaneous
29 }
30 
31 struct Property
32 {
33 	string name;
34 	string value;
35 }
36 
37 struct Entry
38 {
39 	bool isSet = false;
40 	Property[] properties;
41 }
42 
43 struct Line
44 {
45 	uint rangeStart;
46 	uint rangeEnd;
47 	string[] properties;
48 }
49 
50 struct UnicodeCharacterDatabase
51 {
52 	PropertyType type;
53 	Entry[] entries;
54 	size_t[string] enumerationValues;
55 	string varName;
56 
57 	this(string filename, PropertyType type)
58 	{
59 		import std.algorithm;
60 		import std.stdio;
61 		import std.uni;
62 
63 		this.type = type;
64 		this.entries = new Entry[](0x110000);
65 		this.enumerationValues[null] = 0;
66 		this.varName = baseName(filename, ".txt");
67 		Line[] defaults;
68 		Line[] actuals;
69 		bool abbreviates = false;
70 		string enumOverridePrefix;
71 		string enumOverride;
72 
73 		foreach (line; File(filename).byLine())
74 		{
75 			bool isDefault = false;
76 			char[] code;
77 			Line data;
78 
79 			// Special @missing line syntax ?
80 			static immutable isMissingStr = "# @missing: ";
81 			static immutable propNameStr = "# Property:	";
82 			if (line.startsWith(isMissingStr))
83 			{
84 				isDefault = true;
85 				code = line[isMissingStr.length..$];
86 			}
87 			else if (line.startsWith(propNameStr))
88 			{
89 				abbreviates = true;
90 				enumOverridePrefix = "# "~line[propNameStr.length..$].idup~"=";
91 			}
92 			else if (abbreviates && line.startsWith(enumOverridePrefix))
93 			{
94 				enumOverride = line[enumOverridePrefix.length..$].idup;
95 			}
96 			else
97 			{
98 				// Split between code and comment section
99 				auto commentSplit = findSplit(line, "#");
100 				code = commentSplit[0];
101 			}
102 			code = strip!isWhite(code);
103 			if (code.length == 0)
104 				continue;
105 
106 			uint fieldIdx = 0;
107 			foreach (field; splitter(code, ';'))
108 			{
109 				field = strip!isWhite(field);
110 				switch (fieldIdx)
111 				{
112 					case 0: // Code point(s)
113 						auto range = findSplit(field, "..");
114 						data.rangeStart = to!uint(range[0], 16);
115 						data.rangeEnd = range[1] == ".." ? to!uint(range[2], 16) : data.rangeStart;
116 						enforce(data.rangeEnd <= 0x10FFFF);
117 						enforce(data.rangeStart <= data.rangeEnd);
118 						data.rangeEnd++;
119 						break;
120 					default:
121 						string ifield = enumOverride ? enumOverride : field.idup;
122 						data.properties ~= ifield;
123 						if (type == PropertyType.enumeration)
124 						{
125 							if (ifield !in enumerationValues)
126 								enumerationValues[ifield] = enumerationValues.length;
127 						}
128 				}
129 				fieldIdx++;
130 			}
131 			if (type == PropertyType.enumeration)
132 				enforce(fieldIdx >= 2);
133 			else assert(0, "Not implemented");
134 
135 			if (isDefault)
136 				defaults ~= data;
137 			else
138 				actuals ~= data;
139 		}
140 
141 		foreach (set; [defaults, actuals])
142 		{
143 			foreach (ref definition; set)
144 			{
145 				foreach (cp; definition.rangeStart .. definition.rangeEnd)
146 				{
147 					final switch (type) with (PropertyType)
148 					{
149 						case catalog:
150 							assert(0, "Not implemented");
151 						case enumeration:
152 							enforce(definition.properties.length == 1);
153 							entries[cp].properties = [Property(null, definition.properties[0])];
154 							entries[cp].isSet = true;
155 							break;
156 						case binary:
157 						case string:
158 						case numeric:
159 						case miscellaneous:
160 							assert(0, "Not implemented");
161 					}
162 				}
163 			}
164 		}
165 
166 		foreach (cp; 0 .. 0x110000)
167 			enforce(entries[cp].isSet);
168 	}
169 
170 	struct TableEntry
171 	{
172 		ubyte[][] byteSeqs;
173 		string enumerationValue;
174 		Table* subEntries;
175 		
176 		string toString()
177 		{
178 			if (subEntries)
179 				return subEntries.to!string();
180 			else
181 				return enumerationValue;
182 		}
183 	}
184 	
185 	struct Table
186 	{
187 		uint level, idx;
188 		TableEntry[256] entries;
189 		
190 		size_t toHash() const nothrow
191 		{
192 			size_t result;
193 			foreach (i; 0 .. 256)
194 			{
195 				if (entries[i].subEntries)
196 					result = hashOf(entries[i].subEntries.idx, result);
197 				else
198 					result = hashOf(entries[i].enumerationValue, result);
199 			}
200 			return hashOf(level, result);
201 		}
202 		
203 		bool opEquals(ref const Table key) const
204 		{
205 			foreach (i; 0 .. 256)
206 			{
207 				if ((this.entries[i].subEntries is null) != (key.entries[i].subEntries is null))
208 					return false;
209 				if (this.entries[i].subEntries)
210 				{
211 					if (this.entries[i].subEntries.idx != key.entries[i].subEntries.idx)
212 						return false;
213 				}
214 				else if (this.entries[i].enumerationValue != key.entries[i].enumerationValue)
215 				{
216 					return false;
217 				}
218 			}
219 			return this.level == key.level;
220 		}
221 	}
222 
223 	string generateEnumerationCode()
224 	{
225 		auto lookup = new Table;
226 		uint[4] levelAssignments;
227 		foreach (dchar cp; 0 .. 0x110000)
228 		{
229 			ubyte[] byteSeq;
230 			if (cp < 128)
231 			{
232 				byteSeq ~= cast(char)cp;
233 			}
234 			else
235 			{
236 				uint topBit = 6;
237 				uint bits = cp;
238 				do
239 				{
240 					byteSeq = char(bits & 0x3F | 0x80) ~ byteSeq;
241 					bits >>= 6;
242 					topBit--;
243 				}
244 				while (bits && bsr(bits) >= topBit);
245 				byteSeq = cast(char)(0xFE << topBit | bits) ~ byteSeq;
246 			}
247 			auto table = lookup;
248 			foreach (uint i, cu; byteSeq)
249 			{
250 				auto entry = &table.entries[cu];
251 				if (entry.subEntries)
252 				{
253 					table = entry.subEntries;
254 				}
255 				else if (entry.enumerationValue is null)
256 				{
257 					entry.byteSeqs = [byteSeq];
258 					entry.enumerationValue = entries[cp].properties[0].value;
259 					break;
260 				}
261 				else if (entry.enumerationValue == entries[cp].properties[0].value)
262 				{
263 					entry.byteSeqs ~= byteSeq;
264 					break;
265 				}
266 				else
267 				{
268 					auto subTable = new Table(i+1);
269 					foreach (byteSeq2; entry.byteSeqs)
270 					{
271 						subTable.entries[byteSeq2[i+1]].enumerationValue = entry.enumerationValue;
272 						subTable.entries[byteSeq2[i+1]].byteSeqs = [byteSeq2];
273 					}
274 					entry.byteSeqs = null;
275 					entry.enumerationValue = null;
276 					entry.subEntries = subTable;
277 				}
278 				table = entry.subEntries;
279 			}
280 		}
281 
282 		Table*[Table] tableSet;
283 		Table*[uint][4] tableByIdx;
284 		tableByIdx[0][0] = lookup;
285 
286 		void assignIndices(Table* table, uint level = 0)
287 		{
288 			foreach (i, entry; table.entries)
289 			{
290 				if (entry.subEntries)
291 				{
292 					assignIndices(entry.subEntries, level + 1);
293 					if (auto dup = *entry.subEntries in tableSet)
294 					{
295 						entry.subEntries = *dup;
296 					}
297 					else
298 					{
299 						entry.subEntries.idx = levelAssignments[level + 1]++;
300 						tableByIdx[level + 1][entry.subEntries.idx] = entry.subEntries;
301 						tableSet[*entry.subEntries] = entry.subEntries;
302 					}
303 				}
304 			}
305 		}
306 		assignIndices(lookup);
307 		levelAssignments[0] = 1;
308 
309 		writefln("%s: Using %s tables with a total size: %s KiB",
310 			varName, sum(levelAssignments[]), sum(levelAssignments[]) / 4f);
311 		stdout.flush(); // in case we are buffered
312 
313 		auto level0 = new ubyte[256][](levelAssignments[0]);
314 		auto level1 = new ubyte[256][](levelAssignments[1]);
315 		auto level2 = new ubyte[256][](levelAssignments[2]);
316 		auto level3 = new ubyte[256][](levelAssignments[3]);
317 
318 		foreach (level, bin; AliasSeq!(level0, level1, level2, level3))
319 		{
320 			foreach (idx; 0 .. levelAssignments[level])
321 			{
322 				Table* table = tableByIdx[level][idx];
323 				enforce(table.idx   == idx);
324 				enforce(table.level == level);
325 				enforce(levelAssignments[level] + enumerationValues.length <= 256,
326 					format("Sum of tables and enumarations at level %s exceeds ubyte storage capacity", level));
327 				foreach (i, ref entry; table.entries)
328 				{
329 					if (entry.subEntries)
330 						bin[idx][i] = cast(ubyte)(entry.subEntries.idx + enumerationValues.length);
331 					else
332 						bin[idx][i] = cast(ubyte)enumerationValues[entry.enumerationValue];
333 				}
334 			}
335 		}
336 
337 		// Write struct with enum
338 		string code = "struct " ~ varName ~ "\n{\n";
339 		auto sortedEnum = new string[](enumerationValues.length);
340 		foreach (key, value; enumerationValues)
341 			sortedEnum[value] = key;
342 		code ~= "\tenum Enum : size_t\n\t{\n\t\t";
343 		foreach (key, value; sortedEnum)
344 			code ~= (value ? value : "__") ~ ", ";
345 		code ~= "\n\t}\n\n";
346 		foreach (k, bin; AliasSeq!(level0, level1, level2, level3))
347 		{
348 			code ~= "\tstatic immutable ubyte[256][" ~ to!string(bin.length) ~ "] level" ~ to!string(k) ~ " = [\n";
349 			foreach (i; 0 .. bin.length)
350 				code ~= "\t\t[" ~ format("%(%s,%)", bin[i]) ~ "],\n";
351 			code ~= "\t];\n";
352 		}
353 		code ~= "}\n\n";
354 		return code;
355 	}
356 }
357 
358 alias UCD = UnicodeCharacterDatabase;
359 
360 void main()
361 {
362 	string code = "module fast.internal.unicode_tables;\n\n";
363 	UCD ucd;
364 
365 	ucd = UCD("../ucd/auxiliary/GraphemeBreakProperty.txt", PropertyType.enumeration);
366 	code ~= ucd.generateEnumerationCode();
367 	ucd = UCD("../ucd/extracted/DerivedGeneralCategory.txt", PropertyType.enumeration);
368 	code ~= ucd.generateEnumerationCode();
369 	ucd = UCD("../ucd/extracted/DerivedLineBreak.txt", PropertyType.enumeration);
370 	code ~= ucd.generateEnumerationCode();
371 
372 	auto tableFile = File("../source/fast/internal/unicode_tables.d", "w");
373 	tableFile.write(code);
374 }