1 /*************************************************************************************************** 2 * 3 * Helper program to generate the lookup tables required for certain Unicode algorithms. 4 * This code is conforming with Unicode 10.0.0. 5 * 6 * Authors: 7 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 8 * 9 * Copyright: 10 * © 2017 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 11 * 12 * License: 13 * $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0) 14 * 15 **************************************************************************************************/ 16 module unicode.generator; 17 import std.conv; 18 import std.exception; 19 import core.bitop; 20 import std.stdio; 21 import std.string; 22 import std.algorithm; 23 import std.meta; 24 import std.path; 25 26 enum PropertyType 27 { 28 catalog, enumeration, binary, string, numeric, miscellaneous 29 } 30 31 struct Property 32 { 33 string name; 34 string value; 35 } 36 37 struct Entry 38 { 39 bool isSet = false; 40 Property[] properties; 41 } 42 43 struct Line 44 { 45 uint rangeStart; 46 uint rangeEnd; 47 string[] properties; 48 } 49 50 struct UnicodeCharacterDatabase 51 { 52 PropertyType type; 53 Entry[] entries; 54 size_t[string] enumerationValues; 55 string varName; 56 57 this(string filename, PropertyType type) 58 { 59 import std.algorithm; 60 import std.stdio; 61 import std.uni; 62 63 this.type = type; 64 this.entries = new Entry[](0x110000); 65 this.enumerationValues[null] = 0; 66 this.varName = baseName(filename, ".txt"); 67 Line[] defaults; 68 Line[] actuals; 69 bool abbreviates = false; 70 string enumOverridePrefix; 71 string enumOverride; 72 73 foreach (line; File(filename).byLine()) 74 { 75 bool isDefault = false; 76 char[] code; 77 Line data; 78 79 // Special @missing line syntax ? 80 static immutable isMissingStr = "# @missing: "; 81 static immutable propNameStr = "# Property: "; 82 if (line.startsWith(isMissingStr)) 83 { 84 isDefault = true; 85 code = line[isMissingStr.length..$]; 86 } 87 else if (line.startsWith(propNameStr)) 88 { 89 abbreviates = true; 90 enumOverridePrefix = "# "~line[propNameStr.length..$].idup~"="; 91 } 92 else if (abbreviates && line.startsWith(enumOverridePrefix)) 93 { 94 enumOverride = line[enumOverridePrefix.length..$].idup; 95 } 96 else 97 { 98 // Split between code and comment section 99 auto commentSplit = findSplit(line, "#"); 100 code = commentSplit[0]; 101 } 102 code = strip!isWhite(code); 103 if (code.length == 0) 104 continue; 105 106 uint fieldIdx = 0; 107 foreach (field; splitter(code, ';')) 108 { 109 field = strip!isWhite(field); 110 switch (fieldIdx) 111 { 112 case 0: // Code point(s) 113 auto range = findSplit(field, ".."); 114 data.rangeStart = to!uint(range[0], 16); 115 data.rangeEnd = range[1] == ".." ? to!uint(range[2], 16) : data.rangeStart; 116 enforce(data.rangeEnd <= 0x10FFFF); 117 enforce(data.rangeStart <= data.rangeEnd); 118 data.rangeEnd++; 119 break; 120 default: 121 string ifield = enumOverride ? enumOverride : field.idup; 122 data.properties ~= ifield; 123 if (type == PropertyType.enumeration) 124 { 125 if (ifield !in enumerationValues) 126 enumerationValues[ifield] = enumerationValues.length; 127 } 128 } 129 fieldIdx++; 130 } 131 if (type == PropertyType.enumeration) 132 enforce(fieldIdx >= 2); 133 else assert(0, "Not implemented"); 134 135 if (isDefault) 136 defaults ~= data; 137 else 138 actuals ~= data; 139 } 140 141 foreach (set; [defaults, actuals]) 142 { 143 foreach (ref definition; set) 144 { 145 foreach (cp; definition.rangeStart .. definition.rangeEnd) 146 { 147 final switch (type) with (PropertyType) 148 { 149 case catalog: 150 assert(0, "Not implemented"); 151 case enumeration: 152 enforce(definition.properties.length == 1); 153 entries[cp].properties = [Property(null, definition.properties[0])]; 154 entries[cp].isSet = true; 155 break; 156 case binary: 157 case string: 158 case numeric: 159 case miscellaneous: 160 assert(0, "Not implemented"); 161 } 162 } 163 } 164 } 165 166 foreach (cp; 0 .. 0x110000) 167 enforce(entries[cp].isSet); 168 } 169 170 struct TableEntry 171 { 172 ubyte[][] byteSeqs; 173 string enumerationValue; 174 Table* subEntries; 175 176 string toString() 177 { 178 if (subEntries) 179 return subEntries.to!string(); 180 else 181 return enumerationValue; 182 } 183 } 184 185 struct Table 186 { 187 uint level, idx; 188 TableEntry[256] entries; 189 190 size_t toHash() const nothrow 191 { 192 size_t result; 193 foreach (i; 0 .. 256) 194 { 195 if (entries[i].subEntries) 196 result = hashOf(entries[i].subEntries.idx, result); 197 else 198 result = hashOf(entries[i].enumerationValue, result); 199 } 200 return hashOf(level, result); 201 } 202 203 bool opEquals(ref const Table key) const 204 { 205 foreach (i; 0 .. 256) 206 { 207 if ((this.entries[i].subEntries is null) != (key.entries[i].subEntries is null)) 208 return false; 209 if (this.entries[i].subEntries) 210 { 211 if (this.entries[i].subEntries.idx != key.entries[i].subEntries.idx) 212 return false; 213 } 214 else if (this.entries[i].enumerationValue != key.entries[i].enumerationValue) 215 { 216 return false; 217 } 218 } 219 return this.level == key.level; 220 } 221 } 222 223 string generateEnumerationCode() 224 { 225 auto lookup = new Table; 226 uint[4] levelAssignments; 227 foreach (dchar cp; 0 .. 0x110000) 228 { 229 ubyte[] byteSeq; 230 if (cp < 128) 231 { 232 byteSeq ~= cast(char)cp; 233 } 234 else 235 { 236 uint topBit = 6; 237 uint bits = cp; 238 do 239 { 240 byteSeq = char(bits & 0x3F | 0x80) ~ byteSeq; 241 bits >>= 6; 242 topBit--; 243 } 244 while (bits && bsr(bits) >= topBit); 245 byteSeq = cast(char)(0xFE << topBit | bits) ~ byteSeq; 246 } 247 auto table = lookup; 248 foreach (uint i, cu; byteSeq) 249 { 250 auto entry = &table.entries[cu]; 251 if (entry.subEntries) 252 { 253 table = entry.subEntries; 254 } 255 else if (entry.enumerationValue is null) 256 { 257 entry.byteSeqs = [byteSeq]; 258 entry.enumerationValue = entries[cp].properties[0].value; 259 break; 260 } 261 else if (entry.enumerationValue == entries[cp].properties[0].value) 262 { 263 entry.byteSeqs ~= byteSeq; 264 break; 265 } 266 else 267 { 268 auto subTable = new Table(i+1); 269 foreach (byteSeq2; entry.byteSeqs) 270 { 271 subTable.entries[byteSeq2[i+1]].enumerationValue = entry.enumerationValue; 272 subTable.entries[byteSeq2[i+1]].byteSeqs = [byteSeq2]; 273 } 274 entry.byteSeqs = null; 275 entry.enumerationValue = null; 276 entry.subEntries = subTable; 277 } 278 table = entry.subEntries; 279 } 280 } 281 282 Table*[Table] tableSet; 283 Table*[uint][4] tableByIdx; 284 tableByIdx[0][0] = lookup; 285 286 void assignIndices(Table* table, uint level = 0) 287 { 288 foreach (i, entry; table.entries) 289 { 290 if (entry.subEntries) 291 { 292 assignIndices(entry.subEntries, level + 1); 293 if (auto dup = *entry.subEntries in tableSet) 294 { 295 entry.subEntries = *dup; 296 } 297 else 298 { 299 entry.subEntries.idx = levelAssignments[level + 1]++; 300 tableByIdx[level + 1][entry.subEntries.idx] = entry.subEntries; 301 tableSet[*entry.subEntries] = entry.subEntries; 302 } 303 } 304 } 305 } 306 assignIndices(lookup); 307 levelAssignments[0] = 1; 308 309 writefln("%s: Using %s tables with a total size: %s KiB", 310 varName, sum(levelAssignments[]), sum(levelAssignments[]) / 4f); 311 stdout.flush(); // in case we are buffered 312 313 auto level0 = new ubyte[256][](levelAssignments[0]); 314 auto level1 = new ubyte[256][](levelAssignments[1]); 315 auto level2 = new ubyte[256][](levelAssignments[2]); 316 auto level3 = new ubyte[256][](levelAssignments[3]); 317 318 foreach (level, bin; AliasSeq!(level0, level1, level2, level3)) 319 { 320 foreach (idx; 0 .. levelAssignments[level]) 321 { 322 Table* table = tableByIdx[level][idx]; 323 enforce(table.idx == idx); 324 enforce(table.level == level); 325 enforce(levelAssignments[level] + enumerationValues.length <= 256, 326 format("Sum of tables and enumarations at level %s exceeds ubyte storage capacity", level)); 327 foreach (i, ref entry; table.entries) 328 { 329 if (entry.subEntries) 330 bin[idx][i] = cast(ubyte)(entry.subEntries.idx + enumerationValues.length); 331 else 332 bin[idx][i] = cast(ubyte)enumerationValues[entry.enumerationValue]; 333 } 334 } 335 } 336 337 // Write struct with enum 338 string code = "struct " ~ varName ~ "\n{\n"; 339 auto sortedEnum = new string[](enumerationValues.length); 340 foreach (key, value; enumerationValues) 341 sortedEnum[value] = key; 342 code ~= "\tenum Enum : size_t\n\t{\n\t\t"; 343 foreach (key, value; sortedEnum) 344 code ~= (value ? value : "__") ~ ", "; 345 code ~= "\n\t}\n\n"; 346 foreach (k, bin; AliasSeq!(level0, level1, level2, level3)) 347 { 348 code ~= "\tstatic immutable ubyte[256][" ~ to!string(bin.length) ~ "] level" ~ to!string(k) ~ " = [\n"; 349 foreach (i; 0 .. bin.length) 350 code ~= "\t\t[" ~ format("%(%s,%)", bin[i]) ~ "],\n"; 351 code ~= "\t];\n"; 352 } 353 code ~= "}\n\n"; 354 return code; 355 } 356 } 357 358 alias UCD = UnicodeCharacterDatabase; 359 360 void main() 361 { 362 string code = "module fast.internal.unicode_tables;\n\n"; 363 UCD ucd; 364 365 ucd = UCD("../ucd/auxiliary/GraphemeBreakProperty.txt", PropertyType.enumeration); 366 code ~= ucd.generateEnumerationCode(); 367 ucd = UCD("../ucd/extracted/DerivedGeneralCategory.txt", PropertyType.enumeration); 368 code ~= ucd.generateEnumerationCode(); 369 ucd = UCD("../ucd/extracted/DerivedLineBreak.txt", PropertyType.enumeration); 370 code ~= ucd.generateEnumerationCode(); 371 372 auto tableFile = File("../source/fast/internal/unicode_tables.d", "w"); 373 tableFile.write(code); 374 }