1 /******************************************************************************* 2 * 3 * Converts between UTF-8 and UTF-16. 4 * 5 * Authors: 6 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 7 * 8 * Copyright: 9 * © 2013 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 10 * 11 * License: 12 * $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0) 13 * 14 **************************************/ 15 module fast.cstring; @nogc nothrow: 16 17 import core.stdc.stdlib; 18 import core.stdc..string; 19 //import std.traits; 20 import fast.buffer; 21 22 23 /** 24 * Converts a string to a wstring using a buffer provided by the user. 25 * To get the buffer requirements call $(D wstringSize) on your source buffer. 26 * 27 * Params: 28 * src = The UTF-8 string to convert. 29 * dst = The destination buffer for the conversion. 30 * 31 * Returns: 32 * The part of the destination buffer used for the conversion as a $(D wchar[]). 33 * A terminating zero is appended, so the result.ptr can be passed into Windows APIs. 34 */ 35 pure 36 wchar[] string2wstring(in char[] src, wchar* dst) 37 { 38 const char* srcEnd = src.ptr + src.length; 39 const(char)* srcIt = src.ptr; 40 wchar* dstIt = dst; 41 42 while (srcIt !is srcEnd) 43 { 44 // how long is the byte sequence 45 int len = 0; 46 uint mask = 0b1000_0000; 47 while(*srcIt & mask) 48 { 49 mask >>= 1; 50 len++; 51 } 52 53 // get payload of first byte 54 dchar ch = *srcIt++ & (mask - 1); 55 56 while (--len > 0) 57 { 58 // make space for 6 more bits 59 ch <<= 6; 60 ch |= *srcIt++ & 0b0011_1111; 61 } 62 63 // do we need to store a surrogate pair ? 64 static if (is(wchar == dchar)) 65 { 66 *dstIt++ = ch; 67 } 68 else if (ch > wchar.max) 69 { 70 *dstIt++ = (ch >> 10) | 0xD800; 71 *dstIt++ = (ch & 0b11_1111_1111) | 0xDC00; 72 } 73 else 74 { 75 *dstIt++ = cast(wchar) ch; 76 } 77 } 78 *dstIt = 0; 79 80 return dst[0 .. dstIt - dst]; 81 } 82 83 /** 84 * Calculates the required buffer size in bytes for a string to wchar[] conversion. 85 * Room for a terminating '\0' is included. 86 * 87 * Params: 88 * src = The source string. 89 * 90 * Returns: 91 * The maximum byte count the source string could require, including the terminating '\0'. 92 * 93 * See_Also: 94 * string2wstring 95 * 96 */ 97 @safe pure 98 size_t string2wstringSize(in char[] src) 99 { 100 enum limit = size_t.max / wchar.sizeof - 1; 101 return src.length <= limit ? wchar.sizeof * (src.length + 1) : size_t.max; 102 } 103 104 105 /** 106 * Converts a wstring to a string using a buffer provided by the user. 107 * To get the buffer requirements call $(D stringSize) on your source buffer. 108 * 109 * Params: 110 * src = The UTF-8 string to convert. 111 * dst = The destination buffer for the conversion. 112 * 113 * Returns: 114 * The part of the destination buffer used for the conversion as a $(D wchar[]). 115 * A terminating zero is appended, so the result.ptr can be passed into Windows APIs. 116 */ 117 pure 118 char[] wstring2string(in wchar[] src, char* dst) 119 { 120 const wchar* srcEnd = src.ptr + src.length; 121 const(wchar)* srcIt = src.ptr; 122 char* dstIt = dst; 123 124 while (srcIt !is srcEnd) 125 { 126 if (*srcIt < 0x80) 127 { 128 *dstIt++ = cast(char) *srcIt++; 129 } 130 else if (*srcIt < 0x800) 131 { 132 *dstIt++ = cast(char) (0b_11000000 | *srcIt >> 6); 133 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++; 134 } 135 if (*srcIt < 0xD800 || *srcIt > 0xDBFF) 136 { 137 // anything else within the BMP (<= 0xFFFF), but not a high surrogate 138 *dstIt++ = 0b_11100000 | *srcIt >> 12; 139 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt >> 6; 140 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++; 141 } 142 else 143 { 144 // high surrogate, assume correct encoding and that the next wchar is the low surrogate 145 dchar decoded; 146 decoded = (*srcIt++ & 0b11_1111_1111) << 10; 147 decoded |= (*srcIt++ & 0b11_1111_1111); 148 *dstIt++ = 0b_11110000 | decoded >> 18; 149 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 12; 150 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 6; 151 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded; 152 } 153 } 154 *dstIt = 0; 155 156 return dst[0 .. dstIt - dst]; 157 } 158 159 /** 160 * Calculates the required buffer size in bytes for a wstring to char[] conversion. 161 * Room for a terminating '\0' is included. 162 * 163 * Params: 164 * src = The source string. 165 * 166 * Returns: 167 * The maximum byte count the source string could require, including the terminating '\0'. 168 * 169 * See_Also: 170 * wstring2string 171 * 172 */ 173 @safe pure 174 size_t wstring2stringSize(in wchar[] src) 175 { 176 enum limit = (size_t.max / char.sizeof - 1) / 3; 177 return src.length <= limit ? char.sizeof * (3 * src.length + 1) : size_t.max; 178 } 179 180 181 /** 182 * Replaces $(D std.utf.toUTFz) with a version that uses the stack as long as the required bytes for the output are 183 * <= 1k. Longer strings use $(D malloc) to create a buffer for the conversion. It is freed at least at the end of the 184 * scope. 185 * 186 * Params: 187 * str = The source string to convert. 188 * 189 * See_Also: 190 * toWstring 191 * 192 * Example: 193 * --- 194 * string text = "Hello, world!"; 195 * WinApiW(wcharPtr!text); 196 * --- 197 */ 198 auto wcharPtr(alias str)(void* buffer = string2wstringSize(str) <= allocaLimit ? alloca(string2wstringSize(str)) : null) 199 { 200 // In any case we have to return a proper InstantBuffer, so that free() is called in the dtor at some point. 201 return TempBuffer!wchar( 202 string2wstring(str, cast(wchar*) (buffer ? buffer : malloc(string2wstringSize(str)))), 203 buffer is null); 204 } 205 206 /// ditto 207 immutable(wchar)* wcharPtr(alias wstr)() 208 if (is(typeof(wstr) == wstring) && __traits(compiles, { enum wstring e = wstr; })) 209 { 210 // D string literals (known at compile time) are always \0-terminated. 211 return wstr.ptr; 212 } 213 214 /** 215 * $(D char*) version of $(D wcharPtr). Basically it appends a \0 to the input. 216 * The function uses $(D malloc) for strings of lengths 1024 and above. 217 * 218 * Params: 219 * str = The source string to convert to a C UTF-8 string 220 * 221 * Note: 222 * Do not use this to call Windows ANSI functions! Always use wide-char 223 * functions on this operating system unless you want to deal with codepages. 224 * 225 * Example: 226 * --- 227 * string text = "Hello, world!"; 228 * linuxApi(charPtr!text); 229 * --- 230 */ 231 auto charPtr(alias str)(void* buffer = alloca(str.length + 1)) 232 if (is(typeof(str) : const(char)[]) || is(typeof(str) : const(ubyte)[])) 233 { 234 char* dst = cast(char*) memcpy(buffer ? buffer : malloc(str.length + 1), str.ptr, str.length); 235 dst[str.length] = '\0'; 236 return TempBuffer!char(dst[0 .. str.length], buffer is null); 237 } 238 239 /// ditto 240 immutable(char)* charPtr(alias str)() 241 if (__traits(compiles, { enum string e = str; })) 242 { 243 // D string literals (known at compile time) are always \0-terminated. 244 return str.ptr; 245 } 246 247 /** 248 * This overload allocates the required memory from an existing stack buffer. 249 * 250 * Params: 251 * str = The source string to convert to a C UTF-8 string 252 * sb = The stack buffer to allocate from 253 * 254 * Note: 255 * Always assign the result to an auto variable first for RAII to work correctly. 256 */ 257 StackBufferEntry!char charPtr(SB)(const(char)[] str, ref SB sb) 258 if (is(SB == StackBuffer!bytes, bytes...)) 259 { 260 auto buffer = sb.alloc!char(str.length + 1); 261 memcpy(buffer.ptr, str.ptr, str.length); 262 buffer[str.length] = '\0'; 263 return buffer; 264 } 265 266 /** 267 * Returns the given $(D ptr) up to but not including the \0 as a $(D char[]). 268 */ 269 inout(char)[] asString(inout(char*) ptr) @trusted pure 270 { 271 if (ptr is null) return null; 272 return ptr[0 .. strlen(ptr)]; 273 }