1 /*******************************************************************************
2  * 
3  * Converts between UTF-8 and UTF-16.
4  * 
5  * Authors:
6  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
7  * 
8  * Copyright:
9  *   © 2013 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
10  * 
11  * License:
12  *   $(LINK2 http://www.gnu.org/licenses/gpl-3.0, GNU General Public License 3.0)
13  * 
14  **************************************/
15 module fast.cstring; @nogc nothrow:
16 
17 import core.stdc.stdlib;
18 import core.stdc..string;
19 //import std.traits;
20 import fast.buffer;
21 
22 
23 /**
24  * Converts a string to a wstring using a buffer provided by the user.
25  * To get the buffer requirements call $(D wstringSize) on your source buffer.
26  *
27  * Params:
28  *   src = The UTF-8 string to convert.
29  *   dst = The destination buffer for the conversion.
30  *
31  * Returns:
32  *   The part of the destination buffer used for the conversion as a $(D wchar[]).
33  *   A terminating zero is appended, so the result.ptr can be passed into Windows APIs.
34  */
35 pure
36 wchar[] string2wstring(in char[] src, wchar* dst)
37 {
38 	const char* srcEnd = src.ptr + src.length;
39 	const(char)* srcIt = src.ptr;
40 	wchar* dstIt = dst;
41 
42 	while (srcIt !is srcEnd)
43 	{
44 		// how long is the byte sequence
45 		int len = 0;
46 		uint mask = 0b1000_0000;
47 		while(*srcIt & mask)
48 		{
49 			mask >>= 1;
50 			len++;
51 		}
52 
53 		// get payload of first byte
54 		dchar ch = *srcIt++ & (mask - 1);
55 
56 		while (--len > 0)
57 		{
58 			// make space for 6 more bits
59 			ch <<= 6;
60 			ch |= *srcIt++ & 0b0011_1111;
61 		}
62 
63 		// do we need to store a surrogate pair ?
64 		static if (is(wchar == dchar))
65 		{
66 			*dstIt++ = ch;
67 		}
68 		else if (ch > wchar.max)
69 		{
70 			*dstIt++ = (ch >> 10) | 0xD800;
71 			*dstIt++ = (ch & 0b11_1111_1111) | 0xDC00;
72 		}
73 		else
74 		{
75 			*dstIt++ = cast(wchar) ch;
76 		}
77 	}
78 	*dstIt = 0;
79 
80 	return dst[0 .. dstIt - dst];
81 }
82 
83 /**
84  * Calculates the required buffer size in bytes for a string to wchar[] conversion.
85  * Room for a terminating '\0' is included.
86  *
87  * Params:
88  *   src = The source string.
89  *
90  * Returns:
91  *   The maximum byte count the source string could require, including the terminating '\0'.
92  *
93  * See_Also:
94  *   string2wstring
95  *   
96  */
97 @safe pure
98 size_t string2wstringSize(in char[] src)
99 {
100 	enum limit = size_t.max / wchar.sizeof - 1;
101 	return src.length <= limit ? wchar.sizeof * (src.length + 1) : size_t.max;
102 }
103 
104 
105 /**
106  * Converts a wstring to a string using a buffer provided by the user.
107  * To get the buffer requirements call $(D stringSize) on your source buffer.
108  *
109  * Params:
110  *   src = The UTF-8 string to convert.
111  *   dst = The destination buffer for the conversion.
112  *
113  * Returns:
114  *   The part of the destination buffer used for the conversion as a $(D wchar[]).
115  *   A terminating zero is appended, so the result.ptr can be passed into Windows APIs.
116  */
117 pure
118 char[] wstring2string(in wchar[] src, char* dst)
119 {
120 	const wchar* srcEnd = src.ptr + src.length;
121 	const(wchar)* srcIt = src.ptr;
122 	char* dstIt = dst;
123 
124 	while (srcIt !is srcEnd)
125 	{
126 		if (*srcIt < 0x80)
127 		{
128 			*dstIt++ = cast(char) *srcIt++;
129 		}
130 		else if (*srcIt < 0x800)
131 		{
132 			*dstIt++ = cast(char) (0b_11000000 | *srcIt >> 6);
133 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++;
134 		}
135 		if (*srcIt < 0xD800 || *srcIt > 0xDBFF)
136 		{
137 			// anything else within the BMP (<= 0xFFFF), but not a high surrogate
138 			*dstIt++ = 0b_11100000 | *srcIt >> 12;
139 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt >> 6;
140 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++;
141 		}
142 		else
143 		{
144 			// high surrogate, assume correct encoding and that the next wchar is the low surrogate
145 			dchar decoded;
146 			decoded = (*srcIt++ & 0b11_1111_1111) << 10;
147 			decoded |= (*srcIt++ & 0b11_1111_1111);
148 			*dstIt++ = 0b_11110000 | decoded >> 18;
149 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 12;
150 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 6;
151 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded;
152 		}
153 	}
154 	*dstIt = 0;
155 	
156 	return dst[0 .. dstIt - dst];
157 }
158 
159 /**
160  * Calculates the required buffer size in bytes for a wstring to char[] conversion.
161  * Room for a terminating '\0' is included.
162  *
163  * Params:
164  *   src = The source string.
165  *
166  * Returns:
167  *   The maximum byte count the source string could require, including the terminating '\0'.
168  *
169  * See_Also:
170  *   wstring2string
171  *   
172  */
173 @safe pure
174 size_t wstring2stringSize(in wchar[] src)
175 {
176 	enum limit = (size_t.max / char.sizeof - 1) / 3;
177 	return src.length <= limit ? char.sizeof * (3 * src.length + 1) : size_t.max;
178 }
179 
180 
181 /**
182  * Replaces $(D std.utf.toUTFz) with a version that uses the stack as long as the required bytes for the output are
183  * <= 1k. Longer strings use $(D malloc) to create a buffer for the conversion. It is freed at least at the end of the
184  * scope.
185  * 
186  * Params:
187  *   str = The source string to convert.
188  *
189  * See_Also:
190  *   toWstring
191  * 
192  * Example:
193  * ---
194  * string text = "Hello, world!";
195  * WinApiW(wcharPtr!text);
196  * ---
197  */
198 auto wcharPtr(alias str)(void* buffer = string2wstringSize(str) <= allocaLimit ? alloca(string2wstringSize(str)) : null)
199 {
200 	// In any case we have to return a proper InstantBuffer, so that free() is called in the dtor at some point.
201 	return TempBuffer!wchar(
202 		string2wstring(str, cast(wchar*) (buffer ? buffer : malloc(string2wstringSize(str)))),
203 		buffer is null);
204 }
205 
206 /// ditto
207 immutable(wchar)* wcharPtr(alias wstr)()
208 	if (is(typeof(wstr) == wstring) && __traits(compiles, { enum wstring e = wstr; }))
209 {
210 	// D string literals (known at compile time) are always \0-terminated.
211 	return wstr.ptr;
212 }
213 
214 /**
215  * $(D char*) version of $(D wcharPtr). Basically it appends a \0 to the input.
216  * The function uses $(D malloc) for strings of lengths 1024 and above.
217  * 
218  * Params:
219  *   str = The source string to convert to a C UTF-8 string
220  * 
221  * Note:
222  *   Do not use this to call Windows ANSI functions! Always use wide-char
223  *   functions on this operating system unless you want to deal with codepages.
224  *
225  * Example:
226  * ---
227  * string text = "Hello, world!";
228  * linuxApi(charPtr!text);
229  * ---
230  */
231 auto charPtr(alias str)(void* buffer = alloca(str.length + 1))
232 	if (is(typeof(str) : const(char)[]) || is(typeof(str) : const(ubyte)[]))
233 {
234 	char* dst = cast(char*) memcpy(buffer ? buffer : malloc(str.length + 1), str.ptr, str.length);
235 	dst[str.length] = '\0';
236 	return TempBuffer!char(dst[0 .. str.length], buffer is null);
237 }
238 
239 /// ditto
240 immutable(char)* charPtr(alias str)()
241 	if (__traits(compiles, { enum string e = str; }))
242 {
243 	// D string literals (known at compile time) are always \0-terminated.
244 	return str.ptr;
245 }
246 
247 /**
248  * This overload allocates the required memory from an existing stack buffer.
249  *
250  * Params:
251  *   str = The source string to convert to a C UTF-8 string
252  *   sb = The stack buffer to allocate from
253  * 
254  * Note:
255  *   Always assign the result to an auto variable first for RAII to work correctly.
256  */
257 StackBufferEntry!char charPtr(SB)(const(char)[] str, ref SB sb)
258 	if (is(SB == StackBuffer!bytes, bytes...))
259 {
260 	auto buffer = sb.alloc!char(str.length + 1);
261 	memcpy(buffer.ptr, str.ptr, str.length);
262 	buffer[str.length] = '\0';
263 	return buffer;
264 }
265 
266 /**
267  * Returns the given $(D ptr) up to but not including the \0 as a $(D char[]).
268  */
269 inout(char)[] asString(inout(char*) ptr) @trusted pure
270 {
271 	if (ptr is null) return null;
272 	return ptr[0 .. strlen(ptr)];
273 }