[mw-devel] [Git][arthur/mw][master] Convert duktape's cesu-8 strings to utf-8 in js_print

Andrew Price welshbyte at sucs.org
Fri Jul 28 01:16:21 BST 2017


Andrew Price pushed to branch master at Justin Mitchell / mw


Commits:
95c3f705 by Andrew Price at 2017-07-28T01:10:35+01:00
Convert duktape's cesu-8 strings to utf-8 in js_print

🔥🔥🤷🔥🔥

- - - - -


1 changed file:

- src/client/js-duk.c


Changes:

=====================================
src/client/js-duk.c
=====================================
--- a/src/client/js-duk.c
+++ b/src/client/js-duk.c
@@ -79,14 +79,76 @@ static void start_timeout(void)
 	timeout_event = alarm_after(3, 0, NULL, &timeout);
 }
 
+/* Duktape uses a CESU-8 encoding, which allows UTF-16 surrogate pairs
+   (themselves encoded in UTF-8), in order to be kinda-sorta compatible with
+   ecmascript's UTF-16 requirements. This function just copies the cesu8 string,
+   converting any surrogate pairs it finds to UTF-8. */
+static char *cesu8_to_utf8(const char *cesu8)
+{
+	char *utf8 = calloc(1, strlen(cesu8) + 1);
+	const unsigned char *cc = (void *)cesu8;
+	char *cu = utf8;
+	uint32_t hs = 0;
+
+	while (*cc != '\0') {
+		uint32_t c = 0;
+		uint32_t u;
+
+		if (cc[0] <= 0x7F) {
+			*cu++ = *cc++;
+			continue;
+		} else if (cc[0] <= 0xDF) {
+			*cu++ = *cc++;
+			*cu++ = *cc++;
+			continue;
+		} else if (cc[0] <= 0xEF) {
+			/* Surrogates are encoded in 3 chars so convert
+			   back to a single UTF-16 value */
+			c = ((uint32_t)cc[0] & 0xF) << 12 |
+			    ((uint32_t)cc[1] & 0x3F) << 6 |
+			    ((uint32_t)cc[2] & 0x3F);
+		} else {
+			*cu++ = *cc++;
+			*cu++ = *cc++;
+			*cu++ = *cc++;
+			*cu++ = *cc++;
+			continue;
+		}
+		if (hs == 0 && c >= 0xD800 && c <= 0xDBFF)
+			hs = c;
+		else if (hs != 0 && c >= 0xDC00 && c <= 0xDFFF) {
+			/* Have high and low surrogates - convert to code point then
+			   back to UTF-8 */
+			u = 0x10000 + ((((uint32_t)hs & 0x3FF) << 10) | (c & 0x3FF));
+			*cu++ = 0xF0 |  u >> 18;
+			*cu++ = 0x80 | (u >> 12 & 0x3F);
+			*cu++ = 0x80 | (u >> 6 & 0x3F);
+			*cu++ = 0x80 | (u & 0x3F);
+			hs = 0;
+		} else {
+			*cu++ = cc[0];
+			*cu++ = cc[1];
+			*cu++ = cc[2];
+			hs = 0;
+		}
+		cc += 3;
+	}
+	*cu = '\0';
+	return utf8;
+}
+
 static duk_ret_t js_print(duk_context *cx)
 {
 	int argc = duk_get_top(cx);
 
 	if (argc < 1)
 		return 0;
-	for (int i = 0; i < argc; i++)
-		display_message(duk_to_string(cx, i - argc), 0, 1);
+	for (int i = 0; i < argc; i++) {
+		const char *cesu8 = duk_to_string(cx, i - argc);
+		char *utf8 = cesu8_to_utf8(cesu8);
+		display_message(utf8, 0, 1);
+		free(utf8);
+	}
 	return 0;
 }
 



View it on GitLab: https://projects.sucs.org/arthur/mw/commit/95c3f70527013693c0860357c871f74b53786c9b

---
View it on GitLab: https://projects.sucs.org/arthur/mw/commit/95c3f70527013693c0860357c871f74b53786c9b
You're receiving this email because of your account on projects.sucs.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.sucs.org/pipermail/mw-devel/attachments/20170728/8a797591/attachment-0001.html>


More information about the mw-devel mailing list