From 46f64b5c7cc6b318da9d4cb697eed5a6715b2540 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 25 Jan 2019 10:09:40 -0500 Subject: [PATCH 1/2] See #301: Speed-up js2python string conversion --- src/hiwire.c | 2 +- src/js2python.c | 53 ++++++++++++++++++++++++++++++++++++++++----- test/test_python.py | 16 +++++++++++--- 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/src/hiwire.c b/src/hiwire.c index ea6684f63..7e7e1238e 100644 --- a/src/hiwire.c +++ b/src/hiwire.c @@ -60,7 +60,7 @@ EM_JS(int, hiwire_string_ucs4, (int ptr, int len), { var jsstr = ""; var idx = ptr / 4; for (var i = 0; i < len; ++i) { - jsstr += String.fromCharCode(Module.HEAPU32[idx + i]); + jsstr += String.fromCodePoint(Module.HEAPU32[idx + i]); } return Module.hiwire_new_value(jsstr); }); diff --git a/src/js2python.c b/src/js2python.c index 606d383b4..859f7a5ff 100644 --- a/src/js2python.c +++ b/src/js2python.c @@ -9,9 +9,15 @@ // bubble out to Python int -_js2python_string(char* val) +_js2python_allocate_string(int size, int max_code_point) { - return (int)PyUnicode_FromString(val); + return (int)PyUnicode_New(size, max_code_point); +} + +int +_js2python_get_ptr(int obj) +{ + return (int)PyUnicode_DATA((PyObject*)obj); } int @@ -68,9 +74,46 @@ EM_JS(int, __js2python, (int id), { var value = Module.hiwire_get_value(id); var type = typeof value; if (type === 'string') { - var charptr = allocate(intArrayFromString(value), 'i8', ALLOC_NORMAL); - var result = __js2python_string(charptr); - _free(charptr); + // The general idea here is to allocate a Python string and then + // have Javascript write directly into its buffer. We first need + // to determine if is needs to be a 1-, 2- or 4-byte string, since + // Python handles all 3. + var max_code_point = 0; + for (var i = 0; i < value.length; i++) { + code_point = value.codePointAt(i); + max_code_point = Math.max(max_code_point, code_point); + if (max_code_point > 0xffff) { + // If we're dealing with UTF-16 surrogate pairs, convert the string + // to an array of each of its characters, so we correctly count the + // number of characters. + value = Array.from(value[Symbol.iterator]()); + // We can short circuit here -- we already know we need a 4-byte output. + break; + } + } + + var result = __js2python_allocate_string(value.length, max_code_point); + if (result == 0) { + return 0; + } + + var ptr = __js2python_get_ptr(result); + if (max_code_point > 0xffff) { + ptr = ptr / 4; + for (var i = 0; i < value.length; i++) { + Module.HEAPU32[ptr + i] = value[i].codePointAt(0); + } + } else if (max_code_point > 0xff) { + ptr = ptr / 2; + for (var i = 0; i < value.length; i++) { + Module.HEAPU16[ptr + i] = value.codePointAt(i); + } + } else { + for (var i = 0; i < value.length; i++) { + Module.HEAPU8[ptr + i] = value.codePointAt(i); + } + } + return result; } else if (type === 'number') { return __js2python_number(value); diff --git a/test/test_python.py b/test/test_python.py index c0fc64cfa..2dc1dadb5 100644 --- a/test/test_python.py +++ b/test/test_python.py @@ -34,6 +34,8 @@ def test_python2js(selenium): 'return pyodide.runPython("\'ιωδιούχο\'") === "ιωδιούχο"') assert selenium.run_js( 'return pyodide.runPython("\'碘化物\'") === "碘化物"') + assert selenium.run_js( + 'return pyodide.runPython("\'🐍\'") === "🐍"') assert selenium.run_js( 'let x = pyodide.runPython("b\'bytes\'");\n' 'return (x instanceof window.Uint8ClampedArray) && ' @@ -156,7 +158,9 @@ def test_pythonexc2js(selenium): def test_js2python(selenium): selenium.run_js( """ - window.jsstring = "碘化物"; + window.jsstring_ucs1 = "pyodidé"; + window.jsstring_ucs2 = "碘化物"; + window.jsstring_ucs4 = "🐍"; window.jsnumber0 = 42; window.jsnumber1 = 42.5; window.jsundefined = undefined; @@ -170,8 +174,14 @@ def test_js2python(selenium): """ ) assert selenium.run( - 'from js import jsstring\n' - 'jsstring == "碘化物"') + 'from js import jsstring_ucs1\n' + 'jsstring_ucs1 == "pyodidé"') + assert selenium.run( + 'from js import jsstring_ucs2\n' + 'jsstring_ucs2 == "碘化物"') + assert selenium.run( + 'from js import jsstring_ucs4\n' + 'jsstring_ucs4 == "🐍"') assert selenium.run( 'from js import jsnumber0\n' 'jsnumber0 == 42') From 1761a222dce7061115fec3e75c2e0b5ddd4096c0 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 28 Jan 2019 11:48:46 -0500 Subject: [PATCH 2/2] Simplify handling of non-BMP characters --- src/js2python.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/js2python.c b/src/js2python.c index 859f7a5ff..bb81a0f67 100644 --- a/src/js2python.c +++ b/src/js2python.c @@ -79,20 +79,20 @@ EM_JS(int, __js2python, (int id), { // to determine if is needs to be a 1-, 2- or 4-byte string, since // Python handles all 3. var max_code_point = 0; + var length = value.length; for (var i = 0; i < value.length; i++) { code_point = value.codePointAt(i); max_code_point = Math.max(max_code_point, code_point); - if (max_code_point > 0xffff) { - // If we're dealing with UTF-16 surrogate pairs, convert the string - // to an array of each of its characters, so we correctly count the - // number of characters. - value = Array.from(value[Symbol.iterator]()); - // We can short circuit here -- we already know we need a 4-byte output. - break; + if (code_point > 0xffff) { + // If we have a code point requiring UTF-16 surrogate pairs, the + // number of characters (codePoints) is less than value.length, + // so skip the next charCode and subtract 1 from the length. + i++; + length--; } } - var result = __js2python_allocate_string(value.length, max_code_point); + var result = __js2python_allocate_string(length, max_code_point); if (result == 0) { return 0; } @@ -100,16 +100,20 @@ EM_JS(int, __js2python, (int id), { var ptr = __js2python_get_ptr(result); if (max_code_point > 0xffff) { ptr = ptr / 4; - for (var i = 0; i < value.length; i++) { - Module.HEAPU32[ptr + i] = value[i].codePointAt(0); + for (var i = 0, j = 0; j < length; i++, j++) { + var code_point = value.codePointAt(i); + Module.HEAPU32[ptr + j] = code_point; + if (code_point > 0xffff) { + i++; + } } } else if (max_code_point > 0xff) { ptr = ptr / 2; - for (var i = 0; i < value.length; i++) { + for (var i = 0; i < length; i++) { Module.HEAPU16[ptr + i] = value.codePointAt(i); } } else { - for (var i = 0; i < value.length; i++) { + for (var i = 0; i < length; i++) { Module.HEAPU8[ptr + i] = value.codePointAt(i); } }