Encountering an issue known as surrogate pairs is what you're facing right now. Some unicode characters consist of two bytes instead of one, and if they are separated, they will not be displayed correctly.
If ES6 is available to you, utilizing the spread operator or for..of
syntax when iterating through a string takes into consideration surrogate pairs, making it easier to obtain accurate results. Other solutions in this thread demonstrate how to achieve this.
For those unable to utilize ES6, MDN provides an example of how to handle these situations using the charAt
method, which can be found here. The code snippet below demonstrates this approach.
function getWholeChar(str, i) {
var code = str.charCodeAt(i);
if (Number.isNaN(code)) return '';
if (code < 0xD800 || code > 0xDFFF) return str.charAt(i);
if (0xD800 <= code && code <= 0xDBFF) {
if (str.length <= (i + 1)) throw 'High surrogate without following low surrogate';
var next = str.charCodeAt(i + 1);
if (0xDC00 > next || next > 0xDFFF) throw 'High surrogate without following low surrogate';
return str.charAt(i) + str.charAt(i + 1);
}
if (i === 0) throw 'Low surrogate without preceding high surrogate';
var prev = str.charCodeAt(i - 1);
if (0xD800 > prev || prev > 0xDBFF) throw 'Low surrogate without preceding high surrogate';
return false;
}
convert.onclick =
function() {
for (var i = 0, chr; i < before.value.length; i++) {
if(!(chr = getWholeChar(before.value, i))) continue;
after.value += "'" + chr + "', ";
}
}
<textarea id="before" type="text" name="input" style="width:100%;">*π‘(π)-_=+π’βπ¨πππΌπ£βπ₯ππͺππ¦π</textarea><br />
<textarea id="after" cols="50" rows="10" name="output" style="width:100%;"></textarea>
<button id="convert" name="convert" type="button">convert</button>