Replace surrogate chars with U+FFFD

Also add some more unicode input tests.
2026-03-27 05:45:12 +08:00 · 2016-01-19 23:11:20 +01:00
parent 38dd99e827
commit 54857d5fd4
2 changed files with 47 additions and 2 deletions
--- a/app/src/main/java/com/termux/terminal/TerminalEmulator.java
+++ b/app/src/main/java/com/termux/terminal/TerminalEmulator.java
@@ -218,7 +218,7 @@ public final class TerminalEmulator {
 	 */
 	private int mScrollCounter = 0;

-	private int mUtf8ToFollow, mUtf8Index;
+	private byte mUtf8ToFollow, mUtf8Index;
 	private final byte[] mUtf8InputBuffer = new byte[4];

 	public final TerminalColors mColors = new TerminalColors();
@@ -424,7 +424,11 @@ public final class TerminalEmulator {
 						processCodePoint(/* escape (hexadecimal=0x1B, octal=033): */27);
 						processCodePoint((codePoint & 0x7F) + 0x40);
 					} else {
-						if (Character.UNASSIGNED == Character.getType(codePoint)) codePoint = UNICODE_REPLACEMENT_CHAR;
+						switch (Character.getType(codePoint)) {
+							case Character.UNASSIGNED:
+							case Character.SURROGATE:
+								codePoint = UNICODE_REPLACEMENT_CHAR;
+						}
 						processCodePoint(codePoint);
 					}
 				}
--- a/app/src/test/java/com/termux/terminal/UnicodeInputTest.java
+++ b/app/src/test/java/com/termux/terminal/UnicodeInputTest.java
@@ -12,6 +12,47 @@ public class UnicodeInputTest extends TerminalTestCase {
 		withTerminalSized(5, 5);
 		mTerminal.append(new byte[]{(byte) 0b11101111, (byte) 'a'}, 2);
 		assertLineIs(0, ((char) TerminalEmulator.UNICODE_REPLACEMENT_CHAR) + "a   ");
+
+		// https://code.google.com/p/chromium/issues/detail?id=212704
+		byte[] input = new byte[]{
+				(byte) 0x61, (byte) 0xF1,
+				(byte) 0x80, (byte) 0x80,
+				(byte) 0xe1, (byte) 0x80,
+				(byte) 0xc2, (byte) 0x62,
+				(byte) 0x80, (byte) 0x63,
+				(byte) 0x80, (byte) 0xbf,
+				(byte) 0x64
+		};
+		withTerminalSized(10, 2);
+		mTerminal.append(input, input.length);
+		assertLinesAre("a\uFFFD\uFFFD\uFFFDb\uFFFDc\uFFFD\uFFFDd", "          ");
+
+		// Surrogate pairs.
+		withTerminalSized(5, 2);
+		input = new byte[]{
+				(byte) 0xed, (byte) 0xa0,
+				(byte) 0x80, (byte) 0xed,
+				(byte) 0xad, (byte) 0xbf,
+				(byte) 0xed, (byte) 0xae,
+				(byte) 0x80, (byte) 0xed,
+				(byte) 0xbf, (byte) 0xbf
+		};
+		mTerminal.append(input, input.length);
+		assertLinesAre("\uFFFD\uFFFD\uFFFD\uFFFD ", "     ");
+
+		// https://bugzilla.mozilla.org/show_bug.cgi?id=746900: "with this patch 0xe0 0x80 is decoded as two U+FFFDs,
+		// but 0xe0 0xa0 is decoded as a single U+FFFD, and this is correct according to the "Best Practices", but IE
+		// and Chrome (Version 22.0.1229.94) decode both of them as two U+FFFDs. Opera 12.11 decodes both of them as
+		// one U+FFFD".
+		withTerminalSized(5, 2);
+		input = new byte[]{(byte) 0xe0, (byte) 0xa0, ' '};
+		mTerminal.append(input, input.length);
+		assertLinesAre("\uFFFD    ", "     ");
+
+		// withTerminalSized(5, 2);
+		// input = new byte[]{(byte) 0xe0, (byte) 0x80, 'a'};
+		// mTerminal.append(input, input.length);
+		// assertLinesAre("\uFFFD\uFFFDa  ", "     ");
 	}

 	public void testUnassignedCodePoint() throws UnsupportedEncodingException {