fglock · fglock · May 15, 2026 · May 15, 2026
diff --git a/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java b/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java
@@ -118,9 +118,9 @@ public String toUpgradedString() {
             // Character codes > 255 are already Unicode characters
             if (value > 0x10FFFF) {
                 sb.append(PerlUtfString.encodeBeyondUnicode(Integer.toUnsignedLong(value)));
-            } else if (value >= 0xD800 && value <= 0xDFFF) {
-                sb.append(PerlUtfString.encodeSurrogate(Integer.toUnsignedLong(value)));
             } else {
+                // Includes U+D800..U+DFFF from pack "U" / "W": one logical Perl character as a single
+                // UTF-16 code unit (not the FFFD+<hex> internal-marker form used elsewhere).
                 sb.appendCodePoint(value);
             }
         }

diff --git a/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java b/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java
@@ -351,19 +351,23 @@ public static boolean packU(RuntimeScalar value, boolean byteMode, boolean hasUn
         // U format behavior depends on mode:
         // - Character mode: write character code (PackBuffer will handle UTF-8 upgrade)
         // - Byte mode: write UTF-8 bytes directly (for binary compatibility)
-        if (Long.compareUnsigned(codePointLong, 0x10FFFFL) <= 0) {
-            int codePoint1 = (int) codePointLong;
-            if (byteMode) {
-                // Byte mode: write UTF-8 bytes
-                String unicodeChar = new String(Character.toChars(codePoint1));
-                byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8);
-                output.write(utf8Bytes);
-            } else {
-                // Character mode: write character code
-                output.writeCharacter(codePoint1);
+        //
+        // Perl accepts pack("U", $cp) up to 0x7FFF_FFFF (see t/lib/Util.pm in Unicode-UTF8).
+        // Character mode may represent code points above U+10FFFF for modules such as Unicode::UTF8
+        // that reject them at encode time.
+        if (Long.compareUnsigned(codePointLong, 0x80000000L) >= 0) {
+            throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
+        }
+        int codePoint1 = (int) codePointLong;
+        if (byteMode) {
+            if (Integer.compareUnsigned(codePoint1, 0x10FFFF) > 0) {
+                throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
             }
+            String unicodeChar = new String(Character.toChars(codePoint1));
+            byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8);
+            output.write(utf8Bytes);
         } else {
-            throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
+            output.writeCharacter(codePoint1);
         }
         return hasUnicodeInNormalMode;
     }

diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java b/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java
@@ -1132,15 +1132,11 @@ public static RuntimeList _utf8_on(RuntimeArray args, int ctx) {
         }
         RuntimeScalar arg = args.get(0);
         boolean wasUtf8 = (arg.type == STRING);
-        if (arg.type == BYTE_STRING) {
-            // Re-decode the byte string as UTF-8 to get proper characters
-            // e.g., bytes \xC3\xA9 -> character U+00E9 (é)
-            String s = arg.toString();
-            byte[] bytes = s.getBytes(StandardCharsets.ISO_8859_1);
-            arg.set(new String(bytes, StandardCharsets.UTF_8));
+        if (!wasUtf8) {
+            boolean fromBytes = (arg.type == BYTE_STRING);
+            arg.type = STRING;
+            arg.utf8UncheckedOctets = fromBytes;
         }
-        // Set the UTF-8 flag (change type to STRING)
-        arg.type = STRING;
         return new RuntimeScalar(wasUtf8).getList();
     }
 
@@ -1160,6 +1156,7 @@ public static RuntimeList _utf8_off(RuntimeArray args, int ctx) {
             arg.set(new String(bytes, StandardCharsets.ISO_8859_1));
         }
         arg.type = BYTE_STRING;
+        arg.utf8UncheckedOctets = false;
         return new RuntimeScalar(wasUtf8).getList();
     }