Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ public String toUpgradedString() {
// Character codes > 255 are already Unicode characters
if (value > 0x10FFFF) {
sb.append(PerlUtfString.encodeBeyondUnicode(Integer.toUnsignedLong(value)));
} else if (value >= 0xD800 && value <= 0xDFFF) {
sb.append(PerlUtfString.encodeSurrogate(Integer.toUnsignedLong(value)));
} else {
// Includes U+D800..U+DFFF from pack "U" / "W": one logical Perl character as a single
// UTF-16 code unit (not the FFFD+<hex> internal-marker form used elsewhere).
sb.appendCodePoint(value);
}
}
Expand Down
26 changes: 15 additions & 11 deletions src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -351,19 +351,23 @@ public static boolean packU(RuntimeScalar value, boolean byteMode, boolean hasUn
// U format behavior depends on mode:
// - Character mode: write character code (PackBuffer will handle UTF-8 upgrade)
// - Byte mode: write UTF-8 bytes directly (for binary compatibility)
if (Long.compareUnsigned(codePointLong, 0x10FFFFL) <= 0) {
int codePoint1 = (int) codePointLong;
if (byteMode) {
// Byte mode: write UTF-8 bytes
String unicodeChar = new String(Character.toChars(codePoint1));
byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8);
output.write(utf8Bytes);
} else {
// Character mode: write character code
output.writeCharacter(codePoint1);
//
// Perl accepts pack("U", $cp) up to 0x7FFF_FFFF (see t/lib/Util.pm in Unicode-UTF8).
// Character mode may represent code points above U+10FFFF for modules such as Unicode::UTF8
// that reject them at encode time.
if (Long.compareUnsigned(codePointLong, 0x80000000L) >= 0) {
throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
}
int codePoint1 = (int) codePointLong;
if (byteMode) {
if (Integer.compareUnsigned(codePoint1, 0x10FFFF) > 0) {
throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
}
String unicodeChar = new String(Character.toChars(codePoint1));
byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8);
output.write(utf8Bytes);
} else {
throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong);
output.writeCharacter(codePoint1);
}
return hasUnicodeInNormalMode;
}
Expand Down
13 changes: 5 additions & 8 deletions src/main/java/org/perlonjava/runtime/perlmodule/Encode.java
Original file line number Diff line number Diff line change
Expand Up @@ -1132,15 +1132,11 @@ public static RuntimeList _utf8_on(RuntimeArray args, int ctx) {
}
RuntimeScalar arg = args.get(0);
boolean wasUtf8 = (arg.type == STRING);
if (arg.type == BYTE_STRING) {
// Re-decode the byte string as UTF-8 to get proper characters
// e.g., bytes \xC3\xA9 -> character U+00E9 (é)
String s = arg.toString();
byte[] bytes = s.getBytes(StandardCharsets.ISO_8859_1);
arg.set(new String(bytes, StandardCharsets.UTF_8));
if (!wasUtf8) {
boolean fromBytes = (arg.type == BYTE_STRING);
arg.type = STRING;
arg.utf8UncheckedOctets = fromBytes;
}
// Set the UTF-8 flag (change type to STRING)
arg.type = STRING;
return new RuntimeScalar(wasUtf8).getList();
}

Expand All @@ -1160,6 +1156,7 @@ public static RuntimeList _utf8_off(RuntimeArray args, int ctx) {
arg.set(new String(bytes, StandardCharsets.ISO_8859_1));
}
arg.type = BYTE_STRING;
arg.utf8UncheckedOctets = false;
return new RuntimeScalar(wasUtf8).getList();
}

Expand Down
Loading
Loading