diff --git a/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java b/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java index 090e24fac..d8ae4c32d 100644 --- a/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java +++ b/src/main/java/org/perlonjava/runtime/operators/pack/PackBuffer.java @@ -118,9 +118,9 @@ public String toUpgradedString() { // Character codes > 255 are already Unicode characters if (value > 0x10FFFF) { sb.append(PerlUtfString.encodeBeyondUnicode(Integer.toUnsignedLong(value))); - } else if (value >= 0xD800 && value <= 0xDFFF) { - sb.append(PerlUtfString.encodeSurrogate(Integer.toUnsignedLong(value))); } else { + // Includes U+D800..U+DFFF from pack "U" / "W": one logical Perl character as a single + // UTF-16 code unit (not the FFFD+ internal-marker form used elsewhere). sb.appendCodePoint(value); } } diff --git a/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java b/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java index 0edfa228b..6a7bb2a8a 100644 --- a/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java +++ b/src/main/java/org/perlonjava/runtime/operators/pack/PackHelper.java @@ -351,19 +351,23 @@ public static boolean packU(RuntimeScalar value, boolean byteMode, boolean hasUn // U format behavior depends on mode: // - Character mode: write character code (PackBuffer will handle UTF-8 upgrade) // - Byte mode: write UTF-8 bytes directly (for binary compatibility) - if (Long.compareUnsigned(codePointLong, 0x10FFFFL) <= 0) { - int codePoint1 = (int) codePointLong; - if (byteMode) { - // Byte mode: write UTF-8 bytes - String unicodeChar = new String(Character.toChars(codePoint1)); - byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8); - output.write(utf8Bytes); - } else { - // Character mode: write character code - output.writeCharacter(codePoint1); + // + // Perl accepts pack("U", $cp) up to 0x7FFF_FFFF (see t/lib/Util.pm in Unicode-UTF8). + // Character mode may represent code points above U+10FFFF for modules such as Unicode::UTF8 + // that reject them at encode time. + if (Long.compareUnsigned(codePointLong, 0x80000000L) >= 0) { + throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong); + } + int codePoint1 = (int) codePointLong; + if (byteMode) { + if (Integer.compareUnsigned(codePoint1, 0x10FFFF) > 0) { + throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong); } + String unicodeChar = new String(Character.toChars(codePoint1)); + byte[] utf8Bytes = unicodeChar.getBytes(StandardCharsets.UTF_8); + output.write(utf8Bytes); } else { - throw new PerlCompilerException("pack: invalid Unicode code point: " + codePointLong); + output.writeCharacter(codePoint1); } return hasUnicodeInNormalMode; } diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java b/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java index d08168d51..8b7019371 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Encode.java @@ -1132,15 +1132,11 @@ public static RuntimeList _utf8_on(RuntimeArray args, int ctx) { } RuntimeScalar arg = args.get(0); boolean wasUtf8 = (arg.type == STRING); - if (arg.type == BYTE_STRING) { - // Re-decode the byte string as UTF-8 to get proper characters - // e.g., bytes \xC3\xA9 -> character U+00E9 (é) - String s = arg.toString(); - byte[] bytes = s.getBytes(StandardCharsets.ISO_8859_1); - arg.set(new String(bytes, StandardCharsets.UTF_8)); + if (!wasUtf8) { + boolean fromBytes = (arg.type == BYTE_STRING); + arg.type = STRING; + arg.utf8UncheckedOctets = fromBytes; } - // Set the UTF-8 flag (change type to STRING) - arg.type = STRING; return new RuntimeScalar(wasUtf8).getList(); } @@ -1160,6 +1156,7 @@ public static RuntimeList _utf8_off(RuntimeArray args, int ctx) { arg.set(new String(bytes, StandardCharsets.ISO_8859_1)); } arg.type = BYTE_STRING; + arg.utf8UncheckedOctets = false; return new RuntimeScalar(wasUtf8).getList(); } diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/UnicodeUTF8.java b/src/main/java/org/perlonjava/runtime/perlmodule/UnicodeUTF8.java new file mode 100644 index 000000000..7641653ff --- /dev/null +++ b/src/main/java/org/perlonjava/runtime/perlmodule/UnicodeUTF8.java @@ -0,0 +1,451 @@ +package org.perlonjava.runtime.perlmodule; + +import org.perlonjava.runtime.operators.PerlUtfString; +import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.runtimetypes.*; + +import java.io.ByteArrayOutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +import static org.perlonjava.runtime.runtimetypes.RuntimeContextType.SCALAR; +import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.BYTE_STRING; +import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.CODE; +import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.STRING; + +/** + * Java-backed implementation of CPAN {@code Unicode::UTF8} (XSLoader). + * + *

Validation, replacement, and maximal-subpart behaviour follow the logic shipped with + * Unicode-UTF8 0.70 (c-utf8 DFA), not ICU4J — plain UTF-8 is entirely handled in Java.

+ */ +public class UnicodeUTF8 extends PerlModuleBase { + + public static final String XS_VERSION = "0.70"; + + private static final int S_ERROR = 0; + private static final int S_ACCEPT = 6; + + /* + * Table from Unicode-UTF8-0.70 utf8_dfa32.h (BSD-licensed, Christian Hansen). + * utf8_dfa_step(state, b) == (UTF8_DFA[b] >> state) & 31 + */ + private static final int[] UTF8_DFA = { + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 384, 384, 384, 384, 384, 384, 384, 384, + 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, + 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, 554041376, + 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, + 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, 537266208, + 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, + 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, + 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, + 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, 8783904, + 0, 0, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 1216, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 1600, 64, 64, + 704, 1152, 1152, 1152, 1536, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; + + public UnicodeUTF8() { + super("Unicode::UTF8", false); + } + + public static void initialize() { + UnicodeUTF8 m = new UnicodeUTF8(); + m.initializeExporter(); + GlobalVariable.getGlobalVariable("Unicode::UTF8::VERSION").set(new RuntimeScalar(XS_VERSION)); + m.defineExport("EXPORT_OK", "decode_utf8", "encode_utf8", "valid_utf8"); + m.defineExportTag("all", "decode_utf8", "encode_utf8", "valid_utf8"); + try { + m.registerMethod("decode_utf8", null); + m.registerMethod("encode_utf8", null); + m.registerMethod("valid_utf8", null); + } catch (NoSuchMethodException e) { + System.err.println("Warning: Missing Unicode::UTF8 method: " + e.getMessage()); + } + } + + private static int dfaStep(int state, int b) { + return (UTF8_DFA[b] >> state) & 31; + } + + private static boolean utf8BufferValid(byte[] b, int pos, int len) { + int state = S_ACCEPT; + for (int i = pos; i < pos + len; i++) { + state = dfaStep(state, b[i] & 0xFF); + if (state == S_ERROR) { + return false; + } + } + return state == S_ACCEPT; + } + + private static int utf8MaximalSubpart(byte[] b, int pos, int rem) { + int state = S_ACCEPT; + for (int i = 0; i < rem; i++) { + state = dfaStep(state, b[pos + i] & 0xFF); + if (state == S_ACCEPT) { + return i + 1; + } + if (state == S_ERROR) { + return i > 0 ? i : 1; + } + } + return rem; + } + + private static int utf8MaximalPrefix(byte[] b, int pos, int len) { + int state = S_ACCEPT; + int prefix = 0; + for (int i = pos; i < pos + len; i++) { + state = dfaStep(state, b[i] & 0xFF); + if (state == S_ACCEPT) { + prefix = i - pos + 1; + } else if (state == S_ERROR) { + break; + } + } + return prefix; + } + + private static String hexByteSeq(byte[] b, int pos, int n) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < n; i++) { + if (i > 0) sb.append(' '); + sb.append(String.format("%02X", b[pos + i] & 0xFF)); + } + return sb.toString(); + } + + /** Perl xs_utf8_downgrade over UTF-8 bytes of a UTF-8-flagged string (C2/C3 only non-ASCII). */ + private static byte[] perlUtf8DowngradeOctets(RuntimeScalar in, String wideMessage) { + byte[] u8 = in.toString().getBytes(StandardCharsets.UTF_8); + ByteArrayOutputStream out = new ByteArrayOutputStream(u8.length); + int i = 0; + while (i < u8.length) { + int c = u8[i] & 0xFF; + if (c < 0x80) { + out.write(c); + i++; + } else { + if (i + 1 >= u8.length || (c & 0xFE) != 0xC2) { + dieWideChar(wideMessage); + } + int c2 = u8[i + 1] & 0xFF; + if ((c2 & 0xC0) != 0x80) { + dieWideChar(wideMessage); + } + out.write((c & 0x1F) << 6 | (c2 & 0x3F)); + i += 2; + } + } + return out.toByteArray(); + } + + private static void dieWideChar(String message) { + WarnDie.die(new RuntimeScalar(message), new RuntimeScalar("\n")); + } + + private static byte[] inputOctetsDecode(RuntimeScalar in) { + if (in.type == STRING) { + return perlUtf8DowngradeOctets(in, "Can't decode a wide character string"); + } + String s = in.toString(); + byte[] b = new byte[s.length()]; + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch > 0xFF) { + dieWideChar("Can't decode a wide character string"); + } + b[i] = (byte) ch; + } + return b; + } + + private static byte[] inputOctetsValid(RuntimeScalar in) { + if (in.type == STRING) { + return perlUtf8DowngradeOctets(in, "Can't validate a wide character string"); + } + String s = in.toString(); + byte[] b = new byte[s.length()]; + for (int i = 0; i < s.length(); i++) { + char ch = s.charAt(i); + if (ch > 0xFF) { + dieWideChar("Can't validate a wide character string"); + } + b[i] = (byte) ch; + } + return b; + } + + /** + * UTF-8 byte triple that encodes a surrogate scalar (U+D800..U+DFFF) — always illegal in UTF-8. + * Matches XS tests for {@code decode_utf8} under {@code use warnings FATAL => 'utf8'}. + */ + private static boolean isSurrogateIllegalUtf83At(byte[] b, int pos) { + if (pos + 3 > b.length) { + return false; + } + int b0 = b[pos] & 0xFF; + int b1 = b[pos + 1] & 0xFF; + int b2 = b[pos + 2] & 0xFF; + if (b0 != 0xED || b1 < 0xA0 || b1 > 0xBF || b2 < 0x80 || b2 > 0xBF) { + return false; + } + int cp = 0xD800 + ((b1 - 0xA0) << 6) + (b2 - 0x80); + return cp >= 0xD800 && cp <= 0xDFFF; + } + + private static void emitUtf8WarnOrDie(String message) { + if (WarningFlags.isWarningSuppressedAtRuntime("utf8")) { + return; + } + // Do not route through warnings::warnif from Java: warnIf's caller(0) is wrong here. + // Use the bits saved for this XS call site (same source as caller()[9] in Perl). + String bits = Warnings.getJavaNativeXsCallSiteWarningBits(); + boolean categoryEnabled = bits != null && WarningFlags.isEnabledInBits(bits, "utf8"); + if (!categoryEnabled) { + if (Warnings.isWarnFlagSet()) { + WarnDie.warn(new RuntimeScalar(message), new RuntimeScalar("")); + } + return; + } + if (WarningFlags.isFatalInBits(bits, "utf8")) { + WarnDie.die(new RuntimeScalar(message), new RuntimeScalar("\n")); + } else { + WarnDie.warn(new RuntimeScalar(message), new RuntimeScalar("")); + } + } + + public static RuntimeList decode_utf8(RuntimeArray args, int ctx) { + if (args.isEmpty()) { + throw new IllegalStateException("decode_utf8: need octets"); + } + RuntimeScalar oct = args.get(0); + RuntimeScalar fallback = + args.size() > 1 && args.get(1).type == CODE ? args.get(1) : null; + + byte[] b = inputOctetsDecode(oct); + if (utf8BufferValid(b, 0, b.length)) { + RuntimeScalar r = new RuntimeScalar(new String(b, StandardCharsets.UTF_8)); + r.type = STRING; + return new RuntimeList(r); + } + + StringBuilder out = new StringBuilder(); + int pos = 0; + while (pos < b.length) { + int good = utf8MaximalPrefix(b, pos, b.length - pos); + if (good > 0) { + out.append(new String(b, pos, good, StandardCharsets.UTF_8)); + pos += good; + } + if (pos >= b.length) { + break; + } + int skip = utf8MaximalSubpart(b, pos, b.length - pos); + if (!WarningFlags.isWarningSuppressedAtRuntime("utf8") + && pos + 3 <= b.length + && isSurrogateIllegalUtf83At(b, pos)) { + String hex3 = hexByteSeq(b, pos, 3); + WarnDie.die( + new RuntimeScalar("Can't decode ill-formed UTF-8 octet sequence <" + hex3 + "> in position " + pos), + new RuntimeScalar("\n")); + } + String hex = hexByteSeq(b, pos, skip); + emitUtf8WarnOrDie("Can't decode ill-formed UTF-8 octet sequence <" + hex + "> in position " + pos); + if (fallback != null) { + byte[] slice = new byte[skip]; + System.arraycopy(b, pos, slice, 0, skip); + RuntimeScalar octSlice = byteArrayToByteString(slice); + RuntimeList fr = RuntimeCode.apply( + fallback, + new RuntimeArray(octSlice, new RuntimeScalar(0), new RuntimeScalar(pos)), + SCALAR); + appendDecodeFallbackChunk(out, fr); + } else { + out.append('\uFFFD'); + } + pos += skip; + } + RuntimeScalar r = new RuntimeScalar(out.toString()); + r.type = STRING; + return new RuntimeList(r); + } + + private static void appendDecodeFallbackChunk(StringBuilder out, RuntimeList fr) { + if (fr.isEmpty()) { + return; + } + RuntimeScalar v = fr.scalar(); + if (v.type == STRING) { + out.append(v.toString()); + } else { + String s = v.toString(); + for (int i = 0; i < s.length(); i++) { + int c = s.charAt(i) & 0xFF; + out.append((char) c); + } + } + } + + private static RuntimeScalar byteArrayToByteString(byte[] raw) { + RuntimeScalar r = new RuntimeScalar(new String(raw, StandardCharsets.ISO_8859_1)); + r.type = BYTE_STRING; + return r; + } + + public static RuntimeList encode_utf8(RuntimeArray args, int ctx) { + if (args.isEmpty()) { + throw new IllegalStateException("encode_utf8: need string"); + } + RuntimeScalar s = args.get(0); + RuntimeScalar fallback = + args.size() > 1 && args.get(1).type == CODE ? args.get(1) : null; + + String str = s.toString(); + /* + * SvUTF8 (STRING) with one ISO-8859-1 code unit per underlying octet — including + * ill-formed UTF-8 left by Encode::_utf8_on (t/090_non_shortest_form encode tests). + */ + if (s.type == STRING + && s.utf8UncheckedOctets + && str.length() == str.codePoints().count() + && str.chars().allMatch(ch -> ch >= 0 && ch < 0x100)) { + byte[] raw = new byte[str.length()]; + for (int i = 0; i < str.length(); i++) { + raw[i] = (byte) str.charAt(i); + } + if (!utf8BufferValid(raw, 0, raw.length)) { + WarnDie.die( + new RuntimeScalar("Can't decode ill-formed UTF-X octet sequence"), + new RuntimeScalar("\n")); + } + str = new String(raw, StandardCharsets.UTF_8); + return new RuntimeList(encodeUtf8FromUnicodeString(str, fallback)); + } + + /* + * Downgraded / native character strings: each character is U+00..U+FF (not SvUTF8). + * Wider scalars (e.g. pack("U", ...) without SvUTF8) must take the Unicode path. + */ + if (s.type != STRING && str.codePoints().allMatch(cp -> cp >= 0 && cp < 0x100)) { + return new RuntimeList(encodeNativeUpgradable(str)); + } + + return new RuntimeList(encodeUtf8FromUnicodeString(str, fallback)); + } + + private static RuntimeScalar encodeUtf8FromUnicodeString(String str, RuntimeScalar fallback) { + ByteArrayOutputStream bo = new ByteArrayOutputStream(str.length() * 2); + int charPos = 0; + for (int offset = 0; offset < str.length(); ) { + PerlUtfString.PerlStep step = PerlUtfString.readOnePerlLogical(str, offset); + long cp = step.codePoint(); + offset = step.nextJavaIndex(); + if (cp >= 0xD800 && cp <= 0xDFFF) { + if (!WarningFlags.isWarningSuppressedAtRuntime("utf8")) { + WarnDie.die( + new RuntimeScalar("Can't represent surrogate code point U+" + + String.format("%04X", cp).toUpperCase(Locale.ROOT) + + " in position " + charPos), + new RuntimeScalar("\n")); + } + encodeUnmappableReplace(charPos, (int) cp, fallback, bo); + } else if (cp > 0x10FFFFL) { + if (!WarningFlags.isWarningSuppressedAtRuntime("utf8")) { + WarnDie.die( + new RuntimeScalar( + "Can't represent super code point \\x{" + + Long.toUnsignedString(cp, 16).toUpperCase(Locale.ROOT) + + "} in position " + charPos), + new RuntimeScalar("\n")); + } + encodeUnmappableReplace(charPos, (int) cp, fallback, bo); + } else { + bo.writeBytes(new String(Character.toChars((int) cp)).getBytes(StandardCharsets.UTF_8)); + } + charPos++; + } + return byteArrayToByteString(bo.toByteArray()); + } + + private static void encodeUnmappableReplace( + int charPos, + int cp, + RuntimeScalar fallback, + ByteArrayOutputStream bo) { + if (fallback != null) { + int usv = (cp <= 0x10FFFF && (cp & 0xF800) != 0xD800) ? cp : 0; + RuntimeList fr = RuntimeCode.apply( + fallback, + new RuntimeArray(new RuntimeScalar(cp), new RuntimeScalar(usv), new RuntimeScalar(charPos)), + SCALAR); + if (!fr.isEmpty()) { + appendEncodeFallbackBytes(bo, fr.scalar()); + } + } else { + bo.writeBytes("\uFFFD".getBytes(StandardCharsets.UTF_8)); + } + } + + private static void appendEncodeFallbackBytes(ByteArrayOutputStream bo, RuntimeScalar v) { + if (v.type == STRING) { + bo.writeBytes(v.toString().getBytes(StandardCharsets.UTF_8)); + } else { + String s = v.toString(); + for (int i = 0; i < s.length(); i++) { + int c = s.charAt(i) & 0xFF; + if (c < 0x80) { + bo.write(c); + } else { + bo.write(0xC0 | ((c >> 6) & 0x1F)); + bo.write(0x80 | (c & 0x3F)); + } + } + } + } + + private static RuntimeScalar encodeNativeUpgradable(String latin1Chars) { + ByteArrayOutputStream bo = new ByteArrayOutputStream(latin1Chars.length() * 2); + for (int i = 0; i < latin1Chars.length(); i++) { + int c = latin1Chars.charAt(i) & 0xFF; + if (c < 0x80) { + bo.write(c); + } else { + bo.write(0xC0 | ((c >> 6) & 0x1F)); + bo.write(0x80 | (c & 0x3F)); + } + } + return byteArrayToByteString(bo.toByteArray()); + } + + public static RuntimeList valid_utf8(RuntimeArray args, int ctx) { + if (args.isEmpty()) { + throw new IllegalStateException("valid_utf8: need octets"); + } + RuntimeScalar in = args.get(0); + byte[] b = inputOctetsValid(in); + boolean ok = utf8BufferValid(b, 0, b.length); + return new RuntimeList(new RuntimeScalar(ok)); + } +} diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Warnings.java b/src/main/java/org/perlonjava/runtime/perlmodule/Warnings.java index e3e486407..ea8b010c7 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Warnings.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Warnings.java @@ -1,6 +1,7 @@ package org.perlonjava.runtime.perlmodule; import org.perlonjava.frontend.semantic.ScopedSymbolTable; +import org.perlonjava.runtime.WarningBitsRegistry; import org.perlonjava.runtime.operators.WarnDie; import org.perlonjava.runtime.runtimetypes.*; @@ -114,12 +115,100 @@ private static RuntimeScalar getCallerLocation(int level) { return new RuntimeScalar(""); } + /** + * Walks {@code caller()} until warning bits are found. Required when XS is implemented + * in Java: several stack frames may be native with no ${^WARNING_BITS}. + */ + private static String getWarningBitsForJavaNativeXs() { + for (int level = 0; level < 25; level++) { + String bits = getWarningBitsAtLevel(level); + if (bits != null && !bits.isEmpty()) { + return bits; + } + } + return null; + } + + /** + * Lexical ${^WARNING_BITS} at the Perl call site of a Java-implemented XS routine. + * Prefer the value saved by {@link RuntimeCode#apply} on {@link WarningBitsRegistry#callerBitsStack}; + * fall back to walking {@code caller()} like {@link #warnIf}. + */ + public static String getJavaNativeXsCallSiteWarningBits() { + String fromStack = WarningBitsRegistry.getCallerBitsAtFrame(0); + if (fromStack != null && !fromStack.isEmpty()) { + return fromStack; + } + return getWarningBitsForJavaNativeXs(); + } + + /** + * True if {@code category} is FATAL on any {@code caller()} level (used by Java XS where + * inner blocks use {@code use warnings FATAL => 'utf8'}). + */ + public static boolean isCategoryFatalOnAnyCallerLevel(String category) { + if (category == null) { + return false; + } + if (WarningFlags.isWarningSuppressedAtRuntime(category)) { + return false; + } + if (WarningFlags.isCustomCategory(category)) { + String bits = findExternalCallerBits(); + return bits != null && WarningFlags.isFatalInBits(bits, category); + } + for (int level = 0; level < 25; level++) { + String bits = getWarningBitsAtLevel(level); + if (bits != null && WarningFlags.isFatalInBits(bits, category)) { + return true; + } + } + return false; + } + + /** + * True if {@code category} is FATAL in the first Perl caller's lexical warning bits + * (used by Java-implemented XS, e.g. {@code Unicode::UTF8}). + */ + public static boolean isCategoryFatalAtPerlXsCaller(String category) { + if (category == null) { + return false; + } + if (WarningFlags.isWarningSuppressedAtRuntime(category)) { + return false; + } + String bits = WarningFlags.isCustomCategory(category) + ? findExternalCallerBits() + : getWarningBitsForJavaNativeXs(); + return bits != null && WarningFlags.isFatalInBits(bits, category); + } + + /** + * True if {@code category} is enabled in the first Perl caller's lexical warning bits + * (or only via {@code $^W} when bits do not mention the category). + */ + public static boolean isCategoryEnabledAtPerlXsCaller(String category) { + if (category == null) { + return false; + } + if (WarningFlags.isWarningSuppressedAtRuntime(category)) { + return false; + } + String bits = WarningFlags.isCustomCategory(category) + ? findExternalCallerBits() + : getWarningBitsForJavaNativeXs(); + if (bits != null && WarningFlags.isEnabledInBits(bits, category)) { + return true; + } + return isWarnFlagSet(); + } + /** * Walks up the call stack past frames in warnings-registered packages to find * the "external caller" whose warning bits should be checked. This implements * Perl 5's _error_loc() behavior: skip frames in any package that has used * warnings::register (i.e., any custom warning category package). - * + * * @return The warning bits string from the first caller outside registered packages, * or null if not found */ diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java index 9412dc495..85d846c27 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeScalar.java @@ -78,6 +78,13 @@ private static boolean mightBeInteger(String s) { */ public boolean ioOwner; + /** + * When {@link #type} is {@link RuntimeScalarType#STRING}, true if this value was produced by + * {@code Encode::_utf8_on} on a {@link RuntimeScalarType#BYTE_STRING} without decoding octets. + * Character ordinals then match raw UTF-8 bytes (possibly ill-formed), as in Perl. + */ + public boolean utf8UncheckedOctets; + /** * When this scalar is installed in {@link GlobalVariable#globalCodeRefs}, the map key * (fully-qualified name such as {@code My::Pkg::foo}). Used to invalidate @@ -197,6 +204,7 @@ public RuntimeScalar(RuntimeScalar scalar) { } this.type = scalar.type; this.value = scalar.value; + this.utf8UncheckedOctets = scalar.utf8UncheckedOctets; } public RuntimeScalar(RuntimeCode value) { @@ -1049,10 +1057,12 @@ public RuntimeScalar set(RuntimeScalar value) { if (this != value) { this.type = value.type; this.value = value.value; + this.utf8UncheckedOctets = value.utf8UncheckedOctets; RuntimePosLvalue.invalidatePos(this); } else { this.type = value.type; this.value = value.value; + this.utf8UncheckedOctets = value.utf8UncheckedOctets; } return this; } @@ -1139,6 +1149,7 @@ private RuntimeScalar setLarge(RuntimeScalar value) { // so no refCount was incremented/decremented, and no mortal entries were added. this.type = value.type; this.value = value.value; + this.utf8UncheckedOctets = value.utf8UncheckedOctets; return this; } @@ -1178,6 +1189,7 @@ private RuntimeScalar setLargeRefCounted(RuntimeScalar value) { } else { this.type = value.type; this.value = value.value; + this.utf8UncheckedOctets = value.utf8UncheckedOctets; return this; } } @@ -1294,6 +1306,7 @@ private RuntimeScalar setLargeRefCounted(RuntimeScalar value) { // Do the assignment this.type = value.type; this.value = value.value; + this.utf8UncheckedOctets = value.utf8UncheckedOctets; // DESTROY rescue detection for reference types. // Only trigger when the OLD value was a reference to the DESTROY target @@ -1547,6 +1560,7 @@ public RuntimeScalar set(String value) { this.type = RuntimeScalarType.STRING; } this.value = value; + this.utf8UncheckedOctets = false; return this; } diff --git a/src/main/perl/lib/Unicode/UTF8.pm b/src/main/perl/lib/Unicode/UTF8.pm new file mode 100644 index 000000000..b70598b90 --- /dev/null +++ b/src/main/perl/lib/Unicode/UTF8.pm @@ -0,0 +1,20 @@ +package Unicode::UTF8; + +use strict; +use warnings; + +BEGIN { + our $VERSION = '0.70'; + our @EXPORT_OK = qw[ decode_utf8 encode_utf8 valid_utf8 ]; + our %EXPORT_TAGS = ( + all => [@EXPORT_OK], + ); + + require XSLoader; + XSLoader::load(__PACKAGE__, $VERSION); + + require Exporter; + *import = \&Exporter::import; +} + +1;