diff --git a/docs/cli_commands.md b/docs/cli_commands.md index 9accb2998d..374899e2c2 100644 --- a/docs/cli_commands.md +++ b/docs/cli_commands.md @@ -19,6 +19,7 @@ This document provides an overview of CLI commands that can be sent to MeshCore - [GPS](#gps-when-gps-support-is-compiled-in) - [Sensors](#sensors-when-sensor-support-is-compiled-in) - [Bridge](#bridge-when-bridge-support-is-compiled-in) + - [Channel Content Filter](#channel-content-filter-when-channel-filtering-is-compiled-in) --- @@ -1100,3 +1101,80 @@ region save **Note:** Returns an error on boards without power management support. --- + +### Channel Content Filter (When channel filtering is compiled in) + +Repeater only. Lets a repeater decrypt channels it holds the key for, inspect the plaintext, and refuse to retransmit messages that match a blocked keyword or sender name. With nothing configured, behaviour is identical to a stock repeater. + +**How it works:** the repeater only decrypts channels you explicitly load a key for (see `filter channel`). For those channels it reads the sender name and message text; any message matching a blocked keyword (text, case-insensitive substring) or blocked sender (case-insensitive substring of the sender name) is dropped instead of forwarded. All other channels — and direct messages — are never decrypted and forward exactly as before. + +**Unicode handling:** before matching, both the message and your blocked terms are Unicode-folded so common evasion tricks don't slip through — look-alike characters (fullwidth, mathematical bold/italic, circled/squared letters, regional-indicator "flag" letters, and common Cyrillic/Greek homoglyphs) are mapped to the plain ASCII letter they imitate, accents are stripped, and zero-width / combining / variation-selector characters are removed. + +**Limitations:** +- This only stops **this** repeater from forwarding the message. Other repeaters running stock firmware still forward it, so this thins coverage at your node rather than removing the message from the mesh. +- Only works for channels whose key the repeater holds (the built-in public channel, plus any channel PSK you add). +- Sender names are self-declared in the channel payload and easily spoofed, so `filter sender` is a weak control on its own. + +**Config storage:** persisted to `/channel_filter` on the node's filesystem. + +--- + +#### Show the current filter configuration +**Usage:** +- `filter` +- `filter list` + +**Note:** Reports channel/keyword/sender counts, the lifetime filtered-message count, and the configured keyword and sender terms. + +--- + +#### View or reset the filtered-message counter +**Usage:** +- `filter stats` +- `filter stats reset` + +**Note:** `filter stats` reports `filtered: channels: keywords: senders:`. The counter is a runtime value and is not persisted, so it also resets on reboot. + +--- + +#### Add or remove a channel to decrypt +**Usage:** +- `filter channel ` +- `filter channel public` +- `filter channel clear` + +**Parameters:** +- `psk`: Channel pre-shared key in Base64 (16 or 32 bytes when decoded), e.g. the value shared by a MeshCore client. The literal `public` is a shortcut for the well-known public channel key. + +**Note:** Adding a channel only enables decryption/inspection of that channel; messages still forward normally unless they match a blocked keyword or sender. `filter channel clear` removes all channel keys (the repeater stops decrypting and forwards everything blind again). + +--- + +#### Block a keyword +**Usage:** +- `filter block ` + +**Parameters:** +- `keyword`: Text to match (case-insensitive substring) against the whole message. Max 23 characters. + +**Note:** Matches anywhere in the message — both the body and the sender name — so a blocked word can't be hidden in a self-declared sender name. + +--- + +#### Block a sender name +**Usage:** +- `filter sender ` + +**Parameters:** +- `name`: Text to match (case-insensitive substring) against the sender's display name. Max 23 characters. + +--- + +#### Clear or reset filter terms +**Usage:** +- `filter clear` +- `filter reset` + +**Note:** `filter clear` empties the keyword and sender lists but keeps the loaded channel keys. `filter reset` wipes everything — channel keys, keywords and senders. + +--- diff --git a/examples/simple_repeater/MyMesh.cpp b/examples/simple_repeater/MyMesh.cpp index 096907494b..9e2fb6d89d 100644 --- a/examples/simple_repeater/MyMesh.cpp +++ b/examples/simple_repeater/MyMesh.cpp @@ -1,6 +1,44 @@ #include "MyMesh.h" #include +#ifdef WITH_CHANNEL_FILTER +#include "UnicodeFold.h" + +/* --------------------- public-channel content filter ------------------ */ + +// The well-known MeshCore public channel PSK ("izOH6cXN6mrJ5e26oRXNcg==") +static const uint8_t PUBLIC_CHANNEL_SECRET[16] = { + 0x8b, 0x33, 0x87, 0xe9, 0xc5, 0xcd, 0xea, 0x6a, + 0xc9, 0xe5, 0xed, 0xba, 0xa1, 0x15, 0xcd, 0x72 +}; + +static int b64Val(char c) { + if (c >= 'A' && c <= 'Z') return c - 'A'; + if (c >= 'a' && c <= 'z') return c - 'a' + 26; + if (c >= '0' && c <= '9') return c - '0' + 52; + if (c == '+') return 62; + if (c == '/') return 63; + return -1; +} + +static int decodeBase64(const char* in, uint8_t* out, int max_out) { + int bits = 0, nbits = 0, n = 0; + for (const char* p = in; *p && *p != '='; p++) { + int v = b64Val(*p); + if (v < 0) continue; // skip whitespace and other non-alphabet chars + bits = (bits << 6) | v; + nbits += 6; + if (nbits >= 8) { + nbits -= 8; + if (n >= max_out) return -1; + out[n++] = (bits >> nbits) & 0xFF; + } + } + return n; +} +#endif // WITH_CHANNEL_FILTER + + /* ------------------------------ Config -------------------------------- */ #ifndef LORA_FREQ @@ -627,6 +665,213 @@ void MyMesh::getPeerSharedSecret(uint8_t *dest_secret, int peer_idx) { } } +#ifdef WITH_CHANNEL_FILTER +bool MyMesh::addFilterChannel(const char *psk_b64) { + if (num_filter_channels >= MAX_FILTER_CHANNELS) return false; + + auto ch = &filter_channels[num_filter_channels]; + memset(ch->secret, 0, sizeof(ch->secret)); + + int len; + if (strcmp(psk_b64, "public") == 0) { + memcpy(ch->secret, PUBLIC_CHANNEL_SECRET, sizeof(PUBLIC_CHANNEL_SECRET)); + len = sizeof(PUBLIC_CHANNEL_SECRET); + } else { + len = decodeBase64(psk_b64, ch->secret, sizeof(ch->secret)); + } + if (len != 16 && len != 32) return false; + + mesh::Utils::sha256(ch->hash, sizeof(ch->hash), ch->secret, len); + StrHelper::strncpy(filter_channel_psk[num_filter_channels], psk_b64, FILTER_PSK_B64_LEN); + num_filter_channels++; + return true; +} + +int MyMesh::searchChannelsByHash(const uint8_t *hash, mesh::GroupChannel channels[], int max_matches) { + int n = 0; + for (int i = 0; i < num_filter_channels && n < max_matches; i++) { + if (filter_channels[i].hash[0] == hash[0]) { + channels[n++] = filter_channels[i]; + } + } + return n; +} + +void MyMesh::onGroupDataRecv(mesh::Packet *packet, uint8_t type, const mesh::GroupChannel &channel, + uint8_t *data, size_t len) { + if (type != PAYLOAD_TYPE_GRP_TXT) return; // only inspect channel text messages + if (len < 6) return; + if ((data[4] >> 2) != 0) return; // not a plain-text message + if (len >= MAX_PACKET_PAYLOAD) return; // crafted over-long payload; avoid OOB on data[len] + + data[len] = 0; // make a C string: "sender_name: text" + const char *msg = (const char *)&data[5]; + + const char *sep = strstr(msg, ": "); + + // Unicode-fold so homoglyph / zero-width tricks can't evade the blocklist (see + // UnicodeFold.h). Keywords match the whole message (sender + text) so a blocked + // word can't be hidden in the self-declared sender name. Terms are folded the + // same way at match time. + char folded_msg[MAX_PACKET_PAYLOAD]; + ufold::foldUtf8(msg, folded_msg, sizeof(folded_msg)); + + char folded_sender[40]; + folded_sender[0] = 0; + if (sep) { + char sender[40]; + int slen = sep - msg; + if (slen >= (int)sizeof(sender)) slen = sizeof(sender) - 1; + memcpy(sender, msg, slen); + sender[slen] = 0; + ufold::foldUtf8(sender, folded_sender, sizeof(folded_sender)); + } + + bool blocked = false; + const char *reason = "keyword"; + char fterm[FILTER_TERM_LEN]; + + for (int i = 0; i < num_block_senders && !blocked; i++) { + ufold::foldUtf8(block_senders[i], fterm, sizeof(fterm)); + if (fterm[0] && strstr(folded_sender, fterm)) { blocked = true; reason = "sender"; } + } + for (int i = 0; i < num_block_keywords && !blocked; i++) { + ufold::foldUtf8(block_keywords[i], fterm, sizeof(fterm)); + if (fterm[0] && strstr(folded_msg, fterm)) blocked = true; + } + + if (blocked) { + packet->markDoNotRetransmit(); // routeRecvPacket() will now release instead of forwarding + n_filtered++; + MESH_DEBUG_PRINTLN("filter: dropping channel msg (%s): %s", reason, msg); + if (_logging) { + File f = openAppend(PACKET_LOG_FILE); + if (f) { + f.print(getLogDateTime()); + f.printf(": FILTERED (%s): %s\n", reason, msg); + f.close(); + } + } + } +} + +void MyMesh::loadChannelFilter() { + num_filter_channels = 0; + num_block_keywords = 0; + num_block_senders = 0; + +#if defined(RP2040_PLATFORM) + File f = _fs->open(CHANNEL_FILTER_FILE, "r"); +#else + File f = _fs->open(CHANNEL_FILTER_FILE); +#endif + if (!f) return; + + char line[FILTER_PSK_B64_LEN + 8]; + while (f.available()) { + int n = f.readBytesUntil('\n', (uint8_t *)line, sizeof(line) - 1); + line[n] = 0; + while (n > 0 && (line[n - 1] == '\r' || line[n - 1] == ' ')) line[--n] = 0; + if (n < 3 || line[1] != ' ') continue; + + const char *val = &line[2]; + if (line[0] == 'C') { + addFilterChannel(val); + } else if (line[0] == 'K' && num_block_keywords < MAX_FILTER_TERMS) { + StrHelper::strncpy(block_keywords[num_block_keywords++], val, FILTER_TERM_LEN); + } else if (line[0] == 'S' && num_block_senders < MAX_FILTER_TERMS) { + StrHelper::strncpy(block_senders[num_block_senders++], val, FILTER_TERM_LEN); + } + } + f.close(); +} + +void MyMesh::saveChannelFilter() { + _fs->remove(CHANNEL_FILTER_FILE); + File f = openAppend(CHANNEL_FILTER_FILE); + if (!f) return; + for (int i = 0; i < num_filter_channels; i++) f.printf("C %s\n", filter_channel_psk[i]); + for (int i = 0; i < num_block_keywords; i++) f.printf("K %s\n", block_keywords[i]); + for (int i = 0; i < num_block_senders; i++) f.printf("S %s\n", block_senders[i]); + f.close(); +} + +void MyMesh::handleFilterCommand(char *command, char *reply) { + char *arg = command + 6; // skip "filter" + while (*arg == ' ') arg++; + + if (*arg == 0 || strcmp(arg, "list") == 0) { + char *dp = reply; + dp += sprintf(dp, "channels:%d keywords:%d senders:%d filtered:%u", num_filter_channels, + num_block_keywords, num_block_senders, (unsigned)n_filtered); + for (int i = 0; i < num_block_keywords && dp - reply < 120; i++) dp += sprintf(dp, "\nK:%s", block_keywords[i]); + for (int i = 0; i < num_block_senders && dp - reply < 120; i++) dp += sprintf(dp, "\nS:%s", block_senders[i]); + return; + } + if (memcmp(arg, "stats", 5) == 0 && (arg[5] == 0 || arg[5] == ' ')) { + char *sub = arg + 5; + while (*sub == ' ') sub++; + if (strcmp(sub, "reset") == 0) { + n_filtered = 0; + strcpy(reply, "OK - stats reset"); + } else { + sprintf(reply, "filtered:%u channels:%d keywords:%d senders:%d", (unsigned)n_filtered, + num_filter_channels, num_block_keywords, num_block_senders); + } + return; + } + if (memcmp(arg, "channel ", 8) == 0) { + char *val = arg + 8; + while (*val == ' ') val++; + if (strcmp(val, "clear") == 0) { + num_filter_channels = 0; + saveChannelFilter(); + strcpy(reply, "OK - channels cleared"); + } else if (addFilterChannel(val)) { + saveChannelFilter(); + sprintf(reply, "OK - %d channel(s)", num_filter_channels); + } else { + strcpy(reply, "Err - bad PSK or list full"); + } + return; + } + if (memcmp(arg, "block ", 6) == 0) { + char *val = arg + 6; + while (*val == ' ') val++; + if (*val == 0) { strcpy(reply, "Err - empty keyword"); return; } + if (num_block_keywords >= MAX_FILTER_TERMS) { strcpy(reply, "Err - keyword list full"); return; } + StrHelper::strncpy(block_keywords[num_block_keywords++], val, FILTER_TERM_LEN); + saveChannelFilter(); + sprintf(reply, "OK - %d keyword(s)", num_block_keywords); + return; + } + if (memcmp(arg, "sender ", 7) == 0) { + char *val = arg + 7; + while (*val == ' ') val++; + if (*val == 0) { strcpy(reply, "Err - empty sender"); return; } + if (num_block_senders >= MAX_FILTER_TERMS) { strcpy(reply, "Err - sender list full"); return; } + StrHelper::strncpy(block_senders[num_block_senders++], val, FILTER_TERM_LEN); + saveChannelFilter(); + sprintf(reply, "OK - %d sender(s)", num_block_senders); + return; + } + if (strcmp(arg, "clear") == 0) { + num_block_keywords = 0; + num_block_senders = 0; + saveChannelFilter(); + strcpy(reply, "OK - blocks cleared"); + return; + } + if (strcmp(arg, "reset") == 0) { + num_filter_channels = num_block_keywords = num_block_senders = 0; + saveChannelFilter(); + strcpy(reply, "OK - filter reset"); + return; + } + strcpy(reply, "Err - usage: filter [list|stats [reset]|channel |block |sender |clear|reset]"); +} +#endif // WITH_CHANNEL_FILTER + static bool isShare(const mesh::Packet *packet) { if (packet->hasTransportCodes()) { return packet->transport_codes[0] == 0 && packet->transport_codes[1] == 0; // codes { 0, 0 } means 'send to nowhere' @@ -862,6 +1107,12 @@ MyMesh::MyMesh(mesh::MainBoard &board, mesh::Radio &radio, mesh::MillisecondCloc { last_millis = 0; uptime_millis = 0; +#ifdef WITH_CHANNEL_FILTER + num_filter_channels = 0; + num_block_keywords = 0; + num_block_senders = 0; + n_filtered = 0; +#endif next_local_advert = next_flood_advert = 0; dirty_contacts_expiry = 0; set_radio_at = revert_radio_at = 0; @@ -930,6 +1181,9 @@ void MyMesh::begin(FILESYSTEM *fs) { // load persisted prefs _cli.loadPrefs(_fs); acl.load(_fs, self_id); +#ifdef WITH_CHANNEL_FILTER + loadChannelFilter(); +#endif // TODO: key_store.begin(); region_map.load(_fs); @@ -1257,6 +1511,10 @@ void MyMesh::handleCommand(uint32_t sender_timestamp, char *command, char *reply sendNodeDiscoverReq(); strcpy(reply, "OK - Discover sent"); } +#ifdef WITH_CHANNEL_FILTER + } else if (memcmp(command, "filter", 6) == 0 && (command[6] == 0 || command[6] == ' ')) { + handleFilterCommand(command, reply); +#endif } else{ _cli.handleCommand(sender_timestamp, command, reply); // common CLI commands } diff --git a/examples/simple_repeater/MyMesh.h b/examples/simple_repeater/MyMesh.h index 7597c6c6f6..12d3357d86 100644 --- a/examples/simple_repeater/MyMesh.h +++ b/examples/simple_repeater/MyMesh.h @@ -80,6 +80,19 @@ struct NeighbourInfo { #define PACKET_LOG_FILE "/packet_log" +#ifdef WITH_CHANNEL_FILTER +#define CHANNEL_FILTER_FILE "/channel_filter" + +#ifndef MAX_FILTER_CHANNELS + #define MAX_FILTER_CHANNELS 4 +#endif +#ifndef MAX_FILTER_TERMS + #define MAX_FILTER_TERMS 8 +#endif +#define FILTER_TERM_LEN 24 +#define FILTER_PSK_B64_LEN 48 +#endif + class MyMesh : public mesh::Mesh, public CommonCLICallbacks { FILESYSTEM* _fs; uint32_t last_millis; @@ -113,6 +126,17 @@ class MyMesh : public mesh::Mesh, public CommonCLICallbacks { uint8_t pending_sf; uint8_t pending_cr; int matching_peer_indexes[MAX_CLIENTS]; + +#ifdef WITH_CHANNEL_FILTER + mesh::GroupChannel filter_channels[MAX_FILTER_CHANNELS]; + char filter_channel_psk[MAX_FILTER_CHANNELS][FILTER_PSK_B64_LEN]; + uint8_t num_filter_channels; + char block_keywords[MAX_FILTER_TERMS][FILTER_TERM_LEN]; + uint8_t num_block_keywords; + char block_senders[MAX_FILTER_TERMS][FILTER_TERM_LEN]; + uint8_t num_block_senders; + uint32_t n_filtered; +#endif #if defined(WITH_RS232_BRIDGE) RS232Bridge bridge; #elif defined(WITH_ESPNOW_BRIDGE) @@ -130,6 +154,13 @@ class MyMesh : public mesh::Mesh, public CommonCLICallbacks { File openAppend(const char* fname); bool isLooped(const mesh::Packet* packet, const uint8_t max_counters[]); +#ifdef WITH_CHANNEL_FILTER + bool addFilterChannel(const char* psk_b64); + void loadChannelFilter(); + void saveChannelFilter(); + void handleFilterCommand(char* command, char* reply); +#endif + protected: float getAirtimeBudgetFactor() const override { return _prefs.airtime_factor; @@ -167,6 +198,10 @@ class MyMesh : public mesh::Mesh, public CommonCLICallbacks { void onAnonDataRecv(mesh::Packet* packet, const uint8_t* secret, const mesh::Identity& sender, uint8_t* data, size_t len) override; int searchPeersByHash(const uint8_t* hash) override; +#ifdef WITH_CHANNEL_FILTER + int searchChannelsByHash(const uint8_t* hash, mesh::GroupChannel channels[], int max_matches) override; + void onGroupDataRecv(mesh::Packet* packet, uint8_t type, const mesh::GroupChannel& channel, uint8_t* data, size_t len) override; +#endif void getPeerSharedSecret(uint8_t* dest_secret, int peer_idx) override; void onAdvertRecv(mesh::Packet* packet, const mesh::Identity& id, uint32_t timestamp, const uint8_t* app_data, size_t app_data_len); void onPeerDataRecv(mesh::Packet* packet, uint8_t type, int sender_idx, const uint8_t* secret, uint8_t* data, size_t len) override; diff --git a/examples/simple_repeater/UnicodeFold.h b/examples/simple_repeater/UnicodeFold.h new file mode 100644 index 0000000000..b70f59d0f9 --- /dev/null +++ b/examples/simple_repeater/UnicodeFold.h @@ -0,0 +1,251 @@ +#pragma once + +#include +#include +#include +#include + +// Best-effort Unicode confusable folding for the channel content filter. +// +// foldUtf8() normalises a UTF-8 string so that homoglyph / look-alike tricks +// don't slip past a keyword or sender blocklist. It: +// - folds confusable codepoints (fullwidth, mathematical alphanumerics, +// circled / squared / parenthesized letters, regional-indicator "flag" +// letters, common Cyrillic & Greek look-alikes, accented Latin) to the +// plain ASCII letter they imitate, +// - drops invisible codepoints (zero-width spaces/joiners, combining marks, +// variation selectors, bidi controls, emoji skin-tone modifiers, ...), +// - lowercases ASCII, +// - passes any other codepoint through unchanged (so non-Latin blocklist +// terms still match, and visible symbols still act as word separators). +// +// This is not a full UTS#39 skeleton — it covers the abuse vectors that show +// up in practice without shipping the entire Unicode confusables table. + +namespace ufold { + +// Decode one codepoint. *len is bytes consumed (>=1). Malformed -> raw byte. +static inline uint32_t utf8Next(const uint8_t* s, int* len) { + uint8_t c = s[0]; + if (c < 0x80) { *len = 1; return c; } + if ((c & 0xE0) == 0xC0) { + if ((s[1] & 0xC0) == 0x80) { *len = 2; return ((uint32_t)(c & 0x1F) << 6) | (s[1] & 0x3F); } + } else if ((c & 0xF0) == 0xE0) { + if ((s[1] & 0xC0) == 0x80 && (s[2] & 0xC0) == 0x80) { + *len = 3; + return ((uint32_t)(c & 0x0F) << 12) | ((uint32_t)(s[1] & 0x3F) << 6) | (s[2] & 0x3F); + } + } else if ((c & 0xF8) == 0xF0) { + if ((s[1] & 0xC0) == 0x80 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) { + *len = 4; + return ((uint32_t)(c & 0x07) << 18) | ((uint32_t)(s[1] & 0x3F) << 12) | + ((uint32_t)(s[2] & 0x3F) << 6) | (s[3] & 0x3F); + } + } + *len = 1; + return c; // malformed sequence +} + +// Invisible / formatting / combining codepoints that should be stripped. +static inline bool isDrop(uint32_t cp) { + if (cp == 0x00AD) return true; // soft hyphen + if (cp >= 0x0300 && cp <= 0x036F) return true; // combining diacritical marks + if (cp >= 0x1AB0 && cp <= 0x1AFF) return true; // combining marks extended + if (cp >= 0x1DC0 && cp <= 0x1DFF) return true; // combining marks supplement + if (cp >= 0x200B && cp <= 0x200F) return true; // ZW space/joiners, LRM/RLM + if (cp >= 0x202A && cp <= 0x202E) return true; // bidi embeddings/overrides + if (cp >= 0x2060 && cp <= 0x2064) return true; // word joiner, invisible ops + if (cp >= 0x20D0 && cp <= 0x20FF) return true; // combining marks for symbols + if (cp >= 0xFE00 && cp <= 0xFE0F) return true; // variation selectors + if (cp >= 0xFE20 && cp <= 0xFE2F) return true; // combining half marks + if (cp == 0xFEFF) return true; // BOM / ZW no-break space + if (cp >= 0x1F3FB && cp <= 0x1F3FF) return true; // emoji skin-tone modifiers + if (cp >= 0xE0000 && cp <= 0xE007F) return true; // tags + return false; +} + +static inline char foldLatinExtA(uint32_t cp) { + // U+0100..U+017F, indexed base letter (already lowercase) + static const char* T = + "aaaaaa" // 0100-0105 + "cccccccc" // 0106-010D + "dddd" // 010E-0111 + "eeeeeeeeee" // 0112-011B + "gggggggg" // 011C-0123 + "hhhh" // 0124-0127 + "iiiiiiiiii" // 0128-0131 + "ii" // 0132-0133 + "jj" // 0134-0135 + "kkk" // 0136-0138 + "llllllllll" // 0139-0142 + "nnnnnnnnn" // 0143-014B + "oooooo" // 014C-0151 + "oo" // 0152-0153 + "rrrrrr" // 0154-0159 + "ssssssss" // 015A-0161 + "tttttt" // 0162-0167 + "uuuuuuuuuuuu" // 0168-0173 + "ww" // 0174-0175 + "yyy" // 0176-0178 + "zzzzzz" // 0179-017E + "s"; // 017F + return T[cp - 0x0100]; +} + +static inline char foldLatin1(uint32_t cp) { + if (cp >= 0xC0 && cp <= 0xC6) return 'a'; + if (cp == 0xC7) return 'c'; + if (cp >= 0xC8 && cp <= 0xCB) return 'e'; + if (cp >= 0xCC && cp <= 0xCF) return 'i'; + if (cp == 0xD0) return 'd'; + if (cp == 0xD1) return 'n'; + if (cp >= 0xD2 && cp <= 0xD6) return 'o'; + if (cp == 0xD8) return 'o'; + if (cp >= 0xD9 && cp <= 0xDC) return 'u'; + if (cp == 0xDD) return 'y'; + if (cp == 0xDE) return 't'; + if (cp == 0xDF) return 's'; + if (cp >= 0xE0 && cp <= 0xE6) return 'a'; + if (cp == 0xE7) return 'c'; + if (cp >= 0xE8 && cp <= 0xEB) return 'e'; + if (cp >= 0xEC && cp <= 0xEF) return 'i'; + if (cp == 0xF0) return 'd'; + if (cp == 0xF1) return 'n'; + if (cp >= 0xF2 && cp <= 0xF6) return 'o'; + if (cp == 0xF8) return 'o'; + if (cp >= 0xF9 && cp <= 0xFC) return 'u'; + if (cp == 0xFD || cp == 0xFF) return 'y'; + if (cp == 0xFE) return 't'; + return 0; // ×, ÷, etc. +} + +static inline char foldMathAlnum(uint32_t cp) { + // Mathematical digits: 5 styles of 0-9 + if (cp >= 0x1D7CE && cp <= 0x1D7FF) return '0' + ((cp - 0x1D7CE) % 10); + // Latin letter styles: each block is 52 wide (A-Z then a-z) + static const uint32_t starts[] = { + 0x1D400, 0x1D434, 0x1D468, 0x1D49C, 0x1D4D0, 0x1D504, 0x1D538, + 0x1D56C, 0x1D5A0, 0x1D5D4, 0x1D608, 0x1D63C, 0x1D670}; + for (unsigned k = 0; k < sizeof(starts) / sizeof(starts[0]); k++) { + if (cp >= starts[k] && cp <= starts[k] + 51) { + return 'a' + ((cp - starts[k]) % 26); + } + } + if (cp == 0x1D6A4) return 'i'; // italic dotless i + if (cp == 0x1D6A5) return 'j'; // italic dotless j + return 0; +} + +static inline char foldEnclosed(uint32_t cp) { + if (cp >= 0x249C && cp <= 0x24B5) return 'a' + (cp - 0x249C); // parenthesized small + if (cp >= 0x24B6 && cp <= 0x24CF) return 'a' + (cp - 0x24B6); // circled capital + if (cp >= 0x24D0 && cp <= 0x24E9) return 'a' + (cp - 0x24D0); // circled small + if (cp >= 0x2460 && cp <= 0x2468) return '1' + (cp - 0x2460); // circled 1-9 + if (cp == 0x24EA) return '0'; // circled 0 + return 0; +} + +static inline char foldEnclosedSupp(uint32_t cp) { + if (cp >= 0x1F110 && cp <= 0x1F129) return 'a' + (cp - 0x1F110); // parenthesized + if (cp >= 0x1F130 && cp <= 0x1F149) return 'a' + (cp - 0x1F130); // squared + if (cp >= 0x1F150 && cp <= 0x1F169) return 'a' + (cp - 0x1F150); // negative circled + if (cp >= 0x1F170 && cp <= 0x1F189) return 'a' + (cp - 0x1F170); // negative squared + if (cp >= 0x1F1E6 && cp <= 0x1F1FF) return 'a' + (cp - 0x1F1E6); // regional indicators + return 0; +} + +static inline char foldLetterlike(uint32_t cp) { + switch (cp) { + case 0x2102: case 0x212D: return 'c'; + case 0x210A: return 'g'; + case 0x210B: case 0x210C: case 0x210D: case 0x210E: case 0x210F: return 'h'; + case 0x2110: case 0x2111: return 'i'; + case 0x2112: case 0x2113: return 'l'; + case 0x2115: return 'n'; + case 0x2118: case 0x2119: return 'p'; + case 0x211A: return 'q'; + case 0x211B: case 0x211C: case 0x211D: return 'r'; + case 0x2124: return 'z'; + case 0x212C: return 'b'; + case 0x212F: case 0x2130: return 'e'; + case 0x2131: return 'f'; + case 0x2133: return 'm'; + case 0x2134: return 'o'; + default: return 0; + } +} + +static inline char foldCyrillic(uint32_t cp) { + switch (cp) { + case 0x0410: case 0x0430: return 'a'; // А а + case 0x0412: case 0x0432: return 'b'; // В в + case 0x0415: case 0x0435: return 'e'; // Е е + case 0x0405: case 0x0455: return 's'; // Ѕ ѕ + case 0x0406: case 0x0456: return 'i'; // І і + case 0x0408: case 0x0458: return 'j'; // Ј ј + case 0x041A: case 0x043A: return 'k'; // К к + case 0x041C: case 0x043C: return 'm'; // М м + case 0x041D: case 0x043D: return 'h'; // Н н + case 0x041E: case 0x043E: return 'o'; // О о + case 0x0420: case 0x0440: return 'p'; // Р р + case 0x0421: case 0x0441: return 'c'; // С с + case 0x0422: case 0x0442: return 't'; // Т т + case 0x0423: case 0x0443: return 'y'; // У у + case 0x0425: case 0x0445: return 'x'; // Х х + default: return 0; + } +} + +static inline char foldGreek(uint32_t cp) { + switch (cp) { + case 0x0391: return 'a'; case 0x0392: return 'b'; case 0x0395: return 'e'; + case 0x0396: return 'z'; case 0x0397: return 'h'; case 0x0399: return 'i'; + case 0x039A: return 'k'; case 0x039C: return 'm'; case 0x039D: return 'n'; + case 0x039F: return 'o'; case 0x03A1: return 'p'; case 0x03A4: return 't'; + case 0x03A5: return 'y'; case 0x03A7: return 'x'; + case 0x03BF: return 'o'; case 0x03B9: return 'i'; case 0x03C1: return 'p'; + default: return 0; + } +} + +// Fold one codepoint to a lowercase ASCII letter/digit, or 0 if not foldable. +static inline char foldLetter(uint32_t cp) { + char c; + if (cp >= 0xFF21 && cp <= 0xFF3A) return 'a' + (cp - 0xFF21); // fullwidth A-Z + if (cp >= 0xFF41 && cp <= 0xFF5A) return 'a' + (cp - 0xFF41); // fullwidth a-z + if (cp >= 0xFF10 && cp <= 0xFF19) return '0' + (cp - 0xFF10); // fullwidth 0-9 + if (cp >= 0x00C0 && cp <= 0x00FF) return foldLatin1(cp); + if (cp >= 0x0100 && cp <= 0x017F) return foldLatinExtA(cp); + if (cp >= 0x0391 && cp <= 0x03C9 && (c = foldGreek(cp))) return c; + if (cp >= 0x0400 && cp <= 0x04FF && (c = foldCyrillic(cp))) return c; + if (cp >= 0x2100 && cp <= 0x214F && (c = foldLetterlike(cp))) return c; + if (cp >= 0x2460 && cp <= 0x24FF && (c = foldEnclosed(cp))) return c; + if (cp >= 0x1D400 && cp <= 0x1D7FF && (c = foldMathAlnum(cp))) return c; + if (cp >= 0x1F100 && cp <= 0x1F1FF && (c = foldEnclosedSupp(cp))) return c; + return 0; +} + +// Fold a null-terminated UTF-8 string into a normalized lowercase buffer. +static inline void foldUtf8(const char* in, char* out, size_t out_size) { + const uint8_t* s = (const uint8_t*)in; + size_t o = 0; + if (out_size == 0) return; + while (*s && o + 1 < out_size) { + int len = 1; + uint32_t cp = utf8Next(s, &len); + char f; + if (cp < 0x80) { + out[o++] = (char)tolower((int)cp); + } else if (isDrop(cp)) { + // skip + } else if ((f = foldLetter(cp)) != 0) { + out[o++] = f; + } else { + for (int k = 0; k < len && o + 1 < out_size; k++) out[o++] = (char)s[k]; // pass through + } + s += len; + } + out[o] = 0; +} + +} // namespace ufold