在JavaScript中使用UTF-8 Fancytext转换为字母数字

rta7y2nd  于 2023-01-16  发布在  Java
关注(0)|答案(1)|浏览(81)

我有一个接受UTF-8的HTML用户名输入。用户可以添加“fancytext”到他们的名字,如(🅰𐌀等),使其更具风格。但是,我想做一些解析与他们的名字客户端(例如,搜索和查找单词的名称)。
在JavaScript中,有没有一种方法可以在不使用大数据集的情况下将花哨的文本转换为它们的字母数字值?"🅰""Ⓐ""𐌀""ᗩ"都将导致"a"?(不可转换的值返回false或其他)。
我的一些测试表明,字符和toCharCode之间存在模式。

var str = "ⒶⒷⒸⒹⒺⒻⒼABCDEFG";
for (var i = 0; i < str.length; i++)
    console.log(str.charCodeAt(i))

// Prints 9398 9399 9400 9401 9402 9403 9404 65313 65314 65315 65316 65317 65318 65319

但是,我无法确认这是否适用于所有字符,因为我无法获得多字节字符的charCode。

var encodings = "🅰🆉ⒶⓏⓐⓩ𐌀乙ΛẒαzAZazᗩƵΔŽคzαzάžÃŻĄȤ𝔞𝔷𝓪𝔃𝒶𝓏𝓐𝓩𝐀𝐙𝐚𝐳𝔸ℤ𝕒𝕫ᵃᶻɐz"

for (var i = 0; i < encodings.length; i++) {

    console.log(encodings.charCodeAt(i), encodings.charAt(i))
}
/*
    Prints:

    55356 � 56688 � 55356 � 56713 � 9398 Ⓐ 9423 Ⓩ 9424 ⓐ 9449 ⓩ 55296 � 57088 � 20057 乙 923 Λ 7826 Ẓ 945 α 122 z 65313 A 65338 Z 65345 a 65370 z 5609 ᗩ 437 Ƶ 916 Δ 381 Ž 3588 ค 122 z 945 α 122 z 940 ά 382 ž 195 Ã 379 Ż 260 Ą 548 Ȥ 55349 � 56606 � 55349 � 56631 � 55349 � 56554 � 55349 � 56579 � 55349 � 56502 � 55349 � 56527 � 55349 � 56528 � 55349 � 56553 � 55349 � 56320 � 55349 � 56345 � 55349 � 56346 � 55349 � 56371 � 55349 � 56632 � 8484 ℤ 55349 � 56658 � 55349 � 56683 � 7491 ᵃ 7611 ᶻ 592 ɐ 122 z
*/

如果可以正确读取多字节字符,则这是一个可行的解决方案:

function getAlphanumericalFromFancytext(char) {
    var encodings = "🅰🆉ⒶⓏⓐⓩ𐌀乙ΛẒαzAZazᗩƵΔŽคzαzάžÃŻĄȤ𝔞𝔷𝓪𝔃𝒶𝓏𝓐𝓩𝐀𝐙𝐚𝐳𝔸ℤ𝕒𝕫ᵃᶻɐz"

    var code = getCharCodeMultibyte(char, 0); // getCharCodeMultibyte(string, index) gets charCode for utf8 multibyte characters

    for (var i = 0; i < encodings.length; i += 2) {
        var startCode = getCharCodeMultibyte(encodings, i),
            endCode = getCharCodeMultibyte(encodings, i+1);

        if (code >= startCode && code <= endCode) {
            return String.fromCharCode(code - startCode + "a".charCodeAt(0));
        }
    }
    return false;
}

我从Mozilla文档中找到了一种读取多字节字符的方法,但是它仍然返回不正确的结果。

function getCharCodeMultibyte(str, idx) {
    // For example, fixedCharCodeAt('\uD800\uDC00', 0); // 65536
    // For example, fixedCharCodeAt('\uD800\uDC00', 1); // false
    idx = idx || 0;
    var code = str.charCodeAt(idx);
    var hi, low;

    // High surrogate (could change last hex to 0xDB7F
    // to treat high private surrogates
    // as single characters)
    if (0xD800 <= code && code <= 0xDBFF) {
        hi = code;
        low = str.charCodeAt(idx + 1);
        if (isNaN(low)) {
            throw 'High surrogate not followed by ' +
                'low surrogate in fixedCharCodeAt()';
        }
        return ((hi - 0xD800) * 0x400) +
                (low - 0xDC00) + 0x10000;
    }
    if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
        // We return false to allow loops to skip
        // this iteration since should have already handled
        // high surrogate above in the previous iteration
        return false;
        // hi = str.charCodeAt(idx - 1);
        // low = code;
        // return ((hi - 0xD800) * 0x400) +
        //   (low - 0xDC00) + 0x10000;
    }
    return code;
}

function getAlphanumericalFromFancytext(char) {
    var encodings = ["🅰", "🆉", "Ⓐ", "Ⓩ", "ⓐ", "ⓩ", "𐌀", "乙", "Λ", "Ẓ", "α", "z", "A", "Z", "a", "z", "ᗩ", "Ƶ", "Δ", "Ž", "ค", "z", "α", "z", "ά", "ž", "Ã", "Ż", "Ą", "Ȥ", "𝔞", "𝔷", "𝓪", "𝔃", "𝒶", "𝓏", "𝓐", "𝓩", "𝐀", "𝐙", "𝐚", "𝐳", "𝔸", "ℤ", "𝕒", "𝕫", "ᵃ", "ᶻ", "ɐ", "z"]

    var code = getCharCodeMultibyte(char, 0); // getCharCodeMultibyte(string, index) gets charCode for utf8 multibyte characters

    for (var i = 0; i < encodings.length; i += 2) {
        var startCode = getCharCodeMultibyte(encodings[i], 0),
            endCode = getCharCodeMultibyte(encodings[i + 1], 0);

        if (code >= startCode && code <= endCode) {
            //console.log(i + "|" + String.fromCharCode(code - startCode + "a".charCodeAt(0)) + "|" + code + "|" + startCode + "|" + endCode)
            return String.fromCharCode(code - startCode + "a".charCodeAt(0));
        }
    }
    return false;
}

var text = ["🅷", "Ⓔ", "ᒪ", "ᒪ", "𝓞"]
var output = [];

text.forEach((char) => {
    output.push(getAlphanumericalFromFancytext(char))
})
console.log(output.join("")) // Returns heᅰᅰo. "Λ", "Ẓ" is where it messes up.
bis0qfac

bis0qfac1#

基于您提供的结构,我能够构造这个简短的helper对象,基于无限数量的数据集将花哨的文本转换为普通的字母表,前提是要比较的字符总是1:1
就这样用吧

const ftp = new FancyTextParser();
const parsed = ftp.parse("คⓑⓒd");
console.log(parsed); //abcd

您必须在this.charset对象上手动添加自己的字符集,以确保每个数据集的字符位置与base字符集匹配。
当前代码的局限性是我不能考虑字母大小写,因为我的基本字符只能是小写或大写。
x一个一个一个一个x一个一个二个x

相关问题