如何更改 java 中的 UnicodeCategory?

How to change UnicodeCategory in java?

我正在尝试将此 C# 代码迁移到 Java。

是否有可能将 unicodeCategory 迁移到 Java 中的正则表达式,或者是否有可能直接通过 Java 执行 Unicode 类别?

foreach (var currentChar in preNormalizedString)
{
    var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(currentChar);

    //https://msdn.microsoft.com/query/dev14.query?appId=Dev14IDEF1&l=EN-US&k=k(System.Globalization.UnicodeCategory.LowercaseLetter);k(TargetFrameworkMoniker-.NETFramework,Version%3Dv4.6);k(DevLang-csharp)&rd=true
    switch (unicodeCategory)
    {
        //case UnicodeCategory.NonSpacingMark:
        //case UnicodeCategory.SpacingCombiningMark:
        //case UnicodeCategory.EnclosingMark:
        case UnicodeCategory.DashPunctuation:
        case UnicodeCategory.ConnectorPunctuation:
        case UnicodeCategory.OpenPunctuation:
        case UnicodeCategory.ClosePunctuation:
        case UnicodeCategory.OtherPunctuation:
        case UnicodeCategory.LineSeparator:
        case UnicodeCategory.MathSymbol:
        case UnicodeCategory.ModifierSymbol:
        case UnicodeCategory.OtherSymbol:
        case UnicodeCategory.SpaceSeparator:
        case UnicodeCategory.ParagraphSeparator:
            if (!isPreviousSpaceChar)
                builder.Append(" ");
            isPreviousSpaceChar = true;
            break;

        case UnicodeCategory.Control:
        case UnicodeCategory.CurrencySymbol:
        case UnicodeCategory.EnclosingMark:
        case UnicodeCategory.NonSpacingMark:
        case UnicodeCategory.SpacingCombiningMark:
        case UnicodeCategory.InitialQuotePunctuation:
        case UnicodeCategory.FinalQuotePunctuation:
        case UnicodeCategory.Format:
        case UnicodeCategory.ModifierLetter:
        case UnicodeCategory.OtherNotAssigned:
        case UnicodeCategory.PrivateUse:
        case UnicodeCategory.Surrogate:
            // Caratères ignorés.
            break;

        case UnicodeCategory.LowercaseLetter:
        case UnicodeCategory.UppercaseLetter:
        case UnicodeCategory.LetterNumber:
        case UnicodeCategory.DecimalDigitNumber:
        case UnicodeCategory.OtherLetter:
        case UnicodeCategory.OtherNumber:
        case UnicodeCategory.TitlecaseLetter:
        default:
            builder.Append(currentChar);
            isPreviousSpaceChar = false;
            break;
    }
}

var normalizedString = builder.ToString() ?? string.Empty;
normalizedString = normalizedString.ToUpper();
normalizedString = normalizedString.Trim();
return normalizedString;

有一个 getType(char) 将 return 一个 int,然后您可以将其与 Java 文档中列举的常量列表进行比较。

请注意,C# 代码和 getType(char) 都是“错误的”,因为它们不支持非 BMP 字符(使用两个 char 的字符)。但是在 .NET Core 3.0 之前,在其“符文”中拆分字符串在 C# 中稍微复杂一些。

public static String convert(String preNormalizedString) {
    StringBuilder builder = new StringBuilder();
    boolean isPreviousSpaceChar = false;

    for (int i = 0; i < preNormalizedString.length(); i++) {
        char currentChar = preNormalizedString.charAt(i);

        int unicodeCategory = Character.getType(currentChar);

        switch (unicodeCategory) {
            case Character.DASH_PUNCTUATION:
            case Character.CONNECTOR_PUNCTUATION:
                //... You'll have to complete the list
                if (!isPreviousSpaceChar)
                    builder.append(" ");

                isPreviousSpaceChar = true;
                break;
            case Character.CONTROL:
            case Character.CURRENCY_SYMBOL:
                //... You'll have to complete the list
                // Caratères ignorés.
                break;

            case Character.LOWERCASE_LETTER:
            case Character.UPPERCASE_LETTER:
                //... You'll have to complete the list
            default:
                builder.append(currentChar);
                break;
        }
    }

    String normalizedString = builder.toString();
    normalizedString = normalizedString.toUpperCase();
    normalizedString = normalizedString.trim();
    return normalizedString;
}