如何更改 java 中的 UnicodeCategory?
How to change UnicodeCategory in java?
我正在尝试将此 C# 代码迁移到 Java。
是否有可能将 unicodeCategory 迁移到 Java 中的正则表达式,或者是否有可能直接通过 Java 执行 Unicode 类别?
foreach (var currentChar in preNormalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(currentChar);
//https://msdn.microsoft.com/query/dev14.query?appId=Dev14IDEF1&l=EN-US&k=k(System.Globalization.UnicodeCategory.LowercaseLetter);k(TargetFrameworkMoniker-.NETFramework,Version%3Dv4.6);k(DevLang-csharp)&rd=true
switch (unicodeCategory)
{
//case UnicodeCategory.NonSpacingMark:
//case UnicodeCategory.SpacingCombiningMark:
//case UnicodeCategory.EnclosingMark:
case UnicodeCategory.DashPunctuation:
case UnicodeCategory.ConnectorPunctuation:
case UnicodeCategory.OpenPunctuation:
case UnicodeCategory.ClosePunctuation:
case UnicodeCategory.OtherPunctuation:
case UnicodeCategory.LineSeparator:
case UnicodeCategory.MathSymbol:
case UnicodeCategory.ModifierSymbol:
case UnicodeCategory.OtherSymbol:
case UnicodeCategory.SpaceSeparator:
case UnicodeCategory.ParagraphSeparator:
if (!isPreviousSpaceChar)
builder.Append(" ");
isPreviousSpaceChar = true;
break;
case UnicodeCategory.Control:
case UnicodeCategory.CurrencySymbol:
case UnicodeCategory.EnclosingMark:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.SpacingCombiningMark:
case UnicodeCategory.InitialQuotePunctuation:
case UnicodeCategory.FinalQuotePunctuation:
case UnicodeCategory.Format:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherNotAssigned:
case UnicodeCategory.PrivateUse:
case UnicodeCategory.Surrogate:
// Caratères ignorés.
break;
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.LetterNumber:
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.OtherNumber:
case UnicodeCategory.TitlecaseLetter:
default:
builder.Append(currentChar);
isPreviousSpaceChar = false;
break;
}
}
var normalizedString = builder.ToString() ?? string.Empty;
normalizedString = normalizedString.ToUpper();
normalizedString = normalizedString.Trim();
return normalizedString;
有一个 getType(char)
将 return 一个 int
,然后您可以将其与 Java 文档中列举的常量列表进行比较。
请注意,C# 代码和 getType(char)
都是“错误的”,因为它们不支持非 BMP 字符(使用两个 char
的字符)。但是在 .NET Core 3.0 之前,在其“符文”中拆分字符串在 C# 中稍微复杂一些。
public static String convert(String preNormalizedString) {
StringBuilder builder = new StringBuilder();
boolean isPreviousSpaceChar = false;
for (int i = 0; i < preNormalizedString.length(); i++) {
char currentChar = preNormalizedString.charAt(i);
int unicodeCategory = Character.getType(currentChar);
switch (unicodeCategory) {
case Character.DASH_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
//... You'll have to complete the list
if (!isPreviousSpaceChar)
builder.append(" ");
isPreviousSpaceChar = true;
break;
case Character.CONTROL:
case Character.CURRENCY_SYMBOL:
//... You'll have to complete the list
// Caratères ignorés.
break;
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
//... You'll have to complete the list
default:
builder.append(currentChar);
break;
}
}
String normalizedString = builder.toString();
normalizedString = normalizedString.toUpperCase();
normalizedString = normalizedString.trim();
return normalizedString;
}
我正在尝试将此 C# 代码迁移到 Java。
是否有可能将 unicodeCategory 迁移到 Java 中的正则表达式,或者是否有可能直接通过 Java 执行 Unicode 类别?
foreach (var currentChar in preNormalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(currentChar);
//https://msdn.microsoft.com/query/dev14.query?appId=Dev14IDEF1&l=EN-US&k=k(System.Globalization.UnicodeCategory.LowercaseLetter);k(TargetFrameworkMoniker-.NETFramework,Version%3Dv4.6);k(DevLang-csharp)&rd=true
switch (unicodeCategory)
{
//case UnicodeCategory.NonSpacingMark:
//case UnicodeCategory.SpacingCombiningMark:
//case UnicodeCategory.EnclosingMark:
case UnicodeCategory.DashPunctuation:
case UnicodeCategory.ConnectorPunctuation:
case UnicodeCategory.OpenPunctuation:
case UnicodeCategory.ClosePunctuation:
case UnicodeCategory.OtherPunctuation:
case UnicodeCategory.LineSeparator:
case UnicodeCategory.MathSymbol:
case UnicodeCategory.ModifierSymbol:
case UnicodeCategory.OtherSymbol:
case UnicodeCategory.SpaceSeparator:
case UnicodeCategory.ParagraphSeparator:
if (!isPreviousSpaceChar)
builder.Append(" ");
isPreviousSpaceChar = true;
break;
case UnicodeCategory.Control:
case UnicodeCategory.CurrencySymbol:
case UnicodeCategory.EnclosingMark:
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.SpacingCombiningMark:
case UnicodeCategory.InitialQuotePunctuation:
case UnicodeCategory.FinalQuotePunctuation:
case UnicodeCategory.Format:
case UnicodeCategory.ModifierLetter:
case UnicodeCategory.OtherNotAssigned:
case UnicodeCategory.PrivateUse:
case UnicodeCategory.Surrogate:
// Caratères ignorés.
break;
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.LetterNumber:
case UnicodeCategory.DecimalDigitNumber:
case UnicodeCategory.OtherLetter:
case UnicodeCategory.OtherNumber:
case UnicodeCategory.TitlecaseLetter:
default:
builder.Append(currentChar);
isPreviousSpaceChar = false;
break;
}
}
var normalizedString = builder.ToString() ?? string.Empty;
normalizedString = normalizedString.ToUpper();
normalizedString = normalizedString.Trim();
return normalizedString;
有一个 getType(char)
将 return 一个 int
,然后您可以将其与 Java 文档中列举的常量列表进行比较。
请注意,C# 代码和 getType(char)
都是“错误的”,因为它们不支持非 BMP 字符(使用两个 char
的字符)。但是在 .NET Core 3.0 之前,在其“符文”中拆分字符串在 C# 中稍微复杂一些。
public static String convert(String preNormalizedString) {
StringBuilder builder = new StringBuilder();
boolean isPreviousSpaceChar = false;
for (int i = 0; i < preNormalizedString.length(); i++) {
char currentChar = preNormalizedString.charAt(i);
int unicodeCategory = Character.getType(currentChar);
switch (unicodeCategory) {
case Character.DASH_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
//... You'll have to complete the list
if (!isPreviousSpaceChar)
builder.append(" ");
isPreviousSpaceChar = true;
break;
case Character.CONTROL:
case Character.CURRENCY_SYMBOL:
//... You'll have to complete the list
// Caratères ignorés.
break;
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
//... You'll have to complete the list
default:
builder.append(currentChar);
break;
}
}
String normalizedString = builder.toString();
normalizedString = normalizedString.toUpperCase();
normalizedString = normalizedString.trim();
return normalizedString;
}