用于单词匹配语言希伯来语的 Perl 5.34.0 正则表达式
Perl 5.34.0 regular expression for word matching language Hebrew
我正在使用 Perl 5.34.0,我想知道输入是否只是希伯来字母和问号等一些符号。
尽管我找到了一个有很多开销的解决方案,但我想学习如何使用正则表达式更简单地完成它。
这是我没有正则表达式的解决方案。
首先,我在 Perl 模块中的常量哈希中定义了希伯来语字符。
#!perl
package Enums::Nikudletters;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();
our $VERSION = 1.0;
# Types
use constant LETTER => "LETTER";
use constant LOWER_PUNKTATION => "LOWER_PUNKTATION";
use constant UPPER_PUNKTATION => "UPPER_PUNKTATION";
use constant MIDDLE_PUNKTATION => "MIDDLE_PUNKTATION";
use constant {
NIKUDLETTERS => {
AIN => {
UTF8 => charnames::string_vianame("U+05E2"),
CODE => " 05E2",
NAME => "ain",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
ALEF => {
UTF8 => charnames::string_vianame("U+05D0"),
CODE => " 05D0",
NAME => "alef",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
CHET => {
UTF8 => charnames::string_vianame("U+05D7"),
CODE => " 05D7",
NAME => "chet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
DALET => {
UTF8 => charnames::string_vianame("U+05D3"),
CODE => " 05D3",
NAME => "dalet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
GIMEL => {
UTF8 => charnames::string_vianame("U+05D2"),
CODE => " 05D2",
NAME => "gimel",
TYPE => LETTER,
WIDTH => 11,
HANDWRITING => 1,
},
GERESCH => {
UTF8 => charnames::string_vianame("U+05F3"),
CODE => " 05F3",
NAME => "geresch",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 0,
},
GERSCHAYIM => {
UTF8 => charnames::string_vianame("U+05F4"),
CODE => " 05F4",
NAME => "gerschayim",
TYPE => LETTER,
WIDTH => 14,
HANDWRITING => 0,
},
HAEI => {
UTF8 => charnames::string_vianame("U+05D4"),
CODE => " 05D4",
NAME => "häi",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
JOD => {
UTF8 => charnames::string_vianame("U+05D9"),
CODE => " 05D9",
NAME => "jod",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 1,
},
KUF => {
UTF8 => charnames::string_vianame("U+05E7"),
CODE => " 05E7",
NAME => "kuf",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
LAMED => {
UTF8 => charnames::string_vianame("U+05DC"),
CODE => " 05DC",
NAME => "lamed",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 0,
},
RESCH => {
UTF8 => charnames::string_vianame("U+05E8"),
CODE => " 05E8",
NAME => "resch",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
SSAIN => {
UTF8 => charnames::string_vianame("U+05D6"),
CODE => " 05D6",
NAME => "ssain",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
SCHIN => {
UTF8 => charnames::string_vianame("U+05E9"),
CODE => " 05E9",
NAME => "schin",
TYPE => LETTER,
WIDTH => 19,
HANDWRITING => 1,
},
SSAMECH => {
UTF8 => charnames::string_vianame("U+05E1"),
CODE => " 05E1",
NAME => "ssamech",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
SPACE => {
UTF8 => charnames::string_vianame("U+0020"),
CODE => " 0020",
NAME => "space",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
NEWSPACE => {
UTF8 => charnames::string_vianame("U+00A0"),
CODE => " 00A0",
NAME => "newspace",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
TAW => {
UTF8 => charnames::string_vianame("U+05EA"),
CODE => " 05EA",
NAME => "taw",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
TET => {
UTF8 => charnames::string_vianame("U+05D8"),
CODE => " 05D8",
NAME => "tet",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
BET => {
UTF8 => charnames::string_vianame("U+05D1"),
CODE => " 05D1",
NAME => "bet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
WAW => {
UTF8 => charnames::string_vianame("U+05D5"),
CODE => " 05D5",
NAME => "waw",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
ZADI => {
UTF8 => charnames::string_vianame("U+05E6"),
CODE => " 05E6",
NAME => "zadi",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
ZADISSOFIT => {
UTF8 => charnames::string_vianame("U+05E5"),
CODE => " 05E5",
NAME => "zadissofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
KAF => {
UTF8 => charnames::string_vianame("U+05DB"),
CODE => " 05DB",
NAME => "kaf",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
CHAFSSOFIT => {
UTF8 => charnames::string_vianame("U+05DA"),
CODE => " 05DA",
NAME => "chafssofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
PAEI => {
UTF8 => charnames::string_vianame("U+05E4"),
CODE => " 05E4",
NAME => "päi",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
FAEISSOFIT => {
UTF8 => charnames::string_vianame("U+05E3"),
CODE => " 05E3",
NAME => "fäissofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
MEM => {
UTF8 => charnames::string_vianame("U+05DE"),
CODE => " 05DE",
NAME => "mem",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
MEMSSOFIT => {
UTF8 => charnames::string_vianame("U+05DD"),
CODE => " 05DD",
NAME => "memssofit",
TYPE => LETTER,
WIDTH => 16,
HANDWRITING => 1,
},
NUN => {
UTF8 => charnames::string_vianame("U+05E0"),
CODE => " 05E0",
NAME => "nun",,
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 1,
},
NUNSSOFIT => {
UTF8 => charnames::string_vianame("U+05DF"),
CODE => " 05DF",
NAME => "nunssofit",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
SHEVA => {
UTF8 => charnames::string_vianame("U+05B0"),
CODE => " 05B0",
NAME => "schwa = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_SEGOL=> {
UTF8 => charnames::string_vianame("U+05B1"),
CODE => " 05B1",
NAME => "chataf szegol = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_PATAH => {
UTF8 => charnames::string_vianame("U+05B2"),
CODE => " 05B2",
NAME => "chataf patach = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_QAMATS => {
UTF8 => charnames::string_vianame("U+05B3"),
CODE => " 05B3",
NAME => "chataf kamatz = o",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HIRIQ => {
UTF8 => charnames::string_vianame("U+05B4"),
CODE => " 05B4",
NAME => "chirik = i",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
TSERE => {
UTF8 => charnames::string_vianame("U+05B5"),
CODE => " 05B5",
NAME => "zeré = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SEGOL => {
UTF8 => charnames::string_vianame("U+05B6"),
CODE => " 05B6",
NAME => "szegol = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
PATAH => {
UTF8 => charnames::string_vianame("U+05B7"),
CODE => " 05B7",
NAME => "patach = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
QAMATS => {
UTF8 => charnames::string_vianame("U+05B8"),
CODE => " 05B8",
NAME => "kamatz = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HOLAM => {
UTF8 => charnames::string_vianame("U+05B9"),
CODE => " 05B9",
NAME => "cholam = o",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HOLAM_HASER => {
UTF8 => charnames::string_vianame("U+05BA"),
CODE => " 05BA",
NAME => "cholam chaser",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
QUBUTS => {
UTF8 => charnames::string_vianame("U+05BB"),
CODE => " 05BB",
NAME => "kubutz = u",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
DAGESH => {
UTF8 => charnames::string_vianame("U+05BC"),
CODE => " 05BC",
NAME => "dagesch / schuruk",
TYPE => MIDDLE_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
METEG => {
UTF8 => charnames::string_vianame("U+05BD"),
CODE => " 05BD",
NAME => "meteg",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
MAQAF => {
UTF8 => charnames::string_vianame("U+05BE"),
CODE => " 05BE",
NAME => "makaf",
TYPE => LETTER,
WIDTH => 14,
HANDWRITING => 0,
},
RAFE => {
UTF8 => charnames::string_vianame("U+05BF"),
CODE => " 05BF",
NAME => "rafi",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
PASEQ => {
UTF8 => charnames::string_vianame("U+05C0"),
CODE => " 05C0",
NAME => "pasek",
TYPE => LETTER,
WIDTH => 4,
HANDWRITING => 0,
},
SHIN_DOT => {
UTF8 => charnames::string_vianame("U+05C1"),
CODE => " 05C1",
NAME => "schin Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SIN_DOT => {
UTF8 => charnames::string_vianame("U+05C2"),
CODE => " 05C2",
NAME => "ssin Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SOF_PASUQ => {
UTF8 => charnames::string_vianame("U+05C3"),
CODE => " 05C3",
NAME => "sof pasuk",
TYPE => LETTER,
WIDTH => 8,
HANDWRITING => 0,
},
UPPER_DOT => {
UTF8 => charnames::string_vianame("U+05C4"),
CODE => " 05C4",
NAME => "oberer Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
LOWER_DOT => {
UTF8 => charnames::string_vianame("U+05C5"),
CODE => " 05C5",
NAME => "unterer Punkt",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HAFUKAH => {
UTF8 => charnames::string_vianame("U+05C6"),
CODE => " 05C6",
NAME => "chafukach",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
QAMATS_QATAN => {
UTF8 => charnames::string_vianame("U+05C7"),
CODE => " 05C7",
NAME => "kamatz katan",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
JIDDISH_DOUBLE_WAW => {
UTF8 => charnames::string_vianame("U+05F0"),
CODE => " 05F0",
NAME => "waw waw",
TYPE => LETTER,
WIDTH => 19,
HANDWRITING => 0,
},
JIDDISH_WAW_JOD => {
UTF8 => charnames::string_vianame("U+05F1"),
CODE => " 05F1",
NAME => "waw jod",
TYPE => LETTER,
WIDTH => 20,
HANDWRITING => 0,
},
JIDDISH_DOUBLE_JOD => {
UTF8 => charnames::string_vianame("U+05F2"),
CODE => " 05F2",
NAME => "jod jod",
TYPE => LETTER,
WIDTH => 21,
HANDWRITING => 0,
},
}};
sub get {
return NIKUDLETTERS;
}
sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Nikudletters->NIKUDLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=> $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
push(@values_sorted, $letter_hash{$key}->{CODE});
}
return join('', @values_sorted);
}
1;
我也用同样的方式定义了一些符号:
#!perl
package Enums::Signletters;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();
our $VERSION = 1.0;
use constant SIGNLETTERS => {
# ?
QUESTION_MARK =>
{
CODE => " 003F",
},
# !
EXCLAMATION_MARK =>
{
CODE => " 0021",
},
# .
FULL_STOP=>
{
CODE => " 002E",
},
# '
APOSTROPHE =>
{
CODE => " 0027",
},
# (
LEFT_PARENTHESIS =>
{
CODE => " 0028",
},
# )
RIGHT_PARENTHESIS =>
{
CODE => " 0029",
},
# ,
COMMA =>
{
CODE => " 002C",
},
# -
HYPHEN_MINUS =>
{
CODE => " 002D",
},
# "
QUOTATION_MARK =>
{
CODE => " 0022",
},
# §
SECTION_SIGN =>
{
CODE => " 00A7",
},
# $
DOLLAR_SIGN =>
{
CODE => " 0024",
},
# €
EURO_SIGN =>
{
CODE => " 20AC",
},
# %
PERCENT_SIGN =>
{
CODE => " 0025",
},
# /
SOLIDUS =>
{
CODE => " 002F",
},
#[
LEFT_SQUARE_BRACKET =>
{
CODE => " 005B",
},
# ]
RIGHT_SQUARE_BRACKET =>
{
CODE => " 005D",
},
# {
LEFT_CURLY_BRACKET =>
{
CODE => " 007B",
},
#
}
RIGHT_CURLY_BRACKET =>
{
CODE => " 007D",
},
# =
EQUALS_SIGN =>
{
CODE => " 003D",
},
# \
REVERSE_SOLIDUS =>
{
CODE => " 005C",
},
# *
ASTERISK =>
{
CODE => " 002A",
},
# +
PLUS_SIGN =>
{
CODE => " 002B",
},
# #
NUMBER_SIGN =>
{
CODE => " 0023",
},
# ;
SEMICOLON =>
{
CODE => " 0023",
},
# :
COLON =>
{
CODE => " 003A",
},
# _
LOW_LINE =>
{
CODE => " 005F",
},
# °
DEGREE_SIGN =>
{
CODE => " 00B0",
},
# ^
CIRCUMFLEX_ACCENT =>
{
CODE => " 005E",
},
# ´
ACUTE_ACCENT =>
{
CODE => " 00B4",
},
# `
GRAVE_ACCENT =>
{
CODE => " 0060",
},
# @
COMMERCIAL_AT =>
{
CODE => " 0040",
},
# µ
MICRO_SIGN =>
{
CODE => " 00B5",
},
# <
LESS_THAN_SIGN =>
{
CODE => " 003C",
},
# >
GREATER_THAN_SIGN =>
{
CODE => " 003E",
},
# |
VERTICAL_LINE =>
{
CODE => " 007C",
},
# &
AMPERSAND =>
{
CODE => " 0026",
},
};
sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Signletters->SIGNLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=> $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
push(@values_sorted, $letter_hash{$key}->{CODE}) unless( $key eq "REVERSE_SOLIDUS");
}
return join('', @values_sorted);
}
1;
然后我可以将代码连接到我想要的任何表达式字符串中。像这样:
#!perl
package Helpers::UnicodeChecker;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use Enums::Nikudletters;
use Enums::Signletters;
our $VERSION = 1.0;
sub is_valid_hebrew ($self, $hebrew)
{
my $expression = Enums::Nikudletters->get_regular_expression_string();
my $expression_ascii = Enums::Signletters->get_regular_expression_string();
my @hebrew_letters = split('', $hebrew);
foreach my $letter (@hebrew_letters)
{
my $number = unpack("W*", $letter);
unless( $number > 255)
{
my $code_ascii = get_ascii_code($number);
return "false" unless($expression_ascii =~ $code_ascii);
}
else
{
my $code = get_wide_code($number);
return "false" unless($expression =~ $code);
}
}
return "true";
}
sub get_wide_code
{
return substr( sprintf("\x{%04X}", $_[0]) , 3, 4 );
}
sub get_ascii_code
{
return "00".substr( sprintf("\x%02X", $_[0]) , 2, 2 );
}
1;
这可行,但涉及大量代码。有一个简短的正则表达式来完成相同的任务会很好。谁能提供正则表达式?
我在谷歌上搜索了很多,阅读了很多,尝试了很多,但我找不到适用于 Perl 5.34.0 的正则表达式。谢谢您的帮助。我刚学Perl。
您可以使用相对简单的模式匹配来完成此操作。
这里有趣的一点是 \p{Hebrew}
,它允许您匹配每个字符 with a specific Unicode property。剩下的只是字符串的开头 ^
和结尾 $
,以及表示一个或多个的量词 +
。
use strict;
use warnings;
use utf8; # for the Hebrew glyphs in my example input
my $string = qq{שלום עולם!};
print $string =~ m/^[\p{Hebrew}?!.' ]+$/;
这将匹配任何希伯来字母、空格和几个标点字符。您不需要 the utf8
pragma 除非您想在源代码中包含实际的希伯来语文本,例如在注释中。
您可以根据需要使用任何其他字符来扩展字符组(在 []
中)。
在我的评论中我使用了 the re
pragma,这对调试正则表达式很有用。
我正在使用 Perl 5.34.0,我想知道输入是否只是希伯来字母和问号等一些符号。 尽管我找到了一个有很多开销的解决方案,但我想学习如何使用正则表达式更简单地完成它。
这是我没有正则表达式的解决方案。
首先,我在 Perl 模块中的常量哈希中定义了希伯来语字符。
#!perl
package Enums::Nikudletters;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();
our $VERSION = 1.0;
# Types
use constant LETTER => "LETTER";
use constant LOWER_PUNKTATION => "LOWER_PUNKTATION";
use constant UPPER_PUNKTATION => "UPPER_PUNKTATION";
use constant MIDDLE_PUNKTATION => "MIDDLE_PUNKTATION";
use constant {
NIKUDLETTERS => {
AIN => {
UTF8 => charnames::string_vianame("U+05E2"),
CODE => " 05E2",
NAME => "ain",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
ALEF => {
UTF8 => charnames::string_vianame("U+05D0"),
CODE => " 05D0",
NAME => "alef",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
CHET => {
UTF8 => charnames::string_vianame("U+05D7"),
CODE => " 05D7",
NAME => "chet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
DALET => {
UTF8 => charnames::string_vianame("U+05D3"),
CODE => " 05D3",
NAME => "dalet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
GIMEL => {
UTF8 => charnames::string_vianame("U+05D2"),
CODE => " 05D2",
NAME => "gimel",
TYPE => LETTER,
WIDTH => 11,
HANDWRITING => 1,
},
GERESCH => {
UTF8 => charnames::string_vianame("U+05F3"),
CODE => " 05F3",
NAME => "geresch",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 0,
},
GERSCHAYIM => {
UTF8 => charnames::string_vianame("U+05F4"),
CODE => " 05F4",
NAME => "gerschayim",
TYPE => LETTER,
WIDTH => 14,
HANDWRITING => 0,
},
HAEI => {
UTF8 => charnames::string_vianame("U+05D4"),
CODE => " 05D4",
NAME => "häi",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
JOD => {
UTF8 => charnames::string_vianame("U+05D9"),
CODE => " 05D9",
NAME => "jod",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 1,
},
KUF => {
UTF8 => charnames::string_vianame("U+05E7"),
CODE => " 05E7",
NAME => "kuf",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
LAMED => {
UTF8 => charnames::string_vianame("U+05DC"),
CODE => " 05DC",
NAME => "lamed",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 0,
},
RESCH => {
UTF8 => charnames::string_vianame("U+05E8"),
CODE => " 05E8",
NAME => "resch",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
SSAIN => {
UTF8 => charnames::string_vianame("U+05D6"),
CODE => " 05D6",
NAME => "ssain",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
SCHIN => {
UTF8 => charnames::string_vianame("U+05E9"),
CODE => " 05E9",
NAME => "schin",
TYPE => LETTER,
WIDTH => 19,
HANDWRITING => 1,
},
SSAMECH => {
UTF8 => charnames::string_vianame("U+05E1"),
CODE => " 05E1",
NAME => "ssamech",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
SPACE => {
UTF8 => charnames::string_vianame("U+0020"),
CODE => " 0020",
NAME => "space",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
NEWSPACE => {
UTF8 => charnames::string_vianame("U+00A0"),
CODE => " 00A0",
NAME => "newspace",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
TAW => {
UTF8 => charnames::string_vianame("U+05EA"),
CODE => " 05EA",
NAME => "taw",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
TET => {
UTF8 => charnames::string_vianame("U+05D8"),
CODE => " 05D8",
NAME => "tet",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
BET => {
UTF8 => charnames::string_vianame("U+05D1"),
CODE => " 05D1",
NAME => "bet",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
WAW => {
UTF8 => charnames::string_vianame("U+05D5"),
CODE => " 05D5",
NAME => "waw",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
ZADI => {
UTF8 => charnames::string_vianame("U+05E6"),
CODE => " 05E6",
NAME => "zadi",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
ZADISSOFIT => {
UTF8 => charnames::string_vianame("U+05E5"),
CODE => " 05E5",
NAME => "zadissofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
KAF => {
UTF8 => charnames::string_vianame("U+05DB"),
CODE => " 05DB",
NAME => "kaf",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
CHAFSSOFIT => {
UTF8 => charnames::string_vianame("U+05DA"),
CODE => " 05DA",
NAME => "chafssofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
PAEI => {
UTF8 => charnames::string_vianame("U+05E4"),
CODE => " 05E4",
NAME => "päi",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
FAEISSOFIT => {
UTF8 => charnames::string_vianame("U+05E3"),
CODE => " 05E3",
NAME => "fäissofit",
TYPE => LETTER,
WIDTH => 18,
HANDWRITING => 1,
},
MEM => {
UTF8 => charnames::string_vianame("U+05DE"),
CODE => " 05DE",
NAME => "mem",
TYPE => LETTER,
WIDTH => 17,
HANDWRITING => 1,
},
MEMSSOFIT => {
UTF8 => charnames::string_vianame("U+05DD"),
CODE => " 05DD",
NAME => "memssofit",
TYPE => LETTER,
WIDTH => 16,
HANDWRITING => 1,
},
NUN => {
UTF8 => charnames::string_vianame("U+05E0"),
CODE => " 05E0",
NAME => "nun",,
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 1,
},
NUNSSOFIT => {
UTF8 => charnames::string_vianame("U+05DF"),
CODE => " 05DF",
NAME => "nunssofit",
TYPE => LETTER,
WIDTH => 9,
HANDWRITING => 1,
},
SHEVA => {
UTF8 => charnames::string_vianame("U+05B0"),
CODE => " 05B0",
NAME => "schwa = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_SEGOL=> {
UTF8 => charnames::string_vianame("U+05B1"),
CODE => " 05B1",
NAME => "chataf szegol = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_PATAH => {
UTF8 => charnames::string_vianame("U+05B2"),
CODE => " 05B2",
NAME => "chataf patach = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HATAF_QAMATS => {
UTF8 => charnames::string_vianame("U+05B3"),
CODE => " 05B3",
NAME => "chataf kamatz = o",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HIRIQ => {
UTF8 => charnames::string_vianame("U+05B4"),
CODE => " 05B4",
NAME => "chirik = i",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
TSERE => {
UTF8 => charnames::string_vianame("U+05B5"),
CODE => " 05B5",
NAME => "zeré = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SEGOL => {
UTF8 => charnames::string_vianame("U+05B6"),
CODE => " 05B6",
NAME => "szegol = e",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
PATAH => {
UTF8 => charnames::string_vianame("U+05B7"),
CODE => " 05B7",
NAME => "patach = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
QAMATS => {
UTF8 => charnames::string_vianame("U+05B8"),
CODE => " 05B8",
NAME => "kamatz = a",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HOLAM => {
UTF8 => charnames::string_vianame("U+05B9"),
CODE => " 05B9",
NAME => "cholam = o",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HOLAM_HASER => {
UTF8 => charnames::string_vianame("U+05BA"),
CODE => " 05BA",
NAME => "cholam chaser",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
QUBUTS => {
UTF8 => charnames::string_vianame("U+05BB"),
CODE => " 05BB",
NAME => "kubutz = u",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
DAGESH => {
UTF8 => charnames::string_vianame("U+05BC"),
CODE => " 05BC",
NAME => "dagesch / schuruk",
TYPE => MIDDLE_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
METEG => {
UTF8 => charnames::string_vianame("U+05BD"),
CODE => " 05BD",
NAME => "meteg",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
MAQAF => {
UTF8 => charnames::string_vianame("U+05BE"),
CODE => " 05BE",
NAME => "makaf",
TYPE => LETTER,
WIDTH => 14,
HANDWRITING => 0,
},
RAFE => {
UTF8 => charnames::string_vianame("U+05BF"),
CODE => " 05BF",
NAME => "rafi",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
PASEQ => {
UTF8 => charnames::string_vianame("U+05C0"),
CODE => " 05C0",
NAME => "pasek",
TYPE => LETTER,
WIDTH => 4,
HANDWRITING => 0,
},
SHIN_DOT => {
UTF8 => charnames::string_vianame("U+05C1"),
CODE => " 05C1",
NAME => "schin Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SIN_DOT => {
UTF8 => charnames::string_vianame("U+05C2"),
CODE => " 05C2",
NAME => "ssin Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
SOF_PASUQ => {
UTF8 => charnames::string_vianame("U+05C3"),
CODE => " 05C3",
NAME => "sof pasuk",
TYPE => LETTER,
WIDTH => 8,
HANDWRITING => 0,
},
UPPER_DOT => {
UTF8 => charnames::string_vianame("U+05C4"),
CODE => " 05C4",
NAME => "oberer Punkt",
TYPE => UPPER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
LOWER_DOT => {
UTF8 => charnames::string_vianame("U+05C5"),
CODE => " 05C5",
NAME => "unterer Punkt",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
HAFUKAH => {
UTF8 => charnames::string_vianame("U+05C6"),
CODE => " 05C6",
NAME => "chafukach",
TYPE => LETTER,
WIDTH => 10,
HANDWRITING => 0,
},
QAMATS_QATAN => {
UTF8 => charnames::string_vianame("U+05C7"),
CODE => " 05C7",
NAME => "kamatz katan",
TYPE => LOWER_PUNKTATION,
WIDTH => 0,
HANDWRITING => 0,
},
JIDDISH_DOUBLE_WAW => {
UTF8 => charnames::string_vianame("U+05F0"),
CODE => " 05F0",
NAME => "waw waw",
TYPE => LETTER,
WIDTH => 19,
HANDWRITING => 0,
},
JIDDISH_WAW_JOD => {
UTF8 => charnames::string_vianame("U+05F1"),
CODE => " 05F1",
NAME => "waw jod",
TYPE => LETTER,
WIDTH => 20,
HANDWRITING => 0,
},
JIDDISH_DOUBLE_JOD => {
UTF8 => charnames::string_vianame("U+05F2"),
CODE => " 05F2",
NAME => "jod jod",
TYPE => LETTER,
WIDTH => 21,
HANDWRITING => 0,
},
}};
sub get {
return NIKUDLETTERS;
}
sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Nikudletters->NIKUDLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=> $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
push(@values_sorted, $letter_hash{$key}->{CODE});
}
return join('', @values_sorted);
}
1;
我也用同样的方式定义了一些符号:
#!perl
package Enums::Signletters;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();
our $VERSION = 1.0;
use constant SIGNLETTERS => {
# ?
QUESTION_MARK =>
{
CODE => " 003F",
},
# !
EXCLAMATION_MARK =>
{
CODE => " 0021",
},
# .
FULL_STOP=>
{
CODE => " 002E",
},
# '
APOSTROPHE =>
{
CODE => " 0027",
},
# (
LEFT_PARENTHESIS =>
{
CODE => " 0028",
},
# )
RIGHT_PARENTHESIS =>
{
CODE => " 0029",
},
# ,
COMMA =>
{
CODE => " 002C",
},
# -
HYPHEN_MINUS =>
{
CODE => " 002D",
},
# "
QUOTATION_MARK =>
{
CODE => " 0022",
},
# §
SECTION_SIGN =>
{
CODE => " 00A7",
},
# $
DOLLAR_SIGN =>
{
CODE => " 0024",
},
# €
EURO_SIGN =>
{
CODE => " 20AC",
},
# %
PERCENT_SIGN =>
{
CODE => " 0025",
},
# /
SOLIDUS =>
{
CODE => " 002F",
},
#[
LEFT_SQUARE_BRACKET =>
{
CODE => " 005B",
},
# ]
RIGHT_SQUARE_BRACKET =>
{
CODE => " 005D",
},
# {
LEFT_CURLY_BRACKET =>
{
CODE => " 007B",
},
#
}
RIGHT_CURLY_BRACKET =>
{
CODE => " 007D",
},
# =
EQUALS_SIGN =>
{
CODE => " 003D",
},
# \
REVERSE_SOLIDUS =>
{
CODE => " 005C",
},
# *
ASTERISK =>
{
CODE => " 002A",
},
# +
PLUS_SIGN =>
{
CODE => " 002B",
},
# #
NUMBER_SIGN =>
{
CODE => " 0023",
},
# ;
SEMICOLON =>
{
CODE => " 0023",
},
# :
COLON =>
{
CODE => " 003A",
},
# _
LOW_LINE =>
{
CODE => " 005F",
},
# °
DEGREE_SIGN =>
{
CODE => " 00B0",
},
# ^
CIRCUMFLEX_ACCENT =>
{
CODE => " 005E",
},
# ´
ACUTE_ACCENT =>
{
CODE => " 00B4",
},
# `
GRAVE_ACCENT =>
{
CODE => " 0060",
},
# @
COMMERCIAL_AT =>
{
CODE => " 0040",
},
# µ
MICRO_SIGN =>
{
CODE => " 00B5",
},
# <
LESS_THAN_SIGN =>
{
CODE => " 003C",
},
# >
GREATER_THAN_SIGN =>
{
CODE => " 003E",
},
# |
VERTICAL_LINE =>
{
CODE => " 007C",
},
# &
AMPERSAND =>
{
CODE => " 0026",
},
};
sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Signletters->SIGNLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=> $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
push(@values_sorted, $letter_hash{$key}->{CODE}) unless( $key eq "REVERSE_SOLIDUS");
}
return join('', @values_sorted);
}
1;
然后我可以将代码连接到我想要的任何表达式字符串中。像这样:
#!perl
package Helpers::UnicodeChecker;
use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use Enums::Nikudletters;
use Enums::Signletters;
our $VERSION = 1.0;
sub is_valid_hebrew ($self, $hebrew)
{
my $expression = Enums::Nikudletters->get_regular_expression_string();
my $expression_ascii = Enums::Signletters->get_regular_expression_string();
my @hebrew_letters = split('', $hebrew);
foreach my $letter (@hebrew_letters)
{
my $number = unpack("W*", $letter);
unless( $number > 255)
{
my $code_ascii = get_ascii_code($number);
return "false" unless($expression_ascii =~ $code_ascii);
}
else
{
my $code = get_wide_code($number);
return "false" unless($expression =~ $code);
}
}
return "true";
}
sub get_wide_code
{
return substr( sprintf("\x{%04X}", $_[0]) , 3, 4 );
}
sub get_ascii_code
{
return "00".substr( sprintf("\x%02X", $_[0]) , 2, 2 );
}
1;
这可行,但涉及大量代码。有一个简短的正则表达式来完成相同的任务会很好。谁能提供正则表达式?
我在谷歌上搜索了很多,阅读了很多,尝试了很多,但我找不到适用于 Perl 5.34.0 的正则表达式。谢谢您的帮助。我刚学Perl。
您可以使用相对简单的模式匹配来完成此操作。
这里有趣的一点是 \p{Hebrew}
,它允许您匹配每个字符 with a specific Unicode property。剩下的只是字符串的开头 ^
和结尾 $
,以及表示一个或多个的量词 +
。
use strict;
use warnings;
use utf8; # for the Hebrew glyphs in my example input
my $string = qq{שלום עולם!};
print $string =~ m/^[\p{Hebrew}?!.' ]+$/;
这将匹配任何希伯来字母、空格和几个标点字符。您不需要 the utf8
pragma 除非您想在源代码中包含实际的希伯来语文本,例如在注释中。
您可以根据需要使用任何其他字符来扩展字符组(在 []
中)。
在我的评论中我使用了 the re
pragma,这对调试正则表达式很有用。