用于单词匹配语言希伯来语的 Perl 5.34.0 正则表达式

Perl 5.34.0 regular expression for word matching language Hebrew

我正在使用 Perl 5.34.0,我想知道输入是否只是希伯来字母和问号等一些符号。 尽管我找到了一个有很多开销的解决方案,但我想学习如何使用正则表达式更简单地完成它。

这是我没有正则表达式的解决方案。

首先,我在 Perl 模块中的常量哈希中定义了希伯来语字符。

#!perl
package  Enums::Nikudletters;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();

our $VERSION = 1.0;

# Types
use constant LETTER => "LETTER";
use constant LOWER_PUNKTATION => "LOWER_PUNKTATION";
use constant UPPER_PUNKTATION => "UPPER_PUNKTATION";
use constant MIDDLE_PUNKTATION => "MIDDLE_PUNKTATION";

use constant { 
NIKUDLETTERS => {
AIN => {
    UTF8 => charnames::string_vianame("U+05E2"),
    CODE => " 05E2",
    NAME => "ain",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
ALEF => {
    UTF8 => charnames::string_vianame("U+05D0"),
    CODE => " 05D0",
    NAME => "alef",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
CHET => {
    UTF8 => charnames::string_vianame("U+05D7"),
    CODE => " 05D7",
    NAME => "chet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
DALET => {
    UTF8 => charnames::string_vianame("U+05D3"),
    CODE => " 05D3",
    NAME => "dalet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
GIMEL => {
    UTF8 => charnames::string_vianame("U+05D2"),
    CODE => " 05D2",
    NAME => "gimel",
    TYPE => LETTER,
    WIDTH => 11,
    HANDWRITING => 1,
},
GERESCH => {
    UTF8 => charnames::string_vianame("U+05F3"),
    CODE => " 05F3",
    NAME => "geresch",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 0,
},
GERSCHAYIM => {
    UTF8 => charnames::string_vianame("U+05F4"),
    CODE => " 05F4",
    NAME => "gerschayim",
    TYPE => LETTER,
    WIDTH => 14,
    HANDWRITING => 0,
},
HAEI => {
    UTF8 => charnames::string_vianame("U+05D4"),
    CODE => " 05D4",
    NAME => "häi",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
JOD => {
    UTF8 => charnames::string_vianame("U+05D9"),
    CODE => " 05D9",
    NAME => "jod",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 1,
},
KUF => {
    UTF8 => charnames::string_vianame("U+05E7"),
    CODE => " 05E7",
    NAME => "kuf",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
LAMED => {
    UTF8 => charnames::string_vianame("U+05DC"),
    CODE => " 05DC",
    NAME => "lamed",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 0,
},
RESCH => {
    UTF8 => charnames::string_vianame("U+05E8"),
    CODE => " 05E8",
    NAME => "resch",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
SSAIN => {
    UTF8 => charnames::string_vianame("U+05D6"),
    CODE => " 05D6",
    NAME => "ssain",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
SCHIN => {
    UTF8 => charnames::string_vianame("U+05E9"),
    CODE => " 05E9",
    NAME => "schin",
    TYPE => LETTER,
    WIDTH => 19,
    HANDWRITING => 1,
},
SSAMECH => {
    UTF8 => charnames::string_vianame("U+05E1"),
    CODE => " 05E1",
    NAME => "ssamech",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
SPACE => {
    UTF8 => charnames::string_vianame("U+0020"),
    CODE => " 0020",
    NAME => "space",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
NEWSPACE => {
    UTF8 => charnames::string_vianame("U+00A0"),
    CODE => " 00A0",
    NAME => "newspace",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
TAW => {
    UTF8 => charnames::string_vianame("U+05EA"),
    CODE => " 05EA",
    NAME => "taw",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
TET => {
    UTF8 => charnames::string_vianame("U+05D8"),
    CODE => " 05D8",
    NAME => "tet",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
BET => {
    UTF8 => charnames::string_vianame("U+05D1"),
    CODE => " 05D1",
    NAME => "bet",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
WAW => {
    UTF8 => charnames::string_vianame("U+05D5"),
    CODE => " 05D5",
    NAME => "waw",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
ZADI => {
    UTF8 => charnames::string_vianame("U+05E6"),
    CODE => " 05E6",
    NAME => "zadi",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
ZADISSOFIT => {
    UTF8 => charnames::string_vianame("U+05E5"),
    CODE => " 05E5",
    NAME => "zadissofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
KAF => {
    UTF8 => charnames::string_vianame("U+05DB"),
    CODE => " 05DB",
    NAME => "kaf",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
CHAFSSOFIT => {
    UTF8 => charnames::string_vianame("U+05DA"),
    CODE => " 05DA",
    NAME => "chafssofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
PAEI => {
    UTF8 => charnames::string_vianame("U+05E4"),
    CODE => " 05E4",
    NAME => "päi",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
FAEISSOFIT => {
    UTF8 => charnames::string_vianame("U+05E3"),
    CODE => " 05E3",
    NAME => "fäissofit",
    TYPE => LETTER,
    WIDTH => 18,
    HANDWRITING => 1,
},
MEM => {
    UTF8 => charnames::string_vianame("U+05DE"),
    CODE => " 05DE",
    NAME => "mem",
    TYPE => LETTER,
    WIDTH => 17,
    HANDWRITING => 1,
},
MEMSSOFIT => {
    UTF8 => charnames::string_vianame("U+05DD"),
    CODE => " 05DD",
    NAME => "memssofit",
    TYPE => LETTER,
    WIDTH => 16,
    HANDWRITING => 1,
},
NUN => {
    UTF8 => charnames::string_vianame("U+05E0"),
    CODE => " 05E0",
    NAME => "nun",,
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 1,
},
NUNSSOFIT => {
    UTF8 => charnames::string_vianame("U+05DF"),
    CODE => " 05DF",
    NAME => "nunssofit",
    TYPE => LETTER,
    WIDTH => 9,
    HANDWRITING => 1,
},
SHEVA => {
    UTF8 => charnames::string_vianame("U+05B0"),
    CODE => " 05B0",
    NAME => "schwa = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
HATAF_SEGOL=> {
    UTF8 => charnames::string_vianame("U+05B1"),
    CODE => " 05B1",
    NAME => "chataf szegol = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HATAF_PATAH => {
    UTF8 => charnames::string_vianame("U+05B2"),
    CODE => " 05B2",
    NAME => "chataf patach = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HATAF_QAMATS => {
    UTF8 => charnames::string_vianame("U+05B3"),
    CODE => " 05B3",
    NAME => "chataf kamatz = o",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HIRIQ => {
    UTF8 => charnames::string_vianame("U+05B4"),
    CODE => " 05B4",
    NAME => "chirik = i",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
TSERE => {
    UTF8 => charnames::string_vianame("U+05B5"),
    CODE => " 05B5",
    NAME => "zeré = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
SEGOL => {
    UTF8 => charnames::string_vianame("U+05B6"),
    CODE => " 05B6",
    NAME => "szegol = e",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
PATAH => {
    UTF8 => charnames::string_vianame("U+05B7"),
    CODE => " 05B7",
    NAME => "patach = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
QAMATS => {
    UTF8 => charnames::string_vianame("U+05B8"),
    CODE => " 05B8",
    NAME => "kamatz = a",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HOLAM => {
    UTF8 => charnames::string_vianame("U+05B9"),
    CODE => " 05B9",
    NAME => "cholam = o",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HOLAM_HASER => {
    UTF8 => charnames::string_vianame("U+05BA"),
    CODE => " 05BA",
    NAME => "cholam chaser",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
QUBUTS => {
    UTF8 => charnames::string_vianame("U+05BB"),
    CODE => " 05BB",
    NAME => "kubutz = u",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
DAGESH => {
    UTF8 => charnames::string_vianame("U+05BC"),
    CODE => " 05BC",
    NAME => "dagesch / schuruk",
    TYPE => MIDDLE_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
METEG => {
    UTF8 => charnames::string_vianame("U+05BD"),
    CODE => " 05BD",
    NAME => "meteg",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
MAQAF => {
    UTF8 => charnames::string_vianame("U+05BE"),
    CODE => " 05BE",
    NAME => "makaf",
    TYPE => LETTER,
    WIDTH => 14,
    HANDWRITING => 0,
},
RAFE => {
    UTF8 => charnames::string_vianame("U+05BF"),
    CODE => " 05BF",
    NAME => "rafi",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
PASEQ => {
    UTF8 => charnames::string_vianame("U+05C0"),
    CODE => " 05C0",
    NAME => "pasek",
    TYPE => LETTER,
    WIDTH => 4,
    HANDWRITING => 0,
},
SHIN_DOT => {
    UTF8 => charnames::string_vianame("U+05C1"),
    CODE => " 05C1",
    NAME => "schin Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
SIN_DOT => {
    UTF8 => charnames::string_vianame("U+05C2"),
    CODE => " 05C2",
    NAME => "ssin Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
}, 
SOF_PASUQ => {
    UTF8 => charnames::string_vianame("U+05C3"),
    CODE => " 05C3",
    NAME => "sof pasuk",
    TYPE => LETTER,
    WIDTH => 8,
    HANDWRITING => 0,
},
UPPER_DOT => {
    UTF8 => charnames::string_vianame("U+05C4"),
    CODE => " 05C4",
    NAME => "oberer Punkt",
    TYPE => UPPER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
LOWER_DOT => {
    UTF8 => charnames::string_vianame("U+05C5"),
    CODE => " 05C5",
    NAME => "unterer Punkt",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
HAFUKAH => {
    UTF8 => charnames::string_vianame("U+05C6"),
    CODE => " 05C6",
    NAME => "chafukach",
    TYPE => LETTER,
    WIDTH => 10,
    HANDWRITING => 0,
},
QAMATS_QATAN => {
    UTF8 => charnames::string_vianame("U+05C7"),
    CODE => " 05C7",
    NAME => "kamatz katan",
    TYPE => LOWER_PUNKTATION,
    WIDTH => 0,
    HANDWRITING => 0,
},
JIDDISH_DOUBLE_WAW => {
    UTF8 => charnames::string_vianame("U+05F0"),
    CODE => " 05F0",
    NAME => "waw waw",
    TYPE => LETTER,
    WIDTH => 19,
    HANDWRITING => 0,
},
JIDDISH_WAW_JOD => {
    UTF8 => charnames::string_vianame("U+05F1"),
    CODE => " 05F1",
    NAME => "waw jod",
    TYPE => LETTER,
    WIDTH => 20,
    HANDWRITING => 0,
},
JIDDISH_DOUBLE_JOD => {
    UTF8 => charnames::string_vianame("U+05F2"),
    CODE => " 05F2",
    NAME => "jod jod",
    TYPE => LETTER,
    WIDTH => 21,
    HANDWRITING => 0,
},
}};


sub get {
    return NIKUDLETTERS;
}

sub get_regular_expression_string
{
    my %letter_hash = %{ Enums::Nikudletters->NIKUDLETTERS };
    my @values_sorted;
    my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=>  $letter_hash{$b}->{CODE} } keys (%letter_hash);
    foreach my $key (@keys_sorted)
    {
        push(@values_sorted, $letter_hash{$key}->{CODE});
    }
    return join('', @values_sorted);
}

1;

我也用同样的方式定义了一些符号:

#!perl
package  Enums::Signletters;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;
use charnames ();

our $VERSION = 1.0;

use constant SIGNLETTERS => {

# ?
QUESTION_MARK =>
{
    CODE    => " 003F",        
},
# !
EXCLAMATION_MARK =>
    {
        CODE => " 0021",            
    },
# .
FULL_STOP=>
    {
        CODE => " 002E",            
    },
# '
APOSTROPHE =>
    {
        CODE => " 0027",           
    },
# (
LEFT_PARENTHESIS =>
    {
        CODE => " 0028",            
    },
# )
RIGHT_PARENTHESIS =>
    {
        CODE => " 0029",           
    },
# ,
COMMA =>
    {
        CODE => " 002C",            
    },
# -
HYPHEN_MINUS =>
    {
        CODE => " 002D",            
    },
# "
QUOTATION_MARK =>
    {
        CODE => " 0022",           
    },
# §
SECTION_SIGN =>
    {
        CODE => " 00A7",            
    },
# $
DOLLAR_SIGN =>
    {
        CODE => " 0024",            
    },
# €
EURO_SIGN =>
    {
        CODE => " 20AC",            
    },
# %
PERCENT_SIGN =>
    {
        CODE => " 0025",           
    },
    # /
SOLIDUS =>
    {
        CODE => " 002F",            
    },
    #[
LEFT_SQUARE_BRACKET =>
    {
        CODE => " 005B",            
    },
    # ]
RIGHT_SQUARE_BRACKET =>
    {
        CODE => " 005D",            
    },
    # {
LEFT_CURLY_BRACKET =>
    {
        CODE => " 007B",            
    },
#
}
RIGHT_CURLY_BRACKET =>
    {
        CODE => " 007D",           
    },
    # =
EQUALS_SIGN =>
    {
        CODE => " 003D",          
    },
    # \ 
REVERSE_SOLIDUS =>
    {
        CODE => " 005C",            
    },
    # *
ASTERISK =>
    {
        CODE => " 002A",            
    },
     # +
PLUS_SIGN =>
    {
        CODE => " 002B",            
    },
     # #
NUMBER_SIGN =>
    {
        CODE => " 0023",            
    },
     # ;
SEMICOLON =>
    {
        CODE => " 0023",            
    },
     # :
COLON =>
    {
        CODE => " 003A",            
    },
# _
LOW_LINE =>
    {
        CODE => " 005F",           
    },
# °
DEGREE_SIGN =>
    {
        CODE => " 00B0",            
    },
# ^
CIRCUMFLEX_ACCENT =>
    {
        CODE => " 005E",           
    },
# ´
ACUTE_ACCENT =>
    {
        CODE => " 00B4",            
    },
# `
GRAVE_ACCENT =>
    {
        CODE => " 0060",           
    },
# @
COMMERCIAL_AT =>
    {
        CODE => " 0040",            
    },
# µ
MICRO_SIGN =>
    {
        CODE => " 00B5",           
    },
# <
LESS_THAN_SIGN =>
    {
        CODE => " 003C",           
    },
# >
GREATER_THAN_SIGN =>
    {
        CODE => " 003E",            
    },
# |
VERTICAL_LINE =>
    {
        CODE => " 007C",            
    },
# &
AMPERSAND =>
    {
        CODE => " 0026",          
    },
};

sub get_regular_expression_string
{
my %letter_hash = %{ Enums::Signletters->SIGNLETTERS };
my @values_sorted;
my @keys_sorted = sort { $letter_hash{$a}->{CODE} <=>  $letter_hash{$b}->{CODE} } keys (%letter_hash);
foreach my $key (@keys_sorted)
{
    push(@values_sorted, $letter_hash{$key}->{CODE}) unless( $key eq "REVERSE_SOLIDUS");
}
return join('', @values_sorted);
}



1;

然后我可以将代码连接到我想要的任何表达式字符串中。像这样:

#!perl
package Helpers::UnicodeChecker;

use strict;
use warnings;
use diagnostics;
use experimental 'signatures';
use utf8;

use Enums::Nikudletters;
use Enums::Signletters;

our $VERSION = 1.0;

 sub is_valid_hebrew ($self, $hebrew)
 {
 my $expression = Enums::Nikudletters->get_regular_expression_string();
 my $expression_ascii = Enums::Signletters->get_regular_expression_string();

 my @hebrew_letters = split('', $hebrew);

foreach my $letter (@hebrew_letters)
{
    my $number = unpack("W*", $letter);
    unless( $number > 255)
    {
        my $code_ascii = get_ascii_code($number);
        return "false" unless($expression_ascii =~ $code_ascii);
    }
   else
    {
        my $code = get_wide_code($number);
        return "false" unless($expression =~ $code);
    }
}
return "true";
}

sub get_wide_code
{
return substr( sprintf("\x{%04X}", $_[0]) , 3, 4 );
}

sub get_ascii_code
{
return "00".substr( sprintf("\x%02X", $_[0]) , 2, 2 );
}

1;

这可行,但涉及大量代码。有一个简短的正则表达式来完成相同的任务会很好。谁能提供正则表达式?

我在谷歌上搜索了很多,阅读了很多,尝试了很多,但我找不到适用于 Perl 5.34.0 的正则表达式。谢谢您的帮助。我刚学Perl。

您可以使用相对简单的模式匹配来完成此操作。

这里有趣的一点是 \p{Hebrew},它允许您匹配每个字符 with a specific Unicode property。剩下的只是字符串的开头 ^ 和结尾 $,以及表示一个或多个的量词 +

use strict;
use warnings;
use utf8;       # for the Hebrew glyphs in my example input

my $string = qq{שלום עולם!}; 
print $string =~ m/^[\p{Hebrew}?!.' ]+$/;

这将匹配任何希伯来字母、空格和几个标点字符。您不需要 the utf8 pragma 除非您想在源代码中包含实际的希伯来语文本,例如在注释中。

您可以根据需要使用任何其他字符来扩展字符组(在 [] 中)。

在我的评论中我使用了 the re pragma,这对调试正则表达式很有用。