将字符串与来自不同字母表的符号进行比较
Comparing strings with symbols from different alphabets
我想比较两个包含来自不同字母表(例如俄语和英语)的符号的字符串。我希望看起来相似的符号被认为彼此相等。
例如单词"Mom"中的字母"o"来自英文字母表(Unicode代码043E),世界上"Mоm"字母“о”来自俄语字母表(Unicode代码006F)。所以 ("Mom" = "Mоm")
=> false,但我希望它是真的。是否有一些标准的 SAS 函数,或者我应该编写一个宏来完成它。
谢谢!
我会这样做:
首先我会制作地图。我的意思是俄语中的哪个字母对应于英语中的哪个字母。示例:
б = b
в = v
...
我会将此地图存储在单独的 table 中或存储为 macroVars。
然后我会创建一个带有 tranwrd 函数的宏循环,它循环遍历创建的地图。
这里的例子可能是这样的。
data _null_;
stringBefore = "без";
stringAfter = tranwrd(stringBefore,"а","a");
stringAfter = tranwrd(stringAfter,"б","b");
stringAfter = tranwrd(stringAfter,"в","v");
...
run;
在这个转换之后我认为你可以比较你的字符串。
我还编写了一些函数来处理键盘布局错误。这是代码:
/***************************************************************************/
/* FUNCTION count_rus_letters RETURNS NUMBER OF CYRILLIC LETTERS IN STRING */
/***************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_rus_letters(string $);
length letter ;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
end;
return(rus_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION count_eng_letters RETURNS NUMBER OF ENGLISH LETTERS IN STRING */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_eng_letters(string $);
length letter ;
eng_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
return(eng_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION is_string_russian RETURNS 1 IF NUMBER OF RUSSIAN SYMBOLS IN */
/* STRING >= NUMBER OF ENGLISH SYMBOLS */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION is_string_russian(string $);
length letter result 8;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=1;
else result=0;
return(result);
endsub;
run;
/**************************************************************************/
/* FUNCTION fix_layout_misprints REPLACES MISPRINTED SYMBOLS BY ANALYSING */
/* LANGUAGE OF THE STRING (FOR ENGLISH STRING RUSSIAN SYMBOLS ARE */
/* REPLACED BY ENGLISH COPIES AND FOR RUSSIAN STRING SYMBOLS ARE */
/* REPLACED BY RUSSIAN COPIES) */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION fix_layout_misprints(string $) $ 1000;
length letter result 00;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=ktranslate(string,"АаВЕеКкМОоРрСсТХх","AaBEeKkMOoPpCcTXx");
else result=ktranslate(string,"AaBEeKkMOoPpCcTXx","АаВЕеКкМОоРрСсТХх");
return(result);
endsub;
run;
/***********/
/* EXAMPLE */
/***********/
options cmplib=sasuser.userfuncs;
data _null_;
good_str="Иванов";
err_str="Ивaнов";
fixed_str=fix_layout_misprints(err_str);
put "Good string=" good_str;
put "Error string=" err_str;
put "Fixed string=" fixed_str;
rus_count_in_err=count_rus_letters(err_str);
put "Count or Cyrillic symbols in error string=" rus_count_in_err;
eng_count_in_err=count_eng_letters(err_str);
put "Count or English symbols in error string=" eng_count_in_err;
is_error_str_russian=is_string_russian(err_str);
put "Is error string language Russian=" is_error_str_russian;
if (good_str ne err_str)
then put "Before clearing - strings are not equal to each other";
if (good_str = fixed_str)
then put "After clearing - strings are equal to each other";
run;
我想比较两个包含来自不同字母表(例如俄语和英语)的符号的字符串。我希望看起来相似的符号被认为彼此相等。
例如单词"Mom"中的字母"o"来自英文字母表(Unicode代码043E),世界上"Mоm"字母“о”来自俄语字母表(Unicode代码006F)。所以 ("Mom" = "Mоm")
=> false,但我希望它是真的。是否有一些标准的 SAS 函数,或者我应该编写一个宏来完成它。
谢谢!
我会这样做:
首先我会制作地图。我的意思是俄语中的哪个字母对应于英语中的哪个字母。示例:
б = b
в = v
...
我会将此地图存储在单独的 table 中或存储为 macroVars。 然后我会创建一个带有 tranwrd 函数的宏循环,它循环遍历创建的地图。
这里的例子可能是这样的。
data _null_;
stringBefore = "без";
stringAfter = tranwrd(stringBefore,"а","a");
stringAfter = tranwrd(stringAfter,"б","b");
stringAfter = tranwrd(stringAfter,"в","v");
...
run;
在这个转换之后我认为你可以比较你的字符串。
我还编写了一些函数来处理键盘布局错误。这是代码:
/***************************************************************************/
/* FUNCTION count_rus_letters RETURNS NUMBER OF CYRILLIC LETTERS IN STRING */
/***************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_rus_letters(string $);
length letter ;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
end;
return(rus_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION count_eng_letters RETURNS NUMBER OF ENGLISH LETTERS IN STRING */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION count_eng_letters(string $);
length letter ;
eng_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
return(eng_count);
endsub;
run;
/**************************************************************************/
/* FUNCTION is_string_russian RETURNS 1 IF NUMBER OF RUSSIAN SYMBOLS IN */
/* STRING >= NUMBER OF ENGLISH SYMBOLS */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION is_string_russian(string $);
length letter result 8;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=1;
else result=0;
return(result);
endsub;
run;
/**************************************************************************/
/* FUNCTION fix_layout_misprints REPLACES MISPRINTED SYMBOLS BY ANALYSING */
/* LANGUAGE OF THE STRING (FOR ENGLISH STRING RUSSIAN SYMBOLS ARE */
/* REPLACED BY ENGLISH COPIES AND FOR RUSSIAN STRING SYMBOLS ARE */
/* REPLACED BY RUSSIAN COPIES) */
/**************************************************************************/
proc fcmp outlib=sasuser.userfuncs.mystring;
FUNCTION fix_layout_misprints(string $) $ 1000;
length letter result 00;
eng_count=0;
rus_count=0;
len=klength(string);
do i=1 to len;
letter=ksubstr(string,i,1);
if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж"
"З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р",
"С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ"
"Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я")
then rus_count+1;
if rank('A') <= rank(letter) <=rank('z')
then eng_count+1;
end;
if rus_count>=eng_count
then result=ktranslate(string,"АаВЕеКкМОоРрСсТХх","AaBEeKkMOoPpCcTXx");
else result=ktranslate(string,"AaBEeKkMOoPpCcTXx","АаВЕеКкМОоРрСсТХх");
return(result);
endsub;
run;
/***********/
/* EXAMPLE */
/***********/
options cmplib=sasuser.userfuncs;
data _null_;
good_str="Иванов";
err_str="Ивaнов";
fixed_str=fix_layout_misprints(err_str);
put "Good string=" good_str;
put "Error string=" err_str;
put "Fixed string=" fixed_str;
rus_count_in_err=count_rus_letters(err_str);
put "Count or Cyrillic symbols in error string=" rus_count_in_err;
eng_count_in_err=count_eng_letters(err_str);
put "Count or English symbols in error string=" eng_count_in_err;
is_error_str_russian=is_string_russian(err_str);
put "Is error string language Russian=" is_error_str_russian;
if (good_str ne err_str)
then put "Before clearing - strings are not equal to each other";
if (good_str = fixed_str)
then put "After clearing - strings are equal to each other";
run;