Soundex 算法实现的输出对于案例是错误的 - "Tymczak" 和 "Pfister"
Output of Soundex algorithm implementation is wrong for cases - "Tymczak" and "Pfister"
当我根据Wikipedia article on Soundex测试算法Soundex
时,我发现Tymczak返回T520 ,不是 T522 和 Pfister 返回 P123,不是 P236 .
我不知道为什么输出不正确。
我的代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace ConsoleApplication4
{
class Program
{
static void Main(string[] args)
{
string s = "Tymczak";
string result = SoundexByWord(s);
Console.WriteLine(result);
}
private static string Soundex(string data)
{
string first = "pv";
string second = "xz";
string third = "dt";
string forth = "mn";
string fifth = "bf";
string sixth = "cgj";
string seventh = "kqs";
//ana ast5dtmt string builder 34an 3aml zy al list fy apend
// 34an a apend mn al a5r al string
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
// append on the string from the last
// get the first characheter of the string data
// append it on the result
//according to algorithm first charachter stays the same
result.Append(data.Substring(0, 1));
RemoveUnwantedChar(data);
for (int i = 1; i < data.Length; i++)
{
// nb2d al algorithm first take the second characheter in data
//n7wlo la lower
currentLetter = data.Substring(i, 1).ToLower();
currentCode = "";
// No string for zero because we will remove it
if (first.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "1";
else if (fifth.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "1";
else if (sixth.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "2";
else if (seventh.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "2";
else if (second.IndexOf(currentLetter) > -1)
currentCode = "2";
else if (third.IndexOf(currentLetter) > -1)
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if (forth.IndexOf(currentLetter) > -1)
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode)
result.Append(currentCode);
if (result.Length == 4) break;
if (currentCode != "")
previousCode = currentCode;
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}
public static string RemoveUnwantedChar(string input)
{
return Regex.Replace(input, "aeiouyhw", "");
}
private static string SoundexByWord(string data)
{
var soundexes = new List<string>();
foreach (var str in data.Split(' '))
{
soundexes.Add(Soundex(str));
}
#if Net35OrLower
// string.Join in .Net 3.5 and
//before require the second parameter to be an array.
return string.Join(" ", soundexes.ToArray());
#endif
// string.Join in .Net 4 has an overload
//that takes IEnumerable<string>
return string.Join(" ", soundexes);
}
}
}
这不会告诉您代码哪里出了问题,甚至可能不是最快的解决方案,但它似乎使示例正确而且只有几行代码..
它实现了second version of the algorithm的六个步骤。
string Soundex(string input)
{
// character groups: the 1st one are vowels to remove
// the other groups are characters to replace by the group index
List<string> groups = new List<string>()
{ "aeiouy", "bfpv", "cgjkqsxz", "dt", "l", "mn", "r" };
// save the 1st character (1)
string first = input.Substring(0, 1);
string s = input.ToLower();
// remove unconditionally (1)
s = s.Replace("h", "").Replace("w", "");
// replace characters in all replacement groups (2)
for (int g = 1; g < groups.Count; g++)
for (int i = 0; i < groups[g].Length; i++)
s = s.Replace(groups[g][i], ((char)(g + (byte)'0')));
// replace repeating digits (3)
// NOTE: this step actually should be repeated until the length no longer changes!!!
for (int i = 1; i < 10; i++) s = s.Replace(i + "" + i, i + "");
// now remove characters from group 0: (4)
for (int i = 0; i < groups[0].Length; i++) s = s.Replace(groups[0][i].ToString(), "");
// remove the first if it is a digit (5)
if ( (s[0] >= '0') && (s[0] <= '9') ) s = s.Substring(1);
// add saved first to max 3 digits and pad if needed (6)
return (first + s.Substring(0, Math.Min(3, s.Length))).PadRight(4, '0');
}
当我根据Wikipedia article on Soundex测试算法Soundex
时,我发现Tymczak返回T520 ,不是 T522 和 Pfister 返回 P123,不是 P236 .
我不知道为什么输出不正确。
我的代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace ConsoleApplication4
{
class Program
{
static void Main(string[] args)
{
string s = "Tymczak";
string result = SoundexByWord(s);
Console.WriteLine(result);
}
private static string Soundex(string data)
{
string first = "pv";
string second = "xz";
string third = "dt";
string forth = "mn";
string fifth = "bf";
string sixth = "cgj";
string seventh = "kqs";
//ana ast5dtmt string builder 34an 3aml zy al list fy apend
// 34an a apend mn al a5r al string
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
// append on the string from the last
// get the first characheter of the string data
// append it on the result
//according to algorithm first charachter stays the same
result.Append(data.Substring(0, 1));
RemoveUnwantedChar(data);
for (int i = 1; i < data.Length; i++)
{
// nb2d al algorithm first take the second characheter in data
//n7wlo la lower
currentLetter = data.Substring(i, 1).ToLower();
currentCode = "";
// No string for zero because we will remove it
if (first.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "1";
else if (fifth.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "1";
else if (sixth.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "2";
else if (seventh.IndexOf(currentLetter) > -1)
//search for bfpv in the current letter return number
// -1 is out of string index
currentCode = "2";
else if (second.IndexOf(currentLetter) > -1)
currentCode = "2";
else if (third.IndexOf(currentLetter) > -1)
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if (forth.IndexOf(currentLetter) > -1)
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode)
result.Append(currentCode);
if (result.Length == 4) break;
if (currentCode != "")
previousCode = currentCode;
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}
public static string RemoveUnwantedChar(string input)
{
return Regex.Replace(input, "aeiouyhw", "");
}
private static string SoundexByWord(string data)
{
var soundexes = new List<string>();
foreach (var str in data.Split(' '))
{
soundexes.Add(Soundex(str));
}
#if Net35OrLower
// string.Join in .Net 3.5 and
//before require the second parameter to be an array.
return string.Join(" ", soundexes.ToArray());
#endif
// string.Join in .Net 4 has an overload
//that takes IEnumerable<string>
return string.Join(" ", soundexes);
}
}
}
这不会告诉您代码哪里出了问题,甚至可能不是最快的解决方案,但它似乎使示例正确而且只有几行代码..
它实现了second version of the algorithm的六个步骤。
string Soundex(string input)
{
// character groups: the 1st one are vowels to remove
// the other groups are characters to replace by the group index
List<string> groups = new List<string>()
{ "aeiouy", "bfpv", "cgjkqsxz", "dt", "l", "mn", "r" };
// save the 1st character (1)
string first = input.Substring(0, 1);
string s = input.ToLower();
// remove unconditionally (1)
s = s.Replace("h", "").Replace("w", "");
// replace characters in all replacement groups (2)
for (int g = 1; g < groups.Count; g++)
for (int i = 0; i < groups[g].Length; i++)
s = s.Replace(groups[g][i], ((char)(g + (byte)'0')));
// replace repeating digits (3)
// NOTE: this step actually should be repeated until the length no longer changes!!!
for (int i = 1; i < 10; i++) s = s.Replace(i + "" + i, i + "");
// now remove characters from group 0: (4)
for (int i = 0; i < groups[0].Length; i++) s = s.Replace(groups[0][i].ToString(), "");
// remove the first if it is a digit (5)
if ( (s[0] >= '0') && (s[0] <= '9') ) s = s.Substring(1);
// add saved first to max 3 digits and pad if needed (6)
return (first + s.Substring(0, Math.Min(3, s.Length))).PadRight(4, '0');
}