如何根据 C# 运算符从源代码中标记语法

How to tokenizing a syntax from source code based on operators C#

我正在读取 TextBox 中的所有行,我正在尝试删除列表中的所有空格。

我需要能够标记以下表达式:

if(x==0)
{
    cout<<x;
} 

进入

if
(
x
==
0
)
{
cout
<<
x
;
} 

我的代码:

public static string[] Tokenize(string sourceCode)
{
    Regex RE = new Regex(@"([\s+\+\-\*\%\,\;\&\|\<\>\=\!\{\}])");
    string[] x = RE.Split(sourceCode);

    var list = new List<string>(x);
    list.Remove(" ");

    for (int m = 0; m < list.Count(); m++)
    {
        Console.WriteLine(list[m]);
    }

    return (RE.Split(sourceCode));
}

我的输出:

if(x
=

=
0)






{








 

 

 
cout
<

<
x
;







}

如何使用 == << && 等符号拆分以及如何从列表中删除空格? 有没有更好的方法来实现我想要的?

我同意@juharr 的评论。 但是如果你真的想使用正则表达式,最好使用 Match 方法而不是 Split 因为它允许你指定要查找的标记而不是标记边界:

 Regex RE = new Regex(@"\w+|\(|\)|\++|-+|\*|%|,|;|&+|\|+|<+|>+|=+|!|\{|\}");
 foreach (Match m in RE.Matches(sourceCode))
 {
  Console.WriteLine(m.Value);
 }

结果:

if
(
x
==
0
)
{
cout
<<
x
;
}

你可以这样做:

var rx = new Regex(@"([\p{L}_][\p{L}\p{N}_]*|[+-]?[0-9]+|==|!=|>=|<=|<<|>>|\|\||&&|[!=+\-*/%{}();]|\s+)*");
Match match = rx.Match(str);
Group g = match.Groups[1];
foreach (var capture in g.Captures)
{
    Console.WriteLine(capture);
}

(与您的示例相比,我包含了许多其他运算符)。这仍然是个坏主意。

现在......这仍然是个坏主意,但你可以让它变得更复杂:

string str = @"if(x==0)
{
    cout<<x;
    var x1 = '\a';
    var x2 = '\'';
    var x3 = 'X';
    var x4 = ""He\""llo\n"";
}";

var fragments = new[]
{
    // The order of these pattern is important! Longer patterns should go first (so += before + for example)
    new { Name = "Keyword", Pattern = @"(?:if|for|while|var|int|long|string|char|return)\b", Escape = false },
    new { Name = "Symbol", Pattern = @"[\p{L}_][\p{L}\p{N}_]*\b", Escape = false },
    new { Name = "Number", Pattern = @"[+-]?[0-9]+(?:\.[0-9]+)?\b", Escape = false },
    new { Name = "OperatorAssign", Pattern = @"<<=|>>=|&&=|\|\|=|[+\-*/%&|^]=", Escape = false },
    new { Name = "Operator", Pattern = @"==|!=|>=|<=|>|<|<<|>>|&&|\|\||[+\-*/%&|^!]", Escape = false },
    new { Name = "Space", Pattern = @"\s+", Escape = false },
    new { Name = "Assign", Pattern = @"=", Escape = true },
    new { Name = "OpenBrace", Pattern = @"{", Escape = true },
    new { Name = "CloseBrace", Pattern = @"}", Escape = true },
    new { Name = "Semicolon", Pattern = @";", Escape = true },
    new { Name = "OpenRoundParenthesis", Pattern = @"(", Escape = true },
    new { Name = "CloseRoundParenthesis", Pattern = @")", Escape = true },
    new { Name = "OpenSquareParenthesis", Pattern = @"[", Escape = true },
    new { Name = "CloseSquareParenthesis", Pattern = @"]", Escape = true },
    new { Name = "Char", Pattern = @"'(?:\.|.)'", Escape = false },
    new { Name = "String", Pattern = @"\""(?:\.|[^""])*""", Escape = false },
};

string allPatterns = string.Join('|', fragments.Select(x => $"(?<{x.Name}>{(x.Escape ? Regex.Escape(x.Pattern) : x.Pattern)})"));
var rx = new Regex(@"\G(?:" + allPatterns + ")");

int ix = 0;

while (ix < str.Length)
{
    var match = rx.Match(str, ix);

    if (!match.Success)
    {
        Console.WriteLine($"Error starting at: {str.Substring(ix)}");
        break;
    }

    var group = match.Groups.OfType<Group>().Skip(1).Single(x => x.Success);

    string name = group.Name;
    string value = match.Value;

    if (name != "Space")
    {
        Console.WriteLine($"Match: {name}: {value}");
    }
    else
    {
        Console.WriteLine("Skipping some space");
    }

    ix += value.Length;
}