在 C# 中,解析此 WIKI 标记的最佳方法是什么?
In C#, what is the best way to parse this WIKI markup?
我需要从 WIKI 标记页面获取正在读取的数据并将其存储为 table 结构。我想弄清楚如何将以下标记语法正确解析为 C#
中的某些 table 数据结构
这是一个例子table:
|| Owner || Action || Status || Comments ||
| Bill | Fix the lobby | In Progress | This is easy |
| Joe | Fix the bathroom | In Progress | Plumbing \
\
Electric \
\
Painting \
\
\ |
| Scott | Fix the roof | Complete | This is expensive |
下面是它的直接输入方式:
|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|
如您所见:
- 第 header 列有“||”作为分隔符
- 行列有分隔符或“|”
- 一行可能跨越多行(如上面的第二个数据行示例),所以我必须继续阅读,直到我遇到相同数量的“|” (cols) 我在 header 行中。
我试着逐行阅读,然后将中间有“\”的行连接起来,但这看起来有点老套。
我也试过简单地读入一个完整的字符串,然后只用“||”解析首先,然后继续阅读,直到我点击相同数量的“|”然后转到下一行。这似乎可行,但感觉可能有更优雅的方式使用正则表达式或类似的东西。
任何人都可以建议解析此数据的正确方法吗?
这做出了一些假设,但似乎适用于您的样本数据。我敢肯定,如果我在
它还将允许行的单元格数量与 header 不同,我认为 confluence 可以做到这一点。
List<List<string>> table = new List<List<string>>();
var match = Regex.Match(raw, @"(?:(?:\|\|([^|]*))*\n)?");
if (match.Success)
{
var headersWithExtra = match.Groups[1].Captures.Cast<Capture>().Select(c=>c.Value);
List<String> headerRow = headersWithExtra.Take(headersWithExtra.Count()-1).ToList();
if (headerRow.Count > 0)
{
table.Add(headerRow);
}
}
match = Regex.Match(raw + "\r\n", @"[^\n]*\n" + @"(?:\|([^|]*))*");
var cellsWithExtra = match.Groups[1].Captures.Cast<Capture>().Select(c=>c.Value);
List<string> row = new List<string>();
foreach (string cell in cellsWithExtra)
{
if (cell.Trim(' ', '\t') == "\r\n")
{
if (!table.Contains(row) && row.Count > 0)
{
table.Add(row);
}
row = new List<string>();
}
else
{
row.Add(cell);
}
}
由于您编辑后输入的格式与之前发布的格式大不相同,因此我已在很大程度上替换了之前的答案。这导致了一个稍微不同的解决方案。
因为一行之后不再有任何换行符,所以确定行结束位置的唯一方法是要求每一行的列数与 table header。至少如果你不想依赖一些潜在脆弱的白色 space 约定存在于一个且唯一提供的示例字符串中(即行分隔符是唯一的 |
前面没有space)。您的问题至少没有提供此作为行分隔符的规范。
下面的 "parser" 至少提供了可以从您的格式规范和示例字符串派生的错误处理有效性检查,并且还允许 table 没有行。评论解释了它在基本步骤中所做的事情。
public class TableParser
{
const StringSplitOptions SplitOpts = StringSplitOptions.None;
const string RowColSep = "|";
static readonly string[] HeaderColSplit = { "||" };
static readonly string[] RowColSplit = { RowColSep };
static readonly string[] MLColSplit = { @"\" };
public class TableRow
{
public List<string[]> Cells;
}
public class Table
{
public string[] Header;
public TableRow[] Rows;
}
public static Table Parse(string text)
{
// Isolate the header columns and rows remainder.
var headerSplit = text.Split(HeaderColSplit, SplitOpts);
Ensure(headerSplit.Length > 1, "At least 1 header column is required in the input");
// Need to check whether there are any rows.
var hasRows = headerSplit.Last().IndexOf(RowColSep) >= 0;
var header = headerSplit.Skip(1)
.Take(headerSplit.Length - (hasRows ? 2 : 1))
.Select(c => c.Trim())
.ToArray();
if (!hasRows) // If no rows for this table, we are done.
return new Table() { Header = header, Rows = new TableRow[0] };
// Get all row columns from the remainder.
var rowsCols = headerSplit.Last().Split(RowColSplit, SplitOpts);
// Require same amount of columns for a row as the header.
Ensure((rowsCols.Length % (header.Length + 1)) == 1,
"The number of row colums does not match the number of header columns");
var rows = new TableRow[(rowsCols.Length - 1) / (header.Length + 1)];
// Fill rows by sequentially taking # header column cells
for (int ri = 0, start = 1; ri < rows.Length; ri++, start += header.Length + 1)
{
rows[ri] = new TableRow() {
Cells = rowsCols.Skip(start).Take(header.Length)
.Select(c => c.Split(MLColSplit, SplitOpts).Select(p => p.Trim()).ToArray())
.ToList()
};
};
return new Table { Header = header, Rows = rows };
}
private static void Ensure(bool check, string errorMsg)
{
if (!check)
throw new InvalidDataException(errorMsg);
}
}
像这样使用时:
public static void Main(params string[] args)
{
var wikiLine = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
var table = TableParser.Parse(wikiLine);
Console.WriteLine(string.Join(", ", table.Header));
foreach (var r in table.Rows)
Console.WriteLine(string.Join(", ", r.Cells.Select(c => string.Join(Environment.NewLine + "\t# ", c))));
}
它将产生以下输出:
其中 "\t# "
表示输入中存在 \
引起的换行符。
这是一个填充数据表的解决方案。它确实需要一点数据处理 (Trim),但主要的解析是 Splits 和 Linq。
var str = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
var headerStop = str.LastIndexOf("||");
var headers = str.Substring(0, headerStop).Split(new string[1] { "||" }, StringSplitOptions.None).Skip(1).ToList();
var records = str.Substring(headerStop + 4).TrimEnd(new char[2] { ' ', '|' }).Split(new string[1] { "| |" }, StringSplitOptions.None).ToList();
var tbl = new DataTable();
headers.ForEach(h => tbl.Columns.Add(h.Trim()));
records.ForEach(r => tbl.Rows.Add(r.Split('|')));
一次读取一个字符的输入字符串,并使用状态机来决定应该对每个输入字符执行什么操作。这种方法可能需要更多的代码,但它比正则表达式更容易维护和扩展。
这与 Jon Tirjan 的回答非常相似,尽管它将 LINQ 削减为单个语句(替换最后一个语句的代码非常丑陋)并且可扩展性更强一些。例如,它会将 Confluence 换行符 \
替换为您选择的字符串,您可以选择 trim 或不 trim 元素周围的空格等
private void ParseWikiTable(string input, string newLineReplacement = " ")
{
string separatorHeader = "||";
string separatorRow = "| |";
string separatorElement = "|";
input = Regex.Replace(input, @"[ \]{2,}", newLineReplacement);
string inputHeader = input.Substring(0, input.LastIndexOf(separatorHeader));
string inputContent = input.Substring(input.LastIndexOf(separatorHeader) + separatorHeader.Length);
string[] headerArray = SimpleSplit(inputHeader, separatorHeader);
string[][] rowArray = SimpleSplit(inputContent, separatorRow).Select(r => SimpleSplit(r, separatorElement)).ToArray();
// do something with output data
TestPrint(headerArray);
foreach (var r in rowArray) { TestPrint(r); }
}
private string[] SimpleSplit(string input, string separator, bool trimWhitespace = true)
{
input = input.Trim();
if (input.StartsWith(separator)) { input = input.Substring(separator.Length); }
if (input.EndsWith(separator)) { input = input.Substring(0, input.Length - separator.Length); }
string[] segments = input.Split(new string[] { separator }, StringSplitOptions.None);
if (trimWhitespace)
{
for (int i = 0; i < segments.Length; i++)
{
segments[i] = segments[i].Trim();
}
}
return segments;
}
private void TestPrint(string[] lst)
{
string joined = "[" + String.Join("::", lst) + "]";
Console.WriteLine(joined);
}
直接输入字符串的控制台输出:
[Owner::Action::Status::Comments]
[Bill::fix the lobby::In Progress::This is eary]
[Joe::fix the bathroom::In progress::plumbing Electric Painting]
[Scott::fix the roof::Complete::this is expensive]
填充数据表的通用正则表达式解决方案,语法有点灵活。
var text = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
// Get Headers
var regHeaders = new Regex(@"\|\|\s*(\w[^\|]+)", RegexOptions.Compiled);
var headers = regHeaders.Matches(text);
//Get Rows, based on number of headers columns
var regLinhas = new Regex(String.Format(@"(?:\|\s*(\w[^\|]+)){{{0}}}", headers.Count));
var rows = regLinhas.Matches(text);
var tbl = new DataTable();
foreach (Match header in headers)
{
tbl.Columns.Add(header.Groups[1].Value);
}
foreach (Match row in rows)
{
tbl.Rows.Add(row.Groups[1].Captures.OfType<Capture>().Select(col => col.Value).ToArray());
}
这是一个涉及正则表达式的解决方案。它采用单个字符串作为输入,returns 一个 headers 的列表和一个 rows/columns 的列表>。它还会修剪白色 space,这可能是也可能不是所需的行为,因此请注意这一点。它甚至可以很好地打印东西 :)
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace parseWiki
{
class Program
{
static void Main(string[] args)
{
string content = @"|| Owner || Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
content = content.Replace(@"\", "");
string headerContent = content.Substring(0, content.LastIndexOf("||") + 2);
string cellContent = content.Substring(content.LastIndexOf("||") + 2);
MatchCollection headerMatches = new Regex(@"\|\|([^|]*)(?=\|\|)", RegexOptions.Singleline).Matches(headerContent);
MatchCollection cellMatches = new Regex(@"\|([^|]*)(?=\|)", RegexOptions.Singleline).Matches(cellContent);
List<string> headers = new List<string>();
foreach (Match match in headerMatches)
{
if (match.Groups.Count > 1)
{
headers.Add(match.Groups[1].Value.Trim());
}
}
List<List<string>> body = new List<List<string>>();
List<string> newRow = new List<string>();
foreach (Match match in cellMatches)
{
if (newRow.Count > 0 && newRow.Count % headers.Count == 0)
{
body.Add(newRow);
newRow = new List<string>();
}
else
{
newRow.Add(match.Groups[1].Value.Trim());
}
}
body.Add(newRow);
print(headers, body);
}
static void print(List<string> headers, List<List<string>> body)
{
var CELL_SIZE = 20;
for (int i = 0; i < headers.Count; i++)
{
Console.Write(headers[i].Truncate(CELL_SIZE).PadRight(CELL_SIZE) + " ");
}
Console.WriteLine("\n" + "".PadRight( (CELL_SIZE + 2) * headers.Count, '-'));
for (int r = 0; r < body.Count; r++)
{
List<string> row = body[r];
for (int c = 0; c < row.Count; c++)
{
Console.Write(row[c].Truncate(CELL_SIZE).PadRight(CELL_SIZE) + " ");
}
Console.WriteLine("");
}
Console.WriteLine("\n\n\n");
Console.ReadKey(false);
}
}
public static class StringExt
{
public static string Truncate(this string value, int maxLength)
{
if (string.IsNullOrEmpty(value) || value.Length <= maxLength) return value;
return value.Substring(0, maxLength - 3) + "...";
}
}
}
我需要从 WIKI 标记页面获取正在读取的数据并将其存储为 table 结构。我想弄清楚如何将以下标记语法正确解析为 C#
中的某些 table 数据结构这是一个例子table:
|| Owner || Action || Status || Comments ||
| Bill | Fix the lobby | In Progress | This is easy |
| Joe | Fix the bathroom | In Progress | Plumbing \
\
Electric \
\
Painting \
\
\ |
| Scott | Fix the roof | Complete | This is expensive |
下面是它的直接输入方式:
|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|
如您所见:
- 第 header 列有“||”作为分隔符
- 行列有分隔符或“|”
- 一行可能跨越多行(如上面的第二个数据行示例),所以我必须继续阅读,直到我遇到相同数量的“|” (cols) 我在 header 行中。
我试着逐行阅读,然后将中间有“\”的行连接起来,但这看起来有点老套。
我也试过简单地读入一个完整的字符串,然后只用“||”解析首先,然后继续阅读,直到我点击相同数量的“|”然后转到下一行。这似乎可行,但感觉可能有更优雅的方式使用正则表达式或类似的东西。
任何人都可以建议解析此数据的正确方法吗?
这做出了一些假设,但似乎适用于您的样本数据。我敢肯定,如果我在 它还将允许行的单元格数量与 header 不同,我认为 confluence 可以做到这一点。
List<List<string>> table = new List<List<string>>();
var match = Regex.Match(raw, @"(?:(?:\|\|([^|]*))*\n)?");
if (match.Success)
{
var headersWithExtra = match.Groups[1].Captures.Cast<Capture>().Select(c=>c.Value);
List<String> headerRow = headersWithExtra.Take(headersWithExtra.Count()-1).ToList();
if (headerRow.Count > 0)
{
table.Add(headerRow);
}
}
match = Regex.Match(raw + "\r\n", @"[^\n]*\n" + @"(?:\|([^|]*))*");
var cellsWithExtra = match.Groups[1].Captures.Cast<Capture>().Select(c=>c.Value);
List<string> row = new List<string>();
foreach (string cell in cellsWithExtra)
{
if (cell.Trim(' ', '\t') == "\r\n")
{
if (!table.Contains(row) && row.Count > 0)
{
table.Add(row);
}
row = new List<string>();
}
else
{
row.Add(cell);
}
}
由于您编辑后输入的格式与之前发布的格式大不相同,因此我已在很大程度上替换了之前的答案。这导致了一个稍微不同的解决方案。
因为一行之后不再有任何换行符,所以确定行结束位置的唯一方法是要求每一行的列数与 table header。至少如果你不想依赖一些潜在脆弱的白色 space 约定存在于一个且唯一提供的示例字符串中(即行分隔符是唯一的 |
前面没有space)。您的问题至少没有提供此作为行分隔符的规范。
下面的 "parser" 至少提供了可以从您的格式规范和示例字符串派生的错误处理有效性检查,并且还允许 table 没有行。评论解释了它在基本步骤中所做的事情。
public class TableParser
{
const StringSplitOptions SplitOpts = StringSplitOptions.None;
const string RowColSep = "|";
static readonly string[] HeaderColSplit = { "||" };
static readonly string[] RowColSplit = { RowColSep };
static readonly string[] MLColSplit = { @"\" };
public class TableRow
{
public List<string[]> Cells;
}
public class Table
{
public string[] Header;
public TableRow[] Rows;
}
public static Table Parse(string text)
{
// Isolate the header columns and rows remainder.
var headerSplit = text.Split(HeaderColSplit, SplitOpts);
Ensure(headerSplit.Length > 1, "At least 1 header column is required in the input");
// Need to check whether there are any rows.
var hasRows = headerSplit.Last().IndexOf(RowColSep) >= 0;
var header = headerSplit.Skip(1)
.Take(headerSplit.Length - (hasRows ? 2 : 1))
.Select(c => c.Trim())
.ToArray();
if (!hasRows) // If no rows for this table, we are done.
return new Table() { Header = header, Rows = new TableRow[0] };
// Get all row columns from the remainder.
var rowsCols = headerSplit.Last().Split(RowColSplit, SplitOpts);
// Require same amount of columns for a row as the header.
Ensure((rowsCols.Length % (header.Length + 1)) == 1,
"The number of row colums does not match the number of header columns");
var rows = new TableRow[(rowsCols.Length - 1) / (header.Length + 1)];
// Fill rows by sequentially taking # header column cells
for (int ri = 0, start = 1; ri < rows.Length; ri++, start += header.Length + 1)
{
rows[ri] = new TableRow() {
Cells = rowsCols.Skip(start).Take(header.Length)
.Select(c => c.Split(MLColSplit, SplitOpts).Select(p => p.Trim()).ToArray())
.ToList()
};
};
return new Table { Header = header, Rows = rows };
}
private static void Ensure(bool check, string errorMsg)
{
if (!check)
throw new InvalidDataException(errorMsg);
}
}
像这样使用时:
public static void Main(params string[] args)
{
var wikiLine = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
var table = TableParser.Parse(wikiLine);
Console.WriteLine(string.Join(", ", table.Header));
foreach (var r in table.Rows)
Console.WriteLine(string.Join(", ", r.Cells.Select(c => string.Join(Environment.NewLine + "\t# ", c))));
}
它将产生以下输出:
其中 "\t# "
表示输入中存在 \
引起的换行符。
这是一个填充数据表的解决方案。它确实需要一点数据处理 (Trim),但主要的解析是 Splits 和 Linq。
var str = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
var headerStop = str.LastIndexOf("||");
var headers = str.Substring(0, headerStop).Split(new string[1] { "||" }, StringSplitOptions.None).Skip(1).ToList();
var records = str.Substring(headerStop + 4).TrimEnd(new char[2] { ' ', '|' }).Split(new string[1] { "| |" }, StringSplitOptions.None).ToList();
var tbl = new DataTable();
headers.ForEach(h => tbl.Columns.Add(h.Trim()));
records.ForEach(r => tbl.Rows.Add(r.Split('|')));
一次读取一个字符的输入字符串,并使用状态机来决定应该对每个输入字符执行什么操作。这种方法可能需要更多的代码,但它比正则表达式更容易维护和扩展。
这与 Jon Tirjan 的回答非常相似,尽管它将 LINQ 削减为单个语句(替换最后一个语句的代码非常丑陋)并且可扩展性更强一些。例如,它会将 Confluence 换行符 \
替换为您选择的字符串,您可以选择 trim 或不 trim 元素周围的空格等
private void ParseWikiTable(string input, string newLineReplacement = " ")
{
string separatorHeader = "||";
string separatorRow = "| |";
string separatorElement = "|";
input = Regex.Replace(input, @"[ \]{2,}", newLineReplacement);
string inputHeader = input.Substring(0, input.LastIndexOf(separatorHeader));
string inputContent = input.Substring(input.LastIndexOf(separatorHeader) + separatorHeader.Length);
string[] headerArray = SimpleSplit(inputHeader, separatorHeader);
string[][] rowArray = SimpleSplit(inputContent, separatorRow).Select(r => SimpleSplit(r, separatorElement)).ToArray();
// do something with output data
TestPrint(headerArray);
foreach (var r in rowArray) { TestPrint(r); }
}
private string[] SimpleSplit(string input, string separator, bool trimWhitespace = true)
{
input = input.Trim();
if (input.StartsWith(separator)) { input = input.Substring(separator.Length); }
if (input.EndsWith(separator)) { input = input.Substring(0, input.Length - separator.Length); }
string[] segments = input.Split(new string[] { separator }, StringSplitOptions.None);
if (trimWhitespace)
{
for (int i = 0; i < segments.Length; i++)
{
segments[i] = segments[i].Trim();
}
}
return segments;
}
private void TestPrint(string[] lst)
{
string joined = "[" + String.Join("::", lst) + "]";
Console.WriteLine(joined);
}
直接输入字符串的控制台输出:
[Owner::Action::Status::Comments]
[Bill::fix the lobby::In Progress::This is eary]
[Joe::fix the bathroom::In progress::plumbing Electric Painting]
[Scott::fix the roof::Complete::this is expensive]
填充数据表的通用正则表达式解决方案,语法有点灵活。
var text = @"|| Owner|| Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
// Get Headers
var regHeaders = new Regex(@"\|\|\s*(\w[^\|]+)", RegexOptions.Compiled);
var headers = regHeaders.Matches(text);
//Get Rows, based on number of headers columns
var regLinhas = new Regex(String.Format(@"(?:\|\s*(\w[^\|]+)){{{0}}}", headers.Count));
var rows = regLinhas.Matches(text);
var tbl = new DataTable();
foreach (Match header in headers)
{
tbl.Columns.Add(header.Groups[1].Value);
}
foreach (Match row in rows)
{
tbl.Rows.Add(row.Groups[1].Captures.OfType<Capture>().Select(col => col.Value).ToArray());
}
这是一个涉及正则表达式的解决方案。它采用单个字符串作为输入,returns 一个 headers 的列表和一个 rows/columns 的列表>。它还会修剪白色 space,这可能是也可能不是所需的行为,因此请注意这一点。它甚至可以很好地打印东西 :)
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace parseWiki
{
class Program
{
static void Main(string[] args)
{
string content = @"|| Owner || Action || Status || Comments || | Bill\ | fix the lobby |In Progress | This is eary| | Joe\ |fix the bathroom\ | In progress| plumbing \Electric \Painting \ \ | | Scott \ | fix the roof \ | Complete | this is expensive|";
content = content.Replace(@"\", "");
string headerContent = content.Substring(0, content.LastIndexOf("||") + 2);
string cellContent = content.Substring(content.LastIndexOf("||") + 2);
MatchCollection headerMatches = new Regex(@"\|\|([^|]*)(?=\|\|)", RegexOptions.Singleline).Matches(headerContent);
MatchCollection cellMatches = new Regex(@"\|([^|]*)(?=\|)", RegexOptions.Singleline).Matches(cellContent);
List<string> headers = new List<string>();
foreach (Match match in headerMatches)
{
if (match.Groups.Count > 1)
{
headers.Add(match.Groups[1].Value.Trim());
}
}
List<List<string>> body = new List<List<string>>();
List<string> newRow = new List<string>();
foreach (Match match in cellMatches)
{
if (newRow.Count > 0 && newRow.Count % headers.Count == 0)
{
body.Add(newRow);
newRow = new List<string>();
}
else
{
newRow.Add(match.Groups[1].Value.Trim());
}
}
body.Add(newRow);
print(headers, body);
}
static void print(List<string> headers, List<List<string>> body)
{
var CELL_SIZE = 20;
for (int i = 0; i < headers.Count; i++)
{
Console.Write(headers[i].Truncate(CELL_SIZE).PadRight(CELL_SIZE) + " ");
}
Console.WriteLine("\n" + "".PadRight( (CELL_SIZE + 2) * headers.Count, '-'));
for (int r = 0; r < body.Count; r++)
{
List<string> row = body[r];
for (int c = 0; c < row.Count; c++)
{
Console.Write(row[c].Truncate(CELL_SIZE).PadRight(CELL_SIZE) + " ");
}
Console.WriteLine("");
}
Console.WriteLine("\n\n\n");
Console.ReadKey(false);
}
}
public static class StringExt
{
public static string Truncate(this string value, int maxLength)
{
if (string.IsNullOrEmpty(value) || value.Length <= maxLength) return value;
return value.Substring(0, maxLength - 3) + "...";
}
}
}