将日志文件转换为 CSV
Converting log file to CSV
我必须将 (Squid Web Proxy Server) 日志文件转换为 CSV 文件,以便可以将其加载到 powerpivot 中以分析查询。
那么我应该如何开始,我们将不胜感激任何帮助。
我必须使用 C# 语言来完成此任务,日志如下所示:
格式:经过客户端的时间戳Action/Code大小方法URI 标识Hierarchy/From内容
1473546438.145 917 5.45.107.68 TCP_DENIED/403 4114 GET http://atlantis.pennergame.de/pet/ - NONE/- text/html
1473546439.111 3 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html
1473546439.865 358 212.83.168.7 TCP_DENIED/403 3955 GET http://www.theshadehouse.com/left-sidebar-post/ - NONE/- text/html
1473546439.985 218 185.5.97.68 TCP_DENIED/403 3600 GET http://www.google.pl/search? - NONE/- text/html
1473546440.341 2 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html
1473546440.840 403 115.29.46.240 TCP_DENIED/403 4430 POST http://et.airchina.com.cn/fhx/consumeRecord/getCardConsumeRecordList.htm - NONE/- text/html
1473546441.486 2 52.41.27.39 TCP_DENIED/403 3813 POST http://www.deezer.com/ajax/action.php - NONE/- text/html
1473546441.596 2 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html
CSV 是一个分隔文件,其字段分隔符是 ,。几乎所有程序都允许您指定不同的字段和记录分隔符,默认使用 , 和 \n。
如果您的文件不包含多个 space 用于缩进,则可以将其视为分隔文件。您可以使用正则表达式 \s{2,}
将多个 space 替换为一个,例如:
var regex=new Regex(@"\s{2,}");
var original=File.ReadAllText(somePath);
var delimited=regex.Replace(original," ");
File.WriteAllText(somePath,delimited);
Power BI Desktop 已经允许您使用 space 作为分隔符。即使没有,您也可以通过将模式更改为 \s+
,将所有 space 替换为逗号,即:
var regex=new Regex(@"\s+");
...
var delimited=regex.Replace(original,",");
...
日志文件很大,因此减少它们使用的内存量是个好主意。如果使用 ReadLines
一次读取一行,则可以避免读取内存中的整个文件,进行替换并写出:
using(var writer=File.CreateText(targetPath))
{
foreach(var line in File.ReadLines(somePath))
{
var newline=regex.Replace(line," ");
writer.WriteLine(newline);
}
}
与 ReadAllLines
加载数组中的所有行不同,ReadLines
是一个迭代器,一次读取和 returns 一行。
它已经接近于 CSV,所以请逐行阅读并清理每一行:
...
line = line
.Replace(" ", " ") // compress 3 spaces to 1
.Replace(" ", " ") // compress 2 spaces to 1
.Replace(" ", " ") // compress 2 spaces to 1, again
.Replace(" ", "|") // replace space by '|'
.Replace(" - ", "|"); // replace - by '|'
您可能需要针对 TCP_DENIED/403 等字段进行调整。
这给你一个 '|'
分隔线。易于转换为您需要的任何分离器。或者拆分:
// write it out or process it further
string[] parts = line.split('|');
public static class SquidWebProxyServerCommaSeparatedWriter
{
public static void WriteToCSV(string destination, IEnumerable<SquidWebProxyServerLogEntry> serverLogEntries)
{
var lines = serverLogEntries.Select(ConvertToLine);
File.WriteAllLines(destination, lines);
}
private static string ConvertToLine(SquidWebProxyServerLogEntry serverLogEntry)
{
return string.Join(@",", serverLogEntry.Timestamp, serverLogEntry.Elapsed.ToString(),
serverLogEntry.ClientIPAddress, serverLogEntry.ActionCode, serverLogEntry.Size.ToString(),
serverLogEntry.Method.ToString(), serverLogEntry.Uri, serverLogEntry.Identity,
serverLogEntry.HierarchyFrom, serverLogEntry.MimeType);
}
}
public static class SquidWebProxyServerLogParser
{
public static IEnumerable<SquidWebProxyServerLogEntry> Parse(FileInfo fileInfo)
{
using (var streamReader = fileInfo.OpenText())
{
string row;
while ((row = streamReader.ReadLine()) != null)
{
yield return ParseRow(row)
}
}
}
private static SquidWebProxyServerLogEntry ParseRow(string row)
{
var fields = row.Split(new[] {"\t", " "}, StringSplitOptions.None);
return new SquidWebProxyServerLogEntry
{
Timestamp = fields[0],
Elapsed = int.Parse(fields[1]),
ClientIPAddress = fields[2],
ActionCode = fields[3],
Size = int.Parse(fields[4]),
Method =
(SquidWebProxyServerLogEntry.MethodType)
Enum.Parse(typeof(SquidWebProxyServerLogEntry.MethodType), fields[5]),
Uri = fields[6],
Identity = fields[7],
HierarchyFrom = fields[8],
MimeType = fields[9]
};
}
public static IEnumerable<SquidWebProxyServerLogEntry> Parse(IEnumerable<string> rows) => rows.Select(ParseRow);
}
public sealed class SquidWebProxyServerLogEntry
{
public enum MethodType
{
Get = 0,
Post = 1,
Put = 2
}
public string Timestamp { get; set; }
public int Elapsed { get; set; }
public string ClientIPAddress { get; set; }
public string ActionCode { get; set; }
public int Size { get; set; }
public MethodType Method { get; set; }
public string Uri { get; set; }
public string Identity { get; set; }
public string HierarchyFrom { get; set; }
public string MimeType { get; set; }
}
我必须将 (Squid Web Proxy Server) 日志文件转换为 CSV 文件,以便可以将其加载到 powerpivot 中以分析查询。 那么我应该如何开始,我们将不胜感激任何帮助。 我必须使用 C# 语言来完成此任务,日志如下所示:
格式:经过客户端的时间戳Action/Code大小方法URI 标识Hierarchy/From内容
1473546438.145 917 5.45.107.68 TCP_DENIED/403 4114 GET http://atlantis.pennergame.de/pet/ - NONE/- text/html 1473546439.111 3 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html 1473546439.865 358 212.83.168.7 TCP_DENIED/403 3955 GET http://www.theshadehouse.com/left-sidebar-post/ - NONE/- text/html 1473546439.985 218 185.5.97.68 TCP_DENIED/403 3600 GET http://www.google.pl/search? - NONE/- text/html 1473546440.341 2 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html 1473546440.840 403 115.29.46.240 TCP_DENIED/403 4430 POST http://et.airchina.com.cn/fhx/consumeRecord/getCardConsumeRecordList.htm - NONE/- text/html 1473546441.486 2 52.41.27.39 TCP_DENIED/403 3813 POST http://www.deezer.com/ajax/action.php - NONE/- text/html 1473546441.596 2 146.148.96.13 TCP_DENIED/403 4604 POST http://mobiuas.ebay.com/services/mobile/v1/UserAuthenticationService - NONE/- text/html
CSV 是一个分隔文件,其字段分隔符是 ,。几乎所有程序都允许您指定不同的字段和记录分隔符,默认使用 , 和 \n。
如果您的文件不包含多个 space 用于缩进,则可以将其视为分隔文件。您可以使用正则表达式 \s{2,}
将多个 space 替换为一个,例如:
var regex=new Regex(@"\s{2,}");
var original=File.ReadAllText(somePath);
var delimited=regex.Replace(original," ");
File.WriteAllText(somePath,delimited);
Power BI Desktop 已经允许您使用 space 作为分隔符。即使没有,您也可以通过将模式更改为 \s+
,将所有 space 替换为逗号,即:
var regex=new Regex(@"\s+");
...
var delimited=regex.Replace(original,",");
...
日志文件很大,因此减少它们使用的内存量是个好主意。如果使用 ReadLines
一次读取一行,则可以避免读取内存中的整个文件,进行替换并写出:
using(var writer=File.CreateText(targetPath))
{
foreach(var line in File.ReadLines(somePath))
{
var newline=regex.Replace(line," ");
writer.WriteLine(newline);
}
}
与 ReadAllLines
加载数组中的所有行不同,ReadLines
是一个迭代器,一次读取和 returns 一行。
它已经接近于 CSV,所以请逐行阅读并清理每一行:
...
line = line
.Replace(" ", " ") // compress 3 spaces to 1
.Replace(" ", " ") // compress 2 spaces to 1
.Replace(" ", " ") // compress 2 spaces to 1, again
.Replace(" ", "|") // replace space by '|'
.Replace(" - ", "|"); // replace - by '|'
您可能需要针对 TCP_DENIED/403 等字段进行调整。
这给你一个 '|'
分隔线。易于转换为您需要的任何分离器。或者拆分:
// write it out or process it further
string[] parts = line.split('|');
public static class SquidWebProxyServerCommaSeparatedWriter
{
public static void WriteToCSV(string destination, IEnumerable<SquidWebProxyServerLogEntry> serverLogEntries)
{
var lines = serverLogEntries.Select(ConvertToLine);
File.WriteAllLines(destination, lines);
}
private static string ConvertToLine(SquidWebProxyServerLogEntry serverLogEntry)
{
return string.Join(@",", serverLogEntry.Timestamp, serverLogEntry.Elapsed.ToString(),
serverLogEntry.ClientIPAddress, serverLogEntry.ActionCode, serverLogEntry.Size.ToString(),
serverLogEntry.Method.ToString(), serverLogEntry.Uri, serverLogEntry.Identity,
serverLogEntry.HierarchyFrom, serverLogEntry.MimeType);
}
}
public static class SquidWebProxyServerLogParser
{
public static IEnumerable<SquidWebProxyServerLogEntry> Parse(FileInfo fileInfo)
{
using (var streamReader = fileInfo.OpenText())
{
string row;
while ((row = streamReader.ReadLine()) != null)
{
yield return ParseRow(row)
}
}
}
private static SquidWebProxyServerLogEntry ParseRow(string row)
{
var fields = row.Split(new[] {"\t", " "}, StringSplitOptions.None);
return new SquidWebProxyServerLogEntry
{
Timestamp = fields[0],
Elapsed = int.Parse(fields[1]),
ClientIPAddress = fields[2],
ActionCode = fields[3],
Size = int.Parse(fields[4]),
Method =
(SquidWebProxyServerLogEntry.MethodType)
Enum.Parse(typeof(SquidWebProxyServerLogEntry.MethodType), fields[5]),
Uri = fields[6],
Identity = fields[7],
HierarchyFrom = fields[8],
MimeType = fields[9]
};
}
public static IEnumerable<SquidWebProxyServerLogEntry> Parse(IEnumerable<string> rows) => rows.Select(ParseRow);
}
public sealed class SquidWebProxyServerLogEntry
{
public enum MethodType
{
Get = 0,
Post = 1,
Put = 2
}
public string Timestamp { get; set; }
public int Elapsed { get; set; }
public string ClientIPAddress { get; set; }
public string ActionCode { get; set; }
public int Size { get; set; }
public MethodType Method { get; set; }
public string Uri { get; set; }
public string Identity { get; set; }
public string HierarchyFrom { get; set; }
public string MimeType { get; set; }
}