CSV 分隔为 XML - 文件夹层次结构
CSV Delimited to XML - Folder Hierarchy
这是我第一次发帖,所以对于任何忽略运行错误或使用示例失败,我深表歉意。
我有一个要创建的控制台应用程序项目,我在其中获得了相当多的 CSV 文件,我需要从它们中创建某种 Parent/Child/Grandchild 关系(XML?也许吧?-然后我可以用它来上传并以最少的调用写入 DMS - 我不想一遍又一遍地查询文件夹是否存在)
我对这个有点不知所措
我需要知道在没有第 3 方库依赖项的情况下执行此操作的最佳方法,纯 C#,很可能需要使用 OLEDB JET 提供程序,因为它将处理所需的解析,CSV 文件没有顺序关于日期,前几年可能会出现在列表中,反之亦然。
这是 CSV 输出的示例
"DESCRIPTION1","8, 5/8\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"DESCRIPTION2","12, 3/4\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"DESCRIPTION3","12, 3/4\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"another description 20# gw","1","388015","Scan123.pdf","2015-10-24","1"
"another description 20# gw","3","385902","Scan456.pdf","2015-04-14","1"
"STRINGVAL1","273.10 X 9.27 X 6000","45032-01","KHJDWNEJWKFD9101529.pdf","2012-02-03","1"
"STRINGVAL2","273.10 X 21.44 X 6000","7-09372","DJSWH68767681540.pdf","2017-02-03","1"
最终输出将是(YEAR/MONTH/FILENAME
+(每个文件的属性 - 这些用于最终更新 DMS 中的列))
从具有日期
的列中检索到年和月
如果 YEAR 已经存在则不会再次创建
若当年下月存在则不再创建
如果该文件名已存在于 YEAR/MONTH 下,则不会再次创建该文件名,但该文件名的附加属性将添加到属性中 - "line seperated?"
所需输出:
我已经尝试使用 Linq 查询来开始输出可能需要的 XML 让我继续前进,但它输出每一行并且不进行分组,我目前对 Linq 不熟悉。
我也 运行 遇到了 .Split(',') 的基本转义问题(参见上面的原始 CSV 示例与我在测试文件中使用 TAB 分隔和下面的示例进行比较) ) 这就是为什么我希望 Oledb 提供商来处理它。
string[] source = File.ReadAllLines(@"C:\Processing\In\mockCsv.csv");
XElement item = new XElement("Root",
from str in source
let fields = str.Split('\t')
select new XElement("Year", fields[4].Substring(0, 4),
new XElement("Month", fields[4].Substring(5, 2),
new XElement("FileName", fields[3]),
new XElement("Description",fields[0]),
new XElement("Length", fields[1]),
new XElement("Type", fields[2]),
new XElement("FileName", fields[3]),
new XElement("Date", fields[4]),
new XElement("Authorised", fields[5]))
)
);
我还需要记录流程的每一步,所以我设置了一个记录器 class
private class Logger
{
private static string LogFile = null;
internal enum MsgType
{
Info,
Debug,
Error
}
static Logger()
{
var processingDetails = ConfigurationManager.GetSection(SECTION_PROCESSINGDETAILS) as NameValueCollection;
LogFile = Path.Combine(processingDetails[KEY_WORKINGFOLDER],
String.Format("Log_{0}.txt", StartTime.ToString("MMMyyyy")));
if (File.Exists(LogFile))
File.Delete(LogFile);
}
internal static void Write(string msg, MsgType msgType, bool isNewLine, bool closeLine)
{
if (isNewLine)
msg = String.Format("{0} - {1} : {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), msgType, msg);
if (closeLine)
Console.WriteLine(msg);
else
Console.Write(msg);
if (String.IsNullOrEmpty(LogFile))
return;
try
{
using (StreamWriter sw = new StreamWriter(LogFile, true))
{
if (closeLine)
sw.WriteLine(msg);
else
sw.Write(msg);
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
这样使用
Logger.Write(String.Format("Reading records from csv file ({0})... ",
csvFile), Logger.MsgType.Info, true, false);
试试看。如果您正在从文件中读取,请使用 StreamReader 而不是 StringReader :
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using System.IO;
using System.Text.RegularExpressions;
namespace ConsoleApplication74
{
class Program
{
static void Main(string[] args)
{
string input =
"\"DESCRIPTION1\",\"8, 5/8 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"DESCRIPTION2\",\"12, 3/4 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"DESCRIPTION3\",\"12, 3/4 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"another description 20# gw\",\"1\",\"388015\",\"Scan123.pdf\",\"2015-10-24\",\"1\"\n" +
"\"another description 20# gw\",\"3\",\"385902\",\"Scan456.pdf\",\"2015-04-14\",\"1\"\n" +
"\"STRINGVAL1\",\"273.10 X 9.27 X 6000\",\"45032-01\",\"KHJDWNEJWKFD9101529.pdf\",\"2012-02-03\",\"1\"\n" +
"\"STRINGVAL2\",\"273.10 X 21.44 X 6000\",\"7-09372\",\"DJSWH68767681540.pdf\",\"2017-02-03\",\"1\"\n";
string pattern = "\\"\s*,\s*\\"";
string inputline = "";
StringReader reader = new StringReader(input);
XElement root = new XElement("Root");
while ((inputline = reader.ReadLine()) != null)
{
string[] splitLine = Regex.Split(inputline,pattern);
Item newItem = new Item() {
description = splitLine[0].Replace("\"",""),
length = splitLine[1],
type = splitLine[2],
filename = splitLine[3],
date = DateTime.Parse(splitLine[4]),
authorized = splitLine[5].Replace("\"", "") == "1" ? true : false
};
Item.items.Add(newItem);
}
foreach(var year in Item.items.GroupBy(x => x.date.Year).OrderBy(x => x.Key))
{
XElement newYear = new XElement("_" + year.Key.ToString());
root.Add(newYear);
foreach(var month in year.GroupBy(x => x.date.Month).OrderBy(x => x.Key))
{
XElement newMonth = new XElement("_" + month.Key.ToString());
newYear.Add(newMonth);
newMonth.Add(
month.OrderBy(x => x.date).Select(x => new XElement(
x.filename,
string.Join("\r\n", new object[] {
x.description,
x.length,
x.type,
x.date.ToString(),
x.authorized.ToString()
}).ToList()
)));
}
}
}
}
public class Item
{
public static List<Item> items = new List<Item>();
public string description { get; set; }
public string length { get; set; }
public string type { get; set; }
public string filename { get; set; }
public DateTime date { get; set; }
public Boolean authorized { get; set; }
}
}
这是我第一次发帖,所以对于任何忽略运行错误或使用示例失败,我深表歉意。
我有一个要创建的控制台应用程序项目,我在其中获得了相当多的 CSV 文件,我需要从它们中创建某种 Parent/Child/Grandchild 关系(XML?也许吧?-然后我可以用它来上传并以最少的调用写入 DMS - 我不想一遍又一遍地查询文件夹是否存在)
我对这个有点不知所措
我需要知道在没有第 3 方库依赖项的情况下执行此操作的最佳方法,纯 C#,很可能需要使用 OLEDB JET 提供程序,因为它将处理所需的解析,CSV 文件没有顺序关于日期,前几年可能会出现在列表中,反之亦然。
这是 CSV 输出的示例
"DESCRIPTION1","8, 5/8\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"DESCRIPTION2","12, 3/4\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"DESCRIPTION3","12, 3/4\" X 6.4MM","STRING","filename001.pdf","2016-09-19","1"
"another description 20# gw","1","388015","Scan123.pdf","2015-10-24","1"
"another description 20# gw","3","385902","Scan456.pdf","2015-04-14","1"
"STRINGVAL1","273.10 X 9.27 X 6000","45032-01","KHJDWNEJWKFD9101529.pdf","2012-02-03","1"
"STRINGVAL2","273.10 X 21.44 X 6000","7-09372","DJSWH68767681540.pdf","2017-02-03","1"
最终输出将是(YEAR/MONTH/FILENAME
+(每个文件的属性 - 这些用于最终更新 DMS 中的列))
从具有日期
的列中检索到年和月如果 YEAR 已经存在则不会再次创建
若当年下月存在则不再创建
如果该文件名已存在于 YEAR/MONTH 下,则不会再次创建该文件名,但该文件名的附加属性将添加到属性中 - "line seperated?"
所需输出:
我已经尝试使用 Linq 查询来开始输出可能需要的 XML 让我继续前进,但它输出每一行并且不进行分组,我目前对 Linq 不熟悉。
我也 运行 遇到了 .Split(',') 的基本转义问题(参见上面的原始 CSV 示例与我在测试文件中使用 TAB 分隔和下面的示例进行比较) ) 这就是为什么我希望 Oledb 提供商来处理它。
string[] source = File.ReadAllLines(@"C:\Processing\In\mockCsv.csv");
XElement item = new XElement("Root",
from str in source
let fields = str.Split('\t')
select new XElement("Year", fields[4].Substring(0, 4),
new XElement("Month", fields[4].Substring(5, 2),
new XElement("FileName", fields[3]),
new XElement("Description",fields[0]),
new XElement("Length", fields[1]),
new XElement("Type", fields[2]),
new XElement("FileName", fields[3]),
new XElement("Date", fields[4]),
new XElement("Authorised", fields[5]))
)
);
我还需要记录流程的每一步,所以我设置了一个记录器 class
private class Logger
{
private static string LogFile = null;
internal enum MsgType
{
Info,
Debug,
Error
}
static Logger()
{
var processingDetails = ConfigurationManager.GetSection(SECTION_PROCESSINGDETAILS) as NameValueCollection;
LogFile = Path.Combine(processingDetails[KEY_WORKINGFOLDER],
String.Format("Log_{0}.txt", StartTime.ToString("MMMyyyy")));
if (File.Exists(LogFile))
File.Delete(LogFile);
}
internal static void Write(string msg, MsgType msgType, bool isNewLine, bool closeLine)
{
if (isNewLine)
msg = String.Format("{0} - {1} : {2}", DateTime.Now.ToString("dd/MM/yyyy HH:mm:ss"), msgType, msg);
if (closeLine)
Console.WriteLine(msg);
else
Console.Write(msg);
if (String.IsNullOrEmpty(LogFile))
return;
try
{
using (StreamWriter sw = new StreamWriter(LogFile, true))
{
if (closeLine)
sw.WriteLine(msg);
else
sw.Write(msg);
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
}
这样使用
Logger.Write(String.Format("Reading records from csv file ({0})... ",
csvFile), Logger.MsgType.Info, true, false);
试试看。如果您正在从文件中读取,请使用 StreamReader 而不是 StringReader :
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using System.IO;
using System.Text.RegularExpressions;
namespace ConsoleApplication74
{
class Program
{
static void Main(string[] args)
{
string input =
"\"DESCRIPTION1\",\"8, 5/8 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"DESCRIPTION2\",\"12, 3/4 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"DESCRIPTION3\",\"12, 3/4 X 6.4MM\",\"STRING\",\"filename001.pdf\",\"2016-09-19\",\"1\"\n" +
"\"another description 20# gw\",\"1\",\"388015\",\"Scan123.pdf\",\"2015-10-24\",\"1\"\n" +
"\"another description 20# gw\",\"3\",\"385902\",\"Scan456.pdf\",\"2015-04-14\",\"1\"\n" +
"\"STRINGVAL1\",\"273.10 X 9.27 X 6000\",\"45032-01\",\"KHJDWNEJWKFD9101529.pdf\",\"2012-02-03\",\"1\"\n" +
"\"STRINGVAL2\",\"273.10 X 21.44 X 6000\",\"7-09372\",\"DJSWH68767681540.pdf\",\"2017-02-03\",\"1\"\n";
string pattern = "\\"\s*,\s*\\"";
string inputline = "";
StringReader reader = new StringReader(input);
XElement root = new XElement("Root");
while ((inputline = reader.ReadLine()) != null)
{
string[] splitLine = Regex.Split(inputline,pattern);
Item newItem = new Item() {
description = splitLine[0].Replace("\"",""),
length = splitLine[1],
type = splitLine[2],
filename = splitLine[3],
date = DateTime.Parse(splitLine[4]),
authorized = splitLine[5].Replace("\"", "") == "1" ? true : false
};
Item.items.Add(newItem);
}
foreach(var year in Item.items.GroupBy(x => x.date.Year).OrderBy(x => x.Key))
{
XElement newYear = new XElement("_" + year.Key.ToString());
root.Add(newYear);
foreach(var month in year.GroupBy(x => x.date.Month).OrderBy(x => x.Key))
{
XElement newMonth = new XElement("_" + month.Key.ToString());
newYear.Add(newMonth);
newMonth.Add(
month.OrderBy(x => x.date).Select(x => new XElement(
x.filename,
string.Join("\r\n", new object[] {
x.description,
x.length,
x.type,
x.date.ToString(),
x.authorized.ToString()
}).ToList()
)));
}
}
}
}
public class Item
{
public static List<Item> items = new List<Item>();
public string description { get; set; }
public string length { get; set; }
public string type { get; set; }
public string filename { get; set; }
public DateTime date { get; set; }
public Boolean authorized { get; set; }
}
}