拆分 CSV 行时处理连续引号
Handling consecutive quotes when splitting CSV lines
由于两个连续的双引号,我正在努力解析 CSV 文件中的值 ""
。
这是我从维基百科中提取的 CSV 字段示例:1997,Ford,E350,"Super, ""luxurious"" truck"
我试图找到不同的方法来解释它。
我一直得到的结果是:
"1997"
"Ford"
"E350"
"Super,"
""Super"
" ""luxurious"" truck""
这是我的 VB.Net 函数。
Private Function splitCSV(ByVal sLine As String) As List(Of String)
Dim comA As Integer = -1, comB = -1, quotA = -1, quotB = -1, pos = 0
Dim parsed As New List(Of String)
Dim quote As String = """"
Dim comma As String = ","
Dim len As Integer = sLine.Length
Dim first As Boolean = True
comA = sLine.IndexOf(comma, pos) ' Find the next comma.
quotA = sLine.IndexOf(quote, pos) ' Find the next quotation mark.
' This if function works if there is only one field in the given row.
If comA < 0 Then
parsed.Add(False)
Return parsed
End If
While pos < len ' While not at end of the string
comB = sLine.IndexOf(comma, comA + 1) ' Find the second comma
quotB = sLine.IndexOf(quote, quotA + 1) ' Find the second quotation mark
' Looking for the actual second quote mark
' Skips over the double quotation marks.
If quotA > -1 And quotA < comB Then ' If the quotation mark is before the first comma
If Math.Abs(quotA - quotB).Equals(1) Then
Dim tempA As Integer = quotA
Dim tempB As Integer = quotB
' Looking for the actual second quote mark
' Skips over the double quotation marks.
While (Math.Abs(tempA - tempB).Equals(1))
tempA = tempB
If Not tempA.Equals(sLine.LastIndexOf(quote)) Then
tempB = sLine.IndexOf(quote, tempA + 1)
Else
tempA = tempB - 2
End If
End While
quotB = tempB
End If
If quotB < 0 Then ' If second quotation mark does not exist
parsed.Add(False) ' End the function and Return False
Return parsed
End If
parsed.Add(sLine.Substring(quotA + 1, quotB - quotA - 1)) ' Otherwise, add the substring of initial and end quotation marks.
quotA = quotB ' Give quotA the position of quotB
pos = quotB ' Mark the current position
ElseIf comA < comB Then
If first Then ' If it is the first comma in the line,
parsed.Add(sLine.Substring(pos, comA)) ' Parse the first field
first = False ' The future commas will not be considered as the first one.
End If
comB = sLine.IndexOf(comma, comA + 1) ' Find the second comma
If comB > comA Then ' If the second comma exists
parsed.Add(sLine.Substring(comA + 1, comB - comA - 1)) ' Add the substring of the first and second comma.
comA = comB ' Give comA the position of comB
pos = comB ' Mark the current position
End If
ElseIf len > 0 Then ' If the first comma does not exist, as long as sLine has something,
parsed.Add(sLine.Substring(pos + 1, len - pos - 1)) ' Return the substing of position to end of string.
pos = len ' Mark the position at the end to exit out of while loop
End If
End While
Return parsed ' Return parsed list of string
End Function
我以前不得不解析这些类型的文件。这是我最后写的。基本上,您一次扫描一个字符输入的文本。如果是引号,请记下它,除非最后一个字符也是引号。如果您在引用的文本中,分隔符将被忽略。
Protected Function FlexSplitLine(incoming As String, fieldDelimiter As String, quoteDelimiter As String) As String()
Dim rval As New List(Of String)
Dim index As Integer
Dim Word As New System.Text.StringBuilder
Dim inQuote As Boolean
Dim QuoteChar As Char
Dim CommaChar As Char
index = 0
If quoteDelimiter Is Nothing OrElse quoteDelimiter.Length = 0 Then
quoteDelimiter = """"
End If
If fieldDelimiter Is Nothing OrElse fieldDelimiter.Length = 0 Then
fieldDelimiter = ","
End If
QuoteChar = quoteDelimiter(0)
CommaChar = fieldDelimiter(0)
Do While index < incoming.Length
If incoming(index) = QuoteChar Then
If index < incoming.Length - 1 AndAlso incoming(index + 1) = QuoteChar Then
Word.Append(QuoteChar)
index += 1
Else
inQuote = Not inQuote
End If
ElseIf incoming(index) = CommaChar AndAlso Not inQuote Then
rval.Add(Word.ToString)
Word.Length = 0
Else
Word.Append(incoming(index))
End If
index += 1
Loop
If inQuote Then
Throw New IndexOutOfRangeException("Ran past the end of the line while looking for the ending quote character.")
End If
rval.Add(Word.ToString)
Return rval.ToArray
End Function
TextFieldParser
对这类事情真的很不错,当然比自己动手更容易。测试这个很容易:我将你的样本复制到一个文件中,然后:
Imports Microsoft.VisualBasic.FileIO
...
Using parser = New TextFieldParser("C:\Temp\CSVPARSER.TXT")
parser.Delimiters = New String() {","}
parser.TextFieldType = FieldType.Delimited
parser.HasFieldsEnclosedInQuotes = True
While parser.EndOfData = False
data = parser.ReadFields
' use pipe to show column breaks:
Dim s = String.Join("|", data)
Console.WriteLine(s)
End While
End Using
HasFieldsEnclosedInQuotes = True
在这种情况下很重要。结果:
1997|Ford|E350|Super, "luxurious" truck
super 后面的逗号看起来不合适 - 很可能 - 但在原文中它在引号内:1997,Ford,E350,"Super, ""luxurious"" truck"
还有其他 libraries/packages 也适用于各种 CSV 布局和格式。
由于两个连续的双引号,我正在努力解析 CSV 文件中的值 ""
。
这是我从维基百科中提取的 CSV 字段示例:1997,Ford,E350,"Super, ""luxurious"" truck"
我试图找到不同的方法来解释它。
我一直得到的结果是:
"1997"
"Ford"
"E350"
"Super,"
""Super"
" ""luxurious"" truck""
这是我的 VB.Net 函数。
Private Function splitCSV(ByVal sLine As String) As List(Of String)
Dim comA As Integer = -1, comB = -1, quotA = -1, quotB = -1, pos = 0
Dim parsed As New List(Of String)
Dim quote As String = """"
Dim comma As String = ","
Dim len As Integer = sLine.Length
Dim first As Boolean = True
comA = sLine.IndexOf(comma, pos) ' Find the next comma.
quotA = sLine.IndexOf(quote, pos) ' Find the next quotation mark.
' This if function works if there is only one field in the given row.
If comA < 0 Then
parsed.Add(False)
Return parsed
End If
While pos < len ' While not at end of the string
comB = sLine.IndexOf(comma, comA + 1) ' Find the second comma
quotB = sLine.IndexOf(quote, quotA + 1) ' Find the second quotation mark
' Looking for the actual second quote mark
' Skips over the double quotation marks.
If quotA > -1 And quotA < comB Then ' If the quotation mark is before the first comma
If Math.Abs(quotA - quotB).Equals(1) Then
Dim tempA As Integer = quotA
Dim tempB As Integer = quotB
' Looking for the actual second quote mark
' Skips over the double quotation marks.
While (Math.Abs(tempA - tempB).Equals(1))
tempA = tempB
If Not tempA.Equals(sLine.LastIndexOf(quote)) Then
tempB = sLine.IndexOf(quote, tempA + 1)
Else
tempA = tempB - 2
End If
End While
quotB = tempB
End If
If quotB < 0 Then ' If second quotation mark does not exist
parsed.Add(False) ' End the function and Return False
Return parsed
End If
parsed.Add(sLine.Substring(quotA + 1, quotB - quotA - 1)) ' Otherwise, add the substring of initial and end quotation marks.
quotA = quotB ' Give quotA the position of quotB
pos = quotB ' Mark the current position
ElseIf comA < comB Then
If first Then ' If it is the first comma in the line,
parsed.Add(sLine.Substring(pos, comA)) ' Parse the first field
first = False ' The future commas will not be considered as the first one.
End If
comB = sLine.IndexOf(comma, comA + 1) ' Find the second comma
If comB > comA Then ' If the second comma exists
parsed.Add(sLine.Substring(comA + 1, comB - comA - 1)) ' Add the substring of the first and second comma.
comA = comB ' Give comA the position of comB
pos = comB ' Mark the current position
End If
ElseIf len > 0 Then ' If the first comma does not exist, as long as sLine has something,
parsed.Add(sLine.Substring(pos + 1, len - pos - 1)) ' Return the substing of position to end of string.
pos = len ' Mark the position at the end to exit out of while loop
End If
End While
Return parsed ' Return parsed list of string
End Function
我以前不得不解析这些类型的文件。这是我最后写的。基本上,您一次扫描一个字符输入的文本。如果是引号,请记下它,除非最后一个字符也是引号。如果您在引用的文本中,分隔符将被忽略。
Protected Function FlexSplitLine(incoming As String, fieldDelimiter As String, quoteDelimiter As String) As String()
Dim rval As New List(Of String)
Dim index As Integer
Dim Word As New System.Text.StringBuilder
Dim inQuote As Boolean
Dim QuoteChar As Char
Dim CommaChar As Char
index = 0
If quoteDelimiter Is Nothing OrElse quoteDelimiter.Length = 0 Then
quoteDelimiter = """"
End If
If fieldDelimiter Is Nothing OrElse fieldDelimiter.Length = 0 Then
fieldDelimiter = ","
End If
QuoteChar = quoteDelimiter(0)
CommaChar = fieldDelimiter(0)
Do While index < incoming.Length
If incoming(index) = QuoteChar Then
If index < incoming.Length - 1 AndAlso incoming(index + 1) = QuoteChar Then
Word.Append(QuoteChar)
index += 1
Else
inQuote = Not inQuote
End If
ElseIf incoming(index) = CommaChar AndAlso Not inQuote Then
rval.Add(Word.ToString)
Word.Length = 0
Else
Word.Append(incoming(index))
End If
index += 1
Loop
If inQuote Then
Throw New IndexOutOfRangeException("Ran past the end of the line while looking for the ending quote character.")
End If
rval.Add(Word.ToString)
Return rval.ToArray
End Function
TextFieldParser
对这类事情真的很不错,当然比自己动手更容易。测试这个很容易:我将你的样本复制到一个文件中,然后:
Imports Microsoft.VisualBasic.FileIO
...
Using parser = New TextFieldParser("C:\Temp\CSVPARSER.TXT")
parser.Delimiters = New String() {","}
parser.TextFieldType = FieldType.Delimited
parser.HasFieldsEnclosedInQuotes = True
While parser.EndOfData = False
data = parser.ReadFields
' use pipe to show column breaks:
Dim s = String.Join("|", data)
Console.WriteLine(s)
End While
End Using
HasFieldsEnclosedInQuotes = True
在这种情况下很重要。结果:
1997|Ford|E350|Super, "luxurious" truck
super 后面的逗号看起来不合适 - 很可能 - 但在原文中它在引号内:1997,Ford,E350,"Super, ""luxurious"" truck"
还有其他 libraries/packages 也适用于各种 CSV 布局和格式。