使用 SSIS OR T-SQL 将一列带引号和不带引号的逗号分隔值拆分为多列
Using SSIS OR T-SQL Split a column of quoted & unquoted comma separated values into multiple columns
我在名为 C0 的列中有逗号分隔的数据。
C0 中的数据如下所示:
C0
"Pacey LLC.",213830ZZ,11/1/2017,11/1/2017,"297,311.74","2,371.40",0.00,"1,325.18",0.00,42.22,"123,986.56"
Mike The Miker,9814140VCD,12/1/2018,12/1/2018,"3,917,751.99","419,743.54","36,642.66","344,090.43",0.00,10.00,"2,434,671.06"
我希望它最终是这样的:
F1
F1
F3
F4
F5
F6
F7
F8
F9
F10
F11
"Pacey LLC."
213830ZZ
11/1/2017
11/1/2017
297,311.74
2,371.40
0.00
1,325.18
0.00
42.22
123,986.56
Mike The Miker
9814140VCD
12/1/2018
12/1/2018
3,917,751.99
419,743.54
36,642.66
344,090.43
0.00
10.00
2,434,671.06
我试过嵌套替换,但无法找到没有正则表达式的可靠搜索模式 T/SQL?我也试过 TOKEN approach in SSIS by this feller,但都没有结果。
嵌套替换方法卡在小于 1,000(如 0.00)的货币字段上,SSIS TOKEN 方法假定所有字段都以引号分隔,而在我的示例中它们不是。
如前所述,TSQL 是错误的工具。尽管如此,这是可以做到的(至少对于给定的集合)。如果这是一次性操作,您可以尝试一下。如果这是现实场景中重复出现的任务,我会尝试以适当的格式获取数据。
但是,这适用于给定的行:
DECLARE @t1 TABLE(ID INT IDENTITY, YourString NVARCHAR(1000));
INSERT INTO @t1 VALUES(N'"Pacey LLC.",213830ZZ,11/1/2017,11/1/2017,"297,311.74","2,371.40",0.00,"1,325.18",0.00,42.22,"123,986.56"')
,(N'Mike The Miker,9814140VCD,12/1/2018,12/1/2018,"3,917,751.99","419,743.54","36,642.66","344,090.43",0.00,10.00,"2,434,671.06"');
--您的数据包括特定文化格式的日期(真的很糟糕!糟糕)
--最好切换到ISO8601
--设置日期格式会有帮助,但不推荐
SET DATEFORMAT dmy;
--第一个 cte 将使用 APPLY
和计算的 TOP()
--这将允许一个一个地获取每个字符。
WITH singleChars AS
(
SELECT t.ID
,A.Pos
,SUBSTRING(t.YourString,A.POs,1) AS CharOnPos
FROM @t1 t
CROSS APPLY(SELECT TOP (LEN(t.YourString)) ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) FROM master..spt_values) A(Pos) --master..spt_values can be any table with sufficient rows
)
--我们继续递归cte
--它将 运行 遍历字符串并查找我们是否在 内 引用区域或不在
,recCTE AS
(
SELECT *
,CASE WHEN CharOnPos='"' THEN 1 ELSE 0 END AS QuoteIsOpen
,CAST(CharOnPos AS NVARCHAR(MAX)) AS GrowingString
FROM singleChars WHERE Pos=1
UNION ALL
SELECT sc.ID,sc.Pos,sc.CharOnPos
,A.QuoteIsStillOpen
,CONCAT(GrowingString,CASE WHEN sc.CharOnPos=N',' AND A.QuoteIsStillOpen=0 THEN N'$%&' ELSE sc.CharOnPos END)
FROM singleChars sc
INNER JOIN recCTE r ON sc.ID = r.ID AND sc.Pos=r.Pos+1
CROSS APPLY(VALUES(CASE WHEN sc.CharOnPos='"' THEN CASE WHEN r.QuoteIsOpen=1 THEN 0 ELSE 1 END ELSE r.QuoteIsOpen END )) A(QuoteIsStillOpen)
)
--这个 CTE 使用 TOP 1 WITH TIES
和 ORDER BY
分区的 ROW_NUMBER()
执行一个技巧
-- 结果将包含递归的最终字符串 ID
,newlySeparated AS
(
SELECT TOP 1 WITH TIES * FROM recCTE
ORDER BY ROW_NUMBER() OVER(PARTITION BY ID ORDER BY Pos DESC)
)
--最后的 SELECT
使用技巧来拆分字符串位置和类型安全
SELECT A.*
FROM newlySeparated ns
CROSS APPLY OPENJSON(CONCAT(N'[["',REPLACE(REPLACE(ns.GrowingString,'"',''),'$%&','","'),N'"]]'))
WITH(Company NVARCHAR(100) '$[0]'
,Code1 NVARCHAR(100) '$[1]'
,Date1 DATE '$[2]'
,Date2 DATE '$[3]'
,Decimal1 NVARCHAR(100) '$[4]' --Using a numbers type might work here, this depends on your machine
,Decimal2 NVARCHAR(100) '$[5]'
,Decimal3 NVARCHAR(100) '$[6]'
,Decimal4 NVARCHAR(100) '$[7]'
,Decimal5 NVARCHAR(100) '$[8]'
,Decimal6 NVARCHAR(100) '$[9]'
,Decimal7 NVARCHAR(100) '$[10]') A
OPTION(MAXRECURSION 0);
结果
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+
| Pacey LLC. | 213830ZZ | 2017-01-11 | 2017-01-11 | 297,311.74 | 2,371.40 | 0.00 | 1,325.18 | 0.00 | 42.22 | 123,986.56 |
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+
| Mike The Miker | 9814140VCD | 2018-01-12 | 2018-01-12 | 3,917,751.99 | 419,743.54 | 36,642.66 | 344,090.43 | 0.00 | 10.00 | 2,434,671.06 |
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+
我在名为 C0 的列中有逗号分隔的数据。
C0 中的数据如下所示:
C0 |
---|
"Pacey LLC.",213830ZZ,11/1/2017,11/1/2017,"297,311.74","2,371.40",0.00,"1,325.18",0.00,42.22,"123,986.56" |
Mike The Miker,9814140VCD,12/1/2018,12/1/2018,"3,917,751.99","419,743.54","36,642.66","344,090.43",0.00,10.00,"2,434,671.06" |
我希望它最终是这样的:
F1 | F1 | F3 | F4 | F5 | F6 | F7 | F8 | F9 | F10 | F11 |
---|---|---|---|---|---|---|---|---|---|---|
"Pacey LLC." | 213830ZZ | 11/1/2017 | 11/1/2017 | 297,311.74 | 2,371.40 | 0.00 | 1,325.18 | 0.00 | 42.22 | 123,986.56 |
Mike The Miker | 9814140VCD | 12/1/2018 | 12/1/2018 | 3,917,751.99 | 419,743.54 | 36,642.66 | 344,090.43 | 0.00 | 10.00 | 2,434,671.06 |
我试过嵌套替换,但无法找到没有正则表达式的可靠搜索模式 T/SQL?我也试过 TOKEN approach in SSIS by this feller,但都没有结果。
嵌套替换方法卡在小于 1,000(如 0.00)的货币字段上,SSIS TOKEN 方法假定所有字段都以引号分隔,而在我的示例中它们不是。
如前所述,TSQL 是错误的工具。尽管如此,这是可以做到的(至少对于给定的集合)。如果这是一次性操作,您可以尝试一下。如果这是现实场景中重复出现的任务,我会尝试以适当的格式获取数据。
但是,这适用于给定的行:
DECLARE @t1 TABLE(ID INT IDENTITY, YourString NVARCHAR(1000));
INSERT INTO @t1 VALUES(N'"Pacey LLC.",213830ZZ,11/1/2017,11/1/2017,"297,311.74","2,371.40",0.00,"1,325.18",0.00,42.22,"123,986.56"')
,(N'Mike The Miker,9814140VCD,12/1/2018,12/1/2018,"3,917,751.99","419,743.54","36,642.66","344,090.43",0.00,10.00,"2,434,671.06"');
--您的数据包括特定文化格式的日期(真的很糟糕!糟糕)
--最好切换到ISO8601
--设置日期格式会有帮助,但不推荐
SET DATEFORMAT dmy;
--第一个 cte 将使用 APPLY
和计算的 TOP()
--这将允许一个一个地获取每个字符。
WITH singleChars AS
(
SELECT t.ID
,A.Pos
,SUBSTRING(t.YourString,A.POs,1) AS CharOnPos
FROM @t1 t
CROSS APPLY(SELECT TOP (LEN(t.YourString)) ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) FROM master..spt_values) A(Pos) --master..spt_values can be any table with sufficient rows
)
--我们继续递归cte
--它将 运行 遍历字符串并查找我们是否在 内 引用区域或不在
,recCTE AS
(
SELECT *
,CASE WHEN CharOnPos='"' THEN 1 ELSE 0 END AS QuoteIsOpen
,CAST(CharOnPos AS NVARCHAR(MAX)) AS GrowingString
FROM singleChars WHERE Pos=1
UNION ALL
SELECT sc.ID,sc.Pos,sc.CharOnPos
,A.QuoteIsStillOpen
,CONCAT(GrowingString,CASE WHEN sc.CharOnPos=N',' AND A.QuoteIsStillOpen=0 THEN N'$%&' ELSE sc.CharOnPos END)
FROM singleChars sc
INNER JOIN recCTE r ON sc.ID = r.ID AND sc.Pos=r.Pos+1
CROSS APPLY(VALUES(CASE WHEN sc.CharOnPos='"' THEN CASE WHEN r.QuoteIsOpen=1 THEN 0 ELSE 1 END ELSE r.QuoteIsOpen END )) A(QuoteIsStillOpen)
)
--这个 CTE 使用 TOP 1 WITH TIES
和 ORDER BY
分区的 ROW_NUMBER()
执行一个技巧
-- 结果将包含递归的最终字符串 ID
,newlySeparated AS
(
SELECT TOP 1 WITH TIES * FROM recCTE
ORDER BY ROW_NUMBER() OVER(PARTITION BY ID ORDER BY Pos DESC)
)
--最后的 SELECT
使用技巧来拆分字符串位置和类型安全
SELECT A.*
FROM newlySeparated ns
CROSS APPLY OPENJSON(CONCAT(N'[["',REPLACE(REPLACE(ns.GrowingString,'"',''),'$%&','","'),N'"]]'))
WITH(Company NVARCHAR(100) '$[0]'
,Code1 NVARCHAR(100) '$[1]'
,Date1 DATE '$[2]'
,Date2 DATE '$[3]'
,Decimal1 NVARCHAR(100) '$[4]' --Using a numbers type might work here, this depends on your machine
,Decimal2 NVARCHAR(100) '$[5]'
,Decimal3 NVARCHAR(100) '$[6]'
,Decimal4 NVARCHAR(100) '$[7]'
,Decimal5 NVARCHAR(100) '$[8]'
,Decimal6 NVARCHAR(100) '$[9]'
,Decimal7 NVARCHAR(100) '$[10]') A
OPTION(MAXRECURSION 0);
结果
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+
| Pacey LLC. | 213830ZZ | 2017-01-11 | 2017-01-11 | 297,311.74 | 2,371.40 | 0.00 | 1,325.18 | 0.00 | 42.22 | 123,986.56 |
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+
| Mike The Miker | 9814140VCD | 2018-01-12 | 2018-01-12 | 3,917,751.99 | 419,743.54 | 36,642.66 | 344,090.43 | 0.00 | 10.00 | 2,434,671.06 |
+----------------+------------+------------+------------+--------------+------------+-----------+------------+------+-------+--------------+