如何在 SQL 服务器中从 R 复制 dcast()
How to replicate dcast() from R in SQL Server
目前我正在使用 R 将我的 table data 转换为以下形式:
ID Code Condition WT
104 KEENTRAN CON4 .30577
. . . .
. . . .
link 应该适用于任何想要下载我的数据框的人,否则这是一个子集:
>dput(head(df))
structure(list(ID = c(104L, 368L, 10632L, 20385L, 24361L, 34378L
), Code = c("KEENTRAN", "ALEXEXPR", "MINNEXPMN", "JACKMOVWI",
"FREICOIN", "JBXEXPGA"), Condition = c("CON4", "CON4", "CON2",
"CON2", "CON6", "CON5"), WT = c(0.3057717456, 0.7909870604, 1,
1, 0.4301040524, 0.5977268575)), .Names = c("ID", "Code", "Condition",
"WT"), class = c("tbl_df", "data.frame"), row.names = c(NA, -6L
))
背景
我的示例数据是长格式,其中条件变量的范围从 "CON1" 到 "CON6",我想将我的数据重新转换为宽格式,其中 ID 和 Code 值将是主键,Condition 的级别将是其值取最大值 WT 用于特定 ID、代码、条件分组(如果不存在此类配对,则为零)。这可以在 R 中使用 reshape2
包中的 dcast()
函数轻松完成:
library(reshape2)
Result <- df %>% group_by(ID, Condition) %>%
summarise(value = max(as.numeric(WT))) %>%
dcast(ID ~ Condition)
Result[is.na(Result)] <- 0
我想在 SQL 服务器中复制这个数据操作过程,但我不确定如何最好地做到这一点。非常感谢任何帮助或见解。
我能够通过在 SQL Server 2016 中执行以下操作来回答我的问题:
首先,我将发布的数据(这是存储过程的结果)转储到临时文件中 table:
DROP TABLE IF EXISTS #InputDataFrame
CREATE TABLE #InputDataFrame ( ID int, Code varchar(25), Condition varchar(25), WT float)
INSERT INTO #InputDataFrame
exec dbo.[Stored_Proc_to_Create_Sample_Data] -- stored proc to create my posted data
然后我能够重新创建所需的转换如下:
DROP TABLE IF EXISTS #DistinctIDs
CREATE TABLE #DistinctIDs ( ID int, Code varchar(25) )
INSERT INTO #DistinctIDs ( ID, Code)
Select Distinct
I.ID,
I.Code
From #InputDataFrame I
Select
D.*,
CASE WHEN CON1.WT IS NULL THEN 0 ELSE CON1.WT END as CON1,
CASE WHEN CON2.WT IS NULL THEN 0 ELSE CON2.WT END as CON2,
CASE WHEN CON3.WT IS NULL THEN 0 ELSE CON3.WT END as CON3,
CASE WHEN CON4.WT IS NULL THEN 0 ELSE CON4.WT END as CON4,
CASE WHEN CON5.WT IS NULL THEN 0 ELSE CON5.WT END as CON5,
CASE WHEN CON6.WT IS NULL THEN 0 ELSE CON6.WT END as CON6
From #DistinctIDs D
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON1' Group By I.ID) CON1 on CON1.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON2' Group By I.ID) CON2 on CON2.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON3' Group By I.ID) CON3 on CON3.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON4' Group By I.ID) CON4 on CON4.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON5' Group By I.ID) CON5 on CON5.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON6' Group By I.ID) CON6 on CON6.ID = D.ID
我怀疑这是最优雅的解决方案,但它对我的目的有用,希望它对可能在 SQL
设置中努力模仿 R 的 dcast()
功能的其他人有用.
考虑 SQL 服务器的 PIVOT 操作:
SELECT t.ID,
ISNULL(t.[CON1], 0) AS [CON1],
ISNULL(t.[CON2], 0) AS [CON2],
ISNULL(t.[CON3], 0) AS [CON3],
ISNULL(t.[CON4], 0) AS [CON4],
ISNULL(t.[CON5], 0) AS [CON5],
ISNULL(t.[CON6], 0) AS [CON6]
FROM RDataFrame As r
PIVOT
(
MAX(r.[WT])
FOR r.Condition IN ([CON1], [CON2], [CON3], [CON4], [CON5], [CON6])
) AS t
-- ID CON1 CON2 CON3 CON4 CON5 CON6
-- 8 0 0 0 0.4394051665 0 0
-- 10 0 0 0 0.6013843825 0 0
-- 15 0 0 0 0.07231002554 0 0
-- 21 0 0 0 0.6013843825 0 0
-- 23 0 0 0 0.7720454793 0 0
-- 80 0 1 0 0 0 0
-- 104 0 0 0 0.3057717456 0 0
-- 144 0 0 0 0.1430937996 0 0.2646439667
-- 145 0 0 0 0.8276574 0 0
-- 155 0 1 0 0.8977280575 0 0
-- 156 0 0 0 0.8453629338 0 0
-- 158 0 0 0 0.5221399019 0 0
目前我正在使用 R 将我的 table data 转换为以下形式:
ID Code Condition WT
104 KEENTRAN CON4 .30577
. . . .
. . . .
link 应该适用于任何想要下载我的数据框的人,否则这是一个子集:
>dput(head(df))
structure(list(ID = c(104L, 368L, 10632L, 20385L, 24361L, 34378L
), Code = c("KEENTRAN", "ALEXEXPR", "MINNEXPMN", "JACKMOVWI",
"FREICOIN", "JBXEXPGA"), Condition = c("CON4", "CON4", "CON2",
"CON2", "CON6", "CON5"), WT = c(0.3057717456, 0.7909870604, 1,
1, 0.4301040524, 0.5977268575)), .Names = c("ID", "Code", "Condition",
"WT"), class = c("tbl_df", "data.frame"), row.names = c(NA, -6L
))
背景
我的示例数据是长格式,其中条件变量的范围从 "CON1" 到 "CON6",我想将我的数据重新转换为宽格式,其中 ID 和 Code 值将是主键,Condition 的级别将是其值取最大值 WT 用于特定 ID、代码、条件分组(如果不存在此类配对,则为零)。这可以在 R 中使用 reshape2
包中的 dcast()
函数轻松完成:
library(reshape2)
Result <- df %>% group_by(ID, Condition) %>%
summarise(value = max(as.numeric(WT))) %>%
dcast(ID ~ Condition)
Result[is.na(Result)] <- 0
我想在 SQL 服务器中复制这个数据操作过程,但我不确定如何最好地做到这一点。非常感谢任何帮助或见解。
我能够通过在 SQL Server 2016 中执行以下操作来回答我的问题:
首先,我将发布的数据(这是存储过程的结果)转储到临时文件中 table:
DROP TABLE IF EXISTS #InputDataFrame
CREATE TABLE #InputDataFrame ( ID int, Code varchar(25), Condition varchar(25), WT float)
INSERT INTO #InputDataFrame
exec dbo.[Stored_Proc_to_Create_Sample_Data] -- stored proc to create my posted data
然后我能够重新创建所需的转换如下:
DROP TABLE IF EXISTS #DistinctIDs
CREATE TABLE #DistinctIDs ( ID int, Code varchar(25) )
INSERT INTO #DistinctIDs ( ID, Code)
Select Distinct
I.ID,
I.Code
From #InputDataFrame I
Select
D.*,
CASE WHEN CON1.WT IS NULL THEN 0 ELSE CON1.WT END as CON1,
CASE WHEN CON2.WT IS NULL THEN 0 ELSE CON2.WT END as CON2,
CASE WHEN CON3.WT IS NULL THEN 0 ELSE CON3.WT END as CON3,
CASE WHEN CON4.WT IS NULL THEN 0 ELSE CON4.WT END as CON4,
CASE WHEN CON5.WT IS NULL THEN 0 ELSE CON5.WT END as CON5,
CASE WHEN CON6.WT IS NULL THEN 0 ELSE CON6.WT END as CON6
From #DistinctIDs D
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON1' Group By I.ID) CON1 on CON1.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON2' Group By I.ID) CON2 on CON2.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON3' Group By I.ID) CON3 on CON3.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON4' Group By I.ID) CON4 on CON4.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON5' Group By I.ID) CON5 on CON5.ID = D.ID
Left Join ( Select I.ID, MAX( I.WT) as WT From #InputDataFrame I Where I.Condition = 'CON6' Group By I.ID) CON6 on CON6.ID = D.ID
我怀疑这是最优雅的解决方案,但它对我的目的有用,希望它对可能在 SQL
设置中努力模仿 R 的 dcast()
功能的其他人有用.
考虑 SQL 服务器的 PIVOT 操作:
SELECT t.ID,
ISNULL(t.[CON1], 0) AS [CON1],
ISNULL(t.[CON2], 0) AS [CON2],
ISNULL(t.[CON3], 0) AS [CON3],
ISNULL(t.[CON4], 0) AS [CON4],
ISNULL(t.[CON5], 0) AS [CON5],
ISNULL(t.[CON6], 0) AS [CON6]
FROM RDataFrame As r
PIVOT
(
MAX(r.[WT])
FOR r.Condition IN ([CON1], [CON2], [CON3], [CON4], [CON5], [CON6])
) AS t
-- ID CON1 CON2 CON3 CON4 CON5 CON6
-- 8 0 0 0 0.4394051665 0 0
-- 10 0 0 0 0.6013843825 0 0
-- 15 0 0 0 0.07231002554 0 0
-- 21 0 0 0 0.6013843825 0 0
-- 23 0 0 0 0.7720454793 0 0
-- 80 0 1 0 0 0 0
-- 104 0 0 0 0.3057717456 0 0
-- 144 0 0 0 0.1430937996 0 0.2646439667
-- 145 0 0 0 0.8276574 0 0
-- 155 0 1 0 0.8977280575 0 0
-- 156 0 0 0 0.8453629338 0 0
-- 158 0 0 0 0.5221399019 0 0