在某些列中传播具有非唯一值的数据框
Spread a data frame with non unique values in some columns
这是我正在处理的数据:
> data
Segment Product Value Key
1 non-domestic S1 517.50760307564053 Actuals Sales
2 non-domestic S2 1235.3088913918129 Actuals Sales
3 non-domestic S3 2141.6841816176966 Actuals Sales
4 domestic S1 -958.38836859580044 Actuals Sales
5 domestic S2 -1129.5593769492507 Actuals Sales
6 domestic S3 -137.68477107274975 Actuals Sales
7 non-domestic S1 -296.07559218703756 Quarter Sales
8 non-domestic S2 1092.0390648120747 Quarter Sales
9 non-domestic S3 1156.2866848179935 Quarter Sales
10 domestic S1 -1975.0222255105061 Quarter Sales
11 domestic S2 -2549.8125184965966 Quarter Sales
12 domestic S3 -2608.2434152116011 Quarter Sales
我正在尝试展开它以获得具有 6 行和 4 列 (Segment, Product, Actuals Sales, Quarter Sales
) 且没有缺失值的 table
spread(data=data, key=Key, value=Value)
不幸的是,我得到的是这个。我知道这是因为 Segment
和 Product
列中有非唯一值。
Segment Product Actuals Sales Quarter Sales
1 domestic S1 -958.38836859580044 <NA>
2 domestic S2 -1129.5593769492507 <NA>
3 domestic S3 -137.68477107274975 <NA>
4 domestic S1 <NA> -1975.0222255105061
5 domestic S2 <NA> -2549.8125184965966
6 domestic S3 <NA> -2608.2434152116011
7 non-domestic S1 517.50760307564053 <NA>
8 non-domestic S2 1235.3088913918129 <NA>
9 non-domestic S3 2141.6841816176966 <NA>
10 non-domestic S1 <NA> -296.07559218703756
11 non-domestic S2 <NA> 1092.0390648120747
12 non-domestic S3 <NA> 1156.2866848179935
你能帮我吗,我如何删除缺失值并创建一个 table,其中前两列中的值不重复?
这是可重现的例子:
> dput(data)
structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic ", "non-domestic ",
"non-domestic ", "domestic ", "domestic ", "domestic "), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
我会使用 data.table 包来生成 2 个表,然后合并它们。
希望这段代码对您有所帮助。
library(data.table)
#"test" is your data frame input
test <- data.table(test)
a <- test[Key=="ActualsSales",.(Segment=Segment, Product=Product, ActualsSales=Value)]
b <- test[Key=="QuarterSales",.(Segment=Segment, Product=Product, QuarterSales=Value)]
output <- merge(a,b, by=c("Segment","Product"))
print(output)
删除不需要的空格 (trimws()
) 并转换为宽
library(data.table)
dcast(setDT(mydata), trimws(Segment) + Product ~ Key, value.var = "Value", fill = NA)
# Segment Product Actuals Sales Quarter Sales
# 1: domestic S1 -958.38836859580044 -1975.0222255105061
# 2: domestic S2 -1129.5593769492507 -2549.8125184965966
# 3: domestic S3 -137.68477107274975 -2608.2434152116011
# 4: non-domestic S1 517.50760307564053 -296.07559218703756
# 5: non-domestic S2 1235.3088913918129 1092.0390648120747
# 6: non-domestic S3 2141.6841816176966 1156.2866848179935
使用 reshape
的基础 R 选项
reshape(
transform(data, Segment = trimws(Segment)),
direction = "wide",
idvar = c("Segment", "Product"),
timevar = "Key"
)
给予
Segment Product Value.Actuals Sales Value.Quarter Sales
1 non-domestic S1 517.50760307564053 -296.07559218703756
2 non-domestic S2 1235.3088913918129 1092.0390648120747
3 non-domestic S3 2141.6841816176966 1156.2866848179935
4 domestic S1 -958.38836859580044 -1975.0222255105061
5 domestic S2 -1129.5593769492507 -2549.8125184965966
6 domestic S3 -137.68477107274975 -2608.2434152116011
qs <- df$Value[df$Key == 'Quarter Sales']
as <- df$Value[df$Key == 'Actuals Sales']
df$QS <- c(qs, rep(NA, length(qs)))
df$AS <- c(as, rep(NA, length(as)))
df$Key <- NULL
df <- df[complete.cases(df),]
您的示例数据实际上包含一些空格,在删除这些空格后 pivot_wider
及其参数 id_cols
就像一个魅力
data <- structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic", "non-domestic",
"non-domestic", "domestic", "domestic", "domestic"), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
library(tidyr)
data %>% pivot_wider(names_from = Key, values_from = Value, id_cols = c(Segment, Product))
#> # A tibble: 6 x 4
#> Segment Product `Actuals Sales` `Quarter Sales`
#> <chr> <chr> <chr> <chr>
#> 1 non-domestic S1 517.50760307564053 -296.07559218703756
#> 2 non-domestic S2 1235.3088913918129 1092.0390648120747
#> 3 non-domestic S3 2141.6841816176966 1156.2866848179935
#> 4 domestic S1 -958.38836859580044 -1975.0222255105061
#> 5 domestic S2 -1129.5593769492507 -2549.8125184965966
#> 6 domestic S3 -137.68477107274975 -2608.2434152116011
不过,如果您的实际数据还包含空格,您可以在旋转之前使用 stringr::str_trim()
。
data <- structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic ", "non-domestic ",
"non-domestic ", "domestic ", "domestic ", "domestic "), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
library(tidyverse)
data %>% mutate(Segment = str_trim(Segment)) %>%
pivot_wider(names_from = Key, values_from = Value, id_cols = c(Segment, Product))
#> # A tibble: 6 x 4
#> Segment Product `Actuals Sales` `Quarter Sales`
#> <chr> <chr> <chr> <chr>
#> 1 non-domestic S1 517.50760307564053 -296.07559218703756
#> 2 non-domestic S2 1235.3088913918129 1092.0390648120747
#> 3 non-domestic S3 2141.6841816176966 1156.2866848179935
#> 4 domestic S1 -958.38836859580044 -1975.0222255105061
#> 5 domestic S2 -1129.5593769492507 -2549.8125184965966
#> 6 domestic S3 -137.68477107274975 -2608.2434152116011
由 reprex package (v2.0.0)
于 2021-06-11 创建
这是我正在处理的数据:
> data
Segment Product Value Key
1 non-domestic S1 517.50760307564053 Actuals Sales
2 non-domestic S2 1235.3088913918129 Actuals Sales
3 non-domestic S3 2141.6841816176966 Actuals Sales
4 domestic S1 -958.38836859580044 Actuals Sales
5 domestic S2 -1129.5593769492507 Actuals Sales
6 domestic S3 -137.68477107274975 Actuals Sales
7 non-domestic S1 -296.07559218703756 Quarter Sales
8 non-domestic S2 1092.0390648120747 Quarter Sales
9 non-domestic S3 1156.2866848179935 Quarter Sales
10 domestic S1 -1975.0222255105061 Quarter Sales
11 domestic S2 -2549.8125184965966 Quarter Sales
12 domestic S3 -2608.2434152116011 Quarter Sales
我正在尝试展开它以获得具有 6 行和 4 列 (Segment, Product, Actuals Sales, Quarter Sales
) 且没有缺失值的 table
spread(data=data, key=Key, value=Value)
不幸的是,我得到的是这个。我知道这是因为 Segment
和 Product
列中有非唯一值。
Segment Product Actuals Sales Quarter Sales
1 domestic S1 -958.38836859580044 <NA>
2 domestic S2 -1129.5593769492507 <NA>
3 domestic S3 -137.68477107274975 <NA>
4 domestic S1 <NA> -1975.0222255105061
5 domestic S2 <NA> -2549.8125184965966
6 domestic S3 <NA> -2608.2434152116011
7 non-domestic S1 517.50760307564053 <NA>
8 non-domestic S2 1235.3088913918129 <NA>
9 non-domestic S3 2141.6841816176966 <NA>
10 non-domestic S1 <NA> -296.07559218703756
11 non-domestic S2 <NA> 1092.0390648120747
12 non-domestic S3 <NA> 1156.2866848179935
你能帮我吗,我如何删除缺失值并创建一个 table,其中前两列中的值不重复?
这是可重现的例子:
> dput(data)
structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic ", "non-domestic ",
"non-domestic ", "domestic ", "domestic ", "domestic "), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
我会使用 data.table 包来生成 2 个表,然后合并它们。
希望这段代码对您有所帮助。
library(data.table)
#"test" is your data frame input
test <- data.table(test)
a <- test[Key=="ActualsSales",.(Segment=Segment, Product=Product, ActualsSales=Value)]
b <- test[Key=="QuarterSales",.(Segment=Segment, Product=Product, QuarterSales=Value)]
output <- merge(a,b, by=c("Segment","Product"))
print(output)
删除不需要的空格 (trimws()
) 并转换为宽
library(data.table)
dcast(setDT(mydata), trimws(Segment) + Product ~ Key, value.var = "Value", fill = NA)
# Segment Product Actuals Sales Quarter Sales
# 1: domestic S1 -958.38836859580044 -1975.0222255105061
# 2: domestic S2 -1129.5593769492507 -2549.8125184965966
# 3: domestic S3 -137.68477107274975 -2608.2434152116011
# 4: non-domestic S1 517.50760307564053 -296.07559218703756
# 5: non-domestic S2 1235.3088913918129 1092.0390648120747
# 6: non-domestic S3 2141.6841816176966 1156.2866848179935
使用 reshape
reshape(
transform(data, Segment = trimws(Segment)),
direction = "wide",
idvar = c("Segment", "Product"),
timevar = "Key"
)
给予
Segment Product Value.Actuals Sales Value.Quarter Sales
1 non-domestic S1 517.50760307564053 -296.07559218703756
2 non-domestic S2 1235.3088913918129 1092.0390648120747
3 non-domestic S3 2141.6841816176966 1156.2866848179935
4 domestic S1 -958.38836859580044 -1975.0222255105061
5 domestic S2 -1129.5593769492507 -2549.8125184965966
6 domestic S3 -137.68477107274975 -2608.2434152116011
qs <- df$Value[df$Key == 'Quarter Sales']
as <- df$Value[df$Key == 'Actuals Sales']
df$QS <- c(qs, rep(NA, length(qs)))
df$AS <- c(as, rep(NA, length(as)))
df$Key <- NULL
df <- df[complete.cases(df),]
您的示例数据实际上包含一些空格,在删除这些空格后 pivot_wider
及其参数 id_cols
就像一个魅力
data <- structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic", "non-domestic",
"non-domestic", "domestic", "domestic", "domestic"), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
library(tidyr)
data %>% pivot_wider(names_from = Key, values_from = Value, id_cols = c(Segment, Product))
#> # A tibble: 6 x 4
#> Segment Product `Actuals Sales` `Quarter Sales`
#> <chr> <chr> <chr> <chr>
#> 1 non-domestic S1 517.50760307564053 -296.07559218703756
#> 2 non-domestic S2 1235.3088913918129 1092.0390648120747
#> 3 non-domestic S3 2141.6841816176966 1156.2866848179935
#> 4 domestic S1 -958.38836859580044 -1975.0222255105061
#> 5 domestic S2 -1129.5593769492507 -2549.8125184965966
#> 6 domestic S3 -137.68477107274975 -2608.2434152116011
不过,如果您的实际数据还包含空格,您可以在旋转之前使用 stringr::str_trim()
。
data <- structure(list(Segment = c("non-domestic", "non-domestic", "non-domestic",
"domestic", "domestic", "domestic", "non-domestic ", "non-domestic ",
"non-domestic ", "domestic ", "domestic ", "domestic "), Product = c("S1",
"S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3", "S1", "S2", "S3"
), Value = c("517.50760307564053", "1235.3088913918129", "2141.6841816176966",
"-958.38836859580044", "-1129.5593769492507", "-137.68477107274975",
"-296.07559218703756", "1092.0390648120747", "1156.2866848179935",
"-1975.0222255105061", "-2549.8125184965966", "-2608.2434152116011"
), Key = c("Actuals Sales", "Actuals Sales", "Actuals Sales",
"Actuals Sales", "Actuals Sales", "Actuals Sales", "Quarter Sales",
"Quarter Sales", "Quarter Sales", "Quarter Sales", "Quarter Sales",
"Quarter Sales")), .Names = c("Segment", "Product", "Value",
"Key"), row.names = c(NA, -12L), class = "data.frame")
library(tidyverse)
data %>% mutate(Segment = str_trim(Segment)) %>%
pivot_wider(names_from = Key, values_from = Value, id_cols = c(Segment, Product))
#> # A tibble: 6 x 4
#> Segment Product `Actuals Sales` `Quarter Sales`
#> <chr> <chr> <chr> <chr>
#> 1 non-domestic S1 517.50760307564053 -296.07559218703756
#> 2 non-domestic S2 1235.3088913918129 1092.0390648120747
#> 3 non-domestic S3 2141.6841816176966 1156.2866848179935
#> 4 domestic S1 -958.38836859580044 -1975.0222255105061
#> 5 domestic S2 -1129.5593769492507 -2549.8125184965966
#> 6 domestic S3 -137.68477107274975 -2608.2434152116011
由 reprex package (v2.0.0)
于 2021-06-11 创建