如何在R中将数据从一列移动到另一列
How to move data from one column to another in R
由于基本表格填写不正确,我正在尝试将数据从一列移动到另一列。
在表格中,它要求提供有关家庭的信息,并询问每个成员的年龄 (AGE) 和性别 (SEX),每个家庭最多允许 5 人。然而,一些用户已经填写了人 1,3 和 4 的信息,但没有填写人 2 的任何信息,因为他们填写错误的人 2,划掉了详细信息并将人 2 的详细信息填入了人 3 的框等
数据看起来像这样(此数据中参考 1 和 5 正确,其他所有错误)
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
这是 table 目前的样子
(我用 - 替换了 NA 以便于阅读)
参考
AGE1
AGE2
AGE3
AGE4
5 岁
性别1
SEX2
SEX3
SEX4
SEX5
1
45
-
-
-
-
男
-
-
-
-
2
36
24
-
-
15
F
男
-
-
F
3
26
-
35
15
-
男
-
男
F
-
4
47
13
-
11
-
男
F
-
F
-
5
24
57
-
-
-
男
F
-
-
-
6
-
28
26
-
-
-
F
男
-
-
但我希望它看起来像这样
参考
AGE1
AGE2
AGE3
AGE4
5 岁
性别1
SEX2
SEX3
SEX4
SEX5
1
45
-
-
-
-
男
-
-
-
-
2
36
24
15
-
-
F
男
F
-
-
3
26
35
15
-
-
男
男
F
-
-
4
47
13
11
-
-
男
F
F
-
-
5
24
57
-
-
-
男
F
-
-
-
6
28
26
-
-
-
F
男
-
-
-
有没有办法使用 dplyr
更正此问题?如果不是,R 中是否还有另一种方法可以更正数据
这是使用 dplyr
和 tidyr
的方法。该方法涉及将数据转换为更长的格式,将 NA
值排序到末尾,对列名称重新编号,然后再次转换为宽格式。
library(dplyr)
library(tidyr)
df <- data.frame(ref, AGE1, AGE2, AGE3, AGE4, AGE5,
SEX1, SEX2, SEX3, SEX4, SEX5)
df %>%
mutate(across(starts_with("AGE"), as.character)) %>%
pivot_longer(2:11) %>%
separate(name, into = c("cat", "num"), 3) %>%
arrange(is.na(value)) %>%
group_by(ref, cat) %>%
mutate(num = seq_along(value)) %>%
ungroup() %>%
arrange(cat) %>%
unite(name, cat, num, sep = "") %>%
pivot_wider(id_cols = ref) %>%
mutate(across(starts_with("AGE"), as.numeric))
# A tibble: 6 x 11
ref AGE1 AGE2 AGE3 AGE4 AGE5 SEX1 SEX2 SEX3 SEX4 SEX5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 1 45 NA NA NA NA M NA NA NA NA
2 2 36 24 15 NA NA F M F NA NA
3 3 26 35 15 NA NA M M F NA NA
4 4 47 13 11 NA NA M F F NA NA
5 5 24 57 NA NA NA M F NA NA NA
6 6 28 26 NA NA NA F M NA NA NA
试试下面的基本代码
u1 <- reshape(
setNames(df, sub("(\d)", ".\1", names(df))),
direction = "long",
idvar = "ref",
varying = -1
)
u2 <- reshape(
transform(
u1[with(u1, order(is.na(AGE), is.na(SEX))), ],
time = ave(time, ref, FUN = seq_along)
),
direction = "wide",
idvar = "ref"
)
out <- u2[match(names(df),sub("\.","",names(u2)))]
你会得到
> out
ref AGE.1 AGE.2 AGE.3 AGE.4 AGE.5 SEX.1 SEX.2 SEX.3 SEX.4 SEX.5
1.1 1 45 NA NA NA NA M <NA> <NA> <NA> <NA>
2.1 2 36 24 15 NA NA F M F <NA> <NA>
3.1 3 26 35 15 NA NA M M F <NA> <NA>
4.1 4 47 13 11 NA NA M F F <NA> <NA>
5.1 5 24 57 NA NA NA M F <NA> <NA> <NA>
6.2 6 28 26 NA NA NA F M <NA> <NA> <NA>
数据
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
这是使用 dplyr
和 tidyr
库的方法。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ref,
names_to = c('.value', 'num'),
names_pattern = '([A-Z]+)(\d+)') %>%
arrange(ref, AGE, SEX) %>%
group_by(ref) %>%
mutate(num = row_number()) %>%
ungroup %>%
pivot_wider(names_from = num, values_from = c(AGE, SEX))
# ref AGE_1 AGE_2 AGE_3 AGE_4 AGE_5 SEX_1 SEX_2 SEX_3 SEX_4 SEX_5
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
#1 1 45 NA NA NA NA M NA NA NA NA
#2 2 15 24 36 NA NA F M F NA NA
#3 3 15 26 35 NA NA F M M NA NA
#4 4 11 13 47 NA NA F F M NA NA
#5 5 24 57 NA NA NA M F NA NA NA
#6 6 26 28 NA NA NA M F NA NA NA
这是一个使用包 dedupewider
:
的解决方案
library(dedupewider)
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
age_moved <- na_move(df, cols = names(df)[grepl("^AGE\d$", names(df))]) # 'right' direction is by default
sex_moved <- na_move(age_moved, cols = names(df)[grepl("^SEX\d$", names(df))])
sex_moved
#> ref AGE1 AGE2 AGE3 AGE4 AGE5 SEX1 SEX2 SEX3 SEX4 SEX5
#> 1 1 45 NA NA NA NA M <NA> <NA> NA NA
#> 2 2 36 24 15 NA NA F M F NA NA
#> 3 3 26 35 15 NA NA M M F NA NA
#> 4 4 47 13 11 NA NA M F F NA NA
#> 5 5 24 57 NA NA NA M F <NA> NA NA
#> 6 6 28 26 NA NA NA F M <NA> NA NA
由于基本表格填写不正确,我正在尝试将数据从一列移动到另一列。
在表格中,它要求提供有关家庭的信息,并询问每个成员的年龄 (AGE) 和性别 (SEX),每个家庭最多允许 5 人。然而,一些用户已经填写了人 1,3 和 4 的信息,但没有填写人 2 的任何信息,因为他们填写错误的人 2,划掉了详细信息并将人 2 的详细信息填入了人 3 的框等
数据看起来像这样(此数据中参考 1 和 5 正确,其他所有错误)
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
这是 table 目前的样子 (我用 - 替换了 NA 以便于阅读)
参考 | AGE1 | AGE2 | AGE3 | AGE4 | 5 岁 | 性别1 | SEX2 | SEX3 | SEX4 | SEX5 |
---|---|---|---|---|---|---|---|---|---|---|
1 | 45 | - | - | - | - | 男 | - | - | - | - |
2 | 36 | 24 | - | - | 15 | F | 男 | - | - | F |
3 | 26 | - | 35 | 15 | - | 男 | - | 男 | F | - |
4 | 47 | 13 | - | 11 | - | 男 | F | - | F | - |
5 | 24 | 57 | - | - | - | 男 | F | - | - | - |
6 | - | 28 | 26 | - | - | - | F | 男 | - | - |
但我希望它看起来像这样
参考 | AGE1 | AGE2 | AGE3 | AGE4 | 5 岁 | 性别1 | SEX2 | SEX3 | SEX4 | SEX5 |
---|---|---|---|---|---|---|---|---|---|---|
1 | 45 | - | - | - | - | 男 | - | - | - | - |
2 | 36 | 24 | 15 | - | - | F | 男 | F | - | - |
3 | 26 | 35 | 15 | - | - | 男 | 男 | F | - | - |
4 | 47 | 13 | 11 | - | - | 男 | F | F | - | - |
5 | 24 | 57 | - | - | - | 男 | F | - | - | - |
6 | 28 | 26 | - | - | - | F | 男 | - | - | - |
有没有办法使用 dplyr
更正此问题?如果不是,R 中是否还有另一种方法可以更正数据
这是使用 dplyr
和 tidyr
的方法。该方法涉及将数据转换为更长的格式,将 NA
值排序到末尾,对列名称重新编号,然后再次转换为宽格式。
library(dplyr)
library(tidyr)
df <- data.frame(ref, AGE1, AGE2, AGE3, AGE4, AGE5,
SEX1, SEX2, SEX3, SEX4, SEX5)
df %>%
mutate(across(starts_with("AGE"), as.character)) %>%
pivot_longer(2:11) %>%
separate(name, into = c("cat", "num"), 3) %>%
arrange(is.na(value)) %>%
group_by(ref, cat) %>%
mutate(num = seq_along(value)) %>%
ungroup() %>%
arrange(cat) %>%
unite(name, cat, num, sep = "") %>%
pivot_wider(id_cols = ref) %>%
mutate(across(starts_with("AGE"), as.numeric))
# A tibble: 6 x 11
ref AGE1 AGE2 AGE3 AGE4 AGE5 SEX1 SEX2 SEX3 SEX4 SEX5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 1 45 NA NA NA NA M NA NA NA NA
2 2 36 24 15 NA NA F M F NA NA
3 3 26 35 15 NA NA M M F NA NA
4 4 47 13 11 NA NA M F F NA NA
5 5 24 57 NA NA NA M F NA NA NA
6 6 28 26 NA NA NA F M NA NA NA
试试下面的基本代码
u1 <- reshape(
setNames(df, sub("(\d)", ".\1", names(df))),
direction = "long",
idvar = "ref",
varying = -1
)
u2 <- reshape(
transform(
u1[with(u1, order(is.na(AGE), is.na(SEX))), ],
time = ave(time, ref, FUN = seq_along)
),
direction = "wide",
idvar = "ref"
)
out <- u2[match(names(df),sub("\.","",names(u2)))]
你会得到
> out
ref AGE.1 AGE.2 AGE.3 AGE.4 AGE.5 SEX.1 SEX.2 SEX.3 SEX.4 SEX.5
1.1 1 45 NA NA NA NA M <NA> <NA> <NA> <NA>
2.1 2 36 24 15 NA NA F M F <NA> <NA>
3.1 3 26 35 15 NA NA M M F <NA> <NA>
4.1 4 47 13 11 NA NA M F F <NA> <NA>
5.1 5 24 57 NA NA NA M F <NA> <NA> <NA>
6.2 6 28 26 NA NA NA F M <NA> <NA> <NA>
数据
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
这是使用 dplyr
和 tidyr
库的方法。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ref,
names_to = c('.value', 'num'),
names_pattern = '([A-Z]+)(\d+)') %>%
arrange(ref, AGE, SEX) %>%
group_by(ref) %>%
mutate(num = row_number()) %>%
ungroup %>%
pivot_wider(names_from = num, values_from = c(AGE, SEX))
# ref AGE_1 AGE_2 AGE_3 AGE_4 AGE_5 SEX_1 SEX_2 SEX_3 SEX_4 SEX_5
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
#1 1 45 NA NA NA NA M NA NA NA NA
#2 2 15 24 36 NA NA F M F NA NA
#3 3 15 26 35 NA NA F M M NA NA
#4 4 11 13 47 NA NA F F M NA NA
#5 5 24 57 NA NA NA M F NA NA NA
#6 6 26 28 NA NA NA M F NA NA NA
这是一个使用包 dedupewider
:
library(dedupewider)
df <- data.frame(
ref = c(1, 2, 3, 4, 5, 6),
AGE1 = c(45, 36, 26, 47, 24, NA),
AGE2 = c(NA, 24, NA, 13, 57, 28),
AGE3 = c(NA, NA, 35, NA, NA, 26),
AGE4 = c(NA, NA, 15, 11, NA, NA),
AGE5 = c(NA, 15, NA, NA, NA, NA),
SEX1 = c("M", "F", "M", "M", "M", NA),
SEX2 = c(NA, "M", NA, "F", "F", "F"),
SEX3 = c(NA, NA, "M", NA, NA, "M"),
SEX4 = c(NA, NA, "F", "F", NA, NA),
SEX5 = c(NA, "F", NA, NA, NA, NA)
)
age_moved <- na_move(df, cols = names(df)[grepl("^AGE\d$", names(df))]) # 'right' direction is by default
sex_moved <- na_move(age_moved, cols = names(df)[grepl("^SEX\d$", names(df))])
sex_moved
#> ref AGE1 AGE2 AGE3 AGE4 AGE5 SEX1 SEX2 SEX3 SEX4 SEX5
#> 1 1 45 NA NA NA NA M <NA> <NA> NA NA
#> 2 2 36 24 15 NA NA F M F NA NA
#> 3 3 26 35 15 NA NA M M F NA NA
#> 4 4 47 13 11 NA NA M F F NA NA
#> 5 5 24 57 NA NA NA M F <NA> NA NA
#> 6 6 28 26 NA NA NA F M <NA> NA NA