R:使用 gather 清理数据集
R: using gather to clean up a dataset
我有一个来自美国农业部的 csv 数据集,其中包含 1970 年、1980 年、1990 年和 2000 年美国各个县的成年人获得的教育水平。
我已经使用 read_csv 函数导入了这个 csv,然后我像这样清理数据集:
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "State"] <- "state"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Area name"] <- "area_name"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1970"] <- "Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1970"] <- "Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1970"] <- "AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1970"] <- "BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1970"] <- "%Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1970"] <- "% Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1970"] <- "% AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1970"] <- "% BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1980"] <- "Less Than Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1980"] <- "Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1980"] <- "AA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1980"] <- "BA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1980"] <- "% Less Than Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1980"] <- "% Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1980"] <- "% AA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1980"] <- "% BA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1990"] <- "Less Than Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1990"] <- "Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 1990"] <- "AA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 1990"] <- "BA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1990"] <- "% Less Than Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1990"] <- "% Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 1990"] <- "% AA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 1990"] <- "% BA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 2000"] <- "Less Than Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 2000"] <- "Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 2000"] <- "AA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 2000"] <- "BA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 2000"] <- "% Less Than Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 2000"] <- "% Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 2000"] <- "% AA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 2000"] <- "% BA or more, 2000"
所以现在我有一个非常大的标题,但问题是我现在想通过将年份分隔到它自己的列中以及在其他相应列中达到的教育水平的名称来进一步清理它。我知道 gather() 可以完成我想做的事情,但问题是我的数据集包含多年:1970、1980、1990 和 2000。
我希望我已经说清楚了,如果没有,我可以根据需要添加信息。任何帮助将不胜感激。
我觉得你命名变量的方式让它变得不必要的复杂。否则,privot_longer
,替代gather
的更新函数可能会解决此问题。我稍微改了你原来的名字:
使用 pivot_longer
将数据从宽转为长
library(tidyverse)
long<-pivot_longer(df, -c("state", "area_name"),
names_to = c(".value", "year"),
names_sep = "_", values_drop_na = TRUE)
> long
# A tibble: 4 x 11
state area_name year Less.Than.Diploma Diploma AA.or.more BA.or.more percent.Less.Than.D~ percent.Diploma percent.AA.or.m~ percent.BA.or.m~
<dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 1970 71 72 73 74 75 76 77 78
2 1 2 1980 81 82 83 84 85 86 87 88
3 1 2 1990 91 92 93 94 95 96 97 98
4 1 2 2000 21 22 23 24 25 26 27 28
>
数据
df <-data.frame(
"state" = 1,
"area_name" =2,
"Less Than Diploma_1970" = 71,
"Diploma_1970" = 72,
"AA or more_1970" = 73,
"BA or more_1970" = 74,
"percent Less Than Diploma_1970" = 75,
"percent Diploma_1970" = 76,
"percent AA or more_1970" = 77,
"percent BA or more_1970" = 78,
"Less Than Diploma_1980" = 81,
"Diploma_1980" = 82,
"AA or more_1980" = 83,
"BA or more_1980" = 84,
"percent Less Than Diploma_1980" = 85,
"percent Diploma_1980" = 86,
"percent AA or more_1980" = 87,
"percent BA or more_1980" = 88,
"Less Than Diploma_1990" = 91,
"Diploma_1990" = 92,
"AA or more_1990" = 93,
"BA or more_1990" = 94,
"percent Less Than Diploma_1990" = 95 ,
"percent Diploma_1990" = 96,
"percent AA or more_1990"= 97,
"percent BA or more_1990" = 98,
"Less Than Diploma_2000" = 21,
"Diploma_2000" = 22,
"AA or more_2000" = 23,
"BA or more_2000" = 24,
"percent Less Than Diploma_2000" = 25,
"percent Diploma_2000" = 26,
"percent AA or more_2000" = 27,
"percent BA or more_2000" = 28)
我有一个来自美国农业部的 csv 数据集,其中包含 1970 年、1980 年、1990 年和 2000 年美国各个县的成年人获得的教育水平。 我已经使用 read_csv 函数导入了这个 csv,然后我像这样清理数据集:
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "State"] <- "state"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Area name"] <- "area_name"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1970"] <- "Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1970"] <- "Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1970"] <- "AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1970"] <- "BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1970"] <- "%Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1970"] <- "% Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1970"] <- "% AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1970"] <- "% BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1980"] <- "Less Than Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1980"] <- "Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1980"] <- "AA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1980"] <- "BA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1980"] <- "% Less Than Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1980"] <- "% Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1980"] <- "% AA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1980"] <- "% BA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1990"] <- "Less Than Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1990"] <- "Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 1990"] <- "AA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 1990"] <- "BA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1990"] <- "% Less Than Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1990"] <- "% Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 1990"] <- "% AA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 1990"] <- "% BA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 2000"] <- "Less Than Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 2000"] <- "Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 2000"] <- "AA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 2000"] <- "BA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 2000"] <- "% Less Than Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 2000"] <- "% Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 2000"] <- "% AA or more, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 2000"] <- "% BA or more, 2000"
所以现在我有一个非常大的标题,但问题是我现在想通过将年份分隔到它自己的列中以及在其他相应列中达到的教育水平的名称来进一步清理它。我知道 gather() 可以完成我想做的事情,但问题是我的数据集包含多年:1970、1980、1990 和 2000。
我希望我已经说清楚了,如果没有,我可以根据需要添加信息。任何帮助将不胜感激。
我觉得你命名变量的方式让它变得不必要的复杂。否则,privot_longer
,替代gather
的更新函数可能会解决此问题。我稍微改了你原来的名字:
使用 pivot_longer
将数据从宽转为长
library(tidyverse)
long<-pivot_longer(df, -c("state", "area_name"),
names_to = c(".value", "year"),
names_sep = "_", values_drop_na = TRUE)
> long
# A tibble: 4 x 11
state area_name year Less.Than.Diploma Diploma AA.or.more BA.or.more percent.Less.Than.D~ percent.Diploma percent.AA.or.m~ percent.BA.or.m~
<dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2 1970 71 72 73 74 75 76 77 78
2 1 2 1980 81 82 83 84 85 86 87 88
3 1 2 1990 91 92 93 94 95 96 97 98
4 1 2 2000 21 22 23 24 25 26 27 28
>
数据
df <-data.frame(
"state" = 1,
"area_name" =2,
"Less Than Diploma_1970" = 71,
"Diploma_1970" = 72,
"AA or more_1970" = 73,
"BA or more_1970" = 74,
"percent Less Than Diploma_1970" = 75,
"percent Diploma_1970" = 76,
"percent AA or more_1970" = 77,
"percent BA or more_1970" = 78,
"Less Than Diploma_1980" = 81,
"Diploma_1980" = 82,
"AA or more_1980" = 83,
"BA or more_1980" = 84,
"percent Less Than Diploma_1980" = 85,
"percent Diploma_1980" = 86,
"percent AA or more_1980" = 87,
"percent BA or more_1980" = 88,
"Less Than Diploma_1990" = 91,
"Diploma_1990" = 92,
"AA or more_1990" = 93,
"BA or more_1990" = 94,
"percent Less Than Diploma_1990" = 95 ,
"percent Diploma_1990" = 96,
"percent AA or more_1990"= 97,
"percent BA or more_1990" = 98,
"Less Than Diploma_2000" = 21,
"Diploma_2000" = 22,
"AA or more_2000" = 23,
"BA or more_2000" = 24,
"percent Less Than Diploma_2000" = 25,
"percent Diploma_2000" = 26,
"percent AA or more_2000" = 27,
"percent BA or more_2000" = 28)