如何在 R 中创建计算列
How to create a calculated column in R
下面是示例数据集和所需的操作。到目前为止,一切正常。正在尝试创建新的计算列。在某些情况下,smb 代表小企业。 1、2、3、4 代表被认为是小的不同阈值。例如,对于给定区域,所需的列是 smb = 1 占总就业的百分比。例如,对于区域 001,这将是 46/1927。我可以弄清楚如何让它出现一次而不是作为一个完整的专栏。我该怎么做呢?想要的结果在底部。
library(readxl)
library(dplyr)
library(data.table)
library(DBI)
library(stringr)
library(tidyverse)
library(gt)
employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)
smbtest <- data.frame(employment,small,area,year,qtr)
smbtest$smb <-0
smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))
smbsummary2<-smbtest %>%
mutate(period = paste0(year,"q",qtr)) %>%
group_by(area,period,smb) %>%
summarise(employment = sum(employment), worksites = n(),
.groups = 'drop_last') %>%
mutate(employment = cumsum(employment),
worksites = cumsum(worksites))
smbsummary2<- smbsummary2%>%
group_by(area,smb)%>%
mutate(empprevyear=lag(employment),
empprevyearpp=employment-empprevyear,
empprevyearpct=((employment/empprevyear)-1),
empprevyearpct=scales::percent(empprevyearpct,accuracy = 0.01)
)
smblonger2<-smbsummary2 %>%
dplyr::select(area,period,employment,worksites,smb) %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value)%>%filter(smb %in%
c("1","2","3","4"))%>%gt()%>%cols_label(
smb = md("**Category**"))
smblonger2
area period smb employment worksites pcttotal
1 2020q1 1 46 2 46/1927 (total employment)
2 2020q2 2 301 4 301/1927
3 2020q3 3 466 5 466/1927
4 2020q4 4 726 6 726/1927
schema
smb employment range
1 0 to 100
2 0 to 150
3 0 to 250
4 0 to 500
好的,这就是我的解决方案(现在有人会提供 1 行函数!)
library(dplyr)
library(tidyr)
employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)
smbtest <- data.frame(employment,small,area,year,qtr)
smbtest$smb <-0 # I think this line is redundent
smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))
smbsummary2<-smbtest %>%
mutate(period = paste0(year,"q",qtr)) %>%
group_by(area,period,smb) %>%
summarise(employment = sum(employment), worksites = n(),
.groups = 'drop_last') %>%
mutate(employment = cumsum(employment),
worksites = cumsum(worksites))
smbsummary2 %>%
# Make the data wider (a column for each smb)
pivot_wider(
id_cols=c("area", "period"),
names_from = "smb",
values_from = c("employment", "worksites"),
names_prefix = "SMB"
) %>%
# calculate the %
mutate(across(starts_with("employment_SMB"),
~(100*(.x/employment_SMBNA)),
.names = "pcttotal_{.col}")) %>%
# Now make the data longer
pivot_longer(
cols = contains("SMB")
) %>%
# rework the data names so the smb is a value
separate(name, into=c("name", "smb"), sep="_SMB") %>%
# Make the date wider again to match the shape requested
pivot_wider(
id_cols=c("area", "period", "smb"),
names_from = "name",
values_from = "value"
) -> smbsummary3
下面是示例数据集和所需的操作。到目前为止,一切正常。正在尝试创建新的计算列。在某些情况下,smb 代表小企业。 1、2、3、4 代表被认为是小的不同阈值。例如,对于给定区域,所需的列是 smb = 1 占总就业的百分比。例如,对于区域 001,这将是 46/1927。我可以弄清楚如何让它出现一次而不是作为一个完整的专栏。我该怎么做呢?想要的结果在底部。
library(readxl)
library(dplyr)
library(data.table)
library(DBI)
library(stringr)
library(tidyverse)
library(gt)
employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)
smbtest <- data.frame(employment,small,area,year,qtr)
smbtest$smb <-0
smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))
smbsummary2<-smbtest %>%
mutate(period = paste0(year,"q",qtr)) %>%
group_by(area,period,smb) %>%
summarise(employment = sum(employment), worksites = n(),
.groups = 'drop_last') %>%
mutate(employment = cumsum(employment),
worksites = cumsum(worksites))
smbsummary2<- smbsummary2%>%
group_by(area,smb)%>%
mutate(empprevyear=lag(employment),
empprevyearpp=employment-empprevyear,
empprevyearpct=((employment/empprevyear)-1),
empprevyearpct=scales::percent(empprevyearpct,accuracy = 0.01)
)
smblonger2<-smbsummary2 %>%
dplyr::select(area,period,employment,worksites,smb) %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value)%>%filter(smb %in%
c("1","2","3","4"))%>%gt()%>%cols_label(
smb = md("**Category**"))
smblonger2
area period smb employment worksites pcttotal
1 2020q1 1 46 2 46/1927 (total employment)
2 2020q2 2 301 4 301/1927
3 2020q3 3 466 5 466/1927
4 2020q4 4 726 6 726/1927
schema
smb employment range
1 0 to 100
2 0 to 150
3 0 to 250
4 0 to 500
好的,这就是我的解决方案(现在有人会提供 1 行函数!)
library(dplyr)
library(tidyr)
employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)
smbtest <- data.frame(employment,small,area,year,qtr)
smbtest$smb <-0 # I think this line is redundent
smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))
smbsummary2<-smbtest %>%
mutate(period = paste0(year,"q",qtr)) %>%
group_by(area,period,smb) %>%
summarise(employment = sum(employment), worksites = n(),
.groups = 'drop_last') %>%
mutate(employment = cumsum(employment),
worksites = cumsum(worksites))
smbsummary2 %>%
# Make the data wider (a column for each smb)
pivot_wider(
id_cols=c("area", "period"),
names_from = "smb",
values_from = c("employment", "worksites"),
names_prefix = "SMB"
) %>%
# calculate the %
mutate(across(starts_with("employment_SMB"),
~(100*(.x/employment_SMBNA)),
.names = "pcttotal_{.col}")) %>%
# Now make the data longer
pivot_longer(
cols = contains("SMB")
) %>%
# rework the data names so the smb is a value
separate(name, into=c("name", "smb"), sep="_SMB") %>%
# Make the date wider again to match the shape requested
pivot_wider(
id_cols=c("area", "period", "smb"),
names_from = "name",
values_from = "value"
) -> smbsummary3