基于 75h 百分位数创建虚拟变量
Creating a dummy variable based on the 75h percentile
我想做的是构建一个自然灾害虚拟变量,当 Damage-to-GDP 比率高于第 75 个百分位时取值为 1,或者,当此数据不可用时,构建一个自然灾害变量当 Affected-to-total-population 高于第 75 个百分位数时,虚拟变量取值 1。有人可以提示我如何在 R 中构建它吗?
数据集示例:
Year
Country
Damage-to-GDP
Affected-people-to-total
1990
Tonga
0.0102282455
NA
1990
Samao
0.5102296844
NA
1990
Vanuatu
NA
1.364508e-05
> dput(data)
structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga",
"Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844,
NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA,
-3L))
也许这有点麻烦,但应该可行,并且是 base
解决方案。由于您没有提供可重现的示例,我模拟了一些数据:
set.seed(85)
df <- data.frame(gdp=sample(c(NA,runif(10)),100,replace = T),pop=sample(c(NA,runif(10)),100,replace = T))
summary(df)
#how many rows with both gdp and pop ==NA
sum(is.na(df)[,1]+is.na(df)[,2]==2)
#3rd quartile of gdp
quart3_gdp <- quantile(df$gdp,0.75,na.rm=T)
#3rd quartile of pop
quart3_pop <- quantile(df$pop,0.75,na.rm=T)
#first condition if gdp > 3rd quartile then 1 else 0
df$dummy <- ifelse(df$gdp>quart3_gdp,1,0)
#second condition if dummy==NA and pop>3rd quartile then 1 else 0
df$dummy[is.na(df$dummy)] <- ifelse(df$pop[is.na(df$dummy)]>quart3_pop,1,0)
#remove both gdp and pop == NA
df <- df[!is.na(df$dummy),]
数据集有点小,而且包含NA,所以很难测试,但可以试试这样:
data <- structure(
list(
Year = c(1990L, 1990L, 1990L),
Country = c("Tonga", "Samao", "Vanuatu"),
Damage_to_GDP = c(0.0102282455, 0.5102296844, NA),
Affected_people_to_total = c(NA, NA, 1.364508e-05)),
class = "data.frame", row.names = c(NA, -3L))
# estimate precentiles
GDP75 <- quantile(na.omit(data$Damage_to_GDP), 0.75)
Aff75 <- quantile(na.omit(data$Affected_people_to_total), 0.75)
# replace NAs with zero
data[is.na(data)] <- 0
# check columns
disaster <- ifelse(data$Damage_to_GDP > GDP75, 1, ifelse(data$Affected_people_to_total > Aff75, 1, 0))
PS:使用dplyr也是个好主意,但我们不知道这是否会打开另一个线程。
你的例子是稀疏的。因此,以下结果可能不包含您要查找的所有变体。
您所描述的是基于 2 个数据列的第 75 个百分位数的比较。如前所述,quantile()
函数允许您计算向量的第 x 个百分位数。使用 {dplyr}
(和 {tidyverse}
包,您可以在 mutate()
调用中执行此操作。然后构造规则的条件并分配虚拟值 1。避免必须包装多个 ifelse()
电话,你可以使用 dplyr 的 case_when()
.
祝你好运。
library(dplyr)
# data frame of test data - copied from SO with windows
df <- structure(list(
Year = c(1990L, 1990L, 1990L)
, Country = c("Tonga", "Samao", "Vanuatu")
, Damage_to_GDP = c(0.0102282455, 0.5102296844, NA)
, Affected_people_to_total = c(NA, NA, 1.364508e-05))
, class = "data.frame", row.names = c(NA,-3L))
df
# for demonstration purposes we build the 75th percentile of
# Damage_to_GDP and Affected_people_to_total
df <- df %>%
#-------------------- quantile comparison
# function quantile can handle NAs, i.e. exclude them from the calculation
# with probs you can set the quantile level you are looking for
mutate( D_GDP_75 = quantile(Damage_to_GDP, probs = 0.75, na.rm = TRUE)
,A_PPL_75 = quantile(Affected_people_to_total, probs = .075, na.rm = TRUE)
) %>%
# What I want ... natural disaster dummy
# that ... value 1
# when the Damage-to-GDP ratio is above the 75th percentile
# , or,
# when this data is not available ... and
# when the Affected-to-total-population is above the 75th percentile.
mutate( NAT_DIS_DUMMY = case_when(
Damage_to_GDP > D_GDP_75 ~ 1
,is.na(Damage_to_GDP) & Affected_people_to_total > A_PPL_75 ~ 1
,TRUE ~ as.double(NA) # "default" return. check type
)
)
mutate
75 个百分点
- 应用条件生成newvar
case_when
- 删除辅助列
library(dplyr)
options(scipen=999)
df1 %>%
mutate(qnt_75_Damage = quantile(Damage_to_GDP, probs= 0.75, na.rm = TRUE),
qnt_75_Affected = quantile(Affected_people_to_total, probs= 0.75, na.rm = TRUE)) %>%
mutate(newvar = case_when(Damage_to_GDP > qnt_75_Damage ~ 1,
Damage_to_GDP == NA & Affected_people_to_total > qnt_75_Affected ~1)) %>%
select(-qnt_75_Damage, -qnt_75_Affected)
输出:
Year Country Damage_to_GDP Affected_people_to_total newvar
1 1990 Tonga 0.01022825 NA NA
2 1990 Samao 0.51022968 NA 1
3 1990 Vanuatu NA 0.00001364508 NA
数据:
df1 <- structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga",
"Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844,
NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA,
-3L))
我想做的是构建一个自然灾害虚拟变量,当 Damage-to-GDP 比率高于第 75 个百分位时取值为 1,或者,当此数据不可用时,构建一个自然灾害变量当 Affected-to-total-population 高于第 75 个百分位数时,虚拟变量取值 1。有人可以提示我如何在 R 中构建它吗?
数据集示例:
Year | Country | Damage-to-GDP | Affected-people-to-total |
---|---|---|---|
1990 | Tonga | 0.0102282455 | NA |
1990 | Samao | 0.5102296844 | NA |
1990 | Vanuatu | NA | 1.364508e-05 |
> dput(data)
structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga",
"Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844,
NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA,
-3L))
也许这有点麻烦,但应该可行,并且是 base
解决方案。由于您没有提供可重现的示例,我模拟了一些数据:
set.seed(85)
df <- data.frame(gdp=sample(c(NA,runif(10)),100,replace = T),pop=sample(c(NA,runif(10)),100,replace = T))
summary(df)
#how many rows with both gdp and pop ==NA
sum(is.na(df)[,1]+is.na(df)[,2]==2)
#3rd quartile of gdp
quart3_gdp <- quantile(df$gdp,0.75,na.rm=T)
#3rd quartile of pop
quart3_pop <- quantile(df$pop,0.75,na.rm=T)
#first condition if gdp > 3rd quartile then 1 else 0
df$dummy <- ifelse(df$gdp>quart3_gdp,1,0)
#second condition if dummy==NA and pop>3rd quartile then 1 else 0
df$dummy[is.na(df$dummy)] <- ifelse(df$pop[is.na(df$dummy)]>quart3_pop,1,0)
#remove both gdp and pop == NA
df <- df[!is.na(df$dummy),]
数据集有点小,而且包含NA,所以很难测试,但可以试试这样:
data <- structure(
list(
Year = c(1990L, 1990L, 1990L),
Country = c("Tonga", "Samao", "Vanuatu"),
Damage_to_GDP = c(0.0102282455, 0.5102296844, NA),
Affected_people_to_total = c(NA, NA, 1.364508e-05)),
class = "data.frame", row.names = c(NA, -3L))
# estimate precentiles
GDP75 <- quantile(na.omit(data$Damage_to_GDP), 0.75)
Aff75 <- quantile(na.omit(data$Affected_people_to_total), 0.75)
# replace NAs with zero
data[is.na(data)] <- 0
# check columns
disaster <- ifelse(data$Damage_to_GDP > GDP75, 1, ifelse(data$Affected_people_to_total > Aff75, 1, 0))
PS:使用dplyr也是个好主意,但我们不知道这是否会打开另一个线程。
你的例子是稀疏的。因此,以下结果可能不包含您要查找的所有变体。
您所描述的是基于 2 个数据列的第 75 个百分位数的比较。如前所述,quantile()
函数允许您计算向量的第 x 个百分位数。使用 {dplyr}
(和 {tidyverse}
包,您可以在 mutate()
调用中执行此操作。然后构造规则的条件并分配虚拟值 1。避免必须包装多个 ifelse()
电话,你可以使用 dplyr 的 case_when()
.
祝你好运。
library(dplyr)
# data frame of test data - copied from SO with windows
df <- structure(list(
Year = c(1990L, 1990L, 1990L)
, Country = c("Tonga", "Samao", "Vanuatu")
, Damage_to_GDP = c(0.0102282455, 0.5102296844, NA)
, Affected_people_to_total = c(NA, NA, 1.364508e-05))
, class = "data.frame", row.names = c(NA,-3L))
df
# for demonstration purposes we build the 75th percentile of
# Damage_to_GDP and Affected_people_to_total
df <- df %>%
#-------------------- quantile comparison
# function quantile can handle NAs, i.e. exclude them from the calculation
# with probs you can set the quantile level you are looking for
mutate( D_GDP_75 = quantile(Damage_to_GDP, probs = 0.75, na.rm = TRUE)
,A_PPL_75 = quantile(Affected_people_to_total, probs = .075, na.rm = TRUE)
) %>%
# What I want ... natural disaster dummy
# that ... value 1
# when the Damage-to-GDP ratio is above the 75th percentile
# , or,
# when this data is not available ... and
# when the Affected-to-total-population is above the 75th percentile.
mutate( NAT_DIS_DUMMY = case_when(
Damage_to_GDP > D_GDP_75 ~ 1
,is.na(Damage_to_GDP) & Affected_people_to_total > A_PPL_75 ~ 1
,TRUE ~ as.double(NA) # "default" return. check type
)
)
mutate
75 个百分点- 应用条件生成newvar
case_when
- 删除辅助列
library(dplyr)
options(scipen=999)
df1 %>%
mutate(qnt_75_Damage = quantile(Damage_to_GDP, probs= 0.75, na.rm = TRUE),
qnt_75_Affected = quantile(Affected_people_to_total, probs= 0.75, na.rm = TRUE)) %>%
mutate(newvar = case_when(Damage_to_GDP > qnt_75_Damage ~ 1,
Damage_to_GDP == NA & Affected_people_to_total > qnt_75_Affected ~1)) %>%
select(-qnt_75_Damage, -qnt_75_Affected)
输出:
Year Country Damage_to_GDP Affected_people_to_total newvar
1 1990 Tonga 0.01022825 NA NA
2 1990 Samao 0.51022968 NA 1
3 1990 Vanuatu NA 0.00001364508 NA
数据:
df1 <- structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga",
"Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844,
NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA,
-3L))