基于 75h 百分位数创建虚拟变量

Creating a dummy variable based on the 75h percentile

我想做的是构建一个自然灾害虚拟变量,当 Damage-to-GDP 比率高于第 75 个百分位时取值为 1,或者,当此数据不可用时,构建一个自然灾害变量当 Affected-to-total-population 高于第 75 个百分位数时,虚拟变量取值 1。有人可以提示我如何在 R 中构建它吗?

数据集示例:

Year Country Damage-to-GDP Affected-people-to-total
1990 Tonga 0.0102282455 NA
1990 Samao 0.5102296844 NA
1990 Vanuatu NA 1.364508e-05
> dput(data)

structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga", 
"Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844, 
NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA, 
-3L))

也许这有点麻烦,但应该可行,并且是 base 解决方案。由于您没有提供可重现的示例,我模拟了一些数据:

set.seed(85)
df <- data.frame(gdp=sample(c(NA,runif(10)),100,replace = T),pop=sample(c(NA,runif(10)),100,replace = T))
summary(df)
#how many rows with both gdp and pop ==NA
sum(is.na(df)[,1]+is.na(df)[,2]==2)
#3rd quartile of gdp
quart3_gdp <- quantile(df$gdp,0.75,na.rm=T)
#3rd quartile of pop
quart3_pop <- quantile(df$pop,0.75,na.rm=T)

#first condition  if gdp > 3rd quartile then 1 else 0
df$dummy <- ifelse(df$gdp>quart3_gdp,1,0)
#second condition if dummy==NA and pop>3rd quartile then 1 else 0
df$dummy[is.na(df$dummy)] <- ifelse(df$pop[is.na(df$dummy)]>quart3_pop,1,0)
#remove both gdp and pop == NA
df <- df[!is.na(df$dummy),] 

数据集有点小,而且包含NA,所以很难测试,但可以试试这样:


data <- structure(
  list(
    Year = c(1990L, 1990L, 1990L), 
    Country = c("Tonga", "Samao", "Vanuatu"), 
    Damage_to_GDP = c(0.0102282455, 0.5102296844, NA), 
    Affected_people_to_total = c(NA, NA, 1.364508e-05)), 
  class = "data.frame", row.names = c(NA, -3L))

# estimate precentiles
GDP75 <- quantile(na.omit(data$Damage_to_GDP), 0.75)
Aff75 <- quantile(na.omit(data$Affected_people_to_total), 0.75)

# replace NAs with zero
data[is.na(data)] <- 0

# check columns
disaster <- ifelse(data$Damage_to_GDP > GDP75, 1, ifelse(data$Affected_people_to_total > Aff75, 1, 0))

PS:使用dplyr也是个好主意,但我们不知道这是否会打开另一个线程。

你的例子是稀疏的。因此,以下结果可能不包含您要查找的所有变体。

您所描述的是基于 2 个数据列的第 75 个百分位数的比较。如前所述,quantile() 函数允许您计算向量的第 x 个百分位数。使用 {dplyr}(和 {tidyverse} 包,您可以在 mutate() 调用中执行此操作。然后构造规则的条件并分配虚拟值 1。避免必须包装多个 ifelse() 电话,你可以使用 dplyr 的 case_when().

祝你好运。

library(dplyr)

# data frame of test data - copied from SO with windows 
df <- structure(list(
    Year = c(1990L, 1990L, 1990L)
  , Country = c("Tonga", "Samao", "Vanuatu")
  , Damage_to_GDP = c(0.0102282455, 0.5102296844, NA)
  , Affected_people_to_total = c(NA, NA, 1.364508e-05))
  , class = "data.frame", row.names = c(NA,-3L))

df

# for demonstration purposes we build the 75th percentile of
# Damage_to_GDP and Affected_people_to_total

df <- df %>%
  #-------------------- quantile comparison
  # function quantile can handle NAs, i.e. exclude them from the calculation
  # with probs you can set the quantile level you are looking for

  mutate( D_GDP_75 = quantile(Damage_to_GDP, probs = 0.75, na.rm = TRUE)
         ,A_PPL_75 = quantile(Affected_people_to_total, probs = .075, na.rm = TRUE)
         ) %>%
  
  # What I want ... natural disaster dummy 
  # that ... value 1 
  # when the Damage-to-GDP ratio is above the 75th percentile
  # , or, 
  # when this data is not available ... and  
  # when the Affected-to-total-population is above the 75th percentile. 
  
  mutate( NAT_DIS_DUMMY = case_when(
     Damage_to_GDP > D_GDP_75 ~ 1  
    ,is.na(Damage_to_GDP) & Affected_people_to_total > A_PPL_75 ~ 1
    ,TRUE ~ as.double(NA)   # "default" return. check type
    )
    )
  1. mutate 75 个百分点
  2. 应用条件生成newvarcase_when
  3. 删除辅助列
library(dplyr)
options(scipen=999)

df1 %>% 
  mutate(qnt_75_Damage  = quantile(Damage_to_GDP, probs= 0.75, na.rm = TRUE),
         qnt_75_Affected  = quantile(Affected_people_to_total, probs= 0.75, na.rm = TRUE)) %>% 
  mutate(newvar = case_when(Damage_to_GDP > qnt_75_Damage ~ 1,
                            Damage_to_GDP == NA & Affected_people_to_total > qnt_75_Affected ~1)) %>% 
  select(-qnt_75_Damage, -qnt_75_Affected)

输出:

  Year Country Damage_to_GDP Affected_people_to_total newvar
1 1990   Tonga    0.01022825                       NA     NA
2 1990   Samao    0.51022968                       NA      1
3 1990 Vanuatu            NA            0.00001364508     NA

数据:

df1 <- structure(list(Year = c(1990L, 1990L, 1990L), Country = c("Tonga", 
                                                          "Samao", "Vanuatu"), Damage_to_GDP = c(0.0102282455, 0.5102296844, 
                                                                                                 NA), Affected_people_to_total = c(NA, NA, 1.364508e-05)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                               -3L))