如何按条件将代码应用于数据框?

How to apply code to dataframe by condition?

我有以下数据框:

library(dplyr)
library(tidyverse)
library(concordance)
Year <- c(2016,2016,2017,2019,2020,2020,2020,2013,2010,2010)
Pf <- c("HS4","HS4","HS4","HS5","HS5","HS5","HS5","HS4","HS3","HS3")
Code <- c("391890","440929","851660","732399","720839","050510","830241","321590","010210","010210")
Slen <- c("6","6","6","6","6","6","6","6","6","6")
df <-  data.frame(Year,Pf,Code,Slen)

'Pf' 列包含 3 种不同类型的行:“HS3”、“HS4”和“HS5”。我想执行矢量化操作并将 concord() 函数应用于 'Code' 列”,但是为了做到这一点,'Pf' 必须是唯一的,这就是为什么在我设置数据帧之前 [=23] =] 列是唯一的

# Subset data where Pf column is unique
df.H5 <- subset(df, Pf == "HS5")
df.H4  <- subset(df, Pf == "HS4")
df.H3  <- subset(df, Pf == "HS3")

现在我对每个数据框应用一个函数。这里 concord() 函数应用于 'Code' 列并将这些字符转换为不同的字符。但是,如果目的地(参数)和 'Pf' 列中的值相同,则它不起作用,例如,如果 Pf="HS3"(在 df 中)和目的地 = "HS3",代码不会 运行,这就是为什么我不将代码应用于 df.H3

# Apply function to df.H5
df.H5<- df.H5 %>% 
  group_by(Pf, Slen) %>%
  mutate(
    Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
  ) %>%
  ungroup()

# Apply function to df.H4
df.H4<- df.H4 %>% 
  group_by(Pf, Slen) %>%
  mutate(
    Code2 = concord(Code, origin = unique(Pf), dest.digit = unique(Slen), destination = "HS3", all = FALSE)
  ) %>%
  ungroup()

#add column todf.H3 in order to merge these 3 tafarames
df.H3$Code2 <- df.H3$Code

#merge
df2 <- rbind(df.H4, df.H5, df.H3)

我的目标是以某种方式使流程自动化。例如,如果 destination = "HS3",代码应用整个数据而不进行预子集化,并且如果 destination(参数)和 Pf 中的行彼此匹配,则代码不适用于它,只是从 [=24 复制粘贴值=] 在这种情况下生成 'Code2' 列

您可以将逻辑放在一个函数中,然后在拆分数据和应用函数的 by 方法中使用它。在该函数中,您可以执行一个案例处理,其中 P == 'HS3' 不应被处理。最后 unsplit.

cf <- \(x) {
  Code2 <- if (!any(x$Pf == 'HS3')) {
    concordance::concord(x$Code, x$Pf[1], x$Slen[1], 
                         destination="HS3", all=FALSE)
  } else {
    x$Code
  }
  cbind(x, Code2)
}

by(df, df$Pf, cf) |>
  unsplit(df$Pf)
#    Year  Pf   Code Slen  Code2
# 1  2016 HS4 391890    6 391890
# 2  2016 HS4 440929    6 440929
# 3  2017 HS4 851660    6 851660
# 4  2019 HS5 732399    6 732399
# 5  2020 HS5 720839    6 720839
# 6  2020 HS5 050510    6 050510
# 7  2020 HS5 830241    6 830241
# 8  2013 HS4 321590    6 321590
# 9  2010 HS3 010210    6 010210
# 10 2010 HS3 010210    6 010210

数据:

df <- structure(list(Year = c(2016, 2016, 2017, 2019, 2020, 2020, 2020, 
2013, 2010, 2010), Pf = c("HS4", "HS4", "HS4", "HS5", "HS5", 
"HS5", "HS5", "HS4", "HS3", "HS3"), Code = c("391890", "440929", 
"851660", "732399", "720839", "050510", "830241", "321590", "010210", 
"010210"), Slen = c("6", "6", "6", "6", "6", "6", "6", "6", "6", 
"6")), class = "data.frame", row.names = c(NA, -10L))