创建新变量的字符位置标识
Character-location identity to create a new variable
让我们先取一些随机数据
A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)
df1 <- data.frame(A, score_one, score_two, score_three, score_four)
library(dplyr)
library(tidyr)
df2 <- df1 %>%
group_by(A) %>%
mutate_each(funs(substr(.,1,1))) %>%
ungroup %>%
gather(variable, type, -c(A)) %>%
select(-variable) %>%
mutate(type = paste0("type_",type),
value = 1) %>%
group_by(A,type) %>%
summarise(value = sum(value)) %>%
ungroup %>%
spread(type, value, fill=0) %>%
inner_join(df1, by=c("A")) %>%
select(A, starts_with("score_"), starts_with("type_"))
这为每个 score_
引入了一个摘要变量
并计算每个唯一 首位数字
的频率
因此我们在第一行看到,type_1 == 2。因为在相应的 score_
列中,我们有 2 次出现,其中数字 1 是第一个数字
问题陈述
现在我们要引入一个调用 type_n
列的变量。
- 它检查值是否 > 0。
- 遇到这种情况,我们要检查相应的
score_
column/s
- 这里分析小数点后位位是否>=比值2
- 现在如果一个或所有相应行的小数位后的值 >= 2,我们要分配一个值 1
- 如果小数点后所有相应行的值都小于 2,我们要分配一个值 0
- 因此,如果
type_n == 0
,我们想分配一个 0
- 假设我们将此变量命名为
$type_n_G2
这样所需的输出应该看起来像1
以type_1_G2
为例
- 我们有
type_1 == 2
- 我们在
score_one
和score_four
有相应的身份
- 小数点后的两个值都>=2,所以我们赋值
type_1_G2==1
这是一种尝试,将您的数据转换为 long
格式,以便为每个值保留 type
变量。这样就可以更容易地计算第二步中有多少小数 >=2
。
library(tidyr)
#transform df1 to the long format
df <- df1 %>% gather(key, value, -A)
#calculate the type for each line
#this can be done by extracting the first digit and pasting
# "_type" in front of it
df$type <- as.factor(paste("type",sapply(strsplit(as.character(df$value),""),function(x) x[[1]]),sep="_"))
#expand the levels to add missing types
levels(df$type) <- c(levels(df$type),setdiff(paste("type",1:9,sep="_"),levels(df$type)))
#create a new column that holds the first decimal
#I assumed there was only one decimal for each number
#but you can adapt this
df$first_decimal <- as.numeric(sapply(strsplit(as.character(df$value),"[.]"),function(x) x[[2]]))
#group by A and type, if any first_decimal is bigger than 2
#G2 will be set to one for that group
df <- df %>% group_by(A,type) %>% mutate(G2=any(first_decimal>=2)*1)
#create a type_G2 column to hold the final column labels
df$type_G2 <- paste0(df$type,"_G2")
#this cbind creates the final result
cbind(df1,as.data.frame.matrix(table(df[,c("A","type")])),spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1])
最后一个 cbind 的分解:
df1
是原始数据框
as.data.frame.matrix(table(df[,c("A","type")]))
是一个数据框,其中包含每个 type
的数量
spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1]
保存 type_G2
信息。我将子集 df 唯一化,因为有一些冗余信息(例如 type_1_G2
对于第一行的值 123.5 和 123.1 是相同的)。
这是一个矢量化尝试,首先 melt
然后 dcast
使用 data.table
包的数据。它需要一些润色,但我现在没有时间
library(data.table) # v >= 1.9.6
# melt and order by "A"
temp <- setorder(melt(df2, id = 1:5), A)
# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")
# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]
# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\1", as.matrix(indx2))
# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)
# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)
# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
# A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1 1 0 0 0 0 0 0
# 2 2 0 1 0 1 1 0 0
# 3 3 1 1 0 1 0 0 0
# 4 4 0 0 1 0 0 0 0
# 5 5 0 1 1 0 0 0 1
现在您可以 cbind
结果为 df2
(这与您的结果不完全匹配,因为您提供的数据也不匹配)
df2
的复杂构造在我看来是没有必要的。将 df1
重塑为长格式是一个更好的起点,可以用更少的步骤获得所需的最终结果。
使用 data.table
包的方法:
library(data.table)
# melting the original dataframe 'df1' to a long format datatable
dt <- melt(setDT(df1), "A")
# creating two type variables & a logical vector indicating whether
# the decimal for a specific type is equal or above .2
dt[, `:=` (type1=paste0("type_",substr(value,1,1)),
type2=paste0("type_",substr(value,1,1),"_g2"))
][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)]
# creating separate wide datatables for the variable & two type columns
dt1 <- dcast(dt, A ~ variable)
dt2 <- dcast(dt, A ~ type1)
dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A]
# two options for merging the wide datatables together into one
dtres <- dt1[dt2[dt3, on = "A"], on = "A"]
dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3))
# or in one go without creating intermediate datatables
dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"]
这导致:
> dtres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2: 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3: 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4: 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5: 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
这种方法可以转化为 dplyr
/tidyr
实现,如下所示:
library(dplyr)
library(tidyr)
df <- df1 %>% gather(variable, value,-A) %>%
mutate(type1 = paste0("type_",substr(value,1,1)),
type2 = paste0("type_",substr(value,1,1),"_g2")) %>%
group_by(A,type1) %>%
mutate(g2 = +(+(value - floor(value) >= 0.2)==1),
type1n = n()) %>%
ungroup()
d1 <- df %>% select(1:3) %>% spread(variable, value)
d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0)
d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0)
dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A")
结果相同:
> dfres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
免责声明:再次阅读问题后,我的答案是错误的(至少结果过于复杂),以防您希望将十进制值与每个第一个出现的次数进行比较数字.
如果您愿意将分数小数与此行中的每个 type_N 值进行比较,这里有一种方法,希望这里有聪明的人能够改进它:
decimalscores <- (df2[grepl("score_*",colnames(df2))] - floor(df2[grepl("score_*",colnames(df2))]))*10 # Get the decimal, as per the sample only one digit
typesindex <- as.numeric(sub("type_","",colnames(df2[grepl("type_*",colnames(df2))]))) # get the type_"n" columns names to reuse later
res <- t(sapply(1:nrow(df2),function(x) { # loop over the dataframe rows
sapply(typesindex,function(y) { # For each type index
colname <- paste0("type_",y)
cmptype <- unlist(unname(df2[x,colname]))
# create the result if type_n is above 0
ifelse(cmptype > 0,
any(unlist(unname(decimalscores[x,])) >= cmptype)+0L, # If one score is above the value return 1
0) # Else return 0
})
}))
colnames(res) <- paste0("type_",typesindex,"_G2") # Name the resulting columns by adding _G2 to ouptut
res <- as.data.frame(res) # turn matrix into dataframe
df3 <- cbind(df2,res) # bind them to get expected output
我希望评论已经足够解释了,如果有什么不清楚的地方,请告诉我。
让我们先取一些随机数据
A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)
df1 <- data.frame(A, score_one, score_two, score_three, score_four)
library(dplyr)
library(tidyr)
df2 <- df1 %>%
group_by(A) %>%
mutate_each(funs(substr(.,1,1))) %>%
ungroup %>%
gather(variable, type, -c(A)) %>%
select(-variable) %>%
mutate(type = paste0("type_",type),
value = 1) %>%
group_by(A,type) %>%
summarise(value = sum(value)) %>%
ungroup %>%
spread(type, value, fill=0) %>%
inner_join(df1, by=c("A")) %>%
select(A, starts_with("score_"), starts_with("type_"))
这为每个 score_
引入了一个摘要变量
并计算每个唯一 首位数字
因此我们在第一行看到,type_1 == 2。因为在相应的 score_
列中,我们有 2 次出现,其中数字 1 是第一个数字
问题陈述
现在我们要引入一个调用 type_n
列的变量。
- 它检查值是否 > 0。
- 遇到这种情况,我们要检查相应的
score_
column/s - 这里分析小数点后位位是否>=比值2
- 现在如果一个或所有相应行的小数位后的值 >= 2,我们要分配一个值 1
- 如果小数点后所有相应行的值都小于 2,我们要分配一个值 0
- 因此,如果
type_n == 0
,我们想分配一个 0 - 假设我们将此变量命名为
$type_n_G2
这样所需的输出应该看起来像1
以type_1_G2
- 我们有
type_1 == 2
- 我们在
score_one
和score_four
有相应的身份
- 小数点后的两个值都>=2,所以我们赋值
type_1_G2==1
这是一种尝试,将您的数据转换为 long
格式,以便为每个值保留 type
变量。这样就可以更容易地计算第二步中有多少小数 >=2
。
library(tidyr)
#transform df1 to the long format
df <- df1 %>% gather(key, value, -A)
#calculate the type for each line
#this can be done by extracting the first digit and pasting
# "_type" in front of it
df$type <- as.factor(paste("type",sapply(strsplit(as.character(df$value),""),function(x) x[[1]]),sep="_"))
#expand the levels to add missing types
levels(df$type) <- c(levels(df$type),setdiff(paste("type",1:9,sep="_"),levels(df$type)))
#create a new column that holds the first decimal
#I assumed there was only one decimal for each number
#but you can adapt this
df$first_decimal <- as.numeric(sapply(strsplit(as.character(df$value),"[.]"),function(x) x[[2]]))
#group by A and type, if any first_decimal is bigger than 2
#G2 will be set to one for that group
df <- df %>% group_by(A,type) %>% mutate(G2=any(first_decimal>=2)*1)
#create a type_G2 column to hold the final column labels
df$type_G2 <- paste0(df$type,"_G2")
#this cbind creates the final result
cbind(df1,as.data.frame.matrix(table(df[,c("A","type")])),spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1])
最后一个 cbind 的分解:
df1
是原始数据框
as.data.frame.matrix(table(df[,c("A","type")]))
是一个数据框,其中包含每个 type
spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1]
保存 type_G2
信息。我将子集 df 唯一化,因为有一些冗余信息(例如 type_1_G2
对于第一行的值 123.5 和 123.1 是相同的)。
这是一个矢量化尝试,首先 melt
然后 dcast
使用 data.table
包的数据。它需要一些润色,但我现在没有时间
library(data.table) # v >= 1.9.6
# melt and order by "A"
temp <- setorder(melt(df2, id = 1:5), A)
# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")
# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]
# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\1", as.matrix(indx2))
# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)
# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)
# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
# A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1 1 0 0 0 0 0 0
# 2 2 0 1 0 1 1 0 0
# 3 3 1 1 0 1 0 0 0
# 4 4 0 0 1 0 0 0 0
# 5 5 0 1 1 0 0 0 1
现在您可以 cbind
结果为 df2
(这与您的结果不完全匹配,因为您提供的数据也不匹配)
df2
的复杂构造在我看来是没有必要的。将 df1
重塑为长格式是一个更好的起点,可以用更少的步骤获得所需的最终结果。
使用 data.table
包的方法:
library(data.table)
# melting the original dataframe 'df1' to a long format datatable
dt <- melt(setDT(df1), "A")
# creating two type variables & a logical vector indicating whether
# the decimal for a specific type is equal or above .2
dt[, `:=` (type1=paste0("type_",substr(value,1,1)),
type2=paste0("type_",substr(value,1,1),"_g2"))
][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)]
# creating separate wide datatables for the variable & two type columns
dt1 <- dcast(dt, A ~ variable)
dt2 <- dcast(dt, A ~ type1)
dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A]
# two options for merging the wide datatables together into one
dtres <- dt1[dt2[dt3, on = "A"], on = "A"]
dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3))
# or in one go without creating intermediate datatables
dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"]
这导致:
> dtres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2: 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3: 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4: 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5: 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
这种方法可以转化为 dplyr
/tidyr
实现,如下所示:
library(dplyr)
library(tidyr)
df <- df1 %>% gather(variable, value,-A) %>%
mutate(type1 = paste0("type_",substr(value,1,1)),
type2 = paste0("type_",substr(value,1,1),"_g2")) %>%
group_by(A,type1) %>%
mutate(g2 = +(+(value - floor(value) >= 0.2)==1),
type1n = n()) %>%
ungroup()
d1 <- df %>% select(1:3) %>% spread(variable, value)
d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0)
d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0)
dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A")
结果相同:
> dfres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
免责声明:再次阅读问题后,我的答案是错误的(至少结果过于复杂),以防您希望将十进制值与每个第一个出现的次数进行比较数字.
如果您愿意将分数小数与此行中的每个 type_N 值进行比较,这里有一种方法,希望这里有聪明的人能够改进它:
decimalscores <- (df2[grepl("score_*",colnames(df2))] - floor(df2[grepl("score_*",colnames(df2))]))*10 # Get the decimal, as per the sample only one digit
typesindex <- as.numeric(sub("type_","",colnames(df2[grepl("type_*",colnames(df2))]))) # get the type_"n" columns names to reuse later
res <- t(sapply(1:nrow(df2),function(x) { # loop over the dataframe rows
sapply(typesindex,function(y) { # For each type index
colname <- paste0("type_",y)
cmptype <- unlist(unname(df2[x,colname]))
# create the result if type_n is above 0
ifelse(cmptype > 0,
any(unlist(unname(decimalscores[x,])) >= cmptype)+0L, # If one score is above the value return 1
0) # Else return 0
})
}))
colnames(res) <- paste0("type_",typesindex,"_G2") # Name the resulting columns by adding _G2 to ouptut
res <- as.data.frame(res) # turn matrix into dataframe
df3 <- cbind(df2,res) # bind them to get expected output
我希望评论已经足够解释了,如果有什么不清楚的地方,请告诉我。