根据前一行中的值填充缺失的列值
Fill missing column values depending on values in preceding row
我有一个巨大的 table,基本上看起来像这样:
A B C D E F
A B &
A B C D $
A B C @
处理后的版本应该是这样的:
A B C D E F
A B B& B& B& B&
A B C D D$ D$
A B C C@ C@ C@
任务是将最后一个非空单元格的值与前一个非空单元格(同一行)的值连接起来,并使用新值填充同一行中的空单元格。
关于如何在 R 中执行此操作的任何建议?
这是一个循环遍历数据集行的选项。我们通过选择不为空的元素('x1')对每一行的元素进行子集化,paste
将'x1'中的最后两个非空元素放在一起('x2'),然后将除最后一个 (head(x1,-1)
) 之外的所有值与根据 'df1' 的列数和 length
的列数复制的 'x2' 值连接起来21=]。结果可以转置(t
)并转换为data.frame
m1 <- t(apply(df1, 1, function(x) {
x1 <- x[x!=''] #elements that are not-blank
x2 <- paste(tail(x1,2), collapse='') #paste the last two non-blank
if(any(x=='')) #if there is any blank value
c(head(x1,-1), rep(x2, ncol(df1)-length(x1)+1)) #concatenate
else x #else return the row
}))
as.data.frame(m1, stringsAsFactors=FALSE)
# V1 V2 V3 V4 V5 V6
#1 A B C D E F
#2 A B B& B& B& B&
#3 A B C D D$ D$
#4 A B C C@ C@ C@
数据
df1 <- structure(list(v1 = c("A", "A", "A", "A"), v2 = c("B", "B", "B",
"B"), v3 = c("C", "", "C", "C"), v4 = c("D", "", "D", "@"), v5 = c("E",
"&", "", ""), v6 = c("F", "", "$", "")), .Names = c("v1", "v2",
"v3", "v4", "v5", "v6"), class = "data.frame", row.names = c(NA, -4L))
这个看起来很有趣。我将数据框中的空格设为 ""
并将数据框称为 df
.
fill = apply(df, 1, function(x) {
x = x[x != ""]
paste(tail(x, 2), collapse = "")
})
df[df == ""] = matrix(fill, ncol = ncol(df), nrow = nrow(df))[df == ""]
为每一行找到唯一的填充值,制作一个与原始填充值结构相同的矩阵,然后挑选您需要替换的值。
df = structure(list(A = c("A", "A", "A"), B = c("B", "B", "B"), C = c("",
"C", "C"), D = c("", "D", "@"), E = c("&", "", ""), F = c("",
"$", "")), .Names = c("A", "B", "C", "D", "E", "F"), row.names = c(NA,
-3L), class = "data.frame")
这个问题尖叫 na.locf
来自 zoo
:
首先,将""
替换为NA
:x[sapply(x,function(y)y=="X")]<-NA
条形符号:
x.no.sym<-x
x.no.sym[sapply(x.no.sym,function(y)!y%in%LETTERS)]<-NA
填写字母:
x.no.sym.fill<-t(apply(x.no.sym,1,na.locf))
V1 V2 V3 V4 V5 V6
[1,] "A" "B" "C" "D" "E" "F"
[2,] "A" "B" "B" "B" "B" "B"
[3,] "A" "B" "C" "D" "D" "D"
[4,] "A" "B" "C" "C" "C" "C"
现在填写符号并删除字母:
x.sym.fill<-x.sym.fill<-t(apply(x,1,function(y)na.locf(na.locf(y,fromLast=T,na.rm=F),na.rm=F)))
x.sym.fill[sapply(x.sym.fill,function(y)y%in%LETTERS)]<-""
V1 V2 V3 V4 V5 V6
[1,] "" "" "" "" "" ""
[2,] "" "" "&" "&" "&" "&"
[3,] "" "" "" "" "$" "$"
[4,] "" "" "" "@" "@" "@"
现在连接:
> matrix(paste0(x.no.sym.fill,x.sym.fill),ncol=ncol(x))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "A" "B" "C" "D" "E" "F"
[2,] "A" "B" "B&" "B&" "B&" "B&"
[3,] "A" "B" "C" "D" "D$" "D$"
[4,] "A" "B" "C" "C@" "C@" "C@"
我有一个巨大的 table,基本上看起来像这样:
A B C D E F
A B &
A B C D $
A B C @
处理后的版本应该是这样的:
A B C D E F
A B B& B& B& B&
A B C D D$ D$
A B C C@ C@ C@
任务是将最后一个非空单元格的值与前一个非空单元格(同一行)的值连接起来,并使用新值填充同一行中的空单元格。
关于如何在 R 中执行此操作的任何建议?
这是一个循环遍历数据集行的选项。我们通过选择不为空的元素('x1')对每一行的元素进行子集化,paste
将'x1'中的最后两个非空元素放在一起('x2'),然后将除最后一个 (head(x1,-1)
) 之外的所有值与根据 'df1' 的列数和 length
的列数复制的 'x2' 值连接起来21=]。结果可以转置(t
)并转换为data.frame
m1 <- t(apply(df1, 1, function(x) {
x1 <- x[x!=''] #elements that are not-blank
x2 <- paste(tail(x1,2), collapse='') #paste the last two non-blank
if(any(x=='')) #if there is any blank value
c(head(x1,-1), rep(x2, ncol(df1)-length(x1)+1)) #concatenate
else x #else return the row
}))
as.data.frame(m1, stringsAsFactors=FALSE)
# V1 V2 V3 V4 V5 V6
#1 A B C D E F
#2 A B B& B& B& B&
#3 A B C D D$ D$
#4 A B C C@ C@ C@
数据
df1 <- structure(list(v1 = c("A", "A", "A", "A"), v2 = c("B", "B", "B",
"B"), v3 = c("C", "", "C", "C"), v4 = c("D", "", "D", "@"), v5 = c("E",
"&", "", ""), v6 = c("F", "", "$", "")), .Names = c("v1", "v2",
"v3", "v4", "v5", "v6"), class = "data.frame", row.names = c(NA, -4L))
这个看起来很有趣。我将数据框中的空格设为 ""
并将数据框称为 df
.
fill = apply(df, 1, function(x) {
x = x[x != ""]
paste(tail(x, 2), collapse = "")
})
df[df == ""] = matrix(fill, ncol = ncol(df), nrow = nrow(df))[df == ""]
为每一行找到唯一的填充值,制作一个与原始填充值结构相同的矩阵,然后挑选您需要替换的值。
df = structure(list(A = c("A", "A", "A"), B = c("B", "B", "B"), C = c("",
"C", "C"), D = c("", "D", "@"), E = c("&", "", ""), F = c("",
"$", "")), .Names = c("A", "B", "C", "D", "E", "F"), row.names = c(NA,
-3L), class = "data.frame")
这个问题尖叫 na.locf
来自 zoo
:
首先,将""
替换为NA
:x[sapply(x,function(y)y=="X")]<-NA
条形符号:
x.no.sym<-x
x.no.sym[sapply(x.no.sym,function(y)!y%in%LETTERS)]<-NA
填写字母:
x.no.sym.fill<-t(apply(x.no.sym,1,na.locf))
V1 V2 V3 V4 V5 V6
[1,] "A" "B" "C" "D" "E" "F"
[2,] "A" "B" "B" "B" "B" "B"
[3,] "A" "B" "C" "D" "D" "D"
[4,] "A" "B" "C" "C" "C" "C"
现在填写符号并删除字母:
x.sym.fill<-x.sym.fill<-t(apply(x,1,function(y)na.locf(na.locf(y,fromLast=T,na.rm=F),na.rm=F)))
x.sym.fill[sapply(x.sym.fill,function(y)y%in%LETTERS)]<-""
V1 V2 V3 V4 V5 V6
[1,] "" "" "" "" "" ""
[2,] "" "" "&" "&" "&" "&"
[3,] "" "" "" "" "$" "$"
[4,] "" "" "" "@" "@" "@"
现在连接:
> matrix(paste0(x.no.sym.fill,x.sym.fill),ncol=ncol(x))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "A" "B" "C" "D" "E" "F"
[2,] "A" "B" "B&" "B&" "B&" "B&"
[3,] "A" "B" "C" "D" "D$" "D$"
[4,] "A" "B" "C" "C@" "C@" "C@"