为每行第一次出现的任何非零值过滤数据帧
filter dataframes for the first occurrence of any non-zero value per row
这些是我拥有的数据框类型,有两个例子说明它们有何不同:
- - - - - - - - - - - - - - - - - - - - - - - - Type1 992.0 4461.0 1.2 38476.0 :1..4473
第二个数据框:
- - - - - - - - - - - - - - - - - - - - - - - - Type2 1.0 5131.0 0.4 44433.0 -:1998..7151
- - - - - - - - - - - - - - - - - - - - - - - - Type2 5331.0 845.0 1.3 6672.0 -:1164..2016
Type3 1945.0 91.0 18.7 426.0 -:501..597 Type3 1912.0 91.0 18.7 426.0 -:501..597 - - - - - - - - - - - - - - - - - -
Type3 2071.0 196.0 18.9 468.0 -:10..236 Type3 2038.0 196.0 18.9 468.0 -:10..236 Type3 2049.0 141.0 16.3 441.0 -:10..196 Type3 2049.0 141.0 16.3 441.0 -:10..196 Type3 8294.0 151.0 17.2 580.0 -:10..196
- - - - - - - - - - - - - - - - - - - - - - - - Type4 8604.0 1473.0 0.5 13042.0 :1..1471
- - - - - - - - - - - - - - - - - - - - - - - - Type5 9795.0 2114.0 32.0 1971.0 :1296..3439
- - - - - - - - - - - - - - - - - - - - - - - - Type6 10131.0 5684.0 0.3 49063.0 :1455..7113
我正在寻找的代码将提取第一次出现的不是“-”的任何东西,而不依赖于 'Type*' 出现的任何东西,只是任何不是“-”的东西。所以输出看起来像这样:
Type1
或
Type2
Type3
Type4
Type5
Type6
我显然可以对任何不等于“-”的内容进行子集化,但我不知道如何只获取第一次出现的内容,因为我希望输出具有相同的维度。我看到了很多针对整个数据框中任何单词首次出现的解决方案,但这需要每行,我似乎无法让它工作。
第一个输入 data.frame:
dput(x)
structure(list(V1 = structure(1L, .Label = "-", class = "factor"),
V2 = structure(1L, .Label = "-", class = "factor"), V3 = structure(1L, .Label = "-", class = "factor"),
V4 = structure(1L, .Label = "-", class = "factor"), V5 = structure(1L, .Label = "-", class = "factor"),
V6 = structure(1L, .Label = "-", class = "factor"), V7 = structure(1L, .Label = "-", class = "factor"),
V8 = structure(1L, .Label = "-", class = "factor"), V9 = structure(1L, .Label = "-", class = "factor"),
V10 = structure(1L, .Label = "-", class = "factor"), V11 = structure(1L, .Label = "-", class = "factor"),
V12 = structure(1L, .Label = "-", class = "factor"), V13 = structure(1L, .Label = "-", class = "factor"),
V14 = structure(1L, .Label = "-", class = "factor"), V15 = structure(1L, .Label = "-", class = "factor"),
V16 = structure(1L, .Label = "-", class = "factor"), V17 = structure(1L, .Label = "-", class = "factor"),
V18 = structure(1L, .Label = "-", class = "factor"), V19 = structure(1L, .Label = "-", class = "factor"),
V20 = structure(1L, .Label = "-", class = "factor"), V21 = structure(1L, .Label = "-", class = "factor"),
V22 = structure(1L, .Label = "-", class = "factor"), V23 = structure(1L, .Label = "-", class = "factor"),
V24 = structure(1L, .Label = "-", class = "factor"), V25 = structure(1L, .Label = "Type1", class = "factor"),
V26 = 992, V27 = 4461, V28 = 1.2, V29 = 38476, V30 = structure(1L, .Label = ":1..4473", class = "factor")), class = "data.frame", row.names = c(NA, -1L))
第二个数据框示例:
structure(list(V1 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V2 = structure(c(1L, 1L, 2L, 3L,
1L, 1L, 1L), .Label = c("-", "1945.0", "2071.0"), class = "factor"),
V3 = structure(c(1L, 1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-",
"196.0", "91.0"), class = "factor"), V4 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "18.7", "18.9"), class = "factor"),
V5 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-",
"426.0", "468.0"), class = "factor"), V6 = structure(c(1L,
1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-", "-:10..236", "-:501..597"
), class = "factor"), V7 = structure(c(1L, 1L, 2L, 2L, 1L,
1L, 1L), .Label = c("-", "Type2"), class = "factor"), V8 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "1912.0", "2038.0"
), class = "factor"), V9 = structure(c(1L, 1L, 3L, 2L, 1L,
1L, 1L), .Label = c("-", "196.0", "91.0"), class = "factor"),
V10 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-",
"18.7", "18.9"), class = "factor"), V11 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "426.0", "468.0"
), class = "factor"), V12 = structure(c(1L, 1L, 3L, 2L, 1L,
1L, 1L), .Label = c("-", "-:10..236", "-:501..597"), class = "factor"),
V13 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V14 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"),
V15 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"141.0"), class = "factor"), V16 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"),
V17 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"441.0"), class = "factor"), V18 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"),
V19 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V20 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"),
V21 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"141.0"), class = "factor"), V22 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"),
V23 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"441.0"), class = "factor"), V24 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"),
V25 = structure(c(4L, 4L, 1L, 3L, 5L, 2L, 5L), .Label = c("-",
"Type3", "Type2", "Type4", "Type5"), class = "factor"),
V26 = structure(c(2L, 4L, 1L, 5L, 6L, 7L, 3L), .Label = c("-",
"1.0", "10131.0", "5331.0", "8294.0", "8604.0", "9795.0"), class = "factor"),
V27 = structure(c(5L, 7L, 1L, 3L, 2L, 4L, 6L), .Label = c("-",
"1473.0", "151.0", "2114.0", "5131.0", "5684.0", "845.0"), class = "factor"),
V28 = structure(c(3L, 5L, 1L, 6L, 4L, 7L, 2L), .Label = c("-",
"0.3", "0.4", "0.5", "1.3", "17.2", "32.0"), class = "factor"),
V29 = structure(c(4L, 7L, 1L, 6L, 2L, 3L, 5L), .Label = c("-",
"13042.0", "1971.0", "44433.0", "49063.0", "580.0", "6672.0"
), class = "factor"), V30 = structure(c(4L, 3L, 1L, 2L, 5L,
6L, 7L), .Label = c("-", "-:10..196", "-:1164..2016", "-:1998..7151",
":1..1471", ":1296..3439", ":1455..7113"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
以下作品:
apply(df, MARGIN = 1, FUN = function(row) row[!grepl("-", row)][1])
[1] "Type2" "Type2" "Type3" "Type3" "Type4" "Type5" "Type6"
apply
和 MARGIN = 1
作用于行。 FUN
中的函数使用 grepl
捕获与 -
不匹配的行中的所有元素, returns 第一个元素 [1]
.
我们可以使用带有 max.col
的矢量化选项来查找非 -
的 first
情况发生的列位置,以及 cbind
和行序列根据 row/column 索引
提取值
df1[cbind(seq_len(nrow(df1)), max.col(df1 != "-", "first"))]
[1] "Type4" "Type4" "Type2" "Type2" "Type5" "Type3" "Type5"
x[cbind(seq_len(nrow(x)), max.col(x != "-", "first"))]
[1] "Type1"
这些是我拥有的数据框类型,有两个例子说明它们有何不同:
- - - - - - - - - - - - - - - - - - - - - - - - Type1 992.0 4461.0 1.2 38476.0 :1..4473
第二个数据框:
- - - - - - - - - - - - - - - - - - - - - - - - Type2 1.0 5131.0 0.4 44433.0 -:1998..7151
- - - - - - - - - - - - - - - - - - - - - - - - Type2 5331.0 845.0 1.3 6672.0 -:1164..2016
Type3 1945.0 91.0 18.7 426.0 -:501..597 Type3 1912.0 91.0 18.7 426.0 -:501..597 - - - - - - - - - - - - - - - - - -
Type3 2071.0 196.0 18.9 468.0 -:10..236 Type3 2038.0 196.0 18.9 468.0 -:10..236 Type3 2049.0 141.0 16.3 441.0 -:10..196 Type3 2049.0 141.0 16.3 441.0 -:10..196 Type3 8294.0 151.0 17.2 580.0 -:10..196
- - - - - - - - - - - - - - - - - - - - - - - - Type4 8604.0 1473.0 0.5 13042.0 :1..1471
- - - - - - - - - - - - - - - - - - - - - - - - Type5 9795.0 2114.0 32.0 1971.0 :1296..3439
- - - - - - - - - - - - - - - - - - - - - - - - Type6 10131.0 5684.0 0.3 49063.0 :1455..7113
我正在寻找的代码将提取第一次出现的不是“-”的任何东西,而不依赖于 'Type*' 出现的任何东西,只是任何不是“-”的东西。所以输出看起来像这样:
Type1
或
Type2
Type3
Type4
Type5
Type6
我显然可以对任何不等于“-”的内容进行子集化,但我不知道如何只获取第一次出现的内容,因为我希望输出具有相同的维度。我看到了很多针对整个数据框中任何单词首次出现的解决方案,但这需要每行,我似乎无法让它工作。
第一个输入 data.frame:
dput(x)
structure(list(V1 = structure(1L, .Label = "-", class = "factor"),
V2 = structure(1L, .Label = "-", class = "factor"), V3 = structure(1L, .Label = "-", class = "factor"),
V4 = structure(1L, .Label = "-", class = "factor"), V5 = structure(1L, .Label = "-", class = "factor"),
V6 = structure(1L, .Label = "-", class = "factor"), V7 = structure(1L, .Label = "-", class = "factor"),
V8 = structure(1L, .Label = "-", class = "factor"), V9 = structure(1L, .Label = "-", class = "factor"),
V10 = structure(1L, .Label = "-", class = "factor"), V11 = structure(1L, .Label = "-", class = "factor"),
V12 = structure(1L, .Label = "-", class = "factor"), V13 = structure(1L, .Label = "-", class = "factor"),
V14 = structure(1L, .Label = "-", class = "factor"), V15 = structure(1L, .Label = "-", class = "factor"),
V16 = structure(1L, .Label = "-", class = "factor"), V17 = structure(1L, .Label = "-", class = "factor"),
V18 = structure(1L, .Label = "-", class = "factor"), V19 = structure(1L, .Label = "-", class = "factor"),
V20 = structure(1L, .Label = "-", class = "factor"), V21 = structure(1L, .Label = "-", class = "factor"),
V22 = structure(1L, .Label = "-", class = "factor"), V23 = structure(1L, .Label = "-", class = "factor"),
V24 = structure(1L, .Label = "-", class = "factor"), V25 = structure(1L, .Label = "Type1", class = "factor"),
V26 = 992, V27 = 4461, V28 = 1.2, V29 = 38476, V30 = structure(1L, .Label = ":1..4473", class = "factor")), class = "data.frame", row.names = c(NA, -1L))
第二个数据框示例:
structure(list(V1 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V2 = structure(c(1L, 1L, 2L, 3L,
1L, 1L, 1L), .Label = c("-", "1945.0", "2071.0"), class = "factor"),
V3 = structure(c(1L, 1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-",
"196.0", "91.0"), class = "factor"), V4 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "18.7", "18.9"), class = "factor"),
V5 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-",
"426.0", "468.0"), class = "factor"), V6 = structure(c(1L,
1L, 3L, 2L, 1L, 1L, 1L), .Label = c("-", "-:10..236", "-:501..597"
), class = "factor"), V7 = structure(c(1L, 1L, 2L, 2L, 1L,
1L, 1L), .Label = c("-", "Type2"), class = "factor"), V8 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "1912.0", "2038.0"
), class = "factor"), V9 = structure(c(1L, 1L, 3L, 2L, 1L,
1L, 1L), .Label = c("-", "196.0", "91.0"), class = "factor"),
V10 = structure(c(1L, 1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-",
"18.7", "18.9"), class = "factor"), V11 = structure(c(1L,
1L, 2L, 3L, 1L, 1L, 1L), .Label = c("-", "426.0", "468.0"
), class = "factor"), V12 = structure(c(1L, 1L, 3L, 2L, 1L,
1L, 1L), .Label = c("-", "-:10..236", "-:501..597"), class = "factor"),
V13 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V14 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"),
V15 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"141.0"), class = "factor"), V16 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"),
V17 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"441.0"), class = "factor"), V18 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"),
V19 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"Type2"), class = "factor"), V20 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "2049.0"), class = "factor"),
V21 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"141.0"), class = "factor"), V22 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "16.3"), class = "factor"),
V23 = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("-",
"441.0"), class = "factor"), V24 = structure(c(1L, 1L, 1L,
2L, 1L, 1L, 1L), .Label = c("-", "-:10..196"), class = "factor"),
V25 = structure(c(4L, 4L, 1L, 3L, 5L, 2L, 5L), .Label = c("-",
"Type3", "Type2", "Type4", "Type5"), class = "factor"),
V26 = structure(c(2L, 4L, 1L, 5L, 6L, 7L, 3L), .Label = c("-",
"1.0", "10131.0", "5331.0", "8294.0", "8604.0", "9795.0"), class = "factor"),
V27 = structure(c(5L, 7L, 1L, 3L, 2L, 4L, 6L), .Label = c("-",
"1473.0", "151.0", "2114.0", "5131.0", "5684.0", "845.0"), class = "factor"),
V28 = structure(c(3L, 5L, 1L, 6L, 4L, 7L, 2L), .Label = c("-",
"0.3", "0.4", "0.5", "1.3", "17.2", "32.0"), class = "factor"),
V29 = structure(c(4L, 7L, 1L, 6L, 2L, 3L, 5L), .Label = c("-",
"13042.0", "1971.0", "44433.0", "49063.0", "580.0", "6672.0"
), class = "factor"), V30 = structure(c(4L, 3L, 1L, 2L, 5L,
6L, 7L), .Label = c("-", "-:10..196", "-:1164..2016", "-:1998..7151",
":1..1471", ":1296..3439", ":1455..7113"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
以下作品:
apply(df, MARGIN = 1, FUN = function(row) row[!grepl("-", row)][1])
[1] "Type2" "Type2" "Type3" "Type3" "Type4" "Type5" "Type6"
apply
和 MARGIN = 1
作用于行。 FUN
中的函数使用 grepl
捕获与 -
不匹配的行中的所有元素, returns 第一个元素 [1]
.
我们可以使用带有 max.col
的矢量化选项来查找非 -
的 first
情况发生的列位置,以及 cbind
和行序列根据 row/column 索引
df1[cbind(seq_len(nrow(df1)), max.col(df1 != "-", "first"))]
[1] "Type4" "Type4" "Type2" "Type2" "Type5" "Type3" "Type5"
x[cbind(seq_len(nrow(x)), max.col(x != "-", "first"))]
[1] "Type1"