如何删除序列中有两个或多个单词(一个接一个)的行?
How can I delete rows which have two or more words (after each other) in a sequence?
我想删除具有相同的两个或更多单词的行,就像一个序列。这是做顺序模式挖掘分析。
I already tried the distinct()
and duplicated()
function, but this only removes the
whole row.
r_seq_5 <- r_seq_5[!duplicated(r_seq_5),] # remove duplicates
# Su Score result ROI next_roi third_roi four_roi five_roi
# 1 1 90 high Elsewhere Elsewhere Teacher Teacher Teacher
# 2 1 90 high Elsewhere Teacher Teacher Teacher Teacher
# 3 1 90 high Teacher Pen Teacher Elsewhere Smartboard
这是table。如果Teacher在句子中出现两三次也没关系,只要不是互相追着就可以了。
期望的结果是:
# 1 1 90 high Teacher Pen Teacher Elsewhere Smartboard
您可以使用gather()
来重新组合您的变量,然后构建一个循环来识别与前面相同的值。
最后,使用 spread()
重建您的初始结构。
df <- data.frame(
row = 1:4,
Su = 1,
Score = 90,
result = 'high',
ROI = c('A', 'A', 'B', 'A'),
ROI2 = c('A', 'B', 'C', 'B'),
ROI3 = c('B', 'B', 'A', 'C')
) %>%
gather(-(row:result), key = roi, value = value) %>%
arrange(row) %>%
mutate(repeated = 0)
for(i in 2:nrow(df)){
if(df$row[i] == df$row[i-1] & df$value[i] == df$value[i-1])
df$repeated[i] = 1
}
df %>%
group_by(row) %>%
mutate(repeated = sum(repeated)) %>%
filter(repeated == 0) %>%
select(-repeated) %>%
spread(key = roi, value = value)
# row Su Score result ROI ROI2 ROI3
# <int> <dbl> <dbl> <fct> <chr> <chr> <chr>
# 1 3 1 90 high B C A
# 2 4 1 90 high A B C
为此,我发现将因数转换为数字很方便。这是我的第一步,因为比较列的 macth 这条路似乎不那么艰巨。
为此我使用了 for
,qdap
包,因为在 macth 中我用 NA
.
替换了值
library(dplyr)
library(qdap)
df <- data.frame(Su = rep(1,3),
Score = rep(90,3),
ROI = c("A", "A", "B"),
NETX_ROI = c("A", "B", "C"),
third_roi = rep("B", 3),
four_roi = c("B", "B", "A"),
five_roi = c("B", "B", "D"))
df
> df
Su Score ROI NETX_ROI third_roi four_roi five_roi
1 1 90 A A B B B
2 1 90 A B B B B
3 1 90 B C B A D
df2 <- df
roi <- c("A", "B", "C", "D")
# A = Elsewhere
# B = Teacher
# C = Pen
# D = Smartboard
n <- seq(1, length.out = length(roi))
for (i in 1:length(n)) {
df2[df2 == roi[i]] <- NA
df2 <- qdap::NAer(df2, i)
}
> df2
Su Score ROI NETX_ROI third_roi four_roi five_roi
1 1 90 1 1 2 2 2
2 1 90 1 2 2 2 2
3 1 90 2 3 2 1 4
df2 <- df2 %>%
dplyr::select(-c(Su, Score)) %>%
as.matrix()
nn <- ncol(df2)
x <- matrix(nrow = nrow(df2), ncol = ncol(df2)-1)
for (i in 1:(nn-1)) {
xx <- ifelse(df2[,i] == df2[,i+1], NA, 0)
x[,i] <- as.matrix(xx)
}
> x
[,1] [,2] [,3] [,4]
[1,] NA 0 NA NA
[2,] 0 NA NA NA
[3,] 0 0 0 0
最后,我删除了带有 NA
的行。
dfx <- x %>%
as.data.frame()
df_test <- df %>%
dplyr::bind_cols(dfx) %>%
na.omit() %>%
dplyr::select(1:ncol(df))
df_test
> df_test
Su Score ROI NETX_ROI third_roi four_roi five_roi
3 1 90 B C B A D
我想删除具有相同的两个或更多单词的行,就像一个序列。这是做顺序模式挖掘分析。
I already tried the
distinct()
andduplicated()
function, but this only removes the whole row.
r_seq_5 <- r_seq_5[!duplicated(r_seq_5),] # remove duplicates
# Su Score result ROI next_roi third_roi four_roi five_roi
# 1 1 90 high Elsewhere Elsewhere Teacher Teacher Teacher
# 2 1 90 high Elsewhere Teacher Teacher Teacher Teacher
# 3 1 90 high Teacher Pen Teacher Elsewhere Smartboard
这是table。如果Teacher在句子中出现两三次也没关系,只要不是互相追着就可以了。
期望的结果是:
# 1 1 90 high Teacher Pen Teacher Elsewhere Smartboard
您可以使用gather()
来重新组合您的变量,然后构建一个循环来识别与前面相同的值。
最后,使用 spread()
重建您的初始结构。
df <- data.frame(
row = 1:4,
Su = 1,
Score = 90,
result = 'high',
ROI = c('A', 'A', 'B', 'A'),
ROI2 = c('A', 'B', 'C', 'B'),
ROI3 = c('B', 'B', 'A', 'C')
) %>%
gather(-(row:result), key = roi, value = value) %>%
arrange(row) %>%
mutate(repeated = 0)
for(i in 2:nrow(df)){
if(df$row[i] == df$row[i-1] & df$value[i] == df$value[i-1])
df$repeated[i] = 1
}
df %>%
group_by(row) %>%
mutate(repeated = sum(repeated)) %>%
filter(repeated == 0) %>%
select(-repeated) %>%
spread(key = roi, value = value)
# row Su Score result ROI ROI2 ROI3
# <int> <dbl> <dbl> <fct> <chr> <chr> <chr>
# 1 3 1 90 high B C A
# 2 4 1 90 high A B C
为此,我发现将因数转换为数字很方便。这是我的第一步,因为比较列的 macth 这条路似乎不那么艰巨。
为此我使用了 for
,qdap
包,因为在 macth 中我用 NA
.
library(dplyr)
library(qdap)
df <- data.frame(Su = rep(1,3),
Score = rep(90,3),
ROI = c("A", "A", "B"),
NETX_ROI = c("A", "B", "C"),
third_roi = rep("B", 3),
four_roi = c("B", "B", "A"),
five_roi = c("B", "B", "D"))
df
> df
Su Score ROI NETX_ROI third_roi four_roi five_roi
1 1 90 A A B B B
2 1 90 A B B B B
3 1 90 B C B A D
df2 <- df
roi <- c("A", "B", "C", "D")
# A = Elsewhere
# B = Teacher
# C = Pen
# D = Smartboard
n <- seq(1, length.out = length(roi))
for (i in 1:length(n)) {
df2[df2 == roi[i]] <- NA
df2 <- qdap::NAer(df2, i)
}
> df2
Su Score ROI NETX_ROI third_roi four_roi five_roi
1 1 90 1 1 2 2 2
2 1 90 1 2 2 2 2
3 1 90 2 3 2 1 4
df2 <- df2 %>%
dplyr::select(-c(Su, Score)) %>%
as.matrix()
nn <- ncol(df2)
x <- matrix(nrow = nrow(df2), ncol = ncol(df2)-1)
for (i in 1:(nn-1)) {
xx <- ifelse(df2[,i] == df2[,i+1], NA, 0)
x[,i] <- as.matrix(xx)
}
> x
[,1] [,2] [,3] [,4]
[1,] NA 0 NA NA
[2,] 0 NA NA NA
[3,] 0 0 0 0
最后,我删除了带有 NA
的行。
dfx <- x %>%
as.data.frame()
df_test <- df %>%
dplyr::bind_cols(dfx) %>%
na.omit() %>%
dplyr::select(1:ncol(df))
df_test
> df_test
Su Score ROI NETX_ROI third_roi four_roi five_roi
3 1 90 B C B A D