使用 rstatix 识别多个变量异常值
Identifying several variable outliers with rstatix
这是我拥有的数据的dput
。我只包含了数据的头部,因为这是一个非常庞大的数据集,但我认为考虑到我的问题应该足够了:
structure(list(Prioritising.workload = c(2L, 2L, 2L, 4L, 1L,
2L), Writing.notes = c(5L, 4L, 5L, 4L, 2L, 3L), Workaholism = c(4L,
5L, 3L, 5L, 3L, 3L), Reliability = c(4L, 4L, 4L, 3L, 5L, 3L),
Self.criticism = c(1L, 4L, 4L, 5L, 5L, 4L), Loneliness = c(3L,
2L, 5L, 5L, 3L, 2L), Changing.the.past = c(1L, 4L, 5L, 5L,
4L, 3L), Number.of.friends = c(3L, 3L, 3L, 1L, 3L, 3L), Mood.swings = c(3L,
4L, 4L, 5L, 2L, 3L), Socializing = c(3L, 4L, 5L, 1L, 3L,
4L), Energy.levels = c(5L, 3L, 4L, 2L, 5L, 4L), Interests.or.hobbies = c(3L,
3L, 5L, NA, 3L, 5L)), row.names = c(NA, 6L), class = "data.frame")
我试图找出所有这些变量的离群值。如果我单独执行此操作,我将得到以下与尼罗河一样长的代码:
#### EFA Personality Data Check ####
ef.personality %>%
identify_outliers(Prioritising.workload) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Writing.notes) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Workaholism) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Reliability) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Self.criticism) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Loneliness) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Changing.the.past) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Number.of.friends) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Mood.swings) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Socializing) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Energy.levels) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Interests.or.hobbies) %>%
select(is.extreme)
我可以使用一些命令来简化这一切吗?我在想某种循环可以检查每个变量和每个变量的 return 异常值,但我不确定如何实现它。我也对不依赖 rstatix
.
的解决方案持开放态度
rstatix
的美妙之处在于它对管道友好。因此,您可以将它与 tidyverse
框架一起使用。 tidyverse
需要 long-form 中的数据。您可以使用以下代码
library(tidyverse)
library(rstatix)
ef.personality %>%
mutate(id = seq(1, nrow(ef.personality),1)) %>% #To create a unique column required to make that data in long form
pivot_longer(-id) %>% #To make the data in long form required for `tidyverse`
group_by(name) %>% #Based on which column you want aggregate
identify_outliers(value) %>%
select(name, is.extreme)
这是我拥有的数据的dput
。我只包含了数据的头部,因为这是一个非常庞大的数据集,但我认为考虑到我的问题应该足够了:
structure(list(Prioritising.workload = c(2L, 2L, 2L, 4L, 1L,
2L), Writing.notes = c(5L, 4L, 5L, 4L, 2L, 3L), Workaholism = c(4L,
5L, 3L, 5L, 3L, 3L), Reliability = c(4L, 4L, 4L, 3L, 5L, 3L),
Self.criticism = c(1L, 4L, 4L, 5L, 5L, 4L), Loneliness = c(3L,
2L, 5L, 5L, 3L, 2L), Changing.the.past = c(1L, 4L, 5L, 5L,
4L, 3L), Number.of.friends = c(3L, 3L, 3L, 1L, 3L, 3L), Mood.swings = c(3L,
4L, 4L, 5L, 2L, 3L), Socializing = c(3L, 4L, 5L, 1L, 3L,
4L), Energy.levels = c(5L, 3L, 4L, 2L, 5L, 4L), Interests.or.hobbies = c(3L,
3L, 5L, NA, 3L, 5L)), row.names = c(NA, 6L), class = "data.frame")
我试图找出所有这些变量的离群值。如果我单独执行此操作,我将得到以下与尼罗河一样长的代码:
#### EFA Personality Data Check ####
ef.personality %>%
identify_outliers(Prioritising.workload) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Writing.notes) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Workaholism) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Reliability) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Self.criticism) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Loneliness) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Changing.the.past) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Number.of.friends) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Mood.swings) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Socializing) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Energy.levels) %>%
select(is.extreme)
ef.personality %>%
identify_outliers(Interests.or.hobbies) %>%
select(is.extreme)
我可以使用一些命令来简化这一切吗?我在想某种循环可以检查每个变量和每个变量的 return 异常值,但我不确定如何实现它。我也对不依赖 rstatix
.
rstatix
的美妙之处在于它对管道友好。因此,您可以将它与 tidyverse
框架一起使用。 tidyverse
需要 long-form 中的数据。您可以使用以下代码
library(tidyverse)
library(rstatix)
ef.personality %>%
mutate(id = seq(1, nrow(ef.personality),1)) %>% #To create a unique column required to make that data in long form
pivot_longer(-id) %>% #To make the data in long form required for `tidyverse`
group_by(name) %>% #Based on which column you want aggregate
identify_outliers(value) %>%
select(name, is.extreme)