规范化 R 中的数据帧
Normalise a dataframe in R
我有一个名为 examples 的数据框,其中出现了一些语义特征:
> str(examples)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 50 obs. of 12 variables:
$ filename : chr "Text01" "Text02" "Text03" "Text04" ...
$ Control : num 1 3 0 0 0 6 0 1 0 1 ...
$ Economic : num 1 3 0 0 0 0 1 0 1 2 ...
$ ExternalVoices: num 1 2 1 1 1 2 1 4 0 1 ...
$ JobsSkills : num 0 0 0 0 0 2 0 3 0 0 ...
$ LegalStatus : num 0 3 4 0 5 0 1 0 4 0 ...
$ Modals : num 4 6 1 5 4 4 2 6 2 2 ...
$ Orign : num 2 6 8 6 3 5 3 3 2 6 ...
$ Sanctions : num 1 3 0 3 0 3 2 1 1 0 ...
$ Subjectivisms : num 2 3 4 4 3 2 1 1 2 4 ...
$ Verbs : num 3 7 3 11 6 2 7 7 4 5 ...
$ LineTotal : num 130 274 258 419 268 210 379 244 172 199 ...
- attr(*, "spec")=
.. cols(
.. filename = col_character(),
.. Control = col_double(),
.. Economic = col_double(),
.. ExternalVoices = col_double(),
.. JobsSkills = col_double(),
.. LegalStatus = col_double(),
.. Modals = col_double(),
.. Orign = col_double(),
.. Sanctions = col_double(),
.. Subjectivisms = col_double(),
.. Verbs = col_double(),
.. LineTotal = col_double()
.. )
head(examples)
A tibble: 6 x 12
filename Control Economic ExternalVoices JobsSkills LegalStatus Modals Orign Sanctions Subjectivisms Verbs LineTotal
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Text01 1 1 1 0 0 4 2 1 2 3 130
2 Text02 3 3 2 0 3 6 6 3 3 7 274
3 Text03 0 0 1 0 4 1 8 0 4 3 258
4 Text04 0 0 1 0 0 5 6 3 4 11 419
5 Text05 0 0 1 0 5 4 3 0 3 6 268
6 Text06 6 0 2 2 0 4 5 3 2 2 210
我需要应用一个公式,将每个单元格的值乘以 1000,然后除以行的总和,即递归地除以列 "LineTotal":
(cellx1000)/LineTotal
LineTotal 每行的变化。
我什至想不出办法来完成它。任何帮助都会很棒!
谢谢!
编辑
为要复制的文件提供 dput(df):
> dput(df)
structure(list(filename = c("Text01", "Text02", "Text03", "Text04",
"Text05", "Text06", "Text07", "Text08", "Text09", "Text10", "Text11",
"Text12", "Text13", "Text14", "Text15", "Text16", "Text17", "Text18",
"Text19", "Text20", "Text21", "Text22", "Text23", "Text24", "Text25",
"Text26", "Text27", "Text28", "Text29", "Text30", "Text31", "Text32",
"Text33", "Text34", "Text35", "Text36", "Text37", "Text38", "Text39",
"Text40", "Text41", "Text42", "Text43", "Text44", "Text45", "Text46",
"Text47", "Text48", "Text49", "Text50"), Control = c(1, 3, 0,
0, 0, 6, 0, 1, 0, 1, 1, 4, 0, 2, 0, 3, 3, 1, 1, 0, 0, 2, 1, 5,
5, 2, 0, 0, 1, 3, 3, 0, 0, 0, 4, 1, 1, 2, 0, 0, 0, 0, 4, 0, 3,
0, 2, 1, 0, 0), Economic = c(1, 3, 0, 0, 0, 0, 1, 0, 1, 2, 0,
1, 2, 2, 0, 1, 1, 4, 0, 1, 0, 0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 2,
1, 3, 7, 1, 2, 3, 0, 4, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2), ExternalVoices = c(1,
2, 1, 1, 1, 2, 1, 4, 0, 1, 0, 8, 6, 2, 0, 6, 1, 2, 3, 0, 1, 4,
2, 1, 2, 0, 0, 3, 1, 2, 1, 1, 4, 7, 5, 2, 1, 3, 0, 0, 2, 0, 3,
0, 4, 1, 1, 2, 0, 1), JobsSkills = c(0, 0, 0, 0, 0, 2, 0, 3,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), LegalStatus = c(0, 3, 4, 0, 5, 0, 1, 0, 4, 0, 1, 3, 2, 0,
0, 3, 0, 0, 1, 1, 0, 1, 0, 2, 3, 0, 3, 0, 2, 0, 2, 2, 12, 2,
0, 0, 3, 0, 1, 1, 5, 2, 0, 0, 5, 1, 7, 3, 1, 0), Modals = c(4,
6, 1, 5, 4, 4, 2, 6, 2, 2, 0, 5, 2, 1, 7, 5, 6, 1, 0, 0, 1, 2,
6, 0, 2, 8, 0, 3, 8, 0, 1, 2, 5, 13, 2, 7, 1, 2, 4, 0, 2, 4,
5, 8, 5, 0, 2, 7, 1, 3), Orign = c(2, 6, 8, 6, 3, 5, 3, 3, 2,
6, 1, 8, 2, 7, 8, 8, 12, 7, 6, 2, 3, 5, 5, 2, 2, 4, 2, 1, 7,
6, 5, 5, 11, 5, 7, 12, 6, 8, 5, 12, 12, 1, 4, 7, 7, 3, 6, 2,
3, 5), Sanctions = c(1, 3, 0, 3, 0, 3, 2, 1, 1, 0, 0, 3, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 5, 4, 3, 4, 2, 2, 0, 6, 6, 6, 0,
2, 4, 1, 4, 9, 1, 3, 3, 4, 2, 1, 4, 1, 0, 1), Subjectivisms = c(2,
3, 4, 4, 3, 2, 1, 1, 2, 4, 0, 5, 4, 2, 2, 2, 2, 5, 4, 0, 0, 4,
3, 2, 6, 5, 0, 4, 10, 4, 2, 0, 7, 3, 6, 3, 2, 6, 4, 7, 6, 1,
4, 3, 2, 0, 4, 5, 2, 4), Verbs = c(3, 7, 3, 11, 6, 2, 7, 7, 4,
5, 1, 9, 6, 7, 10, 6, 11, 7, 7, 2, 2, 8, 5, 8, 7, 8, 2, 6, 6,
6, 7, 4, 12, 10, 7, 11, 9, 10, 7, 21, 11, 3, 4, 9, 7, 3, 4, 6,
2, 10), Total = c(130, 274, 258, 419, 268, 210, 379, 244, 172,
199, 87, 462, 211, 251, 382, 313, 509, 287, 253, 123, 92, 269,
292, 313, 311, 361, 200, 261, 387, 261, 263, 293, 554, 587, 325,
562, 434, 315, 521, 660, 661, 202, 204, 297, 549, 161, 368, 288,
71, 341)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -50L), spec = structure(list(cols = list(
filename = structure(list(), class = c("collector_character",
"collector")), Control = structure(list(), class = c("collector_double",
"collector")), Economic = structure(list(), class = c("collector_double",
"collector")), ExternalVoices = structure(list(), class = c("collector_double",
"collector")), JobsSkills = structure(list(), class = c("collector_double",
"collector")), LegalStatus = structure(list(), class = c("collector_double",
"collector")), Modals = structure(list(), class = c("collector_double",
"collector")), Orign = structure(list(), class = c("collector_double",
"collector")), Sanctions = structure(list(), class = c("collector_double",
"collector")), Subjectivisms = structure(list(), class = c("collector_double",
"collector")), Verbs = structure(list(), class = c("collector_double",
"collector")), Total = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
谢谢!
我认为这就是您要查找的内容(example
是您的数据框)
example[,2:ncol(example)] <- lapply(2:ncol(example), function(X) example[,X]*1000/sum(example[,X]))
编辑:一个问题是数据框中的第一列中有字符。此新代码适用于您提供的示例数据。
这是使用 tidyverse 的一种可能的解决方案:
library(tidyverse)
examples %>%
mutate_at(
vars(-matches("filename|LineTotal")),
~ .x * 1000 / LineTotal
)
我想这就是您要找的东西
1) 定义总行的列数
colname_tot = which(colnames(examples) == "LineTotal") -1
2) 使用 apply 对每一行执行此操作
examples = rbind(examples[, 1], as.data.frame(t(apply(examples[, -1], 1, function(x) {x = x*1000/x[colname_tot]})))
希望这对您有所帮助,
我有一个名为 examples 的数据框,其中出现了一些语义特征:
> str(examples)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 50 obs. of 12 variables:
$ filename : chr "Text01" "Text02" "Text03" "Text04" ...
$ Control : num 1 3 0 0 0 6 0 1 0 1 ...
$ Economic : num 1 3 0 0 0 0 1 0 1 2 ...
$ ExternalVoices: num 1 2 1 1 1 2 1 4 0 1 ...
$ JobsSkills : num 0 0 0 0 0 2 0 3 0 0 ...
$ LegalStatus : num 0 3 4 0 5 0 1 0 4 0 ...
$ Modals : num 4 6 1 5 4 4 2 6 2 2 ...
$ Orign : num 2 6 8 6 3 5 3 3 2 6 ...
$ Sanctions : num 1 3 0 3 0 3 2 1 1 0 ...
$ Subjectivisms : num 2 3 4 4 3 2 1 1 2 4 ...
$ Verbs : num 3 7 3 11 6 2 7 7 4 5 ...
$ LineTotal : num 130 274 258 419 268 210 379 244 172 199 ...
- attr(*, "spec")=
.. cols(
.. filename = col_character(),
.. Control = col_double(),
.. Economic = col_double(),
.. ExternalVoices = col_double(),
.. JobsSkills = col_double(),
.. LegalStatus = col_double(),
.. Modals = col_double(),
.. Orign = col_double(),
.. Sanctions = col_double(),
.. Subjectivisms = col_double(),
.. Verbs = col_double(),
.. LineTotal = col_double()
.. )
head(examples)
A tibble: 6 x 12
filename Control Economic ExternalVoices JobsSkills LegalStatus Modals Orign Sanctions Subjectivisms Verbs LineTotal
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Text01 1 1 1 0 0 4 2 1 2 3 130
2 Text02 3 3 2 0 3 6 6 3 3 7 274
3 Text03 0 0 1 0 4 1 8 0 4 3 258
4 Text04 0 0 1 0 0 5 6 3 4 11 419
5 Text05 0 0 1 0 5 4 3 0 3 6 268
6 Text06 6 0 2 2 0 4 5 3 2 2 210
我需要应用一个公式,将每个单元格的值乘以 1000,然后除以行的总和,即递归地除以列 "LineTotal":
(cellx1000)/LineTotal
LineTotal 每行的变化。
我什至想不出办法来完成它。任何帮助都会很棒!
谢谢!
编辑
为要复制的文件提供 dput(df):
> dput(df)
structure(list(filename = c("Text01", "Text02", "Text03", "Text04",
"Text05", "Text06", "Text07", "Text08", "Text09", "Text10", "Text11",
"Text12", "Text13", "Text14", "Text15", "Text16", "Text17", "Text18",
"Text19", "Text20", "Text21", "Text22", "Text23", "Text24", "Text25",
"Text26", "Text27", "Text28", "Text29", "Text30", "Text31", "Text32",
"Text33", "Text34", "Text35", "Text36", "Text37", "Text38", "Text39",
"Text40", "Text41", "Text42", "Text43", "Text44", "Text45", "Text46",
"Text47", "Text48", "Text49", "Text50"), Control = c(1, 3, 0,
0, 0, 6, 0, 1, 0, 1, 1, 4, 0, 2, 0, 3, 3, 1, 1, 0, 0, 2, 1, 5,
5, 2, 0, 0, 1, 3, 3, 0, 0, 0, 4, 1, 1, 2, 0, 0, 0, 0, 4, 0, 3,
0, 2, 1, 0, 0), Economic = c(1, 3, 0, 0, 0, 0, 1, 0, 1, 2, 0,
1, 2, 2, 0, 1, 1, 4, 0, 1, 0, 0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 2,
1, 3, 7, 1, 2, 3, 0, 4, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2), ExternalVoices = c(1,
2, 1, 1, 1, 2, 1, 4, 0, 1, 0, 8, 6, 2, 0, 6, 1, 2, 3, 0, 1, 4,
2, 1, 2, 0, 0, 3, 1, 2, 1, 1, 4, 7, 5, 2, 1, 3, 0, 0, 2, 0, 3,
0, 4, 1, 1, 2, 0, 1), JobsSkills = c(0, 0, 0, 0, 0, 2, 0, 3,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), LegalStatus = c(0, 3, 4, 0, 5, 0, 1, 0, 4, 0, 1, 3, 2, 0,
0, 3, 0, 0, 1, 1, 0, 1, 0, 2, 3, 0, 3, 0, 2, 0, 2, 2, 12, 2,
0, 0, 3, 0, 1, 1, 5, 2, 0, 0, 5, 1, 7, 3, 1, 0), Modals = c(4,
6, 1, 5, 4, 4, 2, 6, 2, 2, 0, 5, 2, 1, 7, 5, 6, 1, 0, 0, 1, 2,
6, 0, 2, 8, 0, 3, 8, 0, 1, 2, 5, 13, 2, 7, 1, 2, 4, 0, 2, 4,
5, 8, 5, 0, 2, 7, 1, 3), Orign = c(2, 6, 8, 6, 3, 5, 3, 3, 2,
6, 1, 8, 2, 7, 8, 8, 12, 7, 6, 2, 3, 5, 5, 2, 2, 4, 2, 1, 7,
6, 5, 5, 11, 5, 7, 12, 6, 8, 5, 12, 12, 1, 4, 7, 7, 3, 6, 2,
3, 5), Sanctions = c(1, 3, 0, 3, 0, 3, 2, 1, 1, 0, 0, 3, 1, 2,
1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 5, 4, 3, 4, 2, 2, 0, 6, 6, 6, 0,
2, 4, 1, 4, 9, 1, 3, 3, 4, 2, 1, 4, 1, 0, 1), Subjectivisms = c(2,
3, 4, 4, 3, 2, 1, 1, 2, 4, 0, 5, 4, 2, 2, 2, 2, 5, 4, 0, 0, 4,
3, 2, 6, 5, 0, 4, 10, 4, 2, 0, 7, 3, 6, 3, 2, 6, 4, 7, 6, 1,
4, 3, 2, 0, 4, 5, 2, 4), Verbs = c(3, 7, 3, 11, 6, 2, 7, 7, 4,
5, 1, 9, 6, 7, 10, 6, 11, 7, 7, 2, 2, 8, 5, 8, 7, 8, 2, 6, 6,
6, 7, 4, 12, 10, 7, 11, 9, 10, 7, 21, 11, 3, 4, 9, 7, 3, 4, 6,
2, 10), Total = c(130, 274, 258, 419, 268, 210, 379, 244, 172,
199, 87, 462, 211, 251, 382, 313, 509, 287, 253, 123, 92, 269,
292, 313, 311, 361, 200, 261, 387, 261, 263, 293, 554, 587, 325,
562, 434, 315, 521, 660, 661, 202, 204, 297, 549, 161, 368, 288,
71, 341)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -50L), spec = structure(list(cols = list(
filename = structure(list(), class = c("collector_character",
"collector")), Control = structure(list(), class = c("collector_double",
"collector")), Economic = structure(list(), class = c("collector_double",
"collector")), ExternalVoices = structure(list(), class = c("collector_double",
"collector")), JobsSkills = structure(list(), class = c("collector_double",
"collector")), LegalStatus = structure(list(), class = c("collector_double",
"collector")), Modals = structure(list(), class = c("collector_double",
"collector")), Orign = structure(list(), class = c("collector_double",
"collector")), Sanctions = structure(list(), class = c("collector_double",
"collector")), Subjectivisms = structure(list(), class = c("collector_double",
"collector")), Verbs = structure(list(), class = c("collector_double",
"collector")), Total = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
谢谢!
我认为这就是您要查找的内容(example
是您的数据框)
example[,2:ncol(example)] <- lapply(2:ncol(example), function(X) example[,X]*1000/sum(example[,X]))
编辑:一个问题是数据框中的第一列中有字符。此新代码适用于您提供的示例数据。
这是使用 tidyverse 的一种可能的解决方案:
library(tidyverse)
examples %>%
mutate_at(
vars(-matches("filename|LineTotal")),
~ .x * 1000 / LineTotal
)
我想这就是您要找的东西
1) 定义总行的列数
colname_tot = which(colnames(examples) == "LineTotal") -1
2) 使用 apply 对每一行执行此操作
examples = rbind(examples[, 1], as.data.frame(t(apply(examples[, -1], 1, function(x) {x = x*1000/x[colname_tot]})))
希望这对您有所帮助,