如何解析化学式以获得原子成分?
How to parse chemical formula to get atom component?
我的数据集存储在名为“公式”的单个列 table 中,如下所示:
row.identity..main.ID.
C5H6O2N3
C10H12N
C5H6O2N3S
我想扩展当前的 table,其中每一列都写有字母,下面一行显示相应的数字。基本上我想要这样的东西:
row.identity..main.ID. C H O N S X
C5H6O2N3 5 6 2 3 0 0
C10H12N 10 12 0 1 0 0
C5H6O2N3S 5 6 2 3 1 0
如果代码能够灵活处理具有不同字母的更长数据集,那就太好了。到目前为止,我尝试实施 Onyambu 的解决方案。
library(tidyverse)
library(stringr)
Formula%>%mutate(row.identity..main.ID.=gsub("\b([A-Za-z]+)\b","\30",row.identity..main.ID.),
elements=str_extract_all(row.identity..main.ID.,"[A-Za-z]+"),
value=str_extract_all(row.identity..main.ID.,"\d+"))%>%
unnest()%>%pivot_wider(elements,value,fill=0)
然而,这会导致一些错误,例如“不兼容的长度:4、3”。 and/or 现在使用 unnest() 时需要 cols
。
您可以试试下面的代码
df <- cbind(
df,
do.call(
rbind,
Map(function(x) {
x <- gsub("(?<=[A-z])(?![0-9])","1",x,perl = TRUE)
table(
factor(rep(
gsub("\d+", "", x),
as.numeric(gsub("\D+", "", x))
), levels = c("C", "H", "O", "N", "S", "X"))
)
}, regmatches(df$ID, gregexpr("[A-z]+(\d+)?", df$ID)))
)
)
这给出了
> df
ID C H O N S X
1 C5H6O2N3 5 6 2 3 0 0
2 C10H12N 10 12 0 1 0 0
3 C5H6O2N3S 5 6 2 3 1 0
数据
> dput(df)
structure(list(ID = c("C5H6O2N3", "C10H12N", "C5H6O2N3S"), C = c(5L,
10L, 5L), H = c(6L, 12L, 6L), O = c(2L, 0L, 2L), N = c(3L, 1L,
3L), S = c(0L, 0L, 1L), X = c(0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
这是 purrr
的方法:
library(purrr); library(dplyr); library(stringr)
Formula %>%
pmap_dfr(~map2_dbl(c(...),LETTERS,
~max((str_extract(.x,paste0("(?<=",.y,")[0-9]+"))%>% as.integer),
(str_extract(.x,.y) == .y),
na.rm = TRUE)) %>%
replace(is.infinite(.),0) %>%
set_names(LETTERS)) %>%
select_if(~sum(.) > 0)
# A tibble: 3 x 5
C H N O S
<dbl> <dbl> <dbl> <dbl> <dbl>
1 5 6 3 2 0
2 10 12 1 0 0
3 5 6 3 2 1
数据
Formula <- structure(list(row.identity..main.ID. = c("C5H6O2N3", "C10H12N",
"C5H6O2N3S")), class = "data.frame", row.names = c(NA, -3L))
你也可以这样做:
a<- sub("([A-Z]$)","\1:1", gsub("(\D+)(\d+)", "\1:\2\n",df[,1]))
e <- sapply(a, function(x)data.frame(read.dcf(textConnection(x))))
f <- cbind(df, plyr::rbind.fill(e))
f[is.na(f)] <- 0
f
row.identity..main.ID. C H O N S
1 C5H6O2N3 5 6 2 3 0
2 C10H12N 10 12 0 1 0
3 C5H6O2N3S 5 6 2 3 1
另一种选择是将文本转换为 Json 然后将其读入 R:
a <- gsub("(\D)(\d+)", '"\1":\2,', df[,1])
b <- gsub("([A-Z])$", '"\1":1', trimws(a, whitespace = ","))
cbind(df, jsonlite::fromJSON(sprintf("[{%s}]",paste(b, collapse = "}, {"))))
replace(f, is.na(f), 0)
row.identity..main.ID. C H O N S
1 C5H6O2N3 5 6 2 3 0
2 C10H12N 10 12 0 1 0
3 C5H6O2N3S 5 6 2 3 1
我的数据集存储在名为“公式”的单个列 table 中,如下所示:
row.identity..main.ID.
C5H6O2N3
C10H12N
C5H6O2N3S
我想扩展当前的 table,其中每一列都写有字母,下面一行显示相应的数字。基本上我想要这样的东西:
row.identity..main.ID. C H O N S X
C5H6O2N3 5 6 2 3 0 0
C10H12N 10 12 0 1 0 0
C5H6O2N3S 5 6 2 3 1 0
如果代码能够灵活处理具有不同字母的更长数据集,那就太好了。到目前为止,我尝试实施 Onyambu 的解决方案。
library(tidyverse)
library(stringr)
Formula%>%mutate(row.identity..main.ID.=gsub("\b([A-Za-z]+)\b","\30",row.identity..main.ID.),
elements=str_extract_all(row.identity..main.ID.,"[A-Za-z]+"),
value=str_extract_all(row.identity..main.ID.,"\d+"))%>%
unnest()%>%pivot_wider(elements,value,fill=0)
然而,这会导致一些错误,例如“不兼容的长度:4、3”。 and/or 现在使用 unnest() 时需要 cols
。
您可以试试下面的代码
df <- cbind(
df,
do.call(
rbind,
Map(function(x) {
x <- gsub("(?<=[A-z])(?![0-9])","1",x,perl = TRUE)
table(
factor(rep(
gsub("\d+", "", x),
as.numeric(gsub("\D+", "", x))
), levels = c("C", "H", "O", "N", "S", "X"))
)
}, regmatches(df$ID, gregexpr("[A-z]+(\d+)?", df$ID)))
)
)
这给出了
> df
ID C H O N S X
1 C5H6O2N3 5 6 2 3 0 0
2 C10H12N 10 12 0 1 0 0
3 C5H6O2N3S 5 6 2 3 1 0
数据
> dput(df)
structure(list(ID = c("C5H6O2N3", "C10H12N", "C5H6O2N3S"), C = c(5L,
10L, 5L), H = c(6L, 12L, 6L), O = c(2L, 0L, 2L), N = c(3L, 1L,
3L), S = c(0L, 0L, 1L), X = c(0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
这是 purrr
的方法:
library(purrr); library(dplyr); library(stringr)
Formula %>%
pmap_dfr(~map2_dbl(c(...),LETTERS,
~max((str_extract(.x,paste0("(?<=",.y,")[0-9]+"))%>% as.integer),
(str_extract(.x,.y) == .y),
na.rm = TRUE)) %>%
replace(is.infinite(.),0) %>%
set_names(LETTERS)) %>%
select_if(~sum(.) > 0)
# A tibble: 3 x 5
C H N O S
<dbl> <dbl> <dbl> <dbl> <dbl>
1 5 6 3 2 0
2 10 12 1 0 0
3 5 6 3 2 1
数据
Formula <- structure(list(row.identity..main.ID. = c("C5H6O2N3", "C10H12N",
"C5H6O2N3S")), class = "data.frame", row.names = c(NA, -3L))
你也可以这样做:
a<- sub("([A-Z]$)","\1:1", gsub("(\D+)(\d+)", "\1:\2\n",df[,1]))
e <- sapply(a, function(x)data.frame(read.dcf(textConnection(x))))
f <- cbind(df, plyr::rbind.fill(e))
f[is.na(f)] <- 0
f
row.identity..main.ID. C H O N S
1 C5H6O2N3 5 6 2 3 0
2 C10H12N 10 12 0 1 0
3 C5H6O2N3S 5 6 2 3 1
另一种选择是将文本转换为 Json 然后将其读入 R:
a <- gsub("(\D)(\d+)", '"\1":\2,', df[,1])
b <- gsub("([A-Z])$", '"\1":1', trimws(a, whitespace = ","))
cbind(df, jsonlite::fromJSON(sprintf("[{%s}]",paste(b, collapse = "}, {"))))
replace(f, is.na(f), 0)
row.identity..main.ID. C H O N S
1 C5H6O2N3 5 6 2 3 0
2 C10H12N 10 12 0 1 0
3 C5H6O2N3S 5 6 2 3 1