我遇到了一些 R / 海洋学问题......如何比较不同长度和数字的两列
I got some R / Oceanographic problems...how to compare two columns of different length and digits
我有一个我无法解决的小问题,我有几个包含多个列的冗长数据集,两个 data.frames 的子集如下所示:
Temp <- c(12.9423 ,12.9446 ,12.9412 ,12.9617 ,12.9742 ,12.9652 ,12.9463, 12.9847 ,12.9778,
12.9589, 12.9305, 12.9275 ,12.8569 ,12.8531 ,12.9092, 12.9471, 12.9298, 12.9266,
12.9374 ,12.9385, 12.9505, 12.9510, 12.9632 ,12.9621 ,12.9571, 12.9492 ,12.8988,
12.8895 ,12.8777, 12.8956, 12.8748 ,12.7850 ,12.7323, 12.7546 ,12.7375 ,12.7020,
12.7172, 12.7015, 12.6960, 12.6944, 12.6963, 12.6928, 12.6930 ,12.6883 ,12.6913)
Density <- c(26.38635 ,26.38531 ,26.38429, 26.38336, 26.38268 ,26.38242, 26.38265, 26.38343,
26.38486, 26.38697 ,26.38945, 26.39188, 26.39365, 26.39424 ,26.39376 ,26.39250,
26.39084 ,26.38912 ,26.38744 ,26.38587, 26.38456 ,26.38367, 26.38341 ,26.38398,
26.38547 ,26.38793 ,26.39120 ,26.39509, 26.39955 ,26.40455, 26.41002, 26.41578,
26.42126, 26.42593 ,26.42968, 26.43255 ,26.43463, 26.43603 ,26.43693 ,26.43750,
26.43787, 26.43815, 26.43841 ,26.43871 ,26.43904)
po4 <- c(0.4239840 ,0.4351156, 0.4456128, 0.4542392, 0.4608510, 0.4656445, 0.4690847,
0.4717291, 0.4742391 ,0.4774904 ,0.4831152, 0.4922122, 0.5029904, 0.5128720,
0.5190209, 0.5191368 ,0.5133212, 0.5027542 ,0.4905301 ,0.4796467 ,0.4708035,
0.4638879, 0.4578364 ,0.4519745, 0.4481336, 0.4483697, 0.4531310, 0.4622930,
0.4750474 ,0.4905152 ,0.5082183 ,0.5278212 ,0.5491580 ,0.5720519, 0.5961127,
0.6207716 ,0.6449603, 0.6675704 ,0.6878331 ,0.7051851,0.7195461, 0.7305200,
0.7359634 ,0.7343541, 0.7283988)
PP14 <- data.frame(Temp,Density,po4) ##df1
temp <- c(13.13875, 13.13477 ,13.12337 ,13.10662 ,13.09798 ,13.09542 ,13.08734 ,13.07616,
13.06671 ,13.05899, 13.05890 ,13.05293 ,13.03322, 13.01515, 13.02552 ,13.01668,
12.99829, 12.97075 ,12.95572 ,12.95045 ,12.94541 ,12.94365 ,12.94609 ,12.94256,
12.93565 ,12.93258 ,12.93489 ,12.93209 ,12.92219 ,12.90730 ,12.90416 ,12.89974,
12.89749 ,12.89626 ,12.89395, 12.89315 ,12.89274, 12.89276 ,12.89293 ,12.89302)
density <- c( 26.35897, 26.36274 ,26.36173 ,26.36401 ,26.36507 ,26.36662 ,26.36838,
26.36996,
26.37286 ,26.37452 ,26.37402, 26.37571 ,26.37776, 26.38008 ,26.37959 ,26.38178,
26.38642 ,26.39158 ,26.39350, 26.39467, 26.39601, 26.39601, 26.39596 ,26.39517,
26.39728 ,26.39766, 26.39774, 26.39699 ,26.40081 ,26.40328 ,26.40416, 26.40486,
26.40513 ,26.40474 ,26.40552 ,26.40584, 26.40613, 26.40602 ,26.40595 ,26.40498)
krho <- c( -9.999999e+06, -1.786843e+00, -9.142976e-01, -9.650734e-01, -2.532397e+00,
-3.760537e+00, -2.622484e+00, -1.776506e+00, -2.028391e+00, -2.225910e+00,
-3.486826e+00, -2.062341e-01, -3.010643e+00, -3.878437e+00, -3.796426e+00,
-3.227138e+00, -3.335446e+00, -3.738037e+00, -4.577778e+00, -3.818099e+00,
-3.891467e+00, -4.585045e+00 ,-3.150283e+00 ,-4.371089e+00 ,-3.902601e+00,
-4.546019e+00, -3.932538e+00, -4.331247e+00, -4.508137e+00, -4.789201e+00,
-4.383820e+00, -4.423486e+00, -4.334641e+00, -4.330544e+00, -4.838604e+00,
-4.729123e+00, -4.381797e+00, -4.207365e+00, -4.276804e+00, -4.001305e+00)
MS14 <- data.frame(temp,density,krho) ##df2
我需要参考或比较 MS14 的密度 +- 0.01 和 PP14 的密度 +- 0.01,如果我使用 = = 永远不会相同,因为它们都有 5 位数字,并且 none 个值将相同...
为了解决这个问题,我认为比较密度 MS14$density 和 PP14$Density 是有意义的,+-0.01 只要相似性为真,原始值应该与原始值的所有其他值一起存储MS14 和 PP14 中所有列的行,以便最后我们得到一个包含所有列的 df3 因为此后我需要执行 F= -krho * dPO4/dz 所以我需要保留这些值“按密度排序”...
有什么想法和建议吗?
非常感谢!
好吧,您遇到的一个问题是列的长度不同。
但假设您将它们切成相同的长度,这可能会有所帮助:
library(tidyverse)
similar<-dplyr::near(MS14$density, PP14$Density, tol=0.01) ##PP14 has 45 columns while MS14 has only 40
MS14$similar<-similar
PP14$similar<-similar
MS14[MS14$similar==T,]
PP14[PP14$similar==T,]
df3<-na.omit(cbind.data.frame(MS14[MS14$similar==T,], PP14[PP14$similar==T,]))
temp density krho similar Temp Density po4 similar
17 12.99829 26.38642 -3.335446 TRUE 12.9298 26.39084 0.5133212 TRUE
18 12.97075 26.39158 -3.738037 TRUE 12.9266 26.38912 0.5027542 TRUE
19 12.95572 26.39350 -4.577778 TRUE 12.9374 26.38744 0.4905301 TRUE
20 12.95045 26.39467 -3.818099 TRUE 12.9385 26.38587 0.4796467 TRUE
26 12.93258 26.39766 -4.546019 TRUE 12.9492 26.38793 0.4483697 TRUE
27 12.93489 26.39774 -3.932538 TRUE 12.8988 26.39120 0.4531310 TRUE
28 12.93209 26.39699 -4.331247 TRUE 12.8895 26.39509 0.4622930 TRUE
29 12.92219 26.40081 -4.508137 TRUE 12.8777 26.39955 0.4750474 TRUE
30 12.90730 26.40328 -4.789201 TRUE 12.8956 26.40455 0.4905152 TRUE
31 12.90416 26.40416 -4.383820 TRUE 12.8748 26.41002 0.5082183 TRUE
好的,根据您的回复:“无需在 df3 中获取 45 行,只需所有相互匹配的密度即可。”
我想你可以试试这个:
library(fuzzyjoin)
library(dplyr)
df3 <- difference_left_join(PP14, MS14,
by = c("Density"="density"), max_dist = 0.01) %>%
arrange(density)
df3 %>% head()
Temp Density po4 temp density krho
1 12.9742 26.38268 0.4608510 13.06671 26.37286 -2.028391
2 12.9652 26.38242 0.4656445 13.06671 26.37286 -2.028391
3 12.9463 26.38265 0.4690847 13.06671 26.37286 -2.028391
4 12.9617 26.38336 0.4542392 13.05890 26.37402 -3.486826
5 12.9742 26.38268 0.4608510 13.05890 26.37402 -3.486826
6 12.9652 26.38242 0.4656445 13.05890 26.37402 -3.486826
我有一个我无法解决的小问题,我有几个包含多个列的冗长数据集,两个 data.frames 的子集如下所示:
Temp <- c(12.9423 ,12.9446 ,12.9412 ,12.9617 ,12.9742 ,12.9652 ,12.9463, 12.9847 ,12.9778,
12.9589, 12.9305, 12.9275 ,12.8569 ,12.8531 ,12.9092, 12.9471, 12.9298, 12.9266,
12.9374 ,12.9385, 12.9505, 12.9510, 12.9632 ,12.9621 ,12.9571, 12.9492 ,12.8988,
12.8895 ,12.8777, 12.8956, 12.8748 ,12.7850 ,12.7323, 12.7546 ,12.7375 ,12.7020,
12.7172, 12.7015, 12.6960, 12.6944, 12.6963, 12.6928, 12.6930 ,12.6883 ,12.6913)
Density <- c(26.38635 ,26.38531 ,26.38429, 26.38336, 26.38268 ,26.38242, 26.38265, 26.38343,
26.38486, 26.38697 ,26.38945, 26.39188, 26.39365, 26.39424 ,26.39376 ,26.39250,
26.39084 ,26.38912 ,26.38744 ,26.38587, 26.38456 ,26.38367, 26.38341 ,26.38398,
26.38547 ,26.38793 ,26.39120 ,26.39509, 26.39955 ,26.40455, 26.41002, 26.41578,
26.42126, 26.42593 ,26.42968, 26.43255 ,26.43463, 26.43603 ,26.43693 ,26.43750,
26.43787, 26.43815, 26.43841 ,26.43871 ,26.43904)
po4 <- c(0.4239840 ,0.4351156, 0.4456128, 0.4542392, 0.4608510, 0.4656445, 0.4690847,
0.4717291, 0.4742391 ,0.4774904 ,0.4831152, 0.4922122, 0.5029904, 0.5128720,
0.5190209, 0.5191368 ,0.5133212, 0.5027542 ,0.4905301 ,0.4796467 ,0.4708035,
0.4638879, 0.4578364 ,0.4519745, 0.4481336, 0.4483697, 0.4531310, 0.4622930,
0.4750474 ,0.4905152 ,0.5082183 ,0.5278212 ,0.5491580 ,0.5720519, 0.5961127,
0.6207716 ,0.6449603, 0.6675704 ,0.6878331 ,0.7051851,0.7195461, 0.7305200,
0.7359634 ,0.7343541, 0.7283988)
PP14 <- data.frame(Temp,Density,po4) ##df1
temp <- c(13.13875, 13.13477 ,13.12337 ,13.10662 ,13.09798 ,13.09542 ,13.08734 ,13.07616,
13.06671 ,13.05899, 13.05890 ,13.05293 ,13.03322, 13.01515, 13.02552 ,13.01668,
12.99829, 12.97075 ,12.95572 ,12.95045 ,12.94541 ,12.94365 ,12.94609 ,12.94256,
12.93565 ,12.93258 ,12.93489 ,12.93209 ,12.92219 ,12.90730 ,12.90416 ,12.89974,
12.89749 ,12.89626 ,12.89395, 12.89315 ,12.89274, 12.89276 ,12.89293 ,12.89302)
density <- c( 26.35897, 26.36274 ,26.36173 ,26.36401 ,26.36507 ,26.36662 ,26.36838,
26.36996,
26.37286 ,26.37452 ,26.37402, 26.37571 ,26.37776, 26.38008 ,26.37959 ,26.38178,
26.38642 ,26.39158 ,26.39350, 26.39467, 26.39601, 26.39601, 26.39596 ,26.39517,
26.39728 ,26.39766, 26.39774, 26.39699 ,26.40081 ,26.40328 ,26.40416, 26.40486,
26.40513 ,26.40474 ,26.40552 ,26.40584, 26.40613, 26.40602 ,26.40595 ,26.40498)
krho <- c( -9.999999e+06, -1.786843e+00, -9.142976e-01, -9.650734e-01, -2.532397e+00,
-3.760537e+00, -2.622484e+00, -1.776506e+00, -2.028391e+00, -2.225910e+00,
-3.486826e+00, -2.062341e-01, -3.010643e+00, -3.878437e+00, -3.796426e+00,
-3.227138e+00, -3.335446e+00, -3.738037e+00, -4.577778e+00, -3.818099e+00,
-3.891467e+00, -4.585045e+00 ,-3.150283e+00 ,-4.371089e+00 ,-3.902601e+00,
-4.546019e+00, -3.932538e+00, -4.331247e+00, -4.508137e+00, -4.789201e+00,
-4.383820e+00, -4.423486e+00, -4.334641e+00, -4.330544e+00, -4.838604e+00,
-4.729123e+00, -4.381797e+00, -4.207365e+00, -4.276804e+00, -4.001305e+00)
MS14 <- data.frame(temp,density,krho) ##df2
我需要参考或比较 MS14 的密度 +- 0.01 和 PP14 的密度 +- 0.01,如果我使用 = = 永远不会相同,因为它们都有 5 位数字,并且 none 个值将相同...
为了解决这个问题,我认为比较密度 MS14$density 和 PP14$Density 是有意义的,+-0.01 只要相似性为真,原始值应该与原始值的所有其他值一起存储MS14 和 PP14 中所有列的行,以便最后我们得到一个包含所有列的 df3 因为此后我需要执行 F= -krho * dPO4/dz 所以我需要保留这些值“按密度排序”...
有什么想法和建议吗? 非常感谢!
好吧,您遇到的一个问题是列的长度不同。
但假设您将它们切成相同的长度,这可能会有所帮助:
library(tidyverse)
similar<-dplyr::near(MS14$density, PP14$Density, tol=0.01) ##PP14 has 45 columns while MS14 has only 40
MS14$similar<-similar
PP14$similar<-similar
MS14[MS14$similar==T,]
PP14[PP14$similar==T,]
df3<-na.omit(cbind.data.frame(MS14[MS14$similar==T,], PP14[PP14$similar==T,]))
temp density krho similar Temp Density po4 similar
17 12.99829 26.38642 -3.335446 TRUE 12.9298 26.39084 0.5133212 TRUE
18 12.97075 26.39158 -3.738037 TRUE 12.9266 26.38912 0.5027542 TRUE
19 12.95572 26.39350 -4.577778 TRUE 12.9374 26.38744 0.4905301 TRUE
20 12.95045 26.39467 -3.818099 TRUE 12.9385 26.38587 0.4796467 TRUE
26 12.93258 26.39766 -4.546019 TRUE 12.9492 26.38793 0.4483697 TRUE
27 12.93489 26.39774 -3.932538 TRUE 12.8988 26.39120 0.4531310 TRUE
28 12.93209 26.39699 -4.331247 TRUE 12.8895 26.39509 0.4622930 TRUE
29 12.92219 26.40081 -4.508137 TRUE 12.8777 26.39955 0.4750474 TRUE
30 12.90730 26.40328 -4.789201 TRUE 12.8956 26.40455 0.4905152 TRUE
31 12.90416 26.40416 -4.383820 TRUE 12.8748 26.41002 0.5082183 TRUE
好的,根据您的回复:“无需在 df3 中获取 45 行,只需所有相互匹配的密度即可。” 我想你可以试试这个:
library(fuzzyjoin)
library(dplyr)
df3 <- difference_left_join(PP14, MS14,
by = c("Density"="density"), max_dist = 0.01) %>%
arrange(density)
df3 %>% head()
Temp Density po4 temp density krho
1 12.9742 26.38268 0.4608510 13.06671 26.37286 -2.028391
2 12.9652 26.38242 0.4656445 13.06671 26.37286 -2.028391
3 12.9463 26.38265 0.4690847 13.06671 26.37286 -2.028391
4 12.9617 26.38336 0.4542392 13.05890 26.37402 -3.486826
5 12.9742 26.38268 0.4608510 13.05890 26.37402 -3.486826
6 12.9652 26.38242 0.4656445 13.05890 26.37402 -3.486826