我遇到了一些 R / 海洋学问题......如何比较不同长度和数字的两列

I got some R / Oceanographic problems...how to compare two columns of different length and digits

我有一个我无法解决的小问题,我有几个包含多个列的冗长数据集,两个 data.frames 的子集如下所示:

Temp <- c(12.9423 ,12.9446 ,12.9412 ,12.9617 ,12.9742 ,12.9652 ,12.9463, 12.9847 ,12.9778,
        12.9589, 12.9305, 12.9275 ,12.8569 ,12.8531 ,12.9092, 12.9471, 12.9298, 12.9266,
        12.9374 ,12.9385, 12.9505, 12.9510, 12.9632 ,12.9621 ,12.9571, 12.9492 ,12.8988,
        12.8895 ,12.8777, 12.8956, 12.8748 ,12.7850 ,12.7323, 12.7546 ,12.7375 ,12.7020,
        12.7172, 12.7015, 12.6960, 12.6944, 12.6963, 12.6928, 12.6930 ,12.6883 ,12.6913)

Density <- c(26.38635 ,26.38531 ,26.38429, 26.38336, 26.38268 ,26.38242, 26.38265, 26.38343,
           26.38486, 26.38697 ,26.38945, 26.39188, 26.39365, 26.39424 ,26.39376 ,26.39250,
           26.39084 ,26.38912 ,26.38744 ,26.38587, 26.38456 ,26.38367, 26.38341 ,26.38398,
           26.38547 ,26.38793 ,26.39120 ,26.39509, 26.39955 ,26.40455, 26.41002, 26.41578,
           26.42126, 26.42593 ,26.42968, 26.43255 ,26.43463, 26.43603 ,26.43693 ,26.43750,
           26.43787, 26.43815, 26.43841 ,26.43871 ,26.43904)

po4 <-  c(0.4239840 ,0.4351156, 0.4456128, 0.4542392, 0.4608510, 0.4656445, 0.4690847,
        0.4717291, 0.4742391 ,0.4774904 ,0.4831152, 0.4922122, 0.5029904, 0.5128720,
        0.5190209, 0.5191368 ,0.5133212, 0.5027542 ,0.4905301 ,0.4796467 ,0.4708035,
        0.4638879, 0.4578364 ,0.4519745, 0.4481336, 0.4483697, 0.4531310, 0.4622930,
        0.4750474 ,0.4905152 ,0.5082183 ,0.5278212 ,0.5491580 ,0.5720519, 0.5961127,
        0.6207716 ,0.6449603, 0.6675704 ,0.6878331 ,0.7051851,0.7195461, 0.7305200,
        0.7359634 ,0.7343541, 0.7283988)

PP14 <- data.frame(Temp,Density,po4) ##df1

temp <- c(13.13875, 13.13477 ,13.12337 ,13.10662 ,13.09798 ,13.09542 ,13.08734 ,13.07616,
 13.06671 ,13.05899, 13.05890 ,13.05293 ,13.03322, 13.01515, 13.02552 ,13.01668,
12.99829, 12.97075 ,12.95572 ,12.95045 ,12.94541 ,12.94365 ,12.94609 ,12.94256,
12.93565 ,12.93258 ,12.93489 ,12.93209 ,12.92219 ,12.90730 ,12.90416 ,12.89974,
 12.89749 ,12.89626 ,12.89395, 12.89315 ,12.89274, 12.89276 ,12.89293 ,12.89302)

density <- c( 26.35897, 26.36274 ,26.36173 ,26.36401 ,26.36507 ,26.36662 ,26.36838,
26.36996,
  26.37286 ,26.37452 ,26.37402, 26.37571 ,26.37776, 26.38008 ,26.37959 ,26.38178,
26.38642 ,26.39158 ,26.39350, 26.39467, 26.39601, 26.39601, 26.39596 ,26.39517,
26.39728 ,26.39766, 26.39774, 26.39699 ,26.40081 ,26.40328 ,26.40416, 26.40486,
26.40513 ,26.40474 ,26.40552 ,26.40584, 26.40613, 26.40602 ,26.40595 ,26.40498)

krho <- c( -9.999999e+06, -1.786843e+00, -9.142976e-01, -9.650734e-01, -2.532397e+00,
  -3.760537e+00, -2.622484e+00, -1.776506e+00, -2.028391e+00, -2.225910e+00,
  -3.486826e+00, -2.062341e-01, -3.010643e+00, -3.878437e+00, -3.796426e+00,
-3.227138e+00, -3.335446e+00, -3.738037e+00, -4.577778e+00, -3.818099e+00,
-3.891467e+00, -4.585045e+00 ,-3.150283e+00 ,-4.371089e+00 ,-3.902601e+00,
-4.546019e+00, -3.932538e+00, -4.331247e+00, -4.508137e+00, -4.789201e+00,
   -4.383820e+00, -4.423486e+00, -4.334641e+00, -4.330544e+00, -4.838604e+00,
    -4.729123e+00, -4.381797e+00, -4.207365e+00, -4.276804e+00, -4.001305e+00)

MS14 <- data.frame(temp,density,krho) ##df2

我需要参考或比较 MS14 的密度 +- 0.01 和 PP14 的密度 +- 0.01,如果我使用 = = 永远不会相同,因为它们都有 5 位数字,并且 none 个值将相同...

为了解决这个问题,我认为比较密度 MS14$density 和 PP14$Density 是有意义的,+-0.01 只要相似性为真,原始值应该与原始值的所有其他值一起存储MS14 和 PP14 中所有列的行,以便最后我们得到一个包含所有列的 df3 因为此后我需要执行 F= -krho * dPO4/dz 所以我需要保留这些值“按密度排序”...

有什么想法和建议吗? 非常感谢!

好吧,您遇到的一个问题是列的长度不同。

但假设您将它们切成相同的长度,这可能会有所帮助:

library(tidyverse)

similar<-dplyr::near(MS14$density, PP14$Density, tol=0.01) ##PP14 has 45 columns while MS14 has only 40

MS14$similar<-similar
PP14$similar<-similar
MS14[MS14$similar==T,]
PP14[PP14$similar==T,]

df3<-na.omit(cbind.data.frame(MS14[MS14$similar==T,], PP14[PP14$similar==T,]))


       temp  density      krho similar    Temp  Density       po4 similar
17 12.99829 26.38642 -3.335446    TRUE 12.9298 26.39084 0.5133212    TRUE
18 12.97075 26.39158 -3.738037    TRUE 12.9266 26.38912 0.5027542    TRUE
19 12.95572 26.39350 -4.577778    TRUE 12.9374 26.38744 0.4905301    TRUE
20 12.95045 26.39467 -3.818099    TRUE 12.9385 26.38587 0.4796467    TRUE
26 12.93258 26.39766 -4.546019    TRUE 12.9492 26.38793 0.4483697    TRUE
27 12.93489 26.39774 -3.932538    TRUE 12.8988 26.39120 0.4531310    TRUE
28 12.93209 26.39699 -4.331247    TRUE 12.8895 26.39509 0.4622930    TRUE
29 12.92219 26.40081 -4.508137    TRUE 12.8777 26.39955 0.4750474    TRUE
30 12.90730 26.40328 -4.789201    TRUE 12.8956 26.40455 0.4905152    TRUE
31 12.90416 26.40416 -4.383820    TRUE 12.8748 26.41002 0.5082183    TRUE

好的,根据您的回复:“无需在 df3 中获取 45 行,只需所有相互匹配的密度即可。” 我想你可以试试这个:

library(fuzzyjoin)
library(dplyr)

df3 <- difference_left_join(PP14, MS14, 
                            by = c("Density"="density"), max_dist = 0.01) %>%
  arrange(density)
df3 %>% head()

     Temp  Density       po4     temp  density      krho
1 12.9742 26.38268 0.4608510 13.06671 26.37286 -2.028391
2 12.9652 26.38242 0.4656445 13.06671 26.37286 -2.028391
3 12.9463 26.38265 0.4690847 13.06671 26.37286 -2.028391
4 12.9617 26.38336 0.4542392 13.05890 26.37402 -3.486826
5 12.9742 26.38268 0.4608510 13.05890 26.37402 -3.486826
6 12.9652 26.38242 0.4656445 13.05890 26.37402 -3.486826