如何使用 imputeTS 包中的 gplot_na_imputations() 或 ggplot_na_distribution()
How to use gplot_na_imputations() or ggplot_na_distribution() from the package imputeTS
我有一个数据框(table,2020 年到 2022 年间有 100 rows/countries 和 28 columns/months)。我使用了包 imputeTS 并使用函数 na_kalman() 将我的几个 NA 值替换为一些估计值。一切顺利,直到这里。之后,当我尝试使用 gplot_na_imputations() 或 ggplot_na_distribution() 进行绘图时,会显示错误:“输入 x_with_na 不是数字”。我认为解决方案是将我的数据帧转换为时间序列 'ts'。有什么建议吗?
这是我的:
total_tests_imp <- na_kalman(total_tests_md)
ggplot_na_imputations(x_with_na = total_tests_md, x_with_imputations = total_tests_imp)
ggplot_na_distribution(total_tests_md)
(ps.) 当我 运行: class(total_tests_md)
出现:[1] "tbl_df" "tbl" "data.frame"
当我 运行 `head(total_tests_md)´
# A tibble: 6 x 29
countries jan_20 fev_20 mar_20 abr_20 mai_20 jun_20 jul_20 ago_20 set_20 out_20 nov_20 dez_20 jan_21 fev_21 mar_21 abr_21
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 Albania NA 0.009 0.54 2.83 5.08 8.19 12.9 20.3 29.1 42.0 61.7 86.2 119. 155. 187. 214.
3 Algeria NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
4 Andorra NA NA NA NA NA NA NA NA 691. 1033. 1405. 1613. 1819. 2003. 2175. 2335.
5 Angola NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 Argentina 0.013 0.015 0.162 1.55 4.44 9.91 19.7 34.3 52.3 74.3 92.3 112. 143. 172. 204. 257.
# ... with 12 more variables: mai_21 <dbl>, jun_21 <dbl>, jul_21 <dbl>, ago_21 <dbl>, set_21 <dbl>, out_21 <dbl>,
# nov_21 <dbl>, dez_21 <dbl>, jan_22 <dbl>, fev_22 <dbl>, mar_22 <dbl>, abr_22 <dbl>´´´
dput(head(total_tests_md))
structure(list(countries = c("Afghanistan", "Albania", "Algeria",
"Andorra", "Angola", "Argentina"), jan_20 = c(NA, NA, NA, NA,
NA, 0.013), fev_20 = c(NA, 0.009, NA, NA, NA, 0.015), mar_20 = c(NA,
0.54, NA, NA, NA, 0.162), abr_20 = c(NA, 2.831, NA, NA, NA, 1.546
), mai_20 = c(NA, 5.083, NA, NA, NA, 4.445), jun_20 = c(NA, 8.192,
NA, NA, NA, 9.913), jul_20 = c(NA, 12.852, NA, NA, NA, 19.719
), ago_20 = c(NA, 20.317, NA, NA, NA, 34.32), set_20 = c(NA,
29.089, NA, 691.095, NA, 52.255), out_20 = c(NA, 42.031, NA,
1033.495, NA, 74.307), nov_20 = c(NA, 61.658, NA, 1404.711, NA,
92.271), dez_20 = c(NA, 86.158, NA, 1613.414, NA, 112.404), jan_21 = c(NA,
119.428, NA, 1819.053, NA, 143.415), fev_21 = c(NA, 154.702,
NA, 2003.284, NA, 171.576), mar_21 = c(NA, 186.772, NA, 2174.988,
NA, 203.784), abr_21 = c(NA, 214.329, NA, 2335.148, NA, 257.398
), mai_21 = c(NA, 243.676, NA, 2480.234, NA, 317.92), jun_21 = c(NA,
271.086, NA, 2543.915, NA, 375.2), jul_21 = c(NA, 299.727, NA,
2621.83, NA, 433.25), ago_21 = c(NA, 352.728, NA, 2709.918, NA,
492.053), set_21 = c(NA, 404.621, NA, 2767.717, NA, 528.764),
out_21 = c(NA, 439.925, NA, 2850.247, NA, 556.29), nov_21 = c(NA,
467.614, NA, 3006.839, NA, 580.944), dez_21 = c(NA, 495.44,
NA, 3449.208, NA, 627.339), jan_22 = c(21.413, 543.967, NA,
3840.758, 40.321, 730.777), fev_22 = c(22.328, 552.997, NA,
3882.243, 41.965, 756.948), mar_22 = c(22.695, 556.666, 5.167,
NA, 43.944, 777.078), abr_22 = c(NA, 558.412, NA, NA, 44.198,
783.816)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
当你使用ggplot_na_imputations
或ggplot_na_distribution
时,你应该在一维中提供vector
或ts
对象作为它在函数描述中指定:
https://www.rdocumentation.org/packages/imputeTS/versions/3.2/topics/ggplot_na_imputations
所以你必须将你的 data.frame
与所有国家转换成一个按国家分类的向量。此外,要将向量转换为时间序列,请参阅:
https://stat.ethz.ch/R-manual/R-devel/library/stats/html/ts.html
您的数据
total_tests_md <- structure(list(countries = c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Argentina"),
jan_20 = c(NA, NA, NA, NA, NA, 0.013),
fev_20 = c(NA, 0.009, NA, NA, NA, 0.015),
mar_20 = c(NA, 0.54, NA, NA, NA, 0.162),
abr_20 = c(NA, 2.831, NA, 0.3, NA, 1.546)),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
导入您的库
library(zoo)
library(imputeTS)
将您的 data.frame
转换为矢量
# remove country name
Albania <- total_tests_md[2,-1]
Albania <- as.numeric(Albania)
# create month vector
month <- seq(as.Date("2020-01-01"), as.Date("2020-04-01"), by = "month")
当您使用时间序列时
# reasonning with ts
Albaniats <- zoo(Albania, month)
AlbaniatsInput <- Albaniats
AlbaniatsInput[1] <- 0.5
ggplot_na_imputations(x_with_na = Albaniats,
x_with_imputations = AlbaniatsInput,
x_axis_labels = index(Albaniats))
ggplot_na_distribution(Albaniats,
x_axis_labels = index(Albaniats))
仅使用向量时
#reasoning with numeric vector
AlbaniaInput <- Albania
AlbaniaInput[1] <- 0.5
ggplot_na_imputations(x_with_na = Albania,
x_with_imputations = AlbaniaInput,
x_axis_labels = month)
ggplot_na_distribution(Albania,
x_axis_labels = month)
我有一个数据框(table,2020 年到 2022 年间有 100 rows/countries 和 28 columns/months)。我使用了包 imputeTS 并使用函数 na_kalman() 将我的几个 NA 值替换为一些估计值。一切顺利,直到这里。之后,当我尝试使用 gplot_na_imputations() 或 ggplot_na_distribution() 进行绘图时,会显示错误:“输入 x_with_na 不是数字”。我认为解决方案是将我的数据帧转换为时间序列 'ts'。有什么建议吗?
这是我的:
total_tests_imp <- na_kalman(total_tests_md)
ggplot_na_imputations(x_with_na = total_tests_md, x_with_imputations = total_tests_imp)
ggplot_na_distribution(total_tests_md)
(ps.) 当我 运行: class(total_tests_md)
出现:[1] "tbl_df" "tbl" "data.frame"
当我 运行 `head(total_tests_md)´
# A tibble: 6 x 29
countries jan_20 fev_20 mar_20 abr_20 mai_20 jun_20 jul_20 ago_20 set_20 out_20 nov_20 dez_20 jan_21 fev_21 mar_21 abr_21
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Afghanistan NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2 Albania NA 0.009 0.54 2.83 5.08 8.19 12.9 20.3 29.1 42.0 61.7 86.2 119. 155. 187. 214.
3 Algeria NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
4 Andorra NA NA NA NA NA NA NA NA 691. 1033. 1405. 1613. 1819. 2003. 2175. 2335.
5 Angola NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 Argentina 0.013 0.015 0.162 1.55 4.44 9.91 19.7 34.3 52.3 74.3 92.3 112. 143. 172. 204. 257.
# ... with 12 more variables: mai_21 <dbl>, jun_21 <dbl>, jul_21 <dbl>, ago_21 <dbl>, set_21 <dbl>, out_21 <dbl>,
# nov_21 <dbl>, dez_21 <dbl>, jan_22 <dbl>, fev_22 <dbl>, mar_22 <dbl>, abr_22 <dbl>´´´
dput(head(total_tests_md))
structure(list(countries = c("Afghanistan", "Albania", "Algeria",
"Andorra", "Angola", "Argentina"), jan_20 = c(NA, NA, NA, NA,
NA, 0.013), fev_20 = c(NA, 0.009, NA, NA, NA, 0.015), mar_20 = c(NA,
0.54, NA, NA, NA, 0.162), abr_20 = c(NA, 2.831, NA, NA, NA, 1.546
), mai_20 = c(NA, 5.083, NA, NA, NA, 4.445), jun_20 = c(NA, 8.192,
NA, NA, NA, 9.913), jul_20 = c(NA, 12.852, NA, NA, NA, 19.719
), ago_20 = c(NA, 20.317, NA, NA, NA, 34.32), set_20 = c(NA,
29.089, NA, 691.095, NA, 52.255), out_20 = c(NA, 42.031, NA,
1033.495, NA, 74.307), nov_20 = c(NA, 61.658, NA, 1404.711, NA,
92.271), dez_20 = c(NA, 86.158, NA, 1613.414, NA, 112.404), jan_21 = c(NA,
119.428, NA, 1819.053, NA, 143.415), fev_21 = c(NA, 154.702,
NA, 2003.284, NA, 171.576), mar_21 = c(NA, 186.772, NA, 2174.988,
NA, 203.784), abr_21 = c(NA, 214.329, NA, 2335.148, NA, 257.398
), mai_21 = c(NA, 243.676, NA, 2480.234, NA, 317.92), jun_21 = c(NA,
271.086, NA, 2543.915, NA, 375.2), jul_21 = c(NA, 299.727, NA,
2621.83, NA, 433.25), ago_21 = c(NA, 352.728, NA, 2709.918, NA,
492.053), set_21 = c(NA, 404.621, NA, 2767.717, NA, 528.764),
out_21 = c(NA, 439.925, NA, 2850.247, NA, 556.29), nov_21 = c(NA,
467.614, NA, 3006.839, NA, 580.944), dez_21 = c(NA, 495.44,
NA, 3449.208, NA, 627.339), jan_22 = c(21.413, 543.967, NA,
3840.758, 40.321, 730.777), fev_22 = c(22.328, 552.997, NA,
3882.243, 41.965, 756.948), mar_22 = c(22.695, 556.666, 5.167,
NA, 43.944, 777.078), abr_22 = c(NA, 558.412, NA, NA, 44.198,
783.816)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
当你使用ggplot_na_imputations
或ggplot_na_distribution
时,你应该在一维中提供vector
或ts
对象作为它在函数描述中指定:
https://www.rdocumentation.org/packages/imputeTS/versions/3.2/topics/ggplot_na_imputations
所以你必须将你的 data.frame
与所有国家转换成一个按国家分类的向量。此外,要将向量转换为时间序列,请参阅:
https://stat.ethz.ch/R-manual/R-devel/library/stats/html/ts.html
您的数据
total_tests_md <- structure(list(countries = c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Argentina"),
jan_20 = c(NA, NA, NA, NA, NA, 0.013),
fev_20 = c(NA, 0.009, NA, NA, NA, 0.015),
mar_20 = c(NA, 0.54, NA, NA, NA, 0.162),
abr_20 = c(NA, 2.831, NA, 0.3, NA, 1.546)),
row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
导入您的库
library(zoo)
library(imputeTS)
将您的 data.frame
转换为矢量
# remove country name
Albania <- total_tests_md[2,-1]
Albania <- as.numeric(Albania)
# create month vector
month <- seq(as.Date("2020-01-01"), as.Date("2020-04-01"), by = "month")
当您使用时间序列时
# reasonning with ts
Albaniats <- zoo(Albania, month)
AlbaniatsInput <- Albaniats
AlbaniatsInput[1] <- 0.5
ggplot_na_imputations(x_with_na = Albaniats,
x_with_imputations = AlbaniatsInput,
x_axis_labels = index(Albaniats))
ggplot_na_distribution(Albaniats,
x_axis_labels = index(Albaniats))
仅使用向量时
#reasoning with numeric vector
AlbaniaInput <- Albania
AlbaniaInput[1] <- 0.5
ggplot_na_imputations(x_with_na = Albania,
x_with_imputations = AlbaniaInput,
x_axis_labels = month)
ggplot_na_distribution(Albania,
x_axis_labels = month)