转换为 FEATHER 文件会创建巨大的文件
Conversion to FEATHER file creates huge file
我正在尝试在 Python 中使用 Pandas 来阅读 .rds
file into a .feather
file .
library(feather)
# Set working directory
data = readRDS("file.rds")
data_year = data[["1986"]]
# Try 1
write_feather(
data_year,
"data_year.feather"
)
# Try 2
write_feather(
as.data.frame(as.matrix(data_year)),
"data_year.feather"
)
尝试 1 returns Error: 'x' must be a data frame
和 尝试 2 实际上写了一个 *.feather
文件但是该文件一年大小为 4.5GB,而原始 *.rds
文件几年大小为 0.055GB。
如何将每年的文件转换为单独或非单独的 *.feather
文件,同时保持足够的文件大小?
data
看起来像这样:
data_year
看起来像这样:
*更新
我愿意接受任何关于在 NumPy/Pandas 中使用数据同时保持适度文件大小的建议!
也许像下面的函数这样的东西能帮上忙。
该函数将稀疏矩阵重塑为长格式,从中消除零。这将减少最终 data.frame 大小和磁盘文件大小。
library(Matrix)
library(feather)
dgcMatrix_to_long_df <- function(x) {
res <- NULL
if(nrow(x) > 0L) {
for(i in 1:nrow(x)){
d <- as.matrix(x[i, , drop = FALSE])
d <- as.data.frame(d)
d$row <- i
d <- tidyr::pivot_longer(d, cols = -row, names_to = "col")
d <- d[d$value != 0,]
res <- rbind(res, d)
}
}
res
}
y <- dgcMatrix_to_long_df(data_year)
head(y)
## A tibble: 6 x 3
# row col value
# <int> <chr> <dbl>
#1 1 Col_0103 51
#2 1 Col_0149 6
#3 1 Col_0188 5
#4 1 Col_0238 89
#5 1 Col_0545 14
#6 1 Col_0547 58
path <- "my_data.feather"
write_feather(y, path)
z <- read_feather(path)
identical(y, z)
#[1] TRUE
# The file size is 232 KB though the initial matrix
# had 1 million elements stored as doubles,
# for a total of 8 MB, a saving of around 97%
file.size(path)/1024
#[1] 232.0234
编辑
下面的函数要快得多。
dgcMatrix_to_long_df2 <- function(x) {
res <- NULL
if(nrow(x) > 0L) {
for(i in 1:nrow(x)){
d <- as.matrix(x[i, , drop = FALSE])
inx <- which(d != 0, arr.ind = TRUE)
d <- cbind(inx, value = c(d[d != 0]))
d[, "row"] <- i
res <- rbind(res, d)
}
}
as.data.frame(res)
}
system.time(y <- dgcMatrix_to_long_df(data_year))
# user system elapsed
# 7.89 0.04 7.92
system.time(y <- dgcMatrix_to_long_df2(data_year))
# user system elapsed
# 0.14 0.00 0.14
测试数据
set.seed(2022)
n <- 1e3
x <- rep(0L, n*n)
inx <- sample(c(FALSE, TRUE), n*n, replace = TRUE, prob = c(0.99, 0.01))
x[inx] <- sample(100, sum(inx), replace = TRUE)
data_year <- Matrix(x, n, n, dimnames = list(NULL, sprintf("Col_%04d", 1:n)))
使用 scipy
和 rpy2
,您可以将每个 dgCMatrix
对象作为 scipy.sparse.csc_matrix
对象直接读入 Python。两者都使用 compressed sparse column (CSC) 格式,所以实际上 zero 需要预处理。您需要做的就是将 dgCMatrix
对象的属性作为参数传递给 csc_matrix
构造函数。
为了进行测试,我使用 R 创建了一个 RDS 文件,其中存储了 dgCMatrix
个对象的列表:
library("Matrix")
set.seed(1L)
d <- 6L
n <- 10L
l <- replicate(n, sparseMatrix(i = sample(d), j = sample(d), x = sample(d), repr = "C"), simplify = FALSE)
names(l) <- as.character(seq(1986L, length.out = n))
l[["1986"]]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] . . 5 . . .
## [2,] 3 . . . . .
## [3,] . . . . . 6
## [4,] . 2 . . . .
## [5,] . . . . 1 .
## [6,] . . . 4 . .
saveRDS(l, file = "list_of_dgCMatrix.rds")
然后,在Python中:
from scipy import sparse
from rpy2 import robjects
readRDS = robjects.r['readRDS']
l = readRDS('list_of_dgCMatrix.rds')
x = l.rx2('1986') # in R: l[["1986"]]
x
## <rpy2.robjects.methods.RS4 object at 0x120db7b00> [RTYPES.S4SXP]
## R classes: ('dgCMatrix',)
print(x)
## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] . . 5 . . .
## [2,] 3 . . . . .
## [3,] . . . . . 6
## [4,] . 2 . . . .
## [5,] . . . . 1 .
## [6,] . . . 4 . .
data = x.do_slot('x') # in R: x@x
indices = x.do_slot('i') # in R: x@i
indptr = x.do_slot('p') # in R: x@p
shape = x.do_slot('Dim') # in R: x@Dim or dim(x)
y = sparse.csc_matrix((data, indices, indptr), tuple(shape))
y
## <6x6 sparse matrix of type '<class 'numpy.float64'>'
## with 6 stored elements in Compressed Sparse Column format>
print(y)
## (1, 0) 3.0
## (3, 1) 2.0
## (0, 2) 5.0
## (5, 3) 4.0
## (4, 4) 1.0
## (2, 5) 6.0
这里,y
是 class 的一个对象 scipy.sparse.csc_matrix
. You should not need to use the toarray
method to coerce y
to an array with dense storage. scipy.sparse
实现了我能想象到的所有矩阵运算。例如,这里是 y
的行和列总和:
y.sum(1) # in R: as.matrix(rowSums(x))
## matrix([[5.],
## [3.],
## [6.],
## [2.],
## [1.],
## [4.]])
y.sum(0) # in R: t(as.matrix(colSums(x)))
## matrix([[3., 2., 5., 4., 1., 6.]])
我正在尝试在 Python 中使用 Pandas 来阅读 .rds
file into a .feather
file .
library(feather)
# Set working directory
data = readRDS("file.rds")
data_year = data[["1986"]]
# Try 1
write_feather(
data_year,
"data_year.feather"
)
# Try 2
write_feather(
as.data.frame(as.matrix(data_year)),
"data_year.feather"
)
尝试 1 returns Error: 'x' must be a data frame
和 尝试 2 实际上写了一个 *.feather
文件但是该文件一年大小为 4.5GB,而原始 *.rds
文件几年大小为 0.055GB。
如何将每年的文件转换为单独或非单独的 *.feather
文件,同时保持足够的文件大小?
data
看起来像这样:
data_year
看起来像这样:
*更新
我愿意接受任何关于在 NumPy/Pandas 中使用数据同时保持适度文件大小的建议!
也许像下面的函数这样的东西能帮上忙。
该函数将稀疏矩阵重塑为长格式,从中消除零。这将减少最终 data.frame 大小和磁盘文件大小。
library(Matrix)
library(feather)
dgcMatrix_to_long_df <- function(x) {
res <- NULL
if(nrow(x) > 0L) {
for(i in 1:nrow(x)){
d <- as.matrix(x[i, , drop = FALSE])
d <- as.data.frame(d)
d$row <- i
d <- tidyr::pivot_longer(d, cols = -row, names_to = "col")
d <- d[d$value != 0,]
res <- rbind(res, d)
}
}
res
}
y <- dgcMatrix_to_long_df(data_year)
head(y)
## A tibble: 6 x 3
# row col value
# <int> <chr> <dbl>
#1 1 Col_0103 51
#2 1 Col_0149 6
#3 1 Col_0188 5
#4 1 Col_0238 89
#5 1 Col_0545 14
#6 1 Col_0547 58
path <- "my_data.feather"
write_feather(y, path)
z <- read_feather(path)
identical(y, z)
#[1] TRUE
# The file size is 232 KB though the initial matrix
# had 1 million elements stored as doubles,
# for a total of 8 MB, a saving of around 97%
file.size(path)/1024
#[1] 232.0234
编辑
下面的函数要快得多。
dgcMatrix_to_long_df2 <- function(x) {
res <- NULL
if(nrow(x) > 0L) {
for(i in 1:nrow(x)){
d <- as.matrix(x[i, , drop = FALSE])
inx <- which(d != 0, arr.ind = TRUE)
d <- cbind(inx, value = c(d[d != 0]))
d[, "row"] <- i
res <- rbind(res, d)
}
}
as.data.frame(res)
}
system.time(y <- dgcMatrix_to_long_df(data_year))
# user system elapsed
# 7.89 0.04 7.92
system.time(y <- dgcMatrix_to_long_df2(data_year))
# user system elapsed
# 0.14 0.00 0.14
测试数据
set.seed(2022)
n <- 1e3
x <- rep(0L, n*n)
inx <- sample(c(FALSE, TRUE), n*n, replace = TRUE, prob = c(0.99, 0.01))
x[inx] <- sample(100, sum(inx), replace = TRUE)
data_year <- Matrix(x, n, n, dimnames = list(NULL, sprintf("Col_%04d", 1:n)))
使用 scipy
和 rpy2
,您可以将每个 dgCMatrix
对象作为 scipy.sparse.csc_matrix
对象直接读入 Python。两者都使用 compressed sparse column (CSC) 格式,所以实际上 zero 需要预处理。您需要做的就是将 dgCMatrix
对象的属性作为参数传递给 csc_matrix
构造函数。
为了进行测试,我使用 R 创建了一个 RDS 文件,其中存储了 dgCMatrix
个对象的列表:
library("Matrix")
set.seed(1L)
d <- 6L
n <- 10L
l <- replicate(n, sparseMatrix(i = sample(d), j = sample(d), x = sample(d), repr = "C"), simplify = FALSE)
names(l) <- as.character(seq(1986L, length.out = n))
l[["1986"]]
## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] . . 5 . . .
## [2,] 3 . . . . .
## [3,] . . . . . 6
## [4,] . 2 . . . .
## [5,] . . . . 1 .
## [6,] . . . 4 . .
saveRDS(l, file = "list_of_dgCMatrix.rds")
然后,在Python中:
from scipy import sparse
from rpy2 import robjects
readRDS = robjects.r['readRDS']
l = readRDS('list_of_dgCMatrix.rds')
x = l.rx2('1986') # in R: l[["1986"]]
x
## <rpy2.robjects.methods.RS4 object at 0x120db7b00> [RTYPES.S4SXP]
## R classes: ('dgCMatrix',)
print(x)
## 6 x 6 sparse Matrix of class "dgCMatrix"
##
## [1,] . . 5 . . .
## [2,] 3 . . . . .
## [3,] . . . . . 6
## [4,] . 2 . . . .
## [5,] . . . . 1 .
## [6,] . . . 4 . .
data = x.do_slot('x') # in R: x@x
indices = x.do_slot('i') # in R: x@i
indptr = x.do_slot('p') # in R: x@p
shape = x.do_slot('Dim') # in R: x@Dim or dim(x)
y = sparse.csc_matrix((data, indices, indptr), tuple(shape))
y
## <6x6 sparse matrix of type '<class 'numpy.float64'>'
## with 6 stored elements in Compressed Sparse Column format>
print(y)
## (1, 0) 3.0
## (3, 1) 2.0
## (0, 2) 5.0
## (5, 3) 4.0
## (4, 4) 1.0
## (2, 5) 6.0
这里,y
是 class 的一个对象 scipy.sparse.csc_matrix
. You should not need to use the toarray
method to coerce y
to an array with dense storage. scipy.sparse
实现了我能想象到的所有矩阵运算。例如,这里是 y
的行和列总和:
y.sum(1) # in R: as.matrix(rowSums(x))
## matrix([[5.],
## [3.],
## [6.],
## [2.],
## [1.],
## [4.]])
y.sum(0) # in R: t(as.matrix(colSums(x)))
## matrix([[3., 2., 5., 4., 1., 6.]])