将 SF Dataframe 写入 Microsoft SQL 服务器导致 R 崩溃并且无法写入数据

Writing SF Dataframe to Microsoft SQL Server Crashes R and Fails to Write Data

使用提供的示例数据,我正在尝试按照 here 中的说明将 sf 数据帧写入 Microsoft SQL 服务器 table,但是,每当我这样做时,R Session 就会崩溃并给出下面屏幕截图中显示的错误。

我想知道是否还有其他人看到过这个问题,他们是如何解决的?

一如既往,提前谢谢你。

-nate

来自终端的错误:

Note: method with signature ‘DBIObject#sf’ chosen for function ‘dbDataType’,
 target signature ‘Microsoft SQL Server#sf’.
 "OdbcConnection#ANY" would also be valid

 *** caught segfault ***
address 0x21, cause 'memory not mapped'

Traceback:
 1: result_insert_dataframe(rs@ptr, values, batch_rows)
 2: tryCatchList(expr, classes, parentenv, handlers)
 3: tryCatch(result_insert_dataframe(rs@ptr, values, batch_rows),     finally = dbClearResult(rs))
 4: .local(conn, name, value, ...)
 5: dbWriteTable(conn, name, to_postgis(conn, value, binary), ...,     row.names = row.names, overwrite = overwrite, append = append,     field.types = field.types)
 6: dbWriteTable(conn, name, to_postgis(conn, value, binary), ...,     row.names = row.names, overwrite = overwrite, append = append,     field.types = field.types)
 7: .local(conn, name, value, ...)
 8: DBI::dbWriteTable(conn = con, name = the_table_name, value = polygon_db,     row.names = FALSE, overwrite = TRUE, binary = TRUE)
 9: DBI::dbWriteTable(conn = con, name = the_table_name, value = polygon_db,     row.names = FALSE, overwrite = TRUE, binary = TRUE)
An irrecoverable exception occurred. R is aborting now ...
Segmentation fault: 11

Session 信息:

R version 3.6.3 (2020-02-29)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Catalina 10.15.4

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] odbc_1.2.2   DBI_1.1.0    magrittr_1.5 sf_0.9-3    

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6       class_7.3-16       packrat_0.5.0      grid_3.6.3         e1071_1.7-3        units_0.6-6        KernSmooth_2.23-16 rlang_0.4.6        blob_1.2.1         vctrs_0.2.4        tools_3.6.3        bit64_0.9-7       
[13] bit_1.1-15.2       hms_0.5.3          compiler_3.6.3     pkgconfig_2.0.3    classInt_0.4-3  

示例数据(为长线道歉):

polygon_sf_df <- structure(
  list(leaf_id = c(1234, 2222),
       db_label = c("Middle of Nothing", "Somewhere in Florida"),
       division = c("Mountain", "South Atlantic"),
       geometry = structure(list(
         structure(list(structure(c(-114.806444, -114.326789, -114.08946, -114.898877, -114.806444, 39.236959, 39.219554, 38.961837, 38.829708, 39.236959), .Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg")),
         structure(list(structure(c(-81.806444, -81.326789, -81.08946, -81.898877, -81.806444, 29.236959, 29.219554, 28.961837, 28.829708, 29.236959), .Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg"))),
         class = c("sfc_POLYGON", "sfc"), precision = 0,
         bbox = structure(c(xmin = -114.898877, ymin = 28.829708, xmax = -81.08946, ymax = 39.236959), class = "bbox"),
         crs = structure(list(input = "EPSG:4326", wkt = "GEOGCS[\"WGS 84\",\n    DATUM[\"WGS_1984\",\n        SPHEROID[\"WGS 84\",6378137,298.257223563,\n            AUTHORITY[\"EPSG\",\"7030\"]],\n        AUTHORITY[\"EPSG\",\"6326\"]],\n    PRIMEM[\"Greenwich\",0,\n        AUTHORITY[\"EPSG\",\"8901\"]],\n    UNIT[\"degree\",0.0174532925199433,\n        AUTHORITY[\"EPSG\",\"9122\"]],\n    AUTHORITY[\"EPSG\",\"4326\"]]"), class = "crs"), n_empty = 0L)),
  sf_column = "geometry", agr = structure(c(leaf_id = NA_integer_, db_label = NA_integer_, division = NA_integer_), .Label = c("constant", "aggregate", "identity"), class = "factor"),
  row.names = 1:2, class = c("sf", "data.frame"))

快速安装包:

options(scipen = 999)
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg)) 
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}
packages<- c("sf", "magrittr","DBI", "odbc")
ipak(packages)

连接和写入方法:

con <- DBI::dbConnect(odbc::odbc(),
                      Driver   = "ODBC Driver 17 for SQL Server",
                      Server   = "some_database_server.windows.net",
                      Database = "dev_db",
                      UID      = "<User ID Here>",
                      PWD      = "<PW HERE>",
                      Port     = 1433, 
                      maxvarcharsize = 0)
# Fails Here
DBI::dbWriteTable(conn = con, name = "the_sf_polygon_table_name", value = polygon_sf_df, row.names=FALSE, overwrite=TRUE, binary=TRUE)

table 的架构(根据下面的评论)

structure(list(TABLE_CATALOG = c("analytics_dev", "analytics_dev", 
"analytics_dev", "analytics_dev"), TABLE_SCHEMA = c("dbo", "dbo", 
"dbo", "dbo"), TABLE_NAME = c("test_polygon_table", "test_polygon_table", 
"test_polygon_table", "test_polygon_table"), COLUMN_NAME = c("leaf_id", 
"db_label", "division", "geometry"), ORDINAL_POSITION = 1:4, 
    COLUMN_DEFAULT = c(NA_character_, NA_character_, NA_character_, 
    NA_character_), IS_NULLABLE = c("YES", "YES", "YES", "YES"
    ), DATA_TYPE = c("float", "varchar", "varchar", "geometry"
    ), CHARACTER_MAXIMUM_LENGTH = c(NA, 255L, 255L, -1L), CHARACTER_OCTET_LENGTH = c(NA, 
    255L, 255L, -1L), NUMERIC_PRECISION = c(53L, NA, NA, NA), 
    NUMERIC_PRECISION_RADIX = c(2L, NA, NA, NA), NUMERIC_SCALE = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), DATETIME_PRECISION = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_), CHARACTER_SET_CATALOG = c(NA_character_, 
    NA_character_, NA_character_, NA_character_), CHARACTER_SET_SCHEMA = c(NA_character_, 
    NA_character_, NA_character_, NA_character_), CHARACTER_SET_NAME = c(NA, 
    "iso_1", "iso_1", NA), COLLATION_CATALOG = c(NA_character_, 
    NA_character_, NA_character_, NA_character_), COLLATION_SCHEMA = c(NA_character_, 
    NA_character_, NA_character_, NA_character_), COLLATION_NAME = c(NA, 
    "SQL_Latin1_General_CP1_CI_AS", "SQL_Latin1_General_CP1_CI_AS", 
    NA), DOMAIN_CATALOG = c(NA_character_, NA_character_, NA_character_, 
    NA_character_), DOMAIN_SCHEMA = c(NA_character_, NA_character_, 
    NA_character_, NA_character_), DOMAIN_NAME = c(NA_character_, 
    NA_character_, NA_character_, NA_character_)), class = "data.frame", row.names = c(NA, 
-4L))

老实说,这是一个 hack,因为我不熟悉或不精通 sf(R 包)或 geometry(sql 服务器数据类型)。在前面,我想知道 geography (sql 服务器数据类型)是否更适合你,因为它声称适用于类似 GPS 的坐标(基准等)......但我认为这个process 也可以翻译。

无论哪种方式,我首先展示 R 和 SQL 服务器如何就形状达成一致(扰流板:character),然后如何以正确的数据类型读取这些形状并将其推送到数据库。

查询

如果您单步执行 https://docs.microsoft.com/en-us/sql/t-sql/spatial-geometry/spatial-types-geometry-transact-sql 中的示例,然后 运行 查询,请注意它 returns:

# con <- DBI::dbConnect(...) # sql server
DBI::dbExecute(con, "
CREATE TABLE SpatialTable   
    ( id int IDENTITY (1,1),  
    GeomCol1 geometry,   
    GeomCol2 AS GeomCol1.STAsText() )")

DBI::dbExecute(con, "
INSERT INTO SpatialTable (GeomCol1)  
VALUES (geometry::STGeomFromText('LINESTRING (100 100, 20 180, 180 180)', 0));  
INSERT INTO SpatialTable (GeomCol1)  
VALUES (geometry::STGeomFromText('POLYGON ((0 0, 150 0, 150 150, 0 150, 0 0))', 0));  ")

ret <- DBI::dbGetQuery(con, "select * from SpatialTable")
str(ret)
# 'data.frame': 2 obs. of  3 variables:
#  $ id      : int  1 2
#  $ GeomCol1: chr  "" ""
#  $ GeomCol2: chr  "LINESTRING (100 100, 20 180, 180 180)" "POLYGON ((0 0, 150 0, 150 150, 0 150, 0 0))"

不足为奇(对我来说),DBI 将其视为字符串。但是请注意,它返回了 GeomCol2 的内容,这是实际的类似 blob 的字段 GeomCol1 的文本翻译。 GeomCol1 什么都没有。好的,让我们只检索我们需要的东西,并且由于 "id" 如果几何不是数字,让我们将其转换为字符串然后确认它绘制:

ret <- DBI::dbGetQuery(con, "select id, GeomCol2 from SpatialTable")
ret$id <- as.character(ret$id)
plot(sf::st_as_sf(ret, wkt="GeomCol2"))

上传

为了将您的数据上传到 SQL 服务器,我们需要对其进行 character 处理。

tempdat <- as.data.frame(polygon_sf_df)[,c("leaf_id", "geometry")]
tempdat$geometry <- sapply(tempdat$geometry, format, width = 0)
names(tempdat)[1] <- "id"
str(tempdat)
# 'data.frame': 2 obs. of  2 variables:
#  $ id      : num  1234 2222
#  $ geometry: chr  "POLYGON ((-114.8064 39.23696, -114.3268 39.21955, -114.0895 38.96184, -114.8989 38.82971, -114.8064 39.23696))" "POLYGON ((-81.80644 29.23696, -81.32679 29.21955, -81.08946 28.96184, -81.89888 28.82971, -81.80644 29.23696))"

现在我们可以将其上传到临时文件 table:

### write *character* shapes to a temp table
DBI::dbWriteTable(con, "temptable", tempdat, create = TRUE)
### convert those *character* shapes to real *geometries*
DBI::dbExecute(con, "
insert into SpatialTable (GeomCol1)
select geometry::STGeomFromText(geometry, 0) as GeomCol1
from temptable")
DBI::dbExecute(con, "drop table temptable")

现在我们可以查询那些形状:

ret2 <- DBI::dbGetQuery(con, "select id, GeomCol2 from SpatialTable where id > 2")
ret2$id <- as.character(ret2$id)
ret2 <- sf::st_as_sf(ret2, wkt="GeomCol2")
ret2
# Simple feature collection with 2 features and 1 field
# geometry type:  POLYGON
# dimension:      XY
# bbox:           xmin: -114.8989 ymin: 28.82971 xmax: -81.08946 ymax: 39.23696
# CRS:            NA
#   id                       GeomCol2
# 1  3 POLYGON ((-114.8064 39.2369...
# 2  4 POLYGON ((-81.80644 29.2369...

plot(ret2)

注意数据大小

未经大量测试,我的猜测是该文本字段(在查询时)在 SQL 服务器意义上将是 "large";这是一个 "known thing",因为当 "large fields" 不是列选择的最后一个时,Microsoft 的 ODBC 驱动程序会故意中断。为此,请确保几何文本字段位于查询的最后几列中(不在任何非大字段之前)。

参考文献:

这是我想出的解决方法...它基本上与@r2evans 相同。我非常感谢他的帮助。

步数:

  1. 将实际多边形(geometry 列)转换为 line/polygon 字符串
  2. 从现有数据框中删除旧的几何列
  3. 用 string/character 列(我称之为 geo)将新数据写入数据库
  4. 从数据库读取 (select * from whatever)
  5. 将其转换回 geometry 以便在 R
  6. 中使用

代码:

# Converting To String/Character, Writing To DB:
polygon_sf_df$geo<- sf::st_as_text(polygon_sf_df$geometry)
polygon_sf_df<- polygon_sf_df %>% sf::st_set_geometry(NULL)

the_table_name<- paste0("test_polygon_table")
DBI::dbWriteTable(conn = con, name = the_table_name, value = polygon_sf_df, row.names=FALSE, overwrite=TRUE, binary=TRUE)

## Reading back IN:
tmp<- DBI::dbSendQuery(conn=con, statement = paste0("SELECT * FROM dbo.", the_table_name))
polygon_df_ret<- DBI::dbFetch(tmp)
polygon_df_ret$geometry<- sf::st_as_sfc(polygon_df_ret$geo)
polygon_df_ret$geo<- NULL
polygon_df_ret<- sf::st_as_sf(polygon_df_ret)