R h2o gsub 空指针

R h2o gsub null pointer

我在 R 中使用 h2o 包并尝试进行一些数据操作,但 sub/gsub 函数存在一些问题。

这是我的代码:

library(h2o)

# Start cluster
localH2O = h2o.init(nthreads = 2) 

# Create data set
dat1.mini <- structure(list(id = c("7927751403363142656", "18236986451472797696", 
"5654946373641778176", "14195690822403907584", "1693303484298446848", 
"1.1362181921561e+19", "11694645532962195456", "1221431312630614784", 
"1987127670789791488", "379819848497418688"), click = c("0", 
"0", "0", "0", "0", "0", "0", "1", "0", "0"), hour = c("14102118", 
"14102217", "14102812", "14102912", "14102820", "14102401", "14102117", 
"14102312", "14102301", "14102414"), C1 = c("1005", "1005", "1005", 
"1002", "1005", "1005", "1005", "1005", "1005", "1005"), banner_pos = c("1", 
"1", "0", "0", "0", "0", "1", "1", "0", "0"), site_id = c("b7e9786d", 
"e151e245", "85f751fd", "ee4c822c", "85f751fd", "85f751fd", "e5c60a05", 
"e151e245", "1fbe01fe", "1fbe01fe"), site_domain = c("b12b9f85", 
"7e091613", "c4e18dd6", "c4e18dd6", "c4e18dd6", "c4e18dd6", "7256c623", 
"7e091613", "f3845767", "f3845767"), site_category = c("f028772b", 
"f028772b", "50e219e0", "50e219e0", "50e219e0", "50e219e0", "f028772b", 
"f028772b", "28905ebd", "28905ebd"), app_id = c("ecad2386", "ecad2386", 
"685d1c4c", "ecad2386", "92f5800b", "f02cb7ab", "ecad2386", "ecad2386", 
"ecad2386", "ecad2386"), app_domain = c("7801e8d9", "7801e8d9", 
"2347f47a", "7801e8d9", "ae637522", "2347f47a", "7801e8d9", "7801e8d9", 
"7801e8d9", "7801e8d9"), app_category = c("07d7df22", "07d7df22", 
"8ded1f7a", "07d7df22", "0f2161f8", "f95efa07", "07d7df22", "07d7df22", 
"07d7df22", "07d7df22"), device_id = c("a99f214a", "a99f214a", 
"a99f214a", "8374cacf", "a99f214a", "8a5908a5", "a99f214a", "a99f214a", 
"a99f214a", "a99f214a"), device_ip = c("3214d61e", "d5623936", 
"419e166e", "698846d6", "c2d9c2f2", "40817190", "edd10fc1", "e4c6e857", 
"05d3adbe", "6929d972"), device_model = c("a0f5f879", "69f9dd0e", 
"46a414f4", "12edfe21", "4ffd3a7e", "04f5b394", "779d90c2", "1f0bc64f", 
"293291c1", "d787e91b"), device_type = c("1", "1", "1", "0", 
"1", "1", "1", "1", "1", "1"), device_conn_type = c("0", "0", 
"3", "0", "3", "0", "0", "0", "0", "0"), C14 = c("16208", "20277", 
"23224", "17566", "21189", "20633", "19771", "17264", "15703", 
"20108"), C15 = c("320", "320", "320", "320", "320", "320", "320", 
"320", "320", "320"), C16 = c("50", "50", "50", "50", "50", "50", 
"50", "50", "50", "50"), C17 = c("1800", "2281", "2676", "479", 
"2424", "2374", "2227", "1872", "1722", "2299"), C18 = c("3", 
"3", "0", "3", "1", "3", "0", "3", "0", "2"), C19 = c("167", 
"47", "35", "39", "161", "39", "679", "39", "35", "1327"), C20 = c("100077", 
"100181", "100176", "100074", "100189", "-1", "100074", "-1", 
"-1", "-1"), C21 = c("23", "42", "221", "23", "71", "23", "48", 
"23", "79", "52")), .Names = c("id", "click", "hour", "C1", "banner_pos", 
"site_id", "site_domain", "site_category", "app_id", "app_domain", 
"app_category", "device_id", "device_ip", "device_model", "device_type", 
"device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19", 
"C20", "C21"), row.names = c(NA, 10L), class = "data.frame")

# Load data to cluster
dat.mini.hex <- as.h2o(localH2O, dat1.mini)

# Attempt to grab substring of first 6 characters from hour column
dat.mini.hex$hr <- h2o.sub('^(.{6}).*$','\1', dat.mini.hex$hour)
dat.mini.hex$hr <- h2o.gsub('(.+)..','\1', dat.mini.hex$hour)

所有这些尝试都会导致以下错误:

Error in .h2o.__remoteSend(client, .h2o.__PAGE_EXEC2, str = expr) : 
  http://127.0.0.1:54321/2/Exec2.json  returned the following error:
   class java.lang.NullPointerException

错误发生是因为 hour 是数字列。函数 h2o.subh2o.gsub 不适用于数字数据。

命令 str(dat.mini.hex$hour) 将显示 hour 是一个数字列。

str(dat.mini.hex$hour)

您可以将 hour 转换为因子并将结果保存在新列 hour2 中。

dat.mini.hex$hour2 <- as.factor(dat.mini.hex$hour)

现在,您可以使用 h2o.sub。但是,我想你不会喜欢这个结果...

h2o.sub('^(.{6}).*$','\1', dat.mini.hex$hour2)
#   hour2
# 1   \1
# 2   \1
# 3   \1
# 4   \1
# 5   \1
# 6   \1

如您所见,h2o.sub 按字面意思使用 \1 但不是第一个匹配组。此行为与基本 R 的 sub.

形成对比

您可以更改正则表达式并将前六个字符之后的字符替换为空字符串。

h2o.sub('(?<=^.{6}).*$','', dat.mini.hex$hour2)
#    hour2
# 1 141021
# 2 141022
# 3 141028
# 4 141029
# 5 141028
# 6 141024

在这里,(?<=^.{6}) 是一个积极的回顾。它匹配字符串开头和前 6 位数字之前的位置。