Parse Error: "Trailing Garbage" while trying to parse JSON column in data frame
Parse Error: "Trailing Garbage" while trying to parse JSON column in data frame
我有一个看起来像 this 的日志文件。这是一个文本文档,如下所示:
Id,Date,Level,Message
35054,2016-06-17 19:29:43 +0000,INFO,"{
""id"": -2,
""ipAddress"": ""100.100.100.100"",
""howYouHearAboutUs"": null,
""isInterestedInOffer"": true,
""incomeRange"": 60000,
""isEmailConfirmed"": false
}"
35055,2016-06-17 19:36:38 +0000,INFO,"{
""id"": -1,
""firstName"": ""John"",
""lastName"": ""Smith"",
""email"": ""john.smith@gmail.com"",
""city"": ""Smalltown"",
""incomeRange"": 1,
""birthDate"": ""1999-12-10T05:00:00Z"",
""password"": ""*********"",
""agreeToTermsOfUse"": true,
""howYouHearAboutUs"": ""Radio"",
""isInterestedInOffer"": false
}"
35059,2016-07-19 19:52:08 +0000,INFO,"{
""id"": -3,
""visitUrl"": ""https://www.website.com/?purpose=X"",
""ipAddress"": ""100.200.300.400"",
""howYouHearAboutUs"": null,
""isInterestedInOffer"": true,
""incomeRange"": 100000,
""isEmailConfirmed"": true,
""isIdentityConfirmed"": false,
""agreeToTermsOfUse"": true,
""validationResults"": null
}"
我正在尝试通过以下方式解析 Message
列中的 JSON:
library(readr)
library(jsonlite)
df <- read_csv("log_file_from_above.csv")
fromJSON(as.character(df$Message))
但是,我遇到了以下错误:
Error: parse error: trailing garbage
"isEmailConfirmed": false } { "id": -1, "firstName":
(right here) ------^
我怎样才能摆脱“拖尾垃圾”?
fromJSON()
不是 "apply" 针对字符向量,而是试图将其全部转换为数据框。你可以试试
purrr::map(df$Message, jsonlite::fromJSON)
@Abdou 提供的内容或
jsonlite::stream_in(textConnection(gsub("\n", "", df$Message)))
后两者将创建数据框。第一个将创建一个列表,您可以将其添加为一列。
您可以将最后一种方法与 dplyr::bind_cols
一起使用,以创建包含所有数据的新数据框:
dplyr::bind_cols(df[,1:3],
jsonlite::stream_in(textConnection(gsub("\n", "", df$Message))))
@Abdou 还建议了一个几乎纯基础的 R 解决方案:
cbind(df, do.call(plyr::rbind.fill, lapply(paste0("[",df$Message,"]"), function(x) jsonlite::fromJSON(x))))
完整的工作流程:
library(dplyr)
library(jsonlite)
df <- read.table("http://pastebin.com/raw/MMPMwNZv",
quote='"', sep=",", stringsAsFactors=FALSE, header=TRUE)
bind_cols(df[,1:3], stream_in(textConnection(gsub("\n", "", df$Message)))) %>%
glimpse()
##
Found 3 records...
Imported 3 records. Simplifying into dataframe...
## Observations: 3
## Variables: 19
## $ Id <int> 35054, 35055, 35059
## $ Date <chr> "2016-06-17 19:29:43 +0000", "2016-06-17 1...
## $ Level <chr> "INFO", "INFO", "INFO"
## $ id <int> -2, -1, -3
## $ ipAddress <chr> "100.100.100.100", NA, "100.200.300.400"
## $ howYouHearAboutUs <chr> NA, "Radio", NA
## $ isInterestedInOffer <lgl> TRUE, FALSE, TRUE
## $ incomeRange <int> 60000, 1, 100000
## $ isEmailConfirmed <lgl> FALSE, NA, TRUE
## $ firstName <chr> NA, "John", NA
## $ lastName <chr> NA, "Smith", NA
## $ email <chr> NA, "john.smith@gmail.com", NA
## $ city <chr> NA, "Smalltown", NA
## $ birthDate <chr> NA, "1999-12-10T05:00:00Z", NA
## $ password <chr> NA, "*********", NA
## $ agreeToTermsOfUse <lgl> NA, TRUE, TRUE
## $ visitUrl <chr> NA, NA, "https://www.website.com/?purpose=X"
## $ isIdentityConfirmed <lgl> NA, NA, FALSE
## $ validationResults <lgl> NA, NA, NA
我有一个看起来像 this 的日志文件。这是一个文本文档,如下所示:
Id,Date,Level,Message
35054,2016-06-17 19:29:43 +0000,INFO,"{
""id"": -2,
""ipAddress"": ""100.100.100.100"",
""howYouHearAboutUs"": null,
""isInterestedInOffer"": true,
""incomeRange"": 60000,
""isEmailConfirmed"": false
}"
35055,2016-06-17 19:36:38 +0000,INFO,"{
""id"": -1,
""firstName"": ""John"",
""lastName"": ""Smith"",
""email"": ""john.smith@gmail.com"",
""city"": ""Smalltown"",
""incomeRange"": 1,
""birthDate"": ""1999-12-10T05:00:00Z"",
""password"": ""*********"",
""agreeToTermsOfUse"": true,
""howYouHearAboutUs"": ""Radio"",
""isInterestedInOffer"": false
}"
35059,2016-07-19 19:52:08 +0000,INFO,"{
""id"": -3,
""visitUrl"": ""https://www.website.com/?purpose=X"",
""ipAddress"": ""100.200.300.400"",
""howYouHearAboutUs"": null,
""isInterestedInOffer"": true,
""incomeRange"": 100000,
""isEmailConfirmed"": true,
""isIdentityConfirmed"": false,
""agreeToTermsOfUse"": true,
""validationResults"": null
}"
我正在尝试通过以下方式解析 Message
列中的 JSON:
library(readr)
library(jsonlite)
df <- read_csv("log_file_from_above.csv")
fromJSON(as.character(df$Message))
但是,我遇到了以下错误:
Error: parse error: trailing garbage
"isEmailConfirmed": false } { "id": -1, "firstName":
(right here) ------^
我怎样才能摆脱“拖尾垃圾”?
fromJSON()
不是 "apply" 针对字符向量,而是试图将其全部转换为数据框。你可以试试
purrr::map(df$Message, jsonlite::fromJSON)
@Abdou 提供的内容或
jsonlite::stream_in(textConnection(gsub("\n", "", df$Message)))
后两者将创建数据框。第一个将创建一个列表,您可以将其添加为一列。
您可以将最后一种方法与 dplyr::bind_cols
一起使用,以创建包含所有数据的新数据框:
dplyr::bind_cols(df[,1:3],
jsonlite::stream_in(textConnection(gsub("\n", "", df$Message))))
@Abdou 还建议了一个几乎纯基础的 R 解决方案:
cbind(df, do.call(plyr::rbind.fill, lapply(paste0("[",df$Message,"]"), function(x) jsonlite::fromJSON(x))))
完整的工作流程:
library(dplyr)
library(jsonlite)
df <- read.table("http://pastebin.com/raw/MMPMwNZv",
quote='"', sep=",", stringsAsFactors=FALSE, header=TRUE)
bind_cols(df[,1:3], stream_in(textConnection(gsub("\n", "", df$Message)))) %>%
glimpse()
##
Found 3 records...
Imported 3 records. Simplifying into dataframe...
## Observations: 3
## Variables: 19
## $ Id <int> 35054, 35055, 35059
## $ Date <chr> "2016-06-17 19:29:43 +0000", "2016-06-17 1...
## $ Level <chr> "INFO", "INFO", "INFO"
## $ id <int> -2, -1, -3
## $ ipAddress <chr> "100.100.100.100", NA, "100.200.300.400"
## $ howYouHearAboutUs <chr> NA, "Radio", NA
## $ isInterestedInOffer <lgl> TRUE, FALSE, TRUE
## $ incomeRange <int> 60000, 1, 100000
## $ isEmailConfirmed <lgl> FALSE, NA, TRUE
## $ firstName <chr> NA, "John", NA
## $ lastName <chr> NA, "Smith", NA
## $ email <chr> NA, "john.smith@gmail.com", NA
## $ city <chr> NA, "Smalltown", NA
## $ birthDate <chr> NA, "1999-12-10T05:00:00Z", NA
## $ password <chr> NA, "*********", NA
## $ agreeToTermsOfUse <lgl> NA, TRUE, TRUE
## $ visitUrl <chr> NA, NA, "https://www.website.com/?purpose=X"
## $ isIdentityConfirmed <lgl> NA, NA, FALSE
## $ validationResults <lgl> NA, NA, NA