如何使用 SparkR 或 SparklyR 解析日志?
How can I parse logs with SparkR or SparklyR?
我正在尝试用 SparkR/SparklyR 尽可能地解析 NASA-HTTP 日志。我无法让它工作。
NASA-HTTP 日志如下所示
ix-stp-fl2-19.ix.netcom.com - - [03/Aug/1995:23:03:09 -0400] "GET /images/faq.gif HTTP/1.0" 200 263
slip183-1.kw.jp.ibm.net - - [04/Aug/1995:18:42:17 -0400] "GET /shuttle/missions/sts-70/images/DSC-95EC-0001.gif HTTP/1.0" 200 107133
piweba4y.prodigy.com - - [05/Aug/1995:19:17:41 -0400] "GET /icons/menu.xbm HTTP/1.0" 200 527
我已经能够使用 Pyspark 使用 regexp_extract 来做到这一点,如下所示
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
regexp_extract('value', r'^.*\[(\d\d\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
....
我正在尝试用 SparkR/Sparkly R 做这件事,但没有取得任何进展
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
尝试了一些东西但没有用
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
dim(df)
df %>% select(df,regexp_extract('\\S'),1)
请告诉我如何在数据框上使用正则表达式。
在互联网上搜索并尝试了一天半后,我能够解析 SparkR 和 sparklyR 中的日志
SparkR
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.session()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext,"/FileStore/tables/NASA_access_log*.gz")
a=df %>%
withColumn('regex1', regexp_extract(df$value, '^(\S+)', 1)) %>%
withColumn('regex2', regexp_extract(df$value, "((\S+ -\d{4}))", 2)) %>%
withColumn('regex3', regexp_extract(df$value, '(\"\w+\s+([^\s]+)\s+HTTP.*")', 2)) %>%
withColumn('regex4', regexp_extract(df$value, '(^.*"\s+([^\s]+))', 2)) %>%
withColumn('regex5', regexp_extract(df$value, '(^.*\s+(\d+)$)', 2))
head(SparkR::collect(a))
regex1 regex2
1 199.72.81.55 [01/Jul/1995:00:00:01 -0400
2 unicomp6.unicomp.net [01/Jul/1995:00:00:06 -0400
3 199.120.110.21 [01/Jul/1995:00:00:09 -0400
4 burger.letters.com [01/Jul/1995:00:00:11 -0400
5 199.120.110.21 [01/Jul/1995:00:00:11 -0400
6 burger.letters.com [01/Jul/1995:00:00:12 -0400
regex3 regex4 regex5
1 /history/apollo/ 200 6245
2 /shuttle/countdown/ 200 3985
3 /shuttle/missions/sts-73/mission-sts-73.html 200 4085
4 /shuttle/countdown/liftoff.html 304 0
5 /shuttle/missions/sts-73/sts-73-patch-small.gif 200 4179
6 /images/NASA-logosmall.gif 304 0
SparklyR
library(sparklyr)
library(dplyr)
library(stringr)
#sc <- spark_connect(master = "local", version = "2.1.0")
sc <- spark_connect(method = "databricks")
sdf <-spark_read_text(sc, name="df", path = "/FileStore/tables/NASA_access_log*.gz")
sdf <- sdf %>% mutate(regex = regexp_extract(line, '^(\\S+)',1)) %>%
mutate(regex1 = regexp_extract(line, '((\\S+ -\\d{4}))',2)) %>%
mutate(regex2 = regexp_extract(line, '(\\"\\w+\\s+([^\\s]+)\\s+HTTP.*")',2)) %>%
mutate(regex3 = regexp_extract(line, '(^.*"\\s+([^\\s]+))',2)) %>%
mutate(regex4 = regexp_extract(line, '(^.*\\s+(\\d+)$)',2))
sdf
line regex regex1 regex2 regex3 regex4
1 "199.72.81.55 - - [01/J… 199.72.8… [01/Jul/19… /history/apollo/ 200 6245
2 "unicomp6.unicomp.net -… unicomp6… [01/Jul/19… /shuttle/countd… 200 3985
3 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4085
4 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 304 0
5 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4179
6 "burger.letters.com - -… burger.l… [01/Jul/19… /images/NASA-lo… 304 0
7 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 200 0
我正在尝试用 SparkR/SparklyR 尽可能地解析 NASA-HTTP 日志。我无法让它工作。
NASA-HTTP 日志如下所示
ix-stp-fl2-19.ix.netcom.com - - [03/Aug/1995:23:03:09 -0400] "GET /images/faq.gif HTTP/1.0" 200 263
slip183-1.kw.jp.ibm.net - - [04/Aug/1995:18:42:17 -0400] "GET /shuttle/missions/sts-70/images/DSC-95EC-0001.gif HTTP/1.0" 200 107133
piweba4y.prodigy.com - - [05/Aug/1995:19:17:41 -0400] "GET /icons/menu.xbm HTTP/1.0" 200 527
我已经能够使用 Pyspark 使用 regexp_extract 来做到这一点,如下所示
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
regexp_extract('value', r'^.*\[(\d\d\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
....
我正在尝试用 SparkR/Sparkly R 做这件事,但没有取得任何进展
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
尝试了一些东西但没有用
sparkR.session()
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext, "/FileStore/tables/NASA_access_log*.gz")
dim(df)
df %>% select(df,regexp_extract('\\S'),1)
请告诉我如何在数据框上使用正则表达式。
在互联网上搜索并尝试了一天半后,我能够解析 SparkR 和 sparklyR 中的日志
SparkR
# Initiate a SparkR session
sparkR.session()
sc <- sparkR.session()
sqlContext <- sparkRSQL.init(sc)
df <- read.text(sqlContext,"/FileStore/tables/NASA_access_log*.gz")
a=df %>%
withColumn('regex1', regexp_extract(df$value, '^(\S+)', 1)) %>%
withColumn('regex2', regexp_extract(df$value, "((\S+ -\d{4}))", 2)) %>%
withColumn('regex3', regexp_extract(df$value, '(\"\w+\s+([^\s]+)\s+HTTP.*")', 2)) %>%
withColumn('regex4', regexp_extract(df$value, '(^.*"\s+([^\s]+))', 2)) %>%
withColumn('regex5', regexp_extract(df$value, '(^.*\s+(\d+)$)', 2))
head(SparkR::collect(a))
regex1 regex2
1 199.72.81.55 [01/Jul/1995:00:00:01 -0400
2 unicomp6.unicomp.net [01/Jul/1995:00:00:06 -0400
3 199.120.110.21 [01/Jul/1995:00:00:09 -0400
4 burger.letters.com [01/Jul/1995:00:00:11 -0400
5 199.120.110.21 [01/Jul/1995:00:00:11 -0400
6 burger.letters.com [01/Jul/1995:00:00:12 -0400
regex3 regex4 regex5
1 /history/apollo/ 200 6245
2 /shuttle/countdown/ 200 3985
3 /shuttle/missions/sts-73/mission-sts-73.html 200 4085
4 /shuttle/countdown/liftoff.html 304 0
5 /shuttle/missions/sts-73/sts-73-patch-small.gif 200 4179
6 /images/NASA-logosmall.gif 304 0
SparklyR
library(sparklyr)
library(dplyr)
library(stringr)
#sc <- spark_connect(master = "local", version = "2.1.0")
sc <- spark_connect(method = "databricks")
sdf <-spark_read_text(sc, name="df", path = "/FileStore/tables/NASA_access_log*.gz")
sdf <- sdf %>% mutate(regex = regexp_extract(line, '^(\\S+)',1)) %>%
mutate(regex1 = regexp_extract(line, '((\\S+ -\\d{4}))',2)) %>%
mutate(regex2 = regexp_extract(line, '(\\"\\w+\\s+([^\\s]+)\\s+HTTP.*")',2)) %>%
mutate(regex3 = regexp_extract(line, '(^.*"\\s+([^\\s]+))',2)) %>%
mutate(regex4 = regexp_extract(line, '(^.*\\s+(\\d+)$)',2))
sdf
line regex regex1 regex2 regex3 regex4
1 "199.72.81.55 - - [01/J… 199.72.8… [01/Jul/19… /history/apollo/ 200 6245
2 "unicomp6.unicomp.net -… unicomp6… [01/Jul/19… /shuttle/countd… 200 3985
3 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4085
4 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 304 0
5 "199.120.110.21 - - [01… 199.120.… [01/Jul/19… /shuttle/missio… 200 4179
6 "burger.letters.com - -… burger.l… [01/Jul/19… /images/NASA-lo… 304 0
7 "burger.letters.com - -… burger.l… [01/Jul/19… /shuttle/countd… 200 0