R:构建文本分类器
R: building text Classifier
我的内容集必须根据一些规则进行分类。
示例数据:
1 chin jeffrey hong kong wednesday october global business reporting cc subramanian raghuveer kumar m santhosh antoo ramesh subject request obtain global icis data dear team appreciate can distribute monthly basis latest global icis data ramesh antoo upon availability regards jeffrey chin associate business risk strategy efficiency brse asia international institutional banking australia new zealand banking group limited f three exchange square connaught place central hong kong phone voice net email jeffreychinanzcom brse key business risk interface team within iib providing global support strategic perspectives policy procedures reporting includes risk appetite credit process quality assurance interlock key stakeholders well iib support key projects project glue cacheorion working groups efficiency initiatives business risk forums p please consider environment printing email
2 beren stuart vanuatu monday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors name lim tek kon vanuatu address lini high way port vila efate title relationship manager emerging corporates employee id lan id limtk bsbcc authorising manager beren stuart vanuatu yes read use gpa dashboard business technical reason na
3 kumar m santhosh behalf relationshipbankingfinancesupport friday october g s venkatesh cc global business reporting subject fw please approve qlikview gpa access regards santhosh faunt daniel png wednesday october relationshipbankingfinancesupport cc amet sova subject fw please approve qlikview gpa access hi unable approve excel due macro issues please process amet sova monday october faunt daniel png subject please approve qlikview gpa access hello can please review attached form click line manager approval approve
4 thomson owen tonga thursday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors name hia viliami address head office fakafanua centre maufanga vuna road nukualofa tongatapu tonga nukualofa tongatapu title nfc amu manager employee id lan id hiav bsbcc authorising manager thomson owen tonga yes read use gpa dashboard business technical reason
5 kumar rajesh fiji tuesday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors
这只是一行值,我必须实时处理超过 500 - 10000 行,这里我提取了将要使用的词
> O
$text
$text[[1]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access" "gpa"
$text[[2]]
[1] "report" "qlikview" "gpa" "access" "qlikview" "gpa" "access" "qlikview" "gpa"
[10] "access"
$text[[3]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access"
[10] "gpa"
$text[[4]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access"
[10] "gpa"
$text[[5]]
[1] "report" "qlikview" "gpa" "access" "access" "gpa" "access" "qlikview" "gpa" "access" "access" "gpa" "qlikview" "gpa" "access" "qlikview" "gpa" "access"
$text[[6]]
[1] "report" "qlikview" "access" "access" "report" "qlikview" "access" "access" "gpa"
[10] "qlikview" "access" "access" "qlikview" "access" "access"
$text[[7]]
[1] "report" "report" "access" "access" "report" "report" "report" "report" "report" "report" "data" "data" "report" "access" "report" "report"
$text[[8]]
[1] "report" "qlikview" "gpa" "access" "gpa" "access"
$text[[9]]
[1] "report" "gpa" "access" "access" "gpa" "gpa" "gpa" "gpa" "gpa" "access" "gpa" "gpa" "gpa" "report"
$text[[10]]
[1] "report" "gpa" "gpa" "access" "gpa" "access" "gpa" "access" "gpa" "gpa" "report" "gpa" "gpa" "access" "gpa" "gpa" "gpa" "gpa" "gpa"
现在我必须使用 if 条件 while 在此基础上建立规则,如何表示列表以循环每个文本并检查 "access" 是否存在然后检查 "gpa" 或 "qlikview" 存在,然后 return 作为行值的 ACCESS(示例数据)
否则,如果 "report" 存在,则检查 "pfi" 或 "Regional" return REPORT
我已将上面的列表转换为如下所示的数据框
代码:
maxl <- max(sapply(O,length))
out <- do.call(cbind, lapply(O,function(x) x[1:maxl]))
out <- as.data.frame(out)
text
1 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
2 c("report", "qlikview", "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
3 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
4 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
5 c("report", "qlikview", "gpa", "access", "access", "gpa", "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
6 c("report", "qlikview", "access", "access", "report", "qlikview", "access", "access", "gpa", "qlikview", "access", "access", "qlikview", "access", "access")
如何删除此数据框中的列表类型 c()
?
尝试了一些代码:
#Rule Classifier-----
rule <- function(out)
{
for(i in out)
{
for(j in out[i])
{
if(x[j]=="Access")
{
if(x[j]=="gpa" | x[j]=="qlikview")
{
return("Access")
}
}
else if(x[j]=="Report")
{
if(x[j]=="pfi" | x[j]=="data" )
{
return("Report")
}
}
}
}
}
预期输出:
1 Access
2 Access
3 Access
4 Access
5 Access
6 Access
7 Report/Data
8 Access
9 Access
10 Access
11 Report/Data
12 Report/Data
13 Report/Data
14 Report/Data
dput(O$text[1:10])
list(c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"), c("report", "qlikview",
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa",
"access"), c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"), c("qlikview", "gpa", "access",
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"
), c("report", "qlikview", "gpa", "access", "access", "gpa",
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview",
"gpa", "access", "qlikview", "gpa", "access"), c("report", "qlikview",
"access", "access", "report", "qlikview", "access", "access",
"gpa", "qlikview", "access", "access", "qlikview", "access",
"access"), c("report", "report", "access", "access", "report",
"report", "report", "report", "report", "report", "data", "data",
"report", "access", "report", "report"), c("report", "qlikview",
"gpa", "access", "gpa", "access"), c("report", "gpa", "access",
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa",
"gpa", "gpa", "report"), c("report", "gpa", "gpa", "access",
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa",
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
rule(out)
#this is throwing some error - Error in `[.default`(out, i) : invalid subscript type 'list'
我知道这很天真,但我是新手,如果我哪里出错了请纠正我。
您似乎假设 for ... in
循环将使用整数进行迭代。它们只是遍历列表中的对象,因此当它是列表时,您正试图使用 i
作为索引。但是 ?lapply
将向您展示使用列表的更好方法。
text <- list(c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"),
c("report", "qlikview",
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa",
"access"),
c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"),
c("qlikview", "gpa", "access",
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"),
c("report", "qlikview", "gpa", "access", "access", "gpa",
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview",
"gpa", "access", "qlikview", "gpa", "access"),
c("report", "qlikview",
"access", "access", "report", "qlikview", "access", "access",
"gpa", "qlikview", "access", "access", "qlikview", "access",
"access"),
c("report", "report", "access", "access", "report",
"report", "report", "report", "report", "report", "data", "data",
"report", "access", "report", "report"),
c("report", "qlikview",
"gpa", "access", "gpa", "access"),
c("report", "gpa", "access",
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa",
"gpa", "gpa", "report"),
c("report", "gpa", "gpa", "access",
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa",
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
O <- as.data.frame(cbind(text))
rule <- function(out) {
vapply(out$text, function (row) {
if ("access" %in% row && ("gpa" %in% row || "qlikview" %in% row)) {
return("Access")
} else if ("report" %in% row && ("pfi" %in% row || "data" %in% row)) {
return("Report/Data")
} else {
return("NA")
}
}, "")
}
rule(O)
# [1] "Access" "Access" "Access" "Access" "Access" "Access" "Report/Data" "Access"
# [9] "Access" "Access"
我的内容集必须根据一些规则进行分类。
示例数据:
1 chin jeffrey hong kong wednesday october global business reporting cc subramanian raghuveer kumar m santhosh antoo ramesh subject request obtain global icis data dear team appreciate can distribute monthly basis latest global icis data ramesh antoo upon availability regards jeffrey chin associate business risk strategy efficiency brse asia international institutional banking australia new zealand banking group limited f three exchange square connaught place central hong kong phone voice net email jeffreychinanzcom brse key business risk interface team within iib providing global support strategic perspectives policy procedures reporting includes risk appetite credit process quality assurance interlock key stakeholders well iib support key projects project glue cacheorion working groups efficiency initiatives business risk forums p please consider environment printing email
2 beren stuart vanuatu monday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors name lim tek kon vanuatu address lini high way port vila efate title relationship manager emerging corporates employee id lan id limtk bsbcc authorising manager beren stuart vanuatu yes read use gpa dashboard business technical reason na
3 kumar m santhosh behalf relationshipbankingfinancesupport friday october g s venkatesh cc global business reporting subject fw please approve qlikview gpa access regards santhosh faunt daniel png wednesday october relationshipbankingfinancesupport cc amet sova subject fw please approve qlikview gpa access hi unable approve excel due macro issues please process amet sova monday october faunt daniel png subject please approve qlikview gpa access hello can please review attached form click line manager approval approve
4 thomson owen tonga thursday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors name hia viliami address head office fakafanua centre maufanga vuna road nukualofa tongatapu tonga nukualofa tongatapu title nfc amu manager employee id lan id hiav bsbcc authorising manager thomson owen tonga yes read use gpa dashboard business technical reason
5 kumar rajesh fiji tuesday october g s venkatesh ramesh sandeep talanki h r nagaraj subject please approve qlikview gpa access hi please process following form gpa access please email requestor line manager access granted raj can please add given user qlikview workgroup gpa access form requestors
这只是一行值,我必须实时处理超过 500 - 10000 行,这里我提取了将要使用的词
> O
$text
$text[[1]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access" "gpa"
$text[[2]]
[1] "report" "qlikview" "gpa" "access" "qlikview" "gpa" "access" "qlikview" "gpa"
[10] "access"
$text[[3]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access"
[10] "gpa"
$text[[4]]
[1] "qlikview" "gpa" "access" "gpa" "access" "access" "qlikview" "gpa" "access"
[10] "gpa"
$text[[5]]
[1] "report" "qlikview" "gpa" "access" "access" "gpa" "access" "qlikview" "gpa" "access" "access" "gpa" "qlikview" "gpa" "access" "qlikview" "gpa" "access"
$text[[6]]
[1] "report" "qlikview" "access" "access" "report" "qlikview" "access" "access" "gpa"
[10] "qlikview" "access" "access" "qlikview" "access" "access"
$text[[7]]
[1] "report" "report" "access" "access" "report" "report" "report" "report" "report" "report" "data" "data" "report" "access" "report" "report"
$text[[8]]
[1] "report" "qlikview" "gpa" "access" "gpa" "access"
$text[[9]]
[1] "report" "gpa" "access" "access" "gpa" "gpa" "gpa" "gpa" "gpa" "access" "gpa" "gpa" "gpa" "report"
$text[[10]]
[1] "report" "gpa" "gpa" "access" "gpa" "access" "gpa" "access" "gpa" "gpa" "report" "gpa" "gpa" "access" "gpa" "gpa" "gpa" "gpa" "gpa"
现在我必须使用 if 条件 while 在此基础上建立规则,如何表示列表以循环每个文本并检查 "access" 是否存在然后检查 "gpa" 或 "qlikview" 存在,然后 return 作为行值的 ACCESS(示例数据) 否则,如果 "report" 存在,则检查 "pfi" 或 "Regional" return REPORT
我已将上面的列表转换为如下所示的数据框
代码:
maxl <- max(sapply(O,length))
out <- do.call(cbind, lapply(O,function(x) x[1:maxl]))
out <- as.data.frame(out)
text
1 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
2 c("report", "qlikview", "gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
3 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
4 c("qlikview", "gpa", "access", "gpa", "access", "access", "qlikview", "gpa", "access", "gpa")
5 c("report", "qlikview", "gpa", "access", "access", "gpa", "access", "qlikview", "gpa", "access", "access", "gpa", "qlikview", "gpa", "access", "qlikview", "gpa", "access")
6 c("report", "qlikview", "access", "access", "report", "qlikview", "access", "access", "gpa", "qlikview", "access", "access", "qlikview", "access", "access")
如何删除此数据框中的列表类型 c()
?
尝试了一些代码:
#Rule Classifier-----
rule <- function(out)
{
for(i in out)
{
for(j in out[i])
{
if(x[j]=="Access")
{
if(x[j]=="gpa" | x[j]=="qlikview")
{
return("Access")
}
}
else if(x[j]=="Report")
{
if(x[j]=="pfi" | x[j]=="data" )
{
return("Report")
}
}
}
}
}
预期输出:
1 Access
2 Access
3 Access
4 Access
5 Access
6 Access
7 Report/Data
8 Access
9 Access
10 Access
11 Report/Data
12 Report/Data
13 Report/Data
14 Report/Data
dput(O$text[1:10])
list(c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"), c("report", "qlikview",
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa",
"access"), c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"), c("qlikview", "gpa", "access",
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"
), c("report", "qlikview", "gpa", "access", "access", "gpa",
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview",
"gpa", "access", "qlikview", "gpa", "access"), c("report", "qlikview",
"access", "access", "report", "qlikview", "access", "access",
"gpa", "qlikview", "access", "access", "qlikview", "access",
"access"), c("report", "report", "access", "access", "report",
"report", "report", "report", "report", "report", "data", "data",
"report", "access", "report", "report"), c("report", "qlikview",
"gpa", "access", "gpa", "access"), c("report", "gpa", "access",
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa",
"gpa", "gpa", "report"), c("report", "gpa", "gpa", "access",
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa",
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
rule(out)
#this is throwing some error - Error in `[.default`(out, i) : invalid subscript type 'list'
我知道这很天真,但我是新手,如果我哪里出错了请纠正我。
您似乎假设 for ... in
循环将使用整数进行迭代。它们只是遍历列表中的对象,因此当它是列表时,您正试图使用 i
作为索引。但是 ?lapply
将向您展示使用列表的更好方法。
text <- list(c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"),
c("report", "qlikview",
"gpa", "access", "qlikview", "gpa", "access", "qlikview", "gpa",
"access"),
c("qlikview", "gpa", "access", "gpa", "access", "access",
"qlikview", "gpa", "access", "gpa"),
c("qlikview", "gpa", "access",
"gpa", "access", "access", "qlikview", "gpa", "access", "gpa"),
c("report", "qlikview", "gpa", "access", "access", "gpa",
"access", "qlikview", "gpa", "access", "access", "gpa", "qlikview",
"gpa", "access", "qlikview", "gpa", "access"),
c("report", "qlikview",
"access", "access", "report", "qlikview", "access", "access",
"gpa", "qlikview", "access", "access", "qlikview", "access",
"access"),
c("report", "report", "access", "access", "report",
"report", "report", "report", "report", "report", "data", "data",
"report", "access", "report", "report"),
c("report", "qlikview",
"gpa", "access", "gpa", "access"),
c("report", "gpa", "access",
"access", "gpa", "gpa", "gpa", "gpa", "gpa", "access", "gpa",
"gpa", "gpa", "report"),
c("report", "gpa", "gpa", "access",
"gpa", "access", "gpa", "access", "gpa", "gpa", "report", "gpa",
"gpa", "access", "gpa", "gpa", "gpa", "gpa", "gpa"))
O <- as.data.frame(cbind(text))
rule <- function(out) {
vapply(out$text, function (row) {
if ("access" %in% row && ("gpa" %in% row || "qlikview" %in% row)) {
return("Access")
} else if ("report" %in% row && ("pfi" %in% row || "data" %in% row)) {
return("Report/Data")
} else {
return("NA")
}
}, "")
}
rule(O)
# [1] "Access" "Access" "Access" "Access" "Access" "Access" "Report/Data" "Access"
# [9] "Access" "Access"