如何使用tapply匹配特定条件
How to use tapply to match specific condition
我有事故数据集,其中包含报告的事故数量。我正在尝试使用 tapply
函数,它将显示 "Thursday" 上报告的事故总数。但是,而不是返回特定日期报告的事故数量。它显示我在 dataset.I 中使用的 tapply
函数中的总行数。:
tapply(myfinal$VEHICLE_COUNT,myfinal$DAY_OF_WEEK=='THURSDAY',length)
我的示例数据集如下:
> dput(tail(myfinal,5))
structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")
关于如何修复它的任何建议!提前致谢!
这里用tapply
真的没有意义!
方法一
使用dplyr
:
require(tidyverse);
df %>% filter(trimws(DAY_OF_WEEK) == "MONDAY") %>% summarise(count = n());
# count
#1 5
方法二
在基础 R 中,使用 subset
和 table
table(subset(df, trimws(DAY_OF_WEEK) == "MONDAY")$DAY_OF_WEEK);
#MONDAY
# 5
我在这里使用了 "MONDAY",因为您没有包含 DAY_OF_WEEK = "THURSDAY"
的条目。
示例数据
df <- structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")
Base R 解决方案是通过 "THURSDAY" 对 DAY_OF_WEEK
进行子集化,然后 return 行数:
nrow(df[df$DAY_OF_WEEK == "THURSDAY",])
如果出于某种原因只能使用 tapply,那么根据 Maurits 的回答,您应该可以这样做:
tapply(myfinal$VEHICLE_COUNT,trimws(myfinal$DAY_OF_WEEK)=='THURSDAY',length)
或类似。似乎 DAY_OF_WEEK 变量中的字符串末尾有很多空格。您要么需要删除它们(通过 trimws
),要么修改您的比较字符串以包含这些空格(例如,myfinal$DAY_OF_WEEK=="THURSDAY "
)。使用比较运算符,只有当两个字符串逐个字符完全匹配时,R 才会匹配两个字符串,因此任一字符串中的任何额外空格都会对您不利。
我有事故数据集,其中包含报告的事故数量。我正在尝试使用 tapply
函数,它将显示 "Thursday" 上报告的事故总数。但是,而不是返回特定日期报告的事故数量。它显示我在 dataset.I 中使用的 tapply
函数中的总行数。:
tapply(myfinal$VEHICLE_COUNT,myfinal$DAY_OF_WEEK=='THURSDAY',length)
我的示例数据集如下:
> dput(tail(myfinal,5))
structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")
关于如何修复它的任何建议!提前致谢!
这里用tapply
真的没有意义!
方法一
使用dplyr
:
require(tidyverse);
df %>% filter(trimws(DAY_OF_WEEK) == "MONDAY") %>% summarise(count = n());
# count
#1 5
方法二
在基础 R 中,使用 subset
和 table
table(subset(df, trimws(DAY_OF_WEEK) == "MONDAY")$DAY_OF_WEEK);
#MONDAY
# 5
我在这里使用了 "MONDAY",因为您没有包含 DAY_OF_WEEK = "THURSDAY"
的条目。
示例数据
df <- structure(list(CASE_NUMBER = c("1251045636", "1251045630", "1251045591",
"1251045574", "1250010434"), BARRACK = c("Frederick", "Frederick",
"Frederick", "Frederick", "Jessup"), ACC_DATE = c("2012-12-31T00:00:00",
"2012-12-31T00:00:00", "2012-12-31T00:00:00", "2012-12-31T00:00:00",
"2012-12-31T00:00:00"), ACC_TIME = c("18:12", "18:12", "12:12",
"9:12", "11:12"), ACC_TIME_CODE = c("5", "5", "4", "3", "3"),
DAY_OF_WEEK = c("MONDAY ", "MONDAY ", "MONDAY ", "MONDAY ",
"MONDAY "), ROAD = c("IS 00070 EISENHOWER MEMOR HWY", "MD 00077 ROCKY RIDGE RD",
"MD 00085 BUCKEYSTOWN PIKE", "MD 00017 MYERSVILLE RD", "IS 00070 No Name"
), INTERSECT_ROAD = c("CO 00248 MONUMENT RD", "MD 00076 MOTTERS STATION RD",
"CO 00308 MANOR WOODS RD", "CO 00941 DAWN CT", "US 00029 Columbia Pike"
), DIST_FROM_INTERSECT = c("300", "0", "400", "500", "0.25"
), DIST_DIRECTION = c("E", "U", "S", "S", "E"), CITY_NAME = c("Not Applicable",
"Not Applicable", "Not Applicable", "Not Applicable", NA),
COUNTY_CODE = c("10", "10", "10", "10", "13"), COUNTY_NAME = c("Frederick",
"Frederick", "Frederick", "Frederick", "Howard"), VEHICLE_COUNT = c(1,
2, 2, 1, 2), PROP_DEST = c("NO", "YES", "YES", "NO", "NO"
), INJURY = c("YES", "NO", "NO", "YES", "YES"), COLLISION_WITH_1 = c("FIXED OBJ",
"VEH", "VEH", "NON-COLLISION", "VEH"), COLLISION_WITH_2 = c("OTHER-COLLISION",
"OTHER-COLLISION", "OTHER-COLLISION", "OTHER-COLLISION",
"OTHER-COLLISION")), .Names = c("CASE_NUMBER", "BARRACK",
"ACC_DATE", "ACC_TIME", "ACC_TIME_CODE", "DAY_OF_WEEK", "ROAD",
"INTERSECT_ROAD", "DIST_FROM_INTERSECT", "DIST_DIRECTION", "CITY_NAME",
"COUNTY_CODE", "COUNTY_NAME", "VEHICLE_COUNT", "PROP_DEST", "INJURY",
"COLLISION_WITH_1", "COLLISION_WITH_2"), row.names = 18634:18638, class = "data.frame")
Base R 解决方案是通过 "THURSDAY" 对 DAY_OF_WEEK
进行子集化,然后 return 行数:
nrow(df[df$DAY_OF_WEEK == "THURSDAY",])
如果出于某种原因只能使用 tapply,那么根据 Maurits 的回答,您应该可以这样做:
tapply(myfinal$VEHICLE_COUNT,trimws(myfinal$DAY_OF_WEEK)=='THURSDAY',length)
或类似。似乎 DAY_OF_WEEK 变量中的字符串末尾有很多空格。您要么需要删除它们(通过 trimws
),要么修改您的比较字符串以包含这些空格(例如,myfinal$DAY_OF_WEEK=="THURSDAY "
)。使用比较运算符,只有当两个字符串逐个字符完全匹配时,R 才会匹配两个字符串,因此任一字符串中的任何额外空格都会对您不利。