Row Wise 比较两个 pandas 数据框
Rowise compare two pandas dataframes
我有两个 pandas 数据帧
flows:
------
sourceIPAddress destinationIPAddress flowStartMicroseconds flowEndMicroseconds
163.193.204.92 40.8.121.226 2021-05-01 07:00:00.113 2021-05-01 07:00:00.113962
104.247.103.181 163.193.124.92 2021-05-01 07:00:00.074 2021-05-01 07:00:00.101026
17.254.170.53 163.193.124.133 2021-05-01 07:00:00.077 2021-05-01 07:00:00.083874
18.179.96.152 203.179.250.96 2021-05-01 07:00:00.112 2021-05-01 07:00:00.098296
133.103.144.34 13.154.212.11 2021-05-01 07:00:00.101 2021-05-01 07:00:00.112013
attacks:
--------
datetime srcIP dstIP
2021-05-01 07:00:00.055210 188.67.130.72 133.92.239.153
2021-05-01 07:00:00.055500 45.100.34.74 203.179.180.153
2021-05-01 07:00:00.055351 103.113.29.26 163.193.242.75
2021-05-01 07:00:00.056209 128.215.229.101 163.193.94.194
2021-05-01 07:00:00.055258 45.111.22.11 163.193.138.139
我想检查 flows 的每一行是否与 attacks 的任何行匹配,其中
attacks[srcIP] == flows[srcIP] || attacks[srcIP] == flows[destIP]
&&
attacks[destIP] == flows[srcIP] || attacks[destIP] == flows[destIP]
&&
attacks[datetime] between flows[flowStartMicroseconds] and flows[flowEndMicroseconds]
有没有比遍历它更有效的方法?
编辑:
数据框非常大。我包括了每个的 head()。
flows = {'sourceIPAddress': {510: '163.193.204.92',
564: '104.247.103.181',
590: '17.254.170.53',
599: '18.179.96.152',
1149: '133.103.144.34'},
'destinationIPAddress': {510: '40.8.121.226',
564: '163.193.124.92',
590: '163.193.124.133',
599: '203.179.250.96',
1149: '13.154.212.11'},
'flowStartMicroseconds': {510: Timestamp('2021-05-01 07:00:00.113000'),
564: Timestamp('2021-05-01 07:00:00.074000'),
590: Timestamp('2021-05-01 07:00:00.077000'),
599: Timestamp('2021-05-01 07:00:00.112000'),
1149: Timestamp('2021-05-01 07:00:00.101000')},
'flowEndMicroseconds': {510: Timestamp('2021-05-01 07:00:00.113962'),
564: Timestamp('2021-05-01 07:00:00.083874'),
590: Timestamp('2021-05-01 07:00:00.098296'),
599: Timestamp('2021-05-01 07:00:00.112013'),
1149: Timestamp('2021-05-01 07:00:00.101026')}}
attacks = {'datetime': {0: Timestamp('2021-05-01 07:00:00.055210'),
1: Timestamp('2021-05-01 07:00:00.055500'),
2: Timestamp('2021-05-01 07:00:00.055351'),
3: Timestamp('2021-05-01 07:00:00.056209'),
4: Timestamp('2021-05-01 07:00:00.055258')},
'srcIP': {0: '188.67.130.72',
1: '45.100.34.74',
2: '103.113.29.26',
3: '128.215.229.101',
4: '45.111.22.11'},
'dstIP': {0: '133.92.239.153',
1: '203.179.180.153',
2: '163.193.242.75',
3: '163.193.94.194',
4: '163.193.138.139'}}
在两个数据框之间使用左连接合并,然后查找数据的交集。
我不确定性能,但我会按以下方式进行。
为此只有两种IP类型攻击IP和流量IP。所以我会重新索引两个 DF 以具有以下格式
flow_df : (flow_IPAddress, flowStartMicroseconds, flowEndMicroseconds)
attack_df: (attack_IP, 日期时间)
然后我会使用内部连接合并它们 (left_on = "flow_IPAddress", right_on = "attack_IP")
然后我会查询结果以仅过滤有效的时间戳(例如,使用您在上面写的语句。)
那么生成的 df 将如下所示:
flowIPAddress attack_IP flowStartMicroseconds flowEndMicroseconds datetime
163.193.204.92 40.8.121.226 2021-05-01 07:00:00.113 2021-05-01 07:00:00.113962 2021-05-01 07:00:00.055210
104.247.103.181 163.193.124.92 2021-05-01 07:00:00.074 2021-05-01 07:00:00.101026 2021-05-01 07:00:00.055210
注意:如果您想保留 src 和 dst IP,您可以继续上述方法,但要单独考虑每一对。
解决方案:数据库
我的解决方案是将这两个数据帧导入 PostgreSQL 并创建两个用于正向和反向 IP 匹配的新表,然后将它们 UNION ALL 在一起。
两个单连接比一个大连接要快得多。
create table attacks_forward as
SELECT
flows.*, attacks."label", attacks."sublabel"
FROM
flows
JOIN attacks
ON flows."sourceIPAddress" = attacks."srcIP"
and flows."destinationIPAddress" = attacks."dstIP"
and attacks."datetime" between flows."flowStartMicroseconds" and flows."flowEndMicroseconds";
create table attacks_backward as
SELECT
flows.*, attacks."label", attacks."sublabel"
FROM
flows
JOIN attacks
ON flows."sourceIPAddress" = attacks."dstIP"
and flows."destinationIPAddress" = attacks."srcIP"
and attacks."datetime" between flows."flowStartMicroseconds" and flows."flowEndMicroseconds";
create table attacks_flows as
SELECT * FROM attacks_forward
UNION ALL
SELECT * FROM attacks_backward;
我有两个 pandas 数据帧
flows:
------
sourceIPAddress destinationIPAddress flowStartMicroseconds flowEndMicroseconds
163.193.204.92 40.8.121.226 2021-05-01 07:00:00.113 2021-05-01 07:00:00.113962
104.247.103.181 163.193.124.92 2021-05-01 07:00:00.074 2021-05-01 07:00:00.101026
17.254.170.53 163.193.124.133 2021-05-01 07:00:00.077 2021-05-01 07:00:00.083874
18.179.96.152 203.179.250.96 2021-05-01 07:00:00.112 2021-05-01 07:00:00.098296
133.103.144.34 13.154.212.11 2021-05-01 07:00:00.101 2021-05-01 07:00:00.112013
attacks:
--------
datetime srcIP dstIP
2021-05-01 07:00:00.055210 188.67.130.72 133.92.239.153
2021-05-01 07:00:00.055500 45.100.34.74 203.179.180.153
2021-05-01 07:00:00.055351 103.113.29.26 163.193.242.75
2021-05-01 07:00:00.056209 128.215.229.101 163.193.94.194
2021-05-01 07:00:00.055258 45.111.22.11 163.193.138.139
我想检查 flows 的每一行是否与 attacks 的任何行匹配,其中
attacks[srcIP] == flows[srcIP] || attacks[srcIP] == flows[destIP]
&&
attacks[destIP] == flows[srcIP] || attacks[destIP] == flows[destIP]
&&
attacks[datetime] between flows[flowStartMicroseconds] and flows[flowEndMicroseconds]
有没有比遍历它更有效的方法?
编辑: 数据框非常大。我包括了每个的 head()。
flows = {'sourceIPAddress': {510: '163.193.204.92',
564: '104.247.103.181',
590: '17.254.170.53',
599: '18.179.96.152',
1149: '133.103.144.34'},
'destinationIPAddress': {510: '40.8.121.226',
564: '163.193.124.92',
590: '163.193.124.133',
599: '203.179.250.96',
1149: '13.154.212.11'},
'flowStartMicroseconds': {510: Timestamp('2021-05-01 07:00:00.113000'),
564: Timestamp('2021-05-01 07:00:00.074000'),
590: Timestamp('2021-05-01 07:00:00.077000'),
599: Timestamp('2021-05-01 07:00:00.112000'),
1149: Timestamp('2021-05-01 07:00:00.101000')},
'flowEndMicroseconds': {510: Timestamp('2021-05-01 07:00:00.113962'),
564: Timestamp('2021-05-01 07:00:00.083874'),
590: Timestamp('2021-05-01 07:00:00.098296'),
599: Timestamp('2021-05-01 07:00:00.112013'),
1149: Timestamp('2021-05-01 07:00:00.101026')}}
attacks = {'datetime': {0: Timestamp('2021-05-01 07:00:00.055210'),
1: Timestamp('2021-05-01 07:00:00.055500'),
2: Timestamp('2021-05-01 07:00:00.055351'),
3: Timestamp('2021-05-01 07:00:00.056209'),
4: Timestamp('2021-05-01 07:00:00.055258')},
'srcIP': {0: '188.67.130.72',
1: '45.100.34.74',
2: '103.113.29.26',
3: '128.215.229.101',
4: '45.111.22.11'},
'dstIP': {0: '133.92.239.153',
1: '203.179.180.153',
2: '163.193.242.75',
3: '163.193.94.194',
4: '163.193.138.139'}}
在两个数据框之间使用左连接合并,然后查找数据的交集。
我不确定性能,但我会按以下方式进行。
为此只有两种IP类型攻击IP和流量IP。所以我会重新索引两个 DF 以具有以下格式
flow_df : (flow_IPAddress, flowStartMicroseconds, flowEndMicroseconds)
attack_df: (attack_IP, 日期时间)
然后我会使用内部连接合并它们 (left_on = "flow_IPAddress", right_on = "attack_IP")
然后我会查询结果以仅过滤有效的时间戳(例如,使用您在上面写的语句。)
那么生成的 df 将如下所示:
flowIPAddress attack_IP flowStartMicroseconds flowEndMicroseconds datetime
163.193.204.92 40.8.121.226 2021-05-01 07:00:00.113 2021-05-01 07:00:00.113962 2021-05-01 07:00:00.055210
104.247.103.181 163.193.124.92 2021-05-01 07:00:00.074 2021-05-01 07:00:00.101026 2021-05-01 07:00:00.055210
注意:如果您想保留 src 和 dst IP,您可以继续上述方法,但要单独考虑每一对。
解决方案:数据库
我的解决方案是将这两个数据帧导入 PostgreSQL 并创建两个用于正向和反向 IP 匹配的新表,然后将它们 UNION ALL 在一起。
两个单连接比一个大连接要快得多。
create table attacks_forward as
SELECT
flows.*, attacks."label", attacks."sublabel"
FROM
flows
JOIN attacks
ON flows."sourceIPAddress" = attacks."srcIP"
and flows."destinationIPAddress" = attacks."dstIP"
and attacks."datetime" between flows."flowStartMicroseconds" and flows."flowEndMicroseconds";
create table attacks_backward as
SELECT
flows.*, attacks."label", attacks."sublabel"
FROM
flows
JOIN attacks
ON flows."sourceIPAddress" = attacks."dstIP"
and flows."destinationIPAddress" = attacks."srcIP"
and attacks."datetime" between flows."flowStartMicroseconds" and flows."flowEndMicroseconds";
create table attacks_flows as
SELECT * FROM attacks_forward
UNION ALL
SELECT * FROM attacks_backward;