Python 在数据框中扩展网络地址
Python expand network address in a dataframe
我有一个类似于以下的数据框:
df:
ip last_active
192.167.0.9 01/02/2012
226.90.2.12 05/06/2013
10.90.2.09 05/06/2014
12.60.2.80
192.168.2.11-17 05/06/2016
有没有办法扩展 df 中最后一行的 ip 地址?
一个完美的解决方案是:
df:
ip last_active
192.167.0.9 01/02/2012
226.90.2.12 05/06/2013
10.90.2.09 05/06/2014
12.60.2.80
192.168.2.11 05/06/2016
192.168.2.12 05/06/2016
192.168.2.13 05/06/2016
192.168.2.14 05/06/2016
192.168.2.15 05/06/2016
192.168.2.16 05/06/2016
192.168.2.17 05/06/2016
感谢任何指导!
这是我的方法:
# separate relevant parts
s = df['ip'].str.extract('(\d+\.\d+\.\d+.)(\d+)-?(\d+)?').ffill(1)
# convert last parts to int for easy manipulation
s[1] = s[1].astype(int)
s[2] = s[2].astype(int)
# masking the IP range
lowers = s[1].values[:,None] <= np.arange(256)
uppers = np.arange(256)<= s[2].values[:,None]
# create new dataframe for the new IPs
u = pd.DataFrame(lowers & uppers,
index=pd.MultiIndex.from_arrays([df['last_active'],s[0]])
)
# final data
(u.where(u).stack().reset_index(name='dummy')
.assign(ip=lambda x: x[0]+x['level_2'].astype(str))
.drop([0,'level_2','dummy'], axis=1)
)
输出:
last_active ip
0 01/02/2012 192.167.0.9
1 05/06/2013 226.90.2.12
2 05/06/2014 10.90.2.9
3 NaN 12.60.2.80
4 05/06/2016 192.168.2.11
5 05/06/2016 192.168.2.12
6 05/06/2016 192.168.2.13
7 05/06/2016 192.168.2.14
8 05/06/2016 192.168.2.15
9 05/06/2016 192.168.2.16
10 05/06/2016 192.168.2.17
您可以应用一个函数来制作范围元素中的 IP 列表,然后使用 explode()
如果您有最新的 pandas 版本
def ip_splitter(ip):
if '-' in ip:
last_octet_range=[int(i) for i in ip.split('.')[3].split('-')]
new_ips = [i for i in range(last_octet_range[0],last_octet_range[1]+1)]
expanded_range = ['.'.join(ip.split('.')[:3]+[str(i)]) for i in new_ips]
return expanded_range
return ip
df['ip']=df['ip'].apply(ip_splitter)
df
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 [192.168.2.11, 192.168.2.12, 192.168.2.13, 192... 05/06/2016
df.explode('ip')
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 192.168.2.11 05/06/2016
4 192.168.2.12 05/06/2016
4 192.168.2.13 05/06/2016
4 192.168.2.14 05/06/2016
4 192.168.2.15 05/06/2016
4 192.168.2.16 05/06/2016
4 192.168.2.17 05/06/2016
基于命名捕获组的可能解决方案之一:
pat = re.compile(r'(?P<g1>(?:\d+\.){3})(?P<g2>\d+)-(?P<g3>\d+)')
outRows = []
for _, row in df.iterrows():
ip = row.ip
mtch = pat.match(ip)
if mtch:
n1 = mtch.group('g1')
n2 = int(mtch.group('g2'))
n3 = int(mtch.group('g3'))
for n in range(n2, n3 + 1):
outRows.append([ n1 + str(n), row.last_active])
else:
outRows.append([ ip, row.last_active])
result = pd.DataFrame(outRows, columns=df.columns)
我喜欢各种不同的解决方案。这是改编自此处的另一个 (!):.
df2 = pd.DataFrame(columns=df.columns)
count = 0
for idx,r in df.iterrows():
data = r['ip'].split("-")
if len(data) > 1:
start = int(data[0].split('.')[-1])
end = int(data[1]) + 1
for i in range(start,end):
df2.loc[count] = (data[0][:data[0].rfind('.')+1] +
str(i),r['last_active'])
count += 1
else:
df2.loc[count] = (r['ip'],r['last_active'])
count += 1
结果:
In [40]: df2
Out[40]:
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 192.168.2.11 05/06/2016
5 192.168.2.12 05/06/2016
6 192.168.2.13 05/06/2016
7 192.168.2.14 05/06/2016
8 192.168.2.15 05/06/2016
9 192.168.2.16 05/06/2016
10 192.168.2.17 05/06/2016
我有一个类似于以下的数据框:
df:
ip last_active
192.167.0.9 01/02/2012
226.90.2.12 05/06/2013
10.90.2.09 05/06/2014
12.60.2.80
192.168.2.11-17 05/06/2016
有没有办法扩展 df 中最后一行的 ip 地址? 一个完美的解决方案是:
df:
ip last_active
192.167.0.9 01/02/2012
226.90.2.12 05/06/2013
10.90.2.09 05/06/2014
12.60.2.80
192.168.2.11 05/06/2016
192.168.2.12 05/06/2016
192.168.2.13 05/06/2016
192.168.2.14 05/06/2016
192.168.2.15 05/06/2016
192.168.2.16 05/06/2016
192.168.2.17 05/06/2016
感谢任何指导!
这是我的方法:
# separate relevant parts
s = df['ip'].str.extract('(\d+\.\d+\.\d+.)(\d+)-?(\d+)?').ffill(1)
# convert last parts to int for easy manipulation
s[1] = s[1].astype(int)
s[2] = s[2].astype(int)
# masking the IP range
lowers = s[1].values[:,None] <= np.arange(256)
uppers = np.arange(256)<= s[2].values[:,None]
# create new dataframe for the new IPs
u = pd.DataFrame(lowers & uppers,
index=pd.MultiIndex.from_arrays([df['last_active'],s[0]])
)
# final data
(u.where(u).stack().reset_index(name='dummy')
.assign(ip=lambda x: x[0]+x['level_2'].astype(str))
.drop([0,'level_2','dummy'], axis=1)
)
输出:
last_active ip
0 01/02/2012 192.167.0.9
1 05/06/2013 226.90.2.12
2 05/06/2014 10.90.2.9
3 NaN 12.60.2.80
4 05/06/2016 192.168.2.11
5 05/06/2016 192.168.2.12
6 05/06/2016 192.168.2.13
7 05/06/2016 192.168.2.14
8 05/06/2016 192.168.2.15
9 05/06/2016 192.168.2.16
10 05/06/2016 192.168.2.17
您可以应用一个函数来制作范围元素中的 IP 列表,然后使用 explode()
如果您有最新的 pandas 版本
def ip_splitter(ip):
if '-' in ip:
last_octet_range=[int(i) for i in ip.split('.')[3].split('-')]
new_ips = [i for i in range(last_octet_range[0],last_octet_range[1]+1)]
expanded_range = ['.'.join(ip.split('.')[:3]+[str(i)]) for i in new_ips]
return expanded_range
return ip
df['ip']=df['ip'].apply(ip_splitter)
df
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 [192.168.2.11, 192.168.2.12, 192.168.2.13, 192... 05/06/2016
df.explode('ip')
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 192.168.2.11 05/06/2016
4 192.168.2.12 05/06/2016
4 192.168.2.13 05/06/2016
4 192.168.2.14 05/06/2016
4 192.168.2.15 05/06/2016
4 192.168.2.16 05/06/2016
4 192.168.2.17 05/06/2016
基于命名捕获组的可能解决方案之一:
pat = re.compile(r'(?P<g1>(?:\d+\.){3})(?P<g2>\d+)-(?P<g3>\d+)')
outRows = []
for _, row in df.iterrows():
ip = row.ip
mtch = pat.match(ip)
if mtch:
n1 = mtch.group('g1')
n2 = int(mtch.group('g2'))
n3 = int(mtch.group('g3'))
for n in range(n2, n3 + 1):
outRows.append([ n1 + str(n), row.last_active])
else:
outRows.append([ ip, row.last_active])
result = pd.DataFrame(outRows, columns=df.columns)
我喜欢各种不同的解决方案。这是改编自此处的另一个 (!):
df2 = pd.DataFrame(columns=df.columns)
count = 0
for idx,r in df.iterrows():
data = r['ip'].split("-")
if len(data) > 1:
start = int(data[0].split('.')[-1])
end = int(data[1]) + 1
for i in range(start,end):
df2.loc[count] = (data[0][:data[0].rfind('.')+1] +
str(i),r['last_active'])
count += 1
else:
df2.loc[count] = (r['ip'],r['last_active'])
count += 1
结果:
In [40]: df2
Out[40]:
ip last_active
0 192.167.0.9 01/02/2012
1 226.90.2.12 05/06/2013
2 10.90.2.09 05/06/2014
3 12.60.2.80 None
4 192.168.2.11 05/06/2016
5 192.168.2.12 05/06/2016
6 192.168.2.13 05/06/2016
7 192.168.2.14 05/06/2016
8 192.168.2.15 05/06/2016
9 192.168.2.16 05/06/2016
10 192.168.2.17 05/06/2016