Python & BS4:排除空白行和总计行
Python & BS4: Exclude blank and Grand Total Row
下面是我从 HTML 文档中提取数据并将其放入变量的代码。我需要排除空白行以及 "grand total" 行。我在我的代码下方添加了这些段的 HTML 输入。我不确定如何让它工作。我不能使用 len()
因为长度是可变的。有帮助吗?
from bs4 import BeautifulSoup
import urllib
import re
import HTMLParser
html = urllib.urlopen('RanpakAllocations.html').read()
parser = HTMLParser.HTMLParser()
#unescape doesn't seem to work
output = parser.unescape(html)
soup1 = BeautifulSoup(output, "html.parser")
Customer_No = []
Serial_No = []
data = []
#for hit in soup.findAll(attrs={'class' : 'MYCLASS'}):
rows = soup1.find_all("tr")
title = rows[0]
headers = rows[1]
datarows = rows[2:]
fields = []
try :
for row in datarows :
find_data = row.find_all(attrs={'face' : 'Arial,Helvetica,sans-serif'})
count = 0
for hit in find_data:
data = hit.text
count = count + 1
if count == 3 :
CSNO = data
if count == 9 :
ITNO = data
else :
continue
print CSNO, ITNO
print "new row"
except:
pass
这是输入。第一个 <tr>
是我的最后一行数据,但是我的循环正在重复它下面的空白行和总计行。
<tr>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">12</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">F5684</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">20182</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">VELOCITY SOLUTIONS INC.</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">EQPRAN77717</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">RANPAK FILLPAK TT 2</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">W/UNIVERSAL STAND S/N 51345563</font></td>
<td nowrap="nowrap" align="right"><font size="3" face="Arial,Helvetica,sans-serif">1</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">51345563</font></td>
</tr>
<tr>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td align="left" colspan="5"><font size="1"> </font></td>
</tr>
<tr>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif"> </font></td>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif">Grand Total</font></td>
<td align="left" colspan="7"><font size="1"> </font></td>
</tr>
<tr>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
我会这样做:
from bs4 import BeautifulSoup
content = '''
<root>
<tr>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">12</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">F5684</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">20182</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">VELOCITY SOLUTIONS INC.</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">EQPRAN77717</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">RANPAK FILLPAK TT 2</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">W/UNIVERSAL STAND S/N 51345563</font></td>
<td nowrap="nowrap" align="right"><font size="3" face="Arial,Helvetica,sans-serif">1</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">51345563</font></td>
</tr>
<tr>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td align="left" colspan="5"><font size="1"> </font></td>
</tr>
<tr>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif"> </font></td>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif">Grand Total</font></td>
<td align="left" colspan="7"><font size="1"> </font></td>
</tr>
<tr>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</root>'''
soup = BeautifulSoup(content, 'html')
answer = []
rows = soup.find_all('tr')
for row in rows:
if not row.text.strip():
continue
row_text = []
for cell in row.find_all('td'):
if cell.text.strip():
row_text.append(cell.text)
answer.append(row_text)
print(answer)
输出
[[u'12', u'F5684', u'20182', u'VELOCITY SOLUTIONS INC.', u'EQPRAN77717', u'RANPAK FILLPAK TT 2', u'W/UNIVERSAL STAND S/N 51345563', u'1', u'51345563'], [u'Grand Total']]
您可以使用 if not row.text.strip(): continue
跳过整个空行(row.text.strip()
returns 一个空字符串,计算结果为 False
)。
对于您迭代的行,您可以在保存相关文本之前使用 if cell.text.strip()
检查每个单元格是否为空。
下面是我从 HTML 文档中提取数据并将其放入变量的代码。我需要排除空白行以及 "grand total" 行。我在我的代码下方添加了这些段的 HTML 输入。我不确定如何让它工作。我不能使用 len()
因为长度是可变的。有帮助吗?
from bs4 import BeautifulSoup
import urllib
import re
import HTMLParser
html = urllib.urlopen('RanpakAllocations.html').read()
parser = HTMLParser.HTMLParser()
#unescape doesn't seem to work
output = parser.unescape(html)
soup1 = BeautifulSoup(output, "html.parser")
Customer_No = []
Serial_No = []
data = []
#for hit in soup.findAll(attrs={'class' : 'MYCLASS'}):
rows = soup1.find_all("tr")
title = rows[0]
headers = rows[1]
datarows = rows[2:]
fields = []
try :
for row in datarows :
find_data = row.find_all(attrs={'face' : 'Arial,Helvetica,sans-serif'})
count = 0
for hit in find_data:
data = hit.text
count = count + 1
if count == 3 :
CSNO = data
if count == 9 :
ITNO = data
else :
continue
print CSNO, ITNO
print "new row"
except:
pass
这是输入。第一个 <tr>
是我的最后一行数据,但是我的循环正在重复它下面的空白行和总计行。
<tr>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">12</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">F5684</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">20182</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">VELOCITY SOLUTIONS INC.</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">EQPRAN77717</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">RANPAK FILLPAK TT 2</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">W/UNIVERSAL STAND S/N 51345563</font></td>
<td nowrap="nowrap" align="right"><font size="3" face="Arial,Helvetica,sans-serif">1</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">51345563</font></td>
</tr>
<tr>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td align="left" colspan="5"><font size="1"> </font></td>
</tr>
<tr>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif"> </font></td>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif">Grand Total</font></td>
<td align="left" colspan="7"><font size="1"> </font></td>
</tr>
<tr>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
我会这样做:
from bs4 import BeautifulSoup
content = '''
<root>
<tr>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">12</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">F5684</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">20182</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">VELOCITY SOLUTIONS INC.</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">EQPRAN77717</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">RANPAK FILLPAK TT 2</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">W/UNIVERSAL STAND S/N 51345563</font></td>
<td nowrap="nowrap" align="right"><font size="3" face="Arial,Helvetica,sans-serif">1</font></td>
<td nowrap="nowrap" align="left"><font size="3" face="Arial,Helvetica,sans-serif">51345563</font></td>
</tr>
<tr>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td nowrap="nowrap" align="left"><font size="1"> </font></td>
<td align="left" colspan="5"><font size="1"> </font></td>
</tr>
<tr>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif"> </font></td>
<td align="left"><font size="3" face="Arial,Helvetica,sans-serif">Grand Total</font></td>
<td align="left" colspan="7"><font size="1"> </font></td>
</tr>
<tr>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</root>'''
soup = BeautifulSoup(content, 'html')
answer = []
rows = soup.find_all('tr')
for row in rows:
if not row.text.strip():
continue
row_text = []
for cell in row.find_all('td'):
if cell.text.strip():
row_text.append(cell.text)
answer.append(row_text)
print(answer)
输出
[[u'12', u'F5684', u'20182', u'VELOCITY SOLUTIONS INC.', u'EQPRAN77717', u'RANPAK FILLPAK TT 2', u'W/UNIVERSAL STAND S/N 51345563', u'1', u'51345563'], [u'Grand Total']]
您可以使用 if not row.text.strip(): continue
跳过整个空行(row.text.strip()
returns 一个空字符串,计算结果为 False
)。
对于您迭代的行,您可以在保存相关文本之前使用 if cell.text.strip()
检查每个单元格是否为空。