PYTHON - BEAUTIFULSOUP 如何将空的 TD(table 数据) 抓取为空值而不是跳过它
PYTHON - BEAUTIFULSOUP how to scrape empty TD(table data) as an empty value instead of skipping it
我想将一个网页抓取到一个 4 列的 csv 文件中,并且一些 table 数据不包含数据,我想将其写为一个空单元格值而不是跳过它.text
。我也尝试使用 .string
但它给了我 TypeError: can only concatenate str (not "NoneType") to str
我还想设置一个动态查找以获取如果 <td>
有 <a href>
然后追加 <a>
标签数据如果没有,追加 <td>
中的内容但写为空(或文本“None”)值,如果 <td>
没有数据。
您可以在下面查看 HTML 示例。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.example.com'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_soup = soup(uClient.read(), "lxml")
uClient.close()
# containers = page_soup.find("table", {"class": "typetable"}).find_all("tr",{"class":"typetable"})
# container = containers[0]
containers = page_soup.find_all("tr", {"class": "typetable"})
# print(containers.td)
tds = []
out_filename = "output.csv"
headers = "Parameter,Type_Value,Cardinality,Description \n"
f = open(out_filename, "w")
f.write(headers)
parameter = []
type_value = []
cardinality = []
description = []
for container in containers:
parameter = container.findAll('td')[0].text
type_value = container.find_all('td')[1].text
cardinality = container.find_all('td')[2].text
description = container.find_all('td')[3].text
print("parameter: " + parameter + "\n")
print("type_value: " + type_value + "\n")
print("cardinality: " + cardinality + "\n")
print("description: " + description + "\n")
#f.write(parameter + ', ' + type_value + ', ' + cardinality + ', "' + description + ' "\n')
f.write(f'{parameter},{str(type_value)},{cardinality},"{description}"\n')
f.close()
这是一个例子html:
<tr class="typetable">
<td>Data 1 </td>
<td>Data 2 </td>
<td> </td>
<td>Data 4 </td>
</tr>
<tr class="typetable">
<td>Data 10 </td>
<td>
<a href="#2ndPage">2ndPage</a>" "
</td>
<td>Data 3 </td>
<td> </td>
</tr>
我希望输出显示
Parameter,Type_Value,Cardinality,Description
Data 1,Data 2,,"Data 4"
Data 1,2ndPage,Data 3,
几周来我一直在 Whosebug 上测试和查找示例:(,请帮忙。提前致谢!
您可以使用此脚本从 table:
中提取数据
import csv
from bs4 import BeautifulSoup
txt = '''<tr class="typetable">
<td>Data 1 </td>
<td>Data 2 </td>
<td> </td>
<td>Data 4 </td>
</tr>
<tr class="typetable">
<td>Data 10 </td>
<td>
<a href="#2ndPage">2ndPage</a>" "
</td>
<td>Data 3 </td>
<td> </td>
</tr>'''
soup = BeautifulSoup(txt, 'html.parser')
all_data = []
for row in soup.select('tr.typetable'):
tds = [td.a.get_text(strip=True) if td.a else td.get_text(strip=True) for td in row.select('td')]
all_data.append(tds)
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['Parameter','Type_Value','Cardinality','Description'])
for row in all_data:
writer.writerow(row)
写这个data.csv
:
Parameter,Type_Value,Cardinality,Description
Data 1,Data 2,,Data 4
Data 10,2ndPage,Data 3,
我想将一个网页抓取到一个 4 列的 csv 文件中,并且一些 table 数据不包含数据,我想将其写为一个空单元格值而不是跳过它.text
。我也尝试使用 .string
但它给了我 TypeError: can only concatenate str (not "NoneType") to str
我还想设置一个动态查找以获取如果 <td>
有 <a href>
然后追加 <a>
标签数据如果没有,追加 <td>
中的内容但写为空(或文本“None”)值,如果 <td>
没有数据。
您可以在下面查看 HTML 示例。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.example.com'
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_soup = soup(uClient.read(), "lxml")
uClient.close()
# containers = page_soup.find("table", {"class": "typetable"}).find_all("tr",{"class":"typetable"})
# container = containers[0]
containers = page_soup.find_all("tr", {"class": "typetable"})
# print(containers.td)
tds = []
out_filename = "output.csv"
headers = "Parameter,Type_Value,Cardinality,Description \n"
f = open(out_filename, "w")
f.write(headers)
parameter = []
type_value = []
cardinality = []
description = []
for container in containers:
parameter = container.findAll('td')[0].text
type_value = container.find_all('td')[1].text
cardinality = container.find_all('td')[2].text
description = container.find_all('td')[3].text
print("parameter: " + parameter + "\n")
print("type_value: " + type_value + "\n")
print("cardinality: " + cardinality + "\n")
print("description: " + description + "\n")
#f.write(parameter + ', ' + type_value + ', ' + cardinality + ', "' + description + ' "\n')
f.write(f'{parameter},{str(type_value)},{cardinality},"{description}"\n')
f.close()
这是一个例子html:
<tr class="typetable">
<td>Data 1 </td>
<td>Data 2 </td>
<td> </td>
<td>Data 4 </td>
</tr>
<tr class="typetable">
<td>Data 10 </td>
<td>
<a href="#2ndPage">2ndPage</a>" "
</td>
<td>Data 3 </td>
<td> </td>
</tr>
我希望输出显示
Parameter,Type_Value,Cardinality,Description
Data 1,Data 2,,"Data 4"
Data 1,2ndPage,Data 3,
几周来我一直在 Whosebug 上测试和查找示例:(,请帮忙。提前致谢!
您可以使用此脚本从 table:
中提取数据import csv
from bs4 import BeautifulSoup
txt = '''<tr class="typetable">
<td>Data 1 </td>
<td>Data 2 </td>
<td> </td>
<td>Data 4 </td>
</tr>
<tr class="typetable">
<td>Data 10 </td>
<td>
<a href="#2ndPage">2ndPage</a>" "
</td>
<td>Data 3 </td>
<td> </td>
</tr>'''
soup = BeautifulSoup(txt, 'html.parser')
all_data = []
for row in soup.select('tr.typetable'):
tds = [td.a.get_text(strip=True) if td.a else td.get_text(strip=True) for td in row.select('td')]
all_data.append(tds)
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['Parameter','Type_Value','Cardinality','Description'])
for row in all_data:
writer.writerow(row)
写这个data.csv
:
Parameter,Type_Value,Cardinality,Description
Data 1,Data 2,,Data 4
Data 10,2ndPage,Data 3,