使用 Beautiful Soup 抓取站点地图索引 URL 以获取状态代码
Scraping sitemap index URLs for status code with Beautiful Soup
我正在尝试根据以下说明编写脚本:
Scrape all the URLs into a sitemap index and store the information into an Excel file, specifying for each URL the corresponding status code.
我设法抓取了所有 URL 并将它们存储到一个文件中,顺便说一句,我仍在努力寻找一种获取状态代码的方法。
import requests
from bs4 import BeautifulSoup
url = 'https://www.example-domain.it/sitemap_index.xml'
page = requests.get(url)
print('Loaded page with: %s' % page)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
print('Created %s object' % type(sitemap_index))
urls = [element.text for element in sitemap_index.findAll('loc')]
print(urls)
def extract_links(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
links = [element.text for element in soup.findAll('loc')]
return links
sitemap_urls = []
for url in urls:
links = extract_links(url)
r = requests.get(url)
code = r.status_code
sitemap_urls += links
sitemap_urls += code
print('Found {:,} URLs in the sitemap'.format(len(sitemap_urls)))
import pandas as pd
df = pd.DataFrame(sitemap_urls)
df.to_excel("Export_link.xlsx")
任何人都可以帮我修复这个脚本吗?
我收到这个错误:
TypeError: 'int' object is not iterable
问题出在以下几行:
sitemap_urls = []
for url in urls:
links = extract_links(url)
r = requests.get(url)
code = r.status_code
sitemap_urls += links
sitemap_urls += code
如果我只写:
sitemap_urls = []
for url in urls:
links = extract_links(url)
sitemap_urls += links
该脚本正确导出 sitemap_index 内的所有 URL 但我必须更进一步:我想获取站点地图中包含的每个 URL , 各自的状态码。
我怎样才能安排这个迭代来实现它?
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.text for item in soup.select("loc")]
with open("data.csv", 'w') as f:
writer = csv.writer(f)
writer.writerow(["Url", "Status Code"])
for link in links:
r = req.get(link)
print(link, r.status_code)
writer.writerow([link, r.status_code])
soup = BeautifulSoup(r.content, 'html.parser')
end = [item.text for item in soup.select("loc")]
for a in end:
r = req.head(a)
print(a, r.status_code)
writer.writerow([a, r.status_code])
main("https://www.nemora.it/sitemap_index.xml")
我正在尝试根据以下说明编写脚本:
Scrape all the URLs into a sitemap index and store the information into an Excel file, specifying for each URL the corresponding status code.
我设法抓取了所有 URL 并将它们存储到一个文件中,顺便说一句,我仍在努力寻找一种获取状态代码的方法。
import requests
from bs4 import BeautifulSoup
url = 'https://www.example-domain.it/sitemap_index.xml'
page = requests.get(url)
print('Loaded page with: %s' % page)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
print('Created %s object' % type(sitemap_index))
urls = [element.text for element in sitemap_index.findAll('loc')]
print(urls)
def extract_links(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
links = [element.text for element in soup.findAll('loc')]
return links
sitemap_urls = []
for url in urls:
links = extract_links(url)
r = requests.get(url)
code = r.status_code
sitemap_urls += links
sitemap_urls += code
print('Found {:,} URLs in the sitemap'.format(len(sitemap_urls)))
import pandas as pd
df = pd.DataFrame(sitemap_urls)
df.to_excel("Export_link.xlsx")
任何人都可以帮我修复这个脚本吗?
我收到这个错误:
TypeError: 'int' object is not iterable
问题出在以下几行:
sitemap_urls = []
for url in urls:
links = extract_links(url)
r = requests.get(url)
code = r.status_code
sitemap_urls += links
sitemap_urls += code
如果我只写:
sitemap_urls = []
for url in urls:
links = extract_links(url)
sitemap_urls += links
该脚本正确导出 sitemap_index 内的所有 URL 但我必须更进一步:我想获取站点地图中包含的每个 URL , 各自的状态码。
我怎样才能安排这个迭代来实现它?
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
links = [item.text for item in soup.select("loc")]
with open("data.csv", 'w') as f:
writer = csv.writer(f)
writer.writerow(["Url", "Status Code"])
for link in links:
r = req.get(link)
print(link, r.status_code)
writer.writerow([link, r.status_code])
soup = BeautifulSoup(r.content, 'html.parser')
end = [item.text for item in soup.select("loc")]
for a in end:
r = req.head(a)
print(a, r.status_code)
writer.writerow([a, r.status_code])
main("https://www.nemora.it/sitemap_index.xml")