使用 BeautifulSoup 索引错误进行网页抓取
Webscraping using BeautifulSoup Index Error
使用 BeautifulSoup 获取数据时出现索引错误。我可以提取大量数据,但它在某处中断了。我该如何解决?
import requests
from bs4 import BeautifulSoup
totalCar = 0
for pageNumber in range(3, 7):
r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
r.status_code
r.content
soup = BeautifulSoup(r.content,"lxml")
#soup.prettify
car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
for detail in car_details:
car_link = "https://www.autoscout24.com"+detail.a.get("href")
#print(car_link)
car_r = requests.get(car_link)
car_soup = BeautifulSoup(car_r.content,"lxml")
car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text
#car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(2)")[0].text
car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[0].text
car_year = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[1].text
car_color = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[2].text
car_body = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[3].text
print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
print("-"*20)
totalCar+=1
print(totalCar)
有时,车身信息不存在。您需要检查:
import requests
from bs4 import BeautifulSoup
totalCar = 0
for pageNumber in range(3, 7):
r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
r.status_code
r.content
soup = BeautifulSoup(r.content,"lxml")
#soup.prettify
car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
for detail in car_details:
car_link = "https://www.autoscout24.com"+detail.a.get("href")
#print(car_link)
car_r = requests.get(car_link)
print(car_link)
car_soup = BeautifulSoup(car_r.content,"lxml")
car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text
a = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")
car_model = a[0].text
car_year = a[1].text
car_color = a[2].text
car_body = car_body = a[3].text if len(a) > 3 else '-' # <-- check, if car body information is present
print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
print("-"*20)
totalCar+=1
print(totalCar)
打印:
...
--------------------
https://www.autoscout24.com/offers/mercedes-benz-a-180-blueefficiency-limousine-5tuerig-gasoline-grey-73cbbad4-ab1c-4163-a7cf-76037408fcb8
Make:
Mercedes-Benz
Model:A 180 Year:2009 Color:Grey Body:Sedans
--------------------
https://www.autoscout24.com/offers/audi-a4-ambiente-1-8-ahk-xenon-sitzh-pdc-tempom-8fach-gasoline-black-f6517012-9dfb-4d93-a7dd-d0b9b9bdbbc6
Make:
Audi
Model:A4 Year:2008 Color:Black Body:Sedans
--------------------
80
使用 BeautifulSoup 获取数据时出现索引错误。我可以提取大量数据,但它在某处中断了。我该如何解决?
import requests
from bs4 import BeautifulSoup
totalCar = 0
for pageNumber in range(3, 7):
r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
r.status_code
r.content
soup = BeautifulSoup(r.content,"lxml")
#soup.prettify
car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
for detail in car_details:
car_link = "https://www.autoscout24.com"+detail.a.get("href")
#print(car_link)
car_r = requests.get(car_link)
car_soup = BeautifulSoup(car_r.content,"lxml")
car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text
#car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(2)")[0].text
car_model = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[0].text
car_year = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[1].text
car_color = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[2].text
car_body = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")[3].text
print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
print("-"*20)
totalCar+=1
print(totalCar)
有时,车身信息不存在。您需要检查:
import requests
from bs4 import BeautifulSoup
totalCar = 0
for pageNumber in range(3, 7):
r = requests.get("https://www.autoscout24.com/lst/bmw?sort=standard&desc=0&offer=U&ustate=N%2CU&size=20&page="+
str(pageNumber)+"&cy=D&mmm=47%7C%7C&mmm=9%7C%7C&atype=C&")
r.status_code
r.content
soup = BeautifulSoup(r.content,"lxml")
#soup.prettify
car_details = soup.find_all("div",attrs={"class":"cl-list-element cl-list-element-gap"})
for detail in car_details:
car_link = "https://www.autoscout24.com"+detail.a.get("href")
#print(car_link)
car_r = requests.get(car_link)
print(car_link)
car_soup = BeautifulSoup(car_r.content,"lxml")
car_make = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd:nth-of-type(1)")[0].text
a = car_soup.find("div",attrs={"class":"cldt-categorized-data cldt-data-section sc-pull-right"}).select("dl > dd > a")
car_model = a[0].text
car_year = a[1].text
car_color = a[2].text
car_body = car_body = a[3].text if len(a) > 3 else '-' # <-- check, if car body information is present
print("Make:{} Model:{} Year:{} Color:{} Body:{}".format(car_make,car_model,car_year,car_color,car_body))
print("-"*20)
totalCar+=1
print(totalCar)
打印:
...
--------------------
https://www.autoscout24.com/offers/mercedes-benz-a-180-blueefficiency-limousine-5tuerig-gasoline-grey-73cbbad4-ab1c-4163-a7cf-76037408fcb8
Make:
Mercedes-Benz
Model:A 180 Year:2009 Color:Grey Body:Sedans
--------------------
https://www.autoscout24.com/offers/audi-a4-ambiente-1-8-ahk-xenon-sitzh-pdc-tempom-8fach-gasoline-black-f6517012-9dfb-4d93-a7dd-d0b9b9bdbbc6
Make:
Audi
Model:A4 Year:2008 Color:Black Body:Sedans
--------------------
80