无法通过 BeautifulSoup 获取 "href" 属性
Cannot getting the "href" attributes via BeautifulSoup
简而言之,我无法从 this link(一家土耳其语在线图书及相关商品销售商)获得“href”属性的链接。
这是我的代码(我知道这不是最好的,我正在网上学习 python 几个月,所以也欢迎任何关于最佳实践的提醒)
我试图获取每本书的书名、作者、价格、出版商和链接;没有链接,它按我预期的方式工作。
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
yazar = []
fiyat = []
yayın = []
isim = []
for i in range(1,10):
url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# book names
k = soup.find_all("a", {"class":"prd-name"})
for i in k:
isim.append(i.text)
# writer names
y = soup.find_all("a", {"class":"who text-overflow"})
for i in y:
yazar.append(i.text)
# prices
f = soup.find_all("div", {"class":"prd-price"})
for i in f:
fiyat.append(i.text.split()[0])
# publishers
ye = soup.find_all("a", {"class":"prd-publisher"})
for i in ye:
yayın.append(i.get("title"))
sleep(randint(2, 4))
但是当我尝试获取链接时
soup.find_all("a", {"class":"prd-name"}).get("href")
它变成了 none,但无论我尝试什么,我都无法完成这项工作。
提前谢谢大家,抱歉比平时多了一点post。
以为你不会得到 None
你会得到:
AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all()
产生一个 ResultSet
,所以你必须迭代它以获得所有 href
:
for a in soup.find_all("a", {"class":"prd-name"}):
print('https://www.dr.com.tr'+a.get("href"))
输出
https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001
https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001
https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001
https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001
https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001
https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001
...
您在页面上看到的数据是从外部位置加载的,因此您需要其他 URL 才能获得正确的数据:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.dr.com.tr/Catalog/CatalogProducts"
data = {
"catalogId": "4020",
"page": "1",
"sortfield": "soldcount",
"sortorder": "desc",
"size": "60",
"categoryid": "0",
"parentId": "0",
"mediatypes": "",
"HideNotForSale": "true",
"minPrice": "-1",
"maxPrice": "-1",
"writer": "",
"minDiscount": "-1",
"maxdiscount": "-1",
"language": "",
}
all_data = []
for page in range(1, 3): # <-- increase number of pages here
print(f"Getting page {page}")
data["page"] = page
soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser")
for p in soup.select(".prd-content"):
all_data.append(p.get_text(strip=True, separator="|").split("|")[:5])
df = pd.DataFrame(
all_data, columns=["name", "autor", "price", "type", "publisher"]
)
print(df)
df.to_csv("data.csv", index=False)
打印:
name autor price type publisher
0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları
1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları
2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları
...
并保存 data.csv
(来自 Libre Office 的屏幕截图):
简而言之,我无法从 this link(一家土耳其语在线图书及相关商品销售商)获得“href”属性的链接。
这是我的代码(我知道这不是最好的,我正在网上学习 python 几个月,所以也欢迎任何关于最佳实践的提醒) 我试图获取每本书的书名、作者、价格、出版商和链接;没有链接,它按我预期的方式工作。
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
yazar = []
fiyat = []
yayın = []
isim = []
for i in range(1,10):
url = "https://www.dr.com.tr/CokSatanlar/Kitap#/page="+str(i)
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
# book names
k = soup.find_all("a", {"class":"prd-name"})
for i in k:
isim.append(i.text)
# writer names
y = soup.find_all("a", {"class":"who text-overflow"})
for i in y:
yazar.append(i.text)
# prices
f = soup.find_all("div", {"class":"prd-price"})
for i in f:
fiyat.append(i.text.split()[0])
# publishers
ye = soup.find_all("a", {"class":"prd-publisher"})
for i in ye:
yayın.append(i.get("title"))
sleep(randint(2, 4))
但是当我尝试获取链接时
soup.find_all("a", {"class":"prd-name"}).get("href")
它变成了 none,但无论我尝试什么,我都无法完成这项工作。 提前谢谢大家,抱歉比平时多了一点post。
以为你不会得到 None
你会得到:
AttributeError: ResultSet object has no attribute 'get'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
find_all()
产生一个 ResultSet
,所以你必须迭代它以获得所有 href
:
for a in soup.find_all("a", {"class":"prd-name"}):
print('https://www.dr.com.tr'+a.get("href"))
输出
https://www.dr.com.tr/kitap/daha-adil-bir-dunya-mumkun/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001934858001
https://www.dr.com.tr/kitap/burasi-cok-onemli-enerjiden-ekonomiye-tam-bagimsiz-turkiye/arastirma-tarih/politika-arastirma/turkiye-politika-/urunno=0001966362001
https://www.dr.com.tr/kitap/iz-biraktigin-kadar-varsin/egitim-basvuru/psikoloji-bilimi/urunno=0001947472001
https://www.dr.com.tr/kitap/simdi-onlar-dusunsun/bircan-yildirim/egitim-basvuru/kisisel-gelisim/urunno=0001964436001
https://www.dr.com.tr/kitap/kadinlar-sicak-erkekler-soguk-sever/esra-ezmeci/egitim-basvuru/psikoloji-bilimi/urunno=0001904239001
https://www.dr.com.tr/kitap/dustugunde-kalkarsan-hayat-guzeldir/egitim-basvuru/psikoloji-bilimi/urunno=0001816754001
...
您在页面上看到的数据是从外部位置加载的,因此您需要其他 URL 才能获得正确的数据:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.dr.com.tr/Catalog/CatalogProducts"
data = {
"catalogId": "4020",
"page": "1",
"sortfield": "soldcount",
"sortorder": "desc",
"size": "60",
"categoryid": "0",
"parentId": "0",
"mediatypes": "",
"HideNotForSale": "true",
"minPrice": "-1",
"maxPrice": "-1",
"writer": "",
"minDiscount": "-1",
"maxdiscount": "-1",
"language": "",
}
all_data = []
for page in range(1, 3): # <-- increase number of pages here
print(f"Getting page {page}")
data["page"] = page
soup = BeautifulSoup(requests.post(url, data=data).content, "html.parser")
for p in soup.select(".prd-content"):
all_data.append(p.get_text(strip=True, separator="|").split("|")[:5])
df = pd.DataFrame(
all_data, columns=["name", "autor", "price", "type", "publisher"]
)
print(df)
df.to_csv("data.csv", index=False)
打印:
name autor price type publisher
0 Esra Ezmeci Seti 5 Kitap Takım - Defter Hediyeli Esra Ezmeci 155,45 TL İnce Kapak Destek Yayınları
1 Şimdi Onlar Düşünsün Bircan Yıldırım 36,20 TL İnce Kapak Destek Yayınları
2 İz Bıraktığın Kadar Varsın Esra Ezmeci 36,20 TL İnce Kapak Destek Yayınları
...
并保存 data.csv
(来自 Libre Office 的屏幕截图):