使用 Python 抓取 Spotify
Scrape Spotify using Python
我收到 AttributeError: 'NoneType' object has no attribute 'find'
错误。在发布一些研究之前,可能的问题是 Cloudflare 阻止了我对 Spotify 的访问。这个问题的解决方法是什么?
部分代码如下所示:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily'
start_date = date(2022,3,1)
end_date = date(2022,4,30)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date +timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
for u in urls:
read_pg= requests.get(u)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.content, "html.parser")
songs = soup.find("table", {"class": "chart-table"})
song_scrape(u)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)
, you need the add certain code 用于解决 403 forbidden 错误。
对您的代码进行更多更改后,我能够获取数据。
这是您修改后的工作代码:
# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd
# Variables:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily/'
start_date = datetime(2022,3,1)
end_date = datetime(2022,3,5)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date + timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x, songs):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
# Avoid http 403 forbidden error with this code:
# Source:
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
for u in urls:
read_pg= requests.get(u, headers=header)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.text, "html.parser")
#Using BeautifulSoup, we're getting the specific data from the HTML:
# There is only 1 table = which is the table with the data to extract:
songs = soup.findAll("table")[0]
# Call "song_scrape" function to retrieve the data from the table:
song_scrape(u, songs)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
# print(final_df) # Print the dataframe, if you want
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)
我收到 AttributeError: 'NoneType' object has no attribute 'find'
错误。在发布一些研究之前,可能的问题是 Cloudflare 阻止了我对 Spotify 的访问。这个问题的解决方法是什么?
部分代码如下所示:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily'
start_date = date(2022,3,1)
end_date = date(2022,4,30)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date +timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
for u in urls:
read_pg= requests.get(u)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.content, "html.parser")
songs = soup.find("table", {"class": "chart-table"})
song_scrape(u)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)
对您的代码进行更多更改后,我能够获取数据。
这是您修改后的工作代码:
# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd
# Variables:
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily/'
start_date = datetime(2022,3,1)
end_date = datetime(2022,3,5)
delta = end_date - start_date
# print(delta.days+1)
for i in range(delta.days+1):
day= start_date + timedelta(days=i)
day_string =day.strftime('%Y-%m-%d')
dates.append(day_string)
def add_url():
for date in dates:
c_string=url+date
urls.append(c_string)
add_url()
def song_scrape(x, songs):
pg = x
for tr in songs.find("tbody").findAll("tr"):
artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
artist = artist.replace("by ", "").strip()
title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
songid = songid.split("track/")[1]
url_date = x.split("daily/")[1]
final.append([title, artist, songid, url_date])
# Avoid http 403 forbidden error with this code:
# Source:
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
for u in urls:
read_pg= requests.get(u, headers=header)
sleep(2)
# return read_pg.status_code
soup= BeautifulSoup(read_pg.text, "html.parser")
#Using BeautifulSoup, we're getting the specific data from the HTML:
# There is only 1 table = which is the table with the data to extract:
songs = soup.findAll("table")[0]
# Call "song_scrape" function to retrieve the data from the table:
song_scrape(u, songs)
final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
# print(final_df) # Print the dataframe, if you want
with open('spmooddata.csv', 'w') as f:
final_df.to_csv(f, header= True, index=False)