使用 Python 抓取 Spotify

Question

我收到 AttributeError: 'NoneType' object has no attribute 'find' 错误。在发布一些研究之前，可能的问题是 Cloudflare 阻止了我对 Spotify 的访问。这个问题的解决方法是什么？

部分代码如下所示：

dates=[]
urls=[]
final=[]

url = 'https://spotifycharts.com/regional/us/daily'
start_date = date(2022,3,1)
end_date = date(2022,4,30)
delta = end_date - start_date

# print(delta.days+1)

for i in range(delta.days+1):
    day= start_date +timedelta(days=i)
    day_string =day.strftime('%Y-%m-%d')
    dates.append(day_string)



def add_url():
    for date in dates:
        c_string=url+date
        urls.append(c_string)

add_url()

def song_scrape(x):
    pg = x
    for tr in songs.find("tbody").findAll("tr"):
        artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
        artist = artist.replace("by ", "").strip()

        title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
        songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
        songid = songid.split("track/")[1]
        url_date = x.split("daily/")[1]
        final.append([title, artist, songid, url_date])


for u in urls:
    read_pg= requests.get(u)
    sleep(2)
    # return read_pg.status_code
    soup= BeautifulSoup(read_pg.content, "html.parser")
    songs = soup.find("table", {"class": "chart-table"})
    song_scrape(u)


final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
with open('spmooddata.csv', 'w') as f:
        final_df.to_csv(f, header= True, index=False)

Answer 1

, you need the add certain code 用于解决 403 forbidden 错误。

对您的代码进行更多更改后，我能够获取数据。

这是您修改后的工作代码：

# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd

# Variables: 
dates=[]
urls=[]
final=[]
url = 'https://spotifycharts.com/regional/us/daily/'
start_date = datetime(2022,3,1)
end_date = datetime(2022,3,5)
delta = end_date - start_date

# print(delta.days+1)

for i in range(delta.days+1):
  day= start_date + timedelta(days=i)
  day_string =day.strftime('%Y-%m-%d')
  dates.append(day_string)

def add_url(): 
  for date in dates:
    c_string=url+date
    urls.append(c_string)

add_url()

def song_scrape(x, songs):
  pg = x
  for tr in songs.find("tbody").findAll("tr"): 
    artist = tr.find("td", {"class": "chart-table-track"}).find("span").text
    artist = artist.replace("by ", "").strip()
    
    title = tr.find("td", {"class": "chart-table-track"}).find("strong").text
    songid = tr.find("td", {"class": "chart-table-image"}).find("a").get("href")
    songid = songid.split("track/")[1]
    url_date = x.split("daily/")[1]
    final.append([title, artist, songid, url_date])


# Avoid http 403 forbidden error with this code: 
# Source:  
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

for u in urls:
  read_pg= requests.get(u, headers=header)
  sleep(2)
  # return read_pg.status_code
  soup= BeautifulSoup(read_pg.text, "html.parser")

  #Using BeautifulSoup, we're getting the specific data from the HTML: 
  # There is only 1 table = which is the table with the data to extract:
  songs = soup.findAll("table")[0]

  # Call "song_scrape" function to retrieve the data from the table: 
  song_scrape(u, songs)

final_df = pd.DataFrame(final, columns= ["Title", "Artist", "Song ID", "Chart Date"])
# print(final_df) # Print the dataframe, if you want

with open('spmooddata.csv', 'w') as f: 
  final_df.to_csv(f, header= True, index=False)

使用 Python 抓取 Spotify

Scrape Spotify using Python

python

beautifulsoup

spotify

web-scraping