抓取问题。数据未检索
Scraping issue. data not retrieved
我正在尝试从转会市场上收集一些足球数据。
我要提取:
- 联盟名称
- 联赛中的俱乐部
- 每个玩家的信息
我运行的代码没有问题,但它没有检索到任何信息。
我是数据抓取的新手。我不确定为什么它不起作用。请帮忙
'leauge.py'
from bs4 import BeautifulSoup
import csv
from team import team
import requests
headers = {'user-agent': '>> put my user agent<< '}
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/saison_id/2019"
numTeams = 20
teamcount = 0
result = requests.get(url, headers=headers)
src = result.content
soup = BeautifulSoup(src, 'lxml')
f = open('database.csv', 'w')
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
for td_tag in soup.find_all("td"):
if td_tag.get('class') == ['zentriert']:
a_tag = td_tag.find('a')
if a_tag != None and teamcount < numTeams:
teamcount += 1
url = 'https://www.transfermarkt.com' + a_tag.get('href') + '/plus/1'
print(url)
t1 = team(url)
wr.writerow([a_tag.get('title')])
wr.writerow(['Name', 'Club', 'Position', 'Nationality', 'DOB (Age)', 'Height', 'Foot', 'Date Joined', 'Contract Expires'])
names = t1.getNames()
bdays = t1.getBirth()
pos = t1.getPos()
nats = t1.getNat()
for x in range(0,len(names)):
playerdata = []
playerdata.append(names[x])
playerdata.append(a_tag.get('title'))
playerdata.append(pos[x])
playerdata.append("N/A")
playerdata.append(bdays[5*x])
playerdata.append(bdays[5*x+1])
playerdata.append(bdays[5*x+2])
playerdata.append(bdays[5*x+3])
playerdata.append(bdays[5*x+4])
wr.writerow(playerdata)
f.close()
'team.py'
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
if td_tag.get('class') == ['zentriert']:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
if td_tag.string == 'Second Striker' or td_tag.string == 'Right Midfield' or td_tag.string == 'Left Midfield' or td_tag.string == 'Goalkeeper' or td_tag.string == 'Left-Back' or td_tag.string == 'Centre-Back' or td_tag.string == 'Right-Back' or td_tag.string == 'Defensive Midfield' or td_tag.string == 'Central Midfield' or td_tag.string == 'Attacking Midfield' or td_tag.string == 'Left Winger' or td_tag.string == 'Right Winger' or td_tag.string == 'Centre-Forward':
pos.append(td_tag.string)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
if td_tag.get('class') == ['flaggenrahmen'] and td_tag.string == None:
nat.append(td_tag.get('title'))
return nat
'output'
"Manchester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Liverpool FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Tottenham Hotspur"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Chelsea FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Manchester United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Arsenal FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Everton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Leicester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Wolverhampton Wanderers"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"West Ham United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"AFC Bournemouth"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Newcastle United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Aston Villa"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Southampton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Brighton & Hove Albion"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Watford FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Crystal Palace"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Burnley FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Norwich City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Sheffield United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
您的抓取代码存在一些问题。您确实需要打印出 HTML 并确保您所要求的是您想要的。中间调试打印总是有帮助的。
一方面,td_tag.string
值 returns 是一个包含标签内容组合的长字符串。它可能包含字符串“Second Striker”,但它也有很多其他废话。此外,span_tag.get('class') == ['hide-for-small']
仅在跨度仅包含 class 时才匹配。大多数跨度还有其他 classes。
这似乎有效。这是 team.py
:
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
classes = td_tag.get('class')
if classes and 'zentriert' in classes:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
positions = ('Second Striker', 'Right Midfield', 'Left Midfield', 'Goalkeeper', 'Left-Back', 'Centre-Back', 'Right-Back', 'Defensive Midfield', 'Central Midfield', 'Attacking Midfield', 'Left Winger', 'Right Winger', 'Centre-Forward')
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
for p in self.positions:
if td_tag.string and p in td_tag.string:
pos.append(p)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
classes = td_tag.get('class')
if classes and 'flaggenrahmen' in classes and not td_tag.string:
nat.append(td_tag.get('title'))
return nat
我正在尝试从转会市场上收集一些足球数据。 我要提取:
- 联盟名称
- 联赛中的俱乐部
- 每个玩家的信息
我运行的代码没有问题,但它没有检索到任何信息。 我是数据抓取的新手。我不确定为什么它不起作用。请帮忙
'leauge.py'
from bs4 import BeautifulSoup
import csv
from team import team
import requests
headers = {'user-agent': '>> put my user agent<< '}
url = "https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/saison_id/2019"
numTeams = 20
teamcount = 0
result = requests.get(url, headers=headers)
src = result.content
soup = BeautifulSoup(src, 'lxml')
f = open('database.csv', 'w')
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
for td_tag in soup.find_all("td"):
if td_tag.get('class') == ['zentriert']:
a_tag = td_tag.find('a')
if a_tag != None and teamcount < numTeams:
teamcount += 1
url = 'https://www.transfermarkt.com' + a_tag.get('href') + '/plus/1'
print(url)
t1 = team(url)
wr.writerow([a_tag.get('title')])
wr.writerow(['Name', 'Club', 'Position', 'Nationality', 'DOB (Age)', 'Height', 'Foot', 'Date Joined', 'Contract Expires'])
names = t1.getNames()
bdays = t1.getBirth()
pos = t1.getPos()
nats = t1.getNat()
for x in range(0,len(names)):
playerdata = []
playerdata.append(names[x])
playerdata.append(a_tag.get('title'))
playerdata.append(pos[x])
playerdata.append("N/A")
playerdata.append(bdays[5*x])
playerdata.append(bdays[5*x+1])
playerdata.append(bdays[5*x+2])
playerdata.append(bdays[5*x+3])
playerdata.append(bdays[5*x+4])
wr.writerow(playerdata)
f.close()
'team.py'
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
a_tag = span_tag.find('a')
if a_tag != None:
if span_tag.get('class') == ['hide-for-small']:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
if td_tag.get('class') == ['zentriert']:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
if td_tag.string == 'Second Striker' or td_tag.string == 'Right Midfield' or td_tag.string == 'Left Midfield' or td_tag.string == 'Goalkeeper' or td_tag.string == 'Left-Back' or td_tag.string == 'Centre-Back' or td_tag.string == 'Right-Back' or td_tag.string == 'Defensive Midfield' or td_tag.string == 'Central Midfield' or td_tag.string == 'Attacking Midfield' or td_tag.string == 'Left Winger' or td_tag.string == 'Right Winger' or td_tag.string == 'Centre-Forward':
pos.append(td_tag.string)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
if td_tag.get('class') == ['flaggenrahmen'] and td_tag.string == None:
nat.append(td_tag.get('title'))
return nat
'output'
"Manchester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Liverpool FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Tottenham Hotspur"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Chelsea FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Manchester United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Arsenal FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Everton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Leicester City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Wolverhampton Wanderers"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"West Ham United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"AFC Bournemouth"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Newcastle United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Aston Villa"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Southampton FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Brighton & Hove Albion"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Watford FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Crystal Palace"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Burnley FC"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Norwich City"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
"Sheffield United"
"Name","Club","Position","Nationality","DOB (Age)","Height","Foot","Date Joined","Contract Expires"
您的抓取代码存在一些问题。您确实需要打印出 HTML 并确保您所要求的是您想要的。中间调试打印总是有帮助的。
一方面,td_tag.string
值 returns 是一个包含标签内容组合的长字符串。它可能包含字符串“Second Striker”,但它也有很多其他废话。此外,span_tag.get('class') == ['hide-for-small']
仅在跨度仅包含 class 时才匹配。大多数跨度还有其他 classes。
这似乎有效。这是 team.py
:
import requests
from bs4 import BeautifulSoup
class team:
def __init__(self, url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
result = requests.get(url, headers=headers)
src = result.content
self.soup = BeautifulSoup(src, 'lxml')
def getNames(self):
names = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
names.append(a_tag.get('title'))
return names
def getID(self):
ids = []
for span_tag in self.soup.find_all("span"):
classes = span_tag.get("class")
if not (classes and 'hide-for-small' in classes):
continue
a_tag = span_tag.find('a')
if a_tag:
ids.append(a_tag.get('id'))
return ids
def getBirth(self):
bday = []
for td_tag in self.soup.find_all('td'):
classes = td_tag.get('class')
if classes and 'zentriert' in classes:
if td_tag.string != None:
bday.append(td_tag.string)
return bday
positions = ('Second Striker', 'Right Midfield', 'Left Midfield', 'Goalkeeper', 'Left-Back', 'Centre-Back', 'Right-Back', 'Defensive Midfield', 'Central Midfield', 'Attacking Midfield', 'Left Winger', 'Right Winger', 'Centre-Forward')
def getPos(self):
pos = []
for td_tag in self.soup.find_all('td'):
for p in self.positions:
if td_tag.string and p in td_tag.string:
pos.append(p)
return pos
def getNat(self):
nat = []
for td_tag in self.soup.find_all('img'):
classes = td_tag.get('class')
if classes and 'flaggenrahmen' in classes and not td_tag.string:
nat.append(td_tag.get('title'))
return nat