使用 python 2.7 从表中抓取数据
Scraping data from tables using python 2.7
我已经尝试了这两种不同的方法,但都无法正常工作。我正在尝试抓取此网页上的统计信息:"TEAM STATS" 内的 http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/。我需要指定统计类别后的数字,例如:"NET YARDS RUSHING"。以下是我尝试过但没有成功的方法。
第一种方式:
import pickle
import math
import os
import urllib2
from lxml import etree
from bs4 import BeautifulSoup
from urllib import urlopen
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell
Last Two Game info Home [H] or Away [A]
favLastGM = 'H' #Higher week number 2
favLastGM2 = 'A' #Lower week number 1
#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week)
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
response8 = urllib2.urlopen(favPrevGMInfoUrl)
htmlparser8 = etree.HTMLParser()
tree8 = etree.parse(response8,htmlparser8)
#FAVORITE
if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards
text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[1]/text()')
if text:
favDef_rushYards_L2_1 = int(text[0].strip())
print("test"),
print favDef_rushYards_L2_1
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards
text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[0]/text()')
if text:
favDef_rushYards_L2_1 = int(text[0].strip())
print("test"),
print favDef_rushYards_L2_1
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
else:
print("***************************************************")
print("NOT A VALID ENTRY - favLastGM !")
print("***************************************************")
第二种方式:
import pickle
import math
import os
import urllib2
from lxml import etree
from bs4 import BeautifulSoup
from urllib import urlopen
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell
#Last Two Game info Home [H] or Away [A]
favLastGM = 'H' #Higher week number 2
favLastGM2 = 'A' #Lower week number 1
#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week)
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
favPrevGMhtml2 = urlopen(favPrevGMInfoUrl).read()
favPrevGMsoup2 = BeautifulSoup(favPrevGMhtml2)
favPrevGM2Reg = favPrevGMsoup2.find("table", { "class" : "team-stats" })
favPrevGM2Reg2 = []
if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards
rush = 'Net Yards Rushing'
for row in favPrevGM2Reg.findAll("tr"):
if rush in row.findNext('td'): #Change Year for every new season
for item in row.findAll("td"):
favPrevGM2Reg.append(item.text)
favDef_rushYards_L2_1 = float(favPrevGM2Reg[1])
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards
rush = 'Net Yards Rushing'
for row in favPrevGM2Reg.findAll("tr"):
if rush in row.findNext('td'): #Change Year for every new season
for item in row.findAll("td"):
favPrevGM2Reg.append(item.text)
favDef_rushYards_L2_1 = float(favPrevGM2Reg[0])
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
else:
print("***************************************************")
print("NOT A VALID ENTRY - favLastGM !")
print("***************************************************")
您正在寻找的 xpath:
//td[contains(text(),"Net Yards Rushing")]/following-sibling::td
它所做的是 select 开始的 td,你做对了,但是你想要它的兄弟姐妹,而不是它的 parents' 兄弟姐妹,所以你需要添加 following-sibling: :td 直接在 td 之后。这将按照 table
中的出现顺序为您提供 2 个结果
我没有写出完整的代码,但是这两行会给你主场和客场的冲码。
import urllib2
from lxml import etree
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
response8 = urllib2.urlopen(favPrevGMInfoUrl)
htmlparser8 = etree.HTMLParser()
tree8 = etree.parse(response8,htmlparser8)
away = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value away"]/text()')
home = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value home"]/text()')
我已经尝试了这两种不同的方法,但都无法正常工作。我正在尝试抓取此网页上的统计信息:"TEAM STATS" 内的 http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/。我需要指定统计类别后的数字,例如:"NET YARDS RUSHING"。以下是我尝试过但没有成功的方法。
第一种方式:
import pickle
import math
import os
import urllib2
from lxml import etree
from bs4 import BeautifulSoup
from urllib import urlopen
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell
Last Two Game info Home [H] or Away [A]
favLastGM = 'H' #Higher week number 2
favLastGM2 = 'A' #Lower week number 1
#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week)
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
response8 = urllib2.urlopen(favPrevGMInfoUrl)
htmlparser8 = etree.HTMLParser()
tree8 = etree.parse(response8,htmlparser8)
#FAVORITE
if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards
text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[1]/text()')
if text:
favDef_rushYards_L2_1 = int(text[0].strip())
print("test"),
print favDef_rushYards_L2_1
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards
text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[0]/text()')
if text:
favDef_rushYards_L2_1 = int(text[0].strip())
print("test"),
print favDef_rushYards_L2_1
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
else:
print("***************************************************")
print("NOT A VALID ENTRY - favLastGM !")
print("***************************************************")
第二种方式:
import pickle
import math
import os
import urllib2
from lxml import etree
from bs4 import BeautifulSoup
from urllib import urlopen
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Font, Border
from openpyxl.styles import colors
from openpyxl.cell import Cell
#Last Two Game info Home [H] or Away [A]
favLastGM = 'H' #Higher week number 2
favLastGM2 = 'A' #Lower week number 1
#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week)
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
favPrevGMhtml2 = urlopen(favPrevGMInfoUrl).read()
favPrevGMsoup2 = BeautifulSoup(favPrevGMhtml2)
favPrevGM2Reg = favPrevGMsoup2.find("table", { "class" : "team-stats" })
favPrevGM2Reg2 = []
if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards
rush = 'Net Yards Rushing'
for row in favPrevGM2Reg.findAll("tr"):
if rush in row.findNext('td'): #Change Year for every new season
for item in row.findAll("td"):
favPrevGM2Reg.append(item.text)
favDef_rushYards_L2_1 = float(favPrevGM2Reg[1])
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards
rush = 'Net Yards Rushing'
for row in favPrevGM2Reg.findAll("tr"):
if rush in row.findNext('td'): #Change Year for every new season
for item in row.findAll("td"):
favPrevGM2Reg.append(item.text)
favDef_rushYards_L2_1 = float(favPrevGM2Reg[0])
print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "),
print favDef_rushYards_L2_1
else:
print("***************************************************")
print("NOT A VALID ENTRY - favLastGM !")
print("***************************************************")
您正在寻找的 xpath:
//td[contains(text(),"Net Yards Rushing")]/following-sibling::td
它所做的是 select 开始的 td,你做对了,但是你想要它的兄弟姐妹,而不是它的 parents' 兄弟姐妹,所以你需要添加 following-sibling: :td 直接在 td 之后。这将按照 table
中的出现顺序为您提供 2 个结果我没有写出完整的代码,但是这两行会给你主场和客场的冲码。
import urllib2
from lxml import etree
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/NFL_20161120_JAC@DET/'
response8 = urllib2.urlopen(favPrevGMInfoUrl)
htmlparser8 = etree.HTMLParser()
tree8 = etree.parse(response8,htmlparser8)
away = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value away"]/text()')
home = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value home"]/text()')