将 scrapy 转换为 lxml
converting scrapy to lxml
我有类似这样的 scrapy 代码
for row in response.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
print "================"
print row.xpath(".//td[@class='time']/text()").extract()
print row.xpath(".//td[@class='currency']/text()").extract()
print row.xpath(".//td[@class='impact']/span/@title").extract()
print row.xpath(".//td[@class='event']/span/text()").extract()
print row.xpath(".//td[@class='actual']/text()").extract()
print row.xpath(".//td[@class='forecast']/text()").extract()
print row.xpath(".//td[@class='previous']/text()").extract()
print "================"
我可以像这样使用纯 python 得到相同的东西,
from lxml import html
import requests
page = requests.get('http://www.forexfactory.com/calendar.php?day=dec1.2011')
tree = html.fromstring(page.text)
print tree.xpath(".//td[@class='time']/text()")
print tree.xpath(".//td[@class='currency']/text()")
print tree.xpath(".//td[@class='impact']/span/@title")
print tree.xpath(".//td[@class='event']/span/text()")
print tree.xpath(".//td[@class='actual']/text()")
print tree.xpath(".//td[@class='forecast']/text()")
print tree.xpath(".//td[@class='previous']/text()")
但是我需要逐行执行此操作。我第一次尝试移植到 lxml 没有成功:
from lxml import html
import requests
page = requests.get('http://www.forexfactory.com/calendar.php?day=dec1.2011')
tree = html.fromstring(page.text)
for row in tree.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
print row.xpath(".//td[@class='time']/text()")
print row.xpath(".//td[@class='currency']/text()")
print row.xpath(".//td[@class='impact']/span/@title")
print row.xpath(".//td[@class='event']/span/text()")
print row.xpath(".//td[@class='actual']/text()")
print row.xpath(".//td[@class='forecast']/text()")
print row.xpath(".//td[@class='previous']/text()")
将此 scrapy 代码移植到纯 lxml 的正确方法是什么?
编辑:我已经接近了一点。我能看到一个 table{}
物体,我只是不知道如何走路。
import urllib2
from lxml import etree
#import requests
def wgetUrl(target):
try:
req = urllib2.Request(target)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3 Gecko/2008092417 Firefox/3.0.3')
response = urllib2.urlopen(req)
outtxt = response.read()
response.close()
except:
return ''
return outtxt
url = 'http://www.forexfactory.com/calendar.php?day='
date = 'dec1.2011'
data = wgetUrl(url + date)
parser = etree.HTMLParser()
tree = etree.fromstring(data, parser)
for elem in tree.xpath("//div[@id='flexBox_flex_calendar_mainCal']"):
print elem[0].tag, elem[0].attrib, elem[0].text
# elem[1] is where the table is
print elem[1].tag, elem[1].attrib, elem[1].text
print elem[1]
我喜欢用lxml
来抓取。不过,我通常不使用它的 xpath
功能,而是选择它们的 ElementPath
库。它在语法上非常相似。以下是我将如何移植您的 scrapy
代码。
一行一行:
初始化:
from lxml import etree
# analogous function xpath(.../text()).extract() for lxml etree nodes
def extract_text(elem):
if elem is None:
print None
else
return ''.join(i for i in elem.itertext())
data = wgetUrl(url+date) # wgetUrl, url, date you defined in your question
tree = etree.HTML(content)
第 1 行
# original
for row in response.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
# ported
for row in tree.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
第 2 行
print "================"
第 3 行
# original
print row.xpath(".//td[@class='time']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="time"]'))
第 4 行
# original
print row.xpath(".//td[@class='currency']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="currency"]'))
第 5 行
# original
print row.xpath(".//td[@class='impact']/span/@title").extract()
# ported
td = row.find(r'.//td[@class="impact"]/span')
if td is not None and 'title' in td.attrib:
print td.attrib['title']
第 6 行
# original
print row.xpath(".//td[@class='event']/span/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="event"]/span'))
第 7 行
# original
print row.xpath(".//td[@class='actual']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="actual"]'))
第 8 行
# original
print row.xpath(".//td[@class='forecast']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="forecast"]'))
第 9 行
# original
print row.xpath(".//td[@class='previous']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="previous"]'))
第 10 行
print "================"
现在一起:
from lxml import etree
def wgetUrl(target):
# same as you defined it
# analogous function xpath(.../text()).extract() for lxml etree nodes
def extract_text(elem):
if elem is None:
print None
else
return ''.join(i for i in elem.itertext())
content = wgetUrl(your_url) # wgetUrl as the function you defined in your question
node = etree.HTML(content)
for row in node.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
print "================"
print extract_text(row.find(r'.//td[@class="time"]'))
print extract_text(row.find(r'.//td[@class="currency"]'))
td = row.find(r'.//td[@class="impact"]/span')
if td is not None and 'title' in td.attrib:
print td.attrib['title']
print extract_text(row.find(r'.//td[@class="event"]/span'))
print extract_text(row.find(r'.//td[@class="actual"]'))
print extract_text(row.find(r'.//td[@class="forecast"]'))
print extract_text(row.find(r'.//td[@class="previous"]'))
print "================"
我有类似这样的 scrapy 代码
for row in response.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
print "================"
print row.xpath(".//td[@class='time']/text()").extract()
print row.xpath(".//td[@class='currency']/text()").extract()
print row.xpath(".//td[@class='impact']/span/@title").extract()
print row.xpath(".//td[@class='event']/span/text()").extract()
print row.xpath(".//td[@class='actual']/text()").extract()
print row.xpath(".//td[@class='forecast']/text()").extract()
print row.xpath(".//td[@class='previous']/text()").extract()
print "================"
我可以像这样使用纯 python 得到相同的东西,
from lxml import html
import requests
page = requests.get('http://www.forexfactory.com/calendar.php?day=dec1.2011')
tree = html.fromstring(page.text)
print tree.xpath(".//td[@class='time']/text()")
print tree.xpath(".//td[@class='currency']/text()")
print tree.xpath(".//td[@class='impact']/span/@title")
print tree.xpath(".//td[@class='event']/span/text()")
print tree.xpath(".//td[@class='actual']/text()")
print tree.xpath(".//td[@class='forecast']/text()")
print tree.xpath(".//td[@class='previous']/text()")
但是我需要逐行执行此操作。我第一次尝试移植到 lxml 没有成功:
from lxml import html
import requests
page = requests.get('http://www.forexfactory.com/calendar.php?day=dec1.2011')
tree = html.fromstring(page.text)
for row in tree.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
print row.xpath(".//td[@class='time']/text()")
print row.xpath(".//td[@class='currency']/text()")
print row.xpath(".//td[@class='impact']/span/@title")
print row.xpath(".//td[@class='event']/span/text()")
print row.xpath(".//td[@class='actual']/text()")
print row.xpath(".//td[@class='forecast']/text()")
print row.xpath(".//td[@class='previous']/text()")
将此 scrapy 代码移植到纯 lxml 的正确方法是什么?
编辑:我已经接近了一点。我能看到一个 table{}
物体,我只是不知道如何走路。
import urllib2
from lxml import etree
#import requests
def wgetUrl(target):
try:
req = urllib2.Request(target)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3 Gecko/2008092417 Firefox/3.0.3')
response = urllib2.urlopen(req)
outtxt = response.read()
response.close()
except:
return ''
return outtxt
url = 'http://www.forexfactory.com/calendar.php?day='
date = 'dec1.2011'
data = wgetUrl(url + date)
parser = etree.HTMLParser()
tree = etree.fromstring(data, parser)
for elem in tree.xpath("//div[@id='flexBox_flex_calendar_mainCal']"):
print elem[0].tag, elem[0].attrib, elem[0].text
# elem[1] is where the table is
print elem[1].tag, elem[1].attrib, elem[1].text
print elem[1]
我喜欢用lxml
来抓取。不过,我通常不使用它的 xpath
功能,而是选择它们的 ElementPath
库。它在语法上非常相似。以下是我将如何移植您的 scrapy
代码。
一行一行:
初始化:
from lxml import etree
# analogous function xpath(.../text()).extract() for lxml etree nodes
def extract_text(elem):
if elem is None:
print None
else
return ''.join(i for i in elem.itertext())
data = wgetUrl(url+date) # wgetUrl, url, date you defined in your question
tree = etree.HTML(content)
第 1 行
# original
for row in response.css("div#flexBox_flex_calendar_mainCal table tr.calendar_row"):
# ported
for row in tree.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
第 2 行
print "================"
第 3 行
# original
print row.xpath(".//td[@class='time']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="time"]'))
第 4 行
# original
print row.xpath(".//td[@class='currency']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="currency"]'))
第 5 行
# original
print row.xpath(".//td[@class='impact']/span/@title").extract()
# ported
td = row.find(r'.//td[@class="impact"]/span')
if td is not None and 'title' in td.attrib:
print td.attrib['title']
第 6 行
# original
print row.xpath(".//td[@class='event']/span/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="event"]/span'))
第 7 行
# original
print row.xpath(".//td[@class='actual']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="actual"]'))
第 8 行
# original
print row.xpath(".//td[@class='forecast']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="forecast"]'))
第 9 行
# original
print row.xpath(".//td[@class='previous']/text()").extract()
# ported
print extract_text(row.find(r'.//td[@class="previous"]'))
第 10 行
print "================"
现在一起:
from lxml import etree
def wgetUrl(target):
# same as you defined it
# analogous function xpath(.../text()).extract() for lxml etree nodes
def extract_text(elem):
if elem is None:
print None
else
return ''.join(i for i in elem.itertext())
content = wgetUrl(your_url) # wgetUrl as the function you defined in your question
node = etree.HTML(content)
for row in node.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class="calendar_row"]'):
print "================"
print extract_text(row.find(r'.//td[@class="time"]'))
print extract_text(row.find(r'.//td[@class="currency"]'))
td = row.find(r'.//td[@class="impact"]/span')
if td is not None and 'title' in td.attrib:
print td.attrib['title']
print extract_text(row.find(r'.//td[@class="event"]/span'))
print extract_text(row.find(r'.//td[@class="actual"]'))
print extract_text(row.find(r'.//td[@class="forecast"]'))
print extract_text(row.find(r'.//td[@class="previous"]'))
print "================"