从博客中提取内容
Extract content from blog
import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['https://davestruestories.medium.com']
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
# custom settings
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1
}
def parse(self, response):
link=response.xpath("//h1/a/@href").extract()
for links in link:
url = response.urljoin(links)
yield Request(url, callback=self.parse_book,headers=self.headers)
def parse_book(self, response):
title=response.xpath("//h1/text()").get()
content=response.xpath("//section/text()").getall()
yield{
'title':title,
'article':content
}
我想从博客中提取内容,但他们没有提供给我这些是页面 link https://davestruestories.medium.com/?p=169d7850744a 正如您在下面看到的,这是我想提取的内容
您的内容 xpath 没有指向正确的元素。
content=response.xpath("//section/text()").getall()
根据我的测试,数据在下面的 xpath 中可用。它位于 <p>
标签中,这些标签是 <section>
的子元素
content=response.xpath("//section/p/text()").getall()
你也可以在scrapy中验证xpaths shell
如果您查看从响应中获得的 html
,您会发现这些信息是空的(链接)
您需要进行一些更改:
import json
....
def parse(self, response):
raw_data = response.xpath('//script[contains(text(),"__APOLLO_STATE__")]/text()').get()
data = json.loads(raw_data[raw_data.index("{"):])
links = [data[el].get("mediumUrl") for el in data if "mediumUrl" in data[el].keys()]
for link in links:
url = response.urljoin(link)
yield Request(url, callback=self.parse_book,headers=self.headers)
....
看起来你的 xpath 是错误的
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['https://davestruestories.medium.com']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
links = response.xpath("//h1/a/@href").getall()
for link in links:
yield response.follow(link, callback=self.parse_book, headers=response.request.headers)
def parse_book(self, response):
title = response.xpath("//h1/text()").get()
content = response.xpath("//section//p/text()").getall()
# if you want a string instead of a list:
# content = ''.join(content)
# test which of this is better:
# date = response.xpath('//div[section]//p/span/text()').get()
date = response.xpath('//div//p/span/text()').get()
yield{
'title': title,
'date': date,
'article': content
}
也可以将 user-agent 添加到设置或创建一个 start_requests 方法,这样您就可以将 user-agent 添加到第一个请求。
import scrapy
from scrapy.http import Request
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['https://davestruestories.medium.com']
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
# custom settings
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1
}
def parse(self, response):
link=response.xpath("//h1/a/@href").extract()
for links in link:
url = response.urljoin(links)
yield Request(url, callback=self.parse_book,headers=self.headers)
def parse_book(self, response):
title=response.xpath("//h1/text()").get()
content=response.xpath("//section/text()").getall()
yield{
'title':title,
'article':content
}
我想从博客中提取内容,但他们没有提供给我这些是页面 link https://davestruestories.medium.com/?p=169d7850744a 正如您在下面看到的,这是我想提取的内容
您的内容 xpath 没有指向正确的元素。
content=response.xpath("//section/text()").getall()
根据我的测试,数据在下面的 xpath 中可用。它位于 <p>
标签中,这些标签是 <section>
content=response.xpath("//section/p/text()").getall()
你也可以在scrapy中验证xpaths shell
如果您查看从响应中获得的 html
,您会发现这些信息是空的(链接)
您需要进行一些更改:
import json
....
def parse(self, response):
raw_data = response.xpath('//script[contains(text(),"__APOLLO_STATE__")]/text()').get()
data = json.loads(raw_data[raw_data.index("{"):])
links = [data[el].get("mediumUrl") for el in data if "mediumUrl" in data[el].keys()]
for link in links:
url = response.urljoin(link)
yield Request(url, callback=self.parse_book,headers=self.headers)
....
看起来你的 xpath 是错误的
import scrapy
class PushpaSpider(scrapy.Spider):
name = 'pushpa'
start_urls = ['https://davestruestories.medium.com']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
links = response.xpath("//h1/a/@href").getall()
for link in links:
yield response.follow(link, callback=self.parse_book, headers=response.request.headers)
def parse_book(self, response):
title = response.xpath("//h1/text()").get()
content = response.xpath("//section//p/text()").getall()
# if you want a string instead of a list:
# content = ''.join(content)
# test which of this is better:
# date = response.xpath('//div[section]//p/span/text()').get()
date = response.xpath('//div//p/span/text()').get()
yield{
'title': title,
'date': date,
'article': content
}
也可以将 user-agent 添加到设置或创建一个 start_requests 方法,这样您就可以将 user-agent 添加到第一个请求。