使用 scrapy 从 JSON 的输出中删除 unicode
remove the unicode from the output of JSON using scrapy
import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(@class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(@class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(@class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(@class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(@class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item ["image"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
我可以将其转换为 JSON 但在输出中我得到了 JSON 中的项目符号 如何删除它并仍然具有 JSON 格式?
有些隐藏元素是您在浏览器中看不到的。 Scrapy 看到了它们。
您只需要在页面的相关部分搜索数据即可(div
和 id="siteTable"
):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[@id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(@class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(@class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(@class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(@class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(@class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item["image"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
return item
已测试,这是我得到的结果,例如 votes_likes
:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],
import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(@class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(@class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(@class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(@class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(@class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item ["image"] = response.xpath('//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
我可以将其转换为 JSON 但在输出中我得到了 JSON 中的项目符号 如何删除它并仍然具有 JSON 格式?
有些隐藏元素是您在浏览器中看不到的。 Scrapy 看到了它们。
您只需要在页面的相关部分搜索数据即可(div
和 id="siteTable"
):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[@id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(@class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(@class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(@class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(@class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(@class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/@href').extract()
item["image"] = sel.xpath('.//a[contains(@class,"thumbnail may-blank")]/img/@src').extract()
return item
已测试,这是我得到的结果,例如 votes_likes
:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],