html有emoji时使用Scrapy无法正确响应(解析百度贴吧post的doule-deck(lzl))
Can't get correct response when html has emoji by using Scrapy ( parsing Baidu Tieba post's doule-deck(lzl) )
我是Scrapy的初学者。
我在解析百度贴吧的post时,发现如果post的double-check(中文名为'Lou Zhong Lou'(楼中楼),缩写- >lzl,我会在下面提到它时使用 'lzl')有表情符号,Scrapy 不会 return 我一个正确的回应。
这是我的核心代码:
# coding=utf-8
# filename: tieba_post_spider.py
# path: D:\WORK\PythonProject\ScrapyLearn\ScrapyTest\tutorial\tutorial\spiders\tieba_post_spider.py
import scrapy
from bs4 import BeautifulSoup
class TiebaPostSpider(scrapy.spiders.Spider):
name = "tiebaPost"
allowed_domains = ["tieba.baidu.com"]
start_urls = [
# 1. Has lzl, don't have emoji
# "https://tieba.baidu.com/p/comment?tid=3886007864&pid=71342182567&pn=3"
# 2. Has lzl, has emoji
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111389280437&pn=1"
# 3. Don't have lzl
# "https://tieba.baidu.com/p/comment?tid=5301206923&pid=111390028140&pn=1"
]
# I tried to change header to fix it but I failed :(
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
}
}
# Parse lzl, if there isn't lzl then print nothing, or print username and his/her words
def parse(self, response):
# Use BeautifulSoup's CSS selector to get tags
soup = BeautifulSoup(response.body, "lxml")
# Collect this page's all users' words and print them
lzl_content = soup.select("li[class^='lzl_single_post j_lzl_s_p']")
if len(lzl_content) != 0:
for single_post in lzl_content:
content = single_post.select("div.lzl_cnt")[0]
username = content.select("a")[0].attrs['username']
words = content.select("span")[0].get_text()
print username + u": " + words
# If lzl has next page, request next page and parse it
lzl_next = soup.select("li.lzl_li_pager.j_lzl_l_p.lzl_li_pager_s p a")
if len(lzl_next) != 0:
for h in lzl_next:
href = h.attrs['href']
text = h.get_text().encode('GB18030')
if text == u'下一页'.encode('GB18030'):
index = response.url.find("&pn=")
next_url = response.url[:index+4] + href[1:]
yield scrapy.Request(next_url, callback=self.parse)
可以看到,有3个url需要解析,结果如下:
- 有lzl,没有emoji,可以解析成功,所有用户的文字都可以在cmd中显示。
- 有 lzl,有 emoji,无法 解析,因为发生错误(并且上面使用 print soup 在 解析函数:
- 没有lzl,在这种情况下,不会打印任何东西,spider结束,没有错误发生。
PS:如果表情符号 出现在 post 而不是 lzl 中,它将 被解析为 " ??" 作为回应。
那么,当 Scrapy 在 lzl 遇到 emoji 时,为什么 错误响应 returned?
如有任何建议,我们将不胜感激。
首先你不应该混用Scrapy和BS4。不需要。你会发现自己身处一团糟,就像你现在所处的那样。以下是您的代码在 scrapy 中的更正版本。这工作正常
import scrapy
class TiebaPostSpider(scrapy.spiders.Spider):
name = "tiebaPost"
allowed_domains = ["tieba.baidu.com"]
start_urls = [
# 1. Has lzl, don't have emoji
"https://tieba.baidu.com/p/comment?tid=3886007864&pid=71342182567&pn=3",
# 2. Has lzl, has emoji
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111389280437&pn=1",
# 3. Don't have lzl
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111390028140&pn=1",
]
# I tried to change header to fix it but I failed :(
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
}
}
# Parse lzl, if there isn't lzl then print nothing, or print username and his/her words
def parse(self, response):
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])" # flags (iOS)
"+", flags=re.UNICODE)
# Collect this page's all users' words and print them
lzl_content = response.css("li[class^='lzl_single_post j_lzl_s_p']")
if len(lzl_content) != 0:
for single_post in lzl_content:
content = single_post.css("div.lzl_cnt")
username = content.css("a::attr(username)").extract_first().strip()
words_list = content.css("span::text").extract()
words = ""
for word in words_list:
words += word
if emoji_pattern.search(words):
words = words.encode('unicode-escape')
print username + ":" + words
# If lzl has next page, request next page and parse it
lzl_next = response.css("li.lzl_li_pager.j_lzl_l_p.lzl_li_pager_s p a")
if len(lzl_next) != 0:
for h in lzl_next:
href = h.xpath("@href").extract_first().strip()
text = h.xpath("./text()").extract_first().strip()
if text == u'下一页':
index = response.url.find("&pn=")
next_url = response.url[:index + 4] + href[1:]
yield scrapy.Request(next_url, callback=self.parse)
我是Scrapy的初学者。
我在解析百度贴吧的post时,发现如果post的double-check(中文名为'Lou Zhong Lou'(楼中楼),缩写- >lzl,我会在下面提到它时使用 'lzl')有表情符号,Scrapy 不会 return 我一个正确的回应。
这是我的核心代码:
# coding=utf-8
# filename: tieba_post_spider.py
# path: D:\WORK\PythonProject\ScrapyLearn\ScrapyTest\tutorial\tutorial\spiders\tieba_post_spider.py
import scrapy
from bs4 import BeautifulSoup
class TiebaPostSpider(scrapy.spiders.Spider):
name = "tiebaPost"
allowed_domains = ["tieba.baidu.com"]
start_urls = [
# 1. Has lzl, don't have emoji
# "https://tieba.baidu.com/p/comment?tid=3886007864&pid=71342182567&pn=3"
# 2. Has lzl, has emoji
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111389280437&pn=1"
# 3. Don't have lzl
# "https://tieba.baidu.com/p/comment?tid=5301206923&pid=111390028140&pn=1"
]
# I tried to change header to fix it but I failed :(
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
}
}
# Parse lzl, if there isn't lzl then print nothing, or print username and his/her words
def parse(self, response):
# Use BeautifulSoup's CSS selector to get tags
soup = BeautifulSoup(response.body, "lxml")
# Collect this page's all users' words and print them
lzl_content = soup.select("li[class^='lzl_single_post j_lzl_s_p']")
if len(lzl_content) != 0:
for single_post in lzl_content:
content = single_post.select("div.lzl_cnt")[0]
username = content.select("a")[0].attrs['username']
words = content.select("span")[0].get_text()
print username + u": " + words
# If lzl has next page, request next page and parse it
lzl_next = soup.select("li.lzl_li_pager.j_lzl_l_p.lzl_li_pager_s p a")
if len(lzl_next) != 0:
for h in lzl_next:
href = h.attrs['href']
text = h.get_text().encode('GB18030')
if text == u'下一页'.encode('GB18030'):
index = response.url.find("&pn=")
next_url = response.url[:index+4] + href[1:]
yield scrapy.Request(next_url, callback=self.parse)
可以看到,有3个url需要解析,结果如下:
- 有lzl,没有emoji,可以解析成功,所有用户的文字都可以在cmd中显示。
- 有 lzl,有 emoji,无法 解析,因为发生错误(并且上面使用 print soup 在 解析函数:
- 没有lzl,在这种情况下,不会打印任何东西,spider结束,没有错误发生。
PS:如果表情符号 出现在 post 而不是 lzl 中,它将 被解析为 " ??" 作为回应。
那么,当 Scrapy 在 lzl 遇到 emoji 时,为什么 错误响应 returned?
如有任何建议,我们将不胜感激。
首先你不应该混用Scrapy和BS4。不需要。你会发现自己身处一团糟,就像你现在所处的那样。以下是您的代码在 scrapy 中的更正版本。这工作正常
import scrapy
class TiebaPostSpider(scrapy.spiders.Spider):
name = "tiebaPost"
allowed_domains = ["tieba.baidu.com"]
start_urls = [
# 1. Has lzl, don't have emoji
"https://tieba.baidu.com/p/comment?tid=3886007864&pid=71342182567&pn=3",
# 2. Has lzl, has emoji
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111389280437&pn=1",
# 3. Don't have lzl
"https://tieba.baidu.com/p/comment?tid=5301206923&pid=111390028140&pn=1",
]
# I tried to change header to fix it but I failed :(
custom_settings = {
'DEFAULT_REQUEST_HEADERS': {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/48.0.2564.116 Safari/537.36',
}
}
# Parse lzl, if there isn't lzl then print nothing, or print username and his/her words
def parse(self, response):
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])" # flags (iOS)
"+", flags=re.UNICODE)
# Collect this page's all users' words and print them
lzl_content = response.css("li[class^='lzl_single_post j_lzl_s_p']")
if len(lzl_content) != 0:
for single_post in lzl_content:
content = single_post.css("div.lzl_cnt")
username = content.css("a::attr(username)").extract_first().strip()
words_list = content.css("span::text").extract()
words = ""
for word in words_list:
words += word
if emoji_pattern.search(words):
words = words.encode('unicode-escape')
print username + ":" + words
# If lzl has next page, request next page and parse it
lzl_next = response.css("li.lzl_li_pager.j_lzl_l_p.lzl_li_pager_s p a")
if len(lzl_next) != 0:
for h in lzl_next:
href = h.xpath("@href").extract_first().strip()
text = h.xpath("./text()").extract_first().strip()
if text == u'下一页':
index = response.url.find("&pn=")
next_url = response.url[:index + 4] + href[1:]
yield scrapy.Request(next_url, callback=self.parse)