Scrapy:使用管道替换不需要的 non-ASCII 代码
Scrapy: use pipeline to replace unwanted non-ASCII code
根据 Scrapy 结果,标题中有一个不需要的非 ASCII 代码 \u2013
(又名 character(150)
或 en dash
),例如 u'Director/Senior Director \u2013 Pathology'
。我正在尝试使用管道通过常规 ,
删除 \u2013
。但是下面的代码不起作用。也没有报错。
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted ASCII characters in titles"""
ascii_to_filter = ["\u2013",]
def process_item(self, item, spider):
for word in self.ascii_to_filter:
desc = item.get('title')
if (desc) and word in desc:
spider.log("\u2013 in '%s' was replace" % (item['title']) )
item['title']=item['title'].replace("\u2013", ",")
return item
else:
return item
"\u2013"
应该是unicode,所以只需替换:
ascii_to_filter = ["\u2013",]
与:
ascii_to_filter = [u"\u2013",]
在阅读了这个 Whosebug post Replace non-ASCII characters... 之后,我想到了这段代码,它将过滤掉标题中的所有 non-ASCII 个字符。对于我的情况,不需要 non-ASCII 个字符,因此它非常适合我。
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted non-ASCII characters in titles"""
def process_item(self, item, spider):
def remove_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
orig_titl = item.get('title')
item['title'] = remove_non_ascii(orig_titl)
if item['title'] != orig_titl:
spider.log("Non-ASCII character(s) was removed in '%s'" % (item['title']) )
return item
根据 Scrapy 结果,标题中有一个不需要的非 ASCII 代码 \u2013
(又名 character(150)
或 en dash
),例如 u'Director/Senior Director \u2013 Pathology'
。我正在尝试使用管道通过常规 ,
删除 \u2013
。但是下面的代码不起作用。也没有报错。
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted ASCII characters in titles"""
ascii_to_filter = ["\u2013",]
def process_item(self, item, spider):
for word in self.ascii_to_filter:
desc = item.get('title')
if (desc) and word in desc:
spider.log("\u2013 in '%s' was replace" % (item['title']) )
item['title']=item['title'].replace("\u2013", ",")
return item
else:
return item
"\u2013"
应该是unicode,所以只需替换:
ascii_to_filter = ["\u2013",]
与:
ascii_to_filter = [u"\u2013",]
在阅读了这个 Whosebug post Replace non-ASCII characters... 之后,我想到了这段代码,它将过滤掉标题中的所有 non-ASCII 个字符。对于我的情况,不需要 non-ASCII 个字符,因此它非常适合我。
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted non-ASCII characters in titles"""
def process_item(self, item, spider):
def remove_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
orig_titl = item.get('title')
item['title'] = remove_non_ascii(orig_titl)
if item['title'] != orig_titl:
spider.log("Non-ASCII character(s) was removed in '%s'" % (item['title']) )
return item