替换一系列 Unicode 字符 / Python / Twitter
Replace series of Unicode characters / Python / Twitter
我正在使用 Twitter API 和 Python 3.3 从一条推文中提取文本,我正在 运行 进入推文中高音喇叭放了三个符号的部分推文。它们如下所示。
这两个标志和竖起大拇指似乎是导致问题的原因。以下为纯文字推文
RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
以下是我使用的代码。
import json
import mysql.connector
import sys
from datetime import datetime
from MySQLCL import MySQLCL
class Functions(object):
"""This is a class for Python functions"""
@staticmethod
def Clean(string):
temp = str(string)
temp = temp.replace("'", "").replace("(", "").replace(")", "").replace(",", "").strip()
return temp
@staticmethod
def ParseTweet(string):
for x in range(0, len(string)):
tweetid = string[x]["id_str"]
tweetcreated = string[x]["created_at"]
tweettext = string[x]["text"]
tweetsource = string[x]["source"]
tweetsource = tweetsource
truncated = string[x]["truncated"]
inreplytostatusid = string[x]["in_reply_to_status_id"]
inreplytouserid = string[x]["in_reply_to_user_id"]
inreplytoscreenname = string[x]["in_reply_to_screen_name"]
geo = string[x]["geo"]
coordinates = string[x]["coordinates"]
place = string[x]["place"]
contributors = string[x]["contributors"]
isquotestatus = string[x]["is_quote_status"]
retweetcount = string[x]["retweet_count"]
favoritecount = string[x]["favorite_count"]
favorited = string[x]["favorited"]
retweeted = string[x]["retweeted"]
if "possibly_sensitive" in string[x]:
possiblysensitive = string[x]["possibly_sensitive"]
else:
possiblysensitive = ""
language = string[x]["lang"]
#print(possiblysensitive)
print(Functions.UnicodeFilter(tweettext))
#print(inreplytouserid)
#print("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + Functions.UnicodeFilter(tweettext) + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + str(language) + "', '" + Functions.ToSQL(tweetcreated) + "', '" + Functions.ToSQL(tweetsource) + "', " + str(possiblysensitive) + ")")
#MySQLCL.Set("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + tweettext + "', " + str(truncated) + ", " + Functions.CheckNullNum(inreplytostatusid) + ", " + Functions.CheckNullNum(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + language + "', '" + str(Functions.FormatDate(tweetcreated)) + "', '" + str(Functions.UnicodeFilter(tweetsource)) + "', " + str(possiblysensitive) + ")")
@staticmethod
def ToBool(variable):
if variable.lower() == 'true':
return True
elif variable.lower() == 'false':
return False
@staticmethod
def CheckNullNum(var):
if var == None:
return "0"
else:
return str(var)
@staticmethod
def CheckNull(var):
if var == None:
return ""
else:
return var
@staticmethod
def ToSQL(var):
temp = var
temp = temp.replace("'", "")
return str(temp)
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = Functions.ToSQL(temp)
return temp
@staticmethod
def FormatDate(var):
temp = var
dt = datetime.strptime(temp, "%a %b %d %H:%M:%S %z %Y")
retdt = str(dt.year) + "-" + str(dt.month) + "-" + str(dt.day) + "T" + str(dt.hour) + ":" + str(dt.minute) + ":" + str(dt.second)
return retdt
如您所见,我一直在使用函数 UnicodeFilter 来尝试过滤掉十六进制的 unicode 字符。该函数在处理单个 unicode 字符时有效,但当遇到多个 unicode 字符放在一起时,此方法失败并给出以下错误:
'charmap' codec can't encode characters in position 107-111: character maps to 'undefined'
你们对如何解决这个问题有什么想法吗?
更新:我已经尝试了 解决方案,但我仍然 运行 遇到同样的问题。但是,我决定查看是否有任何特定字符导致了问题,因此我决定逐个字符地将这些字符打印到控制台。这给了我如下错误:
'charmap' codec can't encode character '\U0001f1fa' in position 0: character maps to 'undefined'
看到这里,我把这个加到UnicodeFilter函数中继续测试。在逐字符打印推文时,我遇到了 运行 多个同类错误。但是,我不想一直做这些例外。例如查看修改后的UnicodeFilter函数:
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.encode(errors='ignore').decode('utf-8')
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = temp.replace(chr(0x2026), "")
temp = temp.replace(u"\U0001F1FA", "")
temp = temp.replace(u"\U0001F1F8", "")
temp = temp.replace(u"\U0001F44D", "")
temp = temp.replace(u"\U00014F18", "")
temp = temp.replace(u"\U0001F418", "")
temp = temp.replace(u"\U0001F918", "")
temp = temp.replace(u"\U0001F3FD", "")
temp = temp.replace(u"\U0001F195", "")
temp = Functions.ToSQL(temp)
return str(temp)
我不想为每个导致问题的字符都添加一个新行。通过这种方法,我已经能够传递多条推文,但是这个问题在每条包含不同符号的推文中再次出现。有没有解决方案可以过滤掉所有这些字符?是否可以过滤掉所有不属于utf-8字符集的字符?
尝试内置的 unicode encode/decode 错误处理功能:str.encode(errors='ignore')
例如:
problem_string = """\
RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
"""
print(problem_string.encode(errors='ignore').decode('utf-8'))
忽略错误会删除有问题的字符。
> RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary
您可能对其他错误处理选项感兴趣。
xmlcharrefreplace
例如会产生:
> RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary ����������
如果您需要 UnicodeFilter
函数暗示的自定义过滤,请参阅 Python documentation on registering an error handler。
Python 提供了有用的堆栈跟踪,因此您可以判断错误的来源。
使用它,您会发现您的 print
导致了异常。
print()
失败是因为您是 Windows 控制台的 运行 Python,默认情况下仅支持本地 8 位字符映射。您可以通过以下方式添加支持:https://github.com/Drekin/win-unicode-console
您也可以直接将数据写入文本文件。使用以下命令打开文件:
open('output.txt', 'w', encoding='utf-8')
找到答案了。问题是推文中有一系列字符导致了问题。一旦我找到了字符的正确 Unicode 范围,我就实现了 for 循环来替换该范围内出现的任何 Unicode 字符。实施之后,我能够在没有任何格式或 MySQL 错误的情况下提取数千条推文。
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "'")
temp = temp.replace(chr(0x2026), "")
for x in range(127381, 129305):
temp = temp.replace(chr(x), "")
temp = MySQLCL.ToSQL(temp)
return str(temp)
我正在使用 Twitter API 和 Python 3.3 从一条推文中提取文本,我正在 运行 进入推文中高音喇叭放了三个符号的部分推文。它们如下所示。
这两个标志和竖起大拇指似乎是导致问题的原因。以下为纯文字推文
RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
以下是我使用的代码。
import json
import mysql.connector
import sys
from datetime import datetime
from MySQLCL import MySQLCL
class Functions(object):
"""This is a class for Python functions"""
@staticmethod
def Clean(string):
temp = str(string)
temp = temp.replace("'", "").replace("(", "").replace(")", "").replace(",", "").strip()
return temp
@staticmethod
def ParseTweet(string):
for x in range(0, len(string)):
tweetid = string[x]["id_str"]
tweetcreated = string[x]["created_at"]
tweettext = string[x]["text"]
tweetsource = string[x]["source"]
tweetsource = tweetsource
truncated = string[x]["truncated"]
inreplytostatusid = string[x]["in_reply_to_status_id"]
inreplytouserid = string[x]["in_reply_to_user_id"]
inreplytoscreenname = string[x]["in_reply_to_screen_name"]
geo = string[x]["geo"]
coordinates = string[x]["coordinates"]
place = string[x]["place"]
contributors = string[x]["contributors"]
isquotestatus = string[x]["is_quote_status"]
retweetcount = string[x]["retweet_count"]
favoritecount = string[x]["favorite_count"]
favorited = string[x]["favorited"]
retweeted = string[x]["retweeted"]
if "possibly_sensitive" in string[x]:
possiblysensitive = string[x]["possibly_sensitive"]
else:
possiblysensitive = ""
language = string[x]["lang"]
#print(possiblysensitive)
print(Functions.UnicodeFilter(tweettext))
#print(inreplytouserid)
#print("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + Functions.UnicodeFilter(tweettext) + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + str(language) + "', '" + Functions.ToSQL(tweetcreated) + "', '" + Functions.ToSQL(tweetsource) + "', " + str(possiblysensitive) + ")")
#MySQLCL.Set("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + tweettext + "', " + str(truncated) + ", " + Functions.CheckNullNum(inreplytostatusid) + ", " + Functions.CheckNullNum(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + language + "', '" + str(Functions.FormatDate(tweetcreated)) + "', '" + str(Functions.UnicodeFilter(tweetsource)) + "', " + str(possiblysensitive) + ")")
@staticmethod
def ToBool(variable):
if variable.lower() == 'true':
return True
elif variable.lower() == 'false':
return False
@staticmethod
def CheckNullNum(var):
if var == None:
return "0"
else:
return str(var)
@staticmethod
def CheckNull(var):
if var == None:
return ""
else:
return var
@staticmethod
def ToSQL(var):
temp = var
temp = temp.replace("'", "")
return str(temp)
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = Functions.ToSQL(temp)
return temp
@staticmethod
def FormatDate(var):
temp = var
dt = datetime.strptime(temp, "%a %b %d %H:%M:%S %z %Y")
retdt = str(dt.year) + "-" + str(dt.month) + "-" + str(dt.day) + "T" + str(dt.hour) + ":" + str(dt.minute) + ":" + str(dt.second)
return retdt
如您所见,我一直在使用函数 UnicodeFilter 来尝试过滤掉十六进制的 unicode 字符。该函数在处理单个 unicode 字符时有效,但当遇到多个 unicode 字符放在一起时,此方法失败并给出以下错误:
'charmap' codec can't encode characters in position 107-111: character maps to 'undefined'
你们对如何解决这个问题有什么想法吗?
更新:我已经尝试了
'charmap' codec can't encode character '\U0001f1fa' in position 0: character maps to 'undefined'
看到这里,我把这个加到UnicodeFilter函数中继续测试。在逐字符打印推文时,我遇到了 运行 多个同类错误。但是,我不想一直做这些例外。例如查看修改后的UnicodeFilter函数:
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.encode(errors='ignore').decode('utf-8')
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = temp.replace(chr(0x2026), "")
temp = temp.replace(u"\U0001F1FA", "")
temp = temp.replace(u"\U0001F1F8", "")
temp = temp.replace(u"\U0001F44D", "")
temp = temp.replace(u"\U00014F18", "")
temp = temp.replace(u"\U0001F418", "")
temp = temp.replace(u"\U0001F918", "")
temp = temp.replace(u"\U0001F3FD", "")
temp = temp.replace(u"\U0001F195", "")
temp = Functions.ToSQL(temp)
return str(temp)
我不想为每个导致问题的字符都添加一个新行。通过这种方法,我已经能够传递多条推文,但是这个问题在每条包含不同符号的推文中再次出现。有没有解决方案可以过滤掉所有这些字符?是否可以过滤掉所有不属于utf-8字符集的字符?
尝试内置的 unicode encode/decode 错误处理功能:str.encode(errors='ignore')
例如:
problem_string = """\
RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
"""
print(problem_string.encode(errors='ignore').decode('utf-8'))
忽略错误会删除有问题的字符。
> RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary
您可能对其他错误处理选项感兴趣。
xmlcharrefreplace
例如会产生:
> RT @John_Hunt07: Just voted for @marcorubio is Florida! I am ready for a New American Century!! #FLPrimary ����������
如果您需要 UnicodeFilter
函数暗示的自定义过滤,请参阅 Python documentation on registering an error handler。
Python 提供了有用的堆栈跟踪,因此您可以判断错误的来源。
使用它,您会发现您的 print
导致了异常。
print()
失败是因为您是 Windows 控制台的 运行 Python,默认情况下仅支持本地 8 位字符映射。您可以通过以下方式添加支持:https://github.com/Drekin/win-unicode-console
您也可以直接将数据写入文本文件。使用以下命令打开文件:
open('output.txt', 'w', encoding='utf-8')
找到答案了。问题是推文中有一系列字符导致了问题。一旦我找到了字符的正确 Unicode 范围,我就实现了 for 循环来替换该范围内出现的任何 Unicode 字符。实施之后,我能够在没有任何格式或 MySQL 错误的情况下提取数千条推文。
@staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "'")
temp = temp.replace(chr(0x2026), "")
for x in range(127381, 129305):
temp = temp.replace(chr(x), "")
temp = MySQLCL.ToSQL(temp)
return str(temp)