如何删除 python 中的非 Ascii 字符
How to remove nonAscii characters in python
这是我的代码:
#!C:/Python27/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import urllib2
import sys
import urlparse
import io
url = "http://www.dlib.org/dlib/november14/beel/11beel.html"
#url = "http://eqa.unibo.it/article/view/4554"
#r = requests.get(url)
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#soup = BeautifulSoup(r.text,'lxml')
if url.find("http://www.dlib.org") != -1:
div = soup.find('td', valign='top')
else:
div = soup.find('div',id='content')
f = open('path/file_name.html', 'w')
f.write(str(div))
f.close()
抓取这些网页时,我发现一些非 AScii 字符进入了从此脚本编写的 html 文件中,我需要将其删除或解析为可读字符。
有什么建议吗?谢谢
从文本中删除非 ASCII
个字符。
import string
text = [word for word in text if word not in string.ascii_letters]
字符为 8 字节 (0-255),ascii 字符为 7 字节 (0-127),因此您可以简单地删除 ord 值低于 128 的所有字符
chr 将整数转换为字符,ord 将字符转换为整数。
text = ''.join((c for c in str(div) if ord(c) < 128)
这应该是您的最终代码
#!C:/Python27/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import urllib2
import sys
import urlparse
import io
url = "http://www.dlib.org/dlib/november14/beel/11beel.html"
#url = "http://eqa.unibo.it/article/view/4554"
#r = requests.get(url)
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#soup = BeautifulSoup(r.text,'lxml')
if url.find("http://www.dlib.org") != -1:
div = soup.find('td', valign='top')
else:
div = soup.find('div',id='content')
f = open('path/file_name.html', 'w')
text = ''.join((c for c in str(div) if ord(c) < 128)
f.write(text)
f.close()
尝试规范化字符串,然后 ASCII
忽略错误对其进行编码。
# -*- coding: utf-8 -*-
from unicodedata import normalize
string = 'úäô§'
if isinstance(string, str):
string = string.decode('utf-8')
print normalize('NFKD', string).encode('ASCII', 'ignore')
>>> uao
这是我的代码:
#!C:/Python27/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import urllib2
import sys
import urlparse
import io
url = "http://www.dlib.org/dlib/november14/beel/11beel.html"
#url = "http://eqa.unibo.it/article/view/4554"
#r = requests.get(url)
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#soup = BeautifulSoup(r.text,'lxml')
if url.find("http://www.dlib.org") != -1:
div = soup.find('td', valign='top')
else:
div = soup.find('div',id='content')
f = open('path/file_name.html', 'w')
f.write(str(div))
f.close()
抓取这些网页时,我发现一些非 AScii 字符进入了从此脚本编写的 html 文件中,我需要将其删除或解析为可读字符。 有什么建议吗?谢谢
从文本中删除非 ASCII
个字符。
import string
text = [word for word in text if word not in string.ascii_letters]
字符为 8 字节 (0-255),ascii 字符为 7 字节 (0-127),因此您可以简单地删除 ord 值低于 128 的所有字符
chr 将整数转换为字符,ord 将字符转换为整数。
text = ''.join((c for c in str(div) if ord(c) < 128)
这应该是您的最终代码
#!C:/Python27/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import urllib2
import sys
import urlparse
import io
url = "http://www.dlib.org/dlib/november14/beel/11beel.html"
#url = "http://eqa.unibo.it/article/view/4554"
#r = requests.get(url)
html = urllib2.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#soup = BeautifulSoup(r.text,'lxml')
if url.find("http://www.dlib.org") != -1:
div = soup.find('td', valign='top')
else:
div = soup.find('div',id='content')
f = open('path/file_name.html', 'w')
text = ''.join((c for c in str(div) if ord(c) < 128)
f.write(text)
f.close()
尝试规范化字符串,然后 ASCII
忽略错误对其进行编码。
# -*- coding: utf-8 -*-
from unicodedata import normalize
string = 'úäô§'
if isinstance(string, str):
string = string.decode('utf-8')
print normalize('NFKD', string).encode('ASCII', 'ignore')
>>> uao