将不一致的编码转换为utf-8 Python 3.4 VS 4.3
Converting inconsistant encoding to utf-8 Python 3.4 BS 4.3
有没有办法将编码不一致的文档转换为utf-8?
我的项目涉及从 MS SQL 2000 读取文本(通常是文本或 varchar),'cleaning up' 文本(剥离样式属性,将部分包装在 div 中)并插入 'clean'记录成MySQLtable。
我经常会发现这样的文字:
重要道路包括城市西北侧的费萨尔国王高速公路、东侧的法提赫高速公路和南岸的伊萨宾萨勒曼高速公路。 20 号和 21 号高速公路在附近的 المحرق(穆哈拉格)岛隔水相望。
但是得到 ???处理后
我的代码:
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup as B_S, UnicodeDammit as U_D
import pymysql as db
import time
def mod_content():
conn = db.connect( host='192.168.0.131', port=3306, user='USER', passwd='PASS', db='GRW', charset='utf8' )
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
for rec in c:
contents = rec[2]
contents = U_D.detwingle( contents )
soup = B_S( contents )
rs = soup.find_all( 'div', { 'class':'node_content' } )
for r in rs:
'''
do clean up stuff
'''
contents = soup.prettify( formatter='html' ) # B_S function
contents = ' '.join( contents.split() )
##### writing to a txt file here, but would want to do a MySQL INSERT
raw = open( 'raw_182_mod.txt', 'a', 4 ) # a - append r - read w - write (writes over)
raw.write( contents )
raw.close()
print( 'mod_content Complete' )
mod_content()
有没有办法将所有内容都转换为 utf-8?
更新 3/24
因此,根据此 post ( How to make unicode string with python3 ) Python2 的 unicode 是 Python3 中的 str()。 contents = str( contents, 'utf-8' ) 给我 TypeErrors,而 contents = contents.decode( 'utf-8' ) 给我 AttributeError: 'str' object has no attribute 'decode'。那么,我该如何将其整合到我的工作流程中呢?
def mod_content():
conn = db.connect( host='192.168.0.131', port=3306, user='wtp', passwd='wtp', db='GRW', charset='utf8' )
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
print( 'type(c) is', type( c ) ) ## type(c) is <class 'pymysql.cursors.Cursor'>
for rec in c:
contents = rec[2]
print( 'type(contents) is', type( contents ) ) ## type(contents) is <class 'str'>
#print( contents ) ## this give's me ?????
#contents = U_D.detwingle( contents )
#contents = str( contents, 'utf-8' ) ## TypeError: decoding str is not supported
soup = B_S( contents )
print( 'type(soup) is', type( soup ) ) ## type(soup) is <class 'bs4.BeautifulSoup'>
rs = soup.find_all( 'div', { 'class':'node_content' } )
for r in rs:
'''
do clean up stuff
'''
#contents = str( contents, 'utf-8' ) ## TypeError: decoding str is not supported
contents = soup.prettify( formatter='html' ) # B_S function
contents = ' '.join( contents.split() )
print( 'type(contents) AFTER prettify is', type( contents ) ) ## type(contents) AFTER prettify is <class 'str'>
raw = open( 'raw_182_mod.txt', 'a', 4 ) # a - append r - read w - write (writes over)
raw.write( contents )
raw.close()
print( 'mod_content Complete' )
mod_content()
str1 = "Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of المحرق (Muharraq), highways 20 and 21 encircle the airport."
要将文本转换为 unciode,请使用
unicode(str1,"utf-8")
u'Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of \u0627\u0644\u0645\u062d\u0631\u0642 (Muharraq), highways 20 and 21 encircle the airport.'
要从字符串中删除 unicode 使用
import unicodedata
unicodedata.normalize('NFKD', unicode(str1,"utf-8")).encode('ascii','ignore')
'Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of (Muharraq), highways 20 and 21 encircle the airport.'
更新 3/31
这是我解决这个问题的方法。如果有更好的方法,请告诉我
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup as B_S
import pymysql as db
import time
def mod_content():
conn = db.connect( host='192.xxx.x.xxx', port=3306, user='USER', passwd='PASSWORD', db='GRW', charset='utf8' ) ## declare charset
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
for rec in c.fetchall():
contents = rec[2]
temp = B_S( contents)
soup = temp.body
allDivs = soup.find_all( 'div', { 'class':'picright' } )
for div in allDivs:
print( str( div )[ :80 ] )
'''
do clean up stuff
'''
# now, output the data. I end up with utf-8 string with ascii diacritics
contents = soup.encode( 'ascii' )
content_2str = contents.decode( 'utf-8' )
content_2str = content_2str.replace( "'", "'" ) ## single quotes replaced
content_2str = ' '.join( content_2str.split() ) ## removes extra spaces and line breaks - now compacted
## I can now print it to file or update MySQL
if updateSQL == 'yes':
sql = "UPDATE content_city SET html_content = '" + content_2str + \
"',notes = '" + notes_2str + "' WHERE city_id = " + str( recID ) + \
" AND nid = " + str( nid ) + ""
c.execute( sql )
conn.commit()
if printToFile == 'yes':
file2 = tempRoot + NIDs[ key ]+'_MOD.html'
mod = open( file2, 'a',4 )
mod.write( '\n' + str( nid ) + '\n' + str( recID ) + '\n' + \
content_2str + '\n' + notes_2str + '\n\n' )
time.sleep(1)
mod.close()
print( 'mod_content Complete' )
mod_content()
有没有办法将编码不一致的文档转换为utf-8?
我的项目涉及从 MS SQL 2000 读取文本(通常是文本或 varchar),'cleaning up' 文本(剥离样式属性,将部分包装在 div 中)并插入 'clean'记录成MySQLtable。
我经常会发现这样的文字:
重要道路包括城市西北侧的费萨尔国王高速公路、东侧的法提赫高速公路和南岸的伊萨宾萨勒曼高速公路。 20 号和 21 号高速公路在附近的 المحرق(穆哈拉格)岛隔水相望。
但是得到 ???处理后
我的代码:
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup as B_S, UnicodeDammit as U_D
import pymysql as db
import time
def mod_content():
conn = db.connect( host='192.168.0.131', port=3306, user='USER', passwd='PASS', db='GRW', charset='utf8' )
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
for rec in c:
contents = rec[2]
contents = U_D.detwingle( contents )
soup = B_S( contents )
rs = soup.find_all( 'div', { 'class':'node_content' } )
for r in rs:
'''
do clean up stuff
'''
contents = soup.prettify( formatter='html' ) # B_S function
contents = ' '.join( contents.split() )
##### writing to a txt file here, but would want to do a MySQL INSERT
raw = open( 'raw_182_mod.txt', 'a', 4 ) # a - append r - read w - write (writes over)
raw.write( contents )
raw.close()
print( 'mod_content Complete' )
mod_content()
有没有办法将所有内容都转换为 utf-8?
更新 3/24 因此,根据此 post ( How to make unicode string with python3 ) Python2 的 unicode 是 Python3 中的 str()。 contents = str( contents, 'utf-8' ) 给我 TypeErrors,而 contents = contents.decode( 'utf-8' ) 给我 AttributeError: 'str' object has no attribute 'decode'。那么,我该如何将其整合到我的工作流程中呢?
def mod_content():
conn = db.connect( host='192.168.0.131', port=3306, user='wtp', passwd='wtp', db='GRW', charset='utf8' )
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
print( 'type(c) is', type( c ) ) ## type(c) is <class 'pymysql.cursors.Cursor'>
for rec in c:
contents = rec[2]
print( 'type(contents) is', type( contents ) ) ## type(contents) is <class 'str'>
#print( contents ) ## this give's me ?????
#contents = U_D.detwingle( contents )
#contents = str( contents, 'utf-8' ) ## TypeError: decoding str is not supported
soup = B_S( contents )
print( 'type(soup) is', type( soup ) ) ## type(soup) is <class 'bs4.BeautifulSoup'>
rs = soup.find_all( 'div', { 'class':'node_content' } )
for r in rs:
'''
do clean up stuff
'''
#contents = str( contents, 'utf-8' ) ## TypeError: decoding str is not supported
contents = soup.prettify( formatter='html' ) # B_S function
contents = ' '.join( contents.split() )
print( 'type(contents) AFTER prettify is', type( contents ) ) ## type(contents) AFTER prettify is <class 'str'>
raw = open( 'raw_182_mod.txt', 'a', 4 ) # a - append r - read w - write (writes over)
raw.write( contents )
raw.close()
print( 'mod_content Complete' )
mod_content()
str1 = "Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of المحرق (Muharraq), highways 20 and 21 encircle the airport."
要将文本转换为 unciode,请使用
unicode(str1,"utf-8")
u'Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of \u0627\u0644\u0645\u062d\u0631\u0642 (Muharraq), highways 20 and 21 encircle the airport.'
要从字符串中删除 unicode 使用
import unicodedata
unicodedata.normalize('NFKD', unicode(str1,"utf-8")).encode('ascii','ignore')
'Important roads include King Faisal Highway on the northwestern side of the city, Al Fatih Highway on the eastern side, and Sh Isa Bin Salman Highway along the southern shore. Across the water on the nearby island of (Muharraq), highways 20 and 21 encircle the airport.'
更新 3/31 这是我解决这个问题的方法。如果有更好的方法,请告诉我
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup as B_S
import pymysql as db
import time
def mod_content():
conn = db.connect( host='192.xxx.x.xxx', port=3306, user='USER', passwd='PASSWORD', db='GRW', charset='utf8' ) ## declare charset
c = conn.cursor()
sql = "SELECT city_id,nid,html_content,notes FROM content_city WHERE nid = 13 AND city_id = 182 ORDER BY city_id"
c.execute( sql )
for rec in c.fetchall():
contents = rec[2]
temp = B_S( contents)
soup = temp.body
allDivs = soup.find_all( 'div', { 'class':'picright' } )
for div in allDivs:
print( str( div )[ :80 ] )
'''
do clean up stuff
'''
# now, output the data. I end up with utf-8 string with ascii diacritics
contents = soup.encode( 'ascii' )
content_2str = contents.decode( 'utf-8' )
content_2str = content_2str.replace( "'", "'" ) ## single quotes replaced
content_2str = ' '.join( content_2str.split() ) ## removes extra spaces and line breaks - now compacted
## I can now print it to file or update MySQL
if updateSQL == 'yes':
sql = "UPDATE content_city SET html_content = '" + content_2str + \
"',notes = '" + notes_2str + "' WHERE city_id = " + str( recID ) + \
" AND nid = " + str( nid ) + ""
c.execute( sql )
conn.commit()
if printToFile == 'yes':
file2 = tempRoot + NIDs[ key ]+'_MOD.html'
mod = open( file2, 'a',4 )
mod.write( '\n' + str( nid ) + '\n' + str( recID ) + '\n' + \
content_2str + '\n' + notes_2str + '\n\n' )
time.sleep(1)
mod.close()
print( 'mod_content Complete' )
mod_content()