Scraping Traditional Chinese with BeautifulSoup4: 输出文件无法显示汉字
Scraping Traditional Chinese with BeautifulSoup4: output file cannot display Chinese characters
这是我要抓取的页面:https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701
该页面以 UTF-8 编码。
这是我的代码:
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
with open(r'sample_srape.txt', 'w') as file:
file.write(str(page_soup.encode('utf-8')))
file.close()
输出的txt文件根本不显示汉字。字符显示如下:“\xe7\x9a\x84\xe5\x9c\x96\xe6\x9b\xb8\xe9\xa4\xa8”。
如何让汉字显示出来?
在文件上写时使用decode("unicode-escape")
你会看到所有的汉字。
import requests as r
from bs4 import BeautifulSoup as soup
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
#print(page_soup)
with open(r'sample_srape.txt', 'w') as file:
file.write(str(page_soup.decode("unicode-escape")))
file.close()
最终工作代码:
import requests as r
from bs4 import BeautifulSoup as soup
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
with open(r'sample_srape.txt', 'w', encoding='utf-8') as file:
file.write(page_soup.decode("unicode-escape"))
file.close()
这是我要抓取的页面:https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701
该页面以 UTF-8 编码。
这是我的代码:
import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
with open(r'sample_srape.txt', 'w') as file:
file.write(str(page_soup.encode('utf-8')))
file.close()
输出的txt文件根本不显示汉字。字符显示如下:“\xe7\x9a\x84\xe5\x9c\x96\xe6\x9b\xb8\xe9\xa4\xa8”。
如何让汉字显示出来?
在文件上写时使用decode("unicode-escape")
你会看到所有的汉字。
import requests as r
from bs4 import BeautifulSoup as soup
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
#print(page_soup)
with open(r'sample_srape.txt', 'w') as file:
file.write(str(page_soup.decode("unicode-escape")))
file.close()
最终工作代码:
import requests as r
from bs4 import BeautifulSoup as soup
#make a list of all web pages' urls
webpages=['https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B701', 'https://zh.wikisource.org/wiki/%E8%AE%80%E9%80%9A%E9%91%92%E8%AB%96/%E5%8D%B702']
#start looping through all pages
for item in webpages:
headers = {'User-Agent': 'Mozilla/5.0'}
data = r.get(item, headers=headers)
data.encoding = 'utf-8'
page_soup = soup(data.text, 'html5lib')
with open(r'sample_srape.txt', 'w', encoding='utf-8') as file:
file.write(page_soup.decode("unicode-escape"))
file.close()