Web scraper 脚本 - 如何让它 运行 更快?
Web scraper script - How can I make it run quicker?
我刚开始接触 python 3 并且喜欢看轻小说,所以我做的第一个 python 项目是一个从网络上抓取和下载我最喜欢的轻小说的脚本。
到目前为止一切正常,但速度真的很慢,尤其是检查文件夹中是否确实有章节并下载章节。
目前脚本需要17.8分钟来检查和下载694章
有什么方法可以至少加快检查过程吗?因为所有实际章节只需要下载一次。
https://github.com/alpenmilch411/LN_scrape/blob/master/LN_scraper.py
import requests
from bs4 import BeautifulSoup
import os
import getpass
#Gets chapter links
def get_chapter_links(index_url):
r = requests.get(index_url)
soup = BeautifulSoup(r.content, 'html.parser')
links = soup.find_all('a')
url_list = []
for url in links:
if 'http://www.wuxiaworld.com/cdindex-html/book' in str(url):
url_list.append((url.get('href')))
return url_list
#Gets chapter content
def get_chapters(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
chapter_text = soup.find_all('div',{'class':"entry-content"})
#Puts chapter text into 'chapter'-variable
chapter = ''
for c in chapter_text:
#Removing 'Previous Next Chapter'
content = c.text.strip() # strip??
chapter += content.strip('Previous Next Chapter') # strip??
return chapter
#Gets title of chapter
def get_title(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
title = soup.find_all('h1',{'class':'entry-title'})
chapter_title = ''
for l in title:
chapter_title += l.text
return chapter_title
#Gets title of story
def get_story_title(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
story = soup.find_all('h1',{'class':"entry-title"})
story_title = ''
for content in story:
story_title += content.text
return story_title
#url on which links can be found
links = 'http://www.wuxiaworld.com/cdindex-html/'
#Checks whether a directory already exists and creates a new one if necessary
story_title = get_story_title(links)
path = '/users/{}/documents/'.format(getpass.getuser())+'{}'.format(story_title)
if not os.path.isdir(path):
os.mkdir(path)
link_list = get_chapter_links(links)
#Copys chapters into text file
for x in link_list:
#Checks whether chapter already exists
#TODO Make checking process quicker
chapter_title = get_title(str(x)).replace(',','') + '.txt'
if not os.path.isfile(path + '/' + chapter_title):
story_title = get_story_title(links)
chapter_text = get_chapters(str(x))
file = open(path + '/' + chapter_title, 'w')
file.write(chapter_text)
file.close()
print('{} saved.'.format(chapter_title.replace(',','')))
print('All chapters are up to date.')
将 lxml 与 BeautifulSoup 结合使用。它比 html.parser
快
我刚开始接触 python 3 并且喜欢看轻小说,所以我做的第一个 python 项目是一个从网络上抓取和下载我最喜欢的轻小说的脚本。
到目前为止一切正常,但速度真的很慢,尤其是检查文件夹中是否确实有章节并下载章节。
目前脚本需要17.8分钟来检查和下载694章
有什么方法可以至少加快检查过程吗?因为所有实际章节只需要下载一次。
https://github.com/alpenmilch411/LN_scrape/blob/master/LN_scraper.py
import requests
from bs4 import BeautifulSoup
import os
import getpass
#Gets chapter links
def get_chapter_links(index_url):
r = requests.get(index_url)
soup = BeautifulSoup(r.content, 'html.parser')
links = soup.find_all('a')
url_list = []
for url in links:
if 'http://www.wuxiaworld.com/cdindex-html/book' in str(url):
url_list.append((url.get('href')))
return url_list
#Gets chapter content
def get_chapters(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
chapter_text = soup.find_all('div',{'class':"entry-content"})
#Puts chapter text into 'chapter'-variable
chapter = ''
for c in chapter_text:
#Removing 'Previous Next Chapter'
content = c.text.strip() # strip??
chapter += content.strip('Previous Next Chapter') # strip??
return chapter
#Gets title of chapter
def get_title(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
title = soup.find_all('h1',{'class':'entry-title'})
chapter_title = ''
for l in title:
chapter_title += l.text
return chapter_title
#Gets title of story
def get_story_title(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
story = soup.find_all('h1',{'class':"entry-title"})
story_title = ''
for content in story:
story_title += content.text
return story_title
#url on which links can be found
links = 'http://www.wuxiaworld.com/cdindex-html/'
#Checks whether a directory already exists and creates a new one if necessary
story_title = get_story_title(links)
path = '/users/{}/documents/'.format(getpass.getuser())+'{}'.format(story_title)
if not os.path.isdir(path):
os.mkdir(path)
link_list = get_chapter_links(links)
#Copys chapters into text file
for x in link_list:
#Checks whether chapter already exists
#TODO Make checking process quicker
chapter_title = get_title(str(x)).replace(',','') + '.txt'
if not os.path.isfile(path + '/' + chapter_title):
story_title = get_story_title(links)
chapter_text = get_chapters(str(x))
file = open(path + '/' + chapter_title, 'w')
file.write(chapter_text)
file.close()
print('{} saved.'.format(chapter_title.replace(',','')))
print('All chapters are up to date.')
将 lxml 与 BeautifulSoup 结合使用。它比 html.parser
快