在 Python 中,如何实现(多处理池)
in Python, how to implement (multiprocessing pool)
在抓取代码的这一部分,我从 (url.xml) 文件中存储的 URL 中获取了很多 URL,并且需要很长时间才能完成,如何实现(多处理池)
有什么简单的代码可以解决这个问题吗?谢谢
from bs4 import BeautifulSoup as soup
import requests
from multiprocessing import Pool
p = Pool(10) # “10” means that 10 URLs will be processed at the same time
p.map
page_url = "url.xml"
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("url.xml", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})
otherpricess = page_soup.find("span", {"class": "price"})
currentprice = page_soup.find("div", {"class": "is"})
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + "\n")
p.terminate()
p.join()
一定是这种形式的东西。请进行更改,以便传递给 p.map 的网址是一个网址列表:
from bs4 import BeautifulSoup as soup
import requests
from multiprocessing import Pool
import csv
def parse(url):
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
return availableOffers, otherpricess, currentprice
if __name__ == '__main__':
urls = [ ... ] # List of urls to fetch from
p = Pool(10) # “10” means that 10 URLs will be processed at the same time
records = p.map(parse, urls)
p.terminate()
p.join()
with open("outfile.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
for r in records:
writer.writerow(r)
您可以在 python 中使用 concurrent.futures 标准包进行多处理和多线程处理。
在您的情况下,您不需要多处理,多线程会有所帮助。因为,你的函数在计算上很昂贵。
通过使用多线程,您可以同时发送多个请求。 number_of_threads
参数可以控制请求的个数,一次发送
我创建了一个函数,extract_data_from_url_func
,它将从单个 URL 中提取数据,我将这个函数和 URLS 列表传递给多线程执行程序,使用
concurrent.futures.
from bs4 import BeautifulSoup as soup
from concurrent.futures import ThreadPoolExecutor
import requests
page_url = "url.xml"
number_of_threads = 6
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice \n"
def extract_data_from_url_func(url):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
output_list = [availableOffers, otherpricess, currentprice]
output = ",".join(output_list)
print(output)
return output
with open("url.xml", "r") as fr:
URLS = list(map(lambda x: x.strip(), fr.readlines()))
with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
results = executor.map( extract_data_from_url_func, URLS)
responses = []
for result in results:
responses.append(result)
with open(out_filename, "w") as fw:
fw.write(headers)
for response in responses:
fw.write(response)
参考:https://docs.python.org/3/library/concurrent.futures.html
在抓取代码的这一部分,我从 (url.xml) 文件中存储的 URL 中获取了很多 URL,并且需要很长时间才能完成,如何实现(多处理池)
有什么简单的代码可以解决这个问题吗?谢谢
from bs4 import BeautifulSoup as soup
import requests
from multiprocessing import Pool
p = Pool(10) # “10” means that 10 URLs will be processed at the same time
p.map
page_url = "url.xml"
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("url.xml", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})
otherpricess = page_soup.find("span", {"class": "price"})
currentprice = page_soup.find("div", {"class": "is"})
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + "\n")
p.terminate()
p.join()
一定是这种形式的东西。请进行更改,以便传递给 p.map 的网址是一个网址列表:
from bs4 import BeautifulSoup as soup
import requests
from multiprocessing import Pool
import csv
def parse(url):
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
return availableOffers, otherpricess, currentprice
if __name__ == '__main__':
urls = [ ... ] # List of urls to fetch from
p = Pool(10) # “10” means that 10 URLs will be processed at the same time
records = p.map(parse, urls)
p.terminate()
p.join()
with open("outfile.csv", "w") as csvfile:
writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
for r in records:
writer.writerow(r)
您可以在 python 中使用 concurrent.futures 标准包进行多处理和多线程处理。
在您的情况下,您不需要多处理,多线程会有所帮助。因为,你的函数在计算上很昂贵。
通过使用多线程,您可以同时发送多个请求。 number_of_threads
参数可以控制请求的个数,一次发送
我创建了一个函数,extract_data_from_url_func
,它将从单个 URL 中提取数据,我将这个函数和 URLS 列表传递给多线程执行程序,使用
concurrent.futures.
from bs4 import BeautifulSoup as soup
from concurrent.futures import ThreadPoolExecutor
import requests
page_url = "url.xml"
number_of_threads = 6
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice \n"
def extract_data_from_url_func(url):
print(url)
response = requests.get(url)
page_soup = soup(response.text, "html.parser")
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
output_list = [availableOffers, otherpricess, currentprice]
output = ",".join(output_list)
print(output)
return output
with open("url.xml", "r") as fr:
URLS = list(map(lambda x: x.strip(), fr.readlines()))
with ThreadPoolExecutor(max_workers=number_of_threads) as executor:
results = executor.map( extract_data_from_url_func, URLS)
responses = []
for result in results:
responses.append(result)
with open(out_filename, "w") as fw:
fw.write(headers)
for response in responses:
fw.write(response)
参考:https://docs.python.org/3/library/concurrent.futures.html