无法使用两个线程在脚本中执行两个函数
Unable to use two Threads to execute two functions within a script
我结合使用 python 和 Thread
创建了一个抓取程序,以加快执行速度。爬虫应该解析网页中以不同字母结尾的所有可用链接。它确实解析了它们。
但是,我希望再次使用 Thread
解析来自这些单独链接的所有 names
和 phone
数字。第一部分我可以使用 Thread
设法 运行 但我不知道如何创建另一个 Thread
来执行脚本的后半部分?
我本可以将它们包装在一个 Thread
中,但我的目的是知道如何使用两个 Threads
来执行两个功能。
对于第一部分:我尝试了如下所示并且有效
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(link).text
tree = html.fromstring(response)
return [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
我的问题是:如何在另一个 Thread
[=22= 中使用 sub_links()
函数]
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(link).text
tree = html.fromstring(response)
return [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
def sub_links(process_links):
response = requests.get(process_links).text
root = html.fromstring(response)
for container in root.cssselect(".proListing"):
try:
name = container.cssselect("h2 a")[0].text
except Exception: name = ""
try:
phone = container.cssselect(".proListingPhone")[0].text
except Exception: phone = ""
print(name, phone)
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
您可以像启动第一个线程一样启动更多线程
from threading import Thread
t1 = Thread(target=alphabetical_links, kwargs={
'mainurl': link,
})
t1.start()
t2 = Thread(target=sub_links, kwargs={
'process_links': link,
})
t2.start()
尝试用自己的线程更新 alphabetical_links
:
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(mainurl).text
tree = html.fromstring(response)
links_on_page = [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
threads = []
for link in links_on_page:
thread = threading.Thread(target=sub_links, args=(link,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
def sub_links(process_links):
response = requests.get(process_links).text
root = html.fromstring(response)
for container in root.cssselect(".proListing"):
try:
name = container.cssselect("h2 a")[0].text
except Exception: name = ""
try:
phone = container.cssselect(".proListingPhone")[0].text
except Exception: phone = ""
print(name, phone)
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
请注意,这只是如何管理 "inner Threads" 的示例。由于许多线程同时启动,您的系统可能会由于资源不足而无法启动其中的一些线程,您将得到 RuntimeError: can't start new thread
异常。在这种情况下,您应该尝试实施 ThreadPool
我结合使用 python 和 Thread
创建了一个抓取程序,以加快执行速度。爬虫应该解析网页中以不同字母结尾的所有可用链接。它确实解析了它们。
但是,我希望再次使用 Thread
解析来自这些单独链接的所有 names
和 phone
数字。第一部分我可以使用 Thread
设法 运行 但我不知道如何创建另一个 Thread
来执行脚本的后半部分?
我本可以将它们包装在一个 Thread
中,但我的目的是知道如何使用两个 Threads
来执行两个功能。
对于第一部分:我尝试了如下所示并且有效
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(link).text
tree = html.fromstring(response)
return [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
我的问题是:如何在另一个 Thread
[=22= 中使用 sub_links()
函数]
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(link).text
tree = html.fromstring(response)
return [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
def sub_links(process_links):
response = requests.get(process_links).text
root = html.fromstring(response)
for container in root.cssselect(".proListing"):
try:
name = container.cssselect("h2 a")[0].text
except Exception: name = ""
try:
phone = container.cssselect(".proListingPhone")[0].text
except Exception: phone = ""
print(name, phone)
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
您可以像启动第一个线程一样启动更多线程
from threading import Thread
t1 = Thread(target=alphabetical_links, kwargs={
'mainurl': link,
})
t1.start()
t2 = Thread(target=sub_links, kwargs={
'process_links': link,
})
t2.start()
尝试用自己的线程更新 alphabetical_links
:
import requests
import threading
from lxml import html
main_url = "https://www.houzz.com/proListings/letter/{}"
def alphabetical_links(mainurl):
response = requests.get(mainurl).text
tree = html.fromstring(response)
links_on_page = [container.attrib['href'] for container in tree.cssselect(".proSitemapLink a")]
threads = []
for link in links_on_page:
thread = threading.Thread(target=sub_links, args=(link,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
def sub_links(process_links):
response = requests.get(process_links).text
root = html.fromstring(response)
for container in root.cssselect(".proListing"):
try:
name = container.cssselect("h2 a")[0].text
except Exception: name = ""
try:
phone = container.cssselect(".proListingPhone")[0].text
except Exception: phone = ""
print(name, phone)
if __name__ == '__main__':
linklist = []
for link in [main_url.format(chr(page)) for page in range(97,123)]:
thread = threading.Thread(target=alphabetical_links, args=(link,))
thread.start()
linklist+=[thread]
for thread in linklist:
thread.join()
请注意,这只是如何管理 "inner Threads" 的示例。由于许多线程同时启动,您的系统可能会由于资源不足而无法启动其中的一些线程,您将得到 RuntimeError: can't start new thread
异常。在这种情况下,您应该尝试实施 ThreadPool