使用 Multithreading/Multiprocessing 加速抓取
Speed up scraping with Multithreading/Multiprocessing
我不知道如何在 python 中使用 multithreading/multiprocessing 来加速这个抓取过程,从 instagram 上的标签 'cats' 获取所有用户名。
我的目标是让它尽可能快,因为目前这个过程有点慢
from instaloader import Instaloader
HASHTAG = 'cats'
loader = Instaloader(sleep=False)
users = []
for post in loader.get_hashtag_posts(HASHTAG):
if post.owner_username not in users:
users.append(post.owner_username)
print(post.owner_username)
LockedIterator
的灵感来自 here。
import threading
from instaloader import Instaloader
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
HASHTAG = 'cats'
posts = Instaloader(sleep=False).get_hashtag_posts(HASHTAG)
posts = LockedIterator(posts)
users = set()
def worker():
try:
for post in posts:
print(post.owner_username)
users.add(post.owner_username)
except Exception as e:
print(e)
raise
threads = []
for i in range(4):
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
目标是有一个输入文件和单独的 output.txt 文件,也许你可以帮我到这里
应该是第45行的东西
而且我不是很高级所以我的尝试可能包含一些错误的代码,我不知道
作为 input.txt 的主题标签示例,我使用了:
wqddt & d2deltas
from instaloader import Instaloader
import threading
import io
import time
import sys
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
f = open('input.txt','r',encoding='utf-8')
HASHTAG = f.read()
p = HASHTAG.split('\n')
PROFILE = p[:]
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
posts = Instaloader(sleep=False).get_hashtag_posts(pro)
posts = LockedIterator(posts)
users = set()
start_time = time.time()
PROFILE = p[:]
def worker():
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
try:
filename = 'downloads/'+pro+'.txt'
fil = open(filename,'a',newline='',encoding="utf-8")
for post in posts:
hashtags = post.owner_username
fil.write(str(hashtags)+'\n')
except:
print('Skipping',pro)
threads = []
for i in range(4): #Input Threads
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
end_time = time.time()
print("Done")
print("Time taken : " + str(end_time - start_time) + "sec")
我不知道如何在 python 中使用 multithreading/multiprocessing 来加速这个抓取过程,从 instagram 上的标签 'cats' 获取所有用户名。
我的目标是让它尽可能快,因为目前这个过程有点慢
from instaloader import Instaloader
HASHTAG = 'cats'
loader = Instaloader(sleep=False)
users = []
for post in loader.get_hashtag_posts(HASHTAG):
if post.owner_username not in users:
users.append(post.owner_username)
print(post.owner_username)
LockedIterator
的灵感来自 here。
import threading
from instaloader import Instaloader
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
HASHTAG = 'cats'
posts = Instaloader(sleep=False).get_hashtag_posts(HASHTAG)
posts = LockedIterator(posts)
users = set()
def worker():
try:
for post in posts:
print(post.owner_username)
users.add(post.owner_username)
except Exception as e:
print(e)
raise
threads = []
for i in range(4):
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
目标是有一个输入文件和单独的 output.txt 文件,也许你可以帮我到这里
应该是第45行的东西
而且我不是很高级所以我的尝试可能包含一些错误的代码,我不知道
作为 input.txt 的主题标签示例,我使用了: wqddt & d2deltas
from instaloader import Instaloader
import threading
import io
import time
import sys
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
f = open('input.txt','r',encoding='utf-8')
HASHTAG = f.read()
p = HASHTAG.split('\n')
PROFILE = p[:]
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
posts = Instaloader(sleep=False).get_hashtag_posts(pro)
posts = LockedIterator(posts)
users = set()
start_time = time.time()
PROFILE = p[:]
def worker():
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
try:
filename = 'downloads/'+pro+'.txt'
fil = open(filename,'a',newline='',encoding="utf-8")
for post in posts:
hashtags = post.owner_username
fil.write(str(hashtags)+'\n')
except:
print('Skipping',pro)
threads = []
for i in range(4): #Input Threads
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
end_time = time.time()
print("Done")
print("Time taken : " + str(end_time - start_time) + "sec")