tweepy user_timeline 分页返回每个 Twitter 用户最多 3200 条推文

tweepy user_timeline with pagination returning max of 3200 tweets per twitter user

我正在使用此处的代码抓取一些用户的推文并导出为 .csv:https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25

我想理想地获取每个用户的所有推文,但它似乎仅限于最近的 3200 条推文。这是我以特朗普为例的确切代码:

ids = ['realDonaldTrump']


def extract_hashtags(hashtag_list):
    final_hashtag = ''
    for hashtag in hashtag_list:
        final_hashtag = final_hashtag + ' ' + hashtag['text']
        
    return final_hashtag.strip()
        
#from https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25
class TweetMiner(object):

    result_limit    =   20    
    data            =   []
    api             =   False
    
    twitter_keys = { #redacted   }
    
    
    def __init__(self, keys_dict=twitter_keys, api=api, result_limit = 20):
        
        self.twitter_keys = keys_dict
        
        auth = tw.OAuthHandler(keys_dict['consumer_key'], keys_dict['consumer_secret'])
        auth.set_access_token(keys_dict['access_token_key'], keys_dict['access_token_secret'])
        
        self.api = tw.API(auth)
        self.twitter_keys = keys_dict
        
        self.result_limit = result_limit
        

    def mine_user_tweets(self, user,
                         mine_rewteets=False,
                         max_pages=5):

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= max_pages:
            if last_tweet_id:
                statuses   =   self.api.user_timeline(screen_name=user,
                                                     count=self.result_limit,
                                                     max_id=last_tweet_id - 1,
                                                     tweet_mode = 'extended',
                                                     include_retweets=True
                                                    )        
            else:
                statuses   =   self.api.user_timeline(screen_name=user,
                                                        count=self.result_limit,
                                                        tweet_mode = 'extended',
                                                        include_retweets=True)
                
            for item in statuses:

                mined = {
                    'tweet_id':        item.id,
                    'name':            item.user.name,
                    'screen_name':     item.user.screen_name,
                    'retweet_count':   item.retweet_count,
                    'text':            item.full_text,
                    'mined_at':        datetime.datetime.now(),
                    'created_at':      item.created_at,
                    #'time_zone':        item._json['time_zone'],
                    'favourite_count': item.favorite_count,
                    'hashtags':        extract_hashtags(item.entities['hashtags']),
                    #'links':           extract_
                    'status_count':    item.user.statuses_count,
                    'location':        item.place,
                    'source_device':   item.source
                }
                
                try:
                    mined['retweet_text'] = item.retweeted_status.full_text
                except:
                    mined['retweet_text'] = 'None'
                try:
                    mined['quote_text'] = item.quoted_status.full_text
                    mined['quote_screen_name'] = status.quoted_status.user.screen_name
                except:
                    mined['quote_text'] = 'None'
                    mined['quote_screen_name'] = 'None'
                
                last_tweet_id = item.id
                data.append(mined)
                
            page += 1
            
        return data
    
#result_limit * max_pages is the no of tweets for each id
miner=TweetMiner(result_limit = 460) #200
counter = 0
counter2 = 0
for id in ids:
    try:
        print("Fetching tweets of " + id+ " now...")
        mined_tweets = miner.mine_user_tweets(user= id, max_pages=460) #100
        mined_tweets_df= pd.DataFrame(mined_tweets)
        
        counter2 = counter2 +1
        #after 40 tries, pause for 15 mins
        if counter2%40==0: #5
            print("Couldn't fetch, sleeping for 15 mins")
            time.sleep(900) #15 minute sleep time
    except:
        print(id, 'is invalid or locked')
    
    if counter>0:
        final_df = pd.concat([final_df, mined_tweets_df], ignore_index = True)
        print("Fetched and added!")
    else:
        final_df = mined_tweets_df
        print("Fetched and added!")
    counter +=1 
    
print(final_df)
final_df.to_csv('tweets.csv', encoding='UTF-8')

IDs 中每个用户返回的推文数量应为 460*460 = 211,600 条推文,但每个 ID 总共只有 returns 3200 条推文。此限制是否是 API 中内置的严格限制?如果是,是否有任何方法可以让每个用户获得超过 3200 条推文?

这是 Twitter 内置的限制 API。 user timeline 最多只能 return 3200 条推文(每个“页面”200 条推文)。要检索更多内容,您需要使用高级或企业完整档案搜索 API。