IndexError: list index out of range (on Reddit data crawler)

IndexError: list index out of range (on Reddit data crawler)

预计以下内容应该运行没有问题。

Reddit数据的解决方案:

    import requests
    import re
    import praw
    from datetime import date
    import csv
    import pandas as pd
    import time
    import sys

    class Crawler(object):
        '''
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
        '''
        def __init__(self, subreddit="apple"):
            '''
                Initialize a Crawler object.
                    subreddit is the topic you want to parse. default is r"apple"
                basic_url is the reddit site.
                headers is for requests.get method
                REX is to find submission ids.
                submission_ids save all the ids of submission you will parse.
                reddit is an object created using praw API. Please check it before you use.
            '''
            self.basic_url = "https://www.reddit.com"
            self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
            self.REX = re.compile(r"<div class=\" thing id-t3_[\w]+")
            self.subreddit = subreddit
            self.submission_ids = []
            self.reddit = praw.Reddit(client_id="your_id", client_secret="your_secret", user_agent="subreddit_comments_crawler")

        def get_submission_ids(self, pages=2):
            '''
                Collect all ids of submissions..
                One page has 25 submissions.
                page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                    id(after) is the last submission from last page.
            '''
    #         This is page url.
            url = self.basic_url + "/r/" + self.subreddit

            if pages <= 0:
                return []

            text = requests.get(url, headers=self.headers).text
            ids = self.REX.findall(text)
            ids = list(map(lambda x: x[-6:], ids))
            if pages == 1:
                self.submission_ids = ids
                return ids

            count = 0
            after = ids[-1]
            for i in range(1, pages):
                count += 25
                temp_url = self.basic_url + "/r/" + self.subreddit + "?count=" + str(count) + "&after=t3_" + ids[-1]
                text = requests.get(temp_url, headers=self.headers).text
                temp_list = self.REX.findall(text)
                temp_list = list(map(lambda x: x[-6:], temp_list))
                ids += temp_list
                if count % 100 == 0:
                    time.sleep(60)
            self.submission_ids = ids
            return ids

        def get_comments(self, submission):
            '''
                Submission is an object created using praw API.
            '''
    #         Remove all "more comments".
            submission.comments.replace_more(limit=None)
            comments = []
            for each in submission.comments.list():
                try:
                    comments.append((each.id, each.link_id[3:], each.author.name, date.fromtimestamp(each.created_utc).isoformat(), each.score, each.body) )
                except AttributeError as e: # Some comments are deleted, we cannot access them.
    #                 print(each.link_id, e)
                    continue
            return comments

        def save_comments_submissions(self, pages):
            '''
                1. Save all the ids of submissions.
                2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
                3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
                4. Separately, save them to two csv file.
                Note: You can link them with submission_id.
                Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
            '''

            print("Start to collect all submission ids...")
            self.get_submission_ids(pages)
            print("Start to collect comments...This may cost a long time depending on # of pages.")
            submission_url = self.basic_url + "/r/" + self.subreddit + "/comments/"
            comments = []
            submissions = []
            count = 0
            for idx in self.submission_ids:
                temp_url = submission_url + idx
                submission = self.reddit.submission(url=temp_url)
                submissions.append((submission.name[3:], submission.num_comments, submission.score, submission.subreddit_name_prefixed, date.fromtimestamp(submission.created_utc).isoformat(), submission.title, submission.selftext))
                temp_comments = self.get_comments(submission)
                comments += temp_comments
                count += 1
                print(str(count) + " submissions have got...")
                if count % 50 == 0:
                    time.sleep(60)
            comments_fieldnames = ["comment_id", "submission_id", "author_name", "post_time", "comment_score", "text"]
            df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
            df_comments.to_csv("comments.csv")
            submissions_fieldnames = ["submission_id", "num_of_comments", "submission_score", "submission_subreddit", "post_date", "submission_title", "text"]
            df_submission = pd.DataFrame(submissions, columns=submissions_fieldnames)
            df_submission.to_csv("submissions.csv")
            return df_comments


    if __name__ == "__main__":
        args = sys.argv[1:]
        if len(args) != 2:
            print("Wrong number of args...")
            exit()

        subreddit, pages = args
        c = Crawler(subreddit)
        c.save_comments_submissions(int(pages))

但我得到了:

(base) UserAir:scrape_reddit user$ python reddit_crawler.py apple 2

开始收集所有提交 ID...

回溯(最近调用最后):

文件 "reddit_crawler.py",第 127 行,在

c.save_comments_submissions(int(pages))

文件 "reddit_crawler.py",第 94 行,在 save_comments_submissions

self.get_submission_ids(pages)

文件 "reddit_crawler.py",第 54 行,在 get_submission_ids

after = ids[-1]

IndexError: 列表索引超出范围

my_list[-1]抛出一个IndexError时,意味着my_list为空:

>>> ids = []
>>> ids[-1]
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
IndexError: list index out of range
>>> ids = ['1']
>>> ids[-1]
'1'

诊断此错误的具体原因,但更广泛地说,我认为这是由于您没有充分利用 PRAW 造成的。您的脚本导入 requests 并执行 PRAW 已有方法的大量手动请求。 PRAW 的全部意义在于避免您必须编写这些请求来执行诸如为列表分页之类的操作,因此我建议您利用它。

例如,您的 get_submission_ids 函数(抓取 Reddit 的网络版本并处理分页)可以替换为

def get_submission_ids(self, pages=2):
    return [
        submission.id
        for submission in self.reddit.subreddit(self.subreddit).hot(
            limit=25 * pages
        )
    ]

因为 .hot() function 可以完成您尝试手动完成的所有操作。

我将在这里更进一步,让函数只是 return 一个 Submission 对象的列表,因为你的代码的其余部分最终会做一些更好的事情通过与 PRAW Submission 对象交互。这是代码(我重命名了函数以反映其更新的目的):

def get_submissions(self, pages=2):
    return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))

(我已将此函数更新为仅 return 结果,因为您的版本 return 的值 都将其设置为 self.submission_ids, 除非 pages0。感觉很不一致,所以我就把它设为return的值。)

您的 get_comments 函数看起来不错。

save_comments_submissions 函数与 get_submission_ids 一样,完成了 PRAW 可以处理的大量手动工作。您构造一个 temp_url 具有 post 的完整 URL,然后使用它来制作 PRAW Submission 对象,但我们可以直接使用一个替换它return 编辑 get_submissions。您还有一些对 time.sleep() 的调用,我将其删除,因为 PRAW 会自动为您休眠适当的时间。最后,我删除了此函数的 return 值,因为该函数的重点是将数据保存到磁盘,而不是将其 return 保存到其他任何地方,并且脚本的其余部分不使用return 值。这是该函数的更新版本:

def save_comments_submissions(self, pages):
    """
        1. Save all the ids of submissions.
        2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
        3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
        4. Separately, save them to two csv file.
        Note: You can link them with submission_id.
        Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
    """

    print("Start to collect all submission ids...")
    submissions = self.get_submissions(pages)
    print(
        "Start to collect comments...This may cost a long time depending on # of pages."
    )
    comments = []
    pandas_submissions = []
    for count, submission in enumerate(submissions):
        pandas_submissions.append(
            (
                submission.name[3:],
                submission.num_comments,
                submission.score,
                submission.subreddit_name_prefixed,
                date.fromtimestamp(submission.created_utc).isoformat(),
                submission.title,
                submission.selftext,
            )
        )
        temp_comments = self.get_comments(submission)
        comments += temp_comments
        print(str(count) + " submissions have got...")

    comments_fieldnames = [
        "comment_id",
        "submission_id",
        "author_name",
        "post_time",
        "comment_score",
        "text",
    ]
    df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
    df_comments.to_csv("comments.csv")
    submissions_fieldnames = [
        "submission_id",
        "num_of_comments",
        "submission_score",
        "submission_subreddit",
        "post_date",
        "submission_title",
        "text",
    ]
    df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
    df_submission.to_csv("submissions.csv")

这是完全使用 PRAW 的整个脚本的更新版本:

from datetime import date
import sys


import pandas as pd
import praw


class Crawler:
    """
        basic_url is the reddit site.
        headers is for requests.get method
        REX is to find submission ids.
    """

    def __init__(self, subreddit="apple"):
        """
            Initialize a Crawler object.
                subreddit is the topic you want to parse. default is r"apple"
            basic_url is the reddit site.
            headers is for requests.get method
            REX is to find submission ids.
            submission_ids save all the ids of submission you will parse.
            reddit is an object created using praw API. Please check it before you use.
        """
        self.subreddit = subreddit
        self.submission_ids = []
        self.reddit = praw.Reddit(
            client_id="your_id",
            client_secret="your_secret",
            user_agent="subreddit_comments_crawler",
        )

    def get_submissions(self, pages=2):
        """
            Collect all submissions..
            One page has 25 submissions.
            page url: https://www.reddit.com/r/subreddit/?count25&after=t3_id
                id(after) is the last submission from last page.
        """
        return list(self.reddit.subreddit(self.subreddit).hot(limit=25 * pages))

    def get_comments(self, submission):
        """
            Submission is an object created using praw API.
        """
        #         Remove all "more comments".
        submission.comments.replace_more(limit=None)
        comments = []
        for each in submission.comments.list():
            try:
                comments.append(
                    (
                        each.id,
                        each.link_id[3:],
                        each.author.name,
                        date.fromtimestamp(each.created_utc).isoformat(),
                        each.score,
                        each.body,
                    )
                )
            except AttributeError as e:  # Some comments are deleted, we cannot access them.
                #                 print(each.link_id, e)
                continue
        return comments

    def save_comments_submissions(self, pages):
        """
            1. Save all the ids of submissions.
            2. For each submission, save information of this submission. (submission_id, #comments, score, subreddit, date, title, body_text)
            3. Save comments in this submission. (comment_id, submission_id, author, date, score, body_text)
            4. Separately, save them to two csv file.
            Note: You can link them with submission_id.
            Warning: According to the rule of Reddit API, the get action should not be too frequent. Safely, use the defalut time span in this crawler.
        """

        print("Start to collect all submission ids...")
        submissions = self.get_submissions(pages)
        print(
            "Start to collect comments...This may cost a long time depending on # of pages."
        )
        comments = []
        pandas_submissions = []
        for count, submission in enumerate(submissions):
            pandas_submissions.append(
                (
                    submission.name[3:],
                    submission.num_comments,
                    submission.score,
                    submission.subreddit_name_prefixed,
                    date.fromtimestamp(submission.created_utc).isoformat(),
                    submission.title,
                    submission.selftext,
                )
            )
            temp_comments = self.get_comments(submission)
            comments += temp_comments
            print(str(count) + " submissions have got...")

        comments_fieldnames = [
            "comment_id",
            "submission_id",
            "author_name",
            "post_time",
            "comment_score",
            "text",
        ]
        df_comments = pd.DataFrame(comments, columns=comments_fieldnames)
        df_comments.to_csv("comments.csv")
        submissions_fieldnames = [
            "submission_id",
            "num_of_comments",
            "submission_score",
            "submission_subreddit",
            "post_date",
            "submission_title",
            "text",
        ]
        df_submission = pd.DataFrame(pandas_submissions, columns=submissions_fieldnames)
        df_submission.to_csv("submissions.csv")


if __name__ == "__main__":
    args = sys.argv[1:]
    if len(args) != 2:
        print("Wrong number of args...")
        exit()

    subreddit, pages = args
    c = Crawler(subreddit)
    c.save_comments_submissions(int(pages))

我意识到我在这里的回答涉及 Code Review 领域,但我希望这个回答有助于理解 PRAW 可以做的一些事情。使用预先存在的库代码可以避免您的 "list index out of range" 错误,因此我认为这是解决您问题的方法。