在循环中抓取许多 url 时的抓取行为

Question

我正在抓取一个网站并从一个页面中提取 url 而不是从提取的页面中提取 url 但是当我运行循环 1 个日期即只有一次并打破它时数据被有效地提取并且是正确的但是当有一个 8 个月的日期循环，我注意到日期正在连续移动一个数字，但尚未提取数据。我的问题是 scrapy 本身的这种行为或者我的代码有问题。我这是一种草率的做事方式，我的数据是否正确和完整？如果我的代码中有错误，那是什么。密码是

import scrapy, os, time#, json
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

class MySpider(scrapy.Spider):
    name = "timeline"

    def start_requests(self):
        dates = ['2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17', '2021-02-18', '2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23', '2021-02-24', '2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05', '2021-03-06', '2021-03-07', '2021-03-08', '2021-03-09', '2021-03-10', '2021-03-11', '2021-03-12', '2021-03-13', '2021-03-14', '2021-03-15', '2021-03-16', '2021-03-17', '2021-03-18', '2021-03-19', '2021-03-20', '2021-03-21', '2021-03-22', '2021-03-23', '2021-03-24', '2021-03-25', '2021-03-26', '2021-03-27', '2021-03-28', '2021-03-29', '2021-03-30', '2021-03-31', '2021-04-01', '2021-04-02', '2021-04-03', '2021-04-04', '2021-04-05', '2021-04-06', '2021-04-07', '2021-04-08', '2021-04-09', '2021-04-10', '2021-04-11', '2021-04-12', '2021-04-13', 
        '2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17', '2021-04-18', '2021-04-19', '2021-04-20', '2021-04-21', '2021-04-22', '2021-04-23', '2021-04-24', '2021-04-25', '2021-04-26', '2021-04-27', '2021-04-28', '2021-04-29', '2021-04-30', '2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-05-06', '2021-05-07', '2021-05-08', '2021-05-09', '2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-14', '2021-05-15', '2021-05-16', '2021-05-17', '2021-05-18', '2021-05-19', '2021-05-20', '2021-05-21', '2021-05-22', '2021-05-23', '2021-05-24', '2021-05-25', '2021-05-26', '2021-05-27', '2021-05-28', '2021-05-29', '2021-05-30', '2021-05-31', '2021-06-01', '2021-06-02', '2021-06-03', '2021-06-04', '2021-06-05', '2021-06-06', '2021-06-07', '2021-06-08', '2021-06-09', '2021-06-10', '2021-06-11', '2021-06-12', '2021-06-13', '2021-06-14', '2021-06-15', '2021-06-16', '2021-06-17', '2021-06-18', '2021-06-19', '2021-06-20', '2021-06-21', '2021-06-22', '2021-06-23', '2021-06-24', '2021-06-25', '2021-06-26', '2021-06-27', '2021-06-28', '2021-06-29', '2021-06-30', '2021-07-01', '2021-07-02', '2021-07-03', '2021-07-04', '2021-07-05', '2021-07-06', '2021-07-07', '2021-07-08', '2021-07-09', '2021-07-10', '2021-07-11', '2021-07-12', '2021-07-13', '2021-07-14', '2021-07-15', '2021-07-16', '2021-07-17', '2021-07-18', '2021-07-19', '2021-07-20', '2021-07-21', '2021-07-22', '2021-07-23', '2021-07-24', '2021-07-25', '2021-07-26', '2021-07-27', '2021-07-28', '2021-07-29', '2021-07-30', '2021-07-31', '2021-08-01', '2021-08-02', 
        '2021-08-03', '2021-08-04', '2021-08-05', '2021-08-06', '2021-08-07', '2021-08-08', '2021-08-09', '2021-08-10', '2021-08-11', '2021-08-12', '2021-08-13', '2021-08-14', '2021-08-15', '2021-08-16', '2021-08-17', '2021-08-18', '2021-08-19', '2021-08-20', '2021-08-21', '2021-08-22', '2021-08-23', '2021-08-24', '2021-08-25', '2021-08-26', '2021-08-27', '2021-08-28', '2021-08-29', '2021-08-30', '2021-08-31', '2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04', '2021-09-05', '2021-09-06', '2021-09-07', '2021-09-08', '2021-09-09', '2021-09-10', '2021-09-11', '2021-09-12', '2021-09-13', '2021-09-14', '2021-09-15', '2021-09-16', '2021-09-17', '2021-09-18', '2021-09-19', '2021-09-20', '2021-09-21', '2021-09-22', '2021-09-23', '2021-09-24', '2021-09-25', '2021-09-26', '2021-09-27', '2021-09-28', '2021-09-29', '2021-09-30']
        a_file = open("C:\Users\Roshaan\Desktop\scrapy\bookstoscrape\bookstoscrape\spiders\file.txt", "r")
        list_of_lines = a_file.readlines()
        global done_date, done_player, done_times
        done_date = eval(list_of_lines[0])#empty lists
        done_times = eval(list_of_lines[1])
        done_player = eval(list_of_lines[2])
        a_file.close()
        global player_urls
        player_urls = set()
        while set(dates) != set(done_date):
            try:
                for date in dates:
                    if date in done_date:
                        continue
                    global dateG
                    dateG = date
                    url = 'https://greyhoundbet.racingpost.com/results/blocks.sd?r_date=' + date + '&blocks=header%2Cmeetings'
                    yield scrapy.Request(url=url, callback=self.parse)
                    done_date.append(date)
                    a_file = open("C:\Users\Roshaan\Desktop\scrapy\bookstoscrape\bookstoscrape\spiders\file.txt", "r")
                    list_of_lines = a_file.readlines()
                    list_of_lines[0] = str(done_date) + "\n"
                    a_file = open("C:\Users\Roshaan\Desktop\scrapy\bookstoscrape\bookstoscrape\spiders\file.txt", "w")
                    a_file.writelines(list_of_lines)
                    a_file.close()
                print(player_urls)
                with open('links.txt', 'w') as f:
                    f.write(player_urls)

            except Exception as e:
                print(player_urls)
                time.sleep(30)

    def parse(self, response):
        print('parsing')
        json_data = response.json()
        global timelines_url, raceids
        timelines_url = []
        raceids = []
        
        for data in json_data['meetings']['tracks']['2']['races']:
            a = 1
            for i in data['races']:
                if int(i['trackId']) == 4:
                    if a == 1:
                        lstt = i['rTime']
                        time = lstt.split(' ')
                        time = time[1].replace(":","%3A")
                    a = a + 1
                    timelines_url.append("https://greyhoundbet.racingpost.com/results/blocks.sd?race_id=" + i['raceId'] + "&track_id=4&r_date="+ dateG +"&r_time="+ time +"&blocks=meetingHeader%2Cresults-meeting-pager%2Clist")
                    raceids.append(i['raceId'])
            if  a == 1:
                for data in json_data['meetings']['tracks']['1']['races']:
                    a = 1
                    for i in data['races']:
                        if int(i['trackId']) == 4:
                            if a == 1:
                                lstt = i['rTime']
                                time = lstt.split(' ')
                                time = time[1].replace(":","%3A")
                            a = a + 1
                            timelines_url.append("https://greyhoundbet.racingpost.com/results/blocks.sd?race_id=" + i['raceId'] + "&track_id=4&r_date="+ dateG +"&r_time="+ time +"&blocks=meetingHeader%2Cresults-meeting-pager%2Clist")
                            raceids.append(i['raceId'])

        for t in timelines_url:
            yield scrapy.Request(t, self.parse2)
    def parse2(self, response):
        print("parsing2")
        jsn_data = response.json()
        for i in range(len(raceids)):
            try:
                for datas in jsn_data['list']['track']['results'][raceids[i]]:
                    dog = datas['dogId']
                    if dog in done_player:
                        continue
                    tm = datas['msgTimeOff']                
                    tm = tm.split(' ')
                    player_urls.add("https://greyhoundbet.racingpost.com/#results-dog/race_id="+ raceids[i] +"&dog_id="+ dog +"&r_date="+ dateG +"&track_id=4&r_time=" + tm[1])

                    done_player.append(dog)
                    a_file = open("C:\Users\Roshaan\Desktop\scrapy\bookstoscrape\bookstoscrape\spiders\file.txt", "r")
                    list_of_lines = a_file.readlines()
                    list_of_lines[2] = str(done_player)
                    a_file = open("C:\Users\Roshaan\Desktop\scrapy\bookstoscrape\bookstoscrape\spiders\file.txt", "w")
                    a_file.writelines(list_of_lines)
                    a_file.close()
            except Exception as e:
                if e == "KeyError":
                    continue
                else:
                    i = i-1
                          
                

if __name__ == "__main__":
    spider = 'timeline'
    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    process = CrawlerProcess(settings)
    process.crawl(spider)
    process.start()

我是 scrapy 的新手。提前致谢注意：我注意到一个新事物，日期是无序的，有时请求月份，例如第 4 个月正在处理，下一个时刻是第 3 个月

Answer 1

这是正常行为，因为 scrapy 默认是异步的。尝试将 concurrent_requests 设置为 1:

class MySpider(scrapy.Spider):
    name = "norgren"

    custom_settings = {
        'CONCURRENT_REQUESTS': 1,
    }
...
...
...

在循环中抓取许多 url 时的抓取行为

Scrapy behaviour while scraping many urls in loop

python

scrapy