来自网站重复的评论

Reviews from site duplicate

我正在从网站上抓取评论,这些评论往往会重复。我面临的问题是减少重复项,我认为我的 xpath 可能是个问题,但我无法解决这个问题。

这是我尝试过的方法:

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd


class CruisesItems(scrapy.Item):
    user_rating = Field(output_processor = TakeFirst())
    user = Field(output_processor = TakeFirst())
    location = Field(output_processor = TakeFirst())
    title = Field(output_processor = TakeFirst())
    reviews = Field(output_processor = Join())


class CruisesSpider(scrapy.Spider):
    name = 'cruises_reviews'
    start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']


    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url, 
                callback = self.parse
            )

    def parse(self, response):
        container = response.xpath("//div[@class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
        for reviews in container:
            loader = ItemLoader(CruisesItems(), selector = reviews)
            loader.add_xpath('user_rating', "//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='emWez F1']/span/@class")
            loader.add_xpath('user', "(//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='xMxrO']//div[@class='bcaHz']//span//text())[position() mod 2=1]")
            loader.add_xpath('location',"(//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='xMxrO']//div[@class='BZmsN']//span//text())[position() mod 5=1]")
            loader.add_xpath('title', ".//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='fpMxB MC _S b S6 H5 _a']//text()")
            loader.add_xpath('reviews', "//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='pIRBV _T']//span//text()")
            yield loader.load_item()
        
process = CrawlerProcess(
    settings = {
        'FEEDS':{
            'cruise_reviews.jl':{
                'format':'jsonlines'
            }
        }
    }
)
process.crawl(CruisesSpider)
process.start()

您需要使用相对 xpath。

import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd


class CruisesItems(scrapy.Item):
    user_rating = Field(output_processor = TakeFirst())
    user = Field(output_processor = TakeFirst())
    location = Field(output_processor = TakeFirst())
    title = Field(output_processor = TakeFirst())
    reviews = Field(output_processor = Join())


class CruisesSpider(scrapy.Spider):
    name = 'cruises_reviews'
    start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5
    }
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                callback=self.parse
            )

    def parse(self, response):
        # container = response.xpath("//div[@class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
        container = response.xpath('//div[@id="ship_reviews"]//div[@class="eVykL Gi z cPeBe MD cwpFC"]')
        for reviews in container:
            loader = ItemLoader(CruisesItems(), selector=reviews)
            loader.add_xpath('user_rating', ".//div[@class='emWez F1']/span/@class")
            loader.add_xpath('user', "(.//div[@class='xMxrO']//div[@class='bcaHz']//span//text())[position() mod 2=1]")
            loader.add_xpath('location', "(//div[@class='xMxrO']//div[@class='BZmsN']//span//text())[position() mod 5=1]")
            loader.add_xpath('title', ".//div[@class='fpMxB MC _S b S6 H5 _a']//text()")
            loader.add_xpath('reviews', ".//div[@class='pIRBV _T']//span//text()")
            yield loader.load_item()


process = CrawlerProcess(
    settings = {
        'FEEDS':{
            'cruise_reviews.jl':{
                'format':'jsonlines'
            }
        }
    }
)
process.crawl(CruisesSpider)
process.start()

请注意,我将 //div[@class="eVykL Gi z cPeBe MD cwpFC"] 移到了容器中,因为它在所有项目的 xpath 中。