获取页面中的所有文本且未抓取任何页面
Getting all text from a page and no pages crawled
我创建了一个抓取网页特定元素的抓取工具。该网站提供了进入网页中所有artists
的选项,因此我可以直接从该页面获取所有artists
,因为该网站没有提供'next-page' href。我的问题是,当我将所有网站加载到请求中时,它什么也不抓取,但是当我减少网页列表时,它将开始抓取页面。关于导致此问题的原因有什么想法吗?
此外,我想从歌曲页面中获取所有歌词。但是,有些歌词在 a
标签之间隔开,而其他歌词是单个字符串。但是,有时即使我直接单击 url 网页上有歌词,我也看不到歌词。我怎样才能抓取所有文本并获取所有歌曲的歌词?如果我包括以下内容:
.//pre[@id='lyric-body-text']//a//text()
它仍然只抓取 a
标签下的第一行文本。
这是我的抓取工具的示例:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artists = Field(output_processor = TakeFirst())
songs = Field(output_processor = TakeFirst())
duration = Field(output_processor = TakeFirst())
album = Field(output_processor = TakeFirst())
year = Field(output_processor = TakeFirst())
lyrics = Field(output_processor = TakeFirst())
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
async def parse(self, response):
container = response.xpath("//table[@class='tdata']//tbody//tr")
for artists in container:
loader = ItemLoader(LyricalItem(), selector = artists)
loader.add_xpath('artists', '(.//a)[position() mod 2 = 1]//text()')
links = artists.xpath("(.//a)[position() mod 2 = 1]//@href").get()
yield response.follow(
url = response.urljoin(links),
callback = self.parse_artists,
cb_kwargs = {
'loader':loader
}
)
def parse_artists(self, response, loader):
#table = response.xpath("//div[@class='tdata-ext']//table")
#for items in table:
#loader = ItemLoader(LyricalItem(), selector = items
if loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get()):
loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get())
else:
loader.add_value('album', "Unkown Album")
if loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get()):
loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get())
else:
loader.add_value('year', 'Unknown Year')
loader.add_value('songs', response.xpath("(.//td)[position() mod 2=1]//text()").get())
loader.add_value('duration', response.xpath("(.//td)[position() mod 2=0]/text()").get())
yield loader.load_item()
get_lyrics = response.xpath("(.//td)[position() mod 2=1]//@href").get()
yield response.follow(
url= response.urljoin(get_lyrics),
callback = self.get_lyrical,
cb_kwargs = {
'loader':loader
}
)
def get_lyrical(self, response, loader):
loader.add_value('lyrics', response.xpath(".//pre[@id='lyric-body-text']//text()").get())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEED_URI':'artists.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(LyricalSpider)
process.start()
您的代码有很多冗余片段。我已经删除了冗余代码,并实现了您要求捕获所有歌词的要求。此外,所有信息都可以在歌词页面上找到,因此无需传递加载项。您可以简单地从歌词页面爬取所有信息。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, Join
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artist = Field()
song = Field()
duration = Field()
album = Field()
year = Field()
lyrics = Field(output_processor = Join(" "))
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def parse(self, response):
for artist in response.xpath("//table[@class='tdata']//tr/td/strong/a")[:2]:
yield response.follow(artist, callback = self.parse_artists)
def parse_artists(self, response):
for song in response.xpath("//table[@class='tdata']/tbody/tr/td/strong/a"):
yield response.follow(song, callback = self.get_lyrical)
def get_lyrical(self, response):
loader = ItemLoader(LyricalItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("album", "//h4[contains(text(),'more tracks from the album')]/following-sibling::h3/a/text()")
loader.add_value("album", "Uknown album")
loader.add_value("artist", response.xpath("normalize-space(//h3[@class='lyric-artist'])").get())
loader.add_value("artist", "Uknown artist")
loader.add_xpath("song", "//h1[@id='lyric-title-text']/text()")
loader.add_xpath("year", "//dt[contains(text(),'Year:')]/following-sibling::dd/a/text()")
loader.add_xpath("duration", "//dt[i[@title='Duration']]/following-sibling::dd/text()")
for line in response.xpath("//*[@id='lyric-body-text']/descendant-or-self::*/text()").getall():
loader.add_value('lyrics', line.strip())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEEDS': {
"songs.jl": {
"format": "jsonlines"
}
}
}
)
process.crawl(LyricalSpider)
process.start()
我创建了一个抓取网页特定元素的抓取工具。该网站提供了进入网页中所有artists
的选项,因此我可以直接从该页面获取所有artists
,因为该网站没有提供'next-page' href。我的问题是,当我将所有网站加载到请求中时,它什么也不抓取,但是当我减少网页列表时,它将开始抓取页面。关于导致此问题的原因有什么想法吗?
此外,我想从歌曲页面中获取所有歌词。但是,有些歌词在 a
标签之间隔开,而其他歌词是单个字符串。但是,有时即使我直接单击 url 网页上有歌词,我也看不到歌词。我怎样才能抓取所有文本并获取所有歌曲的歌词?如果我包括以下内容:
.//pre[@id='lyric-body-text']//a//text()
它仍然只抓取 a
标签下的第一行文本。
这是我的抓取工具的示例:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artists = Field(output_processor = TakeFirst())
songs = Field(output_processor = TakeFirst())
duration = Field(output_processor = TakeFirst())
album = Field(output_processor = TakeFirst())
year = Field(output_processor = TakeFirst())
lyrics = Field(output_processor = TakeFirst())
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
async def parse(self, response):
container = response.xpath("//table[@class='tdata']//tbody//tr")
for artists in container:
loader = ItemLoader(LyricalItem(), selector = artists)
loader.add_xpath('artists', '(.//a)[position() mod 2 = 1]//text()')
links = artists.xpath("(.//a)[position() mod 2 = 1]//@href").get()
yield response.follow(
url = response.urljoin(links),
callback = self.parse_artists,
cb_kwargs = {
'loader':loader
}
)
def parse_artists(self, response, loader):
#table = response.xpath("//div[@class='tdata-ext']//table")
#for items in table:
#loader = ItemLoader(LyricalItem(), selector = items
if loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get()):
loader.add_value('album', response.xpath('((.//h3)[@class="artist-album-label"])//a//text()').get())
else:
loader.add_value('album', "Unkown Album")
if loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get()):
loader.add_value('year', response.xpath('((.//h3)[@class="artist-album-label"])//span//text()').get())
else:
loader.add_value('year', 'Unknown Year')
loader.add_value('songs', response.xpath("(.//td)[position() mod 2=1]//text()").get())
loader.add_value('duration', response.xpath("(.//td)[position() mod 2=0]/text()").get())
yield loader.load_item()
get_lyrics = response.xpath("(.//td)[position() mod 2=1]//@href").get()
yield response.follow(
url= response.urljoin(get_lyrics),
callback = self.get_lyrical,
cb_kwargs = {
'loader':loader
}
)
def get_lyrical(self, response, loader):
loader.add_value('lyrics', response.xpath(".//pre[@id='lyric-body-text']//text()").get())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEED_URI':'artists.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(LyricalSpider)
process.start()
您的代码有很多冗余片段。我已经删除了冗余代码,并实现了您要求捕获所有歌词的要求。此外,所有信息都可以在歌词页面上找到,因此无需传递加载项。您可以简单地从歌词页面爬取所有信息。
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, Join
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
class LyricalItem(scrapy.Item):
artist = Field()
song = Field()
duration = Field()
album = Field()
year = Field()
lyrics = Field(output_processor = Join(" "))
class LyricalSpider(scrapy.Spider):
name = 'lyrical'
artists = [0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
start_urls = []
for art in artists:
start_urls.append(f'https://www.lyrics.com/artists/{art}/99999')
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def parse(self, response):
for artist in response.xpath("//table[@class='tdata']//tr/td/strong/a")[:2]:
yield response.follow(artist, callback = self.parse_artists)
def parse_artists(self, response):
for song in response.xpath("//table[@class='tdata']/tbody/tr/td/strong/a"):
yield response.follow(song, callback = self.get_lyrical)
def get_lyrical(self, response):
loader = ItemLoader(LyricalItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("album", "//h4[contains(text(),'more tracks from the album')]/following-sibling::h3/a/text()")
loader.add_value("album", "Uknown album")
loader.add_value("artist", response.xpath("normalize-space(//h3[@class='lyric-artist'])").get())
loader.add_value("artist", "Uknown artist")
loader.add_xpath("song", "//h1[@id='lyric-title-text']/text()")
loader.add_xpath("year", "//dt[contains(text(),'Year:')]/following-sibling::dd/a/text()")
loader.add_xpath("duration", "//dt[i[@title='Duration']]/following-sibling::dd/text()")
for line in response.xpath("//*[@id='lyric-body-text']/descendant-or-self::*/text()").getall():
loader.add_value('lyrics', line.strip())
yield loader.load_item()
process = CrawlerProcess(
settings = {
#'CONCURRENT_REQUESTS':64,
'FEEDS': {
"songs.jl": {
"format": "jsonlines"
}
}
}
)
process.crawl(LyricalSpider)
process.start()