如何在抓取网页时从输出中删除 \n?
How to strip \n from output while scraping webpage?
我正在抓取一个网页,当我得到结果时,一切看起来都很好,除了我的卡名列,因为我在卡名前有一个 \n
。我如何防止它被输出?
# Scraping
def parse(self, response):
item = GameItem()
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
item["card_name"] = game.css("a.card_popup::text").extract_first()
if item["card_name"] != None:
saved_name = item["card_name"]
else:
item["card_name"] = saved_name
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item
示例输出
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAether Membrane", "Condition": "NM/M", "stock": "93", "Price": "[=11=].59"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAether Membrane", "Condition": "PL", "stock": "59", "Price": "[=11=].49"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAngelic Shield", "Condition": "NM/M", "stock": "35", "Price": "[=11=].25"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAnger", "Condition": "NM/M", "stock": "9", "Price": ".49"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAnger", "Condition": "PL", "stock": "49", "Price": ".19"},
built-in 字符串方法 strip()
(str.strip()
) 删除 none 个可打印字符。
这应该可以做到。
# Scraping
def parse(self, response):
item = GameItem()
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
item["card_name"] = game.css("a.card_popup::text").extract_first()
# strip added here ->
if item["card_name"] != None:
saved_name = item["card_name"].strip()
else:
item["card_name"] = saved_name
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item
一个不相关的代码简化提示,如果可以的话:
# Scraping
def parse(self, response):
item = GameItem()
# declare saved_name before hand to avoid NameError
saved_name = ""
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
# simplify conditional statement
saved_name = game.css("a.card_popup::text").extract_first() or saved_name
item["card_name"] = saved_name.strip()
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item
我正在抓取一个网页,当我得到结果时,一切看起来都很好,除了我的卡名列,因为我在卡名前有一个 \n
。我如何防止它被输出?
# Scraping
def parse(self, response):
item = GameItem()
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
item["card_name"] = game.css("a.card_popup::text").extract_first()
if item["card_name"] != None:
saved_name = item["card_name"]
else:
item["card_name"] = saved_name
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item
示例输出
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAether Membrane", "Condition": "NM/M", "stock": "93", "Price": "[=11=].59"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAether Membrane", "Condition": "PL", "stock": "59", "Price": "[=11=].49"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAngelic Shield", "Condition": "NM/M", "stock": "35", "Price": "[=11=].25"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAnger", "Condition": "NM/M", "stock": "9", "Price": ".49"},
{"Category": ["Duel Decks: Venser vs. Koth"], "card_name": "\nAnger", "Condition": "PL", "stock": "49", "Price": ".19"},
built-in 字符串方法 strip()
(str.strip()
) 删除 none 个可打印字符。
这应该可以做到。
# Scraping
def parse(self, response):
item = GameItem()
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
item["card_name"] = game.css("a.card_popup::text").extract_first()
# strip added here ->
if item["card_name"] != None:
saved_name = item["card_name"].strip()
else:
item["card_name"] = saved_name
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item
一个不相关的代码简化提示,如果可以的话:
# Scraping
def parse(self, response):
item = GameItem()
# declare saved_name before hand to avoid NameError
saved_name = ""
item["Category"] = response.css("span.titletext::text").extract()
for game in response.css("tr[class^=deckdbbody]"):
# simplify conditional statement
saved_name = game.css("a.card_popup::text").extract_first() or saved_name
item["card_name"] = saved_name.strip()
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["stock"] = game.css("td[class^=deckdbbody].search_results_8::text").extract_first()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").extract_first()
yield item