无法使用 xpath 从亚马逊找到或打印 link,但我可以使用 beautifulsoup
not able to find or print link form amazon using xpath but I am able to do with beautifulsoup
这是 python 脚本,我尝试了很多方法,但它不起作用,因为我是 xpath 的新手
from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or
contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or
contains(text(),"M.R.P") or contains(text(),"Price")]/following-
sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-
tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]/span/text()'
XPATH_DESCRIPTION = '///*[@id="productDescription"]/p/text()'
XPATH_IMAGE = '//*[@id="main-image-
container"]/ul/li[5]/span/span/div/img/src'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
RAW_DESCRIPTION = doc.xpath(XPATH_DESCRIPTION)
RAW_IMAGE = doc.xpath(XPATH_IMAGE)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if
RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if
RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if
RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY
else None
DESCRIPTION = ''.join(RAW_DESCRIPTION).strip() if RAW_DESCRIPTION
else None
IMAGE = ''.join(RAW_IMAGE) if RAW_IMAGE else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'NAME':NAME,
'SALE_PRICE':SALE_PRICE,
'CATEGORY':CATEGORY,
'ORIGINAL_PRICE':ORIGINAL_PRICE,
'AVAILABILITY':AVAILABILITY,
'URL':url,
'DESCRIPTION':DESCRIPTION,
'IMAGE':IMAGE,
}
return data
except Exception as e:
print e
def ReadAsin():
# AsinList =
csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = ['B008HDREZ6',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()
我无法获取图像 link
这是html
<div class="imgTagWrapper" style="height: 296px;">
<img src="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg" class="a-dynamic-image a-stretch-vertical" id="" style="max-height: 296px; max-width: 204.282px;" data-old-hires="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg" data-a-manual-replacement="true">
</div>
页面使用 JavaScript 将大图像放入此标签中。但是lxml
,beautifulsoup
不能运行JavaScript.
使用 lxml
/beautifulsoup
您只能在左侧使用 '//div[@id="altImages"]//img/@src'
获得小图像。
您可以在 <script>
标签之一中找到一些网址。
代码找到 <script>
与 data["colorImages"] =
和数据作为 JSON 字符串,它转换为 Python 的字典 - 然后很容易获得图像的 url有许多不同的尺寸。
import requests
from lxml import html
import json
url = "http://www.amazon.com/dp/B008HDREZ6"
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
response = requests.get(url, headers=headers)
doc = html.fromstring(response.content)
print('--- small ---')
XPATH_IMAGE = '//div[@id="altImages"]//img/@src'
RAW_IMAGE = doc.xpath(XPATH_IMAGE)
print('\n'.join(RAW_IMAGE[:-1]))
print('--- scripts ---')
XPATH_SCRIPTS = '//script'
RAW_SCRIPTS = doc.xpath(XPATH_SCRIPTS)
data = ''
for script in RAW_SCRIPTS:
text = script.text
if 'data["colorImages"]' in text:
for line in text.splitlines():
if 'data["colorImages"]' in line:
#print(line)
data = line
print('--- data ---')
data = data[24:-1]
data = json.loads(data)
print('keys:', data.keys())
print('keys:', data['Silver'][0].keys())
print('keys:', data['White'][0].keys())
for item in data['Silver']:
print('variant:', item['variant'])
print('main:', item['main'])
print('large:', item['large'])
print('hiRes:', item['hiRes'])
print('thumb:', item['thumb'])
print('-----')
缩略图:
--- small ---
https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
JavaScript中的数据:
--- data ---
keys: dict_keys(['Silver', 'White'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
variant: MAIN
main: {'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX355_.jpg': ['219', '355'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX522_.jpg': ['323', '522'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX450_.jpg': ['278', '450'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX466_.jpg': ['288', '466'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX425_.jpg': ['263', '425']}
large: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
-----
variant: PT01
main: {'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY550_.jpg': ['550', '380'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY355_.jpg': ['355', '245'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY679_.jpg': ['679', '469'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg': ['450', '311'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY606_.jpg': ['606', '419']}
large: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
-----
variant: PT02
main: {'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX466_.jpg': ['311', '466'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX522_.jpg': ['348', '522'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX450_.jpg': ['300', '450'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX425_.jpg': ['283', '425'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX355_.jpg': ['237', '355']}
large: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
-----
这是 python 脚本,我尝试了很多方法,但它不起作用,因为我是 xpath 的新手
from lxml import html
import csv,os,json
import requests
from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or
contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or
contains(text(),"M.R.P") or contains(text(),"Price")]/following-
sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-
tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]/span/text()'
XPATH_DESCRIPTION = '///*[@id="productDescription"]/p/text()'
XPATH_IMAGE = '//*[@id="main-image-
container"]/ul/li[5]/span/span/div/img/src'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
RAW_DESCRIPTION = doc.xpath(XPATH_DESCRIPTION)
RAW_IMAGE = doc.xpath(XPATH_IMAGE)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if
RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if
RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if
RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY
else None
DESCRIPTION = ''.join(RAW_DESCRIPTION).strip() if RAW_DESCRIPTION
else None
IMAGE = ''.join(RAW_IMAGE) if RAW_IMAGE else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'NAME':NAME,
'SALE_PRICE':SALE_PRICE,
'CATEGORY':CATEGORY,
'ORIGINAL_PRICE':ORIGINAL_PRICE,
'AVAILABILITY':AVAILABILITY,
'URL':url,
'DESCRIPTION':DESCRIPTION,
'IMAGE':IMAGE,
}
return data
except Exception as e:
print e
def ReadAsin():
# AsinList =
csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = ['B008HDREZ6',]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print "Processing: "+url
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()
我无法获取图像 link
这是html
<div class="imgTagWrapper" style="height: 296px;">
<img src="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg" class="a-dynamic-image a-stretch-vertical" id="" style="max-height: 296px; max-width: 204.282px;" data-old-hires="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg" data-a-manual-replacement="true">
</div>
页面使用 JavaScript 将大图像放入此标签中。但是lxml
,beautifulsoup
不能运行JavaScript.
使用 lxml
/beautifulsoup
您只能在左侧使用 '//div[@id="altImages"]//img/@src'
获得小图像。
您可以在 <script>
标签之一中找到一些网址。
代码找到 <script>
与 data["colorImages"] =
和数据作为 JSON 字符串,它转换为 Python 的字典 - 然后很容易获得图像的 url有许多不同的尺寸。
import requests
from lxml import html
import json
url = "http://www.amazon.com/dp/B008HDREZ6"
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
}
response = requests.get(url, headers=headers)
doc = html.fromstring(response.content)
print('--- small ---')
XPATH_IMAGE = '//div[@id="altImages"]//img/@src'
RAW_IMAGE = doc.xpath(XPATH_IMAGE)
print('\n'.join(RAW_IMAGE[:-1]))
print('--- scripts ---')
XPATH_SCRIPTS = '//script'
RAW_SCRIPTS = doc.xpath(XPATH_SCRIPTS)
data = ''
for script in RAW_SCRIPTS:
text = script.text
if 'data["colorImages"]' in text:
for line in text.splitlines():
if 'data["colorImages"]' in line:
#print(line)
data = line
print('--- data ---')
data = data[24:-1]
data = json.loads(data)
print('keys:', data.keys())
print('keys:', data['Silver'][0].keys())
print('keys:', data['White'][0].keys())
for item in data['Silver']:
print('variant:', item['variant'])
print('main:', item['main'])
print('large:', item['large'])
print('hiRes:', item['hiRes'])
print('thumb:', item['thumb'])
print('-----')
缩略图:
--- small ---
https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
JavaScript中的数据:
--- data ---
keys: dict_keys(['Silver', 'White'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
keys: dict_keys(['large', 'variant', 'hiRes', 'thumb', 'main'])
variant: MAIN
main: {'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX355_.jpg': ['219', '355'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX522_.jpg': ['323', '522'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX450_.jpg': ['278', '450'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX466_.jpg': ['288', '466'], 'https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SX425_.jpg': ['263', '425']}
large: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/7152gMAICdL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/31bDT3JCmML._SS40_.jpg
-----
variant: PT01
main: {'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY550_.jpg': ['550', '380'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY355_.jpg': ['355', '245'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY679_.jpg': ['679', '469'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg': ['450', '311'], 'https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY606_.jpg': ['606', '419']}
large: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/51OGAiwApNL._SS40_.jpg
-----
variant: PT02
main: {'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX466_.jpg': ['311', '466'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX522_.jpg': ['348', '522'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX450_.jpg': ['300', '450'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX425_.jpg': ['283', '425'], 'https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SX355_.jpg': ['237', '355']}
large: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL.jpg
hiRes: https://images-na.ssl-images-amazon.com/images/I/91UsOEFbYJL._SL1500_.jpg
thumb: https://images-na.ssl-images-amazon.com/images/I/519%2B3tR1ObL._SS40_.jpg
-----