Web 抓取 aspx 网站使用 python
Web scrape aspx website using python
我能够获取 HTTP headers 和参数但无法生成响应 object。该网站是 - https://www.sacmembership.ca/Search/Search.aspx & 我正在寻找每个从业者的详细信息。这是我到目前为止达到的代码:-
import cookielib
import socket
import urllib
import urllib2
url = 'https://www.sacmembership.ca/Search/Search.aspx'
http_header = {
#"POST" : "https://www.sacmembership.ca/Search/Results.aspx HTTP/1.1",
"Host" : "www.sacmembership.ca",
"Connection" : "keep-alive",
"Content-Length" : "16581",
"Cache-Control" :"max-age=0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Origin": "https://www.sacmembership.ca",
"User-Agent" : "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36",
"Content-Type" : "application/x-www-form-urlencoded",
"Referer" : "https://www.sacmembership.ca/Search/Search.aspx",
"Accept-Encoding" : "gzip, deflate",
"Accept-Language" : "en-US,en;q=0.8"
}
params = {
'ctl00$ContentPlaceHolder1$ddlProfession' : "",
'ctl00$ContentPlaceHolder1$ddlFacility' : "",
'ctl00$ContentPlaceHolder1$txtCity' : "",
'ctl00$ContentPlaceHolder1$ddlProvince' : "",
'ctl00$ContentPlaceHolder1$ddlSortBy' : "LastName",
'ctl00$ContentPlaceHolder1$ddlLanguageOfPractice' : "",
'ctl00$ContentPlaceHolder1$txtEmployerCompanyName' : "",
'ctl00$ContentPlaceHolder1$txtFirstName' : "",
'ctl00$ContentPlaceHolder1$txtLastName' : "",
'ctl00$ContentPlaceHolder1$btnSearch' : "Search"
}
cookie_jar = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(cookie_jar)
opener = urllib2.build_opener(cookie)
req = urllib2.Request(url, urllib.urlencode(params), http_header)
res = opener.open(req)
html = res.read()
print html
"""
open("tmp.html", "w").write(html)
body = html
"""
请帮我解决这个问题
我能够使用 Selenium 实现我一直在寻找的东西。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from scrapy import Selector
from selenium.webdriver.support.ui import Select, WebDriverWait
import csv
import time
import requests
from scrapy import Selector as s
driver = webdriver.Firefox()
Links = ['','','','','']
for each in links:
driver.get(each)
time.sleep(02)
driver.find_element_by_id("showAll").click()
time.sleep(04)
source = driver.page_source
sel = s(text=source,type="html")
apartment_listing = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\apartment_listing.csv","ab")as export:
for each1 in apartment_listing:
export.write('{}\n'.format(each1))
#New_link = driver.current_url
i = 0
while (i)<21:
driver.find_element_by_class_name('next').click()
time.sleep(02)
source1 = driver.page_source
sel1 = s(text=source1,type="html")
apartment_listing1 = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\apartment_listing.csv","ab")as export:
for each2 in apartment_listing1:
export.write('{}\n'.format(each2))
i = i+1
我能够获取 HTTP headers 和参数但无法生成响应 object。该网站是 - https://www.sacmembership.ca/Search/Search.aspx & 我正在寻找每个从业者的详细信息。这是我到目前为止达到的代码:-
import cookielib
import socket
import urllib
import urllib2
url = 'https://www.sacmembership.ca/Search/Search.aspx'
http_header = {
#"POST" : "https://www.sacmembership.ca/Search/Results.aspx HTTP/1.1",
"Host" : "www.sacmembership.ca",
"Connection" : "keep-alive",
"Content-Length" : "16581",
"Cache-Control" :"max-age=0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Origin": "https://www.sacmembership.ca",
"User-Agent" : "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36",
"Content-Type" : "application/x-www-form-urlencoded",
"Referer" : "https://www.sacmembership.ca/Search/Search.aspx",
"Accept-Encoding" : "gzip, deflate",
"Accept-Language" : "en-US,en;q=0.8"
}
params = {
'ctl00$ContentPlaceHolder1$ddlProfession' : "",
'ctl00$ContentPlaceHolder1$ddlFacility' : "",
'ctl00$ContentPlaceHolder1$txtCity' : "",
'ctl00$ContentPlaceHolder1$ddlProvince' : "",
'ctl00$ContentPlaceHolder1$ddlSortBy' : "LastName",
'ctl00$ContentPlaceHolder1$ddlLanguageOfPractice' : "",
'ctl00$ContentPlaceHolder1$txtEmployerCompanyName' : "",
'ctl00$ContentPlaceHolder1$txtFirstName' : "",
'ctl00$ContentPlaceHolder1$txtLastName' : "",
'ctl00$ContentPlaceHolder1$btnSearch' : "Search"
}
cookie_jar = cookielib.LWPCookieJar()
cookie = urllib2.HTTPCookieProcessor(cookie_jar)
opener = urllib2.build_opener(cookie)
req = urllib2.Request(url, urllib.urlencode(params), http_header)
res = opener.open(req)
html = res.read()
print html
"""
open("tmp.html", "w").write(html)
body = html
"""
请帮我解决这个问题
我能够使用 Selenium 实现我一直在寻找的东西。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from scrapy import Selector
from selenium.webdriver.support.ui import Select, WebDriverWait
import csv
import time
import requests
from scrapy import Selector as s
driver = webdriver.Firefox()
Links = ['','','','','']
for each in links:
driver.get(each)
time.sleep(02)
driver.find_element_by_id("showAll").click()
time.sleep(04)
source = driver.page_source
sel = s(text=source,type="html")
apartment_listing = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\apartment_listing.csv","ab")as export:
for each1 in apartment_listing:
export.write('{}\n'.format(each1))
#New_link = driver.current_url
i = 0
while (i)<21:
driver.find_element_by_class_name('next').click()
time.sleep(02)
source1 = driver.page_source
sel1 = s(text=source1,type="html")
apartment_listing1 = sel.xpath('//section[@class="placardHeader"]//a[@class="placardTitle"]//@href').extract()
with open("C:\Users\ssamant\Desktop\Client\Anida\Phase_II\Apartments\apartment_listing.csv","ab")as export:
for each2 in apartment_listing1:
export.write('{}\n'.format(each2))
i = i+1