无法获得 scrapy javascript 完整响应
Cant get scrapy javascript full response
我似乎无法从这个 link:
中呈现完整的 html 回复
我正在使用 splash 扩展,因为我用常见的 scrapy 实践尝试过的东西没有用,但这也没有用。
这是我的 python 代码,(我是 运行 splash docker 和 docker run -p 8050:8050 scrapinghub/splash
)
import scrapy
from scrapy.utils.log import configure_logging
import scrapy_splash
from scrapy_splash import SplashRequest
class Covid_Spider(scrapy.Spider):
name = "covid_spider"
custom_settings = {
'SPLASH_URL' : 'http://127.0.0.1:8050',
'DOWNLOADER_MIDDLEWARES' : {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'DUPEFILTER_CLASS' : 'scrapy_splash.SplashAwareDupeFilter',
'SPIDER_MIDDLEWARES' : {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'HTTPCACHE_STORAGE' : 'scrapy_splash.SplashAwareFSCacheStorage'
}
def start_requests(self):
link = 'http://gabgoh.github.io/COVID/?CFR=0.02&D_hospital_lag=5&D_incbation=5.2&D_infectious=2.9&D_recovery_mild=11.1&D_recovery_severe=28.6&I0=10&InterventionAmt=0.09&InterventionTime=0&P_SEVERE=0.2&R0=2.2&Time_to_death=32&logN=14.1'
print(link)
splash_args = {
'html': 1,
'wait': 1,
'render_all': 1
}
yield SplashRequest(url=link, callback=self.parse_covid,endpoint='render.html',args=splash_args)
def parse_covid(self, response):
print(response.css('body').getall()[0])
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(Covid_Spider)
process.start()
这是我的终端输出:
<body>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-65931696-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-65931696-1');
</script>
</body>
我正在尝试获取完整页面。我将不胜感激任何帮助。这是我的第一个 post 顺便说一句。
已解决 Selenium
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# #short version
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://gabgoh.github.io/COVID')
body = driver.find_element_by_css_selector("body")
driver.close()
我似乎无法从这个 link:
中呈现完整的 html 回复我正在使用 splash 扩展,因为我用常见的 scrapy 实践尝试过的东西没有用,但这也没有用。
这是我的 python 代码,(我是 运行 splash docker 和 docker run -p 8050:8050 scrapinghub/splash
)
import scrapy
from scrapy.utils.log import configure_logging
import scrapy_splash
from scrapy_splash import SplashRequest
class Covid_Spider(scrapy.Spider):
name = "covid_spider"
custom_settings = {
'SPLASH_URL' : 'http://127.0.0.1:8050',
'DOWNLOADER_MIDDLEWARES' : {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'DUPEFILTER_CLASS' : 'scrapy_splash.SplashAwareDupeFilter',
'SPIDER_MIDDLEWARES' : {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'HTTPCACHE_STORAGE' : 'scrapy_splash.SplashAwareFSCacheStorage'
}
def start_requests(self):
link = 'http://gabgoh.github.io/COVID/?CFR=0.02&D_hospital_lag=5&D_incbation=5.2&D_infectious=2.9&D_recovery_mild=11.1&D_recovery_severe=28.6&I0=10&InterventionAmt=0.09&InterventionTime=0&P_SEVERE=0.2&R0=2.2&Time_to_death=32&logN=14.1'
print(link)
splash_args = {
'html': 1,
'wait': 1,
'render_all': 1
}
yield SplashRequest(url=link, callback=self.parse_covid,endpoint='render.html',args=splash_args)
def parse_covid(self, response):
print(response.css('body').getall()[0])
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(Covid_Spider)
process.start()
这是我的终端输出:
<body>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-65931696-1"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'UA-65931696-1');
</script>
</body>
我正在尝试获取完整页面。我将不胜感激任何帮助。这是我的第一个 post 顺便说一句。
已解决 Selenium
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# #short version
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://gabgoh.github.io/COVID')
body = driver.find_element_by_css_selector("body")
driver.close()