无法网络抓取导致找不到表单元素

Cannot web-scrape cause cannot find the form element

正在尝试抓取以下网站: https://israeldrugs.health.gov.il/#!/byDrug

您需要在表单中输入搜索词,然后按左侧的蓝色按钮。

但是,bs4 失败,因为它找不到表单元素。

感谢您的帮助。

此站点上的数据是使用 javascript 动态加载的。如果您深入研究 XHR(使用浏览器中的 Developer 选项卡),您将看到这些信息是如何加载到页面中的。顺便说一句,以下假设您使用的是 python;如果没有,您将不得不找到另一种语言的等价物。

import requests
import json

target = 'ATORVASTATIN AS CALCIUM' #this is just a random drug from their list
data = '{"val":"'+target+'","prescription":false,"healthServices":false,"pageIndex":1,"orderBy":0}'


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Content-Type': 'application/json',
    'Origin': 'https://israeldrugs.health.gov.il',
    'Connection': 'keep-alive',
    'Referer': 'https://israeldrugs.health.gov.il/',
}

response = requests.post('https://israeldrugs.health.gov.il/GovServiceList/IDRServer/SearchByName', headers=headers,  data=data)

#load the json response
meds = json.loads(response.text)
#a random item from the 8th (random, again) drug in the response
meds['results'][7]['dragHebName']

输出:

'טורבה 10'

不知道它是否仍然相关,但我能够使用 node js 和 puppeteer(个人项目需要它)抓取整个数据库(或至少其中的大部分)。

// after intalling puppeteer with npm i
const puppeteer = require("puppeteer");

// using fulesystem to save scraped data to json file
const fs = require('fs');

const scrape = async () => {

    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    // go to the url and wait until it loads
    await
    page.goto(`https://israeldrugs.health.gov.il/#!/byDrug`, {
        waitUntil: 'networkidle2',
        timeout : 0
    });   
    const searchInputSelector  = '#homeCtrl > div.display-wrapper > ui-view > div > div > div.search-textbox-container > idr-search-textbox > div:nth-child(1) > div > form > input' 
    await page.waitForSelector(searchInputSelector)

    // grabbing search input field and typing 'a' inside. this will fetch most of the data base (as it searches char at the WHOLE string of the commercial & generic names
    await page.type(searchInputSelector, 'a')
    const searchButtonSelector = '#homeCtrl > div.display-wrapper > ui-view > div > div > div.search-textbox-container > idr-search-textbox > div:nth-child(1) > div > div'

    // grabbing search button & clicking it
    await page.click(searchButtonSelector)

    // starting to create database array
    fs.writeFileSync('israMeds.json', '[')
    const pagesSelector = '#homeCtrl > div.display-wrapper > ui-view > div > search-list > div > div > div.compareAndSortBarWrap > div > div.checkbox.selectAll > span'
    await page.waitForSelector(pagesSelector)

    // grabbing number of results. divided by 10 it will give number of pages to scrape
    const pagesNum = await page.evaluate(() => {
        const pagesSelector = '#homeCtrl > div.display-wrapper > ui-view > div > search-list > div > div > div.compareAndSortBarWrap > div > div.checkbox.selectAll > span'
        return +document.querySelector(pagesSelector).textContent.trim().split('').filter(x => !isNaN(x)).join('').trim() / 10 + 1
    })
    const roundPages = Math.floor(pagesNum)
    for (let i = 0; i < roundPages; i++){
    if (i !== 0 ) fs.appendFileSync('israMeds.json',',')
    await page.waitForNetworkIdle(page,500,0)
    const elements = await page.evaluate(() => {
        const resultSelector = '#homeCtrl > div.display-wrapper > ui-view > div > search-list > div > div > div.search_wrap.ng-scope > div'
        
        return
        // grabbing results selector and looping over them. here you can choose your desired data by getting the field's html selector
        [...document.querySelectorAll(resultSelector)]
       .map(el => {
        const hebTitleSelector = 'div.infoText > div > div > div.firstRowTitle.ng-binding'   
        const engTitleSelector = 'div.infoText > div > div > div > span'
        const activeIngredientSelector = 'div.infoText > div > div > div.secondRowTitle.moreInfo.ng-binding.ng-scope'
        // creating object out of the desired data
        return JSON.stringify({
               drugHebTitle : el.querySelector(hebTitleSelector).textContent,
               drugEngTitle : el.querySelector(engTitleSelector).textContent,
               activeIngredient : el.querySelector(activeIngredientSelector).textContent.trim().split(' ')[2]
           })
       })
    })

    // adding data to external json file
    fs.appendFileSync('israMeds.json',elements.toString())
    const nextPageSelector = '#homeCtrl > div.display-wrapper > ui-view > div > search-list > div > div > div.text-center > ul > li:nth-child(8) > a'
    // moving to next result page
    await page.click(nextPageSelector)
}
    await browser.close();
    fs.appendFileSync('israMeds.json', ']')
};