有没有办法在 运行 人偶操作时将 table 数据转换为 json?

Is there a way to transform table data into json while running puppeteer?

我在尝试使用 puppeteer[将 html table 转换为 json 时遇到了一些问题table到json

我正在做的事情非常简单,puppeteer 去了 chrome,创建了一个新页面,然后开始输入一些单词并点击搜索按钮。当 puppeteer 完成此任务时,屏幕上会出现 table,我认为这就是为什么 tabletojson 一直向我显示一个空数组。它不是 运行 puppeteer 而是 运行ning 在 puppeteer 访问相同的 url 之后没有任何先前puppeteer 没有任何 table 可读的互动。

我要问的是:有没有办法让这个 table 成为 json 而 运行 宁木偶?或者可能有什么方法可以在抓取时 运行 table 到 json 异步?

const puppteer = require('puppeteer')
const tabletojson = require('tabletojson')

async function letscrap()
{
some puppeteer ....
  const browser = await puppteer.launch(
        {
            headless: false,
            defaultViewport: null
        }
    )
    
  const page = await browser.newPage()
  const url = "https://someurl.com/..."
  
 
some tabletojson ....

// here the ideia is to get json from tables...
  // at this point what happens is that it doesn't return anything due to this...
  
        await tabletojson.convertUrl(
            'https://someurl.com/...',
            {stripHtmlFromCells: false, stripHtmlFromCells: true },
            function(tablesAsJson) {
              console.log(tablesAsJson);
            }
        );
        
}

letscrap()

HTML来源:

<form method="post" action="/ConsultaPlanosConsumidor/pages/home.xhtml;jsessionid=bSbNLIiZ7pWWf7mgDl_MIw-F9QPRpVbNlo8johjO.ansprjboss01a:consulta-planos-consumidor-01a" enctype="application/x-www-form-urlencoded">

<!-- 

Some inputs right here
and... 

-->
<button id="formHome:tabOperadora:j_idt99" name="formHome:tabOperadora:j_idt99" class="ui-button ui-widget ui-state-default ui-corner-all ui-button-text-only" onclick="" type="submit" role="button" aria-disabled="false">
<span class="ui-button-text ui-c">Search</span>
</button>


<!-- 

When you hit "Search" it "appends" to it's div with a table.

-->

</form>

感谢您的关注!

我认为您收集的 table 不是正确的。 我发现在结果页面中,至少有 5 个 table。 所以你必须找到包含数据 table 的正确选择器。 右边的选择器是 table[role="grid"]。而且您可以在不使用 tabletojson 的情况下抓取搜索结果中的所有页面。我制作了这个脚本来将搜索结果 table 数据写入 CSV 文件。

const puppeteer = require ('puppeteer')
const fs = require ('fs-extra')

const selectElementID_first = 'formHome:tabOperadora:solbSituacoesPrincipais'
const selectElementID_second = 'formHome:tabOperadora:solbAbrangenciasGeograficas'

const selectedOption_first = 'Liberada'
const selectedOption_second = 'Nacional'

const saveFileCSV = 'porpiano.csv'

let lastPage = 0

;(async () => {
    const browser = await puppeteer.launch ({
        headless : false,
        devtools : false
    })

    const [page] = await browser.pages ()

    const open = await page.goto ('http://www.ans.gov.br/ConsultaPlanosConsumidor/', { waitUntil: 'networkidle0', timeout: 0 })

    const porPianoClick = await page.evaluate ( () => document.querySelector('a[href="#formHome:tabOperadora:panelPorPlano"]').click() )

    // FIRST SELECT OPTION

    while ( !await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}"]`).classList.contains('ui-state-focus'), selectElementID_first ) ) {
        await page.keyboard.press('Tab')
        await page.waitFor(300)
    }

    await page.keyboard.press('Space')

    while ( await page.evaluate( selectElementID_first => document.querySelector(`div[id="${selectElementID_first}_panel"]`).style.display === 'none', selectElementID_first ) ) {
        await page.waitFor(300)
    }

    while ( await page.evaluate( (selectElementID_first, selectedOption_first) => document.querySelector(`div[id="${selectElementID_first}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_first, selectElementID_first, selectedOption_first ) ) {
        await page.keyboard.press('ArrowDown')
        await page.waitFor(300)
    }

    await page.keyboard.press('Enter')

    await page.waitFor(1000)


    // SECOND SELECT OPTION

    while ( !await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}"]`).classList.contains('ui-state-focus'), selectElementID_second ) ) {
        await page.keyboard.press('Tab')
        await page.waitFor(300)
    }

    await page.keyboard.press('Space')

    while ( await page.evaluate( selectElementID_second => document.querySelector(`div[id="${selectElementID_second}_panel"]`).style.display === 'none', selectElementID_second ) ) {
        await page.waitFor(300)
    }

    while ( await page.evaluate( (selectElementID_second, selectedOption_second) => document.querySelector(`div[id="${selectElementID_second}_panel"] ul > li.ui-state-highlight`).innerText !== selectedOption_second, selectElementID_second, selectedOption_second ) ) {
        await page.keyboard.press('ArrowDown')
        await page.waitFor(300)
    }

    await page.keyboard.press('Enter')

    await page.waitFor(1000)


    await page.evaluate( () => document.querySelector('div[id="formHome:tabOperadora:panelPorPlano"] button[type="submit"]').click() )

    await fs.writeFile ( saveFileCSV, 'Número do Registro / Código do Plano, Nome Comercial do Plano, Segmentação Assistencial, Tipo de Contratação, Abrangência Geográfica, Tipo de Plano, Comercialização\n' )

    const loadTableGrid = async () => {
        // WAIT FOR PAGE LOADING

        await page.waitForSelector('span.ui-paginator-current', {timeout: 0})

        while ( lastPage === await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ')[0] ) ) {
            await page.waitFor(250)
        }


        await page.waitForSelector('table[role="grid"]', {timeout: 0})

        // SCRAPE DATA

        var tableDataPage = await page.evaluate( () => {

            var tableDataPage = []

            var tableDataColumn1 = []
            var tableDataColumn2 = []
            var tableDataColumn3 = []
            var tableDataColumn4 = []
            var tableDataColumn5 = []
            var tableDataColumn6 = []
            var tableDataColumn7 = []

            document.querySelectorAll('td[role="gridcell"]:nth-of-type(1)').forEach( tableData => tableDataColumn1.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(2)').forEach( tableData => tableDataColumn2.push( `"${tableData.innerText}"` ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(3)').forEach( tableData => tableDataColumn3.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(4)').forEach( tableData => tableDataColumn4.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(5)').forEach( tableData => tableDataColumn5.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(6)').forEach( tableData => tableDataColumn6.push( tableData.innerText ) )
            document.querySelectorAll('td[role="gridcell"]:nth-of-type(7)').forEach( tableData => tableDataColumn7.push( tableData.innerText ) )

            for ( let num in tableDataColumn1 ) {
                tableDataPage[num] =
                    [
                        tableDataColumn1[num],
                        tableDataColumn2[num],
                        tableDataColumn3[num],
                        tableDataColumn4[num],
                        tableDataColumn5[num],
                        tableDataColumn6[num],
                        tableDataColumn7[num]
                    ]
            }

            return tableDataPage
        })

        // WRITE COLLECTED DATA TO CSV FORMAT

        for ( let row in tableDataPage ) {
            await fs.appendFile ( saveFileCSV, `${tableDataPage[row]}\n` )
            await page.waitFor(100)
        }

        var paginatorValue = await page.evaluate ( () => document.querySelector('span.ui-paginator-current').innerText.split(' de ') )

        lastPage = paginatorValue[0]

        if ( paginatorValue[0] !== paginatorValue[1] ) {
            await page.evaluate ( () => document.querySelector('a[aria-label="Next Page"]').click() )
            await loadTableGrid ()
        } else {
            console.log ('SCRAPE ALL TABLE DATA FINISHED\nCLOSING PUPPETEER BROWSER!')
            await browser.close ()
        }
    }

    await loadTableGrid ()

})()