Android 使用 Jsoup 解析嵌套表格

Question

我正在尝试在线解析 HTML 页面以使用 Jsoup 从 table 检索数据。我要解析的页面包含多个 table.

我该怎么做？

这是我要解析的示例页面：

https://www.cpu-world.com/info/AMD/AMD_A4-Series.html

我要提取的数据是Model Name和详情页的URL。

编辑：

这是我用来从详细信息页面提取数据的一些代码。

            try {
                /**
                 * Works to iterate through the items at the following website
                 * https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
                 */
                URL url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
                
                Document doc = Jsoup.parse(url, 3000);
                
                // spec_table is the name of the class associated with the table
                Elements table = doc.select("table.spec_table");
                Elements rows = table.select("tr");
                
                Iterator<Element> rowIterator = rows.iterator();
                rowIterator.next();
                boolean wasMatch = false;
                
                // Loop through all items in list
                while (rowIterator.hasNext()) {
                    Element row = rowIterator.next();
                    Elements cols = row.select("td");
                    String rowName = cols.get(0).text();
                }
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }

我一直在阅读一些教程和文档，但我似乎无法弄清楚如何浏览网页以提取我正在寻找的数据。我了解 HTML 和 CSS，但我只是在了解 Jsoup。

（我将其标记为 Android 因为那是我使用 Java 代码的地方。我猜没必要那么具体。）

Answer 1

这看起来像你想要的：

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.net.URL;

public class CpuWorld {

    public static void main(String[] args) throws IOException {
        URL url = null;
        try {
            /**
             * Works to iterate through the items at the following website
             * https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
             */
             url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
        } catch (IOException e) {
            e.printStackTrace();
        }

        Document doc = Jsoup.parse(url, 3000);
        // spec_table is the name of the class associated with the table
        String modelNumber = doc.select("table tr:has(td:contains(Model number)) td b a").text();
        String modelUrl = doc.select("table tr:has(td:contains(Model number)) td b a").attr("href");

        System.out.println(modelNumber + " : " + modelUrl);
    }
}

如果这不是您想要的，请告诉我

编辑：结果：

A4-3300 : https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html

Process finished with exit code 0

编辑：

这比一盒青蛙还疯狂，但我们开始吧...我将让您将 2 和 2 放在一起以遍历 URL 以获取您想要的个人详细信息：

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.client.RestTemplate;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

public class CpuWorld {

    public static final String CPU_WORLD_COM_URL = "https://www.cpu-world.com/info/AMD/AMD_A4-Series.html";

    public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
    public static final String SCRAMBLED_DATA_FOOTER = "//- qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";

    private static RestTemplate restTemplate = new RestTemplate();

    public static void main(String[] args) throws IOException {
        Document tableData = getTableData(CPU_WORLD_COM_URL);

        List<String> fullUrls = tableData.select("table tr td:contains(a) a").stream()
                .map(e -> "https://www.cpu-world.com/" + e.attr("href"))
                .collect(Collectors.toList());

        List<String> fullModels = tableData.select("table tr td:contains(a) a").stream()
                .map(e -> e.text())
                .collect(Collectors.toList());

        for (int i=0; i< fullUrls.size(); i++) {
            System.out.println(fullModels.get(i) + " : " + fullUrls.get(i));
        }
    }

    private static Document getTableData(String url) {
        Connection.Response response = null;
        try {
            response = Jsoup
                    .connect(url)
                    .headers(getHeaders())
                    .method(Connection.Method.GET)
                    .data()
                    .execute();

        } catch (IOException e) {
            e.printStackTrace();
            System.exit(1);
        }
        Elements script = Jsoup.parse(response.body()).select("script");

        // take substring of the child node from after the header and before the footer (- 6 more chars which seem dynamic)
        // The script tag containing JSC_Process is the one with the data in (but all mangled).
        Optional<String> scrambledData = script.stream()
                .filter(element -> element.data().contains("JSC_Process"))
                .map(node -> node.data().substring(SCRAMBLED_DATA_HEADER.length(), (node.data().length() - SCRAMBLED_DATA_FOOTER.length()-6)))
                .findFirst();

        String tableData = Unscrambler.unscramble(scrambledData.orElseThrow(() -> new RuntimeException("scrambled data not found in relevant script tag")));

        Document doc = Jsoup.parse(tableData);
        return doc;
    }

    private static boolean isNotEmptyString(Element node) {
        return node.data() != null && !node.data().equals("");
    }

    /**
    * trick server into thinking we're not a bot
    * by telling the server we were referred by the server itself
    * and give tell it we're using a Mozilla/Safari browser
    **/
    private static Map<String, String> getHeaders() {
        Map<String, String> headersMap = new HashMap<>();
        headersMap.put("User-Agent", "Mozilla/5.0 Safari/537.36");
        headersMap.put("Referer", CPU_WORLD_COM_URL);
        return headersMap;
    }
}

class Unscrambler {

    public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
    public static final String SCRAMBLED_DATA_FOOTER = "qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";

    public static String unscramble(String data) {
        String a=data.replace("\\"","'")
                .replace("\\", "\")
                .replace("\r", "")
                .replace("\n", "")
                .replace("\"+\r\n\"", ""); // remove gunk that mucks up processing in java
        StringBuffer buffer = new StringBuffer();
        int e = 0;
        int ei = 2;

        // This is effectively what the code in the footer is doing. Heavily un-obfuscated below.
        // swap two chars around - through
        for (int aj=0; aj < a.length()-2; aj+=2) {
            String xg = a.substring(e, ei);
            buffer.append(xg.substring(1,2) + xg.substring(0,1));
            e+=2;
            ei+=2;
        }
        return buffer.toString().replace("\n","");
    }
}

Android 使用 Jsoup 解析嵌套表格

Android parse nested tables with Jsoup

java

web-scraping

jsoup