Android 使用 Jsoup 解析嵌套表格
Android parse nested tables with Jsoup
我正在尝试在线解析 HTML 页面以使用 Jsoup 从 table 检索数据。我要解析的页面包含多个 table.
我该怎么做?
这是我要解析的示例页面:
https://www.cpu-world.com/info/AMD/AMD_A4-Series.html
我要提取的数据是Model Name和详情页的URL。
编辑:
这是我用来从详细信息页面提取数据的一些代码。
try {
/**
* Works to iterate through the items at the following website
* https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
*/
URL url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
Document doc = Jsoup.parse(url, 3000);
// spec_table is the name of the class associated with the table
Elements table = doc.select("table.spec_table");
Elements rows = table.select("tr");
Iterator<Element> rowIterator = rows.iterator();
rowIterator.next();
boolean wasMatch = false;
// Loop through all items in list
while (rowIterator.hasNext()) {
Element row = rowIterator.next();
Elements cols = row.select("td");
String rowName = cols.get(0).text();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
我一直在阅读一些教程和文档,但我似乎无法弄清楚如何浏览网页以提取我正在寻找的数据。我了解 HTML 和 CSS,但我只是在了解 Jsoup。
(我将其标记为 Android 因为那是我使用 Java 代码的地方。我猜没必要那么具体。)
这看起来像你想要的:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.net.URL;
public class CpuWorld {
public static void main(String[] args) throws IOException {
URL url = null;
try {
/**
* Works to iterate through the items at the following website
* https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
*/
url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
} catch (IOException e) {
e.printStackTrace();
}
Document doc = Jsoup.parse(url, 3000);
// spec_table is the name of the class associated with the table
String modelNumber = doc.select("table tr:has(td:contains(Model number)) td b a").text();
String modelUrl = doc.select("table tr:has(td:contains(Model number)) td b a").attr("href");
System.out.println(modelNumber + " : " + modelUrl);
}
}
如果这不是您想要的,请告诉我
编辑:结果:
A4-3300 : https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
Process finished with exit code 0
编辑:
这比一盒青蛙还疯狂,但我们开始吧...我将让您将 2 和 2 放在一起以遍历 URL 以获取您想要的个人详细信息:
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.client.RestTemplate;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
public class CpuWorld {
public static final String CPU_WORLD_COM_URL = "https://www.cpu-world.com/info/AMD/AMD_A4-Series.html";
public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
public static final String SCRAMBLED_DATA_FOOTER = "//- qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";
private static RestTemplate restTemplate = new RestTemplate();
public static void main(String[] args) throws IOException {
Document tableData = getTableData(CPU_WORLD_COM_URL);
List<String> fullUrls = tableData.select("table tr td:contains(a) a").stream()
.map(e -> "https://www.cpu-world.com/" + e.attr("href"))
.collect(Collectors.toList());
List<String> fullModels = tableData.select("table tr td:contains(a) a").stream()
.map(e -> e.text())
.collect(Collectors.toList());
for (int i=0; i< fullUrls.size(); i++) {
System.out.println(fullModels.get(i) + " : " + fullUrls.get(i));
}
}
private static Document getTableData(String url) {
Connection.Response response = null;
try {
response = Jsoup
.connect(url)
.headers(getHeaders())
.method(Connection.Method.GET)
.data()
.execute();
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
Elements script = Jsoup.parse(response.body()).select("script");
// take substring of the child node from after the header and before the footer (- 6 more chars which seem dynamic)
// The script tag containing JSC_Process is the one with the data in (but all mangled).
Optional<String> scrambledData = script.stream()
.filter(element -> element.data().contains("JSC_Process"))
.map(node -> node.data().substring(SCRAMBLED_DATA_HEADER.length(), (node.data().length() - SCRAMBLED_DATA_FOOTER.length()-6)))
.findFirst();
String tableData = Unscrambler.unscramble(scrambledData.orElseThrow(() -> new RuntimeException("scrambled data not found in relevant script tag")));
Document doc = Jsoup.parse(tableData);
return doc;
}
private static boolean isNotEmptyString(Element node) {
return node.data() != null && !node.data().equals("");
}
/**
* trick server into thinking we're not a bot
* by telling the server we were referred by the server itself
* and give tell it we're using a Mozilla/Safari browser
**/
private static Map<String, String> getHeaders() {
Map<String, String> headersMap = new HashMap<>();
headersMap.put("User-Agent", "Mozilla/5.0 Safari/537.36");
headersMap.put("Referer", CPU_WORLD_COM_URL);
return headersMap;
}
}
class Unscrambler {
public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
public static final String SCRAMBLED_DATA_FOOTER = "qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";
public static String unscramble(String data) {
String a=data.replace("\\"","'")
.replace("\\", "\")
.replace("\r", "")
.replace("\n", "")
.replace("\"+\r\n\"", ""); // remove gunk that mucks up processing in java
StringBuffer buffer = new StringBuffer();
int e = 0;
int ei = 2;
// This is effectively what the code in the footer is doing. Heavily un-obfuscated below.
// swap two chars around - through
for (int aj=0; aj < a.length()-2; aj+=2) {
String xg = a.substring(e, ei);
buffer.append(xg.substring(1,2) + xg.substring(0,1));
e+=2;
ei+=2;
}
return buffer.toString().replace("\n","");
}
}
我正在尝试在线解析 HTML 页面以使用 Jsoup 从 table 检索数据。我要解析的页面包含多个 table.
我该怎么做?
这是我要解析的示例页面:
https://www.cpu-world.com/info/AMD/AMD_A4-Series.html
我要提取的数据是Model Name和详情页的URL。
编辑:
这是我用来从详细信息页面提取数据的一些代码。
try {
/**
* Works to iterate through the items at the following website
* https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
*/
URL url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
Document doc = Jsoup.parse(url, 3000);
// spec_table is the name of the class associated with the table
Elements table = doc.select("table.spec_table");
Elements rows = table.select("tr");
Iterator<Element> rowIterator = rows.iterator();
rowIterator.next();
boolean wasMatch = false;
// Loop through all items in list
while (rowIterator.hasNext()) {
Element row = rowIterator.next();
Elements cols = row.select("td");
String rowName = cols.get(0).text();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
我一直在阅读一些教程和文档,但我似乎无法弄清楚如何浏览网页以提取我正在寻找的数据。我了解 HTML 和 CSS,但我只是在了解 Jsoup。
(我将其标记为 Android 因为那是我使用 Java 代码的地方。我猜没必要那么具体。)
这看起来像你想要的:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.net.URL;
public class CpuWorld {
public static void main(String[] args) throws IOException {
URL url = null;
try {
/**
* Works to iterate through the items at the following website
* https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
*/
url = new URL("https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html");
} catch (IOException e) {
e.printStackTrace();
}
Document doc = Jsoup.parse(url, 3000);
// spec_table is the name of the class associated with the table
String modelNumber = doc.select("table tr:has(td:contains(Model number)) td b a").text();
String modelUrl = doc.select("table tr:has(td:contains(Model number)) td b a").attr("href");
System.out.println(modelNumber + " : " + modelUrl);
}
}
如果这不是您想要的,请告诉我
编辑:结果:
A4-3300 : https://www.cpu-world.com/CPUs/K10/AMD-A4-Series%20A4-3300.html
Process finished with exit code 0
编辑:
这比一盒青蛙还疯狂,但我们开始吧...我将让您将 2 和 2 放在一起以遍历 URL 以获取您想要的个人详细信息:
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.client.RestTemplate;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
public class CpuWorld {
public static final String CPU_WORLD_COM_URL = "https://www.cpu-world.com/info/AMD/AMD_A4-Series.html";
public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
public static final String SCRAMBLED_DATA_FOOTER = "//- qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";
private static RestTemplate restTemplate = new RestTemplate();
public static void main(String[] args) throws IOException {
Document tableData = getTableData(CPU_WORLD_COM_URL);
List<String> fullUrls = tableData.select("table tr td:contains(a) a").stream()
.map(e -> "https://www.cpu-world.com/" + e.attr("href"))
.collect(Collectors.toList());
List<String> fullModels = tableData.select("table tr td:contains(a) a").stream()
.map(e -> e.text())
.collect(Collectors.toList());
for (int i=0; i< fullUrls.size(); i++) {
System.out.println(fullModels.get(i) + " : " + fullUrls.get(i));
}
}
private static Document getTableData(String url) {
Connection.Response response = null;
try {
response = Jsoup
.connect(url)
.headers(getHeaders())
.method(Connection.Method.GET)
.data()
.execute();
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
Elements script = Jsoup.parse(response.body()).select("script");
// take substring of the child node from after the header and before the footer (- 6 more chars which seem dynamic)
// The script tag containing JSC_Process is the one with the data in (but all mangled).
Optional<String> scrambledData = script.stream()
.filter(element -> element.data().contains("JSC_Process"))
.map(node -> node.data().substring(SCRAMBLED_DATA_HEADER.length(), (node.data().length() - SCRAMBLED_DATA_FOOTER.length()-6)))
.findFirst();
String tableData = Unscrambler.unscramble(scrambledData.orElseThrow(() -> new RuntimeException("scrambled data not found in relevant script tag")));
Document doc = Jsoup.parse(tableData);
return doc;
}
private static boolean isNotEmptyString(Element node) {
return node.data() != null && !node.data().equals("");
}
/**
* trick server into thinking we're not a bot
* by telling the server we were referred by the server itself
* and give tell it we're using a Mozilla/Safari browser
**/
private static Map<String, String> getHeaders() {
Map<String, String> headersMap = new HashMap<>();
headersMap.put("User-Agent", "Mozilla/5.0 Safari/537.36");
headersMap.put("Referer", CPU_WORLD_COM_URL);
return headersMap;
}
}
class Unscrambler {
public static final String SCRAMBLED_DATA_HEADER = "<!--\r\nfunction JSC_Process () {var bk,xg,qh,k,aj,y,e,cq,u,a,ei;\r\na=\"";
public static final String SCRAMBLED_DATA_FOOTER = "qh=[\"\"];k=[2];cq=[7600];if (CW_AB){if\t((AB_v!='0')&&(AB_v!='X')&&(AB_Gl((AB_v=='')?99:3)==3)){y=1;AB_cb=function(){JSC_Process();};}else{y=2;}}for(aj=e=0;aj<k.length;aj++){ei=cq[aj];bk=qh[aj];if (!bk) bk=\"JSc_\"+aj;u=CW_E(bk);if (u){bk=\" jsc_a\";if (y>=k[aj]){xg=a.substr(e,ei);xg=xg.replace(/(.)(.)/g,\"\");u.innerHTML=xg.replace(/\\n/g,\"\n\");bk='';}u.className=u.className.replace(/(^| )jsc_\w+$/,bk);}e+=ei;}}JSC_Process();";
public static String unscramble(String data) {
String a=data.replace("\\"","'")
.replace("\\", "\")
.replace("\r", "")
.replace("\n", "")
.replace("\"+\r\n\"", ""); // remove gunk that mucks up processing in java
StringBuffer buffer = new StringBuffer();
int e = 0;
int ei = 2;
// This is effectively what the code in the footer is doing. Heavily un-obfuscated below.
// swap two chars around - through
for (int aj=0; aj < a.length()-2; aj+=2) {
String xg = a.substring(e, ei);
buffer.append(xg.substring(1,2) + xg.substring(0,1));
e+=2;
ei+=2;
}
return buffer.toString().replace("\n","");
}
}