注意帮助使用 jsoup 从 html 解析 mp3 文件的 url
Heed help parsing mp3 files' urls from html with jsoup
我有一个异步方法 fillind pojo class 字段,其中包含通过 jsoup 解析的数据。我正在尝试通过 foreach 从该页面解析本书的单个章节的 mp3 文件的 url,但我尝试过的所有查询都失败了。
http://www.loyalbooks.com/book/adventures-of-huckleberry-finn-by-mark-twain
单个元素在页面代码中看起来像这样,并且 id 号在章节之间变化
<div class="jp-free-media" style="font-size:xx-small;">(<a id="jp_playlist_1_item_0_mp3" href="http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3" tabindex="1">download</a>)</div>
我的AsyncTask,在mLines2中搜索mp3网址:
public class FillBook extends AsyncTask<Void, Void, SingleBook> {
private String link;
private String imgLink;
private String title;
ArrayList<String> tmpChapters = new ArrayList<>();
private SingleBook book;
public FillBook(String link, String imgLink, String title) {
this.link = link;
this.imgLink = imgLink;
this.title = title;
}
@Override
protected SingleBook doInBackground(Void... params) {
Document doc = null;
book = new SingleBook(imgLink, title, false, false, null, new ArrayList<String>());
Elements mLines;
Elements mLines2;
try {
doc = Jsoup.connect(link).get();
} catch (IOException | RuntimeException e) {
e.printStackTrace();
}
if (doc != null) {
mLines = doc.getElementsByClass("book-description");
for (Element mLine : mLines) {
String description= mLine.text();
book.setDescription(description);
}
mLines2 = doc.select(".jp-free-media");
for (Element mLine2 : mLines2) {
tmpChapters.add(mLine2.attr("href"));
}
}else
System.out.println("ERROR");
book.setChapters(tmpChapters);
return book;
}
protected void onPostExecute(SingleBook book) {
super.onPostExecute(book);
Toast.makeText(BookActivity.this, book.getChapters().get(0), Toast.LENGTH_LONG).show();
Picasso.get().load(book.getImgUrl()).into(bookCover);
nameAndAuthor.setText(book.getTitleAndAuthor());
bookDescription.setText(book.getDescription());
最后我得到了空的 ArrayList。
如何获得 http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3 考虑到下一章将是 id="jp_playlist_1_item_1_mp3" 的字符串?
来自俄罗斯 Whosebug 的 Tiarait 帮助找到了解决方案。重点是上面提到的元素是js创建的。我需要获取文档正文,然后通过拆分获取以下数组。
var audioPlaylist = new Playlist("1", [
{名称:"Chapter 01", free:true, mp3:"http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3"},
{名称:"Chapter 02", free:true, mp3:"http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_02_twain_64kb.mp3"},
...
doInBackground 方法应更改为:
@Override
protected SingleBook doInBackground(Void... params) {
Document doc = null;
book = new SingleBook(imgLink, title, false, false, null, new ArrayList<String>());
Elements mLines;
try {
doc = Jsoup.connect(link).get();
} catch (IOException | RuntimeException e) {
e.printStackTrace();
}
if (doc != null) {
mLines = doc.getElementsByClass("book-description");
for (Element mLine : mLines) {
String description= mLine.text();
book.setDescription(description);
}
String arr = "";
String html = doc.body().html();
if (html.contains("var audioPlaylist = new Playlist(\"1\", ["))
arr = html.split("var audioPlaylist = new Playlist\(\"1\", \[")[1];
if (arr.contains("]"))
arr = arr.split("\]")[0];
//-----------------------------------------
if (arr.contains("},{")) {
for (String mLine2 : arr.split("\},\{")) {
if (mLine2.contains("mp3:\""))
tmpChapters.add(mLine2.split("mp3:\"")[1].split("\"")[0]);
}
} else if (arr.contains("mp3:\""))
tmpChapters.add(arr.split("mp3:\"")[1].split("\"")[0]);
}else
System.out.println("ERROR");
book.setChapters(tmpChapters);
return book;
}
我有一个异步方法 fillind pojo class 字段,其中包含通过 jsoup 解析的数据。我正在尝试通过 foreach 从该页面解析本书的单个章节的 mp3 文件的 url,但我尝试过的所有查询都失败了。
http://www.loyalbooks.com/book/adventures-of-huckleberry-finn-by-mark-twain
单个元素在页面代码中看起来像这样,并且 id 号在章节之间变化
<div class="jp-free-media" style="font-size:xx-small;">(<a id="jp_playlist_1_item_0_mp3" href="http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3" tabindex="1">download</a>)</div>
我的AsyncTask,在mLines2中搜索mp3网址:
public class FillBook extends AsyncTask<Void, Void, SingleBook> {
private String link;
private String imgLink;
private String title;
ArrayList<String> tmpChapters = new ArrayList<>();
private SingleBook book;
public FillBook(String link, String imgLink, String title) {
this.link = link;
this.imgLink = imgLink;
this.title = title;
}
@Override
protected SingleBook doInBackground(Void... params) {
Document doc = null;
book = new SingleBook(imgLink, title, false, false, null, new ArrayList<String>());
Elements mLines;
Elements mLines2;
try {
doc = Jsoup.connect(link).get();
} catch (IOException | RuntimeException e) {
e.printStackTrace();
}
if (doc != null) {
mLines = doc.getElementsByClass("book-description");
for (Element mLine : mLines) {
String description= mLine.text();
book.setDescription(description);
}
mLines2 = doc.select(".jp-free-media");
for (Element mLine2 : mLines2) {
tmpChapters.add(mLine2.attr("href"));
}
}else
System.out.println("ERROR");
book.setChapters(tmpChapters);
return book;
}
protected void onPostExecute(SingleBook book) {
super.onPostExecute(book);
Toast.makeText(BookActivity.this, book.getChapters().get(0), Toast.LENGTH_LONG).show();
Picasso.get().load(book.getImgUrl()).into(bookCover);
nameAndAuthor.setText(book.getTitleAndAuthor());
bookDescription.setText(book.getDescription());
最后我得到了空的 ArrayList。 如何获得 http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3 考虑到下一章将是 id="jp_playlist_1_item_1_mp3" 的字符串?
来自俄罗斯 Whosebug 的 Tiarait 帮助找到了解决方案。重点是上面提到的元素是js创建的。我需要获取文档正文,然后通过拆分获取以下数组。
var audioPlaylist = new Playlist("1", [ {名称:"Chapter 01", free:true, mp3:"http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_01_twain_64kb.mp3"}, {名称:"Chapter 02", free:true, mp3:"http://www.archive.org/download/huckleberry_mfs_librivox/huckleberry_finn_02_twain_64kb.mp3"}, ...
doInBackground 方法应更改为:
@Override
protected SingleBook doInBackground(Void... params) {
Document doc = null;
book = new SingleBook(imgLink, title, false, false, null, new ArrayList<String>());
Elements mLines;
try {
doc = Jsoup.connect(link).get();
} catch (IOException | RuntimeException e) {
e.printStackTrace();
}
if (doc != null) {
mLines = doc.getElementsByClass("book-description");
for (Element mLine : mLines) {
String description= mLine.text();
book.setDescription(description);
}
String arr = "";
String html = doc.body().html();
if (html.contains("var audioPlaylist = new Playlist(\"1\", ["))
arr = html.split("var audioPlaylist = new Playlist\(\"1\", \[")[1];
if (arr.contains("]"))
arr = arr.split("\]")[0];
//-----------------------------------------
if (arr.contains("},{")) {
for (String mLine2 : arr.split("\},\{")) {
if (mLine2.contains("mp3:\""))
tmpChapters.add(mLine2.split("mp3:\"")[1].split("\"")[0]);
}
} else if (arr.contains("mp3:\""))
tmpChapters.add(arr.split("mp3:\"")[1].split("\"")[0]);
}else
System.out.println("ERROR");
book.setChapters(tmpChapters);
return book;
}