jsoup 相当于 DOM 范围操作,如 extractContents() 等
jsoup equivalent of DOM Range operations like extractContents() etc
我正在尝试通过 jsoup DOM 模型提取和替换 JavaScript DocumentFragments 的等价物。
有人有现成的代码来模拟 DOM Range selection and operations on it 吗?我想 select 一段文本,它可能会通过多个内联节点(例如 、 等),在此类内联节点的中间开始或结束等。在 JavaScript Range 操作很容易,从中提取 DocumentFragment,围绕它等。我猜 JavaScript Range 正在根据需要拆分内部节点,以正确处理此类提取和插入操作。我如何使用 Java 中的 jsoup 执行此操作?
编辑: 只是大声思考如何做到这一点 - 可能需要在我的范围内搜索 "peak" 元素,然后转到开始和结束如果我的起点是 child,则跳到 parent 范围,"elevate" 到 "peak level"。 0,或者在范围开始元素之前拆分元素 children 列表...如果有这样的代码准备好,我宁愿 re-use 它,否则将不得不从头开始编写.
2015 年 12 月 18 日更新: 用我开发的工作代码发布了我的答案,见下文。
两点:
- JSoup 提供了一些将文本节点作为
String
对象进行操作的方法。
- Java 及其生态系统为操作
String
对象提供了强大的 API。
在从头开始编写 DOM 范围操作之前,您可以尝试使用上述两个选项找到自己的方法。
以下是 JSoup 中的一些方法 API:
Element#text()
获取此元素的组合未编码文本作为 String。
摘自API:
Given HTML <p>Hello <b>there</b> now! </p>
, p.text() returns "Hello there now!"
Element#text(String)
用传递的未编码文本替换此元素的当前文本。
Element#ownText
仅获取此元素的未编码文本,没有所有子元素的文本。
摘自API:
For example, given HTML <p>Hello <b>there</b> now!</p>
, p.ownText() returns "Hello now!", whereas p.text() returns "Hello there now!". Note that the text within the b element is not returned, as it is not a direct child of the p element.
您可能会发现这两个食谱也很有用:
这是我承诺的代码,用于将任意范围的 DOM 主体包装到任意 html 标记中,以便于提取、移动、替换、copy/paste 等操作等
2015 年 12 月 19 日更新 通过 wrapRange() 方法变体在文本中间添加了 TextNode 拆分,可选偏移到范围应该开始的文本节点或结尾。现在可以在 jsoup DOM 模型中任意 copy/paste/移动。
待办事项:(为我自己或其他善良的人)
- 编写一个示例项目来演示这一点,加上一些测试用例,然后 post 到 GitHub。现在没时间,但似乎在我的应用程序中运行良好(处理来自网页和电子书的 HTML 代码以使用 TTS 朗读 - 参见 @Voice Aloud Reader app in Google Play)
RangeWrapper.java模块:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.Tag;
import java.util.ArrayList;
/**
* Created by greg on 12/18/2015.
*/
public class RangeWrapper {
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param endEl the last element to be included into the range
* @param html HTML to wrap around this element, e.g.
* {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, Node endEl, String html) {
if (startEl == endEl) { // special case
return (Element) startEl.wrap(html).parentNode();
}
int startDepth = NodeWalker.getNodeDepth(startEl);
int endDepth = NodeWalker.getNodeDepth(endEl);
int minDepth = getRangeMinDepth(startEl, endEl);
int n;
while (startDepth > minDepth) {
Element parent = (Element)startEl.parentNode();
if ((n = startEl.siblingIndex()) > 0) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.after(parent2);
for (int i = n; i < children.size(); i++)
parent2.appendChild(children.get(i));
startEl = parent2;
} else {
startEl = parent;
}
startDepth--;
}
while (endDepth > minDepth) {
Element parent = (Element)endEl.parentNode();
if ((n = endEl.siblingIndex()) < parent.children().size()-1) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.before(parent2);
for (int i = 0; i <= n; i++)
parent2.appendChild(children.get(i));
endEl = parent2;
} else {
endEl = parent;
}
endDepth--;
}
// Now startEl and endEl are on the same depth == minDepth.
// Wrap the range with our html string
Element range = (Element) startEl.wrap(html).parentNode();
Node nextToAppend;
do {
nextToAppend = range.nextSibling();
// If nextToAppend is null, something is really wrong...
// Commented out to let it crash and investigate,
// so far it did not happen.
//if (nextToAppend == null)
// break;
range.appendChild(nextToAppend);
} while (nextToAppend != endEl);
return range;
}
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param stOffset if startEl is TextNode, split at this offset
* and include only the tail. Otherwise ignored.
* @param endEl the last element to be included into the range
* @param endOffset if endEl is a Text node, split at this offset
* and include only the head. Otherwise ignored.
* @param html HTML to wrap around this element, e.g. {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, int stOffset, Node endEl, int endOffset, String html) {
if (stOffset > 0 && startEl instanceof TextNode) {
TextNode tn = (TextNode) startEl;
if (endOffset < tn.getWholeText().length()-1) {
startEl = tn.splitText(stOffset); // Splits tn and adds tail to DOM, returns tail
}
}
if (endOffset > 0 && endEl instanceof TextNode) {
TextNode tn = (TextNode) endEl;
if (endOffset < tn.getWholeText().length()-1) {
tn.splitText(stOffset); // Splits tn and adds tail to DOM, we take head == original endEl
}
}
return wrapRange(startEl, endEl, html);
}
/**
* Calculate the depth of the range between the two given nodes, relative to body.
* The body has depth 0.
* @param startNode the first element to be included into the range
* @param endNode the last element to be included into the range
* @return minimum depth found in the range
*/
public static int getRangeMinDepth(final Node startNode, final Node endNode) {
class DepthVisitor implements NodeWalker.NodeWalkVisitor {
private int _minDepth = Integer.MAX_VALUE;
public boolean head(Node node, int depth) {
if (depth < _minDepth)
_minDepth = depth;
return true;
}
public boolean tail(Node node, int depth) {return true;}
int getMinDepth() { return _minDepth; }
};
DepthVisitor visitor = new DepthVisitor();
NodeWalker nw = new NodeWalker(visitor);
nw.walk(startNode, endNode);
return visitor.getMinDepth();
}
}
...以及上述代码使用的 NodeWalker.java,改编自 jsoup 包中的 NodeTraversor 和 NodeVisitor 类:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
/**
* Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
* <p>
* This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
* </p>
*/
public class NodeWalker {
private NodeWalkVisitor visitor;
/**
* Create a new traversor.
* @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
*/
public NodeWalker(NodeWalkVisitor visitor) {
this.visitor = visitor;
}
/**
* Start a depth-first traverse of the whole body and all of its descendants.
* @param startNode the arbitrary start point node point within body to traverse from.
* @param endNode the arbitrary end point node point within body where we stop traverse.
* Can be null, in which case we walk until the end of the body.
*/
public void walk(Node startNode, Node endNode) {
Node node = startNode;
int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
while (node != null) {
if (!visitor.head(node, depth))
break;
if (node.childNodeSize() > 0) {
node = node.childNode(0);
depth++;
} else {
while (node.nextSibling() == null && depth > 0) {
if (!visitor.tail(node, depth) || node == endNode)
return;
node = node.parentNode();
depth--;
}
if (!visitor.tail(node, depth) || node == endNode)
break;
node = node.nextSibling();
}
}
}
// The walkBack() was not needed, but leaving it here, may be useful for something...
// /**
// * Start a depth-first backward traverse of the whole body and all of its descendants.
// * @param startNode the arbitrary start point node point within body to traverse from.
// * @param endNode the arbitrary end point node point within body where we stop traverse.
// * Can be null, in which case we walk until the end of the body.
// */
// public void walkBack(Node startNode, Node endNode) {
// Node node = startNode;
// int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
//
// while (node != null) {
// if (!visitor.tail(node, depth))
// break;
// if (node.childNodeSize() > 0) {
// node = node.childNode(node.childNodeSize() - 1);
// depth++;
// } else {
// while (node.previousSibling() == null && depth > 0) {
// if (!visitor.head(node, depth) || node == endNode)
// return;
// node = node.parentNode();
// depth--;
// }
// if (!visitor.head(node, depth) || node == endNode)
// break;
// node = node.previousSibling();
// }
// }
// }
/**
* Calculate the depth of the given node relative to body. The body has depth 0.
* @param givenNode the node within the body to calculate depth for.
* @return the depth of the givenNode
*/
public static int getNodeDepth(Node givenNode) {
Node node = givenNode;
int depth = 0; // let's calulate depth relative to body, body is depth 0
if (!(node instanceof Element) || !"body".equals(((Element) node).tagName())) {
do {
depth++;
node = (Element)node.parentNode();
} while (node != null && !"body".equals(((Element) node).tagName()));
}
return depth;
}
public interface NodeWalkVisitor {
/**
* Callback for when a node is first visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean head(Node node, int depth);
/**
* Callback for when a node is last visited, after all of its descendants have been visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean tail(Node node, int depth);
}
}
格雷格
我正在尝试通过 jsoup DOM 模型提取和替换 JavaScript DocumentFragments 的等价物。
有人有现成的代码来模拟 DOM Range selection and operations on it 吗?我想 select 一段文本,它可能会通过多个内联节点(例如 、 等),在此类内联节点的中间开始或结束等。在 JavaScript Range 操作很容易,从中提取 DocumentFragment,围绕它等。我猜 JavaScript Range 正在根据需要拆分内部节点,以正确处理此类提取和插入操作。我如何使用 Java 中的 jsoup 执行此操作?
编辑: 只是大声思考如何做到这一点 - 可能需要在我的范围内搜索 "peak" 元素,然后转到开始和结束如果我的起点是 child,则跳到 parent 范围,"elevate" 到 "peak level"。 0,或者在范围开始元素之前拆分元素 children 列表...如果有这样的代码准备好,我宁愿 re-use 它,否则将不得不从头开始编写.
2015 年 12 月 18 日更新: 用我开发的工作代码发布了我的答案,见下文。
两点:
- JSoup 提供了一些将文本节点作为
String
对象进行操作的方法。 - Java 及其生态系统为操作
String
对象提供了强大的 API。
在从头开始编写 DOM 范围操作之前,您可以尝试使用上述两个选项找到自己的方法。
以下是 JSoup 中的一些方法 API:
Element#text()
获取此元素的组合未编码文本作为 String。
摘自API:Given HTML
<p>Hello <b>there</b> now! </p>
, p.text() returns "Hello there now!"
Element#text(String)
用传递的未编码文本替换此元素的当前文本。
Element#ownText
仅获取此元素的未编码文本,没有所有子元素的文本。
摘自API:For example, given HTML
<p>Hello <b>there</b> now!</p>
, p.ownText() returns "Hello now!", whereas p.text() returns "Hello there now!". Note that the text within the b element is not returned, as it is not a direct child of the p element.
您可能会发现这两个食谱也很有用:
这是我承诺的代码,用于将任意范围的 DOM 主体包装到任意 html 标记中,以便于提取、移动、替换、copy/paste 等操作等
2015 年 12 月 19 日更新 通过 wrapRange() 方法变体在文本中间添加了 TextNode 拆分,可选偏移到范围应该开始的文本节点或结尾。现在可以在 jsoup DOM 模型中任意 copy/paste/移动。
待办事项:(为我自己或其他善良的人)
- 编写一个示例项目来演示这一点,加上一些测试用例,然后 post 到 GitHub。现在没时间,但似乎在我的应用程序中运行良好(处理来自网页和电子书的 HTML 代码以使用 TTS 朗读 - 参见 @Voice Aloud Reader app in Google Play)
RangeWrapper.java模块:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.parser.Tag;
import java.util.ArrayList;
/**
* Created by greg on 12/18/2015.
*/
public class RangeWrapper {
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param endEl the last element to be included into the range
* @param html HTML to wrap around this element, e.g.
* {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, Node endEl, String html) {
if (startEl == endEl) { // special case
return (Element) startEl.wrap(html).parentNode();
}
int startDepth = NodeWalker.getNodeDepth(startEl);
int endDepth = NodeWalker.getNodeDepth(endEl);
int minDepth = getRangeMinDepth(startEl, endEl);
int n;
while (startDepth > minDepth) {
Element parent = (Element)startEl.parentNode();
if ((n = startEl.siblingIndex()) > 0) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.after(parent2);
for (int i = n; i < children.size(); i++)
parent2.appendChild(children.get(i));
startEl = parent2;
} else {
startEl = parent;
}
startDepth--;
}
while (endDepth > minDepth) {
Element parent = (Element)endEl.parentNode();
if ((n = endEl.siblingIndex()) < parent.children().size()-1) {
// splitting the parent
ArrayList<Node> children = new ArrayList<Node>(parent.childNodes());
Element parent2 = new Element(Tag.valueOf(parent.tagName()), parent.baseUri(), parent.attributes());
parent.before(parent2);
for (int i = 0; i <= n; i++)
parent2.appendChild(children.get(i));
endEl = parent2;
} else {
endEl = parent;
}
endDepth--;
}
// Now startEl and endEl are on the same depth == minDepth.
// Wrap the range with our html string
Element range = (Element) startEl.wrap(html).parentNode();
Node nextToAppend;
do {
nextToAppend = range.nextSibling();
// If nextToAppend is null, something is really wrong...
// Commented out to let it crash and investigate,
// so far it did not happen.
//if (nextToAppend == null)
// break;
range.appendChild(nextToAppend);
} while (nextToAppend != endEl);
return range;
}
/**
* Wrap the supplied HTML around the "range" from startEl to endEl.*
* @param startEl the first element to be included into the range
* @param stOffset if startEl is TextNode, split at this offset
* and include only the tail. Otherwise ignored.
* @param endEl the last element to be included into the range
* @param endOffset if endEl is a Text node, split at this offset
* and include only the head. Otherwise ignored.
* @param html HTML to wrap around this element, e.g. {@code <span class="head"></span>}. Can be arbitrarily deep.
* @return the wrapping element
*/
public static Element wrapRange(Node startEl, int stOffset, Node endEl, int endOffset, String html) {
if (stOffset > 0 && startEl instanceof TextNode) {
TextNode tn = (TextNode) startEl;
if (endOffset < tn.getWholeText().length()-1) {
startEl = tn.splitText(stOffset); // Splits tn and adds tail to DOM, returns tail
}
}
if (endOffset > 0 && endEl instanceof TextNode) {
TextNode tn = (TextNode) endEl;
if (endOffset < tn.getWholeText().length()-1) {
tn.splitText(stOffset); // Splits tn and adds tail to DOM, we take head == original endEl
}
}
return wrapRange(startEl, endEl, html);
}
/**
* Calculate the depth of the range between the two given nodes, relative to body.
* The body has depth 0.
* @param startNode the first element to be included into the range
* @param endNode the last element to be included into the range
* @return minimum depth found in the range
*/
public static int getRangeMinDepth(final Node startNode, final Node endNode) {
class DepthVisitor implements NodeWalker.NodeWalkVisitor {
private int _minDepth = Integer.MAX_VALUE;
public boolean head(Node node, int depth) {
if (depth < _minDepth)
_minDepth = depth;
return true;
}
public boolean tail(Node node, int depth) {return true;}
int getMinDepth() { return _minDepth; }
};
DepthVisitor visitor = new DepthVisitor();
NodeWalker nw = new NodeWalker(visitor);
nw.walk(startNode, endNode);
return visitor.getMinDepth();
}
}
...以及上述代码使用的 NodeWalker.java,改编自 jsoup 包中的 NodeTraversor 和 NodeVisitor 类:
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
/**
* Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
* <p>
* This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
* </p>
*/
public class NodeWalker {
private NodeWalkVisitor visitor;
/**
* Create a new traversor.
* @param visitor a class implementing the {@link NodeVisitor} interface, to be called when visiting each node.
*/
public NodeWalker(NodeWalkVisitor visitor) {
this.visitor = visitor;
}
/**
* Start a depth-first traverse of the whole body and all of its descendants.
* @param startNode the arbitrary start point node point within body to traverse from.
* @param endNode the arbitrary end point node point within body where we stop traverse.
* Can be null, in which case we walk until the end of the body.
*/
public void walk(Node startNode, Node endNode) {
Node node = startNode;
int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
while (node != null) {
if (!visitor.head(node, depth))
break;
if (node.childNodeSize() > 0) {
node = node.childNode(0);
depth++;
} else {
while (node.nextSibling() == null && depth > 0) {
if (!visitor.tail(node, depth) || node == endNode)
return;
node = node.parentNode();
depth--;
}
if (!visitor.tail(node, depth) || node == endNode)
break;
node = node.nextSibling();
}
}
}
// The walkBack() was not needed, but leaving it here, may be useful for something...
// /**
// * Start a depth-first backward traverse of the whole body and all of its descendants.
// * @param startNode the arbitrary start point node point within body to traverse from.
// * @param endNode the arbitrary end point node point within body where we stop traverse.
// * Can be null, in which case we walk until the end of the body.
// */
// public void walkBack(Node startNode, Node endNode) {
// Node node = startNode;
// int depth = getNodeDepth(startNode); // let's calulate depth relative to body, body is depth 0
//
// while (node != null) {
// if (!visitor.tail(node, depth))
// break;
// if (node.childNodeSize() > 0) {
// node = node.childNode(node.childNodeSize() - 1);
// depth++;
// } else {
// while (node.previousSibling() == null && depth > 0) {
// if (!visitor.head(node, depth) || node == endNode)
// return;
// node = node.parentNode();
// depth--;
// }
// if (!visitor.head(node, depth) || node == endNode)
// break;
// node = node.previousSibling();
// }
// }
// }
/**
* Calculate the depth of the given node relative to body. The body has depth 0.
* @param givenNode the node within the body to calculate depth for.
* @return the depth of the givenNode
*/
public static int getNodeDepth(Node givenNode) {
Node node = givenNode;
int depth = 0; // let's calulate depth relative to body, body is depth 0
if (!(node instanceof Element) || !"body".equals(((Element) node).tagName())) {
do {
depth++;
node = (Element)node.parentNode();
} while (node != null && !"body".equals(((Element) node).tagName()));
}
return depth;
}
public interface NodeWalkVisitor {
/**
* Callback for when a node is first visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean head(Node node, int depth);
/**
* Callback for when a node is last visited, after all of its descendants have been visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
* @return true to continue walk, false to abort
*/
boolean tail(Node node, int depth);
}
}
格雷格