jsoup:对文本的操作
jsoup: manipulation on text
我有一个html文件,例如:
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>
我在 java 中编写了一个方法,可以将文本符号从拉丁文转换为西里尔文,例如:
public static char changeLetterLatCyr(char charSent) {
char l_A = 'A',
l_a = 'a',
l_B = 'B',
l_b = 'b',
r_A = 'А',
r_a = 'а',
r_B = 'Б',
r_b = 'б',
result = ' ';
if (charSent == l_A) {
result = r_A;
} else if (charSent == l_a) {
result = r_a;
} else if (charSent == l_B) {
result = r_B;
} else if (charSent == l_b) {
result = r_b;
} else {
result = charSent;
}
return result;
}
如何在保存所有标签结构的文档中的所有文本上实现我的功能?该函数将每个字符更改为特定字符。
我需要这个结果:
<!DOCTYPE html>
<html>
<body>
<h1>some manipulation on text</h1>
<p>some manipulation on text</p>
</body>
</html>
这是您的操作方法。 NodeVisitor 在这里施展魔法。
Java
package com.github.davidepastore.Whosebug33463949;
import java.io.IOException;
import java.io.InputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
/**
* Whosebug 33463949 question.
*
*/
public class App {
/**
* Starts the app here.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
ClassLoader classloader = Thread.currentThread()
.getContextClassLoader();
InputStream is = classloader.getResourceAsStream("file.html");
Document document = Jsoup.parse(is, "UTF-8", "");
Elements elements = document.select("body");
manipulateElements(elements);
System.out.println("Result: " + document.toString());
}
/**
* Manipulate the {@link Elements}.
* @param elements The {@link Elements} to manipulate.
*/
private static void manipulateElements(Elements elements) {
elements.traverse(new NodeVisitor() {
public void tail(Node node, int depth) {
}
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.text().trim();
if (!text.isEmpty()) {
char[] newChars = new char[text.length()];
for (int i = 0; i < text.length(); i++) {
newChars[i] = changeLetterLatCyr(text.charAt(i));
}
textNode.text(new String(newChars));
}
}
}
});
}
/**
* Your own custom change letter method.
* @param charSent The char to convert.
* @return Returns the converted char.
*/
public static char changeLetterLatCyr(char charSent) {
char l_A = 'A',
l_a = 'a',
l_B = 'B',
l_b = 'b',
r_A = 'А',
r_a = 'а',
r_B = 'Б',
r_b = 'б',
result = ' ';
if (charSent == l_A) {
result = r_A;
} else if (charSent == l_a) {
result = r_a;
} else if (charSent == l_B) {
result = r_B;
} else if (charSent == l_b) {
result = r_b;
} else {
result = charSent;
}
return result;
}
}
HTML
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<div>
<p>A a B b Complex structure</p>
</div>
</body>
</html>
输出
Result: <!doctype html>
<html>
<head></head>
<body>
<h1>My First Heаding</h1>
<p>My first pаrаgrаph.</p>
<div>
<p>А а Б б Complex structure</p>
</div>
</body>
</html>
我有一个html文件,例如:
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>
我在 java 中编写了一个方法,可以将文本符号从拉丁文转换为西里尔文,例如:
public static char changeLetterLatCyr(char charSent) {
char l_A = 'A',
l_a = 'a',
l_B = 'B',
l_b = 'b',
r_A = 'А',
r_a = 'а',
r_B = 'Б',
r_b = 'б',
result = ' ';
if (charSent == l_A) {
result = r_A;
} else if (charSent == l_a) {
result = r_a;
} else if (charSent == l_B) {
result = r_B;
} else if (charSent == l_b) {
result = r_b;
} else {
result = charSent;
}
return result;
}
如何在保存所有标签结构的文档中的所有文本上实现我的功能?该函数将每个字符更改为特定字符。
我需要这个结果:
<!DOCTYPE html>
<html>
<body>
<h1>some manipulation on text</h1>
<p>some manipulation on text</p>
</body>
</html>
这是您的操作方法。 NodeVisitor 在这里施展魔法。
Java
package com.github.davidepastore.Whosebug33463949;
import java.io.IOException;
import java.io.InputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
/**
* Whosebug 33463949 question.
*
*/
public class App {
/**
* Starts the app here.
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
ClassLoader classloader = Thread.currentThread()
.getContextClassLoader();
InputStream is = classloader.getResourceAsStream("file.html");
Document document = Jsoup.parse(is, "UTF-8", "");
Elements elements = document.select("body");
manipulateElements(elements);
System.out.println("Result: " + document.toString());
}
/**
* Manipulate the {@link Elements}.
* @param elements The {@link Elements} to manipulate.
*/
private static void manipulateElements(Elements elements) {
elements.traverse(new NodeVisitor() {
public void tail(Node node, int depth) {
}
public void head(Node node, int depth) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
String text = textNode.text().trim();
if (!text.isEmpty()) {
char[] newChars = new char[text.length()];
for (int i = 0; i < text.length(); i++) {
newChars[i] = changeLetterLatCyr(text.charAt(i));
}
textNode.text(new String(newChars));
}
}
}
});
}
/**
* Your own custom change letter method.
* @param charSent The char to convert.
* @return Returns the converted char.
*/
public static char changeLetterLatCyr(char charSent) {
char l_A = 'A',
l_a = 'a',
l_B = 'B',
l_b = 'b',
r_A = 'А',
r_a = 'а',
r_B = 'Б',
r_b = 'б',
result = ' ';
if (charSent == l_A) {
result = r_A;
} else if (charSent == l_a) {
result = r_a;
} else if (charSent == l_B) {
result = r_B;
} else if (charSent == l_b) {
result = r_b;
} else {
result = charSent;
}
return result;
}
}
HTML
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<div>
<p>A a B b Complex structure</p>
</div>
</body>
</html>
输出
Result: <!doctype html>
<html>
<head></head>
<body>
<h1>My First Heаding</h1>
<p>My first pаrаgrаph.</p>
<div>
<p>А а Б б Complex structure</p>
</div>
</body>
</html>