jsoup:对文本的操作

jsoup: manipulation on text

我有一个html文件,例如:

<!DOCTYPE html>
<html>
<body>

<h1>My First Heading</h1>

<p>My first paragraph.</p>

</body>
</html>

我在 java 中编写了一个方法,可以将文本符号从拉丁文转换为西里尔文,例如:

public static char changeLetterLatCyr(char charSent) {
    char l_A = 'A',
         l_a = 'a',
         l_B = 'B',
         l_b = 'b',

         r_A = 'А',
         r_a = 'а',
         r_B = 'Б',
         r_b = 'б',
         result = ' ';

    if (charSent == l_A) {
        result = r_A;
    } else if (charSent == l_a) {
        result = r_a;
    } else if (charSent == l_B) {
        result = r_B;
    } else if (charSent == l_b) {
        result = r_b;
    } else {
        result = charSent;
    }
    return result;
}

如何在保存所有标签结构的文档中的所有文本上实现我的功能?该函数将每个字符更改为特定字符。

我需要这个结果:

<!DOCTYPE html>
<html>
<body>

<h1>some manipulation on text</h1>

<p>some manipulation on text</p>

</body>
</html>

这是您的操作方法。 NodeVisitor 在这里施展魔法。

Java

package com.github.davidepastore.Whosebug33463949;

import java.io.IOException;
import java.io.InputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;

/**
 * Whosebug 33463949 question.
 *
 */
public class App {

    /**
     * Starts the app here.
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        ClassLoader classloader = Thread.currentThread()
                .getContextClassLoader();
        InputStream is = classloader.getResourceAsStream("file.html");
        Document document = Jsoup.parse(is, "UTF-8", "");
        Elements elements = document.select("body");
        manipulateElements(elements);

        System.out.println("Result: " + document.toString());
    }

    /**
     * Manipulate the {@link Elements}.
     * @param elements The {@link Elements} to manipulate.
     */
    private static void manipulateElements(Elements elements) {
        elements.traverse(new NodeVisitor() {

            public void tail(Node node, int depth) {
            }

            public void head(Node node, int depth) {
                if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text().trim();
                    if (!text.isEmpty()) {
                        char[] newChars = new char[text.length()];
                        for (int i = 0; i < text.length(); i++) {
                            newChars[i] = changeLetterLatCyr(text.charAt(i));
                        }
                        textNode.text(new String(newChars));
                    }
                }
            }
        });
    }

    /**
     * Your own custom change letter method.
     * @param charSent The char to convert.
     * @return Returns the converted char.
     */
    public static char changeLetterLatCyr(char charSent) {
        char l_A = 'A',
            l_a = 'a',
            l_B = 'B',
            l_b = 'b',
            r_A = 'А',
            r_a = 'а',
            r_B = 'Б',
            r_b = 'б',
            result = ' ';

        if (charSent == l_A) {
            result = r_A;
        } else if (charSent == l_a) {
            result = r_a;
        } else if (charSent == l_B) {
            result = r_B;
        } else if (charSent == l_b) {
            result = r_b;
        } else {
            result = charSent;
        }
        return result;
    }
}

HTML

<!DOCTYPE html>
<html>
<body>

<h1>My First Heading</h1>

<p>My first paragraph.</p>
<div>
    <p>A a B b Complex structure</p>
</div>

</body>
</html>

输出

Result: <!doctype html>
<html>
 <head></head>
 <body> 
  <h1>My First Heаding</h1> 
  <p>My first pаrаgrаph.</p> 
  <div> 
   <p>А а Б б Complex structure</p> 
  </div>  
 </body>
</html>