用短语标记文本

Tokenizing Text With a Phrase

我在一列中有文本,想根据文本中存在的词将其分成块。

在一条记录中说这是我的文字:

 Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please

我想将其拆分为 Alice 进行的所有对话,并将其作为新属性添加到相同的记录中。将如下所示。

意味着与 "Alice:" 关联的所有文本将在一个属性中,而与 "Bob:" 关联的所有文本将在另一个属性中。 这可以在 Rapidminer 中完成吗?

是的,它可以在 RapidMiner 中完成。您可以使用或不使用文本处理扩展(即使用剪切文档、标记化等)以多种方式执行此操作。但如果它已经在一列中,我将只使用 Split 和像 Alice.* 这样的 RegEx 表达式,然后使用 Pivot:

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.0.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="136">
        <parameter key="text" value=" Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please&#10; Alice: Awesome Bob: Thanks! Alice: How's life? Bob: not bad"/>
      </operator>
      <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="136">
        <parameter key="text_attribute" value="text"/>
        <parameter key="add_meta_information" value="false"/>
      </operator>
      <operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (3)" width="90" x="313" y="187">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="split_pattern" value="\n"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (3)" width="90" x="447" y="187"/>
      <operator activated="true" class="rename" compatibility="8.0.000" expanded="true" height="82" name="Rename (2)" width="90" x="581" y="187">
        <parameter key="old_name" value="att_1"/>
        <parameter key="new_name" value="text"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split" width="90" x="246" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="split_pattern" value="\s(?=[A-Za-z]+:)"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose" width="90" x="380" y="34"/>
      <operator activated="true" class="de_pivot" compatibility="8.0.000" expanded="true" height="82" name="De-Pivot" width="90" x="514" y="34">
        <list key="attribute_name">
          <parameter key="bar" value="text.*"/>
        </list>
        <parameter key="index_attribute" value="foo"/>
      </operator>
      <operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (2)" width="90" x="648" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="bar"/>
        <parameter key="split_pattern" value="[:]\s"/>
      </operator>
      <operator activated="true" class="trim" compatibility="8.0.000" expanded="true" height="82" name="Trim" width="90" x="782" y="34"/>
      <operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="916" y="34">
        <parameter key="attribute_filter_type" value="regular_expression"/>
        <parameter key="regular_expression" value="bar.*"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="8.0.000" expanded="true" height="103" name="Filter Examples" width="90" x="1050" y="34">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="bar_1.is_not_missing."/>
        </list>
      </operator>
      <operator activated="true" class="aggregate" compatibility="8.0.000" expanded="true" height="82" name="Aggregate (2)" width="90" x="1184" y="34">
        <parameter key="use_default_aggregation" value="true"/>
        <parameter key="attribute" value="bar_2"/>
        <parameter key="default_aggregation_function" value="concatenation"/>
        <list key="aggregation_attributes">
          <parameter key="bar_2" value="concatenation"/>
        </list>
        <parameter key="group_by_attributes" value="bar_1"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (2)" width="90" x="1318" y="34"/>
      <operator activated="true" class="rename_by_example_values" compatibility="8.0.000" expanded="true" height="82" name="Rename by Example Values" width="90" x="1452" y="34"/>
      <operator activated="true" class="filter_example_range" compatibility="8.0.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1586" y="34">
        <parameter key="first_example" value="1"/>
        <parameter key="last_example" value="1"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes" width="90" x="1787" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="bar_1"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="replace" compatibility="8.0.000" expanded="true" height="82" name="Replace" width="90" x="1921" y="34">
        <parameter key="replace_what" value="[|]"/>
        <parameter key="replace_by" value=" "/>
      </operator>
      <connect from_op="Create Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
      <connect from_op="Documents to Data" from_port="example set" to_op="Split (3)" to_port="example set input"/>
      <connect from_op="Split (3)" from_port="example set output" to_op="Transpose (3)" to_port="example set input"/>
      <connect from_op="Transpose (3)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
      <connect from_op="Rename (2)" from_port="example set output" to_op="Split" to_port="example set input"/>
      <connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="De-Pivot" to_port="example set input"/>
      <connect from_op="De-Pivot" from_port="example set output" to_op="Split (2)" to_port="example set input"/>
      <connect from_op="Split (2)" from_port="example set output" to_op="Trim" to_port="example set input"/>
      <connect from_op="Trim" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
      <connect from_op="Aggregate (2)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
      <connect from_op="Transpose (2)" from_port="example set output" to_op="Rename by Example Values" to_port="example set input"/>
      <connect from_op="Rename by Example Values" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
      <connect from_op="Filter Example Range" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
      <connect from_op="Replace" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

我注意到您是 Java 开发人员。如果您想走那条路,这是解析文本的最简单方法。当然,如果名称中包含空格,您可能 运行 会遇到问题。

我添加了一些额外的文字来展示不认识的人。

import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;

public class ChatMiner {
    private static final Pattern NAME_TOKEN  = Pattern.compile("^(?<name>\w+):$");
    private static final String NAME_GROUP   = "name";
    private static final String UNKNOWN_NAME = "UNKNOWN";
    private static final String WHITE_SPACE  = "\s+";

    public static void main(String[] args) {
        String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
        String[] names = { "Alice", "Bob" };
        Map<String, List<String>> map = parseChat(text, names);

        for (Map.Entry<String, List<String>> entry : map.entrySet()) {
            System.out.printf("%-8s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
        }
    }

    public static Map<String, List<String>> parseChat(String input, String... names) {
        Map<String, List<String>> result = new HashMap<String, List<String>>();

        List<String> nameList = new ArrayList<String>(Arrays.asList(names));
        nameList.add(UNKNOWN_NAME);
        nameList.forEach(name -> result.put(name, new ArrayList<String>()));

        String[] tokens = input.split(WHITE_SPACE);
        String currentName = null;

        for (String token : tokens) {
            Matcher m = NAME_TOKEN.matcher(token);
            if (m.matches()) {
                if (nameList.contains(m.group(NAME_GROUP))) {
                    currentName = m.group(NAME_GROUP);
                } else {
                    currentName = UNKNOWN_NAME;
                }
                continue;
            }

            if (currentName != null) {
                List<String> words = result.get(currentName);
                words.add(token);
                result.put(currentName, words);
            }
        }

        return result;
    }

    private static String quoteItems(List<String> list) {
        return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
    }
}

输出

Bob     : "Hi", "Tea,please"
Alice   : "Hello", "Coffee?"
UNKNOWN : "Hello", "guys!"

动态方法

如果您愿意,可以跳过声明姓名,并在您找到新朋友时动态生成姓名映射。

import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;

public class ChatMiner {
    private static final Pattern NAME_TOKEN  = Pattern.compile("^(?<name>\w+):$");
    private static final String NAME_GROUP   = "name";
    private static final String WHITE_SPACE  = "\s+";

    public static void main(String[] args) {
        String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
        Map<String, List<String>> map = parseChat(text);

        for (Map.Entry<String, List<String>> entry : map.entrySet()) {
            System.out.printf("%-6s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
        }
    }

    public static Map<String, List<String>> parseChat(String input) {
        Map<String, List<String>> result = new HashMap<String, List<String>>();

        String[] tokens = input.split(WHITE_SPACE);
        String currentName = null;

        for (String token : tokens) {
            Matcher m = NAME_TOKEN.matcher(token);
            if (m.matches()) {
                String name = m.group(NAME_GROUP);
                if (!result.containsKey(name)) {
                    result.put(name, new ArrayList<String>());
                }
                currentName = name;
                continue;
            }

            if (currentName != null) {
                List<String> words = result.get(currentName);
                words.add(token);
                result.put(currentName, words);
            }
        }

        return result;
    }

    private static String quoteItems(List<String> list) {
        return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
    }
}

输出

Bob   : "Hi", "Tea,please"
James : "Hello", "guys!"
Alice : "Hello", "Coffee?"