用短语标记文本
Tokenizing Text With a Phrase
我在一列中有文本,想根据文本中存在的词将其分成块。
在一条记录中说这是我的文字:
Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please
我想将其拆分为 Alice 进行的所有对话,并将其作为新属性添加到相同的记录中。将如下所示。
意味着与 "Alice:" 关联的所有文本将在一个属性中,而与 "Bob:" 关联的所有文本将在另一个属性中。
这可以在 Rapidminer 中完成吗?
是的,它可以在 RapidMiner 中完成。您可以使用或不使用文本处理扩展(即使用剪切文档、标记化等)以多种方式执行此操作。但如果它已经在一列中,我将只使用 Split 和像 Alice.* 这样的 RegEx 表达式,然后使用 Pivot:
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="136">
<parameter key="text" value=" Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please Alice: Awesome Bob: Thanks! Alice: How's life? Bob: not bad"/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="add_meta_information" value="false"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (3)" width="90" x="313" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
<parameter key="split_pattern" value="\n"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (3)" width="90" x="447" y="187"/>
<operator activated="true" class="rename" compatibility="8.0.000" expanded="true" height="82" name="Rename (2)" width="90" x="581" y="187">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="text"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split" width="90" x="246" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
<parameter key="split_pattern" value="\s(?=[A-Za-z]+:)"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose" width="90" x="380" y="34"/>
<operator activated="true" class="de_pivot" compatibility="8.0.000" expanded="true" height="82" name="De-Pivot" width="90" x="514" y="34">
<list key="attribute_name">
<parameter key="bar" value="text.*"/>
</list>
<parameter key="index_attribute" value="foo"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (2)" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="bar"/>
<parameter key="split_pattern" value="[:]\s"/>
</operator>
<operator activated="true" class="trim" compatibility="8.0.000" expanded="true" height="82" name="Trim" width="90" x="782" y="34"/>
<operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="916" y="34">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="bar.*"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.0.000" expanded="true" height="103" name="Filter Examples" width="90" x="1050" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="bar_1.is_not_missing."/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="8.0.000" expanded="true" height="82" name="Aggregate (2)" width="90" x="1184" y="34">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="attribute" value="bar_2"/>
<parameter key="default_aggregation_function" value="concatenation"/>
<list key="aggregation_attributes">
<parameter key="bar_2" value="concatenation"/>
</list>
<parameter key="group_by_attributes" value="bar_1"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (2)" width="90" x="1318" y="34"/>
<operator activated="true" class="rename_by_example_values" compatibility="8.0.000" expanded="true" height="82" name="Rename by Example Values" width="90" x="1452" y="34"/>
<operator activated="true" class="filter_example_range" compatibility="8.0.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1586" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="1"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes" width="90" x="1787" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="bar_1"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="replace" compatibility="8.0.000" expanded="true" height="82" name="Replace" width="90" x="1921" y="34">
<parameter key="replace_what" value="[|]"/>
<parameter key="replace_by" value=" "/>
</operator>
<connect from_op="Create Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Split (3)" to_port="example set input"/>
<connect from_op="Split (3)" from_port="example set output" to_op="Transpose (3)" to_port="example set input"/>
<connect from_op="Transpose (3)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="De-Pivot" to_port="example set input"/>
<connect from_op="De-Pivot" from_port="example set output" to_op="Split (2)" to_port="example set input"/>
<connect from_op="Split (2)" from_port="example set output" to_op="Trim" to_port="example set input"/>
<connect from_op="Trim" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
<connect from_op="Aggregate (2)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_op="Rename by Example Values" to_port="example set input"/>
<connect from_op="Rename by Example Values" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
我注意到您是 Java 开发人员。如果您想走那条路,这是解析文本的最简单方法。当然,如果名称中包含空格,您可能 运行 会遇到问题。
我添加了一些额外的文字来展示不认识的人。
import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;
public class ChatMiner {
private static final Pattern NAME_TOKEN = Pattern.compile("^(?<name>\w+):$");
private static final String NAME_GROUP = "name";
private static final String UNKNOWN_NAME = "UNKNOWN";
private static final String WHITE_SPACE = "\s+";
public static void main(String[] args) {
String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
String[] names = { "Alice", "Bob" };
Map<String, List<String>> map = parseChat(text, names);
for (Map.Entry<String, List<String>> entry : map.entrySet()) {
System.out.printf("%-8s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
}
}
public static Map<String, List<String>> parseChat(String input, String... names) {
Map<String, List<String>> result = new HashMap<String, List<String>>();
List<String> nameList = new ArrayList<String>(Arrays.asList(names));
nameList.add(UNKNOWN_NAME);
nameList.forEach(name -> result.put(name, new ArrayList<String>()));
String[] tokens = input.split(WHITE_SPACE);
String currentName = null;
for (String token : tokens) {
Matcher m = NAME_TOKEN.matcher(token);
if (m.matches()) {
if (nameList.contains(m.group(NAME_GROUP))) {
currentName = m.group(NAME_GROUP);
} else {
currentName = UNKNOWN_NAME;
}
continue;
}
if (currentName != null) {
List<String> words = result.get(currentName);
words.add(token);
result.put(currentName, words);
}
}
return result;
}
private static String quoteItems(List<String> list) {
return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
}
}
输出
Bob : "Hi", "Tea,please"
Alice : "Hello", "Coffee?"
UNKNOWN : "Hello", "guys!"
动态方法
如果您愿意,可以跳过声明姓名,并在您找到新朋友时动态生成姓名映射。
import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;
public class ChatMiner {
private static final Pattern NAME_TOKEN = Pattern.compile("^(?<name>\w+):$");
private static final String NAME_GROUP = "name";
private static final String WHITE_SPACE = "\s+";
public static void main(String[] args) {
String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
Map<String, List<String>> map = parseChat(text);
for (Map.Entry<String, List<String>> entry : map.entrySet()) {
System.out.printf("%-6s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
}
}
public static Map<String, List<String>> parseChat(String input) {
Map<String, List<String>> result = new HashMap<String, List<String>>();
String[] tokens = input.split(WHITE_SPACE);
String currentName = null;
for (String token : tokens) {
Matcher m = NAME_TOKEN.matcher(token);
if (m.matches()) {
String name = m.group(NAME_GROUP);
if (!result.containsKey(name)) {
result.put(name, new ArrayList<String>());
}
currentName = name;
continue;
}
if (currentName != null) {
List<String> words = result.get(currentName);
words.add(token);
result.put(currentName, words);
}
}
return result;
}
private static String quoteItems(List<String> list) {
return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
}
}
输出
Bob : "Hi", "Tea,please"
James : "Hello", "guys!"
Alice : "Hello", "Coffee?"
我在一列中有文本,想根据文本中存在的词将其分成块。
在一条记录中说这是我的文字:
Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please
我想将其拆分为 Alice 进行的所有对话,并将其作为新属性添加到相同的记录中。将如下所示。
意味着与 "Alice:" 关联的所有文本将在一个属性中,而与 "Bob:" 关联的所有文本将在另一个属性中。 这可以在 Rapidminer 中完成吗?
是的,它可以在 RapidMiner 中完成。您可以使用或不使用文本处理扩展(即使用剪切文档、标记化等)以多种方式执行此操作。但如果它已经在一列中,我将只使用 Split 和像 Alice.* 这样的 RegEx 表达式,然后使用 Pivot:
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="7.5.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="136">
<parameter key="text" value=" Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please Alice: Awesome Bob: Thanks! Alice: How's life? Bob: not bad"/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="add_meta_information" value="false"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (3)" width="90" x="313" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
<parameter key="split_pattern" value="\n"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (3)" width="90" x="447" y="187"/>
<operator activated="true" class="rename" compatibility="8.0.000" expanded="true" height="82" name="Rename (2)" width="90" x="581" y="187">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="text"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split" width="90" x="246" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
<parameter key="split_pattern" value="\s(?=[A-Za-z]+:)"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose" width="90" x="380" y="34"/>
<operator activated="true" class="de_pivot" compatibility="8.0.000" expanded="true" height="82" name="De-Pivot" width="90" x="514" y="34">
<list key="attribute_name">
<parameter key="bar" value="text.*"/>
</list>
<parameter key="index_attribute" value="foo"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.000" expanded="true" height="82" name="Split (2)" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="bar"/>
<parameter key="split_pattern" value="[:]\s"/>
</operator>
<operator activated="true" class="trim" compatibility="8.0.000" expanded="true" height="82" name="Trim" width="90" x="782" y="34"/>
<operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="916" y="34">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="bar.*"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.0.000" expanded="true" height="103" name="Filter Examples" width="90" x="1050" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="bar_1.is_not_missing."/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="8.0.000" expanded="true" height="82" name="Aggregate (2)" width="90" x="1184" y="34">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="attribute" value="bar_2"/>
<parameter key="default_aggregation_function" value="concatenation"/>
<list key="aggregation_attributes">
<parameter key="bar_2" value="concatenation"/>
</list>
<parameter key="group_by_attributes" value="bar_1"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.000" expanded="true" height="82" name="Transpose (2)" width="90" x="1318" y="34"/>
<operator activated="true" class="rename_by_example_values" compatibility="8.0.000" expanded="true" height="82" name="Rename by Example Values" width="90" x="1452" y="34"/>
<operator activated="true" class="filter_example_range" compatibility="8.0.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1586" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="1"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.000" expanded="true" height="82" name="Select Attributes" width="90" x="1787" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="bar_1"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="replace" compatibility="8.0.000" expanded="true" height="82" name="Replace" width="90" x="1921" y="34">
<parameter key="replace_what" value="[|]"/>
<parameter key="replace_by" value=" "/>
</operator>
<connect from_op="Create Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Split (3)" to_port="example set input"/>
<connect from_op="Split (3)" from_port="example set output" to_op="Transpose (3)" to_port="example set input"/>
<connect from_op="Transpose (3)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="De-Pivot" to_port="example set input"/>
<connect from_op="De-Pivot" from_port="example set output" to_op="Split (2)" to_port="example set input"/>
<connect from_op="Split (2)" from_port="example set output" to_op="Trim" to_port="example set input"/>
<connect from_op="Trim" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
<connect from_op="Aggregate (2)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_op="Rename by Example Values" to_port="example set input"/>
<connect from_op="Rename by Example Values" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
我注意到您是 Java 开发人员。如果您想走那条路,这是解析文本的最简单方法。当然,如果名称中包含空格,您可能 运行 会遇到问题。
我添加了一些额外的文字来展示不认识的人。
import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;
public class ChatMiner {
private static final Pattern NAME_TOKEN = Pattern.compile("^(?<name>\w+):$");
private static final String NAME_GROUP = "name";
private static final String UNKNOWN_NAME = "UNKNOWN";
private static final String WHITE_SPACE = "\s+";
public static void main(String[] args) {
String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
String[] names = { "Alice", "Bob" };
Map<String, List<String>> map = parseChat(text, names);
for (Map.Entry<String, List<String>> entry : map.entrySet()) {
System.out.printf("%-8s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
}
}
public static Map<String, List<String>> parseChat(String input, String... names) {
Map<String, List<String>> result = new HashMap<String, List<String>>();
List<String> nameList = new ArrayList<String>(Arrays.asList(names));
nameList.add(UNKNOWN_NAME);
nameList.forEach(name -> result.put(name, new ArrayList<String>()));
String[] tokens = input.split(WHITE_SPACE);
String currentName = null;
for (String token : tokens) {
Matcher m = NAME_TOKEN.matcher(token);
if (m.matches()) {
if (nameList.contains(m.group(NAME_GROUP))) {
currentName = m.group(NAME_GROUP);
} else {
currentName = UNKNOWN_NAME;
}
continue;
}
if (currentName != null) {
List<String> words = result.get(currentName);
words.add(token);
result.put(currentName, words);
}
}
return result;
}
private static String quoteItems(List<String> list) {
return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
}
}
输出
Bob : "Hi", "Tea,please"
Alice : "Hello", "Coffee?"
UNKNOWN : "Hello", "guys!"
动态方法
如果您愿意,可以跳过声明姓名,并在您找到新朋友时动态生成姓名映射。
import java.util.*;
import java.util.regex.*;
import java.util.stream.Collectors;
public class ChatMiner {
private static final Pattern NAME_TOKEN = Pattern.compile("^(?<name>\w+):$");
private static final String NAME_GROUP = "name";
private static final String WHITE_SPACE = "\s+";
public static void main(String[] args) {
String text = "Alice: Hello Bob: Hi Alice: Coffee? Bob: Tea,please James: Hello guys!";
Map<String, List<String>> map = parseChat(text);
for (Map.Entry<String, List<String>> entry : map.entrySet()) {
System.out.printf("%-6s: %s%n", entry.getKey(), quoteItems(entry.getValue()));
}
}
public static Map<String, List<String>> parseChat(String input) {
Map<String, List<String>> result = new HashMap<String, List<String>>();
String[] tokens = input.split(WHITE_SPACE);
String currentName = null;
for (String token : tokens) {
Matcher m = NAME_TOKEN.matcher(token);
if (m.matches()) {
String name = m.group(NAME_GROUP);
if (!result.containsKey(name)) {
result.put(name, new ArrayList<String>());
}
currentName = name;
continue;
}
if (currentName != null) {
List<String> words = result.get(currentName);
words.add(token);
result.put(currentName, words);
}
}
return result;
}
private static String quoteItems(List<String> list) {
return list.stream().map(s -> String.format("\"%s\"", s)).collect(Collectors.joining(", "));
}
}
输出
Bob : "Hi", "Tea,please"
James : "Hello", "guys!"
Alice : "Hello", "Coffee?"