在 like 运算符上应用字符串列表
Applying list of strings on like operator
问题陈述: 我需要传递一个字符串列表,并在 spark java.
中使用过滤器函数应用类似运算符
原因:like运算符考虑的是单个字符串实体,所以代码中存在很多性能问题,因为我们需要应用到更大的数据集上,才能克服这个问题
我需要传递字符串列表并使用过滤器函数像运算符一样应用。
请告诉我如何使用 like 应用列表。因为我需要找到 ID 的相似模式
JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
SQLContext sqlContext = new SQLContext(sc);
SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
List<Row> data = Arrays.asList(
RowFactory.create("J40504", "CRC Industries"),
RowFactory.create("K630-0746777","Dixon value"),
RowFactory.create("K444-4444","3M INdustries"),
RowFactory.create("4333444","3M INdustries"),
RowFactory.create("566-655","3M INdustries"),
RowFactory.create("4444888","3M INdustries"),
RowFactory.create("P477-7444","3M INdustries"),
RowFactory.create("566655","Dixon coupling valve"));
// In real time we have large dataset
StructType schema = new StructType(new StructField[] {new StructField("label1", DataTypes.StringType, false,Metadata.empty()),
new StructField("sentence1", DataTypes.StringType, false,Metadata.empty()) });
Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);
List<String> listStrings = new ArrayList<String>();
listStrings.add("40504");
listStrings.add("630-0746");
listStrings.add("477-7444");
listStrings.add("444-4444");
// In real time we have large list of string to be compared with
sentenceDataFrame.show();
System.out.println("Array list :"+listStrings);
for(int i=0;i<listStrings.size();i++){
sentenceDataFrame=sentenceDataFrame.filter(col("label1").like("%"+listStrings.get(i)+"%"));
}
sentenceDataFrame.show();
第一个解决方案
无需迭代数据集 N(其中 N 是 listStrings 的大小)次数,您可以构建表达式并仅过滤一次数据集:
StringBuilder expressionBuilder = new StringBuilder();
String separator = "";
for (String s : listStrings) {
expressionBuilder.append(separator + " label1 LIKE '%" + s + "%'");
separator = " OR ";
}
String expression = expressionBuilder.toString();
sentenceDataFrame = sentenceDataFrame.filter(expression);
第二种解法
我们可以将 listStrings 加载到数据集中:
StructType schemaList = new StructType(new StructField[]{new StructField("labelToFind", DataTypes.StringType, false, Metadata.empty())});
List<Row> listStrings = Arrays.asList(
RowFactory.create("40504"),
RowFactory.create("630-0746"),
RowFactory.create("477-7444"),
RowFactory.create("444-4444"));
Dataset<Row>listDataset = sqlContext.createDataFrame(listStrings, schemaList);
然后我们可以加入两个数据集以过滤行:
sentenceDataFrame = sentenceDataFrame.join(listDataset ,sentenceDataFrame.col("label1").contains(listDataset.col("labelToFind"))).select("label1","sentence1");
问题陈述: 我需要传递一个字符串列表,并在 spark java.
中使用过滤器函数应用类似运算符原因:like运算符考虑的是单个字符串实体,所以代码中存在很多性能问题,因为我们需要应用到更大的数据集上,才能克服这个问题 我需要传递字符串列表并使用过滤器函数像运算符一样应用。
请告诉我如何使用 like 应用列表。因为我需要找到 ID 的相似模式
JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("SparkJdbcDs").setMaster("local[*]"));
SQLContext sqlContext = new SQLContext(sc);
SparkSession spark = SparkSession.builder().appName("JavaTokenizerExample").getOrCreate();
List<Row> data = Arrays.asList(
RowFactory.create("J40504", "CRC Industries"),
RowFactory.create("K630-0746777","Dixon value"),
RowFactory.create("K444-4444","3M INdustries"),
RowFactory.create("4333444","3M INdustries"),
RowFactory.create("566-655","3M INdustries"),
RowFactory.create("4444888","3M INdustries"),
RowFactory.create("P477-7444","3M INdustries"),
RowFactory.create("566655","Dixon coupling valve"));
// In real time we have large dataset
StructType schema = new StructType(new StructField[] {new StructField("label1", DataTypes.StringType, false,Metadata.empty()),
new StructField("sentence1", DataTypes.StringType, false,Metadata.empty()) });
Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);
List<String> listStrings = new ArrayList<String>();
listStrings.add("40504");
listStrings.add("630-0746");
listStrings.add("477-7444");
listStrings.add("444-4444");
// In real time we have large list of string to be compared with
sentenceDataFrame.show();
System.out.println("Array list :"+listStrings);
for(int i=0;i<listStrings.size();i++){
sentenceDataFrame=sentenceDataFrame.filter(col("label1").like("%"+listStrings.get(i)+"%"));
}
sentenceDataFrame.show();
第一个解决方案
无需迭代数据集 N(其中 N 是 listStrings 的大小)次数,您可以构建表达式并仅过滤一次数据集:
StringBuilder expressionBuilder = new StringBuilder();
String separator = "";
for (String s : listStrings) {
expressionBuilder.append(separator + " label1 LIKE '%" + s + "%'");
separator = " OR ";
}
String expression = expressionBuilder.toString();
sentenceDataFrame = sentenceDataFrame.filter(expression);
第二种解法
我们可以将 listStrings 加载到数据集中:
StructType schemaList = new StructType(new StructField[]{new StructField("labelToFind", DataTypes.StringType, false, Metadata.empty())});
List<Row> listStrings = Arrays.asList(
RowFactory.create("40504"),
RowFactory.create("630-0746"),
RowFactory.create("477-7444"),
RowFactory.create("444-4444"));
Dataset<Row>listDataset = sqlContext.createDataFrame(listStrings, schemaList);
然后我们可以加入两个数据集以过滤行:
sentenceDataFrame = sentenceDataFrame.join(listDataset ,sentenceDataFrame.col("label1").contains(listDataset.col("labelToFind"))).select("label1","sentence1");