在 reader 中为 spring 批处理作业实施键集分页
Implement Keyset paging in a reader for a spring batch job
我有一个 spring 批处理作业,它从 Postgres 数据库中获取数据并在处理后将其写入 excel 工作表。但是我想在 reader 中为 spring 批处理作业实现键集分页。目前,我正在使用 JpaPagingItemReader,它使用限制偏移量分页,但是因为我正在处理大量数据,所以 JpaPagingItemReader 用来获取数据的查询随着偏移量的增加而变得低效。键集分页可用于避免限制偏移分页的限制,但我不知道如何使用键集分页实现 reader。我该如何实施?
编辑:
键集分页不包括 offsetting/skipping 记录,相反,我们将在结果中排序和跟踪数字唯一标识符,并请求大于最后一个唯一条目的条目。在此方法中,SQL 如下所示(假设 customer_id 是记录的唯一自动生成标识符)
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 0 order by customer_id asc limit 100;
-- Second iteration ( size = 100, lastCustomerId = 100 )
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 100 order by customer_id asc limit 100;
-- Second iteration ( size = 100, lastCustomerId = 200 )
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 200 order by customer_id asc limit 100;
在实现键集分页时需要牢记的要点是:
- 每条记录都应该有一个数字唯一标识符(
最好是主键 )
- 结果集应该是有序的
- 我们应该有排序的逻辑并在检索到的列表中找到最大的 id。
- 您正在使用的标识符字段上应该有一个索引
用于偷看。
public class CustomerProcessorService {
public void processCustomers() {
List<Customer> customers = new ArrayList();
long lastCusId = 0;
int size = 100;
while ( true ) {
// Create a PageRequest object that will be passed as Pageable interface to repo
// Note that here we are setting 0 as the offset
PageRequest pageRequest = new PageRequest(0,size);
// Get the lastCusId
lastCusId = getLastCusId(customers);
// Get the data from the database
customers = customerRepository.findByStatusAndCustomerIdGreaterThanOrderByCustomerIdAsc('ACTIVE',lastCusId,pageRequest);
// Check if data is there
if ( customers == null || customers.isEmpty()) {
break;
}
// Do the processing
}
}
public Long getLastCusId(List<Customer> customers) {
// If passed entry is null or empty, return 0 ( This handles the first iteration case )
if ( customers == null || customers.isEmpty())
return 0l;
// Do the logic to sort the customers list by customer_id of each
// Customer object
// Return the last entry
return customers.get(customers.size() -1).getCustomerId();
}
您应该能够通过扩展 AbstractPaginatedDataItemReader
class 来实现您的分页逻辑。此基础 class 处理大部分分页样板,并允许您在 doPageRead
中指定分页逻辑。这是一个简单的例子,我会让你相应地调整它:
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.springframework.batch.item.data.AbstractPaginatedDataItemReader;
import org.springframework.data.domain.PageRequest;
public class KeySetPagingItemReader extends AbstractPaginatedDataItemReader<Customer> {
long lastCusId = 0;
int size = 100;
private CustomerRepository customerRepository;
List<Customer> customers = new ArrayList();
public KeySetPagingItemReader(CustomerRepository customerRepository) {
this.customerRepository = customerRepository;
}
@Override
protected Iterator<Customer> doPageRead() {
PageRequest pageRequest = PageRequest.of(0,size);
// Get the lastCusId
lastCusId = getLastCusId(customers);
// Get the data from the database
customers = customerRepository.findByStatusAndCustomerIdGreaterThanOrderByCustomerIdAsc('ACTIVE',lastCusId,pageRequest);
return customers.iterator();
}
public Long getLastCusId(List<Customer> customers) {
// If passed entry is null or empty, return 0 ( This handles the first iteration case )
if ( customers == null || customers.isEmpty())
return 0l;
// Do the logic to sort the customers list by customer_id of each
// Customer object
// Return the last entry
return customers.get(customers.size() -1).getCustomerId();
}
}
我有一个 spring 批处理作业,它从 Postgres 数据库中获取数据并在处理后将其写入 excel 工作表。但是我想在 reader 中为 spring 批处理作业实现键集分页。目前,我正在使用 JpaPagingItemReader,它使用限制偏移量分页,但是因为我正在处理大量数据,所以 JpaPagingItemReader 用来获取数据的查询随着偏移量的增加而变得低效。键集分页可用于避免限制偏移分页的限制,但我不知道如何使用键集分页实现 reader。我该如何实施?
编辑: 键集分页不包括 offsetting/skipping 记录,相反,我们将在结果中排序和跟踪数字唯一标识符,并请求大于最后一个唯一条目的条目。在此方法中,SQL 如下所示(假设 customer_id 是记录的唯一自动生成标识符)
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 0 order by customer_id asc limit 100;
-- Second iteration ( size = 100, lastCustomerId = 100 )
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 100 order by customer_id asc limit 100;
-- Second iteration ( size = 100, lastCustomerId = 200 )
select * from CUSTOMERS where status = 'ACTIVE' and customer_id > 200 order by customer_id asc limit 100;
在实现键集分页时需要牢记的要点是:
- 每条记录都应该有一个数字唯一标识符( 最好是主键 )
- 结果集应该是有序的
- 我们应该有排序的逻辑并在检索到的列表中找到最大的 id。
- 您正在使用的标识符字段上应该有一个索引 用于偷看。
public class CustomerProcessorService {
public void processCustomers() {
List<Customer> customers = new ArrayList();
long lastCusId = 0;
int size = 100;
while ( true ) {
// Create a PageRequest object that will be passed as Pageable interface to repo
// Note that here we are setting 0 as the offset
PageRequest pageRequest = new PageRequest(0,size);
// Get the lastCusId
lastCusId = getLastCusId(customers);
// Get the data from the database
customers = customerRepository.findByStatusAndCustomerIdGreaterThanOrderByCustomerIdAsc('ACTIVE',lastCusId,pageRequest);
// Check if data is there
if ( customers == null || customers.isEmpty()) {
break;
}
// Do the processing
}
}
public Long getLastCusId(List<Customer> customers) {
// If passed entry is null or empty, return 0 ( This handles the first iteration case )
if ( customers == null || customers.isEmpty())
return 0l;
// Do the logic to sort the customers list by customer_id of each
// Customer object
// Return the last entry
return customers.get(customers.size() -1).getCustomerId();
}
您应该能够通过扩展 AbstractPaginatedDataItemReader
class 来实现您的分页逻辑。此基础 class 处理大部分分页样板,并允许您在 doPageRead
中指定分页逻辑。这是一个简单的例子,我会让你相应地调整它:
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.springframework.batch.item.data.AbstractPaginatedDataItemReader;
import org.springframework.data.domain.PageRequest;
public class KeySetPagingItemReader extends AbstractPaginatedDataItemReader<Customer> {
long lastCusId = 0;
int size = 100;
private CustomerRepository customerRepository;
List<Customer> customers = new ArrayList();
public KeySetPagingItemReader(CustomerRepository customerRepository) {
this.customerRepository = customerRepository;
}
@Override
protected Iterator<Customer> doPageRead() {
PageRequest pageRequest = PageRequest.of(0,size);
// Get the lastCusId
lastCusId = getLastCusId(customers);
// Get the data from the database
customers = customerRepository.findByStatusAndCustomerIdGreaterThanOrderByCustomerIdAsc('ACTIVE',lastCusId,pageRequest);
return customers.iterator();
}
public Long getLastCusId(List<Customer> customers) {
// If passed entry is null or empty, return 0 ( This handles the first iteration case )
if ( customers == null || customers.isEmpty())
return 0l;
// Do the logic to sort the customers list by customer_id of each
// Customer object
// Return the last entry
return customers.get(customers.size() -1).getCustomerId();
}
}