多渠道联系人问题:根据信息查找相关工单
Multi-Channel Contacts Problem: Find related tickets based on their information
问题描述
以下问题描述摘自Shopee Code League 2021
对于每张工单,如果每个用户具有相同的联系信息,则识别他们的所有联系人。这些工单中的每一个都通过 Email
、Phone
或 Order ID
直接或间接关联,因此每张工单都属于同一用户。例如:
Ticket
ID
Email
Order ID
Phone
Contacts
A
0
john@gmail.com
12345678
NA
5
B
1
NA
12345678
682212345
2
C
34567
wick@gmail.com
NA
682212345
4
D
78999
wick@gmail.com
NA
NA
3
- 工单 A 和工单 B 通过
Order ID
链接
- 工单 B 和 C 通过
Phone
链接
- 工单 C 和 D 通过
Email
链接
- 工单 A 和工单 D 间接链接 通过工单
A
> B
> C
> D
在这个例子中,这个用户共有14个Contact
。该用户的 ticket_trace/contact 对将是 0-1-34567-78999, 14
.
对于每张票,确定属于同一用户的所有其他 ID
,按升序排序,以及该用户拥有的总数 Contact
。生成一个包含 2 列的 csv
文件,格式如下:
ID
ticket_trace and Contact
0
0-1-34567-78999, 14
1
0-1-34567-78999, 14
⋮
⋮
请注意,有 500,000 行数据。在短时间内解决此问题的最有效方法是最小时间复杂度和内存使用[=64] =]?
问题数据集
输入文件样本,contacts.json
:
[
{
"Id":0,
"Email":"gkzAbIy@qq.com",
"Phone":"",
"Contacts":1,
"OrderId":""
},
{
"Id":1,
"Email":"",
"Phone":"329442681752",
"Contacts":4,
"OrderId":"vDDJJcxfLtSfkooPhbYnJdxov"
}, // more data
]
Click here 问题数据集和更详细的描述。
这是我尝试过的解决方案。虽然我没有足够的时间来处理所有 500,000 行。基本上,我遍历数据数组并按顺序比较它们的属性。是啊,很慢的方式。
import com.google.gson.Gson;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.*;
public class Main {
public static void main(String[] args) throws Exception {
Gson gson = new Gson();
Ticket[] tickets = gson.fromJson(new FileReader("contacts.json"), Ticket[].class);
HashMap<Integer, LinkedList<Integer>> ticketTraceMap = new HashMap<>();
HashMap<Integer, Integer> sumOfContactsMap = new HashMap<>();
int commonId = 1; // tickets with same commonId's are related
for (int i = 0; i < tickets.length; ++i) {
// assign new commonId and increment it if a particular ticket
// is not related to other tickets yet
if (tickets[i].commonId == 0) {
tickets[i].commonId = commonId;
++commonId;
}
int sumOfContacts = 0;
LinkedList<Integer> ticketTraceList = new LinkedList<>();
ticketTraceList.add(Integer.parseInt(tickets[i].getId()));
// get sum of contacts and ticket trace if present
if (sumOfContactsMap.get(tickets[i].commonId) != null) {
sumOfContacts = sumOfContactsMap.get(tickets[i].commonId);
}
if (ticketTraceMap.get((tickets[i].commonId)) != null) {
ticketTraceList = ticketTraceMap.get((tickets[i].commonId));
}
for (int j = i + 1; j < tickets.length; ++j) {
if (tickets[i].equals(tickets[j])) {
tickets[j].commonId = tickets[i].commonId; // assign commonId to link related tickets
sumOfContacts += Integer.parseInt(tickets[j].getContacts());
ticketTraceList.add(Integer.parseInt(tickets[j].getId()));
}
}
ticketTraceMap.put(tickets[i].commonId, ticketTraceList);
sumOfContactsMap.put(tickets[i].commonId, sumOfContacts);
}
// store ticket trace string
HashMap<Integer, String> ticketTraceStringMap = new HashMap<>();
for (Map.Entry<Integer, LinkedList<Integer>> entry : ticketTraceMap.entrySet()) {
Collections.sort(entry.getValue()); // sort ticket trace in ascending order
StringBuilder sb = new StringBuilder();
// concatenate ticket trace into a string
for (Integer integer : entry.getValue()) {
sb.append(integer).append("-");
}
sb.setLength(sb.length() - 1);
ticketTraceStringMap.put(entry.getKey(), sb.toString());
}
// print result to a csv file
PrintWriter pw = new PrintWriter("result.csv");
StringBuilder builder = new StringBuilder();
String columnNamesList = "ticket_id,ticket_trace/contact\n";
builder.append(columnNamesList);
for (int i = 0; i < tickets.length; ++i) {
builder.append(tickets[i].getId()).append(",").append("\"").
append(ticketTraceStringMap.get(tickets[i].commonId)).append(", ").
append(sumOfContactsMap.get((tickets[i].commonId))).append("\"").
append('\n');
pw.write(builder.toString());
builder.setLength(0);
}
pw.close();
}
}
Java 存储工单信息的对象:
class Ticket {
private String Id;
private String Email;
private String Phone;
private String OrderId;
private String Contacts;
public int commonId;
// compare 2 tickets based on email, phone no. and order ID
@Override
public boolean equals(Object o) {
Ticket ticket = (Ticket) o;
return (!getEmail().equals("") && Objects.equals(getEmail(), ticket.getEmail())) ||
(!getPhone().equals("") && Objects.equals(getPhone(), ticket.getPhone())) ||
(!getOrderId().equals("") && Objects.equals(getOrderId(), ticket.getOrderId()));
}
// getters and setters
}
这是我使用 Python3 的解决方案。我用了 2 个字典来存储票证连接。第一个字典按值存储连接的票证。第二个字典存储每个 id 连接(跟踪)。最后,使用第二个字典中的跟踪计算联系人。是的,也是一种非常缓慢的方式。它需要 3 个循环。计算 Kaggle 内核中的所有 500.000 行最多需要 15 秒。
# Import tools
import numpy as np
import pandas as pd
# Load data
df = pd.read_json('/kaggle/input/scl-2021-da/contacts.json')
npdata = df.values
# Initialize memory
memory = {}
connections = {}
# Store connected tickets by value
def add_to_memory(ticket_id, value):
if value != "":
if value in memory:
memory[value].add(ticket_id)
return
memory[value] = {ticket_id}
for row in npdata:
ticket_id = row[0]
# Order Id
add_to_memory(ticket_id, row[4])
# Email
add_to_memory(ticket_id, row[1])
# Phone
add_to_memory(ticket_id, row[2])
# Calculate Trace
for ids in memory.values():
current_connection = set(ids)
for uid in ids:
if uid in connections:
current_connection.update(connections[uid])
for uid in current_connection:
connections[uid] = current_connection
# Calculate contacts and add to list
output = []
for ticket_id, trace in sorted(connections.items()):
contacts = np.sum(npdata[list(trace), 3])
trace = "-".join([str(_id) for _id in sorted(trace)])
answer = "{}, {}".format(trace, contacts)
output.append({"ticket_id": ticket_id, "ticket_trace/contact": answer})
# Convert to pandas DataFrame & save to csv
output_df = pd.DataFrame(output)
filename = "output.csv"
output_df.to_csv(filename, index=False)
问题描述
以下问题描述摘自Shopee Code League 2021
对于每张工单,如果每个用户具有相同的联系信息,则识别他们的所有联系人。这些工单中的每一个都通过 Email
、Phone
或 Order ID
直接或间接关联,因此每张工单都属于同一用户。例如:
Ticket | ID | Order ID | Phone | Contacts | |
---|---|---|---|---|---|
A | 0 | john@gmail.com | 12345678 | NA | 5 |
B | 1 | NA | 12345678 | 682212345 | 2 |
C | 34567 | wick@gmail.com | NA | 682212345 | 4 |
D | 78999 | wick@gmail.com | NA | NA | 3 |
- 工单 A 和工单 B 通过
Order ID
链接
- 工单 B 和 C 通过
Phone
链接
- 工单 C 和 D 通过
Email
链接
- 工单 A 和工单 D 间接链接 通过工单
A
>B
>C
>D
在这个例子中,这个用户共有14个Contact
。该用户的 ticket_trace/contact 对将是 0-1-34567-78999, 14
.
对于每张票,确定属于同一用户的所有其他 ID
,按升序排序,以及该用户拥有的总数 Contact
。生成一个包含 2 列的 csv
文件,格式如下:
ID | ticket_trace and Contact |
---|---|
0 | 0-1-34567-78999, 14 |
1 | 0-1-34567-78999, 14 |
⋮ | ⋮ |
请注意,有 500,000 行数据。在短时间内解决此问题的最有效方法是最小时间复杂度和内存使用[=64] =]?
问题数据集
输入文件样本,contacts.json
:
[
{
"Id":0,
"Email":"gkzAbIy@qq.com",
"Phone":"",
"Contacts":1,
"OrderId":""
},
{
"Id":1,
"Email":"",
"Phone":"329442681752",
"Contacts":4,
"OrderId":"vDDJJcxfLtSfkooPhbYnJdxov"
}, // more data
]
Click here 问题数据集和更详细的描述。
这是我尝试过的解决方案。虽然我没有足够的时间来处理所有 500,000 行。基本上,我遍历数据数组并按顺序比较它们的属性。是啊,很慢的方式。
import com.google.gson.Gson;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.*;
public class Main {
public static void main(String[] args) throws Exception {
Gson gson = new Gson();
Ticket[] tickets = gson.fromJson(new FileReader("contacts.json"), Ticket[].class);
HashMap<Integer, LinkedList<Integer>> ticketTraceMap = new HashMap<>();
HashMap<Integer, Integer> sumOfContactsMap = new HashMap<>();
int commonId = 1; // tickets with same commonId's are related
for (int i = 0; i < tickets.length; ++i) {
// assign new commonId and increment it if a particular ticket
// is not related to other tickets yet
if (tickets[i].commonId == 0) {
tickets[i].commonId = commonId;
++commonId;
}
int sumOfContacts = 0;
LinkedList<Integer> ticketTraceList = new LinkedList<>();
ticketTraceList.add(Integer.parseInt(tickets[i].getId()));
// get sum of contacts and ticket trace if present
if (sumOfContactsMap.get(tickets[i].commonId) != null) {
sumOfContacts = sumOfContactsMap.get(tickets[i].commonId);
}
if (ticketTraceMap.get((tickets[i].commonId)) != null) {
ticketTraceList = ticketTraceMap.get((tickets[i].commonId));
}
for (int j = i + 1; j < tickets.length; ++j) {
if (tickets[i].equals(tickets[j])) {
tickets[j].commonId = tickets[i].commonId; // assign commonId to link related tickets
sumOfContacts += Integer.parseInt(tickets[j].getContacts());
ticketTraceList.add(Integer.parseInt(tickets[j].getId()));
}
}
ticketTraceMap.put(tickets[i].commonId, ticketTraceList);
sumOfContactsMap.put(tickets[i].commonId, sumOfContacts);
}
// store ticket trace string
HashMap<Integer, String> ticketTraceStringMap = new HashMap<>();
for (Map.Entry<Integer, LinkedList<Integer>> entry : ticketTraceMap.entrySet()) {
Collections.sort(entry.getValue()); // sort ticket trace in ascending order
StringBuilder sb = new StringBuilder();
// concatenate ticket trace into a string
for (Integer integer : entry.getValue()) {
sb.append(integer).append("-");
}
sb.setLength(sb.length() - 1);
ticketTraceStringMap.put(entry.getKey(), sb.toString());
}
// print result to a csv file
PrintWriter pw = new PrintWriter("result.csv");
StringBuilder builder = new StringBuilder();
String columnNamesList = "ticket_id,ticket_trace/contact\n";
builder.append(columnNamesList);
for (int i = 0; i < tickets.length; ++i) {
builder.append(tickets[i].getId()).append(",").append("\"").
append(ticketTraceStringMap.get(tickets[i].commonId)).append(", ").
append(sumOfContactsMap.get((tickets[i].commonId))).append("\"").
append('\n');
pw.write(builder.toString());
builder.setLength(0);
}
pw.close();
}
}
Java 存储工单信息的对象:
class Ticket {
private String Id;
private String Email;
private String Phone;
private String OrderId;
private String Contacts;
public int commonId;
// compare 2 tickets based on email, phone no. and order ID
@Override
public boolean equals(Object o) {
Ticket ticket = (Ticket) o;
return (!getEmail().equals("") && Objects.equals(getEmail(), ticket.getEmail())) ||
(!getPhone().equals("") && Objects.equals(getPhone(), ticket.getPhone())) ||
(!getOrderId().equals("") && Objects.equals(getOrderId(), ticket.getOrderId()));
}
// getters and setters
}
这是我使用 Python3 的解决方案。我用了 2 个字典来存储票证连接。第一个字典按值存储连接的票证。第二个字典存储每个 id 连接(跟踪)。最后,使用第二个字典中的跟踪计算联系人。是的,也是一种非常缓慢的方式。它需要 3 个循环。计算 Kaggle 内核中的所有 500.000 行最多需要 15 秒。
# Import tools
import numpy as np
import pandas as pd
# Load data
df = pd.read_json('/kaggle/input/scl-2021-da/contacts.json')
npdata = df.values
# Initialize memory
memory = {}
connections = {}
# Store connected tickets by value
def add_to_memory(ticket_id, value):
if value != "":
if value in memory:
memory[value].add(ticket_id)
return
memory[value] = {ticket_id}
for row in npdata:
ticket_id = row[0]
# Order Id
add_to_memory(ticket_id, row[4])
# Email
add_to_memory(ticket_id, row[1])
# Phone
add_to_memory(ticket_id, row[2])
# Calculate Trace
for ids in memory.values():
current_connection = set(ids)
for uid in ids:
if uid in connections:
current_connection.update(connections[uid])
for uid in current_connection:
connections[uid] = current_connection
# Calculate contacts and add to list
output = []
for ticket_id, trace in sorted(connections.items()):
contacts = np.sum(npdata[list(trace), 3])
trace = "-".join([str(_id) for _id in sorted(trace)])
answer = "{}, {}".format(trace, contacts)
output.append({"ticket_id": ticket_id, "ticket_trace/contact": answer})
# Convert to pandas DataFrame & save to csv
output_df = pd.DataFrame(output)
filename = "output.csv"
output_df.to_csv(filename, index=False)