SLURM - 如何确定作业正在使用哪些特定 CPU?
SLURM - How can I determine what specific CPUs a job is using?
我正在开发一种工具,用于监控集群(19 个节点,40 个核心)上当前 运行 的作业。有什么方法可以确定 slurm 队列中的每个作业正在使用哪个特定的 cpus 吗?我正在使用 'pidstat'、'mpstat' 和 'ps -eFj' 获取数据,这些数据告诉我哪些进程在特定核心上 运行,但无法关联这些进程 ID到 Slurm 使用的作业 ID。 'scontrol show job'给出了很多资料,但是cpu没有具体配置。有什么办法吗?
这是收集数据的代码:
#!/usr/bin/env python
import subprocess
import threading
import time
def scan():
data = [[None, None, None] for i in range(19)]
def mpstat(node):
if(node == 1):
output = subprocess.check_output(['mpstat', '-P', 'ALL', '1', '1'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'mpstat', '-P', 'ALL', '1', '1'])
data[node - 1][0] = output
def pidstat(node):
if(node == 1):
output = subprocess.check_output(['pidstat', '1', '1'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'pidstat', '1', '1'])
data[node - 1][1] = output
def ps(node):
if(node == 1):
output = subprocess.check_output(['ps', '-eFj'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'ps', '-eFj'])
data[node - 1][2] = output
threads = [[None, None, None] for i in range(19)]
for node in range(1, 19 + 1):
threads[node - 1][0] = threading.Thread(target=mpstat, args=(node,))
threads[node - 1][0].start()
threads[node - 1][1] = threading.Thread(target=pidstat, args=(node,))
threads[node - 1][1].start()
threads[node - 1][2] = threading.Thread(target=ps, args=(node,))
threads[node - 1][2].start()
while True:
alive = [[not t.isAlive() for t in n] for n in threads]
alive = [t for n in alive for t in n]
if(all(alive)):
break
time.sleep(1.0)
return(data)
通过使用 -d
标志,您可以获得每个节点上的作业 CPU_IDs,如下所示。
$ scontrol show job -d $SLURM_JOBID
JobId=1 JobName=bash
UserId=USER(UID) GroupId=GROUP(GID) MCS_label=N/A
Priority=56117 Nice=0 Account=account QOS=interactive
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
DerivedExitCode=0:0
RunTime=00:00:10 TimeLimit=02:00:00 TimeMin=N/A
SubmitTime=2019-04-12T17:34:11 EligibleTime=2019-04-12T17:34:11
StartTime=2019-04-12T17:34:12 EndTime=2019-04-12T19:34:12 Deadline=N/A
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=defq AllocNode:Sid=node2:25638
ReqNodeList=(null) ExcNodeList=(null)
NodeList=node1
BatchHost=node2
NumNodes=1 NumCPUs=2 NumTasks=1 CPUs/Task=2 ReqB:S:C:T=0:0:*:*
TRES=cpu=2,mem=17600M,node=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
Nodes=node1 CPU_IDs=12-13 Mem=17600 GRES_IDX=
MinCPUsNode=2 MinMemoryCPU=8800M MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
Gres=(null) Reservation=(null)
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=bash
WorkDir=/home/USER
Power=
如果此信息不够,您可能会发现 scontrol pidinfo PID
的输出很有用
$ scontrol pidinfo 43734
Slurm job id 21757758 ends at Fri Apr 12 20:15:49 2019
slurm_get_rem_time is 6647
我正在开发一种工具,用于监控集群(19 个节点,40 个核心)上当前 运行 的作业。有什么方法可以确定 slurm 队列中的每个作业正在使用哪个特定的 cpus 吗?我正在使用 'pidstat'、'mpstat' 和 'ps -eFj' 获取数据,这些数据告诉我哪些进程在特定核心上 运行,但无法关联这些进程 ID到 Slurm 使用的作业 ID。 'scontrol show job'给出了很多资料,但是cpu没有具体配置。有什么办法吗?
这是收集数据的代码:
#!/usr/bin/env python
import subprocess
import threading
import time
def scan():
data = [[None, None, None] for i in range(19)]
def mpstat(node):
if(node == 1):
output = subprocess.check_output(['mpstat', '-P', 'ALL', '1', '1'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'mpstat', '-P', 'ALL', '1', '1'])
data[node - 1][0] = output
def pidstat(node):
if(node == 1):
output = subprocess.check_output(['pidstat', '1', '1'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'pidstat', '1', '1'])
data[node - 1][1] = output
def ps(node):
if(node == 1):
output = subprocess.check_output(['ps', '-eFj'])
else:
output = subprocess.check_output(['ssh', 'node' + str(node), 'ps', '-eFj'])
data[node - 1][2] = output
threads = [[None, None, None] for i in range(19)]
for node in range(1, 19 + 1):
threads[node - 1][0] = threading.Thread(target=mpstat, args=(node,))
threads[node - 1][0].start()
threads[node - 1][1] = threading.Thread(target=pidstat, args=(node,))
threads[node - 1][1].start()
threads[node - 1][2] = threading.Thread(target=ps, args=(node,))
threads[node - 1][2].start()
while True:
alive = [[not t.isAlive() for t in n] for n in threads]
alive = [t for n in alive for t in n]
if(all(alive)):
break
time.sleep(1.0)
return(data)
通过使用 -d
标志,您可以获得每个节点上的作业 CPU_IDs,如下所示。
$ scontrol show job -d $SLURM_JOBID
JobId=1 JobName=bash
UserId=USER(UID) GroupId=GROUP(GID) MCS_label=N/A
Priority=56117 Nice=0 Account=account QOS=interactive
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=0 Reboot=0 ExitCode=0:0
DerivedExitCode=0:0
RunTime=00:00:10 TimeLimit=02:00:00 TimeMin=N/A
SubmitTime=2019-04-12T17:34:11 EligibleTime=2019-04-12T17:34:11
StartTime=2019-04-12T17:34:12 EndTime=2019-04-12T19:34:12 Deadline=N/A
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=defq AllocNode:Sid=node2:25638
ReqNodeList=(null) ExcNodeList=(null)
NodeList=node1
BatchHost=node2
NumNodes=1 NumCPUs=2 NumTasks=1 CPUs/Task=2 ReqB:S:C:T=0:0:*:*
TRES=cpu=2,mem=17600M,node=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
Nodes=node1 CPU_IDs=12-13 Mem=17600 GRES_IDX=
MinCPUsNode=2 MinMemoryCPU=8800M MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
Gres=(null) Reservation=(null)
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=bash
WorkDir=/home/USER
Power=
如果此信息不够,您可能会发现 scontrol pidinfo PID
$ scontrol pidinfo 43734
Slurm job id 21757758 ends at Fri Apr 12 20:15:49 2019
slurm_get_rem_time is 6647