如何使用 fabric8 kubernetes Java 客户端 API 在容器上设置 GPU 资源要求
How to set GPU resource requirements on a container with the fabric8 kubernetes Java client API
我用 fabric8 kubernetes Java 客户端 API 编写了一个示例来设置容器上的 GPU 资源要求。我遇到以下运行时错误:
spec.containers[0].resources.requests[gpu]: Invalid value: "gpu": must be a standard resource type or fully qualified,
spec.containers[0].resources.requests[gpu]: Invalid value: "gpu": must be a standard resource for containers.
fabric8 jar版本为4.3.0(最新)。貌似fabric8到现在还不支持gpu资源需求,我把"addToRequests("gpu", new Quantity("1"))"这一行去掉就可以正常工作了。
如何在 Java/Scala 应用程序中启用 GPU 资源需求?
示例的全部源码如下:
/**
* Copyright (C) 2015 Red Hat, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.exam.docker.kubernetes.examples;
import io.fabric8.kubernetes.api.model.*;
import io.fabric8.kubernetes.client.*;
import io.fabric8.kubernetes.client.Config;
import io.fabric8.kubernetes.client.ConfigBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
public class PodResExamples {
private static final Logger logger = LoggerFactory.getLogger(PodResExamples.class);
public static void main(String[] args) {
String master = "http://127.0.0.1:8080/";
if (args.length == 1) {
master = args[0];
}
String ns = "thisisatest";
String serviceName = "cuda-vector-add-"+ UUID.randomUUID();
Config config = new ConfigBuilder().withMasterUrl(master).build();
try (KubernetesClient client = new DefaultKubernetesClient(config)) {
try {
if(client.namespaces().withName(ns).get() == null) {
log("Create namespace:", client.namespaces().create(new NamespaceBuilder().withNewMetadata().withName(ns).endMetadata().build()));
}
String imageStr = "k8s.gcr.io/cuda-vector-add:v0.1";
String cmd = "";
final ResourceRequirements resources = new ResourceRequirementsBuilder()
.addToRequests("cpu", new Quantity("2"))
.addToRequests("memory", new Quantity("10Gi"))
.addToRequests("gpu", new Quantity("1"))
.build();
Container container = new ContainerBuilder().withName(serviceName)
.withImage(imageStr).withImagePullPolicy("IfNotPresent")
.withArgs(cmd)
.withResources(resources)
.build();
Pod createdPod = client.pods().inNamespace(ns).createNew()
.withNewMetadata()
.withName(serviceName)
.addToLabels("podres", "cuda-vector")
.endMetadata()
.withNewSpec()
.addToContainers(container)
.withRestartPolicy("Never")
.endSpec().done();
log("Created pod cuda-vector-add:", createdPod);
final CountDownLatch watchLatch = new CountDownLatch(1);
try (final Watch ignored = client.pods().inNamespace(ns).withLabel("podres").watch(new Watcher<Pod>() {
@Override
public void eventReceived(final Action action, Pod pod) {
if (pod.getStatus().getPhase().equals("Succeeded")) {
logger.info("Pod cuda-vector is completed!");
logger.info(client.pods().inNamespace(ns).withName(pod.getMetadata().getName()).getLog());
watchLatch.countDown();
} else if (pod.getStatus().getPhase().equals("Pending")) {
logger.info("Pod cuda-vector is Pending!");
}
}
@Override
public void onClose(final KubernetesClientException e) {
logger.info("Cleaning up pod.");
}
})) {
watchLatch.await(30, TimeUnit.SECONDS);
} catch (final KubernetesClientException | InterruptedException e) {
e.printStackTrace();
logger.error("Could not watch pod", e);
}
} catch (KubernetesClientException e) {
logger.error(e.getMessage(), e);
} finally {
log("Pod cuda-vector log: \n", client.pods().inNamespace(ns).withName(serviceName).getLog());
client.namespaces().withName(ns).delete();
}
}
}
private static void log(String action, Object obj) {
logger.info("{}: {}", action, obj);
}
private static void log(String action) {
logger.info(action);
}
}
参考 Kubernetes Docs 你可以尝试使用 nvidia.com/gpu
而不是 gpu
:
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
spec:
containers:
- name: cuda-vector-add
image: "k8s.gcr.io/cuda-vector-add:v0.1"
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
如果您的应用程序改用 AMD GPU,请尝试 amd.com/gpu
重要说明:您不能设置 GPU-Request,除非您也将限制设置为与请求相等。
我用 fabric8 kubernetes Java 客户端 API 编写了一个示例来设置容器上的 GPU 资源要求。我遇到以下运行时错误:
spec.containers[0].resources.requests[gpu]: Invalid value: "gpu": must be a standard resource type or fully qualified,
spec.containers[0].resources.requests[gpu]: Invalid value: "gpu": must be a standard resource for containers.
fabric8 jar版本为4.3.0(最新)。貌似fabric8到现在还不支持gpu资源需求,我把"addToRequests("gpu", new Quantity("1"))"这一行去掉就可以正常工作了。
如何在 Java/Scala 应用程序中启用 GPU 资源需求?
示例的全部源码如下:
/**
* Copyright (C) 2015 Red Hat, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.exam.docker.kubernetes.examples;
import io.fabric8.kubernetes.api.model.*;
import io.fabric8.kubernetes.client.*;
import io.fabric8.kubernetes.client.Config;
import io.fabric8.kubernetes.client.ConfigBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
public class PodResExamples {
private static final Logger logger = LoggerFactory.getLogger(PodResExamples.class);
public static void main(String[] args) {
String master = "http://127.0.0.1:8080/";
if (args.length == 1) {
master = args[0];
}
String ns = "thisisatest";
String serviceName = "cuda-vector-add-"+ UUID.randomUUID();
Config config = new ConfigBuilder().withMasterUrl(master).build();
try (KubernetesClient client = new DefaultKubernetesClient(config)) {
try {
if(client.namespaces().withName(ns).get() == null) {
log("Create namespace:", client.namespaces().create(new NamespaceBuilder().withNewMetadata().withName(ns).endMetadata().build()));
}
String imageStr = "k8s.gcr.io/cuda-vector-add:v0.1";
String cmd = "";
final ResourceRequirements resources = new ResourceRequirementsBuilder()
.addToRequests("cpu", new Quantity("2"))
.addToRequests("memory", new Quantity("10Gi"))
.addToRequests("gpu", new Quantity("1"))
.build();
Container container = new ContainerBuilder().withName(serviceName)
.withImage(imageStr).withImagePullPolicy("IfNotPresent")
.withArgs(cmd)
.withResources(resources)
.build();
Pod createdPod = client.pods().inNamespace(ns).createNew()
.withNewMetadata()
.withName(serviceName)
.addToLabels("podres", "cuda-vector")
.endMetadata()
.withNewSpec()
.addToContainers(container)
.withRestartPolicy("Never")
.endSpec().done();
log("Created pod cuda-vector-add:", createdPod);
final CountDownLatch watchLatch = new CountDownLatch(1);
try (final Watch ignored = client.pods().inNamespace(ns).withLabel("podres").watch(new Watcher<Pod>() {
@Override
public void eventReceived(final Action action, Pod pod) {
if (pod.getStatus().getPhase().equals("Succeeded")) {
logger.info("Pod cuda-vector is completed!");
logger.info(client.pods().inNamespace(ns).withName(pod.getMetadata().getName()).getLog());
watchLatch.countDown();
} else if (pod.getStatus().getPhase().equals("Pending")) {
logger.info("Pod cuda-vector is Pending!");
}
}
@Override
public void onClose(final KubernetesClientException e) {
logger.info("Cleaning up pod.");
}
})) {
watchLatch.await(30, TimeUnit.SECONDS);
} catch (final KubernetesClientException | InterruptedException e) {
e.printStackTrace();
logger.error("Could not watch pod", e);
}
} catch (KubernetesClientException e) {
logger.error(e.getMessage(), e);
} finally {
log("Pod cuda-vector log: \n", client.pods().inNamespace(ns).withName(serviceName).getLog());
client.namespaces().withName(ns).delete();
}
}
}
private static void log(String action, Object obj) {
logger.info("{}: {}", action, obj);
}
private static void log(String action) {
logger.info(action);
}
}
参考 Kubernetes Docs 你可以尝试使用 nvidia.com/gpu
而不是 gpu
:
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
spec:
containers:
- name: cuda-vector-add
image: "k8s.gcr.io/cuda-vector-add:v0.1"
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
如果您的应用程序改用 AMD GPU,请尝试 amd.com/gpu
重要说明:您不能设置 GPU-Request,除非您也将限制设置为与请求相等。