AWS ECS Fargate 运行 任务:任务中的基本容器已退出

AWS ECS Fargate run task: Essential container in task exited

目标:

在 ECS Fargate 容器中创建交互式 shell

问题:

ECS 服务内的任务 运行 后,任务状态在 Pending 后立即变为 STOPPED 并给出以下停止原因:Essential container in task exited。由于任务已停止,因此使用 aws ecs execute-command 创建交互式 shell 是不可行的。

背景:

Docker 文件:

FROM python:3.9-alpine AS build

ARG TERRAFORM_VERSION=1.0.2
ARG TERRAGRUNT_VERSION=0.31.0
ARG TFLINT_VERSION=0.23.0
ARG TFSEC_VERSION=0.36.11
ARG TFDOCS_VERSION=0.10.1
ARG GIT_CHGLOG_VERSION=0.14.2
ARG SEMTAG_VERSION=0.1.1
ARG GH_VERSION=2.2.0
ARG TFENV_VERSION=2.2.2

ENV VIRTUAL_ENV=/opt/venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

WORKDIR /src/

COPY install.sh ./install.sh
COPY requirements.txt ./requirements.txt

RUN chmod u+x ./install.sh \
    && sh ./install.sh

FROM python:3.9-alpine

ENV VIRTUAL_ENV=/opt/venv
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
ENV PATH="/usr/local/.tfenv/bin:$PATH"

WORKDIR /src/

COPY --from=build /usr/local /usr/local
COPY --from=build $VIRTUAL_ENV $VIRTUAL_ENV

ENV PATH="$VIRTUAL_ENV/bin:$VIRTUAL_ENV/lib/python3.9/site-packages:$PATH"

RUN apk update \
    && apk add --virtual .runtime \
    bash \
    git \
    curl \
    jq \
    # needed for bats --pretty formatter
    ncurses \
    openssl \
    grep \
    # needed for pcregrep
    pcre-tools \
    coreutils \
    postgresql-client \
    libgcc \
    libstdc++ \
    ncurses-libs \
    docker \
&& ln -sf python3 /usr/local/bin/python \
&& git config --global advice.detachedHead false \
&& git config --global user.email testing_user@users.noreply.github.com \
&& git config --global user.name testing_user

COPY entrypoint.sh ./entrypoint.sh

ENTRYPOINT ["bash", "entrypoint.sh"]
CMD ["/bin/bash"]

entrypoint.sh:

if [ -n "$ADDITIONAL_PATH" ]; then
    echo "Adding to PATH: $ADDITIONAL_PATH"
    export PATH="$ADDITIONAL_PATH:$PATH"
fi

source $VIRTUAL_ENV/bin/activate
pip install -e /src

echo "done"

ECS 的 Terraform 配置:(使用此 AWS 博客 post 作为参考)

data "aws_caller_identity" "current" {}
data "aws_region" "current" {}

module "vpc" {
  source = "terraform-aws-modules/vpc/aws"

  name = local.mut_id
  cidr = "10.0.0.0/16"
  azs  = ["us-west-2a", "us-west-2b", "us-west-2c", "us-west-2d"]
  enable_dns_hostnames = true

  public_subnets = local.public_subnets

  create_database_subnet_group   = true
  database_dedicated_network_acl = true
  database_inbound_acl_rules = [
    {
      rule_number = 1
      rule_action = "allow"
      from_port   = 5432
      to_port     = 5432
      protocol    = "tcp"
      cidr_block  = local.private_subnets[0]
    }
  ]
  database_subnet_group_name = "metadb"
  database_subnets           = local.database_subnets

  private_subnets               = local.private_subnets
  private_dedicated_network_acl = true
  private_outbound_acl_rules = [
    {
      rule_number = 1
      rule_action = "allow"
      from_port   = 5432
      to_port     = 5432
      protocol    = "tcp"
      cidr_block  = local.database_subnets[0]
    }
  ]

  enable_nat_gateway     = true
  single_nat_gateway     = true
  one_nat_gateway_per_az = false
}

module "ecr_testing_img" {
  source = "github.com/marshall7m/terraform-aws-ecr/modules//ecr-docker-img"

  create_repo = true
  source_path = "${path.module}/../.."
  repo_name   = "${local.mut_id}-integration-testing"
  tag         = "latest"
  trigger_build_paths = [
    "${path.module}/../../Dockerfile",
    "${path.module}/../../entrypoint.sh",
    "${path.module}/../../install.sh"
  ]
}

module "testing_kms" {
  source                           = "github.com/marshall7m/terraform-aws-kms/modules//cmk"
  trusted_admin_arns               = [data.aws_caller_identity.current.arn]
  trusted_service_usage_principals = ["ecs-tasks.amazonaws.com"]
}

module "testing_ecs_task_role" {
  source           = "github.com/marshall7m/terraform-aws-iam/modules//iam-role"
  role_name        = "${local.mut_id}-task"
  trusted_services = ["ecs-tasks.amazonaws.com"]
  statements = [
    {
      effect    = "Allow"
      actions   = ["kms:Decrypt"]
      resources = [module.testing_kms.arn]
    },
    {
      effect = "Allow"
      actions = [
        "ssmmessages:CreateControlChannel",
        "ssmmessages:CreateDataChannel",
        "ssmmessages:OpenControlChannel",
        "ssmmessages:OpenDataChannel"
      ]
      resources = ["*"]
    }
  ]
}

module "testing_ecs_execution_role" {
  source                  = "github.com/marshall7m/terraform-aws-iam/modules//iam-role"
  role_name               = "${local.mut_id}-exec"
  trusted_services        = ["ecs-tasks.amazonaws.com"]
  custom_role_policy_arns = ["arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"]
}

resource "aws_ecs_cluster" "testing" {
  name = "${local.mut_id}-integration-testing"

  configuration {
    execute_command_configuration {
      kms_key_id = module.testing_kms.arn
            logging = "DEFAULT"
    }
  }
}

resource "aws_ecs_service" "testing" {
  name                   = "${local.mut_id}-integration-testing"
  task_definition        = aws_ecs_task_definition.testing.arn
  cluster                = aws_ecs_cluster.testing.id
  desired_count          = 0
  enable_execute_command = true
  launch_type            = "FARGATE"
  platform_version       = "1.4.0"
  network_configuration {
    subnets         = [module.vpc.public_subnets[0]]
    security_groups = [aws_security_group.testing.id]
    assign_public_ip = true
  }
  wait_for_steady_state = true
}

resource "aws_cloudwatch_log_group" "testing" {
  name = "${local.mut_id}-ecs"
}

resource "aws_ecs_task_definition" "testing" {
  family                   = "integration-testing"
  requires_compatibilities = ["FARGATE"]
  task_role_arn            = module.testing_ecs_task_role.role_arn
  execution_role_arn       = module.testing_ecs_execution_role.role_arn
  network_mode             = "awsvpc"
  cpu                      = 256
  memory                   = 512
  container_definitions = jsonencode([{
    name  = "testing"
    image = module.ecr_testing_img.full_image_url
    linuxParameters = {
      initProcessEnabled = true
    }
        logConfiguration = {
            logDriver = "awslogs",
            options = {
                awslogs-group = aws_cloudwatch_log_group.testing.name
                awslogs-region = data.aws_region.current.name
                awslogs-stream-prefix = "testing"
            }
        }
    cpu    = 256
    memory = 512
  }])
  runtime_platform {
    operating_system_family = "LINUX"
    cpu_architecture        = "X86_64"
  }
}

resource "aws_security_group" "testing" {
  name        = "${local.mut_id}-integration-testing-ecs"
  description = "Allows internet access request from testing container"
  vpc_id      = module.vpc.vpc_id

  egress {
    description = "Allows outbound HTTP access for installing packages within container"
    from_port   = 80
    to_port     = 80
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }
  egress {
    description = "Allows outbound HTTPS access for installing packages within container"
    from_port   = 443
    to_port     = 443
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

运行 ECS 任务并在容器内执行命令的 Bash 脚本片段:

task_id=$(aws ecs run-task \
        --cluster "$cluster_arn"  \
        --task-definition "$task_arn" \
        --launch-type FARGATE \
        --platform-version '1.4.0' \
        --enable-execute-command \
        --network-configuration awsvpcConfiguration="{subnets=[$subnet_id],securityGroups=[$sg_id],assignPublicIp=ENABLED}" \
        --region $AWS_REGION | jq -r '.tasks[0].taskArn | split("/") | .[-1]')
    
    echo "Task ID: $task_id"
    
    if [ "$run_ecs_exec_check" == true ]; then
        bash <( curl -Ls https://raw.githubusercontent.com/aws-containers/amazon-ecs-exec-checker/main/check-ecs-exec.sh ) "$cluster_arn" "$task_id"
    fi

    sleep_time=10
    status=""
    echo ""
    echo "Waiting for task to be running"
    while [ "$status" != "RUNNING" ]; do
        echo "Checking status in $sleep_time seconds..."
        sleep $sleep_time

        status=$(aws ecs describe-tasks \
            --cluster "$cluster_arn" \
            --region $AWS_REGION \
            --tasks "$task_id" | jq -r '.tasks[0].containers[0].managedAgents[] | select(.name == "ExecuteCommandAgent") | .lastStatus')

        echo "Status: $status"

        if [ "$status" == "STOPPED" ]; then
            aws ecs describe-tasks \
            --cluster "$cluster_arn" \
            --region $AWS_REGION \
            --tasks "$task_id"
            exit 1
        fi

        # sleep_time=$(( $sleep_time * 2 ))
    done

    echo "Running interactive shell within container"
    
    aws ecs execute-command  \
        --region $AWS_REGION \
        --cluster "$cluster_arn" \
        --task "$task_id" \
        --command "/bin/bash" \
        --interactive

一旦您 entrypoint.sh 中的最后一个命令完成,docker 容器就会退出。就像您在本地 运行 docker 容器一样。我建议在不先退出的情况下将 docker 容器本地化到 运行,然后将其部署到 ECS。

如果您只是想让容器坐在那里什么都不做,像 tail -f /dev/null 这样的命令就可以了。