在 EKS Fargate pod 上使用 emptyDir{} 类型安装的公共目录中创建的文件夹 deleted/not

Folder deleted/not created inside the common dir mounted with emptyDir{} type on EKS Fargate pod

我们正面临 EKS Fargate 的奇怪问题 Pods。我们想使用 sidecar fluent-bit 容器将日志推送到 cloudwatch,为此我们将单独创建的 /logs/boot/logs/access 文件夹安装在两个类型为 emptyDir: {} 的容器上。但是不知何故 access 文件夹被删除了。当我们在本地 docker 中测试此设置时,它产生了预期的结果并且一切正常,但在 EKS Fargate 中部署时却没有。下面是我们的清单文件

Dockerfile

FROM anapsix/alpine-java:8u201b09_server-jre_nashorn

ARG LOG_DIR=/logs

# Install base packages
RUN apk update
RUN apk upgrade
# RUN apk add ca-certificates && update-ca-certificates

# Dynamically set the JAVA_HOME path
RUN export JAVA_HOME="$(dirname $(dirname $(readlink -f $(which java))))" && echo $JAVA_HOME

# Add Curl
RUN apk --no-cache add curl

RUN mkdir -p $LOG_DIR/boot $LOG_DIR/access
RUN chmod -R 0777 $LOG_DIR/*

# Add metadata to the image to describe which port the container is listening on at runtime.

# Change TimeZone
RUN apk add --update tzdata
ENV TZ="Asia/Kolkata"

# Clean APK cache
RUN rm -rf /var/cache/apk/*

# Setting JAVA HOME
ENV JAVA_HOME=/opt/jdk

# Copy all files and folders
COPY . .
RUN rm -rf /opt/jdk/jre/lib/security/cacerts
COPY cacerts /opt/jdk/jre/lib/security/cacerts
COPY standalone.xml /jboss-eap-6.4-integration/standalone/configuration/

# Set the working directory.
WORKDIR /jboss-eap-6.4-integration/bin

EXPOSE 8177

CMD ["./erctl"]

部署

apiVersion: apps/v1
kind: Deployment
metadata:
  name: vinintegrator
  namespace: eretail
  labels:
    app: vinintegrator
    pod: fargate
spec:
  selector:
    matchLabels:
      app: vinintegrator
      pod: fargate
  replicas: 2
  template:
    metadata:
      labels:
        app: vinintegrator
        pod: fargate
    spec:
      securityContext:
        fsGroup: 0
      serviceAccount: eretail
      containers:
      - name: vinintegrator
        imagePullPolicy: IfNotPresent
        image: 653580443710.dkr.ecr.ap-southeast-1.amazonaws.com/vinintegrator-service:latest
        resources:
          limits:
            memory: "7629Mi"
            cpu: "1.5"
          requests:
            memory: "5435Mi"
            cpu: "750m"
        ports:
        - containerPort: 8177
          protocol: TCP
        # securityContext:
          # runAsUser: 506
          # runAsGroup: 506
        volumeMounts:
          - mountPath: /jboss-eap-6.4-integration/bin
            name: bin
          - mountPath: /logs
            name: logs
      - name: fluent-bit
        image: 657281243710.dkr.ecr.ap-southeast-1.amazonaws.com/fluent-bit:latest
        imagePullPolicy: IfNotPresent
        env:
          - name: HOST_NAME
            valueFrom:
              fieldRef:
                fieldPath: spec.nodeName
          - name: POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                fieldPath: metadata.namespace
        resources:
          limits:
            memory: 200Mi
          requests:
            cpu: 200m
            memory: 100Mi
        volumeMounts:
        - name: fluent-bit-config
          mountPath: /fluent-bit/etc/
        - name: logs
          mountPath: /logs
          readOnly: true
      volumes:
        - name: fluent-bit-config
          configMap:
            name: fluent-bit-config
        - name: logs
          emptyDir: {}
        - name: bin
          persistentVolumeClaim:
            claimName: vinintegrator-pvc

以下是 /logs 文件夹的所有权和权限。 请注意drwxrwsrwx

中的's'
drwxrwsrwx    3 root     root          4096 Oct  1 11:50 logs

以下是日志文件夹中的内容。 请注意访问文件夹没有创建或删除。

/logs # ls -lrt
total 4
drwxr-sr-x    2 root     root          4096 Oct  1 11:50 boot
/logs #

下面是Fluent-Bit的配置图

apiVersion: v1
kind: ConfigMap
metadata:
  name: fluent-bit-config
  namespace: eretail
  labels:
    k8s-app: fluent-bit
data:
  fluent-bit.conf: |
    [SERVICE]
        Flush                     5
        Log_Level                 info
        Daemon                    off
        Parsers_File              parsers.conf
        HTTP_Server               On
        HTTP_Listen               0.0.0.0
        HTTP_Port                 2020
        
    @INCLUDE application-log.conf
  
  application-log.conf: |
    [INPUT]
        Name                tail
        Path                /logs/boot/*.log
        Tag                 boot
        
    [INPUT]
        Name                tail
        Path                /logs/access/*.log
        Tag                 access
        
    [OUTPUT]
        Name                cloudwatch_logs
        Match               *boot*
        region              ap-southeast-1
        log_group_name      eks-fluent-bit
        log_stream_prefix   boot-log-
        auto_create_group   On
        
    [OUTPUT]
        Name                cloudwatch_logs
        Match               *access*
        region              ap-southeast-1
        log_group_name      eks-fluent-bit
        log_stream_prefix   access-log-
        auto_create_group   On
        
  parsers.conf: |
    [PARSER]
        Name                docker
        Format              json
        Time_Key            time
        Time_Format         %Y-%m-%dT%H:%M:%S.%LZ

Fluent-bit容器错误日志如下

AWS for Fluent Bit Container Image Version 2.14.0
Fluent Bit v1.7.4
* Copyright (C) 2019-2021 The Fluent Bit Authors
* Copyright (C) 2015-2018 Treasure Data
* Fluent Bit is a CNCF sub-project under the umbrella of Fluentd
* https://fluentbit.io

[2021/10/01 06:20:33] [ info] [engine] started (pid=1)
[2021/10/01 06:20:33] [ info] [storage] version=1.1.1, initializing...
[2021/10/01 06:20:33] [ info] [storage] in-memory
[2021/10/01 06:20:33] [ info] [storage] normal synchronization mode, checksum disabled, max_chunks_up=128
[2021/10/01 06:20:33] [error] [input:tail:tail.1] read error, check permissions: /logs/access/*.log
[2021/10/01 06:20:33] [ warn] [input:tail:tail.1] error scanning path: /logs/access/*.log
[2021/10/01 06:20:38] [error] [net] connection #33 timeout after 5 seconds to: 169.254.169.254:80
[2021/10/01 06:20:38] [error] [net] socket #33 could not connect to 169.254.169.254:80

建议从您的 Dockerfile 中删除以下内容:

RUN mkdir -p $LOG_DIR/boot $LOG_DIR/access
RUN chmod -R 0777 $LOG_DIR/*

使用以下方法设置日志目录和权限:

apiVersion: v1  
kind: Pod    # Deployment
metadata:
  name: busy
  labels:
    app: busy
spec:
  volumes:
  - name: logs  # Shared folder with ephemeral storage
    emptyDir: {}

  initContainers:    # Setup your log directory here
  - name: setup
    image: busybox
    command: ["bin/ash", "-c"]
    args:
    - >
      mkdir -p /logs/boot /logs/access;
      chmod -R 777 /logs
    volumeMounts:
    - name: logs
      mountPath: /logs

  containers:
  - name: app    # Run your application and logs to the directories
    image: busybox
    command: ["bin/ash","-c"]
    args:
    - >
      while :; do echo "$(date): $(uname -r)" | tee -a /logs/boot/boot.log /logs/access/access.log; sleep 1; done
    volumeMounts:
    - name: logs
      mountPath: /logs

  - name: logger    # Any logger that you like
    image: busybox
    command: ["bin/ash","-c"]
    args:           # tail the app logs, forward to CW etc...
    - >
      sleep 5;
      tail -f /logs/boot/boot.log /logs/access/access.log
    volumeMounts:
    - name: logs
      mountPath: /logs

Fargate 上的代码段 运行s,运行 kubectl logs -f busy -c logger 查看尾部。在现实世界中,“应用程序”是您的 java 应用程序,“记录器”是您想要的任何日志代理。注意 Fargate native logging capability 使用 AWS Fluent-bit,您不需要 运行 AWS Fluent-bit 作为 sidecar。