无法在 Prometheus Operator 中获取 spring 启动应用程序的指标

Question

我正在尝试从我的 prometheus 运算符中的 spring 引导应用程序获取指标： eks：版本。 1.18 kube-普罗米修斯堆栈：版本：12.12.1 应用版本：0.44.0

我检查过，应用程序确实通过端点提取指标：

http:///myloadbalancer/internal-gateway/actuator/prometheus

# HELP system_cpu_usage The "recent cpu usage" for the whole system
# TYPE system_cpu_usage gauge
system_cpu_usage 0.013852972596312008
# HELP process_cpu_usage The "recent cpu usage" for the Java Virtual Machine process
# TYPE process_cpu_usage gauge
process_cpu_usage 0.0
# HELP jvm_gc_pause_seconds Time spent in GC pause
# TYPE jvm_gc_pause_seconds summary
jvm_gc_pause_seconds_count{action="end of major GC",cause="Allocation Failure",} 4.0
jvm_gc_pause_seconds_sum{action="end of major GC",cause="Allocation Failure",} 0.922
jvm_gc_pause_seconds_count{action="end of minor GC",cause="Allocation Failure",} 235.0
jvm_gc_pause_seconds_sum{action="end of minor GC",cause="Allocation Failure",} 2.584
# HELP jvm_gc_pause_seconds_max Time spent in GC pause
# TYPE jvm_gc_pause_seconds_max gauge
jvm_gc_pause_seconds_max{action="end of major GC",cause="Allocation Failure",} 0.0
jvm_gc_pause_seconds_max{action="end of minor GC",cause="Allocation Failure",} 0.0
# HELP jvm_gc_memory_allocated_bytes_total Incremented for an increase in the size of the young generation memory pool after one GC to before the next
# TYPE jvm_gc_memory_allocated_bytes_total counter
jvm_gc_memory_allocated_bytes_total 8.888016704E9
# HELP tomcat_sessions_active_current_sessions  
# TYPE tomcat_sessions_active_current_sessions gauge
tomcat_sessions_active_current_sessions 0.0
# HELP tomcat_sessions_alive_max_seconds  
# TYPE tomcat_sessions_alive_max_seconds gauge
tomcat_sessions_alive_max_seconds 0.0
# HELP jvm_gc_memory_promoted_bytes_total Count of positive increases in the size of the old generation memory pool before GC to after GC
# TYPE jvm_gc_memory_promoted_bytes_total counter
jvm_gc_memory_promoted_bytes_total 1.13497864E8
# HELP jvm_buffer_memory_used_bytes An estimate of the memory that the Java virtual machine is using for this buffer pool
# TYPE jvm_buffer_memory_used_bytes gauge
jvm_buffer_memory_used_bytes{id="mapped",} 0.0
jvm_buffer_memory_used_bytes{id="direct",} 509649.0
# HELP system_cpu_count The number of processors available to the Java virtual machine
# TYPE system_cpu_count gauge
system_cpu_count 1.0
# HELP tomcat_sessions_created_sessions_total  
# TYPE tomcat_sessions_created_sessions_total counter
tomcat_sessions_created_sessions_total 0.0
# HELP jvm_gc_live_data_size_bytes Size of old generation memory pool after a full GC
# TYPE jvm_gc_live_data_size_bytes gauge
jvm_gc_live_data_size_bytes 8.5375192E7
# HELP jvm_classes_unloaded_classes_total The total number of classes unloaded since the Java virtual machine has started execution
# TYPE jvm_classes_unloaded_classes_total counter
jvm_classes_unloaded_classes_total 199.0
# HELP tomcat_sessions_active_max_sessions  
# TYPE tomcat_sessions_active_max_sessions gauge
tomcat_sessions_active_max_sessions 0.0
# HELP process_files_open_files The open file descriptor count
# TYPE process_files_open_files gauge
process_files_open_files 66.0
# HELP logback_events_total Number of error level events that made it to the logs
# TYPE logback_events_total counter
logback_events_total{level="warn",} 2.0
logback_events_total{level="debug",} 0.0
logback_events_total{level="error",} 0.0
logback_events_total{level="trace",} 0.0
logback_events_total{level="info",} 443.0
# HELP jvm_gc_max_data_size_bytes Max size of old generation memory pool
# TYPE jvm_gc_max_data_size_bytes gauge
jvm_gc_max_data_size_bytes 5.36870912E8
# HELP jvm_buffer_count_buffers An estimate of the number of buffers in the pool
# TYPE jvm_buffer_count_buffers gauge
jvm_buffer_count_buffers{id="mapped",} 0.0
jvm_buffer_count_buffers{id="direct",} 18.0
# HELP jvm_buffer_total_capacity_bytes An estimate of the total capacity of the buffers in this pool
# TYPE jvm_buffer_total_capacity_bytes gauge
jvm_buffer_total_capacity_bytes{id="mapped",} 0.0
jvm_buffer_total_capacity_bytes{id="direct",} 509649.0
# HELP jvm_memory_committed_bytes The amount of memory in bytes that is committed for the Java virtual machine to use
# TYPE jvm_memory_committed_bytes gauge
jvm_memory_committed_bytes{area="heap",id="Tenured Gen",} 1.4229504E8
jvm_memory_committed_bytes{area="nonheap",id="CodeHeap 'profiled nmethods'",} 2.9229056E7
jvm_memory_committed_bytes{area="heap",id="Eden Space",} 5.7081856E7
jvm_memory_committed_bytes{area="nonheap",id="Metaspace",} 1.01359616E8
jvm_memory_committed_bytes{area="nonheap",id="CodeHeap 'non-nmethods'",} 2555904.0
jvm_memory_committed_bytes{area="heap",id="Survivor Space",} 7077888.0
jvm_memory_committed_bytes{area="nonheap",id="Compressed Class Space",} 1.31072E7
jvm_memory_committed_bytes{area="nonheap",id="CodeHeap 'non-profiled nmethods'",} 1.1599872E7
# HELP spring_kafka_listener_seconds_max Kafka Listener Timer
# TYPE spring_kafka_listener_seconds_max gauge
spring_kafka_listener_seconds_max{exception="ListenerExecutionFailedException",name="fgMessageConsumer-0",result="failure",} 0.0
spring_kafka_listener_seconds_max{exception="none",name="fgMessageConsumer-0",result="success",} 0.0
# HELP spring_kafka_listener_seconds Kafka Listener Timer
# TYPE spring_kafka_listener_seconds summary
spring_kafka_listener_seconds_count{exception="ListenerExecutionFailedException",name="fgMessageConsumer-0",result="failure",} 0.0
spring_kafka_listener_seconds_sum{exception="ListenerExecutionFailedException",name="fgMessageConsumer-0",result="failure",} 0.0
spring_kafka_listener_seconds_count{exception="none",name="fgMessageConsumer-0",result="success",} 9.0
spring_kafka_listener_seconds_sum{exception="none",name="fgMessageConsumer-0",result="success",} 16.017111464
# HELP jvm_memory_max_bytes The maximum amount of memory in bytes that can be used for memory management
# TYPE jvm_memory_max_bytes gauge
jvm_memory_max_bytes{area="heap",id="Tenured Gen",} 5.36870912E8
jvm_memory_max_bytes{area="nonheap",id="CodeHeap 'profiled nmethods'",} 1.22912768E8
jvm_memory_max_bytes{area="heap",id="Eden Space",} 2.14827008E8
jvm_memory_max_bytes{area="nonheap",id="Metaspace",} -1.0
jvm_memory_max_bytes{area="nonheap",id="CodeHeap 'non-nmethods'",} 5828608.0
jvm_memory_max_bytes{area="heap",id="Survivor Space",} 2.6804224E7
jvm_memory_max_bytes{area="nonheap",id="Compressed Class Space",} 1.073741824E9
jvm_memory_max_bytes{area="nonheap",id="CodeHeap 'non-profiled nmethods'",} 1.22916864E8
# HELP jvm_memory_used_bytes The amount of used memory
# TYPE jvm_memory_used_bytes gauge
jvm_memory_used_bytes{area="heap",id="Tenured Gen",} 8.6654784E7
jvm_memory_used_bytes{area="nonheap",id="CodeHeap 'profiled nmethods'",} 2.382144E7
jvm_memory_used_bytes{area="heap",id="Eden Space",} 7444976.0
jvm_memory_used_bytes{area="nonheap",id="Metaspace",} 9.7431448E7
jvm_memory_used_bytes{area="nonheap",id="CodeHeap 'non-nmethods'",} 1346432.0
jvm_memory_used_bytes{area="heap",id="Survivor Space",} 571600.0
jvm_memory_used_bytes{area="nonheap",id="Compressed Class Space",} 1.1687056E7
jvm_memory_used_bytes{area="nonheap",id="CodeHeap 'non-profiled nmethods'",} 1.1500544E7
# HELP jvm_classes_loaded_classes The number of classes that are currently loaded in the Java virtual machine
# TYPE jvm_classes_loaded_classes gauge
jvm_classes_loaded_classes 16917.0
# HELP tomcat_sessions_rejected_sessions_total  
# TYPE tomcat_sessions_rejected_sessions_total counter
tomcat_sessions_rejected_sessions_total 0.0
# HELP process_start_time_seconds Start time of the process since unix epoch.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.616689221264E9
# HELP jvm_threads_peak_threads The peak live thread count since the Java virtual machine started or peak was reset
# TYPE jvm_threads_peak_threads gauge
jvm_threads_peak_threads 37.0
# HELP jvm_threads_live_threads The current number of live threads including both daemon and non-daemon threads
# TYPE jvm_threads_live_threads gauge
jvm_threads_live_threads 36.0
# HELP system_load_average_1m The sum of the number of runnable entities queued to available processors and the number of runnable entities running on the available processors averaged over a period of time
# TYPE system_load_average_1m gauge
system_load_average_1m 0.0
# HELP jvm_threads_daemon_threads The current number of live daemon threads
# TYPE jvm_threads_daemon_threads gauge
jvm_threads_daemon_threads 30.0
# HELP tomcat_sessions_expired_sessions_total  
# TYPE tomcat_sessions_expired_sessions_total counter
tomcat_sessions_expired_sessions_total 0.0
# HELP jvm_threads_states_threads The current number of threads having NEW state
# TYPE jvm_threads_states_threads gauge
jvm_threads_states_threads{state="runnable",} 10.0
jvm_threads_states_threads{state="blocked",} 0.0
jvm_threads_states_threads{state="waiting",} 17.0
jvm_threads_states_threads{state="timed-waiting",} 9.0
jvm_threads_states_threads{state="new",} 0.0
jvm_threads_states_threads{state="terminated",} 0.0
# HELP process_uptime_seconds The uptime of the Java virtual machine
# TYPE process_uptime_seconds gauge
process_uptime_seconds 45380.981
# HELP http_server_requests_seconds  
# TYPE http_server_requests_seconds summary
http_server_requests_seconds_count{exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/health",} 6032.0
http_server_requests_seconds_sum{exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/health",} 5.492759869
# HELP http_server_requests_seconds_max  
# TYPE http_server_requests_seconds_max gauge
http_server_requests_seconds_max{exception="None",method="GET",outcome="SUCCESS",status="200",uri="/actuator/health",} 7.97605E-4
# HELP process_files_max_files The maximum file descriptor count
# TYPE process_files_max_files gauge
process_files_max_files 1048576.0

所以这一切都很好。

这是我的 ServiceMonitor：

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: internal-gateway-service-monitor
  labels:
    release: kube-prometheus-stack
spec:
  selector:
    matchLabels:
      app: internal-gateway
  endpoints:
  - port: http
    path: '/actuator/prometheus'
    interval: 10s
    honorLabels: true

这是我的服务：

apiVersion: v1
kind: Service
metadata:
  annotations:
    meta.helm.sh/release-name: perf4-backend
    meta.helm.sh/release-namespace: perf4
  creationTimestamp: "2021-03-23T13:00:47Z"
  labels:
    app.kubernetes.io/managed-by: Helm
  managedFields:
  - apiVersion: v1
    fieldsType: FieldsV1
    fieldsV1:
      f:metadata:
        f:annotations:
          .: {}
          f:meta.helm.sh/release-name: {}
          f:meta.helm.sh/release-namespace: {}
        f:labels:
          .: {}
          f:app.kubernetes.io/managed-by: {}
      f:spec:
        f:externalTrafficPolicy: {}
        f:ports:
          .: {}
          k:{"port":80,"protocol":"TCP"}:
            .: {}
            f:name: {}
            f:port: {}
            f:protocol: {}
            f:targetPort: {}
        f:selector:
          .: {}
          f:app: {}
        f:sessionAffinity: {}
        f:type: {}
    manager: Go-http-client
    operation: Update
    time: "2021-03-23T13:00:47Z"
  name: internal-gateway
  namespace: perf4
  resourceVersion: "18659"
  selfLink: /api/v1/namespaces/perf4/services/internal-gateway
  uid: 75f89f23-d76e-4701-80f9-a029ce0f1153
spec:
  clusterIP: 172.20.105.66
  externalTrafficPolicy: Cluster
  ports:
  - name: http
    nodePort: 31500
    port: 80
    protocol: TCP
    targetPort: 8070
  selector:
    app: internal-gateway
  sessionAffinity: None
  type: NodePort
status:
  loadBalancer: {}

这是我的 pod yaml: （删除了不必要的字段）

apiVersion: v1
kind: Pod
metadata:
    cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
    kubernetes.io/psp: eks.privileged
  generateName: fg-internal-gateway-deployment-76cd98ccd8-
  labels:
    app: internal-gateway
    pod-template-hash: 76cd98ccd8
    version: "92095"
  
  name: fg-internal-gateway-deployment-76cd98ccd8-ksmgt
  namespace: perf4
  ownerReferences:
  - apiVersion: apps/v1
    blockOwnerDeletion: true
    controller: true
    kind: ReplicaSet
    name: fg-internal-gateway-deployment-76cd98ccd8
    uid: 69301225-d013-47e4-a126-b525f39ce608
  resourceVersion: "801092"
  selfLink: /api/v1/namespaces/perf4/pods/fg-internal-gateway-deployment-76cd98ccd8-ksmgt
  uid: 5fedee50-b572-4949-8055-9e58a7053b6a
    image: 
    imagePullPolicy: Always
    livenessProbe:
      failureThreshold: 3
      httpGet:
        path: /actuator/health
        port: 8070
        scheme: HTTP
      initialDelaySeconds: 140
      periodSeconds: 15
      successThreshold: 1
      timeoutSeconds: 1
    name: internal-gateway
    ports:
    - containerPort: 8070
      protocol: TCP
    readinessProbe:
      failureThreshold: 3
      httpGet:
        path: /actuator/health
        port: 8070
        scheme: HTTP
      initialDelaySeconds: 140
      periodSeconds: 15
      successThreshold: 1
      timeoutSeconds: 1
    resources:
      limits:
        cpu: "1"
        memory: 3Gi
      requests:
        cpu: "1"
        memory: 3Gi
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
      name: default-token-vcnjm
      readOnly: true
  dnsPolicy: ClusterFirst
  enableServiceLinks: true
  nodeName: 
  nodeSelector:
    role: fgworkers
  priority: 0
  restartPolicy: Always
  schedulerName: default-scheduler
  securityContext: {}
  serviceAccount: default
  serviceAccountName: default
  terminationGracePeriodSeconds: 30
  tolerations:
  - key: gated
    operator: Equal
    value: "true"
  - key: preprod
    operator: Equal
    value: "true"
  - key: staging
    operator: Equal
    value: "true"
  - key: fgworkers
    operator: Equal
    value: "true"
  - effect: NoExecute
    key: node.kubernetes.io/not-ready
    operator: Exists
    tolerationSeconds: 300
  - effect: NoExecute
    key: node.kubernetes.io/unreachable
    operator: Exists
    tolerationSeconds: 300
  volumes:
  - name: default-token-vcnjm
    secret:
      defaultMode: 420
      secretName: default-token-vcnjm
status:
  conditions:
  - lastProbeTime: null
    lastTransitionTime: "2021-03-25T14:42:35Z"
    status: "True"
    type: Initialized
  - lastProbeTime: null
    lastTransitionTime: "2021-03-25T14:45:14Z"
    status: "True"
    type: Ready
  - lastProbeTime: null
    lastTransitionTime: "2021-03-25T14:45:14Z"
    status: "True"
    type: ContainersReady
  - lastProbeTime: null
    lastTransitionTime: "2021-03-25T14:42:35Z"
    status: "True"
    type: PodScheduled
  containerStatuses:
  - containerID: 
    image: 
    imageID: 
    lastState: {}
    name: internal-gateway
    ready: true
    restartCount: 0
    started: true
    state:
      running:
        startedAt: "2021-03-25T14:42:41Z"
  hostIP: 
  phase: Running
  podIP:
  podIPs:
  - ip: 
  qosClass: Guaranteed
  startTime: "2021-03-25T14:42:35Z"

并且我使用了与我的 pod 规格相同的标签 app: internal-gateway。

这是我在普罗米修斯中得到的：

可能是什么问题？

Answer 1

问题是 servicemonitor 找不到您的服务

问题是您在 servicemonitor 定义中的选择器没有选择服务的标签

解决方法：将服务定义的标签更改为与 servicemonitor 的 matchLabeles 定义相同像那样:

apiVersion: v1
kind: Service
metadata:
  annotations:
    meta.helm.sh/release-name: perf4-backend
    meta.helm.sh/release-namespace: perf4
  creationTimestamp: "2021-03-23T13:00:47Z"
  labels:
    app: internal-gateway

Answer 2

确保检查您在服务和 serviceMonitor 中定义的 端口名称 是否相同，我也遇到了同样的问题所以我创建了相同的名称并开始显示正确的应用标签

无法在 Prometheus Operator 中获取 spring 启动应用程序的指标

Unable the get the metrics of spring boot application in Prometheus Operator

monitoring

spring-boot

kubernetes

prometheus

prometheus-operator