Commit ad17036d by Frank Mai Committed by Alena Prokharchyk

Upgrade to 0.0.3 version

- Embed operator as sub charts + Support to configure operator like other charts + Adjust operator default limit - Add permission to kube-state exporter - Replace localhost by 127.0.0.1 on prometheus-auth - Increase Nginx proxy buffers - Configure PVC name of Prometheus or Alertmanager + Allow to configure PVC name of Prometheus or Alertmanager via `prometheus.persistence.name` or `alertmanager.persistence.name` - Adjust Cluster Monitoring scrape logic + Don't scrape Monitoring namespace on `prometheus-io-scrape` job + The rate to scrape is using global interval, the default is 60s + Remove useless Prometheus record rules **Issue:** - https://github.com/rancher/rancher/issues/19693 - https://github.com/rancher/rancher/issues/18830 - https://github.com/rancher/rancher/issues/19243 - https://github.com/rancher/rancher/issues/19689 - https://github.com/rancher/rancher/issues/19410 - https://github.com/rancher/rancher/issues/19248
parent 89fb3032
......@@ -7,8 +7,8 @@ maintainers:
name: rancher-monitoring
sources:
- https://github.com/coreos/prometheus-operator
version: 0.0.2
appVersion: 0.0.2
version: 0.0.3
appVersion: 0.0.3
home: https://github.com/coreos/prometheus-operator
keywords:
- operator
......
......@@ -46,6 +46,10 @@ spec:
{{- if or .Values.storageSpec .Values.persistence.enabled }}
storage:
volumeClaimTemplate:
{{- if .Values.persistence.name }}
metadata:
name: {{ .Values.persistence.name }}
{{- end }}
spec:
{{- if .Values.storageSpec }}
{{ toYaml .Values.storageSpec | indent 8 }}
......
......@@ -20,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: http
interval: 30s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......
......@@ -28,7 +28,6 @@ spec:
{{- end }}
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
{{- if eq .Values.ports.metrics.scheme "https" }}
......
......@@ -24,7 +24,6 @@ spec:
- cattle-logging
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
honorLabels: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
......
......@@ -28,7 +28,6 @@ spec:
{{- end }}
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
{{- if eq .Values.ports.metrics.scheme "https" }}
......
......@@ -28,7 +28,6 @@ spec:
{{- end }}
endpoints:
- port: {{ .Values.ports.metrics.dnsmasq.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.dnsmasq.scheme }}
{{- if eq .Values.ports.metrics.dnsmasq.scheme "https" }}
......@@ -52,7 +51,6 @@ spec:
regex: (.+)
replacement: $1
- port: {{ .Values.ports.metrics.skydns.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.skydns.scheme }}
{{- if eq .Values.ports.metrics.skydns.scheme "https" }}
......
......@@ -28,7 +28,6 @@ spec:
{{- end }}
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
{{- if eq .Values.ports.metrics.scheme "https" }}
......
apiVersion: {{ template "operator_api_version" . }}
kind: PrometheusRule
metadata:
labels:
app: {{ template "app.name" . }}
chart: {{ template "app.version" . }}
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
source: rancher-monitoring
name: {{ template "app.fullname" . }}
spec:
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
\ No newline at end of file
......@@ -28,7 +28,6 @@ spec:
{{- end }}
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
{{- if eq .Values.ports.metrics.scheme "https" }}
......
......@@ -59,6 +59,13 @@ rules:
verbs:
- "list"
- "watch"
- apiGroups:
- "policy"
resources:
- "poddisruptionbudgets"
verbs:
- "list"
- "watch"
---
apiVersion: v1
......
......@@ -20,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: http
interval: 15s
honorLabels: true
relabelings:
- sourceLabels:
......@@ -36,7 +35,6 @@ spec:
regex: (.+)
replacement: $1
- port: http-metrics
interval: 30s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......
......@@ -26,7 +26,6 @@ spec:
{{- if .Values.https }}
- port: https-metrics
scheme: https
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
......@@ -49,7 +48,6 @@ spec:
- port: https-metrics
scheme: https
path: /metrics/cadvisor
interval: 30s
honorLabels: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
tlsConfig:
......@@ -72,7 +70,6 @@ spec:
replacement: $1
{{- else }}
- port: http-metrics
interval: 15s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......@@ -88,7 +85,6 @@ spec:
replacement: $1
- port: http-metrics
path: /metrics/cadvisor
interval: 30s
honorLabels: true
relabelings:
- sourceLabels:
......
apiVersion: {{ template "operator_api_version" . }}
kind: PrometheusRule
metadata:
labels:
app: {{ template "app.name" . }}
chart: {{ template "app.version" . }}
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
source: rancher-monitoring
name: {{ template "app.fullname" . }}
spec:
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
\ No newline at end of file
......@@ -24,7 +24,6 @@ spec:
- "default"
endpoints:
- port: {{ .Values.ports.metrics.name }}
interval: 15s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: {{ .Values.ports.metrics.scheme }}
{{- if eq .Values.ports.metrics.scheme "https" }}
......
apiVersion: {{ template "operator_api_version" . }}
kind: PrometheusRule
metadata:
labels:
app: {{ template "app.name" . }}
chart: {{ template "app.version" . }}
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
source: rancher-monitoring
name: {{ template "app.fullname" . }}
spec:
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
\ No newline at end of file
......@@ -20,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: metrics
interval: 15s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......
......@@ -60,7 +60,7 @@ data:
proxy_read_timeout 180;
proxy_send_timeout 5;
proxy_buffer_size 16k;
proxy_buffers 4 32k;
proxy_buffers 16 16k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
proxy_temp_path /tmp/temp_dir;
......
......@@ -20,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: metrics
interval: 30s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......
apiVersion: v1
description: Creates Prometheus Operator for Kubernetes.
engine: gotpl
maintainers:
- name: thxCode
email: frank@rancher.com
name: operator
version: 0.0.1
\ No newline at end of file
{{- if .Values.enabled }}
{{- $kubeletSvcName := "expose-kubelets-metrics" -}}
apiVersion: v1
kind: Service
......@@ -74,4 +73,3 @@ spec:
tolerations:
{{ toYaml .Values.tolerations | indent 8 }}
{{- end }}
{{- end }}
{{- if .Values.enabled }}
apiVersion: v1
kind: Service
metadata:
......@@ -20,4 +19,3 @@ spec:
- name: metrics
port: 8080
targetPort: http
{{- end }}
\ No newline at end of file
{{- if .Values.enabled }}
{{- if and .Values.enabledRBAC (not .Values.serviceAccountName) }}
apiVersion: {{ template "rbac_api_version" . }}
kind: ClusterRole
......@@ -90,4 +89,3 @@ subjects:
name: {{ template "app.fullname" . }}
namespace: {{ .Release.Namespace }}
{{- end }}
{{- end }}
{{- if .Values.enabled }}
apiVersion: {{ template "operator_api_version" . }}
kind: ServiceMonitor
metadata:
......@@ -21,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: metrics
interval: 30s
honorLabels: true
relabelings:
- sourceLabels:
......@@ -36,4 +34,3 @@ spec:
targetLabel: node
regex: (.+)
replacement: $1
{{- end }}
......@@ -3,7 +3,6 @@
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
- ingress-nginx
- ingress-controller
- kube-system
......
{{- if and (eq .Values.level "project") (eq .Values.sync.mode "federate") }}
- job_name: 'federate'
scrape_interval: 15s
honor_labels: true
metrics_path: {{ .Values.sync.path }}
params:
......
......@@ -42,7 +42,7 @@ data:
proxy_read_timeout 180;
proxy_send_timeout 5;
proxy_buffer_size 16k;
proxy_buffers 4 32k;
proxy_buffers 16 16k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
proxy_temp_path /tmp/temp_dir;
......
......@@ -113,6 +113,9 @@ spec:
paused: {{ .Values.paused }}
replicas: {{ .Values.replicaCount }}
logLevel: {{ .Values.logLevel }}
logFormat: {{ .Values.logFormat }}
scrapeInterval: {{ .Values.scrapeInterval }}
evaluationInterval: {{ .Values.evaluationInterval }}
{{- if and .Values.resources .Values.resources.core }}
resources:
{{ toYaml .Values.resources.core | indent 4 }}
......@@ -151,6 +154,10 @@ spec:
{{- if or .Values.storageSpec .Values.persistence.enabled }}
storage:
volumeClaimTemplate:
{{- if .Values.persistence.name }}
metadata:
name: {{ .Values.persistence.name }}
{{- end }}
spec:
{{- if .Values.storageSpec }}
{{ toYaml .Values.storageSpec | indent 8 }}
......
......@@ -20,7 +20,6 @@ spec:
- {{ .Release.Namespace | quote }}
endpoints:
- port: metrics
interval: 30s
relabelings:
- sourceLabels:
- __meta_kubernetes_pod_host_ip
......@@ -36,7 +35,6 @@ spec:
replacement: $1
{{- if eq .Values.level "cluster" }}
- port: metrics
interval: 15s
path: /_/metrics
metricRelabelings:
- sourceLabels:
......
......@@ -40,6 +40,9 @@ remoteWrite: []
retention: 24h
logLevel: "info"
logFormat: ""
scrapeInterval: "60s"
evaluationInterval: "60s"
## Prometheus StorageSpec for persistent data
## Ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md
......
......@@ -62,4 +62,9 @@ dependencies:
- name: exporter-fluentd
version: 0.0.1
condition: exporter-fluentd.enabled
repository: "file://./charts/exporter-fluentd/"
\ No newline at end of file
repository: "file://./charts/exporter-fluentd/"
- name: operator
version: 0.0.1
condition: operator.enabled
repository: "file://./charts/operator/"
enabled: false
nameOverride: "prometheus-operator"
enabledRBAC: true
apiGroup: "monitoring.coreos.com"
image:
repository: rancher/coreos-prometheus-operator
tag: v0.29.0
prometheusConfigReloader:
repository: rancher/coreos-prometheus-config-reloader
operator:
nameOverride: "prometheus-operator"
enabledRBAC: true
apiGroup: "monitoring.coreos.com"
image:
repository: rancher/coreos-prometheus-operator
tag: v0.29.0
configmapReload:
repository: rancher/coreos-configmap-reload
tag: v0.0.1
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
nodeSelectors: []
tolerations: []
logFormat: "logfmt"
logLevel: "info"
manageCRDs: false
withValidation: true
## Already exist ServiceAccount
##
serviceAccountName: ""
prometheusConfigReloader:
repository: rancher/coreos-prometheus-config-reloader
tag: v0.29.0
configmapReload:
repository: rancher/coreos-configmap-reload
tag: v0.0.1
resources:
limits:
cpu: 200m
memory: 500Mi
requests:
cpu: 100m
memory: 50Mi
nodeSelectors: []
tolerations: []
logFormat: "logfmt"
logLevel: "info"
manageCRDs: false
withValidation: true
## Already exist ServiceAccount
##
serviceAccountName: ""
exporter-coredns:
enabled: false
......@@ -263,6 +263,7 @@ grafana:
memory: 50Mi
cpu: 50m
persistence:
name: ""
enabled: false
storageClass: ""
accessMode: "ReadWriteOnce"
......@@ -277,7 +278,7 @@ prometheus:
level: cluster
auth:
args:
- --proxy-url=http://localhost:9090
- --proxy-url=http://127.0.0.1:9090
- --listen-address=$(POD_IP):9090
- --filter-reader-labels=prometheus
- --filter-reader-labels=prometheus_replica
......@@ -320,6 +321,7 @@ prometheus:
memory: 100Mi
cpu: 100m
persistence:
name: ""
enabled: false
storageClass: ""
accessMode: "ReadWriteOnce"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment