Add Prometheus alerting functionality (#1424)
* Code adaptation k8s: service discovery and registration adaptation, configuration adaptation * Initial submission of the help charts script for openim API * change the help charts script * change the help charts script * change helm chart codes * change dockerfiles script * change chart script:add configmap mounts * change chart script:change repository * change chart script:msggateway add one service * change config.yaml * roll back some config values * change chart script:change Ingress rule with a rewrite annotation * add mysql charts scrible * change chart script:add mysql.config.yaml * add nfs provisioner charts * change chart script:add nfs.config.yaml * add ingress-nginx charts * change chart script:add ingress-nginx.config.yaml * add redis &mongodb charts * add kafka&minio charts * change chart script:change redis.values.yaml * change chart script:add redis.config.yaml * change chart script:change redis.config.yaml * change chart script:change mongodb.value.yaml * change chart script:change mongodb.value.yaml * change chart script:add mongodb.config.yaml * change chart script:change minio.values.yaml * change chart script:add minio.config.yaml * change chart script:change kafka.values.yaml * change chart script:add kafka.config.yaml * change chart script:change services.config.yaml * bug fix:Delete websocket's Port restrictions * bug fix:change port value * change chart script:Submit a stable version script * fix bug:Implement option interface * fix bug:change K8sDR.Register * change config.yaml * change chats script:minio service add ingress * change chats script:minio service add ingress * change chats script:kafka.replicaCount=3& change minio.api ingress * delete change chats script * change config.yaml * change openim.yaml * merge go.sum * Add monitoring function and struct for Prometheus on gin and GRPC * Add GRPC and gin server monitoring logic * Add GRPC and gin server monitoring logic2 * Add GRPC and gin server monitoring logic3 * Add GRPC and gin server monitoring logic4 * Add GRPC and gin server monitoring logic5 * Add GRPC and gin server monitoring logic6 * Add GRPC and gin server monitoring logic7 * delete:old monitoring code * add for test * fix bug:change packname * fix bug:delete getPromPort funciton * fix bug:delete getPromPort funciton * fix bug:change logs * fix bug:change registerName logic in GetGrpcCusMetrics function * add getPrometheus url api * fix:config path logic * fix:prometheus enable function * fix:prometheus enable function * fix:transfer Multi process monitoring logic * del:del not using manifest * fix:openim-msgtransfer.sh * fix:openim-msgtransfer.sh * cicd: robot automated Change * delete not using files * add prometheus docker-compose for monitor * fix prometheus.yaml * fix environment.sh * fix init-config.sh * fix init-config.sh * fix env_template.yaml * fix docker-compose.yml * fix docker-compose.yml * add openim_admin_front service * change openim-admin-front * del not using files * add node-exporter-dashaboard.yaml * cicd: robot automated Change * cicd: robot automated Change * feature: add alertmanager function * feature: add alertmanager function * feature: add alertmanager function * feature: add alertmanager function * feature: add alertmanager function * del:delete not using files * del:delete not using files * change:change to personal email info * feat: deployment and design of management backend and monitoring Signed-off-by: Xinwei Xiong(cubxxw) <3293172751nss@gmail.com> * feat: deployment and design of management backend and monitoring Signed-off-by: Xinwei Xiong(cubxxw) <3293172751nss@gmail.com> * feat: deployment and design of management backend and monitoring Signed-off-by: Xinwei Xiong(cubxxw) <3293172751nss@gmail.com> --------- Signed-off-by: Xinwei Xiong(cubxxw) <3293172751nss@gmail.com> Co-authored-by: lin.huang <lin.huang@apulis.com> Co-authored-by: xuexihuang <1339326187@qq.com> Co-authored-by: xuexihuang <xuexihuang@users.noreply.github.com> Co-authored-by: cubxxw <cubxxw@users.noreply.github.com>pull/1425/head
parent
82d238afbe
commit
fb74453c18
@ -0,0 +1,32 @@
|
|||||||
|
###################### AlertManager Configuration ######################
|
||||||
|
# AlertManager configuration using environment variables
|
||||||
|
#
|
||||||
|
# Resolve timeout
|
||||||
|
# SMTP configuration for sending alerts
|
||||||
|
# Templates for email notifications
|
||||||
|
# Routing configurations for alerts
|
||||||
|
# Receiver configurations
|
||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
smtp_from: alert@openim.io
|
||||||
|
smtp_smarthost: smtp.163.com:465
|
||||||
|
smtp_auth_username: alert@openim.io
|
||||||
|
smtp_auth_password: YOURAUTHPASSWORD
|
||||||
|
smtp_require_tls: false
|
||||||
|
smtp_hello: xxx监控告警
|
||||||
|
|
||||||
|
templates:
|
||||||
|
- /etc/alertmanager/email.tmpl
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_wait: 5s
|
||||||
|
group_interval: 5s
|
||||||
|
repeat_interval: 5m
|
||||||
|
receiver: email
|
||||||
|
receivers:
|
||||||
|
- name: email
|
||||||
|
email_configs:
|
||||||
|
- to: {EMAIL_TO:-'alert@example.com'}
|
||||||
|
html: '{{ template "email.to.html" . }}'
|
||||||
|
headers: { Subject: "[OPENIM-SERVER]Alarm" }
|
||||||
|
send_resolved: true
|
@ -0,0 +1,16 @@
|
|||||||
|
{{ define "email.to.html" }}
|
||||||
|
{{ range .Alerts }}
|
||||||
|
<!-- Begin of OpenIM Alert -->
|
||||||
|
<div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
|
||||||
|
<h3>OpenIM Alert</h3>
|
||||||
|
<p><strong>Alert Program:</strong> Prometheus Alert</p>
|
||||||
|
<p><strong>Severity Level:</strong> {{ .Labels.severity }}</p>
|
||||||
|
<p><strong>Alert Type:</strong> {{ .Labels.alertname }}</p>
|
||||||
|
<p><strong>Affected Host:</strong> {{ .Labels.instance }}</p>
|
||||||
|
<p><strong>Affected Service:</strong> {{ .Labels.job }}</p>
|
||||||
|
<p><strong>Alert Subject:</strong> {{ .Annotations.summary }}</p>
|
||||||
|
<p><strong>Trigger Time:</strong> {{ .StartsAt.Format "2006-01-02 15:04:05" }}</p>
|
||||||
|
</div>
|
||||||
|
<!-- End of OpenIM Alert -->
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
@ -0,0 +1,11 @@
|
|||||||
|
groups:
|
||||||
|
- name: instance_down
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
|
@ -0,0 +1,32 @@
|
|||||||
|
###################### AlertManager Configuration ######################
|
||||||
|
# AlertManager configuration using environment variables
|
||||||
|
#
|
||||||
|
# Resolve timeout
|
||||||
|
# SMTP configuration for sending alerts
|
||||||
|
# Templates for email notifications
|
||||||
|
# Routing configurations for alerts
|
||||||
|
# Receiver configurations
|
||||||
|
global:
|
||||||
|
resolve_timeout: ${ALERTMANAGER_RESOLVE_TIMEOUT}
|
||||||
|
smtp_from: ${ALERTMANAGER_SMTP_FROM}
|
||||||
|
smtp_smarthost: ${ALERTMANAGER_SMTP_SMARTHOST}
|
||||||
|
smtp_auth_username: ${ALERTMANAGER_SMTP_AUTH_USERNAME}
|
||||||
|
smtp_auth_password: ${ALERTMANAGER_SMTP_AUTH_PASSWORD}
|
||||||
|
smtp_require_tls: ${ALERTMANAGER_SMTP_REQUIRE_TLS}
|
||||||
|
smtp_hello: ${ALERTMANAGER_SMTP_HELLO}
|
||||||
|
|
||||||
|
templates:
|
||||||
|
- /etc/alertmanager/email.tmpl
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_wait: 5s
|
||||||
|
group_interval: 5s
|
||||||
|
repeat_interval: 5m
|
||||||
|
receiver: email
|
||||||
|
receivers:
|
||||||
|
- name: email
|
||||||
|
email_configs:
|
||||||
|
- to: ${ALERTMANAGER_EMAIL_TO}
|
||||||
|
html: '{{ template "email.to.html" . }}'
|
||||||
|
headers: { Subject: "[OPENIM-SERVER]Alarm" }
|
||||||
|
send_resolved: true
|
Loading…
Reference in new issue