parent
2052e1c7d5
commit
7f4d6def53
@ -0,0 +1,25 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_from: alert@openim.io
|
||||
smtp_smarthost: smtp.163.com:465
|
||||
smtp_auth_username: alert@openim.io
|
||||
smtp_auth_password: YOURAUTHPASSWORD
|
||||
smtp_require_tls: false
|
||||
smtp_hello: xxx监控告警
|
||||
|
||||
templates:
|
||||
- /etc/alertmanager/email.tmpl
|
||||
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 5s
|
||||
group_interval: 5s
|
||||
repeat_interval: 5m
|
||||
receiver: email
|
||||
receivers:
|
||||
- name: email
|
||||
email_configs:
|
||||
- to: 'alert@example.com'
|
||||
html: '{{ template "email.to.html" . }}'
|
||||
headers: { Subject: "[OPENIM-SERVER]Alarm" }
|
||||
send_resolved: true
|
@ -0,0 +1,16 @@
|
||||
{{ define "email.to.html" }}
|
||||
{{ range .Alerts }}
|
||||
<!-- Begin of OpenIM Alert -->
|
||||
<div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
|
||||
<h3>OpenIM Alert</h3>
|
||||
<p><strong>Alert Program:</strong> Prometheus Alert</p>
|
||||
<p><strong>Severity Level:</strong> {{ .Labels.severity }}</p>
|
||||
<p><strong>Alert Type:</strong> {{ .Labels.alertname }}</p>
|
||||
<p><strong>Affected Host:</strong> {{ .Labels.instance }}</p>
|
||||
<p><strong>Affected Service:</strong> {{ .Labels.job }}</p>
|
||||
<p><strong>Alert Subject:</strong> {{ .Annotations.summary }}</p>
|
||||
<p><strong>Trigger Time:</strong> {{ .StartsAt.Format "2006-01-02 15:04:05" }}</p>
|
||||
</div>
|
||||
<!-- End of OpenIM Alert -->
|
||||
{{ end }}
|
||||
{{ end }}
|
@ -0,0 +1,22 @@
|
||||
groups:
|
||||
- name: instance_down
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
|
||||
|
||||
- name: database_insert_failure_alerts
|
||||
rules:
|
||||
- alert: DatabaseInsertFailed
|
||||
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
|
||||
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
|
@ -0,0 +1,83 @@
|
||||
# my global config
|
||||
global:
|
||||
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['192.168.2.22:19093']
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
- "instance-down-rules.yml"
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
# The job name is added as a label "job='job_name'"" to any timeseries scraped from this config.
|
||||
# Monitored information captured by prometheus
|
||||
|
||||
# prometheus fetches application services
|
||||
- job_name: 'node_exporter'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20114' ]
|
||||
- job_name: 'openimserver-openim-api'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20113' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-msggateway'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20112' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-msgtransfer'
|
||||
static_configs:
|
||||
- targets: [ 192.168.2.22:20111, 192.168.2.22:20110, 192.168.2.22:20109, 192.168.2.22:20108 ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-push'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20107' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-auth'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20106' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-conversation'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20105' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-friend'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20104' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-group'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20103' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-msg'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20102' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-third'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20101' ]
|
||||
labels:
|
||||
namespace: 'default'
|
||||
- job_name: 'openimserver-openim-rpc-user'
|
||||
static_configs:
|
||||
- targets: [ '192.168.2.22:20100' ]
|
||||
labels:
|
||||
namespace: 'default'
|
Loading…
Reference in new issue