# Default Prometheus alert groups for OpenIM. groups: - name: instance_down # Fires when a monitored target remains unreachable. rules: - alert: InstanceDown expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails. for: 1m # Trigger only if the condition remains true for more than 1 minute. labels: severity: critical # Used by Alertmanager for routing and notification priority. annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute." - name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB. rules: - alert: DatabaseInsertFailed expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes. for: 1m # Avoid firing on very short spikes. labels: severity: critical annotations: summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected" description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage." - name: registrations_few # Operational early-warning rule for unusually low login/registration activity. rules: - alert: RegistrationsFew expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour. for: 1m labels: severity: info annotations: summary: "Too few registrations within the time frame" description: "The number of registrations in the last hour is 0. There might be some issues." - name: messages_few # Operational early-warning rule for unusually low messaging activity. rules: - alert: MessagesFew expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour. for: 1m labels: severity: info annotations: summary: "Too few messages within the time frame" description: "The number of messages sent in the last hour is 0. There might be some issues."