Merge pull request #3707 from icey-yu/feat-comment-383

feat: enhance configuration files with detailed comments for clarity
3.8.3-patch
icey-yu 5 days ago committed by GitHub
commit 5211d43d9d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -1,34 +1,36 @@
# Global Alertmanager runtime and SMTP settings.
global:
resolve_timeout: 5m
smtp_from: alert@openim.io
smtp_smarthost: smtp.163.com:465
smtp_auth_username: alert@openim.io
smtp_auth_password: YOURAUTHPASSWORD
smtp_require_tls: false
smtp_hello: xxx
resolve_timeout: 5m # Wait time before an alert is considered resolved when no further updates are received.
smtp_from: alert@openim.io # Sender address displayed in alert emails.
smtp_smarthost: smtp.163.com:465 # SMTP relay endpoint in host:port format.
smtp_auth_username: alert@openim.io # SMTP authentication username (commonly the same as smtp_from).
smtp_auth_password: YOURAUTHPASSWORD # SMTP authorization token or app password.
smtp_require_tls: false # Set to true when your SMTP provider requires STARTTLS.
smtp_hello: xxx # HELO/EHLO identity presented to the SMTP server.
templates:
- /etc/alertmanager/email.tmpl
- /etc/alertmanager/email.tmpl # Go template file used to render HTML email content.
# Root routing tree for all incoming alerts.
route:
group_by: [ 'alertname' ]
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: email
group_by: [ 'alertname' ] # Alerts sharing this label value are batched into one notification.
group_wait: 5s # Initial delay before sending the first notification for a new alert group.
group_interval: 5s # Minimum interval between notifications for the same alert group.
repeat_interval: 5m # Reminder interval while an alert group remains firing.
receiver: email # Default receiver when no child route matches.
routes:
- matchers:
- alertname = "XXX"
group_by: [ 'instance' ]
- alertname = "XXX" # Example matcher; replace with a real alert name or remove this route.
group_by: [ 'instance' ] # Override grouping for this specific route.
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: email
receivers:
- name: email
- name: email # Receiver name referenced by route.receiver.
email_configs:
- to: 'alert@example.com'
html: '{{ template "email.to.html" . }}'
headers: { Subject: "[OPENIM-SERVER]Alarm" }
send_resolved: true
- to: 'alert@example.com' # Recipient mailbox for alert notifications.
html: '{{ template "email.to.html" . }}' # Rendered with the template declared in email.tmpl.
headers: { Subject: "[OPENIM-SERVER]Alarm" } # Custom email subject line.
send_resolved: true # Also send a notification when the alert recovers.

@ -1,3 +1,6 @@
{{/* OpenIM Alertmanager email template.
This template renders both firing and resolved alerts.
Each alert entry reads labels and annotations from Prometheus rule definitions. */}}
{{ define "email.to.html" }}
{{ if eq .Status "firing" }}
{{ range .Alerts }}

@ -1,30 +1,31 @@
# Default Prometheus alert groups for OpenIM.
groups:
- name: instance_down
- name: instance_down # Fires when a monitored target remains unreachable.
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
for: 1m # Trigger only if the condition remains true for more than 1 minute.
labels:
severity: critical
severity: critical # Used by Alertmanager for routing and notification priority.
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
- name: database_insert_failure_alerts
- name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
rules:
- alert: DatabaseInsertFailed
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
for: 1m
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
for: 1m # Avoid firing on very short spikes.
labels:
severity: critical
annotations:
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."
- name: registrations_few
- name: registrations_few # Operational early-warning rule for unusually low login/registration activity.
rules:
- alert: RegistrationsFew
expr: increase(user_login_total[1h]) == 0
expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
for: 1m
labels:
severity: info
@ -32,10 +33,10 @@ groups:
summary: "Too few registrations within the time frame"
description: "The number of registrations in the last hour is 0. There might be some issues."
- name: messages_few
- name: messages_few # Operational early-warning rule for unusually low messaging activity.
rules:
- alert: MessagesFew
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
for: 1m
labels:
severity: info

@ -8,7 +8,7 @@ api:
prometheus:
# Whether to enable prometheus
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# autoSetPorts indicates whether to automatically set the ports
autoSetPorts: true

@ -8,7 +8,7 @@ rpc:
ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -1,5 +1,5 @@
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# autoSetPorts indicates whether to automatically set the ports
autoSetPorts: true

@ -10,7 +10,7 @@ rpc:
ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10200 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10220 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10240 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10260 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10280 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10300 ]
prometheus:
# Enable or disable Prometheus monitoring
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
# It will only take effect when autoSetPorts is set to false.

@ -10,7 +10,7 @@ rpc:
ports: [ 10320 ]
prometheus:
# Whether to enable prometheus
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
enable: true
# Prometheus listening ports, must be consistent with the number of rpc.ports
# It will only take effect when autoSetPorts is set to false.

@ -1,35 +1,34 @@
# my global config
# Global Prometheus runtime settings.
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# scrape_timeout defaults to 10s unless overridden in a specific scrape job.
# Alertmanager configuration
# Alertmanager endpoints that receive alert events from Prometheus.
alerting:
alertmanagers:
- static_configs:
- targets: [127.0.0.1:19093]
- targets: [127.0.0.1:19093] # Alertmanager address in host:port format.
# Load rules once and periodically evaluate them according to the global evaluation_interval.
# Rule files loaded by Prometheus.
rule_files:
- instance-down-rules.yml
- instance-down-rules.yml # Default OpenIM alert rules; add more files here if needed.
# - first_rules.yml
# - second_rules.yml
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
# Scrape jobs used to collect infrastructure and OpenIM service metrics.
scrape_configs:
# The job name is added as a label "job=job_name" to any timeseries scraped from this config.
# Monitored information captured by prometheus
# prometheus fetches application services
# The job_name value is attached as the "job" label in collected time series.
- job_name: node_exporter
static_configs:
- targets: [ 127.0.0.1:19100 ]
- targets: [ 127.0.0.1:19100 ] # node_exporter endpoint for host CPU, memory, disk, and network metrics.
# OpenIM services are discovered dynamically from the admin API.
# For multi-host deployments, replace 127.0.0.1 with a reachable internal address.
- job_name: openimserver-openim-api
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/api"
- url: "http://127.0.0.1:10002/prometheus_discovery/api" # Service discovery endpoint for OpenIM API instances.
# static_configs:
# - targets: [ 127.0.0.1:12002 ]
# labels:
@ -37,7 +36,7 @@ scrape_configs:
- job_name: openimserver-openim-msggateway
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway"
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" # Service discovery endpoint for msggateway instances.
# static_configs:
# - targets: [ 127.0.0.1:12140 ]
# # - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ]
@ -46,7 +45,7 @@ scrape_configs:
- job_name: openimserver-openim-msgtransfer
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer"
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" # Service discovery endpoint for msgtransfer instances.
# static_configs:
# - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ]
# # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ]
@ -55,7 +54,7 @@ scrape_configs:
- job_name: openimserver-openim-push
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/push"
- url: "http://127.0.0.1:10002/prometheus_discovery/push" # Service discovery endpoint for push service instances.
# static_configs:
# - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ]
## - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180, 127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ]
@ -64,7 +63,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-auth
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/auth"
- url: "http://127.0.0.1:10002/prometheus_discovery/auth" # Service discovery endpoint for auth RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12200 ]
# labels:
@ -72,7 +71,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-conversation
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/conversation"
- url: "http://127.0.0.1:10002/prometheus_discovery/conversation" # Service discovery endpoint for conversation RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12220 ]
# labels:
@ -80,7 +79,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-friend
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/friend"
- url: "http://127.0.0.1:10002/prometheus_discovery/friend" # Service discovery endpoint for friend RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12240 ]
# labels:
@ -88,7 +87,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-group
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/group"
- url: "http://127.0.0.1:10002/prometheus_discovery/group" # Service discovery endpoint for group RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12260 ]
# labels:
@ -96,7 +95,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-msg
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/msg"
- url: "http://127.0.0.1:10002/prometheus_discovery/msg" # Service discovery endpoint for msg RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12280 ]
# labels:
@ -104,7 +103,7 @@ scrape_configs:
- job_name: openimserver-openim-rpc-third
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/third"
- url: "http://127.0.0.1:10002/prometheus_discovery/third" # Service discovery endpoint for third-party RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12300 ]
# labels:
@ -112,8 +111,8 @@ scrape_configs:
- job_name: openimserver-openim-rpc-user
http_sd_configs:
- url: "http://127.0.0.1:10002/prometheus_discovery/user"
- url: "http://127.0.0.1:10002/prometheus_discovery/user" # Service discovery endpoint for user RPC instances.
# static_configs:
# - targets: [ 127.0.0.1:12320 ]
# labels:
# namespace: default
# namespace: default

Loading…
Cancel
Save