From 801ac740b719ebe1984cc2c55e0bdf1e21f9fc75 Mon Sep 17 00:00:00 2001 From: icey-yu <1186114839@qq.com> Date: Fri, 20 Mar 2026 18:24:51 +0800 Subject: [PATCH] feat: enhance configuration files with detailed comments for clarity --- config/alertmanager.yml | 42 +++++++++++++------------ config/email.tmpl | 3 ++ config/instance-down-rules.yml | 27 ++++++++-------- config/openim-api.yml | 2 +- config/openim-msggateway.yml | 2 +- config/openim-msgtransfer.yml | 2 +- config/openim-push.yml | 2 +- config/openim-rpc-auth.yml | 2 +- config/openim-rpc-conversation.yml | 2 +- config/openim-rpc-friend.yml | 2 +- config/openim-rpc-group.yml | 2 +- config/openim-rpc-msg.yml | 2 +- config/openim-rpc-third.yml | 2 +- config/openim-rpc-user.yml | 2 +- config/prometheus.yml | 49 +++++++++++++++--------------- 15 files changed, 74 insertions(+), 69 deletions(-) diff --git a/config/alertmanager.yml b/config/alertmanager.yml index 6c675ab6f..0f4d9875f 100644 --- a/config/alertmanager.yml +++ b/config/alertmanager.yml @@ -1,34 +1,36 @@ +# Global Alertmanager runtime and SMTP settings. global: - resolve_timeout: 5m - smtp_from: alert@openim.io - smtp_smarthost: smtp.163.com:465 - smtp_auth_username: alert@openim.io - smtp_auth_password: YOURAUTHPASSWORD - smtp_require_tls: false - smtp_hello: xxx + resolve_timeout: 5m # Wait time before an alert is considered resolved when no further updates are received. + smtp_from: alert@openim.io # Sender address displayed in alert emails. + smtp_smarthost: smtp.163.com:465 # SMTP relay endpoint in host:port format. + smtp_auth_username: alert@openim.io # SMTP authentication username (commonly the same as smtp_from). + smtp_auth_password: YOURAUTHPASSWORD # SMTP authorization token or app password. + smtp_require_tls: false # Set to true when your SMTP provider requires STARTTLS. + smtp_hello: xxx # HELO/EHLO identity presented to the SMTP server. templates: - - /etc/alertmanager/email.tmpl + - /etc/alertmanager/email.tmpl # Go template file used to render HTML email content. +# Root routing tree for all incoming alerts. route: - group_by: [ 'alertname' ] - group_wait: 5s - group_interval: 5s - repeat_interval: 5m - receiver: email + group_by: [ 'alertname' ] # Alerts sharing this label value are batched into one notification. + group_wait: 5s # Initial delay before sending the first notification for a new alert group. + group_interval: 5s # Minimum interval between notifications for the same alert group. + repeat_interval: 5m # Reminder interval while an alert group remains firing. + receiver: email # Default receiver when no child route matches. routes: - matchers: - - alertname = "XXX" - group_by: [ 'instance' ] + - alertname = "XXX" # Example matcher; replace with a real alert name or remove this route. + group_by: [ 'instance' ] # Override grouping for this specific route. group_wait: 5s group_interval: 5s repeat_interval: 5m receiver: email receivers: - - name: email + - name: email # Receiver name referenced by route.receiver. email_configs: - - to: 'alert@example.com' - html: '{{ template "email.to.html" . }}' - headers: { Subject: "[OPENIM-SERVER]Alarm" } - send_resolved: true + - to: 'alert@example.com' # Recipient mailbox for alert notifications. + html: '{{ template "email.to.html" . }}' # Rendered with the template declared in email.tmpl. + headers: { Subject: "[OPENIM-SERVER]Alarm" } # Custom email subject line. + send_resolved: true # Also send a notification when the alert recovers. diff --git a/config/email.tmpl b/config/email.tmpl index 824144e9d..ab9642e85 100644 --- a/config/email.tmpl +++ b/config/email.tmpl @@ -1,3 +1,6 @@ +{{/* OpenIM Alertmanager email template. +This template renders both firing and resolved alerts. +Each alert entry reads labels and annotations from Prometheus rule definitions. */}} {{ define "email.to.html" }} {{ if eq .Status "firing" }} {{ range .Alerts }} diff --git a/config/instance-down-rules.yml b/config/instance-down-rules.yml index bcac7ba60..60b26e33f 100644 --- a/config/instance-down-rules.yml +++ b/config/instance-down-rules.yml @@ -1,30 +1,31 @@ +# Default Prometheus alert groups for OpenIM. groups: - - name: instance_down + - name: instance_down # Fires when a monitored target remains unreachable. rules: - alert: InstanceDown - expr: up == 0 - for: 1m + expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails. + for: 1m # Trigger only if the condition remains true for more than 1 minute. labels: - severity: critical + severity: critical # Used by Alertmanager for routing and notification priority. annotations: summary: "Instance {{ $labels.instance }} down" - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes." + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute." - - name: database_insert_failure_alerts + - name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB. rules: - alert: DatabaseInsertFailed - expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) - for: 1m + expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes. + for: 1m # Avoid firing on very short spikes. labels: severity: critical annotations: summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected" - description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash." + description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage." - - name: registrations_few + - name: registrations_few # Operational early-warning rule for unusually low login/registration activity. rules: - alert: RegistrationsFew - expr: increase(user_login_total[1h]) == 0 + expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour. for: 1m labels: severity: info @@ -32,10 +33,10 @@ groups: summary: "Too few registrations within the time frame" description: "The number of registrations in the last hour is 0. There might be some issues." - - name: messages_few + - name: messages_few # Operational early-warning rule for unusually low messaging activity. rules: - alert: MessagesFew - expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 + expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour. for: 1m labels: severity: info diff --git a/config/openim-api.yml b/config/openim-api.yml index a23b5fb31..7f9aa419c 100644 --- a/config/openim-api.yml +++ b/config/openim-api.yml @@ -8,7 +8,7 @@ api: prometheus: - # Whether to enable prometheus + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # autoSetPorts indicates whether to automatically set the ports autoSetPorts: true diff --git a/config/openim-msggateway.yml b/config/openim-msggateway.yml index d374ce3c7..b082760be 100644 --- a/config/openim-msggateway.yml +++ b/config/openim-msggateway.yml @@ -8,7 +8,7 @@ rpc: ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-msgtransfer.yml b/config/openim-msgtransfer.yml index 39b23b222..ca2af8d1d 100644 --- a/config/openim-msgtransfer.yml +++ b/config/openim-msgtransfer.yml @@ -1,5 +1,5 @@ prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # autoSetPorts indicates whether to automatically set the ports autoSetPorts: true diff --git a/config/openim-push.yml b/config/openim-push.yml index 1bb84a172..b039b695e 100644 --- a/config/openim-push.yml +++ b/config/openim-push.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-auth.yml b/config/openim-rpc-auth.yml index d6e234b63..cc5c674ef 100644 --- a/config/openim-rpc-auth.yml +++ b/config/openim-rpc-auth.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10200 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-conversation.yml b/config/openim-rpc-conversation.yml index 0636a76e3..825c35b12 100644 --- a/config/openim-rpc-conversation.yml +++ b/config/openim-rpc-conversation.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10220 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-friend.yml b/config/openim-rpc-friend.yml index e2b150cec..d3966d169 100644 --- a/config/openim-rpc-friend.yml +++ b/config/openim-rpc-friend.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10240 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-group.yml b/config/openim-rpc-group.yml index a8c2d5ec1..3b7e769b3 100644 --- a/config/openim-rpc-group.yml +++ b/config/openim-rpc-group.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10260 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-msg.yml b/config/openim-rpc-msg.yml index fdb6d8035..f8a301874 100644 --- a/config/openim-rpc-msg.yml +++ b/config/openim-rpc-msg.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10280 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-third.yml b/config/openim-rpc-third.yml index 50088fc03..da5f58ae0 100644 --- a/config/openim-rpc-third.yml +++ b/config/openim-rpc-third.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10300 ] prometheus: - # Enable or disable Prometheus monitoring + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup # It will only take effect when autoSetPorts is set to false. diff --git a/config/openim-rpc-user.yml b/config/openim-rpc-user.yml index 7da94ca0d..1965eb808 100644 --- a/config/openim-rpc-user.yml +++ b/config/openim-rpc-user.yml @@ -10,7 +10,7 @@ rpc: ports: [ 10320 ] prometheus: - # Whether to enable prometheus + # Enable Prometheus metrics exposure for this service; set to true to allow scraping. enable: true # Prometheus listening ports, must be consistent with the number of rpc.ports # It will only take effect when autoSetPorts is set to false. diff --git a/config/prometheus.yml b/config/prometheus.yml index 0b13326d1..68a9e7c45 100644 --- a/config/prometheus.yml +++ b/config/prometheus.yml @@ -1,35 +1,34 @@ -# my global config +# Global Prometheus runtime settings. global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. - # scrape_timeout is set to the global default (10s). + # scrape_timeout defaults to 10s unless overridden in a specific scrape job. -# Alertmanager configuration +# Alertmanager endpoints that receive alert events from Prometheus. alerting: alertmanagers: - static_configs: - - targets: [127.0.0.1:19093] + - targets: [127.0.0.1:19093] # Alertmanager address in host:port format. -# Load rules once and periodically evaluate them according to the global evaluation_interval. +# Rule files loaded by Prometheus. rule_files: - - instance-down-rules.yml + - instance-down-rules.yml # Default OpenIM alert rules; add more files here if needed. # - first_rules.yml # - second_rules.yml -# A scrape configuration containing exactly one endpoint to scrape: -# Here it's Prometheus itself. +# Scrape jobs used to collect infrastructure and OpenIM service metrics. scrape_configs: - # The job name is added as a label "job=job_name" to any timeseries scraped from this config. - # Monitored information captured by prometheus - - # prometheus fetches application services + # The job_name value is attached as the "job" label in collected time series. - job_name: node_exporter static_configs: - - targets: [ 127.0.0.1:19100 ] + - targets: [ 127.0.0.1:19100 ] # node_exporter endpoint for host CPU, memory, disk, and network metrics. + + # OpenIM services are discovered dynamically from the admin API. + # For multi-host deployments, replace 127.0.0.1 with a reachable internal address. - job_name: openimserver-openim-api http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/api" + - url: "http://127.0.0.1:10002/prometheus_discovery/api" # Service discovery endpoint for OpenIM API instances. # static_configs: # - targets: [ 127.0.0.1:12002 ] # labels: @@ -37,7 +36,7 @@ scrape_configs: - job_name: openimserver-openim-msggateway http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" + - url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" # Service discovery endpoint for msggateway instances. # static_configs: # - targets: [ 127.0.0.1:12140 ] # # - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ] @@ -46,7 +45,7 @@ scrape_configs: - job_name: openimserver-openim-msgtransfer http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" + - url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" # Service discovery endpoint for msgtransfer instances. # static_configs: # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ] # # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ] @@ -55,7 +54,7 @@ scrape_configs: - job_name: openimserver-openim-push http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/push" + - url: "http://127.0.0.1:10002/prometheus_discovery/push" # Service discovery endpoint for push service instances. # static_configs: # - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ] ## - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180, 127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ] @@ -64,7 +63,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-auth http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/auth" + - url: "http://127.0.0.1:10002/prometheus_discovery/auth" # Service discovery endpoint for auth RPC instances. # static_configs: # - targets: [ 127.0.0.1:12200 ] # labels: @@ -72,7 +71,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-conversation http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/conversation" + - url: "http://127.0.0.1:10002/prometheus_discovery/conversation" # Service discovery endpoint for conversation RPC instances. # static_configs: # - targets: [ 127.0.0.1:12220 ] # labels: @@ -80,7 +79,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-friend http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/friend" + - url: "http://127.0.0.1:10002/prometheus_discovery/friend" # Service discovery endpoint for friend RPC instances. # static_configs: # - targets: [ 127.0.0.1:12240 ] # labels: @@ -88,7 +87,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-group http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/group" + - url: "http://127.0.0.1:10002/prometheus_discovery/group" # Service discovery endpoint for group RPC instances. # static_configs: # - targets: [ 127.0.0.1:12260 ] # labels: @@ -96,7 +95,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-msg http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/msg" + - url: "http://127.0.0.1:10002/prometheus_discovery/msg" # Service discovery endpoint for msg RPC instances. # static_configs: # - targets: [ 127.0.0.1:12280 ] # labels: @@ -104,7 +103,7 @@ scrape_configs: - job_name: openimserver-openim-rpc-third http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/third" + - url: "http://127.0.0.1:10002/prometheus_discovery/third" # Service discovery endpoint for third-party RPC instances. # static_configs: # - targets: [ 127.0.0.1:12300 ] # labels: @@ -112,8 +111,8 @@ scrape_configs: - job_name: openimserver-openim-rpc-user http_sd_configs: - - url: "http://127.0.0.1:10002/prometheus_discovery/user" + - url: "http://127.0.0.1:10002/prometheus_discovery/user" # Service discovery endpoint for user RPC instances. # static_configs: # - targets: [ 127.0.0.1:12320 ] # labels: -# namespace: default \ No newline at end of file +# namespace: default