Image of alerts that are in firing state Image of alerts not kicking in the alert manager UI despite they are firing What else do I need to refactor in the alertmanager.yml to get notified on outlook. I'm not running this setup on top of conatiners, I installed node-exporter, prometheus and alertmanager individually Alertmanager.yml
global:
smtp_smarthost: 'smtp.office365.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'secret'
route:
repeat_interval: 1m
receiver: team-X-mails
receivers:
- name: 'team-X-mails'
email_configs:
- to: '[email protected]'
prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets: ['localhost:9093']
rule_files:
- "prometheus.rules.yml"
- "alerting.rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'prometheus_master'
file_sd_configs:
- files: ['targets/*.yml']
refresh_interval: 30s
alerting-rules.yml
groups:
- name: server_is_down
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance - {{ $labels.instance }} is down"
description: "This is the instance - {{ $labels.instance }} of job {{ $labels.job }} has been down."
- name: cpu_utilized
rules:
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="prometheus_master",mode="idle"}[5m])) * 100) > 80
labels:
severity: critical
annotations:
summary: "High CPU Usage on this instance - {{ $labels.instance }}"
description: "CPU usage on this instance - {{ $labels.instance }} is above 80%"
- name: memory_utilized
rules:
- alert: HighMemoryUsage
expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes))) > 50
labels:
severity: critical
annotations:
summary: "High Memory Usage on this instance - {{ $labels.instance }}"
description: "Memory usage on this instance - {{ $labels.instance }} is above 50%"
- name: disk_utilized_scratch_tmp
rules:
- alert: DiskSpaceExhausted
expr: ((node_filesystem_size_bytes{mountpoint="/scratch"} - node_filesystem_free_bytes{mountpoint="/scratch"}) / node_filesystem_size_bytes{mountpoint="/scratch"} * 100 > 80) or ((node_filesystem_size_bytes{mountpoint="/tmp"} - node_filesystem_free_bytes{mountpoint="/tmp"}) / node_filesystem_size_bytes{mountpoint="/tmp"} * 100 > 80)
labels:
severity: critical
annotations:
summary: "Disk Space Exhausted on this instance - {{ $labels.instance }}"
description: "Disk space utilized on this instance - {{ $labels.instance }} is above 90%"
prometheus.service
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus_data \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.external-url=http://hostname
[Install]
WantedBy=multi-user.target
alertmanger.service
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=alertmanager
Group=alertmanager
Type=simple
WorkingDirectory=/etc/alertmanager/
ExecStart=/usr/local/bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --web.external-url=http://hostname:9093
[Install]
WantedBy=multi-user.target