Image of alerts that are in firing state Image of alerts not kicking in the alert manager UI despite they are firing What else do I need to refactor in the alertmanager.yml to get notified on outlook. I'm not running this setup on top of conatiners, I installed node-exporter, prometheus and alertmanager individually Alertmanager.yml

global:
  smtp_smarthost: 'smtp.office365.com:587'
  smtp_from: '[email protected]'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'secret'

route:
  repeat_interval: 1m
  receiver: team-X-mails
receivers:
- name: 'team-X-mails'
  email_configs:
  - to: '[email protected]'

prometheus.yml

global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.



alerting:
  alertmanagers:
    - scheme: http
      static_configs:
        - targets: ['localhost:9093']

rule_files:
   - "prometheus.rules.yml"
   - "alerting.rules.yml"
scrape_configs:
  - job_name: 'prometheus'

    static_configs:
    - targets: ['localhost:9090']

  - job_name: 'prometheus_master'
    file_sd_configs:
    - files: ['targets/*.yml']
      refresh_interval: 30s

alerting-rules.yml

groups:
- name: server_is_down
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Instance - {{ $labels.instance }} is down"
      description: "This is the instance - {{ $labels.instance }} of job {{ $labels.job }} has been down."

- name: cpu_utilized
  rules:
  - alert: HighCPUUsage
    expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{job="prometheus_master",mode="idle"}[5m])) * 100) > 80

    labels:
      severity: critical
    annotations:
      summary: "High CPU Usage on this instance - {{ $labels.instance }}"
      description: "CPU usage on this instance - {{ $labels.instance }} is above 80%"

- name: memory_utilized
  rules:
  - alert: HighMemoryUsage
    expr: 100 * (1 - ((node_memory_MemAvailable_bytes) / (node_memory_MemTotal_bytes))) > 50

    labels:
      severity: critical
    annotations:
      summary: "High Memory Usage on this instance - {{ $labels.instance }}"
      description: "Memory usage on this instance - {{ $labels.instance }} is above 50%"

- name: disk_utilized_scratch_tmp
  rules:
  - alert: DiskSpaceExhausted
    expr: ((node_filesystem_size_bytes{mountpoint="/scratch"} - node_filesystem_free_bytes{mountpoint="/scratch"}) / node_filesystem_size_bytes{mountpoint="/scratch"} * 100 > 80) or ((node_filesystem_size_bytes{mountpoint="/tmp"} - node_filesystem_free_bytes{mountpoint="/tmp"}) / node_filesystem_size_bytes{mountpoint="/tmp"} * 100 > 80)
 
    labels:
      severity: critical
    annotations:
      summary: "Disk Space Exhausted on this instance - {{ $labels.instance }}"
      description: "Disk space utilized on this instance - {{ $labels.instance }} is above 90%"


prometheus.service


[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target

[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus_data \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.external-url=http://hostname


[Install]
WantedBy=multi-user.target

alertmanger.service


[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target

[Service]
User=alertmanager
Group=alertmanager
Type=simple
WorkingDirectory=/etc/alertmanager/
ExecStart=/usr/local/bin/alertmanager --config.file=/etc/alertmanager/alertmanager.yml --web.external-url=http://hostname:9093

[Install]
WantedBy=multi-user.target

0

There are 0 best solutions below