I am currently testing the HA environment setup using Repmgr with PostgreSQL. Auto failover works fine when a server or database goes down and Repmgr detects it. However, we have an issue with NIC failure situations where communication is disrupted, causing the Standby DB to be promoted while the Primary remains active, resulting in a split-brain scenario (Primary - Primary). To address this, I am attempting to write a script in the Repmgr.conf file that will force the Primary DB to shut down if it cannot ping the Standby DB for 10 seconds. But, in NIC failure situations, while the Standby is promoted to Primary, the original Primary DB does not shut down, ultimately leading to a Primary-Primary scenario. I'm curious if there's a solution to resolve this issue.
Database : PostgreSQL 15.3 Repmgr : 5.3.3
election_rerun_interval=10
# =============================================================================
# Required configuration items
# =============================================================================
node_id=2
node_name='postgresdb192'
conninfo='host=postgresdb192 user=repmgr dbname=postgres connect_timeout=2'
data_directory='/postgres15/data'
#------------------------------------------------------------------------------
# Replication settings
#------------------------------------------------------------------------------
use_replication_slots=yes
#------------------------------------------------------------------------------
# Logging settings
#------------------------------------------------------------------------------
log_level=INFO
log_facility=STDERR
log_file='/pglog/repmgrd.log'
#------------------------------------------------------------------------------
# Environment/command settings
#------------------------------------------------------------------------------
pg_bindir='/postgres15/app/postgres/bin'
#------------------------------------------------------------------------------
# external command options
#------------------------------------------------------------------------------
pg_ctl_options='-s -l /dev/null'
ssh_options='-q -o ConnectTimeout=10'
#------------------------------------------------------------------------------
# Standby follow settings
#------------------------------------------------------------------------------
primary_follow_timeout=60
#------------------------------------------------------------------------------
# Failover and monitoring settings (repmgrd)
#------------------------------------------------------------------------------
failover=automatic
priority=100
reconnect_attempts=3
reconnect_interval=5
promote_command='repmgr standby promote -f /postgres15/app/postgres/etc/repmgr.conf --log-to-file'
follow_command='repmgr standby follow -f /postgres15/app/postgres/etc/repmgr.conf -W --upstream-node-id=%n --log-to-file'
monitoring_history=true
failover_validation_command='/postgres15/app/postgres/etc/failover.sh'
election_rerun_interval=10
#degraded_monitoring_timeout=-1
#!/bin/bash
PRIMARY_IP="10.12.30.191"
STANDBY_IP="10.12.30.192"
REPMGR_CONFIG="/postgres15/app/postgres/etc/repmgr.conf"
PGLOG="/pglog/repmgrd.log"
function echodate() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')]"
}
# Function to stop PostgreSQL on primary server
function stop_primary_db() {
echo "$(echodate) [FAILOVER] Stopping primary PostgreSQL database" >> "$PGLOG"
repmgr -f "$REPMGR_CONFIG" node service --action=stop
}
# Check if primary server needs to be shut down
ping -c 1 -W 10 "$PRIMARY_IP" > /dev/null 2>&1
ping_exit_code=$?
if [ $ping_exit_code -ne 0 ]; then
# Ping to primary server timed out or failed, stop PostgreSQL and exit
stop_primary_db
exit 0
fi
# No failover condition met, exit
echo "$(echodate) [FAILOVER] No failover condition met, continuing normal operation" >> "$PGLOG"
exit 0