/*
 * check-replication - Check the replication status of a single master and
 * "N" slave Postgres 9.x+ servers running in streaming replication mode.
 * The program takes two arguments - a configuration file path and the
 * maximum permissible offset (in bytes) between the two WAL locations.
 * The program steps through the file, and checks the master and slave
 * locations, plus the proper status (in recovery for slaves, not for
 * the master!) and if there are problems they are reported to stdout.
 * If all is well the output is silent.
 *
 * The format of the parameter file is:
 * name connection-string (e.g. host=xxx.com dbname=any-db user=user")
 * 
 * The first line is the master server, all others are slaves.
 *
 * No claim is made that this thing is particularly clean - just that it
 * works.
 *
 * compile with:
 * 	cc -L /usr/local/pgsql/lib -I/usr/local/pgsql/include \
 *		-o check-replication -trigraphs -c check-replication.c -lpq
 *
 * Released to the public by Karl Denninger (karl@denninger.net) 10/3/2010 
 * under the FreeBSD license.  All may use without encumbrance (for any
 * purpose, commercial or not) so long as this notice remains.
 * 
 */


#include	<stdio.h>
#include	<stdlib.h>
#include	<string.h>
#include	<syslog.h>
#include	<unistd.h>
#include	<sys/types.h>
#include	<pwd.h>
#include	<signal.h>
#include	<libpq-fe.h>

#include	"defs.h"

PGconn	*conn;
PGconn	*conn_slave;

unsigned	long	convert_long(str)
char	*str;

{
	char	tmp[512];
	char	tmp2[512];

	unsigned	long	offset;

	unsigned	long	c1, c2;

	sscanf(str, "%lx/%lx", &c1, &c2);

	offset = (c1 * 16 * 1024 * 1024 * 255) + c2;
	return(offset);
}


notify(string)
char	*string;
{
	FILE	*fid;
	char	tmp[512];

	printf("%s\n", string);
	return;
}

int	main(argc, argv)
int	argc;
char	*argv[];

{
	
	PGresult	*result;
	PGresult	*result2;

	FILE	*fid;


	struct	passwd	*pwd;
	char	tmp[512];
	char	tmp2[512];
	char	master[512];
	char	slave[512];

	unsigned	long	posmaster;
	unsigned	long	posclient;

	if (argc != 3) {
		printf("Usage: %s file-containing-nodes-to-check max-offset\n", argv[0]);
		exit(1);
	}
	fid = fopen(argv[1], "r");
	if (!fid) {
		printf("Error: Cannot open %s\n", argv[1]);
		exit(1);
	}

	
	fscanf(fid, "%[!-~] %[!-~ ]", master, tmp);	/* Get master */
	
	conn = PQconnectdb(tmp);
	if (PQstatus(conn) == CONNECTION_BAD) {
		sprintf(tmp, "%s - %s", master, PQerrorMessage(conn));
		notify(tmp);
		PQfinish(conn);
		exit(0);
	}
	result = PQexec(conn, "select pg_is_in_recovery()");
	if (!PQntuples(result)) {
		sprintf(tmp, "ERROR: Master [%s] fails to give us recovery status - cannot continue", master);
		notify(tmp);
		exit(1);
	}
	strcpy(tmp, PQgetvalue(result, 0, 0));
	PQclear(result);
	if (tmp[0] == 't')  {
		sprintf(tmp, "ERROR: Master [%s] is in recovery - cannot continue", master);
		notify(tmp);
		exit(1);
	}
	
	while (fscanf(fid, " %[!-~] %[!-~ ]", slave, tmp) == 2) {
		conn_slave = PQconnectdb(tmp);
		if (PQstatus(conn_slave) == CONNECTION_BAD) {
			sprintf(tmp, "%s - %s", slave, PQerrorMessage(conn_slave));
			notify(tmp);
			continue;
		}
		result = PQexec(conn_slave, "select pg_is_in_recovery()");
		if (!PQntuples(result)) {
			sprintf(tmp, "ERROR: Slave [%s] fails to give us recovery status - skip", slave);
			notify(tmp);
			PQfinish(conn_slave);
			continue;
		}
		strcpy(tmp, PQgetvalue(result, 0, 0));
		PQclear(result);
		if (tmp[0] != 't')  {
			sprintf(tmp, "ERROR: Slave [%s] is NOT in recovery - skip", slave);
			notify(tmp);
			PQfinish(conn_slave);
			continue;
			
		}
	
		sprintf(tmp2, "select pg_last_xlog_replay_location()");
		result = PQexec(conn_slave, tmp2);
		if (!PQntuples(result))  {
			sprintf(tmp, "ERROR: Slave [%s] returned no result on replay location - skip", slave);
			notify(tmp);
			PQfinish(conn_slave);
			continue;
		}
		strcpy(tmp, PQgetvalue(result, 0, 0));
		PQclear(result);
		posclient = convert_long(tmp);
		
		sprintf(tmp2, "select pg_current_xlog_location()");
		result = PQexec(conn, tmp2);
		if (!PQntuples(result))  {
			sprintf(tmp, "ERROR: Master [%s] returned no result on replay location - EXIT", master);
			notify(tmp);
			PQfinish(conn_slave);
			PQfinish(conn);
			exit(1);
		}
		strcpy(tmp, PQgetvalue(result, 0, 0));
		PQclear(result);
		posmaster = convert_long(tmp);
		if (posmaster > (posclient + atoi(argv[2]))) {
			sprintf(tmp, "ERROR: Client [%s] lags master by %ld", slave, posmaster - posclient);
			notify(tmp);
		}
		PQfinish(conn_slave);
	}
	PQfinish(conn);
	fclose(fid);
	exit(0);
}

