Monitoring Solaris Volume Manager With a cron Job

How to Automate Checking for Errors in Volumes

  • To automatically check your Solaris Volume Manager configuration for errors, create a script that the cron utility can periodically run.

    The following example shows a script that you can adapt and modify for your needs.

    Note

    This script serves as a starting point for automating error checking for Solaris Volume Manager. You probably need to modify this script for your own configuration.

    #
    #!/bin/ksh
    #ident "@(#)metacheck.sh   1.3     96/06/21 SMI"
    # ident='%Z%%M%   %I%     %E% SMI'
    #
    # Copyright (c) 1999 by Sun Microsystems, Inc.
    #
    # metacheck
    #
    # Check on the status of the metadevice configuration.  If there is a problem
    # return a non zero exit code.  Depending on options, send email notification.
    #
    # -h
    #	help
    # -s setname
    #	Specify the set to check.  By default, the 'local' set will be checked.
    # -m recipient [recipient...]
    #	Send email notification to the specified recipients.  This
    #	must be the last argument. The notification shows up as a short 
    #	email message with a subject of 
    #		"Solaris Volume Manager Problem: metacheck.who.nodename.setname"
    #	which summarizes the problem(s) and tells how to obtain detailed 
    #	information. The "setname" is from the -s option, "who" is from 
    #	the -w option, and "nodename" is reported by uname(1).
    #	Email notification is further affected by the following options:
    #		-f	to suppress additional messages after a problem 
    #			has been found. 
    #		-d	to control the supression.
    #		-w	to identify who generated the email.
    #		-t	to force email even when there is no problem.
    # -w who
    #	indicate who is running the command. By default, this is the
    #	user-name as reported by id(1M). This is used when sending
    #	email notification (-m).
    # -f 
    #	Enable filtering.  Filtering applies to email notification (-m).
    #	Filtering requires root permission.  When sending email notification
    #	the file /etc/lvm/metacheck.setname.pending is used to 
    #	controll the filter.  The following matrix specifies the behavior
    #	of the filter:
    #
    #	problem_found	file_exists
    #	  yes		  no		Create file, send notification
    #	  yes		  yes		Resend notification if the current date 
    #					(as specified by -d datefmt) is 
    #					different than the file date.
    #	  no		  yes		Delete file, send notification 
    #					that the problem is resolved.
    #	  no		  no		Send notification if -t specified.
    #	
    # -d datefmt
    #	Specify the format of the date for filtering (-f).  This option 
    #	controls the how often re-notification via email occurs. If the 
    #	current date according to the specified format (strftime(3C)) is 
    #	identical to the date contained in the 
    #	/etc/lvm/metacheck.setname.pending file then the message is 
    #	suppressed. The default date format is "%D", which will send one 
    #	re-notification per day.
    # -t
    #	Test mode.  Enable email generation even when there is no problem.
    #	Used for end-to-end verification of the mechanism and email addresses.
    #	
    #
    # These options are designed to allow integration of metacheck
    # into crontab.  For example, a root crontab entry of:
    #
    # 0,15,30,45 * * * * /usr/sbin/metacheck -f -w SVMcron \
    #   -d '\%D \%h' -m [email protected] [email protected]
    #
    # would check for problems every 15 minutes, and generate an email to
    # [email protected] (and send to an email pager service) every hour when 
    # there is a problem.  Note the \ prior to the '%' characters for a 
    # crontab entry.  Bounced email would come back to root@nodename.
    # The subject line for email generated by the above line would be
    # Solaris Volume Manager Problem: metacheck.SVMcron.nodename.local
    #
    
    # display a debug line to controlling terminal (works in pipes)
    decho()
    {
        if [ "$debug" = "yes" ] ; then
    	echo "DEBUG: $*"	< /dev/null > /dev/tty 2>&1
        fi
    }
    
    # if string $1 is in $2-* then return $1, else return ""
    strstr()
    {
        typeset	look="$1"
        typeset	ret=""
    
        shift
    #   decho "strstr LOOK .$look. FIRST .$1."
        while [ $# -ne 0 ] ; do
    	if [ "$look" = "$1" ] ; then
    	    ret="$look"
    	fi
    	shift
        done
        echo "$ret"
    }
    
    # if string $1 is in $2-* then delete it. return result
    strdstr()
    {
        typeset	look="$1"
        typeset	ret=""
    
        shift
    #   decho "strdstr LOOK .$look. FIRST .$1."
        while [ $# -ne 0 ] ; do
    	if [ "$look" != "$1" ] ; then
    	    ret="$ret $1"
    	fi
    	shift
        done
        echo "$ret"
    }
    
    merge_continued_lines()
    {
        awk -e '\
    	BEGIN { line = "";} \
    	$NF == "\\" { \
    	    $NF = ""; \
    	    line = line $0; \
    	    next; \
    	} \
    	$NF != "\\" { \
    	    if ( line != "" ) { \
    		print line $0; \
    		line = ""; \
    	    } else { \
    		print $0; \
    	    } \
    	}'
    }
    
    # trim out stuff not associated with metadevices
    find_meta_devices()
    {
        typeset	devices=""
    
    #   decho "find_meta_devices .$*."
        while [ $# -ne 0 ] ; do
    	case $1 in
    	d+([0-9]) )	# metadevice name
    	    devices="$devices $1"
    	    ;;
    	esac
    	shift
        done
        echo "$devices"
    }
    
    # return the list of top level metadevices
    toplevel()
    {
        typeset	comp_meta_devices=""
        typeset	top_meta_devices=""
        typeset	devices=""
        typeset	device=""
        typeset	comp=""
    
        metastat$setarg -p | merge_continued_lines | while read line ; do
    	echo "$line"
    	devices=`find_meta_devices $line`
    	set -- $devices
    	if [ $# -ne 0 ] ; then
    	    device=$1
    	    shift
    	    # check to see if device already refered to as component
    	    comp=`strstr $device $comp_meta_devices`
    	    if [ -z $comp ] ; then 
    		top_meta_devices="$top_meta_devices $device"
    	    fi
    	    # add components to component list, remove from top list
    	    while [ $# -ne 0 ] ; do
    		comp=$1
    		comp_meta_devices="$comp_meta_devices $comp"
    		top_meta_devices=`strdstr $comp $top_meta_devices`
    		shift
    	    done
    	fi
        done > /dev/null 2>&1
        echo $top_meta_devices
    }
    
    #
    # - MAIN
    #
    METAPATH=/usr/sbin
    PATH=//usr/bin:$METAPATH
    USAGE="usage: metacheck [-s setname] [-h] [[-t] [-f [-d datefmt]] \
        [-w who] -m recipient [recipient...]]"
    
    datefmt="%D"
    debug="no"
    filter="no"
    mflag="no"
    set="local"
    setarg=""
    testarg="no"
    who=`id | sed -e 's/^uid=[0-9][0-9]*(//' -e 's/).*//'`
    
    while getopts d:Dfms:tw: flag
    do
        case $flag in
        d)	datefmt=$OPTARG;
    	;;
        D)	debug="yes"
    	;;
        f)	filter="yes"
    	;;
        m)	mflag="yes"
    	;;
        s)	set=$OPTARG;
    	if [ "$set" != "local" ] ; then
    		setarg=" -s $set";
    	fi
    	;;
        t)	testarg="yes";
    	;;
        w)	who=$OPTARG;
    	;;
        \?)	echo $USAGE
    	exit 1
    	;;
        esac
    done
    
    # if mflag specified then everything else part of recipient
    shift `expr $OPTIND - 1`
    if [ $mflag = "no" ] ; then
        if [ $# -ne 0 ] ; then 
    	echo $USAGE
    	exit 1
        fi
    else
        if [ $# -eq 0 ] ; then 
    	echo $USAGE
    	exit 1
        fi
    fi
    recipients="$*"
    
    curdate_filter=`date +$datefmt`
    curdate=`date`
    node=`uname -n`
    
    # establish files
    msg_f=/tmp/metacheck.msg.$$
    msgs_f=/tmp/metacheck.msgs.$$
    metastat_f=/tmp/metacheck.metastat.$$
    metadb_f=/tmp/metacheck.metadb.$$
    metahs_f=/tmp/metacheck.metahs.$$
    pending_f=/etc/lvm/metacheck.$set.pending 
    files="$metastat_f $metadb_f $metahs_f $msg_f $msgs_f"
    
    rm -f $files							> /dev/null 2>&1
    trap "rm -f $files > /dev/null 2>&1; exit 1" 1 2 3 15
    
    # Check to see if metadb is capable of running
    have_metadb="yes"
    metadb$setarg 							> $metadb_f 2>&1
    if [ $? -ne 0 ] ; then
        have_metadb="no"
    fi
    grep "there are no existing databases"  	< $metadb_f	> /dev/null 2>&1
    if [ $? -eq 0 ] ; then
        have_metadb="no"
    fi
    grep "/dev/md/admin"				< $metadb_f	> /dev/null 2>&1
    if [ $? -eq 0 ] ; then
        have_metadb="no"
    fi
    
    # check for problems accessing metadbs
    retval=0
    if [ "$have_metadb" = "no" ] ; then
        retval=1
        echo "metacheck: metadb problem, can't run '$METAPATH/metadb$setarg'" \
    								>> $msgs_f
    else
        # snapshot the state
        metadb$setarg 2>&1 | sed -e '1d' | merge_continued_lines	> $metadb_f
        metastat$setarg 2>&1 | merge_continued_lines		> $metastat_f
        metahs$setarg -i 2>&1 | merge_continued_lines		> $metahs_f
    
        #
        # Check replicas for problems, capital letters in the flags
        # indicate an error, fields are seperated by tabs.
        #
        problem=`awk < $metadb_f -F\t '{if ($1 ~ /[A-Z]/) print $1;}'`
        if [ -n "$problem" ] ; then
    	retval=`expr $retval + 64`
    	echo "\
    metacheck: metadb problem, for more detail run:\n\t$METAPATH/metadb$setarg -i" \
    								>> $msgs_f
        fi
    
        #
        # Check the metadevice state
        #
        problem=`awk < $metastat_f -e \
    		'/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
        if [ -n "$problem" ] ; then
    	retval=`expr $retval + 128`
    	echo "\
    metacheck: metadevice problem, for more detail run:" \
    								>> $msgs_f
    
    	# refine the message to toplevel metadevices that have a problem
    	top=`toplevel`
    	set -- $top
    	while [ $# -ne 0 ] ; do
    	    device=$1
    	    problem=`metastat $device | awk -e \
    		'/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
    	    if [ -n "$problem" ] ; then
    		echo "\t$METAPATH/metastat$setarg $device"	>> $msgs_f
    		# find out what is mounted on the device
    		mp=`mount|awk -e '/\/dev\/md\/dsk\/'$device'[ \t]/{print $1;}'`
    		if [ -n "$mp" ] ; then
    		    echo "\t\t$mp mounted on $device"		>> $msgs_f
    		fi
    	    fi
    	    shift
    	done
        fi
    
        #
        # Check the hotspares to see if any have been used.
        #
        problem=""
        grep "no hotspare pools found"	< $metahs_f		> /dev/null 2>&1
        if [ $? -ne 0 ] ; then
    	problem=`awk < $metahs_f -e \
    	    '/blocks/ { if ( $2 != "Available" ) print $0;}'`
        fi
        if [ -n "$problem" ] ; then
    	retval=`expr $retval + 256`
    	echo "\
    metacheck: hot spare in use, for more detail run:\n\t$METAPATH/metahs$setarg -i" \
    								 >> $msgs_f
        fi
    fi
    
    # If any errors occurred, then mail the report
    if [ $retval -ne 0 ] ; then
        if [ -n "$recipients" ] ; then 
    	re=""
    	if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
    	    re="Re: "
    	    # we have a pending notification, check date to see if we resend
    	    penddate_filter=`cat $pending_f | head -1`
    	    if [ "$curdate_filter" != "$penddate_filter" ] ; then
    		rm -f $pending_f				> /dev/null 2>&1
    	    else
    	 	if [ "$debug" = "yes" ] ; then
    		    echo "metacheck: email problem notification still pending"
    		    cat $pending_f
    		fi
    	    fi
    	fi
    	if [ ! -f $pending_f ] ; then
    	    if [ "$filter" = "yes" ] ; then
    		echo "$curdate_filter\n\tDate:$curdate\n\tTo:$recipients" \
    								> $pending_f
    	    fi
    	    echo "\
    Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"		>> $msg_f
    	    echo "\
    --------------------------------------------------------------" >> $msg_f
    	    cat $msg_f $msgs_f | mailx -s \
    		"${re}Solaris Volume Manager Problem: metacheck.$who.$set.$node" $recipients
    	fi
        else
    	cat $msgs_f
        fi
    else
        # no problems detected,
        if [ -n "$recipients" ] ; then
    	# default is to not send any mail, or print anything.
    	echo "\
    Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"		>> $msg_f
    	echo "\
    --------------------------------------------------------------" >> $msg_f
    	if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
    	    # pending filter exista, remove it and send OK
    	    rm -f $pending_f					> /dev/null 2>&1
    	    echo "Problem resolved"				>> $msg_f
    	    cat $msg_f | mailx -s \
    		"Re: Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
    	elif [ "$testarg" = "yes" ] ; then
    	    # for testing, send mail every time even thought there is no problem
    	    echo "Messaging test, no problems detected"		>> $msg_f
    	    cat $msg_f | mailx -s \
    		"Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
    	fi
        else
    	echo "metacheck: Okay"
        fi
    fi
    
    rm -f $files							> /dev/null 2>&1
    exit $retval

    For information on invoking scripts by using the cron utility, see the cron ( 1M ) man page.