Package ZenStatus :: Module zenping
[hide private]
[frames] | no frames]

Source Code for Module ZenStatus.zenping

  1  ########################################################################### 
  2  # 
  3  # This program is part of Zenoss Core, an open source monitoring platform. 
  4  # Copyright (C) 2007, Zenoss Inc. 
  5  # 
  6  # This program is free software; you can redistribute it and/or modify it 
  7  # under the terms of the GNU General Public License version 2 as published by 
  8  # the Free Software Foundation. 
  9  # 
 10  # For complete information please visit: http://www.zenoss.com/oss/ 
 11  # 
 12  ########################################################################### 
 13   
 14   
 15  __doc__=''' ZenPing 
 16   
 17  Determines the availability of an IP address using ping. 
 18   
 19  $Id$''' 
 20   
 21  __version__ = "$Revision$"[11:-2] 
 22   
 23  from socket import gethostbyname, getfqdn, gaierror 
 24   
 25  import time 
 26  import sys 
 27   
 28  import Globals # make zope imports work 
 29   
 30  from AsyncPing import Ping 
 31  from TestPing import Ping as TestPing 
 32  import pingtree 
 33   
 34   
 35  from Products.ZenEvents.ZenEventClasses import App_Start, App_Stop 
 36  from Products.ZenEvents.ZenEventClasses import Status_Ping 
 37  from Products.ZenEvents.Event import Event, EventHeartbeat 
 38  from Products.ZenUtils.ZCmdBase import ZCmdBase 
 39   
 40  from twisted.internet import reactor, defer 
 41   
42 -class ZenPing(ZCmdBase):
43 44 agent = "ZenPing" 45 eventGroup = "Ping" 46 47 pathcheckthresh = 10 48 timeOut = 1.5 49 tries = 2 50 chunk = 75 51 cycleInterval = 60 52 configCycleInterval = 20*60 53 maxFailures = 2 54 pinger = None 55 pingTreeIter = None 56 startTime = None 57 jobs = 0 58 reconfigured = True 59
60 - def __init__(self):
61 ZCmdBase.__init__(self, keeproot=True) 62 self.hostname = getfqdn() 63 self.configpath = self.options.configpath 64 if self.configpath.startswith("/"): 65 self.configpath = self.configpath[1:] 66 67 self.zem = self.dmd.ZenEventManager 68 self.sendEvent(Event(device=getfqdn(), 69 eventClass=App_Start, 70 summary="zenping started", 71 severity=0, 72 component="zenping")) 73 self.log.info("started")
74
75 - def sendEvent(self, evt):
76 "wrapper for sending an event" 77 self.zem.sendEvent(evt)
78 79
80 - def sendPingEvent(self, pj):
81 "Send an event based on a ping job to the event backend." 82 evt = Event(device=pj.hostname, 83 ipAddress=pj.ipaddr, 84 summary=pj.message, 85 severity=pj.severity, 86 eventClass=Status_Ping, 87 eventGroup=self.eventGroup, 88 agent=self.agent, 89 component='', 90 manager=self.hostname) 91 evstate = getattr(pj, 'eventState', None) 92 if evstate is not None: evt.eventState = evstate 93 self.sendEvent(evt)
94
95 - def loadConfig(self):
96 "get the config data" 97 self.dmd._p_jar.sync() 98 changed = False 99 smc = self.dmd.getObjByPath(self.configpath) 100 for att in ("timeOut", "tries", "chunk", 101 "cycleInterval", "configCycleInterval", 102 "maxFailures",): 103 before = getattr(self, att) 104 after = getattr(smc, att) 105 setattr(self, att, after) 106 if not changed: 107 changed = before != after 108 self.configCycleInterval *= 60 109 self.reconfigured = True 110 111 reactor.callLater(self.configCycleInterval, self.loadConfig) 112 113 me = None 114 if self.options.name: 115 me = self.dmd.Devices.findDevice(self.options.name) 116 self.log.info("device %s not found trying %s", 117 self.options.name, self.hostname) 118 else: 119 me = self.dmd.Devices.findDevice(self.hostname) 120 if me: 121 self.log.info("building pingtree from %s", me.id) 122 self.pingtree = pingtree.buildTree(me) 123 else: 124 self.log.critical("ZenPing '%s' not found," 125 "ignoring network topology.",self.hostname) 126 self.pingtree = pingtree.Rnode(findIp(), self.hostname, 0) 127 devices = smc.getPingDevices() 128 self.prepDevices(devices)
129 130
131 - def prepDevices(self, devices):
132 """resolve dns names and make StatusTest objects""" 133 for device in devices: 134 if not self.pingtree.hasDev(device): 135 self.pingtree.addDevice(device) 136 self.reconfigured = True
137 138
139 - def buildOptions(self):
140 ZCmdBase.buildOptions(self) 141 self.parser.add_option('--configpath', 142 dest='configpath', 143 default="Monitors/StatusMonitors/localhost", 144 help="path to our monitor config ie: " 145 "/Monitors/StatusMonitors/localhost") 146 self.parser.add_option('--name', 147 dest='name', 148 help=("name to use when looking up our " 149 "record in the dmd " 150 "defaults to our fqdn as returned " 151 "by getfqdn")) 152 self.parser.add_option('--test', 153 dest='test', 154 default=False, 155 action="store_true", 156 help="Run in test mode: doesn't really ping," 157 " but reads the list of IP Addresses that " 158 " are up from /tmp/testping") 159 self.parser.add_option('--useFileDescriptor', 160 dest='useFileDescriptor', 161 default=None, 162 help="use the given (privileged) file descriptor")
163 164
165 - def pingCycle(self):
166 "Start a new run against the ping job tree" 167 reactor.callLater(self.cycleInterval, self.pingCycle) 168 169 if self.pingTreeIter == None: 170 self.start = time.time() 171 self.jobs = 0 172 self.pingTreeIter = self.pingtree.pjgen() 173 while self.pinger.jobCount() < self.chunk and self.startOne(): 174 pass
175 176
177 - def startOne(self):
178 "Initiate the next ping job" 179 if not self.pingTreeIter: 180 return False 181 while 1: 182 try: 183 pj = self.pingTreeIter.next() 184 if pj.status < self.maxFailures or self.reconfigured: 185 self.ping(pj) 186 return True 187 except StopIteration: 188 self.pingTreeIter = None 189 return False
190
191 - def ping(self, pj):
192 "Perform a ping" 193 self.log.debug("starting %s", pj.ipaddr) 194 pj.reset() 195 self.pinger.sendPacket(pj) 196 pj.deferred.addCallbacks(self.pingSuccess, self.pingFailed)
197
198 - def next(self):
199 "Pull up the next ping job, which may throw StopIteration" 200 self.jobs += 1 201 self.startOne() 202 if self.pinger.jobCount() == 0: 203 self.endCycle()
204 205
206 - def endCycle(self, *unused):
207 "Note the end of the ping list with a successful status message" 208 runtime = time.time() - self.start 209 self.log.info("Finished pinging %d jobs in %.2f seconds", 210 self.jobs, runtime) 211 self.reconfigured = False 212 if not self.options.cycle: 213 reactor.stop() 214 else: 215 self.sendHeartbeat()
216
217 - def sendHeartbeat(self):
218 'Send a heartbeat event for this monitor.' 219 timeout = self.cycleInterval*3 220 evt = EventHeartbeat(getfqdn(), "zenping", timeout) 221 self.sendEvent(evt)
222
223 - def pingSuccess(self, pj):
224 "Callback for a good ping response" 225 pj.deferred = None 226 status = pj.status 227 pj.status = 0 228 if status > 1: 229 pj.severity = 0 230 self.sendPingEvent(pj) 231 self.log.debug("Success %s", pj.ipaddr) 232 self.next()
233
234 - def pingFailed(self, err):
235 try: 236 self.doPingFailed(err) 237 except Exception, ex: 238 import traceback 239 from StringIO import StringIO 240 out = StringIO() 241 traceback.print_exc(ex, out) 242 self.log.error("Exception: %s", out.getvalue())
243
244 - def doPingFailed(self, err):
245 "Callback for a bad (no) ping response" 246 pj = err.value 247 pj.deferred = None 248 pj.status += 1 249 self.log.debug("Failed %s %s", pj.ipaddr, pj.status) 250 if pj.status == 1: 251 self.log.debug("first failure '%s'", pj.hostname) 252 # if our path back is currently clear add our parent 253 # to the ping list again to see if path is really clear 254 # and then re-ping ourself. 255 if not pj.checkpath(): 256 routerpj = pj.routerpj() 257 if routerpj: 258 self.ping(routerpj) 259 self.ping(pj) 260 else: 261 failname = pj.checkpath() 262 if failname: 263 pj.eventState = 2 # suppressed FIXME 264 pj.message += (", failed at %s" % failname) 265 self.log.warn(pj.message) 266 self.sendPingEvent(pj) 267 self.markChildrenDown(pj) 268 269 self.next()
270
271 - def sigTerm(self, *unused):
272 'controlled shutdown of main loop on interrupt' 273 try: 274 ZCmdBase.sigTerm(self, *unused) 275 except SystemExit: 276 reactor.stop()
277
278 - def start(self):
279 "Get things going" 280 self.loadConfig() 281 if self.options.test: 282 self.pinger = TestPing(self.tries, self.timeOut) 283 else: 284 fd = None 285 if self.options.useFileDescriptor is not None: 286 fd = int(self.options.useFileDescriptor) 287 self.pinger = Ping(self.tries, self.timeOut, fd) 288 289 self.pingCycle()
290 291
292 - def markChildrenDown(self, pj):
293 """If this is a router PingJob, mark all Nodes 294 away from the ping monitor as down""" 295 296 # unfortunately there's no mapping from pj to router, so find it 297 routers = [] 298 def recurse(node): 299 if routers: return 300 if node.pj == pj: 301 routers.append(node) 302 for c in node.children: 303 recurse(c)
304 recurse(self.pingtree) 305 if not routers: return 306 assert len(routers) == 1 307 children = routers[0].pjgen() 308 children.next() # skip self 309 for pj in children: 310 pj.eventState = 2 # suppress 311 self.sendPingEvent(pj)
312 313 314 315
316 -def findIp():
317 try: 318 return gethostbyname(getfqdn()) 319 except gaierror: 320 # find the first non-loopback interface address 321 import os 322 import re 323 ifconfigs = ['/sbin/ifconfig', 324 '/usr/sbin/ifconfig', 325 '/usr/bin/ifconfig', 326 '/bin/ifconfig'] 327 ifconfig = filter(os.path.exists, ifconfigs)[0] 328 fp = os.popen(ifconfig + ' -a') 329 config = fp.read().split('\n\n') 330 fp.close() 331 digits = r'[0-9]{1,3}' 332 pat = r'(addr:|inet) *(%s\.%s\.%s\.%s)[^0-9]' % ((digits,)*4) 333 parse = re.compile(pat) 334 results = [] 335 for c in config: 336 addr = parse.search(c) 337 if addr: 338 results.append(addr.group(2)) 339 try: 340 results.remove('127.0.0.1') 341 except ValueError: 342 pass 343 if results: 344 return results[0] 345 return '127.0.0.1'
346 347 if __name__=='__main__': 348 if sys.platform == 'win32': 349 time.time = time.clock 350 pm = ZenPing() 351 pm.start() 352 import logging 353 logging.getLogger('zen.Events').setLevel(20) 354 reactor.run(installSignalHandlers=False) 355 pm.log.info("stopping...") 356 pm.sendEvent(Event(device=getfqdn(), 357 eventClass=App_Stop, 358 summary="zenping stopped", 359 severity=4, component="zenping")) 360 pm.log.info("stopped") 361