Package Products :: Package ZenStatus :: Module zenping
[hide private]
[frames] | no frames]

Source Code for Module Products.ZenStatus.zenping

  1  ########################################################################### 
  2  # 
  3  # This program is part of Zenoss Core, an open source monitoring platform. 
  4  # Copyright (C) 2007, Zenoss Inc. 
  5  # 
  6  # This program is free software; you can redistribute it and/or modify it 
  7  # under the terms of the GNU General Public License version 2 as published by 
  8  # the Free Software Foundation. 
  9  # 
 10  # For complete information please visit: http://www.zenoss.com/oss/ 
 11  # 
 12  ########################################################################### 
 13   
 14   
 15  __doc__=''' ZenPing 
 16   
 17  Determines the availability of an IP address using ping. 
 18   
 19  $Id$''' 
 20   
 21  from socket import gethostbyname, getfqdn, gaierror 
 22   
 23  import time 
 24   
 25  import Globals # make zope imports work 
 26   
 27  from Products.ZenStatus.AsyncPing import Ping 
 28  from Products.ZenStatus.TestPing import Ping as TestPing 
 29  from Products.ZenStatus import pingtree 
 30  from Products.ZenUtils.Utils import unused 
 31  unused(pingtree)                        # needed for pb 
 32   
 33  from Products.ZenEvents.ZenEventClasses import Status_Ping, Clear 
 34  from Products.ZenHub.PBDaemon import FakeRemote, PBDaemon 
 35  from Products.ZenUtils.DaemonStats import DaemonStats 
 36  from Products.ZenUtils.Driver import drive, driveLater 
 37   
 38  from twisted.internet import reactor 
 39  from twisted.python import failure 
 40   
41 -class ZenPing(PBDaemon):
42 43 name = agent = "zenping" 44 eventGroup = "Ping" 45 initialServices = PBDaemon.initialServices + ['PingConfig'] 46 47 pingTimeOut = 1.5 48 pingTries = 2 49 pingChunk = 75 50 pingCycleInterval = 60 51 configCycleInterval = 20*60 52 maxPingFailures = 2 53 54 pinger = None 55 pingTreeIter = None 56 startTime = None 57 jobs = 0 58 reconfigured = True 59 loadingConfig = None 60 lastConfig = None 61 62
63 - def __init__(self):
64 self.pingtree = None 65 PBDaemon.__init__(self, keeproot=True) 66 if not self.options.useFileDescriptor: 67 self.openPrivilegedPort('--ping') 68 self.rrdStats = DaemonStats() 69 self.lastConfig = time.time() - self.options.minconfigwait 70 self.log.info("started")
71 72
73 - def getPinger(self):
74 if self.pinger: 75 self.pinger.reconfigure(self.pingTries, self.pingTimeOut) 76 else: 77 if self.options.test: 78 self.pinger = TestPing(self.pingTries, self.pingTimeOut) 79 else: 80 fd = None 81 if self.options.useFileDescriptor is not None: 82 fd = int(self.options.useFileDescriptor) 83 self.pinger = Ping(self.pingTries, self.pingTimeOut, fd)
84 85
86 - def config(self):
87 return self.services.get('PingConfig', FakeRemote())
88 89
90 - def stopOnError(self, error):
91 self.log.exception(error) 92 self.stop() 93 return error
94 95
96 - def connected(self):
97 self.log.debug("Connected, getting config") 98 d = drive(self.loadConfig) 99 d.addCallback(self.pingCycle) 100 d.addErrback(self.stopOnError)
101 102
103 - def sendPingEvent(self, pj):
104 "Send an event based on a ping job to the event backend." 105 evt = dict(device=pj.hostname, 106 ipAddress=pj.ipaddr, 107 summary=pj.message, 108 severity=pj.severity, 109 eventClass=Status_Ping, 110 eventGroup=self.eventGroup, 111 agent=self.agent, 112 component='', 113 manager=self.options.monitor) 114 evstate = getattr(pj, 'eventState', None) 115 if evstate is not None: 116 evt['eventState'] = evstate 117 self.sendEvent(evt)
118
119 - def loadConfig(self, driver):
120 "Get the configuration for zenping" 121 try: 122 if self.loadingConfig: 123 self.log.warning("Configuration still loading. Started at %s" % 124 time.asctime(time.localtime(self.loadingConfig))) 125 return 126 127 if self.lastConfig: 128 configwait = time.time() - self.lastConfig 129 delay = self.options.minconfigwait - configwait 130 if delay > 0: 131 reactor.callLater(delay, self.remote_updateConfig) 132 self.log.debug("Config recently updated: not fetching") 133 return 134 135 self.loadingConfig = time.time() 136 137 self.log.info('fetching monitor properties') 138 yield self.config().callRemote('propertyItems') 139 self.copyItems(driver.next()) 140 141 driveLater(self.configCycleInterval, self.loadConfig) 142 143 self.log.info("fetching default RRDCreateCommand") 144 yield self.config().callRemote('getDefaultRRDCreateCommand') 145 createCommand = driver.next() 146 147 self.log.info("getting threshold classes") 148 yield self.config().callRemote('getThresholdClasses') 149 self.remote_updateThresholdClasses(driver.next()) 150 151 self.log.info("getting collector thresholds") 152 yield self.config().callRemote('getCollectorThresholds') 153 self.rrdStats.config(self.options.monitor, 154 self.name, 155 driver.next(), 156 createCommand) 157 158 self.log.info("getting ping tree") 159 yield self.config().callRemote('getPingTree', 160 self.options.name, 161 findIp()) 162 oldtree, self.pingtree = self.pingtree, driver.next() 163 self.clearDeletedDevices(oldtree) 164 165 self.rrdStats.gauge('configTime', 166 self.configCycleInterval, 167 time.time() - self.loadingConfig) 168 self.loadingConfig = None 169 self.lastConfig = time.time() 170 except Exception, ex: 171 self.log.exception(ex)
172 173
174 - def buildOptions(self):
175 PBDaemon.buildOptions(self) 176 self.parser.add_option('--name', 177 dest='name', 178 default=getfqdn(), 179 help=("host that roots the ping dependency " 180 "tree: typically the collecting hosts' " 181 "name; defaults to our fully qualified " 182 "domain name (%s)" % getfqdn())) 183 self.parser.add_option('--test', 184 dest='test', 185 default=False, 186 action="store_true", 187 help="Run in test mode: doesn't really ping," 188 " but reads the list of IP Addresses that " 189 " are up from /tmp/testping") 190 self.parser.add_option('--useFileDescriptor', 191 dest='useFileDescriptor', 192 default=None, 193 help= 194 "use the given (privileged) file descriptor") 195 self.parser.add_option('--minConfigWait', 196 dest='minconfigwait', 197 default=300, 198 type='int', 199 help= 200 "the minimal time, in seconds, " 201 "between refreshes of the config")
202 203
204 - def pingCycle(self, unused=None):
205 "Start a new run against the ping job tree" 206 if self.options.cycle: 207 reactor.callLater(self.pingCycleInterval, self.pingCycle) 208 209 if self.pingTreeIter == None: 210 self.start = time.time() 211 self.jobs = 0 212 self.pingTreeIter = self.pingtree.pjgen() 213 while self.pinger.jobCount() < self.pingChunk and self.startOne(): 214 pass
215 216
217 - def startOne(self):
218 "Initiate the next ping job" 219 if not self.pingTreeIter: 220 return False 221 while 1: 222 try: 223 pj = self.pingTreeIter.next() 224 if pj.status < self.maxPingFailures or self.reconfigured: 225 self.ping(pj) 226 return True 227 except StopIteration: 228 self.pingTreeIter = None 229 return False
230
231 - def ping(self, pj):
232 "Perform a ping" 233 self.log.debug("starting %s", pj.ipaddr) 234 pj.reset() 235 self.pinger.sendPacket(pj) 236 pj.deferred.addCallbacks(self.pingSuccess, self.pingFailed)
237
238 - def next(self):
239 "Pull up the next ping job, which may throw StopIteration" 240 self.jobs += 1 241 self.startOne() 242 if self.pinger.jobCount() == 0: 243 self.endCycle()
244 245
246 - def endCycle(self, *unused):
247 "Note the end of the ping list with a successful status message" 248 runtime = time.time() - self.start 249 self.log.info("Finished pinging %d jobs in %.2f seconds", 250 self.jobs, runtime) 251 self.reconfigured = False 252 if not self.options.cycle: 253 reactor.stop() 254 else: 255 self.heartbeat()
256
257 - def heartbeat(self):
258 'Send a heartbeat event for this monitor.' 259 PBDaemon.heartbeat(self) 260 for ev in (self.rrdStats.gauge('cycleTime', 261 self.pingCycleInterval, 262 time.time() - self.start) + 263 self.rrdStats.gauge('devices', 264 self.pingCycleInterval, 265 self.jobs)): 266 self.sendEvent(ev)
267
268 - def pingSuccess(self, pj):
269 "Callback for a good ping response" 270 pj.deferred = None 271 if pj.status > 1: 272 pj.severity = 0 273 self.sendPingEvent(pj) 274 self.log.debug("Success %s", pj.ipaddr) 275 pj.status = 0 276 self.next()
277
278 - def pingFailed(self, err):
279 try: 280 self.doPingFailed(err) 281 except Exception, ex: 282 import traceback 283 from StringIO import StringIO 284 out = StringIO() 285 traceback.print_exc(ex, out) 286 self.log.error("Exception: %s", out.getvalue())
287
288 - def doPingFailed(self, err):
289 "Callback for a bad (no) ping response" 290 pj = err.value 291 pj.deferred = None 292 pj.status += 1 293 self.log.debug("Failed %s %s", pj.ipaddr, pj.status) 294 if pj.status == 1: 295 self.log.debug("first failure '%s'", pj.hostname) 296 # if our path back is currently clear add our parent 297 # to the ping list again to see if path is really clear 298 if not pj.checkpath(): 299 routerpj = pj.routerpj() 300 if routerpj: 301 self.ping(routerpj) 302 # We must now re-run this ping job to actually generate a ping down 303 # event. If there is a problem in the path, it will be suppressed. 304 self.ping(pj) 305 else: 306 failname = pj.checkpath() 307 # walk up the ping tree and find router node with failure 308 if failname: 309 pj.eventState = 2 # suppressed FIXME 310 pj.message += (", failed at %s" % failname) 311 self.log.warn(pj.message) 312 self.sendPingEvent(pj) 313 # not needed since it will cause suppressed ping events 314 # to show up twice, once from if failname: sections 315 # and second from markChildrenDown 316 # the "marking" of children never took place anyway 317 # due to iterator status check 318 # self.markChildrenDown(pj) 319 320 self.next()
321 322
323 - def remote_setPropertyItems(self, items):
324 "The config has changed, maybe the device list is different" 325 self.copyItems(items) 326 self.remote_updateConfig()
327 328
329 - def remote_updateConfig(self):
330 self.log.debug("Asynch update config") 331 d = drive(self.loadConfig) 332 def logResults(v): 333 if isinstance(v, failure.Failure): 334 self.log.error("Unable to reload config for async update") 335 336 # Reset loadingConfig so we don't get stuck in a mode where all 337 # asynchronous updates are blocked. 338 self.loadingConfig = None 339 340 # Try loading the config again in 30 seconds to give zenhub 341 # time to restart. 342 driveLater(30, self.loadConfig)
343 344 d.addBoth(logResults)
345 346
347 - def copyItems(self, items):
348 items = dict(items) 349 for att in ("pingTimeOut", 350 "pingTries", 351 "pingChunk", 352 "pingCycleInterval", 353 "configCycleInterval", 354 "maxPingFailures", 355 ): 356 before = getattr(self, att) 357 after = items.get(att, before) 358 setattr(self, att, after) 359 self.configCycleInterval *= 60 360 self.reconfigured = True 361 self.getPinger()
362 363
364 - def clearDevice(self, device):
365 self.sendEvent(dict(device=device, 366 eventClass=Status_Ping, 367 summary="No longer testing device", 368 severity=Clear))
369 370
371 - def clearDeletedDevices(self, oldtree):
372 "Send clears for any device we stop pinging" 373 down = set() 374 if oldtree: 375 down = set([pj.hostname for pj in oldtree.pjgen() if pj.status]) 376 all = set([pj.hostname for pj in self.pingtree.pjgen()]) 377 for device in down - all: 378 self.clearDevice(device)
379 380
381 - def remote_deleteDevice(self, device):
382 self.log.debug("Asynch delete device %s" % device) 383 self.clearDevice(device) 384 self.remote_updateConfig()
385 386
387 -def findIp():
388 try: 389 return gethostbyname(getfqdn()) 390 except gaierror: 391 # find the first non-loopback interface address 392 import os 393 import re 394 ifconfigs = ['/sbin/ifconfig', 395 '/usr/sbin/ifconfig', 396 '/usr/bin/ifconfig', 397 '/bin/ifconfig'] 398 ifconfig = filter(os.path.exists, ifconfigs)[0] 399 fp = os.popen(ifconfig + ' -a') 400 config = fp.read().split('\n\n') 401 fp.close() 402 digits = r'[0-9]{1,3}' 403 pat = r'(addr:|inet) *(%s\.%s\.%s\.%s)[^0-9]' % ((digits,)*4) 404 parse = re.compile(pat) 405 results = [] 406 for c in config: 407 addr = parse.search(c) 408 if addr: 409 results.append(addr.group(2)) 410 try: 411 results.remove('127.0.0.1') 412 except ValueError: 413 pass 414 if results: 415 return results[0] 416 return '127.0.0.1'
417 418 if __name__=='__main__': 419 pm = ZenPing() 420 import logging 421 logging.getLogger('zen.Events').setLevel(20) 422 pm.run() 423