Package ZenStatus :: Module zenping
[hide private]
[frames] | no frames]

Source Code for Module ZenStatus.zenping

  1  ########################################################################### 
  2  # 
  3  # This program is part of Zenoss Core, an open source monitoring platform. 
  4  # Copyright (C) 2007, Zenoss Inc. 
  5  # 
  6  # This program is free software; you can redistribute it and/or modify it 
  7  # under the terms of the GNU General Public License version 2 as published by 
  8  # the Free Software Foundation. 
  9  # 
 10  # For complete information please visit: http://www.zenoss.com/oss/ 
 11  # 
 12  ########################################################################### 
 13   
 14   
 15  __doc__=''' ZenPing 
 16   
 17  Determines the availability of an IP address using ping. 
 18   
 19  $Id$''' 
 20   
 21  from socket import gethostbyname, getfqdn, gaierror 
 22   
 23  import time 
 24   
 25  import Globals # make zope imports work 
 26   
 27  from Products.ZenStatus.AsyncPing import Ping 
 28  from Products.ZenStatus.TestPing import Ping as TestPing 
 29  from Products.ZenStatus import pingtree 
 30  from Products.ZenUtils.Utils import unused 
 31  unused(pingtree)                        # needed for pb 
 32   
 33  from Products.ZenEvents.ZenEventClasses import Status_Ping, Clear 
 34  from Products.ZenHub.PBDaemon import FakeRemote, PBDaemon 
 35  from Products.ZenUtils.DaemonStats import DaemonStats 
 36  from Products.ZenUtils.Driver import drive, driveLater 
 37   
 38  from twisted.internet import reactor 
 39  from twisted.python import failure 
 40   
41 -class ZenPing(PBDaemon):
42 43 name = agent = "zenping" 44 eventGroup = "Ping" 45 initialServices = PBDaemon.initialServices + ['PingConfig'] 46 47 pingTimeOut = 1.5 48 pingTries = 2 49 pingChunk = 75 50 pingCycleInterval = 60 51 configCycleInterval = 20*60 52 maxPingFailures = 2 53 54 pinger = None 55 pingTreeIter = None 56 startTime = None 57 jobs = 0 58 reconfigured = True 59 loadingConfig = None 60 lastConfig = None 61 62
63 - def __init__(self):
64 self.pingtree = None 65 PBDaemon.__init__(self, keeproot=True) 66 if not self.options.useFileDescriptor: 67 self.openPrivilegedPort('--ping') 68 self.rrdStats = DaemonStats() 69 if self.options.test: 70 self.pinger = TestPing(self.pingTries, self.pingTimeOut) 71 else: 72 fd = None 73 if self.options.useFileDescriptor is not None: 74 fd = int(self.options.useFileDescriptor) 75 self.pinger = Ping(self.pingTries, self.pingTimeOut, fd) 76 self.lastConfig = time.time() - self.options.minconfigwait 77 self.log.info("started")
78 79
80 - def config(self):
81 return self.services.get('PingConfig', FakeRemote())
82 83
84 - def stopOnError(self, error):
85 self.log.exception(error) 86 self.stop() 87 return error
88 89
90 - def connected(self):
91 self.log.debug("Connected, getting config") 92 d = drive(self.loadConfig) 93 d.addCallback(self.pingCycle) 94 d.addErrback(self.stopOnError)
95 96
97 - def sendPingEvent(self, pj):
98 "Send an event based on a ping job to the event backend." 99 evt = dict(device=pj.hostname, 100 ipAddress=pj.ipaddr, 101 summary=pj.message, 102 severity=pj.severity, 103 eventClass=Status_Ping, 104 eventGroup=self.eventGroup, 105 agent=self.agent, 106 component='', 107 manager=self.options.monitor) 108 evstate = getattr(pj, 'eventState', None) 109 if evstate is not None: 110 evt['eventState'] = evstate 111 self.sendEvent(evt)
112
113 - def loadConfig(self, driver):
114 "Get the configuration for zenping" 115 try: 116 if self.loadingConfig: 117 self.log.warning("Configuration still loading. Started at %s" % 118 time.asctime(time.localtime(self.loadingConfig))) 119 return 120 121 if self.lastConfig: 122 configwait = time.time() - self.lastConfig 123 delay = self.options.minconfigwait - configwait 124 if delay > 0: 125 reactor.callLater(delay, self.remote_updateConfig) 126 self.log.debug("Config recently updated: not fetching") 127 return 128 129 self.loadingConfig = time.time() 130 131 self.log.info('fetching monitor properties') 132 yield self.config().callRemote('propertyItems') 133 self.copyItems(driver.next()) 134 135 driveLater(self.configCycleInterval, self.loadConfig) 136 137 self.log.info("fetching default RRDCreateCommand") 138 yield self.config().callRemote('getDefaultRRDCreateCommand') 139 createCommand = driver.next() 140 141 self.log.info("getting threshold classes") 142 yield self.config().callRemote('getThresholdClasses') 143 self.remote_updateThresholdClasses(driver.next()) 144 145 self.log.info("getting collector thresholds") 146 yield self.config().callRemote('getCollectorThresholds') 147 self.rrdStats.config(self.options.monitor, 148 self.name, 149 driver.next(), 150 createCommand) 151 152 self.log.info("getting ping tree") 153 yield self.config().callRemote('getPingTree', 154 self.options.name, 155 findIp()) 156 oldtree, self.pingtree = self.pingtree, driver.next() 157 self.clearDeletedDevices(oldtree) 158 159 self.rrdStats.gauge('configTime', 160 self.configCycleInterval, 161 time.time() - self.loadingConfig) 162 self.loadingConfig = None 163 self.lastConfig = time.time() 164 except Exception, ex: 165 self.log.exception(ex)
166 167
168 - def buildOptions(self):
169 PBDaemon.buildOptions(self) 170 self.parser.add_option('--name', 171 dest='name', 172 default=getfqdn(), 173 help=("host that roots the ping dependency " 174 "tree: typically the collecting hosts' " 175 "name; defaults to our fully qualified " 176 "domain name (%s)" % getfqdn())) 177 self.parser.add_option('--test', 178 dest='test', 179 default=False, 180 action="store_true", 181 help="Run in test mode: doesn't really ping," 182 " but reads the list of IP Addresses that " 183 " are up from /tmp/testping") 184 self.parser.add_option('--useFileDescriptor', 185 dest='useFileDescriptor', 186 default=None, 187 help= 188 "use the given (privileged) file descriptor") 189 self.parser.add_option('--minConfigWait', 190 dest='minconfigwait', 191 default=300, 192 type='int', 193 help= 194 "the minimal time, in seconds, " 195 "between refreshes of the config")
196 197
198 - def pingCycle(self, unused=None):
199 "Start a new run against the ping job tree" 200 if self.options.cycle: 201 reactor.callLater(self.pingCycleInterval, self.pingCycle) 202 203 if self.pingTreeIter == None: 204 self.start = time.time() 205 self.jobs = 0 206 self.pingTreeIter = self.pingtree.pjgen() 207 while self.pinger.jobCount() < self.pingChunk and self.startOne(): 208 pass
209 210
211 - def startOne(self):
212 "Initiate the next ping job" 213 if not self.pingTreeIter: 214 return False 215 while 1: 216 try: 217 pj = self.pingTreeIter.next() 218 if pj.status < self.maxPingFailures or self.reconfigured: 219 self.ping(pj) 220 return True 221 except StopIteration: 222 self.pingTreeIter = None 223 return False
224
225 - def ping(self, pj):
226 "Perform a ping" 227 self.log.debug("starting %s", pj.ipaddr) 228 pj.reset() 229 self.pinger.sendPacket(pj) 230 pj.deferred.addCallbacks(self.pingSuccess, self.pingFailed)
231
232 - def next(self):
233 "Pull up the next ping job, which may throw StopIteration" 234 self.jobs += 1 235 self.startOne() 236 if self.pinger.jobCount() == 0: 237 self.endCycle()
238 239
240 - def endCycle(self, *unused):
241 "Note the end of the ping list with a successful status message" 242 runtime = time.time() - self.start 243 self.log.info("Finished pinging %d jobs in %.2f seconds", 244 self.jobs, runtime) 245 self.reconfigured = False 246 if not self.options.cycle: 247 reactor.stop() 248 else: 249 self.heartbeat()
250
251 - def heartbeat(self):
252 'Send a heartbeat event for this monitor.' 253 PBDaemon.heartbeat(self) 254 for ev in (self.rrdStats.gauge('cycleTime', 255 self.pingCycleInterval, 256 time.time() - self.start) + 257 self.rrdStats.gauge('devices', 258 self.pingCycleInterval, 259 self.jobs)): 260 self.sendEvent(ev)
261
262 - def pingSuccess(self, pj):
263 "Callback for a good ping response" 264 pj.deferred = None 265 if pj.status > 1: 266 pj.severity = 0 267 self.sendPingEvent(pj) 268 self.log.debug("Success %s", pj.ipaddr) 269 pj.status = 0 270 self.next()
271
272 - def pingFailed(self, err):
273 try: 274 self.doPingFailed(err) 275 except Exception, ex: 276 import traceback 277 from StringIO import StringIO 278 out = StringIO() 279 traceback.print_exc(ex, out) 280 self.log.error("Exception: %s", out.getvalue())
281
282 - def doPingFailed(self, err):
283 "Callback for a bad (no) ping response" 284 pj = err.value 285 pj.deferred = None 286 pj.status += 1 287 self.log.debug("Failed %s %s", pj.ipaddr, pj.status) 288 if pj.status == 1: 289 self.log.debug("first failure '%s'", pj.hostname) 290 # if our path back is currently clear add our parent 291 # to the ping list again to see if path is really clear 292 # and then re-ping ourself. 293 if not pj.checkpath(): 294 routerpj = pj.routerpj() 295 if routerpj: 296 self.ping(routerpj) 297 self.ping(pj) 298 else: 299 failname = pj.checkpath() 300 # walk up the ping tree and find router node with failure 301 if failname: 302 pj.eventState = 2 # suppressed FIXME 303 pj.message += (", failed at %s" % failname) 304 self.log.warn(pj.message) 305 self.sendPingEvent(pj) 306 # not needed since it will cause suppressed ping events 307 # to show up twice, once from if failname: sections 308 # and second from markChildrenDown 309 # the "marking" of children never took place anyway 310 # due to iterator status check 311 # self.markChildrenDown(pj) 312 313 self.next()
314 315
316 - def remote_setPropertyItems(self, items):
317 "The config has changed, maybe the device list is different" 318 self.copyItems(items) 319 self.remote_updateConfig()
320 321
322 - def remote_updateConfig(self):
323 self.log.debug("Asynch update config") 324 d = drive(self.loadConfig) 325 def logResults(v): 326 if isinstance(v, failure.Failure): 327 self.log.error("Unable to reload config for async update")
328 d.addBoth(logResults)
329 330
331 - def copyItems(self, items):
332 items = dict(items) 333 for att in ("pingTimeOut", 334 "pingTries", 335 "pingChunk", 336 "pingCycleInterval", 337 "configCycleInterval", 338 "maxPingFailures", 339 ): 340 before = getattr(self, att) 341 after = items.get(att, before) 342 setattr(self, att, after) 343 self.configCycleInterval *= 60 344 self.reconfigured = True
345 346
347 - def clearDevice(self, device):
348 self.sendEvent(dict(device=device, 349 eventClass=Status_Ping, 350 summary="No longer testing device", 351 severity=Clear))
352 353
354 - def clearDeletedDevices(self, oldtree):
355 "Send clears for any device we stop pinging" 356 down = set() 357 if oldtree: 358 down = set([pj.hostname for pj in oldtree.pjgen() if pj.status]) 359 all = set([pj.hostname for pj in self.pingtree.pjgen()]) 360 for device in down - all: 361 self.clearDevice(device)
362 363
364 - def remote_deleteDevice(self, device):
365 self.log.debug("Asynch delete device %s" % device) 366 self.clearDevice(device) 367 self.remote_updateConfig()
368 369
370 -def findIp():
371 try: 372 return gethostbyname(getfqdn()) 373 except gaierror: 374 # find the first non-loopback interface address 375 import os 376 import re 377 ifconfigs = ['/sbin/ifconfig', 378 '/usr/sbin/ifconfig', 379 '/usr/bin/ifconfig', 380 '/bin/ifconfig'] 381 ifconfig = filter(os.path.exists, ifconfigs)[0] 382 fp = os.popen(ifconfig + ' -a') 383 config = fp.read().split('\n\n') 384 fp.close() 385 digits = r'[0-9]{1,3}' 386 pat = r'(addr:|inet) *(%s\.%s\.%s\.%s)[^0-9]' % ((digits,)*4) 387 parse = re.compile(pat) 388 results = [] 389 for c in config: 390 addr = parse.search(c) 391 if addr: 392 results.append(addr.group(2)) 393 try: 394 results.remove('127.0.0.1') 395 except ValueError: 396 pass 397 if results: 398 return results[0] 399 return '127.0.0.1'
400 401 if __name__=='__main__': 402 pm = ZenPing() 403 import logging 404 logging.getLogger('zen.Events').setLevel(20) 405 pm.run() 406