Package ZenStatus :: Module zenping
[hide private]
[frames] | no frames]

Source Code for Module ZenStatus.zenping

  1  ########################################################################### 
  2  # 
  3  # This program is part of Zenoss Core, an open source monitoring platform. 
  4  # Copyright (C) 2007, Zenoss Inc. 
  5  # 
  6  # This program is free software; you can redistribute it and/or modify it 
  7  # under the terms of the GNU General Public License version 2 as published by 
  8  # the Free Software Foundation. 
  9  # 
 10  # For complete information please visit: http://www.zenoss.com/oss/ 
 11  # 
 12  ########################################################################### 
 13   
 14   
 15  __doc__=''' ZenPing 
 16   
 17  Determines the availability of an IP address using ping. 
 18   
 19  $Id$''' 
 20   
 21  from socket import gethostbyname, getfqdn, gaierror 
 22   
 23  import time 
 24   
 25  import Globals # make zope imports work 
 26   
 27  from Products.ZenStatus.AsyncPing import Ping 
 28  from Products.ZenStatus.TestPing import Ping as TestPing 
 29  from Products.ZenStatus import pingtree 
 30  from Products.ZenUtils.Utils import unused 
 31  unused(pingtree)                        # needed for pb 
 32   
 33  from Products.ZenEvents.ZenEventClasses import Status_Ping, Clear 
 34  from Products.ZenHub.PBDaemon import FakeRemote, PBDaemon 
 35  from Products.ZenUtils.DaemonStats import DaemonStats 
 36  from Products.ZenUtils.Driver import drive, driveLater 
 37   
 38  from twisted.internet import reactor 
 39  from twisted.python import failure 
 40   
41 -class ZenPing(PBDaemon):
42 43 name = agent = "zenping" 44 eventGroup = "Ping" 45 initialServices = PBDaemon.initialServices + ['PingConfig'] 46 47 pingTimeOut = 1.5 48 pingTries = 2 49 pingChunk = 75 50 pingCycleInterval = 60 51 configCycleInterval = 20*60 52 maxPingFailures = 2 53 54 pinger = None 55 pingTreeIter = None 56 startTime = None 57 jobs = 0 58 reconfigured = True 59 loadingConfig = None 60 lastConfig = None 61 62
63 - def __init__(self):
64 self.pingtree = None 65 PBDaemon.__init__(self, keeproot=True) 66 if not self.options.useFileDescriptor: 67 self.openPrivilegedPort('--ping') 68 self.rrdStats = DaemonStats() 69 if self.options.test: 70 self.pinger = TestPing(self.pingTries, self.pingTimeOut) 71 else: 72 fd = None 73 if self.options.useFileDescriptor is not None: 74 fd = int(self.options.useFileDescriptor) 75 self.pinger = Ping(self.pingTries, self.pingTimeOut, fd) 76 self.lastConfig = time.time() - self.options.minconfigwait 77 self.log.info("started")
78 79
80 - def config(self):
81 return self.services.get('PingConfig', FakeRemote())
82 83
84 - def stopOnError(self, error):
85 self.log.exception(error) 86 self.stop() 87 return error
88 89
90 - def connected(self):
91 self.log.debug("Connected, getting config") 92 d = drive(self.loadConfig) 93 d.addCallback(self.pingCycle) 94 d.addErrback(self.stopOnError)
95 96
97 - def sendPingEvent(self, pj):
98 "Send an event based on a ping job to the event backend." 99 evt = dict(device=pj.hostname, 100 ipAddress=pj.ipaddr, 101 summary=pj.message, 102 severity=pj.severity, 103 eventClass=Status_Ping, 104 eventGroup=self.eventGroup, 105 agent=self.agent, 106 component='', 107 manager=self.options.monitor) 108 evstate = getattr(pj, 'eventState', None) 109 if evstate is not None: 110 evt['eventState'] = evstate 111 self.sendEvent(evt)
112
113 - def loadConfig(self, driver):
114 "Get the configuration for zenping" 115 try: 116 if self.loadingConfig: 117 self.log.warning("Configuration still loading. Started at %s" % 118 time.asctime(time.localtime(self.loadingConfig))) 119 return 120 121 if self.lastConfig: 122 configwait = time.time() - self.lastConfig 123 delay = self.options.minconfigwait - configwait 124 if delay > 0: 125 reactor.callLater(delay, self.remote_updateConfig) 126 self.log.debug("Config recently updated: not fetching") 127 return 128 129 self.loadingConfig = time.time() 130 131 self.log.info('fetching monitor properties') 132 yield self.config().callRemote('propertyItems') 133 self.copyItems(driver.next()) 134 135 driveLater(self.configCycleInterval, self.loadConfig) 136 137 self.log.info("fetching default RRDCreateCommand") 138 yield self.config().callRemote('getDefaultRRDCreateCommand') 139 createCommand = driver.next() 140 141 self.log.info("getting threshold classes") 142 yield self.config().callRemote('getThresholdClasses') 143 self.remote_updateThresholdClasses(driver.next()) 144 145 self.log.info("getting collector thresholds") 146 yield self.config().callRemote('getCollectorThresholds') 147 self.rrdStats.config(self.options.monitor, 148 self.name, 149 driver.next(), 150 createCommand) 151 152 self.log.info("getting ping tree") 153 yield self.config().callRemote('getPingTree', 154 self.options.name, 155 findIp()) 156 oldtree, self.pingtree = self.pingtree, driver.next() 157 self.clearDeletedDevices(oldtree) 158 159 self.rrdStats.gauge('configTime', 160 self.configCycleInterval, 161 time.time() - self.loadingConfig) 162 self.loadingConfig = None 163 self.lastConfig = time.time() 164 except Exception, ex: 165 self.log.exception(ex)
166 167
168 - def buildOptions(self):
169 PBDaemon.buildOptions(self) 170 self.parser.add_option('--name', 171 dest='name', 172 default=getfqdn(), 173 help=("host that roots the ping dependency " 174 "tree: typically the collecting hosts' " 175 "name; defaults to our fully qualified " 176 "domain name (%s)" % getfqdn())) 177 self.parser.add_option('--test', 178 dest='test', 179 default=False, 180 action="store_true", 181 help="Run in test mode: doesn't really ping," 182 " but reads the list of IP Addresses that " 183 " are up from /tmp/testping") 184 self.parser.add_option('--useFileDescriptor', 185 dest='useFileDescriptor', 186 default=None, 187 help= 188 "use the given (privileged) file descriptor") 189 self.parser.add_option('--minConfigWait', 190 dest='minconfigwait', 191 default=300, 192 type='int', 193 help= 194 "the minimal time, in seconds, " 195 "between refreshes of the config")
196 197
198 - def pingCycle(self, unused=None):
199 "Start a new run against the ping job tree" 200 if self.options.cycle: 201 reactor.callLater(self.pingCycleInterval, self.pingCycle) 202 203 if self.pingTreeIter == None: 204 self.start = time.time() 205 self.jobs = 0 206 self.pingTreeIter = self.pingtree.pjgen() 207 while self.pinger.jobCount() < self.pingChunk and self.startOne(): 208 pass
209 210
211 - def startOne(self):
212 "Initiate the next ping job" 213 if not self.pingTreeIter: 214 return False 215 while 1: 216 try: 217 pj = self.pingTreeIter.next() 218 if pj.status < self.maxPingFailures or self.reconfigured: 219 self.ping(pj) 220 return True 221 except StopIteration: 222 self.pingTreeIter = None 223 return False
224
225 - def ping(self, pj):
226 "Perform a ping" 227 self.log.debug("starting %s", pj.ipaddr) 228 pj.reset() 229 self.pinger.sendPacket(pj) 230 pj.deferred.addCallbacks(self.pingSuccess, self.pingFailed)
231
232 - def next(self):
233 "Pull up the next ping job, which may throw StopIteration" 234 self.jobs += 1 235 self.startOne() 236 if self.pinger.jobCount() == 0: 237 self.endCycle()
238 239
240 - def endCycle(self, *unused):
241 "Note the end of the ping list with a successful status message" 242 runtime = time.time() - self.start 243 self.log.info("Finished pinging %d jobs in %.2f seconds", 244 self.jobs, runtime) 245 self.reconfigured = False 246 if not self.options.cycle: 247 reactor.stop() 248 else: 249 self.heartbeat()
250
251 - def heartbeat(self):
252 'Send a heartbeat event for this monitor.' 253 PBDaemon.heartbeat(self) 254 for ev in (self.rrdStats.gauge('cycleTime', 255 self.pingCycleInterval, 256 time.time() - self.start) + 257 self.rrdStats.gauge('devices', 258 self.pingCycleInterval, 259 self.jobs)): 260 self.sendEvent(ev)
261
262 - def pingSuccess(self, pj):
263 "Callback for a good ping response" 264 pj.deferred = None 265 if pj.status > 1: 266 pj.severity = 0 267 self.sendPingEvent(pj) 268 self.log.debug("Success %s", pj.ipaddr) 269 pj.status = 0 270 self.next()
271
272 - def pingFailed(self, err):
273 try: 274 self.doPingFailed(err) 275 except Exception, ex: 276 import traceback 277 from StringIO import StringIO 278 out = StringIO() 279 traceback.print_exc(ex, out) 280 self.log.error("Exception: %s", out.getvalue())
281
282 - def doPingFailed(self, err):
283 "Callback for a bad (no) ping response" 284 pj = err.value 285 pj.deferred = None 286 pj.status += 1 287 self.log.debug("Failed %s %s", pj.ipaddr, pj.status) 288 if pj.status == 1: 289 self.log.debug("first failure '%s'", pj.hostname) 290 # if our path back is currently clear add our parent 291 # to the ping list again to see if path is really clear 292 if not pj.checkpath(): 293 routerpj = pj.routerpj() 294 if routerpj: 295 self.ping(routerpj) 296 # We must now re-run this ping job to actually generate a ping down 297 # event. If there is a problem in the path, it will be suppressed. 298 self.ping(pj) 299 else: 300 failname = pj.checkpath() 301 # walk up the ping tree and find router node with failure 302 if failname: 303 pj.eventState = 2 # suppressed FIXME 304 pj.message += (", failed at %s" % failname) 305 self.log.warn(pj.message) 306 self.sendPingEvent(pj) 307 # not needed since it will cause suppressed ping events 308 # to show up twice, once from if failname: sections 309 # and second from markChildrenDown 310 # the "marking" of children never took place anyway 311 # due to iterator status check 312 # self.markChildrenDown(pj) 313 314 self.next()
315 316
317 - def remote_setPropertyItems(self, items):
318 "The config has changed, maybe the device list is different" 319 self.copyItems(items) 320 self.remote_updateConfig()
321 322
323 - def remote_updateConfig(self):
324 self.log.debug("Asynch update config") 325 d = drive(self.loadConfig) 326 def logResults(v): 327 if isinstance(v, failure.Failure): 328 self.log.error("Unable to reload config for async update") 329 330 # Reset loadingConfig so we don't get stuck in a mode where all 331 # asynchronous updates are blocked. 332 self.loadingConfig = None 333 334 # Try loading the config again in 30 seconds to give zenhub 335 # time to restart. 336 driveLater(30, self.loadConfig)
337 338 d.addBoth(logResults)
339 340
341 - def copyItems(self, items):
342 items = dict(items) 343 for att in ("pingTimeOut", 344 "pingTries", 345 "pingChunk", 346 "pingCycleInterval", 347 "configCycleInterval", 348 "maxPingFailures", 349 ): 350 before = getattr(self, att) 351 after = items.get(att, before) 352 setattr(self, att, after) 353 self.configCycleInterval *= 60 354 self.reconfigured = True
355 356
357 - def clearDevice(self, device):
358 self.sendEvent(dict(device=device, 359 eventClass=Status_Ping, 360 summary="No longer testing device", 361 severity=Clear))
362 363
364 - def clearDeletedDevices(self, oldtree):
365 "Send clears for any device we stop pinging" 366 down = set() 367 if oldtree: 368 down = set([pj.hostname for pj in oldtree.pjgen() if pj.status]) 369 all = set([pj.hostname for pj in self.pingtree.pjgen()]) 370 for device in down - all: 371 self.clearDevice(device)
372 373
374 - def remote_deleteDevice(self, device):
375 self.log.debug("Asynch delete device %s" % device) 376 self.clearDevice(device) 377 self.remote_updateConfig()
378 379
380 -def findIp():
381 try: 382 return gethostbyname(getfqdn()) 383 except gaierror: 384 # find the first non-loopback interface address 385 import os 386 import re 387 ifconfigs = ['/sbin/ifconfig', 388 '/usr/sbin/ifconfig', 389 '/usr/bin/ifconfig', 390 '/bin/ifconfig'] 391 ifconfig = filter(os.path.exists, ifconfigs)[0] 392 fp = os.popen(ifconfig + ' -a') 393 config = fp.read().split('\n\n') 394 fp.close() 395 digits = r'[0-9]{1,3}' 396 pat = r'(addr:|inet) *(%s\.%s\.%s\.%s)[^0-9]' % ((digits,)*4) 397 parse = re.compile(pat) 398 results = [] 399 for c in config: 400 addr = parse.search(c) 401 if addr: 402 results.append(addr.group(2)) 403 try: 404 results.remove('127.0.0.1') 405 except ValueError: 406 pass 407 if results: 408 return results[0] 409 return '127.0.0.1'
410 411 if __name__=='__main__': 412 pm = ZenPing() 413 import logging 414 logging.getLogger('zen.Events').setLevel(20) 415 pm.run() 416