Trees | Indices | Help |
|
---|
|
1 #! /usr/bin/env python 2 ############################################################################## 3 # 4 # Copyright (C) Zenoss, Inc. 2007, 2010, 2011, all rights reserved. 5 # 6 # This content is made available according to terms specified in 7 # License.zenoss under the directory where your Zenoss product is installed. 8 # 9 ############################################################################## 10 11 12 __doc__="""zenperfsnmp 13 14 Gets SNMP performance data and stores it in RRD files. 15 16 """ 17 18 from datetime import datetime, timedelta 19 from collections import deque 20 import random 21 import logging 22 log = logging.getLogger("zen.zenperfsnmp") 23 24 import Globals 25 import zope.interface 26 27 from twisted.internet import defer, error 28 from twisted.python.failure import Failure 29 from pynetsnmp.twistedsnmp import AgentProxy, snmpprotocol, Snmpv3Error 30 31 from Products.ZenCollector.daemon import CollectorDaemon 32 from Products.ZenCollector.interfaces import ICollectorPreferences,\ 33 IDataService,\ 34 IEventService,\ 35 IScheduledTask 36 from Products.ZenCollector.tasks import SimpleTaskFactory,\ 37 SimpleTaskSplitter,\ 38 TaskStates, \ 39 BaseTask 40 41 from Products.ZenEvents.ZenEventClasses import Status_Snmp 42 from Products.ZenEvents import Event 43 44 # We retrieve our configuration data remotely via a Twisted PerspectiveBroker 45 # connection. To do so, we need to import the class that will be used by the 46 # configuration service to send the data over, i.e. SnmpDeviceProxy. 47 from Products.ZenUtils.Utils import unused 48 from Products.ZenHub.services.SnmpPerformanceConfig import SnmpDeviceProxy 49 unused(SnmpDeviceProxy) 50 from Products.ZenHub.services.PerformanceConfig import SnmpConnInfo 51 unused(SnmpConnInfo) 52 53 COLLECTOR_NAME = "zenperfsnmp" 54 MAX_BACK_OFF_MINUTES = 2058 zope.interface.implements(ICollectorPreferences) 59106 110 113 114 STATUS_EVENT = { 'eventClass' : Status_Snmp, 115 'eventGroup' : 'SnmpTest' }61 """ 62 Constructs a new SnmpPerformanceCollectionPreferences instance and 63 provides default values for needed attributes. 64 """ 65 self.collectorName = COLLECTOR_NAME 66 self.defaultRRDCreateCommand = None 67 self.configCycleInterval = 20 # minutes 68 self.cycleInterval = 5 * 60 # seconds 69 70 # The configurationService attribute is the fully qualified class-name 71 # of our configuration service that runs within ZenHub 72 self.configurationService = 'Products.ZenHub.services.SnmpPerformanceConfig' 73 74 # Will be filled in based on buildOptions 75 self.options = None7678 parser.add_option('--showrawresults', 79 dest='showrawresults', 80 action="store_true", 81 default=False, 82 help="Show the raw RRD values. For debugging purposes only.") 83 84 parser.add_option('--maxbackoffminutes', 85 dest='maxbackoffminutes', 86 default=MAX_BACK_OFF_MINUTES, 87 type='int', 88 help="Deprecated since 4.1.1. No longer used") 89 90 parser.add_option('--triespercycle', 91 dest='triesPerCycle', 92 default=2, 93 type='int', 94 help="How many attempts per cycle should be made to get data for an OID from a "\ 95 "non-responsive device. Minimum of 2") 96 97 parser.add_option('--maxtimeouts', 98 dest='maxTimeouts', 99 default=3, 100 type='int', 101 help="How many consecutive time outs per cycle before stopping attempts to collect")102 103118 """ 119 A task that performs periodic performance collection for devices providing 120 data via SNMP agents. 121 """ 122 zope.interface.implements(IScheduledTask) 123 124 STATE_CONNECTING = 'CONNECTING' 125 STATE_FETCH_PERF = 'FETCH_PERF_DATA' 126 STATE_STORE_PERF = 'STORE_PERF_DATA' 127589 590 591 if __name__ == '__main__': 592 myPreferences = SnmpPerformanceCollectionPreferences() 593 myTaskFactory = SimpleTaskFactory(SnmpPerformanceCollectionTask) 594 myTaskSplitter = SimpleTaskSplitter(myTaskFactory) 595 daemon = CollectorDaemon(myPreferences, myTaskSplitter) 596 daemon.run() 597133 """ 134 @param deviceId: the Zenoss deviceId to watch 135 @type deviceId: string 136 @param taskName: the unique identifier for this task 137 @type taskName: string 138 @param scheduleIntervalSeconds: the interval at which this task will be 139 collected 140 @type scheduleIntervalSeconds: int 141 @param taskConfig: the configuration for this task 142 """ 143 super(SnmpPerformanceCollectionTask, self).__init__( 144 deviceId, taskName, 145 taskConfig.cycleInterval, taskConfig 146 ) 147 148 # Needed for interface 149 self.name = taskName 150 self.configId = deviceId 151 self.state = TaskStates.STATE_IDLE 152 153 # The taskConfig corresponds to a DeviceProxy 154 self._device = taskConfig 155 self._devId = self._device.id 156 self._manageIp = self._device.snmpConnInfo.manageIp 157 self._maxOidsPerRequest = self._device.zMaxOIDPerRequest 158 log.debug("SnmpPerformanceCollectionTask.__init__: self._maxOidsPerRequest=%s" % self._maxOidsPerRequest) 159 self.interval = self._device.cycleInterval 160 self._collectedOids = set() 161 162 self._dataService = zope.component.queryUtility(IDataService) 163 self._eventService = zope.component.queryUtility(IEventService) 164 165 self._preferences = zope.component.queryUtility(ICollectorPreferences, 166 COLLECTOR_NAME) 167 168 self._snmpProxy = None 169 self._snmpConnInfo = self._device.snmpConnInfo 170 self._oids = self._device.oids 171 self._oidDeque = deque(self._oids.keys()) 172 self._good_oids = set() 173 #oids not returning data 174 self._bad_oids = set() 175 self._snmpPort = snmpprotocol.port() 176 self.triesPerCycle = max(2, self._preferences.options.triesPerCycle) 177 self._maxTimeouts = self._preferences.options.maxTimeouts 178 179 self._lastErrorMsg = '' 180 self._cycleExceededCount = 0 181 self._stoppedTaskCount = 0 182 self._snmpV3ErrorCount = 0 183 184 #whether or not we got a response during a collection interval 185 self._responseReceived = False186188 """ 189 Twisted errBack to log the exception for a single device. 190 191 @parameter reason: explanation of the failure 192 @type reason: Twisted error instance 193 """ 194 msg = reason.getErrorMessage() 195 if not msg: # Sometimes we get blank error messages 196 msg = reason.__class__ 197 msg = '%s %s' % (self._devId, msg) 198 199 # Leave 'reason' alone to generate a traceback 200 201 if self._lastErrorMsg != msg: 202 self._lastErrorMsg = msg 203 if msg: 204 log.error(msg) 205 206 return reason207209 """ 210 Callback called after a successful connect to the remote device. 211 """ 212 # If we want to model things first before doing collection, 213 # that code goes here. 214 log.debug("Connected to %s [%s] using SNMP %s", self._devId, self._manageIp, self._snmpConnInfo.zSnmpVer) 215 self._collectedOids.clear() 216 return result217219 elapsed = datetime.now() - self._doTask_start 220 221 if elapsed >= timedelta(seconds=self._device.cycleInterval): 222 raise CycleExceeded( 223 "Elapsed time %s seconds greater than %s seconds" % (elapsed.total_seconds(), self._device.cycleInterval)) 224 #check to to see if we are about to run out of time, if so stop task 225 if elapsed >= timedelta(seconds=self._device.cycleInterval*.99): 226 raise StopTask("Elapsed time %s sec" % elapsed.total_seconds())227229 return set(self._oids) - self._bad_oids - self._good_oids230232 return set(self._oids) - self._bad_oids - self._collectedOids233 234 @defer.inlineCallbacks236 """ 237 Get performance data for all the monitored components on a device 238 """ 239 log.debug("Retrieving OIDs from %s [%s]", self._devId, self._manageIp) 240 if not self._oids: 241 defer.returnValue(None) 242 243 # do known untested and good oids in chunks 244 # first run all oids will be unkown since they aren't in the good oid list or the bad oid list 245 oids_to_test = list(self._untestedOids()) 246 oids_to_test.extend(self._good_oids) 247 log.debug('%s [%s] collecting %s oids out of %s', self._devId, self._manageIp, len(oids_to_test), len(self._oids)) 248 chunk_size = self._maxOidsPerRequest 249 maxTries = self.triesPerCycle 250 try_count = 0 251 consecutiveTimeouts = 0 252 while oids_to_test and try_count < maxTries: 253 try_count += 1 254 if try_count > 1: 255 log.debug("%s [%s] some oids still uncollected after %s tries, trying again with chunk size %s", self._devId, 256 self._manageIp, try_count - 1, chunk_size) 257 oid_chunks = self.chunk(oids_to_test, chunk_size) 258 for oid_chunk in oid_chunks: 259 try: 260 self._checkTaskTime() 261 log.debug("Fetching OID chunk size %s from %s [%s] - %s", chunk_size, self._devId, self._manageIp, oid_chunk) 262 yield self._fetchPerfChunk(oid_chunk) 263 consecutiveTimeouts = 0 264 log.debug("Finished fetchPerfChunk call %s [%s]", self._devId, self._manageIp) 265 except error.TimeoutError as e: 266 log.debug("timeout for %s [%s] oids - %s", self._devId, self._manageIp, oid_chunk) 267 consecutiveTimeouts += 1 268 if consecutiveTimeouts >= self._maxTimeouts: 269 log.debug("%s consecutive timeouts, abandoning run for %s [%s]", consecutiveTimeouts, 270 self._devId, self._manageIp) 271 raise 272 # can still have untested oids from a chunk that failed to return data, one or more of those may be bad. 273 # run with a smaller chunk size to identify bad oid. Can also have uncollected good oids because of timeouts 274 oids_to_test = list(self._uncollectedOids()) 275 chunk_size = 1276 277 278 279 @defer.inlineCallbacks281 self.state = SnmpPerformanceCollectionTask.STATE_FETCH_PERF 282 update_x = {} 283 try: 284 update_x = yield self._snmpProxy.get(oid_chunk, self._snmpConnInfo.zSnmpTimeout, self._snmpConnInfo.zSnmpTries) 285 except error.TimeoutError, e: 286 raise 287 except Exception, e: 288 log.warning('Failed to collect on {0} ({1.__class__.__name__}: {1})'.format(self.configId, e)) 289 #something happened, not sure what. 290 raise 291 finally: 292 self.state = TaskStates.STATE_RUNNING 293 update = {} 294 295 # we got a response 296 self._responseReceived = True 297 #remove leading and trailing dots 298 for oid, value in dict(update_x).items(): 299 update[oid.strip('.')] = value 300 301 if not update: 302 # empty update is probably a bad OID in the request somewhere, remove them from good oids. These will run in 303 # single mode so we can figure out which ones are good or bad 304 if len(oid_chunk) == 1: 305 self.remove_from_good_oids(oid_chunk) 306 self._addBadOids(oid_chunk) 307 log.warn("No return result, marking as bad oid: {%s} {%s}" % (self.configId, oid_chunk)) 308 else: 309 log.warn("No return result, will run in separately to determine which oids are valid: {%s} {%s}" % ( 310 self.configId, oid_chunk)) 311 self.remove_from_good_oids(oid_chunk) 312 313 else: 314 for oid in oid_chunk: 315 if oid not in update: 316 log.error("SNMP get did not return result: {0} {1}".format(self.configId, oid)) 317 self.remove_from_good_oids([oid]) 318 self._addBadOids([oid]) 319 self.state=SnmpPerformanceCollectionTask.STATE_STORE_PERF 320 try: 321 for oid, value in update.items(): 322 323 if oid not in self._oids: 324 log.error("SNMP get returned unexpected OID: {0} {1}".format(self.configId, oid)) 325 continue 326 327 # We should always get something useful back 328 if value == '' or value is None: 329 log.error("SNMP get returned empty value: {0} {1}".format(self.configId, oid)) 330 self.remove_from_good_oids([oid]) 331 self._addBadOids([oid]) 332 continue 333 334 self._good_oids.add(oid) 335 self._bad_oids.discard(oid) 336 self._collectedOids.add(oid) 337 # An OID's data can be stored multiple times 338 for rrdMeta in self._oids[oid]: 339 try: 340 cname, path, rrdType, rrdCommand, rrdMin, rrdMax = rrdMeta 341 self._dataService.writeRRD( 342 path, value, rrdType, 343 rrdCommand=rrdCommand, 344 cycleTime=self._device.cycleInterval, 345 min=rrdMin, max=rrdMax) 346 except Exception, e: 347 log.error("Failed to write to RRD file: {0} {1.__class__.__name__} {1}".format(path, e)) 348 continue 349 finally: 350 self.state = TaskStates.STATE_RUNNING351 352 @defer.inlineCallbacks354 if previous_bad_oids: 355 log.debug("%s Re-checking %s bad oids", self.name, len(previous_bad_oids)) 356 oids_to_test = set(previous_bad_oids) 357 num_checked = 0 358 max_bad_check = max(10, self._maxOidsPerRequest) 359 while num_checked < max_bad_check and oids_to_test: 360 self._checkTaskTime() 361 # using deque as a rotating list so that next time we start where we left off 362 oid = self._oidDeque[0] # get the first one 363 self._oidDeque.rotate(1) # move it to the end 364 if oid in oids_to_test: # fetch if we care 365 oids_to_test.remove(oid) 366 num_checked += 1 367 try: 368 yield self._fetchPerfChunk([oid]) 369 except error.TimeoutError, e: 370 log.debug('%s timed out re-checking bad oid %s', self.name, oid)371373 if details is None: 374 details = {} 375 event = details.copy() 376 event.update(STATUS_EVENT) 377 self._eventService.sendEvent(event, 378 severity=severity, 379 device=self.configId, 380 eventKey=eventKey, 381 summary=summary)382 383 @defer.inlineCallbacks385 previous_bad_oids=list(self._bad_oids) 386 taskStopped = False 387 388 try: 389 try: 390 yield self._fetchPerf() 391 # we have time; try to collect previous bad oids: 392 yield self._processBadOids(previous_bad_oids) 393 except StopTask as e: 394 taskStopped = True 395 self._stoppedTaskCount += 1 396 log.warn("Device %s [%s] Task stopped collecting to avoid exceeding cycle interval - %s", 397 self._devId, self._manageIp, str(e)) 398 self._logOidsNotCollected("task was stopped so as not exceed cycle interval") 399 except error.TimeoutError as e: 400 log.debug("Device %s [%s] snmp timed out ", self._devId, self._manageIp) 401 402 if self._snmpConnInfo.zSnmpVer == 'v3': 403 self._sendStatusEvent('SNMP v3 error cleared', eventKey='snmp_v3_error', severity=Event.Clear) 404 405 # clear cycle exceeded event 406 self._sendStatusEvent('Collection run time restored below interval', eventKey='interval_exceeded', 407 severity=Event.Clear) 408 409 410 if self._responseReceived: 411 # clear down event 412 self._sendStatusEvent('SNMP agent up', eventKey='agent_down', 413 severity=Event.Clear) 414 if not self._collectedOids: 415 #send event if no oids collected - all oids seem to be bad 416 oidSample = self._oids.keys()[:self._maxOidsPerRequest] 417 oidDetails = {'oids_configured': "%s oids configured for device" % len(self._oids), 418 'oid_sample': "Subset of oids requested %s" % oidSample} 419 self._sendStatusEvent('No values returned for configured oids', eventKey='no_oid_results', 420 details=oidDetails) 421 else: 422 self._sendStatusEvent('oids collected', 423 eventKey='no_oid_results', severity=Event.Clear) 424 if len(self._collectedOids) == len(set(self._oids) - self._bad_oids): 425 # this should clear failed to collect some oids event 426 self._sendStatusEvent('Gathered all OIDs', eventKey='partial_oids_collected', 427 severity=Event.Clear) 428 else: 429 summary = 'Failed to collect some OIDs' 430 if taskStopped: 431 summary = '%s - was not able to collect all oids within collection interval' % summary 432 self._sendStatusEvent(summary, eventKey='partial_oids_collected', 433 severity=Event.Warning) 434 435 else: 436 #send event if no response received - all timeouts or other errors 437 self._sendStatusEvent('SNMP agent down - no response received', eventKey='agent_down') 438 439 440 except CycleExceeded as e: 441 self._cycleExceededCount += 1 442 log.warn("Device %s [%s] scan stopped because time exceeded cycle interval, %s", self._devId, self._manageIp 443 , str(e)) 444 self._logOidsNotCollected('cycle exceeded') 445 self._sendStatusEvent('Scan stopped; Collection time exceeded interval - %s' % str(e), 446 eventKey='interval_exceeded') 447 448 except Snmpv3Error as e: 449 self._logOidsNotCollected('of %s' % str(e)) 450 self._snmpV3ErrorCount += 1 451 summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format(self, str(e)) 452 453 log.error("{0} on {1}".format(summary, self.configId)) 454 self._sendStatusEvent(summary, eventKey='snmp_v3_error') 455 finally: 456 self._logTaskOidInfo(previous_bad_oids)457 460462 """ 463 Report any bad OIDs and then track the OID so we 464 don't generate any further errors. 465 """ 466 # make sure oids aren't in good set 467 self.remove_from_good_oids(oids) 468 for oid in oids: 469 if oid in self._oids: 470 self._bad_oids.add(oid) 471 names = [dp[0] for dp in self._oids[oid]] 472 summary = 'Error reading value for %s (%s) on %s' % ( 473 names, oid, self._devId) 474 log.warn(summary)475477 """ 478 Callback activated when the task is complete 479 480 @parameter result: results of SNMP gets 481 @type result: array of (boolean, dictionaries) 482 """ 483 484 try: 485 self._close() 486 except Exception, ex: 487 log.warn("Failed to close device %s: error %s" % 488 (self._devId, str(ex))) 489 490 doTask_end = datetime.now() 491 duration = doTask_end - self._doTask_start 492 if duration > timedelta(seconds=self._device.cycleInterval): 493 log.warn("Collection for %s took %s seconds; cycle interval is %s seconds." % ( 494 self.configId, duration.total_seconds(), self._device.cycleInterval)) 495 else: 496 log.debug("Collection time for %s was %s seconds; cycle interval is %s seconds." % ( 497 self.configId, duration.total_seconds(), self._device.cycleInterval)) 498 499 500 # Return the result so the framework can track success/failure 501 return result502504 return self._close()505507 """ 508 Contact to one device and return a deferred which gathers data from 509 the device. 510 511 @return: A task to scan the OIDs on a device. 512 @rtype: Twisted deferred object 513 """ 514 self._doTask_start = datetime.now() 515 self._responseReceived = False 516 # See if we need to connect first before doing any collection 517 d = defer.maybeDeferred(self._connect) 518 d.addCallbacks(self._connectCallback, self._failure) 519 520 521 d.addCallback(self._doCollectOids) 522 # Call _finished for both success and error scenarois 523 d.addBoth(self._finished) 524 525 # Wait until the Deferred actually completes 526 return d527 528530 if log.isEnabledFor(logging.DEBUG): 531 log.debug("Device %s [%s] %d of %d OIDs scanned successfully", 532 self._devId, self._manageIp, len(self._collectedOids), len(self._oids)) 533 untested_oids = self._untestedOids() 534 log.debug("Device %s [%s] has %d good oids, %d bad oids and %d untested oids out of %d configured", 535 self._devId, self._manageIp, len(self._good_oids), len(self._bad_oids), len(untested_oids), 536 len(self._oids)) 537 538 newBadOids = self._bad_oids - set(previous_bad_oids) 539 if newBadOids: 540 log.info("%s: Detected %s bad oids this cycle", self.name, len(newBadOids)) 541 log.debug("%s: Bad oids detected - %s", self.name, newBadOids)542544 oidsNotCollected = self._uncollectedOids() 545 if oidsNotCollected: 546 log.debug("%s Oids not collected because %s - %s" % (self.name, reason, str(oidsNotCollected)))547 548550 """ 551 Create a connection to the remote device 552 """ 553 self.state = SnmpPerformanceCollectionTask.STATE_CONNECTING 554 if (self._snmpProxy is None or 555 self._snmpProxy._snmpConnInfo != self._snmpConnInfo): 556 self._snmpProxy = self._snmpConnInfo.createSession( 557 protocol=self._snmpPort.protocol, 558 allowCache=True) 559 self._snmpProxy.open() 560 return self._snmpProxy561563 """ 564 Close down the connection to the remote device 565 """ 566 if self._snmpProxy: 567 self._snmpProxy.close() 568 self._snmpProxy = None569 570572 """ 573 Called by the collector framework scheduler, and allows us to 574 see how each task is doing. 575 """ 576 display = "%s using SNMP %s\n" % (self.name, self._snmpConnInfo.zSnmpVer) 577 display += "%s Cycles Exceeded: %s; V3 Error Count: %s; Stopped Task Count: %s\n" % ( 578 self.name, self._cycleExceededCount, self._snmpV3ErrorCount, self._stoppedTaskCount) 579 display += "%s OIDs configured: %d \n" % ( 580 self.name, len(self._oids.keys())) 581 display += "%s Good OIDs: %d - %s\n" % ( 582 self.name, len(self._good_oids), self._good_oids) 583 display += "%s Bad OIDs: %d - %s\n" % ( 584 self.name, len(self._bad_oids), self._bad_oids) 585 586 if self._lastErrorMsg: 587 display += "%s\n" % self._lastErrorMsg 588 return display
Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1.1812 on Mon Jul 30 17:11:31 2012 | http://epydoc.sourceforge.net |