Products.ZenRRD.zenperfsnmp

118 """ 119 A task that performs periodic performance collection for devices providing 120 data via SNMP agents. 121 """ 122 zope.interface.implements(IScheduledTask) 123 124 STATE_CONNECTING = 'CONNECTING' 125 STATE_FETCH_PERF = 'FETCH_PERF_DATA' 126 STATE_STORE_PERF = 'STORE_PERF_DATA' 127

128 - def __init__(self, 129 deviceId, 130 taskName, 131 scheduleIntervalSeconds, 132 taskConfig):

133 """ 134 @param deviceId: the Zenoss deviceId to watch 135 @type deviceId: string 136 @param taskName: the unique identifier for this task 137 @type taskName: string 138 @param scheduleIntervalSeconds: the interval at which this task will be 139 collected 140 @type scheduleIntervalSeconds: int 141 @param taskConfig: the configuration for this task 142 """ 143 super(SnmpPerformanceCollectionTask, self).__init__( 144 deviceId, taskName, 145 taskConfig.cycleInterval, taskConfig 146 ) 147 148 # Needed for interface 149 self.name = taskName 150 self.configId = deviceId 151 self.state = TaskStates.STATE_IDLE 152 153 # The taskConfig corresponds to a DeviceProxy 154 self._device = taskConfig 155 self._devId = self._device.id 156 self._manageIp = self._device.snmpConnInfo.manageIp 157 self._maxOidsPerRequest = self._device.zMaxOIDPerRequest 158 log.debug("SnmpPerformanceCollectionTask.__init__: self._maxOidsPerRequest=%s" % self._maxOidsPerRequest) 159 self.interval = self._device.cycleInterval 160 self._collectedOids = set() 161 162 self._dataService = zope.component.queryUtility(IDataService) 163 self._eventService = zope.component.queryUtility(IEventService) 164 165 self._preferences = zope.component.queryUtility(ICollectorPreferences, 166 COLLECTOR_NAME) 167 168 self._snmpProxy = None 169 self._snmpConnInfo = self._device.snmpConnInfo 170 self._oids = self._device.oids 171 self._oidDeque = deque(self._oids.keys()) 172 self._good_oids = set() 173 #oids not returning data 174 self._bad_oids = set() 175 self._snmpPort = snmpprotocol.port() 176 self.triesPerCycle = max(2, self._preferences.options.triesPerCycle) 177 self._maxTimeouts = self._preferences.options.maxTimeouts 178 179 self._lastErrorMsg = '' 180 self._cycleExceededCount = 0 181 self._stoppedTaskCount = 0 182 self._snmpV3ErrorCount = 0 183 184 #whether or not we got a response during a collection interval 185 self._responseReceived = False

186

187 - def _failure(self, reason):

188 """ 189 Twisted errBack to log the exception for a single device. 190 191 @parameter reason: explanation of the failure 192 @type reason: Twisted error instance 193 """ 194 msg = reason.getErrorMessage() 195 if not msg: # Sometimes we get blank error messages 196 msg = reason.__class__ 197 msg = '%s %s' % (self._devId, msg) 198 199 # Leave 'reason' alone to generate a traceback 200 201 if self._lastErrorMsg != msg: 202 self._lastErrorMsg = msg 203 if msg: 204 log.error(msg) 205 206 return reason

207

208 - def _connectCallback(self, result):

209 """ 210 Callback called after a successful connect to the remote device. 211 """ 212 # If we want to model things first before doing collection, 213 # that code goes here. 214 log.debug("Connected to %s [%s] using SNMP %s", self._devId, self._manageIp, self._snmpConnInfo.zSnmpVer) 215 self._collectedOids.clear() 216 return result

217

218 - def _checkTaskTime(self):

219 elapsed = datetime.now() - self._doTask_start 220 221 if elapsed >= timedelta(seconds=self._device.cycleInterval): 222 raise CycleExceeded( 223 "Elapsed time %s seconds greater than %s seconds" % (elapsed.total_seconds(), self._device.cycleInterval)) 224 #check to to see if we are about to run out of time, if so stop task 225 if elapsed >= timedelta(seconds=self._device.cycleInterval*.99): 226 raise StopTask("Elapsed time %s sec" % elapsed.total_seconds())

227

228 - def _untestedOids(self):

229 return set(self._oids) - self._bad_oids - self._good_oids

230

231 - def _uncollectedOids(self):

232 return set(self._oids) - self._bad_oids - self._collectedOids

233 234 @defer.inlineCallbacks

235 - def _fetchPerf(self):

236 """ 237 Get performance data for all the monitored components on a device 238 """ 239 log.debug("Retrieving OIDs from %s [%s]", self._devId, self._manageIp) 240 if not self._oids: 241 defer.returnValue(None) 242 243 # do known untested and good oids in chunks 244 # first run all oids will be unkown since they aren't in the good oid list or the bad oid list 245 oids_to_test = list(self._untestedOids()) 246 oids_to_test.extend(self._good_oids) 247 log.debug('%s [%s] collecting %s oids out of %s', self._devId, self._manageIp, len(oids_to_test), len(self._oids)) 248 chunk_size = self._maxOidsPerRequest 249 maxTries = self.triesPerCycle 250 try_count = 0 251 consecutiveTimeouts = 0 252 while oids_to_test and try_count < maxTries: 253 try_count += 1 254 if try_count > 1: 255 log.debug("%s [%s] some oids still uncollected after %s tries, trying again with chunk size %s", self._devId, 256 self._manageIp, try_count - 1, chunk_size) 257 oid_chunks = self.chunk(oids_to_test, chunk_size) 258 for oid_chunk in oid_chunks: 259 try: 260 self._checkTaskTime() 261 log.debug("Fetching OID chunk size %s from %s [%s] - %s", chunk_size, self._devId, self._manageIp, oid_chunk) 262 yield self._fetchPerfChunk(oid_chunk) 263 consecutiveTimeouts = 0 264 log.debug("Finished fetchPerfChunk call %s [%s]", self._devId, self._manageIp) 265 except error.TimeoutError as e: 266 log.debug("timeout for %s [%s] oids - %s", self._devId, self._manageIp, oid_chunk) 267 consecutiveTimeouts += 1 268 if consecutiveTimeouts >= self._maxTimeouts: 269 log.debug("%s consecutive timeouts, abandoning run for %s [%s]", consecutiveTimeouts, 270 self._devId, self._manageIp) 271 raise 272 # can still have untested oids from a chunk that failed to return data, one or more of those may be bad. 273 # run with a smaller chunk size to identify bad oid. Can also have uncollected good oids because of timeouts 274 oids_to_test = list(self._uncollectedOids()) 275 chunk_size = 1

276 277 278 279 @defer.inlineCallbacks

280 - def _fetchPerfChunk(self, oid_chunk):

281 self.state = SnmpPerformanceCollectionTask.STATE_FETCH_PERF 282 update_x = {} 283 try: 284 update_x = yield self._snmpProxy.get(oid_chunk, self._snmpConnInfo.zSnmpTimeout, self._snmpConnInfo.zSnmpTries) 285 except error.TimeoutError, e: 286 raise 287 except Exception, e: 288 log.warning('Failed to collect on {0} ({1.__class__.__name__}: {1})'.format(self.configId, e)) 289 #something happened, not sure what. 290 raise 291 finally: 292 self.state = TaskStates.STATE_RUNNING 293 update = {} 294 295 # we got a response 296 self._responseReceived = True 297 #remove leading and trailing dots 298 for oid, value in dict(update_x).items(): 299 update[oid.strip('.')] = value 300 301 if not update: 302 # empty update is probably a bad OID in the request somewhere, remove them from good oids. These will run in 303 # single mode so we can figure out which ones are good or bad 304 if len(oid_chunk) == 1: 305 self.remove_from_good_oids(oid_chunk) 306 self._addBadOids(oid_chunk) 307 log.warn("No return result, marking as bad oid: {%s} {%s}" % (self.configId, oid_chunk)) 308 else: 309 log.warn("No return result, will run in separately to determine which oids are valid: {%s} {%s}" % ( 310 self.configId, oid_chunk)) 311 self.remove_from_good_oids(oid_chunk) 312 313 else: 314 for oid in oid_chunk: 315 if oid not in update: 316 log.error("SNMP get did not return result: {0} {1}".format(self.configId, oid)) 317 self.remove_from_good_oids([oid]) 318 self._addBadOids([oid]) 319 self.state=SnmpPerformanceCollectionTask.STATE_STORE_PERF 320 try: 321 for oid, value in update.items(): 322 323 if oid not in self._oids: 324 log.error("SNMP get returned unexpected OID: {0} {1}".format(self.configId, oid)) 325 continue 326 327 # We should always get something useful back 328 if value == '' or value is None: 329 log.error("SNMP get returned empty value: {0} {1}".format(self.configId, oid)) 330 self.remove_from_good_oids([oid]) 331 self._addBadOids([oid]) 332 continue 333 334 self._good_oids.add(oid) 335 self._bad_oids.discard(oid) 336 self._collectedOids.add(oid) 337 # An OID's data can be stored multiple times 338 for rrdMeta in self._oids[oid]: 339 try: 340 cname, path, rrdType, rrdCommand, rrdMin, rrdMax = rrdMeta 341 self._dataService.writeRRD( 342 path, value, rrdType, 343 rrdCommand=rrdCommand, 344 cycleTime=self._device.cycleInterval, 345 min=rrdMin, max=rrdMax) 346 except Exception, e: 347 log.error("Failed to write to RRD file: {0} {1.__class__.__name__} {1}".format(path, e)) 348 continue 349 finally: 350 self.state = TaskStates.STATE_RUNNING

351 352 @defer.inlineCallbacks

353 - def _processBadOids(self, previous_bad_oids):

354 if previous_bad_oids: 355 log.debug("%s Re-checking %s bad oids", self.name, len(previous_bad_oids)) 356 oids_to_test = set(previous_bad_oids) 357 num_checked = 0 358 max_bad_check = max(10, self._maxOidsPerRequest) 359 while num_checked < max_bad_check and oids_to_test: 360 self._checkTaskTime() 361 # using deque as a rotating list so that next time we start where we left off 362 oid = self._oidDeque[0] # get the first one 363 self._oidDeque.rotate(1) # move it to the end 364 if oid in oids_to_test: # fetch if we care 365 oids_to_test.remove(oid) 366 num_checked += 1 367 try: 368 yield self._fetchPerfChunk([oid]) 369 except error.TimeoutError, e: 370 log.debug('%s timed out re-checking bad oid %s', self.name, oid)

371

372 - def _sendStatusEvent(self, summary, eventKey=None, severity=Event.Error, details=None):

373 if details is None: 374 details = {} 375 event = details.copy() 376 event.update(STATUS_EVENT) 377 self._eventService.sendEvent(event, 378 severity=severity, 379 device=self.configId, 380 eventKey=eventKey, 381 summary=summary)

382 383 @defer.inlineCallbacks

384 - def _doCollectOids(self, ignored):

385 previous_bad_oids=list(self._bad_oids) 386 taskStopped = False 387 388 try: 389 try: 390 yield self._fetchPerf() 391 # we have time; try to collect previous bad oids: 392 yield self._processBadOids(previous_bad_oids) 393 except StopTask as e: 394 taskStopped = True 395 self._stoppedTaskCount += 1 396 log.warn("Device %s [%s] Task stopped collecting to avoid exceeding cycle interval - %s", 397 self._devId, self._manageIp, str(e)) 398 self._logOidsNotCollected("task was stopped so as not exceed cycle interval") 399 except error.TimeoutError as e: 400 log.debug("Device %s [%s] snmp timed out ", self._devId, self._manageIp) 401 402 if self._snmpConnInfo.zSnmpVer == 'v3': 403 self._sendStatusEvent('SNMP v3 error cleared', eventKey='snmp_v3_error', severity=Event.Clear) 404 405 # clear cycle exceeded event 406 self._sendStatusEvent('Collection run time restored below interval', eventKey='interval_exceeded', 407 severity=Event.Clear) 408 409 410 if self._responseReceived: 411 # clear down event 412 self._sendStatusEvent('SNMP agent up', eventKey='agent_down', 413 severity=Event.Clear) 414 if not self._collectedOids: 415 #send event if no oids collected - all oids seem to be bad 416 oidSample = self._oids.keys()[:self._maxOidsPerRequest] 417 oidDetails = {'oids_configured': "%s oids configured for device" % len(self._oids), 418 'oid_sample': "Subset of oids requested %s" % oidSample} 419 self._sendStatusEvent('No values returned for configured oids', eventKey='no_oid_results', 420 details=oidDetails) 421 else: 422 self._sendStatusEvent('oids collected', 423 eventKey='no_oid_results', severity=Event.Clear) 424 if len(self._collectedOids) == len(set(self._oids) - self._bad_oids): 425 # this should clear failed to collect some oids event 426 self._sendStatusEvent('Gathered all OIDs', eventKey='partial_oids_collected', 427 severity=Event.Clear) 428 else: 429 summary = 'Failed to collect some OIDs' 430 if taskStopped: 431 summary = '%s - was not able to collect all oids within collection interval' % summary 432 self._sendStatusEvent(summary, eventKey='partial_oids_collected', 433 severity=Event.Warning) 434 435 else: 436 #send event if no response received - all timeouts or other errors 437 self._sendStatusEvent('SNMP agent down - no response received', eventKey='agent_down') 438 439 440 except CycleExceeded as e: 441 self._cycleExceededCount += 1 442 log.warn("Device %s [%s] scan stopped because time exceeded cycle interval, %s", self._devId, self._manageIp 443 , str(e)) 444 self._logOidsNotCollected('cycle exceeded') 445 self._sendStatusEvent('Scan stopped; Collection time exceeded interval - %s' % str(e), 446 eventKey='interval_exceeded') 447 448 except Snmpv3Error as e: 449 self._logOidsNotCollected('of %s' % str(e)) 450 self._snmpV3ErrorCount += 1 451 summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format(self, str(e)) 452 453 log.error("{0} on {1}".format(summary, self.configId)) 454 self._sendStatusEvent(summary, eventKey='snmp_v3_error') 455 finally: 456 self._logTaskOidInfo(previous_bad_oids)

457

458 - def remove_from_good_oids(self, oids):

459 self._good_oids.difference_update(oids)

460

461 - def _addBadOids(self, oids):

462 """ 463 Report any bad OIDs and then track the OID so we 464 don't generate any further errors. 465 """ 466 # make sure oids aren't in good set 467 self.remove_from_good_oids(oids) 468 for oid in oids: 469 if oid in self._oids: 470 self._bad_oids.add(oid) 471 names = [dp[0] for dp in self._oids[oid]] 472 summary = 'Error reading value for %s (%s) on %s' % ( 473 names, oid, self._devId) 474 log.warn(summary)

475

476 - def _finished(self, result):

477 """ 478 Callback activated when the task is complete 479 480 @parameter result: results of SNMP gets 481 @type result: array of (boolean, dictionaries) 482 """ 483 484 try: 485 self._close() 486 except Exception, ex: 487 log.warn("Failed to close device %s: error %s" % 488 (self._devId, str(ex))) 489 490 doTask_end = datetime.now() 491 duration = doTask_end - self._doTask_start 492 if duration > timedelta(seconds=self._device.cycleInterval): 493 log.warn("Collection for %s took %s seconds; cycle interval is %s seconds." % ( 494 self.configId, duration.total_seconds(), self._device.cycleInterval)) 495 else: 496 log.debug("Collection time for %s was %s seconds; cycle interval is %s seconds." % ( 497 self.configId, duration.total_seconds(), self._device.cycleInterval)) 498 499 500 # Return the result so the framework can track success/failure 501 return result

502

503 - def cleanup(self):

504 return self._close()

505

506 - def doTask(self):

507 """ 508 Contact to one device and return a deferred which gathers data from 509 the device. 510 511 @return: A task to scan the OIDs on a device. 512 @rtype: Twisted deferred object 513 """ 514 self._doTask_start = datetime.now() 515 self._responseReceived = False 516 # See if we need to connect first before doing any collection 517 d = defer.maybeDeferred(self._connect) 518 d.addCallbacks(self._connectCallback, self._failure) 519 520 521 d.addCallback(self._doCollectOids) 522 # Call _finished for both success and error scenarois 523 d.addBoth(self._finished) 524 525 # Wait until the Deferred actually completes 526 return d

527 528

529 - def _logTaskOidInfo(self, previous_bad_oids):

530 if log.isEnabledFor(logging.DEBUG): 531 log.debug("Device %s [%s] %d of %d OIDs scanned successfully", 532 self._devId, self._manageIp, len(self._collectedOids), len(self._oids)) 533 untested_oids = self._untestedOids() 534 log.debug("Device %s [%s] has %d good oids, %d bad oids and %d untested oids out of %d configured", 535 self._devId, self._manageIp, len(self._good_oids), len(self._bad_oids), len(untested_oids), 536 len(self._oids)) 537 538 newBadOids = self._bad_oids - set(previous_bad_oids) 539 if newBadOids: 540 log.info("%s: Detected %s bad oids this cycle", self.name, len(newBadOids)) 541 log.debug("%s: Bad oids detected - %s", self.name, newBadOids)

542

543 - def _logOidsNotCollected(self, reason):

544 oidsNotCollected = self._uncollectedOids() 545 if oidsNotCollected: 546 log.debug("%s Oids not collected because %s - %s" % (self.name, reason, str(oidsNotCollected)))

547 548

549 - def _connect(self):

550 """ 551 Create a connection to the remote device 552 """ 553 self.state = SnmpPerformanceCollectionTask.STATE_CONNECTING 554 if (self._snmpProxy is None or 555 self._snmpProxy._snmpConnInfo != self._snmpConnInfo): 556 self._snmpProxy = self._snmpConnInfo.createSession( 557 protocol=self._snmpPort.protocol, 558 allowCache=True) 559 self._snmpProxy.open() 560 return self._snmpProxy

561

562 - def _close(self):

563 """ 564 Close down the connection to the remote device 565 """ 566 if self._snmpProxy: 567 self._snmpProxy.close() 568 self._snmpProxy = None

569 570

571 - def displayStatistics(self):

572 """ 573 Called by the collector framework scheduler, and allows us to 574 see how each task is doing. 575 """ 576 display = "%s using SNMP %s\n" % (self.name, self._snmpConnInfo.zSnmpVer) 577 display += "%s Cycles Exceeded: %s; V3 Error Count: %s; Stopped Task Count: %s\n" % ( 578 self.name, self._cycleExceededCount, self._snmpV3ErrorCount, self._stoppedTaskCount) 579 display += "%s OIDs configured: %d \n" % ( 580 self.name, len(self._oids.keys())) 581 display += "%s Good OIDs: %d - %s\n" % ( 582 self.name, len(self._good_oids), self._good_oids) 583 display += "%s Bad OIDs: %d - %s\n" % ( 584 self.name, len(self._bad_oids), self._bad_oids) 585 586 if self._lastErrorMsg: 587 display += "%s\n" % self._lastErrorMsg 588 return display

Source Code for Module Products.ZenRRD.zenperfsnmp