| Trees | Indices | Help |
|
|---|
|
|
1 #! /usr/bin/env python
2 ##############################################################################
3 #
4 # Copyright (C) Zenoss, Inc. 2007, 2010, 2011, all rights reserved.
5 #
6 # This content is made available according to terms specified in
7 # License.zenoss under the directory where your Zenoss product is installed.
8 #
9 ##############################################################################
10
11
12 __doc__="""zenperfsnmp
13
14 Gets SNMP performance data and stores it in RRD files.
15
16 """
17
18 from datetime import datetime, timedelta
19 from collections import deque
20 import random
21 import logging
22 log = logging.getLogger("zen.zenperfsnmp")
23
24 import Globals
25 import zope.interface
26
27 from twisted.internet import defer, error
28 from twisted.python.failure import Failure
29 from pynetsnmp.twistedsnmp import AgentProxy, snmpprotocol, Snmpv3Error
30
31 from Products.ZenCollector.daemon import CollectorDaemon
32 from Products.ZenCollector.interfaces import ICollectorPreferences,\
33 IDataService,\
34 IEventService,\
35 IScheduledTask
36 from Products.ZenCollector.tasks import SimpleTaskFactory,\
37 SimpleTaskSplitter,\
38 TaskStates, \
39 BaseTask
40
41 from Products.ZenEvents.ZenEventClasses import Status_Snmp
42 from Products.ZenEvents import Event
43
44 # We retrieve our configuration data remotely via a Twisted PerspectiveBroker
45 # connection. To do so, we need to import the class that will be used by the
46 # configuration service to send the data over, i.e. SnmpDeviceProxy.
47 from Products.ZenUtils.Utils import unused
48 from Products.ZenHub.services.SnmpPerformanceConfig import SnmpDeviceProxy
49 unused(SnmpDeviceProxy)
50 from Products.ZenHub.services.PerformanceConfig import SnmpConnInfo
51 unused(SnmpConnInfo)
52
53 COLLECTOR_NAME = "zenperfsnmp"
54 MAX_BACK_OFF_MINUTES = 20
58 zope.interface.implements(ICollectorPreferences)
59
61 """
62 Constructs a new SnmpPerformanceCollectionPreferences instance and
63 provides default values for needed attributes.
64 """
65 self.collectorName = COLLECTOR_NAME
66 self.defaultRRDCreateCommand = None
67 self.configCycleInterval = 20 # minutes
68 self.cycleInterval = 5 * 60 # seconds
69
70 # The configurationService attribute is the fully qualified class-name
71 # of our configuration service that runs within ZenHub
72 self.configurationService = 'Products.ZenHub.services.SnmpPerformanceConfig'
73
74 # Will be filled in based on buildOptions
75 self.options = None
76
78 parser.add_option('--showrawresults',
79 dest='showrawresults',
80 action="store_true",
81 default=False,
82 help="Show the raw RRD values. For debugging purposes only.")
83
84 parser.add_option('--maxbackoffminutes',
85 dest='maxbackoffminutes',
86 default=MAX_BACK_OFF_MINUTES,
87 type='int',
88 help="Deprecated since 4.1.1. No longer used")
89
90 parser.add_option('--triespercycle',
91 dest='triesPerCycle',
92 default=2,
93 type='int',
94 help="How many attempts per cycle should be made to get data for an OID from a "\
95 "non-responsive device. Minimum of 2")
96
97 parser.add_option('--maxtimeouts',
98 dest='maxTimeouts',
99 default=3,
100 type='int',
101 help="How many consecutive time outs per cycle before stopping attempts to collect")
102
103
106
110
113
114 STATUS_EVENT = { 'eventClass' : Status_Snmp,
115 'eventGroup' : 'SnmpTest' }
118 """
119 A task that performs periodic performance collection for devices providing
120 data via SNMP agents.
121 """
122 zope.interface.implements(IScheduledTask)
123
124 STATE_CONNECTING = 'CONNECTING'
125 STATE_FETCH_PERF = 'FETCH_PERF_DATA'
126 STATE_STORE_PERF = 'STORE_PERF_DATA'
127
133 """
134 @param deviceId: the Zenoss deviceId to watch
135 @type deviceId: string
136 @param taskName: the unique identifier for this task
137 @type taskName: string
138 @param scheduleIntervalSeconds: the interval at which this task will be
139 collected
140 @type scheduleIntervalSeconds: int
141 @param taskConfig: the configuration for this task
142 """
143 super(SnmpPerformanceCollectionTask, self).__init__(
144 deviceId, taskName,
145 taskConfig.cycleInterval, taskConfig
146 )
147
148 # Needed for interface
149 self.name = taskName
150 self.configId = deviceId
151 self.state = TaskStates.STATE_IDLE
152
153 # The taskConfig corresponds to a DeviceProxy
154 self._device = taskConfig
155 self._devId = self._device.id
156 self._manageIp = self._device.snmpConnInfo.manageIp
157 self._maxOidsPerRequest = self._device.zMaxOIDPerRequest
158 log.debug("SnmpPerformanceCollectionTask.__init__: self._maxOidsPerRequest=%s" % self._maxOidsPerRequest)
159 self.interval = self._device.cycleInterval
160 self._collectedOids = set()
161
162 self._dataService = zope.component.queryUtility(IDataService)
163 self._eventService = zope.component.queryUtility(IEventService)
164
165 self._preferences = zope.component.queryUtility(ICollectorPreferences,
166 COLLECTOR_NAME)
167
168 self._snmpProxy = None
169 self._snmpConnInfo = self._device.snmpConnInfo
170 self._oids = self._device.oids
171 self._oidDeque = deque(self._oids.keys())
172 self._good_oids = set()
173 #oids not returning data
174 self._bad_oids = set()
175 self._snmpPort = snmpprotocol.port()
176 self.triesPerCycle = max(2, self._preferences.options.triesPerCycle)
177 self._maxTimeouts = self._preferences.options.maxTimeouts
178
179 self._lastErrorMsg = ''
180 self._cycleExceededCount = 0
181 self._stoppedTaskCount = 0
182 self._snmpV3ErrorCount = 0
183
184 #whether or not we got a response during a collection interval
185 self._responseReceived = False
186
188 """
189 Twisted errBack to log the exception for a single device.
190
191 @parameter reason: explanation of the failure
192 @type reason: Twisted error instance
193 """
194 msg = reason.getErrorMessage()
195 if not msg: # Sometimes we get blank error messages
196 msg = reason.__class__
197 msg = '%s %s' % (self._devId, msg)
198
199 # Leave 'reason' alone to generate a traceback
200
201 if self._lastErrorMsg != msg:
202 self._lastErrorMsg = msg
203 if msg:
204 log.error(msg)
205
206 return reason
207
209 """
210 Callback called after a successful connect to the remote device.
211 """
212 # If we want to model things first before doing collection,
213 # that code goes here.
214 log.debug("Connected to %s [%s] using SNMP %s", self._devId, self._manageIp, self._snmpConnInfo.zSnmpVer)
215 self._collectedOids.clear()
216 return result
217
219 elapsed = datetime.now() - self._doTask_start
220
221 if elapsed >= timedelta(seconds=self._device.cycleInterval):
222 raise CycleExceeded(
223 "Elapsed time %s seconds greater than %s seconds" % (elapsed.total_seconds(), self._device.cycleInterval))
224 #check to to see if we are about to run out of time, if so stop task
225 if elapsed >= timedelta(seconds=self._device.cycleInterval*.99):
226 raise StopTask("Elapsed time %s sec" % elapsed.total_seconds())
227
229 return set(self._oids) - self._bad_oids - self._good_oids
230
232 return set(self._oids) - self._bad_oids - self._collectedOids
233
234 @defer.inlineCallbacks
236 """
237 Get performance data for all the monitored components on a device
238 """
239 log.debug("Retrieving OIDs from %s [%s]", self._devId, self._manageIp)
240 if not self._oids:
241 defer.returnValue(None)
242
243 # do known untested and good oids in chunks
244 # first run all oids will be unkown since they aren't in the good oid list or the bad oid list
245 oids_to_test = list(self._untestedOids())
246 oids_to_test.extend(self._good_oids)
247 log.debug('%s [%s] collecting %s oids out of %s', self._devId, self._manageIp, len(oids_to_test), len(self._oids))
248 chunk_size = self._maxOidsPerRequest
249 maxTries = self.triesPerCycle
250 try_count = 0
251 consecutiveTimeouts = 0
252 while oids_to_test and try_count < maxTries:
253 try_count += 1
254 if try_count > 1:
255 log.debug("%s [%s] some oids still uncollected after %s tries, trying again with chunk size %s", self._devId,
256 self._manageIp, try_count - 1, chunk_size)
257 oid_chunks = self.chunk(oids_to_test, chunk_size)
258 for oid_chunk in oid_chunks:
259 try:
260 self._checkTaskTime()
261 log.debug("Fetching OID chunk size %s from %s [%s] - %s", chunk_size, self._devId, self._manageIp, oid_chunk)
262 yield self._fetchPerfChunk(oid_chunk)
263 consecutiveTimeouts = 0
264 log.debug("Finished fetchPerfChunk call %s [%s]", self._devId, self._manageIp)
265 except error.TimeoutError as e:
266 log.debug("timeout for %s [%s] oids - %s", self._devId, self._manageIp, oid_chunk)
267 consecutiveTimeouts += 1
268 if consecutiveTimeouts >= self._maxTimeouts:
269 log.debug("%s consecutive timeouts, abandoning run for %s [%s]", consecutiveTimeouts,
270 self._devId, self._manageIp)
271 raise
272 # can still have untested oids from a chunk that failed to return data, one or more of those may be bad.
273 # run with a smaller chunk size to identify bad oid. Can also have uncollected good oids because of timeouts
274 oids_to_test = list(self._uncollectedOids())
275 chunk_size = 1
276
277
278
279 @defer.inlineCallbacks
281 self.state = SnmpPerformanceCollectionTask.STATE_FETCH_PERF
282 update_x = {}
283 try:
284 update_x = yield self._snmpProxy.get(oid_chunk, self._snmpConnInfo.zSnmpTimeout, self._snmpConnInfo.zSnmpTries)
285 except error.TimeoutError, e:
286 raise
287 except Exception, e:
288 log.warning('Failed to collect on {0} ({1.__class__.__name__}: {1})'.format(self.configId, e))
289 #something happened, not sure what.
290 raise
291 finally:
292 self.state = TaskStates.STATE_RUNNING
293 update = {}
294
295 # we got a response
296 self._responseReceived = True
297 #remove leading and trailing dots
298 for oid, value in dict(update_x).items():
299 update[oid.strip('.')] = value
300
301 if not update:
302 # empty update is probably a bad OID in the request somewhere, remove them from good oids. These will run in
303 # single mode so we can figure out which ones are good or bad
304 if len(oid_chunk) == 1:
305 self.remove_from_good_oids(oid_chunk)
306 self._addBadOids(oid_chunk)
307 log.warn("No return result, marking as bad oid: {%s} {%s}" % (self.configId, oid_chunk))
308 else:
309 log.warn("No return result, will run in separately to determine which oids are valid: {%s} {%s}" % (
310 self.configId, oid_chunk))
311 self.remove_from_good_oids(oid_chunk)
312
313 else:
314 for oid in oid_chunk:
315 if oid not in update:
316 log.error("SNMP get did not return result: {0} {1}".format(self.configId, oid))
317 self.remove_from_good_oids([oid])
318 self._addBadOids([oid])
319 self.state=SnmpPerformanceCollectionTask.STATE_STORE_PERF
320 try:
321 for oid, value in update.items():
322
323 if oid not in self._oids:
324 log.error("SNMP get returned unexpected OID: {0} {1}".format(self.configId, oid))
325 continue
326
327 # We should always get something useful back
328 if value == '' or value is None:
329 log.error("SNMP get returned empty value: {0} {1}".format(self.configId, oid))
330 self.remove_from_good_oids([oid])
331 self._addBadOids([oid])
332 continue
333
334 self._good_oids.add(oid)
335 self._bad_oids.discard(oid)
336 self._collectedOids.add(oid)
337 # An OID's data can be stored multiple times
338 for rrdMeta in self._oids[oid]:
339 try:
340 cname, path, rrdType, rrdCommand, rrdMin, rrdMax = rrdMeta
341 self._dataService.writeRRD(
342 path, value, rrdType,
343 rrdCommand=rrdCommand,
344 cycleTime=self._device.cycleInterval,
345 min=rrdMin, max=rrdMax)
346 except Exception, e:
347 log.error("Failed to write to RRD file: {0} {1.__class__.__name__} {1}".format(path, e))
348 continue
349 finally:
350 self.state = TaskStates.STATE_RUNNING
351
352 @defer.inlineCallbacks
354 if previous_bad_oids:
355 log.debug("%s Re-checking %s bad oids", self.name, len(previous_bad_oids))
356 oids_to_test = set(previous_bad_oids)
357 num_checked = 0
358 max_bad_check = max(10, self._maxOidsPerRequest)
359 while num_checked < max_bad_check and oids_to_test:
360 self._checkTaskTime()
361 # using deque as a rotating list so that next time we start where we left off
362 oid = self._oidDeque[0] # get the first one
363 self._oidDeque.rotate(1) # move it to the end
364 if oid in oids_to_test: # fetch if we care
365 oids_to_test.remove(oid)
366 num_checked += 1
367 try:
368 yield self._fetchPerfChunk([oid])
369 except error.TimeoutError, e:
370 log.debug('%s timed out re-checking bad oid %s', self.name, oid)
371
373 if details is None:
374 details = {}
375 event = details.copy()
376 event.update(STATUS_EVENT)
377 self._eventService.sendEvent(event,
378 severity=severity,
379 device=self.configId,
380 eventKey=eventKey,
381 summary=summary)
382
383 @defer.inlineCallbacks
385 previous_bad_oids=list(self._bad_oids)
386 taskStopped = False
387
388 try:
389 try:
390 yield self._fetchPerf()
391 # we have time; try to collect previous bad oids:
392 yield self._processBadOids(previous_bad_oids)
393 except StopTask as e:
394 taskStopped = True
395 self._stoppedTaskCount += 1
396 log.warn("Device %s [%s] Task stopped collecting to avoid exceeding cycle interval - %s",
397 self._devId, self._manageIp, str(e))
398 self._logOidsNotCollected("task was stopped so as not exceed cycle interval")
399 except error.TimeoutError as e:
400 log.debug("Device %s [%s] snmp timed out ", self._devId, self._manageIp)
401
402 if self._snmpConnInfo.zSnmpVer == 'v3':
403 self._sendStatusEvent('SNMP v3 error cleared', eventKey='snmp_v3_error', severity=Event.Clear)
404
405 # clear cycle exceeded event
406 self._sendStatusEvent('Collection run time restored below interval', eventKey='interval_exceeded',
407 severity=Event.Clear)
408
409
410 if self._responseReceived:
411 # clear down event
412 self._sendStatusEvent('SNMP agent up', eventKey='agent_down',
413 severity=Event.Clear)
414 if not self._collectedOids:
415 #send event if no oids collected - all oids seem to be bad
416 oidSample = self._oids.keys()[:self._maxOidsPerRequest]
417 oidDetails = {'oids_configured': "%s oids configured for device" % len(self._oids),
418 'oid_sample': "Subset of oids requested %s" % oidSample}
419 self._sendStatusEvent('No values returned for configured oids', eventKey='no_oid_results',
420 details=oidDetails)
421 else:
422 self._sendStatusEvent('oids collected',
423 eventKey='no_oid_results', severity=Event.Clear)
424 if len(self._collectedOids) == len(set(self._oids) - self._bad_oids):
425 # this should clear failed to collect some oids event
426 self._sendStatusEvent('Gathered all OIDs', eventKey='partial_oids_collected',
427 severity=Event.Clear)
428 else:
429 summary = 'Failed to collect some OIDs'
430 if taskStopped:
431 summary = '%s - was not able to collect all oids within collection interval' % summary
432 self._sendStatusEvent(summary, eventKey='partial_oids_collected',
433 severity=Event.Warning)
434
435 else:
436 #send event if no response received - all timeouts or other errors
437 self._sendStatusEvent('SNMP agent down - no response received', eventKey='agent_down')
438
439
440 except CycleExceeded as e:
441 self._cycleExceededCount += 1
442 log.warn("Device %s [%s] scan stopped because time exceeded cycle interval, %s", self._devId, self._manageIp
443 , str(e))
444 self._logOidsNotCollected('cycle exceeded')
445 self._sendStatusEvent('Scan stopped; Collection time exceeded interval - %s' % str(e),
446 eventKey='interval_exceeded')
447
448 except Snmpv3Error as e:
449 self._logOidsNotCollected('of %s' % str(e))
450 self._snmpV3ErrorCount += 1
451 summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format(self, str(e))
452
453 log.error("{0} on {1}".format(summary, self.configId))
454 self._sendStatusEvent(summary, eventKey='snmp_v3_error')
455 finally:
456 self._logTaskOidInfo(previous_bad_oids)
457
460
462 """
463 Report any bad OIDs and then track the OID so we
464 don't generate any further errors.
465 """
466 # make sure oids aren't in good set
467 self.remove_from_good_oids(oids)
468 for oid in oids:
469 if oid in self._oids:
470 self._bad_oids.add(oid)
471 names = [dp[0] for dp in self._oids[oid]]
472 summary = 'Error reading value for %s (%s) on %s' % (
473 names, oid, self._devId)
474 log.warn(summary)
475
477 """
478 Callback activated when the task is complete
479
480 @parameter result: results of SNMP gets
481 @type result: array of (boolean, dictionaries)
482 """
483
484 try:
485 self._close()
486 except Exception, ex:
487 log.warn("Failed to close device %s: error %s" %
488 (self._devId, str(ex)))
489
490 doTask_end = datetime.now()
491 duration = doTask_end - self._doTask_start
492 if duration > timedelta(seconds=self._device.cycleInterval):
493 log.warn("Collection for %s took %s seconds; cycle interval is %s seconds." % (
494 self.configId, duration.total_seconds(), self._device.cycleInterval))
495 else:
496 log.debug("Collection time for %s was %s seconds; cycle interval is %s seconds." % (
497 self.configId, duration.total_seconds(), self._device.cycleInterval))
498
499
500 # Return the result so the framework can track success/failure
501 return result
502
504 return self._close()
505
507 """
508 Contact to one device and return a deferred which gathers data from
509 the device.
510
511 @return: A task to scan the OIDs on a device.
512 @rtype: Twisted deferred object
513 """
514 self._doTask_start = datetime.now()
515 self._responseReceived = False
516 # See if we need to connect first before doing any collection
517 d = defer.maybeDeferred(self._connect)
518 d.addCallbacks(self._connectCallback, self._failure)
519
520
521 d.addCallback(self._doCollectOids)
522 # Call _finished for both success and error scenarois
523 d.addBoth(self._finished)
524
525 # Wait until the Deferred actually completes
526 return d
527
528
530 if log.isEnabledFor(logging.DEBUG):
531 log.debug("Device %s [%s] %d of %d OIDs scanned successfully",
532 self._devId, self._manageIp, len(self._collectedOids), len(self._oids))
533 untested_oids = self._untestedOids()
534 log.debug("Device %s [%s] has %d good oids, %d bad oids and %d untested oids out of %d configured",
535 self._devId, self._manageIp, len(self._good_oids), len(self._bad_oids), len(untested_oids),
536 len(self._oids))
537
538 newBadOids = self._bad_oids - set(previous_bad_oids)
539 if newBadOids:
540 log.info("%s: Detected %s bad oids this cycle", self.name, len(newBadOids))
541 log.debug("%s: Bad oids detected - %s", self.name, newBadOids)
542
544 oidsNotCollected = self._uncollectedOids()
545 if oidsNotCollected:
546 log.debug("%s Oids not collected because %s - %s" % (self.name, reason, str(oidsNotCollected)))
547
548
550 """
551 Create a connection to the remote device
552 """
553 self.state = SnmpPerformanceCollectionTask.STATE_CONNECTING
554 if (self._snmpProxy is None or
555 self._snmpProxy._snmpConnInfo != self._snmpConnInfo):
556 self._snmpProxy = self._snmpConnInfo.createSession(
557 protocol=self._snmpPort.protocol,
558 allowCache=True)
559 self._snmpProxy.open()
560 return self._snmpProxy
561
563 """
564 Close down the connection to the remote device
565 """
566 if self._snmpProxy:
567 self._snmpProxy.close()
568 self._snmpProxy = None
569
570
572 """
573 Called by the collector framework scheduler, and allows us to
574 see how each task is doing.
575 """
576 display = "%s using SNMP %s\n" % (self.name, self._snmpConnInfo.zSnmpVer)
577 display += "%s Cycles Exceeded: %s; V3 Error Count: %s; Stopped Task Count: %s\n" % (
578 self.name, self._cycleExceededCount, self._snmpV3ErrorCount, self._stoppedTaskCount)
579 display += "%s OIDs configured: %d \n" % (
580 self.name, len(self._oids.keys()))
581 display += "%s Good OIDs: %d - %s\n" % (
582 self.name, len(self._good_oids), self._good_oids)
583 display += "%s Bad OIDs: %d - %s\n" % (
584 self.name, len(self._bad_oids), self._bad_oids)
585
586 if self._lastErrorMsg:
587 display += "%s\n" % self._lastErrorMsg
588 return display
589
590
591 if __name__ == '__main__':
592 myPreferences = SnmpPerformanceCollectionPreferences()
593 myTaskFactory = SimpleTaskFactory(SnmpPerformanceCollectionTask)
594 myTaskSplitter = SimpleTaskSplitter(myTaskFactory)
595 daemon = CollectorDaemon(myPreferences, myTaskSplitter)
596 daemon.run()
597
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1.1812 on Mon Jul 30 17:11:31 2012 | http://epydoc.sourceforge.net |