| Trees | Indices | Help |
|
|---|
|
|
1 #! /usr/bin/env python
2 ###########################################################################
3 #
4 # This program is part of Zenoss Core, an open source monitoring platform.
5 # Copyright (C) 2007, 2010 Zenoss Inc.
6 #
7 # This program is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License version 2 or (at your
9 # option) any later version as published by the Free Software Foundation.
10 #
11 # For complete information please visit: http://www.zenoss.com/oss/
12 #
13 ###########################################################################
14
15 __doc__="""zenperfsnmp
16
17 Gets SNMP performance data and stores it in RRD files.
18
19 """
20 import random
21 import logging
22 log = logging.getLogger("zen.zenperfsnmp")
23
24 import Globals
25 import zope.interface
26
27 from twisted.internet import defer, error
28 from twisted.python.failure import Failure
29 from pynetsnmp.twistedsnmp import AgentProxy, snmpprotocol, Snmpv3Error
30
31 from Products.ZenCollector.daemon import CollectorDaemon
32 from Products.ZenCollector.interfaces import ICollectorPreferences,\
33 IDataService,\
34 IEventService,\
35 IScheduledTask
36 from Products.ZenCollector.tasks import SimpleTaskFactory,\
37 SimpleTaskSplitter,\
38 TaskStates, \
39 BaseTask
40 from Products.ZenUtils.Utils import importClass, readable_time
41 from Products.ZenUtils.Chain import Chain
42 from Products.ZenEvents.ZenEventClasses import Perf_Snmp, Status_Snmp, Status_Perf
43 from Products.ZenEvents import Event
44
45 # We retrieve our configuration data remotely via a Twisted PerspectiveBroker
46 # connection. To do so, we need to import the class that will be used by the
47 # configuration service to send the data over, i.e. SnmpDeviceProxy.
48 from Products.ZenUtils.Utils import unused
49 from Products.ZenHub.services.SnmpPerformanceConfig import SnmpDeviceProxy
50 unused(SnmpDeviceProxy)
51 from Products.ZenHub.services.PerformanceConfig import SnmpConnInfo
52 unused(SnmpConnInfo)
53
54 COLLECTOR_NAME = "zenperfsnmp"
55 MAX_BACK_OFF_MINUTES = 20
56
57
59 zope.interface.implements(ICollectorPreferences)
60
62 """
63 Constructs a new SnmpPerformanceCollectionPreferences instance and
64 provides default values for needed attributes.
65 """
66 self.collectorName = COLLECTOR_NAME
67 self.defaultRRDCreateCommand = None
68 self.configCycleInterval = 20 # minutes
69 self.cycleInterval = 5 * 60 # seconds
70
71 # The configurationService attribute is the fully qualified class-name
72 # of our configuration service that runs within ZenHub
73 self.configurationService = 'Products.ZenHub.services.SnmpPerformanceConfig'
74
75 # Will be filled in based on buildOptions
76 self.options = None
77
79 parser.add_option('--showrawresults',
80 dest='showrawresults',
81 action="store_true",
82 default=False,
83 help="Show the raw RRD values. For debugging purposes only.")
84
85 parser.add_option('--maxbackoffminutes',
86 dest='maxbackoffminutes',
87 default=MAX_BACK_OFF_MINUTES,
88 help="When a device fails to respond, increase the time to" \
89 " check on the device until this limit.")
90
93
94
97
98 STATUS_EVENT = { 'eventClass' : Status_Snmp,
99 'component' : 'snmp',
100 'eventGroup' : 'SnmpTest' }
101
103 """
104 A task that performs periodic performance collection for devices providing
105 data via SNMP agents.
106 """
107 zope.interface.implements(IScheduledTask)
108
109 STATE_CONNECTING = 'CONNECTING'
110 STATE_FETCH_PERF = 'FETCH_PERF_DATA'
111 STATE_STORE_PERF = 'STORE_PERF_DATA'
112
118 """
119 @param deviceId: the Zenoss deviceId to watch
120 @type deviceId: string
121 @param taskName: the unique identifier for this task
122 @type taskName: string
123 @param scheduleIntervalSeconds: the interval at which this task will be
124 collected
125 @type scheduleIntervalSeconds: int
126 @param taskConfig: the configuration for this task
127 """
128 super(SnmpPerformanceCollectionTask, self).__init__(
129 deviceId, taskName,
130 taskConfig.cycleInterval, taskConfig
131 )
132
133 # Needed for interface
134 self.name = taskName
135 self.configId = deviceId
136 self.state = TaskStates.STATE_IDLE
137
138 # The taskConfig corresponds to a DeviceProxy
139 self._device = taskConfig
140 self._devId = self._device.id
141 self._manageIp = self._device.snmpConnInfo.manageIp
142 self._maxOidsPerRequest = self._device.zMaxOIDPerRequest
143 log.debug("SnmpPerformanceCollectionTask.__init__: self._maxOidsPerRequest=%s" % self._maxOidsPerRequest)
144 self.interval = self._device.cycleInterval
145 self._singleOidMode = False
146 self._collectedOids = 0
147
148 self._dataService = zope.component.queryUtility(IDataService)
149 self._eventService = zope.component.queryUtility(IEventService)
150
151 self._preferences = zope.component.queryUtility(ICollectorPreferences,
152 COLLECTOR_NAME)
153
154 self._snmpProxy = None
155 self._snmpConnInfo = self._device.snmpConnInfo
156 self._oids = self._device.oids
157 self._snmpStatusFailures = 0
158 self._snmpPort = snmpprotocol.port()
159 self._maxbackoffseconds = self._preferences.options.maxbackoffminutes * 60
160
161 self._lastErrorMsg = ''
162
164 """
165 Twisted errBack to log the exception for a single device.
166
167 @parameter reason: explanation of the failure
168 @type reason: Twisted error instance
169 """
170 self._snmpStatusFailures += 1
171 # Decode the exception
172 if isinstance(reason.value, error.TimeoutError):
173 msg = ('SNMP agent down (%s second timeout connecting to'
174 ' device %s)') % (self._snmpConnInfo.zSnmpTimeout, self._devId)
175 # Indicate that we've handled the error by
176 # not returning a result
177 reason = None
178
179 elif isinstance(reason.value, Snmpv3Error):
180 msg = ("Cannot connect to SNMP agent on {0._devId}: {1.value}").format(self, reason)
181 reason = None
182
183 elif isinstance(reason.value, SingleOidSwitchException):
184 return # Just wait for the next cycle
185
186 else:
187 msg = reason.getErrorMessage()
188 if not msg: # Sometimes we get blank error messages
189 msg = reason.__class__
190 msg = '%s %s' % (self._devId, msg)
191
192 # Leave 'reason' alone to generate a traceback
193
194 if self._lastErrorMsg != msg:
195 self._lastErrorMsg = msg
196 if msg:
197 log.error(msg)
198
199 self._eventService.sendEvent(STATUS_EVENT,
200 device=self._devId,
201 summary=msg,
202 severity=Event.Error)
203 self._delayNextCheck()
204
205 return reason
206
208 """
209 Callback called after a successful connect to the remote device.
210 """
211 # If we want to model things first before doing collection,
212 # that code goes here.
213 log.debug("Connected to %s [%s]", self._devId, self._manageIp)
214 self._collectedOids = 0
215 return result
216
218 """
219 Get performance data for all the monitored components on a device
220
221 @parameter ignored: required to keep Twisted's callback chain happy
222 @type ignored: result of previous callback
223 """
224 self.state = SnmpPerformanceCollectionTask.STATE_FETCH_PERF
225 if not self._oids:
226 return defer.succeed(([]))
227
228 # Either get as many OIDs as we can or one-by-one
229 oidsPerRequest = self._maxOidsPerRequest if not self._singleOidMode else 1
230 log.debug("Retrieving OIDs from %s [%s] oidsPerRequest=%s", self._devId, self._manageIp, oidsPerRequest)
231
232 d = Chain(self._get, iter(self.chunk(self._oids.keys(), oidsPerRequest))).run()
233 d.addCallback(self._checkOidResults)
234 d.addCallback(self._storeOidResults)
235 d.addCallback(self._updateStatus)
236 d.addErrback(self._failure)
237 return d
238
240 """
241 Decode responses from the device and sanity check the responses
242
243 @parameter results: results of SNMP gets
244 @type results: array of (boolean, dictionaries)
245 """
246 if not results:
247 summary = 'Unable to retrieve OIDs from device %s' % \
248 self._devId
249 self._eventService.sendEvent(STATUS_EVENT,
250 device=self._devId,
251 summary=summary,
252 severity=Event.Error)
253 log.info(summary)
254 return defer.fail(summary)
255
256 # Look for problems
257 for success, update in results:
258 # empty update is probably a bad OID in the request somewhere
259 if success and not update and not self._singleOidMode:
260 self._singleOidMode = True
261 msg = 'Error collecting data on %s -- retrying in single-OID mode' % \
262 self._devId
263 log.warn(msg)
264 return defer.fail(SingleOidSwitchException(msg)) # Wait for the next cycle
265
266 if not success:
267 if isinstance(update, Failure) and \
268 isinstance(update.value, (error.TimeoutError, Snmpv3Error)):
269 return defer.fail(update)
270 else:
271 log.warning('Failed to collect on %s (%s: %s)',
272 self._devId,
273 update.__class__,
274 update)
275 return results
276
278 """
279 Store the OID values in RRD files
280
281 @parameter results: results of SNMP gets
282 @type results: array of (boolean, dictionaries)
283 """
284 self.state = SnmpPerformanceCollectionTask.STATE_STORE_PERF
285 oidsReceived = set()
286 successCount = 0
287 for success, update in results:
288 if not success:
289 continue
290
291 successCount += 1
292
293 # Casting update to a dict here is unnecessary in all known cases.
294 # See ticket #7347 for a bug where update would be a tuple at this
295 # point instead of a dict. This cast fixes that problem.
296 for oid, value in dict(update).items():
297 oid = oid.strip('.')
298 if oid not in self._oids:
299 log.error("OID %s is not in %s", oid, self._oids.keys())
300 continue
301
302 # We should always get something useful back
303 if value == '' or value is None:
304 log.debug("Got bad value: oid=%s value=%s" % (oid, value))
305 self._badOid(oid)
306 continue
307
308 self._collectedOids += 1
309 oidsReceived.add(oid)
310 # An OID's data can be stored multiple times
311 for rrdMeta in self._oids[oid]:
312 cname, path, rrdType, rrdCommand, rrdMin, rrdMax = rrdMeta
313 self._dataService.writeRRD(path, value, rrdType,
314 rrdCommand=rrdCommand,
315 min=rrdMin, max=rrdMax)
316
317 if successCount == len(results) and self._singleOidMode:
318 # Remove any oids that didn't report
319 for doomed in set(self._oids.keys()) - oidsReceived:
320 log.debug("Removing OID %s (no response)" % doomed)
321 self._badOid(doomed)
322
323 success = True
324 if results:
325 success = successCount > 0
326
327 return success
328
330 """
331 Callback activated when the task is complete
332
333 @parameter result: results of SNMP gets
334 @type result: array of (boolean, dictionaries)
335 """
336 if not isinstance(result, Failure):
337 log.debug("Device %s [%s] %d of %d OIDs scanned successfully",
338 self._devId, self._manageIp, self._collectedOids,
339 len(self._oids.keys()))
340 self._returnToNormalSchedule()
341 else:
342 log.debug("Device %s [%s] scanned failed, %s",
343 self._devId, self._manageIp, result.getErrorMessage())
344
345 try:
346 self._close()
347 except Exception, ex:
348 log.warn("Failed to close device %s: error %s" %
349 (self._devId, str(ex)))
350
351 # Return the result so the framework can track success/failure
352 return result
353
355 return self._close()
356
358 """
359 Contact to one device and return a deferred which gathers data from
360 the device.
361
362 @return: A task to scan the OIDs on a device.
363 @rtype: Twisted deferred object
364 """
365 # See if we need to connect first before doing any collection
366 d = defer.maybeDeferred(self._connect)
367 d.addCallbacks(self._connectCallback, self._failure)
368 d.addCallback(self._fetchPerf)
369
370 # Call _finished for both success and error scenarois
371 d.addBoth(self._finished)
372
373 # Wait until the Deferred actually completes
374 return d
375
377 """
378 Perform SNMP get for specified OIDs
379
380 @parameter oids: OIDs to gather
381 @type oids: list of strings
382 @return: Twisted deferred
383 @rtype: Twisted deferred
384 """
385 return self._snmpProxy.get(oids,
386 self._snmpConnInfo.zSnmpTimeout,
387 self._snmpConnInfo.zSnmpTries)
388
390 """
391 Create a connection to the remote device
392 """
393 self.state = SnmpPerformanceCollectionTask.STATE_CONNECTING
394 if (self._snmpProxy is None or
395 self._snmpProxy._snmpConnInfo != self._snmpConnInfo):
396 self._snmpProxy = self._snmpConnInfo.createSession(
397 protocol=self._snmpPort.protocol,
398 allowCache=True)
399 self._snmpProxy.open()
400 log.debug("SnmpPerformanceCollectionTask._connect: Connected to %s" % self._snmpConnInfo.manageIp)
401 return self._snmpProxy
402
404 """
405 Close down the connection to the remote device
406 """
407 if self._snmpProxy:
408 self._snmpProxy.close()
409 self._snmpProxy = None
410
412 """
413 Send up/down events based on SNMP results
414
415 @parameter success: Did everything work?
416 @type success: boolean
417 """
418 if success:
419 # As we might not be the process that detected
420 # something was down, always send clear events.
421 # These are deduped out by the daemon code.
422 summary = 'Gathered all OIDs'
423 self._eventService.sendEvent(STATUS_EVENT,
424 device=self._devId, summary=summary,
425 severity=Event.Clear)
426 if self._snmpStatusFailures > 0:
427 log.info("%s %s", self._devId, summary)
428 self._snmpStatusFailures = 0
429
430 if not self._lastErrorMsg:
431 log.info("%s returned back to normal operations",
432 self._devId)
433 self._lastErrorMsg = ''
434 if self.interval != self._device.cycleInterval:
435 # Setting the value kicks off observers, so don't
436 # reset unless necessary
437 self.interval = self._device.cycleInterval
438
439 else:
440 summary = 'Failed to collect all OIDs'
441 self._eventService.sendEvent(STATUS_EVENT,
442 device=self._devId, summary=summary,
443 severity=Event.Warning)
444 log.debug("%s %s", self._devId, summary)
445 self._snmpStatusFailures += 1
446
447 return defer.succeed(self._snmpStatusFailures)
448
450 """
451 Report any bad OIDs and then remove the OID so we
452 don't generate any further errors.
453
454 @parameter oid: the OID that is not responding
455 @type oid: string
456 """
457 names = [dp[0] for dp in self._oids[oid]]
458 summary = 'Error reading value for %s (%s) on %s' % (
459 names, oid, self._devId)
460 log.warn(summary)
461
462 del self._oids[oid]
463
465 """
466 Called by the collector framework scheduler, and allows us to
467 see how each task is doing.
468 """
469 display = "%s OIDs: %d inSingleOidMode: %s\n" % (
470 self.name, len(self._oids.keys()), self._singleOidMode)
471 if self._lastErrorMsg:
472 display += "%s\n" % self._lastErrorMsg
473 return display
474
475
476 if __name__ == '__main__':
477 myPreferences = SnmpPerformanceCollectionPreferences()
478 myTaskFactory = SimpleTaskFactory(SnmpPerformanceCollectionTask)
479 myTaskSplitter = SimpleTaskSplitter(myTaskFactory)
480 daemon = CollectorDaemon(myPreferences, myTaskSplitter)
481 daemon.run()
482
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1.1812 on Tue Oct 11 12:51:57 2011 | http://epydoc.sourceforge.net |