poll_errors.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright (c) 2017-2020 Joe Clarke <jclarke@cisco.com>
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions
  8. # are met:
  9. # 1. Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # 2. Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. # SUCH DAMAGE.
  26. from __future__ import print_function
  27. import netsnmp
  28. import os
  29. import json
  30. import argparse
  31. import sys
  32. from sparker import Sparker, MessageType
  33. import CLEUCreds
  34. from cleu.config import Config as C
  35. CACHE_FILE = "/home/jclarke/errors_cache"
  36. THRESHOLD = 1
  37. WINDOW = 12
  38. REARM = 6
  39. IF_UP = 1
  40. prev_state = {}
  41. curr_state = {}
  42. if __name__ == "__main__":
  43. spark = Sparker(token=CLEUCreds.SPARK_TOKEN)
  44. parser = argparse.ArgumentParser(prog=sys.argv[0], description="Poll errors from network devices")
  45. parser.add_argument(
  46. "--name", "-n", metavar="<NAME>", help="Name of the poller", required=True,
  47. )
  48. parser.add_argument(
  49. "--device-file", "-f", metavar="<DEVICE_FILE>", help="Path to the JSON file containing the devices to poll", required=True,
  50. )
  51. parser.add_argument("--webex-room", "-r", metavar="<ROOM_NAME>", help="Name of Webex room to send alerts to", required=True)
  52. parser.add_argument(
  53. "--ignore-interfaces-file", "-i", metavar="<IGNORE_FILE>", help="Path to JSON file that maps devices and interfaces to ignore"
  54. )
  55. parser.add_argument("--no-discards", help="Poll ifIn/OutDiscards (default: discards are polled)", action="store_true")
  56. args = parser.parse_args()
  57. devices = None
  58. try:
  59. with open(args.device_file) as fd:
  60. devices = json.load(fd)
  61. except Exception as e:
  62. print("ERROR: Failed to load device file {}: {}".format(args.device_file, getattr(e, "message", repr(e))))
  63. sys.exit(1)
  64. ignore_interfaces = {}
  65. if args.ignore_interfaces_file:
  66. try:
  67. with open(args.ignore_interfaces_file) as fd:
  68. ignore_interfaces = json.load(fd)
  69. except Exception as e:
  70. print(
  71. "ERROR: Failed to load the ignore interfaces file {}: {}".format(
  72. args.ignore_interfaces_file, getattr(e, "message", repr(e))
  73. )
  74. )
  75. sys.exit(1)
  76. cache_file = CACHE_FILE + "_" + args.name + ".dat"
  77. if os.path.exists(cache_file):
  78. fd = open(cache_file, "r")
  79. prev_state = json.load(fd)
  80. fd.close()
  81. for device in devices:
  82. swent = {}
  83. if not args.no_discards:
  84. vars = netsnmp.VarList(
  85. netsnmp.Varbind("ifDescr"),
  86. netsnmp.Varbind("ifInErrors"),
  87. netsnmp.Varbind("ifOutErrors"),
  88. netsnmp.Varbind("ifInDiscards"),
  89. netsnmp.Varbind("ifOutDiscards"),
  90. netsnmp.Varbind("ifAlias"),
  91. netsnmp.Varbind("ifOperStatus"),
  92. )
  93. else:
  94. vars = netsnmp.VarList(
  95. netsnmp.Varbind("ifDescr"),
  96. netsnmp.Varbind("ifInErrors"),
  97. netsnmp.Varbind("ifOutErrors"),
  98. netsnmp.Varbind("ifAlias"),
  99. netsnmp.Varbind("ifOperStatus"),
  100. )
  101. netsnmp.snmpwalk(
  102. vars,
  103. Version=3,
  104. DestHost=device,
  105. SecLevel="authPriv",
  106. SecName="CLEUR",
  107. AuthProto="SHA",
  108. AuthPass=CLEUCreds.SNMP_AUTH_PASS,
  109. PrivProto="DES",
  110. PrivPass=CLEUCreds.SNMP_PRIV_PASS,
  111. )
  112. for var in vars:
  113. if var.iid not in swent:
  114. swent[var.iid] = {}
  115. swent[var.iid]["count"] = 0
  116. swent[var.iid]["suppressed"] = False
  117. swent[var.iid][var.tag] = var.val
  118. curr_state[device] = swent
  119. if not device in prev_state:
  120. continue
  121. for ins, vard in list(curr_state[device].items()):
  122. if not ins in prev_state[device]:
  123. continue
  124. if not "ifDescr" in vard:
  125. continue
  126. if "ifOperStatus" not in vard or int(vard["ifOperStatus"]) != IF_UP:
  127. continue
  128. if not "ifAlias" in vard:
  129. vard["ifAlias"] = ""
  130. if "count" in prev_state[device][ins]:
  131. curr_state[device][ins]["count"] = prev_state[device][ins]["count"]
  132. if "suppressed" in prev_state[device][ins]:
  133. curr_state[device][ins]["suppressed"] = prev_state[device][ins]["suppressed"]
  134. if_descr = vard["ifDescr"]
  135. if_alias = vard["ifAlias"]
  136. if device in ignore_interfaces and if_descr in ignore_interfaces[device]:
  137. continue
  138. found_error = False
  139. for k, v in list(vard.items()):
  140. if k == "ifDescr" or k == "ifAlias" or k == "count" or k == "suppressed":
  141. continue
  142. if k in prev_state[device][ins]:
  143. diff = int(v) - int(prev_state[device][ins][k])
  144. if diff >= THRESHOLD:
  145. found_error = True
  146. if curr_state[device][ins]["count"] < WINDOW and not curr_state[device][ins]["suppressed"]:
  147. spark.post_to_spark(
  148. C.WEBEX_TEAM,
  149. args.webex_room,
  150. "Interface **{}** ({}) on device _{}_ has seen an increase of **{}** {} since the last poll (previous: {}, current: {}).".format(
  151. if_descr, if_alias, device, diff, k, prev_state[device][ins][k], v
  152. ),
  153. MessageType.WARNING,
  154. )
  155. elif not curr_state[device][ins]["suppressed"]:
  156. curr_state[device][ins]["suppressed"] = True
  157. spark.post_to_spark(
  158. C.WEBEX_TEAM,
  159. args.webex_room,
  160. "Suppressing alarms for interface **{}** ({}) on device _{}_".format(if_descr, if_alias, device),
  161. )
  162. if not found_error:
  163. if curr_state[device][ins]["count"] > 0:
  164. curr_state[device][ins]["count"] -= 1
  165. if curr_state[device][ins]["count"] < REARM and curr_state[device][ins]["suppressed"]:
  166. spark.post_to_spark(
  167. C.WEBEX_TEAM,
  168. args.webex_room,
  169. "Interface **{}** ({}) on device _{}_ is no longer seeing an increase of errors".format(
  170. if_descr, if_alias, device
  171. ),
  172. MessageType.GOOD,
  173. )
  174. curr_state[device][ins]["suppressed"] = False
  175. else:
  176. curr_state[device][ins]["count"] += 1
  177. fd = open(cache_file, "w")
  178. json.dump(curr_state, fd, indent=4)
  179. fd.close()