poll_errors.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright (c) 2017-2020 Joe Clarke <jclarke@cisco.com>
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions
  8. # are met:
  9. # 1. Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # 2. Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. # SUCH DAMAGE.
  26. import netsnmp
  27. import os
  28. import json
  29. import argparse
  30. import sys
  31. from sparker import Sparker, MessageType
  32. import CLEUCreds
  33. from cleu.config import Config as C
  34. CACHE_FILE = "/home/jclarke/errors_cache"
  35. THRESHOLD = 1
  36. WINDOW = 12
  37. REARM = 6
  38. IF_UP = 1
  39. prev_state = {}
  40. curr_state = {}
  41. if __name__ == "__main__":
  42. spark = Sparker(token=CLEUCreds.SPARK_TOKEN)
  43. parser = argparse.ArgumentParser(prog=sys.argv[0], description="Poll errors from network devices")
  44. parser.add_argument(
  45. "--name", "-n", metavar="<NAME>", help="Name of the poller", required=True,
  46. )
  47. parser.add_argument(
  48. "--device-file", "-f", metavar="<DEVICE_FILE>", help="Path to the JSON file containing the devices to poll", required=True,
  49. )
  50. parser.add_argument("--webex-room", "-r", metavar="<ROOM_NAME>", help="Name of Webex room to send alerts to", required=True)
  51. parser.add_argument(
  52. "--ignore-interfaces-file", "-i", metavar="<IGNORE_FILE>", help="Path to JSON file that maps devices and interfaces to ignore"
  53. )
  54. args = parser.parse_args()
  55. devices = None
  56. try:
  57. with open(args.device_file) as fd:
  58. devices = json.load(fd)
  59. except Exception as e:
  60. print("ERROR: Failed to load device file {}: {}".format(args.device_file, getattr(e, "message", repr(e))))
  61. sys.exit(1)
  62. ignore_interfaces = {}
  63. if args.ignore_interfaces_file:
  64. try:
  65. with open(args.ignore_interfaces_file) as fd:
  66. ignore_interfaces = json.load(fd)
  67. except Exception as e:
  68. print(
  69. "ERROR: Failed to load the ignore interfaces file {}: {}".format(
  70. args.ignore_interfaces_file, getattr(e, "message", repr(e))
  71. )
  72. )
  73. sys.exit(1)
  74. cache_file = CACHE_FILE + "_" + args.name + ".dat"
  75. if os.path.exists(cache_file):
  76. fd = open(cache_file, "r")
  77. prev_state = json.load(fd)
  78. fd.close()
  79. for device in devices:
  80. swent = {}
  81. vars = netsnmp.VarList(
  82. netsnmp.Varbind("ifDescr"),
  83. netsnmp.Varbind("ifInErrors"),
  84. netsnmp.Varbind("ifOutErrors"),
  85. netsnmp.Varbind("ifInDiscards"),
  86. netsnmp.Varbind("ifOutDiscards"),
  87. netsnmp.Varbind("ifAlias"),
  88. netsnmp.Varbind("ifOperStatus"),
  89. )
  90. netsnmp.snmpwalk(
  91. vars,
  92. Version=3,
  93. DestHost=device,
  94. SecLevel="authPriv",
  95. SecName="CLEUR",
  96. AuthProto="SHA",
  97. AuthPass=CLEUCreds.SNMP_AUTH_PASS,
  98. PrivProto="DES",
  99. PrivPass=CLEUCreds.SNMP_PRIV_PASS,
  100. )
  101. for var in vars:
  102. if var.iid not in swent:
  103. swent[var.iid] = {}
  104. swent[var.iid]["count"] = 0
  105. swent[var.iid]["suppressed"] = False
  106. swent[var.iid][var.tag] = var.val
  107. curr_state[device] = swent
  108. if not device in prev_state:
  109. continue
  110. for ins, vard in curr_state[device].items():
  111. if not ins in prev_state[device]:
  112. continue
  113. if not "ifDescr" in vard:
  114. continue
  115. if "ifOperStatus" not in vard or int(vard["ifOperStatus"]) != IF_UP:
  116. continue
  117. if not "ifAlias" in vard:
  118. vard["ifAlias"] = ""
  119. if "count" in prev_state[device][ins]:
  120. curr_state[device][ins]["count"] = prev_state[device][ins]["count"]
  121. if "suppressed" in prev_state[device][ins]:
  122. curr_state[device][ins]["suppressed"] = prev_state[device][ins]["suppressed"]
  123. if_descr = vard["ifDescr"]
  124. if_alias = vard["ifAlias"]
  125. if device in ignore_interfaces and if_descr in ignore_interfaces[device]:
  126. continue
  127. found_error = False
  128. for k, v in vard.items():
  129. if k == "ifDescr" or k == "ifAlias" or k == "count" or k == "suppressed":
  130. continue
  131. if k in prev_state[device][ins]:
  132. diff = int(v) - int(prev_state[device][ins][k])
  133. if diff >= THRESHOLD:
  134. found_error = True
  135. if curr_state[device][ins]["count"] < WINDOW and not curr_state[device][ins]["suppressed"]:
  136. spark.post_to_spark(
  137. C.WEBEX_TEAM,
  138. args.webex_room,
  139. "Interface **{}** ({}) on device _{}_ has seen an increase of **{}** {} since the last poll (previous: {}, current: {}).".format(
  140. if_descr, if_alias, device, diff, k, prev_state[device][ins][k], v
  141. ),
  142. MessageType.WARNING,
  143. )
  144. elif not curr_state[device][ins]["suppressed"]:
  145. curr_state[device][ins]["suppressed"] = True
  146. spark.post_to_spark(
  147. C.WEBEX_TEAM,
  148. args.webex_room,
  149. "Suppressing alarms for interface **{}** ({}) on device _{}_".format(if_descr, if_alias, device),
  150. MessageType.GOOD,
  151. )
  152. if not found_error:
  153. if curr_state[device][ins]["count"] > 0:
  154. curr_state[device][ins]["count"] -= 1
  155. if curr_state[device][ins]["count"] < REARM and curr_state[device][ins]["suppressed"]:
  156. spark.post_to_spark(
  157. C.WEBEX_TEAM,
  158. args.webex_room,
  159. "Interface **{}** ({}) on device _{}_ is no longer seeing an increase of errors".format(
  160. if_descr, if_alias, device
  161. ),
  162. MessageType.GOOD,
  163. )
  164. curr_state[device][ins]["suppressed"] = False
  165. else:
  166. curr_state[device][ins]["count"] += 1
  167. fd = open(cache_file, "w")
  168. json.dump(curr_state, fd, indent=4)
  169. fd.close()