poll_errors.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright (c) 2017-2019 Joe Clarke <jclarke@cisco.com>
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions
  8. # are met:
  9. # 1. Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # 2. Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. # SUCH DAMAGE.
  26. import netsnmp
  27. import os
  28. import json
  29. from sparker import Sparker, MessageType
  30. import CLEUCreds
  31. from cleu.config import Config as C
  32. CACHE_FILE = "/home/jclarke/errors_cache.dat"
  33. THRESHOLD = 1
  34. WINDOW = 12
  35. REARM = 6
  36. WEBEX_ROOM = "Data Centre Alarms"
  37. devices = ["dc1-mccsw-1", "dc1-mccsw-2", "dc2-mccsw-1", "dc2-mccsw-2", "dc1-ethsw-1", "dc1-ethsw-2", "dc2-ethsw-1", "dc2-ethsw-2"]
  38. ignore_interfaces = {}
  39. prev_state = {}
  40. curr_state = {}
  41. if __name__ == "__main__":
  42. spark = Sparker(token=CLEUCreds.SPARK_TOKEN)
  43. if os.path.exists(CACHE_FILE):
  44. fd = open(CACHE_FILE, "r")
  45. prev_state = json.load(fd)
  46. fd.close()
  47. for device in devices:
  48. swent = {}
  49. vars = netsnmp.VarList(
  50. netsnmp.Varbind("ifDescr"),
  51. netsnmp.Varbind("ifInErrors"),
  52. netsnmp.Varbind("ifOutErrors"),
  53. netsnmp.Varbind("ifInDiscards"),
  54. netsnmp.Varbind("ifOutDiscards"),
  55. netsnmp.Varbind("ifAlias"),
  56. )
  57. netsnmp.snmpwalk(
  58. vars,
  59. Version=3,
  60. DestHost=device,
  61. SecLevel="authPriv",
  62. SecName="CLEUR",
  63. AuthProto="SHA",
  64. AuthPass=CLEUCreds.SNMP_AUTH_PASS,
  65. PrivProto="DES",
  66. PrivPass=CLEUCreds.SNMP_PRIV_PASS,
  67. )
  68. for var in vars:
  69. if var.iid not in swent:
  70. swent[var.iid] = {}
  71. swent[var.iid]["count"] = 0
  72. swent[var.iid]["suppressed"] = False
  73. swent[var.iid][var.tag] = var.val
  74. curr_state[device] = swent
  75. if not device in prev_state:
  76. continue
  77. for ins, vard in curr_state[device].items():
  78. if not ins in prev_state[device]:
  79. continue
  80. if not "ifDescr" in vard:
  81. continue
  82. if not "ifAlias" in vard:
  83. vard["ifAlias"] = ""
  84. if "count" in prev_state[device][ins]:
  85. curr_state[device][ins]["count"] = prev_state[device][ins]["count"]
  86. if "suppressed" in prev_state[device][ins]:
  87. curr_state[device][ins]["suppressed"] = prev_state[device][ins]["suppressed"]
  88. if_descr = vard["ifDescr"]
  89. if_alias = vard["ifAlias"]
  90. if device in ignore_interfaces and if_descr in ignore_interfaces[device]:
  91. continue
  92. found_error = False
  93. for k, v in vard.items():
  94. if k == "ifDescr" or k == "ifAlias" or k == "count" or k == "suppressed":
  95. continue
  96. if k in prev_state[device][ins]:
  97. diff = int(v) - int(prev_state[device][ins][k])
  98. if diff >= THRESHOLD:
  99. found_error = True
  100. if curr_state[device][ins]["count"] < WINDOW and not curr_state[device][ins]["suppressed"]:
  101. spark.post_to_spark(
  102. C.WEBEX_TEAM,
  103. WEBEX_ROOM,
  104. "Interface **{}** ({}) on device _{}_ has seen an increase of **{}** {} since the last poll (previous: {}, current: {}).".format(
  105. if_descr, if_alias, device, diff, k, prev_state[device][ins][k], v
  106. ),
  107. MessageType.WARNING,
  108. )
  109. elif not curr_state[device][ins]["suppressed"]:
  110. curr_state[device][ins]["suppressed"] = True
  111. spark.post_to_spark(
  112. C.WEBEX_TEAM,
  113. WEBEX_ROOM,
  114. "Suppressing alarms for interface **{}** ({}) on device _{}_".format(if_descr, if_alias, device),
  115. MessageType.GOOD,
  116. )
  117. if not found_error:
  118. if curr_state[device][ins]["count"] > 0:
  119. curr_state[device][ins]["count"] -= 1
  120. if curr_state[device][ins]["count"] < REARM and curr_state[device][ins]["suppressed"]:
  121. spark.post_to_spark(
  122. C.WEBEX_TEAM,
  123. WEBEX_ROOM,
  124. "Interface **{}** ({}) on device _{}_ is no longer seeing an increase of errors".format(
  125. if_descr, if_alias, device
  126. ),
  127. MessageType.GOOD,
  128. )
  129. curr_state[device][ins]["suppressed"] = False
  130. else:
  131. curr_state[device][ins]["count"] += 1
  132. fd = open(CACHE_FILE, "w")
  133. json.dump(curr_state, fd, indent=4)
  134. fd.close()