230 lines
7.9 KiB
Python
230 lines
7.9 KiB
Python
import datetime
|
|
from datetime import timezone
|
|
from statistics import mean
|
|
|
|
from djangormm.celery import app
|
|
from django.core.exceptions import ObjectDoesNotExist
|
|
|
|
from agents.models import Agent
|
|
from .models import (
|
|
DiskCheck,
|
|
DiskCheckEmail,
|
|
PingCheck,
|
|
PingCheckEmail,
|
|
CpuLoadCheck,
|
|
CpuLoadCheckEmail,
|
|
CpuHistory,
|
|
MemCheck,
|
|
MemoryHistory,
|
|
MemCheckEmail,
|
|
WinServiceCheck,
|
|
WinServiceCheckEmail
|
|
)
|
|
|
|
@app.task
|
|
def handle_check_email_alert_task(check_type, pk):
|
|
if check_type == "ping":
|
|
check = PingCheck.objects.get(pk=pk)
|
|
eml = PingCheckEmail
|
|
elif check_type == "diskspace":
|
|
check = DiskCheck.objects.get(pk=pk)
|
|
eml = DiskCheckEmail
|
|
elif check_type == "cpuload":
|
|
check = CpuLoadCheck.objects.get(pk=pk)
|
|
eml = CpuLoadCheckEmail
|
|
elif check_type == "memory":
|
|
check = MemCheck.objects.get(pk=pk)
|
|
eml = MemCheckEmail
|
|
elif check_type == "winsvc":
|
|
check = WinServiceCheck.objects.get(pk=pk)
|
|
eml = WinServiceCheckEmail
|
|
else:
|
|
return {"error": "no check"}
|
|
|
|
try:
|
|
latest_email = (
|
|
eml.objects.filter(email=check).order_by("-sent")[:1].get()
|
|
)
|
|
except Exception as e:
|
|
# first time sending email
|
|
eml(email=check).save()
|
|
check.send_email()
|
|
else:
|
|
last_sent = latest_email.sent
|
|
delta = datetime.datetime.now(
|
|
timezone.utc
|
|
) - datetime.timedelta(hours=24)
|
|
# send an email only if the last email sent is older than 24 hours
|
|
if last_sent < delta:
|
|
eml(email=check).save()
|
|
check.send_email()
|
|
|
|
return "ok"
|
|
|
|
@app.task
|
|
def determine_agent_status():
|
|
agents = Agent.objects.all()
|
|
offline = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=4)
|
|
|
|
for agent in agents:
|
|
overdue = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=agent.overdue_time)
|
|
if (agent.last_seen < offline) and (agent.last_seen > overdue):
|
|
agent.status = "offline"
|
|
elif (agent.last_seen < offline) and (agent.last_seen < overdue):
|
|
agent.status = "overdue"
|
|
else:
|
|
agent.status = "online"
|
|
agent.save(update_fields=["status"])
|
|
|
|
return "ok"
|
|
|
|
@app.task
|
|
def disk_check_alert():
|
|
agents_with_checks = DiskCheck.objects.all()
|
|
if agents_with_checks:
|
|
for diskcheck in agents_with_checks:
|
|
agent = Agent.objects.get(pk=diskcheck.agent.pk)
|
|
total = agent.disks[diskcheck.disk]["total"]
|
|
free = agent.disks[diskcheck.disk]["free"]
|
|
diskcheck.more_info = f"Total: {total}B, Free: {free}B"
|
|
diskcheck.save(update_fields=["more_info"])
|
|
percent_used = agent.disks[diskcheck.disk]["percent"]
|
|
percent_free = 100 - percent_used
|
|
if percent_free < diskcheck.threshold:
|
|
diskcheck.status = "failing"
|
|
diskcheck.save(update_fields=["status", "last_run"])
|
|
if diskcheck.email_alert:
|
|
handle_check_email_alert_task.delay("diskspace", diskcheck.pk)
|
|
else:
|
|
if diskcheck.status != "passing":
|
|
diskcheck.status = "passing"
|
|
diskcheck.save(update_fields=["status"])
|
|
|
|
diskcheck.save(update_fields=["last_run"])
|
|
return "ok"
|
|
|
|
|
|
@app.task
|
|
def cpu_load_check_alert():
|
|
agents_with_checks = CpuLoadCheck.objects.all()
|
|
if agents_with_checks:
|
|
for check in agents_with_checks:
|
|
threshold = check.cpuload
|
|
agent = Agent.objects.get(pk=check.agent.pk)
|
|
try:
|
|
cpuhistory = CpuHistory.objects.get(agent=agent)
|
|
except ObjectDoesNotExist:
|
|
pass
|
|
else:
|
|
check.more_info = cpuhistory.format_nice()
|
|
check.save(update_fields=["more_info", "last_run"])
|
|
avg = int(mean(cpuhistory.cpu_history))
|
|
if avg > threshold:
|
|
check.status = "failing"
|
|
check.save(update_fields=["status"])
|
|
|
|
if check.email_alert:
|
|
handle_check_email_alert_task.delay("cpuload", check.pk)
|
|
|
|
else:
|
|
if check.status != "passing":
|
|
check.status = "passing"
|
|
check.save(update_fields=["status"])
|
|
check.save(update_fields=["last_run"])
|
|
|
|
return "ok"
|
|
|
|
@app.task
|
|
def restart_win_service_task(pk, svcname):
|
|
agent = Agent.objects.get(pk=pk)
|
|
resp = agent.salt_api_cmd(
|
|
hostname = agent.hostname,
|
|
timeout = 60,
|
|
func = f"service.restart",
|
|
arg = svcname,
|
|
)
|
|
data = resp.json()
|
|
if not data["return"][0][agent.hostname]:
|
|
return {"error": f"restart service {svcname} failed on {agent.hostname}"}
|
|
return "ok"
|
|
|
|
@app.task
|
|
def win_service_check_task():
|
|
agents_with_checks = WinServiceCheck.objects.all()
|
|
if agents_with_checks:
|
|
for check in agents_with_checks:
|
|
alert = False
|
|
agent = Agent.objects.get(pk=check.agent.pk)
|
|
status = list(filter(lambda x: x["name"] == check.svc_name, agent.services))[0]["status"]
|
|
if status == "running":
|
|
check.status = "passing"
|
|
if check.failure_count != 0:
|
|
check.failure_count = 0
|
|
check.save(update_fields=["failure_count"])
|
|
elif status == "start_pending":
|
|
if check.pass_if_start_pending:
|
|
check.status = "passing"
|
|
if check.failure_count != 0:
|
|
check.failure_count = 0
|
|
check.save(update_fields=["failure_count"])
|
|
else:
|
|
check.status = "failing"
|
|
new_count = check.failure_count + 1
|
|
check.failure_count = new_count
|
|
if new_count >= check.failures:
|
|
alert = True
|
|
check.save(update_fields=["failure_count"])
|
|
|
|
else:
|
|
check.status = "failing"
|
|
new_count = check.failure_count + 1
|
|
check.failure_count = new_count
|
|
if new_count >= check.failures:
|
|
alert = True
|
|
check.save(update_fields=["failure_count"])
|
|
|
|
if check.restart_if_stopped:
|
|
if status == "stopped":
|
|
restart_win_service_task.delay(agent.pk, check.svc_name)
|
|
|
|
check.more_info = f"Status {status.upper()}"
|
|
check.save(update_fields=["status", "more_info", "last_run"])
|
|
|
|
if alert:
|
|
if check.email_alert:
|
|
handle_check_email_alert_task.delay("winsvc", check.pk)
|
|
|
|
return "ok"
|
|
|
|
|
|
|
|
|
|
@app.task
|
|
def mem_check_alert():
|
|
agents_with_checks = MemCheck.objects.all()
|
|
if agents_with_checks:
|
|
for check in agents_with_checks:
|
|
threshold = check.threshold
|
|
agent = Agent.objects.get(pk=check.agent.pk)
|
|
try:
|
|
memhistory = MemoryHistory.objects.get(agent=agent)
|
|
except ObjectDoesNotExist:
|
|
pass
|
|
else:
|
|
check.more_info = memhistory.format_nice()
|
|
check.save(update_fields=["more_info", "last_run"])
|
|
avg = int(mean(memhistory.mem_history))
|
|
if avg > threshold:
|
|
check.status = "failing"
|
|
check.save(update_fields=["status"])
|
|
|
|
if check.email_alert:
|
|
handle_check_email_alert_task.delay("memory", check.pk)
|
|
|
|
else:
|
|
if check.status != "passing":
|
|
check.status = "passing"
|
|
check.save(update_fields=["status"])
|
|
check.save(update_fields=["last_run"])
|
|
|
|
return "ok" |