tacticalrmm/api/djangormm/checks/tasks.py

230 lines
7.9 KiB
Python

import datetime
from datetime import timezone
from statistics import mean
from djangormm.celery import app
from django.core.exceptions import ObjectDoesNotExist
from agents.models import Agent
from .models import (
DiskCheck,
DiskCheckEmail,
PingCheck,
PingCheckEmail,
CpuLoadCheck,
CpuLoadCheckEmail,
CpuHistory,
MemCheck,
MemoryHistory,
MemCheckEmail,
WinServiceCheck,
WinServiceCheckEmail
)
@app.task
def handle_check_email_alert_task(check_type, pk):
if check_type == "ping":
check = PingCheck.objects.get(pk=pk)
eml = PingCheckEmail
elif check_type == "diskspace":
check = DiskCheck.objects.get(pk=pk)
eml = DiskCheckEmail
elif check_type == "cpuload":
check = CpuLoadCheck.objects.get(pk=pk)
eml = CpuLoadCheckEmail
elif check_type == "memory":
check = MemCheck.objects.get(pk=pk)
eml = MemCheckEmail
elif check_type == "winsvc":
check = WinServiceCheck.objects.get(pk=pk)
eml = WinServiceCheckEmail
else:
return {"error": "no check"}
try:
latest_email = (
eml.objects.filter(email=check).order_by("-sent")[:1].get()
)
except Exception as e:
# first time sending email
eml(email=check).save()
check.send_email()
else:
last_sent = latest_email.sent
delta = datetime.datetime.now(
timezone.utc
) - datetime.timedelta(hours=24)
# send an email only if the last email sent is older than 24 hours
if last_sent < delta:
eml(email=check).save()
check.send_email()
return "ok"
@app.task
def determine_agent_status():
agents = Agent.objects.all()
offline = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=4)
for agent in agents:
overdue = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=agent.overdue_time)
if (agent.last_seen < offline) and (agent.last_seen > overdue):
agent.status = "offline"
elif (agent.last_seen < offline) and (agent.last_seen < overdue):
agent.status = "overdue"
else:
agent.status = "online"
agent.save(update_fields=["status"])
return "ok"
@app.task
def disk_check_alert():
agents_with_checks = DiskCheck.objects.all()
if agents_with_checks:
for diskcheck in agents_with_checks:
agent = Agent.objects.get(pk=diskcheck.agent.pk)
total = agent.disks[diskcheck.disk]["total"]
free = agent.disks[diskcheck.disk]["free"]
diskcheck.more_info = f"Total: {total}B, Free: {free}B"
diskcheck.save(update_fields=["more_info"])
percent_used = agent.disks[diskcheck.disk]["percent"]
percent_free = 100 - percent_used
if percent_free < diskcheck.threshold:
diskcheck.status = "failing"
diskcheck.save(update_fields=["status", "last_run"])
if diskcheck.email_alert:
handle_check_email_alert_task.delay("diskspace", diskcheck.pk)
else:
if diskcheck.status != "passing":
diskcheck.status = "passing"
diskcheck.save(update_fields=["status"])
diskcheck.save(update_fields=["last_run"])
return "ok"
@app.task
def cpu_load_check_alert():
agents_with_checks = CpuLoadCheck.objects.all()
if agents_with_checks:
for check in agents_with_checks:
threshold = check.cpuload
agent = Agent.objects.get(pk=check.agent.pk)
try:
cpuhistory = CpuHistory.objects.get(agent=agent)
except ObjectDoesNotExist:
pass
else:
check.more_info = cpuhistory.format_nice()
check.save(update_fields=["more_info", "last_run"])
avg = int(mean(cpuhistory.cpu_history))
if avg > threshold:
check.status = "failing"
check.save(update_fields=["status"])
if check.email_alert:
handle_check_email_alert_task.delay("cpuload", check.pk)
else:
if check.status != "passing":
check.status = "passing"
check.save(update_fields=["status"])
check.save(update_fields=["last_run"])
return "ok"
@app.task
def restart_win_service_task(pk, svcname):
agent = Agent.objects.get(pk=pk)
resp = agent.salt_api_cmd(
hostname = agent.hostname,
timeout = 60,
func = f"service.restart",
arg = svcname,
)
data = resp.json()
if not data["return"][0][agent.hostname]:
return {"error": f"restart service {svcname} failed on {agent.hostname}"}
return "ok"
@app.task
def win_service_check_task():
agents_with_checks = WinServiceCheck.objects.all()
if agents_with_checks:
for check in agents_with_checks:
alert = False
agent = Agent.objects.get(pk=check.agent.pk)
status = list(filter(lambda x: x["name"] == check.svc_name, agent.services))[0]["status"]
if status == "running":
check.status = "passing"
if check.failure_count != 0:
check.failure_count = 0
check.save(update_fields=["failure_count"])
elif status == "start_pending":
if check.pass_if_start_pending:
check.status = "passing"
if check.failure_count != 0:
check.failure_count = 0
check.save(update_fields=["failure_count"])
else:
check.status = "failing"
new_count = check.failure_count + 1
check.failure_count = new_count
if new_count >= check.failures:
alert = True
check.save(update_fields=["failure_count"])
else:
check.status = "failing"
new_count = check.failure_count + 1
check.failure_count = new_count
if new_count >= check.failures:
alert = True
check.save(update_fields=["failure_count"])
if check.restart_if_stopped:
if status == "stopped":
restart_win_service_task.delay(agent.pk, check.svc_name)
check.more_info = f"Status {status.upper()}"
check.save(update_fields=["status", "more_info", "last_run"])
if alert:
if check.email_alert:
handle_check_email_alert_task.delay("winsvc", check.pk)
return "ok"
@app.task
def mem_check_alert():
agents_with_checks = MemCheck.objects.all()
if agents_with_checks:
for check in agents_with_checks:
threshold = check.threshold
agent = Agent.objects.get(pk=check.agent.pk)
try:
memhistory = MemoryHistory.objects.get(agent=agent)
except ObjectDoesNotExist:
pass
else:
check.more_info = memhistory.format_nice()
check.save(update_fields=["more_info", "last_run"])
avg = int(mean(memhistory.mem_history))
if avg > threshold:
check.status = "failing"
check.save(update_fields=["status"])
if check.email_alert:
handle_check_email_alert_task.delay("memory", check.pk)
else:
if check.status != "passing":
check.status = "passing"
check.save(update_fields=["status"])
check.save(update_fields=["last_run"])
return "ok"