tacticalrmm/api/djangormm/checks/tasks.py

import datetime
from datetime import timezone
from statistics import mean

from djangormm.celery import app
from django.core.exceptions import ObjectDoesNotExist

from agents.models import Agent
from .models import (
    DiskCheck,
    DiskCheckEmail,
    PingCheck,
    PingCheckEmail,
    CpuLoadCheck,
    CpuLoadCheckEmail,
    CpuHistory,
    MemCheck,
    MemoryHistory,
    MemCheckEmail,
    WinServiceCheck,
    WinServiceCheckEmail
)

@app.task
def handle_check_email_alert_task(check_type, pk):
    if check_type == "ping":
        check = PingCheck.objects.get(pk=pk)
        eml = PingCheckEmail
    elif check_type == "diskspace":
        check = DiskCheck.objects.get(pk=pk)
        eml = DiskCheckEmail
    elif check_type == "cpuload":
        check = CpuLoadCheck.objects.get(pk=pk)
        eml = CpuLoadCheckEmail
    elif check_type == "memory":
        check = MemCheck.objects.get(pk=pk)
        eml = MemCheckEmail
    elif check_type == "winsvc":
        check = WinServiceCheck.objects.get(pk=pk)
        eml = WinServiceCheckEmail
    else:
        return {"error": "no check"}

    try:
        latest_email = (
            eml.objects.filter(email=check).order_by("-sent")[:1].get()
        )
    except Exception as e:
        # first time sending email
        eml(email=check).save()
        check.send_email()
    else:
        last_sent = latest_email.sent
        delta = datetime.datetime.now(
            timezone.utc
        ) - datetime.timedelta(hours=24)
        # send an email only if the last email sent is older than 24 hours
        if last_sent < delta:
            eml(email=check).save()
            check.send_email()

    return "ok"

@app.task
def determine_agent_status():
    agents = Agent.objects.all()
    offline = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=4)

    for agent in agents:
        overdue = datetime.datetime.now(timezone.utc) - datetime.timedelta(minutes=agent.overdue_time)
        if (agent.last_seen < offline) and (agent.last_seen > overdue):
            agent.status = "offline"
        elif (agent.last_seen < offline) and (agent.last_seen < overdue):
            agent.status = "overdue"
        else:
            agent.status = "online"
        agent.save(update_fields=["status"])

    return "ok"

@app.task
def disk_check_alert():
    agents_with_checks = DiskCheck.objects.all()
    if agents_with_checks:
        for diskcheck in agents_with_checks:
            agent = Agent.objects.get(pk=diskcheck.agent.pk)
            total = agent.disks[diskcheck.disk]["total"]
            free = agent.disks[diskcheck.disk]["free"]
            diskcheck.more_info = f"Total: {total}B, Free: {free}B"
            diskcheck.save(update_fields=["more_info"])
            percent_used = agent.disks[diskcheck.disk]["percent"]
            percent_free = 100 - percent_used
            if percent_free < diskcheck.threshold:
                diskcheck.status = "failing"
                diskcheck.save(update_fields=["status", "last_run"])
                if diskcheck.email_alert:
                    handle_check_email_alert_task.delay("diskspace", diskcheck.pk)
            else:
                if diskcheck.status != "passing":
                    diskcheck.status = "passing"
                    diskcheck.save(update_fields=["status"])

                diskcheck.save(update_fields=["last_run"])
    return "ok"


@app.task
def cpu_load_check_alert():
    agents_with_checks = CpuLoadCheck.objects.all()
    if agents_with_checks:
        for check in agents_with_checks:
            threshold = check.cpuload
            agent = Agent.objects.get(pk=check.agent.pk)
            try:
                cpuhistory = CpuHistory.objects.get(agent=agent)
            except ObjectDoesNotExist:
                pass
            else:
                check.more_info = cpuhistory.format_nice()
                check.save(update_fields=["more_info", "last_run"])
                avg = int(mean(cpuhistory.cpu_history))
                if avg > threshold:
                    check.status = "failing"
                    check.save(update_fields=["status"])

                    if check.email_alert:
                        handle_check_email_alert_task.delay("cpuload", check.pk)

                else:
                    if check.status != "passing":
                        check.status = "passing"
                        check.save(update_fields=["status"])
                    check.save(update_fields=["last_run"])

    return "ok"

@app.task
def restart_win_service_task(pk, svcname):
    agent = Agent.objects.get(pk=pk)
    resp = agent.salt_api_cmd(
        hostname = agent.hostname,
        timeout = 60,
        func = f"service.restart",
        arg = svcname,
    )
    data = resp.json()
    if not data["return"][0][agent.hostname]:
        return {"error": f"restart service {svcname} failed on {agent.hostname}"}
    return "ok"

@app.task
def win_service_check_task():
    agents_with_checks = WinServiceCheck.objects.all()
    if agents_with_checks:
        for check in agents_with_checks:
            alert = False
            agent = Agent.objects.get(pk=check.agent.pk)
            status = list(filter(lambda x: x["name"] == check.svc_name, agent.services))[0]["status"]
            if status == "running":
                check.status = "passing"
                if check.failure_count != 0:
                    check.failure_count = 0
                    check.save(update_fields=["failure_count"])
            elif status == "start_pending":
                if check.pass_if_start_pending:
                    check.status = "passing"
                    if check.failure_count != 0:
                        check.failure_count = 0
                        check.save(update_fields=["failure_count"])
                else:
                    check.status = "failing"
                    new_count = check.failure_count + 1
                    check.failure_count = new_count
                    if new_count >= check.failures:
                        alert = True
                    check.save(update_fields=["failure_count"])

            else:
                check.status = "failing"
                new_count = check.failure_count + 1
                check.failure_count = new_count
                if new_count >= check.failures:
                    alert = True
                check.save(update_fields=["failure_count"])

            if check.restart_if_stopped:
                if status == "stopped":
                    restart_win_service_task.delay(agent.pk, check.svc_name)

            check.more_info = f"Status {status.upper()}"
            check.save(update_fields=["status", "more_info", "last_run"])

            if alert:
                if check.email_alert:
                    handle_check_email_alert_task.delay("winsvc", check.pk)

    return "ok"


@app.task
def mem_check_alert():
    agents_with_checks = MemCheck.objects.all()
    if agents_with_checks:
        for check in agents_with_checks:
            threshold = check.threshold
            agent = Agent.objects.get(pk=check.agent.pk)
            try:
                memhistory = MemoryHistory.objects.get(agent=agent)
            except ObjectDoesNotExist:
                pass
            else:
                check.more_info = memhistory.format_nice()
                check.save(update_fields=["more_info", "last_run"])
                avg = int(mean(memhistory.mem_history))
                if avg > threshold:
                    check.status = "failing"
                    check.save(update_fields=["status"])

                    if check.email_alert:
                        handle_check_email_alert_task.delay("memory", check.pk)

                else:
                    if check.status != "passing":
                        check.status = "passing"
                        check.save(update_fields=["status"])
                    check.save(update_fields=["last_run"])

    return "ok"