agent recovery improvements

2020-11-25 23:48:14 +00:00 · 2020-11-25 23:48:14 +00:00 · 6e048b2a12
parent f9657599c2
commit 6e048b2a12
5 changed files with 74 additions and 19 deletions
--- a/api/tacticalrmm/agents/migrations/0026_auto_20201125_2334.py
+++ b/api/tacticalrmm/agents/migrations/0026_auto_20201125_2334.py
@ -0,0 +1,18 @@
+# Generated by Django 3.1.3 on 2020-11-25 23:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('agents', '0025_auto_20201122_0407'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='recoveryaction',
+            name='mode',
+            field=models.CharField(choices=[('salt', 'Salt'), ('mesh', 'Mesh'), ('command', 'Command'), ('rpc', 'Nats RPC'), ('checkrunner', 'Checkrunner')], default='mesh', max_length=50),
+        ),
+    ]
--- a/api/tacticalrmm/agents/models.py
+++ b/api/tacticalrmm/agents/models.py
@ -753,6 +753,7 @@ RECOVERY_CHOICES = [
    ("mesh", "Mesh"),
    ("command", "Command"),
    ("rpc", "Nats RPC"),
+    ("checkrunner", "Checkrunner"),
 ]


--- a/api/tacticalrmm/agents/tests.py
+++ b/api/tacticalrmm/agents/tests.py
@ -640,14 +640,17 @@ class TestAgentViews(TacticalTestCase):
        self.check_not_authenticated("post", url)

    @patch("agents.models.Agent.nats_cmd")
-    def test_recover_mesh(self, mock_ret):
+    def test_recover_mesh(self, nats_cmd):
        url = f"/agents/{self.agent.pk}/recovermesh/"
-        mock_ret.return_value = True
+        nats_cmd.return_value = "ok"
        r = self.client.get(url)
        self.assertEqual(r.status_code, 200)
        self.assertIn(self.agent.hostname, r.data)
+        nats_cmd.assert_called_with(
+            {"func": "recover", "payload": {"mode": "mesh"}}, timeout=45
+        )

-        mock_ret.return_value = "timeout"
+        nats_cmd.return_value = "timeout"
        r = self.client.get(url)
        self.assertEqual(r.status_code, 400)

--- a/api/tacticalrmm/agents/views.py
+++ b/api/tacticalrmm/agents/views.py
@ -636,25 +636,51 @@ def install_agent(request):
@api_view(["POST"])
 def recover(request):
    agent = get_object_or_404(Agent, pk=request.data["pk"])
+    mode = request.data["mode"]

    if pyver.parse(agent.version) <= pyver.parse("0.9.5"):
        return notify_error("Only available in agent version greater than 0.9.5")

+    if not agent.has_nats:
+        if mode == "tacagent" or mode == "checkrunner":
+            return notify_error("Requires agent version 1.1.0 or greater")
+
+    # attempt a realtime recovery if supported, otherwise fall back to old recovery method
+    if agent.has_nats:
+        if (
+            mode == "tacagent"
+            or mode == "checkrunner"
+            or mode == "salt"
+            or mode == "mesh"
+        ):
+            data = {"func": "recover", "payload": {"mode": mode}}
+            r = asyncio.run(agent.nats_cmd(data, timeout=10))
+            if r == "ok":
+                return Response("Successfully completed recovery")
+
    if agent.recoveryactions.filter(last_run=None).exists():
        return notify_error(
            "A recovery action is currently pending. Please wait for the next agent check-in."
        )

-    if request.data["mode"] == "command" and not request.data["cmd"]:
+    if mode == "command" and not request.data["cmd"]:
        return notify_error("Command is required")

+    # if we've made it this far and realtime recovery didn't work,
+    # tacagent service is the fallback recovery so we obv can't use that to recover itself if it's down
+    if mode == "tacagent":
+        return notify_error(
+            "Requires RPC service to be functional. Please recover that first"
+        )
+
+    # we should only get here if all other methods fail
    RecoveryAction(
        agent=agent,
-        mode=request.data["mode"],
-        command=request.data["cmd"] if request.data["mode"] == "command" else None,
+        mode=mode,
+        command=request.data["cmd"] if mode == "command" else None,
    ).save()

-    return Response(f"Recovery will be attempted on the agent's next check-in")
+    return Response("Recovery will be attempted on the agent's next check-in")


@api_view(["POST"])
@ -695,8 +721,10 @@ def recover_mesh(request, pk):
    agent = get_object_or_404(Agent, pk=pk)
    if not agent.has_nats:
        return notify_error("Requires agent version 1.1.0 or greater")
-    r = asyncio.run(agent.nats_cmd({"func": "recovermesh"}, timeout=45))
-    if r == "timeout":
+
+    data = {"func": "recover", "payload": {"mode": "mesh"}}
+    r = asyncio.run(agent.nats_cmd(data, timeout=45))
+    if r != "ok":
        return notify_error("Unable to contact the agent")

    return Response(f"Repaired mesh agent on {agent.hostname}")
--- a/web/src/components/modals/agents/AgentRecovery.vue
+++ b/web/src/components/modals/agents/AgentRecovery.vue
@ -11,29 +11,31 @@
        <div class="q-gutter-sm">
          <q-radio dense v-model="mode" val="mesh" label="Mesh Agent" />
          <q-radio dense v-model="mode" val="rpc" label="Tactical RPC" />
+          <q-radio dense v-model="mode" val="tacagent" label="Tactical Agent" />
+          <q-radio dense v-model="mode" val="checkrunner" label="Tactical Checkrunner" />
          <q-radio dense v-model="mode" val="salt" label="Salt Minion" />
          <q-radio dense v-model="mode" val="command" label="Shell Command" />
        </div>
      </q-card-section>
      <q-card-section v-show="mode === 'mesh'">
-        <p>Fix issues with the Mesh Agent which handles take control, terminal and file browser.</p>
+        <p>Fix issues with the Mesh Agent which handles take control, live terminal and file browser.</p>
+      </q-card-section>
+      <q-card-section v-show="mode === 'tacagent'">
+        <p>Fix issues with the TacticalAgent windows service which handles agent check-in and os info.</p>
+      </q-card-section>
+      <q-card-section v-show="mode === 'checkrunner'">
+        <p>Fix issues with the Tactical Checkrunner windows service which handles running all checks.</p>
      </q-card-section>
      <q-card-section v-show="mode === 'salt'">
-        <p>
-          Fix issues with the salt-minion (do this if getting alot of errors about not being able to contact the agent
-          even if it's online).
-        </p>
+        <p>Fix issues with the salt-minion which handles windows updates, chocolatey and scheduled tasks.</p>
      </q-card-section>
      <q-card-section v-show="mode === 'rpc'">
-        <p>
-          Fix issues with the Tactical RPC service (do this if getting alot of errors about not being able to contact
-          the agent even if it's online).
-        </p>
+        <p>Fix issues with the Tactical RPC service which handles most of the agent's realtime functions.</p>
      </q-card-section>
      <q-card-section v-show="mode === 'command'">
        <p>Run a shell command on the agent.</p>
        <p>You should use the 'Send Command' feature from the agent's context menu for sending shell commands.</p>
-        <p>Only use this as a last resort if unable to recover the salt-minion.</p>
+        <p>Only use this as a last resort if unable to recover the Tactical RPC service.</p>
        <q-input
          ref="input"
          v-model="cmd"
@ -82,6 +84,7 @@ export default {
  },
  methods: {
    recover() {
+      this.$q.loading.show();
      const data = {
        pk: this.pk,
        cmd: this.cmd,
@ -90,10 +93,12 @@ export default {
      this.$axios
        .post("/agents/recover/", data)
        .then(r => {
+          this.$q.loading.hide();
          this.$emit("close");
          this.notifySuccess(r.data, 5000);
        })
        .catch(e => {
+          this.$q.loading.hide();
          this.notifyError(e.response.data, 5000);
        });
    },