Perf: Batch systemd queries in list_executions endpoints

odesenfans · nesitor · commit 4dbda19b7945 · 2026-01-27T16:46:03.000+01:00
Replace per-VM systemd D-Bus calls with a single batch query using
ListUnits(). This reduces the number of D-Bus calls from O(n) to O(1)
for persistent VMs, significantly improving response times on CRNs
with many instances.

- Add get_services_active_states() method to SystemDManager that
  queries all service states in one ListUnits() call
- Add _get_executions_running_states() helper in views to pre-fetch
  all running states efficiently
- Update list_executions and list_executions_v2 to use batch query
diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py
@@ -188,9 +188,43 @@ async def about_executions(request: web.Request) -> web.Response:
     )
 
 
+def _get_executions_running_states(pool: VmPool) -> dict[ItemHash, bool]:
+    """Get running state for all executions efficiently using batch systemd query.
+
+    For persistent VMs, this uses a single D-Bus call to get all service states
+    instead of one call per VM, which is much faster when there are many persistent VMs.
+    """
+    # Collect persistent executions that need systemd check
+    persistent_services: dict[str, ItemHash] = {}
+    for item_hash, execution in pool.executions.items():
+        if execution.persistent and execution.systemd_manager:
+            persistent_services[execution.controller_service] = item_hash
+
+    # Batch query systemd for all persistent services at once
+    service_states: dict[str, bool] = {}
+    if persistent_services:
+        service_states = pool.systemd_manager.get_services_active_states(list(persistent_services.keys()))
+
+    # Build running states for all executions
+    running_states: dict[ItemHash, bool] = {}
+    for item_hash, execution in pool.executions.items():
+        if execution.persistent and execution.systemd_manager:
+            # Use batch result for persistent VMs
+            running_states[item_hash] = service_states.get(execution.controller_service, False)
+        else:
+            # Use timestamp check for non-persistent VMs
+            running_states[item_hash] = bool(execution.times.starting_at and not execution.times.stopping_at)
+
+    return running_states
+
+
 @cors_allow_all
 async def list_executions(request: web.Request) -> web.Response:
     pool: VmPool = request.app["vm_pool"]
+
+    # Get running states efficiently using batch systemd query
+    running_states = _get_executions_running_states(pool)
+
     return web.json_response(
         {
             item_hash: {
@@ -200,7 +234,7 @@ async def list_executions(request: web.Request) -> web.Response:
                 },
             }
             for item_hash, execution in pool.executions.items()
-            if execution.is_running
+            if running_states.get(item_hash, False)
         },
         dumps=dumps_for_json,
     )
@@ -211,6 +245,9 @@ async def list_executions_v2(request: web.Request) -> web.Response:
     """List all executions. Returning their status and ip"""
     pool: VmPool = request.app["vm_pool"]
 
+    # Get running states efficiently using batch systemd query
+    running_states = _get_executions_running_states(pool)
+
     return web.json_response(
         {
             item_hash: {
@@ -227,7 +264,7 @@ async def list_executions_v2(request: web.Request) -> web.Response:
                     else {}
                 ),
                 "status": execution.times,
-                "running": execution.is_running,
+                "running": running_states.get(item_hash, False),
             }
             for item_hash, execution in pool.executions.items()
         },
diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py
@@ -130,6 +130,48 @@ def is_service_active(self, service: str) -> bool:
             logger.error(error)
             return False
 
+    def get_services_active_states(self, services: list[str]) -> dict[str, bool]:
+        """Get active state of multiple services in a single D-Bus call.
+
+        This is much more efficient than calling is_service_active() for each service,
+        as it uses ListUnits() which returns all loaded units in one call.
+
+        Args:
+            services: List of service names to check (e.g., ["aleph-vm-controller@hash.service"])
+
+        Returns:
+            Dictionary mapping service name to active state (True if active, False otherwise)
+        """
+        if not services:
+            return {}
+
+        try:
+            manager = self._get_manager()
+            units = manager.ListUnits()
+
+            # Build lookup from ListUnits() result
+            # ListUnits returns: (name, description, load_state, active_state, sub_state,
+            #                     following, unit_path, job_id, job_type, job_path)
+            active_states: dict[str, bool] = {}
+            service_set = set(services)
+
+            for unit in units:
+                name = str(unit[0])
+                if name in service_set:
+                    active_state = str(unit[3])
+                    active_states[name] = active_state == "active"
+
+            # Services not in ListUnits() output are not loaded (treat as inactive)
+            for service in services:
+                if service not in active_states:
+                    active_states[service] = False
+
+            return active_states
+        except DBusException as error:
+            logger.error(f"Failed to get services active states: {error}")
+            # Return all as inactive on error
+            return {service: False for service in services}
+
     async def enable_and_start(self, service: str) -> None:
         if not self.is_service_enabled(service):
             self.enable(service)