[pdm-devel] [PATCH datacenter-manager 5/7] server: don't try to connect to known-unreachable servers

Tue Feb 4 10:55:52 CET 2025

Signed-off-by: Wolfgang Bumiller <w.bumiller at proxmox.com>
---
 server/src/connection.rs       | 93 +++++++++++++++++++++++++---------
 server/src/remote_cache/mod.rs |  6 +++
 2 files changed, 74 insertions(+), 25 deletions(-)

diff --git a/server/src/connection.rs b/server/src/connection.rs
index 24e2e44..7c1c237 100644
--- a/server/src/connection.rs
+++ b/server/src/connection.rs
@@ -130,14 +130,17 @@ fn prepare_connect_multi_client(remote: &Remote) -> Result<(MultiClient, Connect
     let mut clients = Vec::new();
 
     for node in &remote.nodes {
-        clients.push(Arc::new(prepare_connect_client_to_node(
-            node,
-            info.default_port,
-            info.pve_compat,
-        )?));
+        clients.push(MultiClientEntry {
+            client: Arc::new(prepare_connect_client_to_node(
+                node,
+                info.default_port,
+                info.pve_compat,
+            )?),
+            hostname: node.hostname.clone(),
+        });
     }
 
-    Ok((MultiClient::new(clients), info))
+    Ok((MultiClient::new(remote.id.clone(), clients), info))
 }
 
 /// Like [`connect()`], but with failover support for remotes which can have multiple nodes.
@@ -449,6 +452,14 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
     }
 }
 
+/// In order to allow the [`MultiClient`] to check the cached reachability state of a client, we
+/// need to know which remote it belongs to, so store the metadata alongside the actual `Client`
+/// struct.
+struct MultiClientEntry {
+    client: Arc<Client>,
+    hostname: String,
+}
+
 /// This is another wrapper around the actual HTTP client responsible for dealing with connection
 /// problems: if we cannot reach a node of a cluster, this will attempt to retry a request on
 /// another node.
@@ -456,19 +467,15 @@ pub fn init(instance: Box<dyn ClientFactory + Send + Sync>) {
 /// # Possible improvements
 ///
 /// - For `GET` requests we could also start a 2nd request after a shorter time out (eg. 10s).
-/// - We could use RRD data for a "best guess" where to start eg. if we know a node was offline on
-///   the last rrd polling we'd start with a different one.
-///   For this, we still need to include the node names in the clients here somehow so that we can
-///   actually manage/re-shuffle them from the outside after this struct is already created.
 struct MultiClient {
     state: StdMutex<MultiClientState>,
     timeout: Duration,
 }
 
 impl MultiClient {
-    fn new(clients: Vec<Arc<Client>>) -> Self {
+    fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
         Self {
-            state: StdMutex::new(MultiClientState::new(clients)),
+            state: StdMutex::new(MultiClientState::new(remote, entries)),
             timeout: Duration::from_secs(60),
         }
     }
@@ -477,8 +484,8 @@ impl MultiClient {
     where
         F: Fn(&Arc<Client>),
     {
-        for client in &self.state.lock().unwrap().clients {
-            func(client);
+        for entry in &self.state.lock().unwrap().entries {
+            func(&entry.client);
         }
     }
 }
@@ -488,15 +495,24 @@ impl MultiClient {
 struct MultiClientState {
     /// The current index *not* modulo the client count.
     current: usize,
-    clients: Vec<Arc<Client>>,
+    remote: String,
+    entries: Vec<MultiClientEntry>,
 }
 
 impl MultiClientState {
-    fn new(clients: Vec<Arc<Client>>) -> Self {
-        Self {
+    fn new(remote: String, entries: Vec<MultiClientEntry>) -> Self {
+        let mut this = Self {
             current: 0,
-            clients,
-        }
+            remote,
+            entries,
+        };
+        this.skip_unreachable();
+        this
+    }
+
+    /// Moving to the next entry must wrap.
+    fn next(&mut self) {
+        self.current = self.current.wrapping_add(1);
     }
 
     /// Whenever a request fails with the *current* client we move the current entry forward.
@@ -507,20 +523,47 @@ impl MultiClientState {
     /// the strategy, we may want to change this to something like `1 + max(current, failed)`.
     fn failed(&mut self, failed_index: usize) {
         if self.current == failed_index {
-            self.current = self.current.wrapping_add(1);
+            let entry = self.get_entry();
+            log::error!("marking client {} as unreachable", entry.hostname);
+            if let Ok(mut cache) = crate::remote_cache::RemoteMappingCache::write() {
+                cache.mark_host_reachable(&self.remote, &entry.hostname, false);
+                let _ = cache.save();
+            }
+            self.next();
+            self.skip_unreachable();
+        }
+    }
+
+    /// Skip ahead as long as we're pointing to an unreachable.
+    fn skip_unreachable(&mut self) {
+        let cache = crate::remote_cache::RemoteMappingCache::get();
+        // loop at most as many times as we have entries...
+        for _ in 0..self.entries.len() {
+            let entry = self.get_entry();
+            if !cache.host_is_reachable(&self.remote, &entry.hostname) {
+                log::error!("skipping host {} - marked unreachable", entry.hostname);
+                self.next();
+            } else {
+                return;
+            }
         }
     }
 
-    /// Get `current` as an *index* (i.e. modulo `clients.len()`).
+    /// Get `current` as an *index* (i.e. modulo `entries.len()`).
     fn index(&self) -> usize {
-        self.current % self.clients.len()
+        self.current % self.entries.len()
+    }
+
+    /// Get the current entry.
+    fn get_entry(&self) -> &MultiClientEntry {
+        &self.entries[self.index()]
     }
 
     /// Get the current client and its index which can be passed to `failed()` if the client fails
     /// to connect.
     fn get(&self) -> (Arc<Client>, usize) {
         let index = self.index();
-        (Arc::clone(&self.clients[index]), self.current)
+        (Arc::clone(&self.entries[index].client), self.current)
     }
 
     /// Check if we already tried all clients since a specific starting index.
@@ -533,11 +576,11 @@ impl MultiClientState {
     /// request-retry-loop will fail as soon as the same *number* of clients since its starting
     /// point were marked as faulty without retrying them all.
     fn tried_all_since(&self, start: usize) -> bool {
-        self.tried_clients(start) >= self.clients.len()
+        self.tried_clients(start) >= self.entries.len()
     }
 
     /// We store the current index continuously without wrapping it modulo the client count (and
-    /// only do that when indexing the `clients` array), so that we can easily check if "all
+    /// only do that when indexing the `entries` array), so that we can easily check if "all
     /// currently running tasks taken together" have already tried all clients by comparing our
     /// starting point to the current index.
     fn tried_clients(&self, start: usize) -> usize {
diff --git a/server/src/remote_cache/mod.rs b/server/src/remote_cache/mod.rs
index 69e79f1..57e4bf3 100644
--- a/server/src/remote_cache/mod.rs
+++ b/server/src/remote_cache/mod.rs
@@ -218,6 +218,12 @@ impl RemoteMappingCache {
             remote.set_node_name(hostname, node_name);
         }
     }
+
+    /// Check if a host is reachable.
+    pub fn host_is_reachable(&self, remote: &str, hostname: &str) -> bool {
+        self.info_by_hostname(remote, hostname)
+            .is_none_or(|info| info.reachable)
+    }
 }
 
 /// An entry for a remote in a [`RemoteMappingCache`].
-- 
2.39.5