Skip to content

Commit c980483

Browse files
committed
Fix: servers with temporarily inflated RTT never recover
When noticeFailure() poisons a server's RTT EWMA during a brief outage, the server stops being selected and its metrics never improve. Add a generic recovery mechanism: gradually decay the RTT of dormant servers toward their initial measured value, replacing the old estimator-only recovery path.
1 parent 24ccca0 commit c980483

1 file changed

Lines changed: 58 additions & 18 deletions

File tree

dnscrypt-proxy/serversInfo.go

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ type ServerInfo struct {
6969
totalQueries uint64 // Total queries sent to this server
7070
failedQueries uint64 // Failed queries count
7171
lastUpdateTime time.Time // Last time metrics were updated
72+
lastDecayTS time.Time // Last time RTT was decayed for recovery
7273
}
7374

7475
type LBStrategy interface {
@@ -310,7 +311,6 @@ func (serversInfo *ServersInfo) estimatorUpdate(currentActive int) {
310311
serversInfo.inner[currentActive].rtt.Set(currentActiveRtt)
311312
return
312313
}
313-
partialSort := false
314314
if candidateRtt < currentActiveRtt {
315315
serversInfo.inner[candidate], serversInfo.inner[currentActive] = serversInfo.inner[currentActive], serversInfo.inner[candidate]
316316
dlog.Debugf(
@@ -319,27 +319,65 @@ func (serversInfo *ServersInfo) estimatorUpdate(currentActive int) {
319319
int(candidateRtt),
320320
int(currentActiveRtt),
321321
)
322-
partialSort = true
323-
} else if candidateRtt > 0 && candidateRtt >= (serversInfo.inner[0].rtt.Value()+serversInfo.inner[activeCount-1].rtt.Value())/2.0*4.0 {
324-
if time.Since(serversInfo.inner[candidate].lastActionTS) > time.Duration(1*time.Minute) {
325-
serversInfo.inner[candidate].rtt.Add(candidateRtt / 2.0)
326-
dlog.Debugf(
327-
"Giving a new chance to candidate [%s], lowering its RTT from %d to %d (best: %d)",
328-
serversInfo.inner[candidate].Name,
329-
int(candidateRtt),
330-
int(serversInfo.inner[candidate].rtt.Value()),
331-
int(serversInfo.inner[0].rtt.Value()),
332-
)
333-
partialSort = true
322+
serversInfo.sortByRtt()
323+
}
324+
}
325+
326+
func (serversInfo *ServersInfo) sortByRtt() {
327+
for i := 1; i < len(serversInfo.inner); i++ {
328+
if serversInfo.inner[i-1].rtt.Value() > serversInfo.inner[i].rtt.Value() {
329+
serversInfo.inner[i-1], serversInfo.inner[i] = serversInfo.inner[i], serversInfo.inner[i-1]
334330
}
335331
}
336-
if partialSort {
337-
for i := 1; i < serversCount; i++ {
338-
if serversInfo.inner[i-1].rtt.Value() > serversInfo.inner[i].rtt.Value() {
339-
serversInfo.inner[i-1], serversInfo.inner[i] = serversInfo.inner[i], serversInfo.inner[i-1]
340-
}
332+
}
333+
334+
func (serversInfo *ServersInfo) recoverDormantServers() {
335+
if len(serversInfo.inner) <= 1 {
336+
return
337+
}
338+
bestRtt := serversInfo.inner[0].rtt.Value()
339+
for _, server := range serversInfo.inner {
340+
if rtt := server.rtt.Value(); rtt > 0 && rtt < bestRtt {
341+
bestRtt = rtt
341342
}
342343
}
344+
if bestRtt <= 0 {
345+
return
346+
}
347+
now := time.Now()
348+
needsSort := false
349+
for _, server := range serversInfo.inner {
350+
currentRtt := server.rtt.Value()
351+
if currentRtt <= bestRtt*4.0 {
352+
continue
353+
}
354+
if now.Sub(server.lastActionTS) <= time.Minute {
355+
continue
356+
}
357+
if now.Sub(server.lastDecayTS) <= 10*time.Second {
358+
continue
359+
}
360+
targetRtt := float64(server.initialRtt)
361+
if targetRtt <= 0 {
362+
targetRtt = bestRtt
363+
}
364+
server.rtt.Add(targetRtt)
365+
newRtt := server.rtt.Value()
366+
server.totalQueries = 0
367+
server.failedQueries = 0
368+
server.lastDecayTS = now
369+
needsSort = true
370+
dlog.Debugf(
371+
"Giving a new chance to [%s], lowering its RTT from %d to %d (best: %d)",
372+
server.Name,
373+
int(currentRtt),
374+
int(newRtt),
375+
int(bestRtt),
376+
)
377+
}
378+
if needsSort {
379+
serversInfo.sortByRtt()
380+
}
343381
}
344382

345383
func (serversInfo *ServersInfo) getOne() *ServerInfo {
@@ -350,6 +388,8 @@ func (serversInfo *ServersInfo) getOne() *ServerInfo {
350388
return nil
351389
}
352390

391+
serversInfo.recoverDormantServers()
392+
353393
var candidate int
354394

355395
// Check if using WP2 strategy

0 commit comments

Comments
 (0)