From 2aefadd129c57fa8169bace240accb511790aa86 Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Thu, 15 Jun 2023 12:54:32 +0200 Subject: [PATCH] sources: delay source replacement Wait for four consecutive source selections giving a bad status (falseticker, bad distance or jittery) before triggering the source replacement. This should reduce the rate of unnecessary replacements and shorten the time needed to find a solution when unreplaceable falsetickers are preventing other sources from forming a majority due to switching back and forth to unreachable servers. --- sources.c | 18 +++++++++-- test/simulation/137-pool | 19 ----------- test/simulation/148-replacement | 56 +++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 22 deletions(-) create mode 100755 test/simulation/148-replacement diff --git a/sources.c b/sources.c index 4f6b41d..0d6b1c0 100644 --- a/sources.c +++ b/sources.c @@ -112,6 +112,9 @@ struct SRC_Instance_Record { /* Updates left before allowing combining */ int distant; + /* Updates with a status requiring source replacement */ + int bad; + /* Flag indicating the status of the source */ SRC_Status status; @@ -178,6 +181,9 @@ static int reported_no_majority; /* Flag to avoid repeated log message /* Number of updates needed to reset the distant status */ #define DISTANT_PENALTY 32 +/* Number of updates needed to trigger handling of bad sources */ +#define BAD_HANDLE_THRESHOLD 4 + static double max_distance; static double max_jitter; static double reselect_distance; @@ -340,6 +346,7 @@ SRC_ResetInstance(SRC_Instance instance) instance->reachability = 0; instance->reachability_size = 0; instance->distant = 0; + instance->bad = 0; instance->status = SRC_BAD_STATS; instance->sel_score = 1.0; instance->stratum = 0; @@ -679,15 +686,20 @@ mark_source(SRC_Instance inst, SRC_Status status) /* Try to replace NTP sources that are falsetickers, or have a root distance or jitter larger than the allowed maximums */ if (inst == last_updated_inst) { - if (status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY) + if (inst->bad < INT_MAX && + (status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY)) + inst->bad++; + else + inst->bad = 0; + if (inst->bad >= BAD_HANDLE_THRESHOLD) handle_bad_source(inst); } - DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d leap=%d vote=%d lo=%f hi=%f", + DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d bad=%d leap=%d vote=%d lo=%f hi=%f", source_to_string(inst), get_status_char(inst->status), (unsigned int)inst->sel_options, (unsigned int)inst->reachability, inst->reachability_size, inst->updates, - inst->distant, (int)inst->leap, inst->leap_vote, + inst->distant, inst->bad, (int)inst->leap, inst->leap_vote, inst->sel_info.lo_limit, inst->sel_info.hi_limit); if (logfileid == -1) diff --git a/test/simulation/137-pool b/test/simulation/137-pool index 2e9564e..de8d77d 100755 --- a/test/simulation/137-pool +++ b/test/simulation/137-pool @@ -46,23 +46,4 @@ check_sync || test_fail check_file_messages "20.*192.168.123.*" 15 17 measurements.log || test_fail rm -f tmp/measurements.log -servers=6 -falsetickers=2 -client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst" -wander=1e-12 -jitter=1e-6 -min_sync_time=7 - -run_test || test_fail -check_chronyd_exit || test_fail -check_source_selection || test_fail -check_packet_interval || test_fail -check_sync || test_fail - -check_log_messages "Detected falseticker" 2 10 || test_fail -check_log_messages "Source 192.168.123.. replaced with" 1 1 || test_fail -check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail -check_file_messages "20.*00:\(00:07\|01:..\) 192.168.123.* 11.1 6 6 " 1 1 measurements.log || test_fail -rm -f tmp/measurements.log - test_pass diff --git a/test/simulation/148-replacement b/test/simulation/148-replacement new file mode 100755 index 0000000..b4be53b --- /dev/null +++ b/test/simulation/148-replacement @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +. ./test.common + +test_start "source replacement" + +limit=5000 +client_conf="logdir tmp +log measurements" + +servers=6 +falsetickers=2 +client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst" +wander=1e-12 +jitter=1e-6 +min_sync_time=7 + +run_test || test_fail +check_chronyd_exit || test_fail +check_source_selection || test_fail +check_packet_interval || test_fail +check_sync || test_fail + +check_log_messages "Detected falseticker" 2 10 || test_fail +check_log_messages "Source 192.168.123.. replaced with" 1 3 || test_fail +check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail +check_file_messages "20.*00:[1-5].:.. 192.168.123.* 11.1 6 6 " 1 4 measurements.log || test_fail +rm -f tmp/measurements.log + +# 1 unreplaceable falseticker against 2 replaceable unreachable servers +servers=5 +falsetickers=1 +limit=200000 +base_delay="(+ 1e-4 (* -1 (equal 0.6 to 4.5)))" +client_conf+=" +minsources 2" +client_server_conf=" +server 192.168.123.1 +server nodes-2-4.net1.clk +server nodes-3-5.net1.clk" +max_sync_time=150000 + +run_test || test_fail +check_chronyd_exit || test_fail +check_source_selection && test_fail +check_packet_interval || test_fail +check_sync || test_fail + +check_log_messages "Detected falseticker" 2 10 || test_fail +check_log_messages "Source 192.168.123.. replaced with" 2 70 || test_fail +check_log_messages "2010-01-01T0[0-4]:.*Source 192.168.123.. replaced with" 2 15 || test_fail +check_log_messages "2010-01-01T0[5-9]:.*Source 192.168.123.. replaced with" 0 15 || test_fail +check_file_messages "20.*192.168.123.* 11.1 6 6 " 20 500 measurements.log || test_fail +rm -f tmp/measurements.log + +test_pass