sources: delay source replacement

Wait for four consecutive source selections giving a bad status
(falseticker, bad distance or jittery) before triggering the source
replacement. This should reduce the rate of unnecessary replacements
and shorten the time needed to find a solution when unreplaceable
falsetickers are preventing other sources from forming a majority due
to switching back and forth to unreachable servers.
This commit is contained in:
Miroslav Lichvar 2023-06-15 12:54:32 +02:00
parent 123cb497b9
commit 2aefadd129
3 changed files with 71 additions and 22 deletions

View file

@ -112,6 +112,9 @@ struct SRC_Instance_Record {
/* Updates left before allowing combining */
int distant;
/* Updates with a status requiring source replacement */
int bad;
/* Flag indicating the status of the source */
SRC_Status status;
@ -178,6 +181,9 @@ static int reported_no_majority; /* Flag to avoid repeated log message
/* Number of updates needed to reset the distant status */
#define DISTANT_PENALTY 32
/* Number of updates needed to trigger handling of bad sources */
#define BAD_HANDLE_THRESHOLD 4
static double max_distance;
static double max_jitter;
static double reselect_distance;
@ -340,6 +346,7 @@ SRC_ResetInstance(SRC_Instance instance)
instance->reachability = 0;
instance->reachability_size = 0;
instance->distant = 0;
instance->bad = 0;
instance->status = SRC_BAD_STATS;
instance->sel_score = 1.0;
instance->stratum = 0;
@ -679,15 +686,20 @@ mark_source(SRC_Instance inst, SRC_Status status)
/* Try to replace NTP sources that are falsetickers, or have a root
distance or jitter larger than the allowed maximums */
if (inst == last_updated_inst) {
if (status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY)
if (inst->bad < INT_MAX &&
(status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY))
inst->bad++;
else
inst->bad = 0;
if (inst->bad >= BAD_HANDLE_THRESHOLD)
handle_bad_source(inst);
}
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d leap=%d vote=%d lo=%f hi=%f",
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d bad=%d leap=%d vote=%d lo=%f hi=%f",
source_to_string(inst), get_status_char(inst->status),
(unsigned int)inst->sel_options, (unsigned int)inst->reachability,
inst->reachability_size, inst->updates,
inst->distant, (int)inst->leap, inst->leap_vote,
inst->distant, inst->bad, (int)inst->leap, inst->leap_vote,
inst->sel_info.lo_limit, inst->sel_info.hi_limit);
if (logfileid == -1)

View file

@ -46,23 +46,4 @@ check_sync || test_fail
check_file_messages "20.*192.168.123.*" 15 17 measurements.log || test_fail
rm -f tmp/measurements.log
servers=6
falsetickers=2
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
wander=1e-12
jitter=1e-6
min_sync_time=7
run_test || test_fail
check_chronyd_exit || test_fail
check_source_selection || test_fail
check_packet_interval || test_fail
check_sync || test_fail
check_log_messages "Detected falseticker" 2 10 || test_fail
check_log_messages "Source 192.168.123.. replaced with" 1 1 || test_fail
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
check_file_messages "20.*00:\(00:07\|01:..\) 192.168.123.* 11.1 6 6 " 1 1 measurements.log || test_fail
rm -f tmp/measurements.log
test_pass

56
test/simulation/148-replacement Executable file
View file

@ -0,0 +1,56 @@
#!/usr/bin/env bash
. ./test.common
test_start "source replacement"
limit=5000
client_conf="logdir tmp
log measurements"
servers=6
falsetickers=2
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
wander=1e-12
jitter=1e-6
min_sync_time=7
run_test || test_fail
check_chronyd_exit || test_fail
check_source_selection || test_fail
check_packet_interval || test_fail
check_sync || test_fail
check_log_messages "Detected falseticker" 2 10 || test_fail
check_log_messages "Source 192.168.123.. replaced with" 1 3 || test_fail
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
check_file_messages "20.*00:[1-5].:.. 192.168.123.* 11.1 6 6 " 1 4 measurements.log || test_fail
rm -f tmp/measurements.log
# 1 unreplaceable falseticker against 2 replaceable unreachable servers
servers=5
falsetickers=1
limit=200000
base_delay="(+ 1e-4 (* -1 (equal 0.6 to 4.5)))"
client_conf+="
minsources 2"
client_server_conf="
server 192.168.123.1
server nodes-2-4.net1.clk
server nodes-3-5.net1.clk"
max_sync_time=150000
run_test || test_fail
check_chronyd_exit || test_fail
check_source_selection && test_fail
check_packet_interval || test_fail
check_sync || test_fail
check_log_messages "Detected falseticker" 2 10 || test_fail
check_log_messages "Source 192.168.123.. replaced with" 2 70 || test_fail
check_log_messages "2010-01-01T0[0-4]:.*Source 192.168.123.. replaced with" 2 15 || test_fail
check_log_messages "2010-01-01T0[5-9]:.*Source 192.168.123.. replaced with" 0 15 || test_fail
check_file_messages "20.*192.168.123.* 11.1 6 6 " 20 500 measurements.log || test_fail
rm -f tmp/measurements.log
test_pass