sources: delay source replacement
Wait for four consecutive source selections giving a bad status (falseticker, bad distance or jittery) before triggering the source replacement. This should reduce the rate of unnecessary replacements and shorten the time needed to find a solution when unreplaceable falsetickers are preventing other sources from forming a majority due to switching back and forth to unreachable servers.
This commit is contained in:
parent
123cb497b9
commit
2aefadd129
3 changed files with 71 additions and 22 deletions
18
sources.c
18
sources.c
|
@ -112,6 +112,9 @@ struct SRC_Instance_Record {
|
|||
/* Updates left before allowing combining */
|
||||
int distant;
|
||||
|
||||
/* Updates with a status requiring source replacement */
|
||||
int bad;
|
||||
|
||||
/* Flag indicating the status of the source */
|
||||
SRC_Status status;
|
||||
|
||||
|
@ -178,6 +181,9 @@ static int reported_no_majority; /* Flag to avoid repeated log message
|
|||
/* Number of updates needed to reset the distant status */
|
||||
#define DISTANT_PENALTY 32
|
||||
|
||||
/* Number of updates needed to trigger handling of bad sources */
|
||||
#define BAD_HANDLE_THRESHOLD 4
|
||||
|
||||
static double max_distance;
|
||||
static double max_jitter;
|
||||
static double reselect_distance;
|
||||
|
@ -340,6 +346,7 @@ SRC_ResetInstance(SRC_Instance instance)
|
|||
instance->reachability = 0;
|
||||
instance->reachability_size = 0;
|
||||
instance->distant = 0;
|
||||
instance->bad = 0;
|
||||
instance->status = SRC_BAD_STATS;
|
||||
instance->sel_score = 1.0;
|
||||
instance->stratum = 0;
|
||||
|
@ -679,15 +686,20 @@ mark_source(SRC_Instance inst, SRC_Status status)
|
|||
/* Try to replace NTP sources that are falsetickers, or have a root
|
||||
distance or jitter larger than the allowed maximums */
|
||||
if (inst == last_updated_inst) {
|
||||
if (status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY)
|
||||
if (inst->bad < INT_MAX &&
|
||||
(status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY))
|
||||
inst->bad++;
|
||||
else
|
||||
inst->bad = 0;
|
||||
if (inst->bad >= BAD_HANDLE_THRESHOLD)
|
||||
handle_bad_source(inst);
|
||||
}
|
||||
|
||||
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d leap=%d vote=%d lo=%f hi=%f",
|
||||
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d bad=%d leap=%d vote=%d lo=%f hi=%f",
|
||||
source_to_string(inst), get_status_char(inst->status),
|
||||
(unsigned int)inst->sel_options, (unsigned int)inst->reachability,
|
||||
inst->reachability_size, inst->updates,
|
||||
inst->distant, (int)inst->leap, inst->leap_vote,
|
||||
inst->distant, inst->bad, (int)inst->leap, inst->leap_vote,
|
||||
inst->sel_info.lo_limit, inst->sel_info.hi_limit);
|
||||
|
||||
if (logfileid == -1)
|
||||
|
|
|
@ -46,23 +46,4 @@ check_sync || test_fail
|
|||
check_file_messages "20.*192.168.123.*" 15 17 measurements.log || test_fail
|
||||
rm -f tmp/measurements.log
|
||||
|
||||
servers=6
|
||||
falsetickers=2
|
||||
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
|
||||
wander=1e-12
|
||||
jitter=1e-6
|
||||
min_sync_time=7
|
||||
|
||||
run_test || test_fail
|
||||
check_chronyd_exit || test_fail
|
||||
check_source_selection || test_fail
|
||||
check_packet_interval || test_fail
|
||||
check_sync || test_fail
|
||||
|
||||
check_log_messages "Detected falseticker" 2 10 || test_fail
|
||||
check_log_messages "Source 192.168.123.. replaced with" 1 1 || test_fail
|
||||
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
|
||||
check_file_messages "20.*00:\(00:07\|01:..\) 192.168.123.* 11.1 6 6 " 1 1 measurements.log || test_fail
|
||||
rm -f tmp/measurements.log
|
||||
|
||||
test_pass
|
||||
|
|
56
test/simulation/148-replacement
Executable file
56
test/simulation/148-replacement
Executable file
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
. ./test.common
|
||||
|
||||
test_start "source replacement"
|
||||
|
||||
limit=5000
|
||||
client_conf="logdir tmp
|
||||
log measurements"
|
||||
|
||||
servers=6
|
||||
falsetickers=2
|
||||
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
|
||||
wander=1e-12
|
||||
jitter=1e-6
|
||||
min_sync_time=7
|
||||
|
||||
run_test || test_fail
|
||||
check_chronyd_exit || test_fail
|
||||
check_source_selection || test_fail
|
||||
check_packet_interval || test_fail
|
||||
check_sync || test_fail
|
||||
|
||||
check_log_messages "Detected falseticker" 2 10 || test_fail
|
||||
check_log_messages "Source 192.168.123.. replaced with" 1 3 || test_fail
|
||||
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
|
||||
check_file_messages "20.*00:[1-5].:.. 192.168.123.* 11.1 6 6 " 1 4 measurements.log || test_fail
|
||||
rm -f tmp/measurements.log
|
||||
|
||||
# 1 unreplaceable falseticker against 2 replaceable unreachable servers
|
||||
servers=5
|
||||
falsetickers=1
|
||||
limit=200000
|
||||
base_delay="(+ 1e-4 (* -1 (equal 0.6 to 4.5)))"
|
||||
client_conf+="
|
||||
minsources 2"
|
||||
client_server_conf="
|
||||
server 192.168.123.1
|
||||
server nodes-2-4.net1.clk
|
||||
server nodes-3-5.net1.clk"
|
||||
max_sync_time=150000
|
||||
|
||||
run_test || test_fail
|
||||
check_chronyd_exit || test_fail
|
||||
check_source_selection && test_fail
|
||||
check_packet_interval || test_fail
|
||||
check_sync || test_fail
|
||||
|
||||
check_log_messages "Detected falseticker" 2 10 || test_fail
|
||||
check_log_messages "Source 192.168.123.. replaced with" 2 70 || test_fail
|
||||
check_log_messages "2010-01-01T0[0-4]:.*Source 192.168.123.. replaced with" 2 15 || test_fail
|
||||
check_log_messages "2010-01-01T0[5-9]:.*Source 192.168.123.. replaced with" 0 15 || test_fail
|
||||
check_file_messages "20.*192.168.123.* 11.1 6 6 " 20 500 measurements.log || test_fail
|
||||
rm -f tmp/measurements.log
|
||||
|
||||
test_pass
|
Loading…
Reference in a new issue