sources: delay source replacement
Wait for four consecutive source selections giving a bad status (falseticker, bad distance or jittery) before triggering the source replacement. This should reduce the rate of unnecessary replacements and shorten the time needed to find a solution when unreplaceable falsetickers are preventing other sources from forming a majority due to switching back and forth to unreachable servers.
This commit is contained in:
parent
123cb497b9
commit
2aefadd129
3 changed files with 71 additions and 22 deletions
18
sources.c
18
sources.c
|
@ -112,6 +112,9 @@ struct SRC_Instance_Record {
|
||||||
/* Updates left before allowing combining */
|
/* Updates left before allowing combining */
|
||||||
int distant;
|
int distant;
|
||||||
|
|
||||||
|
/* Updates with a status requiring source replacement */
|
||||||
|
int bad;
|
||||||
|
|
||||||
/* Flag indicating the status of the source */
|
/* Flag indicating the status of the source */
|
||||||
SRC_Status status;
|
SRC_Status status;
|
||||||
|
|
||||||
|
@ -178,6 +181,9 @@ static int reported_no_majority; /* Flag to avoid repeated log message
|
||||||
/* Number of updates needed to reset the distant status */
|
/* Number of updates needed to reset the distant status */
|
||||||
#define DISTANT_PENALTY 32
|
#define DISTANT_PENALTY 32
|
||||||
|
|
||||||
|
/* Number of updates needed to trigger handling of bad sources */
|
||||||
|
#define BAD_HANDLE_THRESHOLD 4
|
||||||
|
|
||||||
static double max_distance;
|
static double max_distance;
|
||||||
static double max_jitter;
|
static double max_jitter;
|
||||||
static double reselect_distance;
|
static double reselect_distance;
|
||||||
|
@ -340,6 +346,7 @@ SRC_ResetInstance(SRC_Instance instance)
|
||||||
instance->reachability = 0;
|
instance->reachability = 0;
|
||||||
instance->reachability_size = 0;
|
instance->reachability_size = 0;
|
||||||
instance->distant = 0;
|
instance->distant = 0;
|
||||||
|
instance->bad = 0;
|
||||||
instance->status = SRC_BAD_STATS;
|
instance->status = SRC_BAD_STATS;
|
||||||
instance->sel_score = 1.0;
|
instance->sel_score = 1.0;
|
||||||
instance->stratum = 0;
|
instance->stratum = 0;
|
||||||
|
@ -679,15 +686,20 @@ mark_source(SRC_Instance inst, SRC_Status status)
|
||||||
/* Try to replace NTP sources that are falsetickers, or have a root
|
/* Try to replace NTP sources that are falsetickers, or have a root
|
||||||
distance or jitter larger than the allowed maximums */
|
distance or jitter larger than the allowed maximums */
|
||||||
if (inst == last_updated_inst) {
|
if (inst == last_updated_inst) {
|
||||||
if (status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY)
|
if (inst->bad < INT_MAX &&
|
||||||
|
(status == SRC_FALSETICKER || status == SRC_BAD_DISTANCE || status == SRC_JITTERY))
|
||||||
|
inst->bad++;
|
||||||
|
else
|
||||||
|
inst->bad = 0;
|
||||||
|
if (inst->bad >= BAD_HANDLE_THRESHOLD)
|
||||||
handle_bad_source(inst);
|
handle_bad_source(inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d leap=%d vote=%d lo=%f hi=%f",
|
DEBUG_LOG("%s status=%c options=%x reach=%o/%d updates=%d distant=%d bad=%d leap=%d vote=%d lo=%f hi=%f",
|
||||||
source_to_string(inst), get_status_char(inst->status),
|
source_to_string(inst), get_status_char(inst->status),
|
||||||
(unsigned int)inst->sel_options, (unsigned int)inst->reachability,
|
(unsigned int)inst->sel_options, (unsigned int)inst->reachability,
|
||||||
inst->reachability_size, inst->updates,
|
inst->reachability_size, inst->updates,
|
||||||
inst->distant, (int)inst->leap, inst->leap_vote,
|
inst->distant, inst->bad, (int)inst->leap, inst->leap_vote,
|
||||||
inst->sel_info.lo_limit, inst->sel_info.hi_limit);
|
inst->sel_info.lo_limit, inst->sel_info.hi_limit);
|
||||||
|
|
||||||
if (logfileid == -1)
|
if (logfileid == -1)
|
||||||
|
|
|
@ -46,23 +46,4 @@ check_sync || test_fail
|
||||||
check_file_messages "20.*192.168.123.*" 15 17 measurements.log || test_fail
|
check_file_messages "20.*192.168.123.*" 15 17 measurements.log || test_fail
|
||||||
rm -f tmp/measurements.log
|
rm -f tmp/measurements.log
|
||||||
|
|
||||||
servers=6
|
|
||||||
falsetickers=2
|
|
||||||
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
|
|
||||||
wander=1e-12
|
|
||||||
jitter=1e-6
|
|
||||||
min_sync_time=7
|
|
||||||
|
|
||||||
run_test || test_fail
|
|
||||||
check_chronyd_exit || test_fail
|
|
||||||
check_source_selection || test_fail
|
|
||||||
check_packet_interval || test_fail
|
|
||||||
check_sync || test_fail
|
|
||||||
|
|
||||||
check_log_messages "Detected falseticker" 2 10 || test_fail
|
|
||||||
check_log_messages "Source 192.168.123.. replaced with" 1 1 || test_fail
|
|
||||||
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
|
|
||||||
check_file_messages "20.*00:\(00:07\|01:..\) 192.168.123.* 11.1 6 6 " 1 1 measurements.log || test_fail
|
|
||||||
rm -f tmp/measurements.log
|
|
||||||
|
|
||||||
test_pass
|
test_pass
|
||||||
|
|
56
test/simulation/148-replacement
Executable file
56
test/simulation/148-replacement
Executable file
|
@ -0,0 +1,56 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
. ./test.common
|
||||||
|
|
||||||
|
test_start "source replacement"
|
||||||
|
|
||||||
|
limit=5000
|
||||||
|
client_conf="logdir tmp
|
||||||
|
log measurements"
|
||||||
|
|
||||||
|
servers=6
|
||||||
|
falsetickers=2
|
||||||
|
client_server_conf="pool nodes-1-2-3-4-5-6.net1.clk maxsources 5 polltarget 1 iburst"
|
||||||
|
wander=1e-12
|
||||||
|
jitter=1e-6
|
||||||
|
min_sync_time=7
|
||||||
|
|
||||||
|
run_test || test_fail
|
||||||
|
check_chronyd_exit || test_fail
|
||||||
|
check_source_selection || test_fail
|
||||||
|
check_packet_interval || test_fail
|
||||||
|
check_sync || test_fail
|
||||||
|
|
||||||
|
check_log_messages "Detected falseticker" 2 10 || test_fail
|
||||||
|
check_log_messages "Source 192.168.123.. replaced with" 1 3 || test_fail
|
||||||
|
check_file_messages "20.*192.168.123.* 11.1 6 6 " 15 17 measurements.log || test_fail
|
||||||
|
check_file_messages "20.*00:[1-5].:.. 192.168.123.* 11.1 6 6 " 1 4 measurements.log || test_fail
|
||||||
|
rm -f tmp/measurements.log
|
||||||
|
|
||||||
|
# 1 unreplaceable falseticker against 2 replaceable unreachable servers
|
||||||
|
servers=5
|
||||||
|
falsetickers=1
|
||||||
|
limit=200000
|
||||||
|
base_delay="(+ 1e-4 (* -1 (equal 0.6 to 4.5)))"
|
||||||
|
client_conf+="
|
||||||
|
minsources 2"
|
||||||
|
client_server_conf="
|
||||||
|
server 192.168.123.1
|
||||||
|
server nodes-2-4.net1.clk
|
||||||
|
server nodes-3-5.net1.clk"
|
||||||
|
max_sync_time=150000
|
||||||
|
|
||||||
|
run_test || test_fail
|
||||||
|
check_chronyd_exit || test_fail
|
||||||
|
check_source_selection && test_fail
|
||||||
|
check_packet_interval || test_fail
|
||||||
|
check_sync || test_fail
|
||||||
|
|
||||||
|
check_log_messages "Detected falseticker" 2 10 || test_fail
|
||||||
|
check_log_messages "Source 192.168.123.. replaced with" 2 70 || test_fail
|
||||||
|
check_log_messages "2010-01-01T0[0-4]:.*Source 192.168.123.. replaced with" 2 15 || test_fail
|
||||||
|
check_log_messages "2010-01-01T0[5-9]:.*Source 192.168.123.. replaced with" 0 15 || test_fail
|
||||||
|
check_file_messages "20.*192.168.123.* 11.1 6 6 " 20 500 measurements.log || test_fail
|
||||||
|
rm -f tmp/measurements.log
|
||||||
|
|
||||||
|
test_pass
|
Loading…
Reference in a new issue