chrony/sys_linux.c
Miroslav Lichvar 14687d003d sys: set tick_update_hz to 100 by default in Linux driver
We can't reliably detect the internal kernel HZ, it may not even be
fixed (CONFIG_NO_HZ). Use a fixed value of 100.
2014-05-23 16:15:28 +02:00

519 lines
15 KiB
C

/*
chronyd/chronyc - Programs for keeping computer clocks accurate.
**********************************************************************
* Copyright (C) Richard P. Curnow 1997-2003
* Copyright (C) John G. Hasler 2009
* Copyright (C) Miroslav Lichvar 2009-2012
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
**********************************************************************
=======================================================================
This is the module specific to the Linux operating system.
*/
#include "config.h"
#include "sysincl.h"
#include <sys/utsname.h>
#if defined(HAVE_SCHED_SETSCHEDULER)
# include <sched.h>
int SchedPriority = 0;
#endif
#if defined(HAVE_MLOCKALL)
# include <sys/mman.h>
#include <sys/resource.h>
int LockAll = 0;
#endif
#ifdef FEAT_LINUXCAPS
#include <sys/types.h>
#include <pwd.h>
#include <sys/prctl.h>
#include <sys/capability.h>
#include <grp.h>
#endif
#include "sys_generic.h"
#include "sys_linux.h"
#include "conf.h"
#include "logging.h"
#include "wrap_adjtimex.h"
/* This is the uncompensated system tick value */
static int nominal_tick;
/* The maximum amount by which 'tick' can be biased away from 'nominal_tick'
(sys_adjtimex() in the kernel bounds this to 10%) */
static int max_tick_bias;
/* This is the scaling required to go between absolute ppm and the
scaled ppm used as an argument to adjtimex. Because chronyd is to an extent
'closed loop' maybe it doesn't matter if this is wrongly determined, UNLESS
the system's ppm error is close to a multiple of HZ, in which case the
relationship between changing the frequency and changing the value of 'tick'
will be wrong. This would (I imagine) cause the system to thrash between
two states.
However..., if this effect was not corrected, and the system is left offline
for a long period, a substantial error would build up. e.g. with HZ==100,
the correction required is 128/128.125, giving a drift of about 84 seconds
per day). */
static double freq_scale;
/* The kernel HZ constant (USER_HZ in recent kernels). */
static int hz;
static double dhz; /* And dbl prec version of same for arithmetic */
/* Flag indicating whether adjtimex() can step the clock */
static int have_setoffset;
/* The assumed rate at which the effective frequency and tick values are
updated in the kernel */
static int tick_update_hz;
/* ================================================== */
inline static long
our_round(double x) {
long y;
if (x > 0.0)
y = x + 0.5;
else
y = x - 0.5;
return y;
}
/* ================================================== */
/* Positive means currently fast of true time, i.e. jump backwards */
static void
apply_step_offset(double offset)
{
if (TMX_ApplyStepOffset(-offset) < 0) {
LOG_FATAL(LOGF_SysLinux, "adjtimex() failed");
}
}
/* ================================================== */
/* This call sets the Linux kernel frequency to a given value in parts
per million relative to the nominal running frequency. Nominal is taken to
be tick=10000, freq=0 (for a HZ==100 system, other values otherwise). The
convention is that this is called with a positive argument if the local
clock runs fast when uncompensated. */
static double
set_frequency(double freq_ppm)
{
long required_tick;
double required_freq; /* what we use */
double scaled_freq; /* what adjtimex & the kernel use */
int required_delta_tick;
required_delta_tick = our_round(freq_ppm / dhz);
required_freq = -(freq_ppm - dhz * required_delta_tick);
required_tick = nominal_tick - required_delta_tick;
scaled_freq = freq_scale * required_freq;
if (TMX_SetFrequency(&scaled_freq, required_tick) < 0) {
LOG_FATAL(LOGF_SysLinux, "adjtimex failed for set_frequency, freq_ppm=%10.4e scaled_freq=%10.4e required_tick=%ld",
freq_ppm, scaled_freq, required_tick);
}
return dhz * (nominal_tick - required_tick) - scaled_freq / freq_scale;
}
/* ================================================== */
/* Read the ppm frequency from the kernel */
static double
read_frequency(void)
{
double tick_term;
double unscaled_freq;
double freq_term;
long tick;
if (TMX_GetFrequency(&unscaled_freq, &tick) < 0) {
LOG_FATAL(LOGF_SysLinux, "adjtimex() failed");
}
tick_term = dhz * (double)(nominal_tick - tick);
freq_term = unscaled_freq / freq_scale;
#if 0
LOG(LOGS_INFO, LOGF_SysLinux, "txc.tick=%ld txc.freq=%ld tick_term=%f freq_term=%f",
txc.tick, txc.freq, tick_term, freq_term);
#endif
return tick_term - freq_term;
}
/* ================================================== */
static void
set_leap(int leap)
{
if (TMX_SetLeap(leap) < 0) {
LOG_FATAL(LOGF_SysLinux, "adjtimex() failed in set_leap");
}
LOG(LOGS_INFO, LOGF_SysLinux, "System clock status set to %s leap second",
leap ? (leap > 0 ? "insert" : "delete") : "not insert/delete");
}
/* ================================================== */
/* Estimate the value of HZ given the value of txc.tick that chronyd finds when
* it starts. The only credible values are 100 (Linux/x86) or powers of 2.
* Also, the bounds checking inside the kernel's adjtimex system call enforces
* a +/- 10% movement of tick away from the nominal value 1e6/HZ. */
static void
guess_hz_and_shift_hz(int tick, int *hz, int *shift_hz)
{
int i, tick_lo, tick_hi, ihz;
double tick_nominal;
/* Pick off the hz=100 case first */
if (tick >= 9000 && tick <= 11000) {
*hz = 100;
*shift_hz = 7;
return;
}
for (i=4; i<16; i++) { /* surely 16 .. 32768 is a wide enough range? */
ihz = 1 << i;
tick_nominal = 1.0e6 / (double) ihz;
tick_lo = (int)(0.5 + tick_nominal*2.0/3.0);
tick_hi = (int)(0.5 + tick_nominal*4.0/3.0);
if (tick_lo < tick && tick <= tick_hi) {
*hz = ihz;
*shift_hz = i;
return;
}
}
/* oh dear. doomed. */
*hz = 0;
*shift_hz = 0;
}
/* ================================================== */
static int
get_hz_and_shift_hz(int *hz, int *shift_hz)
{
#ifdef _SC_CLK_TCK
if ((*hz = sysconf(_SC_CLK_TCK)) < 1) {
return 0;
}
if (*hz == 100) {
*shift_hz = 7;
return 1;
}
for (*shift_hz = 1; (*hz >> *shift_hz) > 1; (*shift_hz)++)
;
return 1;
#else
return 0;
#endif
}
/* ================================================== */
static int
kernelvercmp(int major1, int minor1, int patch1,
int major2, int minor2, int patch2)
{
if (major1 != major2)
return major1 - major2;
if (minor1 != minor2)
return minor1 - minor2;
return patch1 - patch2;
}
/* ================================================== */
/* Compute the scaling to use on any frequency we set, according to
the vintage of the Linux kernel being used. */
static void
get_version_specific_details(void)
{
int major, minor, patch;
int shift_hz;
double dshift_hz;
double basic_freq_scale; /* what to use if HZ!=100 */
int config_hz, set_config_hz; /* values of HZ from conf file */
int set_config_freq_scale;
double config_freq_scale;
struct tmx_params tmx_params;
struct utsname uts;
if (!get_hz_and_shift_hz(&hz, &shift_hz)) {
TMX_ReadCurrentParams(&tmx_params);
guess_hz_and_shift_hz(tmx_params.tick, &hz, &shift_hz);
if (!shift_hz) {
LOG_FATAL(LOGF_SysLinux, "Can't determine hz (txc.tick=%ld txc.freq=%ld (%.8f) txc.offset=%ld)",
tmx_params.tick, tmx_params.freq, tmx_params.dfreq, tmx_params.offset);
} else {
#if 0
LOG(LOGS_INFO, LOGF_SysLinux, "Initial txc.tick=%ld txc.freq=%ld (%.8f) txc.offset=%ld => hz=%d shift_hz=%d",
tmx_params.tick, tmx_params.freq, tmx_params.dfreq, tmx_params.offset, hz, shift_hz);
#endif
}
}
CNF_GetLinuxHz(&set_config_hz, &config_hz);
if (set_config_hz) hz = config_hz;
/* (If true, presumably freq_scale will be overridden anyway, making shift_hz
redundant too.) */
dhz = (double) hz;
dshift_hz = (double)(1UL << shift_hz);
basic_freq_scale = dshift_hz / dhz;
nominal_tick = (1000000L + (hz/2))/hz; /* Mirror declaration in kernel */
max_tick_bias = nominal_tick / 10;
/* We can't reliably detect the internal kernel HZ, it may not even be fixed
(CONFIG_NO_HZ aka tickless), assume the lowest commonly used fixed rate */
tick_update_hz = 100;
/* The basic_freq_scale comes from:
* the kernel increments the usec counter HZ times per second (if the timer
interrupt period were perfect)
* the following code in the kernel
time_adj (+/-)= ltemp >>
(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
causes the adjtimex 'freq' value to be divided down by 1<<SHIFT_HZ.
The net effect is that we have to scale up the value we want by the
reciprocal of all this, i.e. multiply by (1<<SHIFT_HZ)/HZ.
If HZ==100, this code in the kernel comes into play too:
#if HZ == 100
* Compensate for (HZ==100) != (1 << SHIFT_HZ).
* Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
*
if (time_adj < 0)
time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
else
time_adj += (time_adj >> 2) + (time_adj >> 5);
#endif
Special case that later.
*/
if (uname(&uts) < 0) {
LOG_FATAL(LOGF_SysLinux, "Cannot uname(2) to get kernel version, sorry.");
}
patch = 0;
if (sscanf(uts.release, "%d.%d.%d", &major, &minor, &patch) < 2) {
LOG_FATAL(LOGF_SysLinux, "Cannot read information from uname, sorry");
}
DEBUG_LOG(LOGF_SysLinux, "Linux kernel major=%d minor=%d patch=%d", major, minor, patch);
if (kernelvercmp(major, minor, patch, 2, 2, 0) < 0) {
LOG_FATAL(LOGF_SysLinux, "Kernel version not supported, sorry.");
}
if (kernelvercmp(major, minor, patch, 2, 6, 27) < 0) {
freq_scale = (hz == 100) ? (128.0 / 128.125) : basic_freq_scale;
} else {
/* These don't seem to need scaling */
freq_scale = 1.0;
if (kernelvercmp(major, minor, patch, 2, 6, 33) < 0) {
/* Tickless kernels before 2.6.33 accumulated ticks only in
half-second intervals */
tick_update_hz = 2;
}
}
/* ADJ_SETOFFSET support */
if (kernelvercmp(major, minor, patch, 2, 6, 39) < 0) {
have_setoffset = 0;
} else {
have_setoffset = 1;
}
/* Override freq_scale if it appears in conf file */
CNF_GetLinuxFreqScale(&set_config_freq_scale, &config_freq_scale);
if (set_config_freq_scale) {
freq_scale = config_freq_scale;
}
DEBUG_LOG(LOGF_SysLinux, "hz=%d shift_hz=%d freq_scale=%.8f nominal_tick=%d max_tick_bias=%d",
hz, shift_hz, freq_scale, nominal_tick, max_tick_bias);
}
/* ================================================== */
/* Initialisation code for this module */
void
SYS_Linux_Initialise(void)
{
get_version_specific_details();
if (TMX_ResetOffset() < 0) {
LOG_FATAL(LOGF_SysLinux, "adjtimex() failed");
}
if (have_setoffset && TMX_TestStepOffset() < 0) {
LOG(LOGS_INFO, LOGF_SysLinux, "adjtimex() doesn't support ADJ_SETOFFSET");
have_setoffset = 0;
}
TMX_SetSync(CNF_GetRtcSync());
SYS_Generic_CompleteFreqDriver(1.0e6 * max_tick_bias / nominal_tick,
1.0 / tick_update_hz,
read_frequency, set_frequency,
have_setoffset ? apply_step_offset : NULL,
set_leap);
}
/* ================================================== */
/* Finalisation code for this module */
void
SYS_Linux_Finalise(void)
{
SYS_Generic_Finalise();
}
/* ================================================== */
#ifdef FEAT_LINUXCAPS
void
SYS_Linux_DropRoot(char *user)
{
struct passwd *pw;
cap_t cap;
if (user == NULL)
return;
if ((pw = getpwnam(user)) == NULL) {
LOG_FATAL(LOGF_SysLinux, "getpwnam(%s) failed", user);
}
if (prctl(PR_SET_KEEPCAPS, 1)) {
LOG_FATAL(LOGF_SysLinux, "prcap() failed");
}
if (setgroups(0, NULL)) {
LOG_FATAL(LOGF_SysLinux, "setgroups() failed");
}
if (setgid(pw->pw_gid)) {
LOG_FATAL(LOGF_SysLinux, "setgid(%d) failed", pw->pw_gid);
}
if (setuid(pw->pw_uid)) {
LOG_FATAL(LOGF_SysLinux, "setuid(%d) failed", pw->pw_uid);
}
if ((cap = cap_from_text("cap_sys_time=ep")) == NULL) {
LOG_FATAL(LOGF_SysLinux, "cap_from_text() failed");
}
if (cap_set_proc(cap)) {
LOG_FATAL(LOGF_SysLinux, "cap_set_proc() failed");
}
cap_free(cap);
#if 0
LOG(LOGS_INFO, LOGF_SysLinux, "Privileges dropped to user %s", user);
#endif
}
#endif
/* ================================================== */
#if defined(HAVE_SCHED_SETSCHEDULER)
/* Install SCHED_FIFO real-time scheduler with specified priority */
void SYS_Linux_SetScheduler(int SchedPriority)
{
int pmax, pmin;
struct sched_param sched;
if (SchedPriority < 1 || SchedPriority > 99) {
LOG_FATAL(LOGF_SysLinux, "Bad scheduler priority: %d", SchedPriority);
} else {
sched.sched_priority = SchedPriority;
pmax = sched_get_priority_max(SCHED_FIFO);
pmin = sched_get_priority_min(SCHED_FIFO);
if ( SchedPriority > pmax ) {
sched.sched_priority = pmax;
}
else if ( SchedPriority < pmin ) {
sched.sched_priority = pmin;
}
if ( sched_setscheduler(0, SCHED_FIFO, &sched) == -1 ) {
LOG(LOGS_ERR, LOGF_SysLinux, "sched_setscheduler() failed");
}
else {
#if 0
LOG(LOGS_INFO, LOGF_SysLinux, "Enabled SCHED_FIFO with priority %d", sched.sched_priority);
#endif
}
}
}
#endif /* HAVE_SCHED_SETSCHEDULER */
#if defined(HAVE_MLOCKALL)
/* Lock the process into RAM so that it will never be swapped out */
void SYS_Linux_MemLockAll(int LockAll)
{
struct rlimit rlim;
if (LockAll == 1 ) {
/* Make sure that we will be able to lock all the memory we need */
/* even after dropping privileges. This does not actually reaerve any memory */
rlim.rlim_max = RLIM_INFINITY;
rlim.rlim_cur = RLIM_INFINITY;
if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
LOG(LOGS_ERR, LOGF_SysLinux, "setrlimit() failed: not locking into RAM");
}
else {
if (mlockall(MCL_CURRENT|MCL_FUTURE) < 0) {
LOG(LOGS_ERR, LOGF_SysLinux, "mlockall() failed");
}
else {
#if 0
LOG(LOGS_INFO, LOGF_SysLinux, "Successfully locked into RAM");
#endif
}
}
}
}
#endif /* HAVE_MLOCKALL */