#include "libcflat.h" #include "smp.h" #include "atomic.h" #include "processor.h" #include "kvmclock.h" #include "asm/barrier.h" #define unlikely(x) __builtin_expect(!!(x), 0) #define likely(x) __builtin_expect(!!(x), 1) struct pvclock_vcpu_time_info __attribute__((aligned(4))) hv_clock[MAX_CPU]; struct pvclock_wall_clock wall_clock; static unsigned char valid_flags = 0; static atomic64_t last_value = ATOMIC64_INIT(0); /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ u32 tmp1, tmp2; #endif if (shift < 0) delta >>= -shift; else delta <<= shift; #ifdef __i386__ __asm__ ( "mul %5 ; " "mov %4,%%eax ; " "mov %%edx,%4 ; " "mul %5 ; " "xor %5,%5 ; " "add %4,%%eax ; " "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); #elif defined(__x86_64__) __asm__ ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); #else #error implement me! #endif return product; } #ifdef __i386__ # define do_div(n,base) ({ \ u32 __base = (base); \ u32 __rem; \ __rem = ((u64)(n)) % __base; \ (n) = ((u64)(n)) / __base; \ __rem; \ }) #else u32 __attribute__((weak)) __div64_32(u64 *n, u32 base) { u64 rem = *n; u64 b = base; u64 res, d = 1; u32 high = rem >> 32; /* Reduce the thing a bit first */ res = 0; if (high >= base) { high /= base; res = (u64) high << 32; rem -= (u64) (high*base) << 32; } while ((s64)b > 0 && b < rem) { b = b+b; d = d+d; } do { if (rem >= b) { rem -= b; res += d; } b >>= 1; d >>= 1; } while (d); *n = res; return rem; } # define do_div(n,base) ({ \ u32 __base = (base); \ u32 __rem; \ (void)(((typeof((n)) *)0) == ((u64 *)0)); \ if (likely(((n) >> 32) == 0)) { \ __rem = (u32)(n) % __base; \ (n) = (u32)(n) / __base; \ } else \ __rem = __div64_32(&(n), __base); \ __rem; \ }) #endif /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set * @nsec: nanoseconds to set * * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * * Note: The tv_nsec part is always in the range of * 0 <= tv_nsec < NSEC_PER_SEC * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec(struct timespec *ts, long sec, s64 nsec) { while (nsec >= NSEC_PER_SEC) { /* * The following asm() prevents the compiler from * optimising this loop into a modulo operation. See * also __iter_div_u64_rem() in include/linux/time.h */ asm("" : "+rm"(nsec)); nsec -= NSEC_PER_SEC; ++sec; } while (nsec < 0) { asm("" : "+rm"(nsec)); nsec += NSEC_PER_SEC; --sec; } ts->tv_sec = sec; ts->tv_nsec = nsec; } static inline unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) { unsigned version = src->version & ~1; /* Make sure that the version is read before the data. */ smp_rmb(); return version; } static inline bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, unsigned version) { /* Make sure that the version is re-read after the data. */ smp_rmb(); return version != src->version; } static inline u64 rdtsc_ordered() { /* * FIXME: on Intel CPUs rmb() aka lfence is sufficient which brings up * to 2x speedup */ mb(); return rdtsc(); } static inline cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) { u64 delta = rdtsc_ordered() - src->tsc_timestamp; cycle_t offset = scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; } cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { unsigned version; cycle_t ret; u64 last; u8 flags; do { version = pvclock_read_begin(src); ret = __pvclock_read_cycles(src); flags = src->flags; } while (pvclock_read_retry(src, version)); if ((valid_flags & PVCLOCK_RAW_CYCLE_BIT) || ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && (flags & PVCLOCK_TSC_STABLE_BIT))) return ret; /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. * We assume there is an error marging we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, * but this means someone else updated poked the clock at a later time. * We just need to make sure we are not seeing a backwards event. * * For updates: last_value = ret is not enough, since two vcpus could be * updating at the same time, and one of them could be slightly behind, * making the assumption that last_value always go forward fail to hold. */ last = atomic64_read(&last_value); do { if (ret < last) return last; last = atomic64_cmpxchg(&last_value, last, ret); } while (unlikely(last != ret)); return ret; } cycle_t kvm_clock_read() { struct pvclock_vcpu_time_info *src; cycle_t ret; int index = smp_id(); src = &hv_clock[index]; ret = pvclock_clocksource_read(src); return ret; } void kvm_clock_init(void *data) { int index = smp_id(); struct pvclock_vcpu_time_info *hvc = &hv_clock[index]; printf("kvm-clock: cpu %d, msr %p\n", index, hvc); wrmsr(MSR_KVM_SYSTEM_TIME_NEW, (unsigned long)hvc | 1); } void kvm_clock_clear(void *data) { wrmsr(MSR_KVM_SYSTEM_TIME_NEW, 0LL); } void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, struct pvclock_vcpu_time_info *vcpu_time, struct timespec *ts) { u32 version; u64 delta; struct timespec now; /* get wallclock at system boot */ do { version = wall_clock->version; rmb(); /* fetch version before time */ now.tv_sec = wall_clock->sec; now.tv_nsec = wall_clock->nsec; rmb(); /* fetch time before checking version */ } while ((wall_clock->version & 1) || (version != wall_clock->version)); delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; now.tv_nsec = do_div(delta, NSEC_PER_SEC); now.tv_sec = delta; set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } void kvm_get_wallclock(struct timespec *ts) { struct pvclock_vcpu_time_info *vcpu_time; int index = smp_id(); wrmsr(MSR_KVM_WALL_CLOCK_NEW, (unsigned long)&wall_clock); vcpu_time = &hv_clock[index]; pvclock_read_wallclock(&wall_clock, vcpu_time, ts); } void pvclock_set_flags(unsigned char flags) { valid_flags = flags; }