/* * Copyright 2008 Veselin Georgiev, * anrieffNOSPAM @ mgail_DOT.com (convert to gmail) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "libcpuid.h" #include "libcpuid_util.h" #include "asm-bits.h" #include "rdtsc.h" #ifdef _WIN32 #include void sys_precise_clock(uint64_t* result) { double c, f; LARGE_INTEGER freq, counter; QueryPerformanceCounter(&counter); QueryPerformanceFrequency(&freq); c = (double) counter.QuadPart; f = (double) freq.QuadPart; *result = (uint64_t)(c * 1000000.0 / f); } #else /* assuming Linux, Mac OS or other POSIX */ #include void sys_precise_clock(uint64_t* result) { struct timeval tv; gettimeofday(&tv, NULL); *result = (uint64_t) tv.tv_sec * (uint64_t) 1000000 + (uint64_t) tv.tv_usec; } #endif /* _WIN32 */ /* out = a - b */ static void mark_t_subtract(struct cpu_mark_t* a, struct cpu_mark_t* b, struct cpu_mark_t* out) { out->tsc = a->tsc - b->tsc; out->sys_clock = a->sys_clock - b->sys_clock; } void cpu_tsc_mark(struct cpu_mark_t* mark) { cpu_rdtsc(&mark->tsc); sys_precise_clock(&mark->sys_clock); } void cpu_tsc_unmark(struct cpu_mark_t* mark) { struct cpu_mark_t temp; cpu_tsc_mark(&temp); mark_t_subtract(&temp, mark, mark); } int cpu_clock_by_mark(struct cpu_mark_t* mark) { uint64_t result; /* Check if some subtraction resulted in a negative number: */ if((mark->tsc >> 63) != 0 || (mark->sys_clock >> 63) != 0) { return -1; } /* Divide-by-zero check: */ if(mark->sys_clock == 0) { return -1; } /* Check if the result fits in 32bits */ result = mark->tsc / mark->sys_clock; if(result > (uint64_t) 0x7fffffff) { return -1; } return (int) result; } #ifdef _WIN32 int cpu_clock_by_os(void) { HKEY key; DWORD result; DWORD size = 4; if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), 0, KEY_READ, &key) != ERROR_SUCCESS) { return -1; } if(RegQueryValueEx(key, TEXT("~MHz"), NULL, NULL, (LPBYTE) &result, (LPDWORD) &size) != ERROR_SUCCESS) { RegCloseKey(key); return -1; } RegCloseKey(key); return (int)result; } #else #ifdef __APPLE__ #include #include /* Assuming Mac OS X with hw.cpufrequency sysctl */ int cpu_clock_by_os(void) { long long result = -1; size_t size = sizeof(result); if(sysctlbyname("hw.cpufrequency", &result, &size, NULL, 0)) { return -1; } return (int)(result / (long long) 1000000); } #else /* Assuming Linux with /proc/cpuinfo */ int cpu_clock_by_os(void) { FILE* f; char line[1024], *s; int result; f = fopen("/proc/cpuinfo", "rt"); if(!f) { return -1; } while(fgets(line, sizeof(line), f)) { if(!strncmp(line, "cpu MHz", 7)) { s = strchr(line, ':'); if(s && 1 == sscanf(s, ":%d.", &result)) { fclose(f); return result; } } } fclose(f); return -1; } #endif /* __APPLE__ */ #endif /* _WIN32 */ /* Emulate doing useful CPU intensive work */ static int busy_loop(int amount) { int i, j, k, s = 0; static volatile int data[42] = {32, 12, -1, 5, 23, 0 }; for(i = 0; i < amount; i++) for(j = 0; j < 65536; j++) for(k = 0; k < 42; k++) { s += data[k]; } return s; } int busy_loop_delay(int milliseconds) { int cycles = 0, r = 0, first = 1; uint64_t a, b, c; sys_precise_clock(&a); while(1) { sys_precise_clock(&c); if((c - a) / 1000 > milliseconds) { return r; } r += busy_loop(cycles); if(first) { first = 0; } else { if(c - b < 1000) { cycles *= 2; } if(c - b > 10000) { cycles /= 2; } } b = c; } } int cpu_clock_measure(int millis, int quad_check) { struct cpu_mark_t begin[4], end[4], temp, temp2; int results[4], cycles, n, k, i, j, bi, bj, mdiff, diff, _zero = 0; uint64_t tl; if(millis < 1) { return -1; } tl = millis * (uint64_t) 1000; if(quad_check) { tl /= 4; } n = quad_check ? 4 : 1; cycles = 1; for(k = 0; k < n; k++) { cpu_tsc_mark(&begin[k]); end[k] = begin[k]; do { /* Run busy loop, and fool the compiler that we USE the garbishy value it calculates */ _zero |= (1 & busy_loop(cycles)); cpu_tsc_mark(&temp); mark_t_subtract(&temp, &end[k], &temp2); /* If busy loop is too short, increase it */ if(temp2.sys_clock < tl / 8) { cycles *= 2; } end[k] = temp; } while(end[k].sys_clock - begin[k].sys_clock < tl); mark_t_subtract(&end[k], &begin[k], &temp); results[k] = cpu_clock_by_mark(&temp); } if(n == 1) { return results[0]; } mdiff = 0x7fffffff; bi = bj = -1; for(i = 0; i < 4; i++) { for(j = i + 1; j < 4; j++) { diff = results[i] - results[j]; if(diff < 0) { diff = -diff; } if(diff < mdiff) { mdiff = diff; bi = i; bj = j; } } } if(results[bi] == -1) { return -1; } return (results[bi] + results[bj] + _zero) / 2; } static void adjust_march_ic_multiplier(const struct cpu_id_t* id, int* numerator, int* denom) { /* * for cpu_clock_by_ic: we need to know how many clocks does a typical ADDPS instruction * take, when issued in rapid succesion without dependencies. The whole idea of * cpu_clock_by_ic was that this is easy to determine, at least it was back in 2010. Now * it's getting progressively more hairy, but here are the current measurements: * * 1. For CPUs with 64-bit SSE units, ADDPS issue rate is 0.5 IPC (one insn in 2 clocks) * 2. For CPUs with 128-bit SSE units, issue rate is exactly 1.0 IPC * 3. For Bulldozer and later, it is 1.4 IPC (we multiply by 5/7) * 4. For Skylake and later, it is 1.6 IPC (we multiply by 5/8) */ // if(id->sse_size < 128) { debugf(1, "SSE execution path is 64-bit\n"); // on a CPU with half SSE unit length, SSE instructions execute at 0.5 IPC; // the resulting value must be multiplied by 2: *numerator = 2; } else { debugf(1, "SSE execution path is 128-bit\n"); } // // Bulldozer or later: assume 1.4 IPC if(id->vendor == VENDOR_AMD && id->ext_family >= 21) { debugf(1, "cpu_clock_by_ic: Bulldozer (or later) detected, dividing result by 1.4\n"); *numerator = 5; *denom = 7; // multiply by 5/7, to divide by 1.4 } // // Skylake or later: assume 1.6 IPC if(id->vendor == VENDOR_INTEL && id->ext_model >= 94) { debugf(1, "cpu_clock_by_ic: Skylake (or later) detected, dividing result by 1.6\n"); *numerator = 5; *denom = 8; // to divide by 1.6, multiply by 5/8 } } int cpu_clock_by_ic(int millis, int runs) { int max_value = 0, cur_value, i, ri, cycles_inner, cycles_outer, c; struct cpu_id_t* id; uint64_t t0, t1, tl, hz; int multiplier_numerator = 1, multiplier_denom = 1; if(millis <= 0 || runs <= 0) { return -2; } id = get_cached_cpuid(); // if there aren't SSE instructions - we can't run the test at all if(!id || !id->flags[CPU_FEATURE_SSE]) { return -1; } // adjust_march_ic_multiplier(id, &multiplier_numerator, &multiplier_denom); // tl = millis * 125; // (*1000 / 8) cycles_inner = 128; cycles_outer = 1; do { if(cycles_inner < 1000000000) { cycles_inner *= 2; } else { cycles_outer *= 2; } sys_precise_clock(&t0); for(i = 0; i < cycles_outer; i++) { busy_sse_loop(cycles_inner); } sys_precise_clock(&t1); } while(t1 - t0 < tl); debugf(2, "inner: %d, outer: %d\n", cycles_inner, cycles_outer); for(ri = 0; ri < runs; ri++) { sys_precise_clock(&t0); c = 0; do { c++; for(i = 0; i < cycles_outer; i++) { busy_sse_loop(cycles_inner); } sys_precise_clock(&t1); } while(t1 - t0 < tl * (uint64_t) 8); // cpu_Hz = cycles_inner * cycles_outer * 256 / (t1 - t0) * 1000000 debugf(2, "c = %d, td = %d\n", c, (int)(t1 - t0)); hz = ((uint64_t) cycles_inner * (uint64_t) 256 + 12) * (uint64_t) cycles_outer * (uint64_t) multiplier_numerator * (uint64_t) c * (uint64_t) 1000000 / ((t1 - t0) * (uint64_t) multiplier_denom); cur_value = (int)(hz / 1000000); if(cur_value > max_value) { max_value = cur_value; } } return max_value; } int cpu_clock(void) { int result; result = cpu_clock_by_os(); if(result <= 0) { result = cpu_clock_measure(200, 1); } return result; }