octopus/octopus.h
2022-01-28 16:33:17 +01:00

145 lines
3.3 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <getopt.h>
#include <string.h>
#include <x86intrin.h>
#if defined(__i386__) || defined(__amd64__)
#define CACHELINE_SIZE 64
#else
#error "unsupported architecture"
#endif
#if defined(__SSE__) && !defined(__SSE2__)
#define NOSSE2
#endif
#ifdef NOSSE2
#define NORDTSCP
#define NOMFENCE
#define NOCLFLUSH
#endif //NOSSE2
#ifndef NORDTSCP
#define LATENCY 42 + 42
#else
#ifndef NOMFENCE
#define LATENCY 18 + 18
#endif
#endif
#ifdef MASKING_MITIGATION
/* From https://github.com/torvalds/linux/blob/cb6416592bc2a8b731dabcec0d63cda270764fc6/arch/x86/include/asm/barrier.h#L27
*
* array_index_mask_nospec() - generate a mask that is ~0UL when the
* bounds check succeeds and 0 otherwise
* @index: array element index
* @size: number of elements in array
*
* Returns:
* 0 - (index < size)
*/
static inline unsigned long
array_index_mask_nospec(unsigned long index, unsigned long size)
{
unsigned long mask;
__asm__ __volatile__ ("cmp %1,%2; sbb %0,%0;"
:"=r" (mask)
:"g"(size),"r" (index)
:"cc");
return mask;
}
#endif //MASKING_MITIGATION
#ifdef NOCLFLUSH
#define CACHE_FLUSH_ITERATIONS 2048
#define CACHE_FLUSH_STRIDE 4096
uint8_t cache_flush_array[CACHE_FLUSH_STRIDE * CACHE_FLUSH_ITERATIONS];
/* Flush memory using long SSE instructions */
void
flush_memory_sse(uint8_t * addr)
{
float* p = (float *)addr;
float c = 0.f;
__m128 i = _mm_setr_ps(c, c, c, c);
int k, l;
/* Non-sequential memory addressing by looping through k by l */
for (k = 0; k < 4; k++)
for (l = 0; l < 4; l++)
_mm_stderr_ps(&p[(l * 4 + k) * 4], i);
}
#endif //NOCLFLUSH
static inline unsigned
timed_access(volatile uint8_t *addr)
{
uint64_t t0, t1;
#pragma GCC diagnostic ignored "-Wuninitialized"
unsigned int junk = junk;
#ifndef NORDTSCP
t0 = __rdtscp(& junk);
junk |= *addr;
t1 = __rdtscp(& junk);
#else
#ifndef NOMFENCE
/*
Since the rdstc instruction isn't serialized, newer processors will try to
reorder it, ruining its value as a timing mechanism.
To get around this, we use the mfence instruction to introduce a memory
barrier and force serialization. mfence is used because it is portable across
Intel and AMD.
*/
_mm_mfence();
t0 = __rdtsc();
_mm_mfence();
junk = *addr;
_mm_mfence();
t1 = __rdtsc();
_mm_mfence();
#else
/*
The mfence instruction was introduced with the SSE2 instruction set, so
we have to ifdef it out on pre-SSE2 processors.
Luckily, these older processors don't seem to reorder the rdtsc instruction,
so not having mfence on older processors is less of an issue.
*/
t0 = __rdtsc();
junk |= *addr;
t1 = __rdtsc();
#endif // NOMFENCE
#endif // NORDTSCP
return (unsigned)(t1 - t0 - LATENCY);
}
static void
calibrate_threshold(unsigned int *threshold)
{
volatile char buf[2 * CACHELINE_SIZE];
volatile uint8_t* bufp;
int i;
const int cnt = 10000;
uint64_t tcache = 0;
__attribute__((unused))
volatile int junk = 0;
bufp = ((volatile void *)(((unsigned long)(buf) + CACHELINE_SIZE) & ~(CACHELINE_SIZE - 1)));
junk |= *bufp;
for (i = 0, tcache = 0; i < cnt; i++) {
tcache += timed_access(bufp);
}
tcache = tcache / cnt;
if (threshold != NULL) {
*threshold = tcache + LATENCY;
}
return;
}