#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <setjmp.h>

/**
 * Determine whether pointer mangling (POINTER_GUARD) is needed:
 * Currently we assume this is necessary for GLIBC 2.4 and later
 */
#ifdef __GNU_LIBRARY__
#if __GLIBC_PREREQ(2, 4)
#define USE_POINTER_GUARD
#endif
#endif

#ifdef USE_POINTER_GUARD
#if defined(_LINUX) && defined(__i386__)
/** taken from glibc */
typedef struct
{
    void *tcb;
    void *dtv;
    void *self;
    int multiple_threads;
    unsigned int sysinfo;
    unsigned int stack_guard;
    unsigned int pointer_guard;
} tcbhead_t;

#define PTR_MANGLE(var)	asm ("xorl %%gs:%c2, %0"		      \
			     : "=r" (var)			      \
			     : "0" (var),			      \
			       "i" (offsetof (tcbhead_t,	      \
					      pointer_guard)))
#define PTR_DEMANGLE(var)	PTR_MANGLE (var)
#else
#error POINTER_GUARD support not implemented for this architecture
#endif
#else
#define PTR_MANGLE(var)
#define PTR_DEMANGLE(var)
#endif




typedef struct {

    int utId;
    char *data;

} utInputData_t;


typedef struct {
    jmp_buf env;
    int (*func)(void *);
    void *arg;
    int running;
} utContext_t;


// Number of laps performed by Thread1
int laps1;

// Number of laps performed by Thread2
int laps2;


int utNum = 0;
int utCurrent = 0;
#define MAX_THREADS 100
utContext_t utArray[MAX_THREADS];

jmp_buf utSchedEnv;

void utYield(void) {

    if ( !setjmp(utArray[utCurrent].env) ) 
            longjmp(utSchedEnv,1);

}

int utF1(void *arg) {

    int i = 0;
    utInputData_t *u = (utInputData_t *)arg;
    int n = u->utId;
    char *c = strdup(u->data);

    while ( i++ < laps1 ) {
#ifdef _PRINT
	printf("Ut %d %s i = %d\n",n,c,i);
#endif
	utYield();
    }

    return 0;

}


int utF2(void *arg) {


    int i = 0;
    utInputData_t *u = (utInputData_t *)arg;
    int n = u->utId;
    char *c = strdup(u->data);

    while ( i++ < laps2 ) {
#ifdef _PRINT
	printf("Ut %d %s i = %d\n",n,c,i);
#endif
	utYield();
    }

    return 0;

}


void utSchedule(void) {

    int i=0;
    utContext_t *c;
    int haveUt = 1;

    if ( utNum <= 0 ) return;

    while ( haveUt ) {

	haveUt = 0;

	for ( utCurrent=0; utCurrent < utNum; utCurrent++ ) {

	    c = &(utArray[utCurrent]);

	    if ( c->running ) {

		haveUt = 1;

#ifdef _PRINT
		printf("Schedule %d\n",
                       ((utInputData_t *)(utArray[utCurrent].arg))->utId);
#endif

		if ( !setjmp(utSchedEnv) ) 
                    longjmp(utArray[utCurrent].env,1);

	    }
	}

    }

}

void utWrapper(void) {

    utContext_t *c = &(utArray[utCurrent]);

    c->func(c->arg);

    c->running = 0;

    if ( !setjmp(c->env) ) longjmp(utSchedEnv,1);


}

void utMakeContext(utContext_t *uctx, 
                   int (*func)(void *), 
                   int sk_size,
                   void *arg) {

    void *stackPtr;

    if ( !uctx || !func ) return;

    uctx->func = func;
    uctx->arg = arg;
    uctx->running = 1;
    stackPtr = calloc(1,sk_size);

#if defined(__CYGWIN__) && defined(__i386__)
    uctx->env[8] = (int)utWrapper; /* EIP */
    uctx->env[6] = 0; /* EBP */
    uctx->env[7] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[7]) = (void *)0; /* old EIP */
#elif defined(__MINGW__) && defined(__i386__)
    uctx->env[5] = (int)utWrapper; /* EIP */
    uctx->env[0] = 0; /* EBP */
    uctx->env[4] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[4]) = (void *)0; /* old EIP */
#elif defined(_LINUX) && defined(__i386__)
    uctx->env[0].__jmpbuf[5] = (int)utWrapper; /* EIP */
    PTR_MANGLE(uctx->env[0].__jmpbuf[5]);
    uctx->env[0].__jmpbuf[3] = 0; /* EBP */
    uctx->env[0].__jmpbuf[4] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[0].__jmpbuf[4]) = (void *)0; /* old EIP */
    PTR_MANGLE(uctx->env[0].__jmpbuf[4]);
#elif defined(_LINUX) && defined(__x86_64__)
    uctx->env[0].__jmpbuf[JB_PC] = (long int)utWrapper; /* RIP */
    uctx->env[0].__jmpbuf[JB_RBP] = 0; /* RBP */
    uctx->env[0].__jmpbuf[JB_RSP] = (long int)stackPtr + sk_size - sizeof(int64_t); 
        /* RSP */
    *((void**)uctx->env[0].__jmpbuf[JB_RSP]) = (void *)0; /* old RIP */
#elif defined(__MACH__) && defined(__ppc__)   /* ppc32-osx */
    uctx->env[0] = ((int)stackPtr + sk_size - 32 * sizeof(int)); /* SP */
    *((int *)uctx->env[0]) = 0; /* old SP */
    *((int *)uctx->env[0] + sizeof(int)) = 0; /* old LR */
    uctx->env[21] = (int)utWrapper; /* LR */
#elif defined(__MACH__) && defined(__i386__)   /* i386-osx */
    uctx->env[5] = (int)utWrapper; /* EIP */
    uctx->env[2] = 0; /* EBP */
    uctx->env[1] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[1]) = (void *)0; /* old EIP */
#endif 

}



int main(int argc, char *argv[]) {

    int skSize = 4096;
    utInputData_t *u;

    // Parse input parameters: argv[1] defines
    // number of cycles to be performed by Thread1,
    // argv[2] the same for Thread2
    if ( argc < 2 ) 
	laps1 = 10;
    else
	laps1 = atoi(argv[1]);

    if ( argc < 3 ) 
	laps2 = laps1;
    else 
	laps2 = atoi(argv[2]);


    u = (utInputData_t *)malloc(sizeof(utInputData_t));
    u->utId = 1;
    u->data = "User Thread 1";
    utMakeContext(&(utArray[utNum++]),utF1,skSize,u);

    u = (utInputData_t *)malloc(sizeof(utInputData_t));
    u->utId = 2;
    u->data = "User Thread 2";
    utMakeContext(&(utArray[utNum++]),utF2,skSize,u);

    utSchedule();

    exit(0);

}
