#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <setjmp.h>
#include <pthread.h>
#include <sched.h>
#include <errno.h>

/**
 * Determine whether pointer mangling (POINTER_GUARD) is needed:
 * Currently we assume this is necessary for GLIBC 2.4 and later
 */
#ifdef __GNU_LIBRARY__
#if __GLIBC_PREREQ(2, 4)
#define USE_POINTER_GUARD
#endif
#endif

#ifdef USE_POINTER_GUARD
#if defined(_LINUX) && defined(__i386__)
/** taken from glibc */
typedef struct
{
    void *tcb;
    void *dtv;
    void *self;
    int multiple_threads;
    unsigned int sysinfo;
    unsigned int stack_guard;
    unsigned int pointer_guard;
} tcbhead_t;

#define PTR_MANGLE(var)	asm ("xorl %%gs:%c2, %0"		      \
			     : "=r" (var)			      \
			     : "0" (var),			      \
			       "i" (offsetof (tcbhead_t,	      \
					      pointer_guard)))
#define PTR_DEMANGLE(var)	PTR_MANGLE (var)
#else
#error POINTER_GUARD support not implemented for this architecture
#endif
#else
#define PTR_MANGLE(var)
#define PTR_DEMANGLE(var)
#endif

typedef struct {

    int utId;
    char *data;

} utInputData_t;


typedef struct {
    jmp_buf env;
    int (*func)(void *);
    void *arg;
    // User Thread state:
    int running;
} utContext_t;



#define MAX_LWPS 2
#define MAX_THREADS 100
utContext_t utArray[MAX_LWPS][MAX_THREADS];
int utNum[MAX_LWPS] = { 0, 0};
int utCurrent[MAX_LWPS] = { 0, 0};
pthread_t lwpIdx2Id[MAX_LWPS];


jmp_buf utSchedEnv[MAX_LWPS];

int laps1;
int laps2;


int lwpId2Idx(void) {

    pthread_t id = pthread_self();
    int lwpIdx = -1;
    int i;

    // Find LWP index from pthread identification
    for (i=0; i<MAX_LWPS; i++) {
	if ( pthread_equal(lwpIdx2Id[i],id) ) {
	    lwpIdx = i;
	    break;
	}  
    }
    if ( lwpIdx < 0 ) {
	printf("Problem with finding LWP index\n");
	exit(1);
    }

}


void utYield(int lwpIdx) {

    if ( !setjmp(utArray[lwpIdx][utCurrent[lwpIdx]].env) ) 
            longjmp(utSchedEnv[lwpIdx],1);

}

int utF1(void *arg) {

    int i = 0;
    int u = *(int *)arg;
    int myCpu;

    myCpu = 1 << u;
    if ( sched_setaffinity(0,sizeof(unsigned long),&myCpu) ) {
	perror("sched_setaffinity");
    }

    while ( i++ < laps1 ) {
#ifdef _PRINT
	printf("Ut %d i = %d\n",u,i);
#endif
	sched_yield();
    }

    return 0;

}


int utF2(void *arg) {


    int i = 0;
    int u = *(int *)arg;
    int myCpu;

    myCpu = 1 << u;
    if ( sched_setaffinity(0,sizeof(unsigned long),&myCpu) ) {
	perror("sched_setaffinity");
    }

    while ( i++ < laps2 ) {
#ifdef _PRINT
	printf("Ut %d i = %d\n",u,i);
#endif
	sched_yield();
    }

    return 0;

}




void utSchedule(void *pData) {

    int i=0;
    utContext_t *c;
    int haveUt = 1;
    int lwpIdx = *(int *)pData;
    unsigned long myCpu;
    pid_t pid;
    pthread_t pthreadId = pthread_self();

    // Register pthread LWP identification unde 
    // this LWP's index:
    lwpIdx2Id[lwpIdx] = pthreadId;
    pid = getpid();

    printf("LWP %d starts with pthread-id 0x%08X, PID 0x%08X\n",
           lwpIdx,pthreadId,pid);

    myCpu = 1 << lwpIdx;
    if ( sched_setaffinity(0,sizeof(unsigned long),&myCpu) ) {
	perror("sched_setaffinity");
    }

    if ( utNum <= 0 ) return;

    while ( haveUt ) {

	haveUt = 0;

	for ( utCurrent[lwpIdx]=0; 
              utCurrent[lwpIdx] < utNum[lwpIdx]; 
              utCurrent[lwpIdx]++ ) {

	    c = &(utArray[lwpIdx][utCurrent[lwpIdx]]);

	    if ( c->running ) {

		haveUt = 1;

#ifdef _PRINT
		printf("Schedule %d\n",
                ((utInputData_t *)
                 (utArray[lwpIdx][utCurrent[lwpIdx]].arg))->utId);
#endif

		if ( !setjmp(utSchedEnv[lwpIdx]) ) 
                    longjmp(utArray[lwpIdx][utCurrent[lwpIdx]].env,1);

	    }
	}

    }

}


void utWrapper(void) {

    int lwpIdx = lwpId2Idx();

    utContext_t *c = &(utArray[lwpIdx][utCurrent[lwpIdx]]);

    c->func(c->arg);

    c->running = 0;

    if ( !setjmp(c->env) ) longjmp(utSchedEnv[lwpIdx],1);


}

void utMakeContext(utContext_t *uctx, 
                   int (*func)(void *), 
                   int sk_size,
                   void *arg) {

    void *stackPtr;

    if ( !uctx || !func ) return;

    uctx->func = func;
    uctx->arg = arg;
    uctx->running = 1;
    stackPtr = calloc(1,sk_size);

#if defined(__CYGWIN__) && defined(__i386__)
    uctx->env[8] = (int)utWrapper; /* EIP */
    uctx->env[6] = 0; /* EBP */
    uctx->env[7] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[7]) = (void *)0; /* old EIP */
#elif defined(__MINGW__) && defined(__i386__)
    uctx->env[5] = (int)utWrapper; /* EIP */
    uctx->env[0] = 0; /* EBP */
    uctx->env[4] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[4]) = (void *)0; /* old EIP */
#elif defined(_LINUX) && defined(__i386__)
    uctx->env[0].__jmpbuf[5] = (int)utWrapper; /* EIP */
    PTR_MANGLE(uctx->env[0].__jmpbuf[5]);
    uctx->env[0].__jmpbuf[3] = 0; /* EBP */
    uctx->env[0].__jmpbuf[4] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[0].__jmpbuf[4]) = (void *)0; /* old EIP */
    PTR_MANGLE(uctx->env[0].__jmpbuf[4]);
#elif defined(_LINUX) && defined(__x86_64__)
    uctx->env[0].__jmpbuf[JB_PC] = (long int)utWrapper; /* RIP */
    uctx->env[0].__jmpbuf[JB_RBP] = 0; /* RBP */
    uctx->env[0].__jmpbuf[JB_RSP] = (long int)stackPtr + sk_size - sizeof(int64_t); 
        /* RSP */
    *((void**)uctx->env[0].__jmpbuf[JB_RSP]) = (void *)0; /* old RIP */
#elif defined(__MACH__) && defined(__ppc__)   /* ppc32-osx */
    uctx->env[0] = ((int)stackPtr + sk_size - 32 * sizeof(int)); /* SP */
    *((int *)uctx->env[0]) = 0; /* old SP */
    *((int *)uctx->env[0] + sizeof(int)) = 0; /* old LR */
    uctx->env[21] = (int)utWrapper; /* LR */
#elif defined(__MACH__) && defined(__i386__)   /* i386-osx */
    uctx->env[5] = (int)utWrapper; /* EIP */
    uctx->env[2] = 0; /* EBP */
    uctx->env[1] = (int)stackPtr + sk_size - sizeof(int32_t); /* ESP */
    *((void**)uctx->env[1]) = (void *)0; /* old EIP */
#endif 

}



int main(int argc, char *argv[]) {

    int skSize = 4096;
    utInputData_t *u;
    pthread_t thread0;
    pthread_t thread1;
    int *arg0;
    int *arg1;
    unsigned long myCpu;


/*     myCpu = 1; */
/*     if ( sched_setaffinity(0,sizeof(unsigned long),&myCpu) ) { */
/* 	perror("sched_setaffinity"); */
/*     } */
   

    // Parse input parameters: argv[1] defines
    // number of cycles to be performed by Thread1,
    // argv[2] the same for Thread2
    if ( argc < 2 ) 
	laps1 = 10;
    else
	laps1 = atoi(argv[1]);

    if ( argc < 3 ) 
	laps2 = laps1;
    else 
	laps2 = atoi(argv[2]);


    // Activate 1st thread
    arg0 = (int *)malloc(sizeof(int));
    *arg0 = 0;
    pthread_create(&thread0,NULL,utF1,(void *)arg0);

    // Activate 2nd thread
    arg1 = (int *)malloc(sizeof(int));
    *arg1 = 1;
    pthread_create(&thread1,NULL,utF2,(void *)arg1);

    // Wait for all LWPs to finish
    pthread_join(thread0,NULL);
    pthread_join(thread1,NULL);
  

    exit(0);

}
