/*
gcc -O2 memspeed.c -o memspeed
*/

#include <stdlib.h>
#include <stdio.h>

#define SIZEMB (32)
#define SIZE (1024*1024*SIZEMB)
#define ITER (20)
#define ITER2 (10)

double this_gettime(void)
{
    struct timeval tv;
    double time;
    
    gettimeofday(&tv, 0);
    time = tv.tv_sec;
    time += ((double)tv.tv_usec)/1000000.0;
    
    return time;
}


inline void prefetch(void *x)
{
        asm volatile("prefetcht0 %0\n\t"
	:: "m" (*(unsigned long *)x));
}


int main(int argc, char *argv[])
{
    int iter, i, j;
    void *aptr;
    
    float *a, *b;
    volatile float read;
    
    double *da, *db;
    double t1, t2, t3, t4;
    volatile double dread;
    
    double time1, time2;
    
    aptr = malloc(SIZE*sizeof(float) + 64);
    a = (float *) ((long int)(aptr+((long int)63))& ((long int)~0x3f));
    
    aptr = malloc(SIZE*sizeof(double) + 64);
    da = (double *) ((long int)(aptr+((long int)63))& ((long int)~0x3f));
    
    printf("readbuffer alignment: %d %d\n", ((long int)a)&0x3f, ((long int)da)&0x3f);

    for(j = 0; j < SIZE; j++)
    {
	a[j] = 0.0;
	da[j] = 0.0;
    }
    
    
    for(iter = 0; iter < ITER; iter++)
    {
	time1 = this_gettime();
	for(j = 0; j < (SIZE/16); j++)
	{
	    prefetch(a+128);
	    read = a[0];
	    read = a[1];
	    read = a[2];
	    read = a[3];
	    read = a[4];
	    read = a[5];
	    read = a[6];
	    read = a[7];
	    read = a[8];
	    read = a[9];
	    read = a[10];
	    read = a[11];
	    read = a[12];
	    read = a[13];
	    read = a[14];
	    read = a[15];
	    a += 16;
	}
	time2 = this_gettime();
	printf("c int: %f mb/s\n", ((double)(SIZEMB*4))/(time2-time1));
	a -= SIZE;
    }



    for(iter = 0; iter < ITER; iter++)
    {
	time1 = this_gettime();
	for(i = 0; i < ITER2; i++)
	{
	    for(j = 0; j < (SIZE/64); j++)
	    {
	    
    	    asm volatile(
		"prefetcht0  0x800(%0)		\n\t"
		"movaps 0x00(%0), %%xmm0	;\n\t"
		"movaps 0x10(%0), %%xmm1	;\n\t"
		"movaps 0x20(%0), %%xmm2	;\n\t"
		"movaps 0x30(%0), %%xmm3	;\n\t"
		"prefetcht0  0x840(%0)		\n\t"
		"movaps 0x40(%0), %%xmm4	;\n\t"
		"movaps 0x50(%0), %%xmm5	;\n\t"
		"movaps 0x60(%0), %%xmm6	;\n\t"
		"movaps 0x70(%0), %%xmm7	;\n\t"
		"prefetcht0  0x880(%0)		\n\t"
		"movaps 0x80(%0), %%xmm8	;\n\t"
		"movaps 0x90(%0), %%xmm9	;\n\t"
		"movaps 0xa0(%0), %%xmm10	;\n\t"
		"movaps 0xb0(%0), %%xmm11	;\n\t"
		"prefetcht0  0x8c0(%0)		\n\t"
		"movaps 0xc0(%0), %%xmm12	;\n\t"
		"movaps 0xd0(%0), %%xmm13	;\n\t"
		"movaps 0xe0(%0), %%xmm14	;\n\t"
		"movaps 0xf0(%0), %%xmm15	;\n\t"
		"prefetcht0  0x900(%0)		\n\t"
		"movaps 0x100(%0), %%xmm0	;\n\t"
		"movaps 0x110(%0), %%xmm1	;\n\t"
		"movaps 0x120(%0), %%xmm2	;\n\t"
		"movaps 0x130(%0), %%xmm3	;\n\t"
		"prefetcht0  0x940(%0)		\n\t"
		"movaps 0x140(%0), %%xmm4	;\n\t"
		"movaps 0x150(%0), %%xmm5	;\n\t"
		"movaps 0x160(%0), %%xmm6	;\n\t"
		"movaps 0x170(%0), %%xmm7	;\n\t"
		"prefetcht0  0x980(%0)		\n\t"
		"movaps 0x180(%0), %%xmm8	;\n\t"
		"movaps 0x190(%0), %%xmm9	;\n\t"
		"movaps 0x1a0(%0), %%xmm10	;\n\t"
		"movaps 0x1b0(%0), %%xmm11	;\n\t"
		"prefetcht0  0x9c0(%0)		\n\t"
		"movaps 0x1c0(%0), %%xmm12	;\n\t"
		"movaps 0x1d0(%0), %%xmm13	;\n\t"
		"movaps 0x1e0(%0), %%xmm14	;\n\t"
		"movaps 0x1f0(%0), %%xmm15	;\n\t"
	    : : "r" ((unsigned long *)da));
	    
	    da += 64;

	    }
	    da -= SIZE;
	}
	
	time2 = this_gettime();
	printf("asm xmms: %f mb/s\n", ((double)(SIZEMB*8*ITER2))/(time2-time1));
    }

/*    printf("xmms: %f mb/s\n", ((double)(SIZEMB*8*ITER))/(time2-time1)); */
}
