#define TILED
#define SPU
#define PPU
//----------------------------------------------------------------------------------------------

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>

#include <libspe.h>
#include <sched.h>

//----------------------------------------------------------------------------------------------

struct args_t
{
    float *ppuA;
    float *ppuB;
    float *ppuC;
    int decrementer_ticks;
};

//----------------------------------------------------------------------------------------------

extern spe_program_handle_t spu_mm;

int main(void)
{
    spe_gid_t   spu_group_id;
    speid_t     spu_id;

    int i,j,
				k,m,n,
				k2,m2,n2, offa,offb,offc;
		float *a,*b,*c;
#ifdef RET
		float f;
#endif

    float ppuA[64*64] __attribute__ ((aligned (16)));
    float ppuB[64*64] __attribute__ ((aligned (16)));
    float ppuC[64*64] __attribute__ ((aligned (16)));
    float ppuD[64*64] __attribute__ ((aligned (16)));
    float ppuE[64*64] __attribute__ ((aligned (16)));

    struct args_t ppu_args __attribute__ ((aligned (16)));


    for (i = 0; i < 64; i++)
      for (j = 0; j < 64; j++)
        ppuA[i*64+j] = (float)rand() / (float)RAND_MAX - 0.5;

    for (i = 0; i < 64; i++)
      for (j = 0; j < 64; j++)
        ppuB[i*64+j] = (float)rand() / (float)RAND_MAX - 0.5;

    for (i = 0; i < 64; i++){
      for (j = 0; j < 64; j++){
       ppuE[i*64+j] = ppuD[i*64+j] = ppuC[i*64+j] = (float)rand() / (float)RAND_MAX - 0.5;
      }
    }

    ppu_args.ppuA = ppuA;
    ppu_args.ppuB = ppuB;
    ppu_args.ppuC = ppuC;

#ifdef SPU
    assert(spu_group_id = spe_create_group(SCHED_OTHER, 0, 1));
    assert(spu_id = spe_create_thread(spu_group_id, &spu_mm, &ppu_args, 0, -1, 0));
		spe_wait(spu_id, NULL, 0);
#endif

#ifdef PPU
#ifdef SOLID
    for(i=0; i<64; i++){ // 64 // 4
      for(j=0; j<64; j++){ // 64 // 16
        for(k=0; k<64; k++){ // 64 // 8
          ppuD[i*64+j] -= ppuA[i*64+k] * ppuB[k*64+j];
        }
      }
    }
#else
#ifdef TILED
		for(k=0; k<8; k++){ // 64 // 8
			for(m=0; m<16; m++){ // 64 // 4       // For each tile
				for(n=0; n<2; n++){ // 64 // 16

					a = ppuA + (offa = (m*256 + k* 8) );
					b = ppuB + (offb = (k*512 + n*32) );
					c = ppuD + (offc = (m*256 + n*32) );
//					printf("tile:%d a:%d[%d][%d] b:%d[%d][%d] c:%d[%d][%d]\n", m*8+k*4+n, offa,m,k, offb,k,n, offc,m,n );
					
					for(k2=0;k2<8;k2++)
						for(m2=0;m2<4;m2++)
							for(n2=0;n2<32;n2++)
								c[ n2+ (m2*64) ] -= a[ k2+ (m2*64) ] * b[ (k2*64) +n2 ];
					// If you only want to do certain tiles, adjust the next line.
//					if(n == 1) goto done;
				}
			}
//			goto done;
		}
#endif
#endif
#endif
done:
		for (i = 0; i < 64; i++){
			for (j = 0; j < 64; j++){
				printf("%c",((fabs(ppuC[i*64+j] - ppuD[i*64+j]) < 1.0e-5)?'.':'#'));
#ifdef SPU
				printf("[%2d][%2d]: %20.10fs %s %20.10f%c\n",i,j,             ppuC[i*64+j], \
						((fabs(ppuC[i*64+j] - ppuD[i*64+j]) < 1.0e-5)?"==":"!="), ppuD[i*64+j],
#ifdef PPU
						'p'
#else
						'o'
#endif
						);
#ifdef PRINTASSERT
 				assert(fabs(ppuC[i*64+j] - ppuD[i*64+j]) < 1.0e-5);
#endif
#else
				printf("[%2d][%2d]: %20.10f\n",i,j, ppuD[i*64+j]); 
#endif
			}
			printf("\n");
		}

		printf("%20.10f GFLOPS\n", (double)64*64*64*2 / ppu_args.decrementer_ticks * 1431.8 );
		printf("%20.10f time\n",   ppu_args.decrementer_ticks / 1431800.0 );
		printf("%20d Ticks\n", ppu_args.decrementer_ticks);
		return 0;
}

//----------------------------------------------------------------------------------------------

