// MATMUL 8x8
// Optimized C: 773 cycles
// Naive code: 2852 cycles

/*
   e-gcc -Wall -O3 -std=c99 \
         -mlong-calls \
         -mfp-mode=round-nearest \
         -ffp-contract=fast \
         -ffast-math \
         -funroll-loops \
         -T ${EPIPHANY_HOME}/bsps/emek3/fast.ldf \
         -o ./matmul.elf \
         ./matmul.c
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <e_ctimers.h>
#include <e_regs.h>
#include <e_common.h>

e_ctimer_config_t e_ctimer_stop(e_ctimer_id_t timer);

#define N (8)
#define K (N)

float a[N][N]   ALIGN(8) SECTION(".data_bank2");
float b[N][N]   ALIGN(8) SECTION(".data_bank2");
float c_o[N][N] ALIGN(8) SECTION(".data_bank3");
float c_n[N][N] ALIGN(8) SECTION(".data_bank3");

unsigned matmul(float * restrict aa, float * restrict bb, float * restrict cc)
{
    int i = 0;

	unsigned time, time_s, time_e;

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (i=0; i<N; i++)
    {
        int j0, j1;

        for (j0=0; j0<N; j0+=8)
        {
            int x;
            float tot[8];

            for (j1=0; j1<8; j1++)
            {
                tot[j1] = 0.0;
            }

            for (x=0; x<K; x++)
            {
                float tmp = *(aa + (i * K) + x);

                for (j1=0; j1<8; j1+=4)
                {
                	float tmp2[4];

                    tmp2[0] = *(bb + (x * N) + j0 + j1 + 0);
                    tmp2[1] = *(bb + (x * N) + j0 + j1 + 1);
                    tmp2[2] = *(bb + (x * N) + j0 + j1 + 2);
                    tmp2[3] = *(bb + (x * N) + j0 + j1 + 3);
                    tot[j1 + 0] += tmp * tmp2[0];
                    tot[j1 + 1] += tmp * tmp2[1];
                    tot[j1 + 2] += tmp * tmp2[2];
                    tot[j1 + 3] += tmp * tmp2[3];
                }
            }

            for (j1=0; j1<8; j1+=1)
            {
                *(cc + (i * N) + j0+j1) = tot[j1];
            }
        }
    }

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


unsigned matmul_naive(float * restrict a, float * restrict b, float * restrict c)
{
	int i, j, k;

	unsigned time, time_s, time_e;

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (i=0; i<N; i++)
		for (j=0; j<N; j++)
		{
			c[i*N+j] = 0;
			for (k=0; k<N; k++)
				c[i*N+j] += a[i*N+k] * b[k*N+j];
		}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


void matprt(float *a, int NN);

int main ()
{
    int  i;
	unsigned time;
    
    for (i = 0; i < N; i++)
    {
        int  j;
        
        for (j = 0; j < N; j++)
        {
            a[i][j] = (float) i * (float) (j-1);
            b[i][j] = (float) (i+1) * (float) j;
        }
    }
    
	printf("\n");

	time = matmul((float *) a, (float *) b, (float *) c_o);

	printf("Optimized MATMUL time = %d cycles\n", time);

	time = matmul_naive((float *) a, (float *) b, (float *) c_n);

	printf("Naive MATMUL time     = %d cycles\n", time);


//	matprt((float *) a, N);
//	matprt((float *) b, N);
//	matprt((float *) c_o, N);
//	matprt((float *) c_n, N);

	return EXIT_SUCCESS;
}


void matprt(float *a, int NN)
{
	int i, j;

	printf("\n");
	printf("\n");
	for (i=0; i<NN; i++)
	{
		for (j=0; j<NN; j++)
			printf("%9.0f ", a[i*N + j]);
		printf("\n");
	}

	return;
}
