// DOTPROD 256x256:
// Optimized C: 557 cycles
// Naive code:  814 cycles

/*
   e-gcc -Wall -O3 -std=c99 \
         -mlong-calls \
         -mfp-mode=round-nearest \
         -ffp-contract=fast \
         -ffast-math \
         -funroll-loops \
         -T ${EPIPHANY_HOME}/bsps/emek3/fast.ldf \
         -o ./dotprod.elf \
         ./dotprod.c
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <e_ctimers.h>
#include <e_regs.h>
#include <e_common.h>

e_ctimer_config_t e_ctimer_stop(e_ctimer_id_t timer);

#define N (256)

float a[N] ALIGN(8) SECTION(".data_bank2");
float b[N] ALIGN(8) SECTION(".data_bank3");
float c_o  ALIGN(8) SECTION(".data_bank3");
float c_n  ALIGN(8) SECTION(".data_bank3");

unsigned dotprod(float * restrict a, float * restrict b, float * restrict c)
{
	int j0, j1;

	unsigned time, time_s, time_e;

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	float tot[16];

	for (j1=0; j1<4; j1++)
	{
		tot[j1] = 0.0;
	}

	for (j0=0; j0<N; j0+=4)
	{
		for (j1=0; j1<4; j1+=2)
		{
			float tmp1[2], tmp2[2];
			tmp1[0] = *(a + j0 + j1 + 0);
			tmp1[1] = *(a + j0 + j1 + 1);
			tmp2[0] = *(b + j0 + j1 + 0);
			tmp2[1] = *(b + j0 + j1 + 1);
			tot[j1 + 0] += tmp1[0] * tmp2[0];
			tot[j1 + 1] += tmp1[1] * tmp2[1];
		}
	}

	tot[0] += tot[1];
	tot[2] += tot[3];

	*c = tot[0] + tot[2];

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


unsigned dotprod_naive(float * restrict a, float * restrict b, float * restrict c)
{
	int i;

	unsigned time, time_s, time_e;

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	*c = 0;
	for (i=0; i<N; i++)
	{
		*c += a[i] * b[i];
	}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


void vecprt(float *a, int NN);

int main ()
{
    int  i;
	unsigned time;

    for (i = 0; i < N; i++)
    {
		a[i] = (float) i;
		b[i] = (float) (i+2);
    }

	printf("\n");

	time = dotprod(a, b, &c_o);

	printf("Optimized DOTPROD time = %d cycles\n", time);

	time = dotprod_naive(a, b, &c_n);

	printf("Naive DOTPROD time     = %d cycles\n", time);

//	vecprt(a, N);
//	vecprt(b, N);
//	printf("\n%6.0f %6.0f\n", c_o, c_n);

	return EXIT_SUCCESS;
}


void vecprt(float *a, int NN)
{
	int i;

	printf("\n");
	for (i=0; i<NN; i++)
	{
		printf("%9.0f ", a[i]);
	}
	printf("\n");

	return;
}
