// FIR-16
// Optimized C: 620 cycles
// Naive code: 1562 cycles

/*
   e-gcc -Wall -O3 -std=c99 \
         -mlong-calls \
         -mfp-mode=round-nearest \
         -ffp-contract=fast \
         -funroll-loops \
         -T ${EPIPHANY_HOME}/bsps/emek3/fast.ldf \
         -o ./fir.elf \
         ./fir.c
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <e_ctimers.h>
#include <e_regs.h>
#include <e_common.h>

e_ctimer_config_t e_ctimer_stop(e_ctimer_id_t timer);

#define _Ntaps 16
#define _Ndata (_Ntaps * 2)

// arrange Coefficients in descending order, i.e., Coeff_(N-1) first and Coeff_0 last
float coeffs[_Ntaps] =
	{0.01, 0.02, 0.04, 0.08,
	 0.16, 0.32, 0.64, 1.28,
	 1.28, 0.64, 0.32, 0.16,
	 0.08, 0.04, 0.02, 0.01};

float inp_data[_Ndata] SECTION(".data_bank2");
float out_o[_Ndata]    SECTION(".data_bank3");
float out_n[_Ndata]    SECTION(".data_bank3");

unsigned fir(float *a, float *b) {
	int wrp; // pointer to the delay line's current position.
	int rdp; // pointer to the I/O data's current position.
	int cp;  // pointer to the FIR coefficient's array.
	int dlp; // pointer to delay line position per-tap.
	unsigned time, time_s, time_e;

	float c[_Ntaps];      // FIR coefficients
	float dl[_Ntaps * 2]; // FIR delay line
	float fir[8];         // temp fir accumulators

	for (cp=0; cp<_Ntaps; cp++)
	{
		dl[cp] = dl[cp+_Ntaps] = 0.0;
		c[cp]  = coeffs[cp];
	}

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (rdp=0; rdp<_Ndata; rdp+=_Ntaps)
	{
		for (wrp=0; wrp<_Ntaps; wrp+=8)
		{
			dl[wrp+0] = dl[wrp+_Ntaps+0] = a[rdp+wrp+0];
			dl[wrp+1] = dl[wrp+_Ntaps+1] = a[rdp+wrp+1];
			dl[wrp+2] = dl[wrp+_Ntaps+2] = a[rdp+wrp+2];
			dl[wrp+3] = dl[wrp+_Ntaps+3] = a[rdp+wrp+3];
			dl[wrp+4] = dl[wrp+_Ntaps+4] = a[rdp+wrp+4];
			dl[wrp+5] = dl[wrp+_Ntaps+5] = a[rdp+wrp+5];
			dl[wrp+6] = dl[wrp+_Ntaps+6] = a[rdp+wrp+6];
			dl[wrp+7] = dl[wrp+_Ntaps+7] = a[rdp+wrp+7];
			fir[0] = fir[1] = fir[2] = fir[3] = 0;
			fir[4] = fir[5] = fir[6] = fir[7] = 0;
			for (cp=0, dlp=(wrp+1); cp<_Ntaps; cp++, dlp++)
			{
				fir[0] += c[cp] * dl[dlp + 0];
				fir[1] += c[cp] * dl[dlp + 1];
				fir[2] += c[cp] * dl[dlp + 2];
				fir[3] += c[cp] * dl[dlp + 3];
				fir[4] += c[cp] * dl[dlp + 4];
				fir[5] += c[cp] * dl[dlp + 5];
				fir[6] += c[cp] * dl[dlp + 6];
				fir[7] += c[cp] * dl[dlp + 7];
			}
			b[rdp+wrp+0] = fir[0];
			b[rdp+wrp+1] = fir[1];
			b[rdp+wrp+2] = fir[2];
			b[rdp+wrp+3] = fir[3];
			b[rdp+wrp+4] = fir[4];
			b[rdp+wrp+5] = fir[5];
			b[rdp+wrp+6] = fir[6];
			b[rdp+wrp+7] = fir[7];
		}
	}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);
	time = time_s - time_e;

	return time;
}


unsigned fir_naive(float *a, float *b) {
	int register rdp; // pointer to the I/O data's current position.
	int register cp;  // pointer to the FIR coefficient's array.
	unsigned time, time_s, time_e;

	float c[_Ntaps];  // FIR coefficients
	float dl[_Ntaps * 2]; // FIR delay line
	float fir; // temp fir accumulator

	for (cp=0; cp<_Ntaps; cp++)
	{
		dl[cp] = dl[cp+_Ntaps] = 0.0;
		c[cp]  = coeffs[cp];
	}

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (rdp=0; rdp<_Ndata; rdp+=1)
	{
		fir = c[_Ntaps-1] * dl[_Ntaps-2];
		for (cp=(_Ntaps-2); cp>=1; cp--)
		{
			dl[cp] = dl[cp-1];
			fir += c[cp] * dl[cp];
		}
		dl[0] = a[rdp];
		fir += c[0] * dl[0];
		b[rdp] = fir;
	}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);
	time = time_s - time_e;

	return time;
}


void init_data(float *a);

int main(void) {
	unsigned time;
	int rdp;

	init_data(inp_data);

	time = fir(inp_data, out_o);

	printf("Optimized FIR time = %d cycles\n", time);

	time = fir_naive(inp_data, out_n);

	printf("Naive FIR time     = %d cycles\n\n", time);

//	for (rdp=0; rdp<_Ndata; rdp++)
//	{
//		printf("%8d, %9.3f, %9.3f, %9.3f\n", rdp, inp_data[rdp], out_o[rdp], out_n[rdp]);
//	}

	return EXIT_SUCCESS;
}


void init_data(float *a)
{
	int i;
//	float freq;

//	freq = 2;

	for (i=0; i<_Ndata; i++)
	{
//		a[i] = sin(2 * 3.14159 * freq / _Ndata * i);
//		a[i] = -i;
		a[i] = (i == 0) ? 1 : (i == _Ndata/2) ? 2 : 0;
	}

	return;
}
