// IIR - Dual BiQuads
// Optimized C: 992 cycles
// Naive code: 1536 cycles

/*
   e-gcc -Wall -O3 -std=c99 \
         -mlong-calls \
         -mfp-mode=round-nearest \
         -ffp-contract=fast \
         -ffast-math \
         -funroll-loops \
         -T ${EPIPHANY_HOME}/bsps/emek3/fast.ldf \
         -o ./iir.elf \
         ./iir.c
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <e_ctimers.h>
#include <e_regs.h>
#include <e_common.h>

e_ctimer_config_t e_ctimer_stop(e_ctimer_id_t timer);

#define _Ntaps   3
#define _Nstage  2
#define _Ndata  64

#define IIR 0
#define FIR 1

void init_data(float *a);

float coeffs[_Nstage][2][_Ntaps+1] ALIGN(8) SECTION(".data_bank2") =
	{{/*IIR-0*/ {1, 0.5, 0, 0}, /*FIR-0*/ {0.5, 0.5, 0, 0}},
	 {/*IIR-1*/ {1, 0,   0, 0}, /*FIR-1*/ {0,   0,   0, 0}}};
float inp_data[_Ndata] ALIGN(8) SECTION(".data_bank2");
float out_data[_Ndata] ALIGN(8) SECTION(".data_bank2");

unsigned iir()
{
	int register rdp; // pointer to the I/O data's current position.
	int register cp;  // pointer to the coefficients array.
	unsigned time, time_s, time_e;

	float ca[_Nstage][_Ntaps]; // bwd coefficients
	float cb[_Nstage][_Ntaps]; // fwd coefficients
	float dl[_Nstage][_Ntaps]; // delay line
	float iir[4];              // temp accumulator

	for (cp=0; cp<_Ntaps; cp++)
	{
		dl[0][cp] = 0.0;
		ca[0][cp] = coeffs[0][IIR][cp];
		cb[0][cp] = coeffs[0][FIR][cp];

		dl[1][cp] = 0.0;
		ca[1][cp] = coeffs[1][IIR][cp];
		cb[1][cp] = coeffs[1][FIR][cp];
	}

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (rdp=0; rdp<_Ndata; rdp+=8)
	{
		iir[0]   = ca[0][0] * (inp_data[rdp+0] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+0] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+0] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+1] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+1] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+1] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+2] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+2] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+2] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+3] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+3] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+3] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+4] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+4] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+4] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+5] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+5] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+5] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+6] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+6] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+6] = iir[2] + iir[3];


		iir[0]   = ca[0][0] * (inp_data[rdp+7] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]           + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp+7] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]           + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp+7] = iir[2] + iir[3];
	}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


unsigned iir_naive()
{
	int register rdp; // pointer to the I/O data's current position.
	int register cp;  // pointer to the coefficients array.
	unsigned time, time_s, time_e;

	float ca[_Nstage][_Ntaps]; // bwd coefficients
	float cb[_Nstage][_Ntaps]; // fwd coefficients
	float dl[_Nstage][_Ntaps]; // delay line
	float iir[4];              // temp accumulator

	for (cp=0; cp<_Ntaps; cp++)
	{
		ca[0][cp] = coeffs[0][IIR][cp];
		cb[0][cp] = coeffs[0][FIR][cp];
		dl[0][cp] = 0.0;

		ca[1][cp] = coeffs[1][IIR][cp];
		cb[1][cp] = coeffs[1][FIR][cp];
		dl[1][cp] = 0.0;
	}

	e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
	e_ctimer_start(E_CTIMER_0, E_CTIMER_CLK);
	time_s = e_ctimer_get(E_CTIMER_0);

	for (rdp=0; rdp<_Ndata; rdp++)
	{
		iir[0]   = ca[0][0] * (inp_data[rdp] + ca[0][1] * dl[0][0] + ca[0][2] * dl[0][1]);
		iir[2]   = cb[0][0] * iir[0]         + cb[0][1] * dl[0][0] + cb[0][2] * dl[0][1];
		dl[0][1] = dl[0][0];
		dl[0][0] = iir[0];

		iir[1]   = ca[1][0] * (inp_data[rdp] + ca[1][1] * dl[1][0] + ca[1][2] * dl[1][1]);
		iir[3]   = cb[1][0] * iir[1]         + cb[1][1] * dl[1][0] + cb[1][2] * dl[1][1];
		dl[1][1] = dl[1][0];
		dl[1][0] = iir[1];

		out_data[rdp] = iir[2] + iir[3];
	}

	time_e = e_ctimer_get(E_CTIMER_0);
	e_ctimer_stop(E_CTIMER_0);

	time = time_s - time_e;

	return time;
}


int main(void) {
	int rdp;
	unsigned time;

	init_data(inp_data);

	time = iir();

	printf("Optimized IIR time = %d cycles\n", time);

	time = iir_naive();

	printf("Naive IIR time     = %d cycles\n", time);

//	for (rdp=0; rdp<_Ndata; rdp++)
//	{
//		printf("%8d, %9.3f, %9.3f\n", rdp, inp_data[rdp], out_data[rdp]);
//	}

	return EXIT_SUCCESS;
}


void init_data(float *a)
{
	int i;
	float freq;

	freq = 2;

	for (i=0; i<_Ndata; i++)
	{
//		a[i] = sin(2 * 3.14159 * freq / _Ndata * i);
//		a[i] = -i;
		a[i] = (i == 0) ? 1 : (i == _Ndata/2) ? 2 : 0;
	}

	return;
}
