/*
  fft2d_unit.c

  Copyright (C) 2012 Adapteva, Inc.
  Contributed by Yainv Sapir <yaniv@adapteva.com>

  This program is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program, see the file COPYING.  If not, see
  <http://www.gnu.org/licenses/>.
*/


// This program is the accelerator part of the 2D-FFT project.
//
// This program runs on the Epiphany system and answers the host with the
// calculation result of the operand matrices.
//
// Jan-2012, YS.


#include <e_coreid.h>
#include <e_ctimers.h>
#include "fft2dlib.h"
#include "fft2d.h"
#include "dmalib.h"
#include "dram_buffers.h"
#include "static_buffers.h"

int  fft2d_unit();
void FFT2D(fft_dir_t dir);
void corner_turn(int pingpong);
void LPF(int lgNN);
void init();


///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
int fft2d_unit()
{
	// Initialize data structures - mainly target pointers
	dstate(1);
	init();
	dstate(2);

	while (1)
	{
		dstate(3);
		if (me.corenum == 0)
		{
			// Wait for fft() call from the host. When a rising
			// edge is detected in the mailbox, the loop
			// is terminated and a call to the actual
			// FFT() function is initiated.
			while (Mailbox.pCore->go == 0) {};

			e_ctimer_set(E_CTIMER_0, E_CTIMER_CLK, E_CTIMER_MAX);
			__asm__ volatile ("movfs %0, ctimer0" : "=r" (me.time_s));

			Mailbox.pCore->ready = 0;

			me.go_sync = 64;
		} else {
			// Wait for "go" from the previous core. When a rising
			// edge is detected in the core's mailbox, the loop
			// is terminated and a call to the actual
			// FFT() function is initiated.
			while (me.go_sync == 0) {};
		}
		// Signal "go" to next core.
		dstate(4);
		*me.tgt_go_sync = me.corenum + 1;

		// Load _Score rows from DRAM.
#ifdef _USE_DRAM_
#	ifdef _USE_DMA_E_
		dmacpye((void *) &(Mailbox.pA[me.corenum * _Score * _Sfft]), me.bank[_BankA][_PING]);
#	else // _USE_DMA_E_
#		warning "Using rowcpy() instead of DMA_E"
		rowcpy(&(Mailbox.pA[me.corenum * _Score * _Sfft]), me.bank[_BankA][_PING], _Score * _Sfft);
#	endif // _USE_DMA_E_
#endif // _USE_DRAM_

		dstate(5);
		
		// Calculate. During this time, the host polls the
		// Core 0's mailbox, waiting for a falling
		// edge indicating the end of the calculation.
		FFT2D(e_fft_fwd);

		LPF(_lgSfft);

		FFT2D(e_fft_bwd);

		dstate(6);

		// Save _Score rows to DRAM.
#ifdef _USE_DRAM_
#	ifdef _USE_DMA_E_
		dmacpye(me.bank[_BankA][_PING], (void *) &(Mailbox.pB[me.corenum * _Score * _Sfft]));
#	else // _USE_DMA_E_
#		warning "Using rowcpy() instead of DMA_E"
		rowcpy(me.bank[_BankA][_PING], &(Mailbox.pB[me.corenum * _Score * _Sfft]), _Score * _Sfft);
#	endif // _USE_DMA_E_
#endif // _USE_DRAM_

		// If this is the first core, wait until all cores finished calculation and signal the host.
		dstate(7);
		if (me.corenum == 0)
		{
			// Signal own End-Of-Calculation to previous core.
			while (me.go_sync == 64) {};
			me.go_sync = 0;
	        // Wait until next core ends calculation.
			dstate(8);
			while (*me.tgt_go_sync > 0) {};
			dstate(9);

			__asm__ volatile ("movfs %0, ctimer0" : "=r" (me.time_e));
			me.time_f = me.time_s - me.time_e;

			Mailbox.pCore->time_f = me.time_f;
			Mailbox.pCore->go = 0;
			Mailbox.pCore->ready = 1;
			dstate(10);
		} else {
	        // If next core ended calculation, signal own End-Of-Calculation to previous core.
			dstate(11);
			while (*me.tgt_go_sync > 0) {};
			dstate(12);
			me.go_sync = 0;
			dstate(13);
		}
	}

	return 0;
}


///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
void FFT2D(fft_dir_t dir)
{
	int row, cnum, Wn_offset;

	if (dir == e_fft_fwd)
		Wn_offset = 0;
	else
		Wn_offset = _Sfft >> 1;

	dstate(100);
	for (cnum=0; cnum<_Ncores; cnum++)
		me.sync[cnum] = 0;

	// Reorder vectors w/ bit reversal
		bitrev(me.bank[_BankA][_PING], _lgSfft, _Score);

	// Perform 1D-FFT on _Score rows
	for (row=0; row<_Score; row++)
		fft_1d_r2_dit(_lgSfft, (me.bank[_BankA][_PING] + row * _Sfft), me.bank[_BankW][_PING]+Wn_offset, _Sfft);

	dstate(101);
	// Do the corner turn
	corner_turn(_PING);

	dstate(102);
	// Signal for sync
	for (cnum=0; cnum<_Ncores; cnum++)
		*me.tgt_sync[cnum] = 1;

	// Wait for sync from all cores
	for (cnum=0; cnum<_Ncores; cnum++)
		while (me.sync[cnum] == 0) {};

	// Reorder vectors w/ bit reversal
	dstate(103);
	bitrev(me.bank[_BankA][_PONG], _lgSfft, _Score);

	// Perform 1D-FFT on _Score rows
	for (row=0; row<_Score; row++)
		fft_1d_r2_dit(_lgSfft, (me.bank[_BankA][_PONG] + row * _Sfft), me.bank[_BankW][_PING]+Wn_offset, _Sfft);

	dstate(104);
	// Do the corner turn
	corner_turn(_PONG);

	// Signal for sync
	dstate(105);
	for (cnum=0; cnum<_Ncores; cnum++)
		*me.tgt_sync[cnum] = 2;

	// Wait for sync from all cores
	for (cnum=0; cnum<_Ncores; cnum++)
		while (me.sync[cnum] == 1) {};

	dstate(106);
	return;
}


///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
void corner_turn(int pingpong)
{
#ifdef _USE_DMA_I_
	unsigned cnum;

	for (cnum=0; cnum<_Ncores; cnum++)
	{
		dstate(200 + cnum);
		dmacpyi((void *) (me.bank[_BankA][pingpong] + _Score * cnum), (void *) (me.tgt_bk[cnum][_BankA][pingpong] + _Score * me.corenum));
	}




#else
#	warning "Using memcpy() instead of DMA_I"
	unsigned row, col, cnum;

	// Transpose cores
	for (cnum=0; cnum<_Ncores; cnum++)
	{
		for (row=0; row<_Score; row++)
		{
			for (col=0; col<_Score; col++)
			{
				*(me.tgt_bk[cnum][_BankA][pingpong] + _Sfft * col + _Score * me.corenum + row) =
				        *(me.bank[_BankA][pingpong] + _Sfft * row + _Score * cnum       + col);
			}
		}
	}
#endif

	return;
}


///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
void LPF(int lgNN)
{
	int row, col, k;
	#define Fco 2

	if (me.corenum < (8-Fco)*(_Ncores >> 4))
	{
		for (row=0, k=0; row<_Score; row++)
		{
			for (col=0; col<((8-Fco)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] *= recipro_2_by[lgNN+lgNN];
			for (     ; col<((8+Fco)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] = 0;
			for (     ; col<((8+8)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] *= recipro_2_by[lgNN+lgNN];
		}
	}
	else if (me.corenum < (8+Fco)*(_Ncores >> 4))
	{
		for (k=0; k<(_Score * _Sfft); )
			me.bank[_BankA][_PING][k++] = 0;
	}
	else
	{
		for (row=0, k=0; row<_Score; row++)
		{
			for (col=0; col<((8-Fco)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] *= recipro_2_by[lgNN+lgNN];
			for (     ; col<((8+Fco)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] = 0;
			for (     ; col<((8+8)*(_Sfft>>4)); col++)
				me.bank[_BankA][_PING][k++] *= recipro_2_by[lgNN+lgNN];
		}
	}

	return;
}


///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
void init()
{
	int row, col, cnum;
	e_coreid_t coreID;

	// Initialize the mailbox shared buffer pointers
	Mailbox.pBase = (void *) SHARED_DRAM;
	Mailbox.pA    = Mailbox.pBase + offsetof(shared_buf_t, A[0]);
	Mailbox.pB    = Mailbox.pBase + offsetof(shared_buf_t, B[0]);
	Mailbox.pCore = Mailbox.pBase + offsetof(shared_buf_t, core);

	// Initialize per-core parameters - core data structure
	// Use eLib's e_coreid library's API to retrieve core specific information
	me.coreID = e_get_coreid();
	me.corenum = E_CORE_NUM;
	// Use the predefined constants to determine the relative coordinates of the core
	me.row = E_CORE_ROW - E_FIRST_CORE_ROW;
	me.col = E_CORE_COL - E_FIRST_CORE_COL;

	// Initialize pointers to the operand matrices ping-pong arrays
	me.bank[_BankA][_PING] = (cfloat *) &(AA[0][0]);
	me.bank[_BankA][_PONG] = (cfloat *) &(BB[0][0]);
	me.bank[_BankW][_PING] = (cfloat *) &(Wn[0]);

	// Use the e_neighbor_id() API to generate the pointer addresses of the arrays
	// in the horizontal and vertical target cores, where the submatrices data will
	// be swapped.
	cnum = 0;
	for (row=E_FIRST_CORE_ROW; row<(E_FIRST_CORE_ROW + E_ROWS_IN_CHIP); row++)
		for (col=E_FIRST_CORE_COL; col<(E_FIRST_CORE_COL + E_COLS_IN_CHIP); col++)
		{
			coreID = e_coreid_from_coords(row, col);
			me.tgt_bk[cnum][_BankA][_PING] = e_address_from_coreid(coreID, (void *) me.bank[_BankA][_PONG]);
			me.tgt_bk[cnum][_BankA][_PONG] = e_address_from_coreid(coreID, (void *) me.bank[_BankA][_PING]);
			me.tgt_sync[cnum]              = e_address_from_coreid(coreID, (void *) (&me.sync[me.corenum]));
			cnum++;
		}
	me.coreIDn = me.coreID;
	e_neighbor_id(&me.coreIDn, E_NEXT_CORE, E_CHIP_WRAP);
	me.tgt_go_sync = e_address_from_coreid(me.coreIDn, (void *) (&me.go_sync));

	// Generate Wn
//	generateWn(me.bank[_BankW][_PING], _Sfft, _1overSfft);
	generateWn(me.bank[_BankW][_PING], _lgSfft);

	// Clear the inter-core sync signals
	me.go_sync = 0;
	for (cnum=0; cnum<_Ncores; cnum++)
		me.sync[cnum] = 0;

	// Init the host-accelerator sync signals
	Mailbox.pCore->go = 0;
	Mailbox.pCore->ready = 1;

#if 0
	// Initialize input image - to be removed XXX
	for (row=0; row<_Score; row++)
	{
		for (col=0; col<_Sfft; col++)
			*(me.bank[_BankA][_PING] + row * _Sfft + col) = (me.corenum * _Score + row)*1000.0 + col;
		// convert to eDMA
		rowcpy((me.bank[_BankA][_PING] + row * _Sfft), &(Mailbox.pA[(me.corenum * _Score + row) * _Sfft]), _Sfft);
	}
#endif

	return;
}
