User Tools

Site Tools


code_examples

HeatFlow

The Heatflow example is an application that simulates how heat spreads on a 1-Dimensional string. This string is divided in cells representing by a float data type that stores its temperature. The application calculates how the heat spreads in the string over N_ITER time-steps.

Code

#pragma polca output heatmap
void init(float **heatmap,float **heatmap_tmp) {
  int i;
 
#pragma polca memalloc (sizeof(float)) n_elem heatmap
  *heatmap = malloc(sizeof(float) * n_elem);
 
#pragma polca memalloc (sizeof(float)) n_elem heatmap_tmp
  *heatmap_tmp = malloc(sizeof(float) * n_elem);
 
  for(i=0; i<n_elem; i++) {
    (*heatmap)[i] = 0.0;
  }
 
  (*heatmap)[10] = 100.0;
}
 
#pragma polca inout heatmap
void heatspread(float *heatmap, float *heatmap_tmp) {
  int i;
 
#pragma polca stencil1D G 1 heatmap heatmap_tmp
  for(i=1; i<n_elem-1; i++)
    {
#pragma polca def G
#pragma polca input heatmap[i-1] heatmap[i] heatmap[i+1]
#pragma polca output heatmap_tmp[i]
      heatmap_tmp[i] = heatmap[i] + K * (heatmap[i-1] +  heatmap[i+1] - 2 * heatmap[i]);
    }
 
#pragma polca input heatmap[0] heatmap[n_elem-1]
#pragma polca output heatmap[0] heatmap[n_elem-1]
  {
    heatmap_tmp[0] = heatmap[0];
    heatmap_tmp[n_elem-1] = heatmap[n_elem-1];
  }
 
#pragma polca memcopy heatmap_tmp (sizeof(float)) n_elem heatmap
  memcpy((void*)heatmap, (void*)heatmap_tmp, n_elem * sizeof(float));
}
 
#pragma polca def print
#pragma polca input heatmap
#pragma polca io
void printCells(float *heatmap, int n) {
  int i;
 
  printf("i %05d:", n);
  for(i=0; i<n_elem; i++) {
    printf(" %10.6f", heatmap[i]);
  }
 
  printf("\n");
}
 
int main(void) {
  int i;
  float *heatmap,*heatmap_tmp;
 
  // The function is annotated 
  init(&heatmap, &heatmap_tmp);
 
#pragma polca itn heatspread heatmap N_ITER heatmap
  for(i=0; i<N_ITER; i++)
    {
      // The function is annotated
      heatspread(heatmap, heatmap_tmp);
    }
 
  // The function is annotated
  printCells(heatmap, N_ITER);
 
#pragma polca memFree heatmap
  free(heatmap);
 
#pragma polca memFree heatmap_tmp
  free(heatmap_tmp);
 
  return 0;
}

The heatspread() function calculates the heat value of each element (or “cell”) of the string for the next time step. In the main() function the iterator over the time steps can be found.

Application Structure

The annotations of the HeatFlow code previously shown and in particular usage of the Higher-Order functions allows the POLCA tools to extract the structure of the application as a pseudo-functional program, which can also be represented as an execution graph as shown in the next Figure.

INIT v = MEMOP1 v; STARTVALUES v
     where
       -- reserve memory
       MEMOP1 v = (let v = memAlloc(N_ELEM))  
       -- copy initial values
       STARTVALUES v = (let v = ...)          

HEATSPREADCYCLE v = MEMOP2 (stencil1D 1 G v) v
     where
       MEMOP2 v v' = (let v' = memCopy len(v))
       G x y z = gKernel x y z

In the structure of the application, it can be observed how the Higher Order functions are used to create a hierarchical structure of the application, in this example the function itn uses as a parameter the Higher Order function stencil1D. Having hierarchical annotations allows the transformation of the source code at different levels being able to change the non-functional behavior of the application depending on the compilation or execution parameters.

Edge Detection

The Edge Detection application implements a mathematical method which aims at identifying points in a digital image at which the image brightness changes sharply or, more formally, has discontinuities. This application assumes that all the frames of a video are available at a given time and proceeds to process them all. The main computational code is in the function filterFrame(), that applies the filter, a 2-Dimensional stencil, to all the pixels of the image except those in the borders.

At the start of the application the memory for the raw frames and the processed frames is allocated, then the raw frames are “loaded”, and after the processing the memory is accordingly de-allocated.

Code

The code, already annotated with the required extra information for the POLCA tools using the POLCA programming model can be seen in this section. The application, after allocating the buffers and loading the data process all the frames. As all the frames are available and their processing is independent this can be annotated with a map directive. The operator of the map is the function that performs the edge detection in a single frame, the filterFrame() function, which internally is a stencil1D. After all the frames have been processed the result is printed and the buffers de-allocated.

#define M_SIZE_X 8
#define M_SIZE_Y 8
#define M_SIZE M_SIZE_X * M_SIZE_Y
 
#define N_FRAMES 8
#define N_INTS M_SIZE * N_FRAMES
 
int fCounter = 0;
 
//Generate Input
void initFrames(int *frame) {
  int py;
  int px;
 
  py = (M_SIZE_Y/2) -1;
  px = (fCounter++ % (M_SIZE_X-3)) +1;
 
  frame[( py    * M_SIZE_X) + px]     = 1;
  frame[( py    * M_SIZE_X) + px + 1] = 1;
  frame[((py+1) * M_SIZE_X) + px]     = 1;
  frame[((py+1) * M_SIZE_X) + px + 1] = 1;
}
 
void printFrame(int *frame) {
  int i, j;
 
  for(j=0; j<M_SIZE_Y; j++) {
    for(i=0; i<M_SIZE_X; i++) {
      printf("%02d ", frame[j*M_SIZE_X + i]);
    }
    printf("\n");
  }
}
 
// Apply filter to all elements except the edge:
//  0 -1  0
// -1  4 -1
//  0 -1  0
#pragma polca def filterFrame
#pragma polca input dataFrame
#pragma polca output resultFrame
void filterFrame(int *dataFrame, int *resultFrame) {
  int i, j;
 
#pragma polca def innerLoop
#pragma polca stencil2D Filter 1 1 dataFrame resultFrame
  for(j=1; j<M_SIZE_Y-1; j++) {
    for(i=1; i<M_SIZE_X-1; i++) {
#pragma polca def Filter
#pragma polca input dataFrame[j*M_SIZE_X+i] dataFrame[j*M_SIZE_X+i-1] dataFrame[j*M_SIZE_X+i+1] dataFrame[(j-1)*M_SIZE_X+i] dataFrame[(j+1)*M_SIZE_X+i]
#pragma polca output resultFrame[j*M_SIZE_X+i]
      {
	resultFrame[j * M_SIZE_X + i]  = dataFrame[ j    * M_SIZE_X + i   ] *4;
	resultFrame[j * M_SIZE_X + i] -= dataFrame[ j    * M_SIZE_X + i -1];
	resultFrame[j * M_SIZE_X + i] -= dataFrame[ j    * M_SIZE_X + i +1];
	resultFrame[j * M_SIZE_X + i] -= dataFrame[(j-1) * M_SIZE_X + i   ];
	resultFrame[j * M_SIZE_X + i] -= dataFrame[(j+1) * M_SIZE_X + i   ];
      }
    }
  }
}
 
// Create Frames -> process -> print (original & processed)
int main(void) {
  int i;
  int *rawF;
  int *proF;
 
#pragma polca memcalloc (sizeof(int)) N_INTS rawF
  rawF = calloc(N_INTS, sizeof(int));
 
#pragma polca memcalloc (sizeof(int)) N_INTS proF
  proF = calloc(N_INTS, sizeof(int));
 
  for(i=0; i<N_FRAMES; i++) {
    initFrames(&(rawF[M_SIZE * i]));
  }
 
#pragma polca map filterFrame rawF proF
  for(i=0; i<N_FRAMES; i++) {
    filterFrame(&(rawF[M_SIZE * i]), &(proF[M_SIZE * i]));
  }
 
#pragma polca io
  for(i=0; i<N_FRAMES; i++) {
    printf("================\n");
    printFrame(&(rawF[M_SIZE * i]));
    printf("----------------\n");
    filterFrame(&(rawF[M_SIZE * i]), &(proF[M_SIZE * i]));
    printFrame(&(proF[M_SIZE * i]));
    printf("================\n");
  }
 
#pragma polca memfree rawF
  free(rawF);
 
#pragma polca memfree proF
  free(proF);
 
  return 0;
}

Application Structure

filterFrame fr = stencil2D 1 1 Filter fr

processFrames frs = map filterFrame frs

main frs = fps
     where
       fps = (let fps = processFrames frs)

The application gets a list of unprocessed frames frs and returns a list of processed frames fps. The processing algorithm takes each frame from the list and applies a filter to them. This is, in summary, a functional map because the same function is applied to all frames, and the filter is a 2-Dimensional stencil, which for POLCA is another Higher Order Function. In summary, the operator of the map is another HOF, creating a hierarchical structure.

In this example the memory specific operations (allocation and de-allocation) have been removed to increase simplicity of the structure. The execution structure is graphically represented in the previous picture.

N-Body

The N-Body problem is a simulation of a dynamical system of particles, usually under the influence of physical forces, such as gravity. N-Body simulations are widely used tools in astrophysics for investigating the dynamics of few-body systems like the Earth-Moon-Sun system to understanding the evolution of the large-scale structure of the universe.

The “particles” treated by the simulation may or may not correspond to physical objects which are particulate in nature. For example, an N-body simulation of a star cluster might have a particle per star, so each particle has some physical significance. On the other hand, a simulation of a gas cloud cannot afford to have a particle for each atom or molecule of gas.

N-body simulations are simple in principle, because they merely involve integrating the 6N ordinary differential equations defining the particle motions in Newtonian gravity. In practice, the number N of particles involved is usually very large and the number of particle-particle interactions needing to be computed increases on the order of O(N^2).

Initial code:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "params_2arr.h"
 
typedef struct
{
  float x, y, z;
} Triple;
 
void bodyForce(Triple *pos, Triple *frc, int n)
{
  for (int i = 0; i < n; i++)
    {
      frc[i].x = 0.0f;
      frc[i].y = 0.0f;
      frc[i].z = 0.0f;
 
      for (int j = 0; j < n; j++)
	{
	  float dx = pos[j].x - pos[i].x;
	  float dy = pos[j].y - pos[i].y;
	  float dz = pos[j].z - pos[i].z;
	  float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
	  float invDist = 1.0f / sqrtf(distSqr);
	  float invDist3 = invDist * invDist * invDist;
 
	  frc[i].x += dx * invDist3;
	  frc[i].y += dy * invDist3;
	  frc[i].z += dz * invDist3;
        }
    }
}
 
void velocities(Triple *frc, Triple *vel, float dt, int n)
{
  for (int i = 0; i < n; i++)
    {
      vel[i].x += dt*frc[i].x;
      vel[i].y += dt*frc[i].y;
      vel[i].z += dt*frc[i].z;
    }
}
 
void integrate(Triple *pos, Triple *vel, float dt, int n)
{
  for (int i = 0 ; i < n; i++)
    {
      pos[i].x += vel[i].x*dt;
      pos[i].y += vel[i].y*dt;
      pos[i].z += vel[i].z*dt;
    }
}
 
int main(const int argc, const char** argv)
{ 
  int nBodies,nElems;
  int dataType;
 
  const float dt   = DT;    // time step
  const int nIters = ITERS; // simulation iterations
 
  nBodies = nElems / DIM;
  Triple *pStruct = (Triple*)malloc(nBodies*sizeof(Triple));
  Triple *vStruct = (Triple*)malloc(nBodies*sizeof(Triple));
  Triple *fStruct = (Triple*)malloc(nBodies*sizeof(Triple));
 
  loadData("position_input.data", &pStruct, &nElems);
  loadData("velocities_input.data", &vStruct, &nElems);
 
  for (int iter = 0; iter < nIters; iter++)
    {
      bodyForce(pStruct, fStruct, nBodies); // compute interbody forces
      velocities(fStruct, vStruct, dt, nBodies); // update velocities
      integrate(pStruct, vStruct, dt, nBodies);  // integrate for dt
    }
 
  saveData("position_output.data", &pStruct, &nElems);
  saveData("velocities_output.data", &vStruct, &nElems);
 
  free(pStruct);
  free(vStruct);
  free(fStruct);
 
  return 0;
}

We will start annotating the code in the main() function. First we will annotate the allocation of the memory for the three array data structures. For this we will use the memAlloc directive, the first parameter being the size in bytes of the data structure, followed by the number of elements and then the pointer that refers to the starting address of the array in memory. For all the three memory allocations the data structure is Triple and all have the same amount of elements, nBodies. The last parameter is pStruct, vStruct or fStruct respectively. At the end of the main() function the memory is de-allocated using the free() system call, that is annotated with the memFree POLCA directive.

The computational part of the main() function takes place mainly in the for-loop. This loop follows an itn pattern as it is an iteration (the input is overwritten by the output of the loop for each iteration) and we know the number of iterations at the begging of the loop. As the itn directive takes only one input and output but our data is divided in different arrays we use the zip directive to wrap pStruct and vStruct into a single element. There is no need to declare fStruct as input or output as this data structure is only used as a temporal element within the body of this itn structure. The operator, which is the body of the loop, has been defined as MAINLOOP, which uses zip(pStruct, vStruct) as data input and output. The body of the loop uses three functions internally that are annotated individually:

#pragma polca memAlloc (sizeof(Triple)) nBodies pStruct
  Triple *pStruct = (Triple*)malloc(nBodies*sizeof(Triple));
#pragma polca memAlloc (sizeof(Triple)) nBodies vStruct
  Triple *vStruct = (Triple*)malloc(nBodies*sizeof(Triple));
#pragma polca memAlloc (sizeof(Triple)) nBodies fStruct
  Triple *fStruct = (Triple*)malloc(nBodies*sizeof(Triple));
 
  /* ... */
 
#pragma polca itn MAINLOOP zip(pStruct, vStruct) nIters zip(pStruct, vStruct)
  for (int iter = 0; iter < nIters; iter++)
#pragma polca def MAINLOOP
#pragma polca inout zip(pStruct, vStruct)
    {
      bodyForce(pStruct, fStruct, nBodies); // compute interbody forces
      velocities(fStruct, vStruct, dt, nBodies); // update velocities
      integrate(pStruct, vStruct, dt, nBodies);  // integrate for dt
    }
 
      /* ... */
 
#pragma polca memFree pStruct
  free(pStruct);
#pragma polca memFree vStruct
  free(vStruct);
#pragma polca memFree fStruct
  free(fStruct);

Then we continue with the annotation of the individual functions that form the body of the for-loop of the main() function.

First the bodyForce() function takes the array pos as input, make a certain operation with each individual element of the and stores the result in the same position of the output array frc. This execution pattern can be annotated with the map directive with the operation for each element defined as CALCFORCE.

The CALCFORCE operator receives the whole array pos as input and has frc[i] as output, internally it uses the whole array pos, even if this is the same element for all the executions of CALCFORCE it should be annotated as it is used by ADDFORCE. The loop inside CALCFORCE matches to a foldl execution pattern that takes takes zip(frc[i], pos[i]) and input and output (actually pos[i] is not output but has to be zipped with frc[i] as part of the single element input) and pos as the input array to reduce over. The operator of this foldl is the operator ADDFORCE. That takes frc[i] and pos[i] as input and output data zipped into a single element (to match the definition of the foldl directive) and pos[j] as input:

#pragma polca map CALCFORCE pos frc
void bodyForce(Triple *pos, Triple *frc, int n)
{
  for (int i = 0; i < n; i++)
#pragma polca def CALCFORCE
#pragma polca input pos
#pragma polca output frc[i]
    {
      frc[i].x = 0.0f;
      frc[i].y = 0.0f;
      frc[i].z = 0.0f;
 
#pragma polca foldl ADDFORCE zip(frc[i], pos[i]) pos zip(frc[i], pos[i])
      for (int j = 0; j < n; j++)
#pragma polca def ADDFORCE
#pragma polca input pos[j]
#pragma polca inout zip(frc[i], pos[i])
	{
	  float dx = pos[j].x - pos[i].x;
	  float dy = pos[j].y - pos[i].y;
	  float dz = pos[j].z - pos[i].z;
	  float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
	  float invDist = 1.0f / sqrtf(distSqr);
	  float invDist3 = invDist * invDist * invDist;
 
	  frc[i].x += dx * invDist3;
	  frc[i].y += dy * invDist3;
	  frc[i].z += dz * invDist3;
        }
    }
}

The velocities() function takes two arrays, makes an operation with the elements that are in the same position and stores the result in the same position of another array. For this reason it matches the execution pattern of a zipWith directive, where the input arrays are frc and vel and the output array is vel. The operator of the zipWith has been defined as UPDV and takes single elements of both arrays (vel[i] and frc[i]) as input and vel[i] as output. The other elements of the function, dt and n have not been declared as input or output because they behave as constants, their value never changes during the execution of the application:

#pragma polca ZipWith UPDV frc vel vel
void velocities(Triple *frc, Triple *vel, float dt, int n)
{
  for (int i = 0; i < n; i++)
#pragma polca def UPDV
#pragma polca input vel[i] frc[i]
#pragma polca output vel[i]
    {
      vel[i].x += dt*frc[i].x;
      vel[i].y += dt*frc[i].y;
      vel[i].z += dt*frc[i].z;
    }
}

The integrate() function takes two arrays, makes an operation with the elements that are in the same position and stores the result in the same position of another array. For this reason it matches the execution pattern of a zipWith directive, where the input arrays are pos and vel and the output array is pos. The operator of the zipWith has been defined as UPDP and takes single elements of both arrays (pos[i] and vel[i]) as input and pos[i] as output. The other elements of the function, dt and n have not been declared as input or output because they behave as constants, their value never changes during the execution of the application:

#pragma polca zipWith UPDP pos vel pos
void integrate(Triple *pos, Triple *vel, float dt, int n)
{
  for (int i = 0 ; i < n; i++)
#pragma polca def UPDP
#pragma polca input pos[i] vel[i]
#pragma polca output pos[i]
    {
      pos[i].x += vel[i].x*dt;
      pos[i].y += vel[i].y*dt;
      pos[i].z += vel[i].z*dt;
    }
}

The final annotated code:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "params_2arr.h"
 
typedef struct
{
  float x, y, z;
} Triple;
 
#pragma polca map CALCFORCE pos frc
void bodyForce(Triple *pos, Triple *frc, int n)
{
  for (int i = 0; i < n; i++)
#pragma polca def CALCFORCE
#pragma polca input pos
#pragma polca output frc[i]
    {
      frc[i].x = 0.0f;
      frc[i].y = 0.0f;
      frc[i].z = 0.0f;
 
#pragma polca foldl ADDFORCE zip(frc[i], pos[i]) pos zip(frc[i], pos[i])
      for (int j = 0; j < n; j++)
#pragma polca def ADDFORCE
#pragma polca input pos[j]
#pragma polca inout zip(frc[i], pos[i])
	{
	  float dx = pos[j].x - pos[i].x;
	  float dy = pos[j].y - pos[i].y;
	  float dz = pos[j].z - pos[i].z;
	  float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
	  float invDist = 1.0f / sqrtf(distSqr);
	  float invDist3 = invDist * invDist * invDist;
 
	  frc[i].x += dx * invDist3;
	  frc[i].y += dy * invDist3;
	  frc[i].z += dz * invDist3;
        }
    }
}
 
#pragma polca ZipWith UPDV frc vel vel
void velocities(Triple *frc, Triple *vel, float dt, int n)
{
  for (int i = 0; i < n; i++)
#pragma polca def UPDV
#pragma polca input vel[i] frc[i]
#pragma polca output vel[i]
    {
      vel[i].x += dt*frc[i].x;
      vel[i].y += dt*frc[i].y;
      vel[i].z += dt*frc[i].z;
    }
}
 
#pragma polca ZipWith UPDP pos vel pos
void integrate(Triple *pos, Triple *vel, float dt, int n)
{
  for (int i = 0 ; i < n; i++)
#pragma polca def UPDP
#pragma polca input pos[i] vel[i]
#pragma polca output pos[i]
    {
      pos[i].x += vel[i].x*dt;
      pos[i].y += vel[i].y*dt;
      pos[i].z += vel[i].z*dt;
    }
}
 
int main(const int argc, const char** argv)
{ 
  int nBodies,nElems;
  int dataType;
 
  const float dt   = DT;    // time step
  const int nIters = ITERS; // simulation iterations
 
  nBodies = nElems / DIM;
 
#pragma polca memAlloc (sizeof(Triple)) nBodies pStruct
  Triple *pStruct = (Triple*)malloc(nBodies*sizeof(Triple));
#pragma polca memAlloc (sizeof(Triple)) nBodies vStruct
  Triple *vStruct = (Triple*)malloc(nBodies*sizeof(Triple));
#pragma polca memAlloc (sizeof(Triple)) nBodies fStruct
  Triple *fStruct = (Triple*)malloc(nBodies*sizeof(Triple));
 
  loadData("position_input.data", &pStruct, &nElems);
  loadData("velocities_input.data", &vStruct, &nElems);
 
#pragma polca itn MAINLOOP zip(pStruct, vStruct) nIters zip(pStruct, vStruct)
  for (int iter = 0; iter < nIters; iter++)
#pragma polca def MAINLOOP
#pragma polca inout zip(pStruct, vStruct)
    {
      bodyForce(pStruct, fStruct, nBodies); // compute interbody forces
      velocities(fStruct, vStruct, dt, nBodies); // update velocities
      integrate(pStruct, vStruct, dt, nBodies);  // integrate for dt
    }
 
  saveData("position_output.data", &pStruct, &nElems);
  saveData("velocities_output.data", &vStruct, &nElems);
 
#pragma polca memFree pStruct
  free(pStruct);
#pragma polca memFree vStruct
  free(vStruct);
#pragma polca memFree fStruct
  free(fStruct);
 
  return 0;
}
code_examples.txt · Last modified: 2016/12/25 12:45 by daniel