diff -Nurd pfstmo-1.1/src/fattal02/pde.cpp pfstmo-1.1-ejb/src/fattal02/pde.cpp
--- pfstmo-1.1/src/fattal02/pde.cpp	2007-06-14 16:27:45.000000000 +0100
+++ pfstmo-1.1-ejb/src/fattal02/pde.cpp	2008-01-03 22:39:09.000000000 +0000
@@ -59,9 +59,9 @@
 #define V_CYCLE 2 /* number of v-cycles  */
 
 // precision
-#define EPS 1.0e-12
+#define EPS 1.0e-8
 
-void linbcg(unsigned long n, float b[], float x[], int itol, float tol,
+void linbcg(unsigned int n, float b[], float x[], int itol, float tol,
   int itmax, int *iter, float *err);
 
 inline float max( float a, float b )
@@ -111,13 +111,13 @@
   const float dx = (float)in->getCols() / (float)out->getCols();
   const float dy = (float)in->getRows() / (float)out->getRows();
 
-  const float filterSize = 0.5;
+  const float filterSize = 0.5f;
   
   float sx, sy;
   int x, y;
   
-  for( y = 0, sy = dy/2-0.5; y < outRows; y++, sy += dy )
-    for( x = 0, sx = dx/2-0.5; x < outCols; x++, sx += dx ) {
+  for( y = 0, sy = dy/2-0.5f; y < outRows; y++, sy += dy )
+    for( x = 0, sx = dx/2-0.5f; x < outCols; x++, sx += dx ) {
 
       float pixVal = 0;
       float w = 0;
@@ -191,8 +191,8 @@
       
       for( float ix = max( 0, ceilf( sx-filterSize ) ); ix <= min( floorf(sx+filterSize), inCols-1 ); ix++ )
         for( float iy = max( 0, ceilf( sy-filterSize ) ); iy <= min( floorf( sy+filterSize), inRows-1 ); iy++ ) {
-          float fx = fabs( sx - ix );
-          float fy = fabs( sy - iy );
+          float fx = fabsf( sx - ix );
+          float fy = fabsf( sy - iy );
 
           const float fval = (1-fx)*(1-fy);
           
@@ -276,7 +276,7 @@
   int sy = F->getRows();
   int x,y;
 
-  float h = 1.0/sqrt(sx*sy*1.0f);
+  float h = 1.0/sqrtf(sx*sy*1.0f);
   float h2 = h*h;
 
     setArray( U, 0.0f); return;   /* also works well?? */
@@ -334,7 +334,7 @@
 //   int x,y,i;
 //   int shx;	shift x
   
-//   float h = 1.0f/sqrt(sx*sy*1.0f);
+//   float h = 1.0f/sqrtf(sx*sy*1.0f);
 //   float h2 = h*h;
 
 //   h2 = 1;
@@ -367,7 +367,7 @@
   int sx = F->getCols();
   int sy = F->getRows();
 
-  float h = 1.0f/sqrt(sx*sy*1.0f);
+  float h = 1.0f/sqrtf(sx*sy*1.0f);
   float h2i = 1.0/(h*h);
 
   h2i = 1;
@@ -406,7 +406,7 @@
   int i;	// index for simple loops
   int k;	// index for iterating through levels
   int k2;	// index for iterating through levels in V-cycles
-  int x,y;
+//   int x,y;
 
   // 1. restrict f to coarse-grid (by the way count the number of levels)
   //	  k=0: fine-grid = f
@@ -434,7 +434,7 @@
 
   int sx=xmax;
   int sy=ymax;
-  DEBUG_STR << "FMG: #0 size " << sx << "x" << sy << endl;
+//   DEBUG_STR << "FMG: #0 size " << sx << "x" << sy << endl;
   for( k=0 ; k<levels ; k++ )
   {
     // calculate size of next level
@@ -448,7 +448,7 @@
     // restrict from level k to level k+1 (coarser-grid)
     restrict( RHS[k], RHS[k+1] );
 
-    DEBUG_STR << "FMG: #" << k+1 << " size " << sx << "x" << sy << endl;
+//     DEBUG_STR << "FMG: #" << k+1 << " size " << sx << "x" << sy << endl;
   }
 
   // 2. find exact sollution at the coarsest-grid (k=levels)
@@ -549,7 +549,7 @@
   delete IU;
   delete VF;
 
-  DEBUG_STR << "FMG: solved\n";
+//   DEBUG_STR << "FMG: solved\n";
 }
 
 
@@ -561,7 +561,7 @@
 
 void solve_pde_sor( pfs::Array2D *F, pfs::Array2D *U, int maxits)
 {
-  DEBUG_STR << "sor" << endl;
+//   DEBUG_STR << "sor" << endl;
 
   int xmax = F->getCols();
   int ymax = F->getRows();
@@ -574,7 +574,7 @@
 //		 norm has been reduced by a factor EPS.
   for (j = 0; j < xmax; j++)
     for (l = 0; l < ymax; l++) {
-      anormf += fabs( (*F)(j,l) );
+      anormf += fabsf( (*F)(j,l) );
       (*U)(j,l)=0.0f;
     }
 
@@ -600,7 +600,7 @@
           
 	  resid = (*U)(jp1,l) + (*U)(jm1,l) + (*U)(j,lp1) + (*U)(j,lm1)
             - 4.0* (*U)(j,l) - (*F)(j,l);
-	  anorm += fabs(resid);
+	  anorm += fabsf(resid);
 	  (*U)(j,l) -= omega * resid / -4.0;
 	}
 	lsw = 1 - lsw;
@@ -610,20 +610,20 @@
         : 1.0 / (1.0 - 0.25 * rjac * rjac * omega));
     }
     if( !(n%100) || n==1)
-      DEBUG_STR << "SOR:> " << n << "\tAnorm: " << anorm << "\n";
+//       DEBUG_STR << "SOR:> " << n << "\tAnorm: " << anorm << "\n";
     if (anorm < EPS * anormf ) {
-      DEBUG_STR << "SOR:> solved.\n";
+//       DEBUG_STR << "SOR:> solved.\n";
       return;
     }
   }
-  DEBUG_STR << "SOR:> MAXITS exceeded\n";
+//   DEBUG_STR << "SOR:> MAXITS exceeded\n";
 }
 
 
 
 //#define EPS 1.0e-14
 
-void asolve(unsigned long n, float b[], float x[], int itrnsp)
+void asolve(unsigned int /*n*/, float b[], float x[], int/* itrnsp*/)
 {
     for( int r = 0; r < rows; r++ )
       for( int c = 0; c < cols; c++ ) {
@@ -631,7 +631,7 @@
       }
 }
 
-void atimes(unsigned long n, float x[], float res[], int itrnsp)
+void atimes(unsigned int /*n*/, float x[], float res[], int /*itrnsp*/)
 {
   for( int r = 1; r < rows-1; r++ )
     for( int c = 1; c < cols-1; c++ ) {
@@ -659,21 +659,21 @@
     - 2*x[idx(rows-1,cols-1)];  
 }
 
-float snrm(unsigned long n, float sx[], int itol)
+float snrm(unsigned int n, float sx[], int itol)
 {
-	unsigned long i,isamax;
+	unsigned int i,isamax;
 	float ans;
 
 	if (itol <= 3) {
 		ans = 0.0;
 		for (i=1;i<=n;i++) ans += sx[i]*sx[i];
-		return sqrt(ans);
+		return sqrtf(ans);
 	} else {
 		isamax=1;
 		for (i=1;i<=n;i++) {
-			if (fabs(sx[i]) > fabs(sx[isamax])) isamax=i;
+			if (fabsf(sx[i]) > fabsf(sx[isamax])) isamax=i;
 		}
-		return fabs(sx[isamax]);
+		return fabsf(sx[isamax]);
 	}
 }
 
@@ -681,9 +681,9 @@
  * Biconjugate Gradient Method
  * from Numerical Recipes in C
  */
-void linbcg(unsigned long n, float b[], float x[], int itol, float tol,	int itmax, int *iter, float *err)
+void linbcg(unsigned int n, float b[], float x[], int itol, float tol,	int itmax, int *iter, float *err)
 {	
-	unsigned long j;
+	unsigned int j;
 	float ak,akden,bk,bkden,bknum,bnrm,dxnrm,xnrm,zm1nrm,znrm;
 	float *p,*pp,*r,*rr,*z,*zz;
 
@@ -749,9 +749,9 @@
 			*err=snrm(n,r,itol)/bnrm;
 		} else if (itol == 3 || itol == 4) {
 			znrm=snrm(n,z,itol);
-			if (fabs(zm1nrm-znrm) > EPS*znrm) {
-				dxnrm=fabs(ak)*snrm(n,p,itol);
-				*err=znrm/fabs(zm1nrm-znrm)*dxnrm;
+			if (fabsf(zm1nrm-znrm) > EPS*znrm) {
+				dxnrm=fabsf(ak)*snrm(n,p,itol);
+				*err=znrm/fabsf(zm1nrm-znrm)*dxnrm;
 			} else {
 				*err=znrm/bnrm;
 				continue;
diff -Nurd pfstmo-1.1/src/fattal02/tmo_fattal02.cpp pfstmo-1.1-ejb/src/fattal02/tmo_fattal02.cpp
--- pfstmo-1.1/src/fattal02/tmo_fattal02.cpp	2007-06-14 16:27:46.000000000 +0100
+++ pfstmo-1.1-ejb/src/fattal02/tmo_fattal02.cpp	2008-01-03 22:39:09.000000000 +0000
@@ -161,7 +161,7 @@
 {
   int width = H->getCols();
   int height = H->getRows();
-  float divider = pow( 2.0f, k+1 );
+  float divider = powf( 2.0f, k+1 );
   float avgGrad = 0.0f;
 
   for( int y=0 ; y<height ; y++ )
@@ -179,7 +179,7 @@
         
       gy = ((*H)(x,s)-(*H)(x,n)) / divider;
       
-      (*G)(x,y) = sqrt(gx*gx+gy*gy);
+      (*G)(x,y) = sqrtf(gx*gx+gy*gy);
       avgGrad += (*G)(x,y);
     }
   }
@@ -246,9 +246,9 @@
         float grad = (*gradients[k])(x,y);
         float a = alfa * avgGrad[k];
 
-        float value=1.0;
-        if( grad>1e-4 )
-          value = a/(grad+noise) * pow((grad+noise)/a, beta);
+        float value=1.0f;
+        if( grad>1e-4f )
+          value = powf((grad+noise)/a, beta - 1.0f);
         (*fi[k])(x,y) *= value;
       }
 		
@@ -314,7 +314,7 @@
   }
   pfs::Array2D* H = new pfs::Array2DImpl(width, height);
   for( i=0 ; i<size ; i++ )
-    (*H)(i) = log( 100.0f*(*Y)(i)/maxLum + 1e-4 );
+    (*H)(i) = logf( 100.0f*(*Y)(i)/maxLum + 1e-4f );
 
   DEBUG_STR << "tmo_fattal02: calculating attenuation matrix" << endl;
   
@@ -384,7 +384,7 @@
 
   for( y=0 ; y<height ; y++ )
     for( x=0 ; x<width ; x++ )
-      (*L)(x,y) = exp( (*U)(x,y) ) - 1e-4;
+      (*L)(x,y) = expf( (*U)(x,y) ) - 1e-4f;
 	
   // remove percentile of min and max values and renormalize
   findMaxMinPercentile(L, 0.001f, minLum, 0.995f, maxLum);
@@ -394,7 +394,7 @@
     {
       (*L)(x,y) = ((*L)(x,y)-minLum) / maxLum;
       if( (*L)(x,y)<=0.0f )
-        (*L)(x,y) = 1e-4;
+        (*L)(x,y) = 1e-4f;
     }
 
   // clean up
diff -Nurd pfstmo-1.1/src/mantiuk06/contrast_domain.cpp pfstmo-1.1-ejb/src/mantiuk06/contrast_domain.cpp
--- pfstmo-1.1/src/mantiuk06/contrast_domain.cpp	2007-06-20 11:50:07.000000000 +0100
+++ pfstmo-1.1-ejb/src/mantiuk06/contrast_domain.cpp	2008-01-03 22:39:53.000000000 +0000
@@ -28,8 +28,8 @@
  * ---------------------------------------------------------------------- 
  * 
  * @author Radoslaw Mantiuk, <radoslaw.mantiuk@gmail.com>
+ * Updated 2007/12/17 by Ed Brambley <E.J.Brambley@damtp.cam.ac.uk>
  *
- * $Id: contrast_domain.cpp,v 1.5 2007/06/16 19:23:08 rafm Exp $
  */
 
 
@@ -40,99 +40,67 @@
 
 #include "contrast_domain.h"
 
-typedef struct {
-  int rows, cols;
+typedef struct pyramid_s {
+  int rows;
+  int cols;
   float* Gx;
   float* Gy;
-  float* Cx;
-  float* Cy;
-  float* divG;
-  float* M;  // Gradient modules, needed only for contrast equalization
-}
-PYRAMID_LEVEL;
-
-struct PYRAMID{
-  PYRAMID_LEVEL* level;
-  PYRAMID* next;
-};
-
+  struct pyramid_s* next;
+  struct pyramid_s* prev;
+} pyramid_t;
 
-extern float W_table[];
-extern float R_table[];
-extern float xyz2rgbD65Mat[3][3];
-extern float rgb2xyzD65Mat[3][3];
 
 #define PYRAMID_MIN_PIXELS 3
 #define LOOKUP_W_TO_R 107
 
-class ContrastDomain {
-
-  int alloc_mem;
-  progress_callback progress_cb;
-  
-public:
-
-  ContrastDomain( ) {
-    alloc_mem = 0;
-  }
-
-  void tone_mapping(int, int, float* R, float* G, float* B, float* Y, float contrastFactor, float saturationFactor, progress_callback );
-  void contrast_equalization( PYRAMID *pp );
 
-  float* transform_to_luminance(PYRAMID* pyramid);
-  void matrix_upsample(int rows, int cols, float* data, float* res);
-  void matrix_downsample(int rows, int cols, float* data, float* res);
-  float* matrix_add(int n, float* a, float* b);
-  float* matrix_substract(int n, float* a, float* b);
-  float* matrix_copy(int n, float* a, float* b);
-  void matrix_multiply_const(int n, float* a, float val);
-  void matrix_divide(int n, float* a, float* b);
-  float* matrix_alloc(int size);
-  void matrix_free(float* m);
-  float matrix_DotProduct(int n, float* a, float* b);
-  void matrix_zero(int n, float* m);
-  void calculate_divergence(int rows, int cols, float* Gx, float* Gy, float* divG);
-  void pyramid_calculate_divergence(PYRAMID* pyramid);
-  void pyramid_calculate_divergence_sum_in(PYRAMID* pyramid, float* divG_sum);
-  void pyramid_calculate_divergence_sum(PYRAMID* pyramid, float* divG_sum);
-  float* calculate_scale_factor(int n, float* G);
-  void pyramid_calculate_scale_factor(PYRAMID* pyramid);
-  void scale_gradient(int n, float* G, float* C);
-  void pyramid_scale_gradient(PYRAMID* pyramid);
-  void pyramid_scale_gradient(PYRAMID* pyramid, PYRAMID* pC);
-  void pyramid_free(PYRAMID* pyramid);
-  PYRAMID* pyramid_allocate(int cols, int rows);
-  void calculate_gradient(int cols, int rows, float* lum, float* Gx, float* Gy);
-  void pyramid_calculate_gradient(PYRAMID* pyramid, float* lum);
-  //PYRAMID* pyramid_create(int lrows, int lcols, float* llum);
-  void solveX(int n, float* b, float* x);
-  void multiplyA(PYRAMID* px, PYRAMID* pyramid, float* x, float* divG_sum);
-  float* linbcg(PYRAMID* pyramid, float* b);
-  void matrix_log10(int n, float* m);
-  float lookup_table(int n, float* in_tab, float* out_tab, float val);
-  void transform_to_R(int n, float* G);
-  void pyramid_transform_to_R(PYRAMID* pyramid);
-  void transform_to_G(int n, float* R);
-  void pyramid_transform_to_G(PYRAMID* pyramid);
-  float matrix_median(int n, float* m);
-  void pyramid_gradient_multiply(PYRAMID* pyramid, float val);
-
-  void matrix_show(char* text, int rows, int cols, float* data);
-  void pyramid_show(PYRAMID* pyramid);
+static void contrast_equalization( pyramid_t *pp, const float contrastFactor );
 
-};
+static void transform_to_luminance(pyramid_t* pyramid, float* const x, progress_callback progress_cb, const bool bcg);
+static void matrix_add(const int n, const float* const a, float* const b);
+static void matrix_subtract(const int n, const float* const a, float* const b);
+static void matrix_copy(const int n, const float* const a, float* const b);
+static void matrix_multiply_const(const int n, float* const a, const float val);
+static void matrix_divide(const int n, const float* const a, float* const b);
+static float* matrix_alloc(const int size);
+static void matrix_free(float* m);
+static float matrix_DotProduct(const int n, const float* const a, const float* const b);
+static void matrix_zero(const int n, float* const m);
+static void calculate_and_add_divergence(const int rows, const int cols, const float* const Gx, const float* const Gy, float* const divG);
+static void pyramid_calculate_divergence(pyramid_t* pyramid);
+static void pyramid_calculate_divergence_sum(pyramid_t* pyramid, float* divG_sum);
+static void calculate_scale_factor(const int n, const float* const G, float* const C);
+static void pyramid_calculate_scale_factor(pyramid_t* pyramid, pyramid_t* pC);
+static void scale_gradient(const int n, float* const G, const float* const C);
+static void pyramid_scale_gradient(pyramid_t* pyramid, pyramid_t* pC);
+static void pyramid_free(pyramid_t* pyramid);
+static pyramid_t* pyramid_allocate(const int cols, const int rows);
+static void calculate_gradient(const int cols, const int rows, const float* const lum, float* const Gx, float* const Gy);
+static void pyramid_calculate_gradient(pyramid_t* pyramid, float* lum);
+static void solveX(const int n, const float* const b, float* const x);
+static void multiplyA(pyramid_t* px, pyramid_t* pyramid, const float* const x, float* const divG_sum);
+static void linbcg(pyramid_t* pyramid, pyramid_t* pC, const float* const b, float* const x, const int itmax, const float tol, progress_callback progress_cb);
+static void lincg(pyramid_t* pyramid, pyramid_t* pC, const float* const b, float* const x, const int itmax, const float tol, progress_callback progress_cb);
+static float lookup_table(const int n, const float* const in_tab, const float* const out_tab, const float val);
+static void transform_to_R(const int n, float* const G);
+static void pyramid_transform_to_R(pyramid_t* pyramid);
+static void transform_to_G(const int n, float* const R);
+static void pyramid_transform_to_G(pyramid_t* pyramid);
+static void pyramid_gradient_multiply(pyramid_t* pyramid, const float val);
 
+static void dump_matrix_to_file(const int width, const int height, const float* const m, const char * const file_name);
+static void matrix_show(const char* const text, int rows, int cols, const float* const data);
+static void pyramid_show(pyramid_t* pyramid);
 
-//#define DEBUG
 
-float W_table[] = {0.000000,0.010000,0.021180,0.031830,0.042628,0.053819,0.065556,0.077960,0.091140,0.105203,0.120255,0.136410,0.153788,0.172518,0.192739,0.214605,0.238282,0.263952,0.291817,0.322099,0.355040,0.390911,0.430009,0.472663,0.519238,0.570138,0.625811,0.686754,0.753519,0.826720,0.907041,0.995242,1.092169,1.198767,1.316090,1.445315,1.587756,1.744884,1.918345,2.109983,2.321863,2.556306,2.815914,3.103613,3.422694,3.776862,4.170291,4.607686,5.094361,5.636316,6.240338,6.914106,7.666321,8.506849,9.446889,10.499164,11.678143,13.000302,14.484414,16.151900,18.027221,20.138345,22.517282,25.200713,28.230715,31.655611,35.530967,39.920749,44.898685,50.549857,56.972578,64.280589,72.605654,82.100619,92.943020,105.339358,119.530154,135.795960,154.464484,175.919088,200.608905,229.060934,261.894494,299.838552,343.752526,394.651294,453.735325,522.427053,602.414859,695.706358,804.693100,932.229271,1081.727632,1257.276717,1463.784297,1707.153398,1994.498731,2334.413424,2737.298517,3215.770944,3785.169959,4464.187290,5275.653272,6247.520102,7414.094945,8817.590551,10510.080619};
-float R_table[] = {0.000000,0.009434,0.018868,0.028302,0.037736,0.047170,0.056604,0.066038,0.075472,0.084906,0.094340,0.103774,0.113208,0.122642,0.132075,0.141509,0.150943,0.160377,0.169811,0.179245,0.188679,0.198113,0.207547,0.216981,0.226415,0.235849,0.245283,0.254717,0.264151,0.273585,0.283019,0.292453,0.301887,0.311321,0.320755,0.330189,0.339623,0.349057,0.358491,0.367925,0.377358,0.386792,0.396226,0.405660,0.415094,0.424528,0.433962,0.443396,0.452830,0.462264,0.471698,0.481132,0.490566,0.500000,0.509434,0.518868,0.528302,0.537736,0.547170,0.556604,0.566038,0.575472,0.584906,0.594340,0.603774,0.613208,0.622642,0.632075,0.641509,0.650943,0.660377,0.669811,0.679245,0.688679,0.698113,0.707547,0.716981,0.726415,0.735849,0.745283,0.754717,0.764151,0.773585,0.783019,0.792453,0.801887,0.811321,0.820755,0.830189,0.839623,0.849057,0.858491,0.867925,0.877358,0.886792,0.896226,0.905660,0.915094,0.924528,0.933962,0.943396,0.952830,0.962264,0.971698,0.981132,0.990566,1.000000};
+static float W_table[] = {0.000000,0.010000,0.021180,0.031830,0.042628,0.053819,0.065556,0.077960,0.091140,0.105203,0.120255,0.136410,0.153788,0.172518,0.192739,0.214605,0.238282,0.263952,0.291817,0.322099,0.355040,0.390911,0.430009,0.472663,0.519238,0.570138,0.625811,0.686754,0.753519,0.826720,0.907041,0.995242,1.092169,1.198767,1.316090,1.445315,1.587756,1.744884,1.918345,2.109983,2.321863,2.556306,2.815914,3.103613,3.422694,3.776862,4.170291,4.607686,5.094361,5.636316,6.240338,6.914106,7.666321,8.506849,9.446889,10.499164,11.678143,13.000302,14.484414,16.151900,18.027221,20.138345,22.517282,25.200713,28.230715,31.655611,35.530967,39.920749,44.898685,50.549857,56.972578,64.280589,72.605654,82.100619,92.943020,105.339358,119.530154,135.795960,154.464484,175.919088,200.608905,229.060934,261.894494,299.838552,343.752526,394.651294,453.735325,522.427053,602.414859,695.706358,804.693100,932.229271,1081.727632,1257.276717,1463.784297,1707.153398,1994.498731,2334.413424,2737.298517,3215.770944,3785.169959,4464.187290,5275.653272,6247.520102,7414.094945,8817.590551,10510.080619};
+static float R_table[] = {0.000000,0.009434,0.018868,0.028302,0.037736,0.047170,0.056604,0.066038,0.075472,0.084906,0.094340,0.103774,0.113208,0.122642,0.132075,0.141509,0.150943,0.160377,0.169811,0.179245,0.188679,0.198113,0.207547,0.216981,0.226415,0.235849,0.245283,0.254717,0.264151,0.273585,0.283019,0.292453,0.301887,0.311321,0.320755,0.330189,0.339623,0.349057,0.358491,0.367925,0.377358,0.386792,0.396226,0.405660,0.415094,0.424528,0.433962,0.443396,0.452830,0.462264,0.471698,0.481132,0.490566,0.500000,0.509434,0.518868,0.528302,0.537736,0.547170,0.556604,0.566038,0.575472,0.584906,0.594340,0.603774,0.613208,0.622642,0.632075,0.641509,0.650943,0.660377,0.669811,0.679245,0.688679,0.698113,0.707547,0.716981,0.726415,0.735849,0.745283,0.754717,0.764151,0.773585,0.783019,0.792453,0.801887,0.811321,0.820755,0.830189,0.839623,0.849057,0.858491,0.867925,0.877358,0.886792,0.896226,0.905660,0.915094,0.924528,0.933962,0.943396,0.952830,0.962264,0.971698,0.981132,0.990566,1.000000};
 
 
 // display matrix in the console (debugging)
-void ContrastDomain::matrix_show(char* text, int cols, int rows, float* data){
-
-  int _cols = cols;
+static void matrix_show(const char* const text, int cols, int rows, const float* const data)
+{
+  const int _cols = cols;
 
   if(rows > 8)
     rows = 8;
@@ -148,430 +116,553 @@
   }
 }
 
-// display pyramid in the console (debugging)
-void ContrastDomain::pyramid_show(PYRAMID* pyramid){
 
-  if(pyramid->next != NULL)	
-    pyramid_show((PYRAMID*)pyramid->next);
-	
-  PYRAMID_LEVEL* pl = pyramid->level;
+// display pyramid in the console (debugging)
+static void pyramid_show(pyramid_t* pyramid)
+{
   char ss[30];
+
+  while (pyramid->next != NULL)
+    pyramid = pyramid->next;
+
+  while (pyramid != NULL)
+    {
+      printf("\n----- pyramid_t level %d,%d\n", pyramid->cols, pyramid->rows);
 	
-  printf("\n----- PYRAMID level %d,%d\n", pl->cols, pl->rows);
-	
-  sprintf(ss, "Gx %p ", pl->Gx);
-  if(pl->Gx != NULL)
-    matrix_show(ss,pl->cols, pl->rows, pl->Gx);
-  sprintf(ss, "Gy %p ", pl->Gy);	
-  if(pl->Gy != NULL)
-    matrix_show(ss,pl->cols, pl->rows, pl->Gy);
-  sprintf(ss, "Cx %p ", pl->Cx);	
-  if(pl->Cx != NULL)
-    matrix_show(ss,pl->cols, pl->rows, pl->Cx);
-  sprintf(ss, "Cy %p ", pl->Cy);	
-  if(pl->Cy != NULL)	
-    matrix_show(ss,pl->cols, pl->rows, pl->Cy);
-  if(pl->divG != NULL)	
-    matrix_show("divG",pl->cols, pl->rows, pl->divG);	
+      sprintf(ss, "Gx %p ", pyramid->Gx);
+      if(pyramid->Gx != NULL)
+	matrix_show(ss,pyramid->cols, pyramid->rows, pyramid->Gx);
+      sprintf(ss, "Gy %p ", pyramid->Gy);	
+      if(pyramid->Gy != NULL)
+	matrix_show(ss,pyramid->cols, pyramid->rows, pyramid->Gy);
 
-  return;
+      pyramid = pyramid->prev;
+    }
 }
 
 
 
-inline float max( float a, float b )
+static inline float max( float a, float b )
 {
   return a > b ? a : b;
 }
 
-inline float min( float a, float b )
+static inline float min( float a, float b )
 {
   return a < b ? a : b;
 }
 
+static void (*matrix_upsample)(int, int, const float*, float*) = NULL;
+static void (*matrix_downsample)(int, int, const float*, float*) = NULL;
+
 // upsample the matrix
 // upsampled matrix is twice bigger in each direction than data[]
 // res should be a pointer to allocated memory for bigger matrix
 // nearest neighborhood method - should be changed!
-void ContrastDomain::matrix_upsample(int cols, int rows, float* in, float* out){
+// cols and rows are the dimmensions of the output matrix
+static void matrix_upsample_mantiuk(const int outCols, const int outRows, const float* const in, float* const out)
+{
+  const int inRows = outRows/2;
+  const int inCols = outCols/2;
 
-  float dx = (float)cols / (float)(cols*2);
-  float dy = (float)rows / (float)(rows*2);
+  // Original
 
-  const int outRows = rows * 2;
-  const int outCols = cols * 2;
+  const float dx = (float)inCols / ((float)outCols);
+  const float dy = (float)inRows / ((float)outRows);
+  const float filterSize = 1;
 
-  const float inRows = rows;
-  const float inCols = cols;
+#pragma omp parallel for  
+  for(int y = 0; y < outRows; y++)
+    {
+      const float sy = (y+0.5f)*dy - 0.5f;
 
-  const float filterSize = 1;
-  
-  float sx, sy;
-  int x, y;
-  for( y = 0, sy = dy/2-0.5; y < outRows; y++, sy += dy )
-    for( x = 0, sx = dx/2-0.5; x < outCols; x++, sx += dx ) {
+      for(int x = 0; x < outCols; x++)
+	{
+	  const float sx = (x+0.5f)*dx - 0.5f;
 
-      float pixVal = 0;
-      float weight = 0;
-      
-      for( float ix = max( 0, ceilf( sx-filterSize ) ); 
-           ix <= min( floorf(sx+filterSize), inCols-1 ); ix++ )
-        for( float iy = max( 0, ceilf( sy-filterSize ) ); 
-             iy <= min( floorf( sy+filterSize), inRows-1 ); iy++ ) {
-		
-          float fx = fabs( sx - ix );
-          float fy = fabs( sy - iy );
+	  float pixVal = 0;
+	  float weight = 0;
+	  
+	  for( float ix = max( 0, ceilf( sx-filterSize ) ); 
+	       ix <= min( floorf(sx+filterSize), inCols-1 ); ix++ )
+	    for( float iy = max( 0, ceilf( sy-filterSize ) ); 
+		 iy <= min( floorf( sy+filterSize), inRows-1 ); iy++ ) {
+	      
+	      float fx = fabsf( sx - ix );
+	      float fy = fabsf( sy - iy );
+	      
+	      const float fval = (1-fx)*(1-fy);
+	      
+	      pixVal += in[ (int)ix + (int)inCols * (int)iy ] * fval;
+	      weight += fval;
+	    }
+	  
+	  out[x + outCols * y] = pixVal / weight;
+	} 	
+    }
+}
 
-          const float fval = (1-fx)*(1-fy);
-          
-          pixVal += in[ (int)ix + (int)inCols * (int)iy ] * fval;
-          weight += fval;
-        }
+// upsample the matrix
+// upsampled matrix is twice bigger in each direction than data[]
+// res should be a pointer to allocated memory for bigger matrix
+// cols and rows are the dimmensions of the output matrix
+static void matrix_upsample_brambley_fast(const int outCols, const int outRows, const float* const in, float* const out)
+{
+  const int inRows = outRows/2;
+  const int inCols = outCols/2;
+
+  // Transpose of experimental downsampling matrix (theoretically the correct thing to do)
+
+  const float dx = (float)inCols / ((float)outCols);
+  const float dy = (float)inRows / ((float)outRows);
+  const float factor = 1.0f / (dx*dy); // This gives a genuine upsampling matrix, not the transpose of the downsampling matrix
+  // const float factor = 1.0f; // Theoretically, this should be the best.
+
+#pragma omp parallel for
+  for (int y = 0; y < outRows; y++)
+    {
+      const float sy = y * dy;
+      const int iy1 = (  y   * inRows) / outRows;
+      const int iy2 = ((y+1) * inRows) / outRows;
+
+      for (int x = 0; x < outCols; x++)
+	{
+	  const float sx = x * dx;
+	  const int ix1 = (  x   * inCols) / outCols;
+	  const int ix2 = ((x+1) * inCols) / outCols;
+
+	  out[x + y*outCols] = (
+	    ((ix1+1) - sx)*((iy1+1 - sy)) * in[ix1 + iy1*inCols] +
+	    ((ix1+1) - sx)*(sy+dy - (iy1+1)) * in[ix1 + iy2*inCols] +
+	    (sx+dx - (ix1+1))*((iy1+1 - sy)) * in[ix2 + iy1*inCols] +
+	    (sx+dx - (ix1+1))*(sy+dx - (iy1+1)) * in[ix2 + iy2*inCols])*factor;
+	}
+    }
+}
+
+
+// upsample the matrix
+// upsampled matrix is twice bigger in each direction than data[]
+// res should be a pointer to allocated memory for bigger matrix
+// cols and rows are the dimmensions of the output matrix
+static void matrix_upsample_brambley_correct(const int outCols, const int outRows, const float* const in, float* const out)
+{
+  const int inRows = outRows/2;
+  const int inCols = outCols/2;
+
+  // Transpose of experimental downsampling matrix (theoretically the correct thing to do)
+
+  const float dx = (float)inCols / ((float)outCols);
+  const float dy = (float)inRows / ((float)outRows);
+  // const float factor = 1.0f / (dx*dy); // This gives a genuine upsampling matrix, not the transpose of the downsampling matrix
+  const float factor = 1.0f; // Theoretically, this should be the best.
+
+#pragma omp parallel for
+  for (int y = 0; y < outRows; y++)
+    {
+      const float sy = y * dy;
+      const int iy1 = (  y   * inRows) / outRows;
+      const int iy2 = ((y+1) * inRows) / outRows;
+
+      for (int x = 0; x < outCols; x++)
+	{
+	  const float sx = x * dx;
+	  const int ix1 = (  x   * inCols) / outCols;
+	  const int ix2 = ((x+1) * inCols) / outCols;
+
+	  out[x + y*outCols] = (
+	    ((ix1+1) - sx)*((iy1+1 - sy)) * in[ix1 + iy1*inCols] +
+	    ((ix1+1) - sx)*(sy+dy - (iy1+1)) * in[ix1 + iy2*inCols] +
+	    (sx+dx - (ix1+1))*((iy1+1 - sy)) * in[ix2 + iy1*inCols] +
+	    (sx+dx - (ix1+1))*(sy+dx - (iy1+1)) * in[ix2 + iy2*inCols])*factor;
+	}
+    }
+}
+
+
+// downsample the matrix
+static void matrix_downsample_mantiuk(const int inCols, const int inRows, const float* const data, float* const res)
+{
+  const int outRows = inRows / 2;
+  const int outCols = inCols / 2;
+
+  const float dx = (float)inCols / ((float)outCols);
+  const float dy = (float)inRows / ((float)outRows);
+
+  const float filterSize = 0.5f;
+  
+#pragma omp parallel for
+  for(int y = 0; y < outRows; y++)
+    {
+      const float sy = (y+0.5f)*dy - 0.5f;
       
-      out[x + outCols * y] = pixVal / weight;
-    } 	
-  return;
+      for(int x = 0; x < outCols; x++)
+	{
+	  const float sx = (x+0.5f)*dx - 0.5f;
+
+	  float pixVal = 0;
+	  float w = 0;
+	  for( float ix = max( 0, ceilf( sx-dx*filterSize ) ); ix <= min( floorf( sx+dx*filterSize ), inCols-1 ); ix++ )
+	    for( float iy = max( 0, ceilf( sy-dy*filterSize ) ); iy <= min( floorf( sy+dy*filterSize), inRows-1 ); iy++ ) {
+	      pixVal += data[(int)ix + (int)iy * (int)inCols];
+	      w += 1;
+	    }     
+	  res[x + y * outCols] = pixVal/w;      
+	}	
+    }
 }
 
 // downsample the matrix
-void ContrastDomain::matrix_downsample(int cols, int rows, float* data, float* res){
+static void matrix_downsample_brambley(const int inCols, const int inRows, const float* const data, float* const res)
+{
+  const int outRows = inRows / 2;
+  const int outCols = inCols / 2;
 
-  const float inRows = rows;
-  const float inCols = cols;
+  const float dx = (float)inCols / ((float)outCols);
+  const float dy = (float)inRows / ((float)outRows);
 
-  const int outRows = rows / 2;
-  const int outCols = cols / 2;
+  // New downsampling by Ed Brambley:
+  // Experimental downsampling that assumes pixels are square and
+  // integrates over each new pixel to find the average value of the
+  // underlying pixels.
+  //
+  // Consider the original pixels laid out, and the new (larger)
+  // pixels layed out over the top of them.  Then the new value for
+  // the larger pixels is just the integral over that pixel of what
+  // shows through; i.e., the values of the pixels underneath
+  // multiplied by how much of that pixel is showing.
+  //
+  // (ix1, iy1) is the coordinate of the top left visible pixel.
+  // (ix2, iy2) is the coordinate of the bottom right visible pixel.
+  // (fx1, fy1) is the fraction of the top left pixel showing.
+  // (fx2, fy2) is the fraction of the bottom right pixel showing.
 
-  const float dx = (float)inCols / (float)outCols;
-  const float dy = (float)inRows / (float)outRows;
+  const float normalize = 1.0f/(dx*dy);
+#pragma omp parallel for
+  for (int y = 0; y < outRows; y++)
+    {
+      const int iy1 = (  y   * inRows) / outRows;
+      const int iy2 = ((y+1) * inRows) / outRows;
+      const float fy1 = (iy1+1) - y * dy;
+      const float fy2 = (y+1) * dy - iy2;
 
-  const float filterSize = 0.5;
-  
-  float sx, sy;
-  int x, y;
-  
-  for( y = 0, sy = dy/2-0.5; y < outRows; y++, sy += dy )
-    for( x = 0, sx = dx/2-0.5; x < outCols; x++, sx += dx ) {
+      for (int x = 0; x < outCols; x++)
+	{
+	  const int ix1 = (  x   * inCols) / outCols;
+	  const int ix2 = ((x+1) * inCols) / outCols;
+	  const float fx1 = (ix1+1) - x * dx;
+	  const float fx2 = (x+1) * dx - ix2;
+	  
+	  float pixVal = 0.0f;
+	  float factorx, factory;
+	  for (int i = iy1; i <= iy2 && i < inRows; i++)
+	    {
+	      if (i == iy1)
+		factory = fy1;  // We're just getting the bottom edge of this pixel
+	      else if (i == iy2)
+		factory = fy2;  // We're just gettting the top edge of this pixel
+	      else
+		factory = 1.0f; // We've got the full height of this pixel
+	      for (int j = ix1; j <= ix2 && j < inCols; j++)
+		{
+		  if (j == ix1)
+		    factorx = fx1;  // We've just got the right edge of this pixel
+		  else if (j == ix2)
+		    factorx = fx2; // We've just got the left edge of this pixel
+		  else
+		    factorx = 1.0f; // We've got the full width of this pixel
 
-      float pixVal = 0;
-      float w = 0;
-      for( float ix = max( 0, ceilf( sx-dx*filterSize ) ); ix <= min( floorf( sx+dx*filterSize ), inCols-1 ); ix++ )
-        for( float iy = max( 0, ceilf( sy-dx*filterSize ) ); iy <= min( floorf( sy+dx*filterSize), inRows-1 ); iy++ ) {
-          pixVal += data[(int)ix + (int)iy * (int)inCols];
-          w += 1;
-        }     
-      res[x + y * outCols] = pixVal/w;      
-    }	
-	
-  return;
+		  pixVal += data[j + i*inCols] * factorx * factory;
+		}
+	    }
+
+	  res[x + y * outCols] = pixVal * normalize;  // Normalize by the area of the new pixel
+	}
+    }
 }
 
 // return = a + b
-float* ContrastDomain::matrix_add(int n, float* a, float* b){
-  for(int i=0; i<n; i++){
-    b[i] = b[i] + a[i];
-  }
-  return b;
+static inline void matrix_add(const int n, const float* const a, float* const b)
+{
+  //#pragma omp parallel for
+  for(int i=0; i<n; i++)
+    b[i] += a[i];
 }
 
 // return = a - b
-float* ContrastDomain::matrix_substract(int n, float* a, float* b){
-  for(int i=0; i<n; i++){
+static inline void matrix_subtract(const int n, const float* const a, float* const b)
+{
+  //#pragma omp parallel for
+  for(int i=0; i<n; i++)
     b[i] = a[i] - b[i];
-  }
-  return b;
 }
 
 // copy matix a to b, return = a 
-float* ContrastDomain::matrix_copy(int n, float* a, float* b){
-  for(int i=0; i<n; i++){
-    b[i] = a[i];
-  }
-  return b;
+static inline void matrix_copy(const int n, const float* const a, float* const b)
+{
+  memcpy(b, a, sizeof(float)*n);
 }
 
 // multiply matrix a by scalar val
-void ContrastDomain::matrix_multiply_const(int n, float* a, float val){
-  for(int i=0; i<n; i++){
-    a[i] = a[i] * val;
-  }
-  return;
+static inline void matrix_multiply_const(const int n, float* const a, const float val)
+{
+  //#pragma omp parallel for
+  for(int i=0; i<n; i++)
+    a[i] *= val;
 }
 
 // b = a[i] / b[i]
-void ContrastDomain::matrix_divide(int n, float* a, float* b){
-  for(int i=0; i<n; i++){
+static inline void matrix_divide(const int n, const float* const a, float* const b)
+{
+  //#pragma omp parallel for
+  for(int i=0; i<n; i++)
     b[i] = a[i] / b[i];
-  }
-  return;
 }
 
 
 // alloc memory for the float table
-float* ContrastDomain::matrix_alloc(int size){
+static inline float* matrix_alloc(int size){
 
   float* m = (float*)malloc(sizeof(float)*size);
   if(m == NULL){
     fprintf(stderr, "ERROR: malloc in matrix_alloc() (size:%d)", size);
     exit(155);
   }
-  alloc_mem += sizeof(float)*size;
+
   return m;
 }
 
 // free memory for matrix
-void ContrastDomain::matrix_free(float* m){
+static inline void matrix_free(float* m){
   if(m != NULL)
     free(m);
 }
 
 // multiply vector by vector (each vector should have one dimension equal to 1)
-float ContrastDomain::matrix_DotProduct(int n, float* a, float* b){
+static inline float matrix_DotProduct(const int n, const float* const a, const float* const b){
   float val = 0;
+  //#pragma omp parallel for reduction(+:val)
   for(int j=0;j<n;j++)
-    val += (a[j] * b[j]);
+    val += a[j] * b[j];
   return val;
 }
 
 // set zeros for matrix elements
-void ContrastDomain::matrix_zero(int n, float* m){
-  for(int j=0;j<n;j++)
-    m[j] = 0;
-  return;
+static inline void matrix_zero(int n, float* m)
+{
+  bzero(m, n*sizeof(float));
 }
 
 // calculate divergence of two gradient maps (Gx and Gy)
 // divG(x,y) = Gx(x,y) - Gx(x-1,y) + Gy(x,y) - Gy(x,y-1)  
-void ContrastDomain::calculate_divergence(int cols, int rows, float* Gx, float* Gy, float* divG){
-
+static inline void calculate_and_add_divergence(const int cols, const int rows, const float* const Gx, const float* const Gy, float* const divG)
+{
   float divGx, divGy;
+
+#pragma omp parallel for private(divGx, divGy)
+  for(int ky=0; ky<rows; ky++)
+    for(int kx=0; kx<cols; kx++)
+      {
+	const int idx = kx + ky*cols;
 	
-  for(int ky=0; ky<rows; ky++){
-    for(int kx=0; kx<cols; kx++){
-			
-      if(kx == 0)
-        divGx = Gx[kx + ky*cols];
-      else	
-        divGx = Gx[kx + ky*cols] - Gx[(kx-1) + ky*cols];
-			
-      if(ky == 0)
-        divGy = Gy[kx + ky*cols];
-      else	
-        divGy = Gy[kx + ky*cols] - Gy[kx + (ky-1)*cols];			
+	if(kx == 0)
+	  divGx = Gx[idx];
+	else	
+	  divGx = Gx[idx] - Gx[idx-1];
+	
+	if(ky == 0)
+	  divGy = Gy[idx];
+	else	
+	  divGy = Gy[idx] - Gy[idx - cols];			
 		
-			
-      divG[kx+ky*cols] = divGx + divGy;			
+	divG[idx] += divGx + divGy;			
     }
-  }
-  return;
 }
 
-// calculate divergence for the pyramid
-//  calculated divergence is put into PYRAMID_LEVEL::divG
-void ContrastDomain::pyramid_calculate_divergence(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_calculate_divergence((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-			
-  calculate_divergence(pl->cols, pl->rows, pl->Gx, pl->Gy, pl->divG);
-	
-  return;
-}
+// calculate the sum of divergences for the all pyramid level
+// the smaller divergence map is upsamled and added to the divergence map for the higher level of pyramid
+// temp is a temporary matrix of size (cols, rows), assumed to already be allocated
+static void pyramid_calculate_divergence_sum(pyramid_t* pyramid, float* divG_sum)
+{
+  float* temp = matrix_alloc(pyramid->rows*pyramid->cols);
 
+  // Find the coarsest pyramid, and the number of pyramid levels
+  int levels = 1;
+  while (pyramid->next != NULL)
+    {
+      levels++;
+      pyramid = pyramid->next;
+    }
 
-// calculate sum of divergences in the pyramid
-//	divergences for a particular levels should be calculated earlier 
-//	and set in PYRAMID_LEVEL::divG
-void ContrastDomain::pyramid_calculate_divergence_sum_in(PYRAMID* pyramid, float* divG_sum){
+  // For every level, we swap temp and divG_sum.  So, if there are an odd number of levels...
+  if (levels % 2)
+    {
+      float* const dummy = divG_sum;
+      divG_sum = temp;
+      temp = dummy;
+    }
 	
-  if(pyramid->next != NULL)	
-    pyramid_calculate_divergence_sum_in((PYRAMID*)pyramid->next, divG_sum);
+  // Add them all together
+  while (pyramid != NULL)
+    {
+      // Upsample or zero as needed
+      if (pyramid->next != NULL)
+	matrix_upsample(pyramid->cols, pyramid->rows, divG_sum, temp);
+      else
+	matrix_zero(pyramid->rows * pyramid->cols, temp);
 
-  PYRAMID_LEVEL* pl = pyramid->level;
-	
-  matrix_add(pl->rows * pl->cols, pl->divG, divG_sum);
-	
-  float* temp = matrix_alloc(pl->rows*pl->cols);
-  matrix_copy(pl->rows*pl->cols, divG_sum, temp);
+      // Add in the (freshly calculated) divergences
+      calculate_and_add_divergence(pyramid->cols, pyramid->rows, pyramid->Gx, pyramid->Gy, temp);
 
-  matrix_upsample(pl->cols, pl->rows, temp, divG_sum);
-	
-  matrix_free(temp);
+//   char name[256];
+//   sprintf( name, "Up_%d.pfs", pyramid->cols );
+//   dump_matrix_to_file( pyramid->cols, pyramid->rows, temp, name );  
 
-  return;
-}
+      // matrix_copy(pyramid->rows*pyramid->cols, temp, divG_sum);
 
+      // Rather than copying, just switch round the pointers: we know we get them the right way round at the end.
+      float* const dummy = divG_sum;
+      divG_sum = temp;
+      temp = dummy;
 
-// calculate the sum of divergences for the all pyramid level
-// the smaller divergence map is upsamled and added to the divergence map for the higher level of pyramid
-void ContrastDomain::pyramid_calculate_divergence_sum(PYRAMID* pyramid, float* divG_sum){
-	
-  PYRAMID_LEVEL* pl = pyramid->level;
-  matrix_zero(pl->rows * pl->cols, divG_sum);
-	
-  if(pyramid->next != NULL)	
-    pyramid_calculate_divergence_sum_in((PYRAMID*)pyramid->next, divG_sum);
+      pyramid = pyramid->prev;
+    }
 
-  matrix_add(pl->rows * pl->cols, pl->divG, divG_sum);
-		
-  return;
+  matrix_free(temp);
 }
 
 // calculate scale factors (Cx,Cy) for gradients (Gx,Gy)
 // C is equal to EDGE_WEIGHT for gradients smaller than GFIXATE or 1.0 otherwise
-float* ContrastDomain::calculate_scale_factor(int n, float* G){
-
-  float GFIXATE = 0.1;
-  float EDGE_WEIGHT = 0.01;
-	
-  float* C = matrix_alloc(n);
-
-  for(int i=0; i<n; i++){
-
-    if(fabs(G[i]) < GFIXATE)
-      C[i] = EDGE_WEIGHT;
-    else	
-      C[i] = 1.0;
-  }
+static inline void calculate_scale_factor(const int n, const float* const G, float* const C)
+{
+  float GFIXATE = 0.1f;
+  float EDGE_WEIGHT = 0.01f;
+  const float detectT = 0.001f;
+  const float a = 0.038737;
+  const float b = 0.537756;
 
-  return C;
+  //#pragma omp parallel for	
+  for(int i=0; i<n; i++)
+    {
+#if 1
+      const float g = max( detectT, fabsf(G[i]) );    
+      C[i] = 1.0f / (a*powf(g,b));
+#else
+      if(fabsf(G[i]) < GFIXATE)
+	C[i] = 1.0f / EDGE_WEIGHT;
+      else	
+	C[i] = 1.0f;
+#endif
+    }
 }
 
 // calculate scale factor for the whole pyramid
-void ContrastDomain::pyramid_calculate_scale_factor(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_calculate_scale_factor((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-			
-  pl->Cx = calculate_scale_factor(pl->rows*pl->cols, pl->Gx);	
-  pl->Cy = calculate_scale_factor(pl->rows*pl->cols, pl->Gy);	
-
-  return;
+static void pyramid_calculate_scale_factor(pyramid_t* pyramid, pyramid_t* pC)
+{
+  while (pyramid != NULL)
+    {
+      const int size = pyramid->rows * pyramid->cols;
+      calculate_scale_factor(size, pyramid->Gx, pC->Gx);
+      calculate_scale_factor(size, pyramid->Gy, pC->Gy);
+      pyramid = pyramid->next;
+      pC = pC->next;
+    }
 }
 
 // Scale gradient (Gx and Gy) by C (Cx and Cy)
 // G = G / C
-void ContrastDomain::scale_gradient(int n, float* G, float* C){
-
+static inline void scale_gradient(const int n, float* const G, const float* const C)
+{
+  //#pragma omp parallel for
   for(int i=0; i<n; i++)
-    G[i] = G[i] / C[i];
-  return;
-}
-
-// scale gradients for the whole pyramid
-void ContrastDomain::pyramid_scale_gradient(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_scale_gradient((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-  scale_gradient(pl->rows*pl->cols, pl->Gx, pl->Cx);	
-  scale_gradient(pl->rows*pl->cols, pl->Gy, pl->Cy);	
-
-  return;
+    G[i] *= C[i];
 }
 
 // scale gradients for the whole one pyramid with the use of (Cx,Cy) from the other pyramid
-void ContrastDomain::pyramid_scale_gradient(PYRAMID* pyramid, PYRAMID* pC){
-
-  if(pyramid->next != NULL)
-    pyramid_scale_gradient((PYRAMID*)pyramid->next, (PYRAMID*)pC->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-  PYRAMID_LEVEL* plC = pC->level;
-	
-  scale_gradient(pl->rows*pl->cols, pl->Gx, plC->Cx);	
-  scale_gradient(pl->rows*pl->cols, pl->Gy, plC->Cy);	
-
-  return;
+static void pyramid_scale_gradient(pyramid_t* pyramid, pyramid_t* pC)
+{
+  while (pyramid != NULL)
+    {
+      const int size = pyramid->rows * pyramid->cols;
+      scale_gradient(size, pyramid->Gx, pC->Gx);	
+      scale_gradient(size, pyramid->Gy, pC->Gy);
+      pyramid = pyramid->next;
+      pC = pC->next;
+    }
 }
 
 
 // free memory allocated for the pyramid
-void ContrastDomain::pyramid_free(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_free((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-
-  if(pl->Gx != NULL)
-    free(pl->Gx);
-  if(pl->Gy != NULL)
-    free(pl->Gy);
-  if(pl->Cx != NULL)
-    free(pl->Cx);
-  if(pl->Cy != NULL)
-    free(pl->Cy);
-  if(pl->divG != NULL)
-    free(pl->divG);	
-		
-  free(pl);
-  free(pyramid);
-			
-  return;
+static void pyramid_free(pyramid_t* pyramid)
+{
+  while (pyramid)
+    {
+      if(pyramid->Gx != NULL)
+	{
+	  free(pyramid->Gx);
+	  pyramid->Gx = NULL;
+	}
+      if(pyramid->Gy != NULL)
+	{
+	  free(pyramid->Gy);
+	  pyramid->Gy = NULL;
+	}
+      pyramid_t* const next = pyramid->next;
+      pyramid->prev = NULL;
+      pyramid->next = NULL;
+      free(pyramid);
+      pyramid = next;
+    }			
 }
 
 
 // allocate memory for the pyramid
-PYRAMID * ContrastDomain::pyramid_allocate(int cols, int rows){
-
-  PYRAMID_LEVEL* level = NULL;
-  int size;
-  PYRAMID* pyramid = NULL;
-  PYRAMID* pyramid_higher = NULL;
-	
-  PYRAMID* p = NULL;
-  pyramid_higher = NULL;
-		
-  while(1){
-	
-    level = (PYRAMID_LEVEL*)malloc(sizeof(PYRAMID_LEVEL));
-    if(level == NULL)
-      exit(155);
-    memset( level, 0, sizeof(PYRAMID_LEVEL) );
-		
-    level->rows = rows;
-    level->cols = cols;
-    size = level->rows * level->cols;
-    level->Gx = matrix_alloc(size);
-    level->Gy = matrix_alloc(size);
-    level->divG = matrix_alloc(size);
-	
-    pyramid = (PYRAMID*)malloc(sizeof(PYRAMID));
-    pyramid->level = level;
-    pyramid->next = NULL;
+static pyramid_t * pyramid_allocate(int cols, int rows)
+{
+  pyramid_t* level = NULL;
+  pyramid_t* pyramid = NULL;
+  pyramid_t* prev = NULL;
 
-    if(pyramid_higher != NULL)
-      pyramid_higher->next = pyramid;			
-    pyramid_higher = pyramid;
-		
-    if(p == NULL)
-      p = pyramid;
-		
-    rows /= 2;
-    cols /= 2;		
-    if(rows < PYRAMID_MIN_PIXELS || cols < PYRAMID_MIN_PIXELS)
-      break;
+  while(rows >= PYRAMID_MIN_PIXELS && cols >= PYRAMID_MIN_PIXELS)
+    {
+      level = (pyramid_t *) malloc(sizeof(pyramid_t));
+      if(level == NULL)
+	{
+	  fprintf(stderr, "ERROR: malloc in pyramid_alloc() (size:%z)", sizeof(pyramid_t));
+	  exit(155);
+	}
+      memset( level, 0, sizeof(pyramid_t) );
+      
+      level->rows = rows;
+      level->cols = cols;
+      const int size = level->rows * level->cols;
+      level->Gx = matrix_alloc(size);
+      level->Gy = matrix_alloc(size);
+      
+      level->prev = prev;
+      if(prev != NULL)
+	prev->next = level;
+      prev = level;
+      
+      if(pyramid == NULL)
+	pyramid = level;
+      
+      rows /= 2;
+      cols /= 2;		
   }
 	
-  return p;
+  return pyramid;
 }
 
 
 // calculate gradients
-void ContrastDomain::calculate_gradient(int cols, int rows, float* lum, float* Gx, float* Gy){
-
-  int idx;
-	
+static inline void calculate_gradient(const int cols, const int rows, const float* const lum, float* const Gx, float* const Gy)
+{
+#pragma omp parallel for
   for(int ky=0; ky<rows; ky++){
     for(int kx=0; kx<cols; kx++){
 			
-      idx = kx + ky*cols;
+      const int idx = kx + ky*cols;
 			
       if(kx == (cols - 1))
         Gx[idx] = 0;
@@ -584,503 +675,709 @@
         Gy[idx] = lum[idx + cols] - lum[idx];
     }
   }
-
-  return;
 }
 
 
-// calculate gradients for the pyramid
-void ContrastDomain::pyramid_calculate_gradient(PYRAMID* pyramid, float* lum){
+#define PFSEOL "\x0a"
+static void dump_matrix_to_file(const int width, const int height, const float* const m, const char * const file_name)
+{
+  FILE *fh = fopen( file_name, "wb" );
+  // assert( fh != NULL );
+  
+  fprintf( fh, "PFS1" PFSEOL "%d %d" PFSEOL "1" PFSEOL "0" PFSEOL
+    "%s" PFSEOL "0" PFSEOL "ENDH", width, height, "Y");
 
-  PYRAMID_LEVEL* pl = pyramid->level;
+  for( int y = 0; y < height; y++ )
+    for( int x = 0; x < width; x++ ) {
+      int idx = x + y*width;
+      fwrite( &(m[idx]), sizeof( float ), 1, fh );
+    }
+  
+  fclose( fh );
+}  
 
-  float* lum_temp = matrix_alloc(pl->rows * pl->cols);	
-  matrix_copy(pl->rows*pl->cols,lum,lum_temp);
+// calculate gradients for the pyramid
+// lum_temp gets overwritten!
+static void pyramid_calculate_gradient(pyramid_t* pyramid, float* lum_temp)
+{
+  float* temp = matrix_alloc((pyramid->rows/2)*(pyramid->cols/2));
+  float* const temp_saved = temp;
 
-  calculate_gradient(pl->cols, pl->rows, lum_temp, pl->Gx, pl->Gy);	
+  calculate_gradient(pyramid->cols, pyramid->rows, lum_temp, pyramid->Gx, pyramid->Gy);	
 
-  PYRAMID* pp = (PYRAMID*)pyramid->next;
-  float* temp;
+  pyramid = pyramid->next;
 
-  while(1){
-    if(pp == NULL)
-      break;
-    pl = pp->level;
-			
-    temp = matrix_alloc(pl->rows * pl->cols);			
-    matrix_downsample(pl->cols*2, pl->rows*2, lum_temp, temp);	
+  //  int l = 1;
+  while(pyramid)
+    {
+      matrix_downsample(pyramid->prev->cols, pyramid->prev->rows, lum_temp, temp);
 		
-    calculate_gradient(pl->cols, pl->rows, temp, pl->Gx, pl->Gy);
+//     fprintf( stderr, "%d x %d -> %d x %d\n", pyramid->cols, pyramid->rows,
+//       prev_pp->cols, prev_pp->rows );
+    
+//      char name[40];
+//      sprintf( name, "ds_%d.pfs", l++ );
+//      dump_matrix_to_file( pyramid->cols, pyramid->rows, temp, name );    
 		
-    matrix_free(lum_temp);
-    lum_temp = temp;	
+      calculate_gradient(pyramid->cols, pyramid->rows, temp, pyramid->Gx, pyramid->Gy);
+		
+      float* const dummy = lum_temp;
+      lum_temp = temp;	
+      temp = dummy;
 			
-    pp = (PYRAMID*)pp->next;	
+      pyramid = pyramid->next;
   }
-  matrix_free(lum_temp);
 
-  return;
+  matrix_free(temp_saved);
 }
 
 
 // x = -0.25 * b
-void ContrastDomain::solveX(int n, float* b, float* x){
-
+static inline void solveX(const int n, const float* const b, float* const x)
+{
   matrix_copy(n, b, x); // x = b
-  matrix_multiply_const(n, x, -0.25);
+  matrix_multiply_const(n, x, -0.25f);
 }
 
 // divG_sum = A * x = sum(divG(x))
 // memory for the temporary pyramid px should be allocated
-void ContrastDomain::multiplyA(PYRAMID* px, PYRAMID* pyramid, float* x, float* divG_sum){
-
-  pyramid_calculate_gradient(px, x);
-	
-  pyramid_scale_gradient(px, pyramid); // scale gradients by Cx,Cy from main pyramid
-	
-  pyramid_calculate_divergence(px); // calculate divergences for all pyramid levels
-
+static inline void multiplyA(pyramid_t* px, pyramid_t* pC, const float* const x, float* const divG_sum)
+{
+  const int n = pC->rows*pC->cols;
+  matrix_copy(n, x, divG_sum); // use divG_sum as a temp variable
+  pyramid_calculate_gradient(px, divG_sum);
+  pyramid_scale_gradient(px, pC); // scale gradients by Cx,Cy from main pyramid
   pyramid_calculate_divergence_sum(px, divG_sum); // calculate the sum of divergences
 
-  return;
+  // Make matrix positive definite by optimizing to give sum(x_i) = 0
+  const float factor = 0.5f/n;
+  float sum = 0.0f;
+  for (int i = 0; i < n; i++)
+    sum += x[i];
+  sum *= factor;
+  
+  for (int i = 0; i < n; i++)
+    divG_sum[i] += sum;
 } 
 
 
 // bi-conjugate linear equation solver
-float* ContrastDomain::linbcg(PYRAMID* pyramid, float* b){
-
-  int rows = pyramid->level->rows;
-  int cols = pyramid->level->cols;
-  int n = rows*cols;
-	
-  float* z = matrix_alloc(n);
-  float* zz = matrix_alloc(n);
-  float* p = matrix_alloc(n);
-  float* pp = matrix_alloc(n);
-  float* r = matrix_alloc(n);
-  float* rr = matrix_alloc(n);	
+// overwrites pyramid!
+static void linbcg(pyramid_t* pyramid, pyramid_t* pC, float* const b, float* const x, const int itmax, const float tol, progress_callback progress_cb)
+{
+  const int rows = pyramid->rows;
+  const int cols = pyramid->cols;
+  const int n = rows*cols;
+  const float tol2 = tol*tol;
 	
-  PYRAMID* pyramid_tmp = 
-    pyramid_allocate(pyramid->level->cols, pyramid->level->rows);
+  float* const z = matrix_alloc(n);
+  float* const zz = matrix_alloc(n);
+  float* const p = matrix_alloc(n);
+  float* const pp = matrix_alloc(n);
+  float* const r = matrix_alloc(n);
+  float* const rr = matrix_alloc(n);	
+  float* const x_save = matrix_alloc(n);	
 	
-  float* x = matrix_alloc(n); // allocate x matrix filled with zeros
   matrix_zero(n, x); // x = 0
 	
-  multiplyA(pyramid_tmp, pyramid, x, r); // r = A*x = divergence(x)
+  // multiplyA(pyramid, pC, x, r); // r = A*x = divergence(x)
+  // matrix_zero(n, r);
 
-  matrix_substract(n, b, r); // r = b - r
+  // matrix_substract(n, b, r); // r = b - r
+  matrix_copy(n, b, r);
 
-  matrix_copy(n, r, rr); // rr = r
-	
-  float bnrm;
-	
-  bnrm = sqrt(matrix_DotProduct(n, b, b));
-	
-  solveX(n, r, z); // z = ~A(-1) * r = -0.25 * r
+  // matrix_copy(n, r, rr); // rr = r
+  multiplyA(pyramid, pC, r, rr); // rr = A*r
+  
+  const float bnrm2 = matrix_DotProduct(n, b, b);
 	
-#define TOL 0.01
-#define ITMAX 40
-  int iter = 0;
-
-  float bknum=0;
-  int j;
-  float bk=0;
   float bkden = 0;
-  float akden = 0;
-  float ak = 0;
-  float err = 0;
-	
-  while(iter <= ITMAX){
+  float err2 = bnrm2;
+  float saved_err2 = err2;
+  
+  int iter = 0;
+  bool reset = true;
+  int num_backwards = 0;
+  const int num_backwards_ceiling = 3;
+  for (; iter < itmax; iter++)
+    {
+      if( progress_cb != NULL )
+	progress_cb( (int) (logf(bnrm2/err2)*100.0f/(-logf(tol2))));    
+      
+      solveX(n, r, z);   //  z = ~A(-1) *  r = -0.25 *  r
+      solveX(n, rr, zz); // zz = ~A(-1) * rr = -0.25 * rr
+      
+      const float bknum = matrix_DotProduct(n, z, rr);
+      
+      if(reset)
+	{
+	  reset = false;
+	  matrix_copy(n, z, p);
+	  matrix_copy(n, zz, pp); 
+	}
+      else
+	{
+	  const float bk = bknum / bkden; // beta = ...
+	  //#pragma omp parallel for
+	  for (int i = 0; i < n; i++)
+	    {
+	      p[i] = z[i] + bk * p[i];
+	      pp[i] = zz[i] + bk * pp[i];
+	    }
+	}
+      
+      bkden = bknum; // numerato becomes the dominator for the next iteration
+      
+      multiplyA(pyramid, pC, p, z); // z = A*p = divergence(p)
+      multiplyA(pyramid, pC, pp, zz); // zz = A*pp = divergence(pp)
+      
+      const float ak = bknum / matrix_DotProduct(n, z, pp); // alfa = ...
+      //#pragma omp parallel for
+      for(int i = 0 ; i < n ; i++ )
+	{
+	  x[i] += ak * p[i];	// x = x + alfa * p
+	  r[i] -= ak * z[i];		// r = r - alfa * z
+	  rr[i] -= ak * zz[i];	//rr = rr - alfa * zz
+	}
+      
+      const float old_err2 = err2;
+      err2 = matrix_DotProduct(n, r, r);
+      
+      // Have we gone unstable?
+      if (err2 > old_err2)
+	{
+	  // Save where we've got to if it's the best yet
+	  if (num_backwards == 0 && old_err2 < saved_err2)
+	    {
+	      saved_err2 = old_err2;
+	      matrix_copy(n, x, x_save);
+	      for (int i = 0; i < n; i++)
+		x_save[i] -= ak * p[i];
+	    }
+	  
+	  num_backwards++;
+	}
+      else
+	{
+	  num_backwards = 0;
+	}
+      
+      if (num_backwards > num_backwards_ceiling)
+	{
+	  // Reset
+	  reset = true;
+	  num_backwards = 0;
+	  
+	  // Recover saved value
+	  matrix_copy(n, x_save, x);
+	  
+	  // r = Ax
+	  multiplyA(pyramid, pC, x, r);
+	  
+	  // r = b - r
+	  matrix_subtract(n, b, r);
+	  
+	  // err2 = r.r
+	  err2 = matrix_DotProduct(n, r, r);
+	  saved_err2 = err2;
 
-    if( progress_cb != NULL )
-      progress_cb( iter*100/ITMAX );    
-          
-    iter++;
-		
-    solveX(n, rr, zz); // zz = ~A(-1) * rr = -0.25 * rr
-		
-    bknum = matrix_DotProduct(n, z, rr);
-		
-    if(iter == 1){
-      matrix_copy(n, z, p);
-      matrix_copy(n, zz, pp); 
-    }
-    else{
-      bk = bknum / bkden; // beta = ...
-      matrix_multiply_const(n, p, bk); // p = bk * p
-      matrix_add(n, z, p); // p = p + z
-      matrix_multiply_const(n, pp, bk); // pp = bk * pp
-      matrix_add(n, zz, pp); // pp = pp + zz
+	  // rr = A*r
+	  multiplyA(pyramid, pC, r, rr);
+	}
+      
+      // fprintf(stderr, "iter:%d err:%f\n", iter+1, sqrtf(err2/bnrm2));
+      if(err2/bnrm2 < tol2)
+	break;
     }
-		
-    bkden = bknum; // numerato becomes the dominator for the next iteration
 
-    multiplyA(pyramid_tmp, pyramid, p, z); // z = A*p = divergence(p)
-		
-    akden = matrix_DotProduct(n, z, pp);  // alfa denominator
-		
-    ak = bknum / akden; // alfa = ...
-		
-    multiplyA(pyramid_tmp, pyramid, pp, zz); // zz = A*pp = divergence(pp)
-
-    // values for the next iterration
-    for(j=0;j<n;j++){
-      x[j] = x[j] + ak * p[j];	// x = x + alfa * p
-      r[j] = r[j] - ak * z[j];		// r = r - alfa * z
-      rr[j] = rr[j] - ak * zz[j];	//rr = rr - alfa * zz
+  // Use the best version we found
+  if (err2 > saved_err2)
+    {
+      err2 = saved_err2;
+      matrix_copy(n, x_save, x);
     }
-	
-    solveX(n, r, z); // z = ~A(-1) * r
-		
-    err = sqrt(matrix_DotProduct(n, r, r));
-
-    //fprintf(stderr, "iter:%d err:%f\n", iter, err);
-    if(err <= TOL)
-      break;
-  }
 
-  if( iter <= ITMAX && progress_cb != NULL )
-    progress_cb( 100 );    
-  
+  if (err2/bnrm2 > tol2)
+    {
+      // Not converged
+      if( progress_cb != NULL )
+	progress_cb( (int) (logf(bnrm2/err2)*100.0f/(-logf(tol2))));    
+      if (iter == itmax)
+	fprintf(stderr, "\npfstmo_mantiuk06: Warning: Not converged (hit maximum iterations), error = %g (should be below %g).\n", sqrtf(err2/bnrm2), tol);  
+      else
+	fprintf(stderr, "\npfstmo_mantiuk06: Warning: Not converged (going unstable), error = %g (should be below %g).\n", sqrtf(err2/bnrm2), tol);  
+    }
+  else if (progress_cb != NULL)
+    progress_cb(100);
+      
+  matrix_free(x_save);
   matrix_free(p);
   matrix_free(pp);
   matrix_free(z);
   matrix_free(zz);
   matrix_free(r);
   matrix_free(rr);
+}
 
-  pyramid_free(pyramid_tmp);
+
+// conjugate linear equation solver
+// overwrites pyramid!
+// overwrites b!
+static void lincg(pyramid_t* pyramid, pyramid_t* pC, float* const b, float* const x, const int itmax, const float tol, progress_callback progress_cb)
+{
+  const int rows = pyramid->rows;
+  const int cols = pyramid->cols;
+  const int n = rows*cols;
+  const float tol2 = tol*tol;
 	
-  return x;
-}
+  float* const x_save = matrix_alloc(n);
+  float* const r = matrix_alloc(n);
+  float* const p = matrix_alloc(n);
+  float* const Ap = matrix_alloc(n);	
+	
+  // x = 0
+  matrix_zero(n, x);
+	
+  // r = b - Ax
+  // multiplyA(pyramid, pC, x, r);
+  // matrix_subtract(n, b, r);
+  matrix_copy(n, b, r);
 
-// rescale matrix from linear to logarithmic scale
-void ContrastDomain::matrix_log10(int n, float* m){
+  // bnrm2 = ||b||
+  const float bnrm2 = matrix_DotProduct(n, b, b);
+	
+  // p = r
+  matrix_copy(n, r, p);
 
-  for(int j=0;j<n;j++)
-    m[j] = log10f(m[j]);
-		
-  return;
-}
+  // rdotr = r.r
+  float rdotr = bnrm2;
+  float saved_rdotr = rdotr;
 
-// in_tab and out_tab should contain inscresing float values
-float ContrastDomain::lookup_table(int n, float* in_tab, float* out_tab, float val){
+  int iter = 0;
+  int num_backwards = 0;
+  const int num_backwards_ceiling = 3;
+  for (; iter < itmax; iter++)
+    {
+      if( progress_cb != NULL )
+	progress_cb( (int) (logf(bnrm2/rdotr)*100.0f/(-logf(tol2))));    
+      
+      // Ap = A p
+      multiplyA(pyramid, pC, p, Ap);
+      
+      // alpha = r.r / (p . Ap)
+      const float alpha = rdotr / matrix_DotProduct(n, p, Ap);
+      
+      // x = x + alpha p
+      // r = r - alpha Ap
+      //#pragma omp parallel for
+      for (int i = 0; i < n; i++)
+	{
+	  x[i] += alpha *  p[i];
+	  r[i] -= alpha * Ap[i];
+	}
+            
+      // rdotr = r.r
+      const float old_rdotr = rdotr;
+      rdotr = matrix_DotProduct(n, r, r);
+      
+      // Have we gone unstable?
+      if (rdotr > old_rdotr)
+	{
+	  // Save where we've got to
+	  if (num_backwards == 0 && old_rdotr < saved_rdotr)
+	    {
+	      saved_rdotr = old_rdotr;
+	      matrix_copy(n, x, x_save);
+	      for (int i = 0; i < n; i++)
+		x_save[i] -= alpha * p[i];
+	    }
 
-  float ret;
-  float dd;
+	  num_backwards++;
+	}
+      else
+	{
+	  num_backwards = 0;
+	}
+
+      // Exit if we're done
+      // fprintf(stderr, "iter:%d err:%f\n", iter+1, sqrtf(rdotr/bnrm2));
+      if(rdotr/bnrm2 < tol2)
+	break;
+      
+      if (num_backwards > num_backwards_ceiling)
+	{
+	  // Reset
+	  num_backwards = 0;
+	  matrix_copy(n, x_save, x);
+
+	  // r = Ax
+	  multiplyA(pyramid, pC, x, r);
+
+	  // r = b - r
+	  matrix_subtract(n, b, r);
+
+	  // rdotr = r.r
+	  rdotr = matrix_DotProduct(n, r, r);
+	  saved_rdotr = rdotr;
+
+	  // p = r
+	  matrix_copy(n, r, p);
+	}
+      else
+	{
+	  // p = r + beta p
+	  const float beta = rdotr/old_rdotr;
+	  //#pragma omp parallel for
+	  for (int i = 0; i < n; i++)
+	    p[i] = r[i] + beta*p[i];
+	}
+    }
+      
+  // Use the best version we found
+  if (rdotr > saved_rdotr)
+    {
+      rdotr = saved_rdotr;
+      matrix_copy(n, x_save, x);
+    }  
+
+  if (rdotr/bnrm2 > tol2)
+    {
+      // Not converged
+      if( progress_cb != NULL )
+	progress_cb( (int) (logf(bnrm2/rdotr)*100.0f/(-logf(tol2))));    
+      if (iter == itmax)
+	fprintf(stderr, "\npfstmo_mantiuk06: Warning: Not converged (hit maximum iterations), error = %g (should be below %g).\n", sqrtf(rdotr/bnrm2), tol);  
+      else
+	fprintf(stderr, "\npfstmo_mantiuk06: Warning: Not converged (going unstable), error = %g (should be below %g).\n", sqrtf(rdotr/bnrm2), tol);  
+    }
+  else if (progress_cb != NULL)
+    progress_cb(100);
+    
+  matrix_free(p);
+  matrix_free(Ap);
+  matrix_free(r);
+}
 
+
+// in_tab and out_tab should contain inccreasing float values
+static inline float lookup_table(const int n, const float* const in_tab, const float* const out_tab, const float val)
+{
   if(val < in_tab[0])
-    ret = out_tab[0];
-  else
-    if(val > in_tab[n-1])
-      ret = out_tab[n-1];		
-    else
-      for(int j=0;j<(n-1);j++){
-	
-        if(in_tab[j] == val)
-          ret = out_tab[j];
-        else
-          if( val > in_tab[j] && val < in_tab[j+1]){
-            dd = (val - in_tab[j]) / (in_tab[j+1] - in_tab[j]);
-            ret = out_tab[j] + (out_tab[j+1] - out_tab[j]) * dd;
-          }
+    return out_tab[0];
+
+  for (int j = 1; j < n; j++)
+    if(val < in_tab[j])
+      {
+	const float dd = (val - in_tab[j-1]) / (in_tab[j] - in_tab[j-1]);
+	return out_tab[j-1] + (out_tab[j] - out_tab[j-1]) * dd;
       }
-	
-  return ret;
+
+  return out_tab[n-1];
 }
 
 
 // transform gradient (Gx,Gy) to R
-void ContrastDomain::transform_to_R(int n, float* G){
-
-  int sign;
-  float absG;
-  for(int j=0;j<n;j++){
-	
-    // G to W
-    absG = fabs(G[j]);
-    if(G[j] < 0)
-      sign = -1;
-    else
-      sign = 1;	
-    G[j] = (powf(10.0, absG) - 1.0) * (float)sign;
+static inline void transform_to_R(const int n, float* const G)
+{
+#pragma omp parallel for
+  for(int j=0;j<n;j++)
+    {
+      // G to W
+      const float absG = fabsf(G[j]);
+      int sign;
+      if(G[j] < 0)
+	sign = -1;
+      else
+	sign = 1;	
+      G[j] = (exp10f(absG) - 1.0f) * sign;
 		
-    // W to RESP
-    if(G[j] < 0)
-      sign = -1;
-    else
-      sign = 1;	
-					
-    G[j] = (float)sign * lookup_table(LOOKUP_W_TO_R, W_table, R_table, fabs(G[j]));
-  }
-  return;
+      // W to RESP
+      if(G[j] < 0)
+	sign = -1;
+      else
+	sign = 1;	
+      
+      G[j] = sign * lookup_table(LOOKUP_W_TO_R, W_table, R_table, fabsf(G[j]));
+    }
 }
 
 // transform gradient (Gx,Gy) to R for the whole pyramid
-void ContrastDomain::pyramid_transform_to_R(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_transform_to_R((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-  transform_to_R(pl->rows*pl->cols, pl->Gx);	
-  transform_to_R(pl->rows*pl->cols, pl->Gy);	
-
-  return;
+static inline void pyramid_transform_to_R(pyramid_t* pyramid)
+{
+  while (pyramid != NULL)
+    {
+      const int size = pyramid->rows * pyramid->cols;
+      transform_to_R(size, pyramid->Gx);	
+      transform_to_R(size, pyramid->Gy);	
+      pyramid = pyramid->next;
+    }
 }
 
 // transform from R to G
-void ContrastDomain::transform_to_G(int n, float* R){
+static inline void transform_to_G(const int n, float* const R){
 
-  int sign;
+#pragma omp parallel for
   for(int j=0;j<n;j++){
 	
     // RESP to W
+    int sign;
     if(R[j] < 0)
       sign = -1;
     else
       sign = 1;			
-    R[j] = sign * lookup_table(LOOKUP_W_TO_R, R_table, W_table, fabs(R[j]));
+    R[j] = sign * lookup_table(LOOKUP_W_TO_R, R_table, W_table, fabsf(R[j]));
 	
     // W to G
     if(R[j] < 0)
       sign = -1;
     else
       sign = 1;	
-    R[j] = log10f(fabs(R[j]) + 1.0) * sign;
+    R[j] = log10f(fabsf(R[j]) + 1.0f) * sign;
 		
   }
-  return;
 }
 
-// transform from R to G for teh pyramid
-void ContrastDomain::pyramid_transform_to_G(PYRAMID* pyramid){
-
-  if(pyramid->next != NULL)
-    pyramid_transform_to_G((PYRAMID*)pyramid->next);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-  transform_to_G(pl->rows*pl->cols, pl->Gx);	
-  transform_to_G(pl->rows*pl->cols, pl->Gy);	
-
-  return;
+// transform from R to G for the pyramid
+static inline void pyramid_transform_to_G(pyramid_t* pyramid)
+{
+  while (pyramid != NULL)
+    {
+      transform_to_G(pyramid->rows*pyramid->cols, pyramid->Gx);	
+      transform_to_G(pyramid->rows*pyramid->cols, pyramid->Gy);	
+      pyramid = pyramid->next;
+    }
 }
 
 // multiply gradient (Gx,Gy) values by float scalar value for the whole pyramid
-void ContrastDomain::pyramid_gradient_multiply(PYRAMID* pyramid, float val){
-
-  if(pyramid->next != NULL)
-    pyramid_gradient_multiply((PYRAMID*)pyramid->next, val);
-		
-  PYRAMID_LEVEL* pl = pyramid->level;
-  matrix_multiply_const(pl->rows*pl->cols, pl->Gx, val);	
-  matrix_multiply_const(pl->rows*pl->cols, pl->Gy, val);	
-		
-  return;
+static inline void pyramid_gradient_multiply(pyramid_t* pyramid, const float val)
+{
+  while (pyramid != NULL)
+    {
+      matrix_multiply_const(pyramid->rows*pyramid->cols, pyramid->Gx, val);	
+      matrix_multiply_const(pyramid->rows*pyramid->cols, pyramid->Gy, val);	
+      pyramid = pyramid->next;
+    }
 }
 
-int sort_median(const void* v1, const void* v2){
-
-  float* f1 = (float*)v1;
-  float* f2 = (float*)v2;
 
-  if(f1[0] < f2[0])
+static int sort_float(const void* const v1, const void* const v2)
+{
+  if (*((float*)v1) < *((float*)v2))
     return -1;
-  else
-    if(f1[0] == f2[0])
-      return 0;
-    else
-      return 1;
-}
 
-float ContrastDomain::matrix_median(int n, float* m){
+  if(*((float*)v1) > *((float*)v2))
+    return 1;
 
-  float* temp = matrix_alloc(n);
-  matrix_copy(n, m, temp);
-	
-  qsort(temp, n, sizeof(float), sort_median);
+  return 0;
+}
 
-  float val = (temp[(int)(n/2)] + temp[(int)(n/2+1)]) / 2.0;
 
-  matrix_free(temp);
+// transform gradients to luminance
+static void transform_to_luminance(pyramid_t* pp, float* const x, progress_callback progress_cb, const bool bcg, const int itmax, const float tol)
+{
+  pyramid_t* pC = pyramid_allocate(pp->cols, pp->rows);
+  pyramid_calculate_scale_factor(pp, pC); // calculate (Cx,Cy)
+  pyramid_scale_gradient(pp, pC); // scale small gradients by (Cx,Cy);
 
-  return val;
+  float* b = matrix_alloc(pp->cols * pp->rows);
+  pyramid_calculate_divergence_sum(pp, b); // calculate the sum of divergences (equal to b)
+  
+  // calculate luminances from gradients
+  if (bcg)
+    linbcg(pp, pC, b, x, itmax, tol, progress_cb);
+  else
+    lincg(pp, pC, b, x, itmax, tol, progress_cb);
+  
+  matrix_free(b);
+  pyramid_free(pC);
 }
 
 
+struct hist_data
+{
+  float size;
+  float cdf;
+  int index;
+};
 
-// transform gradients to luminance
-float* ContrastDomain::transform_to_luminance(PYRAMID* pp){
+static int hist_data_order(const void* const v1, const void* const v2)
+{
+  if (((struct hist_data*) v1)->size < ((struct hist_data*) v2)->size)
+    return -1;
+  
+  if (((struct hist_data*) v1)->size > ((struct hist_data*) v2)->size)
+    return 1;
 
-  pyramid_calculate_scale_factor(pp); // calculate (Cx,Cy)
-  pyramid_scale_gradient(pp); // scale small gradients by (Cx,Cy);
-  pyramid_calculate_divergence(pp); // calculate divergence for the pyramid
-	
-  float* b = matrix_alloc(pp->level->cols * pp->level->rows);
-  pyramid_calculate_divergence_sum(pp, b); // calculate the sum od divergences (equal to b)
-	
-  float* x = linbcg(pp, b); // calculate luminances from gradients
-	
-  matrix_free(b);
-	
-  return x;
+  return 0;
 }
 
 
-void ContrastDomain::contrast_equalization( PYRAMID *pp )
+static int hist_data_index(const void* const v1, const void* const v2)
 {
-    // Histogram parameters
-    const int hist_size = 2048;
-    const float x_min = -2;
-    const float x_max = 4;
+  return ((struct hist_data*) v1)->index - ((struct hist_data*) v2)->index;
+}
+
+
+static void contrast_equalization( pyramid_t *pp, const float contrastFactor )
+{
+  // Count sizes
+  int total_pixels = 0;
+  pyramid_t* l = pp;
+  while (l != NULL)
+    {
+      total_pixels += l->rows * l->cols;
+      l = l->next;
+    }
+  
+  // Allocate memory
+  struct hist_data* hist = (struct hist_data*) malloc(sizeof(struct hist_data) * total_pixels);
+  if (hist == NULL)
+    {
+      fprintf(stderr, "ERROR: malloc in contrast_equalization() (size:%z)", sizeof(struct hist_data) * total_pixels);
+      exit(155);
+    }
     
-    // Build histogram
-    float hist[hist_size];
-    memset( hist, 0, sizeof( hist ) );
-    PYRAMID *l = pp;
-    while( l != NULL ) {
-      PYRAMID_LEVEL *lev = l->level;
-      const int pixels = lev->rows*lev->cols;
-      lev->M = (float*)malloc(sizeof(float)*pixels);
-      for( int c = 0; c < pixels; c++ ) {
-        lev->M[c] = log10( sqrt( lev->Gx[c]*lev->Gx[c] + lev->Gy[c]*lev->Gy[c] ) );
-        int bin = (int)((lev->M[c]-x_min)/(x_max-x_min)*hist_size+0.5);
-        if( bin < 0 ) bin = 0;
-        else if( bin >= hist_size ) bin = hist_size;
-        hist[bin]++;
-      } 
+  // Build histogram info
+  l = pp;
+  int index = 0;
+  while( l != NULL )
+    {
+      const int pixels = l->rows*l->cols;
+      for(int c = 0; c < pixels; c++ , index++)
+	{
+	  hist[index].size = sqrtf( l->Gx[c]*l->Gx[c] + l->Gy[c]*l->Gy[c] );
+	  hist[index].index = index;
+	} 
       l = l->next;
     }
+  
+  // Generate histogram
+  qsort(hist, total_pixels, sizeof(struct hist_data), hist_data_order);
 
-    // Compute cummulative histogram
-    hist[0] = 0;
-    for( int i = 1; i < hist_size; i++ )
-      hist[i] = hist[i-1] + hist[i];
+  // Calculate cdf
+  const float norm = 1.0f / (float) total_pixels;
+  for (int i = 0; i < total_pixels; i++)
+    hist[i].cdf = ((float) i) * norm;
 
-    // Renormalize cummulative histogram
-    float norm = hist[hist_size-1]*0.8;
-    for( int i = 1; i < hist_size; i++ )
-      hist[i] /= norm;
+  // Recalculate in terms of indexes
+  qsort(hist, total_pixels, sizeof(struct hist_data), hist_data_index);
+
+  //Remap gradient magnitudes
+  l = pp;
+  index = 0;
+  while( l != NULL ) {
+    const int pixels = l->rows*l->cols;
     
-    //Remap gradient magnitudes
-    l = pp;
-    while( l != NULL ) {
-      PYRAMID_LEVEL *lev = l->level;
-      const int pixels = lev->rows*lev->cols;
-      for( int c = 0; c < pixels; c++ ) {
-        int bin = (int)((lev->M[c]-x_min)/(x_max-x_min)*hist_size+0.5);
-        if( bin < 0 ) bin = 0;
-        else if( bin >= hist_size ) bin = hist_size;
-        float scale = hist[bin]/pow(10,lev->M[c]);
-        lev->Gx[c] *= scale;
-        lev->Gy[c] *= scale;        
+    for( int c = 0; c < pixels; c++ , index++ )
+      {
+	const float scale = contrastFactor * hist[index].cdf/hist[index].size;
+	l->Gx[c] *= scale;
+	l->Gy[c] *= scale;        
       } 
-      free( lev->M );
-      lev->M = NULL;
-      l = l->next;
-    }
+    l = l->next;
+  }
+
+  free(hist);
 }
 
 
 // tone mapping
-void ContrastDomain::tone_mapping(int c, int r, float* R, float* G, float* B, float* Y, float contrastFactor, float saturationFactor, progress_callback progress_cb ){
-
-  this->progress_cb = progress_cb;
+void tmo_mantiuk06_contmap(const int c, const int r, float* const R, float* const G, float* const B, float* const Y, const float contrastFactor, const float saturationFactor, const bool bcg, const int interpolate_method, const int itmax, const float tol, progress_callback progress_cb)
+{
   
-  int n = c*r;
-
-  const float clip_min = 1e-5;
+  const int n = c*r;
   
-  for(int j=0;j<n;j++){
+  /* Set the interpolation method */
+  if (interpolate_method == 3)
+    {
+      // Correct Brambley
+      matrix_upsample = matrix_upsample_brambley_correct;
+      matrix_downsample = matrix_downsample_brambley;
+    }
+  else if (interpolate_method == 2)
+    {
+      // Fast Brambley
+      matrix_upsample = matrix_upsample_brambley_fast;
+      matrix_downsample = matrix_downsample_brambley;
+    }
+  else
+    {
+      // Mantiuk
+      matrix_upsample = matrix_upsample_mantiuk;
+      matrix_downsample = matrix_downsample_mantiuk;
+    }
 
-    // clipping
-    if( R[j] < clip_min ) R[j] = clip_min;
-    if( G[j] < clip_min ) G[j] = clip_min;
-    if( B[j] < clip_min ) B[j] = clip_min;
-    if( Y[j] < clip_min ) Y[j] = clip_min;    
-	
-    R[j] = R[j] / Y[j];
-    G[j] = G[j] / Y[j];
-    B[j] = B[j] / Y[j];
-  }
-	
-  matrix_log10(n, Y); // transform Y to logaritmic scale
-	
-  PYRAMID* pp = pyramid_allocate(c,r); // create pyramid
-  pyramid_calculate_gradient(pp,Y); // calculate gradiens for pyramid
+#if 0
+  /* Normalize */
+  float Ymax = -1.0f;
+  for (int j = 0; j < n; j++)
+    if (Y[j] > Ymax)
+      Ymax = Y[j];
+
+  const float clip_min = 1e-7f*Ymax;
+  for(int j=0;j<n;j++)
+    {
+      if( R[j] < clip_min ) R[j] = clip_min;
+      if( G[j] < clip_min ) G[j] = clip_min;
+      if( B[j] < clip_min ) B[j] = clip_min;
+      if( Y[j] < clip_min ) Y[j] = clip_min;    
+    }
+#endif
+
+  //#pragma omp parallel for
+  for (int j = 0; j < n; j++)
+    {
+      R[j] /= Y[j];
+      G[j] /= Y[j];
+      B[j] /= Y[j];
+      Y[j] = log10f(Y[j]);
+    }
 	
+  pyramid_t* pp = pyramid_allocate(c,r); // create pyramid
+  pyramid_calculate_gradient(pp,Y); // calculate gradients for pyramid, destroys Y
   pyramid_transform_to_R(pp); // transform gradients to R
 
-
-  if( contrastFactor != 0 ) {
-    // Contrast mapping
-    pyramid_gradient_multiply(pp, contrastFactor); // scale gradient
-  } else {
-    // Contrast equalization
-    contrast_equalization( pp );
-  }
+  /* Contrast map */
+  if( contrastFactor > 0.0f )
+    pyramid_gradient_multiply(pp, contrastFactor); // Contrast mapping
+  else
+    contrast_equalization(pp, -contrastFactor); // Contrast equalization
 	
   pyramid_transform_to_G(pp); // transform R to gradients
-	
-  float* x = transform_to_luminance(pp); // transform gradients to luminance Y
-	
+  transform_to_luminance(pp, Y, progress_cb, bcg, itmax, tol); // transform gradients to luminance Y
+  pyramid_free(pp);
+
+  /* Renormalize luminance */
   float* temp = matrix_alloc(n);
 	
-  matrix_copy(n, x, temp); // copy x to temp
-  qsort(temp, n, sizeof(float), sort_median); // sort temp in ascending order
+  matrix_copy(n, Y, temp); // copy Y to temp
+  qsort(temp, n, sizeof(float), sort_float); // sort temp in ascending order
 	
-  float median = (temp[(int)((n-1)/2)] + temp[(int)((n-1)/2+1)]) / 2.0; // calculate median
-  // c and r should be even
-  float CUT_MARGIN = 0.1;
+  // const float median = (temp[(int)((n-1)/2)] + temp[(int)((n-1)/2+1)]) * 0.5f; // calculate median
+  const float CUT_MARGIN = 0.1f;
 	
-  float trim = (n-1) * CUT_MARGIN * 0.01;
-  float delta = trim - floor(trim);
-  float p_min = temp[(int)floor(trim)] * delta + temp[(int)ceil(trim)] * (1-delta);	
+  float trim = (n-1) * CUT_MARGIN * 0.01f;
+  float delta = trim - floorf(trim);
+  const float l_min = temp[(int)floorf(trim)] * delta + temp[(int)ceilf(trim)] * (1.0f-delta);	
 
-  trim = (n-1) * (100.0 - CUT_MARGIN) * 0.01;
-  delta = trim - floor(trim);
-  float p_max = temp[(int)floor(trim)] * delta + temp[(int)ceil(trim)] * (1-delta);	
+  trim = (n-1) * (100.0f - CUT_MARGIN) * 0.01f;
+  delta = trim - floorf(trim);
+  const float l_max = temp[(int)floorf(trim)] * delta + temp[(int)ceilf(trim)] * (1.0f-delta);	
 	
   matrix_free(temp);
 	
+  const float disp_dyn_range = 2.3f;
+  for(int j=0;j<n;j++)
+    Y[j] = (Y[j] - l_min) / (l_max - l_min) * disp_dyn_range - disp_dyn_range; // x scaled
 
-  float d;
-  if( (p_max - median) > (median - p_min) )
-    d = p_max - median;
-  else 
-    d = median - p_min;
-	
-  float l_max = median + d;
-  float l_min = median - d;
-        
-  for(int j=0;j<n;j++){
-	
-    x[j] = (x[j] - l_min) / (l_max - l_min) * 2.5 - 2.5; // x scaled to <-2.5,0> range
-  }
-	
-  for(int j=0;j<n;j++){	
-    Y[j] = powf(10, x[j]);  // transform from logaritmic to linear scale (x <0,10>)
-  }
-		
-  for(int j=0;j<n;j++){		// transform Y to (R,G,B)
-    R[j] = powf( R[j], saturationFactor) * Y[j];
-    G[j] = powf( G[j], saturationFactor) * Y[j];
-    B[j] = powf( B[j], saturationFactor) * Y[j];
-  }
-  
-  matrix_free(x);			
-  pyramid_free(pp);
-}
-
-
-
-void tmo_mantiuk06_contmap( int cols, int rows, float* R, float* G, float* B, float* Y,
-  float contrastFactor, float saturationFactor, progress_callback progress_report )
-{
-
-  ContrastDomain contrast;	
-  contrast.tone_mapping( cols, rows, R, G, B, Y, contrastFactor, saturationFactor, progress_report );  
+  /* Transform to linear scale RGB */
+  //#pragma omp parallel for
+  for(int j=0;j<n;j++)
+    {
+      Y[j] = exp10f(Y[j]);
+      R[j] = powf( R[j], saturationFactor) * Y[j];
+      G[j] = powf( G[j], saturationFactor) * Y[j];
+      B[j] = powf( B[j], saturationFactor) * Y[j];
+    }
 }
 
-
-
diff -Nurd pfstmo-1.1/src/mantiuk06/contrast_domain.h pfstmo-1.1-ejb/src/mantiuk06/contrast_domain.h
--- pfstmo-1.1/src/mantiuk06/contrast_domain.h	2007-06-15 12:20:00.000000000 +0100
+++ pfstmo-1.1-ejb/src/mantiuk06/contrast_domain.h	2008-01-03 22:39:09.000000000 +0000
@@ -32,9 +32,6 @@
  * $Id: contrast_domain.h,v 1.3 2007/06/15 10:28:42 rafm Exp $
  */
 
-#include <array2d.h>
-
-
 /** Progress reporting callback function:
  *    void report_progress( int progress )
  *  'progress' parameter grows from 0 to 100
@@ -51,7 +48,12 @@
  * @param Y luminance channel
  * @param contrastFactor contrast scaling factor (in 0-1 range)
  * @param saturationFactor color desaturation (in 0-1 range)
+ * @param bcg true if to use BiConjugate Gradients, false if to use Conjugate Gradients
+ * @param interpolate_method 1 = Mantiuk, 2 = fast Brambely, 3 = correct Brambley
+ * @param itmax maximum number of iterations for convergence (typically 50)
+ * @param tol tolerence to get within for convergence (typically 1e-3)
  * @param progress_cb callback function that reports progress
  */
 void tmo_mantiuk06_contmap( int cols, int rows, float* R, float* G, float* B, float* Y,
-  float contrastFactor, float saturationFactor, progress_callback progress_cb );
+			    float contrastFactor, float saturationFactor, bool bcg,
+			    int interpolate_method, int itmax, float tol, progress_callback progress_cb );
diff -Nurd pfstmo-1.1/src/mantiuk06/pfstmo_mantiuk06.1 pfstmo-1.1-ejb/src/mantiuk06/pfstmo_mantiuk06.1
--- pfstmo-1.1/src/mantiuk06/pfstmo_mantiuk06.1	2007-06-20 11:50:07.000000000 +0100
+++ pfstmo-1.1-ejb/src/mantiuk06/pfstmo_mantiuk06.1	2008-01-03 22:39:09.000000000 +0000
@@ -23,7 +23,7 @@
 The result of this TMO requires gamma correction.
 .SH OPTIONS
 .TP
---equalize-contrast, -e
+--equalize-contrast <val>, -e <val>
 
 Use the \fIcontrast equalization\fR algorithm. If this option is not
 specified, the \fIcontrast mapping\fR algorithm will be used. 
@@ -38,7 +38,7 @@
 Contrast scaling factor (values 0-1) from that determines how much
 large contrast magnitudes should be reduced. This option can be used
 only with the \fIcontrast mapping\fR algorithm. The lower value results in
-a sharper image. Default value: 0.3
+a sharper image. Default value: 0.1
 .TP
 --saturation <val>, -s <val>
 
diff -Nurd pfstmo-1.1/src/mantiuk06/pfstmo_mantiuk06.cpp pfstmo-1.1-ejb/src/mantiuk06/pfstmo_mantiuk06.cpp
--- pfstmo-1.1/src/mantiuk06/pfstmo_mantiuk06.cpp	2007-06-20 11:50:07.000000000 +0100
+++ pfstmo-1.1-ejb/src/mantiuk06/pfstmo_mantiuk06.cpp	2008-01-03 22:39:09.000000000 +0000
@@ -58,8 +58,12 @@
 void printHelp()
 {
   fprintf( stderr, PROG_NAME " (" PACKAGE_STRING ") : \n"
-    "\t[--factor <val>] [--saturation <val>] [--equalize-contrast]\n"
-    "\t[--help] \n"
+    "\t[--factor <val>] [--saturation <val>] [--equalize-contrast <val>]\n"
+    "\t[--interpolate <method>] [--itmax <val>] [--tol <val>] [--help] [--bcg]\n"
+    "\nInterpolate method is:\n"
+    "  1  Original Mantiuk (slow)\n"
+    "  2  Fast Brambley (faster, artifacts?)\n"
+    "  3  Correct Brambley (very slow)\n\n"
     "See man page for more information.\n" );
 }
 
@@ -82,24 +86,31 @@
 {
 
   //--- default tone mapping parameters;
-  float scaleFactor = 0.3f;
+  float scaleFactor = 0.1f;
   float saturationFactor = 0.8f;
-  bool cont_map = false, cont_eq = false;
+  bool cont_map = false, cont_eq = false, bcg = false;
+  int interpolate_method = 1;
+  int itmax = 50;
+  float tol = 1e-3;
 
   //--- process command line args
 
   static struct option cmdLineOptions[] = {
     { "help", no_argument, NULL, 'h' },
     { "verbose", no_argument, NULL, 'v' },
+    { "bcg", no_argument, NULL, 'b' },
     { "factor", required_argument, NULL, 'f' },
     { "saturation", required_argument, NULL, 's' },
-    { "equalize-contrast", no_argument, NULL, 'e' },
+    { "interpolate", required_argument, NULL, 'i' },
+    { "itmax", required_argument, NULL, 'm' },
+    { "tol", required_argument, NULL, 't' },
+    { "equalize-contrast", required_argument, NULL, 'e' },
     { NULL, 0, NULL, 0 }
   };
 
   int optionIndex = 0;
   while( 1 ) {
-    int c = getopt_long (argc, argv, "vhf:s:e", cmdLineOptions, &optionIndex);
+    int c = getopt_long (argc, argv, "vhbf:s:e:i:m:t:", cmdLineOptions, &optionIndex);
     if( c == -1 ) break;
     switch( c ) {
     case 'h':
@@ -108,9 +119,14 @@
     case 'v':
       verbose = true;
       break;
+    case 'b':
+      bcg = true;
+      break;
     case 'e':
-      scaleFactor = 0;
       cont_eq = true;
+      scaleFactor = 0.0f - (float)strtod( optarg, NULL );
+      if( scaleFactor > 0.0f )
+        throw pfs::Exception("incorrect contrast scale factor, accepted range is any positive number");
       break;
     case 's':
       saturationFactor = (float)strtod( optarg, NULL );
@@ -123,6 +139,21 @@
       if( scaleFactor < 0.0f || scaleFactor > 1.0f )
         throw pfs::Exception("incorrect contrast scale factor, accepted range is (0..1)");
       break;
+    case 'i':
+      interpolate_method = atoi( optarg );
+      if (interpolate_method < 1 || interpolate_method > 3)
+        throw pfs::Exception("incorrect interpolation method, accepted values are 1, 2 or 3.");
+      break;
+    case 'm':
+      itmax = atoi( optarg );
+      if (itmax < 1)
+        throw pfs::Exception("incorrect maximum number of iterations.");
+      break;
+    case 't':
+      tol = (float)strtod( optarg, NULL );
+      if( tol < 0.0f || tol > 1.0f )
+        throw pfs::Exception("incorrect convergence tolerance, accepted range is (0..1)");
+      break;
     case '?':
       throw QuietException();
     case ':':
@@ -133,8 +164,9 @@
   if( cont_eq && cont_map )
     throw pfs::Exception( "the 'factor' parameter cannot be used in combination with contrast equalization" );
 
-  if( scaleFactor == 0 ) {
+  if( scaleFactor < 0 ) {
     VERBOSE_STR << "algorithm: contrast equalization" << endl;
+    VERBOSE_STR << "contrast scale factor = " << -scaleFactor << endl;
   } else {
     VERBOSE_STR << "algorithm: contrast mapping" << endl;
     VERBOSE_STR << "contrast scale factor = " << scaleFactor << endl;
@@ -142,6 +174,30 @@
   
   VERBOSE_STR << "saturation factor = " << saturationFactor << endl;
   
+  if (bcg)
+    {
+      VERBOSE_STR << "using biconjugate gradients (itmax = " << itmax << ", tol = " << tol << ")." << endl;
+    }
+  else
+    {
+      VERBOSE_STR << "using conjugate gradients (itmax = " << itmax << ", tol = " << tol << ")." << endl;
+    }
+
+  if (interpolate_method == 1)
+    {
+      VERBOSE_STR << "interpolating using original Mantiuk." << endl;
+    }
+  else if (interpolate_method == 2)
+    {
+      VERBOSE_STR << "interpolating using fast Brambley." << endl;
+    }
+  else if (interpolate_method == 3)
+    {
+      VERBOSE_STR << "interpolating using correct Brambley." << endl;
+    }
+
+    
+
   pfs::DOMIO pfsio;
 	
   pfs::Frame *frame = pfsio.readFrame( stdin );
@@ -159,11 +215,10 @@
   
   pfs::transformColorSpace( pfs::CS_XYZ, inX, inY, inZ, pfs::CS_RGB, inX, &R, inZ );
 
-  tmo_mantiuk06_contmap( cols, rows, inX->getRawData(), R.getRawData(), inZ->getRawData(),
-    inY->getRawData(),
-    scaleFactor, saturationFactor, progress_report );	
+  tmo_mantiuk06_contmap( cols, rows, inX->getRawData(), R.getRawData(), inZ->getRawData(), inY->getRawData(),
+			 scaleFactor, saturationFactor, bcg, interpolate_method, itmax, tol, progress_report );	
 
-  pfs::transformColorSpace( pfs::CS_RGB, inX, inY, inZ, pfs::CS_XYZ, inX, inY, inZ );
+  pfs::transformColorSpace( pfs::CS_RGB, inX, &R, inZ, pfs::CS_XYZ, inX, inY, inZ );
   frame->getTags()->setString("LUMINANCE", "RELATIVE");
   
   pfsio.writeFrame( frame, stdout );