/*
 * new_shift_finder.hpp  --  Part of the CinePaint plug-in "Bracketing_to_HDR"
 *
 * Copyright 2005  Hartmut Sbosny  <hartmut.sbosny@gmx.de>
 *
 * LICENSE:
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
/*
   new_shift_finder.hpp

   Hilfstemplates fuer Camera.hpp.
    
   80 Prozent des Kodes sind Varianten fuer Geschwindigkeitstests.
    
   Abk: LO = Links-Oben,
        RU = Rechts-Unten
*/
#ifndef SHIFT_FINDER_HPP
#define SHIFT_FINDER_HPP


#include "br_types.hpp"    // debug switches
#include "Rgb.hpp"
#include "Rgb_util.hpp"

/*
 * Verschiedene Varianten zu Geschwindigkeitstests
 *   0....
 *   1....
 *   2....
 */
#define HDR_FAST 1
#define USE_REFERENCES

/**
 * Deklaration der in dieser Datei definierten Templates
 */
template <class Unsign>
double
correl (char channel, int Nx, int Ny,
        Array2D< Rgb<Unsign> >& A, int xA, int yA,
        Array2D< Rgb<Unsign> >& B, int xB, int yB);

template <class Unsign>
Rgb<double>
correl (int Nx, int Ny,
        Array2D< Rgb<Unsign> >& A, int xA, int yA,
        Array2D< Rgb<Unsign> >& B, int xB, int yB);

template <class Unsign>                           class ShiftFinder;
template <class Unsign, class sum_t>              class ShiftFinder_2;
template <class Unsign, class mul_t, class sum_t> class ShiftFinder_3;
template <class Unsign, class mul_t, class sum_t> class ShiftFinder_4;




/**
 * correl()
 *
 * Zu Testzwecken diese Stand-allone-Funktion.
 * Korrelationskoeff. zweier [Nx,Ny]-Ausschnitte aus den Bildern A und B
 * mit den LO-Punkten bei (xA,yA) und (xB,yB).
 * Type Unsign relevant fuer bad-pixel-Abfrage: [0...z_max]!
 */
template <class Unsign>
double
correl (char channel, int Nx, int Ny,
        Array2D< Rgb<Unsign> >& A, int xA, int yA,
        Array2D< Rgb<Unsign> >& B, int xB, int yB)
{
#ifdef HDR_DEBUG_1
    printf ("\n%s [extern, %c-Kanal]:\n", __func__, channel);
#endif

#ifdef HDR_WORKING_RANGE
    Unsign z_max = 0-1;        // Gilt aber NUR fuer maximales `CCDBits'!!?
#endif
    double sA=0.0, sB=0.0, s2A=0.0, s2B=0.0, sAB=0.0;
    unsigned bad=0;

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        double a,b;
        switch (channel)
        {
        case 'R':
            a = A [yA+i][xA+j].r;
            b = B [yB+i][xB+j].r;
            break;
        case 'G':
            a = A [yA+i][xA+j].g;
            b = B [yB+i][xB+j].g;
            break;
        case 'B':
            a = A [yA+i][xA+j].b;
            b = B [yB+i][xB+j].b;
            break;
        default:
            printf ("Ungueltiges Kanal-Token %c\n", channel);
            return CORR_RES_ERR;
        }
#ifdef HDR_WORKING_RANGE
        if (a==0 || a>=z_max || b==0 || b>=z_max)
        {   bad++;
            continue;
        }
#endif
        sA  += a;
        sB  += b;
        s2A += a*a;
        s2B += b*b;
        sAB += a * b;
    }
    int n = Nx * Ny - bad;
    double cov = sAB - sA*sB/n;        // strengg. ist das n*cov, nicht cov
    double rho;
    if (cov == 0.0)
         rho = 0.0; 
    else rho = cov / sqrt((s2A - sA*sA/n)*(s2B - sB*sB/n));

#ifdef HDR_DEBUG_1
    //printf ("\tE(A) = %f, D^2(A) = %f\n", sA/n, s2A/n-(sA/n)*(sA/n));
    //printf ("\tE(B) = %f, D^2(B) = %f\n", sB/n, s2B/n-(sB/n)*(sB/n));
    printf ("\tsA = %f, s2A = %f\n", sA, s2A);
    printf ("\tsB = %f, s2B = %f,  sAB = %f\n", sB, s2B, sAB);
    printf ("\tcov(A,B) = %f, rho(AB) = %f\n", cov/n, rho);
#endif

    return rho;
}

/**
 * Wir oben, nur fuer alle 3 Kanaele simultan und ohne das bei Korrelationen
 * nichtsnutzige HDR_WORKING_RANGE.
 */
template <class Unsign>
Rgb<double>
correl (int Nx, int Ny,
        Array2D< Rgb<Unsign> >& A, int xA, int yA,
        Array2D< Rgb<Unsign> >& B, int xB, int yB)
{
#ifdef HDR_DEBUG_1
    printf ("\n%s [extern]:\n", __func__);
#endif

    Rgb<double> sA(0.0), sB(0.0), s2A(0.0), s2B(0.0), sAB(0.0);

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<double> a = A [yA+i][xA+j];        // double <-- Unsign
        Rgb<double> b = B [yB+i][xB+j];        // double <-- Unsign
        sA  += a;                              // daher Ref. nicht moeglich 
        sB  += b;
        s2A += a*a;
        s2B += b*b;
        sAB += a * b;
    }
    int n = Nx * Ny;
    Rgb<double> cov = sAB - sA*sB/(double)n;    // strengg. ist das n*cov,
                                                // nicht cov
    Rgb<double> rho;
    if (cov.r == 0.0)
         rho.r = 0.0;
    else rho.r = cov.r / sqrt((s2A.r - sA.r*sA.r/n)*(s2B.r - sB.r*sB.r/n));

    if (cov.g == 0.0)
         rho.g = 0.0;
    else rho.g = cov.g / sqrt((s2A.g - sA.g*sA.g/n)*(s2B.g - sB.g*sB.g/n));

    if (cov.b == 0.0)
         rho.b = 0.0;
    else rho.b = cov.b / sqrt((s2A.b - sA.b*sA.b/n)*(s2B.b - sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
    std::cout << "\tsA  = " << sA  << '\n';
    std::cout << "\ts2A = " << s2A << '\n';
    std::cout << "\tsB  = " << sB  << '\n';
    std::cout << "\ts2B = " << s2B << '\n';
    std::cout << "\tsAB = " << sAB << '\n';
    std::cout << "\tcov = " << cov / (double)n << '\n';
    std::cout << "\trho = " << rho << '\n';
#endif

    return rho;
}



/**===================================================================
 * ShiftFinder  --  class
 *
 * Suche Korrel.-Maximum eines [Nx,Ny]-Gebietes aus A mit LO-Punkt bei
 * (xA,yA) in einem [Mx,My]-Gebiet von B mit LO-Punkt bei (xB,yB), wobei
 * Mx >= Nx, My >= Ny. A und B gehen also nicht symmetrisch ein, von B
 * wird ein groesserer Bereich gebraucht!
 * In der hier gewollten Beschleunigungsvariante ist Beruecksichtigung von
 * bad_pixel schwierig, bisher nicht gelungen. Basis sollte sein, dass
 * gleichwohl in jeder Position fuer sAB das Ganze einmal zu durchlaufen
 * ist. Da sollten bad_pixel-Infos gezogen werden koennen.
 */
template <class Unsign>
class ShiftFinder {

    TNT::Array2D< Rgb<Unsign> > A,B;

  public:

    ShiftFinder (TNT::Array2D< Rgb<Unsign> >& A_,
                 TNT::Array2D< Rgb<Unsign> >& B_);

    XYindex find_shift_r    (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    XYindex find_shift_1r   (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_1_   (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_1all (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_1    (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);

    template <typename sum_t>
    Rgb<XYindex>
            find_shift_2    (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);

    XYindex find_shift_sym  (int nx, int ny, int mx, int my,
                             int xcA, int ycA, int xcB, int ycB);
};

template <class Unsign>
ShiftFinder<Unsign>::ShiftFinder (TNT::Array2D< Rgb<Unsign> >& A_,
                                  TNT::Array2D< Rgb<Unsign> >& B_)
    :
    A(A_), B(B_)
{}

/**
 * find_shift()  --   ermittelt Verschiebung
 * 
 * @return: Shift des LO-Punktes von A zu dem von B, sprich, Lage
 *    das A-LO im [0...Mx-1] x [0...My-1]-Rechteck (stets >= 0).
 *
 * ACHTUNG: Returnwerte von find_shift() und find_shift_sym() bedeuten
 *    Verschiedenes!
 */

 
/**
 * find_shift_r()  --  nur R-Kanal.
 */
template <class Unsign>
XYindex
ShiftFinder<Unsign>::find_shift_r (int xA, int yA, int Nx, int Ny,
                                   int xB, int yB, int Mx, int My)
{
    printf ("\n%s:\n",__func__);
    printf ("ohne Rgb, direkt auf double, nur R-Kanal\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    XYindex id;
    double rho_max = -2.0;    // kleiner als kleinstmoegl. 

    // Fuer Ausgangslage alle Summen einmal komplett berechnen.
    // Hierbei auch alles an A vorab Bestimmbare.

    double sA=0.0, s2A=0.0;
    double sB=0.0, s2B=0.0, sAB=0.0;

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        double a = A[yA+i][xA+j].r;
        double b = B[yB+i][xB+j].r;
        sA  += a;
        sB  += b;
        s2A += a*a;
        s2B += b*b;
        sAB += a * b;
    }
    int   n = Nx * Ny;
    //double EA = sA / n;            // Mittelwert von A
    //double D2A = s2A / n - EA*EA;    // Streuung von A

#ifdef HDR_DEBUG_1
    printf ("\tsA=%f, s2A=%f\n", sA,s2A);
    //printf ("E(A)=%f, D^2(A)=%f\n", EA, D2A);
#endif

    double sB_0  = sB;        // Werte am linken Rand merken
    double s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        if (p > 0)            // nach unten verschiebben
        {   // sB_0 = sB_0 - Zeile(p-1) + Zeile(p-1+Ny)
            for (int j=0; j < Nx; j++)
            {
                double b1 = B[yB+p-1   ][xB+j].r;    // double <-- Unsign
                double b2 = B[yB+p-1+Ny][xB+j].r;
                sB_0  += b2 - b1;
                s2B_0 += b2*b2 - b1*b1;
            }
            sB  = sB_0;
            s2B = s2B_0;

            sAB = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
                sAB += A[yA+i][xA+j].r * B[yB+p+i][xB+j].r;    // q=0

                // Ueberlauf? Verhindert autom. char->int-Konvertierung!
        }
        // am linken Rand (q=0)
        double cov, rho;
        cov = sAB - sA*sB/n;        // strengg. ist das n*cov, nicht cov
        if (cov == 0.0)
            rho = 0.0; else
            rho = cov / sqrt((s2A - sA*sA/n)*(s2B - sB*sB/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0,p, xB, yB+p);
        printf ("\tsB=%f, s2B=%f, sAB=%f,  rho=%f\n", sB,s2B,sAB, rho);
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho > rho_max)
        {   rho_max = rho;
            id.x = 0;
            id.y = p;
            //printf ("(dx,dy)=(%i,%i),  rho=%.1f\n", id.x, id.y, rho_max);
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            sAB = 0.0;
            // sB = sB - Spalte(q-1) + Spalte(q-1+Nx)
            for (int i=0; i < Ny; i++)
            {
                double b1 = B[yB+p+i][xB+q-1   ].r;
                double b2 = B[yB+p+i][xB+q-1+Nx].r;
                sB  += b2 - b1;
                s2B += b2*b2 - b1*b1;

                for (int j=0; j < Nx; j++)
                    sAB += A[yA+i][xA+j].r * B[yB+p+i][xB+q+j].r;
            }
            cov = sAB - sA*sB/n;    // strengg. ist das n*cov, nicht cov
            if (cov == 0.0)
                rho = 0.0; else
                rho = cov / sqrt((s2A - sA*sA/n)*(s2B - sB*sB/n));
#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            printf ("\tsB=%f, s2B=%f, sAB=%f,  rho=%f\n", sB,s2B,sAB, rho);
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho > rho_max)
            {   rho_max = rho;
                id.x = q;
                id.y = p;
                //printf ("(dx,dy)=(%i,%i), rho=%.1f\n", id.x, id.y, rho_max);
            }
        }
    }
    printf ("\tR: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id.x,id.y, xB+id.x, yB+id.y, rho_max);

    return id;
}

/**
    find_shift_1r()  

    Theoretisch schnellere Variante mit vortabellierten B-Summen.
    Vorerst nur R-Kanal.

    NOTIZ: Wenn die aeussere Schleife ueber das [Mx,My]-Gebiet von oben nach
    unten (y-Dim) und die innere von links nach rechts (x-Dim) laeuft,
    muessen bei jedem neuen aeusseren Durchgang die Spaltensummen nach unten
    verschoben werden; als Zeilensummen genuegen hingegen die auf der linken
    Seite. Andersrum waere es andersrum.
        |-|---|-|------| |   |      |-----|--------|
        | |   | | ->   | Ny  |      |-----|        |
        |_|___|_|______| |   My     |_____|________|
        |     |        |     |      |_____|        |
        |_____|________|     |      |_____|________|
           Nx
        ----- Mx ------
*/
template <class Unsign>
XYindex
ShiftFinder<Unsign>::find_shift_1r (int xA, int yA, int Nx, int Ny,
                                    int xB, int yB, int Mx, int My)
{
    printf ("\n%s:\n",__func__);
    printf ("ohne Rgb, direkt auf double, nur R-Kanal\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    double cols[Mx], cols2[Mx];    // fuer Mx Spaltensummen (einf & Quadrate)
    double rows[My], rows2[My];    // fuer My Zeilensummen

    for (int p=0; p < Mx; p++)
    {   double s=0.0, s2=0.0;
        for (int i=0; i < Ny; i++)              // Spaltensumme
        {   double b = B [yB+i][xB+p].r;        // double <- Unsign
            s  += b;
            s2 += b*b;        // Hier koennte int-Mult. reichen
        }
        cols [p] = s;
        cols2[p] = s2;
    }
    for (int p=0; p < My; p++)
    {   double s=0.0, s2=0.0;
        for (int i=0; i < Nx; i++)              // Zeilensumme
        {   double b = B [yB+p][xB+i].r;        // double <- Unsign
            s  += b;
            s2 += b*b;        // s.o.
        }
        rows [p] = s;
        rows2[p] = s2;
    }

    XYindex id;
    double rho_max = -2.0;    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    double sA=0.0, s2A=0.0;
    double sB=0.0, s2B=0.0, sAB=0.0;

    for (int i=0; i < Ny; i++)     //    for (int i=0; i < Nx; i++)
    {    sB  += rows [i];           //    {    sB  += cols [i];
        s2B += rows2[i];           //        s2B += cols2[i];
    }                              //    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        double a = A[yA+i][xA+j].r;
        sA  += a;
        s2A += a*a;
        sAB += a * B[yB+i][xB+j].r;
    }

    int   n = Nx * Ny;
    //double EA = sA / n;            // Mittelwert von A
    //double D2A = s2A / n - EA*EA;    // Streuung von A

#ifdef HDR_DEBUG_1
    printf ("\tsA=%f, s2A=%f\n", sA,s2A);
    //printf ("\tsB=%f, s2B=%f\n", sB,s2B);
    //printf ("\tE(A)=%f, D^2(A)=%f\n", EA, D2A);
#endif

    double sB_0  = sB;        // B-Summen am linken Rand merken
    double s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)            // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)
            sB_0  += rows [p-1+Ny] - rows [p-1];
            s2B_0 += rows2[p-1+Ny] - rows2[p-1];

            // alle Spaltensummen um 1 nach unten verschieben
            for (int i=0; i < Mx; i++)
            {    double b1 = B [yB+p-1   ][xB+i].r;    // zu subtrahieren
                double b2 = B [yB+p-1+Ny][xB+i].r;    // zu addieren
                cols [i] += b2 - b1;
                cols2[i] += b2*b2 - b1*b1;
            }
            sB  = sB_0;
            s2B = s2B_0;

            // Korrelation ist stets komplett zu berechnen
            sAB = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
                sAB += A[yA+i][xA+j].r * B[yB+p+i][xB+j].r;    // q=0

                // Nicht notw. Ueberlauf, da implizit in int's gewandelt.
                // Auf 32-Bit-Maschinen ist das sicher aber nur bis
                // 15-Bit-Unsign; das Produkt zweier maximaler 16-Bit-Unsign
                // passt auch nicht mehr in einen int!
                // Wiederum spart bei kleinen Unsign's Arbeiten mit int's
                // Zeit. Siehe HDR_Notizen.
        }
        // am linken Rand (q=0)
        double cov, rho;
        cov = sAB - sA*sB/n;        // strengg. ist das n*cov, nicht cov
        if (cov == 0.0)
             rho = 0.0;
        else rho = cov / sqrt((s2A - sA*sA/n)*(s2B - sB*sB/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0, p, xB, yB+p);
        printf ("\tsB=%f, s2B=%f, sAB=%f,  rho=%f\n", sB,s2B,sAB, rho);
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho > rho_max)
        {   rho_max = rho;
            id.x = 0;
            id.y = p;
            //printf ("(dx,dy)=(%i,%i),  rho=%.1f\n", id.x, id.y, rho_max);
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)
            sB  += cols [q-1+Nx] - cols [q-1];
            s2B += cols2[q-1+Nx] - cols2[q-1];

            // Korrelation komplett
            sAB = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
                sAB += A[yA+i][xA+j].r * B[yB+p+i][xB+q+j].r;    // Ueberlauf?!

            cov = sAB - sA*sB/n;    // strengg. ist das n*cov, nicht cov
            if (cov == 0.0)
                 rho = 0.0;
            else rho = cov / sqrt((s2A - sA*sA/n)*(s2B - sB*sB/n));
#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            printf ("\tsB=%f, s2B=%f, sAB=%f,  rho=%f\n", sB,s2B,sAB, rho);
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho > rho_max)
            {    rho_max = rho;
                id.x = q;
                id.y = p;
                //printf ("(dx,dy)=(%i,%i), rho=%.1f\n", id.x, id.y, rho_max);
            }
        }
    }
    printf ("\tR: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id.x,id.y, xB+id.x, yB+id.y, rho_max);

    return id;
}

/**
    find_shift_1_():

    Bei manchen Optionen suggerierte 3maliges find_shift_1r() bessere
    Effizienz als alles Sonstige. Deshalb diese Variante jetzt tatsaechlich
    fuer alle drei Kanaele simultan.
*/
template <class Unsign>
Rgb<XYindex>
ShiftFinder<Unsign>::find_shift_1_ (int xA, int yA, int Nx, int Ny,
                                    int xB, int yB, int Mx, int My)
{
    printf ("\n%s:\n",__func__);
    printf ("ohne Rgb, direkt auf double, alle 3 Kanaele, ");
#ifdef USE_REFERENCES
    printf ("mit Referenzen\n");
#else
    printf ("ohne Referenzen\n");
#endif
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    double cols_r[Mx], cols2_r[Mx],    // fuer Mx Spaltensummen (einf & Quadrate)
           cols_g[Mx], cols2_g[Mx],
           cols_b[Mx], cols2_b[Mx];
    double rows_r[My], rows2_r[My],    // fuer My Zeilensummen
           rows_g[My], rows2_g[My],
           rows_b[My], rows2_b[My];

    for (int p=0; p < Mx; p++)
    {   double s_r=0.0, s2_r=0.0,
               s_g=0.0, s2_g=0.0,
               s_b=0.0, s2_b=0.0;
        for (int i=0; i < Ny; i++)              // Spaltensumme
        {
#ifdef USE_REFERENCES
            const Rgb<Unsign>& _B = B[yB+i][xB+p];
            double b_r = _B.r;                  // double <- Unsign
            double b_g = _B.g;                  // double <- Unsign
            double b_b = _B.b;                  // double <- Unsign
#else
            double b_r = B [yB+i][xB+p].r;      // double <- Unsign
            double b_g = B [yB+i][xB+p].g;      // double <- Unsign
            double b_b = B [yB+i][xB+p].b;      // double <- Unsign
#endif
            s_r  += b_r;
            s_g  += b_g;
            s_b  += b_b;
            s2_r += b_r * b_r;
            s2_g += b_g * b_g;
            s2_b += b_b * b_b;
        }
        cols_r [p] = s_r;
        cols_g [p] = s_g;
        cols_b [p] = s_b;
        cols2_r[p] = s2_r;
        cols2_g[p] = s2_g;
        cols2_b[p] = s2_b;
    }
    for (int p=0; p < My; p++)
    {    
        double s_r=0.0, s2_r=0.0,
               s_g=0.0, s2_g=0.0,
               s_b=0.0, s2_b=0.0;
        for (int i=0; i < Nx; i++)              // Zeilensumme
        {
#ifdef USE_REFERENCES
            const Rgb<Unsign>& _B = B[yB+p][xB+i];
            double b_r = _B.r;                  // double <- Unsign
            double b_g = _B.g;                  // double <- Unsign
            double b_b = _B.b;                  // double <- Unsign
#else
            double b_r = B [yB+p][xB+i].r;      // double <- Unsign
            double b_g = B [yB+p][xB+i].g;      // double <- Unsign
            double b_b = B [yB+p][xB+i].b;      // double <- Unsign
#endif
            s_r  += b_r;
            s_g  += b_g;
            s_b  += b_b;
            s2_r += b_r * b_r;          // s.o.
            s2_g += b_g * b_g;          // s.o.
            s2_b += b_b * b_b;          // s.o.
        }
        rows_r [p] = s_r;
        rows_g [p] = s_g;
        rows_b [p] = s_b;
        rows2_r[p] = s2_r;
        rows2_g[p] = s2_g;
        rows2_b[p] = s2_b;
    }

    XYindex id_r, id_g, id_b;
    double rho_max_r = -2.0;            // kleiner als kleinstmoegl.
    double rho_max_g = -2.0;            // kleiner als kleinstmoegl.
    double rho_max_b = -2.0;            // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    double sA_r=0.0, s2A_r=0.0,
           sA_g=0.0, s2A_g=0.0,
           sA_b=0.0, s2A_b=0.0;
    double sB_r=0.0, s2B_r=0.0, sAB_r=0.0,
           sB_g=0.0, s2B_g=0.0, sAB_g=0.0,
           sB_b=0.0, s2B_b=0.0, sAB_b=0.0;

    for (int i=0; i < Ny; i++)
    {   sB_r  += rows_r [i];
        sB_g  += rows_g [i];
        sB_b  += rows_b [i];
        s2B_r += rows2_r[i];
        s2B_g += rows2_g[i];
        s2B_b += rows2_b[i];
    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
#ifdef USE_REFERENCES
        const Rgb<Unsign>& _A = A[yA+i][xA+j];
        double a_r = _A.r;
        double a_g = _A.g;
        double a_b = _A.b;
        sA_r  += a_r;
        sA_g  += a_g;
        sA_b  += a_b;
        s2A_r += a_r * a_r;
        s2A_g += a_g * a_g;
        s2A_b += a_b * a_b;
        const Rgb<Unsign>& _B = B[yB+i][xB+j];
        sAB_r += a_r * _B.r;
        sAB_g += a_g * _B.g;
        sAB_b += a_b * _B.b;
#else
        double a_r = A[yA+i][xA+j].r;
        double a_g = A[yA+i][xA+j].g;
        double a_b = A[yA+i][xA+j].b;
        sA_r  += a_r;
        sA_g  += a_g;
        sA_b  += a_b;
        s2A_r += a_r * a_r;
        s2A_g += a_g * a_g;
        s2A_b += a_b * a_b;
        sAB_r += a_r * B[yB+i][xB+j].r;
        sAB_g += a_g * B[yB+i][xB+j].g;
        sAB_b += a_b * B[yB+i][xB+j].b;
#endif
    }

    int   n = Nx * Ny;
    //double EA = sA / n;               // Mittelwert von A
    //double D2A = s2A / n - EA*EA;     // Streuung von A

#ifdef HDR_DEBUG_1
    printf ("\tsA=%f, s2A=%f\n", sA,s2A);
    //printf ("\tsB=%f, s2B=%f\n", sB,s2B);
    //printf ("\tE(A)=%f, D^2(A)=%f\n", EA, D2A);
#endif

    double sB_0_r  = sB_r,          // B-Summen am linken Rand merken
           sB_0_g  = sB_g,
           sB_0_b  = sB_b;
    double s2B_0_r = s2B_r,
           s2B_0_g = s2B_g,
           s2B_0_b = s2B_b;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)                  // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)
            sB_0_r  += rows_r [p-1+Ny] - rows_r [p-1];
            sB_0_g  += rows_g [p-1+Ny] - rows_g [p-1];
            sB_0_b  += rows_b [p-1+Ny] - rows_b [p-1];
            s2B_0_r += rows2_r[p-1+Ny] - rows2_r[p-1];
            s2B_0_g += rows2_g[p-1+Ny] - rows2_g[p-1];
            s2B_0_b += rows2_b[p-1+Ny] - rows2_b[p-1];

            // alle Spaltensummen um 1 nach unten verschieben
            for (int i=0; i < Mx; i++)
            {    
#ifdef USE_REFERENCES
                const Rgb<Unsign>& _B1 = B [yB+p-1   ][xB+i];
                const Rgb<Unsign>& _B2 = B [yB+p-1+Ny][xB+i];
                double b1_r = _B1.r;    // zu subtrahieren
                double b1_g = _B1.g;    // zu subtrahieren
                double b1_b = _B1.b;    // zu subtrahieren
                double b2_r = _B2.r;    // zu addieren
                double b2_g = _B2.g;    // zu addieren
                double b2_b = _B2.b;    // zu addieren
#else
                double b1_r = B [yB+p-1   ][xB+i].r;    // zu subtrahieren
                double b1_g = B [yB+p-1   ][xB+i].g;    // zu subtrahieren
                double b1_b = B [yB+p-1   ][xB+i].b;    // zu subtrahieren
                double b2_r = B [yB+p-1+Ny][xB+i].r;    // zu addieren
                double b2_g = B [yB+p-1+Ny][xB+i].g;    // zu addieren
                double b2_b = B [yB+p-1+Ny][xB+i].b;    // zu addieren
#endif
                cols_r [i] += b2_r - b1_r;
                cols_g [i] += b2_g - b1_g;
                cols_b [i] += b2_b - b1_b;
                cols2_r[i] += b2_r * b2_r - b1_r * b1_r;
                cols2_g[i] += b2_g * b2_g - b1_g * b1_g;
                cols2_b[i] += b2_b * b2_b - b1_b * b1_b;
            }
            sB_r  = sB_0_r;
            sB_g  = sB_0_g;
            sB_b  = sB_0_b;
            s2B_r = s2B_0_r;
            s2B_g = s2B_0_g;
            s2B_b = s2B_0_b;

            // Korrelation ist stets komplett zu berechnen
            sAB_r = sAB_g = sAB_b = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
#ifdef USE_REFERENCES
                const Rgb<Unsign>& _A = A[yA  +i][xA+j];    // q=0
                const Rgb<Unsign>& _B = B[yB+p+i][xB+j];    // q=0
                sAB_r += _A.r * _B.r;
                sAB_g += _A.g * _B.g;
                sAB_b += _A.b * _B.b;
#else
                sAB_r += A[yA+i][xA+j].r * B[yB+p+i][xB+j].r;    // q=0
                sAB_g += A[yA+i][xA+j].g * B[yB+p+i][xB+j].g;    // q=0
                sAB_b += A[yA+i][xA+j].b * B[yB+p+i][xB+j].b;    // q=0
#endif
            }
        }
        // am linken Rand (q=0)
        double cov_r, rho_r,
               cov_g, rho_g,
               cov_b, rho_b;

        cov_r = sAB_r - sA_r*sB_r/n;    // strengg. ist das n*cov, nicht cov
        cov_g = sAB_g - sA_g*sB_g/n;
        cov_b = sAB_b - sA_b*sB_b/n;

        if (cov_r == 0.0)
             rho_r = 0.0;
        else rho_r = cov_r / sqrt((s2A_r - sA_r*sA_r/n)*(s2B_r - sB_r*sB_r/n));

        if (cov_g == 0.0)
             rho_g = 0.0;
        else rho_g = cov_g / sqrt((s2A_g - sA_g*sA_g/n)*(s2B_g - sB_g*sB_g/n));

        if (cov_b == 0.0)
             rho_b = 0.0;
        else rho_b = cov_b / sqrt((s2A_b - sA_b*sA_b/n)*(s2B_b - sB_b*sB_b/n));

        if (rho_r > rho_max_r)
        {   rho_max_r = rho_r;
            id_r.x = 0;
            id_r.y = p;
        }
        if (rho_g > rho_max_g)
        {   rho_max_g = rho_g;
            id_g.x = 0;
            id_g.y = p;
        }
        if (rho_b > rho_max_b)
        {   rho_max_b = rho_b;
            id_b.x = 0;
            id_b.y = p;
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)
            sB_r  += cols_r [q-1+Nx] - cols_r [q-1];
            sB_g  += cols_g [q-1+Nx] - cols_g [q-1];
            sB_b  += cols_b [q-1+Nx] - cols_b [q-1];
            s2B_r += cols2_r[q-1+Nx] - cols2_r[q-1];
            s2B_g += cols2_g[q-1+Nx] - cols2_g[q-1];
            s2B_b += cols2_b[q-1+Nx] - cols2_b[q-1];

            // Korrelation stets komplett berechnen
            sAB_r = sAB_g = sAB_b = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
#ifdef USE_REFERENCES
                const Rgb<Unsign>& _A = A[yA  +i][xA+  j];
                const Rgb<Unsign>& _B = B[yB+p+i][xB+q+j];
                sAB_r += _A.r * _B.r;
                sAB_g += _A.g * _B.g;
                sAB_b += _A.b * _B.b;
#else
                sAB_r += A[yA+i][xA+j].r * B[yB+p+i][xB+q+j].r;
                sAB_g += A[yA+i][xA+j].g * B[yB+p+i][xB+q+j].g;
                sAB_b += A[yA+i][xA+j].b * B[yB+p+i][xB+q+j].b;
#endif
            }

            cov_r = sAB_r - sA_r*sB_r/n;    // strengg. ist das n*cov, nicht cov
            cov_g = sAB_g - sA_g*sB_g/n;
            cov_b = sAB_b - sA_b*sB_b/n;

            if (cov_r == 0.0)
                 rho_r = 0.0;
            else rho_r = cov_r / sqrt((s2A_r - sA_r*sA_r/n)*(s2B_r - sB_r*sB_r/n));

            if (cov_g == 0.0)
                 rho_g = 0.0;
            else rho_g = cov_g / sqrt((s2A_g - sA_g*sA_g/n)*(s2B_g - sB_g*sB_g/n));

            if (cov_b == 0.0)
                 rho_b = 0.0;
            else rho_b = cov_b / sqrt((s2A_b - sA_b*sA_b/n)*(s2B_b - sB_b*sB_b/n));

            if (rho_r > rho_max_r)
            {    rho_max_r = rho_r;
                id_r.x = q;
                id_r.y = p;
            }
            if (rho_g > rho_max_g)
            {    rho_max_g = rho_g;
                id_g.x = q;
                id_g.y = p;
            }
            if (rho_b > rho_max_b)
            {    rho_max_b = rho_b;
                id_b.x = q;
                id_b.y = p;
            }
        }
    }
    printf ("\tR: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id_r.x, id_r.y, xB+id_r.x, yB+id_r.y, rho_max_r);
    printf ("\tG: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id_g.x, id_g.y, xB+id_g.x, yB+id_g.y, rho_max_g);
    printf ("\tB: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id_b.x, id_b.y, xB+id_b.x, yB+id_b.y, rho_max_b);

    return Rgb<XYindex>(id_r, id_g, id_b);
}


/**
 * find_shift_1all ()
 *
 * Wie find_shift_1_(), nur so weit mit moeglich mit Rgb's als Strukturen
 * arbeitend wie ohne Geschw.verlust moeglich. (Allerdings scheint es nicht
 * moeglich.) -- Absicht scheint aber noch gar nicht umgesetzt worden zu sein,
 * hingegen in `find_shift_1()'.
 */
template <class Unsign>
Rgb<XYindex>
ShiftFinder<Unsign>::find_shift_1all (int xA, int yA, int Nx, int Ny,
                                      int xB, int yB, int Mx, int My)
{
    printf ("\n%s:\n",__func__);
    printf ("ohne Rgb-Op, doch mit Rgb-Variablen; direkt auf double; ");
#ifdef USE_REFERENCES
    printf ("mit Referenzen\n");
#else
    printf ("ohne Referenzen\n");
#endif
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    double cols_r[Mx], cols2_r[Mx],     // fuer Mx Spaltensummen (einf &
           cols_g[Mx], cols2_g[Mx],     // Quadrate)
           cols_b[Mx], cols2_b[Mx];
    double rows_r[My], rows2_r[My],     // fuer My Zeilensummen
           rows_g[My], rows2_g[My],
           rows_b[My], rows2_b[My];

    for (int p=0; p < Mx; p++)
    {   Rgb<double> s(0), s2(0);
        for (int i=0; i < Ny; i++)              // Spaltensumme
        {   Rgb<double> b = B [yB+i][xB+p];     // double <- Unsign
            s.r  += b.r;        
            s.g  += b.g;        // Haeh? Ich denke mit Rgb's arbeiten?
            s.b  += b.b;
            s2.r += b.r * b.r;
            s2.g += b.g * b.g;
            s2.b += b.b * b.b;
        }
        cols_r [p] = s.r;
        cols_g [p] = s.g;
        cols_b [p] = s.b;
        cols2_r[p] = s2.r;
        cols2_g[p] = s2.g;
        cols2_b[p] = s2.b;
    }
    for (int p=0; p < My; p++)
    {    Rgb<double> s(0), s2(0);
        for (int i=0; i < Nx; i++)              // Zeilensumme
        {   Rgb<double> b = B [yB+p][xB+i];     // double <- Unsign
            s.r  += b.r;
            s.g  += b.g;
            s.b  += b.b;
            s2.r += b.r * b.r;
            s2.g += b.g * b.g;
            s2.b += b.b * b.b;
        }
        rows_r [p] = s.r;
        rows_g [p] = s.g;
        rows_b [p] = s.b;
        rows2_r[p] = s2.r;
        rows2_g[p] = s2.g;
        rows2_b[p] = s2.b;
    }

    Rgb<XYindex> id;
    Rgb<double> rho_max(-2.0);    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    Rgb<double> sA(0), s2A(0);
    Rgb<double> sB(0), s2B(0), sAB(0);

    for (int i=0; i < Ny; i++)
    {   sB.r  += rows_r [i];
        sB.g  += rows_g [i];
        sB.b  += rows_b [i];
        s2B.r += rows2_r[i];
        s2B.g += rows2_g[i];
        s2B.b += rows2_b[i];
    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<double> a = A[yA+i][xA+j];
        sA.r  += a.r;
        sA.g  += a.g;
        sA.b  += a.b;
        s2A.r += a.r * a.r;
        s2A.g += a.g * a.g;
        s2A.b += a.b * a.b;
        sAB.r += a.r * B[yB+i][xB+j].r;
        sAB.g += a.g * B[yB+i][xB+j].g;
        sAB.b += a.b * B[yB+i][xB+j].b;
    }

    int   n = Nx * Ny;
    //double EA = sA / n;               // Mittelwert von A
    //double D2A = s2A / n - EA*EA;     // Streuung von A

#ifdef HDR_DEBUG_1
    printf ("\tsA=%f, s2A=%f\n", sA,s2A);
    //printf ("\tsB=%f, s2B=%f\n", sB,s2B);
    //printf ("\tE(A)=%f, D^2(A)=%f\n", EA, D2A);
#endif

    Rgb<double> sB_0 = sB;              // B-Summen am linken Rand merken
    Rgb<double> s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)                      // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)
            sB_0.r  += rows_r [p-1+Ny] - rows_r [p-1];
            sB_0.g  += rows_g [p-1+Ny] - rows_g [p-1];
            sB_0.b  += rows_b [p-1+Ny] - rows_b [p-1];
            s2B_0.r += rows2_r[p-1+Ny] - rows2_r[p-1];
            s2B_0.g += rows2_g[p-1+Ny] - rows2_g[p-1];
            s2B_0.b += rows2_b[p-1+Ny] - rows2_b[p-1];

            // alle Spaltensummen um 1 nach unten verschieben
            for (int i=0; i < Mx; i++)
            {   Rgb<double> b1 = B [yB+p-1   ][xB+i];   // zu subtrahieren
                Rgb<double> b2 = B [yB+p-1+Ny][xB+i];   // zu addieren
                cols_r [i] += b2.r - b1.r;
                cols_g [i] += b2.g - b1.g;
                cols_b [i] += b2.b - b1.b;
                cols2_r[i] += b2.r * b2.r - b1.r * b1.r;
                cols2_g[i] += b2.g * b2.g - b1.g * b1.g;
                cols2_b[i] += b2.b * b2.b - b1.b * b1.b;
            }
            sB.r  = sB_0.r;
            sB.g  = sB_0.g;
            sB.b  = sB_0.b;
            s2B.r = s2B_0.r;
            s2B.g = s2B_0.g;
            s2B.b = s2B_0.b;

            // Korrelation ist stets komplett zu berechnen
            sAB.r = sAB.g = sAB.b = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                sAB.r += A[yA+i][xA+j].r * B[yB+p+i][xB+j].r;    // q=0
                sAB.g += A[yA+i][xA+j].g * B[yB+p+i][xB+j].g;    // q=0
                sAB.b += A[yA+i][xA+j].b * B[yB+p+i][xB+j].b;    // q=0
            }
        }
        // am linken Rand (q=0)
        Rgb<double> cov, rho;

        cov.r = sAB.r - sA.r*sB.r/n;    // strengg. ist das n*cov, nicht cov
        cov.g = sAB.g - sA.g*sB.g/n;
        cov.b = sAB.b - sA.b*sB.b/n;

        if (cov.r == 0.0)
             rho.r = 0.0;
        else rho.r = cov.r / sqrt((s2A.r - sA.r*sA.r/n)*(s2B.r - sB.r*sB.r/n));

        if (cov.g == 0.0)
             rho.g = 0.0;
        else rho.g = cov.g / sqrt((s2A.g - sA.g*sA.g/n)*(s2B.g - sB.g*sB.g/n));

        if (cov.b == 0.0)
             rho.b = 0.0;
        else rho.b = cov.b / sqrt((s2A.b - sA.b*sA.b/n)*(s2B.b - sB.b*sB.b/n));

        if (rho.r > rho_max.r)
        {    rho_max.r = rho.r;
            id.r.x = 0;
            id.r.y = p;
        }
        if (rho.g > rho_max.g)
        {    rho_max.g = rho.g;
            id.g.x = 0;
            id.g.y = p;
        }
        if (rho.b > rho_max.b)
        {    rho_max.b = rho.b;
            id.b.x = 0;
            id.b.y = p;
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)
            sB.r  += cols_r [q-1+Nx] - cols_r [q-1];
            sB.g  += cols_g [q-1+Nx] - cols_g [q-1];
            sB.b  += cols_b [q-1+Nx] - cols_b [q-1];
            s2B.r += cols2_r[q-1+Nx] - cols2_r[q-1];
            s2B.g += cols2_g[q-1+Nx] - cols2_g[q-1];
            s2B.b += cols2_b[q-1+Nx] - cols2_b[q-1];

            // Korrelation komplett
            sAB.r = sAB.g = sAB.b = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                sAB.r += A[yA+i][xA+j].r * B[yB+p+i][xB+q+j].r;
                sAB.g += A[yA+i][xA+j].g * B[yB+p+i][xB+q+j].g;
                sAB.b += A[yA+i][xA+j].b * B[yB+p+i][xB+q+j].b;
            }

            cov.r = sAB.r - sA.r*sB.r/n;   // strengg. ist das n*cov, nicht cov
            cov.g = sAB.g - sA.g*sB.g/n;
            cov.b = sAB.b - sA.b*sB.b/n;

            if (cov.r == 0.0)
                 rho.r = 0.0;
            else rho.r = cov.r / sqrt((s2A.r - sA.r*sA.r/n)*(s2B.r - sB.r*sB.r/n));

            if (cov.g == 0.0)
                 rho.g = 0.0;
            else rho.g = cov.g / sqrt((s2A.g - sA.g*sA.g/n)*(s2B.g - sB.g*sB.g/n));

            if (cov.b == 0.0)
                 rho.b = 0.0;
            else rho.b = cov.b / sqrt((s2A.b - sA.b*sA.b/n)*(s2B.b - sB.b*sB.b/n));

            if (rho.r > rho_max.r)
            {    rho_max.r = rho.r;
                id.r.x = q;
                id.r.y = p;
            }
            if (rho.g > rho_max.g)
            {    rho_max.g = rho.g;
                id.g.x = q;
                id.g.y = p;
            }
            if (rho.b > rho_max.b)
            {    rho_max.b = rho.b;
                id.b.x = q;
                id.b.y = p;
            }
        }
    }
    printf ("\tR: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id.r.x, id.r.y, xB+id.r.x, yB+id.r.y, rho_max.r);
    printf ("\tG: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id.g.x, id.g.y, xB+id.g.x, yB+id.g.y, rho_max.g);
    printf ("\tB: LO(dx,dy)=(%i,%i), (xB,yB)=(%i,%i), rho=%f\n",
        id.b.x, id.b.y, xB+id.b.x, yB+id.b.y, rho_max.b);

    return id;
}


/**
 * find_shift_1 ()
 *
 * Mit Rgb-Template statt explizitem Dreischritt wie in _1_().
 */
template <class Unsign>
Rgb<XYindex>
ShiftFinder<Unsign>::find_shift_1 (int xA, int yA, int Nx, int Ny,
                                   int xB, int yB, int Mx, int My)
{
    printf ("\n%s:\n",__func__);
    printf ("interne Rgb<double>-Variablen, aber ohne Rgb-Op\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    Rgb<double> cols[Mx], cols2[Mx];    // fuer Mx Spaltensummen (einf & Qrd)
    Rgb<double> rows[My], rows2[My];    // fuer My Zeilensummen

    for (int p=0; p < Mx; p++)
    {
        Rgb<double> s(0.0), s2(0.0);

        for (int i=0; i < Ny; i++)              // Spaltensumme
        {   Rgb<double> b = B [yB+i][xB+p];     // double <-- Unsign
            s  += b;
            s2 += b*b;  // int-Mult moegl.
        }
        cols [p] = s;
        cols2[p] = s2;
    }
    for (int p=0; p < My; p++)
    {
        Rgb<double> s(0.0), s2(0.0);

        for (int i=0; i < Nx; i++)              // Zeilensumme
        {   Rgb<double> b = B [yB+p][xB+i];     // double <-- Unsign
            s  += b;
            s2 += b*b;  // int-Mult moegl.
        }
        rows [p] = s;
        rows2[p] = s2;
    }

    Rgb<XYindex> id;
    Rgb<double> rho_max(-2.0);    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    Rgb<double> sA(0.0), s2A(0.0);
    Rgb<double> sB(0.0), s2B(0.0), sAB(0.0);

    for (int i=0; i < Ny; i++)      //    for (int i=0; i < Nx; i++)
    {   sB  += rows [i];            //    {   sB  += cols [i];
        s2B += rows2[i];            //        s2B += cols2[i];
    }                               //    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<double> a = A[yA+i][xA+j];    // double <-- Unsign
        Rgb<double> b = B[yB+i][xB+j];    // double <-- Unsign
        sA  += a;
        s2A += a*a;
        sAB += a * b;
        //sAB += a * B[yB+i][xB+j];        // Rgb<double> * Rgb<Unsign>
    }

    int   n = Nx * Ny;
    //Rgb<double> EA = sA / (double)n;              // Mittelwert von A
    //Rgb<double> D2A = s2A / (double)n - EA*EA;    // Streuung von A

#ifdef HDR_DEBUG_1
    std::cout << "\tsA  = " << sA << '\n';
    std::cout << "\ts2A = " << s2A << '\n';
    //std::cout << "\tsB  = " << sB << '\n';
    //std::cout << "\ts2B = " << s2B << '\n';
    //std::cout << "\tE(A)= " << EA << '\n';0
    //std::cout << "\tD2A = " << D2A << '\n';
#endif

    Rgb<double> sB_0 = sB;        // B-Summen am linken Rand merken
    Rgb<double> s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)            // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)
            sB_0  += rows [p-1+Ny] - rows [p-1];
            s2B_0 += rows2[p-1+Ny] - rows2[p-1];

            // alle Spaltensummen um 1 nach unten verschieben
            for (int i=0; i < Mx; i++)
            {   Rgb<double> b1 = B [yB+p-1   ][xB+i];    // zu subtrahieren
                Rgb<double> b2 = B [yB+p-1+Ny][xB+i];    // zu addieren
                cols [i] += b2 - b1;
                cols2[i] += b2*b2 - b1*b1;
            }
            sB  = sB_0;
            s2B = s2B_0;

            // Korrelation ist stets komplett zu berechnen
            sAB = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {   Rgb<double> a = A[yA+i  ][xA+j];        // q=0
                Rgb<double> b = B[yB+i+p][xB+j];        // double <-- Unsign
                sAB += a * b;
//              sAB += A[yA+i][xA+j] * B[yB+p+i][xB+j];        Ueberlauf!
            }
        }

        // am linken Rand (q=0)
        Rgb<double> cov, rho;
        cov = sAB - sA*sB/(double)n;     // strengg. ist das n*cov, nicht cov

        if (cov.r == 0.0)
             rho.r = 0.0;
        else rho.r = cov.r / sqrt((s2A.r - sA.r*sA.r/n)*(s2B.r - sB.r*sB.r/n));

        if (cov.g == 0.0)
             rho.g = 0.0;
        else rho.g = cov.g / sqrt((s2A.g - sA.g*sA.g/n)*(s2B.g - sB.g*sB.g/n));

        if (cov.b == 0.0)
             rho.b = 0.0;
        else rho.b = cov.b / sqrt((s2A.b - sA.b*sA.b/n)*(s2B.b - sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0, p, xB, yB+p);
        std::cout << "\tsB  = " << sB << '\n';
        std::cout << "\ts2B = " << s2B << '\n';
        std::cout << "\tsAB = " << sAB << '\n';
        std::cout << "\trho = " << rho << '\n';
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
        correl (     Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho.r > rho_max.r)
        {   rho_max.r = rho.r;
            id.r.x = 0;
            id.r.y = p;
            //printf ("r: (dx,dy)=(%i,%i),  rho=%.1f\n", id.r.x, id.r.y, rho_max.r);
        }
        if (rho.g > rho_max.g)
        {   rho_max.g = rho.g;
            id.g.x = 0;
            id.g.y = p;
        }
        if (rho.b > rho_max.b)
        {   rho_max.b = rho.b;
            id.b.x = 0;
            id.b.y = p;
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)
            sB  += cols [q-1+Nx] - cols [q-1];
            s2B += cols2[q-1+Nx] - cols2[q-1];

            // Korrelation ist komplett zu berechnen
            sAB = 0.0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {   Rgb<double> a = A[yA+i  ][xA+j  ];    // double <-- Unsign
                Rgb<double> b = B[yB+i+p][xB+j+q];
                sAB += a * b;
//                sAB += A[yA+i][xA+j] * B[yB+p+i][xB+q+j];    Ueberlauf!
            }

            cov = sAB - sA*sB/(double)n;    // strengg. ist das n*cov, nicht cov
            if (cov.r == 0.0)
                 rho.r = 0.0;
            else rho.r = cov.r / sqrt((s2A.r - sA.r*sA.r/n)*(s2B.r - sB.r*sB.r/n));

            if (cov.g == 0.0)
                 rho.g = 0.0;
            else rho.g = cov.g / sqrt((s2A.g - sA.g*sA.g/n)*(s2B.g - sB.g*sB.g/n));

            if (cov.b == 0.0)
                 rho.b = 0.0;
            else rho.b = cov.b / sqrt((s2A.b - sA.b*sA.b/n)*(s2B.b - sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            std::cout << "\tsB  = " << sB << '\n';
            std::cout << "\ts2B = " << s2B << '\n';
            std::cout << "\tsAB = " << sAB << '\n';
            std::cout << "\trho = " << rho << '\n';
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
            correl (     Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho.r > rho_max.r)
            {   rho_max.r = rho.r;
                id.r.x = q;        // oder id.r = XYindex(q,p)
                id.r.y = p;
            }
            if (rho.g > rho_max.g)
            {   rho_max.g = rho.g;
                id.g.x = q;
                id.g.y = p;
            }
            if (rho.b > rho_max.b)
            {   rho_max.b = rho.b;
                id.b.x = q;
                id.b.y = p;
            }
        }
    }
    printf ("\tErgebnis %s:\n", __func__);
    printf ("\tr: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.r.x, id.r.y, xB + id.r.x, yB + id.r.y, rho_max.r);
    printf ("\tg: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.g.x, id.g.y, xB + id.g.x, yB + id.g.y, rho_max.g);
    printf ("\tb: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.b.x, id.b.y, xB + id.b.x, yB + id.b.y, rho_max.b);

    return id;
}

/**
   find_shift_sym():
    
    Im Gegensatz zu find_shift() werden hier nicht LO-Punkte, sondern die
    Mittelpunkte der Gebiete angegeben (-> "c"). Diese werden hier in
    LO-Punkte umgerechnet und damit find_shift() aufgerufen.
      m=3   n=3  c
    |- - -|- - -|-|- - -|- - -|
     xB    xA                    xA = xcA-n
                                 xB = xcA-(n+m)
    @return: Shift des Mittelpunktes von B gegen den von A (>0,=0,<0).

    ACHTUNG: Returnwerte von find_shift() und find_shift_sym() bedeuten
             Verschiedenes!
*/
template <class Unsign>
XYindex
ShiftFinder<Unsign>::find_shift_sym (int nx, int ny, int mx, int my,
                    int xcA, int ycA, int xcB, int ycB)
{
    int Nx = 2*nx + 1;         // Dim. der korrelierten Ausschnitte
    int Ny = 2*ny + 1;
    int Mx = 2*(nx+mx) + 1;    // Dim. des durchgrasten B-Gebietes
    int My = 2*(ny+my) + 1;
    int xA = xcA - nx;
    int yA = ycA - ny;
    int xB = xcB - (nx+mx);
    int yB = ycB - (ny+my);

    StopWatch uhr;
/*
    uhr.start();
    find_shift_r   (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();
*/
    uhr.start();
    XYindex id = find_shift_1r (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    id.x -= mx;        // Umrechnung der LO-Shift in Mittelpunktsshift
    id.y -= my;
    printf ("Zentrumsshift (dx,dy) = (%i,%i)\n", id.x, id.y);

    Rgb<XYindex> rid;

    uhr.start();
    rid = find_shift_1_ (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    rid.r.x -= mx;  rid.g.x -= mx;  rid.b.x -= mx;
    rid.r.y -= my;  rid.g.y -= my;  rid.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", rid.r.x, rid.r.y);
    printf ("                       G: (%i,%i)\n", rid.g.x, rid.g.y);
    printf ("                       B: (%i,%i)\n", rid.b.x, rid.b.y);

    uhr.start();
    rid = find_shift_1all (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    rid.r.x -= mx;  rid.g.x -= mx;  rid.b.x -= mx;
    rid.r.y -= my;  rid.g.y -= my;  rid.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", rid.r.x, rid.r.y);
    printf ("                       G: (%i,%i)\n", rid.g.x, rid.g.y);
    printf ("                       B: (%i,%i)\n", rid.b.x, rid.b.y);

    uhr.start();
    rid = find_shift_1 (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    rid.r.x -= mx;  rid.g.x -= mx;  rid.b.x -= mx;
    rid.r.y -= my;  rid.g.y -= my;  rid.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", rid.r.x, rid.r.y);
    printf ("                       G: (%i,%i)\n", rid.g.x, rid.g.y);
    printf ("                       B: (%i,%i)\n", rid.b.x, rid.b.y);

    return id;
}
/**
    NOTIZEN zu find_shift()-Prozeduren:
    Geschwindigkeit:
    - A-Daten sA und s2A vorab (ausserhalb) und nur einmal berechnen
      MOMENT: Da es auch von B abhaengt, welche Pixel in welcher A-B-Lage
      genommen werden, ist das jeweils zugehoerige sA und s2A auch von B
      abhaengig. Geht also nicht vorher.
    - badA-Feld und auch badB-Feld vorab anlegen; dann nur noch
        if (badA[][] || badB[][])
      Aber ist eine zweifache Indexberechnung wirklich schneller als
        if (a==0 || a>=z_max) ?
      Probieren!
    - alle drei Kanaele in einem Durchlauf!
    - Tabelle der 256 Quadrate; statt
        double a = arrays[...];
        sA += a;
        s2A += a*a;                // Gleitkomma-Multipl.!
      dann
        Unsign a = arrays[...];
        sA += a;
        s2A += qdr[a];        // entweder double qdr[n_z] oder long
    - der tatsaechlich variable Teil des B-Ausschnitts beim Verschieben
      ist immer nur eine Zeile oder Spalte
    - Als Datentyp fuer sA, s2A,... reicht bzgl. Umfang und Genauigk. float.
      Aber auch fuer die Summationen, insbesonderen der Quadrate??
      Bei kleinen Karees vielleicht, aber bei grossen sicher nicht mehr.
      Umgestellt auf double.
*/

/*
    zweiparametriges Shift_Finder-Template
*/
template <class Unsign, class sum_t>
class ShiftFinder_2 {

    TNT::Array2D< Rgb<Unsign> > A,B;

  public:

      ShiftFinder_2 (TNT::Array2D< Rgb<Unsign> >& A_,
                   TNT::Array2D< Rgb<Unsign> >& B_)
        : A(A_), B(B_) {}

    Rgb<XYindex>
            find_shift      (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_sym  (int nx, int ny, int mx, int my,
                             int xcA, int ycA, int xcB, int ycB);
};


/*
    Float-Groessen (Flot-Rgb's) erst bei den Quotienten cov und rho notwendig,
    Summen koennen statt in double auch in ganzzahligen ullong's ausgefuehrt
    werden; dann sind diese Summen exakt; Kosten: ullong: 0.63 sec,
    double: 0.55 sec.
*/
template <class Unsign, class sum_t>
Rgb<XYindex>
ShiftFinder_2<Unsign,sum_t>::find_shift (
                    int xA, int yA, int Nx, int Ny,
                    int xB, int yB, int Mx, int My)
{
    printf ("\nShiftFinder_2::%s:\n",__func__);
    printf ("mit Rgb, mul_t\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    Rgb<sum_t> cols[Mx], cols2[Mx];    // fuer Mx Spaltensummen (einf & Quadrate)
    Rgb<sum_t> rows[My], rows2[My];    // fuer My Zeilensummen

    for (int p=0; p < Mx; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Ny; i++)                // Spaltensumme
        {   Rgb<sum_t> b = B [yB+i][xB+p];        // sum_t <-- Unsign
            s  += b;
            s2 += b*b;
        }
        cols [p] = s;
        cols2[p] = s2;
    }
    for (int p=0; p < My; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Nx; i++)                // Zeilensumme
        {   Rgb<sum_t> b = B [yB+p][xB+i];        // sum_t <-- Unsign
            s  += b;
            s2 += b*b;
        }
        rows [p] = s;
        rows2[p] = s2;
    }

    Rgb<XYindex> id;
    Rgb<double> rho_max(-2.0);    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    Rgb<sum_t> sA(0), s2A(0);
    Rgb<sum_t> sB(0), s2B(0), sAB(0);

    for (int i=0; i < Ny; i++)      //    for (int i=0; i < Nx; i++)
    {   sB  += rows [i];            //    {   sB  += cols [i];
        s2B += rows2[i];            //        s2B += cols2[i];
    }                               //    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<sum_t> a = A[yA+i][xA+j];   // sum_t <-- Unsign
        Rgb<sum_t> b = B[yB+i][xB+j];   // sum_t <-- Unsign
        sA  += a;
        s2A += a*a;
        sAB += a * b;
        //sAB += a * B[yB+i][xB+j];     // Rgb<sum_t> * Rgb<Unsign>
    }

    int   n = Nx * Ny;
    //Rgb<double> EA = Rgb<double>(sA) / (double)n; // Mittelwert von A
    //Rgb<double> D2A = s2A / (double)n - EA*EA;    // Streuung von A

#ifdef HDR_DEBUG_1
    std::cout << "\tsA  = " << sA << '\n';
    std::cout << "\ts2A = " << s2A << '\n';
    //std::cout << "\tsB  = " << sB << '\n';
    //std::cout << "\ts2B = " << s2B << '\n';
    //std::cout << "\tE(A)= " << EA << '\n';
    //std::cout << "\tD2A = " << D2A << '\n';
#endif

    Rgb<sum_t> sB_0 = sB;        // B-Summen am linken Rand merken
    Rgb<sum_t> s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)            // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)
            sB_0  += rows [p-1+Ny] - rows [p-1];
            s2B_0 += rows2[p-1+Ny] - rows2[p-1];

            // alle Spaltensummen um 1 nach unten verschieben
            for (int i=0; i < Mx; i++)
            {   Rgb<sum_t> b1 = B [yB+p-1   ][xB+i];    // zu subtrahieren
                Rgb<sum_t> b2 = B [yB+p-1+Ny][xB+i];    // zu addieren
                cols [i] += b2 - b1;
                cols2[i] += b2*b2 - b1*b1;
            }
            sB  = sB_0;
            s2B = s2B_0;

            // Korrelation ist stets komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
#if HDR_FAST==0
                Rgb<sum_t> a = A[yA+i  ][xA+j];         // q=0
                Rgb<sum_t> b = B[yB+i+p][xB+j];         // sum_t <-- Unsign
                sAB += a * b;
//              sAB += A[yA+i][xA+j] * B[yB+p+i][xB+j];      Ueberlauf!
#elif HDR_FAST==1
                Rgb<int> a = A[yA+i  ][xA+j];           // q=0
                Rgb<int> b = B[yB+i+p][xB+j];           // int <-- Unsign
                sAB += a * b;
#else
                sAB.r += A[yA+i][xA+j].r * B[yB+p+i][xB+j].r;
                sAB.g += A[yA+i][xA+j].g * B[yB+p+i][xB+j].g;
                sAB.b += A[yA+i][xA+j].b * B[yB+p+i][xB+j].b;
#endif
            }
        }
        // am linken Rand (q=0)

        // cov = sAB - sA*sB / n        // strengg. ist das n*cov, nicht cov
        Rgb<double> cov (sAB);
        cov -= Rgb<double>(sA * sB) / (double)n;

        Rgb<double> rho;
        if (cov.r == 0.0)
             rho.r = 0.0;
        else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

        if (cov.g == 0.0)
             rho.g = 0.0;
        else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

        if (cov.b == 0.0)
             rho.b = 0.0;
        else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0, p, xB, yB+p);
        std::cout << "\tsB  = " << sB << '\n';
        std::cout << "\ts2B = " << s2B << '\n';
        std::cout << "\tsAB = " << sAB << '\n';
        std::cout << "\trho = " << rho << '\n';
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
        correl (     Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho.r > rho_max.r)
        {   rho_max.r = rho.r;
            id.r.x = 0;
            id.r.y = p;
            //printf ("r: (dx,dy)=(%i,%i),  rho=%.1f\n", id.r.x, id.r.y, rho_max.r);
        }
        if (rho.g > rho_max.g)
        {    rho_max.g = rho.g;
            id.g.x = 0;
            id.g.y = p;
        }
        if (rho.b > rho_max.b)
        {    rho_max.b = rho.b;
            id.b.x = 0;
            id.b.y = p;
        }

        // um q nach rechts verschieben
        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)
            sB  += cols [q-1+Nx] - cols [q-1];
            s2B += cols2[q-1+Nx] - cols2[q-1];

            // Korrelation ist komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
#if HDR_FAST==0
                Rgb<sum_t> a = A[yA+i  ][xA+j  ];       // sum_t <-- Unsign
                Rgb<sum_t> b = B[yB+i+p][xB+j+q];
                sAB += a * b;
//                sAB += A[yA+i][xA+j] * B[yB+p+i][xB+q+j];    Ueberlauf!
#elif HDR_FAST==1
                Rgb<int> a = A[yA+i  ][xA+j  ];         // int <-- Unsign
                Rgb<int> b = B[yB+i+p][xB+j+q];
                sAB += a * b;
#else
                sAB.r += A[yA+i][xA+j].r * B[yB+p+i][xB+q+j].r;
                sAB.g += A[yA+i][xA+j].g * B[yB+p+i][xB+q+j].g;
                sAB.b += A[yA+i][xA+j].b * B[yB+p+i][xB+q+j].b;
#endif
            }

            //cov = sAB - sA*sB/n;    // strengg. ist das n*cov, nicht cov
            cov = Rgb<double>(sAB) - Rgb<double>(sA*sB) / (double)n;

            if (cov.r == 0.0)
                 rho.r = 0.0;
            else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

            if (cov.g == 0.0)
                 rho.g = 0.0;
            else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

            if (cov.b == 0.0)
                 rho.b = 0.0;
            else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            std::cout << "\tsB  = " << sB << '\n';
            std::cout << "\ts2B = " << s2B << '\n';
            std::cout << "\tsAB = " << sAB << '\n';
            std::cout << "\trho = " << rho << '\n';
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
            correl (     Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho.r > rho_max.r)
            {    rho_max.r = rho.r;
                id.r.x = q;         // oder id.r = XYindex(q,p)
                id.r.y = p;
            }
            if (rho.g > rho_max.g)
            {    rho_max.g = rho.g;
                id.g.x = q;
                id.g.y = p;
            }
            if (rho.b > rho_max.b)
            {    rho_max.b = rho.b;
                id.b.x = q;
                id.b.y = p;
            }
        }
    }
    printf ("\tErgebnis %s:\n", __func__);
    printf ("\tr: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.r.x, id.r.y, xB + id.r.x, yB + id.r.y, rho_max.r);
    printf ("\tg: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.g.x, id.g.y, xB + id.g.x, yB + id.g.y, rho_max.g);
    printf ("\tb: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.b.x, id.b.y, xB + id.b.x, yB + id.b.y, rho_max.b);

    return id;
}


/**
     find_shift_sym():
*/
template <class Unsign, class sum_t>
Rgb<XYindex>
ShiftFinder_2<Unsign,sum_t>::find_shift_sym (
                    int nx, int ny, int mx, int my,
                    int xcA, int ycA, int xcB, int ycB)
{
    int Nx = 2*nx + 1;          // Dim. der korrelierten Ausschnitte
    int Ny = 2*ny + 1;
    int Mx = 2*(nx+mx) + 1;     // Dim. des durchgrasten B-Gebietes
    int My = 2*(ny+my) + 1;
    int xA = xcA - nx;
    int yA = ycA - ny;
    int xB = xcB - (nx+mx);
    int yB = ycB - (ny+my);

    StopWatch uhr;

    uhr.start();
    Rgb<XYindex> id = find_shift (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    id.r.x -= mx;  id.g.x -= mx;  id.b.x -= mx;
    id.r.y -= my;  id.g.y -= my;  id.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", id.r.x, id.r.y);
    printf ("                       G: (%i,%i)\n", id.g.x, id.g.y);
    printf ("                       B: (%i,%i)\n", id.b.x, id.b.y);

    return id;
}


/*
    ShiftFinder_3

    Dreiparametriges Shift_Finder-Template.
    Vorzugsweise mul_t=uint32, sum_t=uint64.
*/
template <class Unsign, class mul_t, class sum_t>
class ShiftFinder_3 {

    TNT::Array2D< Rgb<Unsign> > A,B;

  public:

      ShiftFinder_3 (TNT::Array2D< Rgb<Unsign> >& A_,
                   TNT::Array2D< Rgb<Unsign> >& B_)
        : A(A_), B(B_) {}

    Rgb<XYindex>
            find_shift      (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_sym  (int nx, int ny, int mx, int my,
                             int xcA, int ycA, int xcB, int ycB);
};


/*
    Float-Groessen (Flot-Rgb's) erst bei den Quotienten cov und rho notwendig,
    Summen koennen statt in double auch in ganzzahligen ullong's ausgefuehrt
    werden; dann sind diese Summen exakt; Kosten: ullong: 0.63 sec,
    double: 0.55 sec.
*/
template <class Unsign, class mul_t, class sum_t>
Rgb<XYindex>
ShiftFinder_3<Unsign,mul_t,sum_t>::find_shift (
                    int xA, int yA, int Nx, int Ny,
                    int xB, int yB, int Mx, int My)
{
    printf ("\nShiftFinder_3::%s:\n",__func__);
    printf ("mit Rgb, @(,)-Op, mul_t, sum_t\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    Rgb<sum_t> cols[Mx], cols2[Mx];    // fuer Mx Spaltensummen (einf & Qrd)
    Rgb<sum_t> rows[My], rows2[My];    // fuer My Zeilensummen

    for (int p=0; p < Mx; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Ny; i++)                // Spaltensumme
        {   Rgb<mul_t> b = B [yB+i][xB+p];        // mul_t <-- Unsign
            s  += b;
            s2 += b*b;
        }
        cols [p] = s;
        cols2[p] = s2;
    }
    for (int p=0; p < My; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Nx; i++)                // Zeilensumme
        {   Rgb<mul_t> b = B [yB+p][xB+i];        // mul_t <-- Unsign
            s  += b;
            s2 += b*b;
        }
        rows [p] = s;
        rows2[p] = s2;
    }

    Rgb<XYindex> id;
    Rgb<double> rho_max(-2.0);    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    Rgb<sum_t> sA(0), s2A(0);
    Rgb<sum_t> sB(0), s2B(0), sAB(0);

    for (int i=0; i < Ny; i++)     //    for (int i=0; i < Nx; i++)
    {   sB  += rows [i];           //    {   sB  += cols [i];
        s2B += rows2[i];           //        s2B += cols2[i];
    }                              //    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<mul_t> a = A[yA+i][xA+j];    // mul_t <-- Unsign
        Rgb<mul_t> b = B[yB+i][xB+j];    // mul_t <-- Unsign
        sA  += a;
        s2A += a*a;
        sAB += a * b;
    }

    int   n = Nx * Ny;
    //Rgb<double> EA = Rgb<double>(sA) / (double)n;  // Mittelwert von A
    //Rgb<double> D2A = s2A / (double)n - EA*EA;     // Streuung von A

#ifdef HDR_DEBUG_1
    std::cout << "\tsA  = " << sA << '\n';
    std::cout << "\ts2A = " << s2A << '\n';
    //std::cout << "\tsB  = " << sB << '\n';
    //std::cout << "\ts2B = " << s2B << '\n';
    //std::cout << "\tE(A)= " << EA << '\n';
    //std::cout << "\tD2A = " << D2A << '\n';
#endif

    Rgb<sum_t> sB_0 = sB;        // B-Summen am linken Rand merken
    Rgb<sum_t> s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)            // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)

            sB_0  += rows [p-1+Ny] - rows [p-1];
            s2B_0 += rows2[p-1+Ny] - rows2[p-1];

            // alle Spaltensummen um 1 nach unten verschieben

            for (int i=0; i < Mx; i++)
            {   Rgb<sum_t> b1 = B [yB+p-1   ][xB+i];    // zu subtrahieren
                Rgb<sum_t> b2 = B [yB+p-1+Ny][xB+i];    // zu addieren
                cols [i] += b2 - b1;
                cols2[i] += b2*b2 - b1*b1;

                // Hier koennen negative Zwischenwerte entstehen -->
                // Ueberlauf bei unsigned-Typen. Fuer ein korrektes Ergebnis
                // muessen dann Zwischentyp und Ergebnistyp gleich
                // sein. Deshalb hier gleich sum_t.
            }
            sB  = sB_0;
            s2B = s2B_0;

            // Korrelation ist stets komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                Rgb<mul_t> a = A[yA+i  ][xA+j];            // q=0
                Rgb<mul_t> b = B[yB+i+p][xB+j];            // mul_t <-- Unsign
                sAB += a * b;
            }
        }
        // am linken Rand (q=0)

        // cov = sAB - sA*sB / n        // strengg. ist das n*cov, nicht cov
        Rgb<double> cov (sAB);
        cov -= Rgb<double>(sA * sB) / (double)n;

        Rgb<double> rho;
        if (cov.r == 0.0)
             rho.r = 0.0;
        else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

        if (cov.g == 0.0)
             rho.g = 0.0;
        else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

        if (cov.b == 0.0)
             rho.b = 0.0;
        else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0, p, xB, yB+p);
        std::cout << "\tsB  = " << sB << '\n';
        std::cout << "\ts2B = " << s2B << '\n';
        std::cout << "\tsAB = " << sAB << '\n';
        std::cout << "\trho = " << rho << '\n';
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
        correl (     Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho.r > rho_max.r)
        {   rho_max.r = rho.r;
            id.r.x = 0;
            id.r.y = p;
            //printf ("r: (dx,dy)=(%i,%i),  rho=%.1f\n", id.r.x, id.r.y, rho_max.r);
        }
        if (rho.g > rho_max.g)
        {   rho_max.g = rho.g;
            id.g.x = 0;
            id.g.y = p;
        }
        if (rho.b > rho_max.b)
        {   rho_max.b = rho.b;
            id.b.x = 0;
            id.b.y = p;
        }

        // um q nach rechts verschieben

        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)

            sB  += cols [q-1+Nx] - cols [q-1];
            s2B += cols2[q-1+Nx] - cols2[q-1];

            // Negative Zwischenergebnisse machen auch bei unsigend-Typen
            // nichts, solange sB und cols etc. vom gleichen Typ. Hoechstens
            // sB waere kleiner als der zu addierende Zwischenwert, aber
            // das ist hier ausgeschlossen, da das abzuziehende cols[q-1]
            // vormals zu sB addiert wurde.

            // Korrelation ist stets komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                Rgb<mul_t> a = A[yA+i  ][xA+j  ];    // mul_t <-- Unsign
                //Rgb<mul_t> b = B[yB+i+p][xB+j+q];
                //sAB += a * b;
                a *= B[yB+i+p][xB+j+q];
                sAB += a;
            }

            //cov = sAB - sA*sB/n;    // strengg. ist das n*cov, nicht cov
            cov = Rgb<double>(sAB) - Rgb<double>(sA*sB) / (double)n;

            if (cov.r == 0.0)
                 rho.r = 0.0;
            else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

            if (cov.g == 0.0)
                 rho.g = 0.0;
            else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

            if (cov.b == 0.0)
                 rho.b = 0.0;
            else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            std::cout << "\tsB  = " << sB << '\n';
            std::cout << "\ts2B = " << s2B << '\n';
            std::cout << "\tsAB = " << sAB << '\n';
            std::cout << "\trho = " << rho << '\n';
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
            correl (     Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho.r > rho_max.r)
            {   rho_max.r = rho.r;
                id.r.x = q;        // oder id.r = XYindex(q,p)
                id.r.y = p;
            }
            if (rho.g > rho_max.g)
            {   rho_max.g = rho.g;
                id.g.x = q;
                id.g.y = p;
            }
            if (rho.b > rho_max.b)
            {   rho_max.b = rho.b;
                id.b.x = q;
                id.b.y = p;
            }
        }
    }
    printf ("\tErgebnis %s:\n", __func__);
    printf ("\tr: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.r.x, id.r.y, xB + id.r.x, yB + id.r.y, rho_max.r);
    printf ("\tg: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.g.x, id.g.y, xB + id.g.x, yB + id.g.y, rho_max.g);
    printf ("\tb: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.b.x, id.b.y, xB + id.b.x, yB + id.b.y, rho_max.b);

    return id;
}


/**
     find_shift_sym()
*/
template <class Unsign, class mul_t, class sum_t>
Rgb<XYindex>
ShiftFinder_3<Unsign,mul_t,sum_t>::find_shift_sym (
                    int nx, int ny, int mx, int my,
                    int xcA, int ycA, int xcB, int ycB)
{
    int Nx = 2*nx + 1;        // Dim. der korrelierten Ausschnitte
    int Ny = 2*ny + 1;
    int Mx = 2*(nx+mx) + 1;    // Dim. des durchgrasten B-Gebietes
    int My = 2*(ny+my) + 1;
    int xA = xcA - nx;
    int yA = ycA - ny;
    int xB = xcB - (nx+mx);
    int yB = ycB - (ny+my);

    StopWatch uhr;

    uhr.start();
    Rgb<XYindex> id = find_shift (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    id.r.x -= mx;  id.g.x -= mx;  id.b.x -= mx;
    id.r.y -= my;  id.g.y -= my;  id.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", id.r.x, id.r.y);
    printf ("                       G: (%i,%i)\n", id.g.x, id.g.y);
    printf ("                       B: (%i,%i)\n", id.b.x, id.b.y);

    return id;
}


/*
    ShiftFinder_4

    Dreiparametriges Shift_Finder-Template mit maximaler Optimierung.
    Vorzugsweise mul_t=double, sum_t=double.
*/
template <class Unsign, class mul_t, class sum_t>
class ShiftFinder_4 {

    TNT::Array2D< Rgb<Unsign> > A,B;

  public:

      ShiftFinder_4 (TNT::Array2D< Rgb<Unsign> >& A_,
                   TNT::Array2D< Rgb<Unsign> >& B_)
        : A(A_), B(B_) {}

    Rgb<XYindex>
            find_shift      (int xA, int yA, int Nx, int Ny,
                             int xB, int yB, int Mx, int My);
    Rgb<XYindex>
            find_shift_sym  (int nx, int ny, int mx, int my,
                             int xcA, int ycA, int xcB, int ycB);
};


template <class Unsign, class mul_t, class sum_t>
Rgb<XYindex>
ShiftFinder_4<Unsign,mul_t,sum_t>::find_shift (
                    int xA, int yA, int Nx, int Ny,
                    int xB, int yB, int Mx, int My)
{
    printf ("\nShiftFinder_4::%s:\n",__func__);
    printf ("mit Rgb, vzgw. @=-Op, mul_t, sum_t\n");
    printf ("\tA: LO(x,y)=(%i,%i);  B: LO(x,y)=(%i,%i)\n", xA,yA, xB,yB);
    printf ("\tNx=%i, Ny=%i,  Mx=%i, My=%i\n", Nx,Ny,Mx,My);
    printf ("\tB-LO: x:[%i...%i] und y:[%i...%i]\n", xB,xB+Mx-Nx, yB,yB+My-Ny);
    assert (Nx <= A.dim2());
    assert (Ny <= A.dim1());
    assert (Mx <= B.dim2());
    assert (My <= B.dim1());
    assert (Nx <= Mx);
    assert (Ny <= My);

    Rgb<sum_t> cols[Mx], cols2[Mx];    // fuer Mx Spaltensummen (einf & qdr)
    Rgb<sum_t> rows[My], rows2[My];    // fuer My Zeilensummen

    for (int p=0; p < Mx; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Ny; i++)                // Spaltensumme
        {   Rgb<mul_t> b = B [yB+i][xB+p];        // mul_t <-- Unsign
            s += b;
            b *= b;
            s2 += b;
        }
        cols [p] = s;
        cols2[p] = s2;
    }
    for (int p=0; p < My; p++)
    {
        Rgb<sum_t> s(0), s2(0);

        for (int i=0; i < Nx; i++)                // Zeilensumme
        {   Rgb<mul_t> b = B [yB+p][xB+i];        // mul_t <-- Unsign
            s += b;
            b *= b;
            s2 += b;
        }
        rows [p] = s;
        rows2[p] = s2;
    }

    Rgb<XYindex> id;
    Rgb<double> rho_max(-2.0);    // kleiner als kleinstmoegl.

    // Fuer Ausgangslage alle Summen (sA,...,s2B,sAB) einmal vollstaendig
    // berechnen. B-Summen dabei aus den Reihensummen oder Spaltensummen.

    Rgb<sum_t> sA(0), s2A(0);
    Rgb<sum_t> sB(0), s2B(0), sAB(0);

    for (int i=0; i < Ny; i++)     //    for (int i=0; i < Nx; i++)
    {   sB  += rows [i];           //    {   sB  += cols [i];
        s2B += rows2[i];           //        s2B += cols2[i];
    }                              //    }

    for (int i=0; i < Ny; i++)
    for (int j=0; j < Nx; j++)
    {
        Rgb<mul_t> a = A[yA+i][xA+j];    // mul_t <-- Unsign
        Rgb<mul_t> b = B[yB+i][xB+j];    // mul_t <-- Unsign
        sA += a;
        b *= a;        // b = a*b
        a *= a;        // a = a*a;
        s2A += a;    // s2A += a*a;
        sAB += b;    // sAB += a*b;
    }

    int   n = Nx * Ny;
    //Rgb<double> EA = Rgb<double>(sA) / (double)n;    // Mittelwert von A
    //Rgb<double> D2A = s2A / (double)n - EA*EA;    // Streuung von A

#ifdef HDR_DEBUG_1
    std::cout << "\tsA  = " << sA << '\n';
    std::cout << "\ts2A = " << s2A << '\n';
    //std::cout << "\tsB  = " << sB << '\n';
    //std::cout << "\ts2B = " << s2B << '\n';
    //std::cout << "\tE(A)= " << EA << '\n';
    //std::cout << "\tD2A = " << D2A << '\n';
#endif

    Rgb<sum_t> sB_0 = sB;        // B-Summen am linken Rand merken
    Rgb<sum_t> s2B_0 = s2B;

    for (int p=0; p <= My-Ny; p++)
    {
        //printf ("p=%i\n",p);
        if (p > 0)            // nach unten verschieben
        {
            // sB_0 = sB_0 + Zeile(p-1+Ny) - Zeile(p-1)

            sB_0  += rows [p-1+Ny];
            sB_0  -= rows [p-1];
            s2B_0 += rows2[p-1+Ny];
            s2B_0 -= rows2[p-1];

            // alle Spaltensummen um 1 nach unten verschieben

            for (int i=0; i < Mx; i++)
            {   Rgb<sum_t> b1 = B [yB+p-1   ][xB+i];    // zu subtrahieren
                Rgb<sum_t> b2 = B [yB+p-1+Ny][xB+i];    // zu addieren
                cols [i] += b2;
                cols [i] -= b1;
                b2 *= b2;
                b1 *= b1;
                cols2[i] += b2;
                cols2[i] -= b1;

                // Hier koennen negative Zwischenwerte entstehen -->
                // Ueberlauf bei unsigned-Typen. Fuer ein korrektes Ergebnis
                // muessen dann Zwischentyp und Ergebnistyp gleich
                // sein. Deshalb hier sogleich sum_t.
            }
            sB  = sB_0;
            s2B = s2B_0;

            // Korrelation ist stets komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                Rgb<mul_t> a = A[yA+i  ][xA+j];        // q=0
                a *= B[yB+i+p][xB+j];
                sAB += a;
            }
        }
        // am linken Rand (q=0)

        // cov = sAB - sA*sB / n        // strengg. ist das n*cov, nicht cov
        Rgb<double> cov (sAB);
        cov -= Rgb<double>(sA * sB) / (double)n;

        Rgb<double> rho;
        if (cov.r == 0.0)
             rho.r = 0.0;
        else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

        if (cov.g == 0.0)
             rho.g = 0.0;
        else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

        if (cov.b == 0.0)
             rho.b = 0.0;
        else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
        printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", 0, p, xB, yB+p);
        std::cout << "\tsB  = " << sB << '\n';
        std::cout << "\ts2B = " << s2B << '\n';
        std::cout << "\tsAB = " << sAB << '\n';
        std::cout << "\trho = " << rho << '\n';
        correl ('R', Nx,Ny, A,xA,yA, B,xB,yB+p);
        correl (     Nx,Ny, A,xA,yA, B,xB,yB+p);
#endif
        if (rho.r > rho_max.r)
        {   rho_max.r = rho.r;
            id.r.x = 0;
            id.r.y = p;
            //printf ("r: (dx,dy)=(%i,%i),  rho=%.1f\n", id.r.x, id.r.y, rho_max.r);
        }
        if (rho.g > rho_max.g)
        {   rho_max.g = rho.g;
            id.g.x = 0;
            id.g.y = p;
        }
        if (rho.b > rho_max.b)
        {   rho_max.b = rho.b;
            id.b.x = 0;
            id.b.y = p;
        }

        // um q nach rechts verschieben

        for (int q=1; q <= Mx-Nx; q++)
        {
            // sB = sB + Spalte(q-1+Nx) - Spalte(q-1)

            sB  += cols [q-1+Nx] - cols [q-1];
            s2B += cols2[q-1+Nx] - cols2[q-1];

            // Negative Zwischenergebnisse machen auch bei unsigend-Typen
            // nichts, solange sB und cols etc. vom gleichen Typ. Hoechstens
            // sB waere kleiner als der zu addierende Zwischenwert, aber
            // das ist hier ausgeschlossen, da das abzuziehende cols[q-1]
            // vormals zu sB addiert wurde.

            // Korrelation ist stets komplett zu berechnen
            sAB = 0;
            for (int i=0; i < Ny; i++)
            for (int j=0; j < Nx; j++)
            {
                Rgb<mul_t> a;
                a =  A[yA+i  ][xA+j  ];    // mul_t <-- Unsign
                a *= B[yB+i+p][xB+j+q];
                sAB += a;
            }

            //cov = sAB - sA*sB/n;    // strengg. ist das n*cov, nicht cov
            cov = Rgb<double>(sAB) - Rgb<double>(sA*sB) / (double)n;

            if (cov.r == 0.0)
                 rho.r = 0.0;
            else rho.r = cov.r / sqrt((s2A.r -
                        (double)sA.r*sA.r/n)*(s2B.r - (double)sB.r*sB.r/n));

            if (cov.g == 0.0)
                 rho.g = 0.0;
            else rho.g = cov.g / sqrt((s2A.g -
                        (double)sA.g*sA.g/n)*(s2B.g - (double)sB.g*sB.g/n));

            if (cov.b == 0.0)
                 rho.b = 0.0;
            else rho.b = cov.b / sqrt((s2A.b -
                        (double)sA.b*sA.b/n)*(s2B.b - (double)sB.b*sB.b/n));

#ifdef HDR_DEBUG_1
            printf ("\t(dx,dy)=(%i,%i), (xB,yB)=(%i,%i)\n", q,p, xB+q, yB+p);
            std::cout << "\tsB  = " << sB << '\n';
            std::cout << "\ts2B = " << s2B << '\n';
            std::cout << "\tsAB = " << sAB << '\n';
            std::cout << "\trho = " << rho << '\n';
            correl ('R', Nx,Ny, A,xA,yA, B,xB+q,yB+p);
            correl (     Nx,Ny, A,xA,yA, B,xB+q,yB+p);
#endif
            if (rho.r > rho_max.r)
            {   rho_max.r = rho.r;
                id.r.x = q;        // oder id.r = XYindex(q,p)
                id.r.y = p;
            }
            if (rho.g > rho_max.g)
            {   rho_max.g = rho.g;
                id.g.x = q;
                id.g.y = p;
            }
            if (rho.b > rho_max.b)
            {   rho_max.b = rho.b;
                id.b.x = q;
                id.b.y = p;
            }
        }
    }
    printf ("\tErgebnis %s:\n", __func__);
    printf ("\tr: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.r.x, id.r.y, xB + id.r.x, yB + id.r.y, rho_max.r);
    printf ("\tg: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.g.x, id.g.y, xB + id.g.x, yB + id.g.y, rho_max.g);
    printf ("\tb: LO(dx,dy)=(%d,%d), (xB,yB)=(%d,%d), rho=%f\n",
        id.b.x, id.b.y, xB + id.b.x, yB + id.b.y, rho_max.b);

    return id;
}


/**
     find_shift_sym():
*/
template <class Unsign, class mul_t, class sum_t>
Rgb<XYindex>
ShiftFinder_4<Unsign,mul_t,sum_t>::find_shift_sym (
                    int nx, int ny, int mx, int my,
                    int xcA, int ycA, int xcB, int ycB)
{
    int Nx = 2*nx + 1;          // Dim. der korrelierten Ausschnitte
    int Ny = 2*ny + 1;
    int Mx = 2*(nx+mx) + 1;     // Dim. des durchgrasten B-Gebietes
    int My = 2*(ny+my) + 1;
    int xA = xcA - nx;
    int yA = ycA - ny;
    int xB = xcB - (nx+mx);
    int yB = ycB - (ny+my);

    StopWatch uhr;

    uhr.start();
    Rgb<XYindex> id = find_shift (xA,yA, Nx,Ny, xB,yB, Mx,My);
    uhr.result();

    id.r.x -= mx;  id.g.x -= mx;  id.b.x -= mx;
    id.r.y -= my;  id.g.y -= my;  id.b.y -= my;
    printf ("Zentrumsshift (dx,dy)  R: (%i,%i)\n", id.r.x, id.r.y);
    printf ("                       G: (%i,%i)\n", id.g.x, id.g.y);
    printf ("                       B: (%i,%i)\n", id.b.x, id.b.y);

    return id;
}

#endif
// SHIFT_FINDER_HPP

// END OF FILE
