Program Listing for File Lanczos.hpp

Return to documentation for file (src_ed/Hamiltonian/Lanczos.hpp)

#ifndef lanczos_h
#define lanczos_h

// #include "matrix.hpp"
#include "continued_fraction.hpp"
#include "parser.hpp"

void EigensystemTridiagonal(bool evector_flag, vector<double> &alpha, vector<double> &beta, vector<double> &energy, vector<double> &evector);
extern std::normal_distribution<double> normal_dis;

template<typename T, typename HilbertField>
void Lanczos(T &hamil, size_t dim, double &val, vector<HilbertField> &x, bool verb=false)
{
    vector<double> energy;
    vector<double> alpha;
    vector<double> beta;
    size_t niter = 0;

    LanczosEigenvalue(hamil, x, alpha, beta, energy, niter, verb);
    val = energy[0];
    LanczosEigenvector(hamil, x, alpha, beta, verb);
}



template<typename T, typename HilbertField>
void LanczosEigenvalue(T &H, vector<HilbertField> &gs, vector<double> &alpha, vector<double> &beta, vector<double> &energy, size_t &niter, bool verb=false)
{
    size_t dim = gs.size();
    size_t max_iter_lanczos = global_int("max_iter_lanczos");

    if(niter>dim) niter = dim;
    else niter = max_iter_lanczos;

    if(verb) cout << "\nLanczos method for the ground state; dim = " << dim << endl;

    energy.clear();
    alpha.clear();
    beta.clear();
    alpha.reserve(niter);
    beta.reserve(niter);
    energy.reserve(niter);
    vector<double> evector;

    vector<HilbertField> r(gs); // residue
    vector<HilbertField> q(dim); // Lanczos basis vector

    double Ritz=2.0*global_double("accur_lanczos");
    while(Ritz > global_double("accur_lanczos"))
    {
        check_signals();

        // building q
        if(beta.size()){ // does nothing anyway if j=0
            r *= 1.0/beta.back();
            q *= -beta.back();
        }

        // computing q = q + H*r
        H.mult_add(r,q);

        // swap q and r
        swap(r,q);

        HilbertField z = q*r;
        alpha.push_back(realpart(z)); // alpha=<q|r>;
        mult_add(-alpha.back(), q, r); // r=r-q*alpha

        // computing beta_j
        beta.push_back(norm(r));
        EigensystemTridiagonal(true,alpha,beta,energy,evector);
        Ritz = abs(evector.back()*beta.back());
        if(verb && alpha.size()%10 == 0){
            cout.precision(10);
            cout << "--> iteration " << beta.size() << ", evalue = " << energy[0] << ", residual = " << Ritz << endl;
        }
        if(alpha.size() > max_iter_lanczos) qcm_ED_throw("Lanczos procedure exceeded " + to_string(max_iter_lanczos) + " iterations");
    }
    niter = beta.size();
}





template<typename T, typename HilbertField>
void LanczosEigenvector(T &H, vector<HilbertField> &gs, vector<double> &alpha, vector<double> &beta, bool verb=false)
{
    if(verb) cout << "Lanczos: calculation of the eigenvector..." << endl;
    vector<HilbertField> r(gs); // residue. beware of the default copy constructor !!!
    vector<HilbertField> q(gs.size()); // Lanczos basis vector

    vector<double> energy;
    vector<double> evector;

    EigensystemTridiagonal(true,alpha,beta,energy,evector);

    gs *= evector[0];
    for(size_t j=1; j<evector.size(); ++j){
        check_signals();
        H.mult_add(r,q);
        mult_add(-alpha[j-1],r,q);
        for(size_t i=0; i<r.size(); i++){  // r = q/beta   &    q = -beta*r
            HilbertField tmp(r[i]);
            r[i] = q[i]/beta[j-1];
            q[i] = -beta[j-1]*tmp;
        }
        mult_add(evector[j],r,gs);
        if(verb && j%20 == 0) cout << "--> iteration " << j << endl;
    }

    // elective check at the end
    if(global_bool("check_lanczos_residual")){
        to_zero(q);
        H.mult_add(gs,q);
        mult_add(-energy[0],gs,q);
        cout << "norm of the ground state : " << norm(gs) << endl;
        cout << "norm of the Ritz residual : " << norm(q) << endl;
    }
}






template<typename T, typename HilbertField>
pair<vector<double>, vector<double>> LanczosGreen(T &H, vector<HilbertField> &psi, bool verb=false)
{
    vector<HilbertField> q(psi.size());

    vector<double> a;
    vector<double> b;
    b.push_back(1.0);

  double accur_continued_fraction = global_double("accur_continued_fraction");
  size_t max_iter_CF = global_int("max_iter_CF");

  double prod = 1.0;

//  while(fabs(b.back()) > accur_continued_fraction and a.size() < max_iter_CF){
  while(prod > accur_continued_fraction and fabs(b.back()) > accur_continued_fraction and a.size() < max_iter_CF){
    check_signals();
    if(a.size()) swap(q,psi,b.back()); // psi <-- q/beta   and   q <-- -beta*psi
    H.mult_add(psi,q); //calculate q = q + H*psi
    HilbertField z = psi*q;
    a.push_back(real(z));
    mult_add(-a.back(),psi,q); // q -= a*psi
    double zz = norm(q);
    b.push_back(zz);
    prod *= fabs(zz);
    }
  return {a,b};
}







template<typename TYPE, typename HilbertField>
bool bandLanczos(
                 TYPE &H,
                 vector<vector<HilbertField>> &phi,
                 vector<double> &evalues,
                 matrix<HilbertField> &evec_red,
                 matrix<HilbertField> &P0,
                 int &M0,
                 bool verb=false
                 )
{
    int i,j,k;
    int nd=0;           // number of deflated vectors
    int nvec;           // number of vectors currently allocated
    int nvec_max;       // maximum number of vectors currently allocated

    double accur_deflation = global_double("accur_deflation");
    double accur_band_lanczos = global_double("accur_band_lanczos");
    if(accur_band_lanczos<0.0) accur_band_lanczos = 0.0;
    size_t max_iter_BL = global_int("max_iter_BL");

    double band_lanczos_minimum_gap = global_double("band_lanczos_minimum_gap");
    bool no_degenerate_BL = global_bool("no_degenerate_BL");

    size_t dim = phi[0].size();

    if(verb) cout << "\nband Lanczos procedure with " << phi.size() << " starting vectors. Dimension " << dim << endl;

    // make sure the targeted number of iterations is a multiple of the number of starting vectors
    int pc = (int)phi.size();
    if(M0 > max_iter_BL) M0 = max_iter_BL;
    int M0p = M0/(int)phi.size();
    M0p++;
    M0p *= phi.size();
    M0 = M0p-pc;

    vector<vector<HilbertField>> v;  // trial and orthogonalized vectors
    v.assign(M0p,vector<HilbertField>());

    size_t I[M0p];
    matrix<HilbertField> t(M0);
    matrix<HilbertField> P((int)phi.size(),M0); // matrix of inner products <b[i]|v[j]>
    vector<int> def(M0);

    // step (1)
    for(i=0; i<phi.size(); ++i){
        v[i] = phi[i];
    }
    nvec = 2*pc;
    nvec_max = nvec;

    // main loop over j
    bool converged = false;
    double ev_old=1e12;
    for(j=0; j<M0; ++j){
    check_signals();
        double z = norm(v[j]); // step (3)
    if(z < accur_deflation){    // need to deflate vector (step (4))
            if(verb) cout << "deflating vector no " << j << endl;
            if(j >= pc){        // step (4a) [adjusted]
                I[nd] = j-pc;   // note : at any time, j > indices in I[]
                def[j-pc] = 1;
                nd++;
            }
            pc--;           // step (4b)
            if(pc==0) break;
            erase(v[j]);    // freeing memory
            nvec--;
            for(k=j; k<j+pc; ++k) { // step (4c)
                v[k] = v[k+1];
            }
            erase(v[j+pc]);
            j--; continue;      // step (4d)
        }
        v[j] *= 1.0/z;
        if(j>=pc){
            t(j,j-pc) = z;  // step (5)
        }

        // dot products with the starting vectors
        for(k=0; k<phi.size(); ++k) P(k,j) = phi[k]*v[j];

        size_t jm = j+pc;
        for(k=j+1; k<jm; ++k) { // step (6)
            HilbertField zz = v[j]*v[k];
            mult_add(-zz, v[j], v[k]); // v_k = v_k - (v_j*.v_k) v_j
            if(k>=pc){
                t(j,k-pc) = zz;
            }
        }

        // step (7)
        v[j+pc].resize(dim); nvec++;
        if(nvec > nvec_max) nvec_max = nvec;
        H.mult_add(v[j],v[j+pc]);

        int k0 = 0; // step (8)
        if(j>pc){
            k0 = j-pc;
            if(def[k0-1]==0){ // freeing memory on the way
                erase(v[k0-1]);
                nvec--;
            }
        }
        for(k=k0; k<j; ++k){
            t(k,j) = conjugate(t(j,k));
            mult_add(-t(k,j), v[k], v[j+pc]);   // v[j+pc] = v[j+pc] - z*v[k]
        }

        for(k=0; k<nd; ++k){ // step (9)
            QCM_ASSERT(I[k]<j);
            HilbertField w = v[I[k]]*v[j+pc];
            mult_add(-w, v[I[k]], v[j+pc]); // v[j+pc] = v[j+pc] - z*v[I[k]]
            t(I[k],j) = w;
        }

        HilbertField w =  v[j]*v[j+pc];
        mult_add(-w, v[j], v[j+pc]);    // v[j+pc] = v[j+pc] - w*v[j]
        t(j,j) = w;

        // steps (10) and (11) combined
        for(k=0; k<nd; ++k) t(j,I[k]) = conjugate(t(I[k],j));

        // step (12) : test for convergence based on the lowest eigenvalues of the projected Hamiltonian

        double evalue_test;
        if(j>= 4*phi.size() and j%phi.size()==0){
            matrix<HilbertField> Hpr(j);
            Hpr.assign(t);
            //Moise: in the loop, we don't need the eigen vector, we need it only on output.
            //So we will compute it latter to save some time
            //evec_red.set_size(j);
            //evec_red.assign(Hpr);
            vector<double> evalues_tmp(j);
            evalues_tmp.resize(j);
            Hpr.eigenvalues(evalues_tmp);
            //TODO: we don't need all the eigenvalue nor the eigen vector, to optimize:
            //Only 2 eigenvalues are actually used

#ifdef BL_test
            for(size_t i=0; i<j; i++) fev << j << setprecision(10) << '\t' << evalues_tmp[i] << endl;
            fev << endl;
#endif

            evalue_test = abs(evalues_tmp[0]-ev_old); ev_old = evalues_tmp[0];
          if(evalue_test < accur_band_lanczos) converged = true;

            int num=0;
            for(size_t i=0; i<v.size(); i++) if(v[i].size()>0) num++;
            if(verb){
                cout.precision(10);
                cout << "--> iteration " << j << ",\tdelta E = " << evalue_test << "\tgap = " << evalues_tmp[1]-evalues_tmp[0] << endl;
            }

            if (evalues_tmp[1]-evalues_tmp[0] < band_lanczos_minimum_gap and no_degenerate_BL){
                qcm_ED_throw("band Lanczos: the gap between the first two eigenvalues is smaller than " + to_string(band_lanczos_minimum_gap));
            }
            if(converged) break;
        }
    }
    M0 = j;
    if(j==0) return false;

    //if(!converged)
    {
        matrix<HilbertField> Hpr(j);
        Hpr.assign(t);
        evec_red.set_size(j);
        evalues.resize(j);
        Hpr.eigensystem(evalues, evec_red);
    }
    P0.set_size((int)phi.size(),M0);
    P0.assign(P);

#ifdef BL_test
    fev << "\n\n#-----------------------------------\n";
    fev.close();
#endif

    if(verb) cout << M0 << " iterations" << endl;
    return true;
}



template<typename TYPE, typename HilbertField>
bool blockLanczos(
    TYPE &H,
    vector<vector<HilbertField>> &phi,
    vector<matrix<HilbertField>> &A,
    vector<matrix<HilbertField>> &B,
    int &M0,
    bool verb=false
)
{
    int p = (int)phi.size();
    size_t dim = phi[0].size();

    double accur_deflation   = global_double("accur_deflation");
    double accur_band_lanczos = global_double("accur_band_lanczos");
    if(accur_band_lanczos < 0.0) accur_band_lanczos = 0.0;
    size_t max_iter_BL = global_int("max_iter_BL");
    if(M0 > (int)max_iter_BL) M0 = (int)max_iter_BL;

    if(verb) cout << "\nblock Lanczos: block size p=" << p << ", dim=" << dim << endl;

    A.clear();
    B.clear();

    // Three rolling blocks of p Hilbert-space vectors
    vector<vector<HilbertField>> Q_prev;                              // Q_{j-1}
    vector<vector<HilbertField>> Q_cur(p, vector<HilbertField>(dim)); // Q_j (current)
    vector<vector<HilbertField>> Z(p, vector<HilbertField>(dim));     // workspace / Q_{j+1}

    // --- Initialization: orthonormalize the starting block into Q_cur ---
    for(int k = 0; k < p; ++k) Q_cur[k] = phi[k];
    for(int l = 0; l < p; ++l){
        for(int k = 0; k < l; ++k){
            HilbertField z = Q_cur[k] * Q_cur[l];  // <Q_cur[k]|Q_cur[l]>
            mult_add(-z, Q_cur[k], Q_cur[l]);
        }
        double nrm = norm(Q_cur[l]);
        if(nrm < accur_deflation)
            qcm_ED_throw("blockLanczos: starting block is rank-deficient at column " + to_string(l));
        Q_cur[l] *= 1.0/nrm;
    }

    bool converged = false;
    double ev_old = 1e12;
    int j = 0;

    for(j = 0; j < M0; ++j){
        check_signals();

        // Step 1 — Z[k] = H * Q_cur[k]
        for(int k = 0; k < p; ++k){
            to_zero(Z[k]);
            H.mult_add(Q_cur[k], Z[k]);
        }

        // Step 2 — A_j(k,l) = <Q_cur[k] | Z[l]>   (Hermitian block)
        matrix<HilbertField> Aj(p);
        for(int l = 0; l < p; ++l){
            for(int k = 0; k <= l; ++k){
                HilbertField z = Q_cur[k] * Z[l];
                Aj(k,l) = z;
                Aj(l,k) = conjugate(z);
            }
        }
        A.push_back(Aj);

        // Step 3 — Z[l] -= sum_k A_j(k,l) * Q_cur[k]
        for(int l = 0; l < p; ++l)
            for(int k = 0; k < p; ++k)
                mult_add(-Aj(k,l), Q_cur[k], Z[l]);

        // Step 4 — Z[l] -= sum_k conj(B_{j-1}(l,k)) * Q_prev[k]   (j > 0 only)
        //         i.e.  Z -= Q_prev * B_{j-1}^H  column-by-column
        if(j > 0){
            const matrix<HilbertField> &Bprev = B.back();
            for(int l = 0; l < p; ++l)
                for(int k = 0; k < p; ++k)
                    mult_add(-conjugate(Bprev(l,k)), Q_prev[k], Z[l]);
        }

        // Step 5 — QR factorisation of Z via modified Gram-Schmidt:
        //           Z = Q_{j+1} * B_j  (B_j upper triangular)
        //   After this loop Z[l] holds Q_{j+1}[l] (orthonormal).
        matrix<HilbertField> Bj(p);
        bool singular_block = false;
        for(int l = 0; l < p; ++l){
            // project out already-orthonormalised columns
            for(int k = 0; k < l; ++k){
                HilbertField z = Z[k] * Z[l];  // <Z[k]|Z[l]>  (Z[k] already normalised)
                Bj(k,l) = z;
                mult_add(-z, Z[k], Z[l]);
            }
            double nrm = norm(Z[l]);
            if(nrm < accur_deflation){
                if(verb) cout << "block Lanczos: deflation at column " << l
                              << " of step " << j << endl;
                Bj(l,l) = HilbertField(0);
                // zero remaining columns of Z so they don't contaminate future steps
                for(int ll = l; ll < p; ++ll) to_zero(Z[ll]);
                singular_block = true;
                break;
            }
            Bj(l,l) = HilbertField(nrm);
            Z[l] *= 1.0/nrm;
        }
        B.push_back(Bj);

        // Step 6 — Convergence check every p steps (mirrors bandLanczos strategy)
        if((j+1) >= 2*p && (j+1) % p == 0){
            int sz = (j+1) * p;
            matrix<HilbertField> T(sz, sz);
            // diagonal blocks
            for(int jj = 0; jj <= j; ++jj)
                for(int r = 0; r < p; ++r)
                    for(int c = 0; c < p; ++c)
                        T(jj*p+r, jj*p+c) = A[jj](r,c);
            // off-diagonal blocks: T_{j+1,j} = B[j], T_{j,j+1} = B[j]^H
            for(int jj = 0; jj < j; ++jj)
                for(int r = 0; r < p; ++r)
                    for(int c = 0; c < p; ++c){
                        T((jj+1)*p+r, jj*p+c) = B[jj](r,c);
                        T(jj*p+c, (jj+1)*p+r) = conjugate(B[jj](r,c));
                    }
            vector<double> evals(sz);
            T.eigenvalues(evals);
            double delta = abs(evals[0] - ev_old);
            ev_old = evals[0];
            if(verb){
                cout.precision(10);
                cout << "--> block Lanczos step " << j+1
                     << ", delta E = " << delta << endl;
            }
            if(delta < accur_band_lanczos){ converged = true; break; }
        }

        if(singular_block) break;

        // Advance rolling window: Q_prev <- Q_cur, Q_cur <- Z
        Q_prev = Q_cur;
        Q_cur  = Z;
        for(int k = 0; k < p; ++k) to_zero(Z[k]);
    }

    M0 = j + 1;
    if(verb) cout << A.size() << " block Lanczos floors obtained" << endl;
    return converged;
}



template<typename TYPE, typename HilbertField>
bool blockLanczosSVD(
    TYPE &H,
    vector<vector<HilbertField>> &phi,
    vector<matrix<HilbertField>> &A,
    vector<matrix<HilbertField>> &B,
    int &M0,
    bool verb=false
)
{
    int p = (int)phi.size();
    size_t dim = phi[0].size();

    double accur_deflation    = global_double("accur_deflation");
    double accur_band_lanczos = global_double("accur_band_lanczos");
    if(accur_band_lanczos < 0.0) accur_band_lanczos = 0.0;
    size_t max_iter_BL = global_int("max_iter_BL");
    if(M0 > (int)max_iter_BL) M0 = (int)max_iter_BL;

    if(verb) cout << "\nblock Lanczos (polar/SVD): block size p=" << p << ", dim=" << dim << endl;

    A.clear();
    B.clear();

    // Three rolling blocks of p Hilbert-space vectors
    vector<vector<HilbertField>> Q_prev;
    vector<vector<HilbertField>> Q_cur(p, vector<HilbertField>(dim));
    vector<vector<HilbertField>> Z(p, vector<HilbertField>(dim));

    // --- Initialisation: orthonormalize the starting block into Q_cur ---
    for(int k = 0; k < p; ++k) Q_cur[k] = phi[k];
    for(int l = 0; l < p; ++l){
        for(int k = 0; k < l; ++k){
            HilbertField z = Q_cur[k] * Q_cur[l];
            mult_add(-z, Q_cur[k], Q_cur[l]);
        }
        double nrm = norm(Q_cur[l]);
        if(nrm < accur_deflation)
            qcm_ED_throw("blockLanczosSVD: starting block is rank-deficient at column " + to_string(l));
        Q_cur[l] *= 1.0/nrm;
    }

    bool converged = false;
    double ev_old = 1e12;
    int j = 0;

    for(j = 0; j < M0; ++j){
        check_signals();

        // Step 1 — Z[k] = H * Q_cur[k]
        for(int k = 0; k < p; ++k){
            to_zero(Z[k]);
            H.mult_add(Q_cur[k], Z[k]);
        }

        // Step 2 — A_j(k,l) = <Q_cur[k] | Z[l]>   (Hermitian block)
        matrix<HilbertField> Aj(p);
        for(int l = 0; l < p; ++l){
            for(int k = 0; k <= l; ++k){
                HilbertField z = Q_cur[k] * Z[l];
                Aj(k,l) = z;
                Aj(l,k) = conjugate(z);
            }
        }
        A.push_back(Aj);

        // Step 3 — Z[l] -= sum_k A_j(k,l) * Q_cur[k]
        for(int l = 0; l < p; ++l)
            for(int k = 0; k < p; ++k)
                mult_add(-Aj(k,l), Q_cur[k], Z[l]);

        // Step 4 — Z[l] -= sum_k (B_{j-1}^H)_{k,l} * Q_prev[k]   (j > 0 only)
        //   Note: with Hermitian B_{j-1}, conj(B_{j-1}(l,k)) = B_{j-1}(k,l),
        //   so the formula is identical to the upper-triangular case.
        if(j > 0){
            const matrix<HilbertField> &Bprev = B.back();
            for(int l = 0; l < p; ++l)
                for(int k = 0; k < p; ++k)
                    mult_add(-conjugate(Bprev(l,k)), Q_prev[k], Z[l]);
        }

        // Step 5 — Polar decomposition of the residual block Z:
        //   Gram matrix  G_{kl} = <Z[k]|Z[l]>
        //   G = V D V^H  (eigendecomposition, D real non-negative)
        //   B_j = V diag(sqrt(D)) V^H    (Hermitian PSD square root)
        //   Q_{j+1}[:,l] = sum_k (V diag(D^{-1/2}) V^H)_{kl} Z[k]
        matrix<HilbertField> G(p);
        for(int l = 0; l < p; ++l)
            for(int k = 0; k <= l; ++k){
                HilbertField z = Z[k] * Z[l];
                G(k,l) = z;
                G(l,k) = conjugate(z);
            }

        vector<double> D(p);
        matrix<HilbertField> V(p);
        G.eigensystem(D, V);  // G = V diag(D) V^H, eigenvalues in D (ascending)

        // Check for deflation: singular value sqrt(D[m]) < accur_deflation
        bool singular_block = false;
        for(int m = 0; m < p; ++m){
            if(D[m] < accur_deflation * accur_deflation){
                if(verb) cout << "block Lanczos SVD: deflation at singular value " << m
                              << " (sigma=" << sqrt(max(D[m],0.0)) << ") at step " << j << endl;
                singular_block = true;
                break;
            }
        }

        // Build B_j = V diag(sqrt(D)) V^H  (Hermitian PSD)
        matrix<HilbertField> Bj(p);
        for(int r = 0; r < p; ++r)
            for(int c = 0; c < p; ++c){
                HilbertField s = HilbertField(0);
                for(int m = 0; m < p; ++m)
                    s += V(r,m) * HilbertField(sqrt(max(D[m],0.0))) * conjugate(V(c,m));
                Bj(r,c) = s;
            }
        B.push_back(Bj);

        if(singular_block) break;

        // Build Binv = V diag(D^{-1/2}) V^H  (inverse of B_j)
        matrix<HilbertField> Binv(p);
        for(int r = 0; r < p; ++r)
            for(int c = 0; c < p; ++c){
                HilbertField s = HilbertField(0);
                for(int m = 0; m < p; ++m)
                    s += V(r,m) * HilbertField(1.0/sqrt(D[m])) * conjugate(V(c,m));
                Binv(r,c) = s;
            }

        // New Q_{j+1}[:,l] = sum_k Binv_{kl} Z[k]
        //   (derived from Z_mat = Q_mat Bj  =>  Q_mat = Z_mat Binv)
        vector<vector<HilbertField>> Q_next(p, vector<HilbertField>(dim));
        for(int l = 0; l < p; ++l){
            to_zero(Q_next[l]);
            for(int k = 0; k < p; ++k)
                mult_add(Binv(k,l), Z[k], Q_next[l]);
        }

        // Advance rolling window: Q_prev <- Q_cur, Q_cur <- Q_next
        Q_prev = Q_cur;
        Q_cur  = Q_next;
        for(int k = 0; k < p; ++k) to_zero(Z[k]);

        // Step 6 — Convergence check every p steps (mirrors blockLanczos strategy)
        if((j+1) >= 2*p && (j+1) % p == 0){
            int sz = (j+1) * p;
            matrix<HilbertField> T(sz, sz);
            for(int jj = 0; jj <= j; ++jj)
                for(int r = 0; r < p; ++r)
                    for(int c = 0; c < p; ++c)
                        T(jj*p+r, jj*p+c) = A[jj](r,c);
            for(int jj = 0; jj < j; ++jj)
                for(int r = 0; r < p; ++r)
                    for(int c = 0; c < p; ++c){
                        T((jj+1)*p+r, jj*p+c) = B[jj](r,c);
                        T(jj*p+c, (jj+1)*p+r) = conjugate(B[jj](r,c));
                    }
            vector<double> evals(sz);
            T.eigenvalues(evals);
            double delta = abs(evals[0] - ev_old);
            ev_old = evals[0];
            if(verb){
                cout.precision(10);
                cout << "--> block Lanczos SVD step " << j+1
                     << ", delta E = " << delta << endl;
            }
            if(delta < accur_band_lanczos){ converged = true; break; }
        }
    }

    M0 = j + 1;
    if(verb) cout << A.size() << " block Lanczos SVD floors obtained" << endl;
    return converged;
}



template<typename TYPE, typename HilbertField>
bool blockLanczosBI(
    TYPE &H,
    vector<vector<HilbertField>> &phi_R,
    vector<vector<HilbertField>> &phi_L,
    vector<matrix<HilbertField>> &A,
    vector<matrix<HilbertField>> &B,
    vector<matrix<HilbertField>> &C,
    int &M0,
    bool verb=false
)
{
    int p = (int)phi_R.size();
    if((int)phi_L.size() != p)
        qcm_ED_throw("blockLanczosBI: phi_L and phi_R must have the same block size");
    size_t dim = phi_R[0].size();

    double accur_deflation    = global_double("accur_deflation");
    double accur_band_lanczos = global_double("accur_band_lanczos");
    if(accur_band_lanczos < 0.0) accur_band_lanczos = 0.0;
    size_t max_iter_BL = global_int("max_iter_BL");
    if(M0 > (int)max_iter_BL) M0 = (int)max_iter_BL;

    if(verb) cout << "\nblock Lanczos biorthogonalization: p=" << p << ", dim=" << dim << endl;

    A.clear(); B.clear(); C.clear();

    // Rolling blocks: _prev = level j-1, _cur = level j
    vector<vector<HilbertField>> V_prev, W_prev;
    vector<vector<HilbertField>> V_cur(p, vector<HilbertField>(dim));
    vector<vector<HilbertField>> W_cur(p, vector<HilbertField>(dim));
    vector<vector<HilbertField>> R(p, vector<HilbertField>(dim)); // right residual -> V_next
    vector<vector<HilbertField>> S(p, vector<HilbertField>(dim)); // left residual

    for(int k = 0; k < p; ++k){ V_cur[k] = phi_R[k]; W_cur[k] = phi_L[k]; }

    // --- Biorthogonal modified Gram-Schmidt initialisation ---
    // Goal: <W_cur[k] | V_cur[l]> = delta_{kl}
    // For each pair (l,l): first remove projections from all already-normalised pairs (k<l),
    // then normalise the pair symmetrically so <W_cur[l]|V_cur[l]> = 1.
    for(int l = 0; l < p; ++l){
        for(int k = 0; k < l; ++k){
            // Remove V_cur[k] influence from V_cur[l]: enforce <W_cur[k]|V_cur[l]> = 0
            HilbertField zv = W_cur[k] * V_cur[l];
            mult_add(-zv, V_cur[k], V_cur[l]);
            // Remove W_cur[k] influence from W_cur[l]: enforce <W_cur[l]|V_cur[k]> = 0
            HilbertField zw = W_cur[l] * V_cur[k];
            mult_add(-zw, W_cur[k], W_cur[l]);
        }
        // Normalise: <a*W_cur[l] | b*V_cur[l]> = 1
        // Choosing b = 1/|rho|^{1/2} and a = rho/(|rho|^{3/2}) gives the result.
        // (Here |rho| = sqrt(rho * conj(rho)).)
        HilbertField rho   = W_cur[l] * V_cur[l];
        double       arho2 = realpart(rho * conjugate(rho)); // |rho|^2
        if(arho2 < accur_deflation * accur_deflation)
            qcm_ED_throw("blockLanczosBI: starting block is biorthogonally rank-deficient at column " + to_string(l));
        double arho      = sqrt(arho2);          // |rho|
        double sqrt_arho = sqrt(arho);           // |rho|^{1/2}
        V_cur[l]        *= 1.0 / sqrt_arho;
        HilbertField scale_W = rho / (arho * sqrt_arho);
        for(size_t i = 0; i < dim; ++i) W_cur[l][i] *= scale_W;
    }

    bool converged = false;
    double ev_old  = 1e12;
    int j = 0;

    for(j = 0; j < M0; ++j){
        check_signals();

        // Step 1 — Compute R[k] = H V_cur[k]  and  S[k] = H W_cur[k]
        for(int k = 0; k < p; ++k){
            to_zero(R[k]);  H.mult_add(V_cur[k], R[k]);
            to_zero(S[k]);  H.mult_add(W_cur[k], S[k]);
        }

        // Step 2 — Diagonal block: A_j(k,l) = <W_cur[k] | R[l]>
        matrix<HilbertField> Aj(p);
        for(int k = 0; k < p; ++k)
            for(int l = 0; l < p; ++l)
                Aj(k,l) = W_cur[k] * R[l];
        A.push_back(Aj);

        // Step 3 — Subtract diagonal from residuals
        // Right: R[l] -= sum_k A_j(k,l)         * V_cur[k]
        // Left:  S[m] -= sum_k conj(A_j(m,k))   * W_cur[k]   [= A_j^H(k,m) W_cur[k]]
        for(int l = 0; l < p; ++l)
            for(int k = 0; k < p; ++k)
                mult_add(-Aj(k,l), V_cur[k], R[l]);
        for(int m = 0; m < p; ++m)
            for(int k = 0; k < p; ++k)
                mult_add(-conjugate(Aj(m,k)), W_cur[k], S[m]);

        // Step 4 — Subtract previous off-diagonal contribution (j > 0)
        // Right: R[m] -= sum_l conj(C_{j-1}(m,l)) V_prev[l]   [= C_{j-1}^H row m]
        // Left:  S[m] -= sum_l conj(B_{j-1}(m,l)) W_prev[l]   [= B_{j-1}^H row m]
        if(j > 0){
            const matrix<HilbertField> &Bprev = B.back();
            const matrix<HilbertField> &Cprev = C.back();
            for(int m = 0; m < p; ++m)
                for(int l = 0; l < p; ++l){
                    mult_add(-conjugate(Cprev(m,l)), V_prev[l], R[m]);
                    mult_add(-conjugate(Bprev(m,l)), W_prev[l], S[m]);
                }
        }

        // Step 5 — QR factorisation of R via modified Gram-Schmidt -> V_next stored in R, B_j
        matrix<HilbertField> Bj(p);
        bool singular_R = false;
        for(int l = 0; l < p; ++l){
            for(int k = 0; k < l; ++k){
                HilbertField z = R[k] * R[l];   // Euclidean inner product (R[k] already normalised)
                Bj(k,l) = z;
                mult_add(-z, R[k], R[l]);
            }
            double nrm = norm(R[l]);
            if(nrm < accur_deflation){
                if(verb) cout << "blockLanczosBI: deflation in R at column " << l << " of step " << j << endl;
                Bj(l,l) = HilbertField(0);
                for(int ll = l; ll < p; ++ll) to_zero(R[ll]);
                singular_R = true;
                break;
            }
            Bj(l,l) = HilbertField(nrm);
            R[l] *= 1.0 / nrm;
        }
        B.push_back(Bj);
        if(singular_R) break;

        // Step 6 — Compute C_j: C_j(m,l) = <V_next[m] | S[l]>
        // Derivation: S = W_next * C_j, and W_next^H V_next = I requires C_j^H = S^H V_next,
        // i.e. C_j(m,l) = <V_next[m]|S[l]> = conj(<S[l]|V_next[m]>) = conj(C_j^H(l,m)). ✓
        matrix<HilbertField> Cj(p);
        for(int m = 0; m < p; ++m)
            for(int l = 0; l < p; ++l)
                Cj(m,l) = R[m] * S[l];          // R[m] = V_next[m] (after QR above)
        C.push_back(Cj);

        // Check for left-side breakdown: C_j must be invertible
        double Cj_diag_norm = 0.0;
        for(int k = 0; k < p; ++k) Cj_diag_norm += realpart(Cj(k,k) * conjugate(Cj(k,k)));
        if(sqrt(Cj_diag_norm) < accur_deflation * p){
            if(verb) cout << "blockLanczosBI: left-side breakdown at step " << j << endl;
            break;
        }

        // Step 7 — Compute W_next = S * C_j^{-1}
        // W_next[m] = sum_l (C_j^{-1})_{lm} S[l]
        // Then <W_next[k]|V_next[l]> = delta_{kl} by construction (see derivation above).
        matrix<HilbertField> Cj_inv(Cj);
        Cj_inv.inverse();

        vector<vector<HilbertField>> W_next(p, vector<HilbertField>(dim));
        for(int m = 0; m < p; ++m){
            to_zero(W_next[m]);
            for(int l = 0; l < p; ++l)
                mult_add(Cj_inv(l,m), S[l], W_next[m]);
        }

        // Step 8 — Convergence check every p steps via eigenvalues of the Hermitian part of T
        // (T + T^H)/2 has diagonal blocks (A_j + A_j^H)/2 and off-diagonal blocks (B_j + C_j)/2
        if((j+1) >= 2*p && (j+1) % p == 0){
            int sz = (j+1) * p;
            matrix<HilbertField> T_herm(sz);
            for(int jj = 0; jj <= j; ++jj){
                // Hermitian part of diagonal block: (A[jj] + A[jj]^H) / 2
                for(int r = 0; r < p; ++r)
                    for(int c = 0; c < p; ++c){
                        HilbertField sym = (A[jj](r,c) + conjugate(A[jj](c,r))) * HilbertField(0.5);
                        T_herm(jj*p+r, jj*p+c) = sym;
                    }
                // Off-diagonal: (B[jj] + C[jj]) / 2 and its conjugate transpose
                if(jj < j){
                    for(int r = 0; r < p; ++r)
                        for(int c = 0; c < p; ++c){
                            HilbertField lo = (B[jj](r,c) + C[jj](r,c)) * HilbertField(0.5);
                            T_herm((jj+1)*p+r, jj*p+c) = lo;
                            T_herm(jj*p+c, (jj+1)*p+r) = conjugate(lo);
                        }
                }
            }
            vector<double> evals(sz);
            T_herm.eigenvalues(evals);
            double delta = abs(evals[0] - ev_old);
            ev_old = evals[0];
            if(verb){
                cout.precision(10);
                cout << "--> block Lanczos BI step " << j+1
                     << ", delta E (herm. part) = " << delta << endl;
            }
            if(delta < accur_band_lanczos){ converged = true; break; }
        }

        // Advance rolling window
        V_prev = move(V_cur);  W_prev = move(W_cur);
        V_cur  = move(R);      W_cur  = move(W_next);
        R.assign(p, vector<HilbertField>(dim));
        S.assign(p, vector<HilbertField>(dim));
    }

    M0 = j + 1;
    if(verb) cout << A.size() << " block Lanczos BI steps completed" << endl;
    return converged;
}


#endif