mod_harmonics.F90 Source File


This file depends on

sourcefile~~mod_harmonics.f90~~EfferentGraph sourcefile~mod_harmonics.f90 mod_harmonics.F90 sourcefile~mod_constants.f90 mod_constants.F90 sourcefile~mod_harmonics.f90->sourcefile~mod_constants.f90 sourcefile~mod_fmm_utils.f90 mod_fmm_utils.F90 sourcefile~mod_harmonics.f90->sourcefile~mod_fmm_utils.f90 sourcefile~mod_fmm_utils.f90->sourcefile~mod_constants.f90

Files dependent on this one

sourcefile~~mod_harmonics.f90~~AfferentGraph sourcefile~mod_harmonics.f90 mod_harmonics.F90 sourcefile~mod_fmm.f90 mod_fmm.F90 sourcefile~mod_fmm.f90->sourcefile~mod_harmonics.f90 sourcefile~mod_fmm_interface.f90 mod_fmm_interface.F90 sourcefile~mod_fmm_interface.f90->sourcefile~mod_harmonics.f90 sourcefile~mod_fmm_interface.f90->sourcefile~mod_fmm.f90 sourcefile~mod_electrostatics.f90 mod_electrostatics.F90 sourcefile~mod_electrostatics.f90->sourcefile~mod_fmm_interface.f90 sourcefile~mod_qm_helper.f90 mod_qm_helper.F90 sourcefile~mod_qm_helper.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_mmpol.f90 mod_mmpol.F90 sourcefile~mod_qm_helper.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_prm.f90 mod_prm.F90 sourcefile~mod_qm_helper.f90->sourcefile~mod_prm.f90 sourcefile~mod_link_atom.f90 mod_link_atom.F90 sourcefile~mod_qm_helper.f90->sourcefile~mod_link_atom.f90 sourcefile~rotate_multipoles.f90 rotate_multipoles.F90 sourcefile~rotate_multipoles.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_mmpol.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_mmpol.f90->sourcefile~mod_link_atom.f90 sourcefile~mod_prm.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_link_atom.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_link_atom.f90->sourcefile~mod_prm.f90 sourcefile~mod_iohdf5.f90 mod_iohdf5.F90 sourcefile~mod_iohdf5.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_iohdf5.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_polarization.f90 mod_polarization.F90 sourcefile~mod_polarization.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_polarization.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_solvers.f90 mod_solvers.F90 sourcefile~mod_polarization.f90->sourcefile~mod_solvers.f90 sourcefile~mod_solvers.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_geomgrad.f90 mod_geomgrad.F90 sourcefile~mod_geomgrad.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_geomgrad.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_geomgrad.f90->sourcefile~mod_polarization.f90 sourcefile~mod_inputloader.f90 mod_inputloader.F90 sourcefile~mod_inputloader.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_inputloader.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_inputloader.f90->sourcefile~mod_prm.f90 sourcefile~mod_interface.f90 mod_interface.F90 sourcefile~mod_interface.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_interface.f90->sourcefile~mod_qm_helper.f90 sourcefile~mod_interface.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_interface.f90->sourcefile~mod_prm.f90 sourcefile~mod_interface.f90->sourcefile~mod_link_atom.f90 sourcefile~mod_interface.f90->sourcefile~mod_iohdf5.f90 sourcefile~mod_interface.f90->sourcefile~mod_polarization.f90 sourcefile~mod_interface.f90->sourcefile~mod_geomgrad.f90 sourcefile~mod_interface.f90->sourcefile~mod_inputloader.f90 sourcefile~mod_c_interface.f90 mod_c_interface.F90 sourcefile~mod_c_interface.f90->sourcefile~mod_electrostatics.f90 sourcefile~mod_c_interface.f90->sourcefile~mod_qm_helper.f90 sourcefile~mod_c_interface.f90->sourcefile~mod_mmpol.f90 sourcefile~mod_c_interface.f90->sourcefile~mod_interface.f90

Contents

Source Code


Source Code

module mod_harmonics
    use mod_constants, only : ip, rp, pi, eps_rp
    use mod_fmm_utils, only: n_sph_harm, ntot_sph_harm
    private

    real(rp), allocatable :: vscales(:), vscales_rel(:), vcnk(:), m2l_ztranslate_coef(:,:,:)
    integer(ip) :: vscales_p = 0, vcnk_dmax = 0, m2l_pm = 0, m2l_pl = 0
    
    public :: fmm_m2m, fmm_m2l, fmm_l2l, fmm_m2p, prepare_fmmm_constants

    contains

    subroutine prepare_fmmm_constants(pm, pl)
        integer(ip), intent(in) :: pm, pl

        
        call make_vscales(max(pm,pl))
        call make_vcnk(pm)
        call make_m2l_ztranslate_coef(pm, pl)
    end subroutine
!> Compute scaling factors of real normalized spherical harmonics
!!
!! Output values of scaling factors of \f$ Y_\ell^m \f$ harmonics are filled
!! only for non-negative \f$ m \f$ since scaling factor of \f$ Y_\ell^{-m} \f$
!! is the same as scaling factor of \f$ Y_\ell^m \f$.
!!
!! @param[in] p: Maximal degree of spherical harmonics. `p` >= 0
!! @param[out] vscales: Array of scaling factors. Dimension is `(p+1)**2`
!! @param[out] vscales: Array of values 4pi/(2l+1). Dimension is `p+1`
!! @param[out] vscales_rel: Array of relative scaling factors.
!!      Dimension is `(p+1)**, vscales2`.
    subroutine make_vscales(p)
        ! Input
        integer, intent(in) :: p
        real(rp) :: tmp, twolp1, tmp2
        integer :: l, ind, m

        if(p <= vscales_p) then
            ! Job is already done
            return
        end if 

        if(allocated(vscales)) then
            ! vscales is allocated but it should be expanded.
            ! Just remove everything and restart from scratch
            deallocate(vscales)
        end if
        
        if(allocated(vscales_rel)) then
            deallocate(vscales_rel)
        end if

        vscales_p = p
        allocate(vscales((vscales_p+1)**2))
        allocate(vscales_rel((vscales_p+1)**2))

        twolp1 = 1.0_rp
        do l = 0, p
            ! m = 0
            ind = l*l + l + 1
            tmp = 4.0*pi / twolp1
            tmp2 = tmp
            tmp = sqrt(tmp)
            vscales_rel(ind) = tmp
            vscales(ind) = 1.0_rp / tmp
            twolp1 = twolp1 + 2.0_rp
            tmp = vscales(ind) * sqrt(2.0)
            ! m != 0
            do m = 1, l
                tmp = -tmp / sqrt(dble((l-m+1)*(l+m)))
                vscales(ind+m) = tmp
                vscales(ind-m) = tmp
                vscales_rel(ind+m) = tmp * tmp2
                vscales_rel(ind-m) = vscales_rel(ind+m)
            end do
        end do
    end subroutine make_vscales


    !> Compute FMM-related constants
!!
!! @param[in] dmax: Maximal degree of spherical harmonics to be evaluated.
!!      `dmax` >= 0
!! @param[in] pm: Maximal degree of the multipole expansion. `pm` >= 0.
!! @param[in] pl: Maximal degree of the local expansion. `pl` >= 0.
!! @param[out] vcnk: Array of squre roots of combinatorial factors C_n^k.
!!      Dimension is `(2*dmax+1)*(dmax+1)`.
!! @param[out] m2l_ztranslate_coef: Constants for M2L translation over OZ axis.
!!      Dimension is `(pm+1, pl+1, pl+1)`.
!! @param[out] m2l_ztranslate_coef: Constants for adjoint M2L translation over
!!      OZ axis. Dimension is `(pl+1, pl+1, pm+1)`.
subroutine make_vcnk(dmax)
    ! Inputs
    integer, intent(in) :: dmax
    ! Local variables

    integer(ip) :: i, j, indi

    if(dmax <= vcnk_dmax) then
        ! Job is already done
        return
    end if 

    if(allocated(vcnk)) then
        ! vscales is allocated but it should be expanded.
        ! Just remove everything and restart from scratch
        deallocate(vcnk)
    end if

    vcnk_dmax = dmax
    allocate(vcnk((2*dmax+1)*(dmax+1)))

    vcnk(1) = 1.0_rp
    do i = 2, 2*dmax+1
        ! Offset to the C_{i-2}^{i-2}, next item to be stored is C_{i-1}^0
        indi = (i-1) * i / 2
        ! C_{i-1}^0 = 1
        vcnk(indi+1) = 1.0_rp
        ! C_{i-1}^{i-1} = 1
        vcnk(indi+i) = 1.0_rp
        ! C_{i-1}^{j-1} = C_{i-2}^{j-1} + C_{i-2}^{j-2}
        ! Offset to C_{i-3}^{i-3} is indi-i+1
        do j = 2, i-1
            vcnk(indi+j) = vcnk(indi-i+j+1) + vcnk(indi-i+j)
        end do
    end do
    ! Get square roots of C_n^k. sqrt(1.0_rp) is 1.0_rp, so no need to update C_n^0
    ! and C_n^n
    do i = 3, 2*dmax+1
        indi = (i-1) * i / 2
        do j = 2, i-1
            vcnk(indi+j) = sqrt(vcnk(indi+j))
        end do
    end do
end subroutine make_vcnk

!> Compute FMM-related constants
!!
!! @param[in] dmax: Maximal degree of spherical harmonics to be evaluated.
!!      `dmax` >= 0
!! @param[in] pm: Maximal degree of the multipole expansion. `pm` >= 0.
!! @param[in] pl: Maximal degree of the local expansion. `pl` >= 0.
!! @param[out] vcnk: Array of squre roots of combinatorial factors C_n^k.
!!      Dimension is `(2*dmax+1)*(dmax+1)`.
!! @param[out] m2l_ztranslate_coef: Constants for M2L translation over OZ axis.
!!      Dimension is `(pm+1, pl+1, pl+1)`.
!! @param[out] m2l_ztranslate_coef: Constants for adjoint M2L translation over
!!      OZ axis. Dimension is `(pl+1, pl+1, pm+1)`.
subroutine make_m2l_ztranslate_coef(pm, pl)
    ! Inputs
    integer, intent(in) :: pm, pl

    integer(ip) :: j, k, n, indjn
    real(rp) :: tmp1
    if(pm <= m2l_pm .and. pl <= m2l_pl) then
        ! Job is already done
        return
    end if 

    if(allocated(m2l_ztranslate_coef)) then
        ! vscales is allocated but it should be expanded.
        ! Just remove everything and restart from scratch
        deallocate(m2l_ztranslate_coef)
    end if
    
    m2l_pm = pm
    m2l_pl = pl
    allocate(m2l_ztranslate_coef(m2l_pm+1, m2l_pl+1, m2l_pl+1))
    ! Check if vcnk are already populated, otherwise prepare them
    call make_vcnk(m2l_pm)

    ! Fill in m2l_ztranslate_coef
    do j = 0, pl
        do k = 0, j
            tmp1 = 1.0_rp
            do n = k, pm
                indjn = (j+n)*(j+n+1)/2 + 1
                m2l_ztranslate_coef(n-k+1, k+1, j-k+1) = &
                    & tmp1 * vcnk(indjn+j-k) * vcnk(indjn+j+k)
                !m2l_ztranslate_adj_coef(j-k+1, k+1, n-k+1) = &
                !    & m2l_ztranslate_coef(n-k+1, k+1, j-k+1)
                tmp1 = -tmp1
            end do
        end do
    end do
end subroutine

subroutine make_vfact(p, vfact)
    integer, intent(in) :: p
    ! Outputs
    real(rp), intent(out) :: vfact(2*p+1)

    integer(ip) :: i

    vfact(1) = 1
    do i=2, 2*p+1
        vfact(i) = vfact(i-1) * sqrt(real(i-1, rp))
    end do
end subroutine
    

!> Compute arrays of \f$ \cos(m \phi) \f$ and \f$ \sin(m \phi) \f$
!!
!! All values are computed recurrently from input \f$ \cos(\phi) \f$ and \f$
!! \sin(\phi) \f$ without accessing arccos or arcsin functions.
!!
!! @param[in] cphi: \f$ \cos(\phi) \f$. -1 <= `cphi` <= 1
!! @param[in] sphi: \f$ \sin(\phi) \f$. -1 <= `sphi` <= 1
!! @param[in] p: Maximal value of \f$ m \f$, for which to compute \f$ \cos(m
!!      \phi) \f$ and \f$ \sin(m\phi) \f$. `p` >= 0
!! @param[out] vcos: Array of \f$ \cos(m\phi) \f$ for \f$ m=0..p \f$. Dimension
!!      is `(p+1)`
!! @param[out] vsin: Array of \f$ \sin(m\phi) \f$ for \f$ m=0..p \f$. Dimension
!!      is `(p+1)`
    subroutine trgev(cphi, sphi, p, vcos, vsin)
        ! Inputs
        integer, intent(in) :: p
        real(rp), intent(in) :: cphi, sphi
        ! Output
        real(rp), intent(out) :: vcos(p+1), vsin(p+1)
        ! Local variables
        integer :: m
        real(rp) :: c4phi, s4phi
        !! Treat values of p from 0 to 3 differently
        select case(p)
            ! Do nothing if p < 0
            case (:-1)
                return
            ! p = 0
            case (0)
                vcos(1) = 1.0
                vsin(1) = 0.0
                return
            ! p = 1
            case (1)
                vcos(1) = 1.0
                vsin(1) = 0.0
                vcos(2) = cphi
                vsin(2) = sphi
                return
            ! p = 2
            case (2)
                vcos(1) = 1.0
                vsin(1) = 0.0
                vcos(2) = cphi
                vsin(2) = sphi
                vcos(3) = cphi**2 - sphi**2
                vsin(3) = 2 * cphi * sphi
                return
            ! p = 3
            case (3)
                vcos(1) = 1.0
                vsin(1) = 0.0
                vcos(2) = cphi
                vsin(2) = sphi
                vcos(3) = cphi**2 - sphi**2
                vsin(3) = 2 * cphi * sphi
                vcos(4) = vcos(3)*cphi - vsin(3)*sphi
                vsin(4) = vcos(3)*sphi + vsin(3)*cphi
                return
            ! p >= 4
            case default
                vcos(1) = 1.0
                vsin(1) = 0.0
                vcos(2) = cphi
                vsin(2) = sphi
                vcos(3) = cphi**2 - sphi**2
                vsin(3) = 2 * cphi * sphi
                vcos(4) = vcos(3)*cphi - vsin(3)*sphi
                vsin(4) = vcos(3)*sphi + vsin(3)*cphi
                vcos(5) = vcos(3)**2 - vsin(3)**2
                vsin(5) = 2 * vcos(3) * vsin(3)
                c4phi = vcos(5)
                s4phi = vsin(5)
        end select
        ! Define cos(m*phi) and sin(m*phi) recurrently 4 values at a time
        do m = 6, p-2, 4
            vcos(m:m+3) = vcos(m-4:m-1)*c4phi - vsin(m-4:m-1)*s4phi
            vsin(m:m+3) = vcos(m-4:m-1)*s4phi + vsin(m-4:m-1)*c4phi
        end do
        ! Work with leftover
        select case(m-p)
            case (-1)
                vcos(p-1) = vcos(p-2)*cphi - vsin(p-2)*sphi
                vsin(p-1) = vcos(p-2)*sphi + vsin(p-2)*cphi
                vcos(p) = vcos(p-2)*vcos(3) - vsin(p-2)*vsin(3)
                vsin(p) = vcos(p-2)*vsin(3) + vsin(p-2)*vcos(3)
                vcos(p+1) = vcos(p-2)*vcos(4) - vsin(p-2)*vsin(4)
                vsin(p+1) = vcos(p-2)*vsin(4) + vsin(p-2)*vcos(4)
            case (0)
                vcos(p) = vcos(p-1)*cphi - vsin(p-1)*sphi
                vsin(p) = vcos(p-1)*sphi + vsin(p-1)*cphi
                vcos(p+1) = vcos(p-1)*vcos(3) - vsin(p-1)*vsin(3)
                vsin(p+1) = vcos(p-1)*vsin(3) + vsin(p-1)*vcos(3)
            case (1)
                vcos(p+1) = vcos(p)*cphi - vsin(p)*sphi
                vsin(p+1) = vcos(p)*sphi + vsin(p)*cphi
        end select
    end subroutine trgev

!> Direct M2M translation over OZ axis
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha M_M \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ M_M \f$ is a matrix of multipole-to-multipole
!! translation over OZ axis.
!!
!!
!! @param[in] z: OZ coordinate from new to old centers of harmonics
!! @param[in] src_r: Radius of old harmonics
!! @param[in] dst_r: Radius of new harmonics
!! @parma[in] p: Maximal degree of spherical harmonics
!! @param[in] vscales: Normalization constants for harmonics
!! @param[in] vcnk: Square roots of combinatorial numbers C_n^k
!! @param[in] alpha: Scalar multipler for `alpha`
!! @param[in] src_m: Expansion in old harmonics
!! @param[in] beta: Scalar multipler for `dst_m`
!! @param[inout] dst_m: Expansion in new harmonics
!! @param[out] work: Temporary workspace of a size (2*(p+1))
    subroutine fmm_m2m_ztranslate_work(z, p, vscales, vcnk, alpha, &
        & src_m, beta, dst_m, work)
    implicit none
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: z, vscales((p+1)*(p+1)), &
        & vcnk((2*p+1)*(p+1)), alpha, src_m((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst_m((p+1)*(p+1))
    ! Temporary workspace
    real(rp), intent(out), target :: work(2*(p+1))
    ! Local variables
    real(rp) :: r2, tmp1, tmp2, tmp3, res1, res2
    integer :: j, k, n, indj, indjn, indjk1, indjk2
    ! Pointers for temporary values of powers
    real(rp), pointer :: pow_r2(:)
    ! In case alpha is 0.0 just do a proper scaling of output
    if (abs(alpha) < eps_rp) then
        if (abs(beta) < eps_rp) then
            dst_m = 0.0
        else
            dst_m = beta * dst_m
        end if
        return
    end if
    ! Now alpha is non-0.0
    ! If harmonics have different centers
    if (abs(z) > eps_rp) then
        ! Prepare pointers
        pow_r2(1:p+1) => work(1:p+1)
        ! Get ratios r1 and r2
        r2 = z 
        pow_r2(1) = 1.0
        do j = 2, p+1
            pow_r2(j) = pow_r2(j-1) * r2
        end do
        ! Do actual M2M
        ! Overwrite output if beta is 0.0
        if (abs(beta) < eps_rp) then
            do j = 0, p
                ! Offset for dst_m
                indj = j*j + j + 1
                ! k = 0
                tmp1 = alpha * vscales(indj)
                tmp2 = tmp1
                res1 = 0.0
                ! Offset for vcnk
                indjk1 = j*(j+1)/2 + 1
                do n = 0, j
                    ! Offset for src_m
                    indjn = (j-n)**2 + (j-n) + 1
                    tmp3 = pow_r2(n+1) / &
                        & vscales(indjn) * vcnk(indjk1+n)**2
                    res1 = res1 + tmp3*src_m(indjn)
                end do
                dst_m(indj) = tmp2 * res1
                ! k != 0
                do k = 1, j
                    tmp2 = tmp1
                    res1 = 0.0
                    res2 = 0.0
                    ! Offsets for vcnk
                    indjk1 = (j-k)*(j-k+1)/2 + 1
                    indjk2 = (j+k)*(j+k+1)/2 + 1
                    do n = 0, j-k
                        ! Offset for src_m
                        indjn = (j-n)**2 + (j-n) + 1
                        tmp3 = pow_r2(n+1) / &
                            & vscales(indjn) * vcnk(indjk1+n) * &
                            & vcnk(indjk2+n)
                        res1 = res1 + tmp3*src_m(indjn+k)
                        res2 = res2 + tmp3*src_m(indjn-k)
                    end do
                    dst_m(indj+k) = tmp2 * res1
                    dst_m(indj-k) = tmp2 * res2
                end do
            end do
        ! Update output if beta is non-0.0
        else
            do j = 0, p
                ! Offset for dst_m
                indj = j*j + j + 1
                ! k = 0
                tmp1 = alpha * vscales(indj)
                tmp2 = tmp1
                res1 = 0.0
                ! Offset for vcnk
                indjk1 = j * (j+1) /2 + 1
                do n = 0, j
                    ! Offset for src_m
                    indjn = (j-n)**2 + (j-n) + 1
                    tmp3 = pow_r2(n+1) / &
                        & vscales(indjn) * vcnk(indjk1+n)**2
                    res1 = res1 + tmp3*src_m(indjn)
                end do
                dst_m(indj) = beta*dst_m(indj) + tmp2*res1
                ! k != 0
                do k = 1, j
                    tmp2 = tmp1
                    res1 = 0.0
                    res2 = 0.0
                    ! Offsets for vcnk
                    indjk1 = (j-k)*(j-k+1)/2 + 1
                    indjk2 = (j+k)*(j+k+1)/2 + 1
                    do n = 0, j-k
                        ! Offset for src_m
                        indjn = (j-n)**2 + (j-n) + 1
                        tmp3 = pow_r2(n+1) / &
                            & vscales(indjn) * vcnk(indjk1+n) * &
                            & vcnk(indjk2+n)
                        res1 = res1 + tmp3*src_m(indjn+k)
                        res2 = res2 + tmp3*src_m(indjn-k)
                    end do
                    dst_m(indj+k) = beta*dst_m(indj+k) + tmp2*res1
                    dst_m(indj-k) = beta*dst_m(indj-k) + tmp2*res2
                end do
            end do
        end if
    ! If harmonics are located at the same point
    else
        ! Overwrite output if beta is 0.0
        if (abs(beta) < eps_rp) then
            tmp1 = alpha 
            do j = 0, p
                indj = j*j + j + 1
                do k = indj-j, indj+j
                    dst_m(k) = tmp1 * src_m(k)
                end do
            end do
        ! Update output if beta is non-0.0
        else
            tmp1 = alpha
            do j = 0, p
                indj = j*j + j + 1
                do k = indj-j, indj+j
                    dst_m(k) = beta*dst_m(k) + tmp1*src_m(k)
                end do
                tmp1 = tmp1
            end do
        end if
    end if
end subroutine fmm_m2m_ztranslate_work 

!> Rotate spherical harmonics around OZ axis
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha R \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics corresponding to a new cartesion system of coordinates, \f$
!! \mathrm{src} \f$ is a vector of coefficients of input spherical harmonics
!! corresponding to the standard cartesian system of coordinates and \f$
!! R \f$ is a matrix of rotation of coordinates around OZ axis on angle \f$
!! \phi \f$, presented by \f$ \cos(m \phi) \f$ and \f$ \sin(m \phi) \f$.
!!
!!
!! @param[in] p: Maximal order of spherical harmonics
!! @param[in] vcos: Vector \f$ \{ \cos(m \phi) \}_{m=0}^p \f$
!! @param[in] vsin: Vector \f$ \{ \sin(m \phi) \}_{m=0}^p \f$
!! @param[in] alpha: Scalar multiplier for `src`
!! @param[in] src: Coefficients of initial spherical harmonics
!! @param[in] beta: Scalar multipler for `dst`
!! @param[inout] dst: Coefficients of rotated spherical harmonics
subroutine fmm_sph_rotate_oz_work(p, vcos, vsin, alpha, src, beta, dst)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: vcos(p+1), vsin(p+1), alpha, src((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst((p+1)*(p+1))
    ! Local variables
    integer :: l, m, ind
    real(rp) :: v1, v2, v3, v4
    ! In case alpha is 0.0 just scale output
    if (abs(alpha) < eps_rp) then
        ! Set output to 0.0 if beta is also 0.0
        if (abs(beta) < eps_rp) then
            dst = 0.0
        else
            dst = beta * dst
        end if
        ! Exit subroutine
        return
    end if
    ! Now alpha is non-0.0
    ! In case beta is 0.0 output is just overwritten without being read
    if (abs(beta) < eps_rp) then
        ! l = 0
        dst(1) = alpha*src(1)
        ! l > 0
        !!GCC$ unroll 4
        do l = 1, p
            ind = l*l + l + 1
            ! m = 0
            dst(ind) = alpha*src(ind)
            ! m != 0
            !!GCC$ unroll 4
            do m = 1, l
                v1 = src(ind+m)
                v2 = src(ind-m)
                v3 = vcos(1+m)
                v4 = vsin(1+m)
                ! m > 0
                dst(ind+m) = alpha * (v1*v3-v2*v4)
                ! m < 0
                dst(ind-m) = alpha * (v1*v4+v2*v3)
            end do
        end do
    else
        ! l = 0
        dst(1) = beta*dst(1) + alpha*src(1)
        ! l > 0
        !!GCC$ unroll 4
        do l = 1, p
            ind = l*l + l + 1
            ! m = 0
            dst(ind) = beta*dst(ind) + alpha*src(ind)
            ! m != 0
            !!GCC$ unroll 4
            do m = 1, l
                v1 = src(ind+m)
                v2 = src(ind-m)
                v3 = vcos(1+m)
                v4 = vsin(1+m)
                ! m > 0
                dst(ind+m) = beta*dst(ind+m) + alpha*(v1*v3-v2*v4)
                ! m < 0
                dst(ind-m) = beta*dst(ind-m) + alpha*(v1*v4+v2*v3)
            end do
        end do
    end if
end subroutine fmm_sph_rotate_oz_work

!> Convert input cartesian coordinate into spherical coordinate
!!
!! Output coordinate \f$ (\rho, \theta, \phi) \f$ is presented by \f$ (\rho,
!! \cos \theta, \sin \theta, \cos \phi, \sin\phi) \f$.
!!
!! @param[in] x: Cartesian coordinate
!! @param[out] rho: \f$ \rho \f$
!! @param[out] ctheta: \f$ \cos \theta \f$
!! @param[out] stheta: \f$ \sin \theta \f$
!! @param[out] cphi: \f$ \cos \phi \f$
!! @param[out] sphi: \f$ \sin \phi \f$
    subroutine carttosph(x, rho, ctheta, stheta, cphi, sphi)
        ! Input
        real(rp), intent(in) :: x(3)
        ! Output
        real(rp), intent(out) :: rho, ctheta, stheta, cphi, sphi
        ! Local variables
        real(rp) :: max12, ssq12
        ! Check x(1:2) = 0
        if ((abs(x(1)) < eps_rp) .and. (abs(x(2)) < eps_rp)) then
            rho = abs(x(3))
            ctheta = sign(1.0_rp, x(3))
            stheta = 0.0
            cphi = 1.0
            sphi = 0.0
            return
        end if
        ! In other situations use sum-of-scaled-squares technique
        ! Get norm of x(1:2) and cphi with sphi outputs
        if (abs(x(2)) .gt. abs(x(1))) then
            max12 = abs(x(2))
            ssq12 = 1.0 + (x(1)/x(2))**2
        else
            max12 = abs(x(1))
            ssq12 = 1.0 + (x(2)/x(1))**2
        end if
        stheta = max12 * sqrt(ssq12)
        cphi = x(1) / stheta
        sphi = x(2) / stheta
        ! Then compute rho, ctheta and stheta outputs
        if (abs(x(3)) .gt. max12) then
            rho = 1.0 + ssq12*(max12/x(3))**2
            rho = abs(x(3)) * sqrt(rho)
            stheta = stheta / rho
            ctheta = x(3) / rho
        else
            rho = ssq12 + (x(3)/max12)**2
            rho = max12 * sqrt(rho)
            stheta = stheta / rho
            ctheta = x(3) / rho
        end if
    end subroutine carttosph

!> Transform spherical harmonics in the OXZ plane
!!
!! This function implements @ref fmm_sph_rotate_oxz with predefined values
!! of parameters \p alpha=1.0 and \p beta=0.0.
!! 
!! @param[in] p: maximum order of spherical harmonics
!! @param[in] r1xz: 2D transformation matrix in the OXZ plane
!! @param[in] alpha: Scalar multiplier for `src`
!! @param[in] src: Coefficients of initial spherical harmonics
!! @param[in] beta: Scalar multipler for `dst`
!! @param[out] dst: coefficients of rotated spherical harmonics
!! @param[out] work: Temporary workspace of a size (2*(2*p+1)*(2*p+3))
    subroutine fmm_sph_rotate_oxz_work(p, ctheta, stheta, alpha, src, beta, dst, &
        & work)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: ctheta, stheta, alpha, src((p+1)**2), beta
    ! Output
    real(rp), intent(out) :: dst((p+1)*(p+1))
    ! Temporary workspace
    real(rp), intent(out), target :: work(4*p*p+13*p+4)
    ! Local variables
    real(rp) :: u, v, w, fl, fl2, tmp1, tmp2, vu(2), vv(2), vw(2), &
        ctheta2, stheta2, cstheta
    integer :: l, m, n, ind
    ! Pointers for a workspace
    real(rp), pointer :: r(:, :, :), r_prev(:, :, :), scal_uvw_m(:), &
        & scal_u_n(:), scal_v_n(:), scal_w_n(:), r_swap(:, :, :), vsqr(:)
    !! Spherical harmonics Y_l^m with negative m transform into harmonics Y_l^m
    !! with the same l and negative m, while harmonics Y_l^m with non-negative
    !! m transform into harmonics Y_l^m with the same l and non-negative m.
    !! Transformations for non-negative m will be stored in r(1, :, :) and for
    !! negative m will be in r(2, :, :)
    ! In case alpha is 0.0 just scale output
    if (abs(alpha) < eps_rp) then
        ! Set output to 0.0 if beta is also 0.0
        if (abs(beta) < eps_rp) then
            dst = 0.0
        else
            dst = beta * dst
        end if
        ! Exit subroutine
        return
    end if
    ! Now alpha is non-0.0
    ! In case beta is 0.0 output is just overwritten without being read
    if (abs(beta) < eps_rp) then
        ! Compute rotations/reflections
        ! l = 0
        dst(1) = alpha * src(1)
        if (p .eq. 0) then
            return
        end if
        ! l = 1
        dst(2) = alpha * src(2)
        dst(3) = alpha * (src(3)*ctheta - src(4)*stheta)
        dst(4) = alpha * (src(3)*stheta + src(4)*ctheta)
        if (p .eq. 1) then
            return
        end if
        ! Set pointers
        l = 2 * (p+1) * (p+1)
        r(1:2, 0:p, 0:p) => work(1:l)
        m = 2 * l
        r_prev(1:2, 0:p, 0:p) => work(l+1:m)
        l = m + p + 1
        scal_uvw_m(0:p) => work(m+1:l)
        m = l + p
        scal_u_n(0:p-1) => work(l+1:m)
        l = m + p + 1
        scal_v_n(0:p) => work(m+1:l)
        m = l + p - 2
        scal_w_n(1:p-2) => work(l+1:m)
        l = m + p
        vsqr(1:p) => work(m+1:l)
        ! l = 2, m >= 0
        ctheta2 = ctheta * ctheta
        cstheta = ctheta * stheta
        stheta2 = stheta * stheta
        r(1, 2, 2) = (ctheta2 + 1.0) / 2.0
        r(1, 1, 2) = cstheta
        r(1, 0, 2) = sqrt(3.0) / 2.0 * stheta2
        dst(9) = alpha * (src(9)*r(1, 2, 2) + src(8)*r(1, 1, 2) + &
            & src(7)*r(1, 0, 2))
        r(1, 2, 1) = -cstheta
        r(1, 1, 1) = ctheta2 - stheta2
        r(1, 0, 1) = sqrt(3.0) * cstheta
        dst(8) = alpha * (src(9)*r(1, 2, 1) + src(8)*r(1, 1, 1) + &
            & src(7)*r(1, 0, 1))
        r(1, 2, 0) = sqrt(3.0) / 2.0 * stheta2
        r(1, 1, 0) = -sqrt(3.0) * cstheta
        r(1, 0, 0) = (3.0*ctheta2-1.0) / 2.0
        dst(7) = alpha * (src(9)*r(1, 2, 0) + src(8)*r(1, 1, 0) + &
            & src(7)*r(1, 0, 0))
        ! l = 2,  m < 0
        r(2, 1, 1) = ctheta
        r(2, 2, 1) = -stheta
        dst(6) = alpha * (src(6)*r(2, 1, 1) + src(5)*r(2, 2, 1))
        r(2, 1, 2) = stheta
        r(2, 2, 2) = ctheta
        dst(5) = alpha * (src(6)*r(2, 1, 2) + src(5)*r(2, 2, 2))
        ! l > 2
        vsqr(1) = 1.0
        vsqr(2) = 4.0
        do l = 3, p
            ! Swap previous and current rotation matrices
            r_swap => r_prev
            r_prev => r
            r => r_swap
            ! Prepare scalar factors
            fl = dble(l)
            fl2 = fl * fl
            vsqr(l) = fl2
            scal_uvw_m(0) = 1.0 / fl
            !!GCC$ unroll 4
            do m = 1, l-1
                u = sqrt(fl2 - vsqr(m))
                scal_uvw_m(m) = 1.0 / u
            end do
            u = 2.0 * dble(l)
            u = sqrt(dble(u*(u-1.0)))
            scal_uvw_m(l) = 1.0 / u
            scal_u_n(0) = dble(l)
            scal_v_n(0) = -sqrt(dble(l*(l-1))) / sqrt(2.0)
            !!GCC$ unroll 4
            do n = 1, l-2
                u = sqrt(fl2-vsqr(n))
                scal_u_n(n) = u
            end do
            !!GCC$ unroll 4
            do n = 1, l-2
                v = dble(l+n)
                v = sqrt(v*v-v) / 2.0
                scal_v_n(n) = v
                w = dble(l-n)
                w = -sqrt(w*w-w) / 2.0
                scal_w_n(n) = w
            end do
            u = sqrt(dble(2*l-1))
            scal_u_n(l-1) = u
            v = sqrt(dble((2*l-1)*(2*l-2))) / 2.0
            scal_v_n(l-1) = v
            v = sqrt(dble(2*l*(2*l-1))) / 2.0
            scal_v_n(l) = v
            ind = l*l + l + 1
            ! m = l, n = l and m = -l, n = - l
            vv = ctheta*r_prev(:, l-1, l-1) + r_prev(2:1:-1, l-1, l-1)
            r(:, l, l) = vv * scal_v_n(l) * scal_uvw_m(l)
            tmp1 = src(ind+l) * r(1, l, l)
            tmp2 = src(ind-l) * r(2, l, l)
            ! m = l, n = l-1 and m = -l, n = 1-l
            vu = stheta * r_prev(:, l-1, l-1)
            vv = ctheta*r_prev(:, l-2, l-1) + r_prev(2:1:-1, l-2, l-1)
            r(:, l-1, l) = (vu*scal_u_n(l-1)+vv*scal_v_n(l-1)) * scal_uvw_m(l)
            tmp1 = tmp1 + src(ind+l-1)*r(1, l-1, l)
            tmp2 = tmp2 + src(ind-l+1)*r(2, l-1, l)
            ! m = l, n = 1 and m = -l, n = -1
            vu = stheta * r_prev(:, 1, l-1)
            vv(1) = ctheta * r_prev(1, 0, l-1)
            vv(2) = r_prev(1, 0, l-1)
            vw = ctheta*r_prev(:, 2, l-1) - r_prev(2:1:-1, 2, l-1)
            r(:, 1, l) = vu*scal_u_n(1) + vw*scal_w_n(1) + sqrt(2.0)*scal_v_n(1)*vv
            r(:, 1, l) = r(:, 1, l) * scal_uvw_m(l)
            tmp1 = tmp1 + src(ind+1)*r(1, 1, l)
            tmp2 = tmp2 + src(ind-1)*r(2, 1, l)
            ! m = l, n = 0
            u = stheta * r_prev(1, 0, l-1)
            v = ctheta*r_prev(1, 1, l-1) - r_prev(2, 1, l-1)
            r(1, 0, l) = (u*scal_u_n(0) + v*scal_v_n(0)) * scal_uvw_m(l)
            tmp1 = tmp1 + src(ind)*r(1, 0, l)
            ! m = l, n = 2..l-2 and m = -l, n = 2-l..-2
            !!GCC$ unroll 4
            do n = 2, l-2
                vu = stheta * r_prev(:, n, l-1)
                vv = ctheta*r_prev(:, n-1, l-1) + r_prev(2:1:-1, n-1, l-1)
                vw = ctheta*r_prev(:, n+1, l-1) - r_prev(2:1:-1, n+1, l-1)
                vu = vu*scal_u_n(n) + vv*scal_v_n(n) + vw*scal_w_n(n)
                r(:, n, l) = vu * scal_uvw_m(l)
                tmp1 = tmp1 + src(ind+n)*r(1, n, l)
                tmp2 = tmp2 + src(ind-n)*r(2, n, l)
            end do
            dst(ind+l) = alpha * tmp1
            dst(ind-l) = alpha * tmp2
            ! Now deal with m = 0
            ! n = l and n = -l
            v = -stheta * r_prev(1, l-1, 0)
            u = scal_v_n(l) * scal_uvw_m(0)
            r(1, l, 0) = v * u
            tmp1 = src(ind+l) * r(1, l, 0)
            ! n = l-1
            u = ctheta * r_prev(1, l-1, 0)
            v = -stheta * r_prev(1, l-2, 0)
            w = u*scal_u_n(l-1) + v*scal_v_n(l-1)
            r(1, l-1, 0) = w * scal_uvw_m(0)
            tmp1 = tmp1 + src(ind+l-1)*r(1, l-1, 0)
            ! n = 0
            u = ctheta * r_prev(1, 0, 0)
            v = -stheta * r_prev(1, 1, 0)
            w = u*scal_u_n(0) + v*scal_v_n(0)
            r(1, 0, 0) = w * scal_uvw_m(0)
            tmp1 = tmp1 + src(ind)*r(1, 0, 0)
            ! n = 1
            v = sqrt(2.0)*scal_v_n(1)*r_prev(1, 0, 0) + &
                & scal_w_n(1)*r_prev(1, 2, 0)
            u = ctheta * r_prev(1, 1, 0)
            w = scal_u_n(1)*u - stheta*v
            r(1, 1, 0) = w * scal_uvw_m(0)
            tmp1 = tmp1 + src(ind+1)*r(1, 1, 0)
            ! n = 2..l-2
            !!GCC$ unroll 4
            do n = 2, l-2
                v = scal_v_n(n)*r_prev(1, n-1, 0) + &
                    & scal_w_n(n)*r_prev(1, n+1, 0)
                u = ctheta * r_prev(1, n, 0)
                w = scal_u_n(n)*u - stheta*v
                r(1, n, 0) = w * scal_uvw_m(0)
                tmp1 = tmp1 + src(ind+n)*r(1, n, 0)
            end do
            dst(ind) = alpha * tmp1
            ! Now deal with m=1..l-1 and m=1-l..-1
            !!GCC$ unroll 4
            do m = 1, l-1
                ! n = l and n = -l
                vv = -stheta * r_prev(:, l-1, m)
                u = scal_v_n(l) * scal_uvw_m(m)
                r(:, l, m) = vv * u
                tmp1 = src(ind+l) * r(1, l, m)
                tmp2 = src(ind-l) * r(2, l, m)
                ! n = l-1 and n = 1-l
                vu = ctheta * r_prev(:, l-1, m)
                vv = -stheta * r_prev(:, l-2, m)
                vw = vu*scal_u_n(l-1) + vv*scal_v_n(l-1)
                r(:, l-1, m) = vw * scal_uvw_m(m)
                tmp1 = tmp1 + src(ind+l-1)*r(1, l-1, m)
                tmp2 = tmp2 + src(ind-l+1)*r(2, l-1, m)
                ! n = 0
                u = ctheta * r_prev(1, 0, m)
                v = -stheta * r_prev(1, 1, m)
                w = u*scal_u_n(0) + v*scal_v_n(0)
                r(1, 0, m) = w * scal_uvw_m(m)
                tmp1 = tmp1 + src(ind)*r(1, 0, m)
                ! n = 1
                v = sqrt(2.0)*scal_v_n(1)*r_prev(1, 0, m) + &
                    & scal_w_n(1)*r_prev(1, 2, m)
                u = ctheta * r_prev(1, 1, m)
                w = scal_u_n(1)*u - stheta*v
                r(1, 1, m) = w * scal_uvw_m(m)
                tmp1 = tmp1 + src(ind+1)*r(1, 1, m)
                ! n = -1
                u = ctheta * r_prev(2, 1, m)
                w = -stheta * r_prev(2, 2, m)
                v = u*scal_u_n(1) + w*scal_w_n(1)
                r(2, 1, m) = v * scal_uvw_m(m)
                tmp2 = tmp2 + src(ind-1)*r(2, 1, m)
                ! n = 2..l-2 and n = 2-l..-2
                !!GCC$ unroll 4
                do n = 2, l-2
                    vv = scal_v_n(n)*r_prev(:, n-1, m) + &
                        & scal_w_n(n)*r_prev(:, n+1, m)
                    vu = ctheta * r_prev(:, n, m)
                    vw = scal_u_n(n)*vu - stheta*vv
                    r(:, n, m) = vw * scal_uvw_m(m)
                    tmp1 = tmp1 + src(ind+n)*r(1, n, m)
                    tmp2 = tmp2 + src(ind-n)*r(2, n, m)
                end do
                dst(ind+m) = alpha * tmp1
                dst(ind-m) = alpha * tmp2
            end do
        end do
    else
        stop "Not Implemented"
    end if
end subroutine fmm_sph_rotate_oxz_work

!> Rotate spherical harmonics around OZ axis in an opposite direction
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha R \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics corresponding to a new cartesion system of coordinates, \f$
!! \mathrm{src} \f$ is a vector of coefficients of input spherical harmonics
!! corresponding to the standard cartesian system of coordinates and \f$
!! R \f$ is a matrix of rotation of coordinates around OZ axis on angle \f$
!! \phi \f$, presented by \f$ \cos(m \phi) \f$ and \f$ \sin(m \phi) \f$.
!!
!!
!! @param[in] p: Maximal order of spherical harmonics
!! @param[in] vcos: Vector \f$ \{ \cos(m \phi) \}_{m=0}^p \f$
!! @param[in] vsin: Vector \f$ \{ \sin(m \phi) \}_{m=0}^p \f$
!! @param[in] alpha: Scalar multiplier for `src`
!! @param[in] src: Coefficients of initial spherical harmonics
!! @param[in] beta: Scalar multipler for `dst`
!! @param[inout] dst: Coefficients of rotated spherical harmonics
    subroutine fmm_sph_rotate_oz_adj_work(p, vcos, vsin, alpha, src, beta, dst)
        ! Inputs
        integer, intent(in) :: p
        real(rp), intent(in) :: vcos(p+1), vsin(p+1), alpha, src((p+1)*(p+1)), beta
        ! Output
        real(rp), intent(inout) :: dst((p+1)*(p+1))
        ! Local variables
        integer :: l, m, ind
        real(rp) :: v1, v2, v3, v4
        ! In case alpha is 0.0 just scale output
        if (abs(alpha) < eps_rp) then
            ! Set output to 0.0 if beta is also 0.0
            if (abs(beta) < eps_rp) then
                dst = 0.0
            else
                dst = beta * dst
            end if
            ! Exit subroutine
            return
        end if
        ! Now alpha is non-0.0
        ! In case beta is 0.0 output is just overwritten without being read
        if (abs(beta) < eps_rp) then
            ! l = 0
            dst(1) = alpha*src(1)
            ! l > 0
            do l = 1, p
                ind = l*l + l + 1
                ! m = 0
                dst(ind) = alpha*src(ind)
                ! m != 0
                do m = 1, l
                    v1 = src(ind+m)
                    v2 = src(ind-m)
                    v3 = vcos(1+m)
                    v4 = vsin(1+m)
                    ! m > 0
                    dst(ind+m) = alpha * (v1*v3+v2*v4)
                    ! m < 0
                    dst(ind-m) = alpha * (v2*v3-v1*v4)
                end do
            end do
        else
            ! l = 0
            dst(1) = beta*dst(1) + alpha*src(1)
            ! l > 0
            do l = 1, p
                ind = l*l + l + 1
                ! m = 0
                dst(ind) = beta*dst(ind) + alpha*src(ind)
                ! m != 0
                do m = 1, l
                    v1 = src(ind+m)
                    v2 = src(ind-m)
                    v3 = vcos(1+m)
                    v4 = vsin(1+m)
                    ! m > 0
                    dst(ind+m) = beta*dst(ind+m) + alpha*(v1*v3+v2*v4)
                    ! m < 0
                    dst(ind-m) = beta*dst(ind-m) + alpha*(v2*v3-v1*v4)
                end do
            end do
        end if
    end subroutine fmm_sph_rotate_oz_adj_work

    !> Direct M2L translation over OZ axis
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha L_M \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ L_M \f$ is a matrix of multipole-to-local
!! translation over OZ axis.
!!
!!
!! @param[in] z: OZ coordinate from new to old centers of harmonics
!! @param[in] src_r: Radius of old (multipole) harmonics
!! @param[in] dst_r: Radius of new (local) harmonics
!! @parma[in] pm: Maximal degree of multipole spherical harmonics
!! @parma[in] pl: Maximal degree of local spherical harmonics
!! @param[in] vscales: Normalization constants for harmonics
!! @param[in] m2l_ztranslate_coef:
!! @param[in] alpha: Scalar multipler for `src_m`
!! @param[in] src_m: Expansion in old (multipole) harmonics
!! @param[in] beta: Scalar multipler for `dst_l`
!! @param[inout] dst_l: Expansion in new (local) harmonics
!! @param[out] work: Temporary workspace of a size (pm+2)*(pm+1)
subroutine fmm_m2l_ztranslate_work(z, pm, pl, vscales, &
    & m2l_ztranslate_coef, alpha, src_m, beta, dst_l, work)
! Inputs
integer, intent(in) :: pm, pl
real(rp), intent(in) :: z, vscales((pm+pl+1)*(pm+pl+1)), &
    & m2l_ztranslate_coef(pm+1, pl+1, pl+1), alpha, src_m((pm+1)*(pm+1)), &
    & beta
! Output
real(rp), intent(inout) :: dst_l((pl+1)*(pl+1))
! Temporary workspace
real(rp), intent(out), target :: work((pm+2)*(pm+1))
! Local variables
real(rp) :: tmp1, r1, r2, res1, res2, pow_r2
integer :: j, k, n, indj, indk1, indk2
! Pointers for temporary values of powers
real(rp), pointer :: src_m2(:), pow_r1(:)
! In case alpha is 0.0_rp just do a proper scaling of output
if (abs(alpha) < eps_rp) then
    if (abs(beta) < eps_rp) then
        dst_l = 0.0_rp
    else
        dst_l = beta * dst_l
    end if
    return
end if
! Now alpha is non-0.0_rp
! z cannot be 0.0_rp, as input sphere (multipole) must not intersect with
! output sphere (local)
if (abs(z) < eps_rp) then
    return
end if
! Prepare pointers
n = (pm+1) ** 2
src_m2(1:n) => work(1:n)
pow_r1(1:pm+1) => work(n+1:n+pm+1)
! Get powers of r1 and r2
r1 = 1.0_rp / z
r2 = 1.0_rp / z
! This abs(r1) makes it possible to work with negative z to avoid
! unnecessary rotation to positive z
tmp1 = abs(r1)
do j = 0, pm
    indj = j*j + j + 1
    pow_r1(j+1) = tmp1 / vscales(indj)
    tmp1 = tmp1 * r1
end do
pow_r2 = 1.0_rp
! Reorder source harmonics from (degree, order) to (order, degree)
! 0.0_rp order k=0 at first
do j = 0, pm
    indj = j*j + j + 1
    src_m2(j+1) = pow_r1(j+1) * src_m(indj)
end do
! Non-0.0_rp orders next, a positive k followed by a negative -k
indk1 = pm + 2
do k = 1, pm
    n = pm - k + 1
    indk2 = indk1 + n
    !!GCC$ unroll 4
    do j = k, pm
        indj = j*j + j + 1
        src_m2(indk1+j-k) = pow_r1(j+1) * src_m(indj+k)
        src_m2(indk2+j-k) = pow_r1(j+1) * src_m(indj-k)
    end do
    indk1 = indk2 + n
end do
! Do actual M2L
! Overwrite output if beta is 0.0_rp
if (abs(beta) < eps_rp) then
    do j = 0, pl
        ! Offset for dst_l
        indj = j*j + j + 1
        ! k = 0
        tmp1 = alpha * vscales(indj) * pow_r2
        pow_r2 = pow_r2 * r2
        res1 = 0.0_rp
        !!GCC$ unroll 4
        do n = 0, pm
            res1 = res1 + m2l_ztranslate_coef(n+1, 1, j+1)*src_m2(n+1)
        end do
        dst_l(indj) = tmp1 * res1
        ! k != 0
        !!GCC$ unroll 4
        do k = 1, j
            ! Offsets for src_m2
            indk1 = pm + 2 + (2*pm-k+2)*(k-1)
            indk2 = indk1 + pm - k + 1
            res1 = 0.0_rp
            res2 = 0.0_rp
            !!GCC$ unroll 4
            do n = k, pm
                res1 = res1 + &
                    & m2l_ztranslate_coef(n-k+1, k+1, j-k+1)* &
                    & src_m2(indk1+n-k)
                res2 = res2 + &
                    & m2l_ztranslate_coef(n-k+1, k+1, j-k+1)* &
                    & src_m2(indk2+n-k)
            end do
            dst_l(indj+k) = tmp1 * res1
            dst_l(indj-k) = tmp1 * res2
        end do
    end do
else
    do j = 0, pl
        ! Offset for dst_l
        indj = j*j + j + 1
        ! k = 0
        tmp1 = alpha * vscales(indj) * pow_r2
        pow_r2 = pow_r2 * r2
        res1 = 0.0_rp
        do n = 0, pm
            res1 = res1 + m2l_ztranslate_coef(n+1, 1, j+1)*src_m2(n+1)
        end do
        dst_l(indj) = beta*dst_l(indj) + tmp1*res1
        ! k != 0
        do k = 1, j
            ! Offsets for src_m2
            indk1 = pm + 2 + (2*pm-k+2)*(k-1)
            indk2 = indk1 + pm - k + 1
            res1 = 0.0_rp
            res2 = 0.0_rp
            do n = k, pm
                res1 = res1 + &
                    & m2l_ztranslate_coef(n-k+1, k+1, j-k+1)* &
                    & src_m2(indk1+n-k)
                res2 = res2 + &
                    & m2l_ztranslate_coef(n-k+1, k+1, j-k+1)* &
                    & src_m2(indk2+n-k)
            end do
            dst_l(indj+k) = beta*dst_l(indj+k) + tmp1*res1
            dst_l(indj-k) = beta*dst_l(indj-k) + tmp1*res2
        end do
    end do
end if
end subroutine fmm_m2l_ztranslate_work

!> Direct L2L translation over OZ axis
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha L_L \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ L_L \f$ is a matrix of local-to-local
!! translation over OZ axis.
!!
!!
!! @param[in] z: OZ coordinate from new to old centers of harmonics
!! @param[in] src_r: Radius of old harmonics
!! @param[in] dst_r: Radius of new harmonics
!! @parma[in] p: Maximal degree of spherical harmonics
!! @param[in] vscales: Normalization constants for harmonics
!! @param[in] vfact: Square roots of factorials
!! @param[in] alpha: Scalar multipler for `src_l`
!! @param[in] src_l: Expansion in old harmonics
!! @param[in] beta: Scalar multipler for `dst_l`
!! @param[inout] dst_l: Expansion in new harmonics
!! @param[out] work: Temporary workspace of a size (2*(p+1))
subroutine fmm_l2l_ztranslate_work(z, p, vscales, vfact, alpha, &
    & src_l, beta, dst_l, work)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: z, vscales((p+1)*(p+1)), &
        & vfact(2*p+1), alpha, src_l((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst_l((p+1)*(p+1))
    ! Temporary workspace
    real(rp), intent(out), target :: work(2*(p+1))
    ! Local variables
    real(rp) :: r1, r2, tmp1, tmp2
    integer :: j, k, n, indj, indn
    ! Pointers for temporary values of powers
    real(rp), pointer :: pow_r1(:), pow_r2(:)
    ! Init output
    if (abs(beta) < eps_rp) then
        dst_l = 0.0_rp
    else
        dst_l = beta * dst_l
    end if
    ! If harmonics have different centers
    if (abs(z) > eps_rp) then
        ! Prepare pointers
        pow_r1(1:p+1) => work(1:p+1)
        pow_r2(1:p+1) => work(p+2:2*(p+1))
        ! Get powers of r1 and r2
        r1 = z / 1.0_rp
        r2 = 1.0_rp / z
        pow_r1(1) = 1
        pow_r2(1) = 1
        do j = 2, p+1
            pow_r1(j) = pow_r1(j-1) * r1
            pow_r2(j) = pow_r2(j-1) * r2
        end do
        ! Do actual L2L
        do j = 0, p
            indj = j*j + j + 1
            do k = 0, j
                tmp1 = alpha * pow_r2(j+1) / vfact(j-k+1) / vfact(j+k+1) * &
                    & vscales(indj)
                do n = j, p
                    indn = n*n + n + 1
                    tmp2 = tmp1 * pow_r1(n+1) / vscales(indn) * &
                        & vfact(n-k+1) * vfact(n+k+1) / vfact(n-j+1) / &
                        & vfact(n-j+1)
                    if (mod(n+j, 2) .eq. 1) then
                        tmp2 = -tmp2
                    end if
                    if (k .eq. 0) then
                        dst_l(indj) = dst_l(indj) + tmp2*src_l(indn)
                    else
                        dst_l(indj+k) = dst_l(indj+k) + tmp2*src_l(indn+k)
                        dst_l(indj-k) = dst_l(indj-k) + tmp2*src_l(indn-k)
                    end if
                end do
            end do
        end do
    ! If harmonics are located at the same point
    else
        tmp1 = alpha
        do j = 0, p
            indj = j*j + j + 1
            do k = indj-j, indj+j
                dst_l(k) = dst_l(k) + src_l(k)*tmp1
            end do
            tmp1 = tmp1
        end do
    end if
end subroutine fmm_l2l_ztranslate_work

!> Direct M2M translation by 4 rotations and 1 translation
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha M_M \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ M_M \f$ is a matrix of a multipole-to-multipole
!! translation.
!!
!! Rotates around OZ and OY axes, translates over OZ and then rotates back
!! around OY and OZ axes.
!!
!!
!! @param[in] c: Radius-vector from new to old centers of harmonics
!! @param[in] src_r: Radius of old harmonics
!! @param[in] dst_r: Radius of new harmonics
!! @param[in] p: Maximal degree of spherical harmonics
!! @param[in] vscales: Normalization constants for Y_lm
!! @param[in] vcnk: Square roots of combinatorial numbers C_n^k
!! @param[in] alpha: Scalar multiplier for `src_m`
!! @param[in] src_m: Expansion in old harmonics
!! @param[in] beta: Scalar multiplier for `dst_m`
!! @param[inout] dst_m: Expansion in new harmonics
!! @param[out] work: Temporary workspace of a size 6*p*p+19*p+8
subroutine fmm_m2m_rotation_work(c, p, vscales, vcnk, alpha, &
    & src_m, beta, dst_m, work)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: c(3), vscales((p+1)*(p+1)), &
        & vcnk((2*p+1)*(p+1)), alpha, src_m((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst_m((p+1)*(p+1))
    ! Temporary workspace
    real(rp), intent(out), target :: work(6*p*p + 19*p + 8)
    ! Local variables
    real(rp) :: rho, ctheta, stheta, cphi, sphi
    integer :: m, n
    ! Pointers for temporary values of harmonics
    real(rp), pointer :: tmp_m(:), tmp_m2(:), vcos(:), vsin(:)
    ! Convert Cartesian coordinates into spherical
    call carttosph(c, rho, ctheta, stheta, cphi, sphi)
    ! If no need for rotations, just do translation along z
    if (abs(stheta) < eps_rp) then
        ! Workspace here is 2*(p+1)
        call fmm_m2m_ztranslate_work(c(3), p, vscales, vcnk, &
            & alpha, src_m, beta, dst_m, work)
        return
    end if
    ! Prepare pointers
    m = (p+1)**2
    n = 4*m + 5*p ! 4*p*p + 13*p + 4
    tmp_m(1:m) => work(n+1:n+m) ! 5*p*p + 15*p + 5
    n = n + m
    tmp_m2(1:m) => work(n+1:n+m) ! 6*p*p + 17*p + 6
    n = n + m
    m = p + 1
    vcos => work(n+1:n+m) ! 6*p*p + 18*p + 7
    n = n + m
    vsin => work(n+1:n+m) ! 6*p*p + 19*p + 8
    ! Compute arrays of cos and sin that are needed for rotations of harmonics
    call trgev(cphi, sphi, p, vcos, vsin)
    ! Rotate around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_adj_work(p, vcos, vsin, alpha, src_m, 0.0_rp, tmp_m)
    ! Perform rotation in the OXZ plane, work size is 4*p*p+13*p+4
    call fmm_sph_rotate_oxz_work(p, ctheta, -stheta, 1.0_rp, tmp_m, 0.0_rp, &
        & tmp_m2, work)
    ! OZ translation, workspace here is 2*(p+1)
    call fmm_m2m_ztranslate_work(rho, p, vscales, vcnk, 1.0_rp, &
        & tmp_m2, 0.0_rp, tmp_m, work)
    ! Backward rotation in the OXZ plane, work size is 4*p*p+13*p+4
    call fmm_sph_rotate_oxz_work(p, ctheta, stheta, 1.0_rp, tmp_m, 0.0_rp, tmp_m2, &
        & work)
    ! Backward rotation around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_work(p, vcos, vsin, 1.0_rp, tmp_m2, beta, dst_m)
end subroutine fmm_m2m_rotation_work

!> Direct M2L translation by 4 rotations and 1 translation
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha L_M \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ L_M \f$ is a matrix of a multipole-to-local
!! translation.
!!
!! Rotates around OZ and OY axes, translates over OZ and then rotates back
!! around OY and OZ axes.
!!
!!
!! @param[in] c: Radius-vector from new to old centers of harmonics
!! @param[in] src_r: Radius of old harmonics
!! @param[in] dst_r: Radius of new harmonics
!! @param[in] pm: Maximal degree of multipole spherical harmonics
!! @param[in] pl: Maximal degree of local spherical harmonics
!! @param[in] vscales: Normalization constants for Y_lm
!! @param[in] m2l_ztranslate_coef:
!! @param[in] alpha: Scalar multiplier for `src_m`
!! @param[in] src_m: Expansion in old harmonics
!! @param[in] beta: Scalar multiplier for `dst_l`
!! @param[inout] dst_l: Expansion in new harmonics
!! @param[out] work: Temporary workspace of a size 6*p*p+19*p+8 where p is a
!!      maximum of pm and pl
subroutine fmm_m2l_rotation_work(c, pm, pl, vscales, &
    & m2l_ztranslate_coef, alpha, src_m, beta, dst_l, work)
    ! Inputs
    integer, intent(in) :: pm, pl
    real(rp), intent(in) :: c(3), vscales((pm+pl+1)**2), &
        & m2l_ztranslate_coef(pm+1, pl+1, pl+1), alpha, src_m((pm+1)*(pm+1)), &
        & beta
    ! Output
    real(rp), intent(inout) :: dst_l((pl+1)*(pl+1))
    ! Temporary workspace
    real(rp), intent(out), target :: &
        & work(6*max(pm, pl)**2 + 19*max(pm, pl) + 8)
    ! Local variables
    real(rp) :: rho, ctheta, stheta, cphi, sphi
    integer :: m, n, p
    ! Pointers for temporary values of harmonics
    real(rp), pointer :: tmp_ml(:), tmp_ml2(:), vcos(:), vsin(:)
    ! Covert Cartesian coordinates into spherical
    call carttosph(c, rho, ctheta, stheta, cphi, sphi)
    ! If no need for rotations, just do translation along z
    if (abs(stheta) < eps_rp) then
        ! Workspace here is (pm+2)*(pm+1)
        call fmm_m2l_ztranslate_work(c(3), pm, pl, vscales, &
            & m2l_ztranslate_coef, alpha, src_m, beta, dst_l, work)
        return
    end if
    ! Prepare pointers
    p = max(pm, pl)
    m = (p+1)**2
    n = 4*m + 5*p ! 4*p*p + 13*p + 4
    tmp_ml(1:m) => work(n+1:n+m) ! 5*p*p + 15*p + 5
    n = n + m
    tmp_ml2(1:m) => work(n+1:n+m) ! 6*p*p + 17*p + 6
    n = n + m
    m = p + 1
    vcos => work(n+1:n+m) ! 6*p*p + 18*p + 7
    n = n + m
    vsin => work(n+1:n+m) ! 6*p*p + 19*p + 8
    ! Compute arrays of cos and sin that are needed for rotations of harmonics
    call trgev(cphi, sphi, p, vcos, vsin)
    ! Rotate around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_adj_work(pm, vcos, vsin, alpha, src_m, 0.0_rp, tmp_ml)
    ! Perform rotation in the OXZ plane, work size is 4*pm*pm+13*pm+4
    call fmm_sph_rotate_oxz_work(pm, ctheta, -stheta, 1.0_rp, tmp_ml, 0.0_rp, &
        & tmp_ml2, work)
    ! OZ translation, workspace here is (pm+2)*(pm+1)
    call fmm_m2l_ztranslate_work(rho, pm, pl, vscales, &
        & m2l_ztranslate_coef, 1.0_rp, tmp_ml2, 0.0_rp, tmp_ml, work)
    ! Backward rotation in the OXZ plane, work size is 4*pl*pl+13*pl+4
    call fmm_sph_rotate_oxz_work(pl, ctheta, stheta, 1.0_rp, tmp_ml, 0.0_rp, &
        & tmp_ml2, work)
    ! Backward rotation around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_work(pl, vcos, vsin, 1.0_rp, tmp_ml2, beta, dst_l)
end subroutine fmm_m2l_rotation_work

!> Direct L2L translation by 4 rotations and 1 translation
!!
!! Compute the following matrix-vector product:
!! \f[
!!      \mathrm{dst} = \beta \mathrm{dst} + \alpha L_L \mathrm{src},
!! \f]
!! where \f$ \mathrm{dst} \f$ is a vector of coefficients of output spherical
!! harmonics, \f$ \mathrm{src} \f$ is a vector of coefficients of input
!! spherical harmonics and \f$ L_L \f$ is a matrix of a local-to-local
!! translation.
!!
!! Rotates around OZ and OY axes, translates over OZ and then rotates back
!! around OY and OZ axes.
!!
!!
!! @param[in] c: Radius-vector from new to old centers of harmonics
!! @param[in] src_r: Radius of old harmonics
!! @param[in] dst_r: Radius of new harmonics
!! @param[in] p: Maximal degree of spherical harmonics
!! @param[in] vscales: Normalization constants for Y_lm
!! @param[in] vfact: Square roots of factorials
!! @param[in] alpha: Scalar multiplier for `src_l`
!! @param[in] src_l: Expansion in old harmonics
!! @param[in] beta: Scalar multiplier for `dst_l`
!! @param[inout] dst_l: Expansion in new harmonics
!! @param[out] work: Temporary workspace of a size 6*p*p+19*p+8
subroutine fmm_l2l_rotation_work(c, p, vscales, vfact, alpha, &
    & src_l, beta, dst_l, work)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: c(3), vscales((p+1)*(p+1)), &
        & vfact(2*p+1), alpha, src_l((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst_l((p+1)*(p+1))
    ! Temporary workspace
    real(rp), intent(out), target :: work(6*p*p + 19*p + 8)
    ! Local variables
    real(rp) :: rho, ctheta, stheta, cphi, sphi
    integer :: m, n
    ! Pointers for temporary values of harmonics
    real(rp), pointer :: tmp_l(:), tmp_l2(:), vcos(:), vsin(:)
    ! Covert Cartesian coordinates into spherical
    call carttosph(c, rho, ctheta, stheta, cphi, sphi)
    ! If no need for rotations, just do translation along z
    if (abs(stheta) < eps_rp) then
        ! Workspace here is 2*(p+1)
        call fmm_l2l_ztranslate_work(c(3), p, vscales, vfact, &
            & alpha, src_l, beta, dst_l, work)
        return
    end if
    ! Prepare pointers
    m = (p+1)**2
    n = 4*m + 5*p ! 4*p*p + 13*p + 4
    tmp_l(1:m) => work(n+1:n+m) ! 5*p*p + 15*p + 5
    n = n + m
    tmp_l2(1:m) => work(n+1:n+m) ! 6*p*p + 17*p + 6
    n = n + m
    m = p + 1
    vcos => work(n+1:n+m) ! 6*p*p + 18*p + 7
    n = n + m
    vsin => work(n+1:n+m) ! 6*p*p + 19*p + 8
    ! Compute arrays of cos and sin that are needed for rotations of harmonics
    call trgev(cphi, sphi, p, vcos, vsin)
    ! Rotate around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_adj_work(p, vcos, vsin, alpha, src_l, 0.0_rp, tmp_l)
    ! Perform rotation in the OXZ plane, work size is 4*p*p+13*p+4
    call fmm_sph_rotate_oxz_work(p, ctheta, -stheta, 1.0_rp, tmp_l, 0.0_rp, &
        & tmp_l2, work)
    ! OZ translation, workspace here is 2*(p+1)
    call fmm_l2l_ztranslate_work(rho, p, vscales, vfact, 1.0_rp, &
        & tmp_l2, 0.0_rp, tmp_l, work)
    ! Backward rotation in the OXZ plane, work size is 4*p*p+13*p+4
    call fmm_sph_rotate_oxz_work(p, ctheta, stheta, 1.0_rp, tmp_l, 0.0_rp, tmp_l2, &
        & work)
    ! Backward rotation around OZ axis (work array might appear in the future)
    call fmm_sph_rotate_oz_work(p, vcos, vsin, 1.0_rp, tmp_l2, beta, dst_l)
end subroutine fmm_l2l_rotation_work


subroutine fmm_m2m(c_st, pm, s, t)
    implicit none

    real(rp) :: c_st(3)
    !! Distance vector from source to target
    integer(ip) :: pm
    !! Maximum level of spherical harmonics expansion for multipoles
    real(rp) :: s(:)
    !! Source distribution expansion coefficients
    real(rp) :: t(:)
    !! Target distribution expansion coefficients

    real(rp), allocatable :: work(:)

    ! Allocate local variables
    allocate(work(6*pm**2 + 19*pm + 8))

    call fmm_m2m_rotation_work(c_st, pm, vscales, vcnk, 1.0_rp, s, 0.0_rp, t, work)

    deallocate(work)
end subroutine

subroutine fmm_m2l(c_st, pm, pl, s, t)
    implicit none
    
    real(rp) :: c_st(3)
    !! Distance vector from source to target
    integer(ip) :: pl
    !! Maximum level of spherical harmonics expansion for local exp. 
    integer(ip) :: pm
    !! Maximum level of spherical harmonics expansion for multipoles
    real(rp) :: s(:)
    !! Source distribution expansion coefficients
    real(rp) :: t(:)
    !! Target distribution expansion coefficients

    real(rp), allocatable :: work(:)

    ! Allocate local variables
    allocate(work(6*max(pm, pl)**2 + 19*max(pm, pl) + 8))

    call fmm_m2l_rotation_work(c_st, pm, pl, vscales, m2l_ztranslate_coef, 1.0_rp, s, 0.0_rp, t, work)

    deallocate(work)
end subroutine

subroutine fmm_l2l(c_st, r_s, r_t, pl, s, t)
    implicit none
    
    real(rp) :: c_st(3)
    !! Distance vector from source to target
    real(rp) :: r_s
    !! Size of source node
    real(rp) :: r_t
    !! Size of target node
    integer(ip) :: pl
    !! Maximum level of spherical harmonics expansion for local exp. 
    real(rp) :: s(:)
    !! Source distribution expansion coefficients
    real(rp) :: t(:)
    !! Target distribution expansion coefficients

    real(rp), allocatable :: vfact(:), work(:)

    ! Allocate local variables
    allocate(vfact(2*pl+1))
    allocate(work(6*pl**2 + 19*pl + 8))

    call make_vfact(pl, vfact)

    call fmm_l2l_rotation_work(c_st, pl, vscales, vfact, 1.0_rp, s, 0.0_rp, t, work)

    deallocate(vfact, work)
end subroutine
!> Accumulate potential, induced by multipole spherical harmonics
!!
!! This function relies on a user-provided temporary workspace
!!
!! Computes the following sum:
!! \f[
!!      v = \beta v + \alpha \sum_{\ell=0}^p \frac{4\pi}{\sqrt{2\ell+1}}
!!      \left( \frac{r}{\|c\|} \right)^{\ell+1} \sum_{m=-\ell}^\ell
!!      M_\ell^m Y_\ell^m \left( \frac{c}{\|c\|} \right),
!! \f]
!! where \f$ M \f$ is a vector of coefficients of input harmonics of
!! a degree up to \f$ p \f$ inclusively with a convergence radius \f$ r \f$
!! located at the origin, \f$ \alpha \f$ and \f$ \beta \f$ are scaling factors
!! and \f$ c \f$ is a location of a particle.
!!
!! Based on normalized real spherical harmonics \f$ Y_\ell^m \f$, scaled by \f$
!! r^{-\ell} \f$. It means corresponding coefficients are simply scaled by an
!! additional factor \f$ r^\ell \f$.
!!
!! @param[in] c: Coordinates of a particle (relative to center of harmonics)
!! @param[in] r: Radius of spherical harmonics
!! @param[in] p: Maximal degree of multipole basis functions
!! @param[in] vscales_rel: Relative normalization constants for
!!      \f$ Y_\ell^m \f$. Dimension is `(p+1)**2`
!! @param[in] alpha: Scalar multiplier for `src_m`
!! @param[in] src_m: Multipole coefficients. Dimension is `(p+1)**2`
!! @param[in] beta: Scalar multiplier for `v`
!! @param[inout] v: Value of induced potential
!! @param[out] work: Temporary workspace of size (p+1)
subroutine fmm_m2p_work(c, src_r, p, vscales_rel, alpha, src_m, &
    & beta, dst_v, work)
    ! Inputs
    integer, intent(in) :: p
    real(rp), intent(in) :: c(3), src_r, vscales_rel((p+1)*(p+1)), alpha, &
        & src_m((p+1)*(p+1)), beta
    ! Output
    real(rp), intent(inout) :: dst_v
    ! Workspace
    real(rp), intent(out), target :: work(p+1)
    ! Local variables
    real(rp) :: rho, ctheta, stheta, cphi, sphi, rcoef, t, tmp, tmp1, tmp2, &
        & tmp3, max12, ssq12, pl2m, pl1m, plm, pmm, cmphi, smphi, ylm
    integer :: l, m, indl
    ! Scale output
    if (abs(beta) < eps_rp) then
        dst_v = 0.0_rp
    else
        dst_v = beta * dst_v
    end if
    ! In case of 0.0_rp alpha nothing else is required no matter what is the
    ! value of the induced potential
    if (abs(alpha) < eps_rp) then
        return
    end if
    ! Get spherical coordinates
    if (abs(c(1)) <  eps_rp) then
        max12 = abs(c(2))
        ssq12 = 1.0_rp
    else if (abs(c(2)) .gt. abs(c(1))) then
        max12 = abs(c(2))
        ssq12 = 1.0_rp + (c(1)/c(2))**2
    else
        max12 = abs(c(1))
        ssq12 = 1.0_rp + (c(2)/c(1))**2
    end if
    ! Then we compute rho
    if (abs(c(3)) < eps_rp) then
        rho = max12 * sqrt(ssq12)
    else if (abs(c(3)) .gt. max12) then
        rho = 1.0_rp + ssq12 *(max12/c(3))**2
        rho = abs(c(3)) * sqrt(rho)
    else
        rho = ssq12 + (c(3)/max12)**2
        rho = max12 * sqrt(rho)
    end if
    ! In case of a singularity (rho=0.0_rp) induced potential is infinite and is
    ! not taken into account.
    if (abs(rho) < eps_rp) then
        return
    end if
    ! Compute the actual induced potential
    rcoef = src_r / rho
    ! Length of a vector x(1:2)
    stheta = max12 * sqrt(ssq12)
    ! Case x(1:2) != 0
    if (abs(stheta) > eps_rp) then
        ! Normalize cphi and sphi
        cphi = c(1) / stheta
        sphi = c(2) / stheta
        ! Normalize ctheta and stheta
        ctheta = c(3) / rho
        stheta = stheta / rho
        ! Treat easy cases
        select case(p)
            case (0)
                ! l = 0
                dst_v = dst_v + alpha*rcoef*vscales_rel(1)*src_m(1)
                return
            case (1)
                ! l = 0
                tmp = src_m(1) * vscales_rel(1)
                ! l = 1
                tmp2 = ctheta * src_m(3) * vscales_rel(3)
                tmp3 = vscales_rel(4) * stheta
                tmp2 = tmp2 - tmp3*sphi*src_m(2)
                tmp2 = tmp2 - tmp3*cphi*src_m(4)
                dst_v = dst_v + alpha*rcoef*(tmp+rcoef*tmp2)
                return
        end select
        ! Now p>1
        ! Precompute alpha*rcoef^{l+1}
        work(1) = alpha * rcoef
        do l = 1, p
            work(l+1) = rcoef * work(l)
        end do
        ! Case m = 0
        ! P_{l-2}^m which is P_0^0 now
        pl2m = 1.0_rp
        dst_v = dst_v + work(1)*src_m(1)*vscales_rel(1)
        ! Update P_m^m for the next iteration
        pmm = -stheta
        ! P_{l-1}^m which is P_{m+1}^m now
        pl1m = ctheta
        ylm = pl1m * vscales_rel(3)
        dst_v = dst_v + work(2)*src_m(3)*ylm
        ! P_l^m for l>m+1
        do l = 2, p
            plm = dble(2*l-1)*ctheta*pl1m - dble(l-1)*pl2m
            plm = plm / dble(l)
            ylm = plm * vscales_rel(l*l+l+1)
            dst_v = dst_v + work(l+1)*src_m(l*l+l+1)*ylm
            ! Update P_{l-2}^m and P_{l-1}^m for the next iteration
            pl2m = pl1m
            pl1m = plm
        end do
        ! Prepare cos(m*phi) and sin(m*phi) for m=1
        cmphi = cphi
        smphi = sphi
        ! Case 0<m<p
        do m = 1, p-1
            ! P_{l-2}^m which is P_m^m now
            pl2m = pmm
            ylm = pmm * vscales_rel((m+1)**2)
            tmp1 = cmphi*src_m((m+1)**2) + smphi*src_m(m*m+1)
            dst_v = dst_v + work(m+1)*ylm*tmp1
            ! Temporary to reduce number of operations
            tmp1 = dble(2*m+1) * pmm
            ! Update P_m^m for the next iteration
            pmm = -stheta * tmp1
            ! P_{l-1}^m which is P_{m+1}^m now
            pl1m = ctheta * tmp1
            ylm = pl1m * vscales_rel((m+1)*(m+3))
            tmp1 = cmphi*src_m((m+1)*(m+3)) + smphi*src_m((m+1)*(m+2)+1-m)
            dst_v = dst_v + work(m+2)*ylm*tmp1
            ! P_l^m for l>m+1
            do l = m+2, p
                plm = dble(2*l-1)*ctheta*pl1m - dble(l+m-1)*pl2m
                plm = plm / dble(l-m)
                ylm = plm * vscales_rel(l*l+l+1+m)
                tmp1 = cmphi*src_m(l*l+l+1+m) + smphi*src_m(l*l+l+1-m)
                dst_v = dst_v + work(l+1)*ylm*tmp1
                ! Update P_{l-2}^m and P_{l-1}^m for the next iteration
                pl2m = pl1m
                pl1m = plm
            end do
            ! Update cos(m*phi) and sin(m*phi) for the next iteration
            tmp1 = cmphi
            cmphi = cmphi*cphi - smphi*sphi
            smphi = tmp1*sphi + smphi*cphi
        end do
        ! Case m=p requires only to use P_m^m
        ylm = pmm * vscales_rel((p+1)**2)
        tmp1 = cmphi*src_m((p+1)**2) + smphi*src_m(p*p+1)
        dst_v = dst_v + work(p+1)*ylm*tmp1
    ! Case of x(1:2) = 0 and x(3) != 0
    else
        ! In this case Y_l^m = 0 for m != 0, so only case m = 0 is taken into
        ! account. Y_l^0 = ctheta^l in this case where ctheta is either +1 or
        ! -1. So, we copy sign(ctheta) into rcoef. But before that we
        ! initialize alpha/r factor for l=0 case without taking possible sign
        ! into account.
        t = alpha * rcoef
        if (c(3) .lt. 0.0_rp) then
            rcoef = -rcoef
        end if
        ! Proceed with accumulation of a potential
        indl = 1
        do l = 0, p
            ! Index of Y_l^0
            indl = indl + 2*l
            ! Add 4*pi/(2*l+1)*rcoef^{l+1}*Y_l^0 contribution
            dst_v = dst_v + t*src_m(indl)*vscales_rel(indl)
            ! Update t
            t = t * rcoef
        end do
    end if
end subroutine fmm_m2p_work

subroutine fmm_m2p(c_st, r_s,  pm, s, potential)
    implicit none
    
    real(rp) :: c_st(3)
    !! Distance vector from source to target
    real(rp) :: r_s
    !! Size of source node
    integer(ip) :: pm
    !! Maximum level of spherical harmonics expansion for multipoles
    real(rp) :: s(:)
    !! Target distribution expansion coefficients
    real(rp), intent(out) :: potential

    real(rp), allocatable :: work(:)

    ! Allocate local variables
    allocate(work(pm+1))


    call fmm_m2p_work(c_st, r_s, pm, vscales_rel, 1.0_rp, s, 0.0_rp, potential, work)

    deallocate(work)
end subroutine

end module