!{\src2tex{textfont=tt}}
!!****f* abinit/prep_fourwf
!! NAME
!! prep_fourwf
!!
!! FUNCTION
!! this routine prepares the data to the call of fourwf.
!!
!! COPYRIGHT
!! copyright (c) 1998-2005 abinit group (MT)
!! this file is distributed under the terms of the
!! gnu general public license, see ~abinit/infos/copyright
!! or http://www.gnu.org/copyleft/gpl.txt .
!! for the initials of contributors, see ~abinit/infos/contributors .
!!
!! INPUTS
!!  blocksize= size of block for FFT
!!  cwavef(2,npw*nspinor*ndat)=planewave coefficients of wavefunction.
!!  dimffnl=second dimension of ffnl (1+number of derivatives)
!!  dtfil <type(datafiles_type)>=variables related to files
!!  ffnl(npw,dimffnl,lmnmax,ntypat)=nonlocal form factors on basis sphere.
!!  gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
!!  gvnlc=matrix elements <G|Vnonlocal|C>
!!  kg_k(3,npw_k)=reduced planewave coordinates.
!!  icall = order of call of this routine in lobpcgccwf
!!  lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps
!!        =if useylm=0, max number of (l,n)   comp. over all type of psps
!!  matblk=dimension of the array ph3d
!!  mgfft=maximum size of 1d ffts
!!  mpi_enreg=informations about mpi parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials
!!  natom=number of atoms in cell.
!!  nband_k=number of bands at this k point for that spin polarization
!!  nbdblock=
!!  npw_k=number of plane waves at this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in unit cell.
!!  nvloc=final dimension of vlocal (usually 1, but 4 for non-collinear)
!!  n4,n5,n6 used for dimensionning of vlocal
!!  ph3d(2,npw,matblk)=3-dim structure factors, for each atom and plane wave.
!!  prtvol=control print volume and debugging output
!!  vlocal(n4,n5,n6,nvloc)= local potential in real space, on the augmented fft grid
!!
!! OUTPUT
!!  gwavef=(2,npw*nspinor*ndat)=matrix elements <G|H|C>.
!!
!! SIDE EFFECTS
!!
!! PARENTS
!!      mkrho,vtowfk
!!
!! CHILDREN
!!      fourwf,sphereboundary,timab,xallgather_mpi,xallgatherv_mpi
!!      xalltoallv_mpi,xcomm_init
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine prep_fourwf(rhoaug,blocksize,cwavef,wfraug,gs_hamk,istwf_k,iblock,icall,kg_k,&
& mgfft,mpi_enreg,nbdblock,nband_k,npw_k,n4,n5,n6,occ_k,wtk)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_12ffts
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

#if defined MPI_FFT
          include 'mpif.h'
#endif
!Arguments ------------------------------------
 type(gs_hamiltonian_type) :: gs_hamk
 integer:: blocksize
 integer :: iblock,icall,istwf_k,mgfft,n4,n5
 integer :: n6,nband_k,nbdblock,npw_k
 type(mpi_type) :: mpi_enreg
 integer :: kg_k(3,npw_k)
 real(dp) :: wtk,cwavef(2,npw_k*blocksize)
 real(dp) :: occ_k(nband_k),rhoaug(n4,n5,n6),wfraug(2,n4,n5,n6)
!Local variables-------------------------------
  integer :: oldspacecomm,spacecomm=0
  integer:: ier,ipw
  integer:: old_me_g0 =  0,old_num_group_fft,old_paral_compil_fft,old_paral_level,tim_fourwf
  integer:: old_ngfft(18)
  complex(dp), allocatable :: dummy2(:,:)
  real(dp) :: dummy(2,1),weight,tsec(2)

!local variables for mpialltoallv
  real(dp), allocatable :: cwavef_alltoall(:,:)
  integer, allocatable,save :: kg_k_gather(:,:),kg_k_gather_all(:,:),rdispls_all(:),npw_per_proc(:)
  integer, save :: npw_tot
  integer :: iproc,ndatarecv,ndatarecvloc
  integer,  allocatable :: recvcounts(:)
  integer,  allocatable :: sendcounts(:),sdispls(:),rdispls(:)
  integer,  allocatable :: sendcountsloc(:),sdisplsloc(:),recvcountsloc(:),rdisplsloc(:)
!no_abirules
!correspondence with abinit. here for real wf but in complex mode
!this is the index of a given band

! *************************************************************************
 old_paral_level= mpi_enreg%paral_level
 mpi_enreg%paral_level=3
 call xcomm_init(mpi_enreg,spaceComm)
 if(mpi_enreg%mode_para=='b') spaceComm=mpi_enreg%comm_band

 allocate(sendcounts(blocksize))
 allocate(sdispls(blocksize))
 allocate(sendcountsloc(blocksize))
 allocate(sdisplsloc(blocksize))
 allocate(recvcountsloc(blocksize))
 allocate(rdisplsloc(blocksize))
 allocate(rdispls(blocksize))
 allocate(recvcounts(blocksize))
 call timab(548,1,tsec)
 call xallgather_mpi(npw_k,recvcounts,spaceComm,ier)
 call timab(548,2,tsec)
 rdispls(1)=0
 do iproc=2,blocksize
  rdispls(iproc)=rdispls(iproc-1)+recvcounts(iproc-1)
 end do
 ndatarecv=rdispls(blocksize)+recvcounts(blocksize)

 if (icall==1 .and. iblock==1) then
  if (allocated(kg_k_gather)) deallocate(kg_k_gather)
  allocate(kg_k_gather(3,ndatarecv))
  recvcountsloc(:)=recvcounts(:)*3
  rdisplsloc(:)=rdispls(:)*3
  call timab(548,1,tsec)
  call xallgatherv_mpi(kg_k(1,:),npw_k,kg_k_gather(1,:),recvcounts(:),rdispls,spaceComm,ier)
  call xallgatherv_mpi(kg_k(2,:),npw_k,kg_k_gather(2,:),recvcounts(:),rdispls,spaceComm,ier)
  call xallgatherv_mpi(kg_k(3,:),npw_k,kg_k_gather(3,:),recvcounts(:),rdispls,spaceComm,ier)
  call timab(548,2,tsec)
! I now recollect all the kg to have a common sphereboundary.
! First get the dimension of the whole kg array
  oldspacecomm=mpi_enreg%comm_fft
! Get dimension information
  allocate(npw_per_proc(mpi_enreg%nproc_fft),rdispls_all(mpi_enreg%nproc_fft))
  call xallgather_mpi(ndatarecv,npw_per_proc,oldspacecomm,ier)
  rdispls_all(1)=0
  do iproc=2,mpi_enreg%nproc_fft
   rdispls_all(iproc)=rdispls_all(iproc-1)+npw_per_proc(iproc-1)
  end do
  npw_tot=rdispls_all(mpi_enreg%nproc_fft)+npw_per_proc(mpi_enreg%nproc_fft)
! Transfer the kg on each proc to the whole array
  allocate(kg_k_gather_all(3,npw_tot))
  call timab(548,1,tsec)
  call xallgatherv_mpi&
&  (kg_k_gather(1,:),ndatarecv,kg_k_gather_all(1,:),npw_per_proc(:),rdispls_all,oldspaceComm,ier)
  call xallgatherv_mpi&
&  (kg_k_gather(2,:),ndatarecv,kg_k_gather_all(2,:),npw_per_proc(:),rdispls_all,oldspaceComm,ier)
  call xallgatherv_mpi&
&  (kg_k_gather(3,:),ndatarecv,kg_k_gather_all(3,:),npw_per_proc(:),rdispls_all,oldspaceComm,ier)
  call timab(548,2,tsec)
  call sphereboundary(gs_hamk%gbound,istwf_k,kg_k_gather_all,mgfft,npw_tot)
  deallocate(kg_k_gather_all,npw_per_proc,rdispls_all)
 end if !End of the icall=1 and iblock=1 conditions

 sendcounts(:)=npw_k
 do iproc=1,blocksize
  sdispls(iproc)=(iproc-1)*npw_k
 end do

 allocate(cwavef_alltoall(2,ndatarecv))
 recvcountsloc(:)=recvcounts(:)*2
 rdisplsloc(:)=rdispls(:)*2
 sendcountsloc(:)=sendcounts(:)*2
 sdisplsloc(:)=sdispls(:)*2
 call timab(547,1,tsec)
 call xalltoallv_mpi(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall,&
&         recvcountsloc,rdisplsloc,spaceComm,ier)
 call timab(547,2,tsec)
!Attention, c'est pour les essais. Le test est a remettre par la suite.
 if(abs(occ_k(mpi_enreg%coords(2)+1+(iblock-1)*blocksize)) >=tol8) then
  tim_fourwf=16
  weight=occ_k(mpi_enreg%coords(2)+1+(iblock-1)*blocksize)*wtk/gs_hamk%ucvol
  call fourwf(1,rhoaug,cwavef_alltoall,dummy,wfraug,&
&  gs_hamk%gbound,gs_hamk%gbound,&
&  istwf_k,kg_k_gather,kg_k_gather,mgfft,mpi_enreg,1,gs_hamk%ngfft,ndatarecv,1,n4,n5,n6,1,tim_fourwf,weight)
 end if
 if (gs_hamk%istwf_k==2) mpi_enreg%me_g0=old_me_g0

 mpi_enreg%paral_level= old_paral_level
 deallocate(recvcounts,sdispls,rdispls)
 deallocate(cwavef_alltoall)
 deallocate(sendcountsloc,sdisplsloc)
 deallocate(recvcountsloc,rdisplsloc)
end subroutine prep_fourwf
!!***
