diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 330a14c1..b2116433 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,12 +2,14 @@ set(SRC
   allocator.f90
   backend.f90
   common.f90
+  poisson_fft.f90
   solver.f90
   tdsops.f90
   time_integrator.f90
   omp/backend.f90
   omp/common.f90
   omp/kernels_dist.f90
+  omp/poisson_fft.f90
   omp/sendrecv.f90
   omp/exec_dist.f90
 )
@@ -17,7 +19,9 @@ set(CUDASRC
   cuda/common.f90
   cuda/exec_dist.f90
   cuda/kernels/distributed.f90
+  cuda/kernels/complex.f90
   cuda/kernels/reorder.f90
+  cuda/poisson_fft.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
 )
@@ -40,6 +44,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR
   set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp")
   set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
   target_link_options(x3d2 INTERFACE "-cuda")
+  target_link_options(x3d2 INTERFACE "-lcufft")
 
   target_compile_options(xcompact PRIVATE "-DCUDA")
   #  target_link_options(xcompact INTERFACE "-cuda")
diff --git a/src/backend.f90 b/src/backend.f90
index cf453dad..e17970e5 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -1,6 +1,7 @@
 module m_base_backend
    use m_allocator, only: allocator_t, field_t
    use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: tdsops_t, dirps_t
 
    implicit none
@@ -24,6 +25,7 @@ module m_base_backend
       integer :: nx_loc, ny_loc, nz_loc
       class(allocator_t), pointer :: allocator
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
+      class(poisson_fft_t), pointer :: poisson_fft
    contains
       procedure(transeq_ders), deferred :: transeq_x
       procedure(transeq_ders), deferred :: transeq_y
@@ -37,6 +39,7 @@ module m_base_backend
       procedure(get_field), deferred :: get_field
       procedure(set_field), deferred :: set_field
       procedure(alloc_tdsops), deferred :: alloc_tdsops
+      procedure(init_poisson_fft), deferred :: init_poisson_fft
    end type base_backend_t
 
    abstract interface
@@ -188,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, &
       end subroutine alloc_tdsops
    end interface
 
+   abstract interface
+      subroutine init_poisson_fft(self, xdirps, ydirps, zdirps)
+         import :: base_backend_t
+         import :: dirps_t
+         implicit none
+
+         class(base_backend_t) :: self
+         type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      end subroutine init_poisson_fft
+   end interface
+
 end module m_base_backend
diff --git a/src/common.f90 b/src/common.f90
index 3b46919c..055620f8 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -7,14 +7,19 @@ module m_common
    integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
                          RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32
 
+   integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
+
    type :: globs_t
       integer :: nx, ny, nz
       integer :: nx_loc, ny_loc, nz_loc
       integer :: n_groups_x, n_groups_y, n_groups_z
       real(dp) :: Lx, Ly, Lz
       real(dp) :: dx, dy, dz
+      real(dp) :: nu, dt
+      integer :: n_iters, n_output
       integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1
       character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e
+      integer :: poisson_solver_type
    end type globs_t
 
 contains
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 1097d574..a5322ff6 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -7,11 +7,13 @@ module m_cuda_backend
    use m_base_backend, only: base_backend_t
    use m_common, only: dp, globs_t, &
                        RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
+   use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: dirps_t, tdsops_t
 
    use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
    use m_cuda_common, only: SZ
    use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact
+   use m_cuda_poisson_fft, only: cuda_poisson_fft_t
    use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
    use m_cuda_tdsops, only: cuda_tdsops_t
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
@@ -45,6 +47,7 @@ module m_cuda_backend
       procedure :: scalar_product => scalar_product_cuda
       procedure :: set_field => set_field_cuda
       procedure :: get_field => get_field_cuda
+      procedure :: init_poisson_fft => init_cuda_poisson_fft
       procedure :: transeq_cuda_dist
       procedure :: transeq_cuda_thom
       procedure :: tds_solve_dist
@@ -63,6 +66,7 @@ function init(globs, allocator) result(backend)
       class(allocator_t), target, intent(inout) :: allocator
       type(cuda_backend_t) :: backend
 
+      type(cuda_poisson_fft_t) :: cuda_poisson_fft
       integer :: n_halo, n_block
 
       select type(allocator)
@@ -600,5 +604,20 @@ subroutine get_field_cuda(self, arr, f)
 
    end subroutine get_field_cuda
 
+   subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps)
+      implicit none
+
+      class(cuda_backend_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      allocate(cuda_poisson_fft_t :: self%poisson_fft)
+
+      select type (poisson_fft => self%poisson_fft)
+      type is (cuda_poisson_fft_t)
+         poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps)
+      end select
+
+   end subroutine init_cuda_poisson_fft
+
 end module m_cuda_backend
 
diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90
new file mode 100644
index 00000000..32c5041f
--- /dev/null
+++ b/src/cuda/kernels/complex.f90
@@ -0,0 +1,295 @@
+module m_cuda_complex
+   use cudafor
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+
+   implicit none
+
+contains
+
+   attributes(global) subroutine process_spectral_div_u( &
+         div, waves, nx, ny, nz, ax, bx, ay, by, az, bz &
+         )
+      implicit none
+
+      ! Arguments
+      complex(dp), device, intent(inout), dimension(:, :, :) :: div
+      complex(dp), device, intent(in), dimension(:, :, :) :: waves
+      real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz
+      integer, value, intent(in) :: nx, ny, nz
+
+      ! Local variables
+      integer :: i, j, b, ix, iy, iz
+      real(dp) :: tmp_r, tmp_c, div_r, div_c
+
+      i = threadIdx%x
+      b = blockIdx%x
+
+      do j = 1, nx
+         ! normalisation
+         div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz)
+         div_c = aimag(div(i, j, b))/(nx*ny*nz)
+
+         ! get the indices for x, y, z directions
+         ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1
+
+         ! post-process forward
+         ! post-process in z
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bz(iz) + tmp_c*az(iz)
+         div_c = tmp_c*bz(iz) - tmp_r*az(iz)
+
+         ! post-process in y
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+         div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+         if (iy > ny/2 + 1) div_r = -div_r
+         if (iy > ny/2 + 1) div_c = -div_c
+
+         ! post-process in x
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+         div_c = tmp_c*bx(ix) - tmp_r*ax(ix)
+         if (ix > nx/2 + 1) div_r = -div_r
+         if (ix > nx/2 + 1) div_c = -div_c
+
+         ! Solve Poisson
+         tmp_r = real(waves(i, j, b), kind=dp)
+         tmp_c = aimag(waves(i, j, b))
+         if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then
+            div_r = 0._dp; div_c = 0._dp
+         else
+            div_r = -div_r/tmp_r
+            div_c = -div_c/tmp_c
+         end if
+
+         ! post-process backward
+         ! post-process in z
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bz(iz) - tmp_c*az(iz)
+         div_c = -tmp_c*bz(iz) - tmp_r*az(iz)
+
+         ! post-process in y
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*by(iy) + tmp_c*ay(iy)
+         div_c = tmp_c*by(iy) - tmp_r*ay(iy)
+         if (iy > ny/2 + 1) div_r = -div_r
+         if (iy > ny/2 + 1) div_c = -div_c
+
+         ! post-process in x
+         tmp_r = div_r
+         tmp_c = div_c
+         div_r = tmp_r*bx(ix) + tmp_c*ax(ix)
+         div_c = -tmp_c*bx(ix) + tmp_r*ax(ix)
+         if (ix > nx/2 + 1) div_r = -div_r
+         if (ix > nx/2 + 1) div_c = -div_c
+
+         ! update the entry
+         div(i, j, b) = cmplx(div_r, div_c, kind=dp)
+      end do
+
+   end subroutine process_spectral_div_u
+
+   attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_y
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1))
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_cmplx_x2y_T
+
+   attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_x
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1))
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine reorder_cmplx_y2x_T
+
+   attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_z
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nx, nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z
+
+      i = threadIdx%x
+      j = threadIdx%y
+      k = threadIdx%z
+
+      b_x = blockIdx%z
+      b_y = blockIdx%y
+      b_z = blockIdx%x
+
+      ! copy into shared
+      if ( j + (b_z - 1)*SZ <= nz ) &
+         tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
+                          j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      if ( i + (b_z - 1)*SZ <= nz ) &
+         u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i)
+
+   end subroutine reorder_cmplx_y2z_T
+
+   attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: u_y
+      complex(dp), device, intent(in), dimension(:, :, :) :: u_z
+      integer, value, intent(in) :: nx, nz
+
+      complex(dp), shared :: tile(SZ, SZ)
+
+      integer :: i, j, k, b_x, b_y, b_z
+
+      i = threadIdx%x
+      j = threadIdx%y
+      k = threadIdx%z
+
+      b_x = blockIdx%z
+      b_y = blockIdx%y
+      b_z = blockIdx%x
+
+      ! copy into shared
+      if ( i + (b_z - 1)*SZ <= nz ) &
+         tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      if ( j + (b_z - 1)*SZ <= nz ) &
+         u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, &
+             j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i)
+
+   end subroutine reorder_cmplx_z2y_T
+
+   attributes(global) subroutine reshapeDSF(uout, uin)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: uout
+      real(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      real(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
+
+      call syncthreads()
+
+      uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
+
+   end subroutine reshapeDSF
+
+   attributes(global) subroutine reshapeDSB(uout, uin)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: uout
+      real(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      real(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
+
+      call syncthreads()
+
+      uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
+
+   end subroutine reshapeDSB
+
+   attributes(global) subroutine reshapeCDSF(uout, uin)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: uout
+      complex(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      complex(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i, j + (b_i - 1)*SZ, b)
+
+      call syncthreads()
+
+      uout(i + (b_i - 1)*SZ, j, b) = tile(j, i)
+
+   end subroutine reshapeCDSF
+
+   attributes(global) subroutine reshapeCDSB(uout, uin)
+      implicit none
+
+      complex(dp), device, intent(out), dimension(:, :, :) :: uout
+      complex(dp), device, intent(in), dimension(:, :, :) :: uin
+
+      complex(dp), shared :: tile(SZ + 1, SZ)
+
+      integer :: i, j, b_i, b
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b = blockIdx%y
+
+      tile(i, j) = uin(i + (b_i - 1)*SZ, j, b)
+
+      call syncthreads()
+
+      uout(i, j + (b_i - 1)*SZ, b) = tile(j, i)
+
+   end subroutine reshapeCDSB
+
+end module m_cuda_complex
diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90
new file mode 100644
index 00000000..3df787a2
--- /dev/null
+++ b/src/cuda/poisson_fft.f90
@@ -0,0 +1,244 @@
+module m_cuda_poisson_fft
+   use cudafor
+   use cufft
+
+   use m_allocator, only: field_t
+   use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
+   use m_tdsops, only: dirps_t
+
+   use m_cuda_allocator, only: cuda_field_t
+   use m_cuda_common, only: SZ
+   use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, &
+                             reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, &
+                             process_spectral_div_u
+
+   implicit none
+
+   type, extends(poisson_fft_t) :: cuda_poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+
+      !> Local domain sized arrays to store data in spectral space
+      complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev
+      !> Local domain sized array storing the spectral equivalence constants
+      complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev
+
+      real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
+         ay_dev, by_dev, az_dev, bz_dev
+
+      real(dp), device, allocatable, dimension(:, :, :) :: f_tmp
+
+      integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy
+   contains
+      procedure :: fft_forward => fft_forward_cuda
+      procedure :: fft_backward => fft_backward_cuda
+      procedure :: fft_postprocess => fft_postprocess_cuda
+   end type cuda_poisson_fft_t
+
+   interface cuda_poisson_fft_t
+      module procedure init
+   end interface cuda_poisson_fft_t
+
+   private :: init
+
+contains
+
+   function init(xdirps, ydirps, zdirps) result(poisson_fft)
+      implicit none
+
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      type(cuda_poisson_fft_t) :: poisson_fft
+
+      integer :: nx, ny, nz
+
+      integer :: ierrfft
+      integer(int_ptr_kind()) :: worksize
+
+      call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+
+      nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz
+
+      allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ))
+      poisson_fft%waves_dev = poisson_fft%waves
+
+      allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx))
+      allocate (poisson_fft%ay_dev(ny), poisson_fft%by_dev(ny))
+      allocate (poisson_fft%az_dev(nz), poisson_fft%bz_dev(nz))
+      poisson_fft%ax_dev = poisson_fft%ax; poisson_fft%bx_dev = poisson_fft%bx
+      poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by
+      poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz
+
+      allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1)))
+      allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1)))
+      allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1)))
+
+      ! We can't currently ask allocator to pass us an array with
+      ! exact shape we want, so we allocate an extra array here.
+      ! This will be removed when allocator is fixed.
+      allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ))
+
+      ! set cufft plans
+      ierrfft = cufftCreate(poisson_fft%planD2Zz)
+      ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, &
+                                  nz, 1, nz, nz/2+1, 1, nz/2+1, &
+                                  CUFFT_D2Z, nx*ny, worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Dz)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, &
+                                  nz/2+1, 1, nz/2+1, nz, 1, nz, &
+                                  CUFFT_Z2D, nx*ny, worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Zy)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, &
+                                  ny, 1, ny, ny, 1, ny, &
+                                  CUFFT_Z2Z, nx*(nz/2 + 1), worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev)
+
+      ierrfft = cufftCreate(poisson_fft%planZ2Zx)
+      ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, &
+                                  nx, 1, nx, nx, 1, nx, &
+                                  CUFFT_Z2Z, ny*(nz/2 + 1), worksize)
+      ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev)
+
+   end function init
+
+   subroutine fft_forward_cuda(self, f)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+      class(field_t), intent(in) :: f
+
+      real(dp), device, pointer, dimension(:, :, :) :: f_dev
+      complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
+                                                          c_z_ptr
+
+      type(dim3) :: blocks, threads
+      integer :: ierrfft
+
+      select type(f); type is (cuda_field_t); f_dev => f%data_d; end select
+
+      ! First reorder f into cartesian-like data structure
+      blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeDSF<<<blocks, threads>>>(self%f_tmp, f_dev)
+
+      ! Forward FFT transform in z from real to complex
+      ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev)
+
+      ! Reorder from z to y
+      blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
+      threads = dim3(SZ, SZ, 1)
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+      c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
+
+      call reorder_cmplx_z2y_T<<<blocks, threads>>>(c_y_ptr, c_z_ptr, &
+                                                    self%nx, self%nz/2 + 1)
+
+      ! In-place forward FFT in y
+      ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
+                             CUFFT_FORWARD)
+
+      ! Reorder from y to x
+      blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
+      threads = dim3(SZ, SZ, 1)
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      call reorder_cmplx_y2x_T<<<blocks, threads>>>(c_x_ptr, c_y_ptr, &
+                                                    self%nz/2 + 1)
+
+      ! In-place forward FFT in x
+      ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
+                             CUFFT_FORWARD)
+
+   end subroutine fft_forward_cuda
+
+   subroutine fft_backward_cuda(self, f)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+      class(field_t), intent(inout) :: f
+
+      real(dp), device, pointer, dimension(:, :, :) :: f_dev
+      complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, &
+                                                          c_z_ptr
+
+      type(dim3) :: blocks, threads
+      integer :: ierrfft
+
+      select type(f); type is (cuda_field_t); f_dev => f%data_d; end select
+
+      ! In-place backward FFT in x
+      ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, &
+                             CUFFT_INVERSE)
+
+      ! Reorder from x to y
+      blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1)
+      threads = dim3(SZ, SZ, 1)
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      call reorder_cmplx_x2y_T<<<blocks, threads>>>(c_y_ptr, c_x_ptr, &
+                                                    self%nz/2 + 1)
+
+      ! In-place backward FFT in y
+      ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, &
+                             CUFFT_INVERSE)
+
+      ! Reorder from y to z
+      blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
+      threads = dim3(SZ, SZ, 1)
+      c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev
+      c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev
+
+      call reorder_cmplx_y2z_T<<<blocks, threads>>>(c_z_ptr, c_y_ptr, &
+                                                    self%nx, self%nz/2 + 1)
+
+      ! Backward FFT transform in z from complex to real
+      ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp)
+
+      ! Finally reorder f back into our specialist data structure
+      blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeDSB<<<blocks, threads>>>(f_dev, self%f_tmp)
+
+   end subroutine fft_backward_cuda
+
+   subroutine fft_postprocess_cuda(self)
+      implicit none
+
+      class(cuda_poisson_fft_t) :: self
+
+      complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr
+      type(dim3) :: blocks, threads
+
+      ! Get some complex array pointers with right shape
+      c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
+      c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev
+
+      ! Reshape from cartesian-like to our specialist data structure
+      blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeCDSB<<<blocks, threads>>>(c_dev, c_x_ptr)
+
+      ! Postprocess
+      blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1)
+      threads = dim3(SZ, 1, 1)
+      call process_spectral_div_u<<<blocks, threads>>>( &
+         c_dev, self%waves_dev, self%nx, self%ny, self%nz, &
+         self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
+         self%az_dev, self%bz_dev &
+         )
+
+      ! Reshape from our specialist data structure to cartesian-like
+      blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
+      threads = dim3(SZ, SZ, 1)
+      call reshapeCDSF<<<blocks, threads>>>(c_x_ptr, c_dev)
+
+   end subroutine fft_postprocess_cuda
+
+end module m_cuda_poisson_fft
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index aca287bd..aff888ce 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -7,6 +7,7 @@ module m_omp_backend
    use m_omp_sendrecv, only: sendrecv_fields
 
    use m_omp_common, only: SZ
+   use m_omp_poisson_fft, only: omp_poisson_fft_t
 
    implicit none
 
@@ -33,6 +34,7 @@ module m_omp_backend
       procedure :: scalar_product => scalar_product_omp
       procedure :: set_field => set_field_omp
       procedure :: get_field => get_field_omp
+      procedure :: init_poisson_fft => init_omp_poisson_fft
       procedure :: transeq_omp_dist
    end type omp_backend_t
 
@@ -362,5 +364,20 @@ subroutine get_field_omp(self, arr, f)
 
    end subroutine get_field_omp
 
+   subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps)
+      implicit none
+
+      class(omp_backend_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      allocate(omp_poisson_fft_t :: self%poisson_fft)
+
+      select type (poisson_fft => self%poisson_fft)
+      type is (omp_poisson_fft_t)
+         poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps)
+      end select
+
+   end subroutine init_omp_poisson_fft
+
 end module m_omp_backend
 
diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90
new file mode 100644
index 00000000..a796f1fc
--- /dev/null
+++ b/src/omp/poisson_fft.f90
@@ -0,0 +1,60 @@
+module m_omp_poisson_fft
+   use m_allocator, only: field_t
+   use m_common, only: dp
+   use m_poisson_fft, only: poisson_fft_t
+   use m_tdsops, only: dirps_t
+
+   use m_omp_common, only: SZ
+
+   implicit none
+
+   type, extends(poisson_fft_t) :: omp_poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+      complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z
+   contains
+      procedure :: fft_forward => fft_forward_omp
+      procedure :: fft_backward => fft_backward_omp
+      procedure :: fft_postprocess => fft_postprocess_omp
+   end type omp_poisson_fft_t
+
+   interface omp_poisson_fft_t
+      module procedure init
+   end interface omp_poisson_fft_t
+
+   private :: init
+
+contains
+
+   function init(xdirps, ydirps, zdirps) result(poisson_fft)
+      implicit none
+
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+
+      type(omp_poisson_fft_t) :: poisson_fft
+
+      call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ)
+
+   end function init
+
+   subroutine fft_forward_omp(self, f_in)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+      class(field_t), intent(in) :: f_in
+   end subroutine fft_forward_omp
+
+   subroutine fft_backward_omp(self, f_out)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+      class(field_t), intent(inout) :: f_out
+   end subroutine fft_backward_omp
+
+   subroutine fft_postprocess_omp(self)
+      implicit none
+
+      class(omp_poisson_fft_t) :: self
+   end subroutine fft_postprocess_omp
+
+end module m_omp_poisson_fft
diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90
new file mode 100644
index 00000000..9ee0f575
--- /dev/null
+++ b/src/poisson_fft.f90
@@ -0,0 +1,198 @@
+module m_poisson_fft
+   use m_allocator, only: field_t
+   use m_common, only: dp, pi
+   use m_tdsops, only: dirps_t
+
+   implicit none
+
+   type, abstract :: poisson_fft_t
+      !! FFT based Poisson solver
+      !! It can only handle 1D decompositions along z direction.
+      integer :: nx, ny, nz
+      complex(dp), allocatable, dimension(:, :, :) :: waves
+      complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz
+   contains
+      procedure(fft_forward), deferred :: fft_forward
+      procedure(fft_backward), deferred :: fft_backward
+      procedure(fft_postprocess), deferred :: fft_postprocess
+      procedure :: base_init
+      procedure :: waves_set
+   end type poisson_fft_t
+
+   abstract interface
+      subroutine fft_forward(self, f_in)
+         import :: poisson_fft_t
+         import :: field_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+         class(field_t), intent(in) :: f_in
+      end subroutine fft_forward
+
+      subroutine fft_backward(self, f_out)
+         import :: poisson_fft_t
+         import :: field_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+         class(field_t), intent(inout) :: f_out
+      end subroutine fft_backward
+
+      subroutine fft_postprocess(self)
+         import :: poisson_fft_t
+         implicit none
+
+         class(poisson_fft_t) :: self
+      end subroutine fft_postprocess
+   end interface
+
+contains
+
+   subroutine base_init(self, xdirps, ydirps, zdirps, sz)
+      implicit none
+
+      class(poisson_fft_t) :: self
+      class(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      integer, intent(in) :: sz
+
+      self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n
+
+      allocate (self%ax(self%nx), self%bx(self%nx))
+      allocate (self%ay(self%nx), self%by(self%nx))
+      allocate (self%az(self%nx), self%bz(self%nx))
+
+      allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz))
+
+      ! waves_set requires some of the preprocessed tdsops variables.
+      call self%waves_set(xdirps, ydirps, zdirps, sz)
+
+   end subroutine base_init
+
+   subroutine waves_set(self, xdirps, ydirps, zdirps, sz)
+      !! Ref. JCP 228 (2009), 5989–6015, Sec 4
+      implicit none
+
+      class(poisson_fft_t) :: self
+      type(dirps_t), intent(in) :: xdirps, ydirps, zdirps
+      integer, intent(in) :: sz
+
+      complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, &
+                                                exs, eys, ezs
+
+      integer :: nx, ny, nz
+      real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1
+      complex(dp) :: xt2, yt2, zt2, xyzk
+
+      integer :: i, j, ka, kb, ix, iy, iz
+
+      nx = xdirps%n; ny = ydirps%n; nz = zdirps%n
+
+      do i = 1, nx
+         self%ax(i) = sin((i-1)*pi/nx)
+         self%bx(i) = cos((i-1)*pi/nx)
+      end do
+
+      do i = 1, ny
+         self%ay(i) = sin((i-1)*pi/ny)
+         self%by(i) = cos((i-1)*pi/ny)
+      end do
+
+      do i = 1, nz
+         self%az(i) = sin((i-1)*pi/nz)
+         self%bz(i) = cos((i-1)*pi/nz)
+      end do
+
+      ! Now kxyz
+      allocate(xkx(nx), xk2(nx), exs(nx))
+      allocate(yky(ny), yk2(ny), eys(ny))
+      allocate(zkz(nz), zk2(nz), ezs(nz))
+      xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0
+
+      ! periodic-x
+      do i = 1, nx/2 + 1
+         w = 2*pi*(i - 1)/nx
+         wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) &
+              + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w))
+
+         xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)
+         exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L)
+         xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2
+      end do
+      do i = nx/2 + 2, nx
+         xkx(i) = xkx(nx - i + 2)
+         exs(i) = exs(nx - i + 2)
+         xk2(i) = xk2(nx - i + 2)
+      end do
+
+      ! periodic-y
+      do i = 1, ny/2 + 1
+         w = 2*pi*(i - 1)/ny
+         wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) &
+              + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w))
+
+         yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)
+         eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L)
+         yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2
+      end do
+      do i = ny/2 + 2, ny
+         yky(i) = yky(ny-i+2)
+         eys(i) = eys(ny-i+2)
+         yk2(i) = yk2(ny-i+2)
+      end do
+
+      ! periodic-z
+      do i = 1, nz/2 + 1
+         w = 2*pi*(i - 1)/nz
+         wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) &
+              + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w)
+         wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w))
+
+         zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)
+         ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L)
+         zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2
+      end do
+
+      print*, 'waves array is correctly set only for a single rank run'
+      ! TODO: do loop ranges below are valid only for single rank runs
+      do ka = 1, nz/2 + 1
+         do kb = 1, ny/sz
+            do j = 1, nx
+               do i = 1, sz
+                  ix = j; iy = (kb - 1)*sz + i; iz = ka
+                  rlexs = real(exs(ix), kind=dp)*xdirps%d
+                  rleys = real(eys(iy), kind=dp)*ydirps%d
+                  rlezs = real(ezs(iz), kind=dp)*zdirps%d
+
+                  xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) &
+                           + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) &
+                           + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) &
+                           + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp))
+                  ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) &
+                           + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) &
+                           + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) &
+                           + ydirps%interpl_v2p%d*cos(rleys*3.5_dp))
+                  ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) &
+                           + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) &
+                           + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) &
+                           + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp))
+
+                  xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs)
+                  yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys)
+                  zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs)
+
+                  xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2)
+                  yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2)
+                  zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2)
+
+                  xyzk = xt2 + yt2 + zt2
+                  self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk
+               end do
+            end do
+         end do
+      end do
+
+   end subroutine waves_set
+
+end module m_poisson_fft
diff --git a/src/solver.f90 b/src/solver.f90
index fad0a715..c9a04a11 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -2,7 +2,8 @@ module m_solver
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
    use m_common, only: dp, globs_t, &
-                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
+                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
+                       POISSON_SOLVER_FFT, POISSON_SOLVER_CG
    use m_tdsops, only: tdsops_t, dirps_t
    use m_time_integrator, only: time_intg_t
 
@@ -37,23 +38,39 @@ module m_solver
       !! for later use.
 
       real(dp) :: dt, nu
+      integer :: n_iters, n_output
 
       class(field_t), pointer :: u, v, w
 
       class(base_backend_t), pointer :: backend
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
       class(time_intg_t), pointer :: time_integrator
+      procedure(poisson_solver), pointer :: poisson => null()
    contains
       procedure :: transeq
       procedure :: divergence_v2p
       procedure :: gradient_p2v
       procedure :: curl
+      procedure :: output
       procedure :: run
    end type solver_t
 
+   abstract interface
+      subroutine poisson_solver(self, pressure, div_u)
+         import :: solver_t
+         import :: field_t
+         implicit none
+
+         class(solver_t) :: self
+         class(field_t), intent(inout) :: pressure
+         class(field_t), intent(in) :: div_u
+      end subroutine poisson_solver
+   end interface
+
    interface solver_t
       module procedure init
    end interface solver_t
+
 contains
 
    function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
@@ -69,8 +86,8 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init
       integer :: dims(3)
 
-      real(dp) :: dx, dy, dz
-      integer :: nx, ny, nz
+      real(dp) :: x, y, z
+      integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz
 
       solver%backend => backend
       solver%time_integrator => time_integrator
@@ -89,9 +106,31 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       allocate(v_init(dims(1), dims(2), dims(3)))
       allocate(w_init(dims(1), dims(2), dims(3)))
 
-      u_init = 0
-      v_init = 0
-      w_init = 0
+      solver%dt = globs%dt
+      solver%backend%nu = globs%nu
+      solver%n_iters = globs%n_iters
+      solver%n_output = globs%n_output
+
+      sz = dims(1)
+      nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
+      do ka = 1, nz
+         do kb = 1, ny/sz
+            do j = 1, nx
+               do i = 1, sz
+                  ! Mapping to ix, iy, iz depends on global group numbering
+                  ix = j; iy = (kb - 1)*sz + i; iz = ka
+                  x = (ix - 1)*globs%dx
+                  y = (iy - 1)*globs%dy
+                  z = (iz - 1)*globs%dz
+
+                  b = ka + (kb - 1)*xdirps%n
+                  u_init(i, j, b) = sin(x)*cos(y)*cos(z)
+                  v_init(i, j, b) =-cos(x)*sin(y)*cos(z)
+                  w_init(i, j, b) = 0
+               end do
+            end do
+         end do
+      end do
 
       call solver%backend%set_field(solver%u, u_init)
       call solver%backend%set_field(solver%v, v_init)
@@ -100,13 +139,20 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'
 
-      nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
-      dx = globs%dx; dy = globs%dy; dz = globs%dz
-
       ! Allocate and set the tdsops
-      call allocate_tdsops(solver%xdirps, nx, dx, solver%backend)
-      call allocate_tdsops(solver%ydirps, ny, dy, solver%backend)
-      call allocate_tdsops(solver%zdirps, nz, dz, solver%backend)
+      call allocate_tdsops(solver%xdirps, nx, globs%dx, solver%backend)
+      call allocate_tdsops(solver%ydirps, ny, globs%dy, solver%backend)
+      call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend)
+
+      select case (globs%poisson_solver_type)
+      case (POISSON_SOLVER_FFT)
+         print*, 'Poisson solver: FFT'
+         call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps)
+         solver%poisson => poisson_fft
+      case (POISSON_SOLVER_CG)
+         print*, 'Poisson solver: CG, not yet implemented'
+         solver%poisson => poisson_cg
+      end select
 
    end function init
 
@@ -489,20 +535,86 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w)
 
    end subroutine curl
 
-   subroutine run(self, n_iter, u_out, v_out, w_out)
+   subroutine poisson_fft(self, pressure, div_u)
+      implicit none
+
+      class(solver_t) :: self
+      class(field_t), intent(inout) :: pressure
+      class(field_t), intent(in) :: div_u
+
+      ! call forward FFT
+      ! output array in spectral space is stored at poisson_fft class
+      call self%backend%poisson_fft%fft_forward(div_u)
+
+      ! postprocess
+      call self%backend%poisson_fft%fft_postprocess
+
+      ! call backward FFT
+      call self%backend%poisson_fft%fft_backward(pressure)
+
+   end subroutine poisson_fft
+
+   subroutine poisson_cg(self, pressure, div_u)
+      implicit none
+
+      class(solver_t) :: self
+      class(field_t), intent(inout) :: pressure
+      class(field_t), intent(in) :: div_u
+
+   end subroutine poisson_cg
+
+   subroutine output(self, t, u_out)
+      implicit none
+
+      class(solver_t), intent(in) :: self
+      real(dp), intent(in) :: t
+      real(dp), dimension(:, :, :), intent(inout) :: u_out
+
+      class(field_t), pointer :: du, dv, dw
+      integer :: ngrid
+
+      ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n
+      print*, 'time = ', t
+
+      du => self%backend%allocator%get_block()
+      dv => self%backend%allocator%get_block()
+      dw => self%backend%allocator%get_block()
+
+      call self%curl(du, dv, dw, self%u, self%v, self%w)
+      print*, 'enstrophy:', 0.5_dp*( &
+         self%backend%scalar_product(du, du) &
+         + self%backend%scalar_product(dv, dv) &
+         + self%backend%scalar_product(dw, dw) &
+         )/ngrid
+
+      call self%backend%allocator%release_block(du)
+      call self%backend%allocator%release_block(dv)
+      call self%backend%allocator%release_block(dw)
+
+      call self%divergence_v2p(du, self%u, self%v, self%w)
+      call self%backend%get_field(u_out, du)
+      print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid
+
+   end subroutine output
+
+   subroutine run(self, u_out, v_out, w_out)
       implicit none
 
       class(solver_t), intent(in) :: self
-      integer, intent(in) :: n_iter
       real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out
 
       class(field_t), pointer :: du, dv, dw, div_u, pressure, dpdx, dpdy, dpdz
 
+      real(dp) :: t
       integer :: i
 
+      print*, 'initial conditions'
+      t = 0._dp
+      call self%output(t, u_out)
+
       print*, 'start run'
 
-      do i = 1, n_iter
+      do i = 1, self%n_iters
          du => self%backend%allocator%get_block()
          dv => self%backend%allocator%get_block()
          dw => self%backend%allocator%get_block()
@@ -524,7 +636,7 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
 
          pressure => self%backend%allocator%get_block()
 
-         !call self%poisson(pressure, div_u)
+         call self%poisson(pressure, div_u)
 
          call self%backend%allocator%release_block(div_u)
 
@@ -544,6 +656,11 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
          call self%backend%allocator%release_block(dpdx)
          call self%backend%allocator%release_block(dpdy)
          call self%backend%allocator%release_block(dpdz)
+
+         if ( mod(i, self%n_output) == 0 ) then
+            t = i*self%dt
+            call self%output(t, u_out)
+         end if
       end do
 
       print*, 'run end'
diff --git a/src/tdsops.f90 b/src/tdsops.f90
index c159cdd2..3db7b4d9 100644
--- a/src/tdsops.f90
+++ b/src/tdsops.f90
@@ -27,6 +27,7 @@ module m_tdsops
                                              dist_sa, dist_sc, & !! back subs.
                                              dist_af !! the auxiliary factors
       real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :)
+      real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp
       integer :: n, n_halo
    contains
       procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess
@@ -46,6 +47,7 @@ module m_tdsops
                                       stagder_v2p, stagder_p2v, &
                                       interpl_v2p, interpl_p2v
       integer :: nrank, nproc, pnext, pprev, n, n_blocks
+      real(dp) :: L, d
    end type dirps_t
 
 contains
@@ -154,6 +156,9 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = afi; self%b = bfi
+
       self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, &
                         0._dp, &
                         afi, bfi, 0._dp, 0._dp]
@@ -321,6 +326,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, &
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = asi; self%b = bsi; self%c = csi; self%d = dsi
+
       self%coeffs(:) = [dsi, csi, bsi, asi, &
                         -2._dp*(asi + bsi + csi + dsi), &
                         asi, bsi, csi, dsi]
@@ -511,6 +519,9 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = aici; self%b = bici; self%c = cici; self%d = dici
+
       select case (from_to)
       case ('v2p')
          self%coeffs(:) = [0._dp, dici, cici, bici, &
@@ -633,6 +644,9 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym)
          error stop 'scheme is not defined'
       end select
 
+      self%alpha = alpha
+      self%a = aci; self%b = bci
+
       select case (from_to)
       case ('v2p')
          self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, &
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index 779e09f0..635db2a7 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -3,7 +3,8 @@ program xcompact
 
    use m_allocator
    use m_base_backend
-   use m_common, only: pi, globs_t, set_pprev_pnext
+   use m_common, only: pi, globs_t, set_pprev_pnext, &
+                       POISSON_SOLVER_FFT, POISSON_SOLVER_CG
    use m_solver, only: solver_t
    use m_time_integrator, only: time_intg_t
    use m_tdsops, only: tdsops_t
@@ -55,13 +56,21 @@ program xcompact
 
    ! read L_x/y/z from the input file
    globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi
+   xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz
 
    ! read ns from the input file
-   globs%nx = 512; globs%ny = 512; globs%nz = 512
+   globs%nx = 256; globs%ny = 256; globs%nz = 256
+
+   globs%dt = 0.001_dp
+   globs%nu = 1._dp/1600._dp
+   globs%n_iters = 20000
+   globs%n_output = 100
 
    ! set nprocs based on run time arguments
    globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1
 
+   globs%poisson_solver_type = POISSON_SOLVER_FFT
+
    ! Lets allow a 1D decomposition for the moment
    !globs%nproc_x = nproc
 
@@ -91,6 +100,8 @@ program xcompact
    globs%dy = globs%Ly/globs%ny
    globs%dz = globs%Lz/globs%nz
 
+   xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz
+
    xdirps%n = globs%nx_loc
    ydirps%n = globs%ny_loc
    zdirps%n = globs%nz_loc
@@ -117,8 +128,6 @@ program xcompact
    print*, 'OpenMP backend instantiated'
 #endif
 
-   backend%nu = 1._dp
-
    allocate(u(SZ, globs%nx_loc, globs%n_groups_x))
    allocate(v(SZ, globs%nx_loc, globs%n_groups_x))
    allocate(w(SZ, globs%nx_loc, globs%n_groups_x))
@@ -130,7 +139,7 @@ program xcompact
 
    call cpu_time(t_start)
 
-   call solver%run(100, u, v, w)
+   call solver%run(u, v, w)
 
    call cpu_time(t_end)