Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPUed PRESTO2 #42

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
23 changes: 23 additions & 0 deletions Howto_GPU.md
@@ -0,0 +1,23 @@
#GPUed PRESTO

The program of accelsearch in PRESTO was accelerated with CUDA by Jintao Luo. Any questions on how to compile and use the GPU routine may go to jluo@nrao.edu

##How to compile

###compile for CPU-only
By default PRESTO will be compiled for CPU-only. This is controlled by line 22 in $PRESTO/src/Makefile. For CPU-only, this line should be as:
>use_cuda = no# yes/no

###compile with GPU routines
Open the $PRESTO/src/Makefile

1. Make sure line 22 is:
>use_cuda = yes# yes/no

2. Check the CUDA-related variables and flags, and modify them if necessary.

##How to use
To run accelsearch on GPU, use the -cuda option. Or the program will run on CPU. For example:
>accelsearch -numharm 16 -zmax 256 ur_data.dat -cuda 0

0 means using the 1st GPU in your machine.
70 changes: 70 additions & 0 deletions include/accel_utils_gpu.h
@@ -0,0 +1,70 @@
/* ACCEL_USELEN must be less than 65536 since we */
/* use unsigned short ints to index our arrays... */
/* #define ACCEL_USELEN 32160 */
/* #define ACCEL_USELEN 16000 */
#define ACCEL_USELEN 7560
#define ACCEL_NUMBETWEEN 2
/* Stepsize in Fourier Freq */
#define ACCEL_DR 0.5
/* Reciprocal of ACCEL_DR */
#define ACCEL_RDR 2
/* Stepsize in Fourier F-dot */
#define ACCEL_DZ 2
/* Reciprocal of ACCEL_DZ */
#define ACCEL_RDZ 0.5
/* Closest candidates we will accept as independent */
#define ACCEL_CLOSEST_R 15.0
/* Padding for .dat file reading so that we don't SEGFAULT */
#define ACCEL_PADDING 2000


#ifndef DBLCORRECT
#define DBLCORRECT 1e-14
#endif

#ifndef _ACCEL_UTILS_GPU_
#define _ACCEL_UTILS_GPU_

typedef struct accel_cand_gpu{
float pow; /*pow of selected candidate*/
int nof_cand; /*number of candidates in sub_array/plane */
int z_ind; /*z_index of the selected candidate*/
int r_ind; /*r_index of the selected candidate*/
} accel_cand_gpu;

#ifndef _FCOMPLEX_DECLARED_
typedef struct fcomplex {
float r, i;
} fcomplex;
#define _FCOMPLEX_DECLARED_
#endif /* _FCOMPLEX_DECLARED_ */

typedef struct kernel{
int z; /* The fourier f-dot of the kernel */
int fftlen; /* Number of complex points in the kernel */
int numgoodbins; /* The number of good points you can get back */
int numbetween; /* Fourier freq resolution (2=interbin) */
int kern_half_width; /* Half width (bins) of the raw kernel. */
fcomplex *data; /* The FFTd kernel itself */
} kernel;

typedef struct subharminfo{
int numharm; /* The number of sub-harmonics */
int harmnum; /* The sub-harmonic number (fundamental = numharm) */
int zmax; /* The maximum Fourier f-dot for this harmonic */
int numkern; /* Number of kernels in the vector */
kernel *kern; /* The kernels themselves */
unsigned short *rinds; /* Table of indices for Fourier Freqs */
} subharminfo;


/* Constants used in the correlation/convolution routines */
typedef enum {
CONV, CORR, INPLACE_CONV, INPLACE_CORR
} presto_optype;

typedef enum {
FFTDK, FFTD, FFTK, NOFFTS
} presto_ffts;

#endif
5 changes: 5 additions & 0 deletions include/accelsearch_cmd.h
Expand Up @@ -72,6 +72,11 @@ typedef struct s_Cmdline {
/*@null*/char **argv;
/***** the whole command line concatenated */
char *full_cmd_line;
/***** cuda device parameters */
#ifdef USECUDA
char cudaP;
char cuda;
#endif
} Cmdline;


Expand Down
115 changes: 110 additions & 5 deletions src/Makefile
@@ -1,5 +1,6 @@
# Makefile for PRESTO
# by Scott M. Ransom
# GPU-related sections by Jintao Luo

# OS type
OS = Linux
Expand All @@ -8,13 +9,60 @@ OS = Linux
# Linux is the first choice
ifeq ($(OS),Linux)
LIBSUFFIX = .so
LIBCMD = -shared
LIBCMD = -shared
# else assume Darwin (i.e. OSX)
else
LIBSUFFIX = .dylib
LIBCMD = -dynamiclib
endif


#------------------------------------------------------------------------------------------
#determine if compile with CUDA
use_cuda = yes# yes/no

ifeq ($(use_cuda),yes)
$(info Compile with CUDA)
#******************************************************************
# for GUP version, CUDA
# CUDA compile option
# CUDA code generation flags
GENCODE_SM10 := -gencode arch=compute_10,code=sm_10
GENCODE_SM10 := -gencode arch=compute_11,code=sm_11
GENCODE_SM20 := -gencode arch=compute_20,code=sm_20
GENCODE_SM30 := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35
GENCODE_FLAGS := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)
# Location of the CUDA Toolkit binaries and libraries
#modify it if necessary
#CUDA_PATH ?= /usr/local/cuda-5.0
CUDA_PATH ?= /usr/local/cuda

#modify it if necessary
CUDA_INC_PATH ?= -I $(CUDA_PATH)/include -I. -I.. -I $(CUDA_PATH)/samples/common/inc/ -I$(PRESTO)/include

CUDA_BIN_PATH ?= $(CUDA_PATH)/bin
NVCC ?= $(CUDA_BIN_PATH)/nvcc

#modify it if necessary, for example, if 32-bit machine, -m32
NVCCFLAGS ?= -m64 -ccbin g++

EXTRA_NVCCFLAGS ?= -Xcompiler -fPIC -O2

#modify it if necessary, -m64 to -m32 for 32-bit machine
LINKCOMMAND_CUDA = g++ -m64 -O2 -o

#modify it if necessary, maybe lib64 should be lib32?
CUDA_LINK_EXTRA_FLAGS = -L $(CUDA_PATH)/lib64 -lcufft -lcudart
#******************************************************************
endif

ifeq ($(use_cuda),no)
$(info Compile without CUDA)
endif
#------------------------------------------------------------------------------------------



# How to link with some needed libraries of PGPLOT
X11LINK := $(shell pkg-config --libs x11)
PNGLINK := $(shell pkg-config --libs libpng)
Expand All @@ -37,7 +85,14 @@ CFITSIOINC := $(shell pkg-config --cflags cfitsio)
CFITSIOLINK := $(shell pkg-config --libs cfitsio)

# The standard PRESTO libraries to link into executables
#PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK)
ifeq ($(use_cuda),no)
PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK)
endif

ifeq ($(use_cuda),yes)
PRESTOLINK = $(CFITSIOLINK) -L$(PRESTO)/lib -lpresto $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++
endif

CC = gcc
FC = gfortran
Expand All @@ -46,6 +101,11 @@ FC = gfortran
CFLAGS = -I$(PRESTO)/include $(GLIBINC) $(CFITSIOINC) $(PGPLOTINC) $(FFTINC) \
-DUSEFFTW -DUSEMMAP -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \
-g -O3 -ffast-math -Wall -W -fPIC

ifeq ($(use_cuda),yes)
CFLAGS += -DUSECUDA
endif

# -g -Wall -W -fPIC
CLINKFLAGS = $(CFLAGS)
# NOTE: Be careful of upping the optimization on the
Expand All @@ -69,6 +129,7 @@ CLIG = clig
mv ../clig/$*_cmd.c .
cp ../clig/$*.1 ../docs/

ifeq ($(use_cuda),yes)
PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\
correlations.o database.o dcdflib.o dispersion.o\
Expand All @@ -79,7 +140,21 @@ PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\
transpose.o twopass.o twopass_real_fwd.o\
twopass_real_inv.o vectors.o multifiles.o mask.o\
fitsfile.o hget.o hput.o imio.o djcl.o
fitsfile.o hget.o hput.o imio.o djcl.o accel_utils_gpu.o
endif
ifeq ($(use_cuda),no)
PRESTOOBJS = amoeba.o atwood.o barycenter.o birdzap.o cand_output.o\
characteristics.o cldj.o chkio.o corr_prep.o corr_routines.o\
correlations.o database.o dcdflib.o dispersion.o\
fastffts.o fftcalls.o fminbr.o fold.o fresnl.o ioinf.o\
get_candidates.o iomak.o ipmpar.o maximize_r.o maximize_rz.o\
median.o minifft.o misc_utils.o clipping.o\
orbint.o output.o read_fft.o responses.o\
rzinterp.o rzwinterp.o select.o sorter.o swapendian.o\
transpose.o twopass.o twopass_real_fwd.o\
twopass_real_inv.o vectors.o multifiles.o mask.o\
fitsfile.o hget.o hput.o imio.o djcl.o
endif

INSTRUMENTOBJS = backend_common.o zerodm.o sigproc_fb.o psrfits.o

Expand Down Expand Up @@ -112,6 +187,14 @@ indent:
prep:
touch *_cmd.c

ifeq ($(use_cuda),yes)
accel_utils_gpu.o: accel_utils_gpu.cu
$(NVCC) $(EXTRA_NVCCFLAGS) $(NVCCFLAGS) $(CUDA_INC_PATH) $(GENCODE_FLAGS) -o $@ -c $<
endif
ifeq ($(use_cuda),no)
$(info accel_utils_gpu.o will not be compiled)
endif

makewisdom:
$(CC) $(CLINKFLAGS) -o $@ makewisdom.c $(FFTLINK)
./makewisdom
Expand All @@ -124,8 +207,18 @@ timetest:

libpresto: libpresto$(LIBSUFFIX)

ifeq ($(use_cuda),yes)
libpresto$(LIBSUFFIX): $(PRESTOOBJS)
$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK)
$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK) $(CUDA_LINK_EXTRA_FLAGS) -L $(SYSDIR)/lib/x86_64-linux-gnu -lstdc++
endif

ifeq ($(use_cuda),no)
libpresto$(LIBSUFFIX): $(PRESTOOBJS)
$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK)
endif

#libpresto$(LIBSUFFIX): $(PRESTOOBJS)
#$(LINKCOMMAND1) $(PRESTO)/lib/$@ $(PRESTOOBJS) $(FFTLINK)

slalib: libsla$(LIBSUFFIX)
cd slalib ; $(FC) -o sla_test sla_test.f -fno-second-underscore -L$(PRESTO)/lib -lsla
Expand All @@ -138,6 +231,17 @@ libsla$(LIBSUFFIX):

binaries: $(BINARIES)

ifeq ($(use_cuda),yes)
$(info link accelsearch with CUDA)
accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o
$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o accel_utils_gpu.o $(PRESTOLINK) -lglib-2.0 -lm $(CUDA_LINK_EXTRA_FLAGS)
endif
ifeq ($(use_cuda),no)
$(info link accelsearch without CUDA)
accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm
endif

mpi: mpiprepsubband

mpiprepsubband_utils.o: mpiprepsubband_utils.c
Expand All @@ -149,8 +253,8 @@ mpiprepsubband.o: mpiprepsubband.c
mpiprepsubband: mpiprepsubband_cmd.c mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS)
mpicc $(CLINKFLAGS) -o $(PRESTO)/bin/$@ mpiprepsubband_cmd.o mpiprepsubband_utils.o mpiprepsubband.o $(INSTRUMENTOBJS) $(PRESTOLINK) -lcfitsio -lm

accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm
#accelsearch: accelsearch_cmd.c accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o
# $(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ accelsearch_cmd.o accel_utils.o accelsearch.o zapping.o $(PRESTOLINK) $(GLIBLINK) -lm

bary: bary.o
$(CC) $(CLINKFLAGS) -o $(PRESTO)/bin/$@ bary.o $(PRESTOLINK) -lm
Expand Down Expand Up @@ -305,3 +409,4 @@ squeaky: cleaner
cd $(PRESTO)/docs ; rm -f *# *~
cd $(PRESTO)/python ; rm -f *# *~ *.o *.pyc *.pyo
cd $(PRESTO)/include ; rm -f *# *~