R/Individual_Analysis_Results_Summary.R

#' Summarize individual-variant analysis results generated by \code{STAARpipeline} package
#'
#' The \code{Individual_Analysis_Results_Summary} function takes in the objects of individual analysis results
#' generated by \code{STAARpipeline} package,
#' the object from fitting the null model, and the set of known variants to be adjusted for in conditional analysis
#' to summarize the individual analysis results and analyze the conditional association between a quantitative/dichotomous phenotype and
#' the unconditional significant single variants.
#' @param agds_dir a data farme containing directory of GDS/aGDS files.
#' @param jobs_num a data frame containing the number of analysis results, including the number of individual analysis results,
#' the number of sliding window analysis results, and the number of dynamic window analysis results.
#' @param input_path the directory of individual analysis results that generated by \code{STAARpipeline} package.
#' @param output_path the directory for the output files.
#' @param individual_results_name the file name of individual analysis results generated by \code{STAARpipeline} package.
#' @param obj_nullmodel an object from fitting the null model, which is either the output from \code{fit_nullmodel} function in the \code{STAARpipeline} package,
#' or the output from \code{fitNullModel} function in the \code{GENESIS} package and transformed using the \code{genesis2staar_nullmodel} function in the \code{STAARpipeline} package.
#' @param known_loci the data frame of variants to be adjusted for in conditional analysis and should
#' contain 4 columns in the following order: chromosome (CHR), position (POS), reference allele (REF),
#' and alternative allele (ALT) (default = NULL).
#' @param method_cond a character value indicating the method for conditional analysis.
#' \code{optimal} refers to regressing residuals from the null model on \code{known_loci}
#' as well as all covariates used in fitting the null model (fully adjusted) and taking the residuals;
#' \code{naive} refers to regressing residuals from the null model on \code{known_loci}
#' and taking the residuals (default = \code{optimal}).
#' @param QC_label channel name of the QC label in the GDS/aGDS file.
#' @param variant_type type of variant included in the analysis. Choices include "variant", "SNV", or "Indel" (default = "variant").
#' @param geno_missing_imputation method of handling missing genotypes. Either "mean" or "minor" (default = "mean").
#' @param alpha p-value threshold of significant results (default = 5E-09).
#' @param manhattan_plot output manhattan plot or not (default = FALSE).
#' @param QQ_plot output Q-Q plot or not (default = FALSE).
#' @param SPA_p_filter logical: are only the variants with a score-test-based p-value smaller than a pre-specified threshold use the SPA method to recalculate the p-value, only used for imbalanced case-control setting (default = FALSE).
#' @param p_filter_cutoff threshold for the p-value recalculation using the SPA method, only used for imbalanced case-control setting (default = 0.05)
#' @param cond_null_model_name the null model name for conditional analysis in the SPA setting, only used for imbalanced case-control setting (default = NULL).
#' @param cond_null_model_dir the directory of storing the null model for conditional analysis in the SPA setting, only used for imbalanced case-control setting (default = NULL).
#' @return The function returns the following analysis results:
#' @return \code{results_individual_analysis_genome.Rdata}: a matrix contains the score test p-value and effect size estimation of each variant across the genome.
#' @return \code{results_individual_analysis_sig.Rdata} and \code{results_individual_analysis_sig.csv}: a matrix contains the score test p-values and effect size estimations of significant results (p-value < alpha).
#' @return \code{results_sig_cond.Rdata} and \code{results_sig_cond.csv}: a matrix contains the conditional score test p-values for each significant variant  (available if known_loci is not a NULL).
#' @return manhattan plot (optional) and Q-Q plot (optional) of the individual analysis results.
#' @references Li, Z., Li, X., et al. (2022). A framework for detecting
#' noncoding rare-variant associations of large-scale whole-genome sequencing
#' studies. \emph{Nature Methods}, \emph{19}(12), 1599-1611.
#' (\href{https://doi.org/10.1038/s41592-022-01640-x}{pub})
#' @export

Individual_Analysis_Results_Summary <- function(agds_dir,jobs_num,input_path,output_path,individual_results_name,
                                                obj_nullmodel,known_loci=NULL,
                                                method_cond=c("optimal","naive"),
                                                QC_label="annotation/filter",variant_type=c("variant","SNV","Indel"),geno_missing_imputation=c("mean","minor"),
                                                alpha=5E-09,manhattan_plot=FALSE,QQ_plot=FALSE,
                                                SPA_p_filter=FALSE,p_filter_cutoff=0.05,
                                                cond_null_model_name=NULL,cond_null_model_dir=NULL){

	## evaluate choices
	method_cond <- match.arg(method_cond)
	geno_missing_imputation <- match.arg(geno_missing_imputation)

	## set SPA
	if(!is.null(obj_nullmodel$use_SPA))
	{
		use_SPA <- obj_nullmodel$use_SPA
	}else
	{
		use_SPA <- FALSE
	}

	## Summarize Individual Analysis Results
	results_individual_analysis_genome <- c()
	num <- 0
	for(chr in 1:22)
	{
		print(chr)

		if(chr > 1)
		{
			num <- num + jobs_num$individual_analysis_num[chr-1]
		}

		results_individual_analysis_chr <- c()
		for(kk in 1:jobs_num$individual_analysis_num[chr])
		{
			print(kk)
			job_id <- kk + num
			results_individual_analysis <- get(load(paste0(input_path,individual_results_name,"_",job_id,".Rdata")))

			results_individual_analysis_chr <- rbind(results_individual_analysis_chr,results_individual_analysis)
		}
		results_individual_analysis_genome <- rbind(results_individual_analysis_genome,results_individual_analysis_chr)

		rm(results_individual_analysis_chr)
	}

	# save results
	save(results_individual_analysis_genome,file=paste0(output_path,"results_individual_analysis_genome.Rdata"))
	## Significant findings
	results_sig <- results_individual_analysis_genome[results_individual_analysis_genome$pvalue<alpha,]

	# save significant results
	save(results_sig,file=paste0(output_path,"results_individual_analysis_sig.Rdata"))
	write.csv(results_sig,paste0(output_path,"results_individual_analysis_sig.csv"))

	## manhattan plot
	if(manhattan_plot)
	{
		png(paste0(output_path,"manhattan_MAC_20.png"), width = 9, height = 6, units = 'in', res = 600)

		pvalue <- results_individual_analysis_genome$pvalue

		if(min(pvalue)==0)
		{
			if(!use_SPA)
			{
				print(manhattan_plot(results_individual_analysis_genome$CHR, results_individual_analysis_genome$POS, results_individual_analysis_genome$pvalue_log10, use_logp=TRUE, col = c("blue4", "orange3"),sig.level=alpha))
			}else
			{
				pvalue_log10 <- -log10(pvalue)
				pvalue_log10[!is.finite(pvalue_log10)] <- 308

				print(manhattan_plot(results_individual_analysis_genome$CHR, results_individual_analysis_genome$POS, pvalue_log10, use_logp=TRUE, col = c("blue4", "orange3"),sig.level=alpha))

				rm(pvalue_log10)
			}

		}else
		{
			print(manhattan_plot(results_individual_analysis_genome$CHR, results_individual_analysis_genome$POS, results_individual_analysis_genome$pvalue, col = c("blue4", "orange3"),sig.level=alpha))
		}

		rm(pvalue)
		gc()

		dev.off()
	}

	## Q-Q plot
	if(QQ_plot)
	{
		observed <- sort(results_individual_analysis_genome$pvalue)

		if(min(observed)==0)
		{
			if(!use_SPA)
			{
				lobs <- sort(results_individual_analysis_genome$pvalue_log10,decreasing = TRUE)
			}else
			{
				lobs <- -(log10(observed))
				lobs[!is.finite(lobs)] <- 308
			}
		}else
		{
			lobs <- -(log10(observed))
		}

		expected <- c(1:length(observed))
		lexp <- -(log10(expected / (length(expected)+1)))

		rm(results_individual_analysis_genome)
		gc()

		png(paste0(output_path,"qqplot_MAC_20.png"), width = 9, height = 9, units = 'in', res = 600)

		par(mar=c(5,6,4,4))
		plot(lexp,lobs,pch=20, cex=1, xlim = c(0, max(lexp)), ylim = c(0, max(lobs)),
		     xlab = expression(Expected ~ ~-log[10](italic(p))), ylab = expression(Observed ~ ~-log[10](italic(p))),
		     font.lab=2,cex.lab=2,cex.axis=2,font.axis=2)

		abline(0, 1, col="red",lwd=2)

		dev.off()
	}

	## Conditional Analysis
	if(length(known_loci)!=0)
	{
		if(!use_SPA)
		{
			results_sig_cond <- c()
			for(chr in 1:22)
			{
				if(sum(results_sig$CHR==chr)>=1)
				{
					results_sig_chr <- results_sig[results_sig$CHR==chr,]

					gds.path <- agds_dir[chr]
					genofile <- seqOpen(gds.path)

					results_sig_cond_chr <- Individual_Analysis_cond(chr=chr,individual_results=results_sig_chr,genofile=genofile,obj_nullmodel=obj_nullmodel,
					                                                 known_loci=known_loci,method_cond=method_cond,QC_label=QC_label,
					                                                 variant_type=variant_type,geno_missing_imputation=geno_missing_imputation)

					results_sig_cond <- rbind(results_sig_cond,results_sig_cond_chr)

					seqClose(genofile)
				}
			}
			save(results_sig_cond,file=paste0(output_path,"results_sig_cond.Rdata"))
			write.csv(results_sig_cond,paste0(output_path,"results_sig_cond.csv"))
		}else
		{
			results_sig_cond <- c()
			for(chr in 1:22)
			{
				if(sum(results_sig$CHR==chr)>=1)
				{
					if(file.exists(paste0(cond_null_model_dir,cond_null_model_name,".chr",chr,".Rdata")))
					{
						results_sig_chr <- results_sig[results_sig$CHR==chr,,drop=FALSE]

						gds.path <- agds_dir[chr]
						genofile <- seqOpen(gds.path)

						obj_nullmodel_cond <- get(load(paste0(cond_null_model_dir,cond_null_model_name,".chr",chr,".Rdata")))

						results_sig_cond_chr <- Individual_Analysis_cond_spa(chr=chr,individual_results=results_sig_chr,genofile=genofile,obj_nullmodel=obj_nullmodel_cond,
						                                                     QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
						                                                     SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff)

						results_sig_cond <- rbind(results_sig_cond,results_sig_cond_chr)

						seqClose(genofile)
					}else
					{
						results_sig_chr <- results_sig[results_sig$CHR==chr,,drop=FALSE]

						gds.path <- agds_dir[chr]
						genofile <- seqOpen(gds.path)

						results_sig_cond_chr <- Individual_Analysis_cond_spa(chr=chr,individual_results=results_sig_chr,genofile=genofile,obj_nullmodel=obj_nullmodel,
						                                                     QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
						                                                     SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff)

						results_sig_cond <- rbind(results_sig_cond,results_sig_cond_chr)

						seqClose(genofile)
					}
				}
			}
			save(results_sig_cond,file=paste0(output_path,"results_sig_cond.Rdata"))
			write.csv(results_sig_cond,paste0(output_path,"results_sig_cond.csv"))
		}
	}
}