Merge pull request #63 from bodkan/qpadm-rotation

Add prototype of qpAdm rotation strategy
bodkan · Jun 21, 2020 · fec8b6a · fec8b6a
2 parents bef1afb + 3e8f260
commit fec8b6a
Show file tree

Hide file tree

Showing 47 changed files with 1,017 additions and 176 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -27,4 +27,4 @@ before_install:
     # return to the top directory
   - popd
 after_success:
-  - Rscript -e 'covr::codecov()'
+  - Rscript -e 'covr::codecov(exclusions = "R/log.R", function_exclusions = c("print\\."))'
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: admixr
 Title: An Interface for Running 'ADMIXTOOLS' Analyses
-Version: 0.8.7
+Version: 0.9.0
 Authors@R: 
     person(given = "Martin",
            family = "Petr",

diff --git a/NAMESPACE b/NAMESPACE
@@ -15,6 +15,8 @@ export(keep_transitions)
 export(loginfo)
 export(merge_eigenstrat)
 export(qpAdm)
+export(qpAdm_filter)
+export(qpAdm_rotation)
 export(qpWave)
 export(read_geno)
 export(read_ind)
@@ -28,3 +30,4 @@ export(write_ind)
 export(write_snp)
 import(rlang)
 importFrom(magrittr,"%>%")
+importFrom(utils,combn)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,16 @@
+# admixr 0.9.0
+
+* New prototype implementation of exhaustive search of qpAdm models
+  through rotation of sources and outgroups (`qpAdm_rotation()`).
+* New tutorial describing the `qpAdm_rotation()` procedure.
+* Fixed broken regex parsing of qpWave log files in some cases.
+* Added type checking to a couple of functions (not all wrappers type
+  checked yet).
+
 # admixr 0.8.7
 
-* qpAdm proportions table now shows p-values and SNP counts (both for overall analysis but also for the target sample).
+* qpAdm proportions table now shows p-values and SNP counts (both for
+  overall analysis but also for the target sample).
 
 # admixr 0.8.6
 
@@ -25,114 +35,124 @@
 
 # admixr 0.8.1
 
-* Fixed a minor bug in single-source qpAdm analyses. The package now detects
-  this and correctly handles missing "subsets/patterns" information.
+* Fixed a minor bug in single-source qpAdm analyses. The package now
+  detects this and correctly handles missing "subsets/patterns"
+  information.
 
 # admixr 0.8.0
 
-Finally resumed the development of _admixr_! Apologies to everyone
-for having to wait so long. Thank you for your patience and feedback since the last
-release. I hope that things will start moving a little bit faster and we will
-reach version 1.0 in the next couple of months.
+Finally resumed the development of _admixr_! Apologies to everyone for
+having to wait so long. Thank you for your patience and feedback since
+the last release. I hope that things will start moving a little bit
+faster and we will reach version 1.0 in the next couple of months.
 
 New features and improvements:
 
-* New function `loginfo()` which operates on any output object from an admixr wrapper
-  and shows the full log output (the "log file" in ADMIXTOOLS jargon)
-  associated with the analysis. It also has options for saving the log file to
-  a permanent location and to only show a log file for a target sample of interest
-  (relevant for _qpAdm_ analyses with multiple targets at once).
-* _admixr_ can now (hopefully) detect broken/truncated output files generated
-  by ADMIXTOOLS whenever there is some fatal issue with an analysis.
-  The package should now detect this and inform the user.
-* Removed _qpAdm_ argument `details` - the user will always want to see the
-  full analysis summary so this option is redundant. It is still kept in
-  _qpWave_ - for now, until I figure out how useful the full output actually is.
-  Given the implementation of `loginfo()` above, we might remove the argument
-  from _qpWave_ too at some point soon.
-* The function `download_data()` now fetches data from a more stable location.
+* New function `loginfo()` which operates on any output object from an
+  admixr wrapper and shows the full log output (the "log file" in
+  ADMIXTOOLS jargon) associated with the analysis. It also has options
+  for saving the log file to a permanent location and to only show a
+  log file for a target sample of interest (relevant for _qpAdm_
+  analyses with multiple targets at once).
+* _admixr_ can now (hopefully) detect broken/truncated output files
+  generated by ADMIXTOOLS whenever there is some fatal issue with an
+  analysis.  The package should now detect this and inform the user.
+* Removed _qpAdm_ argument `details` - the user will always want to
+  see the full analysis summary so this option is redundant. It is
+  still kept in _qpWave_ - for now, until I figure out how useful the
+  full output actually is.  Given the implementation of `loginfo()`
+  above, we might remove the argument from _qpWave_ too at some point
+  soon.
+* The function `download_data()` now fetches data from a more stable
+  location.
 
 # admixr 0.7.1
 
-* Parse admixture proportions from the "best coefficients" line of the qpAdm
-  output file.
+* Parse admixture proportions from the "best coefficients" line of the
+  qpAdm output file.
 
 # admixr 0.7.0
 
 * Fixed handling of warnings when dealing with duplicated populations.
-* 9s are now replaced with NAs in `read_geno()` and `write_geno()`, which makes
-  it more convenient to write custom analytic code working on data.frames.
-* Renamed `qpAdm()` output elements and changed its function signature.
+* 9s are now replaced with NAs in `read_geno()` and `write_geno()`,
+  which makes it more convenient to write custom analytic code working
+  on data.frames.
+* Renamed `qpAdm()` output elements and changed its function
+  signature.
 
 # admixr 0.6.3
 
-* Rename `keep_transversions()` to `transversions_only()`. The old function is
-  now deprecated.
-* `print.EIGENSTRAT()` now uses a pre-calculated numbers of removed/remaining
-  sites, instead of calculating them each and every time.
-* Printing EIGENSTRAT objects now also shows "group" and "exclude" modifiers
-  only if present.
+* Rename `keep_transversions()` to `transversions_only()`. The old
+  function is now deprecated.
+* `print.EIGENSTRAT()` now uses a pre-calculated numbers of
+  removed/remaining sites, instead of calculating them each and every
+  time.
+* Printing EIGENSTRAT objects now also shows "group" and "exclude"
+  modifiers only if present.
 
 # admixr 0.6.2
 
 * `read_output()` made public.
-* Fixed issues with parsing of mis-formatted ind files (tabs at the ends of
-  lines etc.).
+* Fixed issues with parsing of mis-formatted ind files (tabs at the
+  ends of lines etc.).
 
 # admixr 0.6.1
 
-* It turned out that dragging along Rcpp and Boost dependencies just for the
-  VCF -> EIGENSTRAT conversion function causes unnecessary complications in
-  the installation process. It's not worth having it in the package if it
-  would be used only by a small fraction of potential users.
+* It turned out that dragging along Rcpp and Boost dependencies just
+  for the VCF -> EIGENSTRAT conversion function causes unnecessary
+  complications in the installation process. It's not worth having it
+  in the package if it would be used only by a small fraction of
+  potential users.
 
-  This function has been removed and the `vcf2eigenstrat` program is maintained
-  in its own repository.
+  This function has been removed and the `vcf2eigenstrat` program is
+  maintained in its own repository.
 
 # admixr 0.6.0
 
-* Conversion of VCF to EIGENSTRAT format is now implemented in C++ and should
-  be approximately infinitely faster than the old conversion function written
-  in pure R.
+* Conversion of VCF to EIGENSTRAT format is now implemented in C++ and
+  should be approximately infinitely faster than the old conversion
+  function written in pure R.
 * Conversion of EIGENSTRAT _into_ VCF has been removed.
 
 # admixr 0.5.0
 
 * Added full implementations of `qpWave()` and `qpAdm()` functions.
-* `filter_bed()` now implemented simply by calling `bedtools` in the background.
-  This turned out to be way faster and memory efficient than the previous
-  data.table-based solution.
+* `filter_bed()` now implemented simply by calling `bedtools` in the
+  background.  This turned out to be way faster and memory efficient
+  than the previous data.table-based solution.
 
 # admixr 0.4.1
 
 * Fixed missing `group_labels()` update.
-* Removed the huge built-in data set. Implemented `download_data()` function
-  that fetches the example data set from the web.
+* Removed the huge built-in data set. Implemented `download_data()`
+  function that fetches the example data set from the web.
 
 # admixr 0.4.0
 
-* The package now has a tutorial vignette describing the main functionality.
+* The package now has a tutorial vignette describing the main
+  functionality.
 * Simple SNP dataset is now included with the package.
-* The API of many utility functions has been simplified and their internals
-  re-written.
-* `filter_sites` is now implemented using `data.table` and allows overlap with
-  an arbitrary BED file.
+* The API of many utility functions has been simplified and their
+  internals re-written.
+* `filter_sites` is now implemented using `data.table` and allows
+  overlap with an arbitrary BED file.
 
 # admixr 0.3.0
 
 * All wrappers have been given simpler names (`qpDstat()` -> `d()`,
   `qpF4ratio()` -> `f4ratio()`, etc).
-* F4 statistic can now be calculated using a separate `f4()` function (`f4mode`
-  parameter remains in the `d()` function though, as `f4()` calls `d()`
-  internally).
-* All tests are performed on Travis CI using installed and compiled ADMIXTOOLS
-  software.
+* F4 statistic can now be calculated using a separate `f4()` function
+  (`f4mode` parameter remains in the `d()` function though, as `f4()`
+  calls `d()` internally).
+* All tests are performed on Travis CI using installed and compiled
+  ADMIXTOOLS software.
 
 # admixr 0.2.0
 
 * The package now includes qpAdm functionality.
 
-* Formal tests for all implemented wrapper functions have been implemented.
+* Formal tests for all implemented wrapper functions have been
+  implemented.
 
 # admixr 0.1.0
 

diff --git a/R/config_files.R b/R/config_files.R
@@ -43,6 +43,17 @@ create_qpDstat_pop_file <- function(W = NULL, X = NULL, Y = NULL, Z = NULL, file
 }
 
 
+# Generate a file with populations for a qpDstat run based on given
+# list of quartets.
+create_qpDstat_pop_file_quartets <- function(quartets, file) {
+  lines <- c()
+  for (q in quartets) {
+    lines <- c(lines, sprintf("%s %s %s %s", q[1], q[2], q[3], q[4]))
+  }
+  writeLines(lines, file)
+}
+
+
 # Generate a file with populations for a qp3Pop run.
 create_qp3Pop_pop_file <- function(A, B, C, file) {
   lines <- c()

diff --git a/R/log.R b/R/log.R
@@ -18,23 +18,23 @@ loginfo <- function(x, target = NA, save = FALSE, prefix = NA, dir = ".", suffix
   log_output <- attr(x, "log_output")
   targets <- names(log_output)
 
-  if (!is.na(target) && cmd != "qpAdm")
+  if (!is.na(target) && !cmd %in% c("qpAdm", "qpAdm_rotation"))
     stop(glue::glue("Specifying target does not make sense for examining the log output of {cmd}"),
          call. = FALSE)
 
   if (!is.na(target) && !target %in% targets)
-    stop(glue::glue("Target '{target}' is not present in the output (choices are: {paste(targets, collapse = ', ')})"),
+    stop(glue::glue("Target/model '{target}' is not present in the output (choices are: {paste(targets, collapse = ', ')})"),
          call. = FALSE)
 
-  # qpAdm's log output are stored as a list of character vectors but everything
-  # else is simply a character vector - we convert everything to a list to
-  # iterate over log outputs below
+  ## qpAdm's log output are stored as a list of character vectors but everything
+  ## else is simply a character vector - we convert everything to a list to
+  ## iterate over log outputs below
   if (!is.list(log_output))
     log_output <- list(log_output)
 
   for (i in seq_along(log_output)) {
-    # write only single target log if requested by the user
-    if (!is.na(target) && !is.null(targets) && target != targets[i]) next
+      ## write only single target/model log if requested by the user
+      if (!is.na(target) && !is.null(targets) && target != targets[i]) next
 
     if (save) {
       if (is.na(prefix)) prefix <- cmd
@@ -49,14 +49,17 @@ loginfo <- function(x, target = NA, save = FALSE, prefix = NA, dir = ".", suffix
     } else {
       if (cmd == "qpAdm") {
         title <- glue::glue("qpAdm for target '{targets[i]}'")
+      } else if (cmd == "qpAdm_rotation") {
+        title <- glue::glue("qpAdm rotation for model '{targets[i]}'")
       } else {
         title <- cmd
       }
 
       if (i > 1 && is.na(target)) cat("\n\n")
       cat(paste0("Full output log of ", title, ":\n"))
-      cat("==================================================\n\n")
+      cat("===================================================\n\n")
       cat(paste(log_output[[i]], collapse = "\n"))
+      cat("\n")
     }
   }
 }
@@ -70,7 +73,7 @@ loginfo <- function(x, target = NA, save = FALSE, prefix = NA, dir = ".", suffix
 #'
 #' @export
 print.admixr_result <- function(x, ...) {
-  if (attr(x, "command") == "qpAdm") {
+    if (attr(x, "command") %in% c("qpAdm", "qpAdm_rotation") && length(x) == 3) {
     print.default(list(
       proportions = x$proportions,
       ranks = x$ranks,

diff --git a/R/output_parsers.R b/R/output_parsers.R
@@ -171,7 +171,7 @@ read_qpWave <- function(log_lines, details = FALSE) {
   a_end <- c(test_pos[-c(1, 2)], which(stringr::str_detect(log_lines, "## end of run")))
 
   test_df <- log_lines[test_pos + 1] %>%
-    stringr::str_replace_all(" *[a-z0-9]+: ", "") %>%
+    stringr::str_replace_all("[a-z0-9]+: ", "") %>%
     stringr::str_replace_all(" +", "\t") %>%
     paste0(collapse = "\n") %>%
     readr::read_tsv(col_names = c("rank", "df", "chisq", "tail", "dfdiff",