feat: specify conda environments via their name (#1340)

* feat: specify conda environments via their name (preparation) * Implement named env support * docs * fix * fmt * fix code smells * Mention wildcard expansion. * add assertion * fix list envs * fix attribute * fix wildcard test * dbg * cleanup * fix test case
snakemake · Jan 25, 2022 · 735ab23 · 735ab23
1 parent dae7b8f
commit 735ab23
Show file tree

Hide file tree

Showing 11 changed files with 289 additions and 70 deletions.
diff --git a/docs/snakefiles/deployment.rst b/docs/snakefiles/deployment.rst
@@ -246,6 +246,8 @@ with the following `environment definition <https://conda.io/projects/conda/en/l
 
 The path to the environment definition is interpreted as **relative to the Snakefile that contains the rule** (unless it is an absolute path, which is discouraged).
 
+Instead of using a concrete path, it is also possible to provide a path containing wildcards (which must also occur in the output files of the rule), analogous to the specification of input files.
+
 .. sidebar:: Note
 
    Note that conda environments are only used with ``shell``, ``script`` and the ``wrapper`` directive, not the ``run`` directive.
@@ -260,6 +262,37 @@ Note that you need to clean up environments manually for now. However, in many c
 
 Conda deployment also works well for offline or air-gapped environments. Running ``snakemake --use-conda --conda-create-envs-only`` will only install the required conda environments without running the full workflow. Subsequent runs with ``--use-conda`` will make use of the local environments without requiring internet access.
 
+
+.. _conda_named_env:
+
+Using already existing named conda environments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes it can be handy to refer to an already existing named conda environment from a rule, instead of defining a new one from scratch.
+Importantly, one should be aware that this can **hamper reproducibility**, because the workflow then relies on this environment to be present
+**in exactly the same way** on any new system where the workflow is executed. Essentially, you will have to take care of this manually in such a case.
+Therefore, the approach using environment definition files described above is highly recommended and preferred.
+
+Nevertheless, in case you are still sure that you want to use an existing named environment, it can simply be put into the conda directive, e.g.
+
+.. code-block:: python
+    rule NAME:
+        input:
+            "table.txt"
+        output:
+            "plots/myplot.pdf"
+        conda:
+            "some-env-name"
+        script:
+            "scripts/plot-stuff.R"
+
+For such a rule, Snakemake will just activate the given environment, instead of automatically deploying anything.
+Instead of using a concrete name, it is also possible to provide a name containing wildcards (which must also occur in the output files of the rule), analogous to the specification of input files.
+
+Note that Snakemake distinguishes file based environments from named ones as follows: 
+if the given specification ends on ``.yaml`` or ``.yml``, Snakemake assumes it to be a path to an environment definition file; otherwise, it assumes the given specification
+to be the name of an existing environment.
+
 .. _singularity:
 
 

diff --git a/snakemake/dag.py b/snakemake/dag.py
@@ -275,37 +275,36 @@ def cleanup(self):
     def create_conda_envs(
         self, dryrun=False, forceall=False, init_only=False, quiet=False
     ):
-        # First deduplicate based on job.conda_env_file
+        # First deduplicate based on job.conda_env_spec
         jobs = self.jobs if forceall else self.needrun_jobs
         env_set = {
-            (job.conda_env_file, job.container_img_url)
+            (job.conda_env_spec, job.container_img_url)
             for job in jobs
-            if job.conda_env_file
+            if job.conda_env_spec
         }
         # Then based on md5sum values
         self.conda_envs = dict()
-        for (env_file, simg_url) in env_set:
+        for (env_spec, simg_url) in env_set:
             simg = None
             if simg_url and self.workflow.use_singularity:
                 assert (
                     simg_url in self.container_imgs
                 ), "bug: must first pull singularity images"
                 simg = self.container_imgs[simg_url]
-            env = conda.Env(
-                env_file,
+            env = env_spec.get_conda_env(
                 self.workflow,
                 container_img=simg,
                 cleanup=self.workflow.conda_cleanup_pkgs,
             )
-            self.conda_envs[(env_file, simg_url)] = env
+            self.conda_envs[(env_spec, simg_url)] = env
 
         if not init_only:
             for env in self.conda_envs.values():
-                if not dryrun or not quiet:
+                if (not dryrun or not quiet) and not env.is_named:
                     env.create(dryrun)
 
     def pull_container_imgs(self, dryrun=False, forceall=False, quiet=False):
-        # First deduplicate based on job.conda_env_file
+        # First deduplicate based on job.conda_env_spec
         jobs = self.jobs if forceall else self.needrun_jobs
         img_set = {
             (job.container_img_url, job.is_containerized)
@@ -2075,7 +2074,7 @@ def add(path):
                 logger.info("Archiving conda environments...")
                 envs = set()
                 for job in self.jobs:
-                    if job.conda_env_file:
+                    if job.conda_env_spec:
                         env_archive = job.archive_conda_env()
                         envs.add(env_archive)
                 for env in envs: