From 53972bfddcca836d5abb8cdd452cbea40ab2571f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Thu, 19 May 2022 13:26:26 +0200 Subject: [PATCH] feat: enable the definition of conda pin files in order to freeze an environment. This can drastically increase the robustness because it allows to freeze an environment at a working state. (#1667) * feat: enable the definition of conda pin files in order to freeze an environment * fixes and test case * skip on win * docs --- docs/snakefiles/deployment.rst | 85 +++++++--- snakemake/deployment/conda.py | 158 +++++++++++++----- tests/test_conda_pin_file/Snakefile | 10 ++ .../expected-results/test.txt | 1 + .../test-env.linux-64.pin.txt | 9 + tests/test_conda_pin_file/test-env.yaml | 4 + tests/tests.py | 5 + 7 files changed, 200 insertions(+), 72 deletions(-) create mode 100644 tests/test_conda_pin_file/Snakefile create mode 100644 tests/test_conda_pin_file/expected-results/test.txt create mode 100644 tests/test_conda_pin_file/test-env.linux-64.pin.txt create mode 100644 tests/test_conda_pin_file/test-env.yaml diff --git a/docs/snakefiles/deployment.rst b/docs/snakefiles/deployment.rst index 6835d6e30..f233ff642 100644 --- a/docs/snakefiles/deployment.rst +++ b/docs/snakefiles/deployment.rst @@ -287,44 +287,41 @@ Note that you need to clean up environments manually for now. However, in many c Conda deployment also works well for offline or air-gapped environments. Running ``snakemake --use-conda --conda-create-envs-only`` will only install the required conda environments without running the full workflow. Subsequent runs with ``--use-conda`` will make use of the local environments without requiring internet access. +Freezing environments to exactly pinned packages +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _conda_named_env: +If Snakemake finds a special file ending on ``.pin.txt`` next to a conda environment file (with ```` being the current platform, e.g. ``linux-64``), it will try to use the contents of that file to determine the conda packages to deploy. +The file is expected to contain conda's `explicit specification file format `_. +Snakemake will first try to deploy the environment using that file, and only if that fails it will use the regular enviroment file. -Using already existing named conda environments -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This enables to freeze an environment to a certain state, and will ensure that people using a workflow will get exactly the same environments down to the individual package builds, which is in fact very similar to providing the environment encapsulated in a container image. +Generating such pin files for conda environments can be automatically done using `Snakedeploy `_. +Let ``envs/ggplot.yaml`` be the conda environment file used in the example above. +Then, the pinning can be generated with -Sometimes it can be handy to refer to an already existing named conda environment from a rule, instead of defining a new one from scratch. -Importantly, one should be aware that this can **hamper reproducibility**, because the workflow then relies on this environment to be present -**in exactly the same way** on any new system where the workflow is executed. Essentially, you will have to take care of this manually in such a case. -Therefore, the approach using environment definition files described above is highly recommended and preferred. +.. code-block:: bash -Nevertheless, in case you are still sure that you want to use an existing named environment, it can simply be put into the conda directive, e.g. + snakedeploy pin-conda-envs envs/ggplot.yaml -.. code-block:: python +Multiple paths to environments can be provided at the same time; also see ``snakedeploy pin-conda-envs --help``. - rule NAME: - input: - "table.txt" - output: - "plots/myplot.pdf" - conda: - "some-env-name" - script: - "scripts/plot-stuff.R" +Of course, it is **important to update the pinnings** whenever the original environment is modified, such that they do not diverge. -For such a rule, Snakemake will just activate the given environment, instead of automatically deploying anything. -Instead of using a concrete name, it is also possible to provide a name containing wildcards (which must also occur in the output files of the rule), analogous to the specification of input files. +Updating environments +~~~~~~~~~~~~~~~~~~~~~ -Note that Snakemake distinguishes file based environments from named ones as follows: -if the given specification ends on ``.yaml`` or ``.yml``, Snakemake assumes it to be a path to an environment definition file; otherwise, it assumes the given specification -to be the name of an existing environment. +When a workflow contains many conda environments, it can be helpful to automatically update them to the latest versions of all packages. +This can be done automatically via `Snakedeploy `_: -.. _singularity: +.. code-block:: bash + + snakedeploy update-conda-envs envs/ggplot.yaml + +Multiple paths to environments can be provided at the same time; also see ``snakedeploy update-conda-envs --help``. -------------------------- Providing post-deployment scripts -------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From Snakemake 6.14 onwards post-deployment shell-scripts can be provided to perform additional adjustments of a conda environment. This might be helpful in case a conda package is missing components or requires further configuration for execution. @@ -351,6 +348,42 @@ Post-deployment scripts must be placed next to their corresponding environment-f The path of the conda environment can be accessed within the script via ``$CONDA_PREFIX``. + +.. _conda_named_env: + +----------------------------------------------- +Using already existing named conda environments +----------------------------------------------- + +Sometimes it can be handy to refer to an already existing named conda environment from a rule, instead of defining a new one from scratch. +Importantly, one should be aware that this can **hamper reproducibility**, because the workflow then relies on this environment to be present +**in exactly the same way** on any new system where the workflow is executed. Essentially, you will have to take care of this manually in such a case. +Therefore, the approach using environment definition files described above is highly recommended and preferred. + +Nevertheless, in case you are still sure that you want to use an existing named environment, it can simply be put into the conda directive, e.g. + +.. code-block:: python + + rule NAME: + input: + "table.txt" + output: + "plots/myplot.pdf" + conda: + "some-env-name" + script: + "scripts/plot-stuff.R" + +For such a rule, Snakemake will just activate the given environment, instead of automatically deploying anything. +Instead of using a concrete name, it is also possible to provide a name containing wildcards (which must also occur in the output files of the rule), analogous to the specification of input files. + +Note that Snakemake distinguishes file based environments from named ones as follows: +if the given specification ends on ``.yaml`` or ``.yml``, Snakemake assumes it to be a path to an environment definition file; otherwise, it assumes the given specification +to be the name of an existing environment. + + +.. _singularity: + -------------------------- Running jobs in containers -------------------------- diff --git a/snakemake/deployment/conda.py b/snakemake/deployment/conda.py index 869bb70ef..8d4120c51 100644 --- a/snakemake/deployment/conda.py +++ b/snakemake/deployment/conda.py @@ -65,16 +65,27 @@ def __init__( container_img=None, cleanup=None, ): + self._conda = Conda(container_img) + self.file = None self.name = None self.post_deploy_file = None + self.pin_file = None if env_file is not None: self.file = infer_source_file(env_file) + deploy_file = Path(self.file.get_path_or_uri()).with_suffix( ".post-deploy.sh" ) if deploy_file.exists(): self.post_deploy_file = infer_source_file(deploy_file) + + pin_file = Path(self.file.get_path_or_uri()).with_suffix( + f".{self._conda.platform}.pin.txt" + ) + + if pin_file.exists(): + self.pin_file = infer_source_file(pin_file) if env_name is not None: assert env_file is None, "bug: both env_file and env_name specified" self.name = env_name @@ -92,6 +103,7 @@ def __init__( self._content_hash = None self._content = None self._content_deploy = None + self._content_pin = None self._path = None self._archive_file = None self._cleanup = cleanup @@ -116,6 +128,12 @@ def _get_content_deploy(self): return self.workflow.sourcecache.open(self.post_deploy_file, "rb").read() return None + def _get_content_pin(self): + self.check_is_file_based() + if self.pin_file: + return self.workflow.sourcecache.open(self.pin_file, "rb").read() + return None + @property def _env_archive_dir(self): return self.workflow.persistence.conda_env_archive_path @@ -136,6 +154,12 @@ def content_deploy(self): self._content_deploy = self._get_content_deploy() return self._content_deploy + @property + def content_pin(self): + if self._content_pin is None: + self._content_pin = self._get_content_pin() + return self._content_pin + @property def hash(self): if self._hash is None: @@ -309,9 +333,8 @@ def execute_deployment_script(self, env_file, deploy_file): os.path.relpath(path=deploy_file, start=os.getcwd()) ) ) - conda = Conda(self._container_img) shell.check_output( - conda.shellcmd(self.address, "sh {}".format(deploy_file)), + self._conda.shellcmd(self.address, "sh {}".format(deploy_file)), stderr=subprocess.STDOUT, ) @@ -324,8 +347,10 @@ def create(self, dryrun=False): # Read env file and create hash. env_file = self.file deploy_file = None + pin_file = None tmp_env_file = None tmp_deploy_file = None + tmp_pin_file = None if not isinstance(env_file, LocalSourceFile) or isinstance( env_file, LocalGitFile @@ -343,9 +368,15 @@ def create(self, dryrun=False): tmp.write(self.content_deploy) deploy_file = tmp.name tmp_deploy_file = tmp.name + if self.pin_file: + with tempfile.NamedTemporaryFile(delete=False, suffix="pin.txt") as tmp: + tmp.write(self.content_pin) + pin_file = tmp.name + tmp_pin_file = tmp.name else: env_file = env_file.get_path_or_uri() deploy_file = self.post_deploy_file + pin_file = self.pin_file env_path = self.address @@ -402,11 +433,9 @@ def create(self, dryrun=False): ) ) return env_path - conda = Conda(self._container_img) logger.info( "Creating conda environment {}...".format(self.file.simplify_path()) ) - # Check if env archive exists. Use that if present. env_archive = self.archive_file try: # Touch "start" flag file @@ -414,6 +443,7 @@ def create(self, dryrun=False): with open(os.path.join(env_path, "env_setup_start"), "a") as f: pass + # Check if env archive exists. Use that if present. if os.path.exists(env_archive): logger.info("Installing archived conda packages.") pkg_list = os.path.join(env_archive, "packages.txt") @@ -450,46 +480,78 @@ def create(self, dryrun=False): out = shell.check_output( cmd, stderr=subprocess.STDOUT, universal_newlines=True ) - else: - # Copy env file to env_path (because they can be on - # different volumes and singularity should only mount one). - # In addition, this allows to immediately see what an - # environment in .snakemake/conda contains. - target_env_file = env_path + ".yaml" - shutil.copy(env_file, target_env_file) - - logger.info("Downloading and installing remote packages.") - cmd = " ".join( - [ - self.frontend, - "env", - "create", - "--quiet", - '--file "{}"'.format(target_env_file), - '--prefix "{}"'.format(env_path), - ] - ) - if self._container_img: - cmd = singularity.shellcmd( - self._container_img.path, - cmd, - args=self._singularity_args, - envvars=self.get_singularity_envvars(), - ) - out = shell.check_output( - cmd, stderr=subprocess.STDOUT, universal_newlines=True - ) - # cleanup if requested - if self._cleanup is CondaCleanupMode.tarballs: - logger.info("Cleaning up conda package tarballs.") - shell.check_output("conda clean -y --tarballs") - elif self._cleanup is CondaCleanupMode.cache: - logger.info( - "Cleaning up conda package tarballs and package cache." + def create_env(env_file, filetype="yaml"): + # Copy env file to env_path (because they can be on + # different volumes and singularity should only mount one). + # In addition, this allows to immediately see what an + # environment in .snakemake/conda contains. + target_env_file = env_path + f".{filetype}" + shutil.copy(env_file, target_env_file) + + logger.info("Downloading and installing remote packages.") + + subcommand = [self.frontend] + yes_flag = ["--yes"] + if filetype == "yaml": + subcommand.append("env") + yes_flag = [] + + cmd = " ".join( + subcommand + + [ + "create", + "--quiet", + '--file "{}"'.format(target_env_file), + '--prefix "{}"'.format(env_path), + ] + + yes_flag + ) + if self._container_img: + cmd = singularity.shellcmd( + self._container_img.path, + cmd, + args=self._singularity_args, + envvars=self.get_singularity_envvars(), + ) + out = shell.check_output( + cmd, stderr=subprocess.STDOUT, universal_newlines=True ) - shell.check_output("conda clean -y --tarballs --packages") + + # cleanup if requested + if self._cleanup is CondaCleanupMode.tarballs: + logger.info("Cleaning up conda package tarballs.") + shell.check_output("conda clean -y --tarballs") + elif self._cleanup is CondaCleanupMode.cache: + logger.info( + "Cleaning up conda package tarballs and package cache." + ) + shell.check_output("conda clean -y --tarballs --packages") + return out + + if pin_file is not None: + try: + logger.info( + f"Using pinnings from {self.pin_file.get_path_or_uri()}." + ) + out = create_env(pin_file, filetype="pin.txt") + except subprocess.CalledProcessError as e: + # remove potential partially installed environment + shutil.rmtree(env_path, ignore_errors=True) + advice = "" + if isinstance(self.file, LocalSourceFile): + advice = ( + " If that works, make sure to update the pin file with " + f"'snakedeploy pin-conda-env {self.file.get_path_or_uri()}'." + ) + logger.warning( + f"Failed to install conda environment from pin file ({self.pin_file.get_path_or_uri()}). " + f"Trying regular environment definition file.{advice}" + ) + out = create_env(env_file, filetype="yaml") + else: + out = create_env(env_file, filetype="yaml") # Execute post-deplay script if present if deploy_file: @@ -562,15 +624,19 @@ def __init__(self, container_img=None, prefix_path=None): container_img = container_img.path self.container_img = container_img + info = json.loads( + shell.check_output( + self._get_cmd("conda info --json"), universal_newlines=True + ) + ) + if prefix_path is None or container_img is not None: - self.prefix_path = json.loads( - shell.check_output( - self._get_cmd("conda info --json"), universal_newlines=True - ) - )["conda_prefix"] + self.prefix_path = info["conda_prefix"] else: self.prefix_path = prefix_path + self.platform = info["platform"] + # check conda installation self._check() diff --git a/tests/test_conda_pin_file/Snakefile b/tests/test_conda_pin_file/Snakefile new file mode 100644 index 000000000..098048136 --- /dev/null +++ b/tests/test_conda_pin_file/Snakefile @@ -0,0 +1,10 @@ +shell.executable("bash") + +rule a: + output: + "test.txt" + conda: + "test-env.yaml" + shell: + "rg --version | head -n1 | cut -f2 -d' ' > {output}" + diff --git a/tests/test_conda_pin_file/expected-results/test.txt b/tests/test_conda_pin_file/expected-results/test.txt new file mode 100644 index 000000000..02161ca86 --- /dev/null +++ b/tests/test_conda_pin_file/expected-results/test.txt @@ -0,0 +1 @@ +13.0.0 diff --git a/tests/test_conda_pin_file/test-env.linux-64.pin.txt b/tests/test_conda_pin_file/test-env.linux-64.pin.txt new file mode 100644 index 000000000..551c1b3fd --- /dev/null +++ b/tests/test_conda_pin_file/test-env.linux-64.pin.txt @@ -0,0 +1,9 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-12.1.0-h8d9b700_16.tar.bz2#f013cf7749536ce43d82afbffdf499ab +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-12.1.0-h8d9b700_16.tar.bz2#4f05bc9844f7c101e6e147dab3c88d5c +https://conda.anaconda.org/conda-forge/linux-64/ripgrep-13.0.0-h2f28480_2.tar.bz2#15a0bf4a1260b0a08198eb683eb272fb diff --git a/tests/test_conda_pin_file/test-env.yaml b/tests/test_conda_pin_file/test-env.yaml new file mode 100644 index 000000000..2fa4c015d --- /dev/null +++ b/tests/test_conda_pin_file/test-env.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - ripgrep =12.0 diff --git a/tests/tests.py b/tests/tests.py index 45b56a62b..82ec9052b 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1646,6 +1646,11 @@ def test_retries(): run(dpath("test_retries")) +@skip_on_windows # the testcase only has a linux-64 pin file +def test_conda_pin_file(): + run(dpath("test_conda_pin_file"), use_conda=True) + + @skip_on_windows # sufficient to test this on linux def test_github_issue1618(): run(dpath("test_github_issue1618"), cores=5)