diff --git a/docs/project_info/faq.rst b/docs/project_info/faq.rst index 4d51c004b..acc9a5a12 100644 --- a/docs/project_info/faq.rst +++ b/docs/project_info/faq.rst @@ -575,6 +575,26 @@ temporary file ``huge_file.csv`` could be kept at the compute node. $ snakemake --shadow-prefix /scratch some_summary_statistics.txt --cluster ... +If you want the input files of your rule to be copied to the node-local scratch directory +instead of just using symbolic links, you can use ``copy-minimal`` in the ``shadow`` directive. +This is useful for example for benchmarking tools as a black-box. + +.. code-block:: python + + rule: + input: + "input_file.txt" + output: + file = "output_file.txt", + benchmark = "benchmark_results.txt", + shadow: "copy-minimal" + shell: + """ + /usr/bin/time -v command "{input}" "{output.file}" > "{output.benchmark}" + """ + +Executing snakemake as above then leads to the shell script accessing only node-local storage. + How do I access elements of input or output by a variable index? ---------------------------------------------------------------- diff --git a/docs/snakefiles/rules.rst b/docs/snakefiles/rules.rst index 7107cf774..bbd9001ac 100644 --- a/docs/snakefiles/rules.rst +++ b/docs/snakefiles/rules.rst @@ -995,11 +995,21 @@ Note that any flag that forces re-creation of files still also applies to files Shadow rules ------------ -Shadow rules result in each execution of the rule to be run in isolated temporary directories. This "shadow" directory contains symlinks to files and directories in the current workdir. This is useful for running programs that generate lots of unused files which you don't want to manually cleanup in your snakemake workflow. It can also be useful if you want to keep your workdir clean while the program executes, or simplify your workflow by not having to worry about unique filenames for all outputs of all rules. +Shadow rules result in each execution of the rule to be run in isolated temporary directories. +This "shadow" directory contains symlinks to files and directories in the current workdir. +This is useful for running programs that generate lots of unused files which you don't want to manually cleanup in your snakemake workflow. +It can also be useful if you want to keep your workdir clean while the program executes, +or simplify your workflow by not having to worry about unique filenames for all outputs of all rules. -By setting ``shadow: "shallow"``, the top level files and directories are symlinked, so that any relative paths in a subdirectory will be real paths in the filesystem. The setting ``shadow: "full"`` fully shadows the entire subdirectory structure of the current workdir. The setting ``shadow: "minimal"`` only symlinks the inputs to the rule. Once the rule successfully executes, the output file will be moved if necessary to the real path as indicated by ``output``. +By setting ``shadow: "shallow"``, the top level files and directories are symlinked, +so that any relative paths in a subdirectory will be real paths in the filesystem. +The setting ``shadow: "full"`` fully shadows the entire subdirectory structure of the current workdir. +The setting ``shadow: "minimal"`` only symlinks the inputs to the rule, +and ``shadow: "copy-minimal"`` copies the inputs instead of just creating symlinks. +Once the rule successfully executes, the output file will be moved if necessary to the real path as indicated by ``output``. -Typically, you will not need to modify your rule for compatibility with ``shadow``, unless you reference parent directories relative to your workdir in a rule. +Typically, you will not need to modify your rule for compatibility with ``shadow``, +unless you reference parent directories relative to your workdir in a rule. .. code-block:: python @@ -1009,7 +1019,11 @@ Typically, you will not need to modify your rule for compatibility with ``shadow shadow: "shallow" shell: "somecommand --other_outputs other.txt {input} {output}" -Shadow directories are stored one per rule execution in ``.snakemake/shadow/``, and are cleared on successful execution. Consider running with the ``--cleanup-shadow`` argument every now and then to remove any remaining shadow directories from aborted jobs. The base shadow directory can be changed with the ``--shadow-prefix`` command line argument. +Shadow directories are stored one per rule execution in ``.snakemake/shadow/``, +and are cleared on successful execution. +Consider running with the ``--cleanup-shadow`` argument every now and then +to remove any remaining shadow directories from aborted jobs. +The base shadow directory can be changed with the ``--shadow-prefix`` command line argument. Flag files ---------- diff --git a/snakemake/jobs.py b/snakemake/jobs.py index 11bdac430..eadd19787 100644 --- a/snakemake/jobs.py +++ b/snakemake/jobs.py @@ -8,6 +8,7 @@ import base64 import tempfile import json +import shutil from collections import defaultdict from itertools import chain, filterfalse @@ -743,7 +744,12 @@ def prepare(self): ) cwd = os.getcwd() - if self.rule.shadow_depth == "minimal": + # "minimal" creates symlinks only to the input files in the shadow directory + # "copy-minimal" creates copies instead + if ( + self.rule.shadow_depth == "minimal" + or self.rule.shadow_depth == "copy-minimal" + ): # Re-create the directory structure in the shadow directory for (f, d) in set( [ @@ -763,19 +769,26 @@ def prepare(self): else: raise RuleException( "The following file name references a parent directory relative to your workdir.\n" - 'This isn\'t supported for shadow: "minimal". Consider using an absolute path instead.\n{}'.format( - f + 'This isn\'t supported for shadow: "{}". Consider using an absolute path instead.\n{}'.format( + f, self.rule.shadow_depth ), rule=self.rule, ) - # Symlink the input files - for rel_path in set( - [os.path.relpath(f) for f in self.input if not os.path.isabs(f)] - ): - link = os.path.join(self.shadow_dir, rel_path) - original = os.path.relpath(rel_path, os.path.dirname(link)) - os.symlink(original, link) + # Symlink or copy the input files + if self.rule.shadow_depth == "copy-minimal": + for rel_path in set( + [os.path.relpath(f) for f in self.input if not os.path.isabs(f)] + ): + copy = os.path.join(self.shadow_dir, rel_path) + shutil.copy(rel_path, copy) + else: + for rel_path in set( + [os.path.relpath(f) for f in self.input if not os.path.isabs(f)] + ): + link = os.path.join(self.shadow_dir, rel_path) + original = os.path.relpath(rel_path, os.path.dirname(link)) + os.symlink(original, link) # Shallow simply symlink everything in the working directory. elif self.rule.shadow_depth == "shallow": diff --git a/snakemake/workflow.py b/snakemake/workflow.py index b5a33dfd5..eef12b430 100644 --- a/snakemake/workflow.py +++ b/snakemake/workflow.py @@ -1352,16 +1352,22 @@ def decorate(ruleinfo): ruleinfo.threads = int(ruleinfo.threads) rule.resources["_cores"] = ruleinfo.threads if ruleinfo.shadow_depth: - if ruleinfo.shadow_depth not in (True, "shallow", "full", "minimal"): + if ruleinfo.shadow_depth not in ( + True, + "shallow", + "full", + "minimal", + "copy-minimal", + ): raise RuleException( - "Shadow must either be 'minimal', 'shallow', 'full', " + "Shadow must either be 'minimal', 'copy-minimal', 'shallow', 'full', " "or True (equivalent to 'full')", rule=rule, ) if ruleinfo.shadow_depth is True: rule.shadow_depth = "full" logger.warning( - "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|shallow|full' instead.".format( + "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|copy-minimal|shallow|full' instead.".format( rule ) ) diff --git a/tests/test_shadow_copy/Snakefile b/tests/test_shadow_copy/Snakefile new file mode 100644 index 000000000..275d0c648 --- /dev/null +++ b/tests/test_shadow_copy/Snakefile @@ -0,0 +1,102 @@ +rule all: + input: "minimal_ok.out" + +# Setup files for testing of shadow: "minimal" +rule minimal_setup: + input: "test.in" + output: + "subdir1/subdir2/test.in", + "subdir1/subdir2/test.symbolic.in" + shell: + """ + cp -P {input} {output[0]} + cd subdir1/subdir2 + ln -s test.in test.symbolic.in + """ + +# Tests relative inputs/outputs and in the current dir +rule minimal_rel_curdir: + input: "test.in" + output: protected("simple_minimal.out") + benchmark: "benchmark_minimal.txt" + log: "minimal.log" + shadow: "copy-minimal" + shell: + """ + if [ ! -f "{input}" -o -L "{input}" ]; then + echo "Input file is symbolic link and not a copy" + exit 1 + fi + + touch minimal_junk.out + cat {input} >> {output} + echo simple_minimal >> {output} + echo minimal_log > {log} + """ + +# Tests relative inputs/outputs in subdirectories +rule minimal_rel_subdir: + input: "subdir1/subdir2/test.in" + output: "outdir/minimal.out" + shadow: "copy-minimal" + shell: + """ + if [ ! -f "{input}" -o -L "{input}" ]; then + echo "Input file is symbolic link and not a copy" + exit 1 + fi + + touch outdir/minimal_junk.out + touch {output} + """ + +# Tests symbolic input/output +rule minimal_symbolic: + input: "subdir1/subdir2/test.symbolic.in" + output: "outdir/minimal_real.out", + "outdir/minimal_symbolic.out" + shadow: "copy-minimal" + shell: + """ + if [ ! -f "{input}" -o -L "{input}" ]; then + echo "Input file is symbolic link and not a copy" + exit 1 + fi + + touch outdir/minimal_real.out + cd outdir + ln -s minimal_real.out minimal_symbolic.out + """ + +# Tests absolute input/output +rule minimal_absolute: + input: + os.path.join(os.getcwd(),"test.in") + output: os.path.join(os.getcwd(),"outdir/minimal_absolute.out") + shadow: "copy-minimal" + shell: + """ + if [ ! -f "{input}" -o -L "{input}" ]; then + echo "Input file is symbolic link and not a copy" + exit 1 + fi + + touch {output} + """ + +# Aggregates tests for shadow: "minimal" +rule minimal_ok: + input: "simple_minimal.out", + "outdir/minimal.out", + "outdir/minimal_symbolic.out", + os.path.join(os.getcwd(),"outdir/minimal_absolute.out") + output: "minimal_ok.out" + shell: + """ + #test ! -w {input[0]} + test -f benchmark_minimal.txt + test -f minimal.log + test ! -f minimal_junk.out + test ! -f outdir/minimal_junk.out + touch {output} + """ diff --git a/tests/test_shadow_copy/expected-results/simple_minimal.out b/tests/test_shadow_copy/expected-results/simple_minimal.out new file mode 100644 index 000000000..71d11e4a6 --- /dev/null +++ b/tests/test_shadow_copy/expected-results/simple_minimal.out @@ -0,0 +1,2 @@ +in +simple_minimal diff --git a/tests/test_shadow_copy/test.in b/tests/test_shadow_copy/test.in new file mode 100644 index 000000000..4935e88d3 --- /dev/null +++ b/tests/test_shadow_copy/test.in @@ -0,0 +1 @@ +in diff --git a/tests/tests.py b/tests/tests.py index 666598441..1400f4d4a 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -324,6 +324,11 @@ def test_shadow(): run(dpath("test_shadow")) +@skip_on_windows +def test_shadow_copy(): + run(dpath("test_shadow_copy")) + + @skip_on_windows # Symbolic link privileges needed to work def test_shadow_prefix(): run(dpath("test_shadow_prefix"), shadow_prefix="shadowdir")