diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 00000000..f58e4c63 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@stable diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml new file mode 100644 index 00000000..e535a0a5 --- /dev/null +++ b/.github/workflows/test-refgenie-cli.yml @@ -0,0 +1,140 @@ +name: Test refgenie CLI + +on: + push: + branches: [master, dev] + +jobs: + test_CLI: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.8] + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dev dependancies + run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + + - name: Install package + run: python -m pip install . + + - name: install macOS-specific dependancies + if: startsWith(matrix.os, 'macOS') + run: brew install md5sha1sum + + - name: create genomes dir + run: mkdir genomes + + - name: refgenie init + working-directory: ./genomes + run: refgenie init -c g.yaml; cat g.yaml + + - name: refgenie list + working-directory: ./genomes + run: refgenie list -c g.yaml + + - name: refgenie build fasta (parent asset) + run: | + refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json + ./tests/assert_in_file.sh genomes/g.yaml t7 0 + ./tests/assert_in_file.sh genomes/g.yaml 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 0 # this is a digest that should be produced from this FASTA file + + - name: refgenie build fasta_child (child asset) + run: | + refgenie build -c genomes/g.yaml t7/fasta_child --recipe tests/data/recipe_child.json + ./tests/assert_in_file.sh genomes/g.yaml fasta_child 0 + if [ -L `refgenie seek -c genomes/g.yaml t7/fasta_child` ]; then + echo "`refgenie seek -c genomes/g.yaml t7/fasta_child` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7/fasta_child` does not exist." + exit 1 + fi + if [ -d genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default ]; then + echo "'genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default' exists." + else + echo "Error: 'genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default' does not exist." + exit 1 + fi + + - name: refgenie list + working-directory: ./genomes + run: refgenie list -c g.yaml + + - name: refgenie build fasta + run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json + + - name: refgenie set aliases + run: | + refgenie alias set -c genomes/g.yaml --aliases t7_new t7_new1 --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 + ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 + ./tests/assert_in_file.sh genomes/g.yaml t7_new1 0 + if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new/fasta` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta` does not exist." + exit 1 + fi + if [ -L `refgenie seek -c genomes/g.yaml t7_new1/fasta` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new1/fasta` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7_new1/fasta` does not exist." + exit 1 + fi + + - name: refgenie remove aliases + run: | + refgenie alias set -c genomes/g.yaml --aliases t7_another --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 + refgenie alias remove -c genomes/g.yaml --aliases t7_new t7_new1 t7 --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 + ./tests/assert_in_file.sh genomes/g.yaml t7_new 1 + ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 + ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 + if [ -L genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz ]; then + echo "'genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz' exists." + exit 1 + else + echo "Error: 'genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz' does not exist." + fi + + - name: refgenie get aliases + run: | + refgenie alias get -c genomes/g.yaml + + - name: refgneie add asset + run: | + refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"recipe": "recipe_parent.json"}' + ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 + if [ -L `refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` ]; then + echo "`refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` exists." + else + echo "Error: `refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` does not exist." + exit 1 + fi + + - name: refgenie tag asset + run: | + refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag -f + ./tests/assert_in_file.sh genomes/g.yaml new_tag 0 + if [ -f `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` ]; then + echo "`refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` exists." + else + echo "Error: `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` does not exist." + exit 1 + fi + + - name: refgenie id + run: | + ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta_child:new_tag` 0 + + - name: refgenie remove fasta_child + run: | + refgenie remove -c genomes/g.yaml t7_another/fasta_child -f + ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 + ./tests/assert_in_file.sh genomes/g.yaml 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list diff --git a/.gitignore b/.gitignore index 998222cf..c6ff5ded 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,6 @@ refgenie.egg-info/ docs_jupyter/refgenie.yaml docs_jupyter/rCRSd* docs_jupyter/hs38d1* + +# build dir +build/ diff --git a/MANIFEST.in b/MANIFEST.in index e6d1be79..3fa2f075 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements/* +include refgenie/schemas/* include README.md include LICENSE.txt include refgenie/refgenie.yaml diff --git a/README.md b/README.md index a00ae270..0e8a99dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -![Build package](https://github.com/refgenie/refgenie/workflows/Build%20package/badge.svg) +[![Build package](https://github.com/refgenie/refgenie/workflows/Build%20package/badge.svg)](https://github.com/refgenie/refgenie/actions?query=workflow%3A%22Build+package%22) +[![Test refgenie CLI](https://github.com/refgenie/refgenie/workflows/Test%20refgenie%20CLI/badge.svg)](https://github.com/refgenie/refgenie/actions?query=workflow%3A%22Test+refgenie+CLI%22) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/refgenie/README.html) Refgenie
diff --git a/docs/README.md b/docs/README.md index a8bad9c8..9b4a5960 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,6 +20,8 @@ Refgenie manages storage, access, and transfer of reference genome resources. It 4. **It includes a python API**. For tool developers, you use `rgc = refgenconf.RefGenConf("genomes.yaml")` to get a Python object with paths to any genome asset, *e.g.*, `rgc.seek("hg38", "kallisto_index")`. +5. **It strictly determines genomes compatibility**. Users refer to genomes with arbitrary aliases, like "hg38", but refgenie uses sequence-derived identifiers to verify genome identity with asset servers. + ## Quick example @@ -43,11 +45,16 @@ refgenie listr Response: ```console -Querying available assets from server: http://refgenomes.databio.org/v2/assets -Remote genomes: mouse_chrM2x, rCRSd -Remote assets: - mouse_chrM2x/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default - rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.chrom_sizes:test, fasta.fai:default, fasta.fai:test, fasta:default, fasta:test + Remote refgenie assets + Server URL: http://refgenomes.databio.org +┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ genome ┃ assets ┃ +┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ mouse_chrM2x │ fasta, bwa_index, bowtie2_index │ +│ hg38 │ fasta, bowtie2_index │ +│ rCRSd │ fasta, bowtie2_index │ +│ human_repeats │ fasta, hisat2_index, bwa_index │ +└─────────────────────┴──────────────────────────────────────────────┘ ``` Next, pull one: @@ -58,8 +65,13 @@ refgenie pull rCRSd/bowtie2_index Response: ```console -'rCRSd/bowtie2_index:default' archive size: 116.8KB -Downloading URL: http://staging.refgenomes.databio.org/v2/asset/rCRSd/bowtie2_index/archive ... +Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index +94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index:default ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 128.0/117.0 KB • 1.8 MB/s • 0:00:00 +Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz +Extracting asset tarball: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz +Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default +Created alias directories: + - /Users/mstolarczyk/Desktop/testing/refgenie/alias/rCRSd/bowtie2_index/default ``` See [further reading on downloading assets](pull.md). @@ -70,7 +82,7 @@ Refgenie assets are scripted, so if what you need is not available remotely, you ```console -refgenie build mygenome/bwa_index --fasta mygenome.fa.gz +refgenie build mygenome/bwa_index ``` See [further reading on building assets](build.md). diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index c3bb9364..5fca1068 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -29,12 +29,64 @@ h4 .content { # Package `refgenconf` Documentation +## Class `ConfigNotCompliantError` +The format of the config file does not match required version/standards + + +## Class `DownloadJsonError` +Non-OK response from a JSON download attempt + + +```python +def __init__(self, resp) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `GenomeConfigFormatError` +Exception for invalid genome config file format. + + +```python +def __init__(self, msg) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `MissingAssetError` +Error type for request of an unavailable genome asset. + + +## Class `MissingConfigDataError` +Missing required configuration instance items + + +## Class `MissingGenomeError` +Error type for request of unknown genome/assembly. + + +## Class `MissingRecipeError` +Error type for request of an unavailable recipe. + + +## Class `MissingSeekKeyError` +Error type for request of an unavailable asset seek key. + + +## Class `MissingTagError` +Error type for request of an unavailable asset tag. + + ## Class `RefGenConf` A sort of oracle of available reference genome assembly assets ```python -def __init__(self, filepath=None, entries=None, writable=False, wait_max=10) +def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False, genome_exact=False, schema_source=None) ``` Create the config instance by with a filepath or key-value pairs. @@ -43,7 +95,8 @@ Create the config instance by with a filepath or key-value pairs. - `filepath` (`str`): a path to the YAML file to read - `entries` (`Iterable[(str, object)] | Mapping[str, object]`): config filepath or collection of key-value pairs - `writable` (`bool`): whether to create the object with write capabilities -- `wait_max` (`int`): how long to wait for creating an object when the file that data will be read from is locked +- `wait_max` (`int`): how long to wait for creating an object when thefile that data will be read from is locked +- `skip_read_lock` (`bool`): whether the file should not be locked forreading when object is created in read only mode #### Raises: @@ -54,6 +107,35 @@ Create the config instance by with a filepath or key-value pairs. +```python +def add(self, path, genome, asset, tag=None, seek_keys=None, force=False) +``` + +Add an external asset to the config +#### Parameters: + +- `path` (`str`): a path to the asset to add; must exist and be relativeto the genome_folder +- `genome` (`str`): genome name +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `seek_keys` (`dict`): seek keys to add +- `force` (`bool`): whether to force existing asset overwrite + + + + +```python +def alias_dir(self) +``` + +Path to the genome alias directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def assets_str(self, offset_text=' ', asset_sep=', ', genome_assets_delim='/ ', genome=None, order=None) ``` @@ -106,21 +188,22 @@ the parent genome will be removed as well ```python -def cfg_tag_asset(self, genome, asset, tag, new_tag) +def cfg_tag_asset(self, genome, asset, tag, new_tag, force=False) ``` Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. -This method does not override the original asset entry in the RefGenConf object. It creates its copy and tags -it with the new_tag. -Additionally, if the retagged asset has any children their parent will be retagged as new_tag that was -introduced upon this method execution. +This method does not override the original asset entry in the +RefGenConf object. It creates its copy and tags it with the new_tag. +Additionally, if the retagged asset has any children their parent will + be retagged as new_tag that was introduced upon this method execution. #### Parameters: - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of particular asset of interest - `tag` (`str`): name of the tag that identifies the asset of interest - `new_tag` (`str`): name of particular the new tag +- `force` (`bool`): force any actions that require approval #### Returns: @@ -141,7 +224,8 @@ def chk_digest_update_child(self, genome, remote_asset_name, child_name, server_ Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. -In case the local asset does not exist, the config is populated with the remote asset digest and children data +In case the local asset does not exist, the config is populated with the remote + asset digest and children data #### Parameters: - `genome` (`str`): name of the genome to check the asset digests for @@ -157,14 +241,45 @@ In case the local asset does not exist, the config is populated with the remote +```python +def compare(self, genome1, genome2, explain=False) +``` + +Check genomes compatibility level. Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata +#### Parameters: + +- `genome1` (`str`): name of the first genome to compare +- `genome2` (`str`): name of the first genome to compare +- `explain` (`bool`): whether the returned code explanation shouldbe displayed + + +#### Returns: + +- `int`: compatibility code + + + + +```python +def data_dir(self) +``` + +Path to the genome data directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def file_path(self) ``` -Return the path to the config file or None if not set +Path to the genome configuration file #### Returns: -- `str | None`: path to the file the object will would to +- `str`: path to the genome configuration file @@ -190,6 +305,30 @@ Determine path to a particular asset for a particular genome. +```python +def genome_aliases(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + +```python +def genome_aliases_table(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + ```python def genomes_list(self, order=None) ``` @@ -219,6 +358,42 @@ Get as single string this configuration's reference genome assembly IDs. +```python +def get_asds_path(self, genome) +``` + +Get path to the Annotated Sequence Digests JSON file for a given genome. Note that the path and/or genome may not exist. +#### Parameters: + +- `genome` (`str`): genome name + + +#### Returns: + +- `str`: ASDs path + + + + +```python +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fa582ccad08>) +``` + +Get a rich.Table object representing assets available locally +#### Parameters: + +- `genomes` (`list[str]`): genomes to restrict the results with +- `server_url` (`str`): server URL to query for the remote genome data +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset + + +#### Returns: + +- `rich.table.Table`: table of assets available locally + + + + ```python def get_default_tag(self, genome, asset, use_existing=True) ``` @@ -228,7 +403,7 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest -- `use_existing` (`bool`): whether the first tag in the config should be returned in case there is no defaulttag defined for an asset +- `use_existing` (`bool`): whether the first tag in the config should bereturned in case there is no default tag defined for an asset #### Returns: @@ -238,6 +413,54 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag +```python +def get_genome_alias(self, digest, fallback=False, all_aliases=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `digest` (`str`): digest to find human-readable alias for +- `fallback` (`bool`): whether to return the query digest in caseof failure +- `all_aliases` (`bool`): whether to return all aliases instead of justthe first one + + +#### Returns: + +- `str | list[str]`: human-readable aliases + + +#### Raises: + +- `GenomeConfigFormatError`: if "genome_digests" section doesnot exist in the config +- `UndefinedAliasError`: if a no alias has been defined for therequested digest + + + + +```python +def get_genome_alias_digest(self, alias, fallback=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `alias` (`str`): alias to find digest for +- `fallback` (`bool`): whether to return the query alias in caseof failure and in case it is one of the digests + + +#### Returns: + +- `str`: genome digest + + +#### Raises: + +- `UndefinedAliasError`: if the specified alias has been assigned toany digests + + + + ```python def get_genome_attributes(self, genome) ``` @@ -274,7 +497,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x1051476a8>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fa582cd4840>) ``` List genomes and assets available remotely. @@ -293,7 +516,27 @@ List genomes and assets available remotely. ```python -def getseq(self, genome, locus) +def get_symlink_paths(self, genome, asset=None, tag=None, all_aliases=False) +``` + +Get path to the alias directory for the selected genome-asset-tag +#### Parameters: + +- `genome` (`str`): reference genome ID +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `all_aliases` (`bool`): whether to return a collection of symboliclinks or just the first one from the alias list + + +#### Returns: + +- `dict`: + + + + +```python +def getseq(self, genome, locus, as_str=False) ``` Return the sequence found in a selected range and chromosome. Something like the refget protocol. @@ -301,6 +544,12 @@ Return the sequence found in a selected range and chromosome. Something like the - `genome` (`str`): name of the sequence identifier - `locus` (`str`): 1-10' +- `as_str` (`bool`): whether to convert the resurned object to stringand return just the sequence + + +#### Returns: + +- `str | pyfaidx.FastaRecord | pyfaidx.Sequence`: selected sequence @@ -347,6 +596,28 @@ Initialize genome configuration file on disk +```python +def initialize_genome(self, fasta_path, alias, fasta_unzipped=False, skip_alias_write=False) +``` + +Initialize a genome + +Create a JSON file with Annotated Sequence Digests (ASDs) +for the FASTA file in the genome directory. +#### Parameters: + +- `fasta_path` (`str`): path to a FASTA file to initialize genome with +- `alias` (`str`): alias to set for the genome +- `skip_alias_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `str, list[dict[]]`: human-readable name for the genome + + + + ```python def is_asset_complete(self, genome, asset, tag) ``` @@ -394,7 +665,7 @@ List types/names of assets that are available for one--or all--genomes. - `genome` (`str | NoneType`): reference genome assembly ID, optional;if omitted, the full mapping from genome to asset names - `order` (`function(str) -> object`): how to key genome IDs and assetnames for sort -- `include_tags` (`bool`): whether asset tags should be included in the returned dict +- `include_tags` (`bool`): whether asset tags should be included in thereturned dict #### Returns: @@ -423,10 +694,10 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x1051477b8>) +def listr(self, genome=None, order=None, get_url= at 0x7fa582cd4950>, as_str=False) ``` -List genomes and assets available remotely. +List genomes and assets available remotely on all servers the object subscribes to #### Parameters: - `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance @@ -436,7 +707,7 @@ List genomes and assets available remotely. #### Returns: -- `str, str`: text reps of remotely available genomes and assets +- `dict[OrderedDict[list]]`: remotely available genomes and assetskeyed by genome keyed by source server endpoint @@ -454,7 +725,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x105147a60>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fa582cd4bf8>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -465,6 +736,8 @@ Download and possibly unpack one or more assets for a given ref gen. - `tag` (`str`): name of particular tag to fetch - `unpack` (`bool`): whether to unpack a tarball - `force` (`bool | NoneType`): how to handle case in which asset pathalready exists; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt to replace existing file, and True to auto-replay Yes for existing asset replacement. +- `force_large` (`bool | NoneType`): how to handle case in large (> 5GB)asset is to be pulled; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt, and True to auto-replay Yes +- `size_cutoff` (`float`): maximum archive file size to download withno prompt - `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset - `build_signal_handler` (`function(str) -> function`): how to createa signal handler to use during the download; the single argument to this function factory is the download filepath @@ -528,6 +801,24 @@ Remove any relationship links associated with the selected asset +```python +def remove_genome_aliases(self, digest, aliases=None) +``` + +Remove alias for a specified genome digest. This method will remove the digest both from the genomes object and from the aliases mapping in tbe config +#### Parameters: + +- `digest` (`str`): genome digest to remove an alias for +- `aliases` (`list[str]`): a collection to aliases to remove for thegenome. If not provided, all aliases for the digest will be remove + + +#### Returns: + +- `bool`: whether the removal has been performed + + + + ```python def run_plugins(self, hook) ``` @@ -541,7 +832,38 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x1051472f0>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fa582cd4378>) +``` + +Seek path to a specified genome-asset-tag alias +#### Parameters: + +- `genome_name` (`str`): name of a reference genome assembly of interest +- `asset_name` (`str`): name of the particular asset to fetch +- `tag_name` (`str`): name of the particular asset tag to fetch +- `seek_key` (`str`): name of the particular subasset to fetch +- `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). +- `check_exist` (`function(callable) -> bool`): how to check forasset/path existence +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `all_aliases` (`bool`): whether to return paths to all asset aliases orjust the one for the specified 'genome_name` argument + + +#### Returns: + +- `str`: path to the asset + + +#### Raises: + +- `TypeError`: if the existence check is not a one-arg function +- `refgenconf.MissingGenomeError`: if the named assembly isn't knownto this configuration instance +- `refgenconf.MissingAssetError`: if the names assembly is known tothis configuration instance, but the requested asset is unknown + + + + +```python +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fa582cd4488>) ``` Seek path to a specified genome-asset-tag @@ -553,7 +875,7 @@ Seek path to a specified genome-asset-tag - `seek_key` (`str`): name of the particular subasset to fetch - `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). - `check_exist` (`function(callable) -> bool`): how to check forasset/path existence -- `enclosing_dir` (`bool`): whether a path to the entire enclosing directory should be returned, e.g.for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned #### Returns: @@ -571,7 +893,7 @@ Seek path to a specified genome-asset-tag ```python -def set_default_pointer(self, genome, asset, tag, force=False) +def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None) ``` Point to the selected tag by default @@ -580,7 +902,33 @@ Point to the selected tag by default - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest - `tag` (`str`): name of the particular asset tag to point to by default -- `force` (`bool`): whether the default tag change should be forced (even if it exists) +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. +- `force` (`bool`): whether the default tag change should beforced (even if it exists) + + + + +```python +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fa582cd4ea0>) +``` + +Assign a human-readable alias to a genome identifier. + +Genomes are identified by a unique identifier which is derived from the +FASTA file (part of fasta asset). This way we can ensure genome +provenance and compatibility with the server. This function maps a +human-readable identifier to make referring to the genomes easier. +#### Parameters: + +- `genome` (`str`): name of the genome to assign to an identifier +- `digest` (`str`): identifier to use +- `overwrite` (`bool`): whether all the previously set aliases should beremoved and just the current one stored +- `no_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `bool`: whether the alias has been established @@ -602,7 +950,7 @@ Otherwise the current one will be appended to. ```python -def tag(self, genome, asset, tag, new_tag, files=True) +def tag(self, genome, asset, tag, new_tag, files=True, force=False) ``` Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. @@ -647,7 +995,7 @@ Remove URLs the list of genome_servers. ```python -def update_assets(self, genome, asset=None, data=None) +def update_assets(self, genome, asset=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset mapping is missing, it will be created @@ -655,6 +1003,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -666,13 +1015,14 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass ```python -def update_genomes(self, genome, data=None) +def update_genomes(self, genome, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome is missing, it will be added #### Parameters: - `genome` (`str`): genome to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -694,7 +1044,7 @@ A convenience method which wraps the update assets and uses it to update the ass - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated - `data` (`list`): asset parents to be added/updated -- `children` (`bool`): a logical indicating whether the relationship to be added is 'children' +- `children` (`bool`): a logical indicating whether the relationship to beadded is 'children' #### Returns: @@ -705,7 +1055,7 @@ A convenience method which wraps the update assets and uses it to update the ass ```python -def update_seek_keys(self, genome, asset, tag=None, keys=None) +def update_seek_keys(self, genome, asset, tag=None, keys=None, force_digest=None) ``` A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. @@ -714,6 +1064,7 @@ A convenience method which wraps the updated assets and uses it to update the se - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `keys` (`Mapping`): seek_keys to be added/updated @@ -725,7 +1076,7 @@ A convenience method which wraps the updated assets and uses it to update the se ```python -def update_tags(self, genome, asset=None, tag=None, data=None) +def update_tags(self, genome, asset=None, tag=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset-tag mapping is missing, it will be created @@ -734,6 +1085,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -773,43 +1125,49 @@ Write the contents to a file. If pre- and post-update plugins are defined, they #### Raises: -- `OSError`: when the object has been created in a read only mode or other process has locked the file +- `OSError`: when the object has been created in a read only mode or otherprocess has locked the file - `TypeError`: when the filepath cannot be determined.This takes place only if YacAttMap initialized with a Mapping as an input, not read from file. - `OSError`: when the write is called on an object with no write capabilitiesor when writing to a file that is locked by a different object -## Class `GenomeConfigFormatError` -Exception for invalid genome config file format. +## Class `RefgenconfError` +Base exception type for this package + + +## Class `RemoteDigestMismatchError` +Remote digest of the parent asset does not match its local counterpart ```python -def __init__(self, msg) +def __init__(self, asset, local_digest, remote_digest) ``` Initialize self. See help(type(self)) for accurate signature. -## Class `MissingAssetError` -Error type for request of an unavailable genome asset. +## Class `UnboundEnvironmentVariablesError` +Use of environment variable that isn't bound to a value. -## Class `MissingConfigDataError` -Missing required configuration instance items +```python +def get_dir_digest(path, pm=None) +``` +Generate a MD5 digest that reflects just the contents of the files in the selected directory. +#### Parameters: -## Class `MissingGenomeError` -Error type for request of unknown genome/assembly. +- `path` (`str`): path to the directory to digest +- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided -## Class `RefgenconfError` -Base exception type for this package +#### Returns: + +- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 -## Class `UnboundEnvironmentVariablesError` -Use of environment variable that isn't bound to a value. ```python @@ -830,7 +1188,27 @@ Get path to genome configuration file. +```python +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fa582cca730>, link_fun= at 0x7fa582cd8158>) +``` + +Upgrade the config to a selected target version. + +Convert the config file to target_version format, update file structure +inside genome_folder. Drop genomes for which genome_digest is not available +on any of the servers and do not have a fasta asset locally. +#### Parameters: + +- `target_version` (`str`): the version updated to +- `filepath` (`str`): path to config file +- `force` (`bool`): whether the upgrade should be confirmed upfront +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset +- `link_fun` (`callable`): function to use to link files, e.g os.symlink or os.link + + + + -*Version Information: `refgenconf` v0.7.1-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file +*Version Information: `refgenconf` v0.10.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file diff --git a/docs/available_assets.md b/docs/available_assets.md index b102b285..0ff38fcc 100644 --- a/docs/available_assets.md +++ b/docs/available_assets.md @@ -45,6 +45,27 @@ refgenie build rCRS/fasta --files fasta=rCRS.fa.gz refgenie seek rCRS/fasta ``` +### blacklist + + required files: `--files blacklist=/path/to/blacklist_file` (*e.g.* [hg38-blacklist.v2.bed.gz](https://github.com/Boyle-Lab/Blacklist/tree/master/lists)) + required parameters: *none* + required asset: *none* + required software: *none* + +The `blacklist` asset represents regions that should be excluded from sequencing experiments. The ENCODE blacklist represents a comprehensive listing of these regions for several model organisms [^Amemiya2019]. + +Example blacklist files: + +- [hg19 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg19-blacklist.v2.bed.gz) +- [hg38 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg38-blacklist.v2.bed.gz) +- [mm10 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/mm10-blacklist.v2.bed.gz) +- [dm6 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/dm6-blacklist.v2.bed.gz) + +``` +wget https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg38-blacklist.v2.bed.gz +refgenie build hg38/blacklist --files blacklist=hg38-blacklist.v2.bed.gz +``` + ### refgene_anno required files: `--files refgene=/path/to/refGene_file` (*e.g.* [refGene.txt.gz](http://varianttools.sourceforge.net/Annotation/RefGene)) @@ -267,3 +288,6 @@ The `feat_annotation` asset includes the following genomic feature annotations: ``` refgenie build test/feat_annotation ``` + + +[^Amemiya2019]: Amemiya HM, Kundaje A, Boyle AP. The ENCODE Blacklist: Identification of Problematic Regions of the Genome. *Sci Rep* 2019;9, 9354. doi:10.1038/s41598-019-45839-z \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md index 3cd3dacb..fd33082f 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,24 @@ # Changelog This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.10.0] - 2021-03-11 + +**After updating to this version your configuration file and genome assets will not be compatible with the software. Please refer to the [upgrade tutorial](config_upgrade_03_to_04.md) for instructions on how to migrate the config between versions.** + +## Changed + +- instead of using human-readable names as genome identifiers refgenie uses sequence-derived digests in the config +- asset data moved to `data` directory +- asset files are now named after genome digests +- refgenieserver API v3 is now used for remote assets retrieval +- improved visual interface in `list`, `listr` and `pull` subcommands + +## Added + +- `data` and `alias` directories in genome directory that are used to store asset and aliases data, respectively +- `refgenie alias` command for genome aliases management +- `refgenie upgrade` command for config format upgrades +- `refgenie compare` command for genome compatibility determination ## [0.9.3] - 2020-07-29 diff --git a/docs/compare.md b/docs/compare.md new file mode 100644 index 00000000..a3b9b8c5 --- /dev/null +++ b/docs/compare.md @@ -0,0 +1,38 @@ +# Genomes compatibility + +## Motivation + +Many genomic analyses require less stringent comparison: simple compatibility between assets that are not necessarily identical. +For example, if we wanted to annotate genomic regions based on aligned BAM file using feature annotations we would need the reads to share just the coordinate system (the sequence lengths and names). Importantly, it does not require sequence identity at all. In this case, we would like to confirm that the given `bowtie2_index` asset is at least compatible with the coordinate system of feature annotation file. + +To sum up, we need a more detailed comparison that may ignore sequences but allow to compare other aspects of the reference genome. + +## Solution + +Refgenie facilitates such fine-grained comparison of genomes via `refgenie compare` command. A useful "byproduct" of genome initialization described in [Use aliases how-to guide](aliases.md) is a JSON file with annotated sequence digests, that are required to compare FASTA file contents. + +## Usage + +In order to compare two initialized genomes one needs to issue the following command + +```console +refgenie compare hg38 hg38_plus +``` + +The result is a set of informative flags that determine the level of compatibility of the genomes, for example: + +```console +Flag: 2005 +Binary: 0b11111010101 + +CONTENT_ALL_A_IN_B +LENGTHS_ALL_A_IN_B +NAMES_ALL_A_IN_B +CONTENT_A_ORDER +CONTENT_B_ORDER +CONTENT_ANY_SHARED +LENGTHS_ANY_SHARED +NAMES_ANY_SHARED +``` + +Based on the output above we can conclude that genome `hg38_plus` is a superset of `hg38`. diff --git a/docs/custom_assets.md b/docs/custom_assets.md index 89254bf3..66821147 100644 --- a/docs/custom_assets.md +++ b/docs/custom_assets.md @@ -18,14 +18,28 @@ If you want to, you could also just edit the config file by hand by adding this ```yaml genomes: - hg38: + 511fb1178275e7d529560d53b949dba40815f195623bce8e: + aliases: + - hg38 + - human assets: manual_anno: - asset_path: manual_anno - asset_description: Manual annotations from project X - seek_keys: - anno1: anno1.txt - anno2: anno2.txt + tags: + default: + asset_path: manual_anno + asset_description: Manual annotations from project X + seek_keys: + anno1: anno1.txt + anno2: anno2.txt + default_tag: default ``` -Now, you can access this asset with `refgenie` the same way you do all other assets... `refgenie list` will include it, `refgenie seek -g gh38 -a manual_anno` will retrieve the path, and from within python, `RefGenConf.get_asset('hg38', 'manual_anno')` will also work. The advantage of doing this is that it lets you include *all* your genome-associated resources, including manual ones, within the same framework. +The refgenie-compatible genome digest can be determined this way: + +```python +from refgenconf.seqcol import SeqColClient +digest, _ = SeqColClient({}).load_fasta("path/hg38.fa") +# digest -> 511fb1178275e7d529560d53b949dba40815f195623bce8e +``` + +Now, you can access this asset with `refgenie` the same way you do all other assets... `refgenie list` will include it, `refgenie seek -g gh38 -a manual_anno` will retrieve the path, and from within python, `RefGenConf.seek('hg38', 'manual_anno')` will also work. The advantage of doing this is that it lets you include *all* your genome-associated resources, including manual ones, within the same framework. diff --git a/docs/genome_config.md b/docs/genome_config.md index 18dc6e4a..a8634f3f 100644 --- a/docs/genome_config.md +++ b/docs/genome_config.md @@ -2,26 +2,51 @@ Refgenie will read and write a genome configuration file in yaml format. In general, you shouldn't need to mess with the config file. You create one with `refgenie init -c genome_config.yaml`, then you add assets using either `refgenie pull` or `refgenie build`. You can also add your own custom assets with `refgenie add`, which is explained in [using custom assets](custom_assets.md). Refgenie will use the config file to remember what assets are available and where they are. -But here's how the config file works, in case for some reason you do need to edit some things by hand. Here's an example file to get you started: +## Upgrading the configuration file + +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. Starting with `refgenie v0.10.0` we introduced the `refgenie upgrade` command, which will automatically detect the current configuration file version and will: 1) reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. To reformat the config, run from the command line: + +``` +refgenie upgrade --target-version 0.4 -c /path/to/old/cfg.yml +``` + +Or from within Python: + +```python +from refgenconf import upgrade_config +upgrade_config(target_version="0.4", filepath="/path/to/old/cfg.yml") +``` + +Below is a CHANGELOG describing all changes introduced in configuration file versions. + +## Genome configuration file example + +Here's how the config file works, in case you do need to edit some things by hand. Here's an example file which manages fasta and bowtie2_index assets for hg38 genome. Keep in mind that some of the keys in this config are optional: ```yaml -genome_folder: /path/to/active/genomes -genome_servers: http://refgenomes.databio.org -genome_archive: /path/to/archived/genomes +config_version: 0.4 +genome_folder: /path/to/genomes +genome_archive_folder: /path/to/genome_archives +genome_archive_config: /path/to/genome_archive/config.yaml +remote_url_base: http://awspds.refgenie.databio.org/ +genome_servers: ['http://refgenomes.databio.org'] genomes: - rCRSd: + 511fb1178275e7d529560d53b949dba40815f195623bce8e: + aliases: + - hg38 + - human assets: fasta: tags: default: seek_keys: - fasta: rCRSd.fa - fai: rCRSd.fa.fai - chrom_sizes: rCRSd.chrom.sizes + fasta: 511fb1178275e7d529560d53b949dba40815f195623bce8e.fa + fai: 511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai + chrom_sizes: 511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes asset_parents: [] asset_path: fasta asset_digest: a3c46f201a3ce7831d85cf4a125aa334 - asset_children: ['bowtie2_index:default'] + asset_children: ['511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index:default'] default_tag: default bowtie2_index: asset_description: Genome index for bowtie, produced with bowtie-build @@ -31,28 +56,17 @@ genomes: seek_keys: bowtie2_index: . asset_digest: 0f9217d44264ae2188fcf8e469d2bb38 - asset_parents: ['fasta:default'] - default_tag: default - hg38: - assets: - gencode_gtf: - asset_description: GTF annotation asset which provides access to all annotated transcripts which make up an Ensembl gene set. - tags: - default: - asset_path: gencode_gtf - seek_keys: - gencode_gtf: hg38.gtf.gz - asset_digest: 4cd4eac99cdfdeb8ff72d8f8a4a16f9f - asset_parents: [] + asset_parents: ['511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:default'] default_tag: default ``` ## Details of config attributes +### Required - **genome_folder**: Path to parent folder refgenie-managed assets. -- **genome_servers**: URL to a refgenieserver instances. -- **genome_archive**: (optional; used by refgenieserver) Path to folder where asset archives will be stored. +- **genome_servers**: URLs to a refgenieserver instances. - **genomes**: A list of genomes, each genome has a list of assets. Any relative paths in the asset `path` attributes are considered relative to the genome folder in the config file (or the file itself if not folder path is specified), with the genome name as an intervening path component, e.g. `folder/mm10/indexed_bowtie2`. +- **aliases**: A list of arbitrary strings that can be used to refer to the namespace - **tags**: A collection of tags defined for the asset - **default_tag**: A pointer to the tag that is currently defined as the default one - **asset_parents**: A list of assets that were used to build the asset in question @@ -60,17 +74,47 @@ genomes: - **seek_keys**: A mapping of names and paths of the specific files within an asset - **asset_path**: A path to the asset folder, relative to the genome config file - **asset_digest**: A digest of the asset directory (more precisely, of the file contents within one) used to address the asset provenance issues when the assets are pulled or built. +### Optional (used by refgenieserver) +- **genome_archive_folder**: Path to folder where asset archives will be stored. +- **genome_archive_folder**: Path to folder file asset archives config will be stored. +- **remote_url_base**: Path/URL to prepend to served asset archives, if non-local ones are to be served -Note that for a fully operational config just `genome_folder`, `genome_server`, `genomes`, `assets`, `tags` and `seek_keys` keys are required. For genomes that are managed by `refgenie` (that is, they were built or pulled with `refgenie`), these asset attributes will be automatically populated. You can edit them and refgenie will respect your edits (unless you re-build or re-pull the asset, which will overwrite those fields). You can also add your own assets and `refgenie` won't touch them. For more info, see [using custom assets](custom_assets.md). -## Genome config versions -### v0.2 -Up to version `0.4.4`, refgenie used a config file version that lacked the `assets` level in the hierarchy (so, assets were listed directly under the genome). Starting with version `0.5.0`, we moved the assets down a layer to accommodate other genome-level attributes we intend to use in the future (like a description, checksums, other provenance information). Earlier refgenie config files will need to be updated. +# Config file changelog + +## [0.4] - Unreleased; refgenie v0.10.0 + +### Config format changes + +- use sequence-derived unique genome identifiers instead of genome names everywhere +- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily + +### File tree structure changes + +- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name +- move all the contents from the refgenie directory to a new `data` directory +- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory + +## [0.3] - 2019-10-21; refgenie v0.7.0 + +### Config format changes + +- Added seek keys, tags, asset digests, default tag pointer, asset description. + + +## [0.2] - 2019-07-11; refgenie v0.5.0 + +### Config format changes + +- Added `config_version` entry +- Added the `assets` level in the config hierarchy. +- We moved the assets down a layer to accommodate other genome-level attributes we intend to use in the future (like a description, checksums, other provenance information). Earlier refgenie config files will need to be updated. + +## [0.1] - 2019-05-10; refgenie v0.3.0 + +- Initial version of the config file with the initial refgenie release. -### v0.3 -Upt to version `0.6.0`, refgenie used the config v0.2. Currently, it uses v0.3, where we introduced: seek keys, tags, asset digests, default tag pointer, asset description - - + \ No newline at end of file diff --git a/docs/refgenconf.md b/docs/refgenconf.md index bc90a2ea..2d346b93 100644 --- a/docs/refgenconf.md +++ b/docs/refgenconf.md @@ -38,6 +38,6 @@ bt2idx = rgc.seek(genome, "bowtie2_index") # run bowtie2... ``` -This enables you to write python software that will work on any computing environment without having to worry about passing around brittle environment-specific file paths. See [this tutorial](/jupyter_docs/refgenconf_usage) for more comprehensive example of how to work with `refgenconf` as a tool developer. +This enables you to write python software that will work on any computing environment without having to worry about passing around brittle environment-specific file paths. See [this tutorial](/refgenconf_usage) for more comprehensive example of how to work with `refgenconf` as a tool developer. See the complete [refgenconf python API](/autodoc_build/refgenconf) for more details. diff --git a/docs/refgenieserver.md b/docs/refgenieserver.md index 83204a42..72cae24c 100644 --- a/docs/refgenieserver.md +++ b/docs/refgenieserver.md @@ -18,3 +18,17 @@ docker run --rm -d -p 80:80 \ ``` Mount your archived genomes folder to `/genomes` in the container, and you're essentially good to go. + +### References + +We have scripted the process of building and archiving the assets to serve with `refgenieserver`. The process usually includes the following steps: + +1. Download raw input files for assets (FASTA files, GTF files etc.) +2. Build assets with refgenie build in a local refgenie instance +3. Archive assets with refgenieserver archive +4. Deploy the server (run `refgenieserver serve` on a cluster or locally) + +Check out these GitHub repositories for more details and all the required metadata: + +- [`refgenie/refgenomes.databio.org`](https://github.com/refgenie/refgenomes.databio.org) (primary refgenie assets server instance) +- [`refgenie/rg.databio.org`](https://github.com/refgenie/rg.databio.org) (development refgenie assets server instace) diff --git a/docs/servers.md b/docs/servers.md new file mode 100644 index 00000000..dafc3327 --- /dev/null +++ b/docs/servers.md @@ -0,0 +1,7 @@ +# Servers + +Here are some servers. Let us know if you're running your own refgenieserver instance and would like to be added to this list. + +- [Primary server](http://refgenomes.databio.org) +- [Dev server](http://rg.databio.org) +- [Plant references server](http://plantref.databio.org) diff --git a/docs/upgrade-config.md b/docs/upgrade-config.md new file mode 100644 index 00000000..0b256ccf --- /dev/null +++ b/docs/upgrade-config.md @@ -0,0 +1,38 @@ +# Refgenie configuration file upgrades + +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. + +Starting with the refgenie transition 0.9.3 -> 0.10.0 (configuration file versions: 0.3 -> 0.4) we introduced the `refgenie upgrade` functionality, which will take care of all the required reformatting. Running `refgenie upgrade` will automatically detect the current configuration file version and will: 1. reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. + +Below we describe the changes introduced in each configuration file version and how to upgrade: + +## Configuration file v0.4 (introduced: refgenie v0.10.0) + +### How to upgrade + +To reformat the config run from the command line: + +``` +refgenie upgrade --target-version 0.4 -c /path/to/old/cfg.yml +``` + +Or from within Python: + +```python +from refgenconf import upgrade_config +upgrade_config(target_version="0.4", filepath="/path/to/old/cfg.yml") +``` + +### Config format changes + +- use sequence-derived unique genome identifiers instead of genome names everywhere +- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily + +### File tree structure changes + +- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name +- move all the contents from the refgenie directory to a new `data` directory +- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory + + + diff --git a/docs/usage.md b/docs/usage.md index c70861ac..d93d1f5c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,15 +2,15 @@ ## `refgenie --help` ```console -version: 0.9.3 +version: 0.10.0 | refgenconf 0.10.0 usage: refgenie [-h] [--version] [--silent] [--verbosity V] [--logdev] - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe} + {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} ... refgenie - reference genome asset manager positional arguments: - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe} + {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} init Initialize a genome configuration. list List available local assets. listr List available remote assets. @@ -24,6 +24,9 @@ positional arguments: id Return the asset digest. subscribe Add a refgenieserver URL to the config. unsubscribe Remove a refgenieserver URL from the config. + alias Interact with aliases. + compare Compare two genomes. + upgrade Upgrade config. This will alter the files on disk. optional arguments: -h, --help show this help message and exit @@ -37,311 +40,342 @@ https://refgenie.databio.org ## `refgenie init --help` ```console -usage: refgenie init [-h] -c C [-s GENOME_SERVER [GENOME_SERVER ...]] +usage: refgenie init [-h] -c C [--skip-read-lock] [-s GENOME_SERVER [GENOME_SERVER ...]] [-f GENOME_FOLDER] [-a GENOME_ARCHIVE_FOLDER] - [-b GENOME_ARCHIVE_CONFIG] [-u REMOTE_URL_BASE] - [-j SETTINGS_JSON] + [-b GENOME_ARCHIVE_CONFIG] [-u REMOTE_URL_BASE] [-j SETTINGS_JSON] Initialize a genome configuration. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - URL(s) to use for the genome_servers attribute in - config file. Default: http://refgenomes.databio.org. + URL(s) to use for the genome_servers attribute in + config file. Default: + http://refgenomes.databio.org. -f GENOME_FOLDER, --genome-folder GENOME_FOLDER - Absolute path to parent folder refgenie-managed - assets. + Absolute path to parent folder refgenie-managed + assets. -a GENOME_ARCHIVE_FOLDER, --genome-archive-folder GENOME_ARCHIVE_FOLDER - Absolute path to parent archive folder refgenie- - managed assets; used by refgenieserver. + Absolute path to parent archive folder refgenie- + managed assets; used by refgenieserver. -b GENOME_ARCHIVE_CONFIG, --genome-archive-config GENOME_ARCHIVE_CONFIG - Absolute path to desired archive config file; used by - refgenieserver. + Absolute path to desired archive config file; used + by refgenieserver. -u REMOTE_URL_BASE, --remote-url-base REMOTE_URL_BASE - URL to use as an alternative, remote archive location; - used by refgenieserver. + URL to use as an alternative, remote archive + location; used by refgenieserver. -j SETTINGS_JSON, --settings-json SETTINGS_JSON - Absolute path to a JSON file with the key value pairs - to inialize the configuration file with. Overwritten - by itemized specifications. + Absolute path to a JSON file with the key value + pairs to inialize the configuration file with. + Overwritten by itemized specifications. ``` ## `refgenie list --help` ```console -usage: refgenie list [-h] [-c C] [-g [GENOME [GENOME ...]]] +usage: refgenie list [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] [-r] List available local assets. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g [GENOME [GENOME ...]], --genome [GENOME [GENOME ...]] - Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. + -r, --recipes List available recipes. ``` ## `refgenie listr --help` ```console -usage: refgenie listr [-h] [-c C] [-g [GENOME [GENOME ...]]] +usage: refgenie listr [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] List available remote assets. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g [GENOME [GENOME ...]], --genome [GENOME [GENOME ...]] - Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. ``` ## `refgenie pull --help` ```console -usage: refgenie pull [-h] [-c C] [-g G] [--no-overwrite | --force-overwrite] - [--no-large | --pull-large] [--size-cutoff S] [-b] +usage: refgenie pull [-h] [-c C] [--skip-read-lock] [-g G] + [--no-overwrite | --force-overwrite] [--no-large | --pull-large] + [--size-cutoff S] [-b] asset-registry-paths [asset-registry-paths ...] Download assets. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. Prompt handling: These flags configure the pull prompt responses. - --no-overwrite Do not overwrite if asset exists. - --force-overwrite Overwrite if asset exists. - --no-large Do not pull archives over 5GB. - --pull-large Pull any archive, regardless of its size. - --size-cutoff S Maximum archive file size to download with no - confirmation required (in GB, default: 10) - -b, --batch Use batch mode: pull large archives, do no overwrite + --no-overwrite Do not overwrite if asset exists. + --force-overwrite Overwrite if asset exists. + --no-large Do not pull archives over 5GB. + --pull-large Pull any archive, regardless of its size. + --size-cutoff S Maximum archive file size to download with no confirmation + required (in GB, default: 10) + -b, --batch Use batch mode: pull large archives, do no overwrite ``` ## `refgenie build --help` ```console -usage: refgenie build [-h] [-c C] [-R] [-C CONFIG_FILE] [-N] +usage: refgenie build [-h] [-c C] [--skip-read-lock] [-R] [-C CONFIG_FILE] [-N] [--tag-description TAG_DESCRIPTION] [--genome-description GENOME_DESCRIPTION] [-d] - [--assets ASSETS [ASSETS ...]] - [--files FILES [FILES ...]] - [--params PARAMS [PARAMS ...]] - [-v VOLUMES [VOLUMES ...]] [-o OUTFOLDER] [-q] - [-r RECIPE] [-g G] + [--assets ASSETS [ASSETS ...]] [--files FILES [FILES ...]] + [--params PARAMS [PARAMS ...]] [-v VOLUMES [VOLUMES ...]] + [-o OUTFOLDER] [-q] [-r RECIPE] [-g G] asset-registry-paths [asset-registry-paths ...] Build genome assets. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify + assets (e.g. hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -R, --recover Overwrite locks to recover from previous failed run - -C CONFIG_FILE, --config CONFIG_FILE - Pipeline configuration file (YAML). Relative paths are - with respect to the pipeline script. - -N, --new-start Overwrite all results to start a fresh run - --tag-description TAG_DESCRIPTION - Add tag level description (e.g. built with version - 0.3.2). + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -R, --recover Overwrite locks to recover from previous failed + run + -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths + are with respect to the pipeline script. + -N, --new-start Overwrite all results to start a fresh run + --tag-description TAG_DESCRIPTION Add tag level description (e.g. built with version + 0.3.2). --genome-description GENOME_DESCRIPTION - Add genome level description (e.g. The mouse - mitochondrial genome, released in Dec 2013). - -d, --docker Run all commands in the refgenie docker container. - --assets ASSETS [ASSETS ...] - Override the default genome, asset and tag of the - parents (e.g. fasta=hg38/fasta:default - gtf=mm10/gencode_gtf:default). - --files FILES [FILES ...] - Provide paths to the required files (e.g. - fasta=/path/to/file.fa.gz). - --params PARAMS [PARAMS ...] - Provide required parameter values (e.g. - param1=value1). + Add genome level description (e.g. The mouse + mitochondrial genome, released in Dec 2013). + -d, --docker Run all commands in the refgenie docker container. + --assets ASSETS [ASSETS ...] Override the default genome, asset and tag of the + parents (e.g. fasta=hg38/fasta:default + gtf=mm10/gencode_gtf:default). + --files FILES [FILES ...] Provide paths to the required files (e.g. + fasta=/path/to/file.fa.gz). + --params PARAMS [PARAMS ...] Provide required parameter values (e.g. + param1=value1). -v VOLUMES [VOLUMES ...], --volumes VOLUMES [VOLUMES ...] - If using docker, also mount these folders as volumes. - -o OUTFOLDER, --outfolder OUTFOLDER - Override the default path to genomes folder, which is - the genome_folder attribute in the genome - configuration file. - -q, --requirements Show the build requirements for the specified asset - and exit. - -r RECIPE, --recipe RECIPE - Provide a recipe to use. - -g G, --genome G Reference assembly ID, e.g. mm10. + If using docker, also mount these folders as + volumes. + -o OUTFOLDER, --outfolder OUTFOLDER Override the default path to genomes folder, which + is the genome_folder attribute in the genome + configuration file. + -q, --requirements Show the build requirements for the specified + asset and exit. + -r RECIPE, --recipe RECIPE Provide a recipe to use. + -g G, --genome G Reference assembly ID, e.g. mm10. ``` ## `refgenie seek --help` ```console -usage: refgenie seek [-h] [-c C] [-g G] [-e] +usage: refgenie seek [-h] [-c C] [--skip-read-lock] [-g G] [-e] asset-registry-paths [asset-registry-paths ...] Get the path to a local asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag or - hg38/fasta.fai:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag or hg38/fasta.fai:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -e, --check-exists Whether the returned asset path should be checked for - existence on disk. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -e, --check-exists Whether the returned asset path should be checked for existence + on disk. ``` ## `refgenie add --help` ```console -usage: refgenie add [-h] [-c C] [-g G] [-f] -p P [-s S] +usage: refgenie add [-h] [-c C] [--skip-read-lock] [-g G] [-f] -p P [-s S] asset-registry-paths [asset-registry-paths ...] Add local asset to the config file. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -f, --force Do not prompt before action, approve upfront. - -p P, --path P Relative local path to asset. - -s S, --seek-keys S String representation of a JSON object with seek_keys, - e.g. '{"seek_key1": "file.txt"}') + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. + -p P, --path P Relative local path to asset. + -s S, --seek-keys S String representation of a JSON object with seek_keys, e.g. + '{"seek_key1": "file.txt"}' ``` ## `refgenie remove --help` ```console -usage: refgenie remove [-h] [-c C] [-g G] [-f] +usage: refgenie remove [-h] [-c C] [--skip-read-lock] [-g G] [-f] [-a] asset-registry-paths [asset-registry-paths ...] Remove a local asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -f, --force Do not prompt before action, approve upfront. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. + -a, --aliases Remove the genome alias if last asset for that genome is + removed. ``` ## `refgenie getseq --help` ```console -usage: refgenie getseq [-h] [-c C] -g G -l LOCUS +usage: refgenie getseq [-h] [-c C] [--skip-read-lock] -g G -l LOCUS Get sequences from a genome. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -l LOCUS, --locus LOCUS - Coordinates of desired sequence; e.g. - 'chr1:50000-50200'. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -l LOCUS, --locus LOCUS Coordinates of desired sequence; e.g. 'chr1:50000-50200'. ``` ## `refgenie tag --help` ```console -usage: refgenie tag [-h] [-c C] [-g G] (-t TAG | -d) +usage: refgenie tag [-h] [-c C] [--skip-read-lock] [-g G] [-f] (-t TAG | -d) asset-registry-paths [asset-registry-paths ...] Tag an asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -t TAG, --tag TAG Tag to assign to an asset. - -d, --default Set the selected asset tag as the default one. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. + -t TAG, --tag TAG Tag to assign to an asset. + -d, --default Set the selected asset tag as the default one. ``` ## `refgenie id --help` ```console -usage: refgenie id [-h] [-c C] [-g G] +usage: refgenie id [-h] [-c C] [--skip-read-lock] [-g G] asset-registry-paths [asset-registry-paths ...] Return the asset digest. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. ``` ## `refgenie subscribe --help` ```console -usage: refgenie subscribe [-h] [-c C] [-r] -s GENOME_SERVER +usage: refgenie subscribe [-h] [-c C] [--skip-read-lock] [-r] -s GENOME_SERVER [GENOME_SERVER ...] Add a refgenieserver URL to the config. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -r, --reset Overwrite the current list of server URLs. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -r, --reset Overwrite the current list of server URLs. -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - One or more URLs to add to the genome_servers - attribute in config file. + One or more URLs to add to the genome_servers + attribute in config file. ``` ## `refgenie unsubscribe --help` ```console -usage: refgenie unsubscribe [-h] [-c C] -s GENOME_SERVER [GENOME_SERVER ...] +usage: refgenie unsubscribe [-h] [-c C] [--skip-read-lock] -s GENOME_SERVER + [GENOME_SERVER ...] Remove a refgenieserver URL from the config. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - One or more URLs to remove from the genome_servers - attribute in config file. + One or more URLs to remove from the genome_servers + attribute in config file. +``` + +## `refgenie alias --help` +```console +usage: refgenie alias [-h] {remove,set,get} ... + +Interact with aliases. + +positional arguments: + {remove,set,get} + remove Remove aliases. + set Set aliases. + get Get aliases. + +optional arguments: + -h, --help show this help message and exit +``` + +## `refgenie upgrade --help` +```console +usage: refgenie upgrade [-h] [-c C] [--skip-read-lock] -v V [-f] + +Upgrade config. This will alter the files on disk. + +optional arguments: + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -v V, --target-version V Target config version for the upgrade. + -f, --force Do not prompt before action, approve upfront. ``` diff --git a/docs_jupyter/aliases.ipynb b/docs_jupyter/aliases.ipynb new file mode 100644 index 00000000..bfecaafa --- /dev/null +++ b/docs_jupyter/aliases.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Genome aliases\n", + "\n", + "TLDR; **The genome alias system in refgenie allows users to refer to assets with arbitrary strings managed with `refgenie alias` command.**\n", + "\n", + "## Motivation\n", + "\n", + "Many systems rely on human-readable identifiers of genomes, such as \"hg38\". However, two users may refer to different things with the same identifier, such as the many slight variations of the *hg38* genome assembly. Such identifier mismatches lead to compatibility issues that incur the wrath of bioinformaticians everywhere. A step toward solving this problem is to use unique identifiers that unambiguously identify a particular assembly, such as those provided by the NCBI Assembly database; however, this approach relies on a central authority, and therefore can not apply to custom genomes or custom assets. Besides, human-readable identifiers persist because there's something simple and satisfying about referring to a genome or piece of data with a simple string that makes some sense and is easy to remember, like *hg38*. \n", + "\n", + "## Solutions\n", + "\n", + "### Sequence-derived identifiers\n", + "\n", + "Refgenie’s approach extends the [refget](http://samtools.github.io/hts-specs/refget.html) algorithm by GA4GH, introduced in 2019 to *collections of annotated sequences*. This means that the unique sequence-derived genome identifier calculated by refgenie captures not only sequence content, but also related metadata like sequence names and length. So, instead of referring to human genome as, e.g. \"hg38\" refgenie unambiguously identifies it as `58de7f33a36ccd9d6e3b1b3afe6b9f37cd5b2867bbfb929a`. \n", + "\n", + "#### Genome namespace initialization\n", + "\n", + "The genome digest is calculated based on a FASTA file once the genome namespace is first created. This can happen when the `fasta` asset is pulled or built.\n", + "\n", + "To start, initialize an empty refgenie configuration file from the shell and subscribe to the desired asset server:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized genome configuration file: /Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml\n", + "Created directories:\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/data\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias\n" + ] + } + ], + "source": [ + "export REFGENIE=$(pwd)/refgenie.yaml\n", + "refgenie init -c $REFGENIE -s http://rg.databio.org" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's pull a `fasta` asset, which is one way to initialize a genome:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compatible refgenieserver instances: ['http://rg.databio.org']\n", + "No local digest for genome alias: rCRSd\n", + "Setting 'rCRSd' identity with server: http://rg.databio.org/v3/genomes/genome_digest/rCRSd\n", + "Determined server digest for local genome alias (rCRSd): 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", + "Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta\n", + "\u001b[2K94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta:default \u001b[35m100.0%\u001b[0m • • • …\u001b[0m • • ? • … …\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/fasta__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/fasta__default.tgz\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta' set to: default\n", + "Initializing genome: rCRSd\n", + "Loaded AnnotatedSequenceDigestList (1 sequences)\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/fasta --force" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following the `refgenie pull` command logs we notice that multiple steps happened:\n", + "1. refgenie used the human-readable genome name from the `refgenie pull` call (`rCRSd`) to query the server for any digest associated with it\n", + "2. refgenie set the digest it got back from the server as the genome identifier and set the human-readable genome name as an alias\n", + "3. refgenie used the genome idenfitier (not the user-specified name) to query the server for the `fasta` asset\n", + "\n", + "From now on, the unique sequence-derived genome identifier will be used to query asset servers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compatible refgenieserver instances: ['http://rg.databio.org']\n", + "Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index\n", + "\u001b[2K94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index:defau… \u001b[35m100.…\u001b[0m • •\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/bowtie2_index --force" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Genome aliases\n", + "\n", + "To make the user's life easier genome aliases system in refgenie allows to set arbitrary genome aliases that can be then used to refer to a genome. Users can interact with genome aliases using `refgenie alias` command:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: refgenie alias [-h] {remove,set,get} ...\n", + "\n", + "Interact with aliases.\n", + "\n", + "positional arguments:\n", + " {remove,set,get}\n", + " remove Remove aliases.\n", + " set Set aliases.\n", + " get Get aliases.\n", + "\n", + "optional arguments:\n", + " -h, --help show this help message and exit\n" + ] + } + ], + "source": [ + "refgenie alias --help" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set aliases\n", + "\n", + "To set an alias \"mito\" for genome identified by digest `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` one needs to issue the command below:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: mito)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mito\n" + ] + } + ], + "source": [ + "refgenie alias set --aliases mito --digest 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get aliases\n", + "\n", + "To get see the entire aliases collection managed by refgenie one needs to issue the command below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Genome aliases \u001b[0m\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1malias \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n", + "│ 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 │ rCRSd, mito │\n", + "└──────────────────────────────────────────────────┴─────────────┘\n" + ] + } + ], + "source": [ + "refgenie alias get" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `alias` and `data` directories\n", + "\n", + "Refgenie stores asset data in two directories: `alias` and `data`. The `data` directory consists of the actual asset files, which are built or pulled from asset servers. The files in this directory are named using the digests, which helps refgenie to unambigously identify genomes. The `alias` holds symbolic links to asset data in `data` directory. **This way users do not need to be aware of the digest-named files at all and there is no waste of disk space due to symbolic links**. \n", + "\n", + "Here's a general view of the contents of both directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34malias\u001b[00m\n", + "├── \u001b[01;34mmito\u001b[00m\n", + "│   ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + "│   │   └── \u001b[01;34mdefault\u001b[00m\n", + "│   └── \u001b[01;34mfasta\u001b[00m\n", + "│   └── \u001b[01;34mdefault\u001b[00m\n", + "└── \u001b[01;34mrCRSd\u001b[00m\n", + " ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + " │   └── \u001b[01;34mdefault\u001b[00m\n", + " └── \u001b[01;34mfasta\u001b[00m\n", + " └── \u001b[01;34mdefault\u001b[00m\n", + "\n", + "10 directories\n", + "\u001b[01;34mdata\u001b[00m\n", + "└── \u001b[01;34m94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\u001b[00m\n", + " ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + " │   └── \u001b[01;34mdefault\u001b[00m\n", + " └── \u001b[01;34mfasta\u001b[00m\n", + " └── \u001b[01;34mdefault\u001b[00m\n", + "\n", + "5 directories\n" + ] + } + ], + "source": [ + "tree alias -d \n", + "tree data -d " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the `alias` directory holds *both* of the defined aliases. Let's take a closer look at one of them" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34malias/rCRSd/fasta\u001b[00m\n", + "└── \u001b[01;34mdefault\u001b[00m\n", + " ├── \u001b[01;36mrCRSd.chrom.sizes\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.chrom.sizes\n", + " ├── \u001b[01;36mrCRSd.fa\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa\n", + " └── \u001b[01;36mrCRSd.fa.fai\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.fai\n", + "\n", + "1 directory, 3 files\n" + ] + } + ], + "source": [ + "tree alias/rCRSd/fasta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This explicitly shows that the files inside `alias/rCRSd/fasta/default` are in fact symbolic links that point to the actual asset files in `data` directory." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs_jupyter/config_upgrade_03_to_04.ipynb b/docs_jupyter/config_upgrade_03_to_04.ipynb new file mode 100644 index 00000000..b0b9abab --- /dev/null +++ b/docs_jupyter/config_upgrade_03_to_04.ipynb @@ -0,0 +1,629 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Configuration file upgrade demonstration \n", + "\n", + "In the following tutorial we will present the process of upgrading the refgenie configuration file and asset files from version **0.3** to version **0.4**.\n", + "\n", + "First, let's install the refgenie and refgenconf Python packages that support version 0.3 of refgenie configuration file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working environment setup\n", + "\n", + "Let's install the legacy refgenconf and refgenie Python packages" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting refgenconf==0.9.3\n", + " Using cached https://files.pythonhosted.org/packages/52/c3/6aed361205272e30cd3570ca1c33feae6ad977ad32ddff8e509752046272/refgenconf-0.9.3-py3-none-any.whl\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.9.3) (2.21.0)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.12.12.dev0)\n", + "Requirement already satisfied: tqdm>=4.38.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (4.47.0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.9.3) (5.1)\n", + "Requirement already satisfied: yacman>=0.6.9 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.7.0)\n", + "Requirement already satisfied: pyfaidx in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.5.9.1)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (1.24.1)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (2019.3.9)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (3.0.4)\n", + "Requirement already satisfied: ubiquerg>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from attmap>=0.12.5->refgenconf==0.9.3) (0.6.1)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.6.9->refgenconf==0.9.3) (0.9)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx->refgenconf==0.9.3) (41.0.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx->refgenconf==0.9.3) (1.12.0)\n", + "Installing collected packages: refgenconf\n", + " Found existing installation: refgenconf 0.10.0.dev0\n", + " Uninstalling refgenconf-0.10.0.dev0:\n", + " Successfully uninstalled refgenconf-0.10.0.dev0\n", + "Successfully installed refgenconf-0.9.3\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Collecting refgenie==0.9.3\n", + " Using cached https://files.pythonhosted.org/packages/af/52/c1e1bc63b3543f591ebdf44caccfaab3c730708256d926b9f4b1c34d1865/refgenie-0.9.3-py3-none-any.whl\n", + "Requirement already satisfied: pyfaidx>=0.5.5.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.5.9.1)\n", + "Requirement already satisfied: refgenconf>=0.9.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.9.3)\n", + "Requirement already satisfied: logmuse>=0.2.6 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.2.6)\n", + "Requirement already satisfied: piper>=0.12.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.12.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.9.3) (1.12.0)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.9.3) (41.0.1)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (0.12.12.dev0)\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (2.21.0)\n", + "Requirement already satisfied: tqdm>=4.38.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (4.47.0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (5.1)\n", + "Requirement already satisfied: yacman>=0.6.9 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (0.7.0)\n", + "Requirement already satisfied: ubiquerg>=0.4.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.9.3) (0.6.1)\n", + "Requirement already satisfied: psutil in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from piper>=0.12.1->refgenie==0.9.3) (5.6.1)\n", + "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.9.3) (1.0.3)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (1.24.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (2019.3.9)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (2.8)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.6.9->refgenconf>=0.9.1->refgenie==0.9.3) (0.9)\n", + "Requirement already satisfied: pytz>=2017.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (2018.9)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (2.8.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (1.17.3)\n", + "Installing collected packages: refgenie\n", + " Found existing installation: refgenie 0.10.0.dev0\n", + " Uninstalling refgenie-0.10.0.dev0:\n", + " Successfully uninstalled refgenie-0.10.0.dev0\n", + "Successfully installed refgenie-0.9.3\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "pip install refgenconf==0.9.3\n", + "pip install refgenie==0.9.3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's set up a directory that we will use for the config file and refgenie assets" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "export WORKDIR=~/Desktop/testing/refgenie/upgrade_test\n", + "rm -r $WORKDIR # remove first just to make sure the directory does not exist\n", + "mkdir -p $WORKDIR\n", + "cd $WORKDIR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's set `$REFGENIE` environment variable to point refgenie to the configuration file location and initialize it" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized genome configuration file: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/g.yml\n" + ] + } + ], + "source": [ + "export REFGENIE=$WORKDIR/g.yml\n", + "refgenie init -c $REFGENIE -s http://rg.databio.org:82/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we subscribe to a test instance of refgenieserver, that supports both the old and new refgenie clients. This is because it exposes different API versions, that these clients use: `v2` (refgenie v0.9.3) and `v3` (refgenie v0.10.0-dev)\n", + "\n", + "## Pull/build test assets\n", + "\n", + "Next, let's retrieve couple of assets. As mentioned above, `v2` API is used to retrieve the asset." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading URL: http://rg.databio.org:82/v2/asset/rCRSd/fasta/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/fasta__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/fasta/default\n", + "Default tag for 'rCRSd/fasta' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/human_repeats/fasta/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/fasta__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/fasta/default\n", + "Default tag for 'human_repeats/fasta' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/rCRSd/bowtie2_index/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/bowtie2_index__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/bowtie2_index/default\n", + "Default tag for 'rCRSd/bowtie2_index' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/human_repeats/bwa_index/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/bwa_index__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/bwa_index/default\n", + "Default tag for 'human_repeats/bwa_index' set to: default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/fasta human_repeats/fasta rCRSd/bowtie2_index human_repeats/bwa_index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's download a small FASTA file and build a fasta asset for an arbitrary genome, which is not available at `http://rg.databio.org:82/`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-10-12 17:39:25-- http://big.databio.org/refgenie_raw/files.human_alu.fasta.fasta\n", + "Resolving big.databio.org (big.databio.org)... 128.143.245.182, 128.143.245.181\n", + "Connecting to big.databio.org (big.databio.org)|128.143.245.182|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 501 [application/octet-stream]\n", + "Saving to: ‘human_alu.fa.gz’\n", + "\n", + "human_alu.fa.gz 100%[===================>] 501 --.-KB/s in 0s \n", + "\n", + "2020-10-12 17:39:25 (1.19 MB/s) - ‘human_alu.fa.gz’ saved [501/501]\n", + "\n" + ] + } + ], + "source": [ + "wget -O human_alu.fa.gz http://big.databio.org/refgenie_raw/files.human_alu.fasta.fasta\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using 'default' as the default tag for 'human_alu/fasta'\n", + "Building 'human_alu/fasta:default' using 'fasta' recipe\n", + "Saving outputs to:\n", + "- content: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu\n", + "- logs: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build\n", + "### Pipeline run code and environment:\n", + "\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build human_alu/fasta --files fasta=human_alu.fa.gz`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test\n", + "* Outfolder: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/\n", + "* Pipeline started at: (10-12 17:39:27) elapsed: 0.0 _TIME_\n", + "\n", + "### Version log:\n", + "\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", + "* Pypiper version: 0.12.1\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", + "* Pipeline version: None\n", + "\n", + "### Arguments passed to pipeline:\n", + "\n", + "* `asset_registry_paths`: `['human_alu/fasta']`\n", + "* `assets`: `None`\n", + "* `command`: `build`\n", + "* `config_file`: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/refgenie.yaml`\n", + "* `docker`: `False`\n", + "* `files`: `[['fasta=human_alu.fa.gz']]`\n", + "* `genome`: `None`\n", + "* `genome_config`: `None`\n", + "* `genome_description`: `None`\n", + "* `logdev`: `False`\n", + "* `new_start`: `False`\n", + "* `outfolder`: `/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test`\n", + "* `params`: `None`\n", + "* `recipe`: `None`\n", + "* `recover`: `False`\n", + "* `requirements`: `False`\n", + "* `silent`: `False`\n", + "* `tag_description`: `None`\n", + "* `verbosity`: `None`\n", + "* `volumes`: `None`\n", + "\n", + "----------------------------------------\n", + "\n", + "Target to produce: `/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/human_alu_fasta__default.flag` \n", + "\n", + "> `cp human_alu.fa.gz /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.gz` (70063)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70063)\n",
+      "Warning: couldn't add memory use for process: 70063\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70063;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.001GB\n", + "\n", + "\n", + "> `gzip -df /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.gz` (70064)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70064)\n",
+      "Warning: couldn't add memory use for process: 70064\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70064;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", + "\n", + "\n", + "> `samtools faidx /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa` (70065)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70065)\n",
+      "Warning: couldn't add memory use for process: 70065\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70065;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.0GB\n", + "\n", + "\n", + "> `cut -f 1,2 /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.fai > /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.chrom.sizes` (70066)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70066)\n",
+      "Warning: couldn't add memory use for process: 70066\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70066;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", + "\n", + "> `touch /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/human_alu_fasta__default.flag` (70068)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70068)\n",
+      "Warning: couldn't add memory use for process: 70068\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70068;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", + "\n", + "> `cd /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", + "Asset digest: 9e8fa06e6125f89be4fb974879cb91a6\n", + "Default tag for 'human_alu/fasta' set to: default\n", + "\n", + "### Pipeline completed. Epilogue\n", + "* Elapsed time (this run): 0:00:00\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.001 GB\n", + "* Pipeline completed time: 2020-10-12 17:39:27\n", + "Computing initial genome digest...\n", + "Initializing genome...\n", + "Finished building 'fasta' asset\n" + ] + } + ], + "source": [ + "refgenie build human_alu/fasta --files fasta=human_alu.fa.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the asset inventory" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Server subscriptions: http://rg.databio.org:82\n", + "Local genomes: human_alu, human_repeats, rCRSd\n", + "Local recipes: bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\n", + "Local assets:\n", + " human_alu/ fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n", + " human_repeats/ bwa_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n", + " rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n" + ] + } + ], + "source": [ + "refgenie list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, assets for all three genomes are available." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade refgenie software\n", + "\n", + "Now, let's upgrade to refgenie==0.10.0-dev, which introduces the concept of sequence-derived genome identifiers to uniqly identify genomes." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/refgenie/refgenconf.git@dev_config_upgrade\n", + " Cloning https://github.com/refgenie/refgenconf.git (to revision dev_config_upgrade) to /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-kxmw8i6n\n", + " Running command git clone -q https://github.com/refgenie/refgenconf.git /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-kxmw8i6n\n", + " Running command git checkout -b dev_config_upgrade --track origin/dev_config_upgrade\n", + " Switched to a new branch 'dev_config_upgrade'\n", + " Branch 'dev_config_upgrade' set up to track remote branch 'dev_config_upgrade' from 'origin'.\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.12.12.dev0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (5.1)\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (2.21.0)\n", + "Requirement already satisfied: yacman>=0.7.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.7.0)\n", + "Requirement already satisfied: future in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.17.1)\n", + "Requirement already satisfied: jsonschema in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (3.0.1)\n", + "Requirement already satisfied: rich in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (3.3.0)\n", + "Requirement already satisfied: ubiquerg>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from attmap>=0.12.5->refgenconf==0.10.0.dev0) (0.6.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (2019.3.9)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (2.8)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (1.24.1)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.7.0->refgenconf==0.10.0.dev0) (0.9)\n", + "Requirement already satisfied: attrs>=17.4.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (19.1.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (0.14.11)\n", + "Requirement already satisfied: six>=1.11.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (1.12.0)\n", + "Requirement already satisfied: setuptools in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (41.0.1)\n", + "Requirement already satisfied: colorama<0.5.0,>=0.4.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from rich->refgenconf==0.10.0.dev0) (0.4.1)\n", + "Requirement already satisfied: pprintpp<0.5.0,>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.4.0)\n", + "Requirement already satisfied: typing-extensions<4.0.0,>=3.7.4 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (3.7.4.2)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (2.6.1)\n", + "Requirement already satisfied: dataclasses<0.8,>=0.7; python_version >= \"3.6\" and python_version < \"3.7\" in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.7)\n", + "Requirement already satisfied: commonmark<0.10.0,>=0.9.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.9.1)\n", + "Building wheels for collected packages: refgenconf\n", + " Building wheel for refgenconf (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for refgenconf: filename=refgenconf-0.10.0.dev0-cp36-none-any.whl size=64959 sha256=37191046ce6136b2bd777b1aa274a2d6a5ffb508af7e4969ac0ae97c1682b1f5\n", + " Stored in directory: /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-ephem-wheel-cache-516dw93w/wheels/a8/b1/82/f79eaabaad4cf5c64fb4914e06dd04726c5c226785974aee4e\n", + "Successfully built refgenconf\n", + "Installing collected packages: refgenconf\n", + " Found existing installation: refgenconf 0.9.3\n", + " Uninstalling refgenconf-0.9.3:\n", + " Successfully uninstalled refgenconf-0.9.3\n", + "Successfully installed refgenconf-0.10.0.dev0\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Collecting git+https://github.com/refgenie/refgenie.git@dev_config_upgrade\n", + " Cloning https://github.com/refgenie/refgenie.git (to revision dev_config_upgrade) to /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-3i4zdr4w\n", + " Running command git clone -q https://github.com/refgenie/refgenie.git /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-3i4zdr4w\n", + " Running command git checkout -b dev_config_upgrade --track origin/dev_config_upgrade\n", + " Switched to a new branch 'dev_config_upgrade'\n", + " Branch 'dev_config_upgrade' set up to track remote branch 'dev_config_upgrade' from 'origin'.\n", + "Requirement already satisfied: logmuse>=0.2.6 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.2.6)\n", + "Requirement already satisfied: piper>=0.12.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.12.1)\n", + "Requirement already satisfied: pyfaidx>=0.5.5.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.5.9.1)\n", + "Requirement already satisfied: ubiquerg>=0.4.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.6.1)\n", + "Requirement already satisfied: yacman in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.7.0)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.12.12.dev0)\n", + "Requirement already satisfied: psutil in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (5.6.1)\n", + "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (1.0.3)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.10.0.dev0) (41.0.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.10.0.dev0) (1.12.0)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman->piper>=0.12.1->refgenie==0.10.0.dev0) (0.9)\n", + "Requirement already satisfied: pyyaml>=3.13 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from yacman->piper>=0.12.1->refgenie==0.10.0.dev0) (5.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (2018.9)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (2.8.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (1.17.3)\n", + "Building wheels for collected packages: refgenie\n", + " Building wheel for refgenie (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for refgenie: filename=refgenie-0.10.0.dev0-cp36-none-any.whl size=29266 sha256=d78485a0207036ddd91c36eb66b1973bdb3588aaff925d165d5e5aed483f968c\n", + " Stored in directory: /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-ephem-wheel-cache-wmsjgl78/wheels/07/12/55/f50538357799dd2938a702a2f9e8b84a849975e61b0c59e7a0\n", + "Successfully built refgenie\n", + "Installing collected packages: refgenie\n", + " Found existing installation: refgenie 0.9.3\n", + " Uninstalling refgenie-0.9.3:\n", + " Successfully uninstalled refgenie-0.9.3\n", + "Successfully installed refgenie-0.10.0.dev0\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "pip install git+https://github.com/refgenie/refgenconf.git@dev_config_upgrade\n", + "pip install git+https://github.com/refgenie/refgenie.git@dev_config_upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\n" + ] + } + ], + "source": [ + "refgenie --version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of refgenie commands fails since the config is incompatible:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie\", line 10, in \n", + " sys.exit(main())\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/refgenie.py\", line 821, in main\n", + " skip_read_lock=skip_read_lock)\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenconf/refgenconf.py\", line 110, in __init__\n", + " raise ConfigNotCompliantError(msg)\n", + "refgenconf.exceptions.ConfigNotCompliantError: This genome config (v0.3) is not compliant with v0.4 standards. \n", + "To use current refgenconf, please use upgrade_config function to upgrade, ordowngrade refgenconf: 'pip install \"refgenconf>=0.7.0,<0.10.0\"'. \n", + "If refgenie is installed, you can use 'refgenie upgrade --target-version 0.4'; For config format documentation please see http://refgenie.databio.org/en/latest/genome_config/\n" + ] + }, + { + "ename": "", + "evalue": "1", + "output_type": "error", + "traceback": [] + } + ], + "source": [ + "refgenie list " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade refgenie configuration file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's upgrade the config to v0.4, just as the error message suggests. We will use `--force` option to run the command in batch mode." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upgrading v0.3 config file format to v0.4\n", + "Retrieved rCRSd digest from the server (511fb1178275e7d529560d53b949dba40815f195623bce8e)\n", + "Retrieved human_repeats digest from the server (ebf26d2f064462bea7029e6b4d2298967d7435bff82ed224)\n", + "Genome digest for human_alu is not available on any of the servers. Generating the digest from a local fasta file\n", + "Loaded AnnotatedSequenceDigestList (8 sequences)\n", + "Creating 'data' and 'alias' directories in '/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test'.\n", + "Copying assets to 'data' and creating alias symlinks in 'alias'. Genomes that the digest could not be determined for 'will be ignored.\n", + "Removing genome assets that have been copied to 'data' directory.\n" + ] + } + ], + "source": [ + "refgenie upgrade --force --target-version 0.4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The upgrade succeded for all the assets that were previously managed by refgenie, regardless of the fact if the sequence-derived genome identifiers were avialable on the server. For ones that were not (`human_alu` genome) refgenie calculated the digest from the locally available FASTA file using the same algorithm that has been used to generate digests for the genomes on the server." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Local refgenie assets \u001b[0m\n", + "\u001b[3m Server subscriptions: \u001b[0m\n", + "\u001b[3m http://rg.databio.org:82 \u001b[0m\n", + "┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ rCRSd │ fasta, bowtie2_index │\n", + "│ human_repeats │ fasta, bwa_index │\n", + "│ human_alu │ fasta │\n", + "└───────────────┴──────────────────────┘\n" + ] + } + ], + "source": [ + "refgenie list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/docs_jupyter/refgenconf_usage.ipynb b/docs_jupyter/refgenconf_usage.ipynb index 2e00149f..fbe1c41d 100644 --- a/docs_jupyter/refgenconf_usage.ipynb +++ b/docs_jupyter/refgenconf_usage.ipynb @@ -23,12 +23,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ[\"REFGENIE\"] = \"./genomes.yaml\"\n", + "os.environ[\"REFGENIE\"] = \"./refgenie.yaml\"\n", "user_provided_cfg_path = None\n", "user_provided_genome = \"rCRSd\"" ] @@ -42,11 +42,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "from refgenconf import RefGenConf, select_genome_config, RefgenconfError, CFG_ENV_VARS, CFG_FOLDER_KEY" + "from refgenconf import RefGenConf, select_genome_config, RefgenconfError, CFG_ENV_VARS, CFG_FOLDER_KEY\n", + "from yacman import UndefinedAliasError" ] }, { @@ -58,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -74,14 +75,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "if not refgenie_cfg_path:\n", - " raise OSError(\"Could not determine path to a refgenie genome configuration file. \"\n", - " \"Use --rfg-config argument or set '{}' environment variable to provide it\".\n", - " format(CFG_ENV_VARS))" + " raise OSError(f\"Could not determine path to a refgenie genome configuration file.\"\n", + " f\"Use --rfg-config argument or set '{CFG_ENV_VARS}' environment variable to provide it\")" ] }, { @@ -93,25 +93,26 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "File '/Users/mstolarczyk/Uczelnia/UVA/code/refgenie/docs_jupyter/genomes.yaml' does not exist. Initializing refgenie genome configuration file.\n" + "File '/Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml' does not exist. Initializing refgenie genome configuration file.\n" ] } ], "source": [ "if isinstance(refgenie_cfg_path, str) and os.path.exists(refgenie_cfg_path):\n", - " print(\"Reading refgenie genome configuration file from file: {}\".format(refgenie_cfg_path))\n", + " print(f\"Reading refgenie genome configuration file from file: {refgenie_cfg_path}\")\n", " rgc = RefGenConf(filepath=refgenie_cfg_path)\n", "else:\n", - " print(\"File '{}' does not exist. Initializing refgenie genome configuration file.\".format(refgenie_cfg_path))\n", + " print(f\"File '{refgenie_cfg_path}' does not exist. Initializing refgenie genome configuration file.\")\n", " rgc = RefGenConf(entries={CFG_FOLDER_KEY: os.path.dirname(refgenie_cfg_path)})\n", - " rgc.initialize_config_file(filepath=refgenie_cfg_path)" + " rgc.initialize_config_file(filepath=refgenie_cfg_path)\n", + " rgc.subscribe(urls=\"http://rg.databio.org:82\", reset=True) # subscribe to the desired server, if needed" ] }, { @@ -123,43 +124,56 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - " " + "Could not determine path to chrom.sizes asset, pulling\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Could not determine path to chrom.sizes asset, pulling\n", - "Determined path to fasta asset: /Users/mstolarczyk/Uczelnia/UVA/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a258a7dec75f4bbcb3f2973e0a3b9cc6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "\r" + "Determined path to fasta asset: /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa\n" ] } ], "source": [ "try:\n", - " fasta = rgc.get_asset(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", + " fasta = rgc.seek(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", " seek_key=\"fasta\")\n", - "except RefgenconfError:\n", + "except (RefgenconfError, UndefinedAliasError):\n", " print(\"Could not determine path to chrom.sizes asset, pulling\")\n", - " rgc.pull_asset(genome=user_provided_genome, asset=\"fasta\", tag=\"default\")\n", - " fasta = rgc.get_asset(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", + " rgc.pull(genome=user_provided_genome, asset=\"fasta\", tag=\"default\")\n", + " fasta = rgc.seek(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", " seek_key=\"fasta\")\n", - "print(\"Determined path to fasta asset: {}\".format(fasta))" + "print(f\"Determined path to fasta asset: {fasta}\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index 2eb5c772..3f149ee2 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -8,7 +8,7 @@ "\n", "I assume you've already installed refgenie. In this tutorial I'll show you a few ways to use refgenie from the command line (commands that start with a `!`), and also some Python commands.\n", "\n", - "To start, initialize an empty refgenie configuration file from the shell:" + "To start, initialize an empty refgenie configuration file from the shell and subscribe to the desired asset server:" ] }, { @@ -22,12 +22,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Initialized genome configuration file: /home/nsheff/code/refgenie/docs_jupyter/refgenie.yaml\r\n" + "Initialized genome configuration file: /Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml\r\n", + "Created directories:\r\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/data\r\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias\r\n" ] } ], "source": [ - "!refgenie init -c refgenie.yaml" + "!refgenie init -c refgenie.yaml -s http://rg.databio.org" ] }, { @@ -46,9 +49,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "config_version: 0.3\r\n", - "genome_folder: /home/nsheff/code/refgenie/docs_jupyter\r\n", - "genome_servers: ['http://refgenomes.databio.org']\r\n", + "config_version: 0.4\r\n", + "genome_folder: /Users/mstolarczyk/code/refgenie/docs_jupyter\r\n", + "genome_servers: \r\n", + " - http://rg.databio.org\r\n", "genomes: null\r\n" ] } @@ -57,6 +61,87 @@ "!cat refgenie.yaml" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Remote refgenie assets \u001b[0m\r\n", + "\u001b[3m Server URL: http://rg.databio.org \u001b[0m\r\n", + "┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", + "│ rCRSd │ fasta, bowtie2_index, bwa_index, hisat2_index, │\r\n", + "│ │ star_index, bismark_bt2_index │\r\n", + "│ hg18_cdna │ fasta, kallisto_index │\r\n", + "│ hs38d1 │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg38_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ human_repeats │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ rn6_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ mm10_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ hg38_chr22 │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg38 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bowtie2_index, │\r\n", + "│ │ bwa_index, tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ hg19_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ human_rDNA │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ human_alu │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, bismark_bt2_index │\r\n", + "│ human_alphasat │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ mouse_chrM2x │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ t7 │ fasta, bowtie2_index │\r\n", + "│ mm10 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bwa_index, │\r\n", + "│ │ bowtie2_index, hisat2_index, tallymer_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ dm6 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ bowtie2_index │\r\n", + "│ hg18 │ fasta, gencode_gtf, fasta_txome, suffixerator_index, │\r\n", + "│ │ cellranger_reference, bwa_index, bowtie2_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg19 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bwa_index, │\r\n", + "│ │ bowtie2_index, tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ salmon_partial_sa_index, bismark_bt2_index │\r\n", + "│ rn6 │ fasta, ensembl_gtf, refgene_anno, fasta_txome, │\r\n", + "│ │ suffixerator_index, bwa_index, bowtie2_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ hg38_noalt_decoy │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, bismark_bt2_index │\r\n", + "│ mm10_primary │ fasta, bowtie2_index, bwa_index │\r\n", + "│ hg38_primary │ fasta, bowtie2_index, bwa_index │\r\n", + "│ hg38_mm10 │ fasta, bwa_index │\r\n", + "└──────────────────┴───────────────────────────────────────────────────────────┘\r\n", + "\u001b[2;3m use refgenie listr -g for more detailed view \u001b[0m\r\n" + ] + } + ], + "source": [ + "!refgenie listr -c refgenie.yaml" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -66,64 +151,70 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import refgenconf\n", - "rgc = refgenconf.RefGenConf(\"refgenie.yaml\")" + "rgc = refgenconf.RefGenConf(filepath=\"refgenie.yaml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Use `pull` to download the actual asset:" + "Use `pull` to download one of the assets:" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "abab7f40d9654ca6ba8c60471cbe303a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/plain": [ - "(['hs38d1', 'fasta', 'default'],\n", - " {'archive_digest': '310c578812a64fcdf08d2df60d7b79b4',\n", - " 'archive_size': '1.7MB',\n", - " 'asset_children': ['hs38d1/star_index:default',\n", - " 'hs38d1/bwa_index:default',\n", - " 'hs38d1/bowtie2_index:default',\n", - " 'hs38d1/bismark_bt1_index:default',\n", - " 'hs38d1/bismark_bt2_index:default',\n", - " 'hs38d1/hisat2_index:default',\n", - " 'hs38d1/tallymer_index:default',\n", - " 'hs38d1/suffixerator_index:default'],\n", - " 'asset_digest': 'eddf5466faa3391a7114e87648466dcb',\n", + "(['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a', 'fasta', 'default'],\n", + " {'asset_path': 'fasta',\n", + " 'asset_digest': '8dfe402f7d29d5b036dd8937119e4404',\n", + " 'archive_digest': 'bfb7877ee114c61a17a50bd471de47a2',\n", + " 'asset_size': '39.4KB',\n", + " 'archive_size': '9.1KB',\n", + " 'seek_keys': {'fasta': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa',\n", + " 'fai': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa.fai',\n", + " 'chrom_sizes': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.chrom.sizes'},\n", " 'asset_parents': [],\n", - " 'asset_path': 'fasta',\n", - " 'asset_size': '6.0MB',\n", - " 'seek_keys': {'chrom_sizes': 'hs38d1.chrom.sizes',\n", - " 'fai': 'hs38d1.fa.fai',\n", - " 'fasta': 'hs38d1.fa'}},\n", - " 'http://refgenomes.databio.org')" + " 'asset_children': ['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/suffixerator_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bowtie2_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bwa_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/tallymer_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/hisat2_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/star_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bismark_bt2_index:default']},\n", + " 'http://rg.databio.org')" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rgc.pull(\"hs38d1\", \"fasta\", \"default\")" + "rgc.pull(\"mouse_chrM2x\", \"fasta\", \"default\")" ] }, { @@ -135,22 +226,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/home/nsheff/code/refgenie/docs_jupyter/hs38d1/fasta/default/hs38d1.fa'" + "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mouse_chrM2x/fasta/default/mouse_chrM2x.fa'" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rgc.seek(\"hs38d1\", \"fasta\")" + "rgc.seek(\"mouse_chrM2x\", \"fasta\")" ] }, { @@ -162,22 +253,22 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'eddf5466faa3391a7114e87648466dcb'" + "'8dfe402f7d29d5b036dd8937119e4404'" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "rgc.id(\"hs38d1\", \"fasta\")" + "rgc.id(\"mouse_chrM2x\", \"fasta\")" ] }, { @@ -196,36 +287,34 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-03-13 16:11:59-- http://big.databio.org/refgenie_raw/rCRSd.fa.gz\r\n", - "Resolving big.databio.org (big.databio.org)... 128.143.245.181\r\n", - "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\r\n", - "HTTP request sent, awaiting response... 200 OK\r\n", - "Length: 8399 (8.2K) [application/octet-stream]\r\n", - "Saving to: ‘rCRSd.fa.gz’\r\n", - "\r\n", - "\r", - "rCRSd.fa.gz 0%[ ] 0 --.-KB/s \r", - "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0s \r\n", - "\r\n", - "2020-03-13 16:11:59 (214 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\r\n", - "\r\n" + "--2021-03-09 12:22:40-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", + "Resolving big.databio.org (big.databio.org)... 128.143.245.181, 128.143.245.182\n", + "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 8399 (8.2K) [application/octet-stream]\n", + "Saving to: ‘rCRSd.fa.gz’\n", + "\n", + "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.006s \n", + "\n", + "2021-03-09 12:22:40 (1.35 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", + "\n" ] } ], "source": [ - "!wget http://big.databio.org/refgenie_raw/rCRSd.fa.gz" + "!wget -O rCRSd.fa.gz http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -233,24 +322,30 @@ "output_type": "stream", "text": [ "Using 'default' as the default tag for 'rCRSd/fasta'\n", + "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", "Building 'rCRSd/fasta:default' using 'fasta' recipe\n", + "Initializing genome: rCRSd\n", + "Loaded AnnotatedSequenceDigestList (1 sequences)\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", "Saving outputs to:\n", - "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n", - "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n", - "* Compute host: puma\n", - "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n", - "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/\n", - "* Pipeline started at: (03-13 16:11:59) elapsed: 0.0 _TIME_\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/\n", + "* Pipeline started at: (03-09 12:22:41) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.7.6\n", - "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", "* Pypiper version: 0.12.1\n", - "* Pipeline dir: `/home/nsheff/.local/bin`\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", "* Pipeline version: None\n", "\n", "### Arguments passed to pipeline:\n", @@ -266,67 +361,74 @@ "* `genome_description`: `None`\n", "* `logdev`: `False`\n", "* `new_start`: `False`\n", - "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n", + "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", "* `params`: `None`\n", "* `recipe`: `None`\n", "* `recover`: `True`\n", "* `requirements`: `False`\n", "* `silent`: `False`\n", + "* `skip_read_lock`: `False`\n", "* `tag_description`: `None`\n", "* `verbosity`: `None`\n", "* `volumes`: `None`\n", "\n", "----------------------------------------\n", "\n", - "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` \n", "\n", - "> `cp rCRSd.fa.gz /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28689)\n", + "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63575)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63575)\n",
+      "Warning: couldn't add memory use for process: 63575\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 28689;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.0GB\n", + " PID: 63575;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `gzip -d /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28691)\n", + "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63576)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63576)\n",
+      "Warning: couldn't add memory use for process: 63576\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 28691;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", + " PID: 63576;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `samtools faidx /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa` (28693)\n", + "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa` (63577)\n", "
\n",
       "
\n", - "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.018GB. \n", - " PID: 28693;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.018GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63577;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.001GB\n", "\n", "\n", - "> `cut -f 1,2 /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.fai > /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.chrom.sizes` (28761)\n", + "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.chrom.sizes` (63578)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63578)\n",
+      "Warning: couldn't add memory use for process: 63578\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n", - " PID: 28761;\tCommand: cut;\tReturn code: 0;\tMemory used: 0.0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63578;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` (28763)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` (63580)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63580)\n",
+      "Warning: couldn't add memory use for process: 63580\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n", - " PID: 28763;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n", - "\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63580;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", - "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", "Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n", - "Default tag for 'rCRSd/fasta' set to: default\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:01\n", - "* Peak memory (this run): 0.0184 GB\n", - "* Pipeline completed time: 2020-03-13 16:12:00\n", - "Computing initial genome digest...\n", - "Initializing genome...\n", - "Finished building 'fasta' asset\n" + "* Elapsed time (this run): 0:00:00\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.0015 GB\n", + "* Pipeline completed time: 2021-03-09 12:22:41\n", + "Finished building 'fasta' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" ] } ], @@ -334,16 +436,53 @@ "!refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The asset should be available for local use, let's call `refgenie list` to check it:" + ] + }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Local refgenie assets \u001b[0m\r\n", + "\u001b[3m Server subscriptions: http://rg.databio.org \u001b[0m\r\n", + "┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1masset (\u001b[0m\u001b[1;3mseek_keys\u001b[0m\u001b[1m) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtags \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩\r\n", + "│ rCRSd │ fasta (\u001b[3mfasta, fai, chrom_sizes\u001b[0m) │ default │\r\n", + "└───────────┴────────────────────────────────────────────┴───────────┘\r\n" + ] + } + ], + "source": [ + "!refgenie list -c refgenie.yaml --genome rCRSd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can retrieve the path to this asset with:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\r\n" + "/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa\r\n" ] } ], @@ -355,21 +494,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can do the same thing from within python:" + "Naturally, we can do the same thing from within Python:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa'" + "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa'" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -383,14 +522,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " Now if you have bowtie2-build in your PATH you can build the bowtie2 index with no further requirements.\n", + "Now, if we have bowtie2-build in our `$PATH` we can build the `bowtie2_index` asset with no further requirements.\n", "\n", - "You can see the requirements with `--requirements`:\n" + "Let's check the requirements with `refgenie build --requirements`:\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -416,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -424,24 +563,25 @@ "output_type": "stream", "text": [ "Using 'default' as the default tag for 'rCRSd/bowtie2_index'\n", + "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", "Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n", "Saving outputs to:\n", - "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n", - "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", - "* Compute host: puma\n", - "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n", - "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/\n", - "* Pipeline started at: (03-13 16:12:02) elapsed: 0.0 _TIME_\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/\n", + "* Pipeline started at: (03-09 12:22:45) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.7.6\n", - "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", "* Pypiper version: 0.12.1\n", - "* Pipeline dir: `/home/nsheff/.local/bin`\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", "* Pipeline version: None\n", "\n", "### Arguments passed to pipeline:\n", @@ -457,25 +597,25 @@ "* `genome_description`: `None`\n", "* `logdev`: `False`\n", "* `new_start`: `False`\n", - "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n", + "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", "* `params`: `None`\n", "* `recipe`: `None`\n", "* `recover`: `False`\n", "* `requirements`: `False`\n", "* `silent`: `False`\n", + "* `skip_read_lock`: `False`\n", "* `tag_description`: `None`\n", "* `verbosity`: `None`\n", "* `volumes`: `None`\n", "\n", "----------------------------------------\n", "\n", - "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` \n", "\n", - "> `bowtie2-build /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd` (28812)\n", + "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` (63609)\n", "
\n",
-      "Building a SMALL index\n",
       "Settings:\n",
-      "  Output files: \"/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.*.bt2\"\n",
+      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.*.bt2\"\n",
       "  Line rate: 6 (line is 64 bytes)\n",
       "  Lines per side: 1 (side is 64 bytes)\n",
       "  Offset rate: 4 (one in 16)\n",
@@ -492,7 +632,8 @@
       "  Random seed: 0\n",
       "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
       "Input files DNA, FASTA:\n",
-      "  /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\n",
+      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa\n",
+      "Building a SMALL index\n",
       "Reading reference sizes\n",
       "  Time reading reference sizes: 00:00:00\n",
       "Calculating joined length\n",
@@ -545,8 +686,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -628,8 +769,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -658,28 +799,36 @@
       "    color: 0\n",
       "    reverse: 1\n",
       "Total time for backward call to driver() for mirror index: 00:00:00\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.019GB. \n", - " PID: 28812;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.019GB\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n", + " PID: 63609;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.003GB\n", "\n", "\n", - "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` (28879)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` (63611)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63611)\n",
+      "Warning: couldn't add memory use for process: 63611\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.019GB. \n", - " PID: 28879;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n", - "\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n", + " PID: 63611;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", - "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", - "Default tag for 'rCRSd/bowtie2_index' set to: default\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:01\n", - "* Peak memory (this run): 0.0188 GB\n", - "* Pipeline completed time: 2020-03-13 16:12:03\n", - "Finished building 'bowtie2_index' asset\n" + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.0028 GB\n", + "* Pipeline completed time: 2021-03-09 12:22:46\n", + "Finished building 'bowtie2_index' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" ] } ], @@ -691,48 +840,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can see a list of available recipes like this:" + "We can see a list of available recipes like this:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Server subscriptions: http://refgenomes.databio.org\r\n", - "Local genomes: hs38d1, rCRSd\r\n", - "Local recipes: bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\r\n", - "Local assets:\r\n", - " hs38d1/ fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n", - " rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n" + "bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index, tgMap\r\n" ] } ], "source": [ - "!refgenie list -c refgenie.yaml" + "!refgenie list -c refgenie.yaml --recipes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can get the unique digest for any asset with `refgenie id`:" + "We can get the unique digest for any asset with `refgenie id`:" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "rCRSd/fasta:default,4eb430296bc02ed7e4006624f1d5ac53\r\n" + "4eb430296bc02ed7e4006624f1d5ac53\r\n" ] } ], @@ -749,16 +893,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'3.5.2'" + "'3.6.5'" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -770,40 +914,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "refgenie 0.9.0-dev\r\n" + "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\r\n" ] } ], "source": [ "!refgenie --version" ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.7.0-dev'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "refgenconf.__version__" - ] } ], "metadata": { @@ -822,7 +946,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.5" } }, "nbformat": 4, diff --git a/mkdocs.yml b/mkdocs.yml index 78ac8930..4b6991c4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,9 +18,12 @@ nav: - Add custom assets: custom_assets.md - Retrieve paths to assets: seek.md - Use asset tags: tag.md + - Use aliases: aliases.md + - Compare genomes: compare.md - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md - Use refgenie with iGenomes: igenomes.md + - Upgrade from config 0.3 to 0.4: config_upgrade_03_to_04.md - Reference: - Genome configuration file: genome_config.md - Glossary: glossary.md @@ -48,4 +51,4 @@ navbar: left: - text: Refgenomes server icon: fa-server - href: http://refgenomes.databio.org + href: servers diff --git a/recipes.md b/recipes.md deleted file mode 100644 index 821a1683..00000000 --- a/recipes.md +++ /dev/null @@ -1,36 +0,0 @@ -# Refgenie Recipes - -Here are a few easy scripts you can use to re-index some of your favorite genomes - -## hg19 - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n hg19 -``` - -## hg38 -(use the NCBI's official version for sequence alignments without _alt sequences:) -Old link: INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz - -This README describes the sequences: - -ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/README_analysis_sets.txt - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_23/gencode.v23.primary_assembly.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n hg38 -``` - -## mm10 - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/635/GCA_000001635.5_GRCm38.p3/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001635.5_GRCm38.p3_no_alt_analysis_set.fna.gz -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M12/gencode.vM12.primary_assembly.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n mm10 -``` diff --git a/refgenie/__init__.py b/refgenie/__init__.py index b6aa9894..6e0e9f4e 100644 --- a/refgenie/__init__.py +++ b/refgenie/__init__.py @@ -1,3 +1,4 @@ from ._version import __version__ import logmuse -logmuse.init_logger("refgenie") \ No newline at end of file + +logmuse.init_logger("refgenie") diff --git a/refgenie/__main__.py b/refgenie/__main__.py index 7844a8ff..1fa5c244 100644 --- a/refgenie/__main__.py +++ b/refgenie/__main__.py @@ -1,7 +1,7 @@ -from .refgenie import main +from .cli import main import sys -if __name__ == '__main__': +if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: diff --git a/refgenie/_version.py b/refgenie/_version.py index c5981731..61fb31ca 100644 --- a/refgenie/_version.py +++ b/refgenie/_version.py @@ -1 +1 @@ -__version__ = "0.9.3" +__version__ = "0.10.0" diff --git a/refgenie/add_assets_igenome.py b/refgenie/add_assets_igenome.py index a0620e64..c09a2bb2 100644 --- a/refgenie/add_assets_igenome.py +++ b/refgenie/add_assets_igenome.py @@ -6,7 +6,7 @@ Build/ Annotation/ Sequence/ """ -from .refgenie import _seek, _remove +from .refgenie import _seek from .exceptions import MissingGenomeConfigError from ubiquerg import untar, mkabs, query_yes_no @@ -29,15 +29,36 @@ def build_argparser(): :return argparse.ArgumentParser: constructed parser """ - parser = argparse.ArgumentParser(description='Integrates every asset from the downloaded iGenomes' - ' tarball/directory with Refgenie asset management system') - parser.add_argument('-p', '--path', dest="path", type=str, - help='path to the desired genome tarball or directory to integrate', required=True) - parser.add_argument('-g', '--genome', dest="genome", type=str, help='name to be assigned to the selected genome', - required=True) - parser.add_argument('-c', '--config', dest="config", type=str, - help="path to local genome configuration file. Optional if '{}' environment variable is set.". - format(", ".join(refgenconf.CFG_ENV_VARS)), required=False) + parser = argparse.ArgumentParser( + description="Integrates every asset from the downloaded iGenomes" + " tarball/directory with Refgenie asset management system" + ) + parser.add_argument( + "-p", + "--path", + dest="path", + type=str, + help="path to the desired genome tarball or directory to integrate", + required=True, + ) + parser.add_argument( + "-g", + "--genome", + dest="genome", + type=str, + help="name to be assigned to the selected genome", + required=True, + ) + parser.add_argument( + "-c", + "--config", + dest="config", + type=str, + help="path to local genome configuration file. Optional if '{}' environment variable is set.".format( + ", ".join(refgenconf.CFG_ENV_VARS) + ), + required=False, + ) return parser @@ -78,12 +99,15 @@ def refgenie_add(rgc, asset_dict, path, force=False): should be forced """ # remove the first directory from the provided path if it is the genome name - path = os.path.join(*path.split(os.sep)[1:]) \ - if path.split(os.sep)[0] == asset_dict["genome"] else path - tag = asset_dict["tag"] \ - or rgc.get_default_tag(asset_dict["genome"], asset_dict["asset"]) - outfolder = \ - os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"])) + path = ( + os.path.join(*path.split(os.sep)[1:]) + if path.split(os.sep)[0] == asset_dict["genome"] + else path + ) + tag = asset_dict["tag"] or rgc.get_default_tag( + asset_dict["genome"], asset_dict["asset"] + ) + outfolder = os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"])) abs_asset_path = os.path.join(outfolder, path) if asset_dict["seek_key"] is None: # if seek_key is not specified we're about to move a directory to @@ -101,25 +125,32 @@ def refgenie_add(rgc, asset_dict, path, force=False): if not os.path.exists(tag_path): cp(abs_asset_path, tag_path) else: - if not force and not \ - query_yes_no("Path '{}' exists. Do you want to overwrite?". - format(tag_path)): + if not force and not query_yes_no( + "Path '{}' exists. Do you want to overwrite?".format(tag_path) + ): return False else: _remove(tag_path) cp(abs_asset_path, tag_path) else: - raise OSError("Absolute path '{}' does not exist. " - "The provided path must be relative to: {}". - format(abs_asset_path, rgc[CFG_FOLDER_KEY])) + raise OSError( + "Absolute path '{}' does not exist. " + "The provided path must be relative to: {}".format( + abs_asset_path, rgc[CFG_FOLDER_KEY] + ) + ) rgc.make_writable() gat_bundle = [asset_dict["genome"], asset_dict["asset"], tag] - td = {CFG_ASSET_PATH_KEY: - path if os.path.isdir(abs_asset_path) else os.path.dirname(path)} + td = { + CFG_ASSET_PATH_KEY: path + if os.path.isdir(abs_asset_path) + else os.path.dirname(path) + } rgc.update_tags(*gat_bundle, data=td) # seek_key points to the entire dir if not specified - seek_key_value = os.path.basename(abs_asset_path) \ - if asset_dict["seek_key"] is not None else "." + seek_key_value = ( + os.path.basename(abs_asset_path) if asset_dict["seek_key"] is not None else "." + ) sk = {asset_dict["seek_key"] or asset_dict["asset"]: seek_key_value} rgc.update_seek_keys(*gat_bundle, keys=sk) rgc.set_default_pointer(asset_dict["genome"], asset_dict["asset"], tag) @@ -137,19 +168,27 @@ def main(): """ main workflow """ parser = build_argparser() args, remaining_args = parser.parse_known_args() - cfg = refgenconf.select_genome_config(filename=args.config, check_exist=True, strict_env=True) + cfg = refgenconf.select_genome_config( + filename=args.config, check_exist=True, strict_env=True + ) if not cfg: raise MissingGenomeConfigError(args.config) rgc = refgenconf.RefGenConf(filepath=cfg, writable=False) pths = [args.path, mkabs(args.path, rgc.genome_folder)] - if not untar_or_copy(pths[0], os.path.join(rgc.genome_folder, args.genome)) \ - and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)): - raise OSError("Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths))) + if not untar_or_copy( + pths[0], os.path.join(rgc.genome_folder, args.genome) + ) and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)): + raise OSError( + "Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths)) + ) path_components = [rgc.genome_folder] + [args.genome] + ["*"] * 3 + ["Sequence"] assets_paths = glob(os.path.join(*path_components)) - assert len(assets_paths) > 0, OSError("Your iGenomes directory is corrupted, more than one directory matched by {}." - "\nMatched dirs: {}".format(os.path.join(*path_components), - ", ".join(assets_paths))) + assert len(assets_paths) > 0, OSError( + "Your iGenomes directory is corrupted, more than one directory matched by {}." + "\nMatched dirs: {}".format( + os.path.join(*path_components), ", ".join(assets_paths) + ) + ) assets_path = assets_paths[0] asset_names = [d for d in os.listdir(assets_path) if os.path.isdir(assets_path)] processed = [] @@ -161,7 +200,25 @@ def main(): print("Added assets: \n- {}".format("\n- ".join(processed))) -if __name__ == '__main__': +def _remove(path): + """ + remove asset if it is a dir or a file + + :param str path: path to the entity to remove, either a file or a dir + :return str: removed path + """ + from shutil import rmtree + + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + rmtree(path) + else: + raise ValueError("path '{}' is neither a file nor a dir.".format(path)) + return path + + +if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: diff --git a/refgenie/argparser.py b/refgenie/argparser.py new file mode 100644 index 00000000..ff7df2c7 --- /dev/null +++ b/refgenie/argparser.py @@ -0,0 +1,484 @@ +import pypiper + +from ubiquerg import VersionInHelpParser + +from ._version import __version__ +from .const import * +from refgenconf import __version__ as rgc_version + +from argparse import HelpFormatter + + +def build_argparser(): + """ + Builds argument parser. + + :return argparse.ArgumentParser + """ + + banner = "%(prog)s - reference genome asset manager" + additional_description = "\nhttps://refgenie.databio.org" + + parser = VersionInHelpParser( + prog="refgenie", + version=f"{__version__} | refgenconf {rgc_version}", + description=banner, + epilog=additional_description, + ) + + subparsers = parser.add_subparsers(dest="command") + + def add_subparser(cmd, msg, subparsers): + return subparsers.add_parser( + cmd, + description=msg, + help=msg, + formatter_class=lambda prog: HelpFormatter( + prog, max_help_position=40, width=90 + ), + ) + + sps = {} + for cmd, desc in SUBPARSER_MESSAGES.items(): + sps[cmd] = add_subparser(cmd, desc, subparsers) + # alias is nested and alias subcommands require config path + if cmd == ALIAS_CMD: + continue + # It's required for init + sps[cmd].add_argument( + "-c", + "--genome-config", + required=(cmd == INIT_CMD), + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) + sps[cmd].add_argument( + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) + + # upgrade: upgrade config and alter file structure to the target version + sps[UPGRADE_CMD].add_argument( + "-v", + "--target-version", + required=True, + metavar="V", + help="Target config version for the upgrade.", + ) + sps[UPGRADE_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + sps[INIT_CMD].add_argument( + "-s", + "--genome-server", + nargs="+", + default=[DEFAULT_SERVER], + help="URL(s) to use for the {} attribute in config file. Default: {}.".format( + CFG_SERVERS_KEY, DEFAULT_SERVER + ), + ) + sps[INIT_CMD].add_argument( + "-f", + "--genome-folder", + help="Absolute path to parent folder refgenie-managed assets.", + ) + sps[INIT_CMD].add_argument( + "-a", + "--genome-archive-folder", + help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-b", + "--genome-archive-config", + help="Absolute path to desired archive config file; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-u", + "--remote-url-base", + help="URL to use as an alternative, remote archive location; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-j", + "--settings-json", + help="Absolute path to a JSON file with the key " + "value pairs to inialize the configuration " + "file with. Overwritten by itemized specifications.", + ) + sps[BUILD_CMD] = pypiper.add_pypiper_args( + sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"] + ) + + # Add any arguments specific to subcommands. + + sps[BUILD_CMD].add_argument( + "--tag-description", + required=False, + default=None, + type=str, + help="Add tag level description (e.g. built with version 0.3.2).", + ) + + sps[BUILD_CMD].add_argument( + "--genome-description", + required=False, + default=None, + type=str, + help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).", + ) + + sps[BUILD_CMD].add_argument( + "-d", + "--docker", + action="store_true", + help="Run all commands in the refgenie docker container.", + ) + + sps[BUILD_CMD].add_argument( + "--assets", + nargs="+", + action="append", + required=False, + default=None, + help="Override the default genome, asset and tag of the parents" + " (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).", + ) + + sps[BUILD_CMD].add_argument( + "--files", + nargs="+", + action="append", + required=False, + default=None, + help="Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).", + ) + + sps[BUILD_CMD].add_argument( + "--params", + nargs="+", + action="append", + required=False, + default=None, + help="Provide required parameter values (e.g. param1=value1).", + ) + + sps[BUILD_CMD].add_argument( + "-v", + "--volumes", + nargs="+", + required=False, + default=None, + help="If using docker, also mount these folders as volumes.", + ) + + sps[BUILD_CMD].add_argument( + "-o", + "--outfolder", + dest="outfolder", + required=False, + default=None, + help="Override the default path to genomes folder, which is the " + "genome_folder attribute in the genome configuration file.", + ) + + sps[BUILD_CMD].add_argument( + "-q", + "--requirements", + action="store_true", + help="Show the build requirements for the specified asset and exit.", + ) + + sps[BUILD_CMD].add_argument( + "-r", + "--recipe", + required=False, + default=None, + type=str, + help="Provide a recipe to use.", + ) + + alias_subparser = sps[ALIAS_CMD] + alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand") + + alias_sps = {} + for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): + alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) + alias_sps[cmd].add_argument( + "-c", + "--genome-config", + required=False, + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) + alias_sps[cmd].add_argument( + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) + + alias_sps[ALIAS_SET_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to set; single if the digest is to be retrieved from the server.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-d", + "--digest", + metavar="D", + required=False, + type=str, + help="Digest to set; leave out if the digest is to be retrieved from the server.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-r", + "--reset", + action="store_true", + help="Whether all the aliases should be removed prior to setting new ones.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Whether the action should be forced, if genome does not exist.", + ) + + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to remove.", + ) + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-d", "--digest", metavar="D", required=True, type=str, help="Digest to remove." + ) + + alias_sps[ALIAS_GET_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + type=str, + nargs="+", + help="Aliases to get the digests for.", + ) + + sps[COMPARE_CMD].add_argument( + "genome1", + metavar="GENOME1", + type=str, + nargs=1, + help="First genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "genome2", + metavar="GENOME2", + type=str, + nargs=1, + help="Second genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "-e", + "--no-explanation", + action="store_true", + help="Do not print compatibility code explanation.", + ) + + # add 'genome' argument to many commands + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + GETSEQ_CMD, + TAG_CMD, + ID_CMD, + ]: + # genome is not required for listing actions + sps[cmd].add_argument( + "-g", + "--genome", + required=cmd in GETSEQ_CMD, + metavar="G", + help="Reference assembly ID, e.g. mm10.", + ) + + for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: + sps[cmd].add_argument( + "-g", + "--genome", + required=False, + type=str, + metavar="G", + nargs="*", + help="Reference assembly ID, e.g. mm10.", + ) + + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + TAG_CMD, + ID_CMD, + ]: + sps[cmd].add_argument( + "asset_registry_paths", + metavar="asset-registry-paths", + type=str, + nargs="+", + help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" + + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."), + ) + + sps[LIST_LOCAL_CMD].add_argument( + "-r", "--recipes", action="store_true", help="List available recipes." + ) + + for cmd in [REMOVE_CMD, INSERT_CMD]: + sps[cmd].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + sps[REMOVE_CMD].add_argument( + "-a", + "--aliases", + action="store_true", + help="Remove the genome alias if last asset for that genome is removed.", + ) + force_group = sps[PULL_CMD].add_argument_group( + title="Prompt handling", + description="These flags configure the pull prompt responses.", + ) + + overwrite_group = force_group.add_mutually_exclusive_group() + + overwrite_group.add_argument( + "--no-overwrite", action="store_true", help="Do not overwrite if asset exists." + ) + + overwrite_group.add_argument( + "--force-overwrite", action="store_true", help="Overwrite if asset exists." + ) + + large_group = force_group.add_mutually_exclusive_group() + + large_group.add_argument( + "--no-large", action="store_true", help="Do not pull archives over 5GB." + ) + + large_group.add_argument( + "--pull-large", + action="store_true", + help="Pull any archive, regardless of its size.", + ) + + force_group.add_argument( + "--size-cutoff", + type=float, + default=10, + metavar="S", + help="Maximum archive file size to download with no confirmation required (in GB, default: 10)", + ) + + force_group.add_argument( + "-b", + "--batch", + action="store_true", + help="Use batch mode: pull large archives, do no overwrite", + ) + + sps[INSERT_CMD].add_argument( + "-p", "--path", required=True, metavar="P", help="Relative local path to asset." + ) + + sps[INSERT_CMD].add_argument( + "-s", + "--seek-keys", + required=False, + type=str, + metavar="S", + help=""" + String representation of a JSON object with seek_keys, + e.g. '{"seek_key1": "file.txt"}' + """, + ) + + sps[GETSEQ_CMD].add_argument( + "-l", + "--locus", + required=True, + help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.", + ) + + sps[GET_ASSET_CMD].add_argument( + "-e", + "--check-exists", + required=False, + action="store_true", + help="Whether the returned asset path should be checked for existence on disk.", + ) + + sps[TAG_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) + + group.add_argument("-t", "--tag", type=str, help="Tag to assign to an asset.") + + group.add_argument( + "-d", + "--default", + action="store_true", + help="Set the selected asset tag as the default one.", + ) + + sps[SUBSCRIBE_CMD].add_argument( + "-r", + "--reset", + action="store_true", + help="Overwrite the current list of server URLs.", + ) + + for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]: + sps[cmd].add_argument( + "-s", + "--genome-server", + nargs="+", + required=True, + help="One or more URLs to {action} the {key} attribute in config file.".format( + action="add to" if cmd == SUBSCRIBE_CMD else "remove from", + key=CFG_SERVERS_KEY, + ), + ) + + return parser diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 82981647..fc031eb1 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -6,7 +6,8 @@ # These building recipes should make use of arguments that are auto-populated, # or user-provided. The auto-populated arguments are: # - {genome} -# - {asset_outfolder} In addition to these, the recipe should refer in the +# - {asset_outfolder} +# In addition to these, the recipe should refer in the # same way, {var}, to any variables required to be provided, which will be # provided via the CLI. These should be listed as 'required_inputs' and # will be checked for existence before the commands are executed. @@ -23,7 +24,18 @@ KEY = "key" DEFAULT = "default" -RECIPE_CONSTS = ["DESC", "ASSET_DESC", "ASSETS", "PTH", "REQ_FILES", "REQ_ASSETS", "CONT", "CMD_LST", "KEY", "DEFAULT"] +RECIPE_CONSTS = [ + "DESC", + "ASSET_DESC", + "ASSETS", + "PTH", + "REQ_FILES", + "REQ_ASSETS", + "CONT", + "CMD_LST", + "KEY", + "DEFAULT", +] asset_build_packages = { "fasta": { @@ -31,14 +43,9 @@ ASSETS: { "fasta": "{genome}.fa", "fai": "{genome}.fa.fai", - "chrom_sizes": "{genome}.chrom.sizes" + "chrom_sizes": "{genome}.chrom.sizes", }, - REQ_FILES: [ - { - KEY: "fasta", - DESC: "gzipped fasta file" - } - ], + REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -47,21 +54,16 @@ "gzip -df {asset_outfolder}/{genome}.fa.gz", "samtools faidx {asset_outfolder}/{genome}.fa", "cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes", - ] + ], }, "fasta_txome": { DESC: "cDNA sequences in the FASTA format, indexed FASTA (produced with samtools index) and chromosome sizes file", ASSETS: { "fasta_txome": "{genome}.fa", "fai": "{genome}.fa.fai", - "chrom_sizes": "{genome}.chrom.sizes" + "chrom_sizes": "{genome}.chrom.sizes", }, - REQ_FILES: [ - { - KEY: "fasta", - DESC: "gzipped fasta file" - } - ], + REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -70,26 +72,21 @@ "gzip -df {asset_outfolder}/{genome}.fa.gz", "samtools faidx {asset_outfolder}/{genome}.fa", "cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes", - ] + ], }, "dbnsfp": { DESC: "A database developed for functional prediction and annotation of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome (Gencode release 29/Ensembl 94)", ASSETS: { "dbnsfp": "{genome}_dbNSFP.txt.gz", - "tabix": "{genome}_dbNSFP.txt.gz.tbi" + "tabix": "{genome}_dbNSFP.txt.gz.tbi", }, - REQ_FILES: [ - { - KEY: "dbnsfp", - DESC: "zipped dbSNFP database file" - } - ], + REQ_FILES: [{KEY: "dbnsfp", DESC: "zipped dbSNFP database file"}], REQ_ASSETS: [], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", @@ -102,256 +99,179 @@ "rm {asset_outfolder}/dbNSFP*_variant.chr*", "bgzip -@ {threads} {asset_outfolder}/{genome}_dbNSFP.txt", "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_dbNSFP.txt.gz", - "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`" - ] + "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`", + ], }, "dbsnp": { DESC: "The database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants", - ASSETS: { - "dbsnp": "{genome}_dbSNP.gz", - "tabix": "{genome}_dbSNP.gz.tbi" - }, + ASSETS: {"dbsnp": "{genome}_dbSNP.gz", "tabix": "{genome}_dbSNP.gz.tbi"}, REQ_FILES: [ - { - KEY: "dbsnp_vcf", - DESC: "SNP database file in Variant Call Format (VCF)" - }, - { - KEY: "dbsnp_tbi", - DESC: "tabix index of the dbsnp.vcf file" - } + {KEY: "dbsnp_vcf", DESC: "SNP database file in Variant Call Format (VCF)"}, + {KEY: "dbsnp_tbi", DESC: "tabix index of the dbsnp.vcf file"}, ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "cp {dbsnp_vcf} {asset_outfolder}/{genome}_dbSNP.gz", - "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi" - ] + "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi", + ], }, "bowtie2_index": { DESC: "Genome index for bowtie, produced with bowtie-build", - ASSETS: { - "bowtie2_index": "{genome}" - }, + ASSETS: {"bowtie2_index": "{genome}"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "bowtie2-build {fasta} {asset_outfolder}/{genome}" - ] + CMD_LST: ["bowtie2-build {fasta} {asset_outfolder}/{genome}"], }, "bwa_index": { DESC: "Genome index for Burrows-Wheeler Alignment Tool, produced with bwa index", - ASSETS: { - "bwa_index": "{genome}.fa" - }, + ASSETS: {"bwa_index": "{genome}.fa"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", "bwa index {asset_outfolder}/{genome}.fa", - ] - }, + ], + }, "hisat2_index": { DESC: "Genome index for HISAT2, produced with hisat2-build", - ASSETS: { - "hisat2_index": "{genome}" - }, + ASSETS: {"hisat2_index": "{genome}"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "hisat2-build {fasta} {asset_outfolder}/{genome}" - ] + CMD_LST: ["hisat2-build {fasta} {asset_outfolder}/{genome}"], }, "bismark_bt2_index": { DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie2", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "bismark_bt2_index": "." - }, + ASSETS: {"bismark_bt2_index": "."}, CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", - "bismark_genome_preparation --bowtie2 {asset_outfolder}" - ] + "bismark_genome_preparation --bowtie2 {asset_outfolder}", + ], }, "bismark_bt1_index": { DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie1", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "bismark_bt1_index": "." - }, + ASSETS: {"bismark_bt1_index": "."}, CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", - "bismark_genome_preparation {asset_outfolder}" - ] - }, + "bismark_genome_preparation {asset_outfolder}", + ], + }, "kallisto_index": { DESC: "Genome index for kallisto, produced with kallisto index", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for transcriptome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"} ], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "kallisto_index": "." - }, + ASSETS: {"kallisto_index": "."}, CMD_LST: [ "kallisto index -i {asset_outfolder}/{genome}_kallisto_index.idx {fasta}" - ] + ], }, "salmon_index": { DESC: "Transcriptome index for salmon, produced with salmon index", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for transcriptome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"} ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_index": "." - }, + ASSETS: {"salmon_index": "."}, CMD_LST: [ "salmon index -t {fasta} -i {asset_outfolder} -k {kmer} -p {threads}" - ] + ], }, "salmon_sa_index": { DESC: "Transcriptome index for salmon, produced with salmon index using selective alignment method. Improves quantification accuracy compared to the regular index.", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - }, + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, { KEY: "fasta_txome", DEFAULT: "fasta_txome", - DESC: "fasta asset for transcriptome" - } + DESC: "fasta asset for transcriptome", + }, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_sa_index": "." - }, + ASSETS: {"salmon_sa_index": "."}, CMD_LST: [ "grep '^>' {fasta} | cut -d ' ' -f 1 > {asset_outfolder}/decoys.txt", "sed -i.bak -e 's/>//g' {asset_outfolder}/decoys.txt", "rm {asset_outfolder}/decoys.txt.bak", "cat {fasta_txome} {fasta} > {asset_outfolder}/gentrome.fa", "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}", - "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt" - ] + "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt", + ], }, "salmon_partial_sa_index": { DESC: "Transcriptome index for salmon, produced with salmon index using partial selective alignment method. Preparation includes transcriptome mapping to the genome and extraction of the relevant portion out from the genome and indexing it along with the transcriptome. Recipe source -- https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - }, + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, { KEY: "fasta_txome", DEFAULT: "fasta_txome", - DESC: "fasta asset for transcriptome" + DESC: "fasta asset for transcriptome", }, { KEY: "gtf", DEFAULT: "ensembl_gtf", - DESC: "GTF file for exonic features extraction" - } + DESC: "GTF file for exonic features extraction", + }, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_partial_sa_index": "." - }, + ASSETS: {"salmon_partial_sa_index": "."}, CMD_LST: [ "gunzip -c {gtf} > {asset_outfolder}/{genome}.gtf", "awk -v OFS='\t' '{{if ($3==\"exon\") {{print $1,$4,$5}}}}' {asset_outfolder}/{genome}.gtf > {asset_outfolder}/exons.bed", @@ -360,88 +280,86 @@ "awk -v OFS='\t' '{{print $6,$8,$9}}' {asset_outfolder}/mashmap.out | sort -k1,1 -k2,2n - > {asset_outfolder}/genome_found.sorted.bed", "bedtools merge -i {asset_outfolder}/genome_found.sorted.bed > {asset_outfolder}/genome_found_merged.bed", "bedtools getfasta -fi {asset_outfolder}/reference.masked.genome.fa -bed {asset_outfolder}/genome_found_merged.bed -fo {asset_outfolder}/genome_found.fa", - "awk '{{a=$0; getline;split(a, b, \":\"); r[b[1]] = r[b[1]]\"\"$0}} END {{ for (k in r) {{ print k\"\\n\"r[k] }} }}' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa", + 'awk \'{{a=$0; getline;split(a, b, ":"); r[b[1]] = r[b[1]]""$0}} END {{ for (k in r) {{ print k"\\n"r[k] }} }}\' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa', "cat {fasta_txome} {asset_outfolder}/decoy.fa > {asset_outfolder}/gentrome.fa", "grep '>' {asset_outfolder}/decoy.fa | awk '{{print substr($1,2); }}' > {asset_outfolder}/decoys.txt", "rm {asset_outfolder}/exons.bed {asset_outfolder}/reference.masked.genome.fa {asset_outfolder}/mashmap.out {asset_outfolder}/genome_found.sorted.bed {asset_outfolder}/genome_found_merged.bed {asset_outfolder}/genome_found.fa {asset_outfolder}/decoy.fa {asset_outfolder}/reference.masked.genome.fa.fai", - "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}" - ] + "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}", + ], }, - "epilog_index": { - DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller", + "tgMap": { + DESC: "Transcript to gene map file, containing two columns mapping of each transcript present in the reference to the corresponding gene.", REQ_FILES: [], REQ_ASSETS: [ { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" + KEY: "salmon_partial_sa_index", + DEFAULT: "salmon_partial_sa_index", + DESC: "partial salmon index asset", } ], + REQ_PARAMS: [], + ASSETS: {"tgMap": "{genome}_txp2gene.tsv"}, + CMD_LST: [ + "grep '^>' {salmon_partial_sa_index}/gentrome.fa | cut -d ' ' -f 1,7 | tr -s ' ' '\\t' | sed 's/[>'gene_symbol:']//g' > {asset_outfolder}/{genome}_txp2gene.tsv", + ], + }, + "epilog_index": { + DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller", + REQ_FILES: [], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [ { KEY: "context", - DEFAULT: 'CG', - DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'" + DEFAULT: "CG", + DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'", } ], CONT: "databio/refgenie", - ASSETS: { - "epilog_index": "." - }, + ASSETS: {"epilog_index": "{genome}_{context}.tsv.gz"}, CMD_LST: [ - "epilog index -i {fasta} -o {asset_outfolder}/{genome}_{context}.tsv --context {context} -t" - ] + "epilog index -- --infile {fasta} --outfile {asset_outfolder}/{genome}_{context}.tsv --contexts {context}", + "bgzip {asset_outfolder}/{genome}_{context}.tsv", + "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_{context}.tsv.gz", + ], }, "star_index": { DESC: "Genome index for STAR RNA-seq aligner, produced with STAR --runMode genomeGenerate", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", - ASSETS: { - "star_index": "." - }, + ASSETS: {"star_index": "."}, CMD_LST: [ "mkdir -p {asset_outfolder}", - "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}" - ] + "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}", + ], }, "gencode_gtf": { DESC: "GTF annotation asset which provides access to all annotated transcripts which make up an Ensembl gene set.", REQ_FILES: [ { KEY: "gencode_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode" + DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode", } ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "gencode_gtf": "{genome}.gtf.gz" - }, - CMD_LST: [ - "cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz" - ] + ASSETS: {"gencode_gtf": "{genome}.gtf.gz"}, + CMD_LST: ["cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz"], }, "ensembl_gtf": { DESC: "Ensembl GTF, TSS, and gene body annotation", REQ_FILES: [ { KEY: "ensembl_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl" + DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl", } ], REQ_ASSETS: [], @@ -454,36 +372,27 @@ }, CMD_LST: [ "cp {ensembl_gtf} {asset_outfolder}/{genome}.gtf.gz", - "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep 'exon_number \"1\";' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1, $4, $5, $20, $14, $7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+20\"\t\"$2+120\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$3-120\"\t\"$3-20\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed", - "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk '$3 == \"gene\"' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1,$4,$5,$14,$6,$7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '$4!=\"Metazoa_SRP\"' | awk '$4!=\"U3\"' | awk '$4!=\"7SK\"' | awk '($3-$2)>200' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+500\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$2\"\t\"$3-500\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | awk '$3>$2' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed" - ] + 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep \'exon_number "1";\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1, $4, $5, $20, $14, $7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'{{if($6=="+"){{print $1"\t"$2+20"\t"$2+120"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$3-120"\t"$3-20"\t"$4"\t"$5"\t"$6}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed', + 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk \'$3 == "gene"\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1,$4,$5,$14,$6,$7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'$4!="Metazoa_SRP"\' | awk \'$4!="U3"\' | awk \'$4!="7SK"\' | awk \'($3-$2)>200\' | awk \'{{if($6=="+"){{print $1"\t"$2+500"\t"$3"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$2"\t"$3-500"\t"$4"\t"$5"\t"$6}}}}\' | awk \'$3>$2\' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed', + ], }, "ensembl_rb": { DESC: "A regulatory annotation file", REQ_FILES: [ { KEY: "gff", - DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl" + DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl", } ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "ensembl_rb": "{genome}.gff.gz" - }, - CMD_LST: [ - "cp {gff} {asset_outfolder}/{genome}.gff.gz" - ] + ASSETS: {"ensembl_rb": "{genome}.gff.gz"}, + CMD_LST: ["cp {gff} {asset_outfolder}/{genome}.gff.gz"], }, "refgene_anno": { DESC: "gene, TSS, exon, intron, and premature mRNA annotation files", - REQ_FILES: [ - { - KEY: "refgene", - DESC: "gzipped RefGene database annotation file" - } - ], + REQ_FILES: [{KEY: "refgene", DESC: "gzipped RefGene database annotation file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -496,11 +405,11 @@ }, CMD_LST: [ "cp {refgene} {asset_outfolder}/{genome}_refGene.txt.gz", - "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk '{{if($4==\"+\"){{print $3\"\t\"$5\"\t\"$5\"\t\"$13\"\t.\t\"$4}}else{{print $3\"\t\"$6\"\t\"$6\"\t\"$13\"\t.\t\"$4}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed", + 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk \'{{if($4=="+"){{print $3"\t"$5"\t"$5"\t"$13"\t.\t"$4}}else{{print $3"\t"$6"\t"$6"\t"$13"\t.\t"$4}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed', "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -v OFS='\t' '{{ n = split($10, a, \",\"); split($11, b, \",\"); for(i=1; i {asset_outfolder}/{genome}_exons.bed", "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -F'\t' '{{ exonCount=int($9);split($10,exonStarts,\"[,]\"); split($11,exonEnds,\"[,]\"); for(i=1;i {asset_outfolder}/{genome}_introns.bed", - "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep 'cmpl' | awk '{{print $3\"\t\"$5\"\t\"$6\"\t\"$13\"\t.\t\"$4}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed" - ] + 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep \'cmpl\' | awk \'{{print $3"\t"$5"\t"$6"\t"$13"\t.\t"$4}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed', + ], }, "suffixerator_index": { DESC: "Enhanced suffix array index for genomes using gt (GenomeTools) suffixerator program", @@ -508,61 +417,45 @@ { KEY: "memlimit", DEFAULT: "8GB", - DESC: "The maximum amount of memory available to be used during index construction." + DESC: "The maximum amount of memory available to be used during index construction.", } ], REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], CONT: "databio/refgenie", - ASSETS: { - "esa": "{genome}.sft" - }, + ASSETS: {"esa": "{genome}.sft"}, CMD_LST: [ "gt suffixerator -dna -pl -tis -suf -lcp -v -showprogress -memlimit {memlimit} -db {fasta} -indexname {asset_outfolder}/{genome}.sft" - ] + ], }, "tallymer_index": { DESC: "Indexed k-mers for a given enhanced suffix array at a fixed value of k", REQ_PARAMS: [ - { - KEY: "mersize", - DEFAULT: "30", - DESC: "The mer size." - }, + {KEY: "mersize", DEFAULT: "30", DESC: "The mer size."}, { KEY: "minocc", DEFAULT: "2", - DESC: "The minimum occurrence number for the mers to index." - } + DESC: "The minimum occurrence number for the mers to index.", + }, ], REQ_FILES: [], REQ_ASSETS: [ { KEY: "esa", DEFAULT: "suffixerator_index", - DESC: "enhanced suffix array index for genome" + DESC: "enhanced suffix array index for genome", }, - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, ], CONT: "databio/refgenie", ASSETS: { "tindex": "{genome}.tal_{mersize}", - "search_file": "{genome}.tal_{mersize}.gtTxt" + "search_file": "{genome}.tal_{mersize}.gtTxt", }, CMD_LST: [ "gt tallymer mkindex -v -counts -pl -mersize {mersize} -minocc {minocc} -indexname {asset_outfolder}/{genome}.tal_{mersize} -esa {esa}/{genome}.sft", - "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt" - ] + "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt", + ], }, "feat_annotation": { DESC: "Combined genomic feature annotation created using an Ensembl GTF annotation asset and an Ensembl regulatory build annotation asset", @@ -574,28 +467,28 @@ { KEY: "ensembl_gtf", DEFAULT: "ensembl_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl" + DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl", }, { KEY: "ensembl_rb", DEFAULT: "ensembl_rb", - DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl" - } + DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl", + }, ], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"Exon\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_exons.bed", "gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{ split($20, a, \"\\\"\"); print \"chr\"$1, $4-1, $5, a[2], $6, $7}}' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u | awk 'seen[$4]++ && seen[$4] > 1' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3nr | env LC_COLLATE=C sort -k1,1 -k2,2n -u | env LC_COLLATE=C sort -k1,1 -k3,3n -u | awk -v OFS='\t' '{{if($4==prev4){{new2=prev3+1;}} {{prev4=$4; prev3=$3; print $1, new2, $2-1, \"Intron\", $5, $6}}}}' | awk -F'\t' '$2' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_introns.bed", - "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed", - "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed", + "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed", + "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"promoter\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"promoter_flanking_region\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter Flanking Region\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter_flanking.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"enhancer\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Enhancer\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_enhancer.bed", "cat {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed | awk -F'\t' '!seen[$1, $2, $3]++' > {asset_outfolder}/{genome}_annotations.bed", "rm -f {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed", - "gzip -f {asset_outfolder}/{genome}_annotations.bed" - ] + "gzip -f {asset_outfolder}/{genome}_annotations.bed", + ], }, "cellranger_reference": { DESC: "Cell Ranger custom genome reference for read alignment and gene expression quantification", @@ -607,19 +500,15 @@ { KEY: "gtf", DEFAULT: "gencode_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode" + DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode", }, - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", @@ -627,25 +516,18 @@ "gunzip {gtf} -c > {asset_outfolder}/{genome}.gtf", "cellranger mkgtf {asset_outfolder}/{genome}.gtf {asset_outfolder}/{genome}_filtered.gtf", "rm {asset_outfolder}/{genome}.gtf", - "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}" - ] + "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}", + ], }, "blacklist": { DESC: "Atypical, unstructured, or high signal genomic regions present in next-generation sequencing experiments (e.g. from ENCODE)", ASSETS: { "blacklist": "{genome}_blacklist.bed.gz", }, - REQ_FILES: [ - { - KEY: "blacklist", - DESC: "gzipped blacklist file" - } - ], + REQ_FILES: [{KEY: "blacklist", DESC: "gzipped blacklist file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz" - ] - } + CMD_LST: ["cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz"], + }, } diff --git a/refgenie/build_all_genome.py b/refgenie/build_all_genome.py index 12be1c6c..98cf968c 100644 --- a/refgenie/build_all_genome.py +++ b/refgenie/build_all_genome.py @@ -8,23 +8,71 @@ import argparse import divvy -parser = argparse.ArgumentParser(description='Builds submission scripts for all assets for a genome') -parser.add_argument('-g', '--genome', dest="genome", type=str, - help='genome to build the submission scripts for') -parser.add_argument('-p', '--path', dest="path", type=str, - help='path to the desired submission directory location') -parser.add_argument('-pt', '--partition', dest="PARTITION", type=str, - help='partition in SLURM submission script', default="standard") -parser.add_argument('-m', '--mem', dest="MEM", type=str, - help='mem in SLURM submission script', default="200000") -parser.add_argument('-t', '--time', dest="TIME", type=str, - help='time in SLURM submission script', default="10:00:00") -parser.add_argument('-c', '--cores', dest="CORES", type=str, - help='cpus-per-task in SLURM submission script', default="4") -parser.add_argument('-o', '--output', dest="LOGFILE", type=str, - help='output in SLURM submission script', default=None) -parser.add_argument('-j', '--job-name', dest="JOBNAME", type=str, - help='job-name in SLURM submission script', default=None) +parser = argparse.ArgumentParser( + description="Builds submission scripts for all assets for a genome" +) +parser.add_argument( + "-g", + "--genome", + dest="genome", + type=str, + help="genome to build the submission scripts for", +) +parser.add_argument( + "-p", + "--path", + dest="path", + type=str, + help="path to the desired submission directory location", +) +parser.add_argument( + "-pt", + "--partition", + dest="PARTITION", + type=str, + help="partition in SLURM submission script", + default="standard", +) +parser.add_argument( + "-m", + "--mem", + dest="MEM", + type=str, + help="mem in SLURM submission script", + default="200000", +) +parser.add_argument( + "-t", + "--time", + dest="TIME", + type=str, + help="time in SLURM submission script", + default="10:00:00", +) +parser.add_argument( + "-c", + "--cores", + dest="CORES", + type=str, + help="cpus-per-task in SLURM submission script", + default="4", +) +parser.add_argument( + "-o", + "--output", + dest="LOGFILE", + type=str, + help="output in SLURM submission script", + default=None, +) +parser.add_argument( + "-j", + "--job-name", + dest="JOBNAME", + type=str, + help="job-name in SLURM submission script", + default=None, +) args = parser.parse_args() @@ -69,8 +117,10 @@ def _req_input_to_args(req_input): sub_script = os.path.join(subdir_path, asset + ".sub") req_input = asset_build_packages[asset]["required_inputs"] if req_input: - print("{} asset requires additional input in the command ({}), so '{}'" - " requires manual edit".format(asset, req_input, sub_script)) + print( + "{} asset requires additional input in the command ({}), so '{}'" + " requires manual edit".format(asset, req_input, sub_script) + ) req_str = " ".join(_req_input_to_args(req_input)) else: req_str = "" diff --git a/refgenie/cli.py b/refgenie/cli.py new file mode 100644 index 00000000..cb6a2b70 --- /dev/null +++ b/refgenie/cli.py @@ -0,0 +1,423 @@ +import logmuse +import sys +import json +import os + +from .argparser import build_argparser +from .refgenie import parse_registry_path, _skip_lock +from ._version import __version__ +from .const import * +from .exceptions import * +from .asset_build_packages import * +from .refgenie import refgenie_build +from .helpers import _raise_missing_recipe_error, _single_folder_writeable + +from refgenconf import ( + RefGenConf, + MissingAssetError, + MissingGenomeError, + DownloadJsonError, + upgrade_config, + __version__ as rgc_version, + select_genome_config, +) +from ubiquerg import query_yes_no +from requests.exceptions import MissingSchema + +from collections import OrderedDict +from rich.console import Console + + +def main(): + """ Primary workflow """ + parser = logmuse.add_logging_options(build_argparser()) + args, remaining_args = parser.parse_known_args() + global _LOGGER + _LOGGER = logmuse.logger_via_cli(args, make_root=True) + _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}") + _LOGGER.debug(f"Args: {args}") + + if not args.command: + parser.print_help() + _LOGGER.error("No command given") + sys.exit(1) + + if args.command == ALIAS_CMD and not args.subcommand: + parser.print_help() + _LOGGER.error("No alias subcommand command given") + sys.exit(1) + + gencfg = select_genome_config( + filename=args.genome_config, + check_exist=not args.command == INIT_CMD, + on_missing=lambda fp: fp, + strict_env=True, + ) + if gencfg is None: + raise MissingGenomeConfigError(args.genome_config) + _LOGGER.debug("Determined genome config: {}".format(gencfg)) + + skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) + + # From user input we want to construct a list of asset dicts, where each + # asset has a genome name, asset name, and tag + if "asset_registry_paths" in args and args.asset_registry_paths: + _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) + asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] + + for a in asset_list: + # every asset must have a genome, either provided via registry path + # or the args.genome arg. + if not a["genome"]: + if args.genome: + a["genome"] = args.genome + else: + _LOGGER.error( + "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format( + a["genome"], a["asset"], a["tag"] + ) + ) + sys.exit(1) + else: + if args.genome and args.genome != a["genome"]: + _LOGGER.warn( + "Two different genomes specified for asset '{}'.".format( + a["asset"] + ) + ) + + else: + if args.command in GENOME_ONLY_REQUIRED and not args.genome: + parser.error("You must provide either a genome or a registry path") + sys.exit(1) + if args.command in ASSET_REQUIRED: + parser.error("You must provide an asset registry path") + sys.exit(1) + + if args.command == INIT_CMD: + _LOGGER.debug("Initializing refgenie genome configuration") + entries = OrderedDict( + { + CFG_VERSION_KEY: REQ_CFG_VERSION, + CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), + CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], + CFG_GENOMES_KEY: None, + } + ) + if args.settings_json: + if os.path.isfile(args.settings_json): + with open(args.settings_json, "r") as json_file: + data = json.load(json_file) + entries.update(data) + else: + raise FileNotFoundError( + "JSON file with config init settings does not exist: {}".format( + args.settings_json + ) + ) + if args.genome_folder: + entries.update({CFG_FOLDER_KEY: args.genome_folder}) + if args.remote_url_base: + entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) + if args.genome_archive_folder: + entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) + if args.genome_archive_config: + entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) + _LOGGER.debug("initializing with entries: {}".format(entries)) + rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) + rgc.initialize_config_file(os.path.abspath(gencfg)) + + elif args.command == BUILD_CMD: + if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]): + _LOGGER.error("Build can only build assets for one genome") + sys.exit(1) + recipe_name = None + if args.recipe: + if len(asset_list) > 1: + _LOGGER.error("Recipes cannot be specified for multi-asset builds") + sys.exit(1) + recipe_name = args.recipe + if args.requirements: + for a in asset_list: + recipe = recipe_name or a["asset"] + if recipe not in asset_build_packages.keys(): + _raise_missing_recipe_error(recipe) + _LOGGER.info("'{}' recipe requirements: ".format(recipe)) + _make_asset_build_reqs(recipe) + sys.exit(0) + refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) + + elif args.command == GET_ASSET_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + check = args.check_exists if args.check_exists else None + for a in asset_list: + _LOGGER.debug( + "getting asset: '{}/{}.{}:{}'".format( + a["genome"], a["asset"], a["seek_key"], a["tag"] + ) + ) + print( + rgc.seek( + a["genome"], + a["asset"], + a["tag"], + a["seek_key"], + strict_exists=check, + ) + ) + return + + elif args.command == INSERT_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + + if len(asset_list) > 1: + raise NotImplementedError("Can only add 1 asset at a time") + else: + sk = args.seek_keys + if sk: + sk = json.loads(args.seek_keys) + rgc.add( + path=args.path, + genome=asset_list[0]["genome"], + asset=asset_list[0]["asset"], + tag=asset_list[0]["tag"], + seek_keys=sk, + force=args.force, + ) + + elif args.command == PULL_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + + # existing assets overwriting + if args.no_overwrite: + force = False + elif args.force_overwrite: + force = True + else: + force = None + # large archive pulling + if args.no_large: + force_large = False + elif args.pull_large: + force_large = True + else: + force_large = None + # batch mode takes precedence over other choices + if args.batch: + force_large = True + force = False + + outdir = rgc.data_dir + if not os.path.exists(outdir): + raise MissingFolderError(outdir) + if not perm_check_x(outdir): + return + if not _single_folder_writeable(outdir): + _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) + return + + for a in asset_list: + rgc.pull( + a["genome"], + a["asset"], + a["tag"], + force=force, + force_large=force_large, + size_cutoff=args.size_cutoff, + ) + + elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + console = Console() + if args.command == LIST_REMOTE_CMD: + num_servers = 0 + bad_servers = [] + for server_url in rgc[CFG_SERVERS_KEY]: + num_servers += 1 + try: + table = rgc.get_asset_table( + genomes=args.genome, server_url=server_url + ) + except (DownloadJsonError, ConnectionError, MissingSchema): + bad_servers.append(server_url) + continue + else: + console.print(table) + if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: + _LOGGER.error( + "Could not list assets from the following servers: {}".format( + bad_servers + ) + ) + else: + if args.recipes: + print(", ".join(sorted(list(asset_build_packages.keys())))) + else: + console.print(rgc.get_asset_table(genomes=args.genome)) + + elif args.command == GETSEQ_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + print(rgc.getseq(args.genome, args.locus)) + + elif args.command == REMOVE_CMD: + force = args.force + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + for a in asset_list: + a["tag"] = a["tag"] or rgc.get_default_tag( + a["genome"], a["asset"], use_existing=False + ) + _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) + if a["seek_key"] is not None: + raise NotImplementedError("You can't remove a specific seek_key.") + gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} + try: + if not rgc.is_asset_complete(**gat): + with rgc as r: + r.cfg_remove_assets(**gat) + _LOGGER.info( + "Removed an incomplete asset " + "'{genome}/{asset}:{tag}'".format(*gat) + ) + return + except (KeyError, MissingAssetError, MissingGenomeError): + _LOGGER.info( + "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat) + ) + return + if len(asset_list) > 1: + if not query_yes_no( + "Are you sure you want to remove {} assets?".format(len(asset_list)) + ): + _LOGGER.info("Action aborted by the user") + return + force = True + for a in asset_list: + rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) + + elif args.command == TAG_CMD: + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + if len(asset_list) > 1: + raise NotImplementedError("Can only tag 1 asset at a time") + if args.default: + # set the default tag and exit + with rgc as r: + r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) + sys.exit(0) + rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) + + elif args.command == ID_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + if len(asset_list) == 1: + g, a = asset_list[0]["genome"], asset_list[0]["asset"] + t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) + print(rgc.id(g, a, t)) + return + for asset in asset_list: + g, a = asset["genome"], asset["asset"] + t = asset["tag"] or rgc.get_default_tag(g, a) + print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) + return + elif args.command == SUBSCRIBE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc.subscribe(urls=args.genome_server, reset=args.reset) + return + elif args.command == UNSUBSCRIBE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc.unsubscribe(urls=args.genome_server) + return + elif args.command == ALIAS_CMD: + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + if args.subcommand == ALIAS_GET_CMD: + if args.aliases is not None: + for a in args.aliases: + print(rgc.get_genome_alias_digest(alias=a)) + return + console = Console() + console.print(rgc.genome_aliases_table) + + if args.subcommand == ALIAS_SET_CMD: + rgc.set_genome_alias( + digest=args.digest, + genome=args.aliases, + reset_digest=args.reset, + create_genome=args.force, + ) + return + elif args.subcommand == ALIAS_REMOVE_CMD: + rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) + return + + elif args.command == COMPARE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + res = rgc.compare( + args.genome1[0], args.genome2[0], explain=not args.no_explanation + ) + if args.no_explanation: + print(res) + + elif args.command == UPGRADE_CMD: + upgrade_config( + target_version=args.target_version, filepath=gencfg, force=args.force + ) + + +def perm_check_x(file_to_check, message_tag="genome directory"): + """ + Check X_OK permission on a path, providing according messaging and bool val. + + :param str file_to_check: path to query for permission + :param str message_tag: context for error message if check fails + :return bool: os.access(path, X_OK) for the given path + :raise ValueError: if there's no filepath to check for permission + """ + if not file_to_check: + msg = "You must provide a path to {}".format(message_tag) + _LOGGER.error(msg) + raise ValueError(msg) + if not os.access(file_to_check, os.X_OK): + _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) + return False + return True + + +def _make_asset_build_reqs(asset): + """ + Prepare requirements and inputs lists and display it + + :params str asset: name of the asset + """ + + def _format_reqs(req_list): + """ + + :param list[dict] req_list: + :return list[str]: + """ + templ = "\t{} ({})" + return [ + templ.format(req[KEY], req[DESC]) + if DEFAULT not in req + else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) + for req in req_list + ] + + reqs_list = [] + if asset_build_packages[asset][REQ_FILES]: + reqs_list.append( + "- files:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])) + ) + ) + if asset_build_packages[asset][REQ_ASSETS]: + reqs_list.append( + "- assets:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])) + ) + ) + if asset_build_packages[asset][REQ_PARAMS]: + reqs_list.append( + "- params:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])) + ) + ) + _LOGGER.info("\n".join(reqs_list)) diff --git a/refgenie/const.py b/refgenie/const.py index 830f6fe5..d8ec6600 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -1,9 +1,11 @@ """ -Constant variables for refgenie package. -Ones that are integral to refgenconf and/or refgenieserver should be defined in refgenconf.const +Constant variables for refgenie package. Ones that are integral to refgenconf +and/or refgenieserver should be defined in refgenconf.const """ from refgenconf.const import * +PKG_NAME = "refgenie" + BUILD_CMD = "build" INIT_CMD = "init" PULL_CMD = "pull" @@ -17,6 +19,9 @@ ID_CMD = "id" SUBSCRIBE_CMD = "subscribe" UNSUBSCRIBE_CMD = "unsubscribe" +ALIAS_CMD = "alias" +COMPARE_CMD = "compare" +UPGRADE_CMD = "upgrade" GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] @@ -37,4 +42,17 @@ ID_CMD: "Return the asset digest.", SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", + ALIAS_CMD: "Interact with aliases.", + COMPARE_CMD: "Compare two genomes.", + UPGRADE_CMD: "Upgrade config. This will alter the files on disk.", +} + +ALIAS_GET_CMD = "get" +ALIAS_SET_CMD = "set" +ALIAS_REMOVE_CMD = "remove" + +ALIAS_SUBPARSER_MESSAGES = { + ALIAS_REMOVE_CMD: "Remove aliases.", + ALIAS_SET_CMD: "Set aliases.", + ALIAS_GET_CMD: "Get aliases.", } diff --git a/refgenie/exceptions.py b/refgenie/exceptions.py index 30d32291..7c17797a 100644 --- a/refgenie/exceptions.py +++ b/refgenie/exceptions.py @@ -1,10 +1,11 @@ from refgenconf import CFG_ENV_VARS -__all__ = ["RefgenieError", "MissingGenomeConfigError"] +__all__ = ["RefgenieError", "MissingGenomeConfigError", "MissingFolderError"] class RefgenieError(Exception): """ Base refgenie exception type """ + pass @@ -17,8 +18,9 @@ def __init__(self, conf_file=None): :param str conf_file: path attempted to be used as genome config file """ - msg = "You must provide a config file either as an argument or via an environment variable: {}"\ - .format(", ".join(CFG_ENV_VARS)) + msg = "You must provide a config file either as an argument or via an environment variable: {}".format( + ", ".join(CFG_ENV_VARS) + ) if conf_file: msg = "Not a file {} -- {}.".format(conf_file, msg) super(MissingGenomeConfigError, self).__init__(msg) @@ -32,4 +34,3 @@ def __init__(self, folder): :param str folder: path attempted to be used as folder to save a file to """ super(MissingFolderError, self).__init__(folder) - diff --git a/refgenie/helpers.py b/refgenie/helpers.py new file mode 100644 index 00000000..6a24ae12 --- /dev/null +++ b/refgenie/helpers.py @@ -0,0 +1,63 @@ +import os + +from refgenconf import MissingRecipeError +from ubiquerg import is_writable + +from .asset_build_packages import asset_build_packages +from .exceptions import MissingFolderError + + +def _parse_user_build_input(input): + """ + Parse user input specification. Used in build for specific parents and input parsing. + + :param Iterable[Iterable[str], ...] input: user command line input, + formatted as follows: [[fasta=txt, test=txt], ...] + :return dict: mapping of keys, which are input names and values + """ + lst = [] + for i in input or []: + lst.extend(i) + return ( + {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} + if lst is not None + else lst + ) + + +def _single_folder_writeable(d): + return os.access(d, os.W_OK) and os.access(d, os.X_OK) + + +def _writeable(outdir, strict_exists=False): + outdir = outdir or "." + if os.path.exists(outdir): + return _single_folder_writeable(outdir) + elif strict_exists: + raise MissingFolderError(outdir) + return _writeable(os.path.dirname(outdir), strict_exists) + + +def _raise_missing_recipe_error(recipe): + """ + Raise an error for a missing recipe, when one is requested + + :param str recipe: recipe name + :raise MissingRecipeError: always + """ + raise MissingRecipeError( + f"Recipe '{recipe}' not found. Available recipes: " + f"{', '.join(list(asset_build_packages.keys()))}" + ) + + +def _skip_lock(skip_arg, cfg): + """ + If config read lock skip was not forced, check if dir is writable and set + the default to the result + + :param bool skip_arg: argument selected on the CLI + :param str cfg: path to the confjg + :return bool: decision -- whether to skip the file lock for read + """ + return is_writable(os.path.dirname(cfg)) if not skip_arg else True diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index a5e09d7a..9f62caa1 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -1,270 +1,64 @@ -#!/usr/bin/env python - -from collections import OrderedDict -from shutil import rmtree -from re import sub -from requests import ConnectionError import os import sys import csv import signal import json -from ._version import __version__ -from .exceptions import MissingGenomeConfigError, MissingFolderError from .asset_build_packages import * from .const import * +from .helpers import ( + _raise_missing_recipe_error, + _skip_lock, + _parse_user_build_input, + _writeable, +) -import logmuse import pypiper import refgenconf -from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ - MissingRecipeError, DownloadJsonError, get_dir_digest -from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ - VersionInHelpParser, is_command_callable +from refgenconf import ( + RefGenConf, + get_dir_digest, +) +from ubiquerg import parse_registry_path as prp from ubiquerg.system import is_writable -from .refget import fasta_checksum - -_LOGGER = None - - -def build_argparser(): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - banner = "%(prog)s - reference genome asset manager" - additional_description = "\nhttps://refgenie.databio.org" - - parser = VersionInHelpParser( - prog="refgenie", - version=__version__, - description=banner, - epilog=additional_description) - - subparsers = parser.add_subparsers(dest="command") - - def add_subparser(cmd, description): - return subparsers.add_parser( - cmd, description=description, help=description) - - sps = {} - for cmd, desc in SUBPARSER_MESSAGES.items(): - sps[cmd] = add_subparser(cmd, desc) - # It's required for init - sps[cmd].add_argument( - '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", - help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(refgenconf.CFG_ENV_VARS))) - - sps[INIT_CMD].add_argument('-s', '--genome-server', nargs='+', default=DEFAULT_SERVER, - help="URL(s) to use for the {} attribute in config file. Default: {}." - .format(CFG_SERVERS_KEY, DEFAULT_SERVER)) - sps[INIT_CMD].add_argument('-f', '--genome-folder', - help="Absolute path to parent folder refgenie-managed assets.") - sps[INIT_CMD].add_argument('-a', '--genome-archive-folder', - help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.") - sps[INIT_CMD].add_argument('-b', '--genome-archive-config', - help="Absolute path to desired archive config file; used by refgenieserver.") - sps[INIT_CMD].add_argument('-u', '--remote-url-base', - help="URL to use as an alternative, remote archive location; used by refgenieserver.") - sps[INIT_CMD].add_argument('-j', '--settings-json', - help="Absolute path to a JSON file with the key " - "value pairs to inialize the configuration " - "file with. Overwritten by itemized specifications.") - sps[BUILD_CMD] = pypiper.add_pypiper_args( - sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"]) - - # Add any arguments specific to subcommands. - - sps[BUILD_CMD].add_argument( - '--tag-description', required=False, default=None, type=str, - help="Add tag level description (e.g. built with version 0.3.2).") - - sps[BUILD_CMD].add_argument( - '--genome-description', required=False, default=None, type=str, - help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).") - - sps[BUILD_CMD].add_argument( - "-d", "--docker", action="store_true", help="Run all commands in the refgenie docker container.") - - sps[BUILD_CMD].add_argument( - '--assets', nargs="+", action='append', required=False, default=None, - help='Override the default genome, asset and tag of the parents' - ' (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).') - - sps[BUILD_CMD].add_argument( - '--files', nargs="+", action='append', required=False, default=None, - help='Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).') - - sps[BUILD_CMD].add_argument( - '--params', nargs="+", action='append', required=False, default=None, - help='Provide required parameter values (e.g. param1=value1).') - - sps[BUILD_CMD].add_argument( - '-v', '--volumes', nargs="+", required=False, default=None, - help='If using docker, also mount these folders as volumes.') - - sps[BUILD_CMD].add_argument( - '-o', '--outfolder', dest='outfolder', required=False, default=None, - help='Override the default path to genomes folder, which is the ' - 'genome_folder attribute in the genome configuration file.') - - sps[BUILD_CMD].add_argument( - "-q", "--requirements", action="store_true", - help="Show the build requirements for the specified asset and exit.") - - sps[BUILD_CMD].add_argument( - "-r", "--recipe", required=False, default=None, type=str, - help="Provide a recipe to use.") - - # add 'genome' argument to many commands - for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: - # genome is not required for listing actions - sps[cmd].add_argument( - "-g", "--genome", required=cmd in GETSEQ_CMD, metavar="G", - help="Reference assembly ID, e.g. mm10.") - - for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: - sps[cmd].add_argument("-g", "--genome", required=False, type=str, - nargs="*", help="Reference assembly ID, e.g. mm10.") - - for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, TAG_CMD, ID_CMD]: - sps[cmd].add_argument( - "asset_registry_paths", metavar="asset-registry-paths", type=str, nargs='+', - help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" - + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ").")) - - for cmd in [REMOVE_CMD, INSERT_CMD]: - sps[cmd].add_argument( - "-f", "--force", action="store_true", - help="Do not prompt before action, approve upfront.") - - force_group = sps[PULL_CMD].add_argument_group( - title="Prompt handling", - description="These flags configure the pull prompt responses.") - - overwrite_group = force_group.add_mutually_exclusive_group() - - overwrite_group.add_argument("--no-overwrite", action="store_true", - help="Do not overwrite if asset exists.") - - overwrite_group.add_argument("--force-overwrite", action="store_true", - help="Overwrite if asset exists.") - - large_group = force_group.add_mutually_exclusive_group() - - large_group.add_argument("--no-large", action="store_true", - help="Do not pull archives over 5GB.") - - large_group.add_argument("--pull-large", action="store_true", - help="Pull any archive, regardless of its size.") - - force_group.add_argument("--size-cutoff", type=float, default=10, metavar="S", - help="Maximum archive file size to download with no confirmation required (in GB, default: 10)") - - force_group.add_argument("-b", "--batch", action="store_true", - help="Use batch mode: pull large archives, do no overwrite") - - sps[INSERT_CMD].add_argument( - "-p", "--path", required=True, metavar="P", - help="Relative local path to asset.") - - sps[INSERT_CMD].add_argument( - "-s", "--seek-keys", required=False, type=str, metavar="S", - help=""" - String representation of a JSON object with seek_keys, - e.g. '{"seek_key1": "file.txt"}') - """) - - sps[GETSEQ_CMD].add_argument( - "-l", "--locus", required=True, - help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.") - - sps[GET_ASSET_CMD].add_argument( - "-e", "--check-exists", required=False, action="store_true", - help="Whether the returned asset path should be checked for existence on disk.") - - group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) - - group.add_argument( - "-t", "--tag", type=str, - help="Tag to assign to an asset.") +from yacman import UndefinedAliasError +from logging import getLogger - group.add_argument( - "-d", "--default", action="store_true", - help="Set the selected asset tag as the default one.") - - sps[SUBSCRIBE_CMD].add_argument( - "-r", "--reset", action="store_true", - help="Overwrite the current list of server URLs.") - - for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]: - sps[cmd].add_argument( - "-s", "--genome-server", nargs='+', required=True, - help="One or more URLs to {action} the {key} attribute in config file.". - format(action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY)) - - return parser +_LOGGER = getLogger(PKG_NAME) def parse_registry_path(path): - return prp(path, defaults=[ - ("protocol", None), - ("genome", None), - ("asset", None), - ("seek_key", None), - ("tag", None)]) - - -def copy_or_download_file(input_string, outfolder): - """ - Given an input file, which can be a local file or a URL, and output folder, - this downloads or copies the file into the output folder. - - :param str input_string: Can be either a URL or a path to a local file - :param str outfolder: Where to store the result. - :return str, str: output/result file and command - """ - result_file = os.path.join(outfolder, os.path.basename(input_string)) - parts = ["wget -O", result_file, input_string] \ - if is_url(input_string) else ["cp", input_string, result_file] - return result_file, " ".join(parts) - - -def convert_file(input_fasta, output_file, conversions): - """ - Given an input file, output file, and a list of conversions, gives the appropriate output file. - - :param str output_file: Path to local output file you want to create - :param dict conversions: A dictionary of shell commands to convert files of a given type. - """ - form = {"INPUT": input_fasta, "OUTPUT": output_file} - _, ext = os.path.splitext(input_fasta) - if ext in conversions: - return conversions[ext].format(**form) - - -def default_config_file(): - """ - Path to default compute environment settings file. - - :return str: Path to default compute settings file - """ - return os.path.join(os.path.dirname(__file__), "refgenie.yaml") - - -def get_asset_vars(genome, asset_key, tag, outfolder, specific_args=None, specific_params=None, **kwargs): + return prp( + path, + defaults=[ + ("protocol", None), + ("genome", None), + ("asset", None), + ("seek_key", None), + ("tag", None), + ], + ) + + +def get_asset_vars( + genome, + asset_key, + tag, + outfolder, + specific_args=None, + specific_params=None, + **kwargs, +): """ Gives a dict with variables used to populate an asset path. """ asset_outfolder = os.path.join(outfolder, asset_key, tag) - asset_vars = {"genome": genome, - "asset": asset_key, - "tag": tag, - "asset_outfolder": asset_outfolder} + asset_vars = { + "genome": genome, + "asset": asset_key, + "tag": tag, + "asset_outfolder": asset_outfolder, + } if specific_args: asset_vars.update(specific_args) if specific_params: @@ -287,7 +81,7 @@ def refgenie_initg(rgc, genome, content_checksums): :param str genome: name of the genome :param dict content_checksums: checksums of individual content_checksums, e.g. chromosomes """ - genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome) + genome_dir = os.path.join(rgc.data_dir, genome) if is_writable(genome_dir): output_file = os.path.join(genome_dir, "{}_sequence_digests.tsv".format(genome)) with open(output_file, "w") as contents_file: @@ -296,7 +90,11 @@ def refgenie_initg(rgc, genome, content_checksums): wr.writerow([key, val]) _LOGGER.debug("sequence digests saved to: {}".format(output_file)) else: - _LOGGER.warning("Could not save the genome sequence digests. '{}' is not writable".format(genome_dir)) + _LOGGER.warning( + "Could not save the genome sequence digests. '{}' is not writable".format( + genome_dir + ) + ) def refgenie_build(gencfg, genome, asset_list, recipe_name, args): @@ -306,23 +104,44 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf( + filepath=gencfg, + writable=False, + skip_read_lock=_skip_lock(args.skip_read_lock, gencfg), + ) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) - if not hasattr(args, "outfolder") or not args.outfolder: - # Default to genome_folder - _LOGGER.debug("No outfolder provided, using genome config.") - args.outfolder = rgc[CFG_FOLDER_KEY] + def _read_json_file(filepath): + """ + Read a JSON file - _LOGGER.debug("Default config file: {}".format(default_config_file())) + :param str filepath: path to the file to read + :return dict: read data + """ + with open(filepath, "r") as f: + data = json.load(f) + return data - if args.config_file and not os.path.isfile(args.config_file): - _LOGGER.debug("Config file path isn't a file: {}". - format(args.config_file)) - args.config_file = default_config_file() + if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(".json"): + recipe_name = _read_json_file(filepath=recipe_name) - def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, **kwargs): + if not hasattr(args, "outfolder") or not args.outfolder: + # Default to genome_folder + _LOGGER.debug("No outfolder provided, using genome config.") + args.outfolder = rgc.data_dir + + def _build_asset( + genome, + asset_key, + tag, + build_pkg, + genome_outfolder, + specific_args, + specific_params, + alias, + **kwargs, + ): """ Builds assets with pypiper and updates a genome config file. @@ -336,8 +155,14 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar assets. """ - log_outfolder = os.path.abspath(os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) - _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(genome_outfolder, log_outfolder)) + log_outfolder = os.path.abspath( + os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR) + ) + _LOGGER.info( + "Saving outputs to:\n- content: {}\n- logs: {}".format( + genome_outfolder, log_outfolder + ) + ) if args.docker: # Set up some docker stuff if args.volumes: @@ -347,29 +172,49 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar volumes = genome_outfolder if not _writeable(genome_outfolder): - _LOGGER.error("Insufficient permissions to write to output folder: {}". - format(genome_outfolder)) + _LOGGER.error( + "Insufficient permissions to write to output folder: {}".format( + genome_outfolder + ) + ) return - pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) + pm = pypiper.PipelineManager( + name="refgenie", outfolder=log_outfolder, args=args + ) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) - gat = [genome, asset_key, tag] # create a bundle list to simplify calls below + # create a bundle list to simplify calls below + gat = [genome, asset_key, tag] # collect variables required to populate the command templates - asset_vars = get_asset_vars(genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs) + asset_vars = get_asset_vars( + genome, + asset_key, + tag, + genome_outfolder, + specific_args, + specific_params, + **kwargs, + ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method - command_list_populated = [x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) - for x in build_pkg[CMD_LST]] + command_list_populated = [ + x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) + for x in build_pkg[CMD_LST] + ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) - target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) + target = os.path.join( + log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag) + ) # add target command command_list_populated.append("touch {target}".format(target=target)) - _LOGGER.debug("Command populated: '{}'".format(" ".join(command_list_populated))) + _LOGGER.debug( + "Command populated: '{}'".format(" ".join(command_list_populated)) + ) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) @@ -380,523 +225,248 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) - with open(os.path.join(log_outfolder, recipe_file_name), 'w') as outfile: + with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) - # in order to prevent locking the config file for writing once while - # being able to use the seek method for digest calculation we - # create a temporary object to run seek on. - tmp_rgc = RefGenConf() - tmp_rgc[CFG_FOLDER_KEY] = rgc[CFG_FOLDER_KEY] - tmp_rgc.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key}) - tmp_rgc.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) - digest = get_dir_digest( - _seek(tmp_rgc, genome, asset_key, tag, enclosing_dir=True), pm) + # since the assets are always built to a standard dir structure, we + # can just stitch a path together for asset digest calculation + asset_dir = os.path.join(rgc.data_dir, *gat) + if not os.path.exists(asset_dir): + raise OSError( + "Could not compute asset digest. Path does not " + "exist: {}".format(asset_dir) + ) + digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) - del tmp_rgc # add updates to config file with rgc as r: - r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}) - r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, - CFG_ASSET_CHECKSUM_KEY: digest}) - r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) - r.set_default_pointer(*gat) + if asset_key == "fasta": + r.update_genomes( + genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome + ) + r.update_assets( + *gat[0:2], + data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, + force_digest=genome, + ) + r.update_tags( + *gat, + force_digest=genome, + data={ + CFG_ASSET_PATH_KEY: asset_key, + CFG_ASSET_CHECKSUM_KEY: digest, + }, + ) + r.update_seek_keys( + *gat, + force_digest=genome, + keys={ + k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() + }, + ) + r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] - asset_tag = a["tag"] or rgc.get_default_tag(genome, a["asset"], use_existing=False) + asset_tag = a["tag"] or rgc.get_default_tag( + genome, a["asset"], use_existing=False + ) recipe_name = recipe_name or asset_key - if recipe_name in asset_build_packages.keys(): - asset_build_package = _check_recipe(asset_build_packages[recipe_name]) + if isinstance(recipe_name, dict) or ( + isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys() + ): + if isinstance(recipe_name, dict): + _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) + asset_build_package = _check_recipe(recipe_name) + recipe_name = asset_build_package["name"] + else: + asset_build_package = _check_recipe(asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] specified_asset_keys, specified_assets = None, None if args.assets is not None: parsed_parents_input = _parse_user_build_input(args.assets) - specified_asset_keys, specified_assets = \ - list(parsed_parents_input.keys()), list(parsed_parents_input.values()) - _LOGGER.debug("Custom assets requested: {}".format(args.assets)) + specified_asset_keys = list(parsed_parents_input.keys()) + specified_assets = list(parsed_parents_input.values()) + _LOGGER.debug(f"Custom assets requested: {args.assets}") if not specified_asset_keys and isinstance(args.assets, list): - _LOGGER.warning("Specified parent assets format is invalid. Using defaults.") + _LOGGER.warning( + "Specified parent assets format is invalid. Using defaults." + ) for req_asset in asset_build_package[REQ_ASSETS]: req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested - if specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys: - parent_data = \ - parse_registry_path(specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) - g, a, t, s = parent_data["genome"], \ - parent_data["asset"], \ - parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ - parent_data["seek_key"] + if ( + specified_asset_keys is not None + and req_asset_data["asset"] in specified_asset_keys + ): + parent_data = parse_registry_path( + specified_assets[ + specified_asset_keys.index(req_asset_data["asset"]) + ] + ) + g, a, t, s = ( + parent_data["genome"], + parent_data["asset"], + parent_data["tag"] + or rgc.get_default_tag(genome, parent_data["asset"]), + parent_data["seek_key"], + ) else: # if no custom parents requested for the req asset, use default one default = parse_registry_path(req_asset[DEFAULT]) - g, a, t, s = genome, default["asset"], \ - rgc.get_default_tag(genome, default["asset"]), \ - req_asset_data["seek_key"] - parent_assets.append("{}/{}:{}".format(g, a, t)) + g, a, t, s = ( + genome, + default["asset"], + rgc.get_default_tag(genome, default["asset"]), + req_asset_data["seek_key"], + ) + parent_assets.append( + "{}/{}:{}".format( + rgc.get_genome_alias_digest(g, fallback=True), a, t + ) + ) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) _LOGGER.debug("Provided parameters: {}".format(specified_params)) for required_file in asset_build_package[REQ_FILES]: - if specified_args is None or required_file[KEY] not in specified_args.keys(): - raise ValueError("Path to the '{x}' input ({desc}) is required, but not provided. " - "Specify it with: --files {x}=/path/to/{x}_file" - .format(x=required_file[KEY], desc=required_file[DESC])) + if ( + specified_args is None + or required_file[KEY] not in specified_args.keys() + ): + raise ValueError( + "Path to the '{x}' input ({desc}) is required, but not provided. " + "Specify it with: --files {x}=/path/to/{x}_file".format( + x=required_file[KEY], desc=required_file[DESC] + ) + ) for required_param in asset_build_package[REQ_PARAMS]: if specified_params is None: specified_params = {} if required_param[KEY] not in specified_params.keys(): if required_param[DEFAULT] is None: - raise ValueError("Value for the parameter '{x}' ({desc}) is required, but not provided. " - "Specify it with: --params {x}=value" - .format(x=required_param[KEY], desc=required_param[DESC])) + raise ValueError( + "Value for the parameter '{x}' ({desc}) is required, but not provided. " + "Specify it with: --params {x}=value".format( + x=required_param[KEY], desc=required_param[DESC] + ) + ) else: - specified_params.update({required_param[KEY]: required_param[DEFAULT]}) + specified_params.update( + {required_param[KEY]: required_param[DEFAULT]} + ) + _LOGGER.info( + "Building '{}/{}:{}' using '{}' recipe".format( + genome, asset_key, asset_tag, recipe_name + ) + ) + ori_genome = genome + if recipe_name == "fasta": + if ( + genome in rgc.genomes_list() + and "fasta" in rgc.list_assets_by_genome(genome) + ): + pretag = rgc.get_default_tag(genome, "fasta") + _LOGGER.warning( + "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})".format( + g=genome, a=asset_key, t=pretag + ) + ) + genome = rgc.get_genome_alias_digest(alias=genome, fallback=True) + else: + # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file + genome, _ = rgc.initialize_genome( + fasta_path=specified_args["fasta"], + alias=ori_genome, + skip_alias_write=True, + ) + else: + try: + genome = rgc.get_genome_alias_digest(genome, fallback=True) + except UndefinedAliasError: + _LOGGER.error( + "Genome '{}' has not been initialized yet; " + "no key found for this alias".format(genome) + ) + return + recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) - _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, asset_key, asset_tag, recipe_name)) - if recipe_name == 'fasta' and genome in rgc.genomes_list() \ - and 'fasta' in rgc.list_assets_by_genome(genome): - _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). " - "It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) - if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, - specified_args, specified_params, **input_assets): - log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag, - BUILD_STATS_DIR, ORI_LOG_NAME)) - _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. " - "See the log file for details: {}".format(genome, asset_key, asset_tag, log_path)) + if not _build_asset( + genome, + asset_key, + asset_tag, + asset_build_package, + genome_outfolder, + specified_args, + specified_params, + ori_genome, + **input_assets, + ): + log_path = os.path.abspath( + os.path.join( + genome_outfolder, + asset_key, + asset_tag, + BUILD_STATS_DIR, + ORI_LOG_NAME, + ) + ) + _LOGGER.info( + "'{}/{}:{}' was not added to the config, but directory has been left in place. " + "See the log file for details: {}".format( + genome, asset_key, asset_tag, log_path + ) + ) return - # If the recipe was a fasta, we init the genome - if recipe_name == 'fasta': - _LOGGER.info("Computing initial genome digest...") - collection_checksum, content_checksums = \ - fasta_checksum(_seek(rgc, genome, asset_key, asset_tag, "fasta")) - _LOGGER.info("Initializing genome...") - refgenie_initg(rgc, genome, content_checksums) _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships - r.update_relatives_assets(genome, asset_key, asset_tag, parent_assets) # adds parents + r.update_relatives_assets( + genome, asset_key, asset_tag, parent_assets + ) # adds parents for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent - r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], parsed_parent["tag"], - ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) + r.update_relatives_assets( + parsed_parent["genome"], + parsed_parent["asset"], + parsed_parent["tag"], + ["{}/{}:{}".format(genome, asset_key, asset_tag)], + True, + ) if args.genome_description is not None: - _LOGGER.debug("adding genome ({}) description: '{}'".format(genome, args.genome_description)) - r.update_genomes(genome, {CFG_GENOME_DESC_KEY: args.genome_description}) + _LOGGER.debug( + "adding genome ({}) description: '{}'".format( + genome, args.genome_description + ) + ) + r.update_genomes( + genome, {CFG_GENOME_DESC_KEY: args.genome_description} + ) if args.tag_description is not None: - _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'".format(genome, asset_key, asset_tag, - args.tag_description)) - r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) - if recipe_name == "fasta": - # to save config lock time when building fasta assets - # (genome initialization takes some time for large genomes) we repeat the - # conditional here for writing the computed genome digest - r.update_genomes(genome, data={CFG_CHECKSUM_KEY: collection_checksum}) + _LOGGER.debug( + "adding tag ({}/{}:{}) description: '{}'".format( + genome, asset_key, asset_tag, args.tag_description + ) + ) + r.update_tags( + genome, + asset_key, + asset_tag, + {CFG_TAG_DESC_KEY: args.tag_description}, + ) + rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name) -def _exec_list(rgc, remote, genome): - if remote: - pfx = "Remote" - # we use this func looping through the server urls and assigning a - # single instance as the server for the object. That's why we can - # access the data with [0] below - assemblies, assets = \ - list(rgc.listr(genome=genome, as_str=True).values())[0] - recipes = None # Not implemented - else: - pfx = "Local" - assemblies, assets = rgc.get_local_data_str(genome=genome) - # also get recipes - recipes = ", ".join(sorted(list(asset_build_packages.keys()))) - return pfx, assemblies, assets, recipes - - -def perm_check_x(file_to_check, message_tag): - """ - Check X_OK permission on a path, providing according messaging and bool val. - - :param str file_to_check: path to query for permission - :param str message_tag: context for error message if check fails - :return bool: os.access(path, X_OK) for the given path - :raise ValueError: if there's no filepath to check for permission - """ - if not file_to_check: - msg = "You must provide a path to {}".format(message_tag) - _LOGGER.error(msg) - raise ValueError(msg) - if not os.access(file_to_check, os.X_OK): - _LOGGER.error("Insufficient permissions to write to {}: " - "{}".format(message_tag, file_to_check)) - return False - return True - - -def main(): - """ Primary workflow """ - parser = logmuse.add_logging_options(build_argparser()) - args, remaining_args = parser.parse_known_args() - global _LOGGER - _LOGGER = logmuse.logger_via_cli(args, make_root=True) - _LOGGER.debug("refgenie {}".format(__version__)) - _LOGGER.debug("Args: {}".format(args)) - - if not args.command: - parser.print_help() - _LOGGER.error("No command given") - sys.exit(1) - - gencfg = refgenconf.select_genome_config(filename=args.genome_config, check_exist=not args.command == INIT_CMD, - on_missing=lambda fp: fp, strict_env=True) - if gencfg is None: - raise MissingGenomeConfigError(args.genome_config) - _LOGGER.debug("Determined genome config: {}".format(gencfg)) - - # From user input we want to construct a list of asset dicts, where each - # asset has a genome name, asset name, and tag - - if "asset_registry_paths" in args and args.asset_registry_paths: - _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) - asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] - - for a in asset_list: - # every asset must have a genome, either provided via registry path - # or the args.genome arg. - if not a["genome"]: - if args.genome: - a["genome"] = args.genome - else: - _LOGGER.error("Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.". - format(a["genome"], a["asset"], a["tag"])) - sys.exit(1) - else: - if args.genome and args.genome != a["genome"]: - _LOGGER.warn("Two different genomes specified for asset '{}'.".format(a["asset"])) - - else: - if args.command in GENOME_ONLY_REQUIRED and not args.genome: - parser.error("You must provide either a genome or a registry path") - sys.exit(1) - if args.command in ASSET_REQUIRED: - parser.error("You must provide an asset registry path") - sys.exit(1) - - if args.command == INIT_CMD: - _LOGGER.debug("Initializing refgenie genome configuration") - entries = OrderedDict({ - CFG_VERSION_KEY: REQ_CFG_VERSION, - CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), - CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], - CFG_GENOMES_KEY: None}) - if args.settings_json: - if os.path.isfile(args.settings_json): - with open(args.settings_json, 'r') as json_file: - data = json.load(json_file) - entries.update(data) - else: - raise FileNotFoundError( - "JSON file with config init settings does not exist: {}". - format(args.settings_json)) - if args.genome_folder: - entries.update({CFG_FOLDER_KEY: args.genome_folder}) - if args.remote_url_base: - entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) - if args.genome_archive_folder: - entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) - if args.genome_archive_config: - entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) - _LOGGER.debug("initializing with entries: {}".format(entries)) - rgc = RefGenConf(entries=entries) - rgc.initialize_config_file(os.path.abspath(gencfg)) - - elif args.command == BUILD_CMD: - if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]): - _LOGGER.error("Build can only build assets for one genome") - sys.exit(1) - recipe_name = None - if args.recipe: - if len(asset_list) > 1: - _LOGGER.error("Recipes cannot be specified for multi-asset builds") - sys.exit(1) - recipe_name = args.recipe - if args.requirements: - for a in asset_list: - recipe = recipe_name or a["asset"] - if recipe not in asset_build_packages.keys(): - _raise_missing_recipe_error(recipe) - _LOGGER.info("'{}' recipe requirements: ".format(recipe)) - _make_asset_build_reqs(recipe) - sys.exit(0) - refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) - - elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - check = args.check_exists if args.check_exists else None - for a in asset_list: - _LOGGER.debug("getting asset: '{}/{}.{}:{}'". - format(a["genome"], a["asset"], a["seek_key"], a["tag"])) - print(rgc.seek(a["genome"], a["asset"], a["tag"], a["seek_key"], - strict_exists=check)) - return - - elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - if len(asset_list) > 1: - raise NotImplementedError("Can only add 1 asset at a time") - else: - sk = args.seek_keys - if sk: - sk = json.loads(args.seek_keys) - rgc.add(path=args.path, genome=asset_list[0]["genome"], - asset=asset_list[0]["asset"], tag=asset_list[0]["tag"], - seek_keys=sk, force=args.force) - - elif args.command == PULL_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - # existing assets overwriting - if args.no_overwrite: - force = False - elif args.force_overwrite: - force = True - else: - force = None - # large archive pulling - if args.no_large: - force_large = False - elif args.pull_large: - force_large = True - else: - force_large = None - # batch mode takes precedence over other choices - if args.batch: - force_large = True - force = False - - outdir = rgc[CFG_FOLDER_KEY] - if not os.path.exists(outdir): - raise MissingFolderError(outdir) - target = _key_to_name(CFG_FOLDER_KEY) - if not perm_check_x(outdir, target): - return - if not _single_folder_writeable(outdir): - _LOGGER.error("Insufficient permissions to write to {}: {}". - format(target, outdir)) - return - - for a in asset_list: - rgc.pull(a["genome"], a["asset"], a["tag"], force=force, - force_large=force_large, size_cutoff=args.size_cutoff) - - elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False) - if args.command == LIST_REMOTE_CMD: - num_servers = 0 - # Keep all servers so that child updates maintain server list - server_list = rgc[CFG_SERVERS_KEY] - bad_servers = [] - for server_url in rgc[CFG_SERVERS_KEY]: - num_servers += 1 - try: - rgc[CFG_SERVERS_KEY] = [server_url] - pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome) - if assets is None and genomes is None: - continue - _LOGGER.info("Server URL: {}".format(server_url)) - _LOGGER.info("{} genomes: {}".format(pfx, genomes)) - if args.command != LIST_REMOTE_CMD: # Not implemented yet - _LOGGER.info("{} recipes: {}".format(pfx, recipes)) - _LOGGER.info("{} assets:\n{}\n".format(pfx, assets)) - except (DownloadJsonError, ConnectionError): - bad_servers.append(server_url) - continue - if num_servers >= len(server_list) and bad_servers: - _LOGGER.error("Could not list assets from the following server(s): {}".format(bad_servers)) - # Restore original server list, even when we couldn't find assets on a server - rgc[CFG_SERVERS_KEY] = server_list - else: # Only check local assets once - _LOGGER.info("Server subscriptions: {}".format(", ".join(rgc[CFG_SERVERS_KEY]))) - pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome) - _LOGGER.info("{} genomes: {}".format(pfx, genomes)) - if args.command != LIST_REMOTE_CMD: # Not implemented yet - _LOGGER.info("{} recipes: {}".format(pfx, recipes)) - _LOGGER.info("{} assets:\n{}".format(pfx, assets)) - - elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - print(rgc.getseq(args.genome, args.locus)) - - elif args.command == REMOVE_CMD: - force = args.force - rgc = RefGenConf(filepath=gencfg) - for a in asset_list: - a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"], - use_existing=False) - _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) - if a["seek_key"] is not None: - raise NotImplementedError("You can't remove a specific seek_key.") - bundle = [a["genome"], a["asset"], a["tag"]] - try: - if not rgc.is_asset_complete(*bundle): - with rgc as r: - r.cfg_remove_assets(*bundle) - _LOGGER.info("Removed an incomplete asset '{}/{}:{}'". - format(*bundle)) - return - except (KeyError, MissingAssetError, MissingGenomeError): - _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle)) - return - if len(asset_list) > 1: - if not query_yes_no("Are you sure you want to remove {} assets?". - format(len(asset_list))): - _LOGGER.info("Action aborted by the user") - return - force = True - for a in asset_list: - rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], - force=force) - - elif args.command == TAG_CMD: - rgc = RefGenConf(filepath=gencfg) - if len(asset_list) > 1: - raise NotImplementedError("Can only tag 1 asset at a time") - if args.default: - # set the default tag and exit - with rgc as r: - r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) - sys.exit(0) - rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) - - elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - if len(asset_list) == 1: - g, a = asset_list[0]["genome"], asset_list[0]["asset"] - t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) - print(rgc.id(g, a, t)) - return - for asset in asset_list: - g, a = asset["genome"], asset["asset"] - t = asset["tag"] or rgc.get_default_tag(g, a) - print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) - return - elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - rgc.subscribe(urls=args.genome_server, reset=args.reset) - return - elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) - rgc.unsubscribe(urls=args.genome_server) - return - - -def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): - """ - Message and save removed entity data - - :param str directory: removed dir - :param str entity_class: class of the entity - :param dict asset_dict: selected genome/asset:tag combination - :param list removed_entities: list of the removed entities to append to - """ - subclass = "asset" if entity_class == "genome" else "tag" - if os.path.basename(directory) == asset_dict[entity_class]: - _LOGGER.info("Last {sub} for {ec} '{en}' has been removed, removing {ec} directory". - format(sub=subclass, ec=entity_class, en=asset_dict[entity_class])) - removed_entities.append(_remove(directory)) - else: - _LOGGER.debug("Didn't remove '{}' since it does not match the {} name: {}". - format(directory, entity_class, asset_dict[entity_class])) - - -def _remove(path): - """ - remove asset if it is a dir or a file - - :param str path: path to the entity to remove, either a file or a dir - :return str: removed path - """ - if os.path.isfile(path): - os.remove(path) - elif os.path.isdir(path): - rmtree(path) - else: - raise ValueError("path '{}' is neither a file nor a dir.".format(path)) - return path - - def _key_to_name(k): return k.replace("_", " ") -def _single_folder_writeable(d): - return os.access(d, os.W_OK) and os.access(d, os.X_OK) - - -def _writeable(outdir, strict_exists=False): - outdir = outdir or "." - if os.path.exists(outdir): - return _single_folder_writeable(outdir) - elif strict_exists: - raise MissingFolderError(outdir) - return _writeable(os.path.dirname(outdir), strict_exists) - - -def _make_asset_build_reqs(asset): - """ - Prepare requirements and inputs lists and display it - - :params str asset: name of the asset - """ - def _format_reqs(req_list): - """ - - :param list[dict] req_list: - :return list[str]: - """ - templ = "\t{} ({})" - return [templ.format(req[KEY], req[DESC]) if DEFAULT not in req - else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) for req in req_list] - - reqs_list = [] - if asset_build_packages[asset][REQ_FILES]: - reqs_list.append("- files:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])))) - if asset_build_packages[asset][REQ_ASSETS]: - reqs_list.append("- assets:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])))) - if asset_build_packages[asset][REQ_PARAMS]: - reqs_list.append("- params:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])))) - _LOGGER.info("\n".join(reqs_list)) - - -def get_dir_digest(path, pm=None): - """ - Generate a MD5 digest that reflects just the contents of the files in the selected directory. - - :param str path: path to the directory to digest - :param pypiper.PipelineManager pm: a pipeline object, optional. The subprocess module will be used if not provided - :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 - """ - if not is_command_callable("md5sum"): - raise OSError("md5sum command line tool is required for asset digest calculation. \n" - "Install and try again, e.g on macOS: 'brew install md5sha1sum'") - cmd = "cd {}; find . -type f -not -path './" + BUILD_STATS_DIR + \ - "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" - if isinstance(pm, pypiper.PipelineManager): - x = pm.checkprint(cmd.format(path)) - else: - try: - from subprocess import check_output - x = check_output(cmd.format(path), shell=True).decode("utf-8") - except Exception as e: - _LOGGER.warning("{}: could not calculate digest for '{}'".format(e.__class__.__name__, path)) - return - return str(sub(r'\W+', '', x)) # strips non-alphanumeric - - def _handle_sigint(gat): """ SIGINT handler, unlocks the config file and exists the program @@ -904,35 +474,12 @@ def _handle_sigint(gat): :param list gat: a list of genome, asset and tag. Used for a message generation. :return function: the SIGINT handling function """ + def handle(sig, frame): _LOGGER.warning("\nThe build was interrupted: {}/{}:{}".format(*gat)) sys.exit(0) - return handle - - -def _parse_user_build_input(input): - """ - Parse user input specification. Used in build for specific parents and input parsing. - - :param Iterable[Iterable[str], ...] input: user command line input, - formatted as follows: [[fasta=txt, test=txt], ...] - :return dict: mapping of keys, which are input names and values - """ - lst = [] - for i in input or []: - lst.extend(i) - return {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} if lst is not None else lst - - -def _raise_missing_recipe_error(recipe): - """ - Raise an error for a missing recipe, when one is requested - :param str recipe: recipe name - :raise MissingRecipeError: always - """ - raise MissingRecipeError("Recipe '{}' not found. Available recipes: {}". - format(recipe, ", ".join(list(asset_build_packages.keys())))) + return handle def _check_recipe(recipe): @@ -943,6 +490,21 @@ def _check_recipe(recipe): :param dict recipe: asset_build_package :raise ValueError: if any key names are duplicated """ + # experimental feature; recipe jsonschema validation + from jsonschema import validate + from yacman import load_yaml + + SCHEMA_SRC = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "schemas", "recipe_schema.yaml" + ) + if os.path.exists(SCHEMA_SRC): + validate(recipe, load_yaml(filepath=SCHEMA_SRC)) + _LOGGER.info( + "Recipe validated successfully against a schema: {}".format(SCHEMA_SRC) + ) + else: + _LOGGER.warning("Recipe schema not found: {}".format(SCHEMA_SRC)) + # end of validation req_keys = [] for req in [REQ_PARAMS, REQ_ASSETS, REQ_FILES]: req_keys.extend([req_dict[KEY] for req_dict in recipe[req]]) @@ -951,20 +513,25 @@ def _check_recipe(recipe): if k not in unique: unique.append(k) else: - raise ValueError("The recipe contains a duplicated requirement" - " key '{}', which is not permitted.".format(k)) + raise ValueError( + "The recipe contains a duplicated requirement" + " key '{}', which is not permitted.".format(k) + ) return recipe -def _seek(rgc, genome_name, asset_name, tag_name=None, - seek_key=None, enclosing_dir=False): +def _seek( + rgc, genome_name, asset_name, tag_name=None, seek_key=None, enclosing_dir=False +): """ Strict seek. Most use cases in this package require file existence check in seek. This function makes it easier """ - return rgc.seek(genome_name=genome_name, - asset_name=asset_name, - tag_name=tag_name, - seek_key=seek_key, - enclosing_dir=enclosing_dir, - strict_exists=True) + return rgc.seek_src( + genome_name=genome_name, + asset_name=asset_name, + tag_name=tag_name, + seek_key=seek_key, + enclosing_dir=enclosing_dir, + strict_exists=True, + ) diff --git a/refgenie/refgenie.yaml b/refgenie/refgenie.yaml deleted file mode 100644 index 2e9ad975..00000000 --- a/refgenie/refgenie.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Build configuration - -tools: - # absolute paths to required tools - bowtie2build: bowtie2-build - bismark_genome_preparation: bismark_genome_preparation - epilog_indexer: epilog_indexer.py - samtools: samtools - kallisto: kallisto - hisat2build: hisat2-build - suffixerator: gt suffixerator - tallymer: gt tallymer mkindex - -index: - bowtie2: True - bismark_bt1: False - bismark_bt2: False - epilog: False - hisat: False - kallisto: True - suffixerator: False - tallymer: False - -param: - epilog: - context: "cg" - tallymer: - minocc: 2 \ No newline at end of file diff --git a/refgenie/refget.py b/refgenie/refget.py index 1d5f4d82..b0861260 100644 --- a/refgenie/refget.py +++ b/refgenie/refget.py @@ -40,4 +40,4 @@ def fasta_checksum(fa_file, checksum_function=trunc512_digest): content_checksums[k] = checksum_function(str(fa_object[k])) collection_string = ";".join([":".join(i) for i in content_checksums.items()]) collection_checksum = checksum_function(collection_string) - return collection_checksum, content_checksums \ No newline at end of file + return collection_checksum, content_checksums diff --git a/refgenie/schemas/recipe_schema.yaml b/refgenie/schemas/recipe_schema.yaml new file mode 100644 index 00000000..c06bc03f --- /dev/null +++ b/refgenie/schemas/recipe_schema.yaml @@ -0,0 +1,38 @@ +description: refgenie recipe schema + +properties: + name: + type: string + pattern: "^\\S*$" + description: "name of the recipe with no whitespaces" + description: + type: string + description: "description of the recipe" + assets: + type: object + description: "seek keys to be produced" + required_files: + type: array + items: + type: object + description: "File-type input to the recipe" + required_assets: + type: array + items: + type: object + description: "Asset-type input to the recipe" + required_parameters: + type: array + items: + type: object + description: "Parameter-type input to the recipe" + container: + type: string + pattern: "^\\S*$" + description: "Registry path of the container to use" + command_list: + type: array + items: + type: string + description: "List of commands that create the asset" +required: [description, assets, required_files, required_assets, required_parameters, command_list] \ No newline at end of file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 08942fb7..b7816e4c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,5 @@ logmuse>=0.2.6 -refgenconf>=0.9.1 +refgenconf>=0.10.0 piper>=0.12.1 pyfaidx>=0.5.5.2 +yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt index a7372dc4..b1e1259c 100644 --- a/requirements/requirements-doc.txt +++ b/requirements/requirements-doc.txt @@ -1,2 +1 @@ https://github.com/databio/mkdocs-databio/archive/master.zip -refgenconf>=0.6.1 diff --git a/setup.py b/setup.py index d1f81150..f0c50dcf 100755 --- a/setup.py +++ b/setup.py @@ -8,37 +8,38 @@ for line in reqs_file: if not line.strip(): continue - #DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + # DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) DEPENDENCIES.append(line) # Additional keyword arguments for setup() extra = {"install_requires": DEPENDENCIES} # 2to3 -if sys.version_info >= (3, ): +if sys.version_info >= (3,): extra["use_2to3"] = True -with open("refgenie/_version.py", 'r') as versionfile: +with open("refgenie/_version.py", "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") # Handle the pypi README formatting. try: import pypandoc - long_description = pypandoc.convert_file('README.md', 'rst') + + long_description = pypandoc.convert_file("README.md", "rst") msg = "\033[032mPandoc conversion succeeded.\033[0m" -except(IOError, ImportError, OSError): +except (IOError, ImportError, OSError): msg = "\033[0;31mWarning: pandoc conversion failed!\033[0m" - long_description = open('README.md').read() + long_description = open("README.md").read() setup( - name='refgenie', + name="refgenie", packages=["refgenie"], version=version, - description='Refgenie creates a standardized folder structure for reference genome files and indexes. ' - 'You can download pre-built genomes or build your own for any fasta file', + description="Refgenie creates a standardized folder structure for reference genome files and indexes. " + "You can download pre-built genomes or build your own for any fasta file", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", @@ -46,21 +47,21 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], license="BSD2", entry_points={ "console_scripts": [ - 'refgenie = refgenie.__main__:main', - 'import_igenome = refgenie.add_assets_igenome:main' + "refgenie = refgenie.__main__:main", + "import_igenome = refgenie.add_assets_igenome:main", ], }, keywords="bioinformatics, sequencing, ngs", package_data={"refgenie": [os.path.join("refgenie", "*")]}, include_package_data=True, - url='http://refgenie.databio.org', - author=u'Nathan Sheffield, Vince Reuter, Michal Stolarczyk', + url="http://refgenie.databio.org", + author=u"Nathan Sheffield, Vince Reuter, Michal Stolarczyk", **extra ) -print(msg) \ No newline at end of file +print(msg) diff --git a/tests/assert_in_file.sh b/tests/assert_in_file.sh new file mode 100755 index 00000000..c0c192f0 --- /dev/null +++ b/tests/assert_in_file.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# -ne 3 ]; then + echo $0: usage: assert_in_file.sh filepath query inverse + exit 1 +fi + +if [[ "$3" == "1" ]]; then + echo -e "\nTesting if '$2' is not in '$1'" + if grep -q "$2" "$1"; then + echo -e "\ERROR: '$2' is in '$1'\nContents:\n" + cat "$1" + exit 1 + else + echo -e "\nSUCCESS: '$2' not in '$1'\n" + exit 0 + fi +else + echo -e "\nTesting if '$2' is in '$1'" + if grep -q "$2" "$1"; then + echo -e "\nSUCCESS: '$2' is in '$1'\n" + exit 0 + else + echo -e "\nERROR: '$2' not in '$1'\nContents:\n" + cat "$1" + exit 1 + fi +fi diff --git a/tests/data/recipe_child.json b/tests/data/recipe_child.json new file mode 100644 index 00000000..afe1d76e --- /dev/null +++ b/tests/data/recipe_child.json @@ -0,0 +1,20 @@ +{ + "name": "fasta_child", + "description": "child of an asset, dummy recipe", + "assets": { + "fasta_child": "{genome}_child.fa.gz" + }, + "required_assets": [ + { + "key": "fasta", + "default": "fasta", + "description": "fasta asset for genome" + } + ], + "required_parameters": [], + "required_files": [], + "container": "databio/refgenie", + "command_list": [ + "cp {fasta} {asset_outfolder}/{genome}_child.fa.gz" + ] +} \ No newline at end of file diff --git a/tests/data/recipe_parent.json b/tests/data/recipe_parent.json new file mode 100644 index 00000000..a664beef --- /dev/null +++ b/tests/data/recipe_parent.json @@ -0,0 +1,20 @@ +{ + "name": "fasta", + "description": "DNA sequences in the FASTA format, dummy recipe", + "assets": { + "fasta": "{genome}.fa" + }, + "required_files": [ + { + "key": "fasta", + "description": "gzipped fasta file" + } + ], + "required_assets": [], + "required_parameters": [], + "container": "databio/refgenie", + "command_list": [ + "cp {fasta} {asset_outfolder}/{genome}.fa.gz", + "gzip -df {asset_outfolder}/{genome}.fa.gz" + ] +} \ No newline at end of file diff --git a/tests/data/t7.fa.gz b/tests/data/t7.fa.gz new file mode 100644 index 00000000..a2168f52 Binary files /dev/null and b/tests/data/t7.fa.gz differ diff --git a/update-usage-docs.sh b/update-usage-docs.sh index 4aecfd4a..0841bc4b 100755 --- a/update-usage-docs.sh +++ b/update-usage-docs.sh @@ -1,8 +1,7 @@ #!/bin/bash cp docs/usage.template usage.template #looper --help > USAGE.temp 2>&1 - -for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help"; do +for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do echo $cmd echo -e "## \`refgenie $cmd\`" > USAGE_header.temp refgenie $cmd --help > USAGE.temp 2>&1