From 3e08def08e245134171903353c5ff8297c5bc381 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sun, 12 Jul 2020 16:40:49 -0400 Subject: [PATCH 001/110] dev version bump --- docs/changelog.md | 2 ++ refgenie/_version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/changelog.md b/docs/changelog.md index 2ff3c5cf..fb8357d9 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.10.0] - unreleased + ## [0.9.2] - 2020-07-01 ## Changed diff --git a/refgenie/_version.py b/refgenie/_version.py index a2fecb45..2aceaf4d 100644 --- a/refgenie/_version.py +++ b/refgenie/_version.py @@ -1 +1 @@ -__version__ = "0.9.2" +__version__ = "0.10.0-dev" From 5549687f274b855145cacb42f7a5a57cf3407ba8 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sun, 12 Jul 2020 16:44:06 -0400 Subject: [PATCH 002/110] implement both genome initialization scenarios --- refgenie/const.py | 4 ++- refgenie/refgenie.py | 60 +++++++++++++++++++------------ requirements/requirements-all.txt | 2 +- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/refgenie/const.py b/refgenie/const.py index 830f6fe5..147e1c98 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -17,6 +17,7 @@ ID_CMD = "id" SUBSCRIBE_CMD = "subscribe" UNSUBSCRIBE_CMD = "unsubscribe" +# COMPARE_CMD = "compare" GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] @@ -36,5 +37,6 @@ TAG_CMD: "Tag an asset.", ID_CMD: "Return the asset digest.", SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", - UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", + UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config." + # COMPARE_CMD: "compare two genomes." } diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index ad4687f7..5ef442f0 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -24,6 +24,7 @@ from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, MissingRecipeError, DownloadJsonError from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, VersionInHelpParser, is_command_callable from ubiquerg.system import is_writable +from yacman import UndefinedAliasError from .refget import fasta_checksum _LOGGER = None @@ -121,6 +122,13 @@ def add_subparser(cmd, description): "-r", "--recipe", required=False, default=None, type=str, help="Provide a recipe to use.") + # sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, + # help="First genome for compatibility check") + # sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, + # help="Second genome for compatibility check") + # sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", + # help="Do not print compatibility code explanation") + # add 'genome' argument to many commands for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: # genome is not required for listing actions @@ -482,7 +490,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar g, a, t, s = genome, default["asset"], \ rgc.get_default_tag(genome, default["asset"]), \ req_asset_data["seek_key"] - parent_assets.append("{}/{}:{}".format(g, a, t)) + parent_assets.append("{}/{}:{}".format(rgc.get_genome_alias_digest(g), a, t)) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) @@ -502,12 +510,23 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar .format(x=required_param[KEY], desc=required_param[DESC])) else: specified_params.update({required_param[KEY]: required_param[DEFAULT]}) - genome_outfolder = os.path.join(args.outfolder, genome) _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, asset_key, asset_tag, recipe_name)) - if recipe_name == 'fasta' and genome in rgc.genomes_list() \ - and 'fasta' in rgc.list_assets_by_genome(genome): - _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). " - "It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) + if recipe_name == 'fasta': + ori_genome = genome + if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): + _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) + # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file + genome, _ = \ + rgc.initialize_genome(fasta_path=specified_args["fasta"], + alias=ori_genome) + else: + try: + genome = rgc.get_genome_alias_digest(genome) + except UndefinedAliasError: + _LOGGER.error("Genome '{}' has not been initialized yet; " + "no key found for this alias".format(genome)) + return + genome_outfolder = os.path.join(args.outfolder, genome) if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, specified_args, specified_params, **input_assets): log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag, @@ -515,13 +534,6 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. " "See the log file for details: {}".format(genome, asset_key, asset_tag, log_path)) return - # If the recipe was a fasta, we init the genome - if recipe_name == 'fasta': - _LOGGER.info("Computing initial genome digest...") - collection_checksum, content_checksums = \ - fasta_checksum(_seek(rgc, genome, asset_key, asset_tag, "fasta")) - _LOGGER.info("Initializing genome...") - refgenie_initg(rgc, genome, content_checksums) _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships @@ -529,20 +541,15 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent - r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], parsed_parent["tag"], - ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) + r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], + parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) if args.genome_description is not None: _LOGGER.debug("adding genome ({}) description: '{}'".format(genome, args.genome_description)) r.update_genomes(genome, {CFG_GENOME_DESC_KEY: args.genome_description}) if args.tag_description is not None: - _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'".format(genome, asset_key, asset_tag, - args.tag_description)) + _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'". + format(genome, asset_key, asset_tag, args.tag_description)) r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) - if recipe_name == "fasta": - # to save config lock time when building fasta assets - # (genome initialization takes some time for large genomes) we repeat the - # conditional here for writing the computed genome digest - r.update_genomes(genome, data={CFG_CHECKSUM_KEY: collection_checksum}) else: _raise_missing_recipe_error(recipe_name) @@ -719,7 +726,8 @@ def main(): unpack=not args.no_untar, force=force) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, + genome_exact=args.command == LIST_REMOTE_CMD) if args.command == LIST_REMOTE_CMD: num_servers = 0 # Keep all servers so that child updates maintain server list @@ -816,6 +824,12 @@ def main(): rgc = RefGenConf(filepath=gencfg, writable=False) rgc.unsubscribe(urls=args.genome_server) return + # elif args.command == COMPARE_CMD: + # rgc = RefGenConf(filepath=gencfg, writable=False) + # res = rgc.compare(args.genome1[0], args.genome2[0], + # explain=not args.no_explanation) + # if args.no_explanation: + # print(res) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 5154b55c..5469afb4 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,4 @@ logmuse>=0.2.6 -refgenconf>=0.9.0 +refgenconf>=0.10.0-dev piper>=0.12.1 pyfaidx>=0.5.5.2 From b8969fbf5b33813b91976226c90f787bd000301b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 13 Jul 2020 10:30:40 -0400 Subject: [PATCH 003/110] alias manipulation via CLI --- refgenie/const.py | 4 +++- refgenie/refgenie.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/refgenie/const.py b/refgenie/const.py index 147e1c98..93bab459 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -17,6 +17,7 @@ ID_CMD = "id" SUBSCRIBE_CMD = "subscribe" UNSUBSCRIBE_CMD = "unsubscribe" +ALIAS_CMD = "alias" # COMPARE_CMD = "compare" GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] @@ -37,6 +38,7 @@ TAG_CMD: "Tag an asset.", ID_CMD: "Return the asset digest.", SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", - UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config." + UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", + ALIAS_CMD: "Interact with aliases." # COMPARE_CMD: "compare two genomes." } diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 5ef442f0..dacdbaf7 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -122,6 +122,19 @@ def add_subparser(cmd, description): "-r", "--recipe", required=False, default=None, type=str, help="Provide a recipe to use.") + sps[ALIAS_CMD].add_argument( + "-r", "--remove", required=False, default=None, type=str, nargs="+", + help="Remove an alias.") + + sps[ALIAS_CMD].add_argument( + "-s", "--set", required=False, default=None, type=str, nargs=2, + help="Set an alias.") + + sps[ALIAS_CMD].add_argument( + "-g", "--get", required=False, default=None, type=str, nargs=1, + help="Get genome identifier for an alias.") + + # sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, # help="First genome for compatibility check") # sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, @@ -824,6 +837,25 @@ def main(): rgc = RefGenConf(filepath=gencfg, writable=False) rgc.unsubscribe(urls=args.genome_server) return + elif args.command == ALIAS_CMD: + rgc = RefGenConf(filepath=gencfg) + if args.get: + print(rgc.get_genome_alias_digest(alias=args.get)) + return + elif args.set: + with rgc as r: + r.set_genome_alias(genome=args.set[0], digest=args.set[1]) + return + elif args.remove: + with rgc as r: + [r[CFG_ALIASES_KEY].__delitem__(r.get_genome_alias_digest(a)) for a in args.remove] + return + else: + aliases = rgc.genome_aliases + print("genomeID".rjust(32) + "\talias".ljust(20) + "\tinitialized".ljust(20) + "\n") + print("\n".join("{}\t{}\t{}".format(k.rjust(32), v.ljust(20), ("*" if k in rgc.genomes else "").ljust(20)) for k, v in aliases.items())) + return + # elif args.command == COMPARE_CMD: # rgc = RefGenConf(filepath=gencfg, writable=False) # res = rgc.compare(args.genome1[0], args.genome2[0], From 5a74c1c251bba661046c3e4bb93b511e910b2af9 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 13 Jul 2020 10:35:02 -0400 Subject: [PATCH 004/110] dev reqs --- requirements/requirements-all.txt | 4 ++-- requirements/requirements-dev.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 5469afb4..1a903985 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,4 @@ logmuse>=0.2.6 -refgenconf>=0.10.0-dev +#refgenconf>=0.10.0-dev piper>=0.12.1 -pyfaidx>=0.5.5.2 +pyfaidx>=0.5.5.2 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index e69de29b..c368db1d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -0,0 +1 @@ +-e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From 2c474c4bb957e5856731678a7626f78d959cfc55 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 13 Jul 2020 11:53:04 -0400 Subject: [PATCH 005/110] alias CLI enhancements --- refgenie/refgenie.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index dacdbaf7..292e35dd 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -122,16 +122,22 @@ def add_subparser(cmd, description): "-r", "--recipe", required=False, default=None, type=str, help="Provide a recipe to use.") - sps[ALIAS_CMD].add_argument( - "-r", "--remove", required=False, default=None, type=str, nargs="+", + alias_group = sps[ALIAS_CMD].add_mutually_exclusive_group( + # title='Aliases manipulation arguments', + # description='Specify the action you want to perform on the aliases' + ) + + alias_group.add_argument( + "-r", "--remove", metavar="A", required=False, default=None, type=str, nargs="+", help="Remove an alias.") - sps[ALIAS_CMD].add_argument( - "-s", "--set", required=False, default=None, type=str, nargs=2, - help="Set an alias.") + alias_group.add_argument( + "-s", "--set", metavar="K-V", required=False, default=None, type=str, nargs="+", + help="Key-value pair of alias and genome ID or just an alias when the " + "genome ID is to be looked up from a server") - sps[ALIAS_CMD].add_argument( - "-g", "--get", required=False, default=None, type=str, nargs=1, + alias_group.add_argument( + "-g", "--get", metavar="A", required=False, default=None, type=str, nargs=1, help="Get genome identifier for an alias.") @@ -840,11 +846,18 @@ def main(): elif args.command == ALIAS_CMD: rgc = RefGenConf(filepath=gencfg) if args.get: - print(rgc.get_genome_alias_digest(alias=args.get)) + print(rgc.get_genome_alias_digest(alias=args.get[0])) return elif args.set: with rgc as r: - r.set_genome_alias(genome=args.set[0], digest=args.set[1]) + if len(args.set) == 2: + r.set_genome_alias(genome=args.set[0], digest=args.set[1]) + elif len(args.set) == 1: + r.set_genome_alias(genome=args.set[0]) + else: + _LOGGER.error( + "You can specify either an alias-genomeID pair or just " + "an alias to look up the genomeID from a server") return elif args.remove: with rgc as r: From 61867562f7283eb698fabde9d3043fc3f5758aad Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 14 Jul 2020 14:23:07 -0400 Subject: [PATCH 006/110] add cli hook for alias elimination in remove --- refgenie/refgenie.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 292e35dd..9f31e316 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -170,6 +170,10 @@ def add_subparser(cmd, description): "-f", "--force", action="store_true", help="Do not prompt before action, approve upfront.") + sps[REMOVE_CMD].add_argument( + "-a", "--aliases", action="store_true", + help="Remove the genome alias if last asset for that genome is removed.") + sps[PULL_CMD].add_argument( "-n", "--no-overwrite", action="store_true", help="Do not overwrite if asset exists.") @@ -762,7 +766,7 @@ def main(): _LOGGER.info("{} genomes: {}".format(pfx, genomes)) if args.command != LIST_REMOTE_CMD: # Not implemented yet _LOGGER.info("{} recipes: {}".format(pfx, recipes)) - _LOGGER.info("{} assets:\n{}\n".format(pfx, assets)) + _LOGGER.info("{} assets:\n{}".format(pfx, assets)) except (DownloadJsonError, ConnectionError): bad_servers.append(server_url) continue @@ -791,16 +795,17 @@ def main(): _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: raise NotImplementedError("You can't remove a specific seek_key.") - bundle = [a["genome"], a["asset"], a["tag"]] + gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} try: - if not rgc.is_asset_complete(*bundle): + if not rgc.is_asset_complete(**gat): with rgc as r: - r.cfg_remove_assets(*bundle) - _LOGGER.info("Removed an incomplete asset '{}/{}:{}'". - format(*bundle)) + r.cfg_remove_assets(**gat, aliases=args.aliases) + _LOGGER.info("Removed an incomplete asset " + "'{genome}/{asset}:{tag}'".format(*gat)) return except (KeyError, MissingAssetError, MissingGenomeError): - _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle)) + _LOGGER.info("Asset '{genome}/{asset}:{tag}' does not exist" + .format(**gat)) return if len(asset_list) > 1: if not query_yes_no("Are you sure you want to remove {} assets?". @@ -810,7 +815,7 @@ def main(): force = True for a in asset_list: rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], - force=force) + force=force, aliases=args.aliases) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg) From 6ce7705f8b89fdd876e57b64e2a21dcdb4ba075b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:20:17 -0400 Subject: [PATCH 007/110] add recipe schema, test data --- MANIFEST.in | 1 + refgenie/schemas/recipe_schema.yaml | 38 ++++++++++++++++++++++++++++ tests/data/recipe_child.json | 20 +++++++++++++++ tests/data/recipe_parent.json | 20 +++++++++++++++ tests/data/t7.fa.gz | Bin 0 -> 12584 bytes 5 files changed, 79 insertions(+) create mode 100644 refgenie/schemas/recipe_schema.yaml create mode 100644 tests/data/recipe_child.json create mode 100644 tests/data/recipe_parent.json create mode 100644 tests/data/t7.fa.gz diff --git a/MANIFEST.in b/MANIFEST.in index e6d1be79..3fa2f075 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements/* +include refgenie/schemas/* include README.md include LICENSE.txt include refgenie/refgenie.yaml diff --git a/refgenie/schemas/recipe_schema.yaml b/refgenie/schemas/recipe_schema.yaml new file mode 100644 index 00000000..01e63706 --- /dev/null +++ b/refgenie/schemas/recipe_schema.yaml @@ -0,0 +1,38 @@ +description: refgenie recipe schema + +properties: + name: + type: string + pattern: "^\\S*$" + description: "name of the recipe with no whitespaces" + description: + type: string + description: "description of the recipe" + assets: + type: object + description: "seek keys to be produced" + required_files: + type: array + items: + type: object + description: "File-type input to the recipe" + required_assets: + type: array + items: + type: object + description: "Asset-type input to the recipe" + required_parameters: + type: array + items: + type: object + description: "Parameter-type input to the recipe" + container: + type: string + pattern: "^\\S*$" + description: "Registry path of the container to use" + command_list: + type: array + items: + type: string + description: "List of commands that create the asset" +required: [name, description, assets, required_files, required_assets, required_parameters, command_list] \ No newline at end of file diff --git a/tests/data/recipe_child.json b/tests/data/recipe_child.json new file mode 100644 index 00000000..d20d3000 --- /dev/null +++ b/tests/data/recipe_child.json @@ -0,0 +1,20 @@ +{ + "name": "child", + "description": "child of a asset", + "assets": { + "child_child": "{genome}_child.fa.gz" + }, + "required_assets": [ + { + "key": "fasta", + "default": "fasta", + "description": "fasta asset for genome" + } + ], + "required_parameters": [], + "required_files": [], + "container": "databio/refgenie", + "command_list": [ + "cp {child} {asset_outfolder}/{genome}_child.fa.gz" + ] +} \ No newline at end of file diff --git a/tests/data/recipe_parent.json b/tests/data/recipe_parent.json new file mode 100644 index 00000000..a664beef --- /dev/null +++ b/tests/data/recipe_parent.json @@ -0,0 +1,20 @@ +{ + "name": "fasta", + "description": "DNA sequences in the FASTA format, dummy recipe", + "assets": { + "fasta": "{genome}.fa" + }, + "required_files": [ + { + "key": "fasta", + "description": "gzipped fasta file" + } + ], + "required_assets": [], + "required_parameters": [], + "container": "databio/refgenie", + "command_list": [ + "cp {fasta} {asset_outfolder}/{genome}.fa.gz", + "gzip -df {asset_outfolder}/{genome}.fa.gz" + ] +} \ No newline at end of file diff --git a/tests/data/t7.fa.gz b/tests/data/t7.fa.gz new file mode 100644 index 0000000000000000000000000000000000000000..a2168f52cf5cecfecadda65782179684290e44cd GIT binary patch literal 12584 zcmV+@G1tx?iwFpX`UzhE19UeoW?=w*y-SWP%Z_E)m%$V=fI>DCAf&+w5IO&$Bnm)hA~$A8-!Ir;A& z-(`Gf^`DRZ{iXJj|J5tK^ZfpMX2-#m`}^nj((Dpu5o&+0ZzQu9-DRuZd(;l=@Y@dU z-w*o%e%a68;_u(eOy2ihd~dh=_}$nyZhf5?q1-# z{`bQ9=I0;&&9%Ode`{8q0ervw`{(z8{ps#c%`Csm{~J{wUj2UjM)mI0d(CgXHn6_Y zXHvYxm|6So^WA8#TM5Fyw`Yc5eAkBg^|jhLepe=h&JN{!VQ=^E=IsjZoA0>4yS?mN z?NUweZN6E|x5OL9dykzND|a{Ac747n>^&}_uq!$2Ja7H?hVyNZ>$3~j-;#|ZD%WEt z`i1?>ZrRn>hG`j>>Sv}?{kFy?A^H5 zH<51(Yd=G+)!lES~W`3Ck4eHV=zv$EgVzbNGrdy8R2yXU)ZJN9pFHvYena_#S8 z?Z%Wh_$Kk~&Ia}-o!a+o;O{~CCdFSjQi`#U6JuGw+izwZwx5fBE4CNi;M^15EY=q^ zo4eZh*lB+A-768aUZis!w&(0tHh1LXO>|u%s)m)@*x8W~UbVMNBpuSh{(AXS_nX`o zXavZ{p|ictY!BLAbiSk6gz+^`@8EV=TP}8T@-?-_`0j|4$ z+M$wq?w;6T3-&|jyYz5;If)bzSG7C^a-!V6z3MgNjgu?Rr%_q|;%0;Hnw~cN zoAkBWoq*k=F&3A-6a1rAb2H+2741U4Dyn%tsuE zJG^yAXwqlv1UR=mXhVL^DK?3InPPUWw_LIHgx0Ri=KMWrT$yOw;6+HVnZ6-?8dl=`V>=cu1bRd(kIUB{DF7T9lR~$^q=8sXbEUumYcP zu)d#)tr?Hc0Nplb&(^aIvM~#8{NBwN2Gp+oo_ROF77|@>wC7tMN#2azJQJDqL1_MJ9(0aN=csu{?ci`3gr9`GQsR zvZtnZ!^L}TPPlLn<8)IEMOQGH!+A}0|89Twg4w_ZlfLi@BqCu0k;P@4dvfSEsk@)w zNZ(EOz>(kxCkM+-=sF_QS^o2qK9w$iI9bE`!?2#4ER7MxHz-ey%d*j z?Kf0~$v@239<(J|`~I_uV5nZpu)M9gXJZ7U%`%%l%HldZxPNu3We>rld(MG0iw>#j zsb1GS`eQ?TP!3QHu5_1z?D{cy@YDmm`_IHKJVtnWNZdJvwS{TXb`T~1Uyx%;2mopK zZf`dKk|@k~cYLz}0Gv9^LM%UPO0HFE3l)RjHy&a|WxzZfM6@ST6u(F62EqdiNrhib zACm`g&n4OtV1T-#9U!|I7Tm4EuqmAwhAiI>fQPA0%L2rD!oAy~!t4IlRtB)m+$?sG zZ!@S%><1Gt9n-zo{JRXQ0T7I8A9^!HAoW$D1VKTFQhP+FB`|JG9Cc0Gp;3?3T*}ad>?4&SP|uIMGO+<+_d^m zfZ8pyO-ZnlmCat8VU`Gi`8{=LM5L+$Tu49V9US!_WdK$~gdU>_b8Go?uH}lDu9y)U z)QQW(2C33)aq}-UJIWNU3G@~PfE_|=6*7r^7vQ}GjwlFi{VfW>-8%ip{)TssfljQ#|`vJtF_qy3X z1~!1JYpyE7OHjPnm7PMn3*>?dzI0URD(G3T{K?OtdWhfI{isykcaS0TroOl8QM(r7 z?Ipm=_Yn3XajN*J`}_pih^0@-lkq^`|Bd7P#5@%*$D+9V_gdsXxt&6hnC$px@%7$V zsW(ytxm(pBpIk)bgg*#bV#_OeQ40?8zbdK#qe=mp5WJ>zotGD)e$5T#H{EckBky(x z7r?Q159W|;<#6TChA^B*46DdZ4Xi9Ehw~qzh+tR`4~Fgn^jArGc$Cx8Pril8+y!m! z&=wD(S`0gr=v~2yuta_!lKA^4btEQJdKH}CW;dJNsx5B!4M=5l2em5GmFjJXAYQ*U zs0Z9fgcw=qyUZTd-4?T6-YwMmLrcgOW6F4ZVJia)z}DEu;6<2{0>KQ`X~a3CzbeG)Sc4U^$KrC4 zO23pY-+^2%Do^_#%4 zjwtqR<-vj58=z9$lC}r22^I!#bR0h5`x?y$4ejFd3HdvSY?CYM*rrse09osa-Cx`c z>cZ8$Yzz5_%z4Gkx3^JRE~NjZRA2Vns|f_e-qOeYufq*V6M(n791>PL-=b^%WhR*V1v%)4tx z=;Hp|aFhc*0c??UwI!lp1JwRgJ^!;>lm=|B3o+876G~dYQLRS~86l3MqtWpRcAmSw z5y`>Vr0#j{AYgKS0f2d;_WN;7^#p5vCOp`=g5wC_xD{52(!O*(w2s7`FNG|L$N)rA_(T7V|&+wl<|BSvWLio@~^%9;oe z=r!GbY2?!1P*RR?CXcjP7Vn3=FP{D;b*1I9x4D%nL3IydV|%DSHyW~=wPx+StoVpV z-3R6RM`^9cm$A38xJ$eqzM14sm~VxrM}#tG-bv`C?gHD7MKv{F{owu@C3axAckOvZ zd$)XU4mf|eFqUP=?^r1mZeDCgn?n;GjHMFYR#Z5azkfn#8G4oSE7ea8_`JKs(&!np zE&#y!siZ3Lu*sojrDeT11Y^-aoUjuij0HDk@n{K>-Zhj zDU+ZDiq01@N7wj1E3+e(gyxe`kW&i}PveqeP}QSe@aRJ_Dt7stW@Fb+lC!zFq+hiL|8XXOfk!`thP~06p?? zBnmZ{170=9+k&#;M)yx}k{_FNM-1Ar-v7!#tzORHF`8ex6A!ELsM$IFevagM_*W7n z*ET6Uuf~}w$g9_)->$T3e2@6MDz5wlWz1lAe zWrpaUc)m10kg_dtCYM-nO>{>6`N`uP`wxjCs3_)>g{-~E)4(bT5a=Yh?uM|KW<^c)2Us%Y z@A(~-1AH$XFP>dzk>(I|1toiyj_eR+$68vNZZY_7gxxz8B#cNGG0FhV0Z~}_j;7iE zNd^rt6g!hxi8+=A+h^`8WQ#P9i8APyL?T21H-UfxF|CZD$f`aIlUjK@_)pgZhA~%H zovms`p-~$lSdLvZK*nx4E;8NTbFvCgXk)8|n$(}8dQc`}*}#@U7%&sr)#P0j4$~#Z zT1j4yNa^ARqJ{pehSPpvvbm;?Qq26oegg}LrM*O>#$Au1iPDx3Y{?XTM4z!!iY4qy zN|tEAx1SFn7M!1N)Y^u}(!O1oSl0E-wTI9JwVQ`$hNukBr{l&SEwN8tKGFd6te2;Ahe0>B|TaZQLY3bETAxA zJowLp5bFe0{(DWKw*pTUz20@lCV?=%{v+q>*41)`cm-vdVqQu8Og70v-r+7z*~N9)Ig~CiAZCF}exYN|a|^u9V^^ zmXTvejVo-om1ijb;U_-AL6~9~AE*Oo3W;+AEb0G7QlnLa)+F~~Vyb^@Q_+1CPv@Cu z4V_BA@!Or<1K>O?XV#ERn3@MbhpG5dHhGAO6To%s)2Z)h7V(m%9FC*sa>Qoi4%|mk zm_LOFt~qJzqIu?N#zBr|0F4~4>*+|gJS3M0nI`UzA=6l5w<>~Qs~24@Wt}vFqfjVASP7gqu78rTPK>~9pHt7# z+{AMf#Itv!4x_4cYvL?xrvs0uec78oUjLHO3O$l2(tFD(BnW7w*kWQ<}&8wp!KY;c}->YIn1!Qu8g zd$xF3f-P*+itphbrB9eURg=1R6Dh%U3Jv|lWpDhYEm%@C+Q*CKdYeR9MhILlW;{mEwdtE3%-TW2G+9|a(p~XCaxYv zgtfgI@w~30U0I!11n($~r54MZuQDB^^hP%ZnFWV$9p5V-3UkqF8R{M;OtzS7?i1Gh zaK3Wj2WTzI7Nhr$$~hJ}p(7>s=9h6cA0NApbp_NkDsWB2j6NVnlhrI$%%G0_bSjn zoY&^%{l!Pr)O{qkwB6sA>NR+q+^gDtg(7>dPO5M#%|(`}N550#NIFt!RNq~f?RaT{ zJ-+{lq)8)cg_NM|SP6fKXVCaQAxR4iT)yTn>~&A{I9-1`MD($>07Zo}vX);eC>ouB za(X8!j1OXr6ypfC*e$`V+{n)NvIU~SIh6*XiFvlb9+)f(g_Av1 zoV!sxSYZkB48?nF36<6-?|x%C9!*L;8~1$v;evc&!D>K$S|V2_ZcjDAIoMPm(` zdHQnBICoSyeXfy|#TipqsQR(+iT*&jYwm?5<}tr5-~lyxB#~io(?rr`Dcxs;T6;pD zzn;}`Z_4bQ9G?sFv+%_tJ%38Qk~R$iSi|#_WIe{R4WlG`9R05sGKyAlKp&y4lv1g4 z&?u9x*o`~HGx}ZV8jk9($I@k0)gCIc3MqdMeopz5F>y{JPowB(`puEcK`VXl^Ayt^@0`l%*ro&7~tv zkRxpbvf~n)tJh_Hy}M>$G4tRwR|EsjSRM<3QwbOo`u%IyBlaGW;FxZS z{93q2b$(s}69Q_tsNGV_hV+(l-)Tm}O|y9epBaP;n;a)V{!M4B7^P?gtrdE~T39pc zn1ypz0eo#V;FkVTCr4br-ktZrtCMny=QolRv07g|Jsu_38ot(|{G@0Xk)uq4akj|9 z^+&}(_ecCZHIo#7^L_}q#Masi8Gy#B_Ujscb2(4NYDJrIBQgGyq6j!0TDdh@ao3Ko zLvivFdeiotj#^wJJw` zyG>WeO=+yh(4HQ^f<-w5*HJg!!kx(x%vr4*i1OY9t^g3D7u)&7yX*Cd-qt9Ky4C?m z_E$vK4_I%*=&ES#Q^%?`c9>G+5SL43{-z66_XA}X_)7eol9WXm6=4JnTDfKCoQA1) zc-flKXPC}LRYwf@Tja2e+}wHZ^yj9FjF`ALNmnUxq9aBe4^E(Ds+}#!-^u`nu>~MX z`~5>>%d;Oa%qdnE+Ov9cJ#_EB3Hnx4>)*J9Ka@e=rRDarXNrV*kqse#4 zWHSjU50vUkc~Tnz2n0WKUL*~1OXx)UDE9*j}&Q4ejXG(2G9 ziimD>=wR)Me8_FDl-+J;+tH#=SrRWgf)v&m4(?Tp$vaL@oWGYEf&9u{+I`J&xna~U zvh-#S_&u@+rS^HYs&>!TpC~~blAq{$)Z8ZO7@MBxe{dh^v=~r(*81g>ld5ay3ve1R z#R0Y$7#7v>77`;D!H)Is9|?y^wQmc|8|$Au#|OI#g?Kw&6wx*^z!~!wn?_P|2R&Cl z1wv+Mzn%ct@>h_iT_Hk+rkJfCFDY(0Tivg+!-P-1p%dT_vN0=Hs6|CZz9!4B+_mT3 zCIPK$@N+ENm>;BZp^Le8Gthn+6btGzWsO{x=g3Hg%0%Xw?APuq^5GwKPsxNh;^b-# zS!|7$L4%V#@1Z!P6KMDAAcr-OE}a@eoXb`wk}I0pZ#Sh3Gh3h|Qg>ziS8br=7shz0 zNB{*`PP@$}7I^>ZAQIElg8o?5qI?0$(KM(?2+U0Zd-D8YDL(M4`lQRcR*o~T0A(MZ zFlXQJyBe$3(=j`(A>Zt1ns)q%5_Buv3J7~|80F{(fwwAbxE}i#{~Mlkm76&(rdQcs zMbw?`X^TJtBbji+;KrTGt73(E)K*h;zx8}|HH_CQ5)HC#tk4{82rnIie@X}%#W?dT zK)#Y0lNsJ!3VJ>1zQP&qv1&ty(yBVymVan*=5%>(XKO4R#v2>wRLxj$Y9aQf+37H;w4Rj;2HO|YeX`Cz1r zD;Ol*39(9|a@}k>@`J3M3_nL2ItIa$F3Ek_bP7Dqf8F#sK%_VO5gIqyEP|Z4Yg0zd z9}SB8PD$HEwEk45MS4u&mQ;eJ!V*{x<;jQp3#xLSzL#K$d+S6yLTM~%jX%ujt5KIn zjhtEQdgMxyX2esHL!I`+=w9;FmfQEoFUs~v5JUt{>$v}Obxtx9;sNU zJu!rTk1&4&J`{Al1A;A^Ob4CW4^#Gw4SE&!u`1G zeys&^8mLa*x1V<=&&;NaD9?z2CCWN}=OH^W; zgvPoS`R$+#2V$7bBIUG*IMl!>25lMC8xlu^dO3?i-3|(`xQ8*rCQ1?DOJ{i3rebQ6 z_2n>Rs@C`eDs8cw*fn7fvq+SC!YnDY2sR^_4IE<vImyUs#Jpp1Xl(_o*x`IP z5W_`fmSB(~p2~6cJF23DUU4@4EvK|nDhkw()Tvw2lvWDOiUpF^7-A<92JO6U@6WuG zPM&{a%cX;EIkg_?EcP3QdR2KkXkup>QG!ZcwiVs^#VJe5H=}8An5D;6>Sq%p{q5f& zhVz6PlG0%!-BBTYHY9?OjIu#(mnzP+T5EhPFd^)D=%VHY^LWlj)F~f(itdd(dLpl#|gc%t8mU_4g7ghRZhTq_y5 zc1j+UP?m;0CYyrFi~gu5QesVeKLTw-{g9GEpa9gFb95V-a{aV?&J7g(`^=#!kX9AX zh#ejRKOhus6F_)GG8c@g;FhW5Nn+|6M3mFHf^#;&*_6C=U*2#3B3RQtj4Z?M^aFbV zplWVf#;L}iU^i^ajsJYx>I^rI(o&|y6gzAZGW{I;V{ZH+jKu4{){8^Miq%Uy>hKCf zcTIkIz*av}^b<074y_Z)7(L$C{*DM{D72Dr^$M-38ysfKglJ_;*nR2@OR-;gB>GCy zx+n@E88M_}=ahfe-R8l&Z9I`kBNmbUiPBri=0Pk1U(U0n}UN!T-5i$4`0X#vjB`_%0&^i8)SZ+rXAWe z?v&h1=KI9ZE(LceuZo1%_MDP=H;Ay+xzCM55rd!b8hMkei(S4elcuVvd|3i0sZG7o zlZk2r0D%4=U#G=GHg*)+NeC8BZNL$FXq*PFIVf-^X$LC@wfVJtR8m|A$AT;goX&Mg z3A!E%KKFTkfUBs@l4*d%Q=;Li$PxRI6KfPpH~avU^fyqI-`KI{$ILM>b}2Mg6L~; z(g;E;$zwZ6Brcn$P4(j~gOrsfqvC88p9h!#dcKRNNArw?hglxN znsgFLh1cqMw2{2o=%72KqLh-Fy#bx_)0)zO$v3@U46!j!ltOgo#7!OZpY9IQt|yt9 zil}D4p-rP(uCAIECjA%Z8ou{eLy`)oMb<2zBmnmNN5vgPaFOa&ueX|U+c8m$ zX&~XS8gq~Pw6=QId|I?OZg}SoXn>ntNurXKdck#~@hC@-ldIB5tmgUCzbW(osn>R} zM*$x=5gStdlSCh;s2D4t-pkl^jMjA9C{vk1Kaow6k?-R(9Cwfgx~bLk9u_j&tPXJl z`02OMeKCd8azznrp1E25rXy}sfN{FDtNv(B=1ih0d(PEZg)kV6fxaj zLTT#99)M#Lqy`?_GSE zKgwjzH#l$~USh9*4$HZwK%{d&nxW2Pdvai0L`*Pf9B@#-Qeyta*px^d$m zB@xG(yltOTXGA@Tt+j}%=o4M$3+opLUAJQdr+%@9;)9$CIhKXDh1sjq;d5g%cA zV9u}VF-DD`O|HrxD%QG)WB!ul><_dYbkJVD{0CiBt5k1Vj%;Zr&dU6orvOQ@g8?*n z=~uTEQ-7XC^Y(<^!%>odJs3ANj3|1})_wffb42D#j25;!z%b!|Lusrz5}1(X#MIav zP$mleDVei0aJPYXx>EJ)00_|H-g_Fi*?u&U1X9k3k*Iz2Pm0l&Cf5}U&tq5-fZ zPKeQZ_4lSM%cEbn)wNEI`i6K71Rd&y?pVHDIrqLH=l&15cq63T0Jc+yk(Ue_T(1m}pT)SfWi) z^GwM}n-wrrH=05sV1Le9=`*l_uSEjzs^oU%dlqPN0i5QQyff)2V`4qg^Tr#b8KMok zhK63Un^hydN#NHT-miN>wAW7yQ%H{?g2!1rgbKX2sE=luQZ)(#+V1?Y4wg};A#Bb} z1Y2c&qcH7t>xoNYE?%ZBFHiVRk~pyn9WR_g=PqAIv+?sLDtm_8=pfz+kb=P5y= zLep8y*W1#Dr31ZHla`LB9pK&dLwxctMClu^A6Ot@<$LzoTANQ)5h_J#(o7;@dF)Wl z2ZDU&zU+DbG>y~`ea3^p&~=#C?gtwH4)PwweNzPWuKobq&euq28-U5kmqMu{8baXe zB+qW=Uv@Fbn$6``$`$d;FQ02R3wB77sBCAD(XkNtg?9q9hB-0RG* zX9j%@ntUtWhXGC{Ya7LHo2F1(cBjmgr2>HFb|dex6)|{T#YcuU9h~6u=bOW^VYG zRrqM`OgdQrFR;JT4tYvH5GtPk=#YJ!;YK}>72gTMCu*&W6fM!GOs+zASG}GaSDV5@ zQ4~}CngkxC1WnpNrD}axis*zD2K?j)^Nuu$?r@I3lD{Ui*c@ zoxdp2^!{D&^b?O;1X-8`>)&Gsk%_rTkkiJF=WV|R*@BefT_L54x);dldOthNvPb&v zb0ZGj2eDtgJ>8>UqrsDjl~^s7+h1%946_)$%p#vZ;dbm(+vIt$X>vZVnT4hCSjQPyBH$^99^h+K9(v~_l2DQt zH`l-Hj|Ty4-Lr~m#4-25Cz~Pr=bY6aZlt+z;R!dW@d4=BT-+WMq_+uJyxK;T@0I&1 z_=*sJFodHS%1i3F4{srYp|9L$qd#!dK0&mp`WUROu&2Rnwr0e3f&UjCr`(Ds`G59k zg^!~E-9HcW-qI5`LdIGGk2`^(5E}6&=p`Z}c~-p^c+~k$4V>|oGpGp=9R(MM0su>5 z-3t0`JB-G(3N83cwMf__232IA&L;{*v2SQH zA1$j8PoOX1q?`Xl8w()Uby_Lv`^gin^GTH*N(%W#tnkThF+m3)nAnGZiQl|{g|$LA zf=9P3-k*4er1aVicFPieBgP&rI}&h^QVKS#UxDZeRu}c!4rB)zHI?jzPsRPYvPMX~ z4vno-!aN8zB`BWAa|EAU0&uCAKY8cnE9nt_pHZ;LZZLBz?hP>sg;VI~k6U2ypU!RVeE0KG$`x%7N+17@PE zvNQFl2lUJOQ%25celBnG~fQ}qy@t(=t@{Xv12UA|6 z?U`-*C`{QLK;JYEjXk504ONp$Rc4FXnFu#FF_U52xy}d^ z^`s_7hvL-u?qdfs*;uylvs^cE>2W}Hx58DPexthYznd#LdkeJK8v7~I$2-$T!B4$w zOa6&i<8_l(lpXEx^60FrNHx0&Xe79kawhCUb|A>eS359I=|t_Imf22Hm+#D+pjQzw zy#+aTYIrFH3WjWG)cG^vGNeGVXcRGX#Qd6V1O%aOH%juIiz zh#Ib{3=Nn>P-=0p1c(D=wc1DUCuGffzA3=-GV%U!hnUli;E)q|%F3aRX9}-4 z{``w3)v9~V!{n#4T9*JZ*_MQK^kr<%>F2dngz>Rzf}z5-8-l4c$F|3DKwJO-D15+S z$8T|Z3SA^&1sknIRsNz`zeew()j+qaqsdJ7`CxRo!d_Kui(0LG4A_5dnCnjgeU!SE?R1-sm(Vqy}GtWxrQ|a=8)SAvd;87z@vf zfd)`Mun9J<4ry8mfd!E_nPxp26r4NBN&s}3YP~)Ry*eiNyonYq)=Qfls}4hjJ8WZc zBti|8nA*j1(TGI9OZ=t~zyu_?*rOxD(_2NUK*3=xlunba@C_(vM`M+$NQo15Y;iHa zU3J^&!BjGZ`%=GAXYz1T?ql1K3}XI*h&e7c6)>sF#+JM_>iz2vz>nP|tqTz9yGedi z-jPJ2>W3GrfQOY*;YjIsl1Dqy-)Ump>P4raL42N8$rXox2LVDMc~x|? zXc_QJY4&3`^5s*|sC);&yJt_bCH>BpV@o3|g9y@Y@yK)@Q8S29bJdU#Rbk*yx4jNg z=o3|bADC;DG|P6ynDEIMJG1l^Fp#dv>BTDM$+gp9Bcju_@ONr-t!@M&C(aKSZTS6& zO?8CN?sp(=N=;`)w1P@xw9uVzIk-ANWxL>u9f|d z@NGDFl7>lxKCKw*^V(=o7aByN!gi4q;vOL5j>~lQR00{d*d^Px-~5V*-lBj|$s4Vd zRU_B{4G?xly}^D)$r4Or+@twMII16*o<}dsP-{CgIVG^)|3-O2xmM8`S)BC;^W&rz zK{zjZngfgpiq=ulP6%{nGY~_btd$W*bM*vXlx2}l8*%YiP@e`-WYr{^dgY=iU8W(A1 zI^rFy2zc-1X?ZX Date: Wed, 15 Jul 2020 08:27:56 -0400 Subject: [PATCH 008/110] add experminental json recipe support, validate with jsonschema --- refgenie/refgenie.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 9f31e316..fbb59492 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -387,6 +387,20 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) + def _read_json_file(filepath): + """ + Read a JSON file + + :param str filepath: path to the file to read + :return dict: read data + """ + with open(filepath, 'r') as f: + data = json.load(f) + return data + + if os.path.isfile(recipe_name) and recipe_name.endswith(".json"): + recipe_name = _read_json_file(filepath=recipe_name) + if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") @@ -485,8 +499,13 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar asset_tag = a["tag"] or rgc.get_default_tag(genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key - if recipe_name in asset_build_packages.keys(): - asset_build_package = _check_recipe(asset_build_packages[recipe_name]) + if recipe_name in asset_build_packages.keys() or isinstance(recipe_name, dict): + if isinstance(recipe_name, dict): + _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) + asset_build_package = _check_recipe(recipe_name) + recipe_name = asset_build_package["name"] + else: + asset_build_package = _check_recipe(asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] @@ -1031,6 +1050,16 @@ def _check_recipe(recipe): :param dict recipe: asset_build_package :raise ValueError: if any key names are duplicated """ + # experimental feature; recipe jsonschema validation + from jsonschema import validate + from yacman import load_yaml + SCHEMA_SRC = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas", "recipe_schema.yaml") + if os.path.exists(SCHEMA_SRC): + validate(recipe, load_yaml(filepath=SCHEMA_SRC)) + _LOGGER.info("Recipe validated successfully against a schema: {}".format(SCHEMA_SRC)) + else: + _LOGGER.warning("Recipe schema not found: {}".format(SCHEMA_SRC)) + # end of validation req_keys = [] for req in [REQ_PARAMS, REQ_ASSETS, REQ_FILES]: req_keys.extend([req_dict[KEY] for req_dict in recipe[req]]) From 2bba971818cd62094ede9ab3597df2fbb02d96a5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:29:03 -0400 Subject: [PATCH 009/110] add github action workflow, update requirements --- .github/workflows/test-refgenie-cli.yml | 78 +++++++++++++++++++++++++ requirements/requirements-all.txt | 1 + requirements/requirements-dev.txt | 5 +- tests/data/recipe_child.json | 2 +- 4 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test-refgenie-cli.yml diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml new file mode 100644 index 00000000..ea75f819 --- /dev/null +++ b/.github/workflows/test-refgenie-cli.yml @@ -0,0 +1,78 @@ +name: Test refgenie CLI + +on: + push: + branches: [master, dev] + +jobs: + test_CLI: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.8] + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dev dependancies + run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + + - name: Install package + run: python -m pip install . + + - name: install macOS-specific dependancies + if: startsWith(matrix.os, 'macOS') + run: brew install md5sha1sum + + - name: create genomes dir + run: mkdir genomes + + - name: refgenie init + working-directory: ./genomes + run: refgenie init -c g.yaml; cat g.yaml + + - name: refgenie list + working-directory: ./genomes + run: refgenie list -c g.yaml + + - name: refgenie build fasta (parent asset) + run: | + refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json + ./tests/assert_in_file.sh genomes/g.yaml t7 0 + ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 0 # this is a digest that should be produced from this FASTA file + + - name: refgenie build fasta_child (child asset) + run: | + refgenie build -c genomes/g.yaml t7/fasta_child --recipe tests/data/recipe_child.json + ./tests/assert_in_file.sh genomes/g.yaml fasta_child 0 + + - name: refgenie list + working-directory: ./genomes + run: refgenie list -c g.yaml + + - name: refgenie remove fasta_child + run: | + refgenie remove -c genomes/g.yaml t7/fasta_child -f + ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 + ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d/fasta_child:default 1 # test if the entry was removed from the fasta children list + + - name: refgenie remove fasta, leave digest + run: | + refgenie remove -c genomes/g.yaml t7/fasta -f + ./tests/assert_in_file.sh genomes/g.yaml fasta 1 + ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 0 + + - name: refgenie build fasta + run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=refgenie/tests/data/t7.fa.gz --recipe refgenie/tests/data/recipe_parent.json + + - name: refgenie remove fasta, remove digest + run: | + refgenie remove -c genomes/g.yaml t7/fasta -f -a + ./tests/assert_in_file.sh genomes/g.yaml fasta 1 + ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 1 \ No newline at end of file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 1a903985..896e8f0b 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,5 @@ logmuse>=0.2.6 #refgenconf>=0.10.0-dev +#seqcol piper>=0.12.1 pyfaidx>=0.5.5.2 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c368db1d..c0f5213b 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1 +1,4 @@ --e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file +-e git+git://github.com/databio/yacman@dev#egg=yacman +-e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf +-e git+git://github.com/databio/henge@master#egg=henge +-e git+git://github.com/refgenie/seqcol@master#egg=seqcol \ No newline at end of file diff --git a/tests/data/recipe_child.json b/tests/data/recipe_child.json index d20d3000..e0b0e51e 100644 --- a/tests/data/recipe_child.json +++ b/tests/data/recipe_child.json @@ -1,6 +1,6 @@ { "name": "child", - "description": "child of a asset", + "description": "child of an asset, dummy recipe", "assets": { "child_child": "{genome}_child.fa.gz" }, From 2d82d733cd9bec0d8884f3e93d8a018edb19f9a7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:33:59 -0400 Subject: [PATCH 010/110] add file content checking script --- tests/assert_in_file.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 tests/assert_in_file.sh diff --git a/tests/assert_in_file.sh b/tests/assert_in_file.sh new file mode 100755 index 00000000..c0c192f0 --- /dev/null +++ b/tests/assert_in_file.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [ $# -ne 3 ]; then + echo $0: usage: assert_in_file.sh filepath query inverse + exit 1 +fi + +if [[ "$3" == "1" ]]; then + echo -e "\nTesting if '$2' is not in '$1'" + if grep -q "$2" "$1"; then + echo -e "\ERROR: '$2' is in '$1'\nContents:\n" + cat "$1" + exit 1 + else + echo -e "\nSUCCESS: '$2' not in '$1'\n" + exit 0 + fi +else + echo -e "\nTesting if '$2' is in '$1'" + if grep -q "$2" "$1"; then + echo -e "\nSUCCESS: '$2' is in '$1'\n" + exit 0 + else + echo -e "\nERROR: '$2' not in '$1'\nContents:\n" + cat "$1" + exit 1 + fi +fi From b78ea1d43a3dfd9059a8b40fefc186a10887e2df Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:37:32 -0400 Subject: [PATCH 011/110] update recipe --- tests/data/recipe_child.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data/recipe_child.json b/tests/data/recipe_child.json index e0b0e51e..afe1d76e 100644 --- a/tests/data/recipe_child.json +++ b/tests/data/recipe_child.json @@ -1,8 +1,8 @@ { - "name": "child", + "name": "fasta_child", "description": "child of an asset, dummy recipe", "assets": { - "child_child": "{genome}_child.fa.gz" + "fasta_child": "{genome}_child.fa.gz" }, "required_assets": [ { @@ -15,6 +15,6 @@ "required_files": [], "container": "databio/refgenie", "command_list": [ - "cp {child} {asset_outfolder}/{genome}_child.fa.gz" + "cp {fasta} {asset_outfolder}/{genome}_child.fa.gz" ] } \ No newline at end of file From fd6ab80c1271412578a26bcd58d512e11a1a6be0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:40:04 -0400 Subject: [PATCH 012/110] update workflow --- .github/workflows/test-refgenie-cli.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index ea75f819..85d50664 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -69,7 +69,7 @@ jobs: ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 0 - name: refgenie build fasta - run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=refgenie/tests/data/t7.fa.gz --recipe refgenie/tests/data/recipe_parent.json + run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json - name: refgenie remove fasta, remove digest run: | From 2e11e8c6535aed74a85c301dcfd29bd1d50eb9ce Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 15 Jul 2020 08:54:14 -0400 Subject: [PATCH 013/110] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a00ae270..0e8a99dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -![Build package](https://github.com/refgenie/refgenie/workflows/Build%20package/badge.svg) +[![Build package](https://github.com/refgenie/refgenie/workflows/Build%20package/badge.svg)](https://github.com/refgenie/refgenie/actions?query=workflow%3A%22Build+package%22) +[![Test refgenie CLI](https://github.com/refgenie/refgenie/workflows/Test%20refgenie%20CLI/badge.svg)](https://github.com/refgenie/refgenie/actions?query=workflow%3A%22Test+refgenie+CLI%22) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/refgenie/README.html) Refgenie
From 145172e18be99f80b366881f843a22ab50c36811 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 16 Jul 2020 10:06:21 -0400 Subject: [PATCH 014/110] add test step in workflow --- .github/workflows/test-refgenie-cli.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 85d50664..289c13c9 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -71,8 +71,14 @@ jobs: - name: refgenie build fasta run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json + - name: refgenie change alias + run: | + refgenie alias --remove t7 + refgenie alias --set t7_new fde5c225d75637d6b2fd463a37ff875d + ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 + - name: refgenie remove fasta, remove digest run: | - refgenie remove -c genomes/g.yaml t7/fasta -f -a + refgenie remove -c genomes/g.yaml t7_new/fasta -f -a ./tests/assert_in_file.sh genomes/g.yaml fasta 1 ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 1 \ No newline at end of file From d36fb2cb95feac35aaa51ff67f975ef9f0504d14 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 16 Jul 2020 10:09:08 -0400 Subject: [PATCH 015/110] fix test step in workflow --- .github/workflows/test-refgenie-cli.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 289c13c9..796554d5 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -73,8 +73,8 @@ jobs: - name: refgenie change alias run: | - refgenie alias --remove t7 - refgenie alias --set t7_new fde5c225d75637d6b2fd463a37ff875d + refgenie alias -c genomes/g.yaml --remove t7 + refgenie alias -c genomes/g.yaml --set t7_new fde5c225d75637d6b2fd463a37ff875d ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 - name: refgenie remove fasta, remove digest From ea721ffbccf82115530afed49b20afe1448d61d4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 13:27:07 -0400 Subject: [PATCH 016/110] make building work with local and external recipes --- refgenie/refgenie.py | 2 +- refgenie/schemas/recipe_schema.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index d26ce62d..f021941c 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -351,7 +351,7 @@ def _read_json_file(filepath): data = json.load(f) return data - if os.path.isfile(recipe_name) and recipe_name.endswith(".json"): + if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(".json"): recipe_name = _read_json_file(filepath=recipe_name) if not hasattr(args, "outfolder") or not args.outfolder: diff --git a/refgenie/schemas/recipe_schema.yaml b/refgenie/schemas/recipe_schema.yaml index 01e63706..c06bc03f 100644 --- a/refgenie/schemas/recipe_schema.yaml +++ b/refgenie/schemas/recipe_schema.yaml @@ -35,4 +35,4 @@ properties: items: type: string description: "List of commands that create the asset" -required: [name, description, assets, required_files, required_assets, required_parameters, command_list] \ No newline at end of file +required: [description, assets, required_files, required_assets, required_parameters, command_list] \ No newline at end of file From 1277e5d251ccef5e16e06e92e14dedbaf9168a1f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 16:16:21 -0400 Subject: [PATCH 017/110] force renaming genome --- refgenie/refgenie.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index f021941c..4d39c2ae 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -850,13 +850,14 @@ def main(): elif args.set: with rgc as r: if len(args.set) == 2: - r.set_genome_alias(genome=args.set[0], digest=args.set[1]) + r.set_genome_alias(genome=args.set[0], digest=args.set[1], + force=True) elif len(args.set) == 1: - r.set_genome_alias(genome=args.set[0]) + r.set_genome_alias(genome=args.set[0], force=True) else: _LOGGER.error( - "You can specify either an alias-genomeID pair or just " - "an alias to look up the genomeID from a server") + "You have to specify either an alias-genomeID pair or " + "just an alias to look up the genomeID from a server") return elif args.remove: with rgc as r: From 1a542f2c26c88c7868c48df1623d16db66d9b3cc Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 17:00:08 -0400 Subject: [PATCH 018/110] fix compare --- refgenie/const.py | 6 +++--- refgenie/refgenie.py | 32 +++++++++++++++----------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/refgenie/const.py b/refgenie/const.py index 93bab459..47b2b4a3 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -18,7 +18,7 @@ SUBSCRIBE_CMD = "subscribe" UNSUBSCRIBE_CMD = "unsubscribe" ALIAS_CMD = "alias" -# COMPARE_CMD = "compare" +COMPARE_CMD = "compare" GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] @@ -39,6 +39,6 @@ ID_CMD: "Return the asset digest.", SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", - ALIAS_CMD: "Interact with aliases." - # COMPARE_CMD: "compare two genomes." + ALIAS_CMD: "Interact with aliases.", + COMPARE_CMD: "compare two genomes." } diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 4d39c2ae..cc550177 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -121,10 +121,7 @@ def add_subparser(cmd, description): "-r", "--recipe", required=False, default=None, type=str, help="Provide a recipe to use.") - alias_group = sps[ALIAS_CMD].add_mutually_exclusive_group( - # title='Aliases manipulation arguments', - # description='Specify the action you want to perform on the aliases' - ) + alias_group = sps[ALIAS_CMD].add_mutually_exclusive_group() alias_group.add_argument( "-r", "--remove", metavar="A", required=False, default=None, type=str, nargs="+", @@ -133,19 +130,20 @@ def add_subparser(cmd, description): alias_group.add_argument( "-s", "--set", metavar="K-V", required=False, default=None, type=str, nargs="+", help="Key-value pair of alias and genome ID or just an alias when the " - "genome ID is to be looked up from a server") + "genome ID is to be looked up from a server, " + "e.g. 'hg38 fc66db1b06b8e99bb177ae03c139713d' or 'hg38'") alias_group.add_argument( "-g", "--get", metavar="A", required=False, default=None, type=str, nargs=1, help="Get genome identifier for an alias.") - # sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, - # help="First genome for compatibility check") - # sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, - # help="Second genome for compatibility check") - # sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", - # help="Do not print compatibility code explanation") + sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, + help="First genome for compatibility check") + sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, + help="Second genome for compatibility check") + sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", + help="Do not print compatibility code explanation") # add 'genome' argument to many commands for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: @@ -869,12 +867,12 @@ def main(): print("\n".join("{}\t{}\t{}".format(k.rjust(32), v.ljust(20), ("*" if k in rgc.genomes else "").ljust(20)) for k, v in aliases.items())) return - # elif args.command == COMPARE_CMD: - # rgc = RefGenConf(filepath=gencfg, writable=False) - # res = rgc.compare(args.genome1[0], args.genome2[0], - # explain=not args.no_explanation) - # if args.no_explanation: - # print(res) + elif args.command == COMPARE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False) + res = rgc.compare(args.genome1[0], args.genome2[0], + explain=not args.no_explanation) + if args.no_explanation: + print(res) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): From 9f5ed95083ba7b0ceec826a88e27785b4661152b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 17:05:18 -0400 Subject: [PATCH 019/110] update reqs --- requirements/requirements-all.txt | 1 - requirements/requirements-dev.txt | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 896e8f0b..1a903985 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,4 @@ logmuse>=0.2.6 #refgenconf>=0.10.0-dev -#seqcol piper>=0.12.1 pyfaidx>=0.5.5.2 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c0f5213b..c368db1d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1 @@ --e git+git://github.com/databio/yacman@dev#egg=yacman --e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf --e git+git://github.com/databio/henge@master#egg=henge --e git+git://github.com/refgenie/seqcol@master#egg=seqcol \ No newline at end of file +-e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From fe17344b76974bdc36aca80464a7a433e4fa6a2a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 17:13:00 -0400 Subject: [PATCH 020/110] install reqs for devel --- requirements/requirements-dev.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c368db1d..e19a6d2d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1 +1,3 @@ +-e git+git://github.com/databio/henge@master#egg=henge +-e git+git://github.com/refgenie/seqcol@master#egg=seqcol -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From c839ed403ce216157d045d2c2053c27b8b5d4ae2 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 29 Jul 2020 17:20:39 -0400 Subject: [PATCH 021/110] install reqs for devel --- requirements/requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index e19a6d2d..6d4e6b87 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,3 +1,4 @@ -e git+git://github.com/databio/henge@master#egg=henge -e git+git://github.com/refgenie/seqcol@master#egg=seqcol +-e git+git://github.com/databio/yacman@dev#egg=yacman -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From c0aa1d9f7612e1e19ecd427957e7c14c888bc777 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 4 Aug 2020 17:09:54 -0400 Subject: [PATCH 022/110] alias-digest dir mirroring changes --- refgenie/refgenie.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index cc550177..c622559d 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -315,7 +315,7 @@ def refgenie_initg(rgc, genome, content_checksums): :param str genome: name of the genome :param dict content_checksums: checksums of individual content_checksums, e.g. chromosomes """ - genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome) + genome_dir = os.path.join(rgc.data_dir, genome) if is_writable(genome_dir): output_file = os.path.join(genome_dir, "{}_sequence_digests.tsv".format(genome)) with open(output_file, "w") as contents_file: @@ -355,7 +355,7 @@ def _read_json_file(filepath): if not hasattr(args, "outfolder") or not args.outfolder: # Default to genome_folder _LOGGER.debug("No outfolder provided, using genome config.") - args.outfolder = rgc[CFG_FOLDER_KEY] + args.outfolder = rgc.data_dir _LOGGER.debug("Default config file: {}".format(default_config_file())) @@ -429,6 +429,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar # create a temporary object to run seek on. tmp_rgc = RefGenConf() tmp_rgc[CFG_FOLDER_KEY] = rgc[CFG_FOLDER_KEY] + tmp_rgc[CFG_ALIASES_KEY] = rgc[CFG_ALIASES_KEY] tmp_rgc.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key}) tmp_rgc.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) digest = get_dir_digest( @@ -543,6 +544,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'". format(genome, asset_key, asset_tag, args.tag_description)) r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) + rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name) @@ -564,7 +566,7 @@ def _exec_list(rgc, remote, genome): return pfx, assemblies, assets, recipes -def perm_check_x(file_to_check, message_tag): +def perm_check_x(file_to_check, message_tag="genome directory"): """ Check X_OK permission on a path, providing according messaging and bool val. @@ -578,8 +580,7 @@ def perm_check_x(file_to_check, message_tag): _LOGGER.error(msg) raise ValueError(msg) if not os.access(file_to_check, os.X_OK): - _LOGGER.error("Insufficient permissions to write to {}: " - "{}".format(message_tag, file_to_check)) + _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) return False return True @@ -724,15 +725,13 @@ def main(): force_large = True force = False - outdir = rgc[CFG_FOLDER_KEY] + outdir = rgc.data_dir if not os.path.exists(outdir): raise MissingFolderError(outdir) - target = _key_to_name(CFG_FOLDER_KEY) - if not perm_check_x(outdir, target): + if not perm_check_x(outdir): return if not _single_folder_writeable(outdir): - _LOGGER.error("Insufficient permissions to write to {}: {}". - format(target, outdir)) + _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) return for a in asset_list: @@ -1053,7 +1052,7 @@ def _seek(rgc, genome_name, asset_name, tag_name=None, Strict seek. Most use cases in this package require file existence check in seek. This function makes it easier """ - return rgc.seek(genome_name=genome_name, + return rgc.seek_src(genome_name=genome_name, asset_name=asset_name, tag_name=tag_name, seek_key=seek_key, From 56f01b22d4af86b2bdbce0f03298108c54e65b67 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 11 Aug 2020 16:53:30 -0400 Subject: [PATCH 023/110] update refgenie alias CLI --- refgenie/const.py | 10 ++++++ refgenie/refgenie.py | 72 +++++++++++++++++++++++--------------------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/refgenie/const.py b/refgenie/const.py index 47b2b4a3..be6b4fcc 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -42,3 +42,13 @@ ALIAS_CMD: "Interact with aliases.", COMPARE_CMD: "compare two genomes." } + +ALIAS_GET_CMD = "get" +ALIAS_SET_CMD = "set" +ALIAS_REMOVE_CMD = "remove" + +ALIAS_SUBPARSER_MESSAGES = { + ALIAS_REMOVE_CMD: "remove aliases.", + ALIAS_SET_CMD: "set aliases.", + ALIAS_GET_CMD: "get aliases." +} diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c622559d..63b51c23 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -24,6 +24,7 @@ VersionInHelpParser, is_command_callable from ubiquerg.system import is_writable from yacman import UndefinedAliasError +from argparse import HelpFormatter from .refget import fasta_checksum _LOGGER = None @@ -47,13 +48,17 @@ def build_argparser(): subparsers = parser.add_subparsers(dest="command") - def add_subparser(cmd, description): + def add_subparser(cmd, msg, subparsers): return subparsers.add_parser( - cmd, description=description, help=description) + cmd, description=msg, help=msg, + formatter_class=lambda prog: HelpFormatter( + prog, max_help_position=37, width=90 + ) + ) sps = {} for cmd, desc in SUBPARSER_MESSAGES.items(): - sps[cmd] = add_subparser(cmd, desc) + sps[cmd] = add_subparser(cmd, desc, subparsers) # It's required for init sps[cmd].add_argument( '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", @@ -121,22 +126,30 @@ def add_subparser(cmd, description): "-r", "--recipe", required=False, default=None, type=str, help="Provide a recipe to use.") - alias_group = sps[ALIAS_CMD].add_mutually_exclusive_group() + alias_subparser = sps[ALIAS_CMD] + alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand") - alias_group.add_argument( - "-r", "--remove", metavar="A", required=False, default=None, type=str, nargs="+", - help="Remove an alias.") + alias_sps = {} + for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): + alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) - alias_group.add_argument( - "-s", "--set", metavar="K-V", required=False, default=None, type=str, nargs="+", - help="Key-value pair of alias and genome ID or just an alias when the " - "genome ID is to be looked up from a server, " - "e.g. 'hg38 fc66db1b06b8e99bb177ae03c139713d' or 'hg38'") + alias_sps[ALIAS_SET_CMD].add_argument( + "-a", "--aliases", metavar="A", required=False, default=None, type=str, + nargs="+", help="Aliases to set.") + alias_sps[ALIAS_SET_CMD].add_argument( + "-d", "--digest", metavar="D", required=True, type=str, + help="Digest to set.") - alias_group.add_argument( - "-g", "--get", metavar="A", required=False, default=None, type=str, nargs=1, - help="Get genome identifier for an alias.") + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-a", "--aliases", metavar="A", required=False, default=None, type=str, + nargs="+", help="Aliases to remove.") + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-d", "--digest", metavar="D", required=True, type=str, + help="Digest to remove.") + alias_sps[ALIAS_GET_CMD].add_argument( + "-a", "--aliases", metavar="A", required=True, type=str, nargs="+", + help="Aliases to get the digests for.") sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, help="First genome for compatibility check") @@ -841,30 +854,19 @@ def main(): return elif args.command == ALIAS_CMD: rgc = RefGenConf(filepath=gencfg) - if args.get: - print(rgc.get_genome_alias_digest(alias=args.get[0])) + if args.subcommand == ALIAS_GET_CMD: + for a in args.aliases: + print(rgc.get_genome_alias_digest(alias=a)) return - elif args.set: - with rgc as r: - if len(args.set) == 2: - r.set_genome_alias(genome=args.set[0], digest=args.set[1], - force=True) - elif len(args.set) == 1: - r.set_genome_alias(genome=args.set[0], force=True) - else: - _LOGGER.error( - "You have to specify either an alias-genomeID pair or " - "just an alias to look up the genomeID from a server") + if args.subcommand == ALIAS_SET_CMD: + rgc.set_genome_alias(digest=args.digest, genome=args.aliases) return - elif args.remove: - with rgc as r: - [r[CFG_ALIASES_KEY].__delitem__(r.get_genome_alias_digest(a)) for a in args.remove] + elif args.subcommand == ALIAS_REMOVE_CMD: + rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) return else: - aliases = rgc.genome_aliases - print("genomeID".rjust(32) + "\talias".ljust(20) + "\tinitialized".ljust(20) + "\n") - print("\n".join("{}\t{}\t{}".format(k.rjust(32), v.ljust(20), ("*" if k in rgc.genomes else "").ljust(20)) for k, v in aliases.items())) - return + # TODO: display entire alias table here + raise NotImplementedError("Alias table inspection is not implemented yet") elif args.command == COMPARE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) From 7c75bbe09e043121b5049a607b49d6e556e0d2d0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 12 Aug 2020 17:41:52 -0400 Subject: [PATCH 024/110] small alias CLI update --- refgenie/refgenie.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 63b51c23..7f2e2333 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -135,10 +135,10 @@ def add_subparser(cmd, msg, subparsers): alias_sps[ALIAS_SET_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, - nargs="+", help="Aliases to set.") + nargs="+", help="Aliases to set; single if the digest is to be retrieved from the server.") alias_sps[ALIAS_SET_CMD].add_argument( - "-d", "--digest", metavar="D", required=True, type=str, - help="Digest to set.") + "-d", "--digest", metavar="D", required=False, type=str, + help="Digest to set; leave out if the digest is to be retrieved from the server.") alias_sps[ALIAS_REMOVE_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, @@ -152,11 +152,11 @@ def add_subparser(cmd, msg, subparsers): help="Aliases to get the digests for.") sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, - help="First genome for compatibility check") + help="First genome for compatibility check.") sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, - help="Second genome for compatibility check") + help="Second genome for compatibility check.") sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", - help="Do not print compatibility code explanation") + help="Do not print compatibility code explanation.") # add 'genome' argument to many commands for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: From b110bc3bf1f36cbee78299dce87e63916cda145c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 13 Aug 2020 15:54:38 -0400 Subject: [PATCH 025/110] add reset option in set alias --- refgenie/refgenie.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 7f2e2333..4864b196 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -139,6 +139,9 @@ def add_subparser(cmd, msg, subparsers): alias_sps[ALIAS_SET_CMD].add_argument( "-d", "--digest", metavar="D", required=False, type=str, help="Digest to set; leave out if the digest is to be retrieved from the server.") + alias_sps[ALIAS_SET_CMD].add_argument( + "-r", "--reset", action="store_true", + help="Whether all the aliases should be removed prior to setting new ones.") alias_sps[ALIAS_REMOVE_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, @@ -859,7 +862,8 @@ def main(): print(rgc.get_genome_alias_digest(alias=a)) return if args.subcommand == ALIAS_SET_CMD: - rgc.set_genome_alias(digest=args.digest, genome=args.aliases) + rgc.set_genome_alias(digest=args.digest, genome=args.aliases, + reset_digest=args.reset) return elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) From c112228e66b0e62f6a2b3f2f4133f831407f1cc4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sat, 15 Aug 2020 13:17:18 -0400 Subject: [PATCH 026/110] config argument in alias subcommand --- refgenie/refgenie.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 4864b196..fb17e2cc 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -25,7 +25,6 @@ from ubiquerg.system import is_writable from yacman import UndefinedAliasError from argparse import HelpFormatter -from .refget import fasta_checksum _LOGGER = None @@ -59,6 +58,9 @@ def add_subparser(cmd, msg, subparsers): sps = {} for cmd, desc in SUBPARSER_MESSAGES.items(): sps[cmd] = add_subparser(cmd, desc, subparsers) + # alias is nested and alias subcommands require config path + if cmd == ALIAS_CMD: + continue # It's required for init sps[cmd].add_argument( '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", @@ -132,6 +134,10 @@ def add_subparser(cmd, msg, subparsers): alias_sps = {} for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) + alias_sps[cmd].add_argument( + '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set." + .format(", ".join(refgenconf.CFG_ENV_VARS))) alias_sps[ALIAS_SET_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, @@ -151,7 +157,7 @@ def add_subparser(cmd, msg, subparsers): help="Digest to remove.") alias_sps[ALIAS_GET_CMD].add_argument( - "-a", "--aliases", metavar="A", required=True, type=str, nargs="+", + "-a", "--aliases", metavar="A", required=False, type=str, nargs="+", help="Aliases to get the digests for.") sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, From 8479f4f00acbc1567f09d4f1bb77b1d42ea2ba4b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sat, 15 Aug 2020 13:17:50 -0400 Subject: [PATCH 027/110] display aliases and assets table --- refgenie/refgenie.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index fb17e2cc..62953de8 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -4,6 +4,8 @@ from shutil import rmtree from re import sub from requests import ConnectionError +from rich.console import Console + import os import sys import csv @@ -788,12 +790,8 @@ def main(): # Restore original server list, even when we couldn't find assets on a server rgc[CFG_SERVERS_KEY] = server_list else: # Only check local assets once - _LOGGER.info("Server subscriptions: {}".format(", ".join(rgc[CFG_SERVERS_KEY]))) - pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome) - _LOGGER.info("{} genomes: {}".format(pfx, genomes)) - if args.command != LIST_REMOTE_CMD: # Not implemented yet - _LOGGER.info("{} recipes: {}".format(pfx, recipes)) - _LOGGER.info("{} assets:\n{}".format(pfx, assets)) + console = Console() + console.print(rgc.get_local_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) @@ -864,9 +862,13 @@ def main(): elif args.command == ALIAS_CMD: rgc = RefGenConf(filepath=gencfg) if args.subcommand == ALIAS_GET_CMD: - for a in args.aliases: - print(rgc.get_genome_alias_digest(alias=a)) - return + if args.aliases is not None: + for a in args.aliases: + print(rgc.get_genome_alias_digest(alias=a)) + return + console = Console() + console.print(rgc.genome_aliases_table) + if args.subcommand == ALIAS_SET_CMD: rgc.set_genome_alias(digest=args.digest, genome=args.aliases, reset_digest=args.reset) @@ -874,9 +876,6 @@ def main(): elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) return - else: - # TODO: display entire alias table here - raise NotImplementedError("Alias table inspection is not implemented yet") elif args.command == COMPARE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) From d42831c427166f246ee94d5be85aee39eb16274b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sat, 15 Aug 2020 13:22:18 -0400 Subject: [PATCH 028/110] simpler asset dir determination in build --- refgenie/refgenie.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 62953de8..8a44a6d1 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -448,18 +448,14 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) with open(os.path.join(log_outfolder, recipe_file_name), 'w') as outfile: json.dump(build_pkg, outfile) - # in order to prevent locking the config file for writing once while - # being able to use the seek method for digest calculation we - # create a temporary object to run seek on. - tmp_rgc = RefGenConf() - tmp_rgc[CFG_FOLDER_KEY] = rgc[CFG_FOLDER_KEY] - tmp_rgc[CFG_ALIASES_KEY] = rgc[CFG_ALIASES_KEY] - tmp_rgc.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key}) - tmp_rgc.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) - digest = get_dir_digest( - _seek(tmp_rgc, genome, asset_key, tag, enclosing_dir=True), pm) + # since the assets are always built to a standard dir structure, we + # can just stitch a path together for asset digest calculation + asset_dir = os.path.join(rgc.data_dir, *gat) + if not os.path.exists(asset_dir): + raise OSError("Could not compute asset digest. Path does not " + "exist: {}".format(asset_dir)) + digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) - del tmp_rgc # add updates to config file with rgc as r: r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}) From 2a16fa246df2283a34c42e38c7aa8cabc582522b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Sat, 15 Aug 2020 15:34:16 -0400 Subject: [PATCH 029/110] flip conditional for custom recipe --- refgenie/refgenie.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 8a44a6d1..be5f00ca 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -471,7 +471,8 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar asset_tag = a["tag"] or rgc.get_default_tag(genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key - if recipe_name in asset_build_packages.keys() or isinstance(recipe_name, dict): + if isinstance(recipe_name, dict) or \ + (isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys()): if isinstance(recipe_name, dict): _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) asset_build_package = _check_recipe(recipe_name) From a8571990efd02ea71d3aed551e4eec42421cd56a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 17 Aug 2020 09:22:39 -0400 Subject: [PATCH 030/110] list and remove updates --- refgenie/refgenie.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index be5f00ca..e9cbc6dc 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -760,8 +760,7 @@ def main(): force_large=force_large, size_cutoff=args.size_cutoff) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False, - genome_exact=args.command == LIST_REMOTE_CMD) + rgc = RefGenConf(filepath=gencfg, writable=False) if args.command == LIST_REMOTE_CMD: num_servers = 0 # Keep all servers so that child updates maintain server list @@ -822,8 +821,7 @@ def main(): return force = True for a in asset_list: - rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], - force=force, aliases=args.aliases) + rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg) From d60e6df61811f12596fa443b1b91d007f43d70b5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 09:21:17 -0400 Subject: [PATCH 031/110] remove unnecessary reqs --- requirements/requirements-dev.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 6d4e6b87..493054a9 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,4 +1,2 @@ --e git+git://github.com/databio/henge@master#egg=henge --e git+git://github.com/refgenie/seqcol@master#egg=seqcol -e git+git://github.com/databio/yacman@dev#egg=yacman -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From e3efa04f4afab52ae20d62491becdf2211cf720e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 09:45:02 -0400 Subject: [PATCH 032/110] update test action --- .github/workflows/test-refgenie-cli.yml | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 796554d5..931e59e2 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -45,7 +45,7 @@ jobs: run: | refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json ./tests/assert_in_file.sh genomes/g.yaml t7 0 - ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 0 # this is a digest that should be produced from this FASTA file + ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b 0 # this is a digest that should be produced from this FASTA file - name: refgenie build fasta_child (child asset) run: | @@ -60,25 +60,17 @@ jobs: run: | refgenie remove -c genomes/g.yaml t7/fasta_child -f ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 - ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d/fasta_child:default 1 # test if the entry was removed from the fasta children list - - - name: refgenie remove fasta, leave digest - run: | - refgenie remove -c genomes/g.yaml t7/fasta -f - ./tests/assert_in_file.sh genomes/g.yaml fasta 1 - ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 0 + ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:default 1 # test if the entry was removed from the fasta children list - name: refgenie build fasta run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json - name: refgenie change alias run: | - refgenie alias -c genomes/g.yaml --remove t7 - refgenie alias -c genomes/g.yaml --set t7_new fde5c225d75637d6b2fd463a37ff875d + refgenie alias remove -c genomes/g.yaml --digest t7 + refgenie alias set -c genomes/g.yaml --aliases t7_new --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 - - name: refgenie remove fasta, remove digest + - name: refgenie get aliases run: | - refgenie remove -c genomes/g.yaml t7_new/fasta -f -a - ./tests/assert_in_file.sh genomes/g.yaml fasta 1 - ./tests/assert_in_file.sh genomes/g.yaml fde5c225d75637d6b2fd463a37ff875d 1 \ No newline at end of file + refgenie alias get -c genomes/g.yaml \ No newline at end of file From 20019ac297cb23d3d7466ec43418b7f117cba369 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 15:52:01 -0400 Subject: [PATCH 033/110] more cli tests --- .github/workflows/test-refgenie-cli.yml | 45 +++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 931e59e2..b11b4a1a 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -51,6 +51,18 @@ jobs: run: | refgenie build -c genomes/g.yaml t7/fasta_child --recipe tests/data/recipe_child.json ./tests/assert_in_file.sh genomes/g.yaml fasta_child 0 + if [ -L `refgenie seek -c genomes/g.yaml t7/fasta_child` ]; then + echo "`refgenie seek -c genomes/g.yaml t7/fasta_child` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7/fasta_child` does not exist." + exit 1 + fi + if [ -d genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default ]; then + echo "'genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default' exists." + else + echo "Error: 'genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default' does not exist." + exit 1 + fi - name: refgenie list working-directory: ./genomes @@ -65,11 +77,38 @@ jobs: - name: refgenie build fasta run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json - - name: refgenie change alias + - name: refgenie set aliases run: | - refgenie alias remove -c genomes/g.yaml --digest t7 - refgenie alias set -c genomes/g.yaml --aliases t7_new --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b + refgenie alias set -c genomes/g.yaml --aliases t7_new t7_new1 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 + ./tests/assert_in_file.sh genomes/g.yaml t7_new1 0 + if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta_child` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new/fasta_child` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta_child` does not exist." + exit 1 + fi + if [ -L `refgenie seek -c genomes/g.yaml t7_new1/fasta_child` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new1/fasta_child` exists." + else + echo "Error: `refgenie seek -c genomes/g.yaml t7_new1/fasta_child` does not exist." + exit 1 + fi + + - name: refgenie remove aliases + run: | + refgenie alias set -c genomes/g.yaml --aliases t7_another --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b + refgenie alias remove -c genomes/g.yaml --aliases t7_new t7_new1 t7 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b + ./tests/assert_in_file.sh genomes/g.yaml t7_new 1 + ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 + ./tests/assert_in_file.sh genomes/g.yaml t7 1 + ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 + if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta_child` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new/fasta_child` exists." + exit 1 + else + echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta_child` does not exist." + fi - name: refgenie get aliases run: | From c0e9c81a4d739a7cc1f99ab35e67ff0733a6ffa3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 16:00:56 -0400 Subject: [PATCH 034/110] update asset name in tests --- .github/workflows/test-refgenie-cli.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index b11b4a1a..eeeff8bd 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -82,16 +82,16 @@ jobs: refgenie alias set -c genomes/g.yaml --aliases t7_new t7_new1 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 ./tests/assert_in_file.sh genomes/g.yaml t7_new1 0 - if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta_child` ]; then - echo "`refgenie seek -c genomes/g.yaml t7_new/fasta_child` exists." + if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new/fasta` exists." else - echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta_child` does not exist." + echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta` does not exist." exit 1 fi - if [ -L `refgenie seek -c genomes/g.yaml t7_new1/fasta_child` ]; then - echo "`refgenie seek -c genomes/g.yaml t7_new1/fasta_child` exists." + if [ -L `refgenie seek -c genomes/g.yaml t7_new1/fasta` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new1/fasta` exists." else - echo "Error: `refgenie seek -c genomes/g.yaml t7_new1/fasta_child` does not exist." + echo "Error: `refgenie seek -c genomes/g.yaml t7_new1/fasta` does not exist." exit 1 fi @@ -103,11 +103,11 @@ jobs: ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 ./tests/assert_in_file.sh genomes/g.yaml t7 1 ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 - if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta_child` ]; then - echo "`refgenie seek -c genomes/g.yaml t7_new/fasta_child` exists." + if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then + echo "`refgenie seek -c genomes/g.yaml t7_new/fasta` exists." exit 1 else - echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta_child` does not exist." + echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta` does not exist." fi - name: refgenie get aliases From 46182d54684a4a439dbb2c8c0e9d2d819768618d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 16:04:03 -0400 Subject: [PATCH 035/110] correct test --- .github/workflows/test-refgenie-cli.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index eeeff8bd..943823f8 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -101,7 +101,6 @@ jobs: refgenie alias remove -c genomes/g.yaml --aliases t7_new t7_new1 t7 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b ./tests/assert_in_file.sh genomes/g.yaml t7_new 1 ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 - ./tests/assert_in_file.sh genomes/g.yaml t7 1 ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then echo "`refgenie seek -c genomes/g.yaml t7_new/fasta` exists." From 9d8714b3748472c8ceca6219072f84fe67cd5fda Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 16:13:45 -0400 Subject: [PATCH 036/110] correct test --- .github/workflows/test-refgenie-cli.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 943823f8..17bc3bd6 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -102,11 +102,11 @@ jobs: ./tests/assert_in_file.sh genomes/g.yaml t7_new 1 ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 - if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then - echo "`refgenie seek -c genomes/g.yaml t7_new/fasta` exists." - exit 1 + if [ -L genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz ]; then + echo "'genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz' exists." + exit 1 else - echo "Error: `refgenie seek -c genomes/g.yaml t7_new/fasta` does not exist." + echo "Error: 'genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz' does not exist." fi - name: refgenie get aliases From 6e75488f7b40d59e5df2ee6ba002dcdec5622a35 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 18 Aug 2020 21:36:53 -0400 Subject: [PATCH 037/110] switch to table format in list remote --- refgenie/refgenie.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index e9cbc6dc..8144ed0f 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -761,33 +761,27 @@ def main(): elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: rgc = RefGenConf(filepath=gencfg, writable=False) + console = Console() if args.command == LIST_REMOTE_CMD: num_servers = 0 - # Keep all servers so that child updates maintain server list - server_list = rgc[CFG_SERVERS_KEY] bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: num_servers += 1 try: - rgc[CFG_SERVERS_KEY] = [server_url] - pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome) - if assets is None and genomes is None: - continue - _LOGGER.info("Server URL: {}".format(server_url)) - _LOGGER.info("{} genomes: {}".format(pfx, genomes)) - if args.command != LIST_REMOTE_CMD: # Not implemented yet - _LOGGER.info("{} recipes: {}".format(pfx, recipes)) - _LOGGER.info("{} assets:\n{}".format(pfx, assets)) + table = rgc.get_asset_table( + genomes=args.genome, server_url=server_url) except (DownloadJsonError, ConnectionError): bad_servers.append(server_url) continue - if num_servers >= len(server_list) and bad_servers: - _LOGGER.error("Could not list assets from the following server(s): {}".format(bad_servers)) - # Restore original server list, even when we couldn't find assets on a server - rgc[CFG_SERVERS_KEY] = server_list + else: + console.print(table) + if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: + _LOGGER.error( + "Could not list assets from the following servers: {}". + format(bad_servers) + ) else: # Only check local assets once - console = Console() - console.print(rgc.get_local_asset_table(genomes=args.genome)) + console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False) From 9858eaecb7f1482bf41f33538ad1dfdf8f212e7c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 19 Aug 2020 09:03:34 -0400 Subject: [PATCH 038/110] account for digest in building process --- refgenie/refgenie.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 8144ed0f..c4cf3353 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -505,7 +505,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar g, a, t, s = genome, default["asset"], \ rgc.get_default_tag(genome, default["asset"]), \ req_asset_data["seek_key"] - parent_assets.append("{}/{}:{}".format(rgc.get_genome_alias_digest(g), a, t)) + parent_assets.append("{}/{}:{}".format(rgc.get_genome_alias_digest(g, fallback=True), a, t)) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) @@ -536,7 +536,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar alias=ori_genome) else: try: - genome = rgc.get_genome_alias_digest(genome) + genome = rgc.get_genome_alias_digest(genome, fallback=True) except UndefinedAliasError: _LOGGER.error("Genome '{}' has not been initialized yet; " "no key found for this alias".format(genome)) From e6ca573452f0456467c12d7038c543ef0aedef77 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 19 Aug 2020 13:58:04 -0400 Subject: [PATCH 039/110] reset recipe name for multi-asset build --- refgenie/refgenie.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c4cf3353..c3dfd7f3 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -374,6 +374,9 @@ def _read_json_file(filepath): return data if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(".json"): + if len(asset_list) != 1: + raise NotImplementedError( + "Custom recipes are not allowed when build more than asset at a time.") recipe_name = _read_json_file(filepath=recipe_name) if not hasattr(args, "outfolder") or not args.outfolder: @@ -541,6 +544,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar _LOGGER.error("Genome '{}' has not been initialized yet; " "no key found for this alias".format(genome)) return + recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, specified_args, specified_params, **input_assets): From 147db6e323f758e2f00c6a6a2361c7a46a89201b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 19 Aug 2020 14:04:34 -0400 Subject: [PATCH 040/110] remove check for scenarion handled upstream --- refgenie/refgenie.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c3dfd7f3..e63aeddb 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -374,9 +374,6 @@ def _read_json_file(filepath): return data if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(".json"): - if len(asset_list) != 1: - raise NotImplementedError( - "Custom recipes are not allowed when build more than asset at a time.") recipe_name = _read_json_file(filepath=recipe_name) if not hasattr(args, "outfolder") or not args.outfolder: From d6248265cbe4c5d715186fbdf7041242228ea10b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 20 Aug 2020 17:04:15 -0400 Subject: [PATCH 041/110] test add cli --- .github/workflows/test-refgenie-cli.yml | 13 ++++++++++++- refgenie/refgenie.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 17bc3bd6..c429e0a8 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -111,4 +111,15 @@ jobs: - name: refgenie get aliases run: | - refgenie alias get -c genomes/g.yaml \ No newline at end of file + refgenie alias get -c genomes/g.yaml + + - name: refgneie add asset + run: | + refgenie add t7/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"genomes": "genomes.yaml"}' + ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 + if [ -L `refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml`]; then + echo "`refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml` exists." + else + echo "Error: `refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml` does not exist." + exit 1 + fi \ No newline at end of file diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index e63aeddb..70bf224e 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -228,7 +228,7 @@ def add_subparser(cmd, msg, subparsers): "-s", "--seek-keys", required=False, type=str, metavar="S", help=""" String representation of a JSON object with seek_keys, - e.g. '{"seek_key1": "file.txt"}') + e.g. '{"seek_key1": "file.txt"}' """) sps[GETSEQ_CMD].add_argument( From db2689739b5c7865011b446eb472b679fe78dd7b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 20 Aug 2020 17:08:32 -0400 Subject: [PATCH 042/110] correct alias in test --- .github/workflows/test-refgenie-cli.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index c429e0a8..67c3d122 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -115,11 +115,11 @@ jobs: - name: refgneie add asset run: | - refgenie add t7/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"genomes": "genomes.yaml"}' + refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"genomes": "genomes.yaml"}' ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 - if [ -L `refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml`]; then - echo "`refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml` exists." + if [ -L `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml`]; then + echo "`refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` exists." else - echo "Error: `refgenie seek t7/test_asset.genomes:default -c genomes/g.yaml` does not exist." + echo "Error: `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` does not exist." exit 1 fi \ No newline at end of file From 015bc66a14907eb08415a34c0ae25011ff0513dd Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 21 Aug 2020 08:44:39 -0400 Subject: [PATCH 043/110] whitespace in wrkflw --- .github/workflows/test-refgenie-cli.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 67c3d122..830a35b7 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -117,7 +117,7 @@ jobs: run: | refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"genomes": "genomes.yaml"}' ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 - if [ -L `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml`]; then + if [ -L `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` ]; then echo "`refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` exists." else echo "Error: `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` does not exist." From ba167d1016a8a4fe7518fee2022de47bc28f238f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 21 Aug 2020 09:07:46 -0400 Subject: [PATCH 044/110] correct seek key --- .github/workflows/test-refgenie-cli.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 830a35b7..e1be0037 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -115,11 +115,11 @@ jobs: - name: refgneie add asset run: | - refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"genomes": "genomes.yaml"}' + refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"recipe": "recipe_parent.json"}' ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 - if [ -L `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` ]; then - echo "`refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` exists." + if [ -L `refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` ]; then + echo "`refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` exists." else - echo "Error: `refgenie seek t7_another/test_asset.genomes:default -c genomes/g.yaml` does not exist." + echo "Error: `refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` does not exist." exit 1 fi \ No newline at end of file From 804ee76b61972e0a29ef239e77e99a322c30da5a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 21 Aug 2020 09:39:56 -0400 Subject: [PATCH 045/110] test tag and id --- .github/workflows/test-refgenie-cli.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index e1be0037..4616deef 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -122,4 +122,19 @@ jobs: else echo "Error: `refgenie seek t7_another/test_asset.recipe:default -c genomes/g.yaml` does not exist." exit 1 - fi \ No newline at end of file + fi + + - name: refgenie tag asset + run: | + refgenie tag -c genomes/g.yaml t7_another/fasta:default -t new_tag + ./tests/assert_in_file.sh genomes/g.yaml new_tag 0 + if [ -f `refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` ]; then + echo "`refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` exists." + else + echo "Error: `refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` does not exist." + exit 1 + fi + + - name: refgenie id + run: | + ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta:new_tag` 0 From 9768847d5af1c583b285bdc44e8bf238425700a1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 24 Aug 2020 09:44:12 -0400 Subject: [PATCH 046/110] add cli hook for skipping read lock, add default; closes #194 --- refgenie/refgenie.py | 46 +++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 70bf224e..1c46842c 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -53,7 +53,7 @@ def add_subparser(cmd, msg, subparsers): return subparsers.add_parser( cmd, description=msg, help=msg, formatter_class=lambda prog: HelpFormatter( - prog, max_help_position=37, width=90 + prog, max_help_position=40, width=90 ) ) @@ -68,6 +68,9 @@ def add_subparser(cmd, msg, subparsers): '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", help="Path to local genome configuration file. Optional if {} environment variable is set." .format(", ".join(refgenconf.CFG_ENV_VARS))) + sps[cmd].add_argument( + '--skip-read-lock', required=False, action="store_true", + help="Whether the config file should not be locked for reading") sps[INIT_CMD].add_argument('-s', '--genome-server', nargs='+', default=DEFAULT_SERVER, help="URL(s) to use for the {} attribute in config file. Default: {}." @@ -137,9 +140,12 @@ def add_subparser(cmd, msg, subparsers): for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) alias_sps[cmd].add_argument( - '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", + '-c', '--genome-config', required=False, dest="genome_config", metavar="C", help="Path to local genome configuration file. Optional if {} environment variable is set." .format(", ".join(refgenconf.CFG_ENV_VARS))) + alias_sps[cmd].add_argument( + '--skip-read-lock', required=False, action="store_true", + help="Whether the config file should not be locked for reading") alias_sps[ALIAS_SET_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, @@ -177,7 +183,7 @@ def add_subparser(cmd, msg, subparsers): help="Reference assembly ID, e.g. mm10.") for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: - sps[cmd].add_argument("-g", "--genome", required=False, type=str, + sps[cmd].add_argument("-g", "--genome", required=False, type=str, metavar="G", nargs="*", help="Reference assembly ID, e.g. mm10.") for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, TAG_CMD, ID_CMD]: @@ -358,7 +364,7 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) @@ -627,9 +633,13 @@ def main(): raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) + # if config read lock skip was not forced, check if dir is writable and set + # the default to the result + skip_read_lock = is_writable(os.path.dirname(gencfg)) \ + if not args.skip_read_lock else True + # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag - if "asset_registry_paths" in args and args.asset_registry_paths: _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] @@ -681,7 +691,7 @@ def main(): if args.genome_archive_config: entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) _LOGGER.debug("initializing with entries: {}".format(entries)) - rgc = RefGenConf(entries=entries) + rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) rgc.initialize_config_file(os.path.abspath(gencfg)) elif args.command == BUILD_CMD: @@ -705,7 +715,7 @@ def main(): refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'". @@ -715,7 +725,7 @@ def main(): return elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: @@ -727,7 +737,7 @@ def main(): seek_keys=sk, force=args.force) elif args.command == PULL_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) # existing assets overwriting if args.no_overwrite: force = False @@ -761,7 +771,7 @@ def main(): force_large=force_large, size_cutoff=args.size_cutoff) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) console = Console() if args.command == LIST_REMOTE_CMD: num_servers = 0 @@ -785,12 +795,12 @@ def main(): console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: force = args.force - rgc = RefGenConf(filepath=gencfg) + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"], use_existing=False) @@ -819,7 +829,7 @@ def main(): rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: - rgc = RefGenConf(filepath=gencfg) + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: @@ -830,7 +840,7 @@ def main(): rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) @@ -842,15 +852,15 @@ def main(): print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: - rgc = RefGenConf(filepath=gencfg) + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) if args.subcommand == ALIAS_GET_CMD: if args.aliases is not None: for a in args.aliases: @@ -868,7 +878,7 @@ def main(): return elif args.command == COMPARE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) res = rgc.compare(args.genome1[0], args.genome2[0], explain=not args.no_explanation) if args.no_explanation: From 40a781439aa62253d9378aeb835dca775cbc26f1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 24 Aug 2020 09:49:08 -0400 Subject: [PATCH 047/110] determine skip for build cmd --- refgenie/refgenie.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 1c46842c..e322105a 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -364,6 +364,10 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ + # if config read lock skip was not forced, check if dir is writable and set + # the default to the result + skip_read_lock = is_writable(os.path.dirname(gencfg)) \ + if not args.skip_read_lock else True rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) From c7ff4266dcc9ad97662d6c4fb754622516d0ce9b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 24 Aug 2020 10:10:30 -0400 Subject: [PATCH 048/110] single source of read lock skipping decision --- refgenie/refgenie.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index e322105a..3c071b69 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -364,11 +364,8 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ - # if config read lock skip was not forced, check if dir is writable and set - # the default to the result - skip_read_lock = is_writable(os.path.dirname(gencfg)) \ - if not args.skip_read_lock else True - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=_skip_lock(args.skip_read_lock, gencfg)) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) @@ -637,10 +634,7 @@ def main(): raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) - # if config read lock skip was not forced, check if dir is writable and set - # the default to the result - skip_read_lock = is_writable(os.path.dirname(gencfg)) \ - if not args.skip_read_lock else True + skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag @@ -1073,3 +1067,15 @@ def _seek(rgc, genome_name, asset_name, tag_name=None, seek_key=seek_key, enclosing_dir=enclosing_dir, strict_exists=True) + + +def _skip_lock(skip_arg, cfg): + """ + If config read lock skip was not forced, check if dir is writable and set + the default to the result + + :param bool skip_arg: argument selected on the CLI + :param str cfg: path to the confjg + :return bool: decision -- whether to skip the file lock for read + """ + return is_writable(os.path.dirname(cfg)) if not skip_arg else True From 0d9b6e9e53bfb30bb92368f922b915115d366d8b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 25 Aug 2020 15:38:16 -0400 Subject: [PATCH 049/110] adjust cli to python api; create genome if does not exist --- refgenie/refgenie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 3c071b69..e63a4336 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -869,7 +869,7 @@ def main(): if args.subcommand == ALIAS_SET_CMD: rgc.set_genome_alias(digest=args.digest, genome=args.aliases, - reset_digest=args.reset) + reset_digest=args.reset, create_genome=True) return elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) From 29fe0108b88f08e1d52bdfe25d087883e0e8c94a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 28 Aug 2020 13:31:33 -0400 Subject: [PATCH 050/110] write alias after succesfully completing build; #202 --- refgenie/refgenie.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index e63a4336..fcd274a5 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -395,7 +395,7 @@ def _read_json_file(filepath): format(args.config_file)) args.config_file = default_config_file() - def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, **kwargs): + def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs): """ Builds assets with pypiper and updates a genome config file. @@ -465,11 +465,11 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: - r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}) - r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, - CFG_ASSET_CHECKSUM_KEY: digest}) - r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) - r.set_default_pointer(*gat) + r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) + r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome) + r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest}, force_digest=genome) + r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}, force_digest=genome) + r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True @@ -540,7 +540,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file genome, _ = \ rgc.initialize_genome(fasta_path=specified_args["fasta"], - alias=ori_genome) + alias=ori_genome, skip_alias_write=True) else: try: genome = rgc.get_genome_alias_digest(genome, fallback=True) @@ -550,8 +550,8 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar return recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) - if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, - specified_args, specified_params, **input_assets): + if not _build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, + specified_args, specified_params, ori_genome, **input_assets): log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag, BUILD_STATS_DIR, ORI_LOG_NAME)) _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. " @@ -573,7 +573,7 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'". format(genome, asset_key, asset_tag, args.tag_description)) r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) - rgc._symlink_alias(genome, asset_key, asset_tag) + rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name) From 8af5900422e6bdabe05132482cf4f1fbf08b8eeb Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 28 Aug 2020 13:35:05 -0400 Subject: [PATCH 051/110] fix for no fasta builds --- refgenie/refgenie.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index fcd274a5..c463b687 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -465,7 +465,8 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: - r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) + if asset_key == "fasta": + r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome) r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest}, force_digest=genome) r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}, force_digest=genome) @@ -533,8 +534,8 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a else: specified_params.update({required_param[KEY]: required_param[DEFAULT]}) _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, asset_key, asset_tag, recipe_name)) + ori_genome = genome if recipe_name == 'fasta': - ori_genome = genome if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file From 557e0a37863ac8b7a02e6c5d58b5da6ae8c09371 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 11 Sep 2020 10:19:29 -0400 Subject: [PATCH 052/110] fix docstring --- refgenie/asset_build_packages.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 82981647..f592ca7d 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -6,7 +6,8 @@ # These building recipes should make use of arguments that are auto-populated, # or user-provided. The auto-populated arguments are: # - {genome} -# - {asset_outfolder} In addition to these, the recipe should refer in the +# - {asset_outfolder} +# In addition to these, the recipe should refer in the # same way, {var}, to any variables required to be provided, which will be # provided via the CLI. These should be listed as 'required_inputs' and # will be checked for existence before the commands are executed. From 4fe94fadb7b8136d1462dd76d43efeb66dbafaf4 Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Thu, 17 Sep 2020 15:06:17 -0400 Subject: [PATCH 053/110] add upgrade cmd --- .gitignore | 3 + refgenie/const.py | 7 +- refgenie/refgenie.py | 218 ++++++++++++++++++++++++++--------------- refgenie/refgenie.yaml | 2 +- 4 files changed, 149 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index 998222cf..c6ff5ded 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,6 @@ refgenie.egg-info/ docs_jupyter/refgenie.yaml docs_jupyter/rCRSd* docs_jupyter/hs38d1* + +# build dir +build/ diff --git a/refgenie/const.py b/refgenie/const.py index be6b4fcc..60db2905 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -19,11 +19,13 @@ UNSUBSCRIBE_CMD = "unsubscribe" ALIAS_CMD = "alias" COMPARE_CMD = "compare" +UPGRADE_CMD = "upgrade" GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] # For each asset we assume a genome is also required -ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, TAG_CMD, ID_CMD] +ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, + BUILD_CMD, INSERT_CMD, TAG_CMD, ID_CMD] SUBPARSER_MESSAGES = { INIT_CMD: "Initialize a genome configuration.", @@ -40,7 +42,8 @@ SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", ALIAS_CMD: "Interact with aliases.", - COMPARE_CMD: "compare two genomes." + COMPARE_CMD: "compare two genomes.", + UPGRADE_CMD: "Upgrade config. This will alter the files on disk." } ALIAS_GET_CMD = "get" diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c463b687..474de532 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -72,6 +72,12 @@ def add_subparser(cmd, msg, subparsers): '--skip-read-lock', required=False, action="store_true", help="Whether the config file should not be locked for reading") + # upgrade: upgrade config and alter file structure to the target version + sps[UPGRADE_CMD].add_argument('-v', '--target-version', required=True, metavar="V", + help="Target config version for the upgrade.") + sps[UPGRADE_CMD].add_argument('-f', '--force', action="store_true", + help="Do not prompt before action, approve upfront.") + sps[INIT_CMD].add_argument('-s', '--genome-server', nargs='+', default=DEFAULT_SERVER, help="URL(s) to use for the {} attribute in config file. Default: {}." .format(CFG_SERVERS_KEY, DEFAULT_SERVER)) @@ -142,7 +148,7 @@ def add_subparser(cmd, msg, subparsers): alias_sps[cmd].add_argument( '-c', '--genome-config', required=False, dest="genome_config", metavar="C", help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(refgenconf.CFG_ENV_VARS))) + .format(", ".join(refgenconf.CFG_ENV_VARS))) alias_sps[cmd].add_argument( '--skip-read-lock', required=False, action="store_true", help="Whether the config file should not be locked for reading") @@ -169,11 +175,11 @@ def add_subparser(cmd, msg, subparsers): help="Aliases to get the digests for.") sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, - help="First genome for compatibility check.") + help="First genome for compatibility check.") sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, - help="Second genome for compatibility check.") + help="Second genome for compatibility check.") sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", - help="Do not print compatibility code explanation.") + help="Do not print compatibility code explanation.") # add 'genome' argument to many commands for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: @@ -207,24 +213,24 @@ def add_subparser(cmd, msg, subparsers): overwrite_group = force_group.add_mutually_exclusive_group() overwrite_group.add_argument("--no-overwrite", action="store_true", - help="Do not overwrite if asset exists.") + help="Do not overwrite if asset exists.") overwrite_group.add_argument("--force-overwrite", action="store_true", - help="Overwrite if asset exists.") + help="Overwrite if asset exists.") large_group = force_group.add_mutually_exclusive_group() large_group.add_argument("--no-large", action="store_true", - help="Do not pull archives over 5GB.") + help="Do not pull archives over 5GB.") large_group.add_argument("--pull-large", action="store_true", - help="Pull any archive, regardless of its size.") + help="Pull any archive, regardless of its size.") force_group.add_argument("--size-cutoff", type=float, default=10, metavar="S", - help="Maximum archive file size to download with no confirmation required (in GB, default: 10)") + help="Maximum archive file size to download with no confirmation required (in GB, default: 10)") force_group.add_argument("-b", "--batch", action="store_true", - help="Use batch mode: pull large archives, do no overwrite") + help="Use batch mode: pull large archives, do no overwrite") sps[INSERT_CMD].add_argument( "-p", "--path", required=True, metavar="P", @@ -263,7 +269,7 @@ def add_subparser(cmd, msg, subparsers): sps[cmd].add_argument( "-s", "--genome-server", nargs='+', required=True, help="One or more URLs to {action} the {key} attribute in config file.". - format(action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY)) + format(action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY)) return parser @@ -347,14 +353,16 @@ def refgenie_initg(rgc, genome, content_checksums): """ genome_dir = os.path.join(rgc.data_dir, genome) if is_writable(genome_dir): - output_file = os.path.join(genome_dir, "{}_sequence_digests.tsv".format(genome)) + output_file = os.path.join( + genome_dir, "{}_sequence_digests.tsv".format(genome)) with open(output_file, "w") as contents_file: wr = csv.writer(contents_file, delimiter="\t") for key, val in content_checksums.items(): wr.writerow([key, val]) _LOGGER.debug("sequence digests saved to: {}".format(output_file)) else: - _LOGGER.warning("Could not save the genome sequence digests. '{}' is not writable".format(genome_dir)) + _LOGGER.warning( + "Could not save the genome sequence digests. '{}' is not writable".format(genome_dir)) def refgenie_build(gencfg, genome, asset_list, recipe_name, args): @@ -409,8 +417,10 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a assets. """ - log_outfolder = os.path.abspath(os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) - _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(genome_outfolder, log_outfolder)) + log_outfolder = os.path.abspath(os.path.join( + genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) + _LOGGER.info( + "Saving outputs to:\n- content: {}\n- logs: {}".format(genome_outfolder, log_outfolder)) if args.docker: # Set up some docker stuff if args.volumes: @@ -424,14 +434,17 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a format(genome_outfolder)) return - pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args) + pm = pypiper.PipelineManager( + name="refgenie", outfolder=log_outfolder, args=args) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) _LOGGER.debug("Asset build package: " + str(build_pkg)) - gat = [genome, asset_key, tag] # create a bundle list to simplify calls below + # create a bundle list to simplify calls below + gat = [genome, asset_key, tag] # collect variables required to populate the command templates - asset_vars = get_asset_vars(genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs) + asset_vars = get_asset_vars( + genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method command_list_populated = [x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) @@ -439,10 +452,12 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a # create output directory tk.make_dir(asset_vars["asset_outfolder"]) - target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) + target = os.path.join( + log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) # add target command command_list_populated.append("touch {target}".format(target=target)) - _LOGGER.debug("Command populated: '{}'".format(" ".join(command_list_populated))) + _LOGGER.debug("Command populated: '{}'".format( + " ".join(command_list_populated))) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) @@ -466,17 +481,22 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a # add updates to config file with rgc as r: if asset_key == "fasta": - r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome) - r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome) - r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest}, force_digest=genome) - r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}, force_digest=genome) + r.update_genomes(genome, data={CFG_ALIASES_KEY: [ + alias]}, force_digest=genome) + r.update_assets( + *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome) + r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, + CFG_ASSET_CHECKSUM_KEY: digest}, force_digest=genome) + r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) + for k, v in build_pkg[ASSETS].items()}, force_digest=genome) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] - asset_tag = a["tag"] or rgc.get_default_tag(genome, a["asset"], use_existing=False) + asset_tag = a["tag"] or rgc.get_default_tag( + genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key if isinstance(recipe_name, dict) or \ @@ -486,7 +506,8 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a asset_build_package = _check_recipe(recipe_name) recipe_name = asset_build_package["name"] else: - asset_build_package = _check_recipe(asset_build_packages[recipe_name]) + asset_build_package = _check_recipe( + asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] @@ -494,26 +515,31 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a if args.assets is not None: parsed_parents_input = _parse_user_build_input(args.assets) specified_asset_keys, specified_assets = \ - list(parsed_parents_input.keys()), list(parsed_parents_input.values()) - _LOGGER.debug("Custom assets requested: {}".format(args.assets)) + list(parsed_parents_input.keys()), list( + parsed_parents_input.values()) + _LOGGER.debug( + "Custom assets requested: {}".format(args.assets)) if not specified_asset_keys and isinstance(args.assets, list): - _LOGGER.warning("Specified parent assets format is invalid. Using defaults.") + _LOGGER.warning( + "Specified parent assets format is invalid. Using defaults.") for req_asset in asset_build_package[REQ_ASSETS]: req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested if specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys: parent_data = \ - parse_registry_path(specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) + parse_registry_path( + specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) g, a, t, s = parent_data["genome"], \ - parent_data["asset"], \ - parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ - parent_data["seek_key"] + parent_data["asset"], \ + parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ + parent_data["seek_key"] else: # if no custom parents requested for the req asset, use default one default = parse_registry_path(req_asset[DEFAULT]) g, a, t, s = genome, default["asset"], \ - rgc.get_default_tag(genome, default["asset"]), \ - req_asset_data["seek_key"] - parent_assets.append("{}/{}:{}".format(rgc.get_genome_alias_digest(g, fallback=True), a, t)) + rgc.get_default_tag(genome, default["asset"]), \ + req_asset_data["seek_key"] + parent_assets.append( + "{}/{}:{}".format(rgc.get_genome_alias_digest(g, fallback=True), a, t)) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) @@ -532,12 +558,15 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a "Specify it with: --params {x}=value" .format(x=required_param[KEY], desc=required_param[DESC])) else: - specified_params.update({required_param[KEY]: required_param[DEFAULT]}) - _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, asset_key, asset_tag, recipe_name)) + specified_params.update( + {required_param[KEY]: required_param[DEFAULT]}) + _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, + asset_key, asset_tag, recipe_name)) ori_genome = genome if recipe_name == 'fasta': if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): - _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag)) + _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). It will be re-initialized.".format( + g=genome, a=asset_key, t=asset_tag)) # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file genome, _ = \ rgc.initialize_genome(fasta_path=specified_args["fasta"], @@ -552,7 +581,7 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) if not _build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, - specified_args, specified_params, ori_genome, **input_assets): + specified_args, specified_params, ori_genome, **input_assets): log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag, BUILD_STATS_DIR, ORI_LOG_NAME)) _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. " @@ -561,19 +590,23 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships - r.update_relatives_assets(genome, asset_key, asset_tag, parent_assets) # adds parents + r.update_relatives_assets( + genome, asset_key, asset_tag, parent_assets) # adds parents for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], - parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) + parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) if args.genome_description is not None: - _LOGGER.debug("adding genome ({}) description: '{}'".format(genome, args.genome_description)) - r.update_genomes(genome, {CFG_GENOME_DESC_KEY: args.genome_description}) + _LOGGER.debug("adding genome ({}) description: '{}'".format( + genome, args.genome_description)) + r.update_genomes( + genome, {CFG_GENOME_DESC_KEY: args.genome_description}) if args.tag_description is not None: _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'". format(genome, asset_key, asset_tag, args.tag_description)) - r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description}) + r.update_tags(genome, asset_key, asset_tag, { + CFG_TAG_DESC_KEY: args.tag_description}) rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name) @@ -610,7 +643,8 @@ def perm_check_x(file_to_check, message_tag="genome directory"): _LOGGER.error(msg) raise ValueError(msg) if not os.access(file_to_check, os.X_OK): - _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) + _LOGGER.error( + "Insufficient permissions to write to {}: ".format(file_to_check)) return False return True @@ -640,8 +674,10 @@ def main(): # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: - _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) - asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] + _LOGGER.debug("Found registry_path: {}".format( + args.asset_registry_paths)) + asset_list = [parse_registry_path(x) + for x in args.asset_registry_paths] for a in asset_list: # every asset must have a genome, either provided via registry path @@ -655,7 +691,8 @@ def main(): sys.exit(1) else: if args.genome and args.genome != a["genome"]: - _LOGGER.warn("Two different genomes specified for asset '{}'.".format(a["asset"])) + _LOGGER.warn( + "Two different genomes specified for asset '{}'.".format(a["asset"])) else: if args.command in GENOME_ONLY_REQUIRED and not args.genome: @@ -680,7 +717,7 @@ def main(): else: raise FileNotFoundError( "JSON file with config init settings does not exist: {}". - format(args.settings_json)) + format(args.settings_json)) if args.genome_folder: entries.update({CFG_FOLDER_KEY: args.genome_folder}) if args.remote_url_base: @@ -688,7 +725,8 @@ def main(): if args.genome_archive_folder: entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) if args.genome_archive_config: - entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) + entries.update( + {CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) _LOGGER.debug("initializing with entries: {}".format(entries)) rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) rgc.initialize_config_file(os.path.abspath(gencfg)) @@ -700,7 +738,8 @@ def main(): recipe_name = None if args.recipe: if len(asset_list) > 1: - _LOGGER.error("Recipes cannot be specified for multi-asset builds") + _LOGGER.error( + "Recipes cannot be specified for multi-asset builds") sys.exit(1) recipe_name = args.recipe if args.requirements: @@ -711,10 +750,12 @@ def main(): _LOGGER.info("'{}' recipe requirements: ".format(recipe)) _make_asset_build_reqs(recipe) sys.exit(0) - refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) + refgenie_build( + gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'". @@ -724,7 +765,8 @@ def main(): return elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: @@ -736,7 +778,8 @@ def main(): seek_keys=sk, force=args.force) elif args.command == PULL_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) # existing assets overwriting if args.no_overwrite: force = False @@ -762,7 +805,8 @@ def main(): if not perm_check_x(outdir): return if not _single_folder_writeable(outdir): - _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) + _LOGGER.error( + "Insufficient permissions to write to: {}".format(outdir)) return for a in asset_list: @@ -770,7 +814,8 @@ def main(): force_large=force_large, size_cutoff=args.size_cutoff) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) console = Console() if args.command == LIST_REMOTE_CMD: num_servers = 0 @@ -788,13 +833,14 @@ def main(): if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: _LOGGER.error( "Could not list assets from the following servers: {}". - format(bad_servers) + format(bad_servers) ) else: # Only check local assets once console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: @@ -805,7 +851,8 @@ def main(): use_existing=False) _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: - raise NotImplementedError("You can't remove a specific seek_key.") + raise NotImplementedError( + "You can't remove a specific seek_key.") gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} try: if not rgc.is_asset_complete(**gat): @@ -820,12 +867,13 @@ def main(): return if len(asset_list) > 1: if not query_yes_no("Are you sure you want to remove {} assets?". - format(len(asset_list))): + format(len(asset_list))): _LOGGER.info("Action aborted by the user") return force = True for a in asset_list: - rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) + rgc.remove(genome=a["genome"], asset=a["asset"], + tag=a["tag"], force=force) elif args.command == TAG_CMD: rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) @@ -839,7 +887,8 @@ def main(): rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) @@ -851,11 +900,13 @@ def main(): print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: @@ -877,12 +928,17 @@ def main(): return elif args.command == COMPARE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) res = rgc.compare(args.genome1[0], args.genome2[0], explain=not args.no_explanation) if args.no_explanation: print(res) + elif args.command == UPGRADE_CMD: + RefGenConf.config_upgrade(target_version=args.target_version, + filepath=gencfg, force=args.force) + def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): """ @@ -954,11 +1010,14 @@ def _format_reqs(req_list): reqs_list = [] if asset_build_packages[asset][REQ_FILES]: - reqs_list.append("- files:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])))) + reqs_list.append( + "- files:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])))) if asset_build_packages[asset][REQ_ASSETS]: - reqs_list.append("- assets:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])))) + reqs_list.append( + "- assets:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])))) if asset_build_packages[asset][REQ_PARAMS]: - reqs_list.append("- params:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])))) + reqs_list.append( + "- params:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])))) _LOGGER.info("\n".join(reqs_list)) @@ -982,7 +1041,8 @@ def get_dir_digest(path, pm=None): from subprocess import check_output x = check_output(cmd.format(path), shell=True).decode("utf-8") except Exception as e: - _LOGGER.warning("{}: could not calculate digest for '{}'".format(e.__class__.__name__, path)) + _LOGGER.warning("{}: could not calculate digest for '{}'".format( + e.__class__.__name__, path)) return return str(sub(r'\W+', '', x)) # strips non-alphanumeric @@ -1036,10 +1096,12 @@ def _check_recipe(recipe): # experimental feature; recipe jsonschema validation from jsonschema import validate from yacman import load_yaml - SCHEMA_SRC = os.path.join(os.path.dirname(os.path.abspath(__file__)), "schemas", "recipe_schema.yaml") + SCHEMA_SRC = os.path.join(os.path.dirname( + os.path.abspath(__file__)), "schemas", "recipe_schema.yaml") if os.path.exists(SCHEMA_SRC): validate(recipe, load_yaml(filepath=SCHEMA_SRC)) - _LOGGER.info("Recipe validated successfully against a schema: {}".format(SCHEMA_SRC)) + _LOGGER.info( + "Recipe validated successfully against a schema: {}".format(SCHEMA_SRC)) else: _LOGGER.warning("Recipe schema not found: {}".format(SCHEMA_SRC)) # end of validation @@ -1063,11 +1125,11 @@ def _seek(rgc, genome_name, asset_name, tag_name=None, check in seek. This function makes it easier """ return rgc.seek_src(genome_name=genome_name, - asset_name=asset_name, - tag_name=tag_name, - seek_key=seek_key, - enclosing_dir=enclosing_dir, - strict_exists=True) + asset_name=asset_name, + tag_name=tag_name, + seek_key=seek_key, + enclosing_dir=enclosing_dir, + strict_exists=True) def _skip_lock(skip_arg, cfg): diff --git a/refgenie/refgenie.yaml b/refgenie/refgenie.yaml index 2e9ad975..d36f5a04 100644 --- a/refgenie/refgenie.yaml +++ b/refgenie/refgenie.yaml @@ -25,4 +25,4 @@ param: epilog: context: "cg" tallymer: - minocc: 2 \ No newline at end of file + minocc: 2 From 28bfab6762b6ca8019012bbf6adea35bad472952 Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Thu, 17 Sep 2020 19:16:51 -0400 Subject: [PATCH 054/110] show refgenconf version with refgenie --version resolve #206 --- refgenie/refgenie.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 474de532..dad7dcf9 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -22,6 +22,7 @@ import refgenconf from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ MissingRecipeError, DownloadJsonError, get_dir_digest +from refgenconf import __version__ as __refgenconf_version__ from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ VersionInHelpParser, is_command_callable from ubiquerg.system import is_writable @@ -43,7 +44,7 @@ def build_argparser(): parser = VersionInHelpParser( prog="refgenie", - version=__version__, + version=__version__ + " | refgenconf " + __refgenconf_version__, description=banner, epilog=additional_description) From bf5d55239fe4f526daf2713b43f9d075e7875655 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 18 Sep 2020 11:25:21 -0400 Subject: [PATCH 055/110] specify refgenonf dev req in main requirements;#207 --- requirements/requirements-all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 1a903985..e975f09d 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,4 @@ logmuse>=0.2.6 -#refgenconf>=0.10.0-dev +refgenconf>=0.10.0-dev piper>=0.12.1 pyfaidx>=0.5.5.2 \ No newline at end of file From a6cb59f1f5f84fa58525b22e9fbd8c82e6ab125c Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 18 Sep 2020 13:49:08 -0400 Subject: [PATCH 056/110] update epilog recipe to scala version --- refgenie/asset_build_packages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 82981647..983c27de 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -389,7 +389,9 @@ "epilog_index": "." }, CMD_LST: [ - "epilog index -i {fasta} -o {asset_outfolder}/{genome}_{context}.tsv --context {context} -t" + "epilog index -- --infile {fasta} --outfile {asset_outfolder}/{genome}_{context}.tsv --contexts {context}", + "bgzip {asset_outfolder}/{genome}_{context}.tsv", + "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_{context}.tsv.gz", ] }, "star_index": { From 7c1d861733074a5225329f251b45d0a99dc35093 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 18 Sep 2020 14:42:35 -0400 Subject: [PATCH 057/110] epilog recipe pointer --- refgenie/asset_build_packages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 983c27de..97a5da08 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -386,7 +386,7 @@ ], CONT: "databio/refgenie", ASSETS: { - "epilog_index": "." + "epilog_index": "{genome}_{context}.tsv.gz" }, CMD_LST: [ "epilog index -- --infile {fasta} --outfile {asset_outfolder}/{genome}_{context}.tsv --contexts {context}", From f239dff283cb06378d246f52e31a6be8693965e0 Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Mon, 21 Sep 2020 23:24:11 -0400 Subject: [PATCH 058/110] add RefGenConf_old object for v0.3 config --- refgenie/refgenie.py | 82 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 18 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index dad7dcf9..70e1a7e7 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -20,13 +20,13 @@ import logmuse import pypiper import refgenconf -from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ +from refgenconf import RefGenConf, RefGenConf_old, MissingAssetError, MissingGenomeError, \ MissingRecipeError, DownloadJsonError, get_dir_digest from refgenconf import __version__ as __refgenconf_version__ from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ VersionInHelpParser, is_command_callable from ubiquerg.system import is_writable -from yacman import UndefinedAliasError +from yacman import UndefinedAliasError, YacAttMap from argparse import HelpFormatter _LOGGER = None @@ -755,8 +755,13 @@ def main(): gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'". @@ -766,8 +771,14 @@ def main(): return elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: @@ -781,6 +792,7 @@ def main(): elif args.command == PULL_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + # existing assets overwriting if args.no_overwrite: force = False @@ -840,13 +852,24 @@ def main(): console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: force = args.force - rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, + skip_read_lock=skip_read_lock) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"], use_existing=False) @@ -877,7 +900,13 @@ def main(): tag=a["tag"], force=force) elif args.command == TAG_CMD: - rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, + skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: @@ -888,8 +917,13 @@ def main(): rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) @@ -901,13 +935,23 @@ def main(): print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] + if config_version < REQ_CFG_VERSION: + rgc = RefGenConf_old(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) + else: + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: @@ -937,8 +981,10 @@ def main(): print(res) elif args.command == UPGRADE_CMD: - RefGenConf.config_upgrade(target_version=args.target_version, - filepath=gencfg, force=args.force) + rgc = RefGenConf_old(filepath=gencfg, + skip_read_lock=skip_read_lock) + RefGenConf_old.config_upgrade(target_version=args.target_version, + filepath=gencfg, force=args.force) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): From f7883c99282c4a2d24ac38fe881d007622b5c4fa Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Tue, 22 Sep 2020 02:28:50 -0400 Subject: [PATCH 059/110] RefGenConf.upgrade writable --- refgenie/refgenie.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 70e1a7e7..7aff4029 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -981,10 +981,10 @@ def main(): print(res) elif args.command == UPGRADE_CMD: - rgc = RefGenConf_old(filepath=gencfg, + rgc = RefGenConf_old(filepath=gencfg, writable=True, skip_read_lock=skip_read_lock) - RefGenConf_old.config_upgrade(target_version=args.target_version, - filepath=gencfg, force=args.force) + rgc.config_upgrade( + target_version=args.target_version, force=args.force) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): From 0d71a05069e966fc68a5b6479bcf626065ff703d Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Tue, 22 Sep 2020 13:07:30 -0400 Subject: [PATCH 060/110] change config_upgrade to an standalone function --- refgenie/refgenie.py | 82 +++++++++++--------------------------------- 1 file changed, 20 insertions(+), 62 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 7aff4029..0d87b8b0 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -20,8 +20,8 @@ import logmuse import pypiper import refgenconf -from refgenconf import RefGenConf, RefGenConf_old, MissingAssetError, MissingGenomeError, \ - MissingRecipeError, DownloadJsonError, get_dir_digest +from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ + MissingRecipeError, DownloadJsonError, get_dir_digest, config_upgrade from refgenconf import __version__ as __refgenconf_version__ from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ VersionInHelpParser, is_command_callable @@ -755,13 +755,8 @@ def main(): gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: _LOGGER.debug("getting asset: '{}/{}.{}:{}'". @@ -771,13 +766,8 @@ def main(): return elif args.command == INSERT_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") @@ -852,24 +842,14 @@ def main(): console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: force = args.force - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, + skip_read_lock=skip_read_lock) for a in asset_list: a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"], use_existing=False) @@ -900,13 +880,8 @@ def main(): tag=a["tag"], force=force) elif args.command == TAG_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, + skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: @@ -917,13 +892,8 @@ def main(): rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) elif args.command == ID_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) @@ -935,23 +905,13 @@ def main(): print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: - config_version = YacAttMap(filepath=gencfg)[CFG_VERSION_KEY] - if config_version < REQ_CFG_VERSION: - rgc = RefGenConf_old(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - else: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, + skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: @@ -981,10 +941,8 @@ def main(): print(res) elif args.command == UPGRADE_CMD: - rgc = RefGenConf_old(filepath=gencfg, writable=True, - skip_read_lock=skip_read_lock) - rgc.config_upgrade( - target_version=args.target_version, force=args.force) + config_upgrade(target_version=args.target_version, + filepath=gencfg, force=args.force) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): From 9e776a064e79ba31a36f46439ad50d3487b319e5 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 1 Oct 2020 09:20:41 -0400 Subject: [PATCH 061/110] add upgrade docs page; https://github.com/refgenie/refgenconf/pull/111#issuecomment-702111542 --- docs/upgrade-config.md | 41 +++++++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 42 insertions(+) create mode 100644 docs/upgrade-config.md diff --git a/docs/upgrade-config.md b/docs/upgrade-config.md new file mode 100644 index 00000000..cae175fe --- /dev/null +++ b/docs/upgrade-config.md @@ -0,0 +1,41 @@ +# Refgenie configuration file upgrades + +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes of refgenie configuration file format and/or asset directory structure. + +Starting with the refgenie transition 0.9.3 -> 0.10.0 (configuration file versions: 0.3 -> 0.4) we introduce `refgenie upgrade` functionality, which will take care of all the required reformatting. Upon running the command refgenie will automatically detect the current configuration file version and both reformat the file and make any necessary changes to the asset directory structure. + +Below we describe the changes introduced in each configuration file version and list the commands that need to be run to upgrade: + +## v0.4 + +**introduced: refgenie 0.10.0** + +### Config format changes + +- use sequence-derived unique genome identifiers instead of genome names everywhere +- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily + +### File tree structure changes + +- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name +- move all the contents from the refgenie directory to a new `data` directory +- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory + +### Upgrade + +To reformat the config run from the command line: + +``` +refgenie upgrade --target-version 0.4 -c /path/to/old/cfg.yml +``` + +Or from within Python: + +```python +from refgenconf import upgrade_config +upgrade_config(target_version="0.4", filepath="/path/to/old/cfg.yml") +``` + + + + diff --git a/mkdocs.yml b/mkdocs.yml index 0a68732c..073c3d52 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -21,6 +21,7 @@ nav: - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md - Use refgenie with iGenomes: igenomes.md + - Upgrade config: upgrade-config.md - Reference: - Genome configuration file: genome_config.md - Glossary: glossary.md From 3ed4bfe2fd2252d1936b8eb824bbf26f4f42da4c Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Thu, 1 Oct 2020 09:54:30 -0400 Subject: [PATCH 062/110] change funct name to upgrade_config --- refgenie/refgenie.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 0d87b8b0..58dd0c5d 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -21,7 +21,7 @@ import pypiper import refgenconf from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ - MissingRecipeError, DownloadJsonError, get_dir_digest, config_upgrade + MissingRecipeError, DownloadJsonError, get_dir_digest, upgrade_config from refgenconf import __version__ as __refgenconf_version__ from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ VersionInHelpParser, is_command_callable @@ -941,7 +941,7 @@ def main(): print(res) elif args.command == UPGRADE_CMD: - config_upgrade(target_version=args.target_version, + upgrade_config(target_version=args.target_version, filepath=gencfg, force=args.force) From 7bfc90a5e6ef46137d74a7a44caa4ecb7c797142 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 12 Oct 2020 17:40:54 -0400 Subject: [PATCH 063/110] add 03 to 04 cfg upgrade tutorial --- docs_jupyter/config_upgrade_03_to_04.ipynb | 629 +++++++++++++++++++++ 1 file changed, 629 insertions(+) create mode 100644 docs_jupyter/config_upgrade_03_to_04.ipynb diff --git a/docs_jupyter/config_upgrade_03_to_04.ipynb b/docs_jupyter/config_upgrade_03_to_04.ipynb new file mode 100644 index 00000000..b0b9abab --- /dev/null +++ b/docs_jupyter/config_upgrade_03_to_04.ipynb @@ -0,0 +1,629 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Configuration file upgrade demonstration \n", + "\n", + "In the following tutorial we will present the process of upgrading the refgenie configuration file and asset files from version **0.3** to version **0.4**.\n", + "\n", + "First, let's install the refgenie and refgenconf Python packages that support version 0.3 of refgenie configuration file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working environment setup\n", + "\n", + "Let's install the legacy refgenconf and refgenie Python packages" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting refgenconf==0.9.3\n", + " Using cached https://files.pythonhosted.org/packages/52/c3/6aed361205272e30cd3570ca1c33feae6ad977ad32ddff8e509752046272/refgenconf-0.9.3-py3-none-any.whl\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.9.3) (2.21.0)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.12.12.dev0)\n", + "Requirement already satisfied: tqdm>=4.38.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (4.47.0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.9.3) (5.1)\n", + "Requirement already satisfied: yacman>=0.6.9 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.7.0)\n", + "Requirement already satisfied: pyfaidx in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.9.3) (0.5.9.1)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (1.24.1)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (2019.3.9)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.9.3) (3.0.4)\n", + "Requirement already satisfied: ubiquerg>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from attmap>=0.12.5->refgenconf==0.9.3) (0.6.1)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.6.9->refgenconf==0.9.3) (0.9)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx->refgenconf==0.9.3) (41.0.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx->refgenconf==0.9.3) (1.12.0)\n", + "Installing collected packages: refgenconf\n", + " Found existing installation: refgenconf 0.10.0.dev0\n", + " Uninstalling refgenconf-0.10.0.dev0:\n", + " Successfully uninstalled refgenconf-0.10.0.dev0\n", + "Successfully installed refgenconf-0.9.3\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Collecting refgenie==0.9.3\n", + " Using cached https://files.pythonhosted.org/packages/af/52/c1e1bc63b3543f591ebdf44caccfaab3c730708256d926b9f4b1c34d1865/refgenie-0.9.3-py3-none-any.whl\n", + "Requirement already satisfied: pyfaidx>=0.5.5.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.5.9.1)\n", + "Requirement already satisfied: refgenconf>=0.9.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.9.3)\n", + "Requirement already satisfied: logmuse>=0.2.6 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.2.6)\n", + "Requirement already satisfied: piper>=0.12.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.9.3) (0.12.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.9.3) (1.12.0)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.9.3) (41.0.1)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (0.12.12.dev0)\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (2.21.0)\n", + "Requirement already satisfied: tqdm>=4.38.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (4.47.0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (5.1)\n", + "Requirement already satisfied: yacman>=0.6.9 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf>=0.9.1->refgenie==0.9.3) (0.7.0)\n", + "Requirement already satisfied: ubiquerg>=0.4.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.9.3) (0.6.1)\n", + "Requirement already satisfied: psutil in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from piper>=0.12.1->refgenie==0.9.3) (5.6.1)\n", + "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.9.3) (1.0.3)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (1.24.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (2019.3.9)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf>=0.9.1->refgenie==0.9.3) (2.8)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.6.9->refgenconf>=0.9.1->refgenie==0.9.3) (0.9)\n", + "Requirement already satisfied: pytz>=2017.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (2018.9)\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (2.8.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.9.3) (1.17.3)\n", + "Installing collected packages: refgenie\n", + " Found existing installation: refgenie 0.10.0.dev0\n", + " Uninstalling refgenie-0.10.0.dev0:\n", + " Successfully uninstalled refgenie-0.10.0.dev0\n", + "Successfully installed refgenie-0.9.3\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "pip install refgenconf==0.9.3\n", + "pip install refgenie==0.9.3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's set up a directory that we will use for the config file and refgenie assets" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "export WORKDIR=~/Desktop/testing/refgenie/upgrade_test\n", + "rm -r $WORKDIR # remove first just to make sure the directory does not exist\n", + "mkdir -p $WORKDIR\n", + "cd $WORKDIR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's set `$REFGENIE` environment variable to point refgenie to the configuration file location and initialize it" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized genome configuration file: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/g.yml\n" + ] + } + ], + "source": [ + "export REFGENIE=$WORKDIR/g.yml\n", + "refgenie init -c $REFGENIE -s http://rg.databio.org:82/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we subscribe to a test instance of refgenieserver, that supports both the old and new refgenie clients. This is because it exposes different API versions, that these clients use: `v2` (refgenie v0.9.3) and `v3` (refgenie v0.10.0-dev)\n", + "\n", + "## Pull/build test assets\n", + "\n", + "Next, let's retrieve couple of assets. As mentioned above, `v2` API is used to retrieve the asset." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading URL: http://rg.databio.org:82/v2/asset/rCRSd/fasta/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/fasta__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/fasta/default\n", + "Default tag for 'rCRSd/fasta' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/human_repeats/fasta/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/fasta__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/fasta/default\n", + "Default tag for 'human_repeats/fasta' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/rCRSd/bowtie2_index/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/bowtie2_index__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/rCRSd/bowtie2_index/default\n", + "Default tag for 'rCRSd/bowtie2_index' set to: default\n", + "Downloading URL: http://rg.databio.org:82/v2/asset/human_repeats/bwa_index/archive\n", + "Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/bwa_index__default.tgz\n", + "Extracting asset tarball and saving to: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_repeats/bwa_index/default\n", + "Default tag for 'human_repeats/bwa_index' set to: default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/fasta human_repeats/fasta rCRSd/bowtie2_index human_repeats/bwa_index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's download a small FASTA file and build a fasta asset for an arbitrary genome, which is not available at `http://rg.databio.org:82/`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-10-12 17:39:25-- http://big.databio.org/refgenie_raw/files.human_alu.fasta.fasta\n", + "Resolving big.databio.org (big.databio.org)... 128.143.245.182, 128.143.245.181\n", + "Connecting to big.databio.org (big.databio.org)|128.143.245.182|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 501 [application/octet-stream]\n", + "Saving to: ‘human_alu.fa.gz’\n", + "\n", + "human_alu.fa.gz 100%[===================>] 501 --.-KB/s in 0s \n", + "\n", + "2020-10-12 17:39:25 (1.19 MB/s) - ‘human_alu.fa.gz’ saved [501/501]\n", + "\n" + ] + } + ], + "source": [ + "wget -O human_alu.fa.gz http://big.databio.org/refgenie_raw/files.human_alu.fasta.fasta\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using 'default' as the default tag for 'human_alu/fasta'\n", + "Building 'human_alu/fasta:default' using 'fasta' recipe\n", + "Saving outputs to:\n", + "- content: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu\n", + "- logs: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build\n", + "### Pipeline run code and environment:\n", + "\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build human_alu/fasta --files fasta=human_alu.fa.gz`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test\n", + "* Outfolder: /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/\n", + "* Pipeline started at: (10-12 17:39:27) elapsed: 0.0 _TIME_\n", + "\n", + "### Version log:\n", + "\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", + "* Pypiper version: 0.12.1\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", + "* Pipeline version: None\n", + "\n", + "### Arguments passed to pipeline:\n", + "\n", + "* `asset_registry_paths`: `['human_alu/fasta']`\n", + "* `assets`: `None`\n", + "* `command`: `build`\n", + "* `config_file`: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/refgenie.yaml`\n", + "* `docker`: `False`\n", + "* `files`: `[['fasta=human_alu.fa.gz']]`\n", + "* `genome`: `None`\n", + "* `genome_config`: `None`\n", + "* `genome_description`: `None`\n", + "* `logdev`: `False`\n", + "* `new_start`: `False`\n", + "* `outfolder`: `/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test`\n", + "* `params`: `None`\n", + "* `recipe`: `None`\n", + "* `recover`: `False`\n", + "* `requirements`: `False`\n", + "* `silent`: `False`\n", + "* `tag_description`: `None`\n", + "* `verbosity`: `None`\n", + "* `volumes`: `None`\n", + "\n", + "----------------------------------------\n", + "\n", + "Target to produce: `/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/human_alu_fasta__default.flag` \n", + "\n", + "> `cp human_alu.fa.gz /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.gz` (70063)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70063)\n",
+      "Warning: couldn't add memory use for process: 70063\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70063;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.001GB\n", + "\n", + "\n", + "> `gzip -df /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.gz` (70064)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70064)\n",
+      "Warning: couldn't add memory use for process: 70064\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70064;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", + "\n", + "\n", + "> `samtools faidx /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa` (70065)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70065)\n",
+      "Warning: couldn't add memory use for process: 70065\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70065;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.0GB\n", + "\n", + "\n", + "> `cut -f 1,2 /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.fa.fai > /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/human_alu.chrom.sizes` (70066)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70066)\n",
+      "Warning: couldn't add memory use for process: 70066\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70066;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", + "\n", + "> `touch /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default/_refgenie_build/human_alu_fasta__default.flag` (70068)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=70068)\n",
+      "Warning: couldn't add memory use for process: 70068\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 70068;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", + "\n", + "> `cd /Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test/human_alu/fasta/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", + "Asset digest: 9e8fa06e6125f89be4fb974879cb91a6\n", + "Default tag for 'human_alu/fasta' set to: default\n", + "\n", + "### Pipeline completed. Epilogue\n", + "* Elapsed time (this run): 0:00:00\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.001 GB\n", + "* Pipeline completed time: 2020-10-12 17:39:27\n", + "Computing initial genome digest...\n", + "Initializing genome...\n", + "Finished building 'fasta' asset\n" + ] + } + ], + "source": [ + "refgenie build human_alu/fasta --files fasta=human_alu.fa.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's inspect the asset inventory" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Server subscriptions: http://rg.databio.org:82\n", + "Local genomes: human_alu, human_repeats, rCRSd\n", + "Local recipes: bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\n", + "Local assets:\n", + " human_alu/ fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n", + " human_repeats/ bwa_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n", + " rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\n" + ] + } + ], + "source": [ + "refgenie list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, assets for all three genomes are available." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade refgenie software\n", + "\n", + "Now, let's upgrade to refgenie==0.10.0-dev, which introduces the concept of sequence-derived genome identifiers to uniqly identify genomes." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting git+https://github.com/refgenie/refgenconf.git@dev_config_upgrade\n", + " Cloning https://github.com/refgenie/refgenconf.git (to revision dev_config_upgrade) to /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-kxmw8i6n\n", + " Running command git clone -q https://github.com/refgenie/refgenconf.git /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-kxmw8i6n\n", + " Running command git checkout -b dev_config_upgrade --track origin/dev_config_upgrade\n", + " Switched to a new branch 'dev_config_upgrade'\n", + " Branch 'dev_config_upgrade' set up to track remote branch 'dev_config_upgrade' from 'origin'.\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.12.12.dev0)\n", + "Requirement already satisfied: pyyaml in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (5.1)\n", + "Requirement already satisfied: requests in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (2.21.0)\n", + "Requirement already satisfied: yacman>=0.7.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.7.0)\n", + "Requirement already satisfied: future in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (0.17.1)\n", + "Requirement already satisfied: jsonschema in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from refgenconf==0.10.0.dev0) (3.0.1)\n", + "Requirement already satisfied: rich in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenconf==0.10.0.dev0) (3.3.0)\n", + "Requirement already satisfied: ubiquerg>=0.2.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from attmap>=0.12.5->refgenconf==0.10.0.dev0) (0.6.1)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (2019.3.9)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (2.8)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from requests->refgenconf==0.10.0.dev0) (1.24.1)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman>=0.7.0->refgenconf==0.10.0.dev0) (0.9)\n", + "Requirement already satisfied: attrs>=17.4.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (19.1.0)\n", + "Requirement already satisfied: pyrsistent>=0.14.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (0.14.11)\n", + "Requirement already satisfied: six>=1.11.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (1.12.0)\n", + "Requirement already satisfied: setuptools in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from jsonschema->refgenconf==0.10.0.dev0) (41.0.1)\n", + "Requirement already satisfied: colorama<0.5.0,>=0.4.0 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from rich->refgenconf==0.10.0.dev0) (0.4.1)\n", + "Requirement already satisfied: pprintpp<0.5.0,>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.4.0)\n", + "Requirement already satisfied: typing-extensions<4.0.0,>=3.7.4 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (3.7.4.2)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (2.6.1)\n", + "Requirement already satisfied: dataclasses<0.8,>=0.7; python_version >= \"3.6\" and python_version < \"3.7\" in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.7)\n", + "Requirement already satisfied: commonmark<0.10.0,>=0.9.0 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from rich->refgenconf==0.10.0.dev0) (0.9.1)\n", + "Building wheels for collected packages: refgenconf\n", + " Building wheel for refgenconf (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for refgenconf: filename=refgenconf-0.10.0.dev0-cp36-none-any.whl size=64959 sha256=37191046ce6136b2bd777b1aa274a2d6a5ffb508af7e4969ac0ae97c1682b1f5\n", + " Stored in directory: /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-ephem-wheel-cache-516dw93w/wheels/a8/b1/82/f79eaabaad4cf5c64fb4914e06dd04726c5c226785974aee4e\n", + "Successfully built refgenconf\n", + "Installing collected packages: refgenconf\n", + " Found existing installation: refgenconf 0.9.3\n", + " Uninstalling refgenconf-0.9.3:\n", + " Successfully uninstalled refgenconf-0.9.3\n", + "Successfully installed refgenconf-0.10.0.dev0\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", + "Collecting git+https://github.com/refgenie/refgenie.git@dev_config_upgrade\n", + " Cloning https://github.com/refgenie/refgenie.git (to revision dev_config_upgrade) to /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-3i4zdr4w\n", + " Running command git clone -q https://github.com/refgenie/refgenie.git /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-req-build-3i4zdr4w\n", + " Running command git checkout -b dev_config_upgrade --track origin/dev_config_upgrade\n", + " Switched to a new branch 'dev_config_upgrade'\n", + " Branch 'dev_config_upgrade' set up to track remote branch 'dev_config_upgrade' from 'origin'.\n", + "Requirement already satisfied: logmuse>=0.2.6 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.2.6)\n", + "Requirement already satisfied: piper>=0.12.1 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.12.1)\n", + "Requirement already satisfied: pyfaidx>=0.5.5.2 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from refgenie==0.10.0.dev0) (0.5.9.1)\n", + "Requirement already satisfied: ubiquerg>=0.4.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.6.1)\n", + "Requirement already satisfied: yacman in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.7.0)\n", + "Requirement already satisfied: attmap>=0.12.5 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (0.12.12.dev0)\n", + "Requirement already satisfied: psutil in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (5.6.1)\n", + "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from piper>=0.12.1->refgenie==0.10.0.dev0) (1.0.3)\n", + "Requirement already satisfied: setuptools>=0.7 in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.10.0.dev0) (41.0.1)\n", + "Requirement already satisfied: six in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pyfaidx>=0.5.5.2->refgenie==0.10.0.dev0) (1.12.0)\n", + "Requirement already satisfied: oyaml in /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages (from yacman->piper>=0.12.1->refgenie==0.10.0.dev0) (0.9)\n", + "Requirement already satisfied: pyyaml>=3.13 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from yacman->piper>=0.12.1->refgenie==0.10.0.dev0) (5.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (2018.9)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (2.8.0)\n", + "Requirement already satisfied: numpy>=1.13.3 in /Users/mstolarczyk/Library/Python/3.6/lib/python/site-packages (from pandas->piper>=0.12.1->refgenie==0.10.0.dev0) (1.17.3)\n", + "Building wheels for collected packages: refgenie\n", + " Building wheel for refgenie (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for refgenie: filename=refgenie-0.10.0.dev0-cp36-none-any.whl size=29266 sha256=d78485a0207036ddd91c36eb66b1973bdb3588aaff925d165d5e5aed483f968c\n", + " Stored in directory: /private/var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/pip-ephem-wheel-cache-wmsjgl78/wheels/07/12/55/f50538357799dd2938a702a2f9e8b84a849975e61b0c59e7a0\n", + "Successfully built refgenie\n", + "Installing collected packages: refgenie\n", + " Found existing installation: refgenie 0.9.3\n", + " Uninstalling refgenie-0.9.3:\n", + " Successfully uninstalled refgenie-0.9.3\n", + "Successfully installed refgenie-0.10.0.dev0\n", + "\u001b[33mWARNING: You are using pip version 19.2.3, however version 20.2.3 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "pip install git+https://github.com/refgenie/refgenconf.git@dev_config_upgrade\n", + "pip install git+https://github.com/refgenie/refgenie.git@dev_config_upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\n" + ] + } + ], + "source": [ + "refgenie --version" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of refgenie commands fails since the config is incompatible:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie\", line 10, in \n", + " sys.exit(main())\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/refgenie.py\", line 821, in main\n", + " skip_read_lock=skip_read_lock)\n", + " File \"/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenconf/refgenconf.py\", line 110, in __init__\n", + " raise ConfigNotCompliantError(msg)\n", + "refgenconf.exceptions.ConfigNotCompliantError: This genome config (v0.3) is not compliant with v0.4 standards. \n", + "To use current refgenconf, please use upgrade_config function to upgrade, ordowngrade refgenconf: 'pip install \"refgenconf>=0.7.0,<0.10.0\"'. \n", + "If refgenie is installed, you can use 'refgenie upgrade --target-version 0.4'; For config format documentation please see http://refgenie.databio.org/en/latest/genome_config/\n" + ] + }, + { + "ename": "", + "evalue": "1", + "output_type": "error", + "traceback": [] + } + ], + "source": [ + "refgenie list " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upgrade refgenie configuration file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's upgrade the config to v0.4, just as the error message suggests. We will use `--force` option to run the command in batch mode." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upgrading v0.3 config file format to v0.4\n", + "Retrieved rCRSd digest from the server (511fb1178275e7d529560d53b949dba40815f195623bce8e)\n", + "Retrieved human_repeats digest from the server (ebf26d2f064462bea7029e6b4d2298967d7435bff82ed224)\n", + "Genome digest for human_alu is not available on any of the servers. Generating the digest from a local fasta file\n", + "Loaded AnnotatedSequenceDigestList (8 sequences)\n", + "Creating 'data' and 'alias' directories in '/Users/mstolarczyk/Desktop/testing/refgenie/upgrade_test'.\n", + "Copying assets to 'data' and creating alias symlinks in 'alias'. Genomes that the digest could not be determined for 'will be ignored.\n", + "Removing genome assets that have been copied to 'data' directory.\n" + ] + } + ], + "source": [ + "refgenie upgrade --force --target-version 0.4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The upgrade succeded for all the assets that were previously managed by refgenie, regardless of the fact if the sequence-derived genome identifiers were avialable on the server. For ones that were not (`human_alu` genome) refgenie calculated the digest from the locally available FASTA file using the same algorithm that has been used to generate digests for the genomes on the server." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Local refgenie assets \u001b[0m\n", + "\u001b[3m Server subscriptions: \u001b[0m\n", + "\u001b[3m http://rg.databio.org:82 \u001b[0m\n", + "┏━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ rCRSd │ fasta, bowtie2_index │\n", + "│ human_repeats │ fasta, bwa_index │\n", + "│ human_alu │ fasta │\n", + "└───────────────┴──────────────────────┘\n" + ] + } + ], + "source": [ + "refgenie list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From a2d9f685e4cbc8e8765e279dc385fc401d2b1df4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 13 Oct 2020 16:18:10 -0400 Subject: [PATCH 064/110] dont reinit if fasta exists --- refgenie/refgenie.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 58dd0c5d..6717e040 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -566,12 +566,13 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a ori_genome = genome if recipe_name == 'fasta': if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): - _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). It will be re-initialized.".format( - g=genome, a=asset_key, t=asset_tag)) - # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file - genome, _ = \ - rgc.initialize_genome(fasta_path=specified_args["fasta"], - alias=ori_genome, skip_alias_write=True) + pretag = rgc.get_default_tag(genome, "fasta") + _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})". + format(g=genome, a=asset_key, t=pretag)) + else: + # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file + genome, _ = rgc.initialize_genome( + fasta_path=specified_args["fasta"], alias=ori_genome, skip_alias_write=True) else: try: genome = rgc.get_genome_alias_digest(genome, fallback=True) From ad783cf4ff7e454f7c5dae791d69c7696d2f4feb Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 27 Oct 2020 15:05:25 -0400 Subject: [PATCH 065/110] use genome digest in case of already initialized fasta builds --- refgenie/refgenie.py | 1 + 1 file changed, 1 insertion(+) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 6717e040..65cf07d2 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -569,6 +569,7 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a pretag = rgc.get_default_tag(genome, "fasta") _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})". format(g=genome, a=asset_key, t=pretag)) + genome = rgc.get_genome_alias_digest(alias=genome, fallback=True) else: # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file genome, _ = rgc.initialize_genome( From d77125d9e48a9d177f2519656f94e4e5a0a83898 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 29 Oct 2020 09:26:30 -0400 Subject: [PATCH 066/110] Minor text updates to upgrade docs --- docs/upgrade-config.md | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/docs/upgrade-config.md b/docs/upgrade-config.md index cae175fe..0b256ccf 100644 --- a/docs/upgrade-config.md +++ b/docs/upgrade-config.md @@ -1,27 +1,14 @@ # Refgenie configuration file upgrades -Refgenie is under active development and new features are added regularly. This sometimes necessitates changes of refgenie configuration file format and/or asset directory structure. +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. -Starting with the refgenie transition 0.9.3 -> 0.10.0 (configuration file versions: 0.3 -> 0.4) we introduce `refgenie upgrade` functionality, which will take care of all the required reformatting. Upon running the command refgenie will automatically detect the current configuration file version and both reformat the file and make any necessary changes to the asset directory structure. +Starting with the refgenie transition 0.9.3 -> 0.10.0 (configuration file versions: 0.3 -> 0.4) we introduced the `refgenie upgrade` functionality, which will take care of all the required reformatting. Running `refgenie upgrade` will automatically detect the current configuration file version and will: 1. reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. -Below we describe the changes introduced in each configuration file version and list the commands that need to be run to upgrade: +Below we describe the changes introduced in each configuration file version and how to upgrade: -## v0.4 +## Configuration file v0.4 (introduced: refgenie v0.10.0) -**introduced: refgenie 0.10.0** - -### Config format changes - -- use sequence-derived unique genome identifiers instead of genome names everywhere -- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily - -### File tree structure changes - -- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name -- move all the contents from the refgenie directory to a new `data` directory -- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory - -### Upgrade +### How to upgrade To reformat the config run from the command line: @@ -36,6 +23,16 @@ from refgenconf import upgrade_config upgrade_config(target_version="0.4", filepath="/path/to/old/cfg.yml") ``` +### Config format changes + +- use sequence-derived unique genome identifiers instead of genome names everywhere +- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily + +### File tree structure changes + +- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name +- move all the contents from the refgenie directory to a new `data` directory +- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory From f3d722eb30dba90b91775f28d0c75cd655d749e9 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 29 Oct 2020 10:10:49 -0400 Subject: [PATCH 067/110] Restructure config docs --- docs/autodoc_build/refgenconf.md | 15 ++++---- docs/genome_config.md | 62 +++++++++++++++++++++++++++----- mkdocs.yml | 2 +- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index c3bb9364..40744375 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -34,7 +34,7 @@ A sort of oracle of available reference genome assembly assets ```python -def __init__(self, filepath=None, entries=None, writable=False, wait_max=10) +def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False) ``` Create the config instance by with a filepath or key-value pairs. @@ -43,7 +43,8 @@ Create the config instance by with a filepath or key-value pairs. - `filepath` (`str`): a path to the YAML file to read - `entries` (`Iterable[(str, object)] | Mapping[str, object]`): config filepath or collection of key-value pairs - `writable` (`bool`): whether to create the object with write capabilities -- `wait_max` (`int`): how long to wait for creating an object when the file that data will be read from is locked +- `wait_max` (`int`): how long to wait for creating an object when thefile that data will be read from is locked +- `skip_read_lock` (`bool`): whether the file should not be locked forreading when object is created in read only mode #### Raises: @@ -274,7 +275,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x1051476a8>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f14d6d19050>) ``` List genomes and assets available remotely. @@ -423,7 +424,7 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x1051477b8>) +def listr(self, genome=None, order=None, get_url= at 0x7f14d6d19170>) ``` List genomes and assets available remotely. @@ -454,7 +455,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x105147a60>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x7f14d6d19440>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -541,7 +542,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x1051472f0>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f14d6d13c20>) ``` Seek path to a specified genome-asset-tag @@ -833,4 +834,4 @@ Get path to genome configuration file. -*Version Information: `refgenconf` v0.7.1-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file +*Version Information: `refgenconf` v0.9.0, generated by `lucidoc` v0.4.2* \ No newline at end of file diff --git a/docs/genome_config.md b/docs/genome_config.md index 18dc6e4a..53a9c7dd 100644 --- a/docs/genome_config.md +++ b/docs/genome_config.md @@ -2,7 +2,26 @@ Refgenie will read and write a genome configuration file in yaml format. In general, you shouldn't need to mess with the config file. You create one with `refgenie init -c genome_config.yaml`, then you add assets using either `refgenie pull` or `refgenie build`. You can also add your own custom assets with `refgenie add`, which is explained in [using custom assets](custom_assets.md). Refgenie will use the config file to remember what assets are available and where they are. -But here's how the config file works, in case for some reason you do need to edit some things by hand. Here's an example file to get you started: +## Upgrading the configuration file + +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. Starting with `refgenie v0.10.0` we introduced the `refgenie upgrade` command, which will automatically detect the current configuration file version and will: 1. reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. To reformat the config, run from the command line: + +``` +refgenie upgrade --target-version 0.4 -c /path/to/old/cfg.yml +``` + +Or from within Python: + +```python +from refgenconf import upgrade_config +upgrade_config(target_version="0.4", filepath="/path/to/old/cfg.yml") +``` + +Below is a CHANGELOG describing all changes introduced in configuration file versions. + +## Genome configuration file example + +Here's how the config file works, in case you do need to edit some things by hand. Here's an example file: ```yaml genome_folder: /path/to/active/genomes @@ -65,12 +84,39 @@ Note that for a fully operational config just `genome_folder`, `genome_server`, For genomes that are managed by `refgenie` (that is, they were built or pulled with `refgenie`), these asset attributes will be automatically populated. You can edit them and refgenie will respect your edits (unless you re-build or re-pull the asset, which will overwrite those fields). You can also add your own assets and `refgenie` won't touch them. For more info, see [using custom assets](custom_assets.md). -## Genome config versions -### v0.2 -Up to version `0.4.4`, refgenie used a config file version that lacked the `assets` level in the hierarchy (so, assets were listed directly under the genome). Starting with version `0.5.0`, we moved the assets down a layer to accommodate other genome-level attributes we intend to use in the future (like a description, checksums, other provenance information). Earlier refgenie config files will need to be updated. +# Config file changelog + +## [0.4] - Unreleased; refgenie v0.10.0 + +### Config format changes + +- use sequence-derived unique genome identifiers instead of genome names everywhere +- add `aliases` key under each genome section to store the aliases that can be used to refer to the genomes easily + +### File tree structure changes + +- use sequence-derived unique genome identifiers instead of genome names in every file name and directory name +- move all the contents from the refgenie directory to a new `data` directory +- add an `alias` directory with contents corresponding to the aliases defined in the configuration file. The contents of the child directories are symbolic links to the asset files in the `data` directory + +## [0.3] - 2019-10-21; refgenie v0.7.0 + +### Config format changes + +- Added seek keys, tags, asset digests, default tag pointer, asset description. + + +## [0.2] - 2019-07-11; refgenie v0.5.0 + +### Config format changes + +- Added `config_version` entry +- Added the `assets` level in the config hierarchy. +- We moved the assets down a layer to accommodate other genome-level attributes we intend to use in the future (like a description, checksums, other provenance information). Earlier refgenie config files will need to be updated. + +## [0.1] - 2019-05-10; refgenie v0.3.0 + +- Initial version of the config file with the initial refgenie release. -### v0.3 -Upt to version `0.6.0`, refgenie used the config v0.2. Currently, it uses v0.3, where we introduced: seek keys, tags, asset digests, default tag pointer, asset description - - + \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 073c3d52..f75f8027 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -21,7 +21,7 @@ nav: - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md - Use refgenie with iGenomes: igenomes.md - - Upgrade config: upgrade-config.md + - Upgrade from config 0.3 to 0.4: config_upgrade_03_to_04.md - Reference: - Genome configuration file: genome_config.md - Glossary: glossary.md From 59bea6d7a05bc4768c3272475815c80bec3966f3 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 29 Oct 2020 10:29:34 -0400 Subject: [PATCH 068/110] typo --- docs/autodoc_build/refgenconf.md | 431 +++++++++++++++++++++++++++++-- docs/genome_config.md | 2 +- 2 files changed, 404 insertions(+), 29 deletions(-) diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 40744375..5b9c5460 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -29,12 +29,64 @@ h4 .content { # Package `refgenconf` Documentation +## Class `ConfigNotCompliantError` +The format of the config file does not match required version/standards + + +## Class `DownloadJsonError` +Non-OK response from a JSON download attempt + + +```python +def __init__(self, resp) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `GenomeConfigFormatError` +Exception for invalid genome config file format. + + +```python +def __init__(self, msg) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `MissingAssetError` +Error type for request of an unavailable genome asset. + + +## Class `MissingConfigDataError` +Missing required configuration instance items + + +## Class `MissingGenomeError` +Error type for request of unknown genome/assembly. + + +## Class `MissingRecipeError` +Error type for request of an unavailable recipe. + + +## Class `MissingSeekKeyError` +Error type for request of an unavailable asset seek key. + + +## Class `MissingTagError` +Error type for request of an unavailable asset tag. + + ## Class `RefGenConf` A sort of oracle of available reference genome assembly assets ```python -def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False) +def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False, genome_exact=False) ``` Create the config instance by with a filepath or key-value pairs. @@ -55,6 +107,35 @@ Create the config instance by with a filepath or key-value pairs. +```python +def add(self, path, genome, asset, tag=None, seek_keys=None, force=False) +``` + +Add an external asset to the config +#### Parameters: + +- `path` (`str`): a path to the asset to add; must exist and be relativeto the genome_folder +- `genome` (`str`): genome name +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `seek_keys` (`dict`): seek keys to add +- `force` (`bool`): whether to force existing asset overwrite + + + + +```python +def alias_dir(self) +``` + +Path to the genome alias directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def assets_str(self, offset_text=' ', asset_sep=', ', genome_assets_delim='/ ', genome=None, order=None) ``` @@ -158,6 +239,37 @@ In case the local asset does not exist, the config is populated with the remote +```python +def compare(self, genome1, genome2, explain=False) +``` + +Check genomes compatibility level. Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata +#### Parameters: + +- `genome1` (`str`): name of the first genome to compare +- `genome2` (`str`): name of the first genome to compare +- `explain` (`bool`): whether the returned code explanation shouldbe displayed + + +#### Returns: + +- `int`: compatibility code + + + + +```python +def data_dir(self) +``` + +Path to the genome data directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def file_path(self) ``` @@ -191,6 +303,30 @@ Determine path to a particular asset for a particular genome. +```python +def genome_aliases(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + +```python +def genome_aliases_table(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + ```python def genomes_list(self, order=None) ``` @@ -220,6 +356,42 @@ Get as single string this configuration's reference genome assembly IDs. +```python +def get_asds_path(self, genome) +``` + +Get path to the Annotated Sequence Digests JSON file for a given genome. Note that the path and/or genome may not exist. +#### Parameters: + +- `genome` (`str`): genome name + + +#### Returns: + +- `str`: ASDs path + + + + +```python +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fabea3bca60>) +``` + +Get a rich.Table object representing assets available locally +#### Parameters: + +- `genomes` (`list[str]`): genomes to restrict the results with +- `server_url` (`str`): server URL to query for the remote genome data +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset + + +#### Returns: + +- `rich.table.Table`: table of assets available locally + + + + ```python def get_default_tag(self, genome, asset, use_existing=True) ``` @@ -239,6 +411,54 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag +```python +def get_genome_alias(self, digest, fallback=False, all_aliases=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `digest` (`str`): digest to find human-readable alias for +- `fallback` (`bool`): whether to return the query digest in caseof failure +- `all_aliases` (`bool`): whether to return all aliases instead of justthe first one + + +#### Returns: + +- `str | list[str]`: human-readable aliases + + +#### Raises: + +- `GenomeConfigFormatError`: if "genome_digests" section doesnot exist in the config +- `UndefinedAliasError`: if a no alias has been defined for therequested digest + + + + +```python +def get_genome_alias_digest(self, alias, fallback=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `alias` (`str`): alias to find digest for +- `fallback` (`bool`): whether to return the query alias in caseof failure and in case it is one of the digests + + +#### Returns: + +- `str`: genome digest + + +#### Raises: + +- `UndefinedAliasError`: if the specified alias has been assigned toany digests + + + + ```python def get_genome_attributes(self, genome) ``` @@ -275,7 +495,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f14d6d19050>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fabea3be598>) ``` List genomes and assets available remotely. @@ -294,7 +514,27 @@ List genomes and assets available remotely. ```python -def getseq(self, genome, locus) +def get_symlink_paths(self, genome, asset=None, tag=None, all_aliases=False) +``` + +Get path to the alias directory for the selected genome-asset-tag +#### Parameters: + +- `genome` (`str`): reference genome ID +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `all_aliases` (`bool`): whether to return a collection of symboliclinks or just the first one from the alias list + + +#### Returns: + +- `dict`: + + + + +```python +def getseq(self, genome, locus, as_str=False) ``` Return the sequence found in a selected range and chromosome. Something like the refget protocol. @@ -302,6 +542,12 @@ Return the sequence found in a selected range and chromosome. Something like the - `genome` (`str`): name of the sequence identifier - `locus` (`str`): 1-10' +- `as_str` (`bool`): whether to convert the resurned object to stringand return just the sequence + + +#### Returns: + +- `str | pyfaidx.FastaRecord | pyfaidx.Sequence`: selected sequence @@ -348,6 +594,28 @@ Initialize genome configuration file on disk +```python +def initialize_genome(self, fasta_path, alias, fasta_unzipped=False, skip_alias_write=False) +``` + +Initialize a genome + +Create a JSON file with Annotated Sequence Digests (ASDs) +for the FASTA file in the genome directory. +#### Parameters: + +- `fasta_path` (`str`): path to a FASTA file to initialize genome with +- `alias` (`str`): alias to set for the genome +- `skip_alias_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `str, list[dict[]]`: human-readable name for the genome + + + + ```python def is_asset_complete(self, genome, asset, tag) ``` @@ -424,10 +692,10 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7f14d6d19170>) +def listr(self, genome=None, order=None, get_url= at 0x7fabea3be6a8>, as_str=False) ``` -List genomes and assets available remotely. +List genomes and assets available remotely on all servers the object subscribes to #### Parameters: - `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance @@ -437,7 +705,7 @@ List genomes and assets available remotely. #### Returns: -- `str, str`: text reps of remotely available genomes and assets +- `dict[OrderedDict[list]]`: remotely available genomes and assetskeyed by genome keyed by source server endpoint @@ -455,7 +723,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x7f14d6d19440>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fabea3be950>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -466,6 +734,8 @@ Download and possibly unpack one or more assets for a given ref gen. - `tag` (`str`): name of particular tag to fetch - `unpack` (`bool`): whether to unpack a tarball - `force` (`bool | NoneType`): how to handle case in which asset pathalready exists; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt to replace existing file, and True to auto-replay Yes for existing asset replacement. +- `force_large` (`bool | NoneType`): how to handle case in large (> 5GB)asset is to be pulled; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt, and True to auto-replay Yes +- `size_cutoff` (`float`): maximum archive file size to download withno prompt - `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset - `build_signal_handler` (`function(str) -> function`): how to createa signal handler to use during the download; the single argument to this function factory is the download filepath @@ -529,6 +799,24 @@ Remove any relationship links associated with the selected asset +```python +def remove_genome_aliases(self, digest, aliases=None) +``` + +Remove alias for a specified genome digest. This method will remove the digest both from the genomes object and from the aliases mapping in tbe config +#### Parameters: + +- `digest` (`str`): genome digest to remove an alias for +- `aliases` (`list[str]`): a collection to aliases to remove for thegenome. If not provided, all aliases for the digest will be remove + + +#### Returns: + +- `bool`: whether the removal has been performed + + + + ```python def run_plugins(self, hook) ``` @@ -542,7 +830,38 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f14d6d13c20>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fabea3be0d0>) +``` + +Seek path to a specified genome-asset-tag alias +#### Parameters: + +- `genome_name` (`str`): name of a reference genome assembly of interest +- `asset_name` (`str`): name of the particular asset to fetch +- `tag_name` (`str`): name of the particular asset tag to fetch +- `seek_key` (`str`): name of the particular subasset to fetch +- `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). +- `check_exist` (`function(callable) -> bool`): how to check forasset/path existence +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `all_aliases` (`bool`): whether to return paths to all asset aliases orjust the one for the specified 'genome_name` argument + + +#### Returns: + +- `str`: path to the asset + + +#### Raises: + +- `TypeError`: if the existence check is not a one-arg function +- `refgenconf.MissingGenomeError`: if the named assembly isn't knownto this configuration instance +- `refgenconf.MissingAssetError`: if the names assembly is known tothis configuration instance, but the requested asset is unknown + + + + +```python +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fabea3be1e0>) ``` Seek path to a specified genome-asset-tag @@ -554,7 +873,7 @@ Seek path to a specified genome-asset-tag - `seek_key` (`str`): name of the particular subasset to fetch - `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). - `check_exist` (`function(callable) -> bool`): how to check forasset/path existence -- `enclosing_dir` (`bool`): whether a path to the entire enclosing directory should be returned, e.g.for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned #### Returns: @@ -572,7 +891,7 @@ Seek path to a specified genome-asset-tag ```python -def set_default_pointer(self, genome, asset, tag, force=False) +def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None) ``` Point to the selected tag by default @@ -581,11 +900,37 @@ Point to the selected tag by default - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest - `tag` (`str`): name of the particular asset tag to point to by default +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `force` (`bool`): whether the default tag change should be forced (even if it exists) +```python +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fabea3bebf8>) +``` + +Assign a human-readable alias to a genome identifier. + +Genomes are identified by a unique identifier which is derived from the +FASTA file (part of fasta asset). This way we can ensure genome +provenance and compatibility with the server. This function maps a +human-readable identifier to make referring to the genomes easier. +#### Parameters: + +- `genome` (`str`): name of the genome to assign to an identifier +- `digest` (`str`): identifier to use +- `overwrite` (`bool`): whether all the previously set aliases should beremoved and just the current one stored +- `no_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `bool`: whether the alias has been established + + + + ```python def subscribe(self, urls, reset=False) ``` @@ -648,7 +993,7 @@ Remove URLs the list of genome_servers. ```python -def update_assets(self, genome, asset=None, data=None) +def update_assets(self, genome, asset=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset mapping is missing, it will be created @@ -656,6 +1001,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -667,13 +1013,14 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass ```python -def update_genomes(self, genome, data=None) +def update_genomes(self, genome, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome is missing, it will be added #### Parameters: - `genome` (`str`): genome to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -706,7 +1053,7 @@ A convenience method which wraps the update assets and uses it to update the ass ```python -def update_seek_keys(self, genome, asset, tag=None, keys=None) +def update_seek_keys(self, genome, asset, tag=None, keys=None, force_digest=None) ``` A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. @@ -715,6 +1062,7 @@ A convenience method which wraps the updated assets and uses it to update the se - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `keys` (`Mapping`): seek_keys to be added/updated @@ -726,7 +1074,7 @@ A convenience method which wraps the updated assets and uses it to update the se ```python -def update_tags(self, genome, asset=None, tag=None, data=None) +def update_tags(self, genome, asset=None, tag=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset-tag mapping is missing, it will be created @@ -735,6 +1083,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -781,36 +1130,42 @@ Write the contents to a file. If pre- and post-update plugins are defined, they -## Class `GenomeConfigFormatError` -Exception for invalid genome config file format. +## Class `RefgenconfError` +Base exception type for this package + + +## Class `RemoteDigestMismatchError` +Remote digest of the parent asset does not match its local counterpart ```python -def __init__(self, msg) +def __init__(self, asset, local_digest, remote_digest) ``` Initialize self. See help(type(self)) for accurate signature. -## Class `MissingAssetError` -Error type for request of an unavailable genome asset. +## Class `UnboundEnvironmentVariablesError` +Use of environment variable that isn't bound to a value. -## Class `MissingConfigDataError` -Missing required configuration instance items +```python +def get_dir_digest(path, pm=None) +``` +Generate a MD5 digest that reflects just the contents of the files in the selected directory. +#### Parameters: -## Class `MissingGenomeError` -Error type for request of unknown genome/assembly. +- `path` (`str`): path to the directory to digest +- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided -## Class `RefgenconfError` -Base exception type for this package +#### Returns: + +- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 -## Class `UnboundEnvironmentVariablesError` -Use of environment variable that isn't bound to a value. ```python @@ -831,7 +1186,27 @@ Get path to genome configuration file. +```python +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fabea3bc510>, link_fun= at 0x7fabea3bfe18>) +``` + +Upgrade the config to a selected target version. + +Convert the config file to target_version format, update file structure +inside genome_folder. Drop genomes for which genome_digest is not available +on any of the servers and do not have a fasta asset locally. +#### Parameters: + +- `target_version` (`str`): the version updated to +- `filepath` (`str`): path to config file +- `force` (`bool`): whether the upgrade should be confirmed upfront +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset +- `link_fun` (`callable`): function to use to link files, e.g os.symlink or os.link + + + + -*Version Information: `refgenconf` v0.9.0, generated by `lucidoc` v0.4.2* \ No newline at end of file +*Version Information: `refgenconf` v0.10.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file diff --git a/docs/genome_config.md b/docs/genome_config.md index 53a9c7dd..369ef07c 100644 --- a/docs/genome_config.md +++ b/docs/genome_config.md @@ -4,7 +4,7 @@ Refgenie will read and write a genome configuration file in yaml format. In gene ## Upgrading the configuration file -Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. Starting with `refgenie v0.10.0` we introduced the `refgenie upgrade` command, which will automatically detect the current configuration file version and will: 1. reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. To reformat the config, run from the command line: +Refgenie is under active development and new features are added regularly. This sometimes necessitates changes in the refgenie configuration file format or asset directory structure. Starting with `refgenie v0.10.0` we introduced the `refgenie upgrade` command, which will automatically detect the current configuration file version and will: 1) reformat the configuration file to the new version; and 2) make any necessary changes to the asset directory structure. To reformat the config, run from the command line: ``` refgenie upgrade --target-version 0.4 -c /path/to/old/cfg.yml From 9b2990ebc6560f363cdc4dbfbfa278e5f6a11293 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 09:31:35 -0400 Subject: [PATCH 069/110] update consts --- refgenie/const.py | 16 ++++++++-------- refgenie/refgenie.yaml | 28 ---------------------------- 2 files changed, 8 insertions(+), 36 deletions(-) delete mode 100644 refgenie/refgenie.yaml diff --git a/refgenie/const.py b/refgenie/const.py index 60db2905..05f6e1ba 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -1,6 +1,6 @@ """ -Constant variables for refgenie package. -Ones that are integral to refgenconf and/or refgenieserver should be defined in refgenconf.const +Constant variables for refgenie package. Ones that are integral to refgenconf +and/or refgenieserver should be defined in refgenconf.const """ from refgenconf.const import * @@ -24,8 +24,8 @@ GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] # For each asset we assume a genome is also required -ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, - BUILD_CMD, INSERT_CMD, TAG_CMD, ID_CMD] +ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, TAG_CMD, + ID_CMD] SUBPARSER_MESSAGES = { INIT_CMD: "Initialize a genome configuration.", @@ -42,7 +42,7 @@ SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.", UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", ALIAS_CMD: "Interact with aliases.", - COMPARE_CMD: "compare two genomes.", + COMPARE_CMD: "Compare two genomes.", UPGRADE_CMD: "Upgrade config. This will alter the files on disk." } @@ -51,7 +51,7 @@ ALIAS_REMOVE_CMD = "remove" ALIAS_SUBPARSER_MESSAGES = { - ALIAS_REMOVE_CMD: "remove aliases.", - ALIAS_SET_CMD: "set aliases.", - ALIAS_GET_CMD: "get aliases." + ALIAS_REMOVE_CMD: "Remove aliases.", + ALIAS_SET_CMD: "Set aliases.", + ALIAS_GET_CMD: "Get aliases." } diff --git a/refgenie/refgenie.yaml b/refgenie/refgenie.yaml deleted file mode 100644 index d36f5a04..00000000 --- a/refgenie/refgenie.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# Build configuration - -tools: - # absolute paths to required tools - bowtie2build: bowtie2-build - bismark_genome_preparation: bismark_genome_preparation - epilog_indexer: epilog_indexer.py - samtools: samtools - kallisto: kallisto - hisat2build: hisat2-build - suffixerator: gt suffixerator - tallymer: gt tallymer mkindex - -index: - bowtie2: True - bismark_bt1: False - bismark_bt2: False - epilog: False - hisat: False - kallisto: True - suffixerator: False - tallymer: False - -param: - epilog: - context: "cg" - tallymer: - minocc: 2 From c87ad29827093845a613f25909295f99b4111eeb Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 09:50:22 -0400 Subject: [PATCH 070/110] update imports --- refgenie/refgenie.py | 69 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 65cf07d2..37535ba5 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -21,12 +21,12 @@ import pypiper import refgenconf from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ - MissingRecipeError, DownloadJsonError, get_dir_digest, upgrade_config -from refgenconf import __version__ as __refgenconf_version__ + MissingRecipeError, DownloadJsonError, get_dir_digest, upgrade_config, \ + __version__ as rgc_version, select_genome_config from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ VersionInHelpParser, is_command_callable from ubiquerg.system import is_writable -from yacman import UndefinedAliasError, YacAttMap +from yacman import UndefinedAliasError from argparse import HelpFormatter _LOGGER = None @@ -44,7 +44,7 @@ def build_argparser(): parser = VersionInHelpParser( prog="refgenie", - version=__version__ + " | refgenconf " + __refgenconf_version__, + version=f"{__version__} | refgenconf {rgc_version}", description=banner, epilog=additional_description) @@ -68,7 +68,7 @@ def add_subparser(cmd, msg, subparsers): sps[cmd].add_argument( '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(refgenconf.CFG_ENV_VARS))) + .format(", ".join(CFG_ENV_VARS))) sps[cmd].add_argument( '--skip-read-lock', required=False, action="store_true", help="Whether the config file should not be locked for reading") @@ -149,7 +149,7 @@ def add_subparser(cmd, msg, subparsers): alias_sps[cmd].add_argument( '-c', '--genome-config', required=False, dest="genome_config", metavar="C", help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(refgenconf.CFG_ENV_VARS))) + .format(", ".join(CFG_ENV_VARS))) alias_sps[cmd].add_argument( '--skip-read-lock', required=False, action="store_true", help="Whether the config file should not be locked for reading") @@ -482,44 +482,46 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a # add updates to config file with rgc as r: if asset_key == "fasta": - r.update_genomes(genome, data={CFG_ALIASES_KEY: [ - alias]}, force_digest=genome) + r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, + force_digest=genome) r.update_assets( - *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, force_digest=genome) - r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key, - CFG_ASSET_CHECKSUM_KEY: digest}, force_digest=genome) - r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) - for k, v in build_pkg[ASSETS].items()}, force_digest=genome) + *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, + force_digest=genome) + r.update_tags( + *gat, force_digest=genome, + data={CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest}) + r.update_seek_keys( + *gat, force_digest=genome, + keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] - asset_tag = a["tag"] or rgc.get_default_tag( - genome, a["asset"], use_existing=False) + asset_tag = a["tag"] or \ + rgc.get_default_tag(genome, a["asset"], use_existing=False) recipe_name = recipe_name or asset_key if isinstance(recipe_name, dict) or \ - (isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys()): + (isinstance(recipe_name, str) + and recipe_name in asset_build_packages.keys()): if isinstance(recipe_name, dict): _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) asset_build_package = _check_recipe(recipe_name) recipe_name = asset_build_package["name"] else: - asset_build_package = _check_recipe( - asset_build_packages[recipe_name]) + asset_build_package = \ + _check_recipe(asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] specified_asset_keys, specified_assets = None, None if args.assets is not None: parsed_parents_input = _parse_user_build_input(args.assets) - specified_asset_keys, specified_assets = \ - list(parsed_parents_input.keys()), list( - parsed_parents_input.values()) - _LOGGER.debug( - "Custom assets requested: {}".format(args.assets)) + specified_asset_keys = list(parsed_parents_input.keys()) + specified_assets = list(parsed_parents_input.values()) + _LOGGER.debug(f"Custom assets requested: {args.assets}") if not specified_asset_keys and isinstance(args.assets, list): _LOGGER.warning( "Specified parent assets format is invalid. Using defaults.") @@ -527,9 +529,8 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested if specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys: - parent_data = \ - parse_registry_path( - specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) + parent_data = parse_registry_path( + specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) g, a, t, s = parent_data["genome"], \ parent_data["asset"], \ parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ @@ -559,10 +560,9 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a "Specify it with: --params {x}=value" .format(x=required_param[KEY], desc=required_param[DESC])) else: - specified_params.update( - {required_param[KEY]: required_param[DEFAULT]}) - _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, - asset_key, asset_tag, recipe_name)) + specified_params.update({required_param[KEY]: required_param[DEFAULT]}) + _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format( + genome, asset_key, asset_tag, recipe_name)) ori_genome = genome if recipe_name == 'fasta': if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): @@ -666,8 +666,9 @@ def main(): _LOGGER.error("No command given") sys.exit(1) - gencfg = refgenconf.select_genome_config(filename=args.genome_config, check_exist=not args.command == INIT_CMD, - on_missing=lambda fp: fp, strict_env=True) + gencfg = select_genome_config( + filename=args.genome_config, check_exist=not args.command == INIT_CMD, + on_missing=lambda fp: fp, strict_env=True) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) @@ -943,8 +944,8 @@ def main(): print(res) elif args.command == UPGRADE_CMD: - upgrade_config(target_version=args.target_version, - filepath=gencfg, force=args.force) + upgrade_config(target_version=args.target_version, filepath=gencfg, + force=args.force) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): From a385453edbbeaa5853f27f64337861d23e999fc9 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 10:25:03 -0400 Subject: [PATCH 071/110] update landing page --- docs/README.md | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/docs/README.md b/docs/README.md index a8bad9c8..b3fdc25a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -20,6 +20,8 @@ Refgenie manages storage, access, and transfer of reference genome resources. It 4. **It includes a python API**. For tool developers, you use `rgc = refgenconf.RefGenConf("genomes.yaml")` to get a Python object with paths to any genome asset, *e.g.*, `rgc.seek("hg38", "kallisto_index")`. +5. **It strictly determines genomes compatibility**. Users refer to genomes with arbitrary aliases, like "hg38", but refgenie uses sequence-derived identifiers to verify genome identity with asset servers. + ## Quick example @@ -43,11 +45,16 @@ refgenie listr Response: ```console -Querying available assets from server: http://refgenomes.databio.org/v2/assets -Remote genomes: mouse_chrM2x, rCRSd -Remote assets: - mouse_chrM2x/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default - rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.chrom_sizes:test, fasta.fai:default, fasta.fai:test, fasta:default, fasta:test + Remote refgenie assets + Server URL: http://rg.databio.org:82 +┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ genome ┃ assets ┃ +┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ mouse_chrM2x │ fasta, bwa_index, bowtie2_index │ +│ hg38 │ fasta, bowtie2_index │ +│ rCRSd │ fasta, bowtie2_index │ +│ human_repeats │ fasta, hisat2_index, bwa_index │ +└─────────────────────┴──────────────────────────────────────────────┘ ``` Next, pull one: @@ -58,8 +65,14 @@ refgenie pull rCRSd/bowtie2_index Response: ```console -'rCRSd/bowtie2_index:default' archive size: 116.8KB -Downloading URL: http://staging.refgenomes.databio.org/v2/asset/rCRSd/bowtie2_index/archive ... +No local digest for genome alias: rCRSd +Setting 'rCRSd' identity with server: http://rg.databio.org:82/v3/alias/genome_digest/rCRSd +Determined server digest for local genome alias (rCRSd): 511fb1178275e7d529560d53b949dba40815f195623bce8e +Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd) +Created alias directories: + - /Users/mstolarczyk/demo/alias/rCRSd +Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive +... ``` See [further reading on downloading assets](pull.md). @@ -70,7 +83,7 @@ Refgenie assets are scripted, so if what you need is not available remotely, you ```console -refgenie build mygenome/bwa_index --fasta mygenome.fa.gz +refgenie build mygenome/bwa_index --files fasta=mygenome.fa.gz ``` See [further reading on building assets](build.md). From ee1cb9b1fa131a67bb0aa5cc32682ac3819c764e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 10:50:01 -0400 Subject: [PATCH 072/110] update tutorial notebook --- docs/autodoc_build/refgenconf.md | 16 +- docs_jupyter/tutorial.ipynb | 386 ++++++++++++++++++------------- 2 files changed, 230 insertions(+), 172 deletions(-) diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 5b9c5460..16dc9f31 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -374,7 +374,7 @@ Get path to the Annotated Sequence Digests JSON file for a given genome. Note th ```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fabea3bca60>) +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fe3a1bbba60>) ``` Get a rich.Table object representing assets available locally @@ -495,7 +495,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fabea3be598>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fe3a1bbd598>) ``` List genomes and assets available remotely. @@ -692,7 +692,7 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7fabea3be6a8>, as_str=False) +def listr(self, genome=None, order=None, get_url= at 0x7fe3a1bbd6a8>, as_str=False) ``` List genomes and assets available remotely on all servers the object subscribes to @@ -723,7 +723,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fabea3be950>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fe3a1bbd950>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -830,7 +830,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fabea3be0d0>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fe3a1bbd0d0>) ``` Seek path to a specified genome-asset-tag alias @@ -861,7 +861,7 @@ Seek path to a specified genome-asset-tag alias ```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fabea3be1e0>) +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fe3a1bbd1e0>) ``` Seek path to a specified genome-asset-tag @@ -907,7 +907,7 @@ Point to the selected tag by default ```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fabea3bebf8>) +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fe3a1bbdbf8>) ``` Assign a human-readable alias to a genome identifier. @@ -1187,7 +1187,7 @@ Get path to genome configuration file. ```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fabea3bc510>, link_fun= at 0x7fabea3bfe18>) +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fe3a1bbb510>, link_fun= at 0x7fe3a1bbee18>) ``` Upgrade the config to a selected target version. diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index 2eb5c772..754f30fc 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -22,12 +22,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Initialized genome configuration file: /home/nsheff/code/refgenie/docs_jupyter/refgenie.yaml\r\n" + "Initialized genome configuration file: /Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml\r\n", + "Created directories:\r\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/data\r\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias\r\n" ] } ], "source": [ - "!refgenie init -c refgenie.yaml" + "!refgenie init -c refgenie.yaml -s http://rg.databio.org:82" ] }, { @@ -46,9 +49,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "config_version: 0.3\r\n", - "genome_folder: /home/nsheff/code/refgenie/docs_jupyter\r\n", - "genome_servers: ['http://refgenomes.databio.org']\r\n", + "config_version: 0.4\r\n", + "genome_folder: /Users/mstolarczyk/code/refgenie/docs_jupyter\r\n", + "genome_servers: \r\n", + " - http://rg.databio.org:82\r\n", "genomes: null\r\n" ] } @@ -78,7 +82,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use `pull` to download the actual asset:" + "Use `listr` to see what's available on the server:" ] }, { @@ -86,35 +90,31 @@ "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, { "data": { "text/plain": [ - "(['hs38d1', 'fasta', 'default'],\n", - " {'archive_digest': '310c578812a64fcdf08d2df60d7b79b4',\n", - " 'archive_size': '1.7MB',\n", - " 'asset_children': ['hs38d1/star_index:default',\n", - " 'hs38d1/bwa_index:default',\n", - " 'hs38d1/bowtie2_index:default',\n", - " 'hs38d1/bismark_bt1_index:default',\n", - " 'hs38d1/bismark_bt2_index:default',\n", - " 'hs38d1/hisat2_index:default',\n", - " 'hs38d1/tallymer_index:default',\n", - " 'hs38d1/suffixerator_index:default'],\n", - " 'asset_digest': 'eddf5466faa3391a7114e87648466dcb',\n", - " 'asset_parents': [],\n", - " 'asset_path': 'fasta',\n", - " 'asset_size': '6.0MB',\n", - " 'seek_keys': {'chrom_sizes': 'hs38d1.chrom.sizes',\n", - " 'fai': 'hs38d1.fa.fai',\n", - " 'fasta': 'hs38d1.fa'}},\n", - " 'http://refgenomes.databio.org')" + "{'http://rg.databio.org:82/v3/assets': OrderedDict([('hg38',\n", + " ['bowtie2_index:default',\n", + " 'fasta.chrom_sizes:default',\n", + " 'fasta.fai:default',\n", + " 'fasta:default']),\n", + " ('human_repeats',\n", + " ['bwa_index:default',\n", + " 'fasta.chrom_sizes:default',\n", + " 'fasta.fai:default',\n", + " 'fasta:default',\n", + " 'hisat2_index:default']),\n", + " ('mouse_chrM2x',\n", + " ['bowtie2_index:default',\n", + " 'bwa_index:default',\n", + " 'fasta.chrom_sizes:default',\n", + " 'fasta.fai:default',\n", + " 'fasta:default']),\n", + " ('rCRSd',\n", + " ['bowtie2_index:default',\n", + " 'fasta.chrom_sizes:default',\n", + " 'fasta.fai:default',\n", + " 'fasta:default'])])}" ] }, "execution_count": 4, @@ -123,14 +123,14 @@ } ], "source": [ - "rgc.pull(\"hs38d1\", \"fasta\", \"default\")" + "rgc.listr()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Once it's downloaded, use `seek` to retrieve a path to it." + "Use `pull` to download on of the assets asset:" ] }, { @@ -138,10 +138,36 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "39ae8da8f45d4f0a959327e4fecbaf4b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ - "'/home/nsheff/code/refgenie/docs_jupyter/hs38d1/fasta/default/hs38d1.fa'" + "(['194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496', 'fasta', 'default'],\n", + " {'asset_path': 'fasta',\n", + " 'asset_digest': '8dfe402f7d29d5b036dd8937119e4404',\n", + " 'archive_digest': 'deae753231ebb9df82622c7140e0bd3a',\n", + " 'asset_size': '46.8KB',\n", + " 'archive_size': '9.1KB',\n", + " 'seek_keys': {'fasta': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.fa',\n", + " 'fai': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.fa.fai',\n", + " 'chrom_sizes': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.chrom.sizes'},\n", + " 'asset_parents': [],\n", + " 'asset_children': ['194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496/bwa_index:default',\n", + " '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496/bowtie2_index:default']},\n", + " 'http://rg.databio.org:82')" ] }, "execution_count": 5, @@ -150,14 +176,14 @@ } ], "source": [ - "rgc.seek(\"hs38d1\", \"fasta\")" + "rgc.pull(\"mouse_chrM2x\", \"fasta\", \"default\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can get the unique asset identifier with `id()`" + "Once it's downloaded, use `seek` to retrieve a path to it." ] }, { @@ -168,7 +194,7 @@ { "data": { "text/plain": [ - "'eddf5466faa3391a7114e87648466dcb'" + "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mouse_chrM2x/fasta/default/mouse_chrM2x.fa'" ] }, "execution_count": 6, @@ -177,7 +203,34 @@ } ], "source": [ - "rgc.id(\"hs38d1\", \"fasta\")" + "rgc.seek(\"mouse_chrM2x\", \"fasta\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can get the unique asset identifier with `id()`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'8dfe402f7d29d5b036dd8937119e4404'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rgc.id(\"mouse_chrM2x\", \"fasta\")" ] }, { @@ -196,36 +249,34 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2020-03-13 16:11:59-- http://big.databio.org/refgenie_raw/rCRSd.fa.gz\r\n", - "Resolving big.databio.org (big.databio.org)... 128.143.245.181\r\n", - "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\r\n", - "HTTP request sent, awaiting response... 200 OK\r\n", - "Length: 8399 (8.2K) [application/octet-stream]\r\n", - "Saving to: ‘rCRSd.fa.gz’\r\n", - "\r\n", - "\r", - "rCRSd.fa.gz 0%[ ] 0 --.-KB/s \r", - "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0s \r\n", - "\r\n", - "2020-03-13 16:11:59 (214 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\r\n", - "\r\n" + "--2020-10-30 10:48:54-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", + "Resolving big.databio.org (big.databio.org)... 128.143.245.182, 128.143.245.181\n", + "Connecting to big.databio.org (big.databio.org)|128.143.245.182|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 8399 (8.2K) [application/octet-stream]\n", + "Saving to: ‘rCRSd.fa.gz’\n", + "\n", + "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.005s \n", + "\n", + "2020-10-30 10:48:54 (1.53 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", + "\n" ] } ], "source": [ - "!wget http://big.databio.org/refgenie_raw/rCRSd.fa.gz" + "!wget -O rCRSd.fa.gz http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -233,24 +284,30 @@ "output_type": "stream", "text": [ "Using 'default' as the default tag for 'rCRSd/fasta'\n", + "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", "Building 'rCRSd/fasta:default' using 'fasta' recipe\n", + "Initializing genome: rCRSd\n", + "Loaded AnnotatedSequenceDigestList (1 sequences)\n", + "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", "Saving outputs to:\n", - "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n", - "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n", - "* Compute host: puma\n", - "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n", - "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/\n", - "* Pipeline started at: (03-13 16:11:59) elapsed: 0.0 _TIME_\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/\n", + "* Pipeline started at: (10-30 10:48:55) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.7.6\n", - "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", "* Pypiper version: 0.12.1\n", - "* Pipeline dir: `/home/nsheff/.local/bin`\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", "* Pipeline version: None\n", "\n", "### Arguments passed to pipeline:\n", @@ -266,67 +323,76 @@ "* `genome_description`: `None`\n", "* `logdev`: `False`\n", "* `new_start`: `False`\n", - "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n", + "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", "* `params`: `None`\n", "* `recipe`: `None`\n", "* `recover`: `True`\n", "* `requirements`: `False`\n", "* `silent`: `False`\n", + "* `skip_read_lock`: `False`\n", "* `tag_description`: `None`\n", "* `verbosity`: `None`\n", "* `volumes`: `None`\n", "\n", "----------------------------------------\n", "\n", - "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` \n", "\n", - "> `cp rCRSd.fa.gz /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28689)\n", + "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17428)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17428)\n",
+      "Warning: couldn't add memory use for process: 17428\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 28689;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.0GB\n", + " PID: 17428;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `gzip -d /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28691)\n", + "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17429)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17429)\n",
+      "Warning: couldn't add memory use for process: 17429\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 28691;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", + " PID: 17429;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `samtools faidx /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa` (28693)\n", + "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (17430)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17430)\n",
+      "Warning: couldn't add memory use for process: 17430\n",
       "
\n", - "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.018GB. \n", - " PID: 28693;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.018GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", + " PID: 17430;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `cut -f 1,2 /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.fai > /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.chrom.sizes` (28761)\n", + "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (17431)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17431)\n",
+      "Warning: couldn't add memory use for process: 17431\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n", - " PID: 28761;\tCommand: cut;\tReturn code: 0;\tMemory used: 0.0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", + " PID: 17431;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` (28763)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (17433)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17433)\n",
+      "Warning: couldn't add memory use for process: 17433\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n", - " PID: 28763;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n", - "\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", + " PID: 17433;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", - "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", "Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n", - "Default tag for 'rCRSd/fasta' set to: default\n", + "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:01\n", - "* Peak memory (this run): 0.0184 GB\n", - "* Pipeline completed time: 2020-03-13 16:12:00\n", - "Computing initial genome digest...\n", - "Initializing genome...\n", - "Finished building 'fasta' asset\n" + "* Elapsed time (this run): 0:00:00\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0 GB\n", + "* Pipeline completed time: 2020-10-30 10:48:55\n", + "Finished building 'fasta' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" ] } ], @@ -336,14 +402,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\r\n" + "/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa\r\n" ] } ], @@ -360,16 +426,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa'" + "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa'" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -390,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -416,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -424,24 +490,25 @@ "output_type": "stream", "text": [ "Using 'default' as the default tag for 'rCRSd/bowtie2_index'\n", + "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", "Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n", "Saving outputs to:\n", - "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n", - "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", - "* Compute host: puma\n", - "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n", - "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/\n", - "* Pipeline started at: (03-13 16:12:02) elapsed: 0.0 _TIME_\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/\n", + "* Pipeline started at: (10-30 10:48:58) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.7.6\n", - "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", "* Pypiper version: 0.12.1\n", - "* Pipeline dir: `/home/nsheff/.local/bin`\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", "* Pipeline version: None\n", "\n", "### Arguments passed to pipeline:\n", @@ -457,25 +524,25 @@ "* `genome_description`: `None`\n", "* `logdev`: `False`\n", "* `new_start`: `False`\n", - "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n", + "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", "* `params`: `None`\n", "* `recipe`: `None`\n", "* `recover`: `False`\n", "* `requirements`: `False`\n", "* `silent`: `False`\n", + "* `skip_read_lock`: `False`\n", "* `tag_description`: `None`\n", "* `verbosity`: `None`\n", "* `volumes`: `None`\n", "\n", "----------------------------------------\n", "\n", - "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` \n", "\n", - "> `bowtie2-build /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd` (28812)\n", + "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (17461)\n", "
\n",
-      "Building a SMALL index\n",
       "Settings:\n",
-      "  Output files: \"/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.*.bt2\"\n",
+      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.*.bt2\"\n",
       "  Line rate: 6 (line is 64 bytes)\n",
       "  Lines per side: 1 (side is 64 bytes)\n",
       "  Offset rate: 4 (one in 16)\n",
@@ -492,7 +559,8 @@
       "  Random seed: 0\n",
       "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
       "Input files DNA, FASTA:\n",
-      "  /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\n",
+      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n",
+      "Building a SMALL index\n",
       "Reading reference sizes\n",
       "  Time reading reference sizes: 00:00:00\n",
       "Calculating joined length\n",
@@ -545,8 +613,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -628,8 +696,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -659,27 +727,35 @@
       "    reverse: 1\n",
       "Total time for backward call to driver() for mirror index: 00:00:00\n",
       "
\n", - "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.019GB. \n", - " PID: 28812;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.019GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 17461;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.001GB\n", "\n", "\n", - "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` (28879)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (17463)\n", "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17463)\n",
+      "Warning: couldn't add memory use for process: 17463\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.019GB. \n", - " PID: 28879;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 17463;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", - "\n", - "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n", "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", - "Default tag for 'rCRSd/bowtie2_index' set to: default\n", + "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:01\n", - "* Peak memory (this run): 0.0188 GB\n", - "* Pipeline completed time: 2020-03-13 16:12:03\n", - "Finished building 'bowtie2_index' asset\n" + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.001 GB\n", + "* Pipeline completed time: 2020-10-30 10:48:59\n", + "Finished building 'bowtie2_index' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" ] } ], @@ -696,19 +772,21 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Server subscriptions: http://refgenomes.databio.org\r\n", - "Local genomes: hs38d1, rCRSd\r\n", - "Local recipes: bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\r\n", - "Local assets:\r\n", - " hs38d1/ fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n", - " rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n" + "\u001b[3m Local refgenie assets \u001b[0m\r\n", + "\u001b[3m Server subscriptions: http://rg.databio.org:82 \u001b[0m\r\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", + "│ mouse_chrM2x │ fasta │\r\n", + "│ rCRSd │ fasta, bowtie2_index │\r\n", + "└───────────────────────────┴────────────────────────────────────────┘\r\n" ] } ], @@ -725,14 +803,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "rCRSd/fasta:default,4eb430296bc02ed7e4006624f1d5ac53\r\n" + "4eb430296bc02ed7e4006624f1d5ac53\r\n" ] } ], @@ -749,16 +827,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'3.5.2'" + "'3.6.5'" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -770,40 +848,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "refgenie 0.9.0-dev\r\n" + "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\r\n" ] } ], "source": [ "!refgenie --version" ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'0.7.0-dev'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "refgenconf.__version__" - ] } ], "metadata": { @@ -822,7 +880,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.5" } }, "nbformat": 4, From 15b321c31957e58f3549100cd5a006f12cd873fc Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 10:54:18 -0400 Subject: [PATCH 073/110] correct typos --- docs_jupyter/tutorial.ipynb | 390 ++++-------------------------------- 1 file changed, 37 insertions(+), 353 deletions(-) diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index 754f30fc..54fd5277 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -8,7 +8,7 @@ "\n", "I assume you've already installed refgenie. In this tutorial I'll show you a few ways to use refgenie from the command line (commands that start with a `!`), and also some Python commands.\n", "\n", - "To start, initialize an empty refgenie configuration file from the shell:" + "To start, initialize an empty refgenie configuration file from the shell and subscribe to the desired asset server:" ] }, { @@ -130,7 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use `pull` to download on of the assets asset:" + "Use `pull` to download one of the assets:" ] }, { @@ -141,7 +141,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "39ae8da8f45d4f0a959327e4fecbaf4b", + "model_id": "b0f4e24236b74ab9a5590e0f811cebd2", "version_major": 2, "version_minor": 0 }, @@ -256,7 +256,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2020-10-30 10:48:54-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", + "--2020-10-30 10:53:02-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", "Resolving big.databio.org (big.databio.org)... 128.143.245.182, 128.143.245.181\n", "Connecting to big.databio.org (big.databio.org)|128.143.245.182|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -265,7 +265,7 @@ "\n", "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.005s \n", "\n", - "2020-10-30 10:48:54 (1.53 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", + "2020-10-30 10:53:02 (1.65 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", "\n" ] } @@ -300,7 +300,7 @@ "* Compute host: MichalsMBP\n", "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/\n", - "* Pipeline started at: (10-30 10:48:55) elapsed: 0.0 _TIME_\n", + "* Pipeline started at: (10-30 10:53:03) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", @@ -338,49 +338,49 @@ "\n", "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` \n", "\n", - "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17428)\n", + "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17678)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17428)\n",
-      "Warning: couldn't add memory use for process: 17428\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17678)\n",
+      "Warning: couldn't add memory use for process: 17678\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17428;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", + " PID: 17678;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17429)\n", + "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17680)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17429)\n",
-      "Warning: couldn't add memory use for process: 17429\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17680)\n",
+      "Warning: couldn't add memory use for process: 17680\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17429;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", + " PID: 17680;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (17430)\n", + "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (17681)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17430)\n",
-      "Warning: couldn't add memory use for process: 17430\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17681)\n",
+      "Warning: couldn't add memory use for process: 17681\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17430;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0GB\n", + " PID: 17681;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (17431)\n", + "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (17682)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17431)\n",
-      "Warning: couldn't add memory use for process: 17431\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17682)\n",
+      "Warning: couldn't add memory use for process: 17682\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17431;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", + " PID: 17682;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (17433)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (17684)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17433)\n",
-      "Warning: couldn't add memory use for process: 17433\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17684)\n",
+      "Warning: couldn't add memory use for process: 17684\n",
       "
\n", "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17433;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + " PID: 17684;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n", "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", @@ -389,7 +389,7 @@ "* Elapsed time (this run): 0:00:00\n", "* Total elapsed time (all runs): 0:00:00\n", "* Peak memory (this run): 0 GB\n", - "* Pipeline completed time: 2020-10-30 10:48:55\n", + "* Pipeline completed time: 2020-10-30 10:53:03\n", "Finished building 'fasta' asset\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" @@ -482,283 +482,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using 'default' as the default tag for 'rCRSd/bowtie2_index'\n", - "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", - "Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n", - "Saving outputs to:\n", - "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", - "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build\n", - "### Pipeline run code and environment:\n", - "\n", - "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", - "* Compute host: MichalsMBP\n", - "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", - "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/\n", - "* Pipeline started at: (10-30 10:48:58) elapsed: 0.0 _TIME_\n", - "\n", - "### Version log:\n", - "\n", - "* Python version: 3.6.5\n", - "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", - "* Pypiper version: 0.12.1\n", - "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", - "* Pipeline version: None\n", - "\n", - "### Arguments passed to pipeline:\n", - "\n", - "* `asset_registry_paths`: `['rCRSd/bowtie2_index']`\n", - "* `assets`: `None`\n", - "* `command`: `build`\n", - "* `config_file`: `refgenie.yaml`\n", - "* `docker`: `False`\n", - "* `files`: `None`\n", - "* `genome`: `None`\n", - "* `genome_config`: `refgenie.yaml`\n", - "* `genome_description`: `None`\n", - "* `logdev`: `False`\n", - "* `new_start`: `False`\n", - "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", - "* `params`: `None`\n", - "* `recipe`: `None`\n", - "* `recover`: `False`\n", - "* `requirements`: `False`\n", - "* `silent`: `False`\n", - "* `skip_read_lock`: `False`\n", - "* `tag_description`: `None`\n", - "* `verbosity`: `None`\n", - "* `volumes`: `None`\n", - "\n", - "----------------------------------------\n", - "\n", - "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` \n", - "\n", - "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (17461)\n", - "
\n",
-      "Settings:\n",
-      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.*.bt2\"\n",
-      "  Line rate: 6 (line is 64 bytes)\n",
-      "  Lines per side: 1 (side is 64 bytes)\n",
-      "  Offset rate: 4 (one in 16)\n",
-      "  FTable chars: 10\n",
-      "  Strings: unpacked\n",
-      "  Max bucket size: default\n",
-      "  Max bucket size, sqrt multiplier: default\n",
-      "  Max bucket size, len divisor: 4\n",
-      "  Difference-cover sample period: 1024\n",
-      "  Endianness: little\n",
-      "  Actual local endianness: little\n",
-      "  Sanity checking: disabled\n",
-      "  Assertions: disabled\n",
-      "  Random seed: 0\n",
-      "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
-      "Input files DNA, FASTA:\n",
-      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n",
-      "Building a SMALL index\n",
-      "Reading reference sizes\n",
-      "  Time reading reference sizes: 00:00:00\n",
-      "Calculating joined length\n",
-      "Writing header\n",
-      "Reserving space for joined string\n",
-      "Joining reference sequences\n",
-      "  Time to join reference sequences: 00:00:00\n",
-      "bmax according to bmaxDivN setting: 8284\n",
-      "Using parameters --bmax 6213 --dcv 1024\n",
-      "  Doing ahead-of-time memory usage test\n",
-      "  Passed!  Constructing with these parameters: --bmax 6213 --dcv 1024\n",
-      "Constructing suffix-array element generator\n",
-      "Building DifferenceCoverSample\n",
-      "  Building sPrime\n",
-      "  Building sPrimeOrder\n",
-      "  V-Sorting samples\n",
-      "  V-Sorting samples time: 00:00:00\n",
-      "  Allocating rank array\n",
-      "  Ranking v-sort output\n",
-      "  Ranking v-sort output time: 00:00:00\n",
-      "  Invoking Larsson-Sadakane on ranks\n",
-      "  Invoking Larsson-Sadakane on ranks time: 00:00:00\n",
-      "  Sanity-checking and returning\n",
-      "Building samples\n",
-      "Reserving space for 12 sample suffixes\n",
-      "Generating random suffixes\n",
-      "QSorting 12 sample offsets, eliminating duplicates\n",
-      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
-      "Multikey QSorting 12 samples\n",
-      "  (Using difference cover)\n",
-      "  Multikey QSorting samples time: 00:00:00\n",
-      "Calculating bucket sizes\n",
-      "Splitting and merging\n",
-      "  Splitting and merging time: 00:00:00\n",
-      "Avg bucket size: 33136 (target: 6212)\n",
-      "Converting suffix-array elements to index image\n",
-      "Allocating ftab, absorbFtab\n",
-      "Entering Ebwt loop\n",
-      "Getting block 1 of 1\n",
-      "  No samples; assembling all-inclusive block\n",
-      "  Sorting block of length 33136 for bucket 1\n",
-      "  (Using difference cover)\n",
-      "  Sorting block time: 00:00:00\n",
-      "Returning block of 33137 for bucket 1\n",
-      "Exited Ebwt loop\n",
-      "fchr[A]: 0\n",
-      "fchr[C]: 10248\n",
-      "fchr[G]: 20610\n",
-      "fchr[T]: 24948\n",
-      "fchr[$]: 33136\n",
-      "Exiting Ebwt::buildToDisk()\n",
-      "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.2.bt2\n",
-      "Re-opening _in1 and _in2 as input streams\n",
-      "Returning from Ebwt constructor\n",
-      "Headers:\n",
-      "    len: 33136\n",
-      "    bwtLen: 33137\n",
-      "    sz: 8284\n",
-      "    bwtSz: 8285\n",
-      "    lineRate: 6\n",
-      "    offRate: 4\n",
-      "    offMask: 0xfffffff0\n",
-      "    ftabChars: 10\n",
-      "    eftabLen: 20\n",
-      "    eftabSz: 80\n",
-      "    ftabLen: 1048577\n",
-      "    ftabSz: 4194308\n",
-      "    offsLen: 2072\n",
-      "    offsSz: 8288\n",
-      "    lineSz: 64\n",
-      "    sideSz: 64\n",
-      "    sideBwtSz: 48\n",
-      "    sideBwtLen: 192\n",
-      "    numSides: 173\n",
-      "    numLines: 173\n",
-      "    ebwtTotLen: 11072\n",
-      "    ebwtTotSz: 11072\n",
-      "    color: 0\n",
-      "    reverse: 0\n",
-      "Total time for call to driver() for forward index: 00:00:00\n",
-      "Reading reference sizes\n",
-      "  Time reading reference sizes: 00:00:00\n",
-      "Calculating joined length\n",
-      "Writing header\n",
-      "Reserving space for joined string\n",
-      "Joining reference sequences\n",
-      "  Time to join reference sequences: 00:00:00\n",
-      "  Time to reverse reference sequence: 00:00:00\n",
-      "bmax according to bmaxDivN setting: 8284\n",
-      "Using parameters --bmax 6213 --dcv 1024\n",
-      "  Doing ahead-of-time memory usage test\n",
-      "  Passed!  Constructing with these parameters: --bmax 6213 --dcv 1024\n",
-      "Constructing suffix-array element generator\n",
-      "Building DifferenceCoverSample\n",
-      "  Building sPrime\n",
-      "  Building sPrimeOrder\n",
-      "  V-Sorting samples\n",
-      "  V-Sorting samples time: 00:00:00\n",
-      "  Allocating rank array\n",
-      "  Ranking v-sort output\n",
-      "  Ranking v-sort output time: 00:00:00\n",
-      "  Invoking Larsson-Sadakane on ranks\n",
-      "  Invoking Larsson-Sadakane on ranks time: 00:00:00\n",
-      "  Sanity-checking and returning\n",
-      "Building samples\n",
-      "Reserving space for 12 sample suffixes\n",
-      "Generating random suffixes\n",
-      "QSorting 12 sample offsets, eliminating duplicates\n",
-      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
-      "Multikey QSorting 12 samples\n",
-      "  (Using difference cover)\n",
-      "  Multikey QSorting samples time: 00:00:00\n",
-      "Calculating bucket sizes\n",
-      "Splitting and merging\n",
-      "  Splitting and merging time: 00:00:00\n",
-      "Avg bucket size: 33136 (target: 6212)\n",
-      "Converting suffix-array elements to index image\n",
-      "Allocating ftab, absorbFtab\n",
-      "Entering Ebwt loop\n",
-      "Getting block 1 of 1\n",
-      "  No samples; assembling all-inclusive block\n",
-      "  Sorting block of length 33136 for bucket 1\n",
-      "  (Using difference cover)\n",
-      "  Sorting block time: 00:00:00\n",
-      "Returning block of 33137 for bucket 1\n",
-      "Exited Ebwt loop\n",
-      "fchr[A]: 0\n",
-      "fchr[C]: 10248\n",
-      "fchr[G]: 20610\n",
-      "fchr[T]: 24948\n",
-      "fchr[$]: 33136\n",
-      "Exiting Ebwt::buildToDisk()\n",
-      "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.2.bt2\n",
-      "Re-opening _in1 and _in2 as input streams\n",
-      "Returning from Ebwt constructor\n",
-      "Headers:\n",
-      "    len: 33136\n",
-      "    bwtLen: 33137\n",
-      "    sz: 8284\n",
-      "    bwtSz: 8285\n",
-      "    lineRate: 6\n",
-      "    offRate: 4\n",
-      "    offMask: 0xfffffff0\n",
-      "    ftabChars: 10\n",
-      "    eftabLen: 20\n",
-      "    eftabSz: 80\n",
-      "    ftabLen: 1048577\n",
-      "    ftabSz: 4194308\n",
-      "    offsLen: 2072\n",
-      "    offsSz: 8288\n",
-      "    lineSz: 64\n",
-      "    sideSz: 64\n",
-      "    sideBwtSz: 48\n",
-      "    sideBwtLen: 192\n",
-      "    numSides: 173\n",
-      "    numLines: 173\n",
-      "    ebwtTotLen: 11072\n",
-      "    ebwtTotSz: 11072\n",
-      "    color: 0\n",
-      "    reverse: 1\n",
-      "Total time for backward call to driver() for mirror index: 00:00:00\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", - " PID: 17461;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.001GB\n", - "\n", - "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (17463)\n", - "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17463)\n",
-      "Warning: couldn't add memory use for process: 17463\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", - " PID: 17463;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", - "\n", - "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", - "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", - "\n", - "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:00\n", - "* Peak memory (this run): 0.001 GB\n", - "* Pipeline completed time: 2020-10-30 10:48:59\n", - "Finished building 'bowtie2_index' asset\n", - "Created alias directories: \n", - " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" - ] - } - ], + "outputs": [], "source": [ "!refgenie build rCRSd/bowtie2_index -c refgenie.yaml" ] @@ -772,24 +498,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[3m Local refgenie assets \u001b[0m\r\n", - "\u001b[3m Server subscriptions: http://rg.databio.org:82 \u001b[0m\r\n", - "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", - "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", - "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", - "│ mouse_chrM2x │ fasta │\r\n", - "│ rCRSd │ fasta, bowtie2_index │\r\n", - "└───────────────────────────┴────────────────────────────────────────┘\r\n" - ] - } - ], + "outputs": [], "source": [ "!refgenie list -c refgenie.yaml" ] @@ -803,17 +514,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4eb430296bc02ed7e4006624f1d5ac53\r\n" - ] - } - ], + "outputs": [], "source": [ "!refgenie id rCRSd/fasta -c refgenie.yaml" ] @@ -827,20 +530,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'3.6.5'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from platform import python_version \n", "python_version()" @@ -848,17 +540,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\r\n" - ] - } - ], + "outputs": [], "source": [ "!refgenie --version" ] From 6c51e49a10909ab36326746c03e7c72d8a062046 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 11:28:38 -0400 Subject: [PATCH 074/110] update docs; config, custom assets etc --- docs/autodoc_build/refgenconf.md | 16 +- docs/custom_assets.md | 28 ++- docs/genome_config.md | 45 ++--- docs_jupyter/tutorial.ipynb | 336 ++++++++++++++++++++++++++++++- 4 files changed, 377 insertions(+), 48 deletions(-) diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 16dc9f31..010b24d1 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -374,7 +374,7 @@ Get path to the Annotated Sequence Digests JSON file for a given genome. Note th ```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fe3a1bbba60>) +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7f80b6aa2a60>) ``` Get a rich.Table object representing assets available locally @@ -495,7 +495,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fe3a1bbd598>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f80b6aa5598>) ``` List genomes and assets available remotely. @@ -692,7 +692,7 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7fe3a1bbd6a8>, as_str=False) +def listr(self, genome=None, order=None, get_url= at 0x7f80b6aa56a8>, as_str=False) ``` List genomes and assets available remotely on all servers the object subscribes to @@ -723,7 +723,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fe3a1bbd950>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7f80b6aa5950>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -830,7 +830,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fe3a1bbd0d0>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7f80b6aa50d0>) ``` Seek path to a specified genome-asset-tag alias @@ -861,7 +861,7 @@ Seek path to a specified genome-asset-tag alias ```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fe3a1bbd1e0>) +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f80b6aa51e0>) ``` Seek path to a specified genome-asset-tag @@ -907,7 +907,7 @@ Point to the selected tag by default ```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fe3a1bbdbf8>) +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7f80b6aa5bf8>) ``` Assign a human-readable alias to a genome identifier. @@ -1187,7 +1187,7 @@ Get path to genome configuration file. ```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fe3a1bbb510>, link_fun= at 0x7fe3a1bbee18>) +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7f80b6aa2510>, link_fun= at 0x7f80b6aa6e18>) ``` Upgrade the config to a selected target version. diff --git a/docs/custom_assets.md b/docs/custom_assets.md index 89254bf3..66821147 100644 --- a/docs/custom_assets.md +++ b/docs/custom_assets.md @@ -18,14 +18,28 @@ If you want to, you could also just edit the config file by hand by adding this ```yaml genomes: - hg38: + 511fb1178275e7d529560d53b949dba40815f195623bce8e: + aliases: + - hg38 + - human assets: manual_anno: - asset_path: manual_anno - asset_description: Manual annotations from project X - seek_keys: - anno1: anno1.txt - anno2: anno2.txt + tags: + default: + asset_path: manual_anno + asset_description: Manual annotations from project X + seek_keys: + anno1: anno1.txt + anno2: anno2.txt + default_tag: default ``` -Now, you can access this asset with `refgenie` the same way you do all other assets... `refgenie list` will include it, `refgenie seek -g gh38 -a manual_anno` will retrieve the path, and from within python, `RefGenConf.get_asset('hg38', 'manual_anno')` will also work. The advantage of doing this is that it lets you include *all* your genome-associated resources, including manual ones, within the same framework. +The refgenie-compatible genome digest can be determined this way: + +```python +from refgenconf.seqcol import SeqColClient +digest, _ = SeqColClient({}).load_fasta("path/hg38.fa") +# digest -> 511fb1178275e7d529560d53b949dba40815f195623bce8e +``` + +Now, you can access this asset with `refgenie` the same way you do all other assets... `refgenie list` will include it, `refgenie seek -g gh38 -a manual_anno` will retrieve the path, and from within python, `RefGenConf.seek('hg38', 'manual_anno')` will also work. The advantage of doing this is that it lets you include *all* your genome-associated resources, including manual ones, within the same framework. diff --git a/docs/genome_config.md b/docs/genome_config.md index 369ef07c..f7a40287 100644 --- a/docs/genome_config.md +++ b/docs/genome_config.md @@ -24,23 +24,29 @@ Below is a CHANGELOG describing all changes introduced in configuration file ver Here's how the config file works, in case you do need to edit some things by hand. Here's an example file: ```yaml -genome_folder: /path/to/active/genomes -genome_servers: http://refgenomes.databio.org -genome_archive: /path/to/archived/genomes +config_version: 0.4 +genome_folder: /path/to/genomes +genome_archive_folder: /path/to/genome_archives +genome_archive_config: /path/to/genome_archive/config.yaml +remote_url_base: http://awspds.refgenie.databio.org/ +genome_servers: ['http://refgenomes.databio.org'] genomes: - rCRSd: + 511fb1178275e7d529560d53b949dba40815f195623bce8e: + aliases: + - hg38 + - human assets: fasta: tags: default: seek_keys: - fasta: rCRSd.fa - fai: rCRSd.fa.fai - chrom_sizes: rCRSd.chrom.sizes + fasta: 511fb1178275e7d529560d53b949dba40815f195623bce8e.fa + fai: 511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai + chrom_sizes: 511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes asset_parents: [] asset_path: fasta asset_digest: a3c46f201a3ce7831d85cf4a125aa334 - asset_children: ['bowtie2_index:default'] + asset_children: ['511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index:default'] default_tag: default bowtie2_index: asset_description: Genome index for bowtie, produced with bowtie-build @@ -50,28 +56,17 @@ genomes: seek_keys: bowtie2_index: . asset_digest: 0f9217d44264ae2188fcf8e469d2bb38 - asset_parents: ['fasta:default'] - default_tag: default - hg38: - assets: - gencode_gtf: - asset_description: GTF annotation asset which provides access to all annotated transcripts which make up an Ensembl gene set. - tags: - default: - asset_path: gencode_gtf - seek_keys: - gencode_gtf: hg38.gtf.gz - asset_digest: 4cd4eac99cdfdeb8ff72d8f8a4a16f9f - asset_parents: [] + asset_parents: ['511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:default'] default_tag: default ``` ## Details of config attributes +### Required - **genome_folder**: Path to parent folder refgenie-managed assets. -- **genome_servers**: URL to a refgenieserver instances. -- **genome_archive**: (optional; used by refgenieserver) Path to folder where asset archives will be stored. +- **genome_servers**: URLs to a refgenieserver instances. - **genomes**: A list of genomes, each genome has a list of assets. Any relative paths in the asset `path` attributes are considered relative to the genome folder in the config file (or the file itself if not folder path is specified), with the genome name as an intervening path component, e.g. `folder/mm10/indexed_bowtie2`. +- **aliases**: A list of arbitrary strings that can be used to refer to the namespace - **tags**: A collection of tags defined for the asset - **default_tag**: A pointer to the tag that is currently defined as the default one - **asset_parents**: A list of assets that were used to build the asset in question @@ -79,6 +74,10 @@ genomes: - **seek_keys**: A mapping of names and paths of the specific files within an asset - **asset_path**: A path to the asset folder, relative to the genome config file - **asset_digest**: A digest of the asset directory (more precisely, of the file contents within one) used to address the asset provenance issues when the assets are pulled or built. +### Optional (used by refgenieserver) +- **genome_archive_folder**: Path to folder where asset archives will be stored. +- **genome_archive_folder**: Path to folder file asset archives config will be stored. +- **remote_url_base**: Path/URL to prepend to served asset archives, if non-local ones are to be served Note that for a fully operational config just `genome_folder`, `genome_server`, `genomes`, `assets`, `tags` and `seek_keys` keys are required. diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index 54fd5277..2593a6c9 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -482,9 +482,283 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using 'default' as the default tag for 'rCRSd/bowtie2_index'\n", + "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", + "Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n", + "Saving outputs to:\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build\n", + "### Pipeline run code and environment:\n", + "\n", + "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", + "* Compute host: MichalsMBP\n", + "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/\n", + "* Pipeline started at: (10-30 10:53:06) elapsed: 0.0 _TIME_\n", + "\n", + "### Version log:\n", + "\n", + "* Python version: 3.6.5\n", + "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n", + "* Pypiper version: 0.12.1\n", + "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n", + "* Pipeline version: None\n", + "\n", + "### Arguments passed to pipeline:\n", + "\n", + "* `asset_registry_paths`: `['rCRSd/bowtie2_index']`\n", + "* `assets`: `None`\n", + "* `command`: `build`\n", + "* `config_file`: `refgenie.yaml`\n", + "* `docker`: `False`\n", + "* `files`: `None`\n", + "* `genome`: `None`\n", + "* `genome_config`: `refgenie.yaml`\n", + "* `genome_description`: `None`\n", + "* `logdev`: `False`\n", + "* `new_start`: `False`\n", + "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n", + "* `params`: `None`\n", + "* `recipe`: `None`\n", + "* `recover`: `False`\n", + "* `requirements`: `False`\n", + "* `silent`: `False`\n", + "* `skip_read_lock`: `False`\n", + "* `tag_description`: `None`\n", + "* `verbosity`: `None`\n", + "* `volumes`: `None`\n", + "\n", + "----------------------------------------\n", + "\n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` \n", + "\n", + "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (17711)\n", + "
\n",
+      "Settings:\n",
+      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.*.bt2\"\n",
+      "  Line rate: 6 (line is 64 bytes)\n",
+      "  Lines per side: 1 (side is 64 bytes)\n",
+      "  Offset rate: 4 (one in 16)\n",
+      "  FTable chars: 10\n",
+      "  Strings: unpacked\n",
+      "  Max bucket size: default\n",
+      "  Max bucket size, sqrt multiplier: default\n",
+      "  Max bucket size, len divisor: 4\n",
+      "  Difference-cover sample period: 1024\n",
+      "  Endianness: little\n",
+      "  Actual local endianness: little\n",
+      "  Sanity checking: disabled\n",
+      "  Assertions: disabled\n",
+      "  Random seed: 0\n",
+      "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
+      "Input files DNA, FASTA:\n",
+      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n",
+      "Building a SMALL index\n",
+      "Reading reference sizes\n",
+      "  Time reading reference sizes: 00:00:00\n",
+      "Calculating joined length\n",
+      "Writing header\n",
+      "Reserving space for joined string\n",
+      "Joining reference sequences\n",
+      "  Time to join reference sequences: 00:00:00\n",
+      "bmax according to bmaxDivN setting: 8284\n",
+      "Using parameters --bmax 6213 --dcv 1024\n",
+      "  Doing ahead-of-time memory usage test\n",
+      "  Passed!  Constructing with these parameters: --bmax 6213 --dcv 1024\n",
+      "Constructing suffix-array element generator\n",
+      "Building DifferenceCoverSample\n",
+      "  Building sPrime\n",
+      "  Building sPrimeOrder\n",
+      "  V-Sorting samples\n",
+      "  V-Sorting samples time: 00:00:00\n",
+      "  Allocating rank array\n",
+      "  Ranking v-sort output\n",
+      "  Ranking v-sort output time: 00:00:00\n",
+      "  Invoking Larsson-Sadakane on ranks\n",
+      "  Invoking Larsson-Sadakane on ranks time: 00:00:00\n",
+      "  Sanity-checking and returning\n",
+      "Building samples\n",
+      "Reserving space for 12 sample suffixes\n",
+      "Generating random suffixes\n",
+      "QSorting 12 sample offsets, eliminating duplicates\n",
+      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
+      "Multikey QSorting 12 samples\n",
+      "  (Using difference cover)\n",
+      "  Multikey QSorting samples time: 00:00:00\n",
+      "Calculating bucket sizes\n",
+      "Splitting and merging\n",
+      "  Splitting and merging time: 00:00:00\n",
+      "Avg bucket size: 33136 (target: 6212)\n",
+      "Converting suffix-array elements to index image\n",
+      "Allocating ftab, absorbFtab\n",
+      "Entering Ebwt loop\n",
+      "Getting block 1 of 1\n",
+      "  No samples; assembling all-inclusive block\n",
+      "  Sorting block of length 33136 for bucket 1\n",
+      "  (Using difference cover)\n",
+      "  Sorting block time: 00:00:00\n",
+      "Returning block of 33137 for bucket 1\n",
+      "Exited Ebwt loop\n",
+      "fchr[A]: 0\n",
+      "fchr[C]: 10248\n",
+      "fchr[G]: 20610\n",
+      "fchr[T]: 24948\n",
+      "fchr[$]: 33136\n",
+      "Exiting Ebwt::buildToDisk()\n",
+      "Returning from initFromVector\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.2.bt2\n",
+      "Re-opening _in1 and _in2 as input streams\n",
+      "Returning from Ebwt constructor\n",
+      "Headers:\n",
+      "    len: 33136\n",
+      "    bwtLen: 33137\n",
+      "    sz: 8284\n",
+      "    bwtSz: 8285\n",
+      "    lineRate: 6\n",
+      "    offRate: 4\n",
+      "    offMask: 0xfffffff0\n",
+      "    ftabChars: 10\n",
+      "    eftabLen: 20\n",
+      "    eftabSz: 80\n",
+      "    ftabLen: 1048577\n",
+      "    ftabSz: 4194308\n",
+      "    offsLen: 2072\n",
+      "    offsSz: 8288\n",
+      "    lineSz: 64\n",
+      "    sideSz: 64\n",
+      "    sideBwtSz: 48\n",
+      "    sideBwtLen: 192\n",
+      "    numSides: 173\n",
+      "    numLines: 173\n",
+      "    ebwtTotLen: 11072\n",
+      "    ebwtTotSz: 11072\n",
+      "    color: 0\n",
+      "    reverse: 0\n",
+      "Total time for call to driver() for forward index: 00:00:00\n",
+      "Reading reference sizes\n",
+      "  Time reading reference sizes: 00:00:00\n",
+      "Calculating joined length\n",
+      "Writing header\n",
+      "Reserving space for joined string\n",
+      "Joining reference sequences\n",
+      "  Time to join reference sequences: 00:00:00\n",
+      "  Time to reverse reference sequence: 00:00:00\n",
+      "bmax according to bmaxDivN setting: 8284\n",
+      "Using parameters --bmax 6213 --dcv 1024\n",
+      "  Doing ahead-of-time memory usage test\n",
+      "  Passed!  Constructing with these parameters: --bmax 6213 --dcv 1024\n",
+      "Constructing suffix-array element generator\n",
+      "Building DifferenceCoverSample\n",
+      "  Building sPrime\n",
+      "  Building sPrimeOrder\n",
+      "  V-Sorting samples\n",
+      "  V-Sorting samples time: 00:00:00\n",
+      "  Allocating rank array\n",
+      "  Ranking v-sort output\n",
+      "  Ranking v-sort output time: 00:00:00\n",
+      "  Invoking Larsson-Sadakane on ranks\n",
+      "  Invoking Larsson-Sadakane on ranks time: 00:00:00\n",
+      "  Sanity-checking and returning\n",
+      "Building samples\n",
+      "Reserving space for 12 sample suffixes\n",
+      "Generating random suffixes\n",
+      "QSorting 12 sample offsets, eliminating duplicates\n",
+      "QSorting sample offsets, eliminating duplicates time: 00:00:00\n",
+      "Multikey QSorting 12 samples\n",
+      "  (Using difference cover)\n",
+      "  Multikey QSorting samples time: 00:00:00\n",
+      "Calculating bucket sizes\n",
+      "Splitting and merging\n",
+      "  Splitting and merging time: 00:00:00\n",
+      "Avg bucket size: 33136 (target: 6212)\n",
+      "Converting suffix-array elements to index image\n",
+      "Allocating ftab, absorbFtab\n",
+      "Entering Ebwt loop\n",
+      "Getting block 1 of 1\n",
+      "  No samples; assembling all-inclusive block\n",
+      "  Sorting block of length 33136 for bucket 1\n",
+      "  (Using difference cover)\n",
+      "  Sorting block time: 00:00:00\n",
+      "Returning block of 33137 for bucket 1\n",
+      "Exited Ebwt loop\n",
+      "fchr[A]: 0\n",
+      "fchr[C]: 10248\n",
+      "fchr[G]: 20610\n",
+      "fchr[T]: 24948\n",
+      "fchr[$]: 33136\n",
+      "Exiting Ebwt::buildToDisk()\n",
+      "Returning from initFromVector\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.2.bt2\n",
+      "Re-opening _in1 and _in2 as input streams\n",
+      "Returning from Ebwt constructor\n",
+      "Headers:\n",
+      "    len: 33136\n",
+      "    bwtLen: 33137\n",
+      "    sz: 8284\n",
+      "    bwtSz: 8285\n",
+      "    lineRate: 6\n",
+      "    offRate: 4\n",
+      "    offMask: 0xfffffff0\n",
+      "    ftabChars: 10\n",
+      "    eftabLen: 20\n",
+      "    eftabSz: 80\n",
+      "    ftabLen: 1048577\n",
+      "    ftabSz: 4194308\n",
+      "    offsLen: 2072\n",
+      "    offsSz: 8288\n",
+      "    lineSz: 64\n",
+      "    sideSz: 64\n",
+      "    sideBwtSz: 48\n",
+      "    sideBwtLen: 192\n",
+      "    numSides: 173\n",
+      "    numLines: 173\n",
+      "    ebwtTotLen: 11072\n",
+      "    ebwtTotSz: 11072\n",
+      "    color: 0\n",
+      "    reverse: 1\n",
+      "Total time for backward call to driver() for mirror index: 00:00:01\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 17711;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.001GB\n", + "\n", + "\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (17713)\n", + "
\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=17713)\n",
+      "Warning: couldn't add memory use for process: 17713\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 17713;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", + "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", + "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", + "\n", + "### Pipeline completed. Epilogue\n", + "* Elapsed time (this run): 0:00:00\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.0006 GB\n", + "* Pipeline completed time: 2020-10-30 10:53:07\n", + "Finished building 'bowtie2_index' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" + ] + } + ], "source": [ "!refgenie build rCRSd/bowtie2_index -c refgenie.yaml" ] @@ -498,9 +772,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Local refgenie assets \u001b[0m\r\n", + "\u001b[3m Server subscriptions: http://rg.databio.org:82 \u001b[0m\r\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", + "│ mouse_chrM2x │ fasta │\r\n", + "│ rCRSd │ fasta, bowtie2_index │\r\n", + "└───────────────────────────┴────────────────────────────────────────┘\r\n" + ] + } + ], "source": [ "!refgenie list -c refgenie.yaml" ] @@ -514,9 +803,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4eb430296bc02ed7e4006624f1d5ac53\r\n" + ] + } + ], "source": [ "!refgenie id rCRSd/fasta -c refgenie.yaml" ] @@ -530,9 +827,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'3.6.5'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from platform import python_version \n", "python_version()" @@ -540,9 +848,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\r\n" + ] + } + ], "source": [ "!refgenie --version" ] From 6e1e92fb925938a1ae0172d14335c3c99ef6308a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 11:33:31 -0400 Subject: [PATCH 075/110] update usage --- docs/usage.md | 372 +++++++++++++++++++++++-------------------- update-usage-docs.sh | 6 +- 2 files changed, 205 insertions(+), 173 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index c70861ac..d5d3761f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,15 +2,15 @@ ## `refgenie --help` ```console -version: 0.9.3 +version: 0.10.0-dev | refgenconf 0.10.0-dev usage: refgenie [-h] [--version] [--silent] [--verbosity V] [--logdev] - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe} + {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} ... refgenie - reference genome asset manager positional arguments: - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe} + {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} init Initialize a genome configuration. list List available local assets. listr List available remote assets. @@ -24,6 +24,9 @@ positional arguments: id Return the asset digest. subscribe Add a refgenieserver URL to the config. unsubscribe Remove a refgenieserver URL from the config. + alias Interact with aliases. + compare Compare two genomes. + upgrade Upgrade config. This will alter the files on disk. optional arguments: -h, --help show this help message and exit @@ -37,311 +40,340 @@ https://refgenie.databio.org ## `refgenie init --help` ```console -usage: refgenie init [-h] -c C [-s GENOME_SERVER [GENOME_SERVER ...]] +usage: refgenie init [-h] -c C [--skip-read-lock] [-s GENOME_SERVER [GENOME_SERVER ...]] [-f GENOME_FOLDER] [-a GENOME_ARCHIVE_FOLDER] - [-b GENOME_ARCHIVE_CONFIG] [-u REMOTE_URL_BASE] - [-j SETTINGS_JSON] + [-b GENOME_ARCHIVE_CONFIG] [-u REMOTE_URL_BASE] [-j SETTINGS_JSON] Initialize a genome configuration. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - URL(s) to use for the genome_servers attribute in - config file. Default: http://refgenomes.databio.org. + URL(s) to use for the genome_servers attribute in + config file. Default: + http://refgenomes.databio.org. -f GENOME_FOLDER, --genome-folder GENOME_FOLDER - Absolute path to parent folder refgenie-managed - assets. + Absolute path to parent folder refgenie-managed + assets. -a GENOME_ARCHIVE_FOLDER, --genome-archive-folder GENOME_ARCHIVE_FOLDER - Absolute path to parent archive folder refgenie- - managed assets; used by refgenieserver. + Absolute path to parent archive folder refgenie- + managed assets; used by refgenieserver. -b GENOME_ARCHIVE_CONFIG, --genome-archive-config GENOME_ARCHIVE_CONFIG - Absolute path to desired archive config file; used by - refgenieserver. + Absolute path to desired archive config file; used + by refgenieserver. -u REMOTE_URL_BASE, --remote-url-base REMOTE_URL_BASE - URL to use as an alternative, remote archive location; - used by refgenieserver. + URL to use as an alternative, remote archive + location; used by refgenieserver. -j SETTINGS_JSON, --settings-json SETTINGS_JSON - Absolute path to a JSON file with the key value pairs - to inialize the configuration file with. Overwritten - by itemized specifications. + Absolute path to a JSON file with the key value + pairs to inialize the configuration file with. + Overwritten by itemized specifications. ``` ## `refgenie list --help` ```console -usage: refgenie list [-h] [-c C] [-g [GENOME [GENOME ...]]] +usage: refgenie list [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] List available local assets. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g [GENOME [GENOME ...]], --genome [GENOME [GENOME ...]] - Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. ``` ## `refgenie listr --help` ```console -usage: refgenie listr [-h] [-c C] [-g [GENOME [GENOME ...]]] +usage: refgenie listr [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] List available remote assets. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g [GENOME [GENOME ...]], --genome [GENOME [GENOME ...]] - Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. ``` ## `refgenie pull --help` ```console -usage: refgenie pull [-h] [-c C] [-g G] [--no-overwrite | --force-overwrite] - [--no-large | --pull-large] [--size-cutoff S] [-b] +usage: refgenie pull [-h] [-c C] [--skip-read-lock] [-g G] + [--no-overwrite | --force-overwrite] [--no-large | --pull-large] + [--size-cutoff S] [-b] asset-registry-paths [asset-registry-paths ...] Download assets. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. Prompt handling: These flags configure the pull prompt responses. - --no-overwrite Do not overwrite if asset exists. - --force-overwrite Overwrite if asset exists. - --no-large Do not pull archives over 5GB. - --pull-large Pull any archive, regardless of its size. - --size-cutoff S Maximum archive file size to download with no - confirmation required (in GB, default: 10) - -b, --batch Use batch mode: pull large archives, do no overwrite + --no-overwrite Do not overwrite if asset exists. + --force-overwrite Overwrite if asset exists. + --no-large Do not pull archives over 5GB. + --pull-large Pull any archive, regardless of its size. + --size-cutoff S Maximum archive file size to download with no confirmation + required (in GB, default: 10) + -b, --batch Use batch mode: pull large archives, do no overwrite ``` ## `refgenie build --help` ```console -usage: refgenie build [-h] [-c C] [-R] [-C CONFIG_FILE] [-N] +usage: refgenie build [-h] [-c C] [--skip-read-lock] [-R] [-C CONFIG_FILE] [-N] [--tag-description TAG_DESCRIPTION] [--genome-description GENOME_DESCRIPTION] [-d] - [--assets ASSETS [ASSETS ...]] - [--files FILES [FILES ...]] - [--params PARAMS [PARAMS ...]] - [-v VOLUMES [VOLUMES ...]] [-o OUTFOLDER] [-q] - [-r RECIPE] [-g G] + [--assets ASSETS [ASSETS ...]] [--files FILES [FILES ...]] + [--params PARAMS [PARAMS ...]] [-v VOLUMES [VOLUMES ...]] + [-o OUTFOLDER] [-q] [-r RECIPE] [-g G] asset-registry-paths [asset-registry-paths ...] Build genome assets. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify + assets (e.g. hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -R, --recover Overwrite locks to recover from previous failed run - -C CONFIG_FILE, --config CONFIG_FILE - Pipeline configuration file (YAML). Relative paths are - with respect to the pipeline script. - -N, --new-start Overwrite all results to start a fresh run - --tag-description TAG_DESCRIPTION - Add tag level description (e.g. built with version - 0.3.2). + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -R, --recover Overwrite locks to recover from previous failed + run + -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths + are with respect to the pipeline script. + -N, --new-start Overwrite all results to start a fresh run + --tag-description TAG_DESCRIPTION Add tag level description (e.g. built with version + 0.3.2). --genome-description GENOME_DESCRIPTION - Add genome level description (e.g. The mouse - mitochondrial genome, released in Dec 2013). - -d, --docker Run all commands in the refgenie docker container. - --assets ASSETS [ASSETS ...] - Override the default genome, asset and tag of the - parents (e.g. fasta=hg38/fasta:default - gtf=mm10/gencode_gtf:default). - --files FILES [FILES ...] - Provide paths to the required files (e.g. - fasta=/path/to/file.fa.gz). - --params PARAMS [PARAMS ...] - Provide required parameter values (e.g. - param1=value1). + Add genome level description (e.g. The mouse + mitochondrial genome, released in Dec 2013). + -d, --docker Run all commands in the refgenie docker container. + --assets ASSETS [ASSETS ...] Override the default genome, asset and tag of the + parents (e.g. fasta=hg38/fasta:default + gtf=mm10/gencode_gtf:default). + --files FILES [FILES ...] Provide paths to the required files (e.g. + fasta=/path/to/file.fa.gz). + --params PARAMS [PARAMS ...] Provide required parameter values (e.g. + param1=value1). -v VOLUMES [VOLUMES ...], --volumes VOLUMES [VOLUMES ...] - If using docker, also mount these folders as volumes. - -o OUTFOLDER, --outfolder OUTFOLDER - Override the default path to genomes folder, which is - the genome_folder attribute in the genome - configuration file. - -q, --requirements Show the build requirements for the specified asset - and exit. - -r RECIPE, --recipe RECIPE - Provide a recipe to use. - -g G, --genome G Reference assembly ID, e.g. mm10. + If using docker, also mount these folders as + volumes. + -o OUTFOLDER, --outfolder OUTFOLDER Override the default path to genomes folder, which + is the genome_folder attribute in the genome + configuration file. + -q, --requirements Show the build requirements for the specified + asset and exit. + -r RECIPE, --recipe RECIPE Provide a recipe to use. + -g G, --genome G Reference assembly ID, e.g. mm10. ``` ## `refgenie seek --help` ```console -usage: refgenie seek [-h] [-c C] [-g G] [-e] +usage: refgenie seek [-h] [-c C] [--skip-read-lock] [-g G] [-e] asset-registry-paths [asset-registry-paths ...] Get the path to a local asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag or - hg38/fasta.fai:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag or hg38/fasta.fai:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -e, --check-exists Whether the returned asset path should be checked for - existence on disk. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -e, --check-exists Whether the returned asset path should be checked for existence + on disk. ``` ## `refgenie add --help` ```console -usage: refgenie add [-h] [-c C] [-g G] [-f] -p P [-s S] +usage: refgenie add [-h] [-c C] [--skip-read-lock] [-g G] [-f] -p P [-s S] asset-registry-paths [asset-registry-paths ...] Add local asset to the config file. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -f, --force Do not prompt before action, approve upfront. - -p P, --path P Relative local path to asset. - -s S, --seek-keys S String representation of a JSON object with seek_keys, - e.g. '{"seek_key1": "file.txt"}') + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. + -p P, --path P Relative local path to asset. + -s S, --seek-keys S String representation of a JSON object with seek_keys, e.g. + '{"seek_key1": "file.txt"}' ``` ## `refgenie remove --help` ```console -usage: refgenie remove [-h] [-c C] [-g G] [-f] +usage: refgenie remove [-h] [-c C] [--skip-read-lock] [-g G] [-f] [-a] asset-registry-paths [asset-registry-paths ...] Remove a local asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -f, --force Do not prompt before action, approve upfront. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. + -a, --aliases Remove the genome alias if last asset for that genome is + removed. ``` ## `refgenie getseq --help` ```console -usage: refgenie getseq [-h] [-c C] -g G -l LOCUS +usage: refgenie getseq [-h] [-c C] [--skip-read-lock] -g G -l LOCUS Get sequences from a genome. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -l LOCUS, --locus LOCUS - Coordinates of desired sequence; e.g. - 'chr1:50000-50200'. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -l LOCUS, --locus LOCUS Coordinates of desired sequence; e.g. 'chr1:50000-50200'. ``` ## `refgenie tag --help` ```console -usage: refgenie tag [-h] [-c C] [-g G] (-t TAG | -d) +usage: refgenie tag [-h] [-c C] [--skip-read-lock] [-g G] (-t TAG | -d) asset-registry-paths [asset-registry-paths ...] Tag an asset. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. - -t TAG, --tag TAG Tag to assign to an asset. - -d, --default Set the selected asset tag as the default one. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -t TAG, --tag TAG Tag to assign to an asset. + -d, --default Set the selected asset tag as the default one. ``` ## `refgenie id --help` ```console -usage: refgenie id [-h] [-c C] [-g G] +usage: refgenie id [-h] [-c C] [--skip-read-lock] [-g G] asset-registry-paths [asset-registry-paths ...] Return the asset digest. positional arguments: - asset-registry-paths One or more registry path strings that identify assets - (e.g. hg38/fasta or hg38/fasta:tag). + asset-registry-paths One or more registry path strings that identify assets (e.g. + hg38/fasta or hg38/fasta:tag). optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -g G, --genome G Reference assembly ID, e.g. mm10. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -g G, --genome G Reference assembly ID, e.g. mm10. ``` ## `refgenie subscribe --help` ```console -usage: refgenie subscribe [-h] [-c C] [-r] -s GENOME_SERVER +usage: refgenie subscribe [-h] [-c C] [--skip-read-lock] [-r] -s GENOME_SERVER [GENOME_SERVER ...] Add a refgenieserver URL to the config. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. - -r, --reset Overwrite the current list of server URLs. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -r, --reset Overwrite the current list of server URLs. -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - One or more URLs to add to the genome_servers - attribute in config file. + One or more URLs to add to the genome_servers + attribute in config file. ``` ## `refgenie unsubscribe --help` ```console -usage: refgenie unsubscribe [-h] [-c C] -s GENOME_SERVER [GENOME_SERVER ...] +usage: refgenie unsubscribe [-h] [-c C] [--skip-read-lock] -s GENOME_SERVER + [GENOME_SERVER ...] Remove a refgenieserver URL from the config. optional arguments: - -h, --help show this help message and exit - -c C, --genome-config C - Path to local genome configuration file. Optional if - REFGENIE environment variable is set. + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] - One or more URLs to remove from the genome_servers - attribute in config file. + One or more URLs to remove from the genome_servers + attribute in config file. +``` + +## `refgenie alias --help` +```console +usage: refgenie alias [-h] {remove,set,get} ... + +Interact with aliases. + +positional arguments: + {remove,set,get} + remove Remove aliases. + set Set aliases. + get Get aliases. + +optional arguments: + -h, --help show this help message and exit +``` + +## `refgenie upgrade --help` +```console +usage: refgenie upgrade [-h] [-c C] [--skip-read-lock] -v V [-f] + +Upgrade config. This will alter the files on disk. + +optional arguments: + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -v V, --target-version V Target config version for the upgrade. + -f, --force Do not prompt before action, approve upfront. ``` diff --git a/update-usage-docs.sh b/update-usage-docs.sh index 4aecfd4a..95d5324c 100755 --- a/update-usage-docs.sh +++ b/update-usage-docs.sh @@ -2,15 +2,15 @@ cp docs/usage.template usage.template #looper --help > USAGE.temp 2>&1 -for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help"; do +for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do echo $cmd - echo -e "## \`refgenie $cmd\`" > USAGE_header.temp + echo "## \`refgenie $cmd\`" > USAGE_header.temp refgenie $cmd --help > USAGE.temp 2>&1 # sed -i 's/^/\t/' USAGE.temp sed -i.bak '1s;^;\`\`\`console\ ;' USAGE.temp # sed -i '1s/^/\n\`\`\`console\n/' USAGE.temp - echo -e "\`\`\`\n" >> USAGE.temp + echo "\`\`\`\n" >> USAGE.temp #sed -i -e "/\`looper $cmd\`/r USAGE.temp" -e '$G' usage.template # for -in place inserts cat USAGE_header.temp USAGE.temp >> usage.template # to append to the end done From f4a1497ff15a070cfd12c06eef62a4dc3ddd4af7 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 14:14:33 -0400 Subject: [PATCH 076/110] add aliases documentation --- docs/autodoc_build/refgenconf.md | 16 +- docs_jupyter/aliases.ipynb | 300 +++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 309 insertions(+), 8 deletions(-) create mode 100644 docs_jupyter/aliases.ipynb diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 010b24d1..0fda612a 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -374,7 +374,7 @@ Get path to the Annotated Sequence Digests JSON file for a given genome. Note th ```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7f80b6aa2a60>) +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fe3c32a29d8>) ``` Get a rich.Table object representing assets available locally @@ -495,7 +495,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f80b6aa5598>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fe3c32a3510>) ``` List genomes and assets available remotely. @@ -692,7 +692,7 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7f80b6aa56a8>, as_str=False) +def listr(self, genome=None, order=None, get_url= at 0x7fe3c32a3620>, as_str=False) ``` List genomes and assets available remotely on all servers the object subscribes to @@ -723,7 +723,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7f80b6aa5950>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fe3c32a38c8>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -830,7 +830,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7f80b6aa50d0>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fe3c32a3048>) ``` Seek path to a specified genome-asset-tag alias @@ -861,7 +861,7 @@ Seek path to a specified genome-asset-tag alias ```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f80b6aa51e0>) +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fe3c32a3158>) ``` Seek path to a specified genome-asset-tag @@ -907,7 +907,7 @@ Point to the selected tag by default ```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7f80b6aa5bf8>) +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fe3c32a3b70>) ``` Assign a human-readable alias to a genome identifier. @@ -1187,7 +1187,7 @@ Get path to genome configuration file. ```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7f80b6aa2510>, link_fun= at 0x7f80b6aa6e18>) +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fe3c32a2488>, link_fun= at 0x7fe3c32a4d90>) ``` Upgrade the config to a selected target version. diff --git a/docs_jupyter/aliases.ipynb b/docs_jupyter/aliases.ipynb new file mode 100644 index 00000000..ca5de9cb --- /dev/null +++ b/docs_jupyter/aliases.ipynb @@ -0,0 +1,300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Genome aliases\n", + "\n", + "TLDR; **The genome alias system in refgenie allows users to refer to assets with arbitrary strings managed with `refgenie alias` command.**\n", + "\n", + "## Motivation\n", + "\n", + "Many systems rely on human-readable identifiers of genomes, such as \"hg38\". However, two users may refer to different things with the same identifier, such as the many slight variations of the *hg38* genome assembly. Such identifier mismatches lead to compatibility issues that incur the wrath of bioinformaticians everywhere. A step toward solving this problem is to use unique identifiers that unambiguously identify a particular assembly, such as those provided by the NCBI Assembly database; however, this approach relies on a central authority, and therefore can not apply to custom genomes or custom assets. Besides, human-readable identifiers persist because there's something simple and satisfying about referring to a genome or piece of data with a simple string that makes some sense and is easy to remember, like *hg38*. \n", + "\n", + "## Solutions\n", + "\n", + "### Sequence-derived identifiers\n", + "\n", + "Refgenie’s approach extends the [refget](http://samtools.github.io/hts-specs/refget.html) algorithm by GA4GH, introduced in 2019 to *collections of annotated sequences*. This means that the unique sequence-derived genome identifier calculated by refgenie captures not only sequence content, but also related metadata like sequence names and length. So, instead of referring to human genome as, e.g. \"hg38\" refgenie unambiguously identifies it as `58de7f33a36ccd9d6e3b1b3afe6b9f37cd5b2867bbfb929a`. \n", + "\n", + "#### Genome namespace initialization\n", + "\n", + "The genome digest is calculated based on a FASTA file once the genome namespace is first created. This can happen when the `fasta` asset is pulled or built.\n", + "\n", + "To start, initialize an empty refgenie configuration file from the shell and subscribe to the desired asset server:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialized genome configuration file: /Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml\n", + "Created directories:\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/data\n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias\n" + ] + } + ], + "source": [ + "export REFGENIE=$(pwd)/refgenie.yaml\n", + "refgenie init -c $REFGENIE -s http://rg.databio.org:82" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's pull a `fasta` asset, which is one way to initialize a genome:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No local digest for genome alias: rCRSd\n", + "Setting 'rCRSd' identity with server: http://rg.databio.org:82/v3/alias/genome_digest/rCRSd\n", + "Determined server digest for local genome alias (rCRSd): 511fb1178275e7d529560d53b949dba40815f195623bce8e\n", + "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", + "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive\n", + "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8… 107.3-:\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", + "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", + "Initializing genome: rCRSd\n", + "Loaded AnnotatedSequenceDigestList (1 sequences)\n", + "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/fasta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following the `refgenie pull` command logs we notice that multiple steps happened:\n", + "1. refgenie used the human-readable genome name from the `refgenie pull` call (`rCRSd`) to query the server for any digest associated with it\n", + "2. refgenie set the digest it got back from the server as the genome identifier and set the human-readable genome name as an alias\n", + "3. refgenie used the genome idenfitier (not the user-specified name) to query the server for the `fasta` asset\n", + "\n", + "From now on, the unique seqiue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Genome aliases\n", + "\n", + "To make the user's life easier genome aliases system in refgenie allows to set arbitrary genome aliases that can be then used to refer to a genome. Users can interact with genome aliases using `refgenie alias` command:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: refgenie alias [-h] {remove,set,get} ...\n", + "\n", + "Interact with aliases.\n", + "\n", + "positional arguments:\n", + " {remove,set,get}\n", + " remove Remove aliases.\n", + " set Set aliases.\n", + " get Get aliases.\n", + "\n", + "optional arguments:\n", + " -h, --help show this help message and exit\n" + ] + } + ], + "source": [ + "refgenie alias --help" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set aliases\n", + "\n", + "To set an alias \"mito\" for genome identified by digest `511fb1178275e7d529560d53b949dba40815f195623bce8e` one needs to issue the command below:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: ['mito'])\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mito\n" + ] + } + ], + "source": [ + "refgenie alias set --aliases mito --digest 511fb1178275e7d529560d53b949dba40815f195623bce8e" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get aliases\n", + "\n", + "To get see the entire aliases collection managed by refgenie one needs to issue the command below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Genome aliases \u001b[0m\n", + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1malias \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n", + "│ 511fb1178275e7d529560d53b949dba40815f195623bce8e │ rCRSd, mito │\n", + "└──────────────────────────────────────────────────┴─────────────┘\n" + ] + } + ], + "source": [ + "refgenie alias get" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `alias` and `data` directories\n", + "\n", + "Refgenie stores asset data in two directories: `alias` and `data`. The `data` directory consists of the actual asset files, which are built or pulled from asset servers. The files in this directory are named using the digests, which helps refgenie to unambigously identify genomes. The `alias` holds symbolic links to asset data in `data` directory. **This way users do not need to be aware of the digest-named files at all and there is no wast of disk space due to symbolic links**. \n", + "\n", + "Here's a general view of the contents of both directories:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34malias\u001b[00m\n", + "├── \u001b[01;34mmito\u001b[00m\n", + "│   └── \u001b[01;34mfasta\u001b[00m\n", + "│   └── \u001b[01;34mdefault\u001b[00m\n", + "└── \u001b[01;34mrCRSd\u001b[00m\n", + " └── \u001b[01;34mfasta\u001b[00m\n", + " └── \u001b[01;34mdefault\u001b[00m\n", + "\n", + "6 directories\n", + "\u001b[01;34mdata\u001b[00m\n", + "└── \u001b[01;34m511fb1178275e7d529560d53b949dba40815f195623bce8e\u001b[00m\n", + " └── \u001b[01;34mfasta\u001b[00m\n", + " └── \u001b[01;34mdefault\u001b[00m\n", + "\n", + "3 directories\n" + ] + } + ], + "source": [ + "tree alias -d \n", + "tree data -d " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the `alias` directory holds *both* of the defined aliases. Let's take a closer look at one of them" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34malias/rCRSd/fasta\u001b[00m\n", + "└── \u001b[01;34mdefault\u001b[00m\n", + " ├── \u001b[01;36mrCRSd.chrom.sizes\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes\n", + " ├── \u001b[01;36mrCRSd.fa\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n", + " └── \u001b[01;36mrCRSd.fa.fai\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai\n", + "\n", + "1 directory, 3 files\n" + ] + } + ], + "source": [ + "tree alias/rCRSd/fasta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This explicitly shows that the files inside `alias/rCRSd/fasta/deafault` are in fact symbolic links that point to the actual asset files in `data` directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Bash", + "language": "bash", + "name": "bash" + }, + "language_info": { + "codemirror_mode": "shell", + "file_extension": ".sh", + "mimetype": "text/x-sh", + "name": "bash" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mkdocs.yml b/mkdocs.yml index f75f8027..6d81fd6f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,6 +18,7 @@ nav: - Add custom assets: custom_assets.md - Retrieve paths to assets: seek.md - Use asset tags: tag.md + - Use aliases: aliases.md - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md - Use refgenie with iGenomes: igenomes.md From 01c2b7963b578078e11ba35d15ee73e9e63e47af Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 15:37:13 -0400 Subject: [PATCH 077/110] update changelog --- docs/changelog.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 0061a480..6918b537 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,24 @@ # Changelog This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.10.0] - unreleased + +**After updating to this version your configuration file and genome assets will not be compatible with the software. Please refer to the [upgrade tutorial](config_upgrade_03_to_04.md) for instructions on how to migrate the config between versions.** + +## Changed + +- instead of using human-readable names as genome identifiers refgenie uses sequence-derived digests in the config +- asset data moved to `data` directory +- asset files are now named after genome digests +- refgenieserver API v3 is now used for remote assets retrieval +- improved visual interface in `list`, `listr` and `pull` subcommands + +## Added + +- `data` and `alias` directories in genome directory that are used to store asset and aliases data, respectively +- `refgenie alias` command for genome aliases management +- `refgenie upgrade` command for config format upgrades +- `refgenie compare` command for genome compatibility determination ## [0.9.3] - 2020-07-29 @@ -15,8 +33,6 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## Fixed - `refgenie add` issues -- added assets are no longer imported to the `genome_folder`; [#180](https://github.com/refgenie/refgenie/issues/180) -## [0.10.0] - unreleased - ## [0.9.2] - 2020-07-01 ## Changed From aea929addc5d30b764befc1d7b0312b818701c8d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 30 Oct 2020 16:03:17 -0400 Subject: [PATCH 078/110] add refgenie compare docs --- docs/compare.md | 37 +++++++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 38 insertions(+) create mode 100644 docs/compare.md diff --git a/docs/compare.md b/docs/compare.md new file mode 100644 index 00000000..b0a22ed1 --- /dev/null +++ b/docs/compare.md @@ -0,0 +1,37 @@ +# Genomes compatibility + +## Motivation + +Many genomic analyses require less stringent comparison: simple compatibility between assets that are not necessarily identical. +For example, if we wanted to annotate genomic regions based on aligned BAM file using feature annotations we would need the reads to share just the coordinate system (the sequence lengths and names). Importantly, it does not require sequence identity at all. In this case, we would like to confirm that the given `bowtie2_index` asset is at least compatible with the coordinate system of feature annotation file. + +To sum up, we need a more detailed comparison that may ignore sequences but allow to compare other aspects of the reference genome. + +## Solution + +Refgenie facilitates such fine-grained comparison of genomes via `refgenie compare` command. A useful "byproduct" of genome initialization described in [Use aliases how-to guide](aliases.md) is a JSON file with annotated sequence digests, that are required to compare FASTA file contents. + +## Usage + +In order to compare two initialized genomes one needs to issue the following command + +```console +refgenie compare hg38 hg38_plus +``` + +The result is a set of informative flags that determine the level of compatibility of the genomes, for example: + +```console +Binary: 0b1100111010101 + +CONTENT_ALL_A_IN_B # sequence content +LENGTHS_ALL_A_IN_B # sequence lengths +NAMES_ALL_A_IN_B # sequence names +TOPO_ALL_A_IN_B # sequence topology +TOPO_ALL_B_IN_A +CONTENT_ANY_SHARED +CONTENT_A_ORDER # sequence order +CONTENT_B_ORDER +``` + +Based on the output above we can conclude that genome `hg38_plus` is a superset of `hg38`. diff --git a/mkdocs.yml b/mkdocs.yml index 6d81fd6f..82a1988e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,6 +19,7 @@ nav: - Retrieve paths to assets: seek.md - Use asset tags: tag.md - Use aliases: aliases.md + - Compare genomes: compare.md - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md - Use refgenie with iGenomes: igenomes.md From f20e07597beba32fbd7834389b3951f0ef9dec5e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 08:33:33 -0500 Subject: [PATCH 079/110] recipe listing --- refgenie/refgenie.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 37535ba5..0d26f8e7 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -199,6 +199,9 @@ def add_subparser(cmd, msg, subparsers): help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ").")) + sps[LIST_LOCAL_CMD].add_argument("-r", "--recipes", action="store_true", + help="List available recipes.") + for cmd in [REMOVE_CMD, INSERT_CMD]: sps[cmd].add_argument( "-f", "--force", action="store_true", @@ -841,8 +844,11 @@ def main(): "Could not list assets from the following servers: {}". format(bad_servers) ) - else: # Only check local assets once - console.print(rgc.get_asset_table(genomes=args.genome)) + else: + if args.recipes: + print(", ".join(sorted(list(asset_build_packages.keys())))) + else: + console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, From 43625a4d3419733e9f4c61ab8964db33efcf8737 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 08:44:11 -0500 Subject: [PATCH 080/110] update usage --- docs/usage.md | 3 ++- update-usage-docs.sh | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index d5d3761f..0b64f018 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -76,7 +76,7 @@ optional arguments: ## `refgenie list --help` ```console -usage: refgenie list [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] +usage: refgenie list [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] [-r] List available local assets. @@ -87,6 +87,7 @@ optional arguments: --skip-read-lock Whether the config file should not be locked for reading -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. + -r, --recipes List available recipes. ``` ## `refgenie listr --help` diff --git a/update-usage-docs.sh b/update-usage-docs.sh index 95d5324c..0841bc4b 100755 --- a/update-usage-docs.sh +++ b/update-usage-docs.sh @@ -1,16 +1,15 @@ #!/bin/bash cp docs/usage.template usage.template #looper --help > USAGE.temp 2>&1 - for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do echo $cmd - echo "## \`refgenie $cmd\`" > USAGE_header.temp + echo -e "## \`refgenie $cmd\`" > USAGE_header.temp refgenie $cmd --help > USAGE.temp 2>&1 # sed -i 's/^/\t/' USAGE.temp sed -i.bak '1s;^;\`\`\`console\ ;' USAGE.temp # sed -i '1s/^/\n\`\`\`console\n/' USAGE.temp - echo "\`\`\`\n" >> USAGE.temp + echo -e "\`\`\`\n" >> USAGE.temp #sed -i -e "/\`looper $cmd\`/r USAGE.temp" -e '$G' usage.template # for -in place inserts cat USAGE_header.temp USAGE.temp >> usage.template # to append to the end done From e4423ef27c8a1b7408962f1e02ab699004f5e28d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 08:50:12 -0500 Subject: [PATCH 081/110] update recipe listing in tutorial --- docs_jupyter/tutorial.ipynb | 115 ++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 63 deletions(-) diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index 2593a6c9..a3f5bee9 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -141,7 +141,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b0f4e24236b74ab9a5590e0f811cebd2", + "model_id": "7640cd78195940e7a21600a1313320f8", "version_major": 2, "version_minor": 0 }, @@ -256,16 +256,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2020-10-30 10:53:02-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", - "Resolving big.databio.org (big.databio.org)... 128.143.245.182, 128.143.245.181\n", - "Connecting to big.databio.org (big.databio.org)|128.143.245.182|:80... connected.\n", + "--2020-11-02 08:49:28-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", + "Resolving big.databio.org (big.databio.org)... 128.143.245.181, 128.143.245.182\n", + "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 8399 (8.2K) [application/octet-stream]\n", "Saving to: ‘rCRSd.fa.gz’\n", "\n", - "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.005s \n", + "rCRSd.fa.gz 100%[===================>] 8.20K 10.3KB/s in 0.8s \n", "\n", - "2020-10-30 10:53:02 (1.65 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", + "2020-11-02 08:49:29 (10.3 KB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", "\n" ] } @@ -300,7 +300,7 @@ "* Compute host: MichalsMBP\n", "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/\n", - "* Pipeline started at: (10-30 10:53:03) elapsed: 0.0 _TIME_\n", + "* Pipeline started at: (11-02 08:49:30) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", @@ -338,49 +338,47 @@ "\n", "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` \n", "\n", - "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17678)\n", + "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (28209)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17678)\n",
-      "Warning: couldn't add memory use for process: 17678\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=28209)\n",
+      "Warning: couldn't add memory use for process: 28209\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17678;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", + " PID: 28209;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.0GB\n", "\n", "\n", - "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (17680)\n", + "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (28210)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17680)\n",
-      "Warning: couldn't add memory use for process: 17680\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=28210)\n",
+      "Warning: couldn't add memory use for process: 28210\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17680;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", + " PID: 28210;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", "\n", "\n", - "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (17681)\n", + "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (28211)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17681)\n",
-      "Warning: couldn't add memory use for process: 17681\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17681;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", + " PID: 28211;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.002GB\n", "\n", "\n", - "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (17682)\n", + "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (28212)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17682)\n",
-      "Warning: couldn't add memory use for process: 17682\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=28212)\n",
+      "Warning: couldn't add memory use for process: 28212\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17682;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", + " PID: 28212;\tCommand: cut;\tReturn code: 0;\tMemory used: 0.001GB\n", "\n", "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (17684)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (28214)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=17684)\n",
-      "Warning: couldn't add memory use for process: 17684\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=28214)\n",
+      "Warning: couldn't add memory use for process: 28214\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", - " PID: 17684;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", + " PID: 28214;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n", "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", @@ -388,11 +386,9 @@ "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:00\n", "* Total elapsed time (all runs): 0:00:00\n", - "* Peak memory (this run): 0 GB\n", - "* Pipeline completed time: 2020-10-30 10:53:03\n", - "Finished building 'fasta' asset\n", - "Created alias directories: \n", - " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" + "* Peak memory (this run): 0.0016 GB\n", + "* Pipeline completed time: 2020-11-02 08:49:30\n", + "Finished building 'fasta' asset\n" ] } ], @@ -501,7 +497,7 @@ "* Compute host: MichalsMBP\n", "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/\n", - "* Pipeline started at: (10-30 10:53:06) elapsed: 0.0 _TIME_\n", + "* Pipeline started at: (11-02 08:49:34) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", @@ -539,7 +535,7 @@ "\n", "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` \n", "\n", - "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (17711)\n", + "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (28246)\n", "
\n",
       "Settings:\n",
       "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.*.bt2\"\n",
@@ -725,34 +721,34 @@
       "    ebwtTotSz: 11072\n",
       "    color: 0\n",
       "    reverse: 1\n",
-      "Total time for backward call to driver() for mirror index: 00:00:01\n",
+      "Total time for backward call to driver() for mirror index: 00:00:00\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", - " PID: 17711;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.001GB\n", + "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.0GB. \n", + " PID: 28246;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.0GB\n", "\n", "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (17713)\n", - "
\n"
+      "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (28248)\n",
+      "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=28248)\n",
+      "Warning: couldn't add memory use for process: 28248\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", + " PID: 28248;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "psutil.ZombieProcess process still exists but it's a zombie (pid=17713)\n", - "Warning: couldn't add memory use for process: 17713\n", - "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", - " PID: 17713;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", - "\n", "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:00\n", - "* Total elapsed time (all runs): 0:00:00\n", - "* Peak memory (this run): 0.0006 GB\n", - "* Pipeline completed time: 2020-10-30 10:53:07\n", + "* Elapsed time (this run): 0:00:01\n", + "* Total elapsed time (all runs): 0:00:01\n", + "* Peak memory (this run): 0.0003 GB\n", + "* Pipeline completed time: 2020-11-02 08:49:34\n", "Finished building 'bowtie2_index' asset\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" @@ -779,19 +775,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[3m Local refgenie assets \u001b[0m\r\n", - "\u001b[3m Server subscriptions: http://rg.databio.org:82 \u001b[0m\r\n", - "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", - "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", - "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", - "│ mouse_chrM2x │ fasta │\r\n", - "│ rCRSd │ fasta, bowtie2_index │\r\n", - "└───────────────────────────┴────────────────────────────────────────┘\r\n" + "bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\r\n" ] } ], "source": [ - "!refgenie list -c refgenie.yaml" + "!refgenie list -c refgenie.yaml --recipes" ] }, { From f6a7846ebaf3b896e2f6766326ba43d0e5592fcb Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 08:51:43 -0500 Subject: [PATCH 082/110] update aliases notebook --- docs_jupyter/aliases.ipynb | 47 ++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/docs_jupyter/aliases.ipynb b/docs_jupyter/aliases.ipynb index ca5de9cb..45ed006f 100644 --- a/docs_jupyter/aliases.ipynb +++ b/docs_jupyter/aliases.ipynb @@ -69,7 +69,7 @@ "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive\n", - "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8… 107.3-:\n", + "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8.… 35.2-:\n", "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", @@ -94,7 +94,30 @@ "2. refgenie set the digest it got back from the server as the genome identifier and set the human-readable genome name as an alias\n", "3. refgenie used the genome idenfitier (not the user-specified name) to query the server for the `fasta` asset\n", "\n", - "From now on, the unique seqiue" + "From now on, the unique sequence-derived genome identifier will be used to query asset servers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/archive\n", + "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index:de… \u001b[35m1…\u001b[0m 128.0/117.\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/bowtie2_index__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/bowtie2_index__default.tgz\n", + "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" + ] + } + ], + "source": [ + "refgenie pull rCRSd/bowtie2_index" ] }, { @@ -109,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -146,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -174,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -207,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -216,19 +239,25 @@ "text": [ "\u001b[01;34malias\u001b[00m\n", "├── \u001b[01;34mmito\u001b[00m\n", + "│   ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + "│   │   └── \u001b[01;34mdefault\u001b[00m\n", "│   └── \u001b[01;34mfasta\u001b[00m\n", "│   └── \u001b[01;34mdefault\u001b[00m\n", "└── \u001b[01;34mrCRSd\u001b[00m\n", + " ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + " │   └── \u001b[01;34mdefault\u001b[00m\n", " └── \u001b[01;34mfasta\u001b[00m\n", " └── \u001b[01;34mdefault\u001b[00m\n", "\n", - "6 directories\n", + "10 directories\n", "\u001b[01;34mdata\u001b[00m\n", "└── \u001b[01;34m511fb1178275e7d529560d53b949dba40815f195623bce8e\u001b[00m\n", + " ├── \u001b[01;34mbowtie2_index\u001b[00m\n", + " │   └── \u001b[01;34mdefault\u001b[00m\n", " └── \u001b[01;34mfasta\u001b[00m\n", " └── \u001b[01;34mdefault\u001b[00m\n", "\n", - "3 directories\n" + "5 directories\n" ] } ], @@ -246,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { From ccc64a36dbad8ea634ac392d7bd281005a8dc2f4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 09:08:27 -0500 Subject: [PATCH 083/110] update refgenconf usage in pipelines --- docs/autodoc_build/refgenconf.md | 16 +++---- docs/refgenconf.md | 2 +- docs_jupyter/refgenconf_usage.ipynb | 74 +++++++++++++++++------------ 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 0fda612a..0445d538 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -374,7 +374,7 @@ Get path to the Annotated Sequence Digests JSON file for a given genome. Note th ```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fe3c32a29d8>) +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7f946329a9d8>) ``` Get a rich.Table object representing assets available locally @@ -495,7 +495,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fe3c32a3510>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f946329b510>) ``` List genomes and assets available remotely. @@ -692,7 +692,7 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7fe3c32a3620>, as_str=False) +def listr(self, genome=None, order=None, get_url= at 0x7f946329b620>, as_str=False) ``` List genomes and assets available remotely on all servers the object subscribes to @@ -723,7 +723,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fe3c32a38c8>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7f946329b8c8>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -830,7 +830,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fe3c32a3048>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7f946329b048>) ``` Seek path to a specified genome-asset-tag alias @@ -861,7 +861,7 @@ Seek path to a specified genome-asset-tag alias ```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fe3c32a3158>) +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f946329b158>) ``` Seek path to a specified genome-asset-tag @@ -907,7 +907,7 @@ Point to the selected tag by default ```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fe3c32a3b70>) +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7f946329bb70>) ``` Assign a human-readable alias to a genome identifier. @@ -1187,7 +1187,7 @@ Get path to genome configuration file. ```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fe3c32a2488>, link_fun= at 0x7fe3c32a4d90>) +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7f946329a488>, link_fun= at 0x7f946329cd90>) ``` Upgrade the config to a selected target version. diff --git a/docs/refgenconf.md b/docs/refgenconf.md index bc90a2ea..2d346b93 100644 --- a/docs/refgenconf.md +++ b/docs/refgenconf.md @@ -38,6 +38,6 @@ bt2idx = rgc.seek(genome, "bowtie2_index") # run bowtie2... ``` -This enables you to write python software that will work on any computing environment without having to worry about passing around brittle environment-specific file paths. See [this tutorial](/jupyter_docs/refgenconf_usage) for more comprehensive example of how to work with `refgenconf` as a tool developer. +This enables you to write python software that will work on any computing environment without having to worry about passing around brittle environment-specific file paths. See [this tutorial](/refgenconf_usage) for more comprehensive example of how to work with `refgenconf` as a tool developer. See the complete [refgenconf python API](/autodoc_build/refgenconf) for more details. diff --git a/docs_jupyter/refgenconf_usage.ipynb b/docs_jupyter/refgenconf_usage.ipynb index 2e00149f..fbe1c41d 100644 --- a/docs_jupyter/refgenconf_usage.ipynb +++ b/docs_jupyter/refgenconf_usage.ipynb @@ -23,12 +23,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ[\"REFGENIE\"] = \"./genomes.yaml\"\n", + "os.environ[\"REFGENIE\"] = \"./refgenie.yaml\"\n", "user_provided_cfg_path = None\n", "user_provided_genome = \"rCRSd\"" ] @@ -42,11 +42,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "from refgenconf import RefGenConf, select_genome_config, RefgenconfError, CFG_ENV_VARS, CFG_FOLDER_KEY" + "from refgenconf import RefGenConf, select_genome_config, RefgenconfError, CFG_ENV_VARS, CFG_FOLDER_KEY\n", + "from yacman import UndefinedAliasError" ] }, { @@ -58,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -74,14 +75,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "if not refgenie_cfg_path:\n", - " raise OSError(\"Could not determine path to a refgenie genome configuration file. \"\n", - " \"Use --rfg-config argument or set '{}' environment variable to provide it\".\n", - " format(CFG_ENV_VARS))" + " raise OSError(f\"Could not determine path to a refgenie genome configuration file.\"\n", + " f\"Use --rfg-config argument or set '{CFG_ENV_VARS}' environment variable to provide it\")" ] }, { @@ -93,25 +93,26 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "File '/Users/mstolarczyk/Uczelnia/UVA/code/refgenie/docs_jupyter/genomes.yaml' does not exist. Initializing refgenie genome configuration file.\n" + "File '/Users/mstolarczyk/code/refgenie/docs_jupyter/refgenie.yaml' does not exist. Initializing refgenie genome configuration file.\n" ] } ], "source": [ "if isinstance(refgenie_cfg_path, str) and os.path.exists(refgenie_cfg_path):\n", - " print(\"Reading refgenie genome configuration file from file: {}\".format(refgenie_cfg_path))\n", + " print(f\"Reading refgenie genome configuration file from file: {refgenie_cfg_path}\")\n", " rgc = RefGenConf(filepath=refgenie_cfg_path)\n", "else:\n", - " print(\"File '{}' does not exist. Initializing refgenie genome configuration file.\".format(refgenie_cfg_path))\n", + " print(f\"File '{refgenie_cfg_path}' does not exist. Initializing refgenie genome configuration file.\")\n", " rgc = RefGenConf(entries={CFG_FOLDER_KEY: os.path.dirname(refgenie_cfg_path)})\n", - " rgc.initialize_config_file(filepath=refgenie_cfg_path)" + " rgc.initialize_config_file(filepath=refgenie_cfg_path)\n", + " rgc.subscribe(urls=\"http://rg.databio.org:82\", reset=True) # subscribe to the desired server, if needed" ] }, { @@ -123,43 +124,56 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - " " + "Could not determine path to chrom.sizes asset, pulling\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Could not determine path to chrom.sizes asset, pulling\n", - "Determined path to fasta asset: /Users/mstolarczyk/Uczelnia/UVA/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a258a7dec75f4bbcb3f2973e0a3b9cc6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "\r" + "Determined path to fasta asset: /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa\n" ] } ], "source": [ "try:\n", - " fasta = rgc.get_asset(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", + " fasta = rgc.seek(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", " seek_key=\"fasta\")\n", - "except RefgenconfError:\n", + "except (RefgenconfError, UndefinedAliasError):\n", " print(\"Could not determine path to chrom.sizes asset, pulling\")\n", - " rgc.pull_asset(genome=user_provided_genome, asset=\"fasta\", tag=\"default\")\n", - " fasta = rgc.get_asset(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", + " rgc.pull(genome=user_provided_genome, asset=\"fasta\", tag=\"default\")\n", + " fasta = rgc.seek(genome_name=user_provided_genome, asset_name=\"fasta\", tag_name=\"default\",\n", " seek_key=\"fasta\")\n", - "print(\"Determined path to fasta asset: {}\".format(fasta))" + "print(f\"Determined path to fasta asset: {fasta}\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 107daae4de423dcb15d47e77306aff5f7a36e09b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 2 Nov 2020 09:23:28 -0500 Subject: [PATCH 084/110] update cfg docs --- docs/genome_config.md | 3 +-- docs_jupyter/aliases.ipynb | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/genome_config.md b/docs/genome_config.md index f7a40287..a8634f3f 100644 --- a/docs/genome_config.md +++ b/docs/genome_config.md @@ -21,7 +21,7 @@ Below is a CHANGELOG describing all changes introduced in configuration file ver ## Genome configuration file example -Here's how the config file works, in case you do need to edit some things by hand. Here's an example file: +Here's how the config file works, in case you do need to edit some things by hand. Here's an example file which manages fasta and bowtie2_index assets for hg38 genome. Keep in mind that some of the keys in this config are optional: ```yaml config_version: 0.4 @@ -79,7 +79,6 @@ genomes: - **genome_archive_folder**: Path to folder file asset archives config will be stored. - **remote_url_base**: Path/URL to prepend to served asset archives, if non-local ones are to be served -Note that for a fully operational config just `genome_folder`, `genome_server`, `genomes`, `assets`, `tags` and `seek_keys` keys are required. For genomes that are managed by `refgenie` (that is, they were built or pulled with `refgenie`), these asset attributes will be automatically populated. You can edit them and refgenie will respect your edits (unless you re-build or re-pull the asset, which will overwrite those fields). You can also add your own assets and `refgenie` won't touch them. For more info, see [using custom assets](custom_assets.md). diff --git a/docs_jupyter/aliases.ipynb b/docs_jupyter/aliases.ipynb index 45ed006f..be76bdac 100644 --- a/docs_jupyter/aliases.ipynb +++ b/docs_jupyter/aliases.ipynb @@ -69,7 +69,7 @@ "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive\n", - "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8.… 35.2-:\n", + "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8.… 60.3-:\n", "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", @@ -223,7 +223,7 @@ "source": [ "### `alias` and `data` directories\n", "\n", - "Refgenie stores asset data in two directories: `alias` and `data`. The `data` directory consists of the actual asset files, which are built or pulled from asset servers. The files in this directory are named using the digests, which helps refgenie to unambigously identify genomes. The `alias` holds symbolic links to asset data in `data` directory. **This way users do not need to be aware of the digest-named files at all and there is no wast of disk space due to symbolic links**. \n", + "Refgenie stores asset data in two directories: `alias` and `data`. The `data` directory consists of the actual asset files, which are built or pulled from asset servers. The files in this directory are named using the digests, which helps refgenie to unambigously identify genomes. The `alias` holds symbolic links to asset data in `data` directory. **This way users do not need to be aware of the digest-named files at all and there is no waste of disk space due to symbolic links**. \n", "\n", "Here's a general view of the contents of both directories:" ] @@ -300,7 +300,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This explicitly shows that the files inside `alias/rCRSd/fasta/deafault` are in fact symbolic links that point to the actual asset files in `data` directory." + "This explicitly shows that the files inside `alias/rCRSd/fasta/default` are in fact symbolic links that point to the actual asset files in `data` directory." ] }, { From 7d37295f11b7f2a9247dcaf544eafa42c8e687ea Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 18 Nov 2020 16:21:30 -0500 Subject: [PATCH 085/110] add blacklist as buildable asset --- docs/available_assets.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/available_assets.md b/docs/available_assets.md index b102b285..0ff38fcc 100644 --- a/docs/available_assets.md +++ b/docs/available_assets.md @@ -45,6 +45,27 @@ refgenie build rCRS/fasta --files fasta=rCRS.fa.gz refgenie seek rCRS/fasta ``` +### blacklist + + required files: `--files blacklist=/path/to/blacklist_file` (*e.g.* [hg38-blacklist.v2.bed.gz](https://github.com/Boyle-Lab/Blacklist/tree/master/lists)) + required parameters: *none* + required asset: *none* + required software: *none* + +The `blacklist` asset represents regions that should be excluded from sequencing experiments. The ENCODE blacklist represents a comprehensive listing of these regions for several model organisms [^Amemiya2019]. + +Example blacklist files: + +- [hg19 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg19-blacklist.v2.bed.gz) +- [hg38 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg38-blacklist.v2.bed.gz) +- [mm10 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/mm10-blacklist.v2.bed.gz) +- [dm6 blacklist](https://github.com/Boyle-Lab/Blacklist/blob/master/lists/dm6-blacklist.v2.bed.gz) + +``` +wget https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg38-blacklist.v2.bed.gz +refgenie build hg38/blacklist --files blacklist=hg38-blacklist.v2.bed.gz +``` + ### refgene_anno required files: `--files refgene=/path/to/refGene_file` (*e.g.* [refGene.txt.gz](http://varianttools.sourceforge.net/Annotation/RefGene)) @@ -267,3 +288,6 @@ The `feat_annotation` asset includes the following genomic feature annotations: ``` refgenie build test/feat_annotation ``` + + +[^Amemiya2019]: Amemiya HM, Kundaje A, Boyle AP. The ENCODE Blacklist: Identification of Problematic Regions of the Genome. *Sci Rep* 2019;9, 9354. doi:10.1038/s41598-019-45839-z \ No newline at end of file From bc22d8480e466b30a787ea9f30567d14f97aed96 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 17 Feb 2021 22:20:24 -0500 Subject: [PATCH 086/110] print help if no alias subcommand given; fix #221 --- refgenie/refgenie.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 0d26f8e7..2c498382 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -669,6 +669,11 @@ def main(): _LOGGER.error("No command given") sys.exit(1) + if args.command == ALIAS_CMD and not args.subcommand: + parser.print_help() + _LOGGER.error("No alias subcommand command given") + sys.exit(1) + gencfg = select_genome_config( filename=args.genome_config, check_exist=not args.command == INIT_CMD, on_missing=lambda fp: fp, strict_env=True) From fd96919a7a1aa3a07bc85c87dc9b066b5075e4ac Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 09:59:38 -0500 Subject: [PATCH 087/110] update cli testing workflow --- .github/workflows/test-refgenie-cli.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 4616deef..c6ef4a33 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -68,12 +68,6 @@ jobs: working-directory: ./genomes run: refgenie list -c g.yaml - - name: refgenie remove fasta_child - run: | - refgenie remove -c genomes/g.yaml t7/fasta_child -f - ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 - ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:default 1 # test if the entry was removed from the fasta children list - - name: refgenie build fasta run: refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json @@ -126,15 +120,21 @@ jobs: - name: refgenie tag asset run: | - refgenie tag -c genomes/g.yaml t7_another/fasta:default -t new_tag + refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag ./tests/assert_in_file.sh genomes/g.yaml new_tag 0 - if [ -f `refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` ]; then - echo "`refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` exists." + if [ -f `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` ]; then + echo "`refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` exists." else - echo "Error: `refgenie seek t7_another/fasta:new_tag -c genomes/g.yaml` does not exist." + echo "Error: `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` does not exist." exit 1 fi + - name: refgenie remove fasta_child + run: | + refgenie remove -c genomes/g.yaml t7/fasta_child -f + ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 + ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list + - name: refgenie id run: | - ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta:new_tag` 0 + ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta_child:new_tag` 0 From 0a1462a0dee16b841714358c5167b8d9f54e6f0b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 10:30:54 -0500 Subject: [PATCH 088/110] add tag force flag --- refgenie/refgenie.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 2c498382..38766b42 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -255,6 +255,10 @@ def add_subparser(cmd, msg, subparsers): "-e", "--check-exists", required=False, action="store_true", help="Whether the returned asset path should be checked for existence on disk.") + sps[TAG_CMD].add_argument( + '-f', '--force', action="store_true", + help="Do not prompt before action, approve upfront.") + group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) group.add_argument( @@ -903,7 +907,7 @@ def main(): with rgc as r: r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) sys.exit(0) - rgc.tag(a["genome"], a["asset"], a["tag"], args.tag) + rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, args.force) elif args.command == ID_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, From 33eaaa870f2b98da54ef77b974afdb4af98e4546 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 10:31:35 -0500 Subject: [PATCH 089/110] use tag force in action --- .github/workflows/test-refgenie-cli.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index c6ef4a33..4d3fe19f 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -120,7 +120,7 @@ jobs: - name: refgenie tag asset run: | - refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag + refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag --force ./tests/assert_in_file.sh genomes/g.yaml new_tag 0 if [ -f `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` ]; then echo "`refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` exists." From 4fcb70a63c65ccbfe4f4af0e31c15e0ffe0d0d0a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 11:06:12 -0500 Subject: [PATCH 090/110] specify force arg explicitly --- .github/workflows/test-refgenie-cli.yml | 2 +- refgenie/refgenie.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 4d3fe19f..afdd3f89 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -120,7 +120,7 @@ jobs: - name: refgenie tag asset run: | - refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag --force + refgenie tag -c genomes/g.yaml t7_another/fasta_child:default -t new_tag -f ./tests/assert_in_file.sh genomes/g.yaml new_tag 0 if [ -f `refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` ]; then echo "`refgenie seek t7_another/fasta_child:new_tag -c genomes/g.yaml` exists." diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 38766b42..87a0bd0d 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -907,7 +907,7 @@ def main(): with rgc as r: r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) sys.exit(0) - rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, args.force) + rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) elif args.command == ID_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, From 719d331bb8f99ce7264f7b99f1afbf5770802e72 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 11:11:40 -0500 Subject: [PATCH 091/110] updare workflow --- .github/workflows/test-refgenie-cli.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index afdd3f89..b56f9b11 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -131,7 +131,7 @@ jobs: - name: refgenie remove fasta_child run: | - refgenie remove -c genomes/g.yaml t7/fasta_child -f + refgenie remove -c genomes/g.yaml t7_another/fasta_child -f ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list From 82613bc170a14fcbfa3cee29676be805e3aa629a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 18 Feb 2021 11:16:01 -0500 Subject: [PATCH 092/110] tweak cmd order --- .github/workflows/test-refgenie-cli.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index b56f9b11..5b1131d2 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -129,12 +129,12 @@ jobs: exit 1 fi + - name: refgenie id + run: | + ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta_child:new_tag` 0 + - name: refgenie remove fasta_child run: | refgenie remove -c genomes/g.yaml t7_another/fasta_child -f ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list - - - name: refgenie id - run: | - ./tests/assert_in_file.sh genomes/g.yaml `refgenie id -c genomes/g.yaml t7_another/fasta_child:new_tag` 0 From 0d6d0cc7380406c1802305accbc5f41de048cfa4 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Feb 2021 10:48:13 -0500 Subject: [PATCH 093/110] nonexistent genome alias setting must be forced --- refgenie/refgenie.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 87a0bd0d..b0723544 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -163,6 +163,9 @@ def add_subparser(cmd, msg, subparsers): alias_sps[ALIAS_SET_CMD].add_argument( "-r", "--reset", action="store_true", help="Whether all the aliases should be removed prior to setting new ones.") + alias_sps[ALIAS_SET_CMD].add_argument( + "-f", "--force", action="store_true", + help="Whether the action should be forced, if genome does not exist.") alias_sps[ALIAS_REMOVE_CMD].add_argument( "-a", "--aliases", metavar="A", required=False, default=None, type=str, @@ -944,7 +947,7 @@ def main(): if args.subcommand == ALIAS_SET_CMD: rgc.set_genome_alias(digest=args.digest, genome=args.aliases, - reset_digest=args.reset, create_genome=True) + reset_digest=args.reset, create_genome=args.force) return elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) From b93791876c8387b44d7628c940aa75f94740a375 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Feb 2021 10:49:45 -0500 Subject: [PATCH 094/110] add linter action --- .github/workflows/black.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 00000000..f58e4c63 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@stable From 2d7ce1195cf5487973c5c76822a0cf752d3a4a3d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Feb 2021 11:33:09 -0500 Subject: [PATCH 095/110] reformat --- refgenie/__init__.py | 3 +- refgenie/__main__.py | 2 +- refgenie/_version.py | 2 +- refgenie/add_assets_igenome.py | 105 ++- refgenie/asset_build_packages.py | 399 ++++------ refgenie/build_all_genome.py | 88 ++- refgenie/const.py | 7 +- refgenie/exceptions.py | 7 +- refgenie/refgenie.py | 1130 ++++++++++++++++++++--------- refgenie/refget.py | 2 +- requirements/requirements-all.txt | 5 +- requirements/requirements-dev.txt | 1 - setup.py | 35 +- 13 files changed, 1095 insertions(+), 691 deletions(-) diff --git a/refgenie/__init__.py b/refgenie/__init__.py index b6aa9894..6e0e9f4e 100644 --- a/refgenie/__init__.py +++ b/refgenie/__init__.py @@ -1,3 +1,4 @@ from ._version import __version__ import logmuse -logmuse.init_logger("refgenie") \ No newline at end of file + +logmuse.init_logger("refgenie") diff --git a/refgenie/__main__.py b/refgenie/__main__.py index 7844a8ff..3beb081f 100644 --- a/refgenie/__main__.py +++ b/refgenie/__main__.py @@ -1,7 +1,7 @@ from .refgenie import main import sys -if __name__ == '__main__': +if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: diff --git a/refgenie/_version.py b/refgenie/_version.py index e553b2c7..2aceaf4d 100644 --- a/refgenie/_version.py +++ b/refgenie/_version.py @@ -1 +1 @@ -__version__ = "0.10.0-dev" \ No newline at end of file +__version__ = "0.10.0-dev" diff --git a/refgenie/add_assets_igenome.py b/refgenie/add_assets_igenome.py index a0620e64..7d0778f1 100644 --- a/refgenie/add_assets_igenome.py +++ b/refgenie/add_assets_igenome.py @@ -29,15 +29,36 @@ def build_argparser(): :return argparse.ArgumentParser: constructed parser """ - parser = argparse.ArgumentParser(description='Integrates every asset from the downloaded iGenomes' - ' tarball/directory with Refgenie asset management system') - parser.add_argument('-p', '--path', dest="path", type=str, - help='path to the desired genome tarball or directory to integrate', required=True) - parser.add_argument('-g', '--genome', dest="genome", type=str, help='name to be assigned to the selected genome', - required=True) - parser.add_argument('-c', '--config', dest="config", type=str, - help="path to local genome configuration file. Optional if '{}' environment variable is set.". - format(", ".join(refgenconf.CFG_ENV_VARS)), required=False) + parser = argparse.ArgumentParser( + description="Integrates every asset from the downloaded iGenomes" + " tarball/directory with Refgenie asset management system" + ) + parser.add_argument( + "-p", + "--path", + dest="path", + type=str, + help="path to the desired genome tarball or directory to integrate", + required=True, + ) + parser.add_argument( + "-g", + "--genome", + dest="genome", + type=str, + help="name to be assigned to the selected genome", + required=True, + ) + parser.add_argument( + "-c", + "--config", + dest="config", + type=str, + help="path to local genome configuration file. Optional if '{}' environment variable is set.".format( + ", ".join(refgenconf.CFG_ENV_VARS) + ), + required=False, + ) return parser @@ -78,12 +99,15 @@ def refgenie_add(rgc, asset_dict, path, force=False): should be forced """ # remove the first directory from the provided path if it is the genome name - path = os.path.join(*path.split(os.sep)[1:]) \ - if path.split(os.sep)[0] == asset_dict["genome"] else path - tag = asset_dict["tag"] \ - or rgc.get_default_tag(asset_dict["genome"], asset_dict["asset"]) - outfolder = \ - os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"])) + path = ( + os.path.join(*path.split(os.sep)[1:]) + if path.split(os.sep)[0] == asset_dict["genome"] + else path + ) + tag = asset_dict["tag"] or rgc.get_default_tag( + asset_dict["genome"], asset_dict["asset"] + ) + outfolder = os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"])) abs_asset_path = os.path.join(outfolder, path) if asset_dict["seek_key"] is None: # if seek_key is not specified we're about to move a directory to @@ -101,25 +125,32 @@ def refgenie_add(rgc, asset_dict, path, force=False): if not os.path.exists(tag_path): cp(abs_asset_path, tag_path) else: - if not force and not \ - query_yes_no("Path '{}' exists. Do you want to overwrite?". - format(tag_path)): + if not force and not query_yes_no( + "Path '{}' exists. Do you want to overwrite?".format(tag_path) + ): return False else: _remove(tag_path) cp(abs_asset_path, tag_path) else: - raise OSError("Absolute path '{}' does not exist. " - "The provided path must be relative to: {}". - format(abs_asset_path, rgc[CFG_FOLDER_KEY])) + raise OSError( + "Absolute path '{}' does not exist. " + "The provided path must be relative to: {}".format( + abs_asset_path, rgc[CFG_FOLDER_KEY] + ) + ) rgc.make_writable() gat_bundle = [asset_dict["genome"], asset_dict["asset"], tag] - td = {CFG_ASSET_PATH_KEY: - path if os.path.isdir(abs_asset_path) else os.path.dirname(path)} + td = { + CFG_ASSET_PATH_KEY: path + if os.path.isdir(abs_asset_path) + else os.path.dirname(path) + } rgc.update_tags(*gat_bundle, data=td) # seek_key points to the entire dir if not specified - seek_key_value = os.path.basename(abs_asset_path) \ - if asset_dict["seek_key"] is not None else "." + seek_key_value = ( + os.path.basename(abs_asset_path) if asset_dict["seek_key"] is not None else "." + ) sk = {asset_dict["seek_key"] or asset_dict["asset"]: seek_key_value} rgc.update_seek_keys(*gat_bundle, keys=sk) rgc.set_default_pointer(asset_dict["genome"], asset_dict["asset"], tag) @@ -137,19 +168,27 @@ def main(): """ main workflow """ parser = build_argparser() args, remaining_args = parser.parse_known_args() - cfg = refgenconf.select_genome_config(filename=args.config, check_exist=True, strict_env=True) + cfg = refgenconf.select_genome_config( + filename=args.config, check_exist=True, strict_env=True + ) if not cfg: raise MissingGenomeConfigError(args.config) rgc = refgenconf.RefGenConf(filepath=cfg, writable=False) pths = [args.path, mkabs(args.path, rgc.genome_folder)] - if not untar_or_copy(pths[0], os.path.join(rgc.genome_folder, args.genome)) \ - and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)): - raise OSError("Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths))) + if not untar_or_copy( + pths[0], os.path.join(rgc.genome_folder, args.genome) + ) and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)): + raise OSError( + "Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths)) + ) path_components = [rgc.genome_folder] + [args.genome] + ["*"] * 3 + ["Sequence"] assets_paths = glob(os.path.join(*path_components)) - assert len(assets_paths) > 0, OSError("Your iGenomes directory is corrupted, more than one directory matched by {}." - "\nMatched dirs: {}".format(os.path.join(*path_components), - ", ".join(assets_paths))) + assert len(assets_paths) > 0, OSError( + "Your iGenomes directory is corrupted, more than one directory matched by {}." + "\nMatched dirs: {}".format( + os.path.join(*path_components), ", ".join(assets_paths) + ) + ) assets_path = assets_paths[0] asset_names = [d for d in os.listdir(assets_path) if os.path.isdir(assets_path)] processed = [] @@ -161,7 +200,7 @@ def main(): print("Added assets: \n- {}".format("\n- ".join(processed))) -if __name__ == '__main__': +if __name__ == "__main__": try: sys.exit(main()) except KeyboardInterrupt: diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 7ef5105d..63726eec 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -6,7 +6,7 @@ # These building recipes should make use of arguments that are auto-populated, # or user-provided. The auto-populated arguments are: # - {genome} -# - {asset_outfolder} +# - {asset_outfolder} # In addition to these, the recipe should refer in the # same way, {var}, to any variables required to be provided, which will be # provided via the CLI. These should be listed as 'required_inputs' and @@ -24,7 +24,18 @@ KEY = "key" DEFAULT = "default" -RECIPE_CONSTS = ["DESC", "ASSET_DESC", "ASSETS", "PTH", "REQ_FILES", "REQ_ASSETS", "CONT", "CMD_LST", "KEY", "DEFAULT"] +RECIPE_CONSTS = [ + "DESC", + "ASSET_DESC", + "ASSETS", + "PTH", + "REQ_FILES", + "REQ_ASSETS", + "CONT", + "CMD_LST", + "KEY", + "DEFAULT", +] asset_build_packages = { "fasta": { @@ -32,14 +43,9 @@ ASSETS: { "fasta": "{genome}.fa", "fai": "{genome}.fa.fai", - "chrom_sizes": "{genome}.chrom.sizes" + "chrom_sizes": "{genome}.chrom.sizes", }, - REQ_FILES: [ - { - KEY: "fasta", - DESC: "gzipped fasta file" - } - ], + REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -48,21 +54,16 @@ "gzip -df {asset_outfolder}/{genome}.fa.gz", "samtools faidx {asset_outfolder}/{genome}.fa", "cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes", - ] + ], }, "fasta_txome": { DESC: "cDNA sequences in the FASTA format, indexed FASTA (produced with samtools index) and chromosome sizes file", ASSETS: { "fasta_txome": "{genome}.fa", "fai": "{genome}.fa.fai", - "chrom_sizes": "{genome}.chrom.sizes" + "chrom_sizes": "{genome}.chrom.sizes", }, - REQ_FILES: [ - { - KEY: "fasta", - DESC: "gzipped fasta file" - } - ], + REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -71,26 +72,21 @@ "gzip -df {asset_outfolder}/{genome}.fa.gz", "samtools faidx {asset_outfolder}/{genome}.fa", "cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes", - ] + ], }, "dbnsfp": { DESC: "A database developed for functional prediction and annotation of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome (Gencode release 29/Ensembl 94)", ASSETS: { "dbnsfp": "{genome}_dbNSFP.txt.gz", - "tabix": "{genome}_dbNSFP.txt.gz.tbi" + "tabix": "{genome}_dbNSFP.txt.gz.tbi", }, - REQ_FILES: [ - { - KEY: "dbnsfp", - DESC: "zipped dbSNFP database file" - } - ], + REQ_FILES: [{KEY: "dbnsfp", DESC: "zipped dbSNFP database file"}], REQ_ASSETS: [], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", @@ -103,256 +99,179 @@ "rm {asset_outfolder}/dbNSFP*_variant.chr*", "bgzip -@ {threads} {asset_outfolder}/{genome}_dbNSFP.txt", "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_dbNSFP.txt.gz", - "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`" - ] + "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`", + ], }, "dbsnp": { DESC: "The database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants", - ASSETS: { - "dbsnp": "{genome}_dbSNP.gz", - "tabix": "{genome}_dbSNP.gz.tbi" - }, + ASSETS: {"dbsnp": "{genome}_dbSNP.gz", "tabix": "{genome}_dbSNP.gz.tbi"}, REQ_FILES: [ - { - KEY: "dbsnp_vcf", - DESC: "SNP database file in Variant Call Format (VCF)" - }, - { - KEY: "dbsnp_tbi", - DESC: "tabix index of the dbsnp.vcf file" - } + {KEY: "dbsnp_vcf", DESC: "SNP database file in Variant Call Format (VCF)"}, + {KEY: "dbsnp_tbi", DESC: "tabix index of the dbsnp.vcf file"}, ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "cp {dbsnp_vcf} {asset_outfolder}/{genome}_dbSNP.gz", - "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi" - ] + "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi", + ], }, "bowtie2_index": { DESC: "Genome index for bowtie, produced with bowtie-build", - ASSETS: { - "bowtie2_index": "{genome}" - }, + ASSETS: {"bowtie2_index": "{genome}"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "bowtie2-build {fasta} {asset_outfolder}/{genome}" - ] + CMD_LST: ["bowtie2-build {fasta} {asset_outfolder}/{genome}"], }, "bwa_index": { DESC: "Genome index for Burrows-Wheeler Alignment Tool, produced with bwa index", - ASSETS: { - "bwa_index": "{genome}.fa" - }, + ASSETS: {"bwa_index": "{genome}.fa"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", "bwa index {asset_outfolder}/{genome}.fa", - ] - }, + ], + }, "hisat2_index": { DESC: "Genome index for HISAT2, produced with hisat2-build", - ASSETS: { - "hisat2_index": "{genome}" - }, + ASSETS: {"hisat2_index": "{genome}"}, REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "hisat2-build {fasta} {asset_outfolder}/{genome}" - ] + CMD_LST: ["hisat2-build {fasta} {asset_outfolder}/{genome}"], }, "bismark_bt2_index": { DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie2", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "bismark_bt2_index": "." - }, + ASSETS: {"bismark_bt2_index": "."}, CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", - "bismark_genome_preparation --bowtie2 {asset_outfolder}" - ] + "bismark_genome_preparation --bowtie2 {asset_outfolder}", + ], }, "bismark_bt1_index": { DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie1", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "bismark_bt1_index": "." - }, + ASSETS: {"bismark_bt1_index": "."}, CMD_LST: [ "ln -sf {fasta} {asset_outfolder}", - "bismark_genome_preparation {asset_outfolder}" - ] - }, + "bismark_genome_preparation {asset_outfolder}", + ], + }, "kallisto_index": { DESC: "Genome index for kallisto, produced with kallisto index", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for transcriptome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"} ], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "kallisto_index": "." - }, + ASSETS: {"kallisto_index": "."}, CMD_LST: [ "kallisto index -i {asset_outfolder}/{genome}_kallisto_index.idx {fasta}" - ] + ], }, "salmon_index": { DESC: "Transcriptome index for salmon, produced with salmon index", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for transcriptome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"} ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_index": "." - }, + ASSETS: {"salmon_index": "."}, CMD_LST: [ "salmon index -t {fasta} -i {asset_outfolder} -k {kmer} -p {threads}" - ] + ], }, "salmon_sa_index": { DESC: "Transcriptome index for salmon, produced with salmon index using selective alignment method. Improves quantification accuracy compared to the regular index.", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - }, + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, { KEY: "fasta_txome", DEFAULT: "fasta_txome", - DESC: "fasta asset for transcriptome" - } + DESC: "fasta asset for transcriptome", + }, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_sa_index": "." - }, + ASSETS: {"salmon_sa_index": "."}, CMD_LST: [ "grep '^>' {fasta} | cut -d ' ' -f 1 > {asset_outfolder}/decoys.txt", "sed -i.bak -e 's/>//g' {asset_outfolder}/decoys.txt", "rm {asset_outfolder}/decoys.txt.bak", "cat {fasta_txome} {fasta} > {asset_outfolder}/gentrome.fa", "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}", - "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt" - ] + "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt", + ], }, "salmon_partial_sa_index": { DESC: "Transcriptome index for salmon, produced with salmon index using partial selective alignment method. Preparation includes transcriptome mapping to the genome and extraction of the relevant portion out from the genome and indexing it along with the transcriptome. Recipe source -- https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh", REQ_FILES: [], REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - }, + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, { KEY: "fasta_txome", DEFAULT: "fasta_txome", - DESC: "fasta asset for transcriptome" + DESC: "fasta asset for transcriptome", }, { KEY: "gtf", DEFAULT: "ensembl_gtf", - DESC: "GTF file for exonic features extraction" - } + DESC: "GTF file for exonic features extraction", + }, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" - }, + DESC: "Number of threads to use for parallel computing", + }, { KEY: "kmer", DEFAULT: "31", - DESC: "The length of kmer to use to create the indices" - } + DESC: "The length of kmer to use to create the indices", + }, ], CONT: "combinelab/salmon", - ASSETS: { - "salmon_partial_sa_index": "." - }, + ASSETS: {"salmon_partial_sa_index": "."}, CMD_LST: [ "gunzip -c {gtf} > {asset_outfolder}/{genome}.gtf", "awk -v OFS='\t' '{{if ($3==\"exon\") {{print $1,$4,$5}}}}' {asset_outfolder}/{genome}.gtf > {asset_outfolder}/exons.bed", @@ -361,90 +280,70 @@ "awk -v OFS='\t' '{{print $6,$8,$9}}' {asset_outfolder}/mashmap.out | sort -k1,1 -k2,2n - > {asset_outfolder}/genome_found.sorted.bed", "bedtools merge -i {asset_outfolder}/genome_found.sorted.bed > {asset_outfolder}/genome_found_merged.bed", "bedtools getfasta -fi {asset_outfolder}/reference.masked.genome.fa -bed {asset_outfolder}/genome_found_merged.bed -fo {asset_outfolder}/genome_found.fa", - "awk '{{a=$0; getline;split(a, b, \":\"); r[b[1]] = r[b[1]]\"\"$0}} END {{ for (k in r) {{ print k\"\\n\"r[k] }} }}' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa", + 'awk \'{{a=$0; getline;split(a, b, ":"); r[b[1]] = r[b[1]]""$0}} END {{ for (k in r) {{ print k"\\n"r[k] }} }}\' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa', "cat {fasta_txome} {asset_outfolder}/decoy.fa > {asset_outfolder}/gentrome.fa", "grep '>' {asset_outfolder}/decoy.fa | awk '{{print substr($1,2); }}' > {asset_outfolder}/decoys.txt", "rm {asset_outfolder}/exons.bed {asset_outfolder}/reference.masked.genome.fa {asset_outfolder}/mashmap.out {asset_outfolder}/genome_found.sorted.bed {asset_outfolder}/genome_found_merged.bed {asset_outfolder}/genome_found.fa {asset_outfolder}/decoy.fa {asset_outfolder}/reference.masked.genome.fa.fai", - "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}" - ] + "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}", + ], }, "epilog_index": { DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [ { KEY: "context", - DEFAULT: 'CG', - DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'" + DEFAULT: "CG", + DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'", } ], CONT: "databio/refgenie", - ASSETS: { - "epilog_index": "{genome}_{context}.tsv.gz" - }, + ASSETS: {"epilog_index": "{genome}_{context}.tsv.gz"}, CMD_LST: [ "epilog index -- --infile {fasta} --outfile {asset_outfolder}/{genome}_{context}.tsv --contexts {context}", "bgzip {asset_outfolder}/{genome}_{context}.tsv", "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_{context}.tsv.gz", - ] + ], }, "star_index": { DESC: "Genome index for STAR RNA-seq aligner, produced with STAR --runMode genomeGenerate", REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", - ASSETS: { - "star_index": "." - }, + ASSETS: {"star_index": "."}, CMD_LST: [ "mkdir -p {asset_outfolder}", - "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}" - ] + "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}", + ], }, "gencode_gtf": { DESC: "GTF annotation asset which provides access to all annotated transcripts which make up an Ensembl gene set.", REQ_FILES: [ { KEY: "gencode_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode" + DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode", } ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "gencode_gtf": "{genome}.gtf.gz" - }, - CMD_LST: [ - "cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz" - ] + ASSETS: {"gencode_gtf": "{genome}.gtf.gz"}, + CMD_LST: ["cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz"], }, "ensembl_gtf": { DESC: "Ensembl GTF, TSS, and gene body annotation", REQ_FILES: [ { KEY: "ensembl_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl" + DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl", } ], REQ_ASSETS: [], @@ -457,36 +356,27 @@ }, CMD_LST: [ "cp {ensembl_gtf} {asset_outfolder}/{genome}.gtf.gz", - "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep 'exon_number \"1\";' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1, $4, $5, $20, $14, $7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+20\"\t\"$2+120\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$3-120\"\t\"$3-20\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed", - "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk '$3 == \"gene\"' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1,$4,$5,$14,$6,$7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '$4!=\"Metazoa_SRP\"' | awk '$4!=\"U3\"' | awk '$4!=\"7SK\"' | awk '($3-$2)>200' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+500\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$2\"\t\"$3-500\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | awk '$3>$2' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed" - ] + 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep \'exon_number "1";\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1, $4, $5, $20, $14, $7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'{{if($6=="+"){{print $1"\t"$2+20"\t"$2+120"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$3-120"\t"$3-20"\t"$4"\t"$5"\t"$6}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed', + 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk \'$3 == "gene"\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1,$4,$5,$14,$6,$7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'$4!="Metazoa_SRP"\' | awk \'$4!="U3"\' | awk \'$4!="7SK"\' | awk \'($3-$2)>200\' | awk \'{{if($6=="+"){{print $1"\t"$2+500"\t"$3"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$2"\t"$3-500"\t"$4"\t"$5"\t"$6}}}}\' | awk \'$3>$2\' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed', + ], }, "ensembl_rb": { DESC: "A regulatory annotation file", REQ_FILES: [ { KEY: "gff", - DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl" + DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl", } ], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - ASSETS: { - "ensembl_rb": "{genome}.gff.gz" - }, - CMD_LST: [ - "cp {gff} {asset_outfolder}/{genome}.gff.gz" - ] + ASSETS: {"ensembl_rb": "{genome}.gff.gz"}, + CMD_LST: ["cp {gff} {asset_outfolder}/{genome}.gff.gz"], }, "refgene_anno": { DESC: "gene, TSS, exon, intron, and premature mRNA annotation files", - REQ_FILES: [ - { - KEY: "refgene", - DESC: "gzipped RefGene database annotation file" - } - ], + REQ_FILES: [{KEY: "refgene", DESC: "gzipped RefGene database annotation file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", @@ -499,11 +389,11 @@ }, CMD_LST: [ "cp {refgene} {asset_outfolder}/{genome}_refGene.txt.gz", - "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk '{{if($4==\"+\"){{print $3\"\t\"$5\"\t\"$5\"\t\"$13\"\t.\t\"$4}}else{{print $3\"\t\"$6\"\t\"$6\"\t\"$13\"\t.\t\"$4}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed", + 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk \'{{if($4=="+"){{print $3"\t"$5"\t"$5"\t"$13"\t.\t"$4}}else{{print $3"\t"$6"\t"$6"\t"$13"\t.\t"$4}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed', "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -v OFS='\t' '{{ n = split($10, a, \",\"); split($11, b, \",\"); for(i=1; i {asset_outfolder}/{genome}_exons.bed", "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -F'\t' '{{ exonCount=int($9);split($10,exonStarts,\"[,]\"); split($11,exonEnds,\"[,]\"); for(i=1;i {asset_outfolder}/{genome}_introns.bed", - "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep 'cmpl' | awk '{{print $3\"\t\"$5\"\t\"$6\"\t\"$13\"\t.\t\"$4}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed" - ] + 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep \'cmpl\' | awk \'{{print $3"\t"$5"\t"$6"\t"$13"\t.\t"$4}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed', + ], }, "suffixerator_index": { DESC: "Enhanced suffix array index for genomes using gt (GenomeTools) suffixerator program", @@ -511,61 +401,45 @@ { KEY: "memlimit", DEFAULT: "8GB", - DESC: "The maximum amount of memory available to be used during index construction." + DESC: "The maximum amount of memory available to be used during index construction.", } ], REQ_FILES: [], - REQ_ASSETS: [ - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } - ], + REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}], CONT: "databio/refgenie", - ASSETS: { - "esa": "{genome}.sft" - }, + ASSETS: {"esa": "{genome}.sft"}, CMD_LST: [ "gt suffixerator -dna -pl -tis -suf -lcp -v -showprogress -memlimit {memlimit} -db {fasta} -indexname {asset_outfolder}/{genome}.sft" - ] + ], }, "tallymer_index": { DESC: "Indexed k-mers for a given enhanced suffix array at a fixed value of k", REQ_PARAMS: [ - { - KEY: "mersize", - DEFAULT: "30", - DESC: "The mer size." - }, + {KEY: "mersize", DEFAULT: "30", DESC: "The mer size."}, { KEY: "minocc", DEFAULT: "2", - DESC: "The minimum occurrence number for the mers to index." - } + DESC: "The minimum occurrence number for the mers to index.", + }, ], REQ_FILES: [], REQ_ASSETS: [ { KEY: "esa", DEFAULT: "suffixerator_index", - DESC: "enhanced suffix array index for genome" + DESC: "enhanced suffix array index for genome", }, - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, ], CONT: "databio/refgenie", ASSETS: { "tindex": "{genome}.tal_{mersize}", - "search_file": "{genome}.tal_{mersize}.gtTxt" + "search_file": "{genome}.tal_{mersize}.gtTxt", }, CMD_LST: [ "gt tallymer mkindex -v -counts -pl -mersize {mersize} -minocc {minocc} -indexname {asset_outfolder}/{genome}.tal_{mersize} -esa {esa}/{genome}.sft", - "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt" - ] + "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt", + ], }, "feat_annotation": { DESC: "Combined genomic feature annotation created using an Ensembl GTF annotation asset and an Ensembl regulatory build annotation asset", @@ -577,28 +451,28 @@ { KEY: "ensembl_gtf", DEFAULT: "ensembl_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl" + DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl", }, { KEY: "ensembl_rb", DEFAULT: "ensembl_rb", - DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl" - } + DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl", + }, ], REQ_PARAMS: [], CONT: "databio/refgenie", CMD_LST: [ "gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"Exon\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_exons.bed", "gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{ split($20, a, \"\\\"\"); print \"chr\"$1, $4-1, $5, a[2], $6, $7}}' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u | awk 'seen[$4]++ && seen[$4] > 1' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3nr | env LC_COLLATE=C sort -k1,1 -k2,2n -u | env LC_COLLATE=C sort -k1,1 -k3,3n -u | awk -v OFS='\t' '{{if($4==prev4){{new2=prev3+1;}} {{prev4=$4; prev3=$3; print $1, new2, $2-1, \"Intron\", $5, $6}}}}' | awk -F'\t' '$2' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_introns.bed", - "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed", - "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed", + "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed", + "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"promoter\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"promoter_flanking_region\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter Flanking Region\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter_flanking.bed", "gzip -dcf {ensembl_rb} | awk '$3==\"enhancer\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Enhancer\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_enhancer.bed", "cat {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed | awk -F'\t' '!seen[$1, $2, $3]++' > {asset_outfolder}/{genome}_annotations.bed", "rm -f {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed", - "gzip -f {asset_outfolder}/{genome}_annotations.bed" - ] + "gzip -f {asset_outfolder}/{genome}_annotations.bed", + ], }, "cellranger_reference": { DESC: "Cell Ranger custom genome reference for read alignment and gene expression quantification", @@ -610,19 +484,15 @@ { KEY: "gtf", DEFAULT: "gencode_gtf", - DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode" + DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode", }, - { - KEY: "fasta", - DEFAULT: "fasta", - DESC: "fasta asset for genome" - } + {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}, ], REQ_PARAMS: [ { KEY: "threads", DEFAULT: "8", - DESC: "Number of threads to use for parallel computing" + DESC: "Number of threads to use for parallel computing", } ], CONT: "databio/refgenie", @@ -630,25 +500,18 @@ "gunzip {gtf} -c > {asset_outfolder}/{genome}.gtf", "cellranger mkgtf {asset_outfolder}/{genome}.gtf {asset_outfolder}/{genome}_filtered.gtf", "rm {asset_outfolder}/{genome}.gtf", - "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}" - ] + "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}", + ], }, "blacklist": { DESC: "Atypical, unstructured, or high signal genomic regions present in next-generation sequencing experiments (e.g. from ENCODE)", ASSETS: { "blacklist": "{genome}_blacklist.bed.gz", }, - REQ_FILES: [ - { - KEY: "blacklist", - DESC: "gzipped blacklist file" - } - ], + REQ_FILES: [{KEY: "blacklist", DESC: "gzipped blacklist file"}], REQ_ASSETS: [], REQ_PARAMS: [], CONT: "databio/refgenie", - CMD_LST: [ - "cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz" - ] - } + CMD_LST: ["cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz"], + }, } diff --git a/refgenie/build_all_genome.py b/refgenie/build_all_genome.py index 12be1c6c..98cf968c 100644 --- a/refgenie/build_all_genome.py +++ b/refgenie/build_all_genome.py @@ -8,23 +8,71 @@ import argparse import divvy -parser = argparse.ArgumentParser(description='Builds submission scripts for all assets for a genome') -parser.add_argument('-g', '--genome', dest="genome", type=str, - help='genome to build the submission scripts for') -parser.add_argument('-p', '--path', dest="path", type=str, - help='path to the desired submission directory location') -parser.add_argument('-pt', '--partition', dest="PARTITION", type=str, - help='partition in SLURM submission script', default="standard") -parser.add_argument('-m', '--mem', dest="MEM", type=str, - help='mem in SLURM submission script', default="200000") -parser.add_argument('-t', '--time', dest="TIME", type=str, - help='time in SLURM submission script', default="10:00:00") -parser.add_argument('-c', '--cores', dest="CORES", type=str, - help='cpus-per-task in SLURM submission script', default="4") -parser.add_argument('-o', '--output', dest="LOGFILE", type=str, - help='output in SLURM submission script', default=None) -parser.add_argument('-j', '--job-name', dest="JOBNAME", type=str, - help='job-name in SLURM submission script', default=None) +parser = argparse.ArgumentParser( + description="Builds submission scripts for all assets for a genome" +) +parser.add_argument( + "-g", + "--genome", + dest="genome", + type=str, + help="genome to build the submission scripts for", +) +parser.add_argument( + "-p", + "--path", + dest="path", + type=str, + help="path to the desired submission directory location", +) +parser.add_argument( + "-pt", + "--partition", + dest="PARTITION", + type=str, + help="partition in SLURM submission script", + default="standard", +) +parser.add_argument( + "-m", + "--mem", + dest="MEM", + type=str, + help="mem in SLURM submission script", + default="200000", +) +parser.add_argument( + "-t", + "--time", + dest="TIME", + type=str, + help="time in SLURM submission script", + default="10:00:00", +) +parser.add_argument( + "-c", + "--cores", + dest="CORES", + type=str, + help="cpus-per-task in SLURM submission script", + default="4", +) +parser.add_argument( + "-o", + "--output", + dest="LOGFILE", + type=str, + help="output in SLURM submission script", + default=None, +) +parser.add_argument( + "-j", + "--job-name", + dest="JOBNAME", + type=str, + help="job-name in SLURM submission script", + default=None, +) args = parser.parse_args() @@ -69,8 +117,10 @@ def _req_input_to_args(req_input): sub_script = os.path.join(subdir_path, asset + ".sub") req_input = asset_build_packages[asset]["required_inputs"] if req_input: - print("{} asset requires additional input in the command ({}), so '{}'" - " requires manual edit".format(asset, req_input, sub_script)) + print( + "{} asset requires additional input in the command ({}), so '{}'" + " requires manual edit".format(asset, req_input, sub_script) + ) req_str = " ".join(_req_input_to_args(req_input)) else: req_str = "" diff --git a/refgenie/const.py b/refgenie/const.py index 05f6e1ba..40db1316 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -24,8 +24,7 @@ GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] # For each asset we assume a genome is also required -ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, TAG_CMD, - ID_CMD] +ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, TAG_CMD, ID_CMD] SUBPARSER_MESSAGES = { INIT_CMD: "Initialize a genome configuration.", @@ -43,7 +42,7 @@ UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.", ALIAS_CMD: "Interact with aliases.", COMPARE_CMD: "Compare two genomes.", - UPGRADE_CMD: "Upgrade config. This will alter the files on disk." + UPGRADE_CMD: "Upgrade config. This will alter the files on disk.", } ALIAS_GET_CMD = "get" @@ -53,5 +52,5 @@ ALIAS_SUBPARSER_MESSAGES = { ALIAS_REMOVE_CMD: "Remove aliases.", ALIAS_SET_CMD: "Set aliases.", - ALIAS_GET_CMD: "Get aliases." + ALIAS_GET_CMD: "Get aliases.", } diff --git a/refgenie/exceptions.py b/refgenie/exceptions.py index 30d32291..9a5cc28a 100644 --- a/refgenie/exceptions.py +++ b/refgenie/exceptions.py @@ -5,6 +5,7 @@ class RefgenieError(Exception): """ Base refgenie exception type """ + pass @@ -17,8 +18,9 @@ def __init__(self, conf_file=None): :param str conf_file: path attempted to be used as genome config file """ - msg = "You must provide a config file either as an argument or via an environment variable: {}"\ - .format(", ".join(CFG_ENV_VARS)) + msg = "You must provide a config file either as an argument or via an environment variable: {}".format( + ", ".join(CFG_ENV_VARS) + ) if conf_file: msg = "Not a file {} -- {}.".format(conf_file, msg) super(MissingGenomeConfigError, self).__init__(msg) @@ -32,4 +34,3 @@ def __init__(self, folder): :param str folder: path attempted to be used as folder to save a file to """ super(MissingFolderError, self).__init__(folder) - diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index b0723544..4d3250c4 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -20,11 +20,24 @@ import logmuse import pypiper import refgenconf -from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \ - MissingRecipeError, DownloadJsonError, get_dir_digest, upgrade_config, \ - __version__ as rgc_version, select_genome_config -from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \ - VersionInHelpParser, is_command_callable +from refgenconf import ( + RefGenConf, + MissingAssetError, + MissingGenomeError, + MissingRecipeError, + DownloadJsonError, + get_dir_digest, + upgrade_config, + __version__ as rgc_version, + select_genome_config, +) +from ubiquerg import ( + is_url, + query_yes_no, + parse_registry_path as prp, + VersionInHelpParser, + is_command_callable, +) from ubiquerg.system import is_writable from yacman import UndefinedAliasError from argparse import HelpFormatter @@ -46,16 +59,19 @@ def build_argparser(): prog="refgenie", version=f"{__version__} | refgenconf {rgc_version}", description=banner, - epilog=additional_description) + epilog=additional_description, + ) subparsers = parser.add_subparsers(dest="command") def add_subparser(cmd, msg, subparsers): return subparsers.add_parser( - cmd, description=msg, help=msg, + cmd, + description=msg, + help=msg, formatter_class=lambda prog: HelpFormatter( prog, max_help_position=40, width=90 - ) + ), ) sps = {} @@ -66,79 +82,164 @@ def add_subparser(cmd, msg, subparsers): continue # It's required for init sps[cmd].add_argument( - '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C", - help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(CFG_ENV_VARS))) + "-c", + "--genome-config", + required=(cmd == INIT_CMD), + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) sps[cmd].add_argument( - '--skip-read-lock', required=False, action="store_true", - help="Whether the config file should not be locked for reading") + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) # upgrade: upgrade config and alter file structure to the target version - sps[UPGRADE_CMD].add_argument('-v', '--target-version', required=True, metavar="V", - help="Target config version for the upgrade.") - sps[UPGRADE_CMD].add_argument('-f', '--force', action="store_true", - help="Do not prompt before action, approve upfront.") - - sps[INIT_CMD].add_argument('-s', '--genome-server', nargs='+', default=DEFAULT_SERVER, - help="URL(s) to use for the {} attribute in config file. Default: {}." - .format(CFG_SERVERS_KEY, DEFAULT_SERVER)) - sps[INIT_CMD].add_argument('-f', '--genome-folder', - help="Absolute path to parent folder refgenie-managed assets.") - sps[INIT_CMD].add_argument('-a', '--genome-archive-folder', - help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.") - sps[INIT_CMD].add_argument('-b', '--genome-archive-config', - help="Absolute path to desired archive config file; used by refgenieserver.") - sps[INIT_CMD].add_argument('-u', '--remote-url-base', - help="URL to use as an alternative, remote archive location; used by refgenieserver.") - sps[INIT_CMD].add_argument('-j', '--settings-json', - help="Absolute path to a JSON file with the key " - "value pairs to inialize the configuration " - "file with. Overwritten by itemized specifications.") + sps[UPGRADE_CMD].add_argument( + "-v", + "--target-version", + required=True, + metavar="V", + help="Target config version for the upgrade.", + ) + sps[UPGRADE_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + sps[INIT_CMD].add_argument( + "-s", + "--genome-server", + nargs="+", + default=DEFAULT_SERVER, + help="URL(s) to use for the {} attribute in config file. Default: {}.".format( + CFG_SERVERS_KEY, DEFAULT_SERVER + ), + ) + sps[INIT_CMD].add_argument( + "-f", + "--genome-folder", + help="Absolute path to parent folder refgenie-managed assets.", + ) + sps[INIT_CMD].add_argument( + "-a", + "--genome-archive-folder", + help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-b", + "--genome-archive-config", + help="Absolute path to desired archive config file; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-u", + "--remote-url-base", + help="URL to use as an alternative, remote archive location; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-j", + "--settings-json", + help="Absolute path to a JSON file with the key " + "value pairs to inialize the configuration " + "file with. Overwritten by itemized specifications.", + ) sps[BUILD_CMD] = pypiper.add_pypiper_args( - sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"]) + sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"] + ) # Add any arguments specific to subcommands. sps[BUILD_CMD].add_argument( - '--tag-description', required=False, default=None, type=str, - help="Add tag level description (e.g. built with version 0.3.2).") + "--tag-description", + required=False, + default=None, + type=str, + help="Add tag level description (e.g. built with version 0.3.2).", + ) sps[BUILD_CMD].add_argument( - '--genome-description', required=False, default=None, type=str, - help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).") + "--genome-description", + required=False, + default=None, + type=str, + help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).", + ) sps[BUILD_CMD].add_argument( - "-d", "--docker", action="store_true", help="Run all commands in the refgenie docker container.") + "-d", + "--docker", + action="store_true", + help="Run all commands in the refgenie docker container.", + ) sps[BUILD_CMD].add_argument( - '--assets', nargs="+", action='append', required=False, default=None, - help='Override the default genome, asset and tag of the parents' - ' (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).') + "--assets", + nargs="+", + action="append", + required=False, + default=None, + help="Override the default genome, asset and tag of the parents" + " (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).", + ) sps[BUILD_CMD].add_argument( - '--files', nargs="+", action='append', required=False, default=None, - help='Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).') + "--files", + nargs="+", + action="append", + required=False, + default=None, + help="Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).", + ) sps[BUILD_CMD].add_argument( - '--params', nargs="+", action='append', required=False, default=None, - help='Provide required parameter values (e.g. param1=value1).') + "--params", + nargs="+", + action="append", + required=False, + default=None, + help="Provide required parameter values (e.g. param1=value1).", + ) sps[BUILD_CMD].add_argument( - '-v', '--volumes', nargs="+", required=False, default=None, - help='If using docker, also mount these folders as volumes.') + "-v", + "--volumes", + nargs="+", + required=False, + default=None, + help="If using docker, also mount these folders as volumes.", + ) sps[BUILD_CMD].add_argument( - '-o', '--outfolder', dest='outfolder', required=False, default=None, - help='Override the default path to genomes folder, which is the ' - 'genome_folder attribute in the genome configuration file.') + "-o", + "--outfolder", + dest="outfolder", + required=False, + default=None, + help="Override the default path to genomes folder, which is the " + "genome_folder attribute in the genome configuration file.", + ) sps[BUILD_CMD].add_argument( - "-q", "--requirements", action="store_true", - help="Show the build requirements for the specified asset and exit.") + "-q", + "--requirements", + action="store_true", + help="Show the build requirements for the specified asset and exit.", + ) sps[BUILD_CMD].add_argument( - "-r", "--recipe", required=False, default=None, type=str, - help="Provide a recipe to use.") + "-r", + "--recipe", + required=False, + default=None, + type=str, + help="Provide a recipe to use.", + ) alias_subparser = sps[ALIAS_CMD] alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand") @@ -147,151 +248,289 @@ def add_subparser(cmd, msg, subparsers): for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) alias_sps[cmd].add_argument( - '-c', '--genome-config', required=False, dest="genome_config", metavar="C", - help="Path to local genome configuration file. Optional if {} environment variable is set." - .format(", ".join(CFG_ENV_VARS))) + "-c", + "--genome-config", + required=False, + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) alias_sps[cmd].add_argument( - '--skip-read-lock', required=False, action="store_true", - help="Whether the config file should not be locked for reading") + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) alias_sps[ALIAS_SET_CMD].add_argument( - "-a", "--aliases", metavar="A", required=False, default=None, type=str, - nargs="+", help="Aliases to set; single if the digest is to be retrieved from the server.") + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to set; single if the digest is to be retrieved from the server.", + ) alias_sps[ALIAS_SET_CMD].add_argument( - "-d", "--digest", metavar="D", required=False, type=str, - help="Digest to set; leave out if the digest is to be retrieved from the server.") + "-d", + "--digest", + metavar="D", + required=False, + type=str, + help="Digest to set; leave out if the digest is to be retrieved from the server.", + ) alias_sps[ALIAS_SET_CMD].add_argument( - "-r", "--reset", action="store_true", - help="Whether all the aliases should be removed prior to setting new ones.") + "-r", + "--reset", + action="store_true", + help="Whether all the aliases should be removed prior to setting new ones.", + ) alias_sps[ALIAS_SET_CMD].add_argument( - "-f", "--force", action="store_true", - help="Whether the action should be forced, if genome does not exist.") + "-f", + "--force", + action="store_true", + help="Whether the action should be forced, if genome does not exist.", + ) alias_sps[ALIAS_REMOVE_CMD].add_argument( - "-a", "--aliases", metavar="A", required=False, default=None, type=str, - nargs="+", help="Aliases to remove.") + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to remove.", + ) alias_sps[ALIAS_REMOVE_CMD].add_argument( - "-d", "--digest", metavar="D", required=True, type=str, - help="Digest to remove.") + "-d", "--digest", metavar="D", required=True, type=str, help="Digest to remove." + ) alias_sps[ALIAS_GET_CMD].add_argument( - "-a", "--aliases", metavar="A", required=False, type=str, nargs="+", - help="Aliases to get the digests for.") - - sps[COMPARE_CMD].add_argument("genome1", metavar="GENOME1", type=str, nargs=1, - help="First genome for compatibility check.") - sps[COMPARE_CMD].add_argument("genome2", metavar="GENOME2", type=str, nargs=1, - help="Second genome for compatibility check.") - sps[COMPARE_CMD].add_argument("-e", "--no-explanation", action="store_true", - help="Do not print compatibility code explanation.") + "-a", + "--aliases", + metavar="A", + required=False, + type=str, + nargs="+", + help="Aliases to get the digests for.", + ) + + sps[COMPARE_CMD].add_argument( + "genome1", + metavar="GENOME1", + type=str, + nargs=1, + help="First genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "genome2", + metavar="GENOME2", + type=str, + nargs=1, + help="Second genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "-e", + "--no-explanation", + action="store_true", + help="Do not print compatibility code explanation.", + ) # add 'genome' argument to many commands - for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]: + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + GETSEQ_CMD, + TAG_CMD, + ID_CMD, + ]: # genome is not required for listing actions sps[cmd].add_argument( - "-g", "--genome", required=cmd in GETSEQ_CMD, metavar="G", - help="Reference assembly ID, e.g. mm10.") + "-g", + "--genome", + required=cmd in GETSEQ_CMD, + metavar="G", + help="Reference assembly ID, e.g. mm10.", + ) for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: - sps[cmd].add_argument("-g", "--genome", required=False, type=str, metavar="G", - nargs="*", help="Reference assembly ID, e.g. mm10.") + sps[cmd].add_argument( + "-g", + "--genome", + required=False, + type=str, + metavar="G", + nargs="*", + help="Reference assembly ID, e.g. mm10.", + ) - for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, TAG_CMD, ID_CMD]: + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + TAG_CMD, + ID_CMD, + ]: sps[cmd].add_argument( - "asset_registry_paths", metavar="asset-registry-paths", type=str, nargs='+', + "asset_registry_paths", + metavar="asset-registry-paths", + type=str, + nargs="+", help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" - + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ").")) + + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."), + ) - sps[LIST_LOCAL_CMD].add_argument("-r", "--recipes", action="store_true", - help="List available recipes.") + sps[LIST_LOCAL_CMD].add_argument( + "-r", "--recipes", action="store_true", help="List available recipes." + ) for cmd in [REMOVE_CMD, INSERT_CMD]: sps[cmd].add_argument( - "-f", "--force", action="store_true", - help="Do not prompt before action, approve upfront.") + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) sps[REMOVE_CMD].add_argument( - "-a", "--aliases", action="store_true", - help="Remove the genome alias if last asset for that genome is removed.") + "-a", + "--aliases", + action="store_true", + help="Remove the genome alias if last asset for that genome is removed.", + ) force_group = sps[PULL_CMD].add_argument_group( title="Prompt handling", - description="These flags configure the pull prompt responses.") + description="These flags configure the pull prompt responses.", + ) overwrite_group = force_group.add_mutually_exclusive_group() - overwrite_group.add_argument("--no-overwrite", action="store_true", - help="Do not overwrite if asset exists.") + overwrite_group.add_argument( + "--no-overwrite", action="store_true", help="Do not overwrite if asset exists." + ) - overwrite_group.add_argument("--force-overwrite", action="store_true", - help="Overwrite if asset exists.") + overwrite_group.add_argument( + "--force-overwrite", action="store_true", help="Overwrite if asset exists." + ) large_group = force_group.add_mutually_exclusive_group() - large_group.add_argument("--no-large", action="store_true", - help="Do not pull archives over 5GB.") - - large_group.add_argument("--pull-large", action="store_true", - help="Pull any archive, regardless of its size.") - - force_group.add_argument("--size-cutoff", type=float, default=10, metavar="S", - help="Maximum archive file size to download with no confirmation required (in GB, default: 10)") - - force_group.add_argument("-b", "--batch", action="store_true", - help="Use batch mode: pull large archives, do no overwrite") + large_group.add_argument( + "--no-large", action="store_true", help="Do not pull archives over 5GB." + ) + + large_group.add_argument( + "--pull-large", + action="store_true", + help="Pull any archive, regardless of its size.", + ) + + force_group.add_argument( + "--size-cutoff", + type=float, + default=10, + metavar="S", + help="Maximum archive file size to download with no confirmation required (in GB, default: 10)", + ) + + force_group.add_argument( + "-b", + "--batch", + action="store_true", + help="Use batch mode: pull large archives, do no overwrite", + ) sps[INSERT_CMD].add_argument( - "-p", "--path", required=True, metavar="P", - help="Relative local path to asset.") + "-p", "--path", required=True, metavar="P", help="Relative local path to asset." + ) sps[INSERT_CMD].add_argument( - "-s", "--seek-keys", required=False, type=str, metavar="S", + "-s", + "--seek-keys", + required=False, + type=str, + metavar="S", help=""" String representation of a JSON object with seek_keys, e.g. '{"seek_key1": "file.txt"}' - """) + """, + ) sps[GETSEQ_CMD].add_argument( - "-l", "--locus", required=True, - help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.") + "-l", + "--locus", + required=True, + help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.", + ) sps[GET_ASSET_CMD].add_argument( - "-e", "--check-exists", required=False, action="store_true", - help="Whether the returned asset path should be checked for existence on disk.") + "-e", + "--check-exists", + required=False, + action="store_true", + help="Whether the returned asset path should be checked for existence on disk.", + ) sps[TAG_CMD].add_argument( - '-f', '--force', action="store_true", - help="Do not prompt before action, approve upfront.") + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) - group.add_argument( - "-t", "--tag", type=str, - help="Tag to assign to an asset.") + group.add_argument("-t", "--tag", type=str, help="Tag to assign to an asset.") group.add_argument( - "-d", "--default", action="store_true", - help="Set the selected asset tag as the default one.") + "-d", + "--default", + action="store_true", + help="Set the selected asset tag as the default one.", + ) sps[SUBSCRIBE_CMD].add_argument( - "-r", "--reset", action="store_true", - help="Overwrite the current list of server URLs.") + "-r", + "--reset", + action="store_true", + help="Overwrite the current list of server URLs.", + ) for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]: sps[cmd].add_argument( - "-s", "--genome-server", nargs='+', required=True, - help="One or more URLs to {action} the {key} attribute in config file.". - format(action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY)) + "-s", + "--genome-server", + nargs="+", + required=True, + help="One or more URLs to {action} the {key} attribute in config file.".format( + action="add to" if cmd == SUBSCRIBE_CMD else "remove from", + key=CFG_SERVERS_KEY, + ), + ) return parser def parse_registry_path(path): - return prp(path, defaults=[ - ("protocol", None), - ("genome", None), - ("asset", None), - ("seek_key", None), - ("tag", None)]) + return prp( + path, + defaults=[ + ("protocol", None), + ("genome", None), + ("asset", None), + ("seek_key", None), + ("tag", None), + ], + ) def copy_or_download_file(input_string, outfolder): @@ -304,8 +543,11 @@ def copy_or_download_file(input_string, outfolder): :return str, str: output/result file and command """ result_file = os.path.join(outfolder, os.path.basename(input_string)) - parts = ["wget -O", result_file, input_string] \ - if is_url(input_string) else ["cp", input_string, result_file] + parts = ( + ["wget -O", result_file, input_string] + if is_url(input_string) + else ["cp", input_string, result_file] + ) return result_file, " ".join(parts) @@ -331,15 +573,25 @@ def default_config_file(): return os.path.join(os.path.dirname(__file__), "refgenie.yaml") -def get_asset_vars(genome, asset_key, tag, outfolder, specific_args=None, specific_params=None, **kwargs): +def get_asset_vars( + genome, + asset_key, + tag, + outfolder, + specific_args=None, + specific_params=None, + **kwargs, +): """ Gives a dict with variables used to populate an asset path. """ asset_outfolder = os.path.join(outfolder, asset_key, tag) - asset_vars = {"genome": genome, - "asset": asset_key, - "tag": tag, - "asset_outfolder": asset_outfolder} + asset_vars = { + "genome": genome, + "asset": asset_key, + "tag": tag, + "asset_outfolder": asset_outfolder, + } if specific_args: asset_vars.update(specific_args) if specific_params: @@ -364,8 +616,7 @@ def refgenie_initg(rgc, genome, content_checksums): """ genome_dir = os.path.join(rgc.data_dir, genome) if is_writable(genome_dir): - output_file = os.path.join( - genome_dir, "{}_sequence_digests.tsv".format(genome)) + output_file = os.path.join(genome_dir, "{}_sequence_digests.tsv".format(genome)) with open(output_file, "w") as contents_file: wr = csv.writer(contents_file, delimiter="\t") for key, val in content_checksums.items(): @@ -373,7 +624,10 @@ def refgenie_initg(rgc, genome, content_checksums): _LOGGER.debug("sequence digests saved to: {}".format(output_file)) else: _LOGGER.warning( - "Could not save the genome sequence digests. '{}' is not writable".format(genome_dir)) + "Could not save the genome sequence digests. '{}' is not writable".format( + genome_dir + ) + ) def refgenie_build(gencfg, genome, asset_list, recipe_name, args): @@ -383,8 +637,11 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args): :param str gencfg: path to the genome configuration file :param argparse.Namespace args: parsed command-line options/arguments """ - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=_skip_lock(args.skip_read_lock, gencfg)) + rgc = RefGenConf( + filepath=gencfg, + writable=False, + skip_read_lock=_skip_lock(args.skip_read_lock, gencfg), + ) specified_args = _parse_user_build_input(args.files) specified_params = _parse_user_build_input(args.params) @@ -395,7 +652,7 @@ def _read_json_file(filepath): :param str filepath: path to the file to read :return dict: read data """ - with open(filepath, 'r') as f: + with open(filepath, "r") as f: data = json.load(f) return data @@ -410,11 +667,20 @@ def _read_json_file(filepath): _LOGGER.debug("Default config file: {}".format(default_config_file())) if args.config_file and not os.path.isfile(args.config_file): - _LOGGER.debug("Config file path isn't a file: {}". - format(args.config_file)) + _LOGGER.debug("Config file path isn't a file: {}".format(args.config_file)) args.config_file = default_config_file() - def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, alias, **kwargs): + def _build_asset( + genome, + asset_key, + tag, + build_pkg, + genome_outfolder, + specific_args, + specific_params, + alias, + **kwargs, + ): """ Builds assets with pypiper and updates a genome config file. @@ -428,10 +694,14 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a assets. """ - log_outfolder = os.path.abspath(os.path.join( - genome_outfolder, asset_key, tag, BUILD_STATS_DIR)) + log_outfolder = os.path.abspath( + os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR) + ) _LOGGER.info( - "Saving outputs to:\n- content: {}\n- logs: {}".format(genome_outfolder, log_outfolder)) + "Saving outputs to:\n- content: {}\n- logs: {}".format( + genome_outfolder, log_outfolder + ) + ) if args.docker: # Set up some docker stuff if args.volumes: @@ -441,12 +711,16 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a volumes = genome_outfolder if not _writeable(genome_outfolder): - _LOGGER.error("Insufficient permissions to write to output folder: {}". - format(genome_outfolder)) + _LOGGER.error( + "Insufficient permissions to write to output folder: {}".format( + genome_outfolder + ) + ) return pm = pypiper.PipelineManager( - name="refgenie", outfolder=log_outfolder, args=args) + name="refgenie", outfolder=log_outfolder, args=args + ) tk = pypiper.NGSTk(pm=pm) if args.docker: pm.get_container(build_pkg[CONT], volumes) @@ -455,20 +729,31 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a gat = [genome, asset_key, tag] # collect variables required to populate the command templates asset_vars = get_asset_vars( - genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs) + genome, + asset_key, + tag, + genome_outfolder, + specific_args, + specific_params, + **kwargs, + ) # populate command templates # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method - command_list_populated = [x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) - for x in build_pkg[CMD_LST]] + command_list_populated = [ + x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()}) + for x in build_pkg[CMD_LST] + ] # create output directory tk.make_dir(asset_vars["asset_outfolder"]) target = os.path.join( - log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)) + log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag) + ) # add target command command_list_populated.append("touch {target}".format(target=target)) - _LOGGER.debug("Command populated: '{}'".format( - " ".join(command_list_populated))) + _LOGGER.debug( + "Command populated: '{}'".format(" ".join(command_list_populated)) + ) try: # run build command signal.signal(signal.SIGINT, _handle_sigint(gat)) @@ -479,50 +764,64 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a else: # save build recipe to the JSON-formatted file recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag) - with open(os.path.join(log_outfolder, recipe_file_name), 'w') as outfile: + with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile: json.dump(build_pkg, outfile) # since the assets are always built to a standard dir structure, we # can just stitch a path together for asset digest calculation asset_dir = os.path.join(rgc.data_dir, *gat) if not os.path.exists(asset_dir): - raise OSError("Could not compute asset digest. Path does not " - "exist: {}".format(asset_dir)) + raise OSError( + "Could not compute asset digest. Path does not " + "exist: {}".format(asset_dir) + ) digest = get_dir_digest(asset_dir) _LOGGER.info("Asset digest: {}".format(digest)) # add updates to config file with rgc as r: if asset_key == "fasta": - r.update_genomes(genome, data={CFG_ALIASES_KEY: [alias]}, - force_digest=genome) + r.update_genomes( + genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome + ) r.update_assets( - *gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, - force_digest=genome) + *gat[0:2], + data={CFG_ASSET_DESC_KEY: build_pkg[DESC]}, + force_digest=genome, + ) r.update_tags( - *gat, force_digest=genome, - data={CFG_ASSET_PATH_KEY: asset_key, CFG_ASSET_CHECKSUM_KEY: digest}) + *gat, + force_digest=genome, + data={ + CFG_ASSET_PATH_KEY: asset_key, + CFG_ASSET_CHECKSUM_KEY: digest, + }, + ) r.update_seek_keys( - *gat, force_digest=genome, - keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()}) + *gat, + force_digest=genome, + keys={ + k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items() + }, + ) r.set_default_pointer(*gat, force_digest=genome) pm.stop_pipeline() return True for a in asset_list: asset_key = a["asset"] - asset_tag = a["tag"] or \ - rgc.get_default_tag(genome, a["asset"], use_existing=False) + asset_tag = a["tag"] or rgc.get_default_tag( + genome, a["asset"], use_existing=False + ) recipe_name = recipe_name or asset_key - if isinstance(recipe_name, dict) or \ - (isinstance(recipe_name, str) - and recipe_name in asset_build_packages.keys()): + if isinstance(recipe_name, dict) or ( + isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys() + ): if isinstance(recipe_name, dict): _LOGGER.info("Using custom recipe: \n{}".format(recipe_name)) asset_build_package = _check_recipe(recipe_name) recipe_name = asset_build_package["name"] else: - asset_build_package = \ - _check_recipe(asset_build_packages[recipe_name]) + asset_build_package = _check_recipe(asset_build_packages[recipe_name]) # handle user-requested parents for the required assets input_assets = {} parent_assets = [] @@ -534,92 +833,170 @@ def _build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_a _LOGGER.debug(f"Custom assets requested: {args.assets}") if not specified_asset_keys and isinstance(args.assets, list): _LOGGER.warning( - "Specified parent assets format is invalid. Using defaults.") + "Specified parent assets format is invalid. Using defaults." + ) for req_asset in asset_build_package[REQ_ASSETS]: req_asset_data = parse_registry_path(req_asset[KEY]) # for each req asset see if non-default parents were requested - if specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys: + if ( + specified_asset_keys is not None + and req_asset_data["asset"] in specified_asset_keys + ): parent_data = parse_registry_path( - specified_assets[specified_asset_keys.index(req_asset_data["asset"])]) - g, a, t, s = parent_data["genome"], \ - parent_data["asset"], \ - parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \ - parent_data["seek_key"] + specified_assets[ + specified_asset_keys.index(req_asset_data["asset"]) + ] + ) + g, a, t, s = ( + parent_data["genome"], + parent_data["asset"], + parent_data["tag"] + or rgc.get_default_tag(genome, parent_data["asset"]), + parent_data["seek_key"], + ) else: # if no custom parents requested for the req asset, use default one default = parse_registry_path(req_asset[DEFAULT]) - g, a, t, s = genome, default["asset"], \ - rgc.get_default_tag(genome, default["asset"]), \ - req_asset_data["seek_key"] + g, a, t, s = ( + genome, + default["asset"], + rgc.get_default_tag(genome, default["asset"]), + req_asset_data["seek_key"], + ) parent_assets.append( - "{}/{}:{}".format(rgc.get_genome_alias_digest(g, fallback=True), a, t)) + "{}/{}:{}".format( + rgc.get_genome_alias_digest(g, fallback=True), a, t + ) + ) input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s) _LOGGER.debug("Using parents: {}".format(", ".join(parent_assets))) _LOGGER.debug("Provided files: {}".format(specified_args)) _LOGGER.debug("Provided parameters: {}".format(specified_params)) for required_file in asset_build_package[REQ_FILES]: - if specified_args is None or required_file[KEY] not in specified_args.keys(): - raise ValueError("Path to the '{x}' input ({desc}) is required, but not provided. " - "Specify it with: --files {x}=/path/to/{x}_file" - .format(x=required_file[KEY], desc=required_file[DESC])) + if ( + specified_args is None + or required_file[KEY] not in specified_args.keys() + ): + raise ValueError( + "Path to the '{x}' input ({desc}) is required, but not provided. " + "Specify it with: --files {x}=/path/to/{x}_file".format( + x=required_file[KEY], desc=required_file[DESC] + ) + ) for required_param in asset_build_package[REQ_PARAMS]: if specified_params is None: specified_params = {} if required_param[KEY] not in specified_params.keys(): if required_param[DEFAULT] is None: - raise ValueError("Value for the parameter '{x}' ({desc}) is required, but not provided. " - "Specify it with: --params {x}=value" - .format(x=required_param[KEY], desc=required_param[DESC])) + raise ValueError( + "Value for the parameter '{x}' ({desc}) is required, but not provided. " + "Specify it with: --params {x}=value".format( + x=required_param[KEY], desc=required_param[DESC] + ) + ) else: - specified_params.update({required_param[KEY]: required_param[DEFAULT]}) - _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format( - genome, asset_key, asset_tag, recipe_name)) + specified_params.update( + {required_param[KEY]: required_param[DEFAULT]} + ) + _LOGGER.info( + "Building '{}/{}:{}' using '{}' recipe".format( + genome, asset_key, asset_tag, recipe_name + ) + ) ori_genome = genome - if recipe_name == 'fasta': - if genome in rgc.genomes_list() and 'fasta' in rgc.list_assets_by_genome(genome): + if recipe_name == "fasta": + if ( + genome in rgc.genomes_list() + and "fasta" in rgc.list_assets_by_genome(genome) + ): pretag = rgc.get_default_tag(genome, "fasta") - _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})". - format(g=genome, a=asset_key, t=pretag)) + _LOGGER.warning( + "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})".format( + g=genome, a=asset_key, t=pretag + ) + ) genome = rgc.get_genome_alias_digest(alias=genome, fallback=True) else: # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file genome, _ = rgc.initialize_genome( - fasta_path=specified_args["fasta"], alias=ori_genome, skip_alias_write=True) + fasta_path=specified_args["fasta"], + alias=ori_genome, + skip_alias_write=True, + ) else: try: genome = rgc.get_genome_alias_digest(genome, fallback=True) except UndefinedAliasError: - _LOGGER.error("Genome '{}' has not been initialized yet; " - "no key found for this alias".format(genome)) + _LOGGER.error( + "Genome '{}' has not been initialized yet; " + "no key found for this alias".format(genome) + ) return recipe_name = None genome_outfolder = os.path.join(args.outfolder, genome) - if not _build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder, - specified_args, specified_params, ori_genome, **input_assets): - log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag, - BUILD_STATS_DIR, ORI_LOG_NAME)) - _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. " - "See the log file for details: {}".format(genome, asset_key, asset_tag, log_path)) + if not _build_asset( + genome, + asset_key, + asset_tag, + asset_build_package, + genome_outfolder, + specified_args, + specified_params, + ori_genome, + **input_assets, + ): + log_path = os.path.abspath( + os.path.join( + genome_outfolder, + asset_key, + asset_tag, + BUILD_STATS_DIR, + ORI_LOG_NAME, + ) + ) + _LOGGER.info( + "'{}/{}:{}' was not added to the config, but directory has been left in place. " + "See the log file for details: {}".format( + genome, asset_key, asset_tag, log_path + ) + ) return _LOGGER.info("Finished building '{}' asset".format(asset_key)) with rgc as r: # update asset relationships r.update_relatives_assets( - genome, asset_key, asset_tag, parent_assets) # adds parents + genome, asset_key, asset_tag, parent_assets + ) # adds parents for i in parent_assets: parsed_parent = parse_registry_path(i) # adds child (currently built asset) to the parent - r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], - parsed_parent["tag"], ["{}/{}:{}".format(genome, asset_key, asset_tag)], True) + r.update_relatives_assets( + parsed_parent["genome"], + parsed_parent["asset"], + parsed_parent["tag"], + ["{}/{}:{}".format(genome, asset_key, asset_tag)], + True, + ) if args.genome_description is not None: - _LOGGER.debug("adding genome ({}) description: '{}'".format( - genome, args.genome_description)) + _LOGGER.debug( + "adding genome ({}) description: '{}'".format( + genome, args.genome_description + ) + ) r.update_genomes( - genome, {CFG_GENOME_DESC_KEY: args.genome_description}) + genome, {CFG_GENOME_DESC_KEY: args.genome_description} + ) if args.tag_description is not None: - _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'". - format(genome, asset_key, asset_tag, args.tag_description)) - r.update_tags(genome, asset_key, asset_tag, { - CFG_TAG_DESC_KEY: args.tag_description}) + _LOGGER.debug( + "adding tag ({}/{}:{}) description: '{}'".format( + genome, asset_key, asset_tag, args.tag_description + ) + ) + r.update_tags( + genome, + asset_key, + asset_tag, + {CFG_TAG_DESC_KEY: args.tag_description}, + ) rgc._symlink_alias(genome, asset_key, asset_tag) else: _raise_missing_recipe_error(recipe_name) @@ -631,8 +1008,7 @@ def _exec_list(rgc, remote, genome): # we use this func looping through the server urls and assigning a # single instance as the server for the object. That's why we can # access the data with [0] below - assemblies, assets = \ - list(rgc.listr(genome=genome, as_str=True).values())[0] + assemblies, assets = list(rgc.listr(genome=genome, as_str=True).values())[0] recipes = None # Not implemented else: pfx = "Local" @@ -656,8 +1032,7 @@ def perm_check_x(file_to_check, message_tag="genome directory"): _LOGGER.error(msg) raise ValueError(msg) if not os.access(file_to_check, os.X_OK): - _LOGGER.error( - "Insufficient permissions to write to {}: ".format(file_to_check)) + _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) return False return True @@ -682,8 +1057,11 @@ def main(): sys.exit(1) gencfg = select_genome_config( - filename=args.genome_config, check_exist=not args.command == INIT_CMD, - on_missing=lambda fp: fp, strict_env=True) + filename=args.genome_config, + check_exist=not args.command == INIT_CMD, + on_missing=lambda fp: fp, + strict_env=True, + ) if gencfg is None: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) @@ -693,10 +1071,8 @@ def main(): # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: - _LOGGER.debug("Found registry_path: {}".format( - args.asset_registry_paths)) - asset_list = [parse_registry_path(x) - for x in args.asset_registry_paths] + _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) + asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] for a in asset_list: # every asset must have a genome, either provided via registry path @@ -705,13 +1081,19 @@ def main(): if args.genome: a["genome"] = args.genome else: - _LOGGER.error("Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.". - format(a["genome"], a["asset"], a["tag"])) + _LOGGER.error( + "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format( + a["genome"], a["asset"], a["tag"] + ) + ) sys.exit(1) else: if args.genome and args.genome != a["genome"]: _LOGGER.warn( - "Two different genomes specified for asset '{}'.".format(a["asset"])) + "Two different genomes specified for asset '{}'.".format( + a["asset"] + ) + ) else: if args.command in GENOME_ONLY_REQUIRED and not args.genome: @@ -723,20 +1105,25 @@ def main(): if args.command == INIT_CMD: _LOGGER.debug("Initializing refgenie genome configuration") - entries = OrderedDict({ - CFG_VERSION_KEY: REQ_CFG_VERSION, - CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), - CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], - CFG_GENOMES_KEY: None}) + entries = OrderedDict( + { + CFG_VERSION_KEY: REQ_CFG_VERSION, + CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), + CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], + CFG_GENOMES_KEY: None, + } + ) if args.settings_json: if os.path.isfile(args.settings_json): - with open(args.settings_json, 'r') as json_file: + with open(args.settings_json, "r") as json_file: data = json.load(json_file) entries.update(data) else: raise FileNotFoundError( - "JSON file with config init settings does not exist: {}". - format(args.settings_json)) + "JSON file with config init settings does not exist: {}".format( + args.settings_json + ) + ) if args.genome_folder: entries.update({CFG_FOLDER_KEY: args.genome_folder}) if args.remote_url_base: @@ -744,8 +1131,7 @@ def main(): if args.genome_archive_folder: entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) if args.genome_archive_config: - entries.update( - {CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) + entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) _LOGGER.debug("initializing with entries: {}".format(entries)) rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) rgc.initialize_config_file(os.path.abspath(gencfg)) @@ -757,8 +1143,7 @@ def main(): recipe_name = None if args.recipe: if len(asset_list) > 1: - _LOGGER.error( - "Recipes cannot be specified for multi-asset builds") + _LOGGER.error("Recipes cannot be specified for multi-asset builds") sys.exit(1) recipe_name = args.recipe if args.requirements: @@ -769,23 +1154,30 @@ def main(): _LOGGER.info("'{}' recipe requirements: ".format(recipe)) _make_asset_build_reqs(recipe) sys.exit(0) - refgenie_build( - gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) + refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) check = args.check_exists if args.check_exists else None for a in asset_list: - _LOGGER.debug("getting asset: '{}/{}.{}:{}'". - format(a["genome"], a["asset"], a["seek_key"], a["tag"])) - print(rgc.seek(a["genome"], a["asset"], a["tag"], a["seek_key"], - strict_exists=check)) + _LOGGER.debug( + "getting asset: '{}/{}.{}:{}'".format( + a["genome"], a["asset"], a["seek_key"], a["tag"] + ) + ) + print( + rgc.seek( + a["genome"], + a["asset"], + a["tag"], + a["seek_key"], + strict_exists=check, + ) + ) return elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") @@ -793,13 +1185,17 @@ def main(): sk = args.seek_keys if sk: sk = json.loads(args.seek_keys) - rgc.add(path=args.path, genome=asset_list[0]["genome"], - asset=asset_list[0]["asset"], tag=asset_list[0]["tag"], - seek_keys=sk, force=args.force) + rgc.add( + path=args.path, + genome=asset_list[0]["genome"], + asset=asset_list[0]["asset"], + tag=asset_list[0]["tag"], + seek_keys=sk, + force=args.force, + ) elif args.command == PULL_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) # existing assets overwriting if args.no_overwrite: @@ -826,17 +1222,21 @@ def main(): if not perm_check_x(outdir): return if not _single_folder_writeable(outdir): - _LOGGER.error( - "Insufficient permissions to write to: {}".format(outdir)) + _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) return for a in asset_list: - rgc.pull(a["genome"], a["asset"], a["tag"], force=force, - force_large=force_large, size_cutoff=args.size_cutoff) + rgc.pull( + a["genome"], + a["asset"], + a["tag"], + force=force, + force_large=force_large, + size_cutoff=args.size_cutoff, + ) elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) console = Console() if args.command == LIST_REMOTE_CMD: num_servers = 0 @@ -845,7 +1245,8 @@ def main(): num_servers += 1 try: table = rgc.get_asset_table( - genomes=args.genome, server_url=server_url) + genomes=args.genome, server_url=server_url + ) except (DownloadJsonError, ConnectionError): bad_servers.append(server_url) continue @@ -853,8 +1254,9 @@ def main(): console.print(table) if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: _LOGGER.error( - "Could not list assets from the following servers: {}". - format(bad_servers) + "Could not list assets from the following servers: {}".format( + bad_servers + ) ) else: if args.recipes: @@ -863,46 +1265,46 @@ def main(): console.print(rgc.get_asset_table(genomes=args.genome)) elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) print(rgc.getseq(args.genome, args.locus)) elif args.command == REMOVE_CMD: force = args.force - rgc = RefGenConf(filepath=gencfg, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) for a in asset_list: - a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"], - use_existing=False) + a["tag"] = a["tag"] or rgc.get_default_tag( + a["genome"], a["asset"], use_existing=False + ) _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) if a["seek_key"] is not None: - raise NotImplementedError( - "You can't remove a specific seek_key.") + raise NotImplementedError("You can't remove a specific seek_key.") gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} try: if not rgc.is_asset_complete(**gat): with rgc as r: r.cfg_remove_assets(**gat, aliases=args.aliases) - _LOGGER.info("Removed an incomplete asset " - "'{genome}/{asset}:{tag}'".format(*gat)) + _LOGGER.info( + "Removed an incomplete asset " + "'{genome}/{asset}:{tag}'".format(*gat) + ) return except (KeyError, MissingAssetError, MissingGenomeError): - _LOGGER.info("Asset '{genome}/{asset}:{tag}' does not exist" - .format(**gat)) + _LOGGER.info( + "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat) + ) return if len(asset_list) > 1: - if not query_yes_no("Are you sure you want to remove {} assets?". - format(len(asset_list))): + if not query_yes_no( + "Are you sure you want to remove {} assets?".format(len(asset_list)) + ): _LOGGER.info("Action aborted by the user") return force = True for a in asset_list: - rgc.remove(genome=a["genome"], asset=a["asset"], - tag=a["tag"], force=force) + rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) elif args.command == TAG_CMD: - rgc = RefGenConf(filepath=gencfg, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) if len(asset_list) > 1: raise NotImplementedError("Can only tag 1 asset at a time") if args.default: @@ -913,8 +1315,7 @@ def main(): rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if len(asset_list) == 1: g, a = asset_list[0]["genome"], asset_list[0]["asset"] t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) @@ -926,13 +1327,11 @@ def main(): print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) return elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.subscribe(urls=args.genome_server, reset=args.reset) return elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) rgc.unsubscribe(urls=args.genome_server) return elif args.command == ALIAS_CMD: @@ -946,24 +1345,29 @@ def main(): console.print(rgc.genome_aliases_table) if args.subcommand == ALIAS_SET_CMD: - rgc.set_genome_alias(digest=args.digest, genome=args.aliases, - reset_digest=args.reset, create_genome=args.force) + rgc.set_genome_alias( + digest=args.digest, + genome=args.aliases, + reset_digest=args.reset, + create_genome=args.force, + ) return elif args.subcommand == ALIAS_REMOVE_CMD: rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) return elif args.command == COMPARE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, - skip_read_lock=skip_read_lock) - res = rgc.compare(args.genome1[0], args.genome2[0], - explain=not args.no_explanation) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + res = rgc.compare( + args.genome1[0], args.genome2[0], explain=not args.no_explanation + ) if args.no_explanation: print(res) elif args.command == UPGRADE_CMD: - upgrade_config(target_version=args.target_version, filepath=gencfg, - force=args.force) + upgrade_config( + target_version=args.target_version, filepath=gencfg, force=args.force + ) def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): @@ -977,12 +1381,18 @@ def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entitie """ subclass = "asset" if entity_class == "genome" else "tag" if os.path.basename(directory) == asset_dict[entity_class]: - _LOGGER.info("Last {sub} for {ec} '{en}' has been removed, removing {ec} directory". - format(sub=subclass, ec=entity_class, en=asset_dict[entity_class])) + _LOGGER.info( + "Last {sub} for {ec} '{en}' has been removed, removing {ec} directory".format( + sub=subclass, ec=entity_class, en=asset_dict[entity_class] + ) + ) removed_entities.append(_remove(directory)) else: - _LOGGER.debug("Didn't remove '{}' since it does not match the {} name: {}". - format(directory, entity_class, asset_dict[entity_class])) + _LOGGER.debug( + "Didn't remove '{}' since it does not match the {} name: {}".format( + directory, entity_class, asset_dict[entity_class] + ) + ) def _remove(path): @@ -1024,6 +1434,7 @@ def _make_asset_build_reqs(asset): :params str asset: name of the asset """ + def _format_reqs(req_list): """ @@ -1031,19 +1442,32 @@ def _format_reqs(req_list): :return list[str]: """ templ = "\t{} ({})" - return [templ.format(req[KEY], req[DESC]) if DEFAULT not in req - else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) for req in req_list] + return [ + templ.format(req[KEY], req[DESC]) + if DEFAULT not in req + else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) + for req in req_list + ] reqs_list = [] if asset_build_packages[asset][REQ_FILES]: reqs_list.append( - "- files:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])))) + "- files:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])) + ) + ) if asset_build_packages[asset][REQ_ASSETS]: reqs_list.append( - "- assets:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])))) + "- assets:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])) + ) + ) if asset_build_packages[asset][REQ_PARAMS]: reqs_list.append( - "- params:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])))) + "- params:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])) + ) + ) _LOGGER.info("\n".join(reqs_list)) @@ -1056,21 +1480,30 @@ def get_dir_digest(path, pm=None): :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 """ if not is_command_callable("md5sum"): - raise OSError("md5sum command line tool is required for asset digest calculation. \n" - "Install and try again, e.g on macOS: 'brew install md5sha1sum'") - cmd = "cd {}; find . -type f -not -path './" + BUILD_STATS_DIR + \ - "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" + raise OSError( + "md5sum command line tool is required for asset digest calculation. \n" + "Install and try again, e.g on macOS: 'brew install md5sha1sum'" + ) + cmd = ( + "cd {}; find . -type f -not -path './" + + BUILD_STATS_DIR + + "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" + ) if isinstance(pm, pypiper.PipelineManager): x = pm.checkprint(cmd.format(path)) else: try: from subprocess import check_output + x = check_output(cmd.format(path), shell=True).decode("utf-8") except Exception as e: - _LOGGER.warning("{}: could not calculate digest for '{}'".format( - e.__class__.__name__, path)) + _LOGGER.warning( + "{}: could not calculate digest for '{}'".format( + e.__class__.__name__, path + ) + ) return - return str(sub(r'\W+', '', x)) # strips non-alphanumeric + return str(sub(r"\W+", "", x)) # strips non-alphanumeric def _handle_sigint(gat): @@ -1080,9 +1513,11 @@ def _handle_sigint(gat): :param list gat: a list of genome, asset and tag. Used for a message generation. :return function: the SIGINT handling function """ + def handle(sig, frame): _LOGGER.warning("\nThe build was interrupted: {}/{}:{}".format(*gat)) sys.exit(0) + return handle @@ -1097,7 +1532,11 @@ def _parse_user_build_input(input): lst = [] for i in input or []: lst.extend(i) - return {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} if lst is not None else lst + return ( + {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} + if lst is not None + else lst + ) def _raise_missing_recipe_error(recipe): @@ -1107,8 +1546,11 @@ def _raise_missing_recipe_error(recipe): :param str recipe: recipe name :raise MissingRecipeError: always """ - raise MissingRecipeError("Recipe '{}' not found. Available recipes: {}". - format(recipe, ", ".join(list(asset_build_packages.keys())))) + raise MissingRecipeError( + "Recipe '{}' not found. Available recipes: {}".format( + recipe, ", ".join(list(asset_build_packages.keys())) + ) + ) def _check_recipe(recipe): @@ -1122,12 +1564,15 @@ def _check_recipe(recipe): # experimental feature; recipe jsonschema validation from jsonschema import validate from yacman import load_yaml - SCHEMA_SRC = os.path.join(os.path.dirname( - os.path.abspath(__file__)), "schemas", "recipe_schema.yaml") + + SCHEMA_SRC = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "schemas", "recipe_schema.yaml" + ) if os.path.exists(SCHEMA_SRC): validate(recipe, load_yaml(filepath=SCHEMA_SRC)) _LOGGER.info( - "Recipe validated successfully against a schema: {}".format(SCHEMA_SRC)) + "Recipe validated successfully against a schema: {}".format(SCHEMA_SRC) + ) else: _LOGGER.warning("Recipe schema not found: {}".format(SCHEMA_SRC)) # end of validation @@ -1139,23 +1584,28 @@ def _check_recipe(recipe): if k not in unique: unique.append(k) else: - raise ValueError("The recipe contains a duplicated requirement" - " key '{}', which is not permitted.".format(k)) + raise ValueError( + "The recipe contains a duplicated requirement" + " key '{}', which is not permitted.".format(k) + ) return recipe -def _seek(rgc, genome_name, asset_name, tag_name=None, - seek_key=None, enclosing_dir=False): +def _seek( + rgc, genome_name, asset_name, tag_name=None, seek_key=None, enclosing_dir=False +): """ Strict seek. Most use cases in this package require file existence check in seek. This function makes it easier """ - return rgc.seek_src(genome_name=genome_name, - asset_name=asset_name, - tag_name=tag_name, - seek_key=seek_key, - enclosing_dir=enclosing_dir, - strict_exists=True) + return rgc.seek_src( + genome_name=genome_name, + asset_name=asset_name, + tag_name=tag_name, + seek_key=seek_key, + enclosing_dir=enclosing_dir, + strict_exists=True, + ) def _skip_lock(skip_arg, cfg): diff --git a/refgenie/refget.py b/refgenie/refget.py index 1d5f4d82..b0861260 100644 --- a/refgenie/refget.py +++ b/refgenie/refget.py @@ -40,4 +40,4 @@ def fasta_checksum(fa_file, checksum_function=trunc512_digest): content_checksums[k] = checksum_function(str(fa_object[k])) collection_string = ";".join([":".join(i) for i in content_checksums.items()]) collection_checksum = checksum_function(collection_string) - return collection_checksum, content_checksums \ No newline at end of file + return collection_checksum, content_checksums diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index e975f09d..dd79a365 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,4 +1,5 @@ logmuse>=0.2.6 -refgenconf>=0.10.0-dev +# refgenconf>=0.10.0-dev piper>=0.12.1 -pyfaidx>=0.5.5.2 \ No newline at end of file +pyfaidx>=0.5.5.2 +yacman>=0.7.1 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 493054a9..c368db1d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,2 +1 @@ --e git+git://github.com/databio/yacman@dev#egg=yacman -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file diff --git a/setup.py b/setup.py index d1f81150..f0c50dcf 100755 --- a/setup.py +++ b/setup.py @@ -8,37 +8,38 @@ for line in reqs_file: if not line.strip(): continue - #DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + # DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) DEPENDENCIES.append(line) # Additional keyword arguments for setup() extra = {"install_requires": DEPENDENCIES} # 2to3 -if sys.version_info >= (3, ): +if sys.version_info >= (3,): extra["use_2to3"] = True -with open("refgenie/_version.py", 'r') as versionfile: +with open("refgenie/_version.py", "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") # Handle the pypi README formatting. try: import pypandoc - long_description = pypandoc.convert_file('README.md', 'rst') + + long_description = pypandoc.convert_file("README.md", "rst") msg = "\033[032mPandoc conversion succeeded.\033[0m" -except(IOError, ImportError, OSError): +except (IOError, ImportError, OSError): msg = "\033[0;31mWarning: pandoc conversion failed!\033[0m" - long_description = open('README.md').read() + long_description = open("README.md").read() setup( - name='refgenie', + name="refgenie", packages=["refgenie"], version=version, - description='Refgenie creates a standardized folder structure for reference genome files and indexes. ' - 'You can download pre-built genomes or build your own for any fasta file', + description="Refgenie creates a standardized folder structure for reference genome files and indexes. " + "You can download pre-built genomes or build your own for any fasta file", long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", @@ -46,21 +47,21 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], license="BSD2", entry_points={ "console_scripts": [ - 'refgenie = refgenie.__main__:main', - 'import_igenome = refgenie.add_assets_igenome:main' + "refgenie = refgenie.__main__:main", + "import_igenome = refgenie.add_assets_igenome:main", ], }, keywords="bioinformatics, sequencing, ngs", package_data={"refgenie": [os.path.join("refgenie", "*")]}, include_package_data=True, - url='http://refgenie.databio.org', - author=u'Nathan Sheffield, Vince Reuter, Michal Stolarczyk', + url="http://refgenie.databio.org", + author=u"Nathan Sheffield, Vince Reuter, Michal Stolarczyk", **extra ) -print(msg) \ No newline at end of file +print(msg) From 767d868a30bcc35900045c1470edb30c6382d87b Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Wed, 24 Feb 2021 23:27:15 -0500 Subject: [PATCH 096/110] create a table for each compatible server --- refgenie/refgenie.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 4d3250c4..c2943db0 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -4,6 +4,7 @@ from shutil import rmtree from re import sub from requests import ConnectionError +from requests.exceptions import MissingSchema from rich.console import Console import os @@ -1247,7 +1248,7 @@ def main(): table = rgc.get_asset_table( genomes=args.genome, server_url=server_url ) - except (DownloadJsonError, ConnectionError): + except (DownloadJsonError, ConnectionError, MissingSchema): bad_servers.append(server_url) continue else: From e718dd9a48c764f5239d9f838424735cc1b12f0b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 26 Feb 2021 12:29:50 -0500 Subject: [PATCH 097/110] make default server value a list --- refgenie/refgenie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c2943db0..c12a2aff 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -118,7 +118,7 @@ def add_subparser(cmd, msg, subparsers): "-s", "--genome-server", nargs="+", - default=DEFAULT_SERVER, + default=[DEFAULT_SERVER], help="URL(s) to use for the {} attribute in config file. Default: {}.".format( CFG_SERVERS_KEY, DEFAULT_SERVER ), From a836ce7ef1bb9280e250b95aa25230e139fcd895 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 2 Mar 2021 16:50:47 -0500 Subject: [PATCH 098/110] update dev reqs --- requirements/requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c368db1d..493054a9 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1 +1,2 @@ +-e git+git://github.com/databio/yacman@dev#egg=yacman -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From d4d353dda0b57f1ad6aaf7e6c1f1559a51b3b7e0 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 2 Mar 2021 16:55:05 -0500 Subject: [PATCH 099/110] update digest in action --- .github/workflows/test-refgenie-cli.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index 5b1131d2..e535a0a5 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -45,7 +45,7 @@ jobs: run: | refgenie build -c genomes/g.yaml t7/fasta --files fasta=tests/data/t7.fa.gz --recipe tests/data/recipe_parent.json ./tests/assert_in_file.sh genomes/g.yaml t7 0 - ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b 0 # this is a digest that should be produced from this FASTA file + ./tests/assert_in_file.sh genomes/g.yaml 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 0 # this is a digest that should be produced from this FASTA file - name: refgenie build fasta_child (child asset) run: | @@ -57,10 +57,10 @@ jobs: echo "Error: `refgenie seek -c genomes/g.yaml t7/fasta_child` does not exist." exit 1 fi - if [ -d genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default ]; then - echo "'genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default' exists." + if [ -d genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default ]; then + echo "'genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default' exists." else - echo "Error: 'genomes/data/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child/default' does not exist." + echo "Error: 'genomes/data/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child/default' does not exist." exit 1 fi @@ -73,7 +73,7 @@ jobs: - name: refgenie set aliases run: | - refgenie alias set -c genomes/g.yaml --aliases t7_new t7_new1 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b + refgenie alias set -c genomes/g.yaml --aliases t7_new t7_new1 --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 ./tests/assert_in_file.sh genomes/g.yaml t7_new 0 ./tests/assert_in_file.sh genomes/g.yaml t7_new1 0 if [ -L `refgenie seek -c genomes/g.yaml t7_new/fasta` ]; then @@ -91,16 +91,16 @@ jobs: - name: refgenie remove aliases run: | - refgenie alias set -c genomes/g.yaml --aliases t7_another --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b - refgenie alias remove -c genomes/g.yaml --aliases t7_new t7_new1 t7 --digest 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b + refgenie alias set -c genomes/g.yaml --aliases t7_another --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 + refgenie alias remove -c genomes/g.yaml --aliases t7_new t7_new1 t7 --digest 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905 ./tests/assert_in_file.sh genomes/g.yaml t7_new 1 ./tests/assert_in_file.sh genomes/g.yaml t7_new1 1 ./tests/assert_in_file.sh genomes/g.yaml t7_another 0 - if [ -L genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz ]; then - echo "'genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz' exists." + if [ -L genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz ]; then + echo "'genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz' exists." exit 1 else - echo "Error: 'genomes/alias/t7_new/fasta/default/162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b.fa.gz' does not exist." + echo "Error: 'genomes/alias/t7_new/fasta/default/6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905.fa.gz' does not exist." fi - name: refgenie get aliases @@ -137,4 +137,4 @@ jobs: run: | refgenie remove -c genomes/g.yaml t7_another/fasta_child -f ./tests/assert_in_file.sh genomes/g.yaml fasta_child 1 - ./tests/assert_in_file.sh genomes/g.yaml 162a8922a1529a5ec1ce0e69e65d7476936258586a79ba0b/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list + ./tests/assert_in_file.sh genomes/g.yaml 6c5f19c9c2850e62cc3f89b04047fa05eee911662bd77905/fasta_child:new_tag 1 # test if the entry was removed from the fasta children list From 7d1f1ac82a3661959345efcee3ea4667cc780b8f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 3 Mar 2021 16:30:56 -0500 Subject: [PATCH 100/110] add tgMap recipe --- refgenie/asset_build_packages.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 63726eec..7329033d 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -287,6 +287,16 @@ "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}", ], }, + "tgMap": { + DESC: "Transcript to gene map file, containing two columns mapping of each transcript present in the reference to the corresponding gene.", + REQ_FILES: [], + REQ_ASSETS: [{KEY: "salmon_partial_sa_index", DEFAULT: "salmon_partial_sa_index", DESC: "partial salmon index asset"}], + REQ_PARAMS: [], + ASSETS: {"tgMap": "{genome}_txp2gene.tsv"}, + CMD_LST: [ + "grep '^>' {salmon_partial_sa_index}/gentrome.fa | cut -d ' ' -f 1,7 | tr -s ' ' '\\t' | sed 's/[>'gene_symbol:']//g' > {asset_outfolder}/{genome}_txp2gene.tsv", + ], + }, "epilog_index": { DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller", REQ_FILES: [], From 63ac5f13e6a388d7c41b02b4aa46bda4b43a4526 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 4 Mar 2021 12:22:42 -0500 Subject: [PATCH 101/110] reformat --- refgenie/asset_build_packages.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py index 7329033d..fc031eb1 100644 --- a/refgenie/asset_build_packages.py +++ b/refgenie/asset_build_packages.py @@ -290,7 +290,13 @@ "tgMap": { DESC: "Transcript to gene map file, containing two columns mapping of each transcript present in the reference to the corresponding gene.", REQ_FILES: [], - REQ_ASSETS: [{KEY: "salmon_partial_sa_index", DEFAULT: "salmon_partial_sa_index", DESC: "partial salmon index asset"}], + REQ_ASSETS: [ + { + KEY: "salmon_partial_sa_index", + DEFAULT: "salmon_partial_sa_index", + DESC: "partial salmon index asset", + } + ], REQ_PARAMS: [], ASSETS: {"tgMap": "{genome}_txp2gene.tsv"}, CMD_LST: [ From e177a43628762540efc584b9a5050840b04c178d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Mar 2021 12:36:48 -0500 Subject: [PATCH 102/110] major code refactor --- refgenie/__main__.py | 2 +- refgenie/add_assets_igenome.py | 20 +- refgenie/argparser.py | 484 ++++++++++++++ refgenie/cli.py | 422 ++++++++++++ refgenie/exceptions.py | 2 +- refgenie/helpers.py | 63 ++ refgenie/refgenie.py | 1098 +------------------------------- 7 files changed, 997 insertions(+), 1094 deletions(-) create mode 100644 refgenie/argparser.py create mode 100644 refgenie/cli.py create mode 100644 refgenie/helpers.py diff --git a/refgenie/__main__.py b/refgenie/__main__.py index 3beb081f..1fa5c244 100644 --- a/refgenie/__main__.py +++ b/refgenie/__main__.py @@ -1,4 +1,4 @@ -from .refgenie import main +from .cli import main import sys if __name__ == "__main__": diff --git a/refgenie/add_assets_igenome.py b/refgenie/add_assets_igenome.py index 7d0778f1..c09a2bb2 100644 --- a/refgenie/add_assets_igenome.py +++ b/refgenie/add_assets_igenome.py @@ -6,7 +6,7 @@ Build/ Annotation/ Sequence/ """ -from .refgenie import _seek, _remove +from .refgenie import _seek from .exceptions import MissingGenomeConfigError from ubiquerg import untar, mkabs, query_yes_no @@ -200,6 +200,24 @@ def main(): print("Added assets: \n- {}".format("\n- ".join(processed))) +def _remove(path): + """ + remove asset if it is a dir or a file + + :param str path: path to the entity to remove, either a file or a dir + :return str: removed path + """ + from shutil import rmtree + + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + rmtree(path) + else: + raise ValueError("path '{}' is neither a file nor a dir.".format(path)) + return path + + if __name__ == "__main__": try: sys.exit(main()) diff --git a/refgenie/argparser.py b/refgenie/argparser.py new file mode 100644 index 00000000..ff7df2c7 --- /dev/null +++ b/refgenie/argparser.py @@ -0,0 +1,484 @@ +import pypiper + +from ubiquerg import VersionInHelpParser + +from ._version import __version__ +from .const import * +from refgenconf import __version__ as rgc_version + +from argparse import HelpFormatter + + +def build_argparser(): + """ + Builds argument parser. + + :return argparse.ArgumentParser + """ + + banner = "%(prog)s - reference genome asset manager" + additional_description = "\nhttps://refgenie.databio.org" + + parser = VersionInHelpParser( + prog="refgenie", + version=f"{__version__} | refgenconf {rgc_version}", + description=banner, + epilog=additional_description, + ) + + subparsers = parser.add_subparsers(dest="command") + + def add_subparser(cmd, msg, subparsers): + return subparsers.add_parser( + cmd, + description=msg, + help=msg, + formatter_class=lambda prog: HelpFormatter( + prog, max_help_position=40, width=90 + ), + ) + + sps = {} + for cmd, desc in SUBPARSER_MESSAGES.items(): + sps[cmd] = add_subparser(cmd, desc, subparsers) + # alias is nested and alias subcommands require config path + if cmd == ALIAS_CMD: + continue + # It's required for init + sps[cmd].add_argument( + "-c", + "--genome-config", + required=(cmd == INIT_CMD), + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) + sps[cmd].add_argument( + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) + + # upgrade: upgrade config and alter file structure to the target version + sps[UPGRADE_CMD].add_argument( + "-v", + "--target-version", + required=True, + metavar="V", + help="Target config version for the upgrade.", + ) + sps[UPGRADE_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + sps[INIT_CMD].add_argument( + "-s", + "--genome-server", + nargs="+", + default=[DEFAULT_SERVER], + help="URL(s) to use for the {} attribute in config file. Default: {}.".format( + CFG_SERVERS_KEY, DEFAULT_SERVER + ), + ) + sps[INIT_CMD].add_argument( + "-f", + "--genome-folder", + help="Absolute path to parent folder refgenie-managed assets.", + ) + sps[INIT_CMD].add_argument( + "-a", + "--genome-archive-folder", + help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-b", + "--genome-archive-config", + help="Absolute path to desired archive config file; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-u", + "--remote-url-base", + help="URL to use as an alternative, remote archive location; used by refgenieserver.", + ) + sps[INIT_CMD].add_argument( + "-j", + "--settings-json", + help="Absolute path to a JSON file with the key " + "value pairs to inialize the configuration " + "file with. Overwritten by itemized specifications.", + ) + sps[BUILD_CMD] = pypiper.add_pypiper_args( + sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"] + ) + + # Add any arguments specific to subcommands. + + sps[BUILD_CMD].add_argument( + "--tag-description", + required=False, + default=None, + type=str, + help="Add tag level description (e.g. built with version 0.3.2).", + ) + + sps[BUILD_CMD].add_argument( + "--genome-description", + required=False, + default=None, + type=str, + help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).", + ) + + sps[BUILD_CMD].add_argument( + "-d", + "--docker", + action="store_true", + help="Run all commands in the refgenie docker container.", + ) + + sps[BUILD_CMD].add_argument( + "--assets", + nargs="+", + action="append", + required=False, + default=None, + help="Override the default genome, asset and tag of the parents" + " (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).", + ) + + sps[BUILD_CMD].add_argument( + "--files", + nargs="+", + action="append", + required=False, + default=None, + help="Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).", + ) + + sps[BUILD_CMD].add_argument( + "--params", + nargs="+", + action="append", + required=False, + default=None, + help="Provide required parameter values (e.g. param1=value1).", + ) + + sps[BUILD_CMD].add_argument( + "-v", + "--volumes", + nargs="+", + required=False, + default=None, + help="If using docker, also mount these folders as volumes.", + ) + + sps[BUILD_CMD].add_argument( + "-o", + "--outfolder", + dest="outfolder", + required=False, + default=None, + help="Override the default path to genomes folder, which is the " + "genome_folder attribute in the genome configuration file.", + ) + + sps[BUILD_CMD].add_argument( + "-q", + "--requirements", + action="store_true", + help="Show the build requirements for the specified asset and exit.", + ) + + sps[BUILD_CMD].add_argument( + "-r", + "--recipe", + required=False, + default=None, + type=str, + help="Provide a recipe to use.", + ) + + alias_subparser = sps[ALIAS_CMD] + alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand") + + alias_sps = {} + for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): + alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) + alias_sps[cmd].add_argument( + "-c", + "--genome-config", + required=False, + dest="genome_config", + metavar="C", + help="Path to local genome configuration file. Optional if {} environment variable is set.".format( + ", ".join(CFG_ENV_VARS) + ), + ) + alias_sps[cmd].add_argument( + "--skip-read-lock", + required=False, + action="store_true", + help="Whether the config file should not be locked for reading", + ) + + alias_sps[ALIAS_SET_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to set; single if the digest is to be retrieved from the server.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-d", + "--digest", + metavar="D", + required=False, + type=str, + help="Digest to set; leave out if the digest is to be retrieved from the server.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-r", + "--reset", + action="store_true", + help="Whether all the aliases should be removed prior to setting new ones.", + ) + alias_sps[ALIAS_SET_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Whether the action should be forced, if genome does not exist.", + ) + + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + default=None, + type=str, + nargs="+", + help="Aliases to remove.", + ) + alias_sps[ALIAS_REMOVE_CMD].add_argument( + "-d", "--digest", metavar="D", required=True, type=str, help="Digest to remove." + ) + + alias_sps[ALIAS_GET_CMD].add_argument( + "-a", + "--aliases", + metavar="A", + required=False, + type=str, + nargs="+", + help="Aliases to get the digests for.", + ) + + sps[COMPARE_CMD].add_argument( + "genome1", + metavar="GENOME1", + type=str, + nargs=1, + help="First genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "genome2", + metavar="GENOME2", + type=str, + nargs=1, + help="Second genome for compatibility check.", + ) + sps[COMPARE_CMD].add_argument( + "-e", + "--no-explanation", + action="store_true", + help="Do not print compatibility code explanation.", + ) + + # add 'genome' argument to many commands + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + GETSEQ_CMD, + TAG_CMD, + ID_CMD, + ]: + # genome is not required for listing actions + sps[cmd].add_argument( + "-g", + "--genome", + required=cmd in GETSEQ_CMD, + metavar="G", + help="Reference assembly ID, e.g. mm10.", + ) + + for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: + sps[cmd].add_argument( + "-g", + "--genome", + required=False, + type=str, + metavar="G", + nargs="*", + help="Reference assembly ID, e.g. mm10.", + ) + + for cmd in [ + PULL_CMD, + GET_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + REMOVE_CMD, + TAG_CMD, + ID_CMD, + ]: + sps[cmd].add_argument( + "asset_registry_paths", + metavar="asset-registry-paths", + type=str, + nargs="+", + help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" + + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."), + ) + + sps[LIST_LOCAL_CMD].add_argument( + "-r", "--recipes", action="store_true", help="List available recipes." + ) + + for cmd in [REMOVE_CMD, INSERT_CMD]: + sps[cmd].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + sps[REMOVE_CMD].add_argument( + "-a", + "--aliases", + action="store_true", + help="Remove the genome alias if last asset for that genome is removed.", + ) + force_group = sps[PULL_CMD].add_argument_group( + title="Prompt handling", + description="These flags configure the pull prompt responses.", + ) + + overwrite_group = force_group.add_mutually_exclusive_group() + + overwrite_group.add_argument( + "--no-overwrite", action="store_true", help="Do not overwrite if asset exists." + ) + + overwrite_group.add_argument( + "--force-overwrite", action="store_true", help="Overwrite if asset exists." + ) + + large_group = force_group.add_mutually_exclusive_group() + + large_group.add_argument( + "--no-large", action="store_true", help="Do not pull archives over 5GB." + ) + + large_group.add_argument( + "--pull-large", + action="store_true", + help="Pull any archive, regardless of its size.", + ) + + force_group.add_argument( + "--size-cutoff", + type=float, + default=10, + metavar="S", + help="Maximum archive file size to download with no confirmation required (in GB, default: 10)", + ) + + force_group.add_argument( + "-b", + "--batch", + action="store_true", + help="Use batch mode: pull large archives, do no overwrite", + ) + + sps[INSERT_CMD].add_argument( + "-p", "--path", required=True, metavar="P", help="Relative local path to asset." + ) + + sps[INSERT_CMD].add_argument( + "-s", + "--seek-keys", + required=False, + type=str, + metavar="S", + help=""" + String representation of a JSON object with seek_keys, + e.g. '{"seek_key1": "file.txt"}' + """, + ) + + sps[GETSEQ_CMD].add_argument( + "-l", + "--locus", + required=True, + help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.", + ) + + sps[GET_ASSET_CMD].add_argument( + "-e", + "--check-exists", + required=False, + action="store_true", + help="Whether the returned asset path should be checked for existence on disk.", + ) + + sps[TAG_CMD].add_argument( + "-f", + "--force", + action="store_true", + help="Do not prompt before action, approve upfront.", + ) + + group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) + + group.add_argument("-t", "--tag", type=str, help="Tag to assign to an asset.") + + group.add_argument( + "-d", + "--default", + action="store_true", + help="Set the selected asset tag as the default one.", + ) + + sps[SUBSCRIBE_CMD].add_argument( + "-r", + "--reset", + action="store_true", + help="Overwrite the current list of server URLs.", + ) + + for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]: + sps[cmd].add_argument( + "-s", + "--genome-server", + nargs="+", + required=True, + help="One or more URLs to {action} the {key} attribute in config file.".format( + action="add to" if cmd == SUBSCRIBE_CMD else "remove from", + key=CFG_SERVERS_KEY, + ), + ) + + return parser diff --git a/refgenie/cli.py b/refgenie/cli.py new file mode 100644 index 00000000..234b36a7 --- /dev/null +++ b/refgenie/cli.py @@ -0,0 +1,422 @@ +import logmuse +import sys +import json + +from .argparser import build_argparser +from .refgenie import parse_registry_path, _skip_lock +from ._version import __version__ +from .const import * +from .exceptions import * +from .asset_build_packages import * +from .refgenie import refgenie_build +from .helpers import _raise_missing_recipe_error, _single_folder_writeable + +from refgenconf import ( + RefGenConf, + MissingAssetError, + MissingGenomeError, + DownloadJsonError, + upgrade_config, + __version__ as rgc_version, + select_genome_config, +) +from ubiquerg import query_yes_no +from requests.exceptions import MissingSchema + +from collections import OrderedDict +from rich.console import Console + + +def main(): + """ Primary workflow """ + parser = logmuse.add_logging_options(build_argparser()) + args, remaining_args = parser.parse_known_args() + global _LOGGER + _LOGGER = logmuse.logger_via_cli(args, make_root=True) + _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}") + _LOGGER.debug(f"Args: {args}") + + if not args.command: + parser.print_help() + _LOGGER.error("No command given") + sys.exit(1) + + if args.command == ALIAS_CMD and not args.subcommand: + parser.print_help() + _LOGGER.error("No alias subcommand command given") + sys.exit(1) + + gencfg = select_genome_config( + filename=args.genome_config, + check_exist=not args.command == INIT_CMD, + on_missing=lambda fp: fp, + strict_env=True, + ) + if gencfg is None: + raise MissingGenomeConfigError(args.genome_config) + _LOGGER.debug("Determined genome config: {}".format(gencfg)) + + skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) + + # From user input we want to construct a list of asset dicts, where each + # asset has a genome name, asset name, and tag + if "asset_registry_paths" in args and args.asset_registry_paths: + _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) + asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] + + for a in asset_list: + # every asset must have a genome, either provided via registry path + # or the args.genome arg. + if not a["genome"]: + if args.genome: + a["genome"] = args.genome + else: + _LOGGER.error( + "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format( + a["genome"], a["asset"], a["tag"] + ) + ) + sys.exit(1) + else: + if args.genome and args.genome != a["genome"]: + _LOGGER.warn( + "Two different genomes specified for asset '{}'.".format( + a["asset"] + ) + ) + + else: + if args.command in GENOME_ONLY_REQUIRED and not args.genome: + parser.error("You must provide either a genome or a registry path") + sys.exit(1) + if args.command in ASSET_REQUIRED: + parser.error("You must provide an asset registry path") + sys.exit(1) + + if args.command == INIT_CMD: + _LOGGER.debug("Initializing refgenie genome configuration") + entries = OrderedDict( + { + CFG_VERSION_KEY: REQ_CFG_VERSION, + CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), + CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], + CFG_GENOMES_KEY: None, + } + ) + if args.settings_json: + if os.path.isfile(args.settings_json): + with open(args.settings_json, "r") as json_file: + data = json.load(json_file) + entries.update(data) + else: + raise FileNotFoundError( + "JSON file with config init settings does not exist: {}".format( + args.settings_json + ) + ) + if args.genome_folder: + entries.update({CFG_FOLDER_KEY: args.genome_folder}) + if args.remote_url_base: + entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) + if args.genome_archive_folder: + entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) + if args.genome_archive_config: + entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) + _LOGGER.debug("initializing with entries: {}".format(entries)) + rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) + rgc.initialize_config_file(os.path.abspath(gencfg)) + + elif args.command == BUILD_CMD: + if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]): + _LOGGER.error("Build can only build assets for one genome") + sys.exit(1) + recipe_name = None + if args.recipe: + if len(asset_list) > 1: + _LOGGER.error("Recipes cannot be specified for multi-asset builds") + sys.exit(1) + recipe_name = args.recipe + if args.requirements: + for a in asset_list: + recipe = recipe_name or a["asset"] + if recipe not in asset_build_packages.keys(): + _raise_missing_recipe_error(recipe) + _LOGGER.info("'{}' recipe requirements: ".format(recipe)) + _make_asset_build_reqs(recipe) + sys.exit(0) + refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) + + elif args.command == GET_ASSET_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + check = args.check_exists if args.check_exists else None + for a in asset_list: + _LOGGER.debug( + "getting asset: '{}/{}.{}:{}'".format( + a["genome"], a["asset"], a["seek_key"], a["tag"] + ) + ) + print( + rgc.seek( + a["genome"], + a["asset"], + a["tag"], + a["seek_key"], + strict_exists=check, + ) + ) + return + + elif args.command == INSERT_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + + if len(asset_list) > 1: + raise NotImplementedError("Can only add 1 asset at a time") + else: + sk = args.seek_keys + if sk: + sk = json.loads(args.seek_keys) + rgc.add( + path=args.path, + genome=asset_list[0]["genome"], + asset=asset_list[0]["asset"], + tag=asset_list[0]["tag"], + seek_keys=sk, + force=args.force, + ) + + elif args.command == PULL_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + + # existing assets overwriting + if args.no_overwrite: + force = False + elif args.force_overwrite: + force = True + else: + force = None + # large archive pulling + if args.no_large: + force_large = False + elif args.pull_large: + force_large = True + else: + force_large = None + # batch mode takes precedence over other choices + if args.batch: + force_large = True + force = False + + outdir = rgc.data_dir + if not os.path.exists(outdir): + raise MissingFolderError(outdir) + if not perm_check_x(outdir): + return + if not _single_folder_writeable(outdir): + _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) + return + + for a in asset_list: + rgc.pull( + a["genome"], + a["asset"], + a["tag"], + force=force, + force_large=force_large, + size_cutoff=args.size_cutoff, + ) + + elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + console = Console() + if args.command == LIST_REMOTE_CMD: + num_servers = 0 + bad_servers = [] + for server_url in rgc[CFG_SERVERS_KEY]: + num_servers += 1 + try: + table = rgc.get_asset_table( + genomes=args.genome, server_url=server_url + ) + except (DownloadJsonError, ConnectionError, MissingSchema): + bad_servers.append(server_url) + continue + else: + console.print(table) + if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: + _LOGGER.error( + "Could not list assets from the following servers: {}".format( + bad_servers + ) + ) + else: + if args.recipes: + print(", ".join(sorted(list(asset_build_packages.keys())))) + else: + console.print(rgc.get_asset_table(genomes=args.genome)) + + elif args.command == GETSEQ_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + print(rgc.getseq(args.genome, args.locus)) + + elif args.command == REMOVE_CMD: + force = args.force + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + for a in asset_list: + a["tag"] = a["tag"] or rgc.get_default_tag( + a["genome"], a["asset"], use_existing=False + ) + _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) + if a["seek_key"] is not None: + raise NotImplementedError("You can't remove a specific seek_key.") + gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} + try: + if not rgc.is_asset_complete(**gat): + with rgc as r: + r.cfg_remove_assets(**gat, aliases=args.aliases) + _LOGGER.info( + "Removed an incomplete asset " + "'{genome}/{asset}:{tag}'".format(*gat) + ) + return + except (KeyError, MissingAssetError, MissingGenomeError): + _LOGGER.info( + "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat) + ) + return + if len(asset_list) > 1: + if not query_yes_no( + "Are you sure you want to remove {} assets?".format(len(asset_list)) + ): + _LOGGER.info("Action aborted by the user") + return + force = True + for a in asset_list: + rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) + + elif args.command == TAG_CMD: + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + if len(asset_list) > 1: + raise NotImplementedError("Can only tag 1 asset at a time") + if args.default: + # set the default tag and exit + with rgc as r: + r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) + sys.exit(0) + rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) + + elif args.command == ID_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + if len(asset_list) == 1: + g, a = asset_list[0]["genome"], asset_list[0]["asset"] + t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) + print(rgc.id(g, a, t)) + return + for asset in asset_list: + g, a = asset["genome"], asset["asset"] + t = asset["tag"] or rgc.get_default_tag(g, a) + print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) + return + elif args.command == SUBSCRIBE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc.subscribe(urls=args.genome_server, reset=args.reset) + return + elif args.command == UNSUBSCRIBE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + rgc.unsubscribe(urls=args.genome_server) + return + elif args.command == ALIAS_CMD: + rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) + if args.subcommand == ALIAS_GET_CMD: + if args.aliases is not None: + for a in args.aliases: + print(rgc.get_genome_alias_digest(alias=a)) + return + console = Console() + console.print(rgc.genome_aliases_table) + + if args.subcommand == ALIAS_SET_CMD: + rgc.set_genome_alias( + digest=args.digest, + genome=args.aliases, + reset_digest=args.reset, + create_genome=args.force, + ) + return + elif args.subcommand == ALIAS_REMOVE_CMD: + rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) + return + + elif args.command == COMPARE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + res = rgc.compare( + args.genome1[0], args.genome2[0], explain=not args.no_explanation + ) + if args.no_explanation: + print(res) + + elif args.command == UPGRADE_CMD: + upgrade_config( + target_version=args.target_version, filepath=gencfg, force=args.force + ) + + +def perm_check_x(file_to_check, message_tag="genome directory"): + """ + Check X_OK permission on a path, providing according messaging and bool val. + + :param str file_to_check: path to query for permission + :param str message_tag: context for error message if check fails + :return bool: os.access(path, X_OK) for the given path + :raise ValueError: if there's no filepath to check for permission + """ + if not file_to_check: + msg = "You must provide a path to {}".format(message_tag) + _LOGGER.error(msg) + raise ValueError(msg) + if not os.access(file_to_check, os.X_OK): + _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) + return False + return True + + +def _make_asset_build_reqs(asset): + """ + Prepare requirements and inputs lists and display it + + :params str asset: name of the asset + """ + + def _format_reqs(req_list): + """ + + :param list[dict] req_list: + :return list[str]: + """ + templ = "\t{} ({})" + return [ + templ.format(req[KEY], req[DESC]) + if DEFAULT not in req + else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) + for req in req_list + ] + + reqs_list = [] + if asset_build_packages[asset][REQ_FILES]: + reqs_list.append( + "- files:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])) + ) + ) + if asset_build_packages[asset][REQ_ASSETS]: + reqs_list.append( + "- assets:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])) + ) + ) + if asset_build_packages[asset][REQ_PARAMS]: + reqs_list.append( + "- params:\n{}".format( + "\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])) + ) + ) + _LOGGER.info("\n".join(reqs_list)) diff --git a/refgenie/exceptions.py b/refgenie/exceptions.py index 9a5cc28a..7c17797a 100644 --- a/refgenie/exceptions.py +++ b/refgenie/exceptions.py @@ -1,6 +1,6 @@ from refgenconf import CFG_ENV_VARS -__all__ = ["RefgenieError", "MissingGenomeConfigError"] +__all__ = ["RefgenieError", "MissingGenomeConfigError", "MissingFolderError"] class RefgenieError(Exception): diff --git a/refgenie/helpers.py b/refgenie/helpers.py new file mode 100644 index 00000000..6a24ae12 --- /dev/null +++ b/refgenie/helpers.py @@ -0,0 +1,63 @@ +import os + +from refgenconf import MissingRecipeError +from ubiquerg import is_writable + +from .asset_build_packages import asset_build_packages +from .exceptions import MissingFolderError + + +def _parse_user_build_input(input): + """ + Parse user input specification. Used in build for specific parents and input parsing. + + :param Iterable[Iterable[str], ...] input: user command line input, + formatted as follows: [[fasta=txt, test=txt], ...] + :return dict: mapping of keys, which are input names and values + """ + lst = [] + for i in input or []: + lst.extend(i) + return ( + {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} + if lst is not None + else lst + ) + + +def _single_folder_writeable(d): + return os.access(d, os.W_OK) and os.access(d, os.X_OK) + + +def _writeable(outdir, strict_exists=False): + outdir = outdir or "." + if os.path.exists(outdir): + return _single_folder_writeable(outdir) + elif strict_exists: + raise MissingFolderError(outdir) + return _writeable(os.path.dirname(outdir), strict_exists) + + +def _raise_missing_recipe_error(recipe): + """ + Raise an error for a missing recipe, when one is requested + + :param str recipe: recipe name + :raise MissingRecipeError: always + """ + raise MissingRecipeError( + f"Recipe '{recipe}' not found. Available recipes: " + f"{', '.join(list(asset_build_packages.keys()))}" + ) + + +def _skip_lock(skip_arg, cfg): + """ + If config read lock skip was not forced, check if dir is writable and set + the default to the result + + :param bool skip_arg: argument selected on the CLI + :param str cfg: path to the confjg + :return bool: decision -- whether to skip the file lock for read + """ + return is_writable(os.path.dirname(cfg)) if not skip_arg else True diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c12a2aff..c093f680 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -1,526 +1,32 @@ #!/usr/bin/env python -from collections import OrderedDict -from shutil import rmtree -from re import sub -from requests import ConnectionError -from requests.exceptions import MissingSchema -from rich.console import Console - -import os import sys import csv import signal import json -from ._version import __version__ -from .exceptions import MissingGenomeConfigError, MissingFolderError from .asset_build_packages import * from .const import * +from .helpers import ( + _raise_missing_recipe_error, + _skip_lock, + _parse_user_build_input, + _writeable, +) -import logmuse import pypiper import refgenconf from refgenconf import ( RefGenConf, - MissingAssetError, - MissingGenomeError, - MissingRecipeError, - DownloadJsonError, get_dir_digest, - upgrade_config, - __version__ as rgc_version, - select_genome_config, -) -from ubiquerg import ( - is_url, - query_yes_no, - parse_registry_path as prp, - VersionInHelpParser, - is_command_callable, ) +from ubiquerg import parse_registry_path as prp from ubiquerg.system import is_writable from yacman import UndefinedAliasError -from argparse import HelpFormatter _LOGGER = None -def build_argparser(): - """ - Builds argument parser. - - :return argparse.ArgumentParser - """ - - banner = "%(prog)s - reference genome asset manager" - additional_description = "\nhttps://refgenie.databio.org" - - parser = VersionInHelpParser( - prog="refgenie", - version=f"{__version__} | refgenconf {rgc_version}", - description=banner, - epilog=additional_description, - ) - - subparsers = parser.add_subparsers(dest="command") - - def add_subparser(cmd, msg, subparsers): - return subparsers.add_parser( - cmd, - description=msg, - help=msg, - formatter_class=lambda prog: HelpFormatter( - prog, max_help_position=40, width=90 - ), - ) - - sps = {} - for cmd, desc in SUBPARSER_MESSAGES.items(): - sps[cmd] = add_subparser(cmd, desc, subparsers) - # alias is nested and alias subcommands require config path - if cmd == ALIAS_CMD: - continue - # It's required for init - sps[cmd].add_argument( - "-c", - "--genome-config", - required=(cmd == INIT_CMD), - dest="genome_config", - metavar="C", - help="Path to local genome configuration file. Optional if {} environment variable is set.".format( - ", ".join(CFG_ENV_VARS) - ), - ) - sps[cmd].add_argument( - "--skip-read-lock", - required=False, - action="store_true", - help="Whether the config file should not be locked for reading", - ) - - # upgrade: upgrade config and alter file structure to the target version - sps[UPGRADE_CMD].add_argument( - "-v", - "--target-version", - required=True, - metavar="V", - help="Target config version for the upgrade.", - ) - sps[UPGRADE_CMD].add_argument( - "-f", - "--force", - action="store_true", - help="Do not prompt before action, approve upfront.", - ) - - sps[INIT_CMD].add_argument( - "-s", - "--genome-server", - nargs="+", - default=[DEFAULT_SERVER], - help="URL(s) to use for the {} attribute in config file. Default: {}.".format( - CFG_SERVERS_KEY, DEFAULT_SERVER - ), - ) - sps[INIT_CMD].add_argument( - "-f", - "--genome-folder", - help="Absolute path to parent folder refgenie-managed assets.", - ) - sps[INIT_CMD].add_argument( - "-a", - "--genome-archive-folder", - help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.", - ) - sps[INIT_CMD].add_argument( - "-b", - "--genome-archive-config", - help="Absolute path to desired archive config file; used by refgenieserver.", - ) - sps[INIT_CMD].add_argument( - "-u", - "--remote-url-base", - help="URL to use as an alternative, remote archive location; used by refgenieserver.", - ) - sps[INIT_CMD].add_argument( - "-j", - "--settings-json", - help="Absolute path to a JSON file with the key " - "value pairs to inialize the configuration " - "file with. Overwritten by itemized specifications.", - ) - sps[BUILD_CMD] = pypiper.add_pypiper_args( - sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"] - ) - - # Add any arguments specific to subcommands. - - sps[BUILD_CMD].add_argument( - "--tag-description", - required=False, - default=None, - type=str, - help="Add tag level description (e.g. built with version 0.3.2).", - ) - - sps[BUILD_CMD].add_argument( - "--genome-description", - required=False, - default=None, - type=str, - help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).", - ) - - sps[BUILD_CMD].add_argument( - "-d", - "--docker", - action="store_true", - help="Run all commands in the refgenie docker container.", - ) - - sps[BUILD_CMD].add_argument( - "--assets", - nargs="+", - action="append", - required=False, - default=None, - help="Override the default genome, asset and tag of the parents" - " (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).", - ) - - sps[BUILD_CMD].add_argument( - "--files", - nargs="+", - action="append", - required=False, - default=None, - help="Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).", - ) - - sps[BUILD_CMD].add_argument( - "--params", - nargs="+", - action="append", - required=False, - default=None, - help="Provide required parameter values (e.g. param1=value1).", - ) - - sps[BUILD_CMD].add_argument( - "-v", - "--volumes", - nargs="+", - required=False, - default=None, - help="If using docker, also mount these folders as volumes.", - ) - - sps[BUILD_CMD].add_argument( - "-o", - "--outfolder", - dest="outfolder", - required=False, - default=None, - help="Override the default path to genomes folder, which is the " - "genome_folder attribute in the genome configuration file.", - ) - - sps[BUILD_CMD].add_argument( - "-q", - "--requirements", - action="store_true", - help="Show the build requirements for the specified asset and exit.", - ) - - sps[BUILD_CMD].add_argument( - "-r", - "--recipe", - required=False, - default=None, - type=str, - help="Provide a recipe to use.", - ) - - alias_subparser = sps[ALIAS_CMD] - alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand") - - alias_sps = {} - for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items(): - alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers) - alias_sps[cmd].add_argument( - "-c", - "--genome-config", - required=False, - dest="genome_config", - metavar="C", - help="Path to local genome configuration file. Optional if {} environment variable is set.".format( - ", ".join(CFG_ENV_VARS) - ), - ) - alias_sps[cmd].add_argument( - "--skip-read-lock", - required=False, - action="store_true", - help="Whether the config file should not be locked for reading", - ) - - alias_sps[ALIAS_SET_CMD].add_argument( - "-a", - "--aliases", - metavar="A", - required=False, - default=None, - type=str, - nargs="+", - help="Aliases to set; single if the digest is to be retrieved from the server.", - ) - alias_sps[ALIAS_SET_CMD].add_argument( - "-d", - "--digest", - metavar="D", - required=False, - type=str, - help="Digest to set; leave out if the digest is to be retrieved from the server.", - ) - alias_sps[ALIAS_SET_CMD].add_argument( - "-r", - "--reset", - action="store_true", - help="Whether all the aliases should be removed prior to setting new ones.", - ) - alias_sps[ALIAS_SET_CMD].add_argument( - "-f", - "--force", - action="store_true", - help="Whether the action should be forced, if genome does not exist.", - ) - - alias_sps[ALIAS_REMOVE_CMD].add_argument( - "-a", - "--aliases", - metavar="A", - required=False, - default=None, - type=str, - nargs="+", - help="Aliases to remove.", - ) - alias_sps[ALIAS_REMOVE_CMD].add_argument( - "-d", "--digest", metavar="D", required=True, type=str, help="Digest to remove." - ) - - alias_sps[ALIAS_GET_CMD].add_argument( - "-a", - "--aliases", - metavar="A", - required=False, - type=str, - nargs="+", - help="Aliases to get the digests for.", - ) - - sps[COMPARE_CMD].add_argument( - "genome1", - metavar="GENOME1", - type=str, - nargs=1, - help="First genome for compatibility check.", - ) - sps[COMPARE_CMD].add_argument( - "genome2", - metavar="GENOME2", - type=str, - nargs=1, - help="Second genome for compatibility check.", - ) - sps[COMPARE_CMD].add_argument( - "-e", - "--no-explanation", - action="store_true", - help="Do not print compatibility code explanation.", - ) - - # add 'genome' argument to many commands - for cmd in [ - PULL_CMD, - GET_ASSET_CMD, - BUILD_CMD, - INSERT_CMD, - REMOVE_CMD, - GETSEQ_CMD, - TAG_CMD, - ID_CMD, - ]: - # genome is not required for listing actions - sps[cmd].add_argument( - "-g", - "--genome", - required=cmd in GETSEQ_CMD, - metavar="G", - help="Reference assembly ID, e.g. mm10.", - ) - - for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD: - sps[cmd].add_argument( - "-g", - "--genome", - required=False, - type=str, - metavar="G", - nargs="*", - help="Reference assembly ID, e.g. mm10.", - ) - - for cmd in [ - PULL_CMD, - GET_ASSET_CMD, - BUILD_CMD, - INSERT_CMD, - REMOVE_CMD, - TAG_CMD, - ID_CMD, - ]: - sps[cmd].add_argument( - "asset_registry_paths", - metavar="asset-registry-paths", - type=str, - nargs="+", - help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" - + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."), - ) - - sps[LIST_LOCAL_CMD].add_argument( - "-r", "--recipes", action="store_true", help="List available recipes." - ) - - for cmd in [REMOVE_CMD, INSERT_CMD]: - sps[cmd].add_argument( - "-f", - "--force", - action="store_true", - help="Do not prompt before action, approve upfront.", - ) - - sps[REMOVE_CMD].add_argument( - "-a", - "--aliases", - action="store_true", - help="Remove the genome alias if last asset for that genome is removed.", - ) - force_group = sps[PULL_CMD].add_argument_group( - title="Prompt handling", - description="These flags configure the pull prompt responses.", - ) - - overwrite_group = force_group.add_mutually_exclusive_group() - - overwrite_group.add_argument( - "--no-overwrite", action="store_true", help="Do not overwrite if asset exists." - ) - - overwrite_group.add_argument( - "--force-overwrite", action="store_true", help="Overwrite if asset exists." - ) - - large_group = force_group.add_mutually_exclusive_group() - - large_group.add_argument( - "--no-large", action="store_true", help="Do not pull archives over 5GB." - ) - - large_group.add_argument( - "--pull-large", - action="store_true", - help="Pull any archive, regardless of its size.", - ) - - force_group.add_argument( - "--size-cutoff", - type=float, - default=10, - metavar="S", - help="Maximum archive file size to download with no confirmation required (in GB, default: 10)", - ) - - force_group.add_argument( - "-b", - "--batch", - action="store_true", - help="Use batch mode: pull large archives, do no overwrite", - ) - - sps[INSERT_CMD].add_argument( - "-p", "--path", required=True, metavar="P", help="Relative local path to asset." - ) - - sps[INSERT_CMD].add_argument( - "-s", - "--seek-keys", - required=False, - type=str, - metavar="S", - help=""" - String representation of a JSON object with seek_keys, - e.g. '{"seek_key1": "file.txt"}' - """, - ) - - sps[GETSEQ_CMD].add_argument( - "-l", - "--locus", - required=True, - help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.", - ) - - sps[GET_ASSET_CMD].add_argument( - "-e", - "--check-exists", - required=False, - action="store_true", - help="Whether the returned asset path should be checked for existence on disk.", - ) - - sps[TAG_CMD].add_argument( - "-f", - "--force", - action="store_true", - help="Do not prompt before action, approve upfront.", - ) - - group = sps[TAG_CMD].add_mutually_exclusive_group(required=True) - - group.add_argument("-t", "--tag", type=str, help="Tag to assign to an asset.") - - group.add_argument( - "-d", - "--default", - action="store_true", - help="Set the selected asset tag as the default one.", - ) - - sps[SUBSCRIBE_CMD].add_argument( - "-r", - "--reset", - action="store_true", - help="Overwrite the current list of server URLs.", - ) - - for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]: - sps[cmd].add_argument( - "-s", - "--genome-server", - nargs="+", - required=True, - help="One or more URLs to {action} the {key} attribute in config file.".format( - action="add to" if cmd == SUBSCRIBE_CMD else "remove from", - key=CFG_SERVERS_KEY, - ), - ) - - return parser - - def parse_registry_path(path): return prp( path, @@ -534,46 +40,6 @@ def parse_registry_path(path): ) -def copy_or_download_file(input_string, outfolder): - """ - Given an input file, which can be a local file or a URL, and output folder, - this downloads or copies the file into the output folder. - - :param str input_string: Can be either a URL or a path to a local file - :param str outfolder: Where to store the result. - :return str, str: output/result file and command - """ - result_file = os.path.join(outfolder, os.path.basename(input_string)) - parts = ( - ["wget -O", result_file, input_string] - if is_url(input_string) - else ["cp", input_string, result_file] - ) - return result_file, " ".join(parts) - - -def convert_file(input_fasta, output_file, conversions): - """ - Given an input file, output file, and a list of conversions, gives the appropriate output file. - - :param str output_file: Path to local output file you want to create - :param dict conversions: A dictionary of shell commands to convert files of a given type. - """ - form = {"INPUT": input_fasta, "OUTPUT": output_file} - _, ext = os.path.splitext(input_fasta) - if ext in conversions: - return conversions[ext].format(**form) - - -def default_config_file(): - """ - Path to default compute environment settings file. - - :return str: Path to default compute settings file - """ - return os.path.join(os.path.dirname(__file__), "refgenie.yaml") - - def get_asset_vars( genome, asset_key, @@ -665,12 +131,6 @@ def _read_json_file(filepath): _LOGGER.debug("No outfolder provided, using genome config.") args.outfolder = rgc.data_dir - _LOGGER.debug("Default config file: {}".format(default_config_file())) - - if args.config_file and not os.path.isfile(args.config_file): - _LOGGER.debug("Config file path isn't a file: {}".format(args.config_file)) - args.config_file = default_config_file() - def _build_asset( genome, asset_key, @@ -1003,510 +463,10 @@ def _build_asset( _raise_missing_recipe_error(recipe_name) -def _exec_list(rgc, remote, genome): - if remote: - pfx = "Remote" - # we use this func looping through the server urls and assigning a - # single instance as the server for the object. That's why we can - # access the data with [0] below - assemblies, assets = list(rgc.listr(genome=genome, as_str=True).values())[0] - recipes = None # Not implemented - else: - pfx = "Local" - assemblies, assets = rgc.get_local_data_str(genome=genome) - # also get recipes - recipes = ", ".join(sorted(list(asset_build_packages.keys()))) - return pfx, assemblies, assets, recipes - - -def perm_check_x(file_to_check, message_tag="genome directory"): - """ - Check X_OK permission on a path, providing according messaging and bool val. - - :param str file_to_check: path to query for permission - :param str message_tag: context for error message if check fails - :return bool: os.access(path, X_OK) for the given path - :raise ValueError: if there's no filepath to check for permission - """ - if not file_to_check: - msg = "You must provide a path to {}".format(message_tag) - _LOGGER.error(msg) - raise ValueError(msg) - if not os.access(file_to_check, os.X_OK): - _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check)) - return False - return True - - -def main(): - """ Primary workflow """ - parser = logmuse.add_logging_options(build_argparser()) - args, remaining_args = parser.parse_known_args() - global _LOGGER - _LOGGER = logmuse.logger_via_cli(args, make_root=True) - _LOGGER.debug("refgenie {}".format(__version__)) - _LOGGER.debug("Args: {}".format(args)) - - if not args.command: - parser.print_help() - _LOGGER.error("No command given") - sys.exit(1) - - if args.command == ALIAS_CMD and not args.subcommand: - parser.print_help() - _LOGGER.error("No alias subcommand command given") - sys.exit(1) - - gencfg = select_genome_config( - filename=args.genome_config, - check_exist=not args.command == INIT_CMD, - on_missing=lambda fp: fp, - strict_env=True, - ) - if gencfg is None: - raise MissingGenomeConfigError(args.genome_config) - _LOGGER.debug("Determined genome config: {}".format(gencfg)) - - skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) - - # From user input we want to construct a list of asset dicts, where each - # asset has a genome name, asset name, and tag - if "asset_registry_paths" in args and args.asset_registry_paths: - _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths)) - asset_list = [parse_registry_path(x) for x in args.asset_registry_paths] - - for a in asset_list: - # every asset must have a genome, either provided via registry path - # or the args.genome arg. - if not a["genome"]: - if args.genome: - a["genome"] = args.genome - else: - _LOGGER.error( - "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format( - a["genome"], a["asset"], a["tag"] - ) - ) - sys.exit(1) - else: - if args.genome and args.genome != a["genome"]: - _LOGGER.warn( - "Two different genomes specified for asset '{}'.".format( - a["asset"] - ) - ) - - else: - if args.command in GENOME_ONLY_REQUIRED and not args.genome: - parser.error("You must provide either a genome or a registry path") - sys.exit(1) - if args.command in ASSET_REQUIRED: - parser.error("You must provide an asset registry path") - sys.exit(1) - - if args.command == INIT_CMD: - _LOGGER.debug("Initializing refgenie genome configuration") - entries = OrderedDict( - { - CFG_VERSION_KEY: REQ_CFG_VERSION, - CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)), - CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER], - CFG_GENOMES_KEY: None, - } - ) - if args.settings_json: - if os.path.isfile(args.settings_json): - with open(args.settings_json, "r") as json_file: - data = json.load(json_file) - entries.update(data) - else: - raise FileNotFoundError( - "JSON file with config init settings does not exist: {}".format( - args.settings_json - ) - ) - if args.genome_folder: - entries.update({CFG_FOLDER_KEY: args.genome_folder}) - if args.remote_url_base: - entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) - if args.genome_archive_folder: - entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) - if args.genome_archive_config: - entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config}) - _LOGGER.debug("initializing with entries: {}".format(entries)) - rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock) - rgc.initialize_config_file(os.path.abspath(gencfg)) - - elif args.command == BUILD_CMD: - if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]): - _LOGGER.error("Build can only build assets for one genome") - sys.exit(1) - recipe_name = None - if args.recipe: - if len(asset_list) > 1: - _LOGGER.error("Recipes cannot be specified for multi-asset builds") - sys.exit(1) - recipe_name = args.recipe - if args.requirements: - for a in asset_list: - recipe = recipe_name or a["asset"] - if recipe not in asset_build_packages.keys(): - _raise_missing_recipe_error(recipe) - _LOGGER.info("'{}' recipe requirements: ".format(recipe)) - _make_asset_build_reqs(recipe) - sys.exit(0) - refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args) - - elif args.command == GET_ASSET_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - check = args.check_exists if args.check_exists else None - for a in asset_list: - _LOGGER.debug( - "getting asset: '{}/{}.{}:{}'".format( - a["genome"], a["asset"], a["seek_key"], a["tag"] - ) - ) - print( - rgc.seek( - a["genome"], - a["asset"], - a["tag"], - a["seek_key"], - strict_exists=check, - ) - ) - return - - elif args.command == INSERT_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - - if len(asset_list) > 1: - raise NotImplementedError("Can only add 1 asset at a time") - else: - sk = args.seek_keys - if sk: - sk = json.loads(args.seek_keys) - rgc.add( - path=args.path, - genome=asset_list[0]["genome"], - asset=asset_list[0]["asset"], - tag=asset_list[0]["tag"], - seek_keys=sk, - force=args.force, - ) - - elif args.command == PULL_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - - # existing assets overwriting - if args.no_overwrite: - force = False - elif args.force_overwrite: - force = True - else: - force = None - # large archive pulling - if args.no_large: - force_large = False - elif args.pull_large: - force_large = True - else: - force_large = None - # batch mode takes precedence over other choices - if args.batch: - force_large = True - force = False - - outdir = rgc.data_dir - if not os.path.exists(outdir): - raise MissingFolderError(outdir) - if not perm_check_x(outdir): - return - if not _single_folder_writeable(outdir): - _LOGGER.error("Insufficient permissions to write to: {}".format(outdir)) - return - - for a in asset_list: - rgc.pull( - a["genome"], - a["asset"], - a["tag"], - force=force, - force_large=force_large, - size_cutoff=args.size_cutoff, - ) - - elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - console = Console() - if args.command == LIST_REMOTE_CMD: - num_servers = 0 - bad_servers = [] - for server_url in rgc[CFG_SERVERS_KEY]: - num_servers += 1 - try: - table = rgc.get_asset_table( - genomes=args.genome, server_url=server_url - ) - except (DownloadJsonError, ConnectionError, MissingSchema): - bad_servers.append(server_url) - continue - else: - console.print(table) - if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers: - _LOGGER.error( - "Could not list assets from the following servers: {}".format( - bad_servers - ) - ) - else: - if args.recipes: - print(", ".join(sorted(list(asset_build_packages.keys())))) - else: - console.print(rgc.get_asset_table(genomes=args.genome)) - - elif args.command == GETSEQ_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - print(rgc.getseq(args.genome, args.locus)) - - elif args.command == REMOVE_CMD: - force = args.force - rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) - for a in asset_list: - a["tag"] = a["tag"] or rgc.get_default_tag( - a["genome"], a["asset"], use_existing=False - ) - _LOGGER.debug("Determined tag for removal: {}".format(a["tag"])) - if a["seek_key"] is not None: - raise NotImplementedError("You can't remove a specific seek_key.") - gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]} - try: - if not rgc.is_asset_complete(**gat): - with rgc as r: - r.cfg_remove_assets(**gat, aliases=args.aliases) - _LOGGER.info( - "Removed an incomplete asset " - "'{genome}/{asset}:{tag}'".format(*gat) - ) - return - except (KeyError, MissingAssetError, MissingGenomeError): - _LOGGER.info( - "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat) - ) - return - if len(asset_list) > 1: - if not query_yes_no( - "Are you sure you want to remove {} assets?".format(len(asset_list)) - ): - _LOGGER.info("Action aborted by the user") - return - force = True - for a in asset_list: - rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force) - - elif args.command == TAG_CMD: - rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) - if len(asset_list) > 1: - raise NotImplementedError("Can only tag 1 asset at a time") - if args.default: - # set the default tag and exit - with rgc as r: - r.set_default_pointer(a["genome"], a["asset"], a["tag"], True) - sys.exit(0) - rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force) - - elif args.command == ID_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - if len(asset_list) == 1: - g, a = asset_list[0]["genome"], asset_list[0]["asset"] - t = asset_list[0]["tag"] or rgc.get_default_tag(g, a) - print(rgc.id(g, a, t)) - return - for asset in asset_list: - g, a = asset["genome"], asset["asset"] - t = asset["tag"] or rgc.get_default_tag(g, a) - print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t)) - return - elif args.command == SUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - rgc.subscribe(urls=args.genome_server, reset=args.reset) - return - elif args.command == UNSUBSCRIBE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - rgc.unsubscribe(urls=args.genome_server) - return - elif args.command == ALIAS_CMD: - rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock) - if args.subcommand == ALIAS_GET_CMD: - if args.aliases is not None: - for a in args.aliases: - print(rgc.get_genome_alias_digest(alias=a)) - return - console = Console() - console.print(rgc.genome_aliases_table) - - if args.subcommand == ALIAS_SET_CMD: - rgc.set_genome_alias( - digest=args.digest, - genome=args.aliases, - reset_digest=args.reset, - create_genome=args.force, - ) - return - elif args.subcommand == ALIAS_REMOVE_CMD: - rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases) - return - - elif args.command == COMPARE_CMD: - rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - res = rgc.compare( - args.genome1[0], args.genome2[0], explain=not args.no_explanation - ) - if args.no_explanation: - print(res) - - elif args.command == UPGRADE_CMD: - upgrade_config( - target_version=args.target_version, filepath=gencfg, force=args.force - ) - - -def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): - """ - Message and save removed entity data - - :param str directory: removed dir - :param str entity_class: class of the entity - :param dict asset_dict: selected genome/asset:tag combination - :param list removed_entities: list of the removed entities to append to - """ - subclass = "asset" if entity_class == "genome" else "tag" - if os.path.basename(directory) == asset_dict[entity_class]: - _LOGGER.info( - "Last {sub} for {ec} '{en}' has been removed, removing {ec} directory".format( - sub=subclass, ec=entity_class, en=asset_dict[entity_class] - ) - ) - removed_entities.append(_remove(directory)) - else: - _LOGGER.debug( - "Didn't remove '{}' since it does not match the {} name: {}".format( - directory, entity_class, asset_dict[entity_class] - ) - ) - - -def _remove(path): - """ - remove asset if it is a dir or a file - - :param str path: path to the entity to remove, either a file or a dir - :return str: removed path - """ - if os.path.isfile(path): - os.remove(path) - elif os.path.isdir(path): - rmtree(path) - else: - raise ValueError("path '{}' is neither a file nor a dir.".format(path)) - return path - - def _key_to_name(k): return k.replace("_", " ") -def _single_folder_writeable(d): - return os.access(d, os.W_OK) and os.access(d, os.X_OK) - - -def _writeable(outdir, strict_exists=False): - outdir = outdir or "." - if os.path.exists(outdir): - return _single_folder_writeable(outdir) - elif strict_exists: - raise MissingFolderError(outdir) - return _writeable(os.path.dirname(outdir), strict_exists) - - -def _make_asset_build_reqs(asset): - """ - Prepare requirements and inputs lists and display it - - :params str asset: name of the asset - """ - - def _format_reqs(req_list): - """ - - :param list[dict] req_list: - :return list[str]: - """ - templ = "\t{} ({})" - return [ - templ.format(req[KEY], req[DESC]) - if DEFAULT not in req - else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) - for req in req_list - ] - - reqs_list = [] - if asset_build_packages[asset][REQ_FILES]: - reqs_list.append( - "- files:\n{}".format( - "\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES])) - ) - ) - if asset_build_packages[asset][REQ_ASSETS]: - reqs_list.append( - "- assets:\n{}".format( - "\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS])) - ) - ) - if asset_build_packages[asset][REQ_PARAMS]: - reqs_list.append( - "- params:\n{}".format( - "\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS])) - ) - ) - _LOGGER.info("\n".join(reqs_list)) - - -def get_dir_digest(path, pm=None): - """ - Generate a MD5 digest that reflects just the contents of the files in the selected directory. - - :param str path: path to the directory to digest - :param pypiper.PipelineManager pm: a pipeline object, optional. The subprocess module will be used if not provided - :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 - """ - if not is_command_callable("md5sum"): - raise OSError( - "md5sum command line tool is required for asset digest calculation. \n" - "Install and try again, e.g on macOS: 'brew install md5sha1sum'" - ) - cmd = ( - "cd {}; find . -type f -not -path './" - + BUILD_STATS_DIR - + "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" - ) - if isinstance(pm, pypiper.PipelineManager): - x = pm.checkprint(cmd.format(path)) - else: - try: - from subprocess import check_output - - x = check_output(cmd.format(path), shell=True).decode("utf-8") - except Exception as e: - _LOGGER.warning( - "{}: could not calculate digest for '{}'".format( - e.__class__.__name__, path - ) - ) - return - return str(sub(r"\W+", "", x)) # strips non-alphanumeric - - def _handle_sigint(gat): """ SIGINT handler, unlocks the config file and exists the program @@ -1522,38 +482,6 @@ def handle(sig, frame): return handle -def _parse_user_build_input(input): - """ - Parse user input specification. Used in build for specific parents and input parsing. - - :param Iterable[Iterable[str], ...] input: user command line input, - formatted as follows: [[fasta=txt, test=txt], ...] - :return dict: mapping of keys, which are input names and values - """ - lst = [] - for i in input or []: - lst.extend(i) - return ( - {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} - if lst is not None - else lst - ) - - -def _raise_missing_recipe_error(recipe): - """ - Raise an error for a missing recipe, when one is requested - - :param str recipe: recipe name - :raise MissingRecipeError: always - """ - raise MissingRecipeError( - "Recipe '{}' not found. Available recipes: {}".format( - recipe, ", ".join(list(asset_build_packages.keys())) - ) - ) - - def _check_recipe(recipe): """ Check whether there are any key name clashes in the recipe requirements @@ -1607,15 +535,3 @@ def _seek( enclosing_dir=enclosing_dir, strict_exists=True, ) - - -def _skip_lock(skip_arg, cfg): - """ - If config read lock skip was not forced, check if dir is writable and set - the default to the result - - :param bool skip_arg: argument selected on the CLI - :param str cfg: path to the confjg - :return bool: decision -- whether to skip the file lock for read - """ - return is_writable(os.path.dirname(cfg)) if not skip_arg else True From 2b97ad59006a2abdbd1ab469a6b1fd56e41727df Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Mar 2021 12:38:44 -0500 Subject: [PATCH 103/110] add missing import --- refgenie/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/refgenie/cli.py b/refgenie/cli.py index 234b36a7..ab40f8c5 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -1,6 +1,7 @@ import logmuse import sys import json +import os from .argparser import build_argparser from .refgenie import parse_registry_path, _skip_lock From 8df360b29633c02c8dbd3f23c64aab0334ea0cfe Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Mar 2021 12:40:12 -0500 Subject: [PATCH 104/110] add missing import --- refgenie/refgenie.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index c093f680..4d74a1f6 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -1,5 +1,4 @@ -#!/usr/bin/env python - +import os import sys import csv import signal From 0f81287eb0695eca46ffa5673f8419537bea1f22 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 8 Mar 2021 12:48:24 -0500 Subject: [PATCH 105/110] get logger --- refgenie/const.py | 2 ++ refgenie/refgenie.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/refgenie/const.py b/refgenie/const.py index 40db1316..d8ec6600 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -4,6 +4,8 @@ """ from refgenconf.const import * +PKG_NAME = "refgenie" + BUILD_CMD = "build" INIT_CMD = "init" PULL_CMD = "pull" diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 4d74a1f6..9f62caa1 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -22,8 +22,9 @@ from ubiquerg import parse_registry_path as prp from ubiquerg.system import is_writable from yacman import UndefinedAliasError +from logging import getLogger -_LOGGER = None +_LOGGER = getLogger(PKG_NAME) def parse_registry_path(path): From 4ac5dd186044aadc3a21c52f01d0fca6e630f1d0 Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 8 Mar 2021 21:18:35 +0000 Subject: [PATCH 106/110] update servers. Fix #228 --- docs/autodoc_build/refgenconf.md | 431 ++----------------------------- docs/servers.md | 7 + mkdocs.yml | 2 +- 3 files changed, 36 insertions(+), 404 deletions(-) create mode 100644 docs/servers.md diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 0445d538..4f96a359 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -29,64 +29,12 @@ h4 .content { # Package `refgenconf` Documentation -## Class `ConfigNotCompliantError` -The format of the config file does not match required version/standards - - -## Class `DownloadJsonError` -Non-OK response from a JSON download attempt - - -```python -def __init__(self, resp) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -## Class `GenomeConfigFormatError` -Exception for invalid genome config file format. - - -```python -def __init__(self, msg) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -## Class `MissingAssetError` -Error type for request of an unavailable genome asset. - - -## Class `MissingConfigDataError` -Missing required configuration instance items - - -## Class `MissingGenomeError` -Error type for request of unknown genome/assembly. - - -## Class `MissingRecipeError` -Error type for request of an unavailable recipe. - - -## Class `MissingSeekKeyError` -Error type for request of an unavailable asset seek key. - - -## Class `MissingTagError` -Error type for request of an unavailable asset tag. - - ## Class `RefGenConf` A sort of oracle of available reference genome assembly assets ```python -def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False, genome_exact=False) +def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False) ``` Create the config instance by with a filepath or key-value pairs. @@ -107,35 +55,6 @@ Create the config instance by with a filepath or key-value pairs. -```python -def add(self, path, genome, asset, tag=None, seek_keys=None, force=False) -``` - -Add an external asset to the config -#### Parameters: - -- `path` (`str`): a path to the asset to add; must exist and be relativeto the genome_folder -- `genome` (`str`): genome name -- `asset` (`str`): asset name -- `tag` (`str`): tag name -- `seek_keys` (`dict`): seek keys to add -- `force` (`bool`): whether to force existing asset overwrite - - - - -```python -def alias_dir(self) -``` - -Path to the genome alias directory -#### Returns: - -- `str`: path to the directory where the assets are stored - - - - ```python def assets_str(self, offset_text=' ', asset_sep=', ', genome_assets_delim='/ ', genome=None, order=None) ``` @@ -239,37 +158,6 @@ In case the local asset does not exist, the config is populated with the remote -```python -def compare(self, genome1, genome2, explain=False) -``` - -Check genomes compatibility level. Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata -#### Parameters: - -- `genome1` (`str`): name of the first genome to compare -- `genome2` (`str`): name of the first genome to compare -- `explain` (`bool`): whether the returned code explanation shouldbe displayed - - -#### Returns: - -- `int`: compatibility code - - - - -```python -def data_dir(self) -``` - -Path to the genome data directory -#### Returns: - -- `str`: path to the directory where the assets are stored - - - - ```python def file_path(self) ``` @@ -303,30 +191,6 @@ Determine path to a particular asset for a particular genome. -```python -def genome_aliases(self) -``` - -Mapping of human-readable genome identifiers to genome identifiers -#### Returns: - -- `dict`: mapping of human-readable genome identifiers to genomeidentifiers - - - - -```python -def genome_aliases_table(self) -``` - -Mapping of human-readable genome identifiers to genome identifiers -#### Returns: - -- `dict`: mapping of human-readable genome identifiers to genomeidentifiers - - - - ```python def genomes_list(self, order=None) ``` @@ -356,42 +220,6 @@ Get as single string this configuration's reference genome assembly IDs. -```python -def get_asds_path(self, genome) -``` - -Get path to the Annotated Sequence Digests JSON file for a given genome. Note that the path and/or genome may not exist. -#### Parameters: - -- `genome` (`str`): genome name - - -#### Returns: - -- `str`: ASDs path - - - - -```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7f946329a9d8>) -``` - -Get a rich.Table object representing assets available locally -#### Parameters: - -- `genomes` (`list[str]`): genomes to restrict the results with -- `server_url` (`str`): server URL to query for the remote genome data -- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset - - -#### Returns: - -- `rich.table.Table`: table of assets available locally - - - - ```python def get_default_tag(self, genome, asset, use_existing=True) ``` @@ -411,54 +239,6 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag -```python -def get_genome_alias(self, digest, fallback=False, all_aliases=False) -``` - -Get the human readable alias for a genome digest -#### Parameters: - -- `digest` (`str`): digest to find human-readable alias for -- `fallback` (`bool`): whether to return the query digest in caseof failure -- `all_aliases` (`bool`): whether to return all aliases instead of justthe first one - - -#### Returns: - -- `str | list[str]`: human-readable aliases - - -#### Raises: - -- `GenomeConfigFormatError`: if "genome_digests" section doesnot exist in the config -- `UndefinedAliasError`: if a no alias has been defined for therequested digest - - - - -```python -def get_genome_alias_digest(self, alias, fallback=False) -``` - -Get the human readable alias for a genome digest -#### Parameters: - -- `alias` (`str`): alias to find digest for -- `fallback` (`bool`): whether to return the query alias in caseof failure and in case it is one of the digests - - -#### Returns: - -- `str`: genome digest - - -#### Raises: - -- `UndefinedAliasError`: if the specified alias has been assigned toany digests - - - - ```python def get_genome_attributes(self, genome) ``` @@ -495,7 +275,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f946329b510>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f6f6a5b8f80>) ``` List genomes and assets available remotely. @@ -514,27 +294,7 @@ List genomes and assets available remotely. ```python -def get_symlink_paths(self, genome, asset=None, tag=None, all_aliases=False) -``` - -Get path to the alias directory for the selected genome-asset-tag -#### Parameters: - -- `genome` (`str`): reference genome ID -- `asset` (`str`): asset name -- `tag` (`str`): tag name -- `all_aliases` (`bool`): whether to return a collection of symboliclinks or just the first one from the alias list - - -#### Returns: - -- `dict`: - - - - -```python -def getseq(self, genome, locus, as_str=False) +def getseq(self, genome, locus) ``` Return the sequence found in a selected range and chromosome. Something like the refget protocol. @@ -542,12 +302,6 @@ Return the sequence found in a selected range and chromosome. Something like the - `genome` (`str`): name of the sequence identifier - `locus` (`str`): 1-10' -- `as_str` (`bool`): whether to convert the resurned object to stringand return just the sequence - - -#### Returns: - -- `str | pyfaidx.FastaRecord | pyfaidx.Sequence`: selected sequence @@ -594,28 +348,6 @@ Initialize genome configuration file on disk -```python -def initialize_genome(self, fasta_path, alias, fasta_unzipped=False, skip_alias_write=False) -``` - -Initialize a genome - -Create a JSON file with Annotated Sequence Digests (ASDs) -for the FASTA file in the genome directory. -#### Parameters: - -- `fasta_path` (`str`): path to a FASTA file to initialize genome with -- `alias` (`str`): alias to set for the genome -- `skip_alias_write` (`bool`): whether to skip writing the alias to the file - - -#### Returns: - -- `str, list[dict[]]`: human-readable name for the genome - - - - ```python def is_asset_complete(self, genome, asset, tag) ``` @@ -692,10 +424,10 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7f946329b620>, as_str=False) +def listr(self, genome=None, order=None, get_url= at 0x7f6f6a5bd0e0>) ``` -List genomes and assets available remotely on all servers the object subscribes to +List genomes and assets available remotely. #### Parameters: - `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance @@ -705,7 +437,7 @@ List genomes and assets available remotely on all servers the object subscribes #### Returns: -- `dict[OrderedDict[list]]`: remotely available genomes and assetskeyed by genome keyed by source server endpoint +- `str, str`: text reps of remotely available genomes and assets @@ -723,7 +455,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7f946329b8c8>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x7f6f6a5bd3b0>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -734,8 +466,6 @@ Download and possibly unpack one or more assets for a given ref gen. - `tag` (`str`): name of particular tag to fetch - `unpack` (`bool`): whether to unpack a tarball - `force` (`bool | NoneType`): how to handle case in which asset pathalready exists; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt to replace existing file, and True to auto-replay Yes for existing asset replacement. -- `force_large` (`bool | NoneType`): how to handle case in large (> 5GB)asset is to be pulled; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt, and True to auto-replay Yes -- `size_cutoff` (`float`): maximum archive file size to download withno prompt - `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset - `build_signal_handler` (`function(str) -> function`): how to createa signal handler to use during the download; the single argument to this function factory is the download filepath @@ -799,24 +529,6 @@ Remove any relationship links associated with the selected asset -```python -def remove_genome_aliases(self, digest, aliases=None) -``` - -Remove alias for a specified genome digest. This method will remove the digest both from the genomes object and from the aliases mapping in tbe config -#### Parameters: - -- `digest` (`str`): genome digest to remove an alias for -- `aliases` (`list[str]`): a collection to aliases to remove for thegenome. If not provided, all aliases for the digest will be remove - - -#### Returns: - -- `bool`: whether the removal has been performed - - - - ```python def run_plugins(self, hook) ``` @@ -830,38 +542,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7f946329b048>) -``` - -Seek path to a specified genome-asset-tag alias -#### Parameters: - -- `genome_name` (`str`): name of a reference genome assembly of interest -- `asset_name` (`str`): name of the particular asset to fetch -- `tag_name` (`str`): name of the particular asset tag to fetch -- `seek_key` (`str`): name of the particular subasset to fetch -- `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). -- `check_exist` (`function(callable) -> bool`): how to check forasset/path existence -- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned -- `all_aliases` (`bool`): whether to return paths to all asset aliases orjust the one for the specified 'genome_name` argument - - -#### Returns: - -- `str`: path to the asset - - -#### Raises: - -- `TypeError`: if the existence check is not a one-arg function -- `refgenconf.MissingGenomeError`: if the named assembly isn't knownto this configuration instance -- `refgenconf.MissingAssetError`: if the names assembly is known tothis configuration instance, but the requested asset is unknown - - - - -```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f946329b158>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f6f6a5b8b90>) ``` Seek path to a specified genome-asset-tag @@ -873,7 +554,7 @@ Seek path to a specified genome-asset-tag - `seek_key` (`str`): name of the particular subasset to fetch - `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). - `check_exist` (`function(callable) -> bool`): how to check forasset/path existence -- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `enclosing_dir` (`bool`): whether a path to the entire enclosing directory should be returned, e.g.for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned #### Returns: @@ -891,7 +572,7 @@ Seek path to a specified genome-asset-tag ```python -def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None) +def set_default_pointer(self, genome, asset, tag, force=False) ``` Point to the selected tag by default @@ -900,37 +581,11 @@ Point to the selected tag by default - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest - `tag` (`str`): name of the particular asset tag to point to by default -- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `force` (`bool`): whether the default tag change should be forced (even if it exists) -```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7f946329bb70>) -``` - -Assign a human-readable alias to a genome identifier. - -Genomes are identified by a unique identifier which is derived from the -FASTA file (part of fasta asset). This way we can ensure genome -provenance and compatibility with the server. This function maps a -human-readable identifier to make referring to the genomes easier. -#### Parameters: - -- `genome` (`str`): name of the genome to assign to an identifier -- `digest` (`str`): identifier to use -- `overwrite` (`bool`): whether all the previously set aliases should beremoved and just the current one stored -- `no_write` (`bool`): whether to skip writing the alias to the file - - -#### Returns: - -- `bool`: whether the alias has been established - - - - ```python def subscribe(self, urls, reset=False) ``` @@ -993,7 +648,7 @@ Remove URLs the list of genome_servers. ```python -def update_assets(self, genome, asset=None, data=None, force_digest=None) +def update_assets(self, genome, asset=None, data=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset mapping is missing, it will be created @@ -1001,7 +656,6 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated -- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -1013,14 +667,13 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass ```python -def update_genomes(self, genome, data=None, force_digest=None) +def update_genomes(self, genome, data=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome is missing, it will be added #### Parameters: - `genome` (`str`): genome to be added/updated -- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -1053,7 +706,7 @@ A convenience method which wraps the update assets and uses it to update the ass ```python -def update_seek_keys(self, genome, asset, tag=None, keys=None, force_digest=None) +def update_seek_keys(self, genome, asset, tag=None, keys=None) ``` A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. @@ -1062,7 +715,6 @@ A convenience method which wraps the updated assets and uses it to update the se - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated -- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `keys` (`Mapping`): seek_keys to be added/updated @@ -1074,7 +726,7 @@ A convenience method which wraps the updated assets and uses it to update the se ```python -def update_tags(self, genome, asset=None, tag=None, data=None, force_digest=None) +def update_tags(self, genome, asset=None, tag=None, data=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset-tag mapping is missing, it will be created @@ -1083,7 +735,6 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated -- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -1130,42 +781,36 @@ Write the contents to a file. If pre- and post-update plugins are defined, they -## Class `RefgenconfError` -Base exception type for this package - - -## Class `RemoteDigestMismatchError` -Remote digest of the parent asset does not match its local counterpart +## Class `GenomeConfigFormatError` +Exception for invalid genome config file format. ```python -def __init__(self, asset, local_digest, remote_digest) +def __init__(self, msg) ``` Initialize self. See help(type(self)) for accurate signature. -## Class `UnboundEnvironmentVariablesError` -Use of environment variable that isn't bound to a value. - +## Class `MissingAssetError` +Error type for request of an unavailable genome asset. -```python -def get_dir_digest(path, pm=None) -``` -Generate a MD5 digest that reflects just the contents of the files in the selected directory. -#### Parameters: +## Class `MissingConfigDataError` +Missing required configuration instance items -- `path` (`str`): path to the directory to digest -- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided +## Class `MissingGenomeError` +Error type for request of unknown genome/assembly. -#### Returns: -- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 +## Class `RefgenconfError` +Base exception type for this package +## Class `UnboundEnvironmentVariablesError` +Use of environment variable that isn't bound to a value. ```python @@ -1186,27 +831,7 @@ Get path to genome configuration file. -```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7f946329a488>, link_fun= at 0x7f946329cd90>) -``` - -Upgrade the config to a selected target version. - -Convert the config file to target_version format, update file structure -inside genome_folder. Drop genomes for which genome_digest is not available -on any of the servers and do not have a fasta asset locally. -#### Parameters: - -- `target_version` (`str`): the version updated to -- `filepath` (`str`): path to config file -- `force` (`bool`): whether the upgrade should be confirmed upfront -- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset -- `link_fun` (`callable`): function to use to link files, e.g os.symlink or os.link - - - - -*Version Information: `refgenconf` v0.10.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file +*Version Information: `refgenconf` v0.9.0, generated by `lucidoc` v0.4.2* \ No newline at end of file diff --git a/docs/servers.md b/docs/servers.md new file mode 100644 index 00000000..dafc3327 --- /dev/null +++ b/docs/servers.md @@ -0,0 +1,7 @@ +# Servers + +Here are some servers. Let us know if you're running your own refgenieserver instance and would like to be added to this list. + +- [Primary server](http://refgenomes.databio.org) +- [Dev server](http://rg.databio.org) +- [Plant references server](http://plantref.databio.org) diff --git a/mkdocs.yml b/mkdocs.yml index c44b4d81..4b6991c4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,4 +51,4 @@ navbar: left: - text: Refgenomes server icon: fa-server - href: http://refgenomes.databio.org + href: servers From 8526a7d8d627b9d41535a44b1a01f624f15c0e1b Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 9 Mar 2021 13:13:13 -0500 Subject: [PATCH 107/110] update docs --- docs/README.md | 17 +- docs/autodoc_build/refgenconf.md | 461 ++++++++++++++++++++++++++++--- docs/compare.md | 17 +- docs/refgenieserver.md | 14 + docs/usage.md | 5 +- docs_jupyter/aliases.ipynb | 59 ++-- docs_jupyter/tutorial.ipynb | 351 ++++++++++++++--------- recipes.md | 36 --- refgenie/cli.py | 2 +- 9 files changed, 695 insertions(+), 267 deletions(-) delete mode 100644 recipes.md diff --git a/docs/README.md b/docs/README.md index b3fdc25a..9b4a5960 100644 --- a/docs/README.md +++ b/docs/README.md @@ -46,7 +46,7 @@ refgenie listr Response: ```console Remote refgenie assets - Server URL: http://rg.databio.org:82 + Server URL: http://refgenomes.databio.org ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ genome ┃ assets ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ @@ -65,14 +65,13 @@ refgenie pull rCRSd/bowtie2_index Response: ```console -No local digest for genome alias: rCRSd -Setting 'rCRSd' identity with server: http://rg.databio.org:82/v3/alias/genome_digest/rCRSd -Determined server digest for local genome alias (rCRSd): 511fb1178275e7d529560d53b949dba40815f195623bce8e -Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd) +Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index +94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index:default ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100.0% • 128.0/117.0 KB • 1.8 MB/s • 0:00:00 +Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz +Extracting asset tarball: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz +Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default Created alias directories: - - /Users/mstolarczyk/demo/alias/rCRSd -Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive -... + - /Users/mstolarczyk/Desktop/testing/refgenie/alias/rCRSd/bowtie2_index/default ``` See [further reading on downloading assets](pull.md). @@ -83,7 +82,7 @@ Refgenie assets are scripted, so if what you need is not available remotely, you ```console -refgenie build mygenome/bwa_index --files fasta=mygenome.fa.gz +refgenie build mygenome/bwa_index ``` See [further reading on building assets](build.md). diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 4f96a359..5fca1068 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -29,12 +29,64 @@ h4 .content { # Package `refgenconf` Documentation +## Class `ConfigNotCompliantError` +The format of the config file does not match required version/standards + + +## Class `DownloadJsonError` +Non-OK response from a JSON download attempt + + +```python +def __init__(self, resp) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `GenomeConfigFormatError` +Exception for invalid genome config file format. + + +```python +def __init__(self, msg) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + +## Class `MissingAssetError` +Error type for request of an unavailable genome asset. + + +## Class `MissingConfigDataError` +Missing required configuration instance items + + +## Class `MissingGenomeError` +Error type for request of unknown genome/assembly. + + +## Class `MissingRecipeError` +Error type for request of an unavailable recipe. + + +## Class `MissingSeekKeyError` +Error type for request of an unavailable asset seek key. + + +## Class `MissingTagError` +Error type for request of an unavailable asset tag. + + ## Class `RefGenConf` A sort of oracle of available reference genome assembly assets ```python -def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False) +def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, skip_read_lock=False, genome_exact=False, schema_source=None) ``` Create the config instance by with a filepath or key-value pairs. @@ -55,6 +107,35 @@ Create the config instance by with a filepath or key-value pairs. +```python +def add(self, path, genome, asset, tag=None, seek_keys=None, force=False) +``` + +Add an external asset to the config +#### Parameters: + +- `path` (`str`): a path to the asset to add; must exist and be relativeto the genome_folder +- `genome` (`str`): genome name +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `seek_keys` (`dict`): seek keys to add +- `force` (`bool`): whether to force existing asset overwrite + + + + +```python +def alias_dir(self) +``` + +Path to the genome alias directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def assets_str(self, offset_text=' ', asset_sep=', ', genome_assets_delim='/ ', genome=None, order=None) ``` @@ -107,21 +188,22 @@ the parent genome will be removed as well ```python -def cfg_tag_asset(self, genome, asset, tag, new_tag) +def cfg_tag_asset(self, genome, asset, tag, new_tag, force=False) ``` Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. -This method does not override the original asset entry in the RefGenConf object. It creates its copy and tags -it with the new_tag. -Additionally, if the retagged asset has any children their parent will be retagged as new_tag that was -introduced upon this method execution. +This method does not override the original asset entry in the +RefGenConf object. It creates its copy and tags it with the new_tag. +Additionally, if the retagged asset has any children their parent will + be retagged as new_tag that was introduced upon this method execution. #### Parameters: - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of particular asset of interest - `tag` (`str`): name of the tag that identifies the asset of interest - `new_tag` (`str`): name of particular the new tag +- `force` (`bool`): force any actions that require approval #### Returns: @@ -142,7 +224,8 @@ def chk_digest_update_child(self, genome, remote_asset_name, child_name, server_ Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. -In case the local asset does not exist, the config is populated with the remote asset digest and children data +In case the local asset does not exist, the config is populated with the remote + asset digest and children data #### Parameters: - `genome` (`str`): name of the genome to check the asset digests for @@ -158,14 +241,45 @@ In case the local asset does not exist, the config is populated with the remote +```python +def compare(self, genome1, genome2, explain=False) +``` + +Check genomes compatibility level. Compares Annotated Sequence Digests (ASDs) -- digested sequences and metadata +#### Parameters: + +- `genome1` (`str`): name of the first genome to compare +- `genome2` (`str`): name of the first genome to compare +- `explain` (`bool`): whether the returned code explanation shouldbe displayed + + +#### Returns: + +- `int`: compatibility code + + + + +```python +def data_dir(self) +``` + +Path to the genome data directory +#### Returns: + +- `str`: path to the directory where the assets are stored + + + + ```python def file_path(self) ``` -Return the path to the config file or None if not set +Path to the genome configuration file #### Returns: -- `str | None`: path to the file the object will would to +- `str`: path to the genome configuration file @@ -191,6 +305,30 @@ Determine path to a particular asset for a particular genome. +```python +def genome_aliases(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + +```python +def genome_aliases_table(self) +``` + +Mapping of human-readable genome identifiers to genome identifiers +#### Returns: + +- `dict`: mapping of human-readable genome identifiers to genomeidentifiers + + + + ```python def genomes_list(self, order=None) ``` @@ -220,6 +358,42 @@ Get as single string this configuration's reference genome assembly IDs. +```python +def get_asds_path(self, genome) +``` + +Get path to the Annotated Sequence Digests JSON file for a given genome. Note that the path and/or genome may not exist. +#### Parameters: + +- `genome` (`str`): genome name + + +#### Returns: + +- `str`: ASDs path + + + + +```python +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fa582ccad08>) +``` + +Get a rich.Table object representing assets available locally +#### Parameters: + +- `genomes` (`list[str]`): genomes to restrict the results with +- `server_url` (`str`): server URL to query for the remote genome data +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset + + +#### Returns: + +- `rich.table.Table`: table of assets available locally + + + + ```python def get_default_tag(self, genome, asset, use_existing=True) ``` @@ -229,7 +403,7 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest -- `use_existing` (`bool`): whether the first tag in the config should be returned in case there is no defaulttag defined for an asset +- `use_existing` (`bool`): whether the first tag in the config should bereturned in case there is no default tag defined for an asset #### Returns: @@ -239,6 +413,54 @@ Determine the asset tag to use as default. The one indicated by the 'default_tag +```python +def get_genome_alias(self, digest, fallback=False, all_aliases=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `digest` (`str`): digest to find human-readable alias for +- `fallback` (`bool`): whether to return the query digest in caseof failure +- `all_aliases` (`bool`): whether to return all aliases instead of justthe first one + + +#### Returns: + +- `str | list[str]`: human-readable aliases + + +#### Raises: + +- `GenomeConfigFormatError`: if "genome_digests" section doesnot exist in the config +- `UndefinedAliasError`: if a no alias has been defined for therequested digest + + + + +```python +def get_genome_alias_digest(self, alias, fallback=False) +``` + +Get the human readable alias for a genome digest +#### Parameters: + +- `alias` (`str`): alias to find digest for +- `fallback` (`bool`): whether to return the query alias in caseof failure and in case it is one of the digests + + +#### Returns: + +- `str`: genome digest + + +#### Raises: + +- `UndefinedAliasError`: if the specified alias has been assigned toany digests + + + + ```python def get_genome_attributes(self, genome) ``` @@ -275,7 +497,7 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7f6f6a5b8f80>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fa582cd4840>) ``` List genomes and assets available remotely. @@ -294,7 +516,27 @@ List genomes and assets available remotely. ```python -def getseq(self, genome, locus) +def get_symlink_paths(self, genome, asset=None, tag=None, all_aliases=False) +``` + +Get path to the alias directory for the selected genome-asset-tag +#### Parameters: + +- `genome` (`str`): reference genome ID +- `asset` (`str`): asset name +- `tag` (`str`): tag name +- `all_aliases` (`bool`): whether to return a collection of symboliclinks or just the first one from the alias list + + +#### Returns: + +- `dict`: + + + + +```python +def getseq(self, genome, locus, as_str=False) ``` Return the sequence found in a selected range and chromosome. Something like the refget protocol. @@ -302,6 +544,12 @@ Return the sequence found in a selected range and chromosome. Something like the - `genome` (`str`): name of the sequence identifier - `locus` (`str`): 1-10' +- `as_str` (`bool`): whether to convert the resurned object to stringand return just the sequence + + +#### Returns: + +- `str | pyfaidx.FastaRecord | pyfaidx.Sequence`: selected sequence @@ -348,6 +596,28 @@ Initialize genome configuration file on disk +```python +def initialize_genome(self, fasta_path, alias, fasta_unzipped=False, skip_alias_write=False) +``` + +Initialize a genome + +Create a JSON file with Annotated Sequence Digests (ASDs) +for the FASTA file in the genome directory. +#### Parameters: + +- `fasta_path` (`str`): path to a FASTA file to initialize genome with +- `alias` (`str`): alias to set for the genome +- `skip_alias_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `str, list[dict[]]`: human-readable name for the genome + + + + ```python def is_asset_complete(self, genome, asset, tag) ``` @@ -395,7 +665,7 @@ List types/names of assets that are available for one--or all--genomes. - `genome` (`str | NoneType`): reference genome assembly ID, optional;if omitted, the full mapping from genome to asset names - `order` (`function(str) -> object`): how to key genome IDs and assetnames for sort -- `include_tags` (`bool`): whether asset tags should be included in the returned dict +- `include_tags` (`bool`): whether asset tags should be included in thereturned dict #### Returns: @@ -424,10 +694,10 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7f6f6a5bd0e0>) +def listr(self, genome=None, order=None, get_url= at 0x7fa582cd4950>, as_str=False) ``` -List genomes and assets available remotely. +List genomes and assets available remotely on all servers the object subscribes to #### Parameters: - `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance @@ -437,7 +707,7 @@ List genomes and assets available remotely. #### Returns: -- `str, str`: text reps of remotely available genomes and assets +- `dict[OrderedDict[list]]`: remotely available genomes and assetskeyed by genome keyed by source server endpoint @@ -455,7 +725,7 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, get_json_url= at 0x7f6f6a5bd3b0>, build_signal_handler=) +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fa582cd4bf8>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -466,6 +736,8 @@ Download and possibly unpack one or more assets for a given ref gen. - `tag` (`str`): name of particular tag to fetch - `unpack` (`bool`): whether to unpack a tarball - `force` (`bool | NoneType`): how to handle case in which asset pathalready exists; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt to replace existing file, and True to auto-replay Yes for existing asset replacement. +- `force_large` (`bool | NoneType`): how to handle case in large (> 5GB)asset is to be pulled; null for prompt (on a per-asset basis), False to effectively auto-reply No to the prompt, and True to auto-replay Yes +- `size_cutoff` (`float`): maximum archive file size to download withno prompt - `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset - `build_signal_handler` (`function(str) -> function`): how to createa signal handler to use during the download; the single argument to this function factory is the download filepath @@ -529,6 +801,24 @@ Remove any relationship links associated with the selected asset +```python +def remove_genome_aliases(self, digest, aliases=None) +``` + +Remove alias for a specified genome digest. This method will remove the digest both from the genomes object and from the aliases mapping in tbe config +#### Parameters: + +- `digest` (`str`): genome digest to remove an alias for +- `aliases` (`list[str]`): a collection to aliases to remove for thegenome. If not provided, all aliases for the digest will be remove + + +#### Returns: + +- `bool`: whether the removal has been performed + + + + ```python def run_plugins(self, hook) ``` @@ -542,7 +832,38 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7f6f6a5b8b90>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fa582cd4378>) +``` + +Seek path to a specified genome-asset-tag alias +#### Parameters: + +- `genome_name` (`str`): name of a reference genome assembly of interest +- `asset_name` (`str`): name of the particular asset to fetch +- `tag_name` (`str`): name of the particular asset tag to fetch +- `seek_key` (`str`): name of the particular subasset to fetch +- `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). +- `check_exist` (`function(callable) -> bool`): how to check forasset/path existence +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `all_aliases` (`bool`): whether to return paths to all asset aliases orjust the one for the specified 'genome_name` argument + + +#### Returns: + +- `str`: path to the asset + + +#### Raises: + +- `TypeError`: if the existence check is not a one-arg function +- `refgenconf.MissingGenomeError`: if the named assembly isn't knownto this configuration instance +- `refgenconf.MissingAssetError`: if the names assembly is known tothis configuration instance, but the requested asset is unknown + + + + +```python +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fa582cd4488>) ``` Seek path to a specified genome-asset-tag @@ -554,7 +875,7 @@ Seek path to a specified genome-asset-tag - `seek_key` (`str`): name of the particular subasset to fetch - `strict_exists` (`bool | NoneType`): how to handle case in whichpath doesn't exist; True to raise IOError, False to raise RuntimeWarning, and None to do nothing at all. Default: None (do not check). - `check_exist` (`function(callable) -> bool`): how to check forasset/path existence -- `enclosing_dir` (`bool`): whether a path to the entire enclosing directory should be returned, e.g.for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned +- `enclosing_dir` (`bool`): whether a path to the entire enclosingdirectory should be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned #### Returns: @@ -572,7 +893,7 @@ Seek path to a specified genome-asset-tag ```python -def set_default_pointer(self, genome, asset, tag, force=False) +def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None) ``` Point to the selected tag by default @@ -581,7 +902,33 @@ Point to the selected tag by default - `genome` (`str`): name of a reference genome assembly of interest - `asset` (`str`): name of the particular asset of interest - `tag` (`str`): name of the particular asset tag to point to by default -- `force` (`bool`): whether the default tag change should be forced (even if it exists) +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. +- `force` (`bool`): whether the default tag change should beforced (even if it exists) + + + + +```python +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fa582cd4ea0>) +``` + +Assign a human-readable alias to a genome identifier. + +Genomes are identified by a unique identifier which is derived from the +FASTA file (part of fasta asset). This way we can ensure genome +provenance and compatibility with the server. This function maps a +human-readable identifier to make referring to the genomes easier. +#### Parameters: + +- `genome` (`str`): name of the genome to assign to an identifier +- `digest` (`str`): identifier to use +- `overwrite` (`bool`): whether all the previously set aliases should beremoved and just the current one stored +- `no_write` (`bool`): whether to skip writing the alias to the file + + +#### Returns: + +- `bool`: whether the alias has been established @@ -603,7 +950,7 @@ Otherwise the current one will be appended to. ```python -def tag(self, genome, asset, tag, new_tag, files=True) +def tag(self, genome, asset, tag, new_tag, files=True, force=False) ``` Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. @@ -648,7 +995,7 @@ Remove URLs the list of genome_servers. ```python -def update_assets(self, genome, asset=None, data=None) +def update_assets(self, genome, asset=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset mapping is missing, it will be created @@ -656,6 +1003,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -667,13 +1015,14 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass ```python -def update_genomes(self, genome, data=None) +def update_genomes(self, genome, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome is missing, it will be added #### Parameters: - `genome` (`str`): genome to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -695,7 +1044,7 @@ A convenience method which wraps the update assets and uses it to update the ass - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated - `data` (`list`): asset parents to be added/updated -- `children` (`bool`): a logical indicating whether the relationship to be added is 'children' +- `children` (`bool`): a logical indicating whether the relationship to beadded is 'children' #### Returns: @@ -706,7 +1055,7 @@ A convenience method which wraps the update assets and uses it to update the ass ```python -def update_seek_keys(self, genome, asset, tag=None, keys=None) +def update_seek_keys(self, genome, asset, tag=None, keys=None, force_digest=None) ``` A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. @@ -715,6 +1064,7 @@ A convenience method which wraps the updated assets and uses it to update the se - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `keys` (`Mapping`): seek_keys to be added/updated @@ -726,7 +1076,7 @@ A convenience method which wraps the updated assets and uses it to update the se ```python -def update_tags(self, genome, asset=None, tag=None, data=None) +def update_tags(self, genome, asset=None, tag=None, data=None, force_digest=None) ``` Updates the genomes in RefGenConf object at any level. If a requested genome-asset-tag mapping is missing, it will be created @@ -735,6 +1085,7 @@ Updates the genomes in RefGenConf object at any level. If a requested genome-ass - `genome` (`str`): genome to be added/updated - `asset` (`str`): asset to be added/updated - `tag` (`str`): tag to be added/updated +- `force_digest` (`str`): digest to force update of. The alias willnot be converted to the digest, even if provided. - `data` (`Mapping`): data to be added/updated @@ -774,43 +1125,49 @@ Write the contents to a file. If pre- and post-update plugins are defined, they #### Raises: -- `OSError`: when the object has been created in a read only mode or other process has locked the file +- `OSError`: when the object has been created in a read only mode or otherprocess has locked the file - `TypeError`: when the filepath cannot be determined.This takes place only if YacAttMap initialized with a Mapping as an input, not read from file. - `OSError`: when the write is called on an object with no write capabilitiesor when writing to a file that is locked by a different object -## Class `GenomeConfigFormatError` -Exception for invalid genome config file format. +## Class `RefgenconfError` +Base exception type for this package + + +## Class `RemoteDigestMismatchError` +Remote digest of the parent asset does not match its local counterpart ```python -def __init__(self, msg) +def __init__(self, asset, local_digest, remote_digest) ``` Initialize self. See help(type(self)) for accurate signature. -## Class `MissingAssetError` -Error type for request of an unavailable genome asset. +## Class `UnboundEnvironmentVariablesError` +Use of environment variable that isn't bound to a value. -## Class `MissingConfigDataError` -Missing required configuration instance items +```python +def get_dir_digest(path, pm=None) +``` +Generate a MD5 digest that reflects just the contents of the files in the selected directory. +#### Parameters: -## Class `MissingGenomeError` -Error type for request of unknown genome/assembly. +- `path` (`str`): path to the directory to digest +- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided -## Class `RefgenconfError` -Base exception type for this package +#### Returns: + +- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 -## Class `UnboundEnvironmentVariablesError` -Use of environment variable that isn't bound to a value. ```python @@ -831,7 +1188,27 @@ Get path to genome configuration file. +```python +def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fa582cca730>, link_fun= at 0x7fa582cd8158>) +``` + +Upgrade the config to a selected target version. + +Convert the config file to target_version format, update file structure +inside genome_folder. Drop genomes for which genome_digest is not available +on any of the servers and do not have a fasta asset locally. +#### Parameters: + +- `target_version` (`str`): the version updated to +- `filepath` (`str`): path to config file +- `force` (`bool`): whether the upgrade should be confirmed upfront +- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset +- `link_fun` (`callable`): function to use to link files, e.g os.symlink or os.link + + + + -*Version Information: `refgenconf` v0.9.0, generated by `lucidoc` v0.4.2* \ No newline at end of file +*Version Information: `refgenconf` v0.10.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file diff --git a/docs/compare.md b/docs/compare.md index b0a22ed1..a3b9b8c5 100644 --- a/docs/compare.md +++ b/docs/compare.md @@ -22,16 +22,17 @@ refgenie compare hg38 hg38_plus The result is a set of informative flags that determine the level of compatibility of the genomes, for example: ```console -Binary: 0b1100111010101 +Flag: 2005 +Binary: 0b11111010101 -CONTENT_ALL_A_IN_B # sequence content -LENGTHS_ALL_A_IN_B # sequence lengths -NAMES_ALL_A_IN_B # sequence names -TOPO_ALL_A_IN_B # sequence topology -TOPO_ALL_B_IN_A -CONTENT_ANY_SHARED -CONTENT_A_ORDER # sequence order +CONTENT_ALL_A_IN_B +LENGTHS_ALL_A_IN_B +NAMES_ALL_A_IN_B +CONTENT_A_ORDER CONTENT_B_ORDER +CONTENT_ANY_SHARED +LENGTHS_ANY_SHARED +NAMES_ANY_SHARED ``` Based on the output above we can conclude that genome `hg38_plus` is a superset of `hg38`. diff --git a/docs/refgenieserver.md b/docs/refgenieserver.md index 83204a42..72cae24c 100644 --- a/docs/refgenieserver.md +++ b/docs/refgenieserver.md @@ -18,3 +18,17 @@ docker run --rm -d -p 80:80 \ ``` Mount your archived genomes folder to `/genomes` in the container, and you're essentially good to go. + +### References + +We have scripted the process of building and archiving the assets to serve with `refgenieserver`. The process usually includes the following steps: + +1. Download raw input files for assets (FASTA files, GTF files etc.) +2. Build assets with refgenie build in a local refgenie instance +3. Archive assets with refgenieserver archive +4. Deploy the server (run `refgenieserver serve` on a cluster or locally) + +Check out these GitHub repositories for more details and all the required metadata: + +- [`refgenie/refgenomes.databio.org`](https://github.com/refgenie/refgenomes.databio.org) (primary refgenie assets server instance) +- [`refgenie/rg.databio.org`](https://github.com/refgenie/rg.databio.org) (development refgenie assets server instace) diff --git a/docs/usage.md b/docs/usage.md index 0b64f018..d93d1f5c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,7 +2,7 @@ ## `refgenie --help` ```console -version: 0.10.0-dev | refgenconf 0.10.0-dev +version: 0.10.0 | refgenconf 0.10.0 usage: refgenie [-h] [--version] [--silent] [--verbosity V] [--logdev] {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} ... @@ -272,7 +272,7 @@ optional arguments: ## `refgenie tag --help` ```console -usage: refgenie tag [-h] [-c C] [--skip-read-lock] [-g G] (-t TAG | -d) +usage: refgenie tag [-h] [-c C] [--skip-read-lock] [-g G] [-f] (-t TAG | -d) asset-registry-paths [asset-registry-paths ...] Tag an asset. @@ -287,6 +287,7 @@ optional arguments: environment variable is set. --skip-read-lock Whether the config file should not be locked for reading -g G, --genome G Reference assembly ID, e.g. mm10. + -f, --force Do not prompt before action, approve upfront. -t TAG, --tag TAG Tag to assign to an asset. -d, --default Set the selected asset tag as the default one. ``` diff --git a/docs_jupyter/aliases.ipynb b/docs_jupyter/aliases.ipynb index be76bdac..bfecaafa 100644 --- a/docs_jupyter/aliases.ipynb +++ b/docs_jupyter/aliases.ipynb @@ -43,7 +43,7 @@ ], "source": [ "export REFGENIE=$(pwd)/refgenie.yaml\n", - "refgenie init -c $REFGENIE -s http://rg.databio.org:82" + "refgenie init -c $REFGENIE -s http://rg.databio.org" ] }, { @@ -62,27 +62,28 @@ "name": "stdout", "output_type": "stream", "text": [ + "Compatible refgenieserver instances: ['http://rg.databio.org']\n", "No local digest for genome alias: rCRSd\n", - "Setting 'rCRSd' identity with server: http://rg.databio.org:82/v3/alias/genome_digest/rCRSd\n", - "Determined server digest for local genome alias (rCRSd): 511fb1178275e7d529560d53b949dba40815f195623bce8e\n", - "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Setting 'rCRSd' identity with server: http://rg.databio.org/v3/genomes/genome_digest/rCRSd\n", + "Determined server digest for local genome alias (rCRSd): 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", - "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/archive\n", - "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta:def… \u001b[35m10…\u001b[0m 24.0/8.… 60.3-:\n", - "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", - "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/fasta__default.tgz\n", - "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", + "Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta\n", + "\u001b[2K94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta:default \u001b[35m100.0%\u001b[0m • • • …\u001b[0m • • ? • … …\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/fasta__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/fasta__default.tgz\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta' set to: default\n", "Initializing genome: rCRSd\n", "Loaded AnnotatedSequenceDigestList (1 sequences)\n", - "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" ] } ], "source": [ - "refgenie pull rCRSd/fasta" + "refgenie pull rCRSd/fasta --force" ] }, { @@ -106,18 +107,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Downloading URL: http://rg.databio.org:82/v3/asset/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/archive\n", - "\u001b[2K511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index:de… \u001b[35m1…\u001b[0m 128.0/117.\n", - "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/bowtie2_index__default.tgz\n", - "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/bowtie2_index__default.tgz\n", - "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", + "Compatible refgenieserver instances: ['http://rg.databio.org']\n", + "Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index\n", + "\u001b[2K94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index:defau… \u001b[35m100.…\u001b[0m • •\n", + "\u001b[?25hDownload complete: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz\n", + "Extracting asset tarball: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" ] } ], "source": [ - "refgenie pull rCRSd/bowtie2_index" + "refgenie pull rCRSd/bowtie2_index --force" ] }, { @@ -164,7 +166,7 @@ "source": [ "#### Set aliases\n", "\n", - "To set an alias \"mito\" for genome identified by digest `511fb1178275e7d529560d53b949dba40815f195623bce8e` one needs to issue the command below:" + "To set an alias \"mito\" for genome identified by digest `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` one needs to issue the command below:" ] }, { @@ -176,14 +178,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: ['mito'])\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: mito)\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mito\n" ] } ], "source": [ - "refgenie alias set --aliases mito --digest 511fb1178275e7d529560d53b949dba40815f195623bce8e" + "refgenie alias set --aliases mito --digest 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4" ] }, { @@ -208,7 +210,7 @@ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1malias \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n", - "│ 511fb1178275e7d529560d53b949dba40815f195623bce8e │ rCRSd, mito │\n", + "│ 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 │ rCRSd, mito │\n", "└──────────────────────────────────────────────────┴─────────────┘\n" ] } @@ -251,7 +253,7 @@ "\n", "10 directories\n", "\u001b[01;34mdata\u001b[00m\n", - "└── \u001b[01;34m511fb1178275e7d529560d53b949dba40815f195623bce8e\u001b[00m\n", + "└── \u001b[01;34m94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\u001b[00m\n", " ├── \u001b[01;34mbowtie2_index\u001b[00m\n", " │   └── \u001b[01;34mdefault\u001b[00m\n", " └── \u001b[01;34mfasta\u001b[00m\n", @@ -284,9 +286,9 @@ "text": [ "\u001b[01;34malias/rCRSd/fasta\u001b[00m\n", "└── \u001b[01;34mdefault\u001b[00m\n", - " ├── \u001b[01;36mrCRSd.chrom.sizes\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes\n", - " ├── \u001b[01;36mrCRSd.fa\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n", - " └── \u001b[01;36mrCRSd.fa.fai\u001b[00m -> ../../../../data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai\n", + " ├── \u001b[01;36mrCRSd.chrom.sizes\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.chrom.sizes\n", + " ├── \u001b[01;36mrCRSd.fa\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa\n", + " └── \u001b[01;36mrCRSd.fa.fai\u001b[00m -> ../../../../data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.fai\n", "\n", "1 directory, 3 files\n" ] @@ -302,13 +304,6 @@ "source": [ "This explicitly shows that the files inside `alias/rCRSd/fasta/default` are in fact symbolic links that point to the actual asset files in `data` directory." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs_jupyter/tutorial.ipynb b/docs_jupyter/tutorial.ipynb index a3f5bee9..3f149ee2 100644 --- a/docs_jupyter/tutorial.ipynb +++ b/docs_jupyter/tutorial.ipynb @@ -30,7 +30,7 @@ } ], "source": [ - "!refgenie init -c refgenie.yaml -s http://rg.databio.org:82" + "!refgenie init -c refgenie.yaml -s http://rg.databio.org" ] }, { @@ -52,7 +52,7 @@ "config_version: 0.4\r\n", "genome_folder: /Users/mstolarczyk/code/refgenie/docs_jupyter\r\n", "genome_servers: \r\n", - " - http://rg.databio.org:82\r\n", + " - http://rg.databio.org\r\n", "genomes: null\r\n" ] } @@ -61,69 +61,102 @@ "!cat refgenie.yaml" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's enter python and do some stuff." - ] - }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Remote refgenie assets \u001b[0m\r\n", + "\u001b[3m Server URL: http://rg.databio.org \u001b[0m\r\n", + "┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massets \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\r\n", + "│ rCRSd │ fasta, bowtie2_index, bwa_index, hisat2_index, │\r\n", + "│ │ star_index, bismark_bt2_index │\r\n", + "│ hg18_cdna │ fasta, kallisto_index │\r\n", + "│ hs38d1 │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg38_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ human_repeats │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ rn6_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ mm10_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ hg38_chr22 │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg38 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bowtie2_index, │\r\n", + "│ │ bwa_index, tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ hg19_cdna │ fasta, kallisto_index, salmon_index │\r\n", + "│ human_rDNA │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ human_alu │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, bismark_bt2_index │\r\n", + "│ human_alphasat │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ mouse_chrM2x │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ t7 │ fasta, bowtie2_index │\r\n", + "│ mm10 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bwa_index, │\r\n", + "│ │ bowtie2_index, hisat2_index, tallymer_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ dm6 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ bowtie2_index │\r\n", + "│ hg18 │ fasta, gencode_gtf, fasta_txome, suffixerator_index, │\r\n", + "│ │ cellranger_reference, bwa_index, bowtie2_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index │\r\n", + "│ hg19 │ fasta, gencode_gtf, ensembl_gtf, refgene_anno, │\r\n", + "│ │ fasta_txome, ensembl_rb, feat_annotation, │\r\n", + "│ │ suffixerator_index, cellranger_reference, bwa_index, │\r\n", + "│ │ bowtie2_index, tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ salmon_partial_sa_index, bismark_bt2_index │\r\n", + "│ rn6 │ fasta, ensembl_gtf, refgene_anno, fasta_txome, │\r\n", + "│ │ suffixerator_index, bwa_index, bowtie2_index, │\r\n", + "│ │ tallymer_index, hisat2_index, star_index, │\r\n", + "│ │ bismark_bt2_index, salmon_partial_sa_index │\r\n", + "│ hg38_noalt_decoy │ fasta, suffixerator_index, bowtie2_index, bwa_index, │\r\n", + "│ │ tallymer_index, hisat2_index, bismark_bt2_index │\r\n", + "│ mm10_primary │ fasta, bowtie2_index, bwa_index │\r\n", + "│ hg38_primary │ fasta, bowtie2_index, bwa_index │\r\n", + "│ hg38_mm10 │ fasta, bwa_index │\r\n", + "└──────────────────┴───────────────────────────────────────────────────────────┘\r\n", + "\u001b[2;3m use refgenie listr -g for more detailed view \u001b[0m\r\n" + ] + } + ], "source": [ - "import refgenconf\n", - "rgc = refgenconf.RefGenConf(\"refgenie.yaml\")" + "!refgenie listr -c refgenie.yaml" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Use `listr` to see what's available on the server:" + "Now let's enter python and do some stuff." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'http://rg.databio.org:82/v3/assets': OrderedDict([('hg38',\n", - " ['bowtie2_index:default',\n", - " 'fasta.chrom_sizes:default',\n", - " 'fasta.fai:default',\n", - " 'fasta:default']),\n", - " ('human_repeats',\n", - " ['bwa_index:default',\n", - " 'fasta.chrom_sizes:default',\n", - " 'fasta.fai:default',\n", - " 'fasta:default',\n", - " 'hisat2_index:default']),\n", - " ('mouse_chrM2x',\n", - " ['bowtie2_index:default',\n", - " 'bwa_index:default',\n", - " 'fasta.chrom_sizes:default',\n", - " 'fasta.fai:default',\n", - " 'fasta:default']),\n", - " ('rCRSd',\n", - " ['bowtie2_index:default',\n", - " 'fasta.chrom_sizes:default',\n", - " 'fasta.fai:default',\n", - " 'fasta:default'])])}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "rgc.listr()" + "import refgenconf\n", + "rgc = refgenconf.RefGenConf(filepath=\"refgenie.yaml\")" ] }, { @@ -141,7 +174,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7640cd78195940e7a21600a1313320f8", + "model_id": "abab7f40d9654ca6ba8c60471cbe303a", "version_major": 2, "version_minor": 0 }, @@ -155,19 +188,24 @@ { "data": { "text/plain": [ - "(['194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496', 'fasta', 'default'],\n", + "(['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a', 'fasta', 'default'],\n", " {'asset_path': 'fasta',\n", " 'asset_digest': '8dfe402f7d29d5b036dd8937119e4404',\n", - " 'archive_digest': 'deae753231ebb9df82622c7140e0bd3a',\n", - " 'asset_size': '46.8KB',\n", + " 'archive_digest': 'bfb7877ee114c61a17a50bd471de47a2',\n", + " 'asset_size': '39.4KB',\n", " 'archive_size': '9.1KB',\n", - " 'seek_keys': {'fasta': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.fa',\n", - " 'fai': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.fa.fai',\n", - " 'chrom_sizes': '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496.chrom.sizes'},\n", + " 'seek_keys': {'fasta': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa',\n", + " 'fai': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa.fai',\n", + " 'chrom_sizes': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.chrom.sizes'},\n", " 'asset_parents': [],\n", - " 'asset_children': ['194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496/bwa_index:default',\n", - " '194f8681e3d9e35b9eca2d17ec5e36bbf5e8c2beea486496/bowtie2_index:default']},\n", - " 'http://rg.databio.org:82')" + " 'asset_children': ['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/suffixerator_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bowtie2_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bwa_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/tallymer_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/hisat2_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/star_index:default',\n", + " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bismark_bt2_index:default']},\n", + " 'http://rg.databio.org')" ] }, "execution_count": 5, @@ -256,16 +294,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2020-11-02 08:49:28-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", + "--2021-03-09 12:22:40-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n", "Resolving big.databio.org (big.databio.org)... 128.143.245.181, 128.143.245.182\n", "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 8399 (8.2K) [application/octet-stream]\n", "Saving to: ‘rCRSd.fa.gz’\n", "\n", - "rCRSd.fa.gz 100%[===================>] 8.20K 10.3KB/s in 0.8s \n", + "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.006s \n", "\n", - "2020-11-02 08:49:29 (10.3 KB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", + "2021-03-09 12:22:40 (1.35 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n", "\n" ] } @@ -288,19 +326,19 @@ "Building 'rCRSd/fasta:default' using 'fasta' recipe\n", "Initializing genome: rCRSd\n", "Loaded AnnotatedSequenceDigestList (1 sequences)\n", - "Set genome alias (511fb1178275e7d529560d53b949dba40815f195623bce8e: rCRSd)\n", + "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n", "Saving outputs to:\n", - "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", - "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n", "* Compute host: MichalsMBP\n", "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", - "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/\n", - "* Pipeline started at: (11-02 08:49:30) elapsed: 0.0 _TIME_\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/\n", + "* Pipeline started at: (03-09 12:22:41) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", @@ -336,59 +374,61 @@ "\n", "----------------------------------------\n", "\n", - "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` \n", "\n", - "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (28209)\n", + "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63575)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=28209)\n",
-      "Warning: couldn't add memory use for process: 28209\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63575)\n",
+      "Warning: couldn't add memory use for process: 63575\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", - " PID: 28209;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", + " PID: 63575;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.gz` (28210)\n", + "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63576)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=28210)\n",
-      "Warning: couldn't add memory use for process: 28210\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63576)\n",
+      "Warning: couldn't add memory use for process: 63576\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", - " PID: 28210;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n", + " PID: 63576;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa` (28211)\n", + "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa` (63577)\n", "
\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", - " PID: 28211;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.002GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63577;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.001GB\n", "\n", "\n", - "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.chrom.sizes` (28212)\n", + "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.chrom.sizes` (63578)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=28212)\n",
-      "Warning: couldn't add memory use for process: 28212\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63578)\n",
+      "Warning: couldn't add memory use for process: 63578\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", - " PID: 28212;\tCommand: cut;\tReturn code: 0;\tMemory used: 0.001GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63578;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_fasta__default.flag` (28214)\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` (63580)\n", "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=28214)\n",
-      "Warning: couldn't add memory use for process: 28214\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63580)\n",
+      "Warning: couldn't add memory use for process: 63580\n",
       "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.002GB. \n", - " PID: 28214;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n", + " PID: 63580;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", "\n", "Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n", - "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta' set to: default\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:00\n", "* Total elapsed time (all runs): 0:00:00\n", - "* Peak memory (this run): 0.0016 GB\n", - "* Pipeline completed time: 2020-11-02 08:49:30\n", - "Finished building 'fasta' asset\n" + "* Peak memory (this run): 0.0015 GB\n", + "* Pipeline completed time: 2021-03-09 12:22:41\n", + "Finished building 'fasta' asset\n", + "Created alias directories: \n", + " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n" ] } ], @@ -396,10 +436,47 @@ "!refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The asset should be available for local use, let's call `refgenie list` to check it:" + ] + }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[3m Local refgenie assets \u001b[0m\r\n", + "\u001b[3m Server subscriptions: http://rg.databio.org \u001b[0m\r\n", + "┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓\r\n", + "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1masset (\u001b[0m\u001b[1;3mseek_keys\u001b[0m\u001b[1m) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtags \u001b[0m\u001b[1m \u001b[0m┃\r\n", + "┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩\r\n", + "│ rCRSd │ fasta (\u001b[3mfasta, fai, chrom_sizes\u001b[0m) │ default │\r\n", + "└───────────┴────────────────────────────────────────────┴───────────┘\r\n" + ] + } + ], + "source": [ + "!refgenie list -c refgenie.yaml --genome rCRSd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can retrieve the path to this asset with:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -417,12 +494,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can do the same thing from within python:" + "Naturally, we can do the same thing from within Python:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -431,7 +508,7 @@ "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa'" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -445,14 +522,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " Now if you have bowtie2-build in your PATH you can build the bowtie2 index with no further requirements.\n", + "Now, if we have bowtie2-build in our `$PATH` we can build the `bowtie2_index` asset with no further requirements.\n", "\n", - "You can see the requirements with `--requirements`:\n" + "Let's check the requirements with `refgenie build --requirements`:\n" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -478,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -489,15 +566,15 @@ "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n", "Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n", "Saving outputs to:\n", - "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e\n", - "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build\n", + "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n", + "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build\n", "### Pipeline run code and environment:\n", "\n", "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n", "* Compute host: MichalsMBP\n", "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n", - "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/\n", - "* Pipeline started at: (11-02 08:49:34) elapsed: 0.0 _TIME_\n", + "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/\n", + "* Pipeline started at: (03-09 12:22:45) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", @@ -533,12 +610,12 @@ "\n", "----------------------------------------\n", "\n", - "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` \n", + "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` \n", "\n", - "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e` (28246)\n", + "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` (63609)\n", "
\n",
       "Settings:\n",
-      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.*.bt2\"\n",
+      "  Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.*.bt2\"\n",
       "  Line rate: 6 (line is 64 bytes)\n",
       "  Lines per side: 1 (side is 64 bytes)\n",
       "  Offset rate: 4 (one in 16)\n",
@@ -555,7 +632,7 @@
       "  Random seed: 0\n",
       "  Sizeofs: void*:8, int:4, long:8, size_t:8\n",
       "Input files DNA, FASTA:\n",
-      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/fasta/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.fa\n",
+      "  /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa\n",
       "Building a SMALL index\n",
       "Reading reference sizes\n",
       "  Time reading reference sizes: 00:00:00\n",
@@ -609,8 +686,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -692,8 +769,8 @@
       "fchr[$]: 33136\n",
       "Exiting Ebwt::buildToDisk()\n",
       "Returning from initFromVector\n",
-      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.1.bt2\n",
-      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/511fb1178275e7d529560d53b949dba40815f195623bce8e.rev.2.bt2\n",
+      "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.1.bt2\n",
+      "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.2.bt2\n",
       "Re-opening _in1 and _in2 as input streams\n",
       "Returning from Ebwt constructor\n",
       "Headers:\n",
@@ -722,33 +799,33 @@
       "    color: 0\n",
       "    reverse: 1\n",
       "Total time for backward call to driver() for mirror index: 00:00:00\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.0GB. \n", - " PID: 28246;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.0GB\n", - "\n", - "\n", - "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index/default/_refgenie_build/511fb1178275e7d529560d53b949dba40815f195623bce8e_bowtie2_index__default.flag` (28248)\n", - "
\n",
-      "psutil.ZombieProcess process still exists but it's a zombie (pid=28248)\n",
-      "Warning: couldn't add memory use for process: 28248\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.0GB. \n", - " PID: 28248;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", - "\n" + "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n", + " PID: 63609;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.003GB\n", + "\n", + "\n", + "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` (63611)\n", + "
\n",
+      "psutil.ZombieProcess process still exists but it's a zombie (pid=63611)\n",
+      "Warning: couldn't add memory use for process: 63611\n",
+      "
\n", + "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n", + " PID: 63611;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n", + "\n", "Asset digest: 1262e30d4a87db9365d501de8559b3b4\n", - "Default tag for '511fb1178275e7d529560d53b949dba40815f195623bce8e/bowtie2_index' set to: default\n", + "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:01\n", - "* Total elapsed time (all runs): 0:00:01\n", - "* Peak memory (this run): 0.0003 GB\n", - "* Pipeline completed time: 2020-11-02 08:49:34\n", + "* Total elapsed time (all runs): 0:00:00\n", + "* Peak memory (this run): 0.0028 GB\n", + "* Pipeline completed time: 2021-03-09 12:22:46\n", "Finished building 'bowtie2_index' asset\n", "Created alias directories: \n", " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n" @@ -763,19 +840,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can see a list of available recipes like this:" + "We can see a list of available recipes like this:" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\r\n" + "bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index, tgMap\r\n" ] } ], @@ -787,12 +864,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can get the unique digest for any asset with `refgenie id`:" + "We can get the unique digest for any asset with `refgenie id`:" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -816,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -825,7 +902,7 @@ "'3.6.5'" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -837,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { diff --git a/recipes.md b/recipes.md deleted file mode 100644 index 821a1683..00000000 --- a/recipes.md +++ /dev/null @@ -1,36 +0,0 @@ -# Refgenie Recipes - -Here are a few easy scripts you can use to re-index some of your favorite genomes - -## hg19 - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n hg19 -``` - -## hg38 -(use the NCBI's official version for sequence alignments without _alt sequences:) -Old link: INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz - -This README describes the sequences: - -ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/README_analysis_sets.txt - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_23/gencode.v23.primary_assembly.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n hg38 -``` - -## mm10 - -```console -BUILDER=${CODEBASE}refgenie/src/refgenie.py -INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/635/GCA_000001635.5_GRCm38.p3/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001635.5_GRCm38.p3_no_alt_analysis_set.fna.gz -GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M12/gencode.vM12.primary_assembly.annotation.gtf.gz -${BUILDER} -i ${INPUT} -a ${GTF} -n mm10 -``` diff --git a/refgenie/cli.py b/refgenie/cli.py index ab40f8c5..cb6a2b70 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -273,7 +273,7 @@ def main(): try: if not rgc.is_asset_complete(**gat): with rgc as r: - r.cfg_remove_assets(**gat, aliases=args.aliases) + r.cfg_remove_assets(**gat) _LOGGER.info( "Removed an incomplete asset " "'{genome}/{asset}:{tag}'".format(*gat) From 16bfb5d4011530208891b369fffec91d39edac11 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 10 Mar 2021 13:16:50 -0500 Subject: [PATCH 108/110] udpate yacman req --- requirements/requirements-all.txt | 2 +- requirements/requirements-dev.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index dd79a365..e9a19412 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -2,4 +2,4 @@ logmuse>=0.2.6 # refgenconf>=0.10.0-dev piper>=0.12.1 pyfaidx>=0.5.5.2 -yacman>=0.7.1 \ No newline at end of file +yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 493054a9..c368db1d 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,2 +1 @@ --e git+git://github.com/databio/yacman@dev#egg=yacman -e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file From 5d196bcf6c9c60ab0583961329118bf6097b5b09 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 11 Mar 2021 12:41:16 -0500 Subject: [PATCH 109/110] update changelog and version --- docs/changelog.md | 2 +- refgenie/_version.py | 2 +- requirements/requirements-all.txt | 2 +- requirements/requirements-doc.txt | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 6918b537..fd33082f 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,7 +1,7 @@ # Changelog This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. -## [0.10.0] - unreleased +## [0.10.0] - 2021-03-11 **After updating to this version your configuration file and genome assets will not be compatible with the software. Please refer to the [upgrade tutorial](config_upgrade_03_to_04.md) for instructions on how to migrate the config between versions.** diff --git a/refgenie/_version.py b/refgenie/_version.py index 2aceaf4d..61fb31ca 100644 --- a/refgenie/_version.py +++ b/refgenie/_version.py @@ -1 +1 @@ -__version__ = "0.10.0-dev" +__version__ = "0.10.0" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index e9a19412..5180d4c9 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ logmuse>=0.2.6 -# refgenconf>=0.10.0-dev +# refgenconf>=0.10.0 piper>=0.12.1 pyfaidx>=0.5.5.2 yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt index a7372dc4..b1e1259c 100644 --- a/requirements/requirements-doc.txt +++ b/requirements/requirements-doc.txt @@ -1,2 +1 @@ https://github.com/databio/mkdocs-databio/archive/master.zip -refgenconf>=0.6.1 From f68a1b61aa99fa72a4c8b1f8a301bf93c235387c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 11 Mar 2021 13:09:26 -0500 Subject: [PATCH 110/110] update refgenconf requirement --- requirements/requirements-all.txt | 2 +- requirements/requirements-dev.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 5180d4c9..b7816e4c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ logmuse>=0.2.6 -# refgenconf>=0.10.0 +refgenconf>=0.10.0 piper>=0.12.1 pyfaidx>=0.5.5.2 yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index c368db1d..e69de29b 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1 +0,0 @@ --e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf \ No newline at end of file