From 32dcf2ce4cb95cbdea7dcc2c2ed87f0b8d8c760f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Thu, 11 Mar 2021 13:32:57 -0500 Subject: [PATCH 01/44] add refgenconf to doc reqs --- requirements/requirements-doc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt index b1e1259c..13abedf0 100644 --- a/requirements/requirements-doc.txt +++ b/requirements/requirements-doc.txt @@ -1 +1,2 @@ https://github.com/databio/mkdocs-databio/archive/master.zip +refgenconf>=0.10.0 \ No newline at end of file From 120c84b3e5837df465781f95fc15fd8af9d6dee1 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 17 Mar 2021 11:46:50 -0400 Subject: [PATCH 02/44] implement populate cmd. See #230 --- refgenie/argparser.py | 8 ++++++++ refgenie/cli.py | 19 +++++++++++++++++++ refgenie/const.py | 2 ++ tests/data/pop_test.txt | 6 ++++++ 4 files changed, 35 insertions(+) create mode 100644 tests/data/pop_test.txt diff --git a/refgenie/argparser.py b/refgenie/argparser.py index ff7df2c7..36bf9dc4 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -481,4 +481,12 @@ def add_subparser(cmd, msg, subparsers): ), ) + sps[POPULATE_CMD].add_argument( + "-f", + "--file", + metavar="F", + help="File with registry paths to populate") + + + return parser diff --git a/refgenie/cli.py b/refgenie/cli.py index cb6a2b70..f6ec282d 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -17,6 +17,7 @@ MissingAssetError, MissingGenomeError, DownloadJsonError, + populate_refgenie_refs, upgrade_config, __version__ as rgc_version, select_genome_config, @@ -360,6 +361,24 @@ def main(): target_version=args.target_version, filepath=gencfg, force=args.force ) + elif args.command == POPULATE_CMD: + _LOGGER.debug("Populating file: {}".format(args.file)) + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + # demo = {"genome": 'refgenie://hg19/fasta', + # "other_attr": "something", + # "bt2": 'refgenie://t7/bwa_index'} + # res = populate_refgenie_refs(rgc, demo) + # print(res) + if args.file: + with open(args.file) as fp: + for line in fp: + sys.stdout.write(populate_refgenie_refs(rgc, line)) + else: + for line in sys.stdin: + if 'q' == line.rstrip(): + break + sys.stdout.write(populate_refgenie_refs(rgc, line)) + def perm_check_x(file_to_check, message_tag="genome directory"): """ diff --git a/refgenie/const.py b/refgenie/const.py index d8ec6600..21e49507 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -9,6 +9,7 @@ BUILD_CMD = "build" INIT_CMD = "init" PULL_CMD = "pull" +POPULATE_CMD = "populate" LIST_LOCAL_CMD = "list" LIST_REMOTE_CMD = "listr" GET_ASSET_CMD = "seek" @@ -45,6 +46,7 @@ ALIAS_CMD: "Interact with aliases.", COMPARE_CMD: "Compare two genomes.", UPGRADE_CMD: "Upgrade config. This will alter the files on disk.", + POPULATE_CMD: "Populate registry paths with real paths." } ALIAS_GET_CMD = "get" diff --git a/tests/data/pop_test.txt b/tests/data/pop_test.txt new file mode 100644 index 00000000..10a07b2c --- /dev/null +++ b/tests/data/pop_test.txt @@ -0,0 +1,6 @@ +refgenie://t7/fasta +refgenie://mm10/fasta +blah +lorem ipsum --- ... + +refgenie://hg38/fasta From 5fa2ddcd4a19ca28b1dc8b6793eb27a1e77bdee7 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 18 Mar 2021 15:49:46 -0400 Subject: [PATCH 03/44] update to populate method --- refgenie/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/refgenie/cli.py b/refgenie/cli.py index f6ec282d..f1cfcdd3 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -17,7 +17,6 @@ MissingAssetError, MissingGenomeError, DownloadJsonError, - populate_refgenie_refs, upgrade_config, __version__ as rgc_version, select_genome_config, @@ -372,12 +371,12 @@ def main(): if args.file: with open(args.file) as fp: for line in fp: - sys.stdout.write(populate_refgenie_refs(rgc, line)) + sys.stdout.write(rgc.populate_refgenie_refs(line)) else: for line in sys.stdin: if 'q' == line.rstrip(): break - sys.stdout.write(populate_refgenie_refs(rgc, line)) + sys.stdout.write(rgc.populate_refgenie_refs(line)) def perm_check_x(file_to_check, message_tag="genome directory"): From fdd4b835c082f0bab24996a8915457d65cdfae4a Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 19 Mar 2021 16:42:26 -0400 Subject: [PATCH 04/44] implement refgneie seekr --- refgenie/argparser.py | 8 +++++++- refgenie/cli.py | 18 ++++++++++++++++++ refgenie/const.py | 12 +++++++++++- requirements/requirements-all.txt | 2 +- requirements/requirements-dev.txt | 1 + setup.py | 1 + 6 files changed, 39 insertions(+), 3 deletions(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index ff7df2c7..ea1ddd12 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -308,6 +308,7 @@ def add_subparser(cmd, msg, subparsers): for cmd in [ PULL_CMD, GET_ASSET_CMD, + GET_REMOTE_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, @@ -338,6 +339,7 @@ def add_subparser(cmd, msg, subparsers): for cmd in [ PULL_CMD, GET_ASSET_CMD, + GET_REMOTE_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, @@ -350,7 +352,11 @@ def add_subparser(cmd, msg, subparsers): type=str, nargs="+", help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag" - + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."), + + ( + " or hg38/fasta.fai:tag)." + if cmd in [GET_ASSET_CMD, GET_REMOTE_ASSET_CMD] + else ")." + ), ) sps[LIST_LOCAL_CMD].add_argument( diff --git a/refgenie/cli.py b/refgenie/cli.py index cb6a2b70..542796cf 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -167,6 +167,24 @@ def main(): ) return + elif args.command == GET_REMOTE_ASSET_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + for a in asset_list: + _LOGGER.debug( + "getting remote asset path: '{}/{}.{}:{}'".format( + a["genome"], a["asset"], a["seek_key"], a["tag"] + ) + ) + print( + rgc.seekr( + a["genome"], + a["asset"], + a["seek_key"], + a["tag"], + ) + ) + return + elif args.command == INSERT_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) diff --git a/refgenie/const.py b/refgenie/const.py index d8ec6600..20782b02 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -12,6 +12,7 @@ LIST_LOCAL_CMD = "list" LIST_REMOTE_CMD = "listr" GET_ASSET_CMD = "seek" +GET_REMOTE_ASSET_CMD = "seekr" INSERT_CMD = "add" REMOVE_CMD = "remove" GETSEQ_CMD = "getseq" @@ -26,7 +27,15 @@ GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD] # For each asset we assume a genome is also required -ASSET_REQUIRED = [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, TAG_CMD, ID_CMD] +ASSET_REQUIRED = [ + PULL_CMD, + GET_ASSET_CMD, + GET_REMOTE_ASSET_CMD, + BUILD_CMD, + INSERT_CMD, + TAG_CMD, + ID_CMD, +] SUBPARSER_MESSAGES = { INIT_CMD: "Initialize a genome configuration.", @@ -35,6 +44,7 @@ PULL_CMD: "Download assets.", BUILD_CMD: "Build genome assets.", GET_ASSET_CMD: "Get the path to a local asset.", + GET_REMOTE_ASSET_CMD: "Get the path to a remote asset.", INSERT_CMD: "Add local asset to the config file.", REMOVE_CMD: "Remove a local asset.", GETSEQ_CMD: "Get sequences from a genome.", diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index b7816e4c..221dcd7c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ logmuse>=0.2.6 -refgenconf>=0.10.0 +# refgenconf>=0.11.0 piper>=0.12.1 pyfaidx>=0.5.5.2 yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index e69de29b..15afb46c 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -0,0 +1 @@ +https://github.com/refgenie/refgencionf/archive/dev.zip diff --git a/setup.py b/setup.py index f0c50dcf..458f24b7 100755 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering :: Bio-Informatics", ], license="BSD2", From 50d47724edac7fdf77049b24ee0124931870dd9f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 19 Mar 2021 16:44:21 -0400 Subject: [PATCH 05/44] dev req format change --- requirements/requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 15afb46c..0eabb7aa 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1 +1 @@ -https://github.com/refgenie/refgencionf/archive/dev.zip +-e git+git://github.com/databio/refgenconf@dev#egg=refgenconf From 940685ddc264e24fc6811070bad921ec4e74321f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 09:15:32 -0400 Subject: [PATCH 06/44] add dev refgenconf to reqs --- refgenie/argparser.py | 8 ++------ refgenie/cli.py | 12 ++++++------ refgenie/const.py | 2 +- requirements/requirements-all.txt | 2 +- requirements/requirements-dev.txt | 1 + 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 36bf9dc4..92f1b1b6 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -482,11 +482,7 @@ def add_subparser(cmd, msg, subparsers): ) sps[POPULATE_CMD].add_argument( - "-f", - "--file", - metavar="F", - help="File with registry paths to populate") - - + "-f", "--file", metavar="F", help="File with registry paths to populate" + ) return parser diff --git a/refgenie/cli.py b/refgenie/cli.py index f1cfcdd3..21483ee5 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -363,8 +363,8 @@ def main(): elif args.command == POPULATE_CMD: _LOGGER.debug("Populating file: {}".format(args.file)) rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - # demo = {"genome": 'refgenie://hg19/fasta', - # "other_attr": "something", + # demo = {"genome": 'refgenie://hg19/fasta', + # "other_attr": "something", # "bt2": 'refgenie://t7/bwa_index'} # res = populate_refgenie_refs(rgc, demo) # print(res) @@ -372,11 +372,11 @@ def main(): with open(args.file) as fp: for line in fp: sys.stdout.write(rgc.populate_refgenie_refs(line)) - else: - for line in sys.stdin: - if 'q' == line.rstrip(): + else: + for line in sys.stdin: + if "q" == line.rstrip(): break - sys.stdout.write(rgc.populate_refgenie_refs(line)) + sys.stdout.write(rgc.populate_refgenie_refs(line)) def perm_check_x(file_to_check, message_tag="genome directory"): diff --git a/refgenie/const.py b/refgenie/const.py index 21e49507..12b61fc8 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -46,7 +46,7 @@ ALIAS_CMD: "Interact with aliases.", COMPARE_CMD: "Compare two genomes.", UPGRADE_CMD: "Upgrade config. This will alter the files on disk.", - POPULATE_CMD: "Populate registry paths with real paths." + POPULATE_CMD: "Populate registry paths with real paths.", } ALIAS_GET_CMD = "get" diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index b7816e4c..221dcd7c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,5 +1,5 @@ logmuse>=0.2.6 -refgenconf>=0.10.0 +# refgenconf>=0.11.0 piper>=0.12.1 pyfaidx>=0.5.5.2 yacman>=0.8.0 \ No newline at end of file diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index e69de29b..0ea736f7 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -0,0 +1 @@ +-e git+git://github.com/refgenie/refgenconf@dev#egg=refgenconf From 9aacacbad08df11740f0c7845a445d0df4069efd Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 09:20:05 -0400 Subject: [PATCH 07/44] edit action --- .github/workflows/build-package.yml | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml index 3c4b7881..d1219d33 100644 --- a/.github/workflows/build-package.yml +++ b/.github/workflows/build-package.yml @@ -7,7 +7,7 @@ on: branches: [master, dev] jobs: - pytest: + build-package: runs-on: ${{ matrix.os }} strategy: matrix: @@ -25,17 +25,8 @@ jobs: - name: Install dev dependancies run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi - # - name: Install test dependancies - # run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + - name: Install test dependancies + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - name: Install package run: python -m pip install . - - # - name: Run pytest tests - # run: pytest tests --remote-data --cov=./ --cov-report=xml - - # - name: Upload coverage to Codecov - # uses: codecov/codecov-action@v1 - # with: - # file: ./coverage.xml - # name: py-${{ matrix.python-version }}-${{ matrix.os }} From 1b370fa8e5e7aa7a5bfaaa2e7cd0eec2da7e7024 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 09:25:58 -0400 Subject: [PATCH 08/44] fix indent --- .github/workflows/build-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml index d1219d33..ee558a08 100644 --- a/.github/workflows/build-package.yml +++ b/.github/workflows/build-package.yml @@ -25,8 +25,8 @@ jobs: - name: Install dev dependancies run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi - - name: Install test dependancies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + - name: Install test dependancies + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - name: Install package run: python -m pip install . From f30c688ddbaf6bcf289de90acd08113aaa1dab65 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 09:39:53 -0400 Subject: [PATCH 09/44] update populate method name, more cli breaking statements --- refgenie/cli.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/refgenie/cli.py b/refgenie/cli.py index 3c7ee03f..6163262b 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -379,22 +379,17 @@ def main(): ) elif args.command == POPULATE_CMD: - _LOGGER.debug("Populating file: {}".format(args.file)) rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - # demo = {"genome": 'refgenie://hg19/fasta', - # "other_attr": "something", - # "bt2": 'refgenie://t7/bwa_index'} - # res = populate_refgenie_refs(rgc, demo) - # print(res) if args.file: + _LOGGER.debug(f"Populating file: {args.file}") with open(args.file) as fp: for line in fp: - sys.stdout.write(rgc.populate_refgenie_refs(line)) + sys.stdout.write(rgc.populate(line)) else: for line in sys.stdin: - if "q" == line.rstrip(): + if line.rstrip() in ["q", "quit", "exit"]: break - sys.stdout.write(rgc.populate_refgenie_refs(line)) + sys.stdout.write(rgc.populate(line)) def perm_check_x(file_to_check, message_tag="genome directory"): From e6c613680fbe7dfeb4a55a5a3cf5c814a833227f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 13:13:47 -0400 Subject: [PATCH 10/44] allow using remote features with no cfg --- refgenie/cli.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/refgenie/cli.py b/refgenie/cli.py index 6163262b..ab42a219 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -53,12 +53,11 @@ def main(): on_missing=lambda fp: fp, strict_env=True, ) - if gencfg is None: + if gencfg is None and args.command not in [GET_REMOTE_ASSET_CMD, LIST_REMOTE_CMD]: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) - skip_read_lock = _skip_lock(args.skip_read_lock, gencfg) - + skip_read_lock = True if gencfg is None else _skip_lock(args.skip_read_lock, gencfg) # From user input we want to construct a list of asset dicts, where each # asset has a genome name, asset name, and tag if "asset_registry_paths" in args and args.asset_registry_paths: From 99fdebf73525c60210fbf80ed6c790ab33397d2f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 13:33:51 -0400 Subject: [PATCH 11/44] enable non-persistent server subscriptions for seekr and listr --- refgenie/argparser.py | 18 ++++++++++++++++++ refgenie/cli.py | 5 ++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index efc4f103..3ba88484 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -481,12 +481,30 @@ def add_subparser(cmd, msg, subparsers): "--genome-server", nargs="+", required=True, + metavar="S", help="One or more URLs to {action} the {key} attribute in config file.".format( action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY, ), ) + for cmd in [LIST_REMOTE_CMD, GET_REMOTE_ASSET_CMD]: + sps[cmd].add_argument( + "-s", + "--genome-server", + nargs="+", + required=False, + metavar="S", + help="One or more URLs to use. " + "This information will not persist in the genome config file." + ) + sps[cmd].add_argument( + "-p", + "--append-server", + action="store_true", + help="Whether the provided servers should be appended to the list." + ) + sps[POPULATE_CMD].add_argument( "-f", "--file", metavar="F", help="File with registry paths to populate" ) diff --git a/refgenie/cli.py b/refgenie/cli.py index ab42a219..f26f136a 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -168,6 +168,8 @@ def main(): elif args.command == GET_REMOTE_ASSET_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + if args.genome_server is not None: + rgc.subscribe(urls=args.genome_server, reset=not args.append_server) for a in asset_list: _LOGGER.debug( "getting remote asset path: '{}/{}.{}:{}'".format( @@ -186,7 +188,6 @@ def main(): elif args.command == INSERT_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - if len(asset_list) > 1: raise NotImplementedError("Can only add 1 asset at a time") else: @@ -247,6 +248,8 @@ def main(): rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) console = Console() if args.command == LIST_REMOTE_CMD: + if args.genome_server is not None: + rgc.subscribe(urls=args.genome_server, reset=not args.append_server) num_servers = 0 bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: From 4dfd54f722571e00edbcf90c4938e2856c07b926 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 13:46:14 -0400 Subject: [PATCH 12/44] use no_write in on-the-fly subscriptions --- refgenie/cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/refgenie/cli.py b/refgenie/cli.py index f26f136a..0712952f 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -169,7 +169,9 @@ def main(): elif args.command == GET_REMOTE_ASSET_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) if args.genome_server is not None: - rgc.subscribe(urls=args.genome_server, reset=not args.append_server) + rgc.subscribe( + urls=args.genome_server, reset=not args.append_server, no_write=True + ) for a in asset_list: _LOGGER.debug( "getting remote asset path: '{}/{}.{}:{}'".format( @@ -249,7 +251,9 @@ def main(): console = Console() if args.command == LIST_REMOTE_CMD: if args.genome_server is not None: - rgc.subscribe(urls=args.genome_server, reset=not args.append_server) + rgc.subscribe( + urls=args.genome_server, reset=not args.append_server, no_write=True + ) num_servers = 0 bad_servers = [] for server_url in rgc[CFG_SERVERS_KEY]: From 410f491772b69457155014536af42ee71f07c71e Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 22 Mar 2021 13:49:21 -0400 Subject: [PATCH 13/44] format --- refgenie/argparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 3ba88484..8e879dc5 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -496,13 +496,13 @@ def add_subparser(cmd, msg, subparsers): required=False, metavar="S", help="One or more URLs to use. " - "This information will not persist in the genome config file." + "This information will not persist in the genome config file.", ) sps[cmd].add_argument( "-p", "--append-server", action="store_true", - help="Whether the provided servers should be appended to the list." + help="Whether the provided servers should be appended to the list.", ) sps[POPULATE_CMD].add_argument( From 300c389ad8c6c10cb24da8a0651e51b8a6d3c2d2 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 09:24:10 -0400 Subject: [PATCH 14/44] allow specifying remote-class via CLI --- .github/workflows/test-refgenie-cli.yml | 2 +- refgenie/argparser.py | 7 +++++++ refgenie/cli.py | 3 +-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index e535a0a5..be6b0aa8 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -107,7 +107,7 @@ jobs: run: | refgenie alias get -c genomes/g.yaml - - name: refgneie add asset + - name: refgenie add asset run: | refgenie add t7_another/test_asset -c genomes/g.yaml --path ../tests/data --seek-keys '{"recipe": "recipe_parent.json"}' ./tests/assert_in_file.sh genomes/g.yaml test_asset 0 diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 8e879dc5..33c9371f 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -504,6 +504,13 @@ def add_subparser(cmd, msg, subparsers): action="store_true", help="Whether the provided servers should be appended to the list.", ) + sps[GET_REMOTE_ASSET_CMD].add_argument( + "-r", + "--remote-class", + metavar="RC", + type=str, + help="Remote data provider class, e.g. 'html' or 's3'", + ) sps[POPULATE_CMD].add_argument( "-f", "--file", metavar="F", help="File with registry paths to populate" diff --git a/refgenie/cli.py b/refgenie/cli.py index 0712952f..f2855285 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -116,8 +116,6 @@ def main(): ) if args.genome_folder: entries.update({CFG_FOLDER_KEY: args.genome_folder}) - if args.remote_url_base: - entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base}) if args.genome_archive_folder: entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder}) if args.genome_archive_config: @@ -184,6 +182,7 @@ def main(): a["asset"], a["seek_key"], a["tag"], + args.remote_class, ) ) return From afaf7920482d66c01785456535bdec786811fc37 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 10:00:09 -0400 Subject: [PATCH 15/44] allow remote paths in refgenie populate; #234 --- refgenie/argparser.py | 21 ++++++++++++++------- refgenie/cli.py | 10 +++++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 33c9371f..a3dcc190 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -504,16 +504,23 @@ def add_subparser(cmd, msg, subparsers): action="store_true", help="Whether the provided servers should be appended to the list.", ) - sps[GET_REMOTE_ASSET_CMD].add_argument( - "-r", - "--remote-class", - metavar="RC", - type=str, - help="Remote data provider class, e.g. 'html' or 's3'", - ) + for cmd in [POPULATE_CMD, GET_REMOTE_ASSET_CMD]: + sps[cmd].add_argument( + "--remote-class", + metavar="RC", + type=str, + help="Remote data provider class, e.g. 'html' or 's3'", + ) sps[POPULATE_CMD].add_argument( "-f", "--file", metavar="F", help="File with registry paths to populate" ) + sps[POPULATE_CMD].add_argument( + "-r", + "--remote", + action="store_true", + help="Whether to populate using remote data paths", + ) + return parser diff --git a/refgenie/cli.py b/refgenie/cli.py index f2855285..7b6ee992 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -26,6 +26,7 @@ from collections import OrderedDict from rich.console import Console +from functools import partial def main(): @@ -180,8 +181,8 @@ def main(): rgc.seekr( a["genome"], a["asset"], - a["seek_key"], a["tag"], + a["seek_key"], args.remote_class, ) ) @@ -385,16 +386,19 @@ def main(): elif args.command == POPULATE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + pop_func = partial( + rgc.populate, remote=args.remote, remote_class=args.remote_class + ) if args.file: _LOGGER.debug(f"Populating file: {args.file}") with open(args.file) as fp: for line in fp: - sys.stdout.write(rgc.populate(line)) + sys.stdout.write(pop_func(glob=line)) else: for line in sys.stdin: if line.rstrip() in ["q", "quit", "exit"]: break - sys.stdout.write(rgc.populate(line)) + sys.stdout.write(pop_func(glob=line)) def perm_check_x(file_to_check, message_tag="genome directory"): From 718168f098743e05d2740fdae3880a4cc04bf50d Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 12:03:13 -0400 Subject: [PATCH 16/44] add refgenie populater subcommand; #234 --- refgenie/__init__.py | 3 +- refgenie/__main__.py | 3 +- refgenie/add_assets_igenome.py | 19 ++++---- refgenie/argparser.py | 27 +++++------- refgenie/build_all_genome.py | 8 ++-- refgenie/cli.py | 79 +++++++++++++++++++--------------- refgenie/const.py | 5 ++- refgenie/refgenie.py | 27 ++++++------ refgenie/refget.py | 5 ++- 9 files changed, 93 insertions(+), 83 deletions(-) diff --git a/refgenie/__init__.py b/refgenie/__init__.py index 6e0e9f4e..7fa2296c 100644 --- a/refgenie/__init__.py +++ b/refgenie/__init__.py @@ -1,4 +1,5 @@ -from ._version import __version__ import logmuse +from ._version import __version__ + logmuse.init_logger("refgenie") diff --git a/refgenie/__main__.py b/refgenie/__main__.py index 1fa5c244..0edeef18 100644 --- a/refgenie/__main__.py +++ b/refgenie/__main__.py @@ -1,6 +1,7 @@ -from .cli import main import sys +from .cli import main + if __name__ == "__main__": try: sys.exit(main()) diff --git a/refgenie/add_assets_igenome.py b/refgenie/add_assets_igenome.py index c09a2bb2..3239fa43 100644 --- a/refgenie/add_assets_igenome.py +++ b/refgenie/add_assets_igenome.py @@ -6,21 +6,20 @@ Build/ Annotation/ Sequence/ """ -from .refgenie import _seek -from .exceptions import MissingGenomeConfigError - -from ubiquerg import untar, mkabs, query_yes_no +import argparse +import os +import sys +import tarfile +from glob import glob +from shutil import move import refgenconf from refgenconf import get_dir_digest from refgenconf.const import * -from glob import glob +from ubiquerg import mkabs, query_yes_no, untar -import os -import argparse -import sys -import tarfile -from shutil import move +from .exceptions import MissingGenomeConfigError +from .refgenie import _seek def build_argparser(): diff --git a/refgenie/argparser.py b/refgenie/argparser.py index a3dcc190..6529ac3e 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -1,12 +1,11 @@ -import pypiper +from argparse import HelpFormatter +import pypiper +from refgenconf import __version__ as rgc_version from ubiquerg import VersionInHelpParser from ._version import __version__ from .const import * -from refgenconf import __version__ as rgc_version - -from argparse import HelpFormatter def build_argparser(): @@ -488,7 +487,7 @@ def add_subparser(cmd, msg, subparsers): ), ) - for cmd in [LIST_REMOTE_CMD, GET_REMOTE_ASSET_CMD]: + for cmd in [LIST_REMOTE_CMD, GET_REMOTE_ASSET_CMD, POPULATE_REMOTE_CMD]: sps[cmd].add_argument( "-s", "--genome-server", @@ -504,23 +503,19 @@ def add_subparser(cmd, msg, subparsers): action="store_true", help="Whether the provided servers should be appended to the list.", ) - for cmd in [POPULATE_CMD, GET_REMOTE_ASSET_CMD]: + + for cmd in [POPULATE_REMOTE_CMD, GET_REMOTE_ASSET_CMD]: sps[cmd].add_argument( "--remote-class", metavar="RC", type=str, + default="http", help="Remote data provider class, e.g. 'html' or 's3'", ) - sps[POPULATE_CMD].add_argument( - "-f", "--file", metavar="F", help="File with registry paths to populate" - ) - - sps[POPULATE_CMD].add_argument( - "-r", - "--remote", - action="store_true", - help="Whether to populate using remote data paths", - ) + for cmd in [POPULATE_REMOTE_CMD, POPULATE_CMD]: + sps[cmd].add_argument( + "-f", "--file", metavar="F", help="File with registry paths to populate" + ) return parser diff --git a/refgenie/build_all_genome.py b/refgenie/build_all_genome.py index 98cf968c..2def4255 100644 --- a/refgenie/build_all_genome.py +++ b/refgenie/build_all_genome.py @@ -2,11 +2,13 @@ A helper script to create SLURM submission scripts for all the assets defined in asset_build_packages for a given genome """ -from .asset_build_packages import asset_build_packages -from ubiquerg import expandpath -import os import argparse +import os + import divvy +from ubiquerg import expandpath + +from .asset_build_packages import asset_build_packages parser = argparse.ArgumentParser( description="Builds submission scripts for all assets for a genome" diff --git a/refgenie/cli.py b/refgenie/cli.py index 7b6ee992..4518948f 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -1,32 +1,29 @@ -import logmuse -import sys import json import os +import sys +from collections import OrderedDict +from functools import partial -from .argparser import build_argparser -from .refgenie import parse_registry_path, _skip_lock -from ._version import __version__ -from .const import * -from .exceptions import * -from .asset_build_packages import * -from .refgenie import refgenie_build -from .helpers import _raise_missing_recipe_error, _single_folder_writeable - +import logmuse from refgenconf import ( - RefGenConf, + DownloadJsonError, MissingAssetError, MissingGenomeError, - DownloadJsonError, - upgrade_config, - __version__ as rgc_version, - select_genome_config, + RefGenConf, ) -from ubiquerg import query_yes_no +from refgenconf import __version__ as rgc_version +from refgenconf import select_genome_config, upgrade_config from requests.exceptions import MissingSchema - -from collections import OrderedDict from rich.console import Console -from functools import partial +from ubiquerg import query_yes_no + +from ._version import __version__ +from .argparser import build_argparser +from .asset_build_packages import * +from .const import * +from .exceptions import * +from .helpers import _raise_missing_recipe_error, _single_folder_writeable +from .refgenie import _skip_lock, parse_registry_path, refgenie_build def main(): @@ -54,7 +51,11 @@ def main(): on_missing=lambda fp: fp, strict_env=True, ) - if gencfg is None and args.command not in [GET_REMOTE_ASSET_CMD, LIST_REMOTE_CMD]: + if gencfg is None and args.command not in [ + GET_REMOTE_ASSET_CMD, + LIST_REMOTE_CMD, + POPULATE_REMOTE_CMD, + ]: raise MissingGenomeConfigError(args.genome_config) _LOGGER.debug("Determined genome config: {}".format(gencfg)) @@ -386,19 +387,29 @@ def main(): elif args.command == POPULATE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - pop_func = partial( - rgc.populate, remote=args.remote, remote_class=args.remote_class - ) - if args.file: - _LOGGER.debug(f"Populating file: {args.file}") - with open(args.file) as fp: - for line in fp: - sys.stdout.write(pop_func(glob=line)) - else: - for line in sys.stdin: - if line.rstrip() in ["q", "quit", "exit"]: - break - sys.stdout.write(pop_func(glob=line)) + process_populate(args, rgc.populate) + + elif args.command == POPULATE_REMOTE_CMD: + rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) + if args.genome_server is not None: + rgc.subscribe( + urls=args.genome_server, reset=not args.append_server, no_write=True + ) + pop_fun = partial(rgc.populater, remote_class=args.remote_class) + process_populate(args, pop_fun) + + +def process_populate(args, pop_fun): + if args.file: + _LOGGER.debug(f"Populating file: {args.file}") + with open(args.file) as fp: + for line in fp: + sys.stdout.write(pop_fun(glob=line)) + else: + for line in sys.stdin: + if line.rstrip() in ["q", "quit", "exit"]: + break + sys.stdout.write(pop_fun(glob=line)) def perm_check_x(file_to_check, message_tag="genome directory"): diff --git a/refgenie/const.py b/refgenie/const.py index a500c92e..3bbdc8a4 100644 --- a/refgenie/const.py +++ b/refgenie/const.py @@ -10,6 +10,7 @@ INIT_CMD = "init" PULL_CMD = "pull" POPULATE_CMD = "populate" +POPULATE_REMOTE_CMD = "populater" LIST_LOCAL_CMD = "list" LIST_REMOTE_CMD = "listr" GET_ASSET_CMD = "seek" @@ -56,9 +57,11 @@ ALIAS_CMD: "Interact with aliases.", COMPARE_CMD: "Compare two genomes.", UPGRADE_CMD: "Upgrade config. This will alter the files on disk.", - POPULATE_CMD: "Populate registry paths with real paths.", + POPULATE_CMD: "Populate registry paths with local paths.", + POPULATE_REMOTE_CMD: "Populate registry paths with remote paths.", } + ALIAS_GET_CMD = "get" ALIAS_SET_CMD = "set" ALIAS_REMOVE_CMD = "remove" diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index 9f62caa1..f1b38f7e 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -1,29 +1,26 @@ -import os -import sys import csv -import signal import json +import os +import signal +import sys +from logging import getLogger + +import pypiper +import refgenconf +from refgenconf import RefGenConf, get_dir_digest +from ubiquerg import parse_registry_path as prp +from ubiquerg.system import is_writable +from yacman import UndefinedAliasError from .asset_build_packages import * from .const import * from .helpers import ( + _parse_user_build_input, _raise_missing_recipe_error, _skip_lock, - _parse_user_build_input, _writeable, ) -import pypiper -import refgenconf -from refgenconf import ( - RefGenConf, - get_dir_digest, -) -from ubiquerg import parse_registry_path as prp -from ubiquerg.system import is_writable -from yacman import UndefinedAliasError -from logging import getLogger - _LOGGER = getLogger(PKG_NAME) diff --git a/refgenie/refget.py b/refgenie/refget.py index b0861260..f81633e2 100644 --- a/refgenie/refget.py +++ b/refgenie/refget.py @@ -1,11 +1,12 @@ # TO be imported from refget package when it is finished # from refget import fasta_checksum -import hashlib import binascii -import pyfaidx +import hashlib import os +import pyfaidx + def trunc512_digest(seq, offset=24): digest = hashlib.sha512(seq.encode()).digest() From d245197bcab5997d2f2917cc2a48215f1358d4f6 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 12:36:57 -0400 Subject: [PATCH 17/44] document process_populate --- refgenie/argparser.py | 2 +- refgenie/cli.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 6529ac3e..8bb64ac2 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -510,7 +510,7 @@ def add_subparser(cmd, msg, subparsers): metavar="RC", type=str, default="http", - help="Remote data provider class, e.g. 'html' or 's3'", + help="Remote data provider class, e.g. 'http' or 's3'", ) for cmd in [POPULATE_REMOTE_CMD, POPULATE_CMD]: diff --git a/refgenie/cli.py b/refgenie/cli.py index 4518948f..d7aa6233 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -387,7 +387,7 @@ def main(): elif args.command == POPULATE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) - process_populate(args, rgc.populate) + process_populate(pop_fun=rgc.populate, file_path=args.file) elif args.command == POPULATE_REMOTE_CMD: rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock) @@ -396,13 +396,21 @@ def main(): urls=args.genome_server, reset=not args.append_server, no_write=True ) pop_fun = partial(rgc.populater, remote_class=args.remote_class) - process_populate(args, pop_fun) + process_populate(pop_fun=pop_fun, file_path=args.file) -def process_populate(args, pop_fun): - if args.file: - _LOGGER.debug(f"Populating file: {args.file}") - with open(args.file) as fp: +def process_populate(pop_fun, file_path=None): + """ + Process a populate request (file or stdin) with a custom populator function + + :param callable(dict | str | list) -> dict | str | list: a function that populates + refgenie registry paths in objects + :param str file_path: path to the file to populate refgenie registry paths in, + skip for stdin processing + """ + if file_path is not None: + _LOGGER.debug(f"Populating file: {file_path}") + with open(file_path) as fp: for line in fp: sys.stdout.write(pop_fun(glob=line)) else: From 6651d2314a5659f4423e9e05c17a38ad366cf638 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 16:13:26 -0400 Subject: [PATCH 18/44] numerous docs updates --- docs/README.md | 50 ++++++-- docs/autodoc_build/refgenconf.md | 198 +++++++++++++++++-------------- docs/demo_videos.md | 27 +++++ docs/manuscripts.md | 10 ++ docs/remote.md | 112 +++++++++++++++++ docs/seek.md | 4 +- docs/seekr.md | 17 +++ docs/usage.md | 90 ++++++++++++-- mkdocs.yml | 7 +- refgenie/cli.py | 4 +- setup.py | 3 +- update-usage-docs.sh | 2 +- 12 files changed, 408 insertions(+), 116 deletions(-) create mode 100644 docs/demo_videos.md create mode 100644 docs/manuscripts.md create mode 100644 docs/remote.md create mode 100644 docs/seekr.md diff --git a/docs/README.md b/docs/README.md index 9b4a5960..8db0065e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,24 +17,47 @@ Refgenie manages storage, access, and transfer of reference genome resources. It 2. **It's scripted**. In case you need resources *not* on the server, such as for a custom genome, you can `build` your own: `refgenie build custom_genome/bowtie2_index`. 3. **It simplifies finding local asset locations**. When you need a path to an asset, you can `seek` it, making your pipelines portable across computing environments: `refgenie seek hg38/salmon_index`. + +4. **It provides remote operation mode**, useful for cloud applications. Get a path to an asset file hosted on AWS S3: `refgenie seekr hg38/fasta --remote-class s3`. -4. **It includes a python API**. For tool developers, you use `rgc = refgenconf.RefGenConf("genomes.yaml")` to get a Python object with paths to any genome asset, *e.g.*, `rgc.seek("hg38", "kallisto_index")`. - -5. **It strictly determines genomes compatibility**. Users refer to genomes with arbitrary aliases, like "hg38", but refgenie uses sequence-derived identifiers to verify genome identity with asset servers. +5. **It includes a Python API**. For tool developers, you use `rgc = refgenconf.RefGenConf("genomes.yaml")` to get a Python object with paths to any genome asset, *e.g.*, `rgc.seek("hg38", "kallisto_index")`. +6. **It strictly determines genomes compatibility**. Users refer to genomes with arbitrary aliases, like "hg38", but refgenie uses sequence-derived identifiers to verify genome identity with asset servers. ## Quick example -### Install and initialize +### Install -Refgenie keeps track of what's available using a configuration file initialized by `refgenie init`: +Refgenie is a Python package package, install from [PyPi](https://pypi.org/project/refgenie/): ```console pip install --user refgenie +``` + +Or [conda](https://anaconda.org/bioconda/refgenie): + +```console +conda install refgenie +``` + +And that's it! If you wish to use refgenie in *remote mode* See [further reading on remote mode in refgenie](remote.md). + +If you're connected to the Internet, call a test command, e.g.: + +```console +refgenie seekr hg38/fasta +``` + +### Initialize to use refgenie locally + +Refgenie keeps track of what's available using a configuration file initialized by `refgenie init`: + +```console export REFGENIE='genome_config.yaml' refgenie init -c $REFGENIE ``` + ### Download indexes and assets for a remote reference genome Use `refgenie pull` to download pre-built assets from a remote server. View available remote assets with `listr`: @@ -87,7 +110,7 @@ refgenie build mygenome/bwa_index See [further reading on building assets](build.md). -### Retrieve paths to refgenie-managed assets +### Retrieve paths to *local* refgenie-managed assets Once you've populated your refgenie with a few assets, use `seek` to retrieve their local file paths: @@ -97,4 +120,17 @@ refgenie seek mm10/bowtie2_index This will return the path to the particular asset of interest, regardless of your computing environment. This gives you an ultra-portable asset manager! See [further reading on retrieving asset paths](seek.md). -If you want to read more about the motivation behind refgenie and the software engineering that makes refgenie work, proceed next to the [overview](overview.md). +### Retrieve paths to *remote* refgenie-managed assets + +Use `seekr` (short for "seek remote") to retrieve remote `seek_key` targets: + +```console +refgenie seekr mm10/fasta.fai +``` + +This will return the path to the particular remote file of interest, here: FASTA index file, which is a part of `mm10/fasta` asset. + +See [further reading on seeking remote asset files](seekr.md). + +--- +If you want to read more about the motivation behind refgenie and the software engineering that makes refgenie work, proceed next to the [overview](overview.md). \ No newline at end of file diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index 5fca1068..a220a977 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -29,58 +29,6 @@ h4 .content { # Package `refgenconf` Documentation -## Class `ConfigNotCompliantError` -The format of the config file does not match required version/standards - - -## Class `DownloadJsonError` -Non-OK response from a JSON download attempt - - -```python -def __init__(self, resp) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -## Class `GenomeConfigFormatError` -Exception for invalid genome config file format. - - -```python -def __init__(self, msg) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -## Class `MissingAssetError` -Error type for request of an unavailable genome asset. - - -## Class `MissingConfigDataError` -Missing required configuration instance items - - -## Class `MissingGenomeError` -Error type for request of unknown genome/assembly. - - -## Class `MissingRecipeError` -Error type for request of an unavailable recipe. - - -## Class `MissingSeekKeyError` -Error type for request of an unavailable asset seek key. - - -## Class `MissingTagError` -Error type for request of an unavailable asset tag. - - ## Class `RefGenConf` A sort of oracle of available reference genome assembly assets @@ -376,7 +324,7 @@ Get path to the Annotated Sequence Digests JSON file for a given genome. Note th ```python -def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fa582ccad08>) +def get_asset_table(self, genomes=None, server_url=None, get_json_url= at 0x7fac824c4e18>) ``` Get a rich.Table object representing assets available locally @@ -497,13 +445,13 @@ List locally available reference genome IDs and assets by ID. ```python -def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fa582cd4840>) +def get_remote_data_str(self, genome=None, order=None, get_url= at 0x7fac824cca60>) ``` List genomes and assets available remotely. #### Parameters: -- `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance +- `get_url` (`function(serverUrl, operationId) -> str`): how to determineURL request, given server URL and endpoint operationID - `genome` (`list[str] | str`): genomes that the assets should be found for - `order` (`function(str) -> object`): how to key genome IDs and assetnames for sort @@ -694,13 +642,13 @@ List assemblies for which a particular asset is available. ```python -def listr(self, genome=None, order=None, get_url= at 0x7fa582cd4950>, as_str=False) +def listr(self, genome=None, get_url= at 0x7fac824ccb70>, as_digests=False) ``` List genomes and assets available remotely on all servers the object subscribes to #### Parameters: -- `get_url` (`function(refgenconf.RefGenConf) -> str`): how to determineURL request, given RefGenConf instance +- `get_url` (`function(serverUrl, operationId) -> str`): how to determineURL request, given server URL and endpoint operationID - `genome` (`list[str] | str`): genomes that the assets should be found for - `order` (`function(str) -> object`): how to key genome IDs and assetnames for sort @@ -725,7 +673,42 @@ Plugins registered by entry points in the current Python env ```python -def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fa582cd4bf8>, build_signal_handler=) +def populate(self, glob) +``` + +Populates *local* refgenie references from refgenie://genome/asset.seek_key:tag registry paths +#### Parameters: + +- `glob` (`dict | str | list`): String which may contain refgenie registry paths asvalues; or a dict, for which values may contain refgenie registry paths. Dict include nested dicts. + + +#### Returns: + +- `dict | str | list`: modified input dict with refgenie paths populated + + + + +```python +def populater(self, glob, remote_class=None) +``` + +Populates *remote* refgenie references from refgenie://genome/asset:tag registry paths +#### Parameters: + +- `glob` (`dict | str | list`): String which may contain refgenie registry paths asvalues; or a dict, for which values may contain refgenie registry paths. Dict include nested dicts. +- `remote_class` (`str`): remote data provider class, e.g. 'http' or 's3' + + +#### Returns: + +- `dict | str | list`: modified input dict with refgenie paths populated + + + + +```python +def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, get_json_url= at 0x7fac824cce18>, build_signal_handler=) ``` Download and possibly unpack one or more assets for a given ref gen. @@ -832,7 +815,7 @@ Runs all installed plugins for the specified hook. ```python -def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fa582cd4378>) +def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, all_aliases=False, check_exist= at 0x7fac824cc488>) ``` Seek path to a specified genome-asset-tag alias @@ -863,7 +846,7 @@ Seek path to a specified genome-asset-tag alias ```python -def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fa582cd4488>) +def seek_src(self, genome_name, asset_name, tag_name=None, seek_key=None, strict_exists=None, enclosing_dir=False, check_exist= at 0x7fac824cc6a8>) ``` Seek path to a specified genome-asset-tag @@ -892,6 +875,28 @@ Seek path to a specified genome-asset-tag +```python +def seekr(self, genome_name, asset_name, tag_name=None, seek_key=None, remote_class='html', get_url= at 0x7fac824cc598>) +``` + +Seek a remote path to a specified genome/asset.seek_key:tag +#### Parameters: + +- `genome_name` (`str`): name of a reference genome assembly of interest +- `asset_name` (`str`): name of the particular asset to fetch +- `tag_name` (`str`): name of the particular asset tag to fetch +- `seek_key` (`str`): name of the particular subasset to fetch +- `remote_class` (`str`): remote data provider class, e.g. 'html' or 's3' +- `get_url` (`function(serverUrl, operationId) -> str`): how to determineURL request, given server URL and endpoint operationID + + +#### Returns: + +- `str`: path to the asset + + + + ```python def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None) ``` @@ -909,7 +914,7 @@ Point to the selected tag by default ```python -def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fa582cd4ea0>) +def set_genome_alias(self, genome, digest=None, servers=None, overwrite=False, reset_digest=False, create_genome=False, no_write=False, get_json_url= at 0x7fac824cd158>) ``` Assign a human-readable alias to a genome identifier. @@ -934,7 +939,7 @@ human-readable identifier to make referring to the genomes easier. ```python -def subscribe(self, urls, reset=False) +def subscribe(self, urls, reset=False, no_write=False) ``` Add URLs the list of genome_servers. @@ -983,7 +988,7 @@ genome configuration file changes ```python -def unsubscribe(self, urls) +def unsubscribe(self, urls, no_write=False) ``` Remove URLs the list of genome_servers. @@ -1132,78 +1137,87 @@ Write the contents to a file. If pre- and post-update plugins are defined, they -## Class `RefgenconfError` -Base exception type for this package - - -## Class `RemoteDigestMismatchError` -Remote digest of the parent asset does not match its local counterpart +## Class `GenomeConfigFormatError` +Exception for invalid genome config file format. ```python -def __init__(self, asset, local_digest, remote_digest) +def __init__(self, msg) ``` Initialize self. See help(type(self)) for accurate signature. +## Class `MissingAssetError` +Error type for request of an unavailable genome asset. + + +## Class `MissingConfigDataError` +Missing required configuration instance items + + +## Class `MissingGenomeError` +Error type for request of unknown genome/assembly. + + +## Class `RefgenconfError` +Base exception type for this package + + ## Class `UnboundEnvironmentVariablesError` Use of environment variable that isn't bound to a value. ```python -def get_dir_digest(path, pm=None) +def select_genome_config(filename=None, conf_env_vars=['REFGENIE'], **kwargs) ``` -Generate a MD5 digest that reflects just the contents of the files in the selected directory. +Get path to genome configuration file. #### Parameters: -- `path` (`str`): path to the directory to digest -- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided +- `filename` (`str`): name/path of genome configuration file +- `conf_env_vars` (`Iterable[str]`): names of environment variables toconsider; basically, a prioritized search list #### Returns: -- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 +- `str`: path to genome configuration file ```python -def select_genome_config(filename=None, conf_env_vars=['REFGENIE'], **kwargs) +def get_dir_digest(path, pm=None) ``` -Get path to genome configuration file. +Generate a MD5 digest that reflects just the contents of the files in the selected directory. #### Parameters: -- `filename` (`str`): name/path of genome configuration file -- `conf_env_vars` (`Iterable[str]`): names of environment variables toconsider; basically, a prioritized search list +- `path` (`str`): path to the directory to digest +- `pm` (`pypiper.PipelineManager`): a pipeline object, optional.The subprocess module will be used if not provided #### Returns: -- `str`: path to genome configuration file +- `str`: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 ```python -def upgrade_config(target_version, filepath, force=False, get_json_url= at 0x7fa582cca730>, link_fun= at 0x7fa582cd8158>) +def looper_refgenie_plugin(namespaces) ``` -Upgrade the config to a selected target version. - -Convert the config file to target_version format, update file structure -inside genome_folder. Drop genomes for which genome_digest is not available -on any of the servers and do not have a fasta asset locally. +A looper plugin that populates refgenie references in a PEP from refgenie://genome/asset:tag registry paths. This can be used to convert all refgenie references into their local paths at the looper stage, so the final paths are passed to the workflow. This way the workflow does not need to depend on refgenie to resolve the paths. This is useful for example for CWL pipelines, which are built to have paths resolved outside the workflow. #### Parameters: -- `target_version` (`str`): the version updated to -- `filepath` (`str`): path to config file -- `force` (`bool`): whether the upgrade should be confirmed upfront -- `get_json_url` (`function(str, str) -> str`): how to build URL fromgenome server URL base, genome, and asset -- `link_fun` (`callable`): function to use to link files, e.g os.symlink or os.link +- `namespaces` (`dict`): variable namespaces dict + + +#### Returns: + +- `dict`: sample namespace dict @@ -1211,4 +1225,4 @@ on any of the servers and do not have a fasta asset locally. -*Version Information: `refgenconf` v0.10.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file +*Version Information: `refgenconf` v0.11.0-dev, generated by `lucidoc` v0.4.3* \ No newline at end of file diff --git a/docs/demo_videos.md b/docs/demo_videos.md new file mode 100644 index 00000000..7cc97111 --- /dev/null +++ b/docs/demo_videos.md @@ -0,0 +1,27 @@ +# Demo videos - start working with refgenie + +Check out these demonstration videos, which show how to quickly start working with refgenie. + +After refgenie Python package is installed with `pip install refgenie`, refgenie can be used with or without genome configuration file. + +## Without config - remote mode + +If used *without* genome configuration file, the data does not persist from session to session. It's really useful in cloud applications. + +**hint**: remote commands end with `r`: + +- `refgenie listr` +- `refgenie seekr` +- `refgenie populater` + +**!!! Record a new asciicast and insert here** + + + +## With config - local mode + +If used *with* genome configuration file, the data persists from session to session. That's the most common scenario, you can `build`, `pull` and `seek` for assets managed locally. + +**!!! Record a new asciicast and insert here** + + \ No newline at end of file diff --git a/docs/manuscripts.md b/docs/manuscripts.md new file mode 100644 index 00000000..1aa3a72b --- /dev/null +++ b/docs/manuscripts.md @@ -0,0 +1,10 @@ +# Refgenie manuscripts + +- [Stolarczyk et al. (2020) *Refgenie: a reference genome resource manager*](https://doi.org/10.1093/gigascience/giz149): introductory publication +- [Stolarczyk et al. (2021) *Identity and compatibility of reference genome resources*](https://www.biorxiv.org/content/10.1101/2021.03.15.435425v1): sequence-derived genome identifiers, data provenance tracking + +# Manuscripts using refgenie + +Did you use refgenie in your research? [Add your manuscript to this list!](https://github.com/refgenie/refgenie/edit/master/docs/manuscripts.md) + +- [VijayKrishna et al. (2020) *Expanding the Galaxy’s reference data*](https://www.biorxiv.org/content/10.1101/2020.10.09.327114v1): refgenie-Galaxy integration \ No newline at end of file diff --git a/docs/remote.md b/docs/remote.md new file mode 100644 index 00000000..2b7db9b0 --- /dev/null +++ b/docs/remote.md @@ -0,0 +1,112 @@ +# Remote mode in refgenie + +Starting with version 0.11.0, refgenie can be used in *remote mode*, which means that in some cases the genome configuration file is not required. **Therefore, you can skip `refgenie init` and start workling with refgenie right after installation!** + +## Commands available in remote mode + +*Hint: all of these commands end with "r"* + +There are a few commands that do not require genome configuration file to run. `-s/--genome-servers` argument specifies the list of servers you want refgenie to query. Default server (http://refgenomes.databio.org) is used if not provided. + +### List remote assets with `refgenie listr` + +You can list assets available on remote servers with `refgenie listr`. + +```console +~ refgenie listr -s http://rg.databio.org +Using default config. No config found in env var: ['REFGENIE'] +Subscribed to: http://rg.databio.org + + Remote refgenie assets + Server URL: http://rg.databio.org +┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ genome ┃ assets ┃ +┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ mouse_chrM2x │ fasta, bowtie2_index, bwa_index │ +│ rCRSd │ fasta, bowtie2_index │ +│ human_repeats │ fasta, hisat2_index, bwa_index │ +│ hg38 │ fasta, bowtie2_index │ +└─────────────────────┴──────────────────────────────────────────────┘ + use refgenie listr -g for more detailed view +``` + +### Find remote asset paths with `refgenie seekr` + +You can seek for remote asset paths with `refgenie seekr`: + +```console +~ refgenie seekr hg38/fasta -s http://rg.databio.org --remote-class s3 + +Using default config. No config found in env var: ['REFGENIE'] +Subscribed to: http://rg.databio.org +No local digest for genome alias: hg38 +Setting 'hg38' identity with server: http://rg.databio.org/v3/genomes/genome_digest/hg38 +Determined server digest for local genome alias (hg38): 2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 +Set genome alias (2230c535660fb4774114bfa966a62f823fdb6d21acf138d4: hg38) +s3://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa +``` + +`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: http://rg.databio.org/remotes/dict. + +### Replace asset registry paths with remote asset paths with `refgenie populater` + +You can replace refgenie asset registry paths in text of files with `refgenie populater`. Any string that matches the following format will be replaced with a remote path: + +``` +refgenie://genome_alias/asset.seek_key:tag +``` + + +#### populate text from standard input + +```console +~ echo 'test remote populating refgenie://hg38/fasta.fasta:default' | refgenie populater hg38/fasta -s http://rg.databio.org --remote-class s3 + +Using default config. No config found in env var: ['REFGENIE'] +Subscribed to: http://rg.databio.org +No local digest for genome alias: hg38 +Setting 'hg38' identity with server: http://rg.databio.org/v3/genomes/genome_digest/hg38 +Determined server digest for local genome alias (hg38): 2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 +Set genome alias (2230c535660fb4774114bfa966a62f823fdb6d21acf138d4: hg38) +test remote populating s3://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa:default +``` + +`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: http://rg.databio.org/remotes/dict. + +#### populate text from file + +1. check input file contents +```console +~ cat remote_populate_test.txt + +human genome FASTA file: refgenie://hg38/fasta.fasta:default +yeast doubled genome FASTA file: refgenie://rCRSd/fasta.fasta:default +``` + +2. run `refgenie populater` +```console +~ refgenie populater -f remote_populate_test.txt -s http://rg.databio.org > remote_populate_test_output.txt + +Using default config. No config found in env var: ['REFGENIE'] +Subscribed to: http://rg.databio.org +No local digest for genome alias: hg38 +Setting 'hg38' identity with server: http://rg.databio.org/v3/genomes/genome_digest/hg38 +Determined server digest for local genome alias (hg38): 2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 +Set genome alias (2230c535660fb4774114bfa966a62f823fdb6d21acf138d4: hg38) +No local digest for genome alias: rCRSd +Setting 'rCRSd' identity with server: http://rg.databio.org/v3/genomes/genome_digest/rCRSd +Determined server digest for local genome alias (rCRSd): 94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 +Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd) +``` + +3. check output file contents +```console +~ cat remote_populate_test_output.txt + +human genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa:default +yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta__default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa:default +``` + +## Motivation + +... \ No newline at end of file diff --git a/docs/seek.md b/docs/seek.md index abcb5bd8..dda9f5fb 100644 --- a/docs/seek.md +++ b/docs/seek.md @@ -1,9 +1,9 @@ -# Retrieve paths to assets +# Retrieve paths to *local* assets Once you've assembled a few assets, either by downloading or by building them, you'll be able to use `refgenie seek` to retrieve the paths. It's quite simple, really -- say you've built the `bowtie2_index` and `fasta` assets for `hg38`. If you type: ```console -refgenie seek -c CONFIG.yaml hg38/bowtie2_index +refgenie seek hg38/bowtie2_index ``` You'll get back the absolute path on your system to the `bowtie2_index` asset, something like: diff --git a/docs/seekr.md b/docs/seekr.md new file mode 100644 index 00000000..44a15f02 --- /dev/null +++ b/docs/seekr.md @@ -0,0 +1,17 @@ +# Retrieve paths to *remote* assets + +**Please read the documentation for [`refgenie seek`](seek.md) command first to learn more about the basic concepts of asset paths retrieval.** + +## Motivation + +The implementation of `refgenie seekr` was motivated by cloud computing needs. It's + +## Examples + +```console +refgenie seekr hg38/fasta +``` + +```console +refgenie seekr mm10/bowtie2_index:v0.4 +``` \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index d93d1f5c..dd85d996 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,21 +2,22 @@ ## `refgenie --help` ```console -version: 0.10.0 | refgenconf 0.10.0 +version: 0.10.0 | refgenconf 0.11.0-dev usage: refgenie [-h] [--version] [--silent] [--verbosity V] [--logdev] - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} + {init,list,listr,pull,build,seek,seekr,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade,populate,populater} ... refgenie - reference genome asset manager positional arguments: - {init,list,listr,pull,build,seek,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade} + {init,list,listr,pull,build,seek,seekr,add,remove,getseq,tag,id,subscribe,unsubscribe,alias,compare,upgrade,populate,populater} init Initialize a genome configuration. list List available local assets. listr List available remote assets. pull Download assets. build Build genome assets. seek Get the path to a local asset. + seekr Get the path to a remote asset. add Add local asset to the config file. remove Remove a local asset. getseq Get sequences from a genome. @@ -27,6 +28,8 @@ positional arguments: alias Interact with aliases. compare Compare two genomes. upgrade Upgrade config. This will alter the files on disk. + populate Populate registry paths with local paths. + populater Populate registry paths with remote paths. optional arguments: -h, --help show this help message and exit @@ -92,7 +95,7 @@ optional arguments: ## `refgenie listr --help` ```console -usage: refgenie listr [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] +usage: refgenie listr [-h] [-c C] [--skip-read-lock] [-g [G [G ...]]] [-s S [S ...]] [-p] List available remote assets. @@ -103,6 +106,11 @@ optional arguments: --skip-read-lock Whether the config file should not be locked for reading -g [G [G ...]], --genome [G [G ...]] Reference assembly ID, e.g. mm10. + -s S [S ...], --genome-server S [S ...] + One or more URLs to use. This information will not + persist in the genome config file. + -p, --append-server Whether the provided servers should be appended to + the list. ``` ## `refgenie pull --help` @@ -210,6 +218,70 @@ optional arguments: on disk. ``` +## `refgenie seekr --help` +```console +usage: refgenie seekr [-h] [-c C] [--skip-read-lock] [-g G] [-s S [S ...]] [-p] + [--remote-class RC] + asset-registry-paths [asset-registry-paths ...] + +Get the path to a remote asset. + +positional arguments: + asset-registry-paths One or more registry path strings that identify + assets (e.g. hg38/fasta or hg38/fasta:tag or + hg38/fasta.fai:tag). + +optional arguments: + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -g G, --genome G Reference assembly ID, e.g. mm10. + -s S [S ...], --genome-server S [S ...] + One or more URLs to use. This information will not + persist in the genome config file. + -p, --append-server Whether the provided servers should be appended to + the list. + --remote-class RC Remote data provider class, e.g. 'http' or 's3' +``` + +## `refgenie populate --help` +```console +usage: refgenie populate [-h] [-c C] [--skip-read-lock] [-f F] + +Populate registry paths with local paths. + +optional arguments: + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional if REFGENIE + environment variable is set. + --skip-read-lock Whether the config file should not be locked for reading + -f F, --file F File with registry paths to populate +``` + +## `refgenie populater --help` +```console +usage: refgenie populater [-h] [-c C] [--skip-read-lock] [-s S [S ...]] [-p] + [--remote-class RC] [-f F] + +Populate registry paths with remote paths. + +optional arguments: + -h, --help show this help message and exit + -c C, --genome-config C Path to local genome configuration file. Optional + if REFGENIE environment variable is set. + --skip-read-lock Whether the config file should not be locked for + reading + -s S [S ...], --genome-server S [S ...] + One or more URLs to use. This information will not + persist in the genome config file. + -p, --append-server Whether the provided servers should be appended to + the list. + --remote-class RC Remote data provider class, e.g. 'http' or 's3' + -f F, --file F File with registry paths to populate +``` + ## `refgenie add --help` ```console usage: refgenie add [-h] [-c C] [--skip-read-lock] [-g G] [-f] -p P [-s S] @@ -313,8 +385,7 @@ optional arguments: ## `refgenie subscribe --help` ```console -usage: refgenie subscribe [-h] [-c C] [--skip-read-lock] [-r] -s GENOME_SERVER - [GENOME_SERVER ...] +usage: refgenie subscribe [-h] [-c C] [--skip-read-lock] [-r] -s S [S ...] Add a refgenieserver URL to the config. @@ -325,15 +396,14 @@ optional arguments: --skip-read-lock Whether the config file should not be locked for reading -r, --reset Overwrite the current list of server URLs. - -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] + -s S [S ...], --genome-server S [S ...] One or more URLs to add to the genome_servers attribute in config file. ``` ## `refgenie unsubscribe --help` ```console -usage: refgenie unsubscribe [-h] [-c C] [--skip-read-lock] -s GENOME_SERVER - [GENOME_SERVER ...] +usage: refgenie unsubscribe [-h] [-c C] [--skip-read-lock] -s S [S ...] Remove a refgenieserver URL from the config. @@ -343,7 +413,7 @@ optional arguments: if REFGENIE environment variable is set. --skip-read-lock Whether the config file should not be locked for reading - -s GENOME_SERVER [GENOME_SERVER ...], --genome-server GENOME_SERVER [GENOME_SERVER ...] + -s S [S ...], --genome-server S [S ...] One or more URLs to remove from the genome_servers attribute in config file. ``` diff --git a/mkdocs.yml b/mkdocs.yml index 4b6991c4..bc1e274c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,11 +3,11 @@ site_logo: img/refgenie_logo_light.svg site_url: http://refgenie.databio.org/en/latest/ repo_url: http://github.com/databio/refgenie pypi_name: refgenie -paper_link: https://doi.org/10.1093/gigascience/giz149 nav: - Getting Started: - Introduction: README.md + - Demo videos: demo_videos.md - Overview: overview.md - Install and configure: install.md - Basic tutorial: tutorial.md @@ -22,6 +22,7 @@ nav: - Compare genomes: compare.md - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md + - Use refgenie on the cloud: remote.md - Use refgenie with iGenomes: igenomes.md - Upgrade from config 0.3 to 0.4: config_upgrade_03_to_04.md - Reference: @@ -52,3 +53,7 @@ navbar: - text: Refgenomes server icon: fa-server href: servers + right: + - text: Manuscripts + icon: fa-file-alt + href: manuscripts diff --git a/refgenie/cli.py b/refgenie/cli.py index d7aa6233..8320ff0c 100644 --- a/refgenie/cli.py +++ b/refgenie/cli.py @@ -403,8 +403,8 @@ def process_populate(pop_fun, file_path=None): """ Process a populate request (file or stdin) with a custom populator function - :param callable(dict | str | list) -> dict | str | list: a function that populates - refgenie registry paths in objects + :param callable(dict | str | list) -> dict | str | list pop_fun: a function + that populates refgenie registry paths in objects :param str file_path: path to the file to populate refgenie registry paths in, skip for stdin processing """ diff --git a/setup.py b/setup.py index 458f24b7..e4e85558 100755 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ import os -from setuptools import setup import sys +from setuptools import setup + # Ordinary dependencies DEPENDENCIES = [] with open("requirements/requirements-all.txt", "r") as reqs_file: diff --git a/update-usage-docs.sh b/update-usage-docs.sh index 0841bc4b..45440505 100755 --- a/update-usage-docs.sh +++ b/update-usage-docs.sh @@ -1,7 +1,7 @@ #!/bin/bash cp docs/usage.template usage.template #looper --help > USAGE.temp 2>&1 -for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do +for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "seekr --help" "populate --help" "populater --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do echo $cmd echo -e "## \`refgenie $cmd\`" > USAGE_header.temp refgenie $cmd --help > USAGE.temp 2>&1 From fae3b4070605cc97bfe14c84b161650e55afb195 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Tue, 23 Mar 2021 16:19:44 -0400 Subject: [PATCH 19/44] small remote docs tweaks --- docs/remote.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/remote.md b/docs/remote.md index 2b7db9b0..32496b30 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -6,7 +6,7 @@ Starting with version 0.11.0, refgenie can be used in *remote mode*, which means *Hint: all of these commands end with "r"* -There are a few commands that do not require genome configuration file to run. `-s/--genome-servers` argument specifies the list of servers you want refgenie to query. Default server (http://refgenomes.databio.org) is used if not provided. +There are a few commands that do not require genome configuration file to run. `-s/--genome-servers` argument specifies the list of servers you want refgenie to query. Default server ([http://refgenomes.databio.org](http://refgenomes.databio.org)) is used if not provided. ### List remote assets with `refgenie listr` @@ -46,7 +46,7 @@ Set genome alias (2230c535660fb4774114bfa966a62f823fdb6d21acf138d4: hg38) s3://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa ``` -`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: http://rg.databio.org/remotes/dict. +`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: [http://rg.databio.org/remotes/dict](http://rg.databio.org/remotes/dict). ### Replace asset registry paths with remote asset paths with `refgenie populater` @@ -71,19 +71,19 @@ Set genome alias (2230c535660fb4774114bfa966a62f823fdb6d21acf138d4: hg38) test remote populating s3://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa:default ``` -`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: http://rg.databio.org/remotes/dict. +`-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: [http://rg.databio.org/remotes/dict](http://rg.databio.org/remotes/dict). #### populate text from file -1. check input file contents +- Check input file contents ```console ~ cat remote_populate_test.txt -human genome FASTA file: refgenie://hg38/fasta.fasta:default -yeast doubled genome FASTA file: refgenie://rCRSd/fasta.fasta:default +human genome FASTA file: refgenie://hg38/fasta.fasta +yeast doubled genome FASTA file: refgenie://rCRSd/fasta.fasta ``` -2. run `refgenie populater` +- Run `refgenie populater` ```console ~ refgenie populater -f remote_populate_test.txt -s http://rg.databio.org > remote_populate_test_output.txt @@ -99,12 +99,12 @@ Determined server digest for local genome alias (rCRSd): 94e0d21feb576e6af61cd2a Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd) ``` -3. check output file contents +- Check output file contents ```console ~ cat remote_populate_test_output.txt -human genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa:default -yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta__default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa:default +human genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa +yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta__default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa ``` ## Motivation From c5e15477e440a1d2b6db891a9e72d2ec0a1a2232 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 24 Mar 2021 14:43:38 -0400 Subject: [PATCH 20/44] docs udpdates --- docs/demo_videos.md | 4 ++-- docs/populate.md | 43 +++++++++++++++++++++++++++++++++++++++++++ docs/remote.md | 4 +++- mkdocs.yml | 1 + refgenie/argparser.py | 4 +--- 5 files changed, 50 insertions(+), 6 deletions(-) create mode 100644 docs/populate.md diff --git a/docs/demo_videos.md b/docs/demo_videos.md index 7cc97111..01819919 100644 --- a/docs/demo_videos.md +++ b/docs/demo_videos.md @@ -8,7 +8,7 @@ After refgenie Python package is installed with `pip install refgenie`, refgenie If used *without* genome configuration file, the data does not persist from session to session. It's really useful in cloud applications. -**hint**: remote commands end with `r`: +*Hint: all of these commands end with "r"*: - `refgenie listr` - `refgenie seekr` @@ -20,7 +20,7 @@ If used *without* genome configuration file, the data does not persist from sess ## With config - local mode -If used *with* genome configuration file, the data persists from session to session. That's the most common scenario, you can `build`, `pull` and `seek` for assets managed locally. +If used *with* genome configuration file, the data persists from session to session. That's the most common scenario, you can `build`, `pull` and `seek` assets managed locally. **!!! Record a new asciicast and insert here** diff --git a/docs/populate.md b/docs/populate.md new file mode 100644 index 00000000..e897b707 --- /dev/null +++ b/docs/populate.md @@ -0,0 +1,43 @@ +# Replace refgenie registry paths with asset file paths + +`refgenie populate` and `refgenie populater` commands can be used to replace refgenie registry paths with asset file paths -- **go from `refgenie://hg38/fasta` to `/home/johndoe/genomes/hg38/fasta/default/hg38.fa`**. + +# Motivation + +In some cases it is desirable to run a refgenie-unaware workflow and benefit from refgenie framework. Is such cases we need to populate input for a workflow run as a pre-processing step, outside of the workflow. For instance, this is the way [Common Workflow Language](https://www.commonwl.org/) (CWL) works; CWL workflows in best practices require knowledge of all input files before the workflow run begins. So, it doesn't make sense to pass a registry path, which is then resolved by refgenie inside the workflow. + +# Usage examples + +Both `populate` commands can populate refgenie registry paths both in file and string. + +## Text intput + +To populate an argument to `bowtie2` command with a local path to `bowtie2_index` asset managed by refgenie and run the aligner call: + +```console +echo 'bowtie2 -x refgenie://hg38/bowtie2_index -U reads_1.fq -S eg1.sam' | refgenie populate | sh +``` + +## File input + +Example input in `test/config_template.yaml`: +```yaml +config: + param1: value1 + fasta: refgenie://hg38/fasta + bowtie2_index: refgenie://hg38/bowtie2_index +``` + +To populate a bowtie2 index and FASTA file paths in a YAML configuration file of an arbitrary pipelne call: + +```console +refgenie populate --file test/config_template.yaml > test/config.yaml +``` + +Example output in `test/config.yaml`: +```yaml +config: + param1: value1 + fasta: /home/johndoe/genomes/hg38/fasta/default/hg38.fa + bowtie2_index: /home/johndoe/genomes/hg38/bowtie2_index/default/hg38 +``` diff --git a/docs/remote.md b/docs/remote.md index 32496b30..fdf6a196 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -109,4 +109,6 @@ yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.o ## Motivation -... \ No newline at end of file +The motivation behind the remote mode in refgenie is *cloud computing*. It is becoming a common practice to farm out jobs that require refgenie assets to computing clusters, where refgenie environment is not configured. + +Up until now, the user was expected to `init` the refgenie config, `pull` desired assets and then `seek` the path in order to pass it to the data processing workflow. With the new `seekr` the configuration and data acquisition steps can be skipped. What is more, refgenieserver software provides full flexibility regarding place where the asset files and archives are stored. Therefore, in some cases the data may be readily avilable within the cloud services provider's servers. For example, [http://refgenomes.databio.org](http://refgenomes.databio.org) refgenieserver instance stores the data in AWS S3, so any jobs running on AWS servers would benefit from the increased performance. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index bc1e274c..e3fd420e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,6 +19,7 @@ nav: - Retrieve paths to assets: seek.md - Use asset tags: tag.md - Use aliases: aliases.md + - Populate refgenie paths: populate.md - Compare genomes: compare.md - Run my own asset server: refgenieserver.md - Use refgenie from Python: refgenconf.md diff --git a/refgenie/argparser.py b/refgenie/argparser.py index 8bb64ac2..9e003219 100644 --- a/refgenie/argparser.py +++ b/refgenie/argparser.py @@ -81,9 +81,7 @@ def add_subparser(cmd, msg, subparsers): "--genome-server", nargs="+", default=[DEFAULT_SERVER], - help="URL(s) to use for the {} attribute in config file. Default: {}.".format( - CFG_SERVERS_KEY, DEFAULT_SERVER - ), + help=f"URL(s) to use for the {CFG_SERVERS_KEY} attribute in config file. Default: {DEFAULT_SERVER}.", ) sps[INIT_CMD].add_argument( "-f", From 2ff916107bae9af28c4ec2289b7208e008763523 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Wed, 24 Mar 2021 17:10:18 -0400 Subject: [PATCH 21/44] fix typos in docs --- docs/populate.md | 8 ++++---- docs/remote.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/populate.md b/docs/populate.md index e897b707..f63e9fab 100644 --- a/docs/populate.md +++ b/docs/populate.md @@ -1,14 +1,14 @@ # Replace refgenie registry paths with asset file paths -`refgenie populate` and `refgenie populater` commands can be used to replace refgenie registry paths with asset file paths -- **go from `refgenie://hg38/fasta` to `/home/johndoe/genomes/hg38/fasta/default/hg38.fa`**. +`refgenie populate` and `refgenie populater` commands can be used to replace refgenie registry paths with asset file paths. For example, go from `refgenie://hg38/fasta` to `/home/johndoe/genomes/hg38/fasta/default/hg38.fa`. # Motivation -In some cases it is desirable to run a refgenie-unaware workflow and benefit from refgenie framework. Is such cases we need to populate input for a workflow run as a pre-processing step, outside of the workflow. For instance, this is the way [Common Workflow Language](https://www.commonwl.org/) (CWL) works; CWL workflows in best practices require knowledge of all input files before the workflow run begins. So, it doesn't make sense to pass a registry path, which is then resolved by refgenie inside the workflow. +In some cases it is desirable to run a refgenie-unaware workflow and benefit from the refgenie framework. In such cases we need to populate an input for a workflow run as a pre-processing step, outside of the workflow. For instance, this is the way [Common Workflow Language](https://www.commonwl.org/) (CWL) works; CWL workflows in best practices require knowledge of all input files before the workflow run begins. So, it doesn't make sense to pass a registry path, which is then resolved by refgenie inside the workflow. # Usage examples -Both `populate` commands can populate refgenie registry paths both in file and string. +Both `populate` commands can populate refgenie registry paths both in **file** and **string**. ## Text intput @@ -28,7 +28,7 @@ config: bowtie2_index: refgenie://hg38/bowtie2_index ``` -To populate a bowtie2 index and FASTA file paths in a YAML configuration file of an arbitrary pipelne call: +To populate a bowtie2 index and FASTA file paths in a YAML configuration file of an arbitrary pipeline call: ```console refgenie populate --file test/config_template.yaml > test/config.yaml diff --git a/docs/remote.md b/docs/remote.md index fdf6a196..f7f2d243 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -111,4 +111,4 @@ yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.o The motivation behind the remote mode in refgenie is *cloud computing*. It is becoming a common practice to farm out jobs that require refgenie assets to computing clusters, where refgenie environment is not configured. -Up until now, the user was expected to `init` the refgenie config, `pull` desired assets and then `seek` the path in order to pass it to the data processing workflow. With the new `seekr` the configuration and data acquisition steps can be skipped. What is more, refgenieserver software provides full flexibility regarding place where the asset files and archives are stored. Therefore, in some cases the data may be readily avilable within the cloud services provider's servers. For example, [http://refgenomes.databio.org](http://refgenomes.databio.org) refgenieserver instance stores the data in AWS S3, so any jobs running on AWS servers would benefit from the increased performance. \ No newline at end of file +Up until now, the user was expected to `init` the refgenie config, `pull` desired assets and then `seek` the path in order to pass it to the data processing workflow. With the new `seekr` the configuration and data acquisition steps can be skipped. What is more, refgenieserver software provides full flexibility regarding place where the asset files and archives are stored. Therefore, in some cases the data may be readily available within the cloud services provider's servers. For example, [http://refgenomes.databio.org](http://refgenomes.databio.org) refgenieserver instance stores the data in AWS S3, so any jobs running on AWS servers would benefit from the increased performance. \ No newline at end of file From ba3299f2021743fe5976a4dc976cc60634be6f6f Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 26 Mar 2021 13:14:25 -0400 Subject: [PATCH 22/44] add a dir seek key by default; #217 --- refgenie/refgenie.py | 14 +++++--------- refgenie/schemas/recipe_schema.yaml | 4 +++- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index f1b38f7e..d5c9c1ec 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -156,9 +156,7 @@ def _build_asset( os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR) ) _LOGGER.info( - "Saving outputs to:\n- content: {}\n- logs: {}".format( - genome_outfolder, log_outfolder - ) + f"Saving outputs to:\n- content: {genome_outfolder}\n- logs: {log_outfolder}" ) if args.docker: # Set up some docker stuff @@ -169,11 +167,7 @@ def _build_asset( volumes = genome_outfolder if not _writeable(genome_outfolder): - _LOGGER.error( - "Insufficient permissions to write to output folder: {}".format( - genome_outfolder - ) - ) + _LOGGER.error(f"Insufficient permissions to write to output folder: {genome_outfolder}") return pm = pypiper.PipelineManager( @@ -233,7 +227,9 @@ def _build_asset( "exist: {}".format(asset_dir) ) digest = get_dir_digest(asset_dir) - _LOGGER.info("Asset digest: {}".format(digest)) + _LOGGER.info(f"Asset digest: {digest}") + # add a 'dir' seek_key that points to the asset directory + build_pkg[ASSETS].update({"dir": "."}) # add updates to config file with rgc as r: if asset_key == "fasta": diff --git a/refgenie/schemas/recipe_schema.yaml b/refgenie/schemas/recipe_schema.yaml index c06bc03f..2b506418 100644 --- a/refgenie/schemas/recipe_schema.yaml +++ b/refgenie/schemas/recipe_schema.yaml @@ -10,7 +10,9 @@ properties: description: "description of the recipe" assets: type: object - description: "seek keys to be produced" + description: "seek keys to be produced, property names must be different from 'dir'" + propertyNames: + pattern: "^(?!.*dir$).*$" required_files: type: array items: From d337bc77d422209f64583ed042a8c6c107cfe5d1 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Fri, 26 Mar 2021 14:00:51 -0400 Subject: [PATCH 23/44] reformat --- .github/workflows/build-package.yml | 2 +- .github/workflows/test-refgenie-cli.yml | 2 +- .pre-commit-config.yaml | 20 ++++++++++++++++++++ refgenie/refgenie.py | 4 +++- 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.yml index ee558a08..2472afee 100644 --- a/.github/workflows/build-package.yml +++ b/.github/workflows/build-package.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-latest] steps: diff --git a/.github/workflows/test-refgenie-cli.yml b/.github/workflows/test-refgenie-cli.yml index be6b0aa8..0d704122 100644 --- a/.github/workflows/test-refgenie-cli.yml +++ b/.github/workflows/test-refgenie-cli.yml @@ -9,7 +9,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.8] + python-version: [3.6, 3.9] os: [ubuntu-latest, macos-latest] steps: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..ab5489e2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.4.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: trailing-whitespace + + - repo: https://github.com/PyCQA/isort + rev: 5.7.0 + hooks: + - id: isort + args: ["--profile", "black"] + + - repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py index d5c9c1ec..9a13333c 100755 --- a/refgenie/refgenie.py +++ b/refgenie/refgenie.py @@ -167,7 +167,9 @@ def _build_asset( volumes = genome_outfolder if not _writeable(genome_outfolder): - _LOGGER.error(f"Insufficient permissions to write to output folder: {genome_outfolder}") + _LOGGER.error( + f"Insufficient permissions to write to output folder: {genome_outfolder}" + ) return pm = pypiper.PipelineManager( From ab86df6afc68bdaed5ef114f0b9439be7cf02013 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 29 Mar 2021 09:26:48 -0400 Subject: [PATCH 24/44] fix typos --- docs/remote.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/remote.md b/docs/remote.md index f7f2d243..66dda307 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -17,8 +17,8 @@ You can list assets available on remote servers with `refgenie listr`. Using default config. No config found in env var: ['REFGENIE'] Subscribed to: http://rg.databio.org - Remote refgenie assets - Server URL: http://rg.databio.org + Remote refgenie assets + Server URL: http://rg.databio.org ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ genome ┃ assets ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ @@ -27,7 +27,7 @@ Subscribed to: http://rg.databio.org │ human_repeats │ fasta, hisat2_index, bwa_index │ │ hg38 │ fasta, bowtie2_index │ └─────────────────────┴──────────────────────────────────────────────┘ - use refgenie listr -g for more detailed view + use refgenie listr -g for more detailed view ``` ### Find remote asset paths with `refgenie seekr` @@ -48,9 +48,9 @@ s3://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f82 `-r`/`--remote-class` command specifies the data provider link to be used in the output. Please refer to the refgenieserver instance API endpoint that lists available options, for example: [http://rg.databio.org/remotes/dict](http://rg.databio.org/remotes/dict). -### Replace asset registry paths with remote asset paths with `refgenie populater` +### Replace asset registry paths with remote asset paths using `refgenie populater` -You can replace refgenie asset registry paths in text of files with `refgenie populater`. Any string that matches the following format will be replaced with a remote path: +You can replace refgenie asset registry paths in **text** or **files** with `refgenie populater`. Any string that matches the following format will be replaced with a remote path: ``` refgenie://genome_alias/asset.seek_key:tag @@ -60,7 +60,7 @@ refgenie://genome_alias/asset.seek_key:tag #### populate text from standard input ```console -~ echo 'test remote populating refgenie://hg38/fasta.fasta:default' | refgenie populater hg38/fasta -s http://rg.databio.org --remote-class s3 +~ echo 'test remote populating refgenie://hg38/fasta.fasta:default' | refgenie populater -s http://rg.databio.org --r s3 Using default config. No config found in env var: ['REFGENIE'] Subscribed to: http://rg.databio.org @@ -77,7 +77,7 @@ test remote populating s3://awspds.refgenie.databio.org/rg.databio.org/2230c5356 - Check input file contents ```console -~ cat remote_populate_test.txt +~ cat remote_populate_test.txt human genome FASTA file: refgenie://hg38/fasta.fasta yeast doubled genome FASTA file: refgenie://rCRSd/fasta.fasta @@ -101,7 +101,7 @@ Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd) - Check output file contents ```console -~ cat remote_populate_test_output.txt +~ cat remote_populate_test_output.txt human genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta__default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.fa yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.org/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta__default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa @@ -109,6 +109,6 @@ yeast doubled genome FASTA file: http://awspds.refgenie.databio.org/rg.databio.o ## Motivation -The motivation behind the remote mode in refgenie is *cloud computing*. It is becoming a common practice to farm out jobs that require refgenie assets to computing clusters, where refgenie environment is not configured. +The motivation behind the remote mode in refgenie is *cloud computing*. It is becoming a common practice to farm out jobs that require refgenie assets to computing clusters, where refgenie environment is not configured. -Up until now, the user was expected to `init` the refgenie config, `pull` desired assets and then `seek` the path in order to pass it to the data processing workflow. With the new `seekr` the configuration and data acquisition steps can be skipped. What is more, refgenieserver software provides full flexibility regarding place where the asset files and archives are stored. Therefore, in some cases the data may be readily available within the cloud services provider's servers. For example, [http://refgenomes.databio.org](http://refgenomes.databio.org) refgenieserver instance stores the data in AWS S3, so any jobs running on AWS servers would benefit from the increased performance. \ No newline at end of file +Up until now, the user was expected to `init` the refgenie config, `pull` desired assets and then `seek` the path in order to pass it to the data processing workflow. With the new `seekr` command the configuration and data acquisition steps can be skipped. What is more, refgenieserver software provides full flexibility regarding place where the asset files and archives are stored. Therefore, in some cases the data may be readily available within the cloud services provider's servers. For example, [http://refgenomes.databio.org](http://refgenomes.databio.org) refgenieserver instance stores the data in AWS S3, so any jobs running on AWS servers would benefit from the increased performance. From 43282d678e7c956786f95be13267abc37785cd9c Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 29 Mar 2021 10:38:29 -0400 Subject: [PATCH 25/44] update asciicasts --- docs/demo_videos.md | 12 ++++-------- docs/remote.md | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/demo_videos.md b/docs/demo_videos.md index 01819919..2697fb43 100644 --- a/docs/demo_videos.md +++ b/docs/demo_videos.md @@ -1,8 +1,8 @@ # Demo videos - start working with refgenie -Check out these demonstration videos, which show how to quickly start working with refgenie. +Check out these demonstration videos, which show how to quickly start working with refgenie. -After refgenie Python package is installed with `pip install refgenie`, refgenie can be used with or without genome configuration file. +After refgenie Python package is installed with `pip install refgenie`, refgenie can be used with or without genome configuration file. ## Without config - remote mode @@ -14,14 +14,10 @@ If used *without* genome configuration file, the data does not persist from sess - `refgenie seekr` - `refgenie populater` -**!!! Record a new asciicast and insert here** - - + ## With config - local mode If used *with* genome configuration file, the data persists from session to session. That's the most common scenario, you can `build`, `pull` and `seek` assets managed locally. -**!!! Record a new asciicast and insert here** - - \ No newline at end of file + diff --git a/docs/remote.md b/docs/remote.md index 66dda307..9f4e46cb 100644 --- a/docs/remote.md +++ b/docs/remote.md @@ -35,7 +35,7 @@ Subscribed to: http://rg.databio.org You can seek for remote asset paths with `refgenie seekr`: ```console -~ refgenie seekr hg38/fasta -s http://rg.databio.org --remote-class s3 +~ refgenie seekr hg38/fasta -s http://rg.databio.org -r s3 Using default config. No config found in env var: ['REFGENIE'] Subscribed to: http://rg.databio.org @@ -60,7 +60,7 @@ refgenie://genome_alias/asset.seek_key:tag #### populate text from standard input ```console -~ echo 'test remote populating refgenie://hg38/fasta.fasta:default' | refgenie populater -s http://rg.databio.org --r s3 +~ echo 'test remote populating refgenie://hg38/fasta.fasta:default' | refgenie populater -s http://rg.databio.org -r s3 Using default config. No config found in env var: ['REFGENIE'] Subscribed to: http://rg.databio.org From 7d9b3b16012e26ae17adcb1f6d986a5aea13e371 Mon Sep 17 00:00:00 2001 From: Michal Stolarczyk Date: Mon, 29 Mar 2021 11:08:03 -0400 Subject: [PATCH 26/44] docs tweaks --- docs/README.md | 14 ++++++++------ docs/autodoc_build/refgenconf.md | 24 ++++++++++++------------ docs/seekr.md | 17 ----------------- 3 files changed, 20 insertions(+), 35 deletions(-) delete mode 100644 docs/seekr.md diff --git a/docs/README.md b/docs/README.md index 8db0065e..07dc2325 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,6 +10,8 @@ Refgenie manages storage, access, and transfer of reference genome resources. It provides command-line and Python interfaces to *download* pre-built reference genome "assets", like indexes used by bioinformatics tools. It can also *build* assets for custom genome assemblies. Refgenie provides programmatic access to a standard genome folder structure, so software can swap from one genome to another. +**In a hurry?** Check out the [demo videos](demo_videos.md) that present the most relevant refgenie features in 3 minutes! + ## What makes refgenie better? 1. **It provides a command-line interface to download individual resources**. Think of it as `GitHub` for reference genomes. You just type `refgenie pull hg38/bwa_index`. @@ -17,7 +19,7 @@ Refgenie manages storage, access, and transfer of reference genome resources. It 2. **It's scripted**. In case you need resources *not* on the server, such as for a custom genome, you can `build` your own: `refgenie build custom_genome/bowtie2_index`. 3. **It simplifies finding local asset locations**. When you need a path to an asset, you can `seek` it, making your pipelines portable across computing environments: `refgenie seek hg38/salmon_index`. - + 4. **It provides remote operation mode**, useful for cloud applications. Get a path to an asset file hosted on AWS S3: `refgenie seekr hg38/fasta --remote-class s3`. 5. **It includes a Python API**. For tool developers, you use `rgc = refgenconf.RefGenConf("genomes.yaml")` to get a Python object with paths to any genome asset, *e.g.*, `rgc.seek("hg38", "kallisto_index")`. @@ -68,8 +70,8 @@ refgenie listr Response: ```console - Remote refgenie assets - Server URL: http://refgenomes.databio.org + Remote refgenie assets + Server URL: http://refgenomes.databio.org ┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ genome ┃ assets ┃ ┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ @@ -93,7 +95,7 @@ Downloading URL: http://rg.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a Download complete: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz Extracting asset tarball: /Users/mstolarczyk/Desktop/testing/refgenie/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/bowtie2_index__default.tgz Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default -Created alias directories: +Created alias directories: - /Users/mstolarczyk/Desktop/testing/refgenie/alias/rCRSd/bowtie2_index/default ``` @@ -130,7 +132,7 @@ refgenie seekr mm10/fasta.fai This will return the path to the particular remote file of interest, here: FASTA index file, which is a part of `mm10/fasta` asset. -See [further reading on seeking remote asset files](seekr.md). +See [further reading on using refgenie in remote mode](remote.md). --- -If you want to read more about the motivation behind refgenie and the software engineering that makes refgenie work, proceed next to the [overview](overview.md). \ No newline at end of file +If you want to read more about the motivation behind refgenie and the software engineering that makes refgenie work, proceed next to the [overview](overview.md). diff --git a/docs/autodoc_build/refgenconf.md b/docs/autodoc_build/refgenconf.md index a220a977..951b8143 100644 --- a/docs/autodoc_build/refgenconf.md +++ b/docs/autodoc_build/refgenconf.md @@ -7,7 +7,7 @@ document.addEventListener('DOMContentLoaded', (event) => {