for more detailed view \u001b[0m\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!refgenie listr -c refgenie.yaml"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -66,64 +151,70 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import refgenconf\n",
- "rgc = refgenconf.RefGenConf(\"refgenie.yaml\")"
+ "rgc = refgenconf.RefGenConf(filepath=\"refgenie.yaml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Use `pull` to download the actual asset:"
+ "Use `pull` to download one of the assets:"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " \r"
- ]
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "abab7f40d9654ca6ba8c60471cbe303a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Output()"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
{
"data": {
"text/plain": [
- "(['hs38d1', 'fasta', 'default'],\n",
- " {'archive_digest': '310c578812a64fcdf08d2df60d7b79b4',\n",
- " 'archive_size': '1.7MB',\n",
- " 'asset_children': ['hs38d1/star_index:default',\n",
- " 'hs38d1/bwa_index:default',\n",
- " 'hs38d1/bowtie2_index:default',\n",
- " 'hs38d1/bismark_bt1_index:default',\n",
- " 'hs38d1/bismark_bt2_index:default',\n",
- " 'hs38d1/hisat2_index:default',\n",
- " 'hs38d1/tallymer_index:default',\n",
- " 'hs38d1/suffixerator_index:default'],\n",
- " 'asset_digest': 'eddf5466faa3391a7114e87648466dcb',\n",
+ "(['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a', 'fasta', 'default'],\n",
+ " {'asset_path': 'fasta',\n",
+ " 'asset_digest': '8dfe402f7d29d5b036dd8937119e4404',\n",
+ " 'archive_digest': 'bfb7877ee114c61a17a50bd471de47a2',\n",
+ " 'asset_size': '39.4KB',\n",
+ " 'archive_size': '9.1KB',\n",
+ " 'seek_keys': {'fasta': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa',\n",
+ " 'fai': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.fa.fai',\n",
+ " 'chrom_sizes': '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a.chrom.sizes'},\n",
" 'asset_parents': [],\n",
- " 'asset_path': 'fasta',\n",
- " 'asset_size': '6.0MB',\n",
- " 'seek_keys': {'chrom_sizes': 'hs38d1.chrom.sizes',\n",
- " 'fai': 'hs38d1.fa.fai',\n",
- " 'fasta': 'hs38d1.fa'}},\n",
- " 'http://refgenomes.databio.org')"
+ " 'asset_children': ['43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/suffixerator_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bowtie2_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bwa_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/tallymer_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/hisat2_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/star_index:default',\n",
+ " '43f14ba8beed34d52edb244e26f193df6edbb467bd55d37a/bismark_bt2_index:default']},\n",
+ " 'http://rg.databio.org')"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "rgc.pull(\"hs38d1\", \"fasta\", \"default\")"
+ "rgc.pull(\"mouse_chrM2x\", \"fasta\", \"default\")"
]
},
{
@@ -135,22 +226,22 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/nsheff/code/refgenie/docs_jupyter/hs38d1/fasta/default/hs38d1.fa'"
+ "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/mouse_chrM2x/fasta/default/mouse_chrM2x.fa'"
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "rgc.seek(\"hs38d1\", \"fasta\")"
+ "rgc.seek(\"mouse_chrM2x\", \"fasta\")"
]
},
{
@@ -162,22 +253,22 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'eddf5466faa3391a7114e87648466dcb'"
+ "'8dfe402f7d29d5b036dd8937119e4404'"
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "rgc.id(\"hs38d1\", \"fasta\")"
+ "rgc.id(\"mouse_chrM2x\", \"fasta\")"
]
},
{
@@ -196,36 +287,34 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "--2020-03-13 16:11:59-- http://big.databio.org/refgenie_raw/rCRSd.fa.gz\r\n",
- "Resolving big.databio.org (big.databio.org)... 128.143.245.181\r\n",
- "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\r\n",
- "HTTP request sent, awaiting response... 200 OK\r\n",
- "Length: 8399 (8.2K) [application/octet-stream]\r\n",
- "Saving to: ‘rCRSd.fa.gz’\r\n",
- "\r\n",
- "\r",
- "rCRSd.fa.gz 0%[ ] 0 --.-KB/s \r",
- "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0s \r\n",
- "\r\n",
- "2020-03-13 16:11:59 (214 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\r\n",
- "\r\n"
+ "--2021-03-09 12:22:40-- http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta\n",
+ "Resolving big.databio.org (big.databio.org)... 128.143.245.181, 128.143.245.182\n",
+ "Connecting to big.databio.org (big.databio.org)|128.143.245.181|:80... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 8399 (8.2K) [application/octet-stream]\n",
+ "Saving to: ‘rCRSd.fa.gz’\n",
+ "\n",
+ "rCRSd.fa.gz 100%[===================>] 8.20K --.-KB/s in 0.006s \n",
+ "\n",
+ "2021-03-09 12:22:40 (1.35 MB/s) - ‘rCRSd.fa.gz’ saved [8399/8399]\n",
+ "\n"
]
}
],
"source": [
- "!wget http://big.databio.org/refgenie_raw/rCRSd.fa.gz"
+ "!wget -O rCRSd.fa.gz http://big.databio.org/refgenie_raw/files.rCRSd.fasta.fasta"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -233,24 +322,30 @@
"output_type": "stream",
"text": [
"Using 'default' as the default tag for 'rCRSd/fasta'\n",
+ "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n",
"Building 'rCRSd/fasta:default' using 'fasta' recipe\n",
+ "Initializing genome: rCRSd\n",
+ "Loaded AnnotatedSequenceDigestList (1 sequences)\n",
+ "Set genome alias (94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4: rCRSd)\n",
+ "Created alias directories: \n",
+ " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd\n",
"Saving outputs to:\n",
- "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n",
- "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build\n",
+ "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n",
+ "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build\n",
"### Pipeline run code and environment:\n",
"\n",
- "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n",
- "* Compute host: puma\n",
- "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n",
- "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/\n",
- "* Pipeline started at: (03-13 16:11:59) elapsed: 0.0 _TIME_\n",
+ "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R`\n",
+ "* Compute host: MichalsMBP\n",
+ "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n",
+ "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/\n",
+ "* Pipeline started at: (03-09 12:22:41) elapsed: 0.0 _TIME_\n",
"\n",
"### Version log:\n",
"\n",
- "* Python version: 3.7.6\n",
- "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n",
+ "* Python version: 3.6.5\n",
+ "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n",
"* Pypiper version: 0.12.1\n",
- "* Pipeline dir: `/home/nsheff/.local/bin`\n",
+ "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n",
"* Pipeline version: None\n",
"\n",
"### Arguments passed to pipeline:\n",
@@ -266,67 +361,74 @@
"* `genome_description`: `None`\n",
"* `logdev`: `False`\n",
"* `new_start`: `False`\n",
- "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n",
+ "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n",
"* `params`: `None`\n",
"* `recipe`: `None`\n",
"* `recover`: `True`\n",
"* `requirements`: `False`\n",
"* `silent`: `False`\n",
+ "* `skip_read_lock`: `False`\n",
"* `tag_description`: `None`\n",
"* `verbosity`: `None`\n",
"* `volumes`: `None`\n",
"\n",
"----------------------------------------\n",
"\n",
- "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` \n",
+ "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` \n",
"\n",
- "> `cp rCRSd.fa.gz /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28689)\n",
+ "> `cp rCRSd.fa.gz /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63575)\n",
"\n",
+ "psutil.ZombieProcess process still exists but it's a zombie (pid=63575)\n",
+ "Warning: couldn't add memory use for process: 63575\n",
"
\n",
"Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n",
- " PID: 28689;\tCommand: cp;\tReturn code: 0;\tMemory used: 0.0GB\n",
+ " PID: 63575;\tCommand: cp;\tReturn code: 0;\tMemory used: 0GB\n",
"\n",
"\n",
- "> `gzip -d /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.gz` (28691)\n",
+ "> `gzip -df /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.gz` (63576)\n",
"\n",
+ "psutil.ZombieProcess process still exists but it's a zombie (pid=63576)\n",
+ "Warning: couldn't add memory use for process: 63576\n",
"
\n",
"Command completed. Elapsed time: 0:00:00. Running peak memory: 0GB. \n",
- " PID: 28691;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0.0GB\n",
+ " PID: 63576;\tCommand: gzip;\tReturn code: 0;\tMemory used: 0GB\n",
"\n",
"\n",
- "> `samtools faidx /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa` (28693)\n",
+ "> `samtools faidx /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa` (63577)\n",
"\n",
"
\n",
- "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.018GB. \n",
- " PID: 28693;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.018GB\n",
+ "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n",
+ " PID: 63577;\tCommand: samtools;\tReturn code: 0;\tMemory used: 0.001GB\n",
"\n",
"\n",
- "> `cut -f 1,2 /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa.fai > /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.chrom.sizes` (28761)\n",
+ "> `cut -f 1,2 /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa.fai > /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.chrom.sizes` (63578)\n",
"\n",
+ "psutil.ZombieProcess process still exists but it's a zombie (pid=63578)\n",
+ "Warning: couldn't add memory use for process: 63578\n",
"
\n",
- "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n",
- " PID: 28761;\tCommand: cut;\tReturn code: 0;\tMemory used: 0.0GB\n",
+ "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n",
+ " PID: 63578;\tCommand: cut;\tReturn code: 0;\tMemory used: 0GB\n",
"\n",
"\n",
- "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/_refgenie_build/rCRSd_fasta__default.flag` (28763)\n",
+ "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_fasta__default.flag` (63580)\n",
"\n",
+ "psutil.ZombieProcess process still exists but it's a zombie (pid=63580)\n",
+ "Warning: couldn't add memory use for process: 63580\n",
"
\n",
- "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.018GB. \n",
- " PID: 28763;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n",
- "\n",
+ "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.001GB. \n",
+ " PID: 63580;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n",
"\n",
- "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n",
"Asset digest: 4eb430296bc02ed7e4006624f1d5ac53\n",
- "Default tag for 'rCRSd/fasta' set to: default\n",
+ "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta' set to: default\n",
"\n",
"### Pipeline completed. Epilogue\n",
- "* Elapsed time (this run): 0:00:01\n",
- "* Total elapsed time (all runs): 0:00:01\n",
- "* Peak memory (this run): 0.0184 GB\n",
- "* Pipeline completed time: 2020-03-13 16:12:00\n",
- "Computing initial genome digest...\n",
- "Initializing genome...\n",
- "Finished building 'fasta' asset\n"
+ "* Elapsed time (this run): 0:00:00\n",
+ "* Total elapsed time (all runs): 0:00:00\n",
+ "* Peak memory (this run): 0.0015 GB\n",
+ "* Pipeline completed time: 2021-03-09 12:22:41\n",
+ "Finished building 'fasta' asset\n",
+ "Created alias directories: \n",
+ " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default\n"
]
}
],
@@ -334,16 +436,53 @@
"!refgenie build rCRSd/fasta -c refgenie.yaml --files fasta=rCRSd.fa.gz -R"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The asset should be available for local use, let's call `refgenie list` to check it:"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[3m Local refgenie assets \u001b[0m\r\n",
+ "\u001b[3m Server subscriptions: http://rg.databio.org \u001b[0m\r\n",
+ "┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓\r\n",
+ "┃\u001b[1m \u001b[0m\u001b[1mgenome \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1masset (\u001b[0m\u001b[1;3mseek_keys\u001b[0m\u001b[1m) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtags \u001b[0m\u001b[1m \u001b[0m┃\r\n",
+ "┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩\r\n",
+ "│ rCRSd │ fasta (\u001b[3mfasta, fai, chrom_sizes\u001b[0m) │ default │\r\n",
+ "└───────────┴────────────────────────────────────────────┴───────────┘\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!refgenie list -c refgenie.yaml --genome rCRSd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can retrieve the path to this asset with:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\r\n"
+ "/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa\r\n"
]
}
],
@@ -355,21 +494,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can do the same thing from within python:"
+ "Naturally, we can do the same thing from within Python:"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa'"
+ "'/Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/fasta/default/rCRSd.fa'"
]
},
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -383,14 +522,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- " Now if you have bowtie2-build in your PATH you can build the bowtie2 index with no further requirements.\n",
+ "Now, if we have bowtie2-build in our `$PATH` we can build the `bowtie2_index` asset with no further requirements.\n",
"\n",
- "You can see the requirements with `--requirements`:\n"
+ "Let's check the requirements with `refgenie build --requirements`:\n"
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -416,7 +555,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -424,24 +563,25 @@
"output_type": "stream",
"text": [
"Using 'default' as the default tag for 'rCRSd/bowtie2_index'\n",
+ "Recipe validated successfully against a schema: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/refgenie/schemas/recipe_schema.yaml\n",
"Building 'rCRSd/bowtie2_index:default' using 'bowtie2_index' recipe\n",
"Saving outputs to:\n",
- "- content: /home/nsheff/code/refgenie/docs_jupyter/rCRSd\n",
- "- logs: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build\n",
+ "- content: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4\n",
+ "- logs: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build\n",
"### Pipeline run code and environment:\n",
"\n",
- "* Command: `/home/nsheff/.local/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n",
- "* Compute host: puma\n",
- "* Working dir: /home/nsheff/code/refgenie/docs_jupyter\n",
- "* Outfolder: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/\n",
- "* Pipeline started at: (03-13 16:12:02) elapsed: 0.0 _TIME_\n",
+ "* Command: `/Library/Frameworks/Python.framework/Versions/3.6/bin/refgenie build rCRSd/bowtie2_index -c refgenie.yaml`\n",
+ "* Compute host: MichalsMBP\n",
+ "* Working dir: /Users/mstolarczyk/code/refgenie/docs_jupyter\n",
+ "* Outfolder: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/\n",
+ "* Pipeline started at: (03-09 12:22:45) elapsed: 0.0 _TIME_\n",
"\n",
"### Version log:\n",
"\n",
- "* Python version: 3.7.6\n",
- "* Pypiper dir: `/home/nsheff/.local/lib/python3.7/site-packages/pypiper`\n",
+ "* Python version: 3.6.5\n",
+ "* Pypiper dir: `/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pypiper`\n",
"* Pypiper version: 0.12.1\n",
- "* Pipeline dir: `/home/nsheff/.local/bin`\n",
+ "* Pipeline dir: `/Library/Frameworks/Python.framework/Versions/3.6/bin`\n",
"* Pipeline version: None\n",
"\n",
"### Arguments passed to pipeline:\n",
@@ -457,25 +597,25 @@
"* `genome_description`: `None`\n",
"* `logdev`: `False`\n",
"* `new_start`: `False`\n",
- "* `outfolder`: `/home/nsheff/code/refgenie/docs_jupyter`\n",
+ "* `outfolder`: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data`\n",
"* `params`: `None`\n",
"* `recipe`: `None`\n",
"* `recover`: `False`\n",
"* `requirements`: `False`\n",
"* `silent`: `False`\n",
+ "* `skip_read_lock`: `False`\n",
"* `tag_description`: `None`\n",
"* `verbosity`: `None`\n",
"* `volumes`: `None`\n",
"\n",
"----------------------------------------\n",
"\n",
- "Target to produce: `/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` \n",
+ "Target to produce: `/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` \n",
"\n",
- "> `bowtie2-build /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd` (28812)\n",
+ "> `bowtie2-build /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` (63609)\n",
"\n",
- "Building a SMALL index\n",
"Settings:\n",
- " Output files: \"/home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.*.bt2\"\n",
+ " Output files: \"/Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.*.bt2\"\n",
" Line rate: 6 (line is 64 bytes)\n",
" Lines per side: 1 (side is 64 bytes)\n",
" Offset rate: 4 (one in 16)\n",
@@ -492,7 +632,8 @@
" Random seed: 0\n",
" Sizeofs: void*:8, int:4, long:8, size_t:8\n",
"Input files DNA, FASTA:\n",
- " /home/nsheff/code/refgenie/docs_jupyter/rCRSd/fasta/default/rCRSd.fa\n",
+ " /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/fasta/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.fa\n",
+ "Building a SMALL index\n",
"Reading reference sizes\n",
" Time reading reference sizes: 00:00:00\n",
"Calculating joined length\n",
@@ -545,8 +686,8 @@
"fchr[$]: 33136\n",
"Exiting Ebwt::buildToDisk()\n",
"Returning from initFromVector\n",
- "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.1.bt2\n",
- "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.2.bt2\n",
+ "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.1.bt2\n",
+ "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.2.bt2\n",
"Re-opening _in1 and _in2 as input streams\n",
"Returning from Ebwt constructor\n",
"Headers:\n",
@@ -628,8 +769,8 @@
"fchr[$]: 33136\n",
"Exiting Ebwt::buildToDisk()\n",
"Returning from initFromVector\n",
- "Wrote 4205567 bytes to primary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.1.bt2\n",
- "Wrote 8292 bytes to secondary EBWT file: /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/rCRSd.rev.2.bt2\n",
+ "Wrote 4205567 bytes to primary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.1.bt2\n",
+ "Wrote 8292 bytes to secondary EBWT file: /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4.rev.2.bt2\n",
"Re-opening _in1 and _in2 as input streams\n",
"Returning from Ebwt constructor\n",
"Headers:\n",
@@ -658,28 +799,36 @@
" color: 0\n",
" reverse: 1\n",
"Total time for backward call to driver() for mirror index: 00:00:00\n",
- "
\n",
- "Command completed. Elapsed time: 0:00:01. Running peak memory: 0.019GB. \n",
- " PID: 28812;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.019GB\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n",
+ " PID: 63609;\tCommand: bowtie2-build;\tReturn code: 0;\tMemory used: 0.003GB\n",
"\n",
"\n",
- "> `touch /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default/_refgenie_build/rCRSd_bowtie2_index__default.flag` (28879)\n",
+ "> `touch /Users/mstolarczyk/code/refgenie/docs_jupyter/data/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index/default/_refgenie_build/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4_bowtie2_index__default.flag` (63611)\n",
"\n",
+ "psutil.ZombieProcess process still exists but it's a zombie (pid=63611)\n",
+ "Warning: couldn't add memory use for process: 63611\n",
"
\n",
- "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.019GB. \n",
- " PID: 28879;\tCommand: touch;\tReturn code: 0;\tMemory used: 0.0GB\n",
- "\n",
+ "Command completed. Elapsed time: 0:00:00. Running peak memory: 0.003GB. \n",
+ " PID: 63611;\tCommand: touch;\tReturn code: 0;\tMemory used: 0GB\n",
"\n",
- "> `cd /home/nsheff/code/refgenie/docs_jupyter/rCRSd/bowtie2_index/default; find . -type f -not -path './_refgenie_build*' -exec md5sum {} \\; | sort -k 2 | awk '{print $1}' | md5sum`\n",
"Asset digest: 1262e30d4a87db9365d501de8559b3b4\n",
- "Default tag for 'rCRSd/bowtie2_index' set to: default\n",
+ "Default tag for '94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index' set to: default\n",
"\n",
"### Pipeline completed. Epilogue\n",
"* Elapsed time (this run): 0:00:01\n",
- "* Total elapsed time (all runs): 0:00:01\n",
- "* Peak memory (this run): 0.0188 GB\n",
- "* Pipeline completed time: 2020-03-13 16:12:03\n",
- "Finished building 'bowtie2_index' asset\n"
+ "* Total elapsed time (all runs): 0:00:00\n",
+ "* Peak memory (this run): 0.0028 GB\n",
+ "* Pipeline completed time: 2021-03-09 12:22:46\n",
+ "Finished building 'bowtie2_index' asset\n",
+ "Created alias directories: \n",
+ " - /Users/mstolarczyk/code/refgenie/docs_jupyter/alias/rCRSd/bowtie2_index/default\n"
]
}
],
@@ -691,48 +840,43 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can see a list of available recipes like this:"
+ "We can see a list of available recipes like this:"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Server subscriptions: http://refgenomes.databio.org\r\n",
- "Local genomes: hs38d1, rCRSd\r\n",
- "Local recipes: bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index\r\n",
- "Local assets:\r\n",
- " hs38d1/ fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n",
- " rCRSd/ bowtie2_index:default, fasta.chrom_sizes:default, fasta.fai:default, fasta:default\r\n"
+ "bismark_bt1_index, bismark_bt2_index, blacklist, bowtie2_index, bwa_index, cellranger_reference, dbnsfp, dbsnp, ensembl_gtf, ensembl_rb, epilog_index, fasta, fasta_txome, feat_annotation, gencode_gtf, hisat2_index, kallisto_index, refgene_anno, salmon_index, salmon_partial_sa_index, salmon_sa_index, star_index, suffixerator_index, tallymer_index, tgMap\r\n"
]
}
],
"source": [
- "!refgenie list -c refgenie.yaml"
+ "!refgenie list -c refgenie.yaml --recipes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can get the unique digest for any asset with `refgenie id`:"
+ "We can get the unique digest for any asset with `refgenie id`:"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "rCRSd/fasta:default,4eb430296bc02ed7e4006624f1d5ac53\r\n"
+ "4eb430296bc02ed7e4006624f1d5ac53\r\n"
]
}
],
@@ -749,16 +893,16 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'3.5.2'"
+ "'3.6.5'"
]
},
- "execution_count": 15,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -770,40 +914,20 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "refgenie 0.9.0-dev\r\n"
+ "refgenie 0.10.0-dev | refgenconf 0.10.0-dev\r\n"
]
}
],
"source": [
"!refgenie --version"
]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'0.7.0-dev'"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "refgenconf.__version__"
- ]
}
],
"metadata": {
@@ -822,7 +946,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.5.2"
+ "version": "3.6.5"
}
},
"nbformat": 4,
diff --git a/mkdocs.yml b/mkdocs.yml
index 78ac8930..4b6991c4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -18,9 +18,12 @@ nav:
- Add custom assets: custom_assets.md
- Retrieve paths to assets: seek.md
- Use asset tags: tag.md
+ - Use aliases: aliases.md
+ - Compare genomes: compare.md
- Run my own asset server: refgenieserver.md
- Use refgenie from Python: refgenconf.md
- Use refgenie with iGenomes: igenomes.md
+ - Upgrade from config 0.3 to 0.4: config_upgrade_03_to_04.md
- Reference:
- Genome configuration file: genome_config.md
- Glossary: glossary.md
@@ -48,4 +51,4 @@ navbar:
left:
- text: Refgenomes server
icon: fa-server
- href: http://refgenomes.databio.org
+ href: servers
diff --git a/recipes.md b/recipes.md
deleted file mode 100644
index 821a1683..00000000
--- a/recipes.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Refgenie Recipes
-
-Here are a few easy scripts you can use to re-index some of your favorite genomes
-
-## hg19
-
-```console
-BUILDER=${CODEBASE}refgenie/src/refgenie.py
-INPUT=http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.2bit
-GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
-${BUILDER} -i ${INPUT} -a ${GTF} -n hg19
-```
-
-## hg38
-(use the NCBI's official version for sequence alignments without _alt sequences:)
-Old link: INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
-
-This README describes the sequences:
-
-ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/README_analysis_sets.txt
-
-```console
-BUILDER=${CODEBASE}refgenie/src/refgenie.py
-INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
-GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_23/gencode.v23.primary_assembly.annotation.gtf.gz
-${BUILDER} -i ${INPUT} -a ${GTF} -n hg38
-```
-
-## mm10
-
-```console
-BUILDER=${CODEBASE}refgenie/src/refgenie.py
-INPUT=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/635/GCA_000001635.5_GRCm38.p3/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001635.5_GRCm38.p3_no_alt_analysis_set.fna.gz
-GTF=ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M12/gencode.vM12.primary_assembly.annotation.gtf.gz
-${BUILDER} -i ${INPUT} -a ${GTF} -n mm10
-```
diff --git a/refgenie/__init__.py b/refgenie/__init__.py
index b6aa9894..6e0e9f4e 100644
--- a/refgenie/__init__.py
+++ b/refgenie/__init__.py
@@ -1,3 +1,4 @@
from ._version import __version__
import logmuse
-logmuse.init_logger("refgenie")
\ No newline at end of file
+
+logmuse.init_logger("refgenie")
diff --git a/refgenie/__main__.py b/refgenie/__main__.py
index 7844a8ff..1fa5c244 100644
--- a/refgenie/__main__.py
+++ b/refgenie/__main__.py
@@ -1,7 +1,7 @@
-from .refgenie import main
+from .cli import main
import sys
-if __name__ == '__main__':
+if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
diff --git a/refgenie/_version.py b/refgenie/_version.py
index c5981731..61fb31ca 100644
--- a/refgenie/_version.py
+++ b/refgenie/_version.py
@@ -1 +1 @@
-__version__ = "0.9.3"
+__version__ = "0.10.0"
diff --git a/refgenie/add_assets_igenome.py b/refgenie/add_assets_igenome.py
index a0620e64..c09a2bb2 100644
--- a/refgenie/add_assets_igenome.py
+++ b/refgenie/add_assets_igenome.py
@@ -6,7 +6,7 @@
Build/
Annotation/ Sequence/
"""
-from .refgenie import _seek, _remove
+from .refgenie import _seek
from .exceptions import MissingGenomeConfigError
from ubiquerg import untar, mkabs, query_yes_no
@@ -29,15 +29,36 @@ def build_argparser():
:return argparse.ArgumentParser: constructed parser
"""
- parser = argparse.ArgumentParser(description='Integrates every asset from the downloaded iGenomes'
- ' tarball/directory with Refgenie asset management system')
- parser.add_argument('-p', '--path', dest="path", type=str,
- help='path to the desired genome tarball or directory to integrate', required=True)
- parser.add_argument('-g', '--genome', dest="genome", type=str, help='name to be assigned to the selected genome',
- required=True)
- parser.add_argument('-c', '--config', dest="config", type=str,
- help="path to local genome configuration file. Optional if '{}' environment variable is set.".
- format(", ".join(refgenconf.CFG_ENV_VARS)), required=False)
+ parser = argparse.ArgumentParser(
+ description="Integrates every asset from the downloaded iGenomes"
+ " tarball/directory with Refgenie asset management system"
+ )
+ parser.add_argument(
+ "-p",
+ "--path",
+ dest="path",
+ type=str,
+ help="path to the desired genome tarball or directory to integrate",
+ required=True,
+ )
+ parser.add_argument(
+ "-g",
+ "--genome",
+ dest="genome",
+ type=str,
+ help="name to be assigned to the selected genome",
+ required=True,
+ )
+ parser.add_argument(
+ "-c",
+ "--config",
+ dest="config",
+ type=str,
+ help="path to local genome configuration file. Optional if '{}' environment variable is set.".format(
+ ", ".join(refgenconf.CFG_ENV_VARS)
+ ),
+ required=False,
+ )
return parser
@@ -78,12 +99,15 @@ def refgenie_add(rgc, asset_dict, path, force=False):
should be forced
"""
# remove the first directory from the provided path if it is the genome name
- path = os.path.join(*path.split(os.sep)[1:]) \
- if path.split(os.sep)[0] == asset_dict["genome"] else path
- tag = asset_dict["tag"] \
- or rgc.get_default_tag(asset_dict["genome"], asset_dict["asset"])
- outfolder = \
- os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"]))
+ path = (
+ os.path.join(*path.split(os.sep)[1:])
+ if path.split(os.sep)[0] == asset_dict["genome"]
+ else path
+ )
+ tag = asset_dict["tag"] or rgc.get_default_tag(
+ asset_dict["genome"], asset_dict["asset"]
+ )
+ outfolder = os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], asset_dict["genome"]))
abs_asset_path = os.path.join(outfolder, path)
if asset_dict["seek_key"] is None:
# if seek_key is not specified we're about to move a directory to
@@ -101,25 +125,32 @@ def refgenie_add(rgc, asset_dict, path, force=False):
if not os.path.exists(tag_path):
cp(abs_asset_path, tag_path)
else:
- if not force and not \
- query_yes_no("Path '{}' exists. Do you want to overwrite?".
- format(tag_path)):
+ if not force and not query_yes_no(
+ "Path '{}' exists. Do you want to overwrite?".format(tag_path)
+ ):
return False
else:
_remove(tag_path)
cp(abs_asset_path, tag_path)
else:
- raise OSError("Absolute path '{}' does not exist. "
- "The provided path must be relative to: {}".
- format(abs_asset_path, rgc[CFG_FOLDER_KEY]))
+ raise OSError(
+ "Absolute path '{}' does not exist. "
+ "The provided path must be relative to: {}".format(
+ abs_asset_path, rgc[CFG_FOLDER_KEY]
+ )
+ )
rgc.make_writable()
gat_bundle = [asset_dict["genome"], asset_dict["asset"], tag]
- td = {CFG_ASSET_PATH_KEY:
- path if os.path.isdir(abs_asset_path) else os.path.dirname(path)}
+ td = {
+ CFG_ASSET_PATH_KEY: path
+ if os.path.isdir(abs_asset_path)
+ else os.path.dirname(path)
+ }
rgc.update_tags(*gat_bundle, data=td)
# seek_key points to the entire dir if not specified
- seek_key_value = os.path.basename(abs_asset_path) \
- if asset_dict["seek_key"] is not None else "."
+ seek_key_value = (
+ os.path.basename(abs_asset_path) if asset_dict["seek_key"] is not None else "."
+ )
sk = {asset_dict["seek_key"] or asset_dict["asset"]: seek_key_value}
rgc.update_seek_keys(*gat_bundle, keys=sk)
rgc.set_default_pointer(asset_dict["genome"], asset_dict["asset"], tag)
@@ -137,19 +168,27 @@ def main():
""" main workflow """
parser = build_argparser()
args, remaining_args = parser.parse_known_args()
- cfg = refgenconf.select_genome_config(filename=args.config, check_exist=True, strict_env=True)
+ cfg = refgenconf.select_genome_config(
+ filename=args.config, check_exist=True, strict_env=True
+ )
if not cfg:
raise MissingGenomeConfigError(args.config)
rgc = refgenconf.RefGenConf(filepath=cfg, writable=False)
pths = [args.path, mkabs(args.path, rgc.genome_folder)]
- if not untar_or_copy(pths[0], os.path.join(rgc.genome_folder, args.genome)) \
- and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)):
- raise OSError("Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths)))
+ if not untar_or_copy(
+ pths[0], os.path.join(rgc.genome_folder, args.genome)
+ ) and not untar_or_copy(pths[1], os.path.join(rgc.genome_folder, args.genome)):
+ raise OSError(
+ "Path '{}' does not exist. Tried: {}".format(args.path, " and ".join(pths))
+ )
path_components = [rgc.genome_folder] + [args.genome] + ["*"] * 3 + ["Sequence"]
assets_paths = glob(os.path.join(*path_components))
- assert len(assets_paths) > 0, OSError("Your iGenomes directory is corrupted, more than one directory matched by {}."
- "\nMatched dirs: {}".format(os.path.join(*path_components),
- ", ".join(assets_paths)))
+ assert len(assets_paths) > 0, OSError(
+ "Your iGenomes directory is corrupted, more than one directory matched by {}."
+ "\nMatched dirs: {}".format(
+ os.path.join(*path_components), ", ".join(assets_paths)
+ )
+ )
assets_path = assets_paths[0]
asset_names = [d for d in os.listdir(assets_path) if os.path.isdir(assets_path)]
processed = []
@@ -161,7 +200,25 @@ def main():
print("Added assets: \n- {}".format("\n- ".join(processed)))
-if __name__ == '__main__':
+def _remove(path):
+ """
+ remove asset if it is a dir or a file
+
+ :param str path: path to the entity to remove, either a file or a dir
+ :return str: removed path
+ """
+ from shutil import rmtree
+
+ if os.path.isfile(path):
+ os.remove(path)
+ elif os.path.isdir(path):
+ rmtree(path)
+ else:
+ raise ValueError("path '{}' is neither a file nor a dir.".format(path))
+ return path
+
+
+if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
diff --git a/refgenie/argparser.py b/refgenie/argparser.py
new file mode 100644
index 00000000..ff7df2c7
--- /dev/null
+++ b/refgenie/argparser.py
@@ -0,0 +1,484 @@
+import pypiper
+
+from ubiquerg import VersionInHelpParser
+
+from ._version import __version__
+from .const import *
+from refgenconf import __version__ as rgc_version
+
+from argparse import HelpFormatter
+
+
+def build_argparser():
+ """
+ Builds argument parser.
+
+ :return argparse.ArgumentParser
+ """
+
+ banner = "%(prog)s - reference genome asset manager"
+ additional_description = "\nhttps://refgenie.databio.org"
+
+ parser = VersionInHelpParser(
+ prog="refgenie",
+ version=f"{__version__} | refgenconf {rgc_version}",
+ description=banner,
+ epilog=additional_description,
+ )
+
+ subparsers = parser.add_subparsers(dest="command")
+
+ def add_subparser(cmd, msg, subparsers):
+ return subparsers.add_parser(
+ cmd,
+ description=msg,
+ help=msg,
+ formatter_class=lambda prog: HelpFormatter(
+ prog, max_help_position=40, width=90
+ ),
+ )
+
+ sps = {}
+ for cmd, desc in SUBPARSER_MESSAGES.items():
+ sps[cmd] = add_subparser(cmd, desc, subparsers)
+ # alias is nested and alias subcommands require config path
+ if cmd == ALIAS_CMD:
+ continue
+ # It's required for init
+ sps[cmd].add_argument(
+ "-c",
+ "--genome-config",
+ required=(cmd == INIT_CMD),
+ dest="genome_config",
+ metavar="C",
+ help="Path to local genome configuration file. Optional if {} environment variable is set.".format(
+ ", ".join(CFG_ENV_VARS)
+ ),
+ )
+ sps[cmd].add_argument(
+ "--skip-read-lock",
+ required=False,
+ action="store_true",
+ help="Whether the config file should not be locked for reading",
+ )
+
+ # upgrade: upgrade config and alter file structure to the target version
+ sps[UPGRADE_CMD].add_argument(
+ "-v",
+ "--target-version",
+ required=True,
+ metavar="V",
+ help="Target config version for the upgrade.",
+ )
+ sps[UPGRADE_CMD].add_argument(
+ "-f",
+ "--force",
+ action="store_true",
+ help="Do not prompt before action, approve upfront.",
+ )
+
+ sps[INIT_CMD].add_argument(
+ "-s",
+ "--genome-server",
+ nargs="+",
+ default=[DEFAULT_SERVER],
+ help="URL(s) to use for the {} attribute in config file. Default: {}.".format(
+ CFG_SERVERS_KEY, DEFAULT_SERVER
+ ),
+ )
+ sps[INIT_CMD].add_argument(
+ "-f",
+ "--genome-folder",
+ help="Absolute path to parent folder refgenie-managed assets.",
+ )
+ sps[INIT_CMD].add_argument(
+ "-a",
+ "--genome-archive-folder",
+ help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.",
+ )
+ sps[INIT_CMD].add_argument(
+ "-b",
+ "--genome-archive-config",
+ help="Absolute path to desired archive config file; used by refgenieserver.",
+ )
+ sps[INIT_CMD].add_argument(
+ "-u",
+ "--remote-url-base",
+ help="URL to use as an alternative, remote archive location; used by refgenieserver.",
+ )
+ sps[INIT_CMD].add_argument(
+ "-j",
+ "--settings-json",
+ help="Absolute path to a JSON file with the key "
+ "value pairs to inialize the configuration "
+ "file with. Overwritten by itemized specifications.",
+ )
+ sps[BUILD_CMD] = pypiper.add_pypiper_args(
+ sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"]
+ )
+
+ # Add any arguments specific to subcommands.
+
+ sps[BUILD_CMD].add_argument(
+ "--tag-description",
+ required=False,
+ default=None,
+ type=str,
+ help="Add tag level description (e.g. built with version 0.3.2).",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "--genome-description",
+ required=False,
+ default=None,
+ type=str,
+ help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "-d",
+ "--docker",
+ action="store_true",
+ help="Run all commands in the refgenie docker container.",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "--assets",
+ nargs="+",
+ action="append",
+ required=False,
+ default=None,
+ help="Override the default genome, asset and tag of the parents"
+ " (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "--files",
+ nargs="+",
+ action="append",
+ required=False,
+ default=None,
+ help="Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "--params",
+ nargs="+",
+ action="append",
+ required=False,
+ default=None,
+ help="Provide required parameter values (e.g. param1=value1).",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "-v",
+ "--volumes",
+ nargs="+",
+ required=False,
+ default=None,
+ help="If using docker, also mount these folders as volumes.",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "-o",
+ "--outfolder",
+ dest="outfolder",
+ required=False,
+ default=None,
+ help="Override the default path to genomes folder, which is the "
+ "genome_folder attribute in the genome configuration file.",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "-q",
+ "--requirements",
+ action="store_true",
+ help="Show the build requirements for the specified asset and exit.",
+ )
+
+ sps[BUILD_CMD].add_argument(
+ "-r",
+ "--recipe",
+ required=False,
+ default=None,
+ type=str,
+ help="Provide a recipe to use.",
+ )
+
+ alias_subparser = sps[ALIAS_CMD]
+ alias_subsubparsers = alias_subparser.add_subparsers(dest="subcommand")
+
+ alias_sps = {}
+ for cmd, desc in ALIAS_SUBPARSER_MESSAGES.items():
+ alias_sps[cmd] = add_subparser(cmd, desc, alias_subsubparsers)
+ alias_sps[cmd].add_argument(
+ "-c",
+ "--genome-config",
+ required=False,
+ dest="genome_config",
+ metavar="C",
+ help="Path to local genome configuration file. Optional if {} environment variable is set.".format(
+ ", ".join(CFG_ENV_VARS)
+ ),
+ )
+ alias_sps[cmd].add_argument(
+ "--skip-read-lock",
+ required=False,
+ action="store_true",
+ help="Whether the config file should not be locked for reading",
+ )
+
+ alias_sps[ALIAS_SET_CMD].add_argument(
+ "-a",
+ "--aliases",
+ metavar="A",
+ required=False,
+ default=None,
+ type=str,
+ nargs="+",
+ help="Aliases to set; single if the digest is to be retrieved from the server.",
+ )
+ alias_sps[ALIAS_SET_CMD].add_argument(
+ "-d",
+ "--digest",
+ metavar="D",
+ required=False,
+ type=str,
+ help="Digest to set; leave out if the digest is to be retrieved from the server.",
+ )
+ alias_sps[ALIAS_SET_CMD].add_argument(
+ "-r",
+ "--reset",
+ action="store_true",
+ help="Whether all the aliases should be removed prior to setting new ones.",
+ )
+ alias_sps[ALIAS_SET_CMD].add_argument(
+ "-f",
+ "--force",
+ action="store_true",
+ help="Whether the action should be forced, if genome does not exist.",
+ )
+
+ alias_sps[ALIAS_REMOVE_CMD].add_argument(
+ "-a",
+ "--aliases",
+ metavar="A",
+ required=False,
+ default=None,
+ type=str,
+ nargs="+",
+ help="Aliases to remove.",
+ )
+ alias_sps[ALIAS_REMOVE_CMD].add_argument(
+ "-d", "--digest", metavar="D", required=True, type=str, help="Digest to remove."
+ )
+
+ alias_sps[ALIAS_GET_CMD].add_argument(
+ "-a",
+ "--aliases",
+ metavar="A",
+ required=False,
+ type=str,
+ nargs="+",
+ help="Aliases to get the digests for.",
+ )
+
+ sps[COMPARE_CMD].add_argument(
+ "genome1",
+ metavar="GENOME1",
+ type=str,
+ nargs=1,
+ help="First genome for compatibility check.",
+ )
+ sps[COMPARE_CMD].add_argument(
+ "genome2",
+ metavar="GENOME2",
+ type=str,
+ nargs=1,
+ help="Second genome for compatibility check.",
+ )
+ sps[COMPARE_CMD].add_argument(
+ "-e",
+ "--no-explanation",
+ action="store_true",
+ help="Do not print compatibility code explanation.",
+ )
+
+ # add 'genome' argument to many commands
+ for cmd in [
+ PULL_CMD,
+ GET_ASSET_CMD,
+ BUILD_CMD,
+ INSERT_CMD,
+ REMOVE_CMD,
+ GETSEQ_CMD,
+ TAG_CMD,
+ ID_CMD,
+ ]:
+ # genome is not required for listing actions
+ sps[cmd].add_argument(
+ "-g",
+ "--genome",
+ required=cmd in GETSEQ_CMD,
+ metavar="G",
+ help="Reference assembly ID, e.g. mm10.",
+ )
+
+ for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD:
+ sps[cmd].add_argument(
+ "-g",
+ "--genome",
+ required=False,
+ type=str,
+ metavar="G",
+ nargs="*",
+ help="Reference assembly ID, e.g. mm10.",
+ )
+
+ for cmd in [
+ PULL_CMD,
+ GET_ASSET_CMD,
+ BUILD_CMD,
+ INSERT_CMD,
+ REMOVE_CMD,
+ TAG_CMD,
+ ID_CMD,
+ ]:
+ sps[cmd].add_argument(
+ "asset_registry_paths",
+ metavar="asset-registry-paths",
+ type=str,
+ nargs="+",
+ help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag"
+ + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."),
+ )
+
+ sps[LIST_LOCAL_CMD].add_argument(
+ "-r", "--recipes", action="store_true", help="List available recipes."
+ )
+
+ for cmd in [REMOVE_CMD, INSERT_CMD]:
+ sps[cmd].add_argument(
+ "-f",
+ "--force",
+ action="store_true",
+ help="Do not prompt before action, approve upfront.",
+ )
+
+ sps[REMOVE_CMD].add_argument(
+ "-a",
+ "--aliases",
+ action="store_true",
+ help="Remove the genome alias if last asset for that genome is removed.",
+ )
+ force_group = sps[PULL_CMD].add_argument_group(
+ title="Prompt handling",
+ description="These flags configure the pull prompt responses.",
+ )
+
+ overwrite_group = force_group.add_mutually_exclusive_group()
+
+ overwrite_group.add_argument(
+ "--no-overwrite", action="store_true", help="Do not overwrite if asset exists."
+ )
+
+ overwrite_group.add_argument(
+ "--force-overwrite", action="store_true", help="Overwrite if asset exists."
+ )
+
+ large_group = force_group.add_mutually_exclusive_group()
+
+ large_group.add_argument(
+ "--no-large", action="store_true", help="Do not pull archives over 5GB."
+ )
+
+ large_group.add_argument(
+ "--pull-large",
+ action="store_true",
+ help="Pull any archive, regardless of its size.",
+ )
+
+ force_group.add_argument(
+ "--size-cutoff",
+ type=float,
+ default=10,
+ metavar="S",
+ help="Maximum archive file size to download with no confirmation required (in GB, default: 10)",
+ )
+
+ force_group.add_argument(
+ "-b",
+ "--batch",
+ action="store_true",
+ help="Use batch mode: pull large archives, do no overwrite",
+ )
+
+ sps[INSERT_CMD].add_argument(
+ "-p", "--path", required=True, metavar="P", help="Relative local path to asset."
+ )
+
+ sps[INSERT_CMD].add_argument(
+ "-s",
+ "--seek-keys",
+ required=False,
+ type=str,
+ metavar="S",
+ help="""
+ String representation of a JSON object with seek_keys,
+ e.g. '{"seek_key1": "file.txt"}'
+ """,
+ )
+
+ sps[GETSEQ_CMD].add_argument(
+ "-l",
+ "--locus",
+ required=True,
+ help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.",
+ )
+
+ sps[GET_ASSET_CMD].add_argument(
+ "-e",
+ "--check-exists",
+ required=False,
+ action="store_true",
+ help="Whether the returned asset path should be checked for existence on disk.",
+ )
+
+ sps[TAG_CMD].add_argument(
+ "-f",
+ "--force",
+ action="store_true",
+ help="Do not prompt before action, approve upfront.",
+ )
+
+ group = sps[TAG_CMD].add_mutually_exclusive_group(required=True)
+
+ group.add_argument("-t", "--tag", type=str, help="Tag to assign to an asset.")
+
+ group.add_argument(
+ "-d",
+ "--default",
+ action="store_true",
+ help="Set the selected asset tag as the default one.",
+ )
+
+ sps[SUBSCRIBE_CMD].add_argument(
+ "-r",
+ "--reset",
+ action="store_true",
+ help="Overwrite the current list of server URLs.",
+ )
+
+ for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]:
+ sps[cmd].add_argument(
+ "-s",
+ "--genome-server",
+ nargs="+",
+ required=True,
+ help="One or more URLs to {action} the {key} attribute in config file.".format(
+ action="add to" if cmd == SUBSCRIBE_CMD else "remove from",
+ key=CFG_SERVERS_KEY,
+ ),
+ )
+
+ return parser
diff --git a/refgenie/asset_build_packages.py b/refgenie/asset_build_packages.py
index 82981647..fc031eb1 100644
--- a/refgenie/asset_build_packages.py
+++ b/refgenie/asset_build_packages.py
@@ -6,7 +6,8 @@
# These building recipes should make use of arguments that are auto-populated,
# or user-provided. The auto-populated arguments are:
# - {genome}
-# - {asset_outfolder} In addition to these, the recipe should refer in the
+# - {asset_outfolder}
+# In addition to these, the recipe should refer in the
# same way, {var}, to any variables required to be provided, which will be
# provided via the CLI. These should be listed as 'required_inputs' and
# will be checked for existence before the commands are executed.
@@ -23,7 +24,18 @@
KEY = "key"
DEFAULT = "default"
-RECIPE_CONSTS = ["DESC", "ASSET_DESC", "ASSETS", "PTH", "REQ_FILES", "REQ_ASSETS", "CONT", "CMD_LST", "KEY", "DEFAULT"]
+RECIPE_CONSTS = [
+ "DESC",
+ "ASSET_DESC",
+ "ASSETS",
+ "PTH",
+ "REQ_FILES",
+ "REQ_ASSETS",
+ "CONT",
+ "CMD_LST",
+ "KEY",
+ "DEFAULT",
+]
asset_build_packages = {
"fasta": {
@@ -31,14 +43,9 @@
ASSETS: {
"fasta": "{genome}.fa",
"fai": "{genome}.fa.fai",
- "chrom_sizes": "{genome}.chrom.sizes"
+ "chrom_sizes": "{genome}.chrom.sizes",
},
- REQ_FILES: [
- {
- KEY: "fasta",
- DESC: "gzipped fasta file"
- }
- ],
+ REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
@@ -47,21 +54,16 @@
"gzip -df {asset_outfolder}/{genome}.fa.gz",
"samtools faidx {asset_outfolder}/{genome}.fa",
"cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes",
- ]
+ ],
},
"fasta_txome": {
DESC: "cDNA sequences in the FASTA format, indexed FASTA (produced with samtools index) and chromosome sizes file",
ASSETS: {
"fasta_txome": "{genome}.fa",
"fai": "{genome}.fa.fai",
- "chrom_sizes": "{genome}.chrom.sizes"
+ "chrom_sizes": "{genome}.chrom.sizes",
},
- REQ_FILES: [
- {
- KEY: "fasta",
- DESC: "gzipped fasta file"
- }
- ],
+ REQ_FILES: [{KEY: "fasta", DESC: "gzipped fasta file"}],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
@@ -70,26 +72,21 @@
"gzip -df {asset_outfolder}/{genome}.fa.gz",
"samtools faidx {asset_outfolder}/{genome}.fa",
"cut -f 1,2 {asset_outfolder}/{genome}.fa.fai > {asset_outfolder}/{genome}.chrom.sizes",
- ]
+ ],
},
"dbnsfp": {
DESC: "A database developed for functional prediction and annotation of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome (Gencode release 29/Ensembl 94)",
ASSETS: {
"dbnsfp": "{genome}_dbNSFP.txt.gz",
- "tabix": "{genome}_dbNSFP.txt.gz.tbi"
+ "tabix": "{genome}_dbNSFP.txt.gz.tbi",
},
- REQ_FILES: [
- {
- KEY: "dbnsfp",
- DESC: "zipped dbSNFP database file"
- }
- ],
+ REQ_FILES: [{KEY: "dbnsfp", DESC: "zipped dbSNFP database file"}],
REQ_ASSETS: [],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
+ DESC: "Number of threads to use for parallel computing",
}
],
CONT: "databio/refgenie",
@@ -102,256 +99,179 @@
"rm {asset_outfolder}/dbNSFP*_variant.chr*",
"bgzip -@ {threads} {asset_outfolder}/{genome}_dbNSFP.txt",
"tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_dbNSFP.txt.gz",
- "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`"
- ]
+ "rm `find {asset_outfolder} -type f -not -path '{asset_outfolder}/_refgenie_build*' -not -path '{asset_outfolder}/hg38_dbNSFP.txt.*'`",
+ ],
},
"dbsnp": {
DESC: "The database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants",
- ASSETS: {
- "dbsnp": "{genome}_dbSNP.gz",
- "tabix": "{genome}_dbSNP.gz.tbi"
- },
+ ASSETS: {"dbsnp": "{genome}_dbSNP.gz", "tabix": "{genome}_dbSNP.gz.tbi"},
REQ_FILES: [
- {
- KEY: "dbsnp_vcf",
- DESC: "SNP database file in Variant Call Format (VCF)"
- },
- {
- KEY: "dbsnp_tbi",
- DESC: "tabix index of the dbsnp.vcf file"
- }
+ {KEY: "dbsnp_vcf", DESC: "SNP database file in Variant Call Format (VCF)"},
+ {KEY: "dbsnp_tbi", DESC: "tabix index of the dbsnp.vcf file"},
],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
CMD_LST: [
"cp {dbsnp_vcf} {asset_outfolder}/{genome}_dbSNP.gz",
- "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi"
- ]
+ "cp {dbsnp_tbi} {asset_outfolder}/{genome}_dbSNP.gz.tbi",
+ ],
},
"bowtie2_index": {
DESC: "Genome index for bowtie, produced with bowtie-build",
- ASSETS: {
- "bowtie2_index": "{genome}"
- },
+ ASSETS: {"bowtie2_index": "{genome}"},
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- CMD_LST: [
- "bowtie2-build {fasta} {asset_outfolder}/{genome}"
- ]
+ CMD_LST: ["bowtie2-build {fasta} {asset_outfolder}/{genome}"],
},
"bwa_index": {
DESC: "Genome index for Burrows-Wheeler Alignment Tool, produced with bwa index",
- ASSETS: {
- "bwa_index": "{genome}.fa"
- },
+ ASSETS: {"bwa_index": "{genome}.fa"},
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [],
CONT: "databio/refgenie",
CMD_LST: [
"ln -sf {fasta} {asset_outfolder}",
"bwa index {asset_outfolder}/{genome}.fa",
- ]
- },
+ ],
+ },
"hisat2_index": {
DESC: "Genome index for HISAT2, produced with hisat2-build",
- ASSETS: {
- "hisat2_index": "{genome}"
- },
+ ASSETS: {"hisat2_index": "{genome}"},
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- CMD_LST: [
- "hisat2-build {fasta} {asset_outfolder}/{genome}"
- ]
+ CMD_LST: ["hisat2-build {fasta} {asset_outfolder}/{genome}"],
},
"bismark_bt2_index": {
DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie2",
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- ASSETS: {
- "bismark_bt2_index": "."
- },
+ ASSETS: {"bismark_bt2_index": "."},
CMD_LST: [
"ln -sf {fasta} {asset_outfolder}",
- "bismark_genome_preparation --bowtie2 {asset_outfolder}"
- ]
+ "bismark_genome_preparation --bowtie2 {asset_outfolder}",
+ ],
},
"bismark_bt1_index": {
DESC: "Genome index for Bisulfite-Seq applications, produced by bismark_genome_preparation using bowtie1",
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- ASSETS: {
- "bismark_bt1_index": "."
- },
+ ASSETS: {"bismark_bt1_index": "."},
CMD_LST: [
"ln -sf {fasta} {asset_outfolder}",
- "bismark_genome_preparation {asset_outfolder}"
- ]
- },
+ "bismark_genome_preparation {asset_outfolder}",
+ ],
+ },
"kallisto_index": {
DESC: "Genome index for kallisto, produced with kallisto index",
REQ_FILES: [],
REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for transcriptome"
- }
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"}
],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- ASSETS: {
- "kallisto_index": "."
- },
+ ASSETS: {"kallisto_index": "."},
CMD_LST: [
"kallisto index -i {asset_outfolder}/{genome}_kallisto_index.idx {fasta}"
- ]
+ ],
},
"salmon_index": {
DESC: "Transcriptome index for salmon, produced with salmon index",
REQ_FILES: [],
REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for transcriptome"
- }
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for transcriptome"}
],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
- },
+ DESC: "Number of threads to use for parallel computing",
+ },
{
KEY: "kmer",
DEFAULT: "31",
- DESC: "The length of kmer to use to create the indices"
- }
+ DESC: "The length of kmer to use to create the indices",
+ },
],
CONT: "combinelab/salmon",
- ASSETS: {
- "salmon_index": "."
- },
+ ASSETS: {"salmon_index": "."},
CMD_LST: [
"salmon index -t {fasta} -i {asset_outfolder} -k {kmer} -p {threads}"
- ]
+ ],
},
"salmon_sa_index": {
DESC: "Transcriptome index for salmon, produced with salmon index using selective alignment method. Improves quantification accuracy compared to the regular index.",
REQ_FILES: [],
REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- },
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"},
{
KEY: "fasta_txome",
DEFAULT: "fasta_txome",
- DESC: "fasta asset for transcriptome"
- }
+ DESC: "fasta asset for transcriptome",
+ },
],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
- },
+ DESC: "Number of threads to use for parallel computing",
+ },
{
KEY: "kmer",
DEFAULT: "31",
- DESC: "The length of kmer to use to create the indices"
- }
+ DESC: "The length of kmer to use to create the indices",
+ },
],
CONT: "combinelab/salmon",
- ASSETS: {
- "salmon_sa_index": "."
- },
+ ASSETS: {"salmon_sa_index": "."},
CMD_LST: [
"grep '^>' {fasta} | cut -d ' ' -f 1 > {asset_outfolder}/decoys.txt",
"sed -i.bak -e 's/>//g' {asset_outfolder}/decoys.txt",
"rm {asset_outfolder}/decoys.txt.bak",
"cat {fasta_txome} {fasta} > {asset_outfolder}/gentrome.fa",
"salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}",
- "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt"
- ]
+ "rm {asset_outfolder}/gentrome.fa {asset_outfolder}/decoys.txt",
+ ],
},
"salmon_partial_sa_index": {
DESC: "Transcriptome index for salmon, produced with salmon index using partial selective alignment method. Preparation includes transcriptome mapping to the genome and extraction of the relevant portion out from the genome and indexing it along with the transcriptome. Recipe source -- https://github.com/COMBINE-lab/SalmonTools/blob/master/scripts/generateDecoyTranscriptome.sh",
REQ_FILES: [],
REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- },
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"},
{
KEY: "fasta_txome",
DEFAULT: "fasta_txome",
- DESC: "fasta asset for transcriptome"
+ DESC: "fasta asset for transcriptome",
},
{
KEY: "gtf",
DEFAULT: "ensembl_gtf",
- DESC: "GTF file for exonic features extraction"
- }
+ DESC: "GTF file for exonic features extraction",
+ },
],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
- },
+ DESC: "Number of threads to use for parallel computing",
+ },
{
KEY: "kmer",
DEFAULT: "31",
- DESC: "The length of kmer to use to create the indices"
- }
+ DESC: "The length of kmer to use to create the indices",
+ },
],
CONT: "combinelab/salmon",
- ASSETS: {
- "salmon_partial_sa_index": "."
- },
+ ASSETS: {"salmon_partial_sa_index": "."},
CMD_LST: [
"gunzip -c {gtf} > {asset_outfolder}/{genome}.gtf",
"awk -v OFS='\t' '{{if ($3==\"exon\") {{print $1,$4,$5}}}}' {asset_outfolder}/{genome}.gtf > {asset_outfolder}/exons.bed",
@@ -360,88 +280,86 @@
"awk -v OFS='\t' '{{print $6,$8,$9}}' {asset_outfolder}/mashmap.out | sort -k1,1 -k2,2n - > {asset_outfolder}/genome_found.sorted.bed",
"bedtools merge -i {asset_outfolder}/genome_found.sorted.bed > {asset_outfolder}/genome_found_merged.bed",
"bedtools getfasta -fi {asset_outfolder}/reference.masked.genome.fa -bed {asset_outfolder}/genome_found_merged.bed -fo {asset_outfolder}/genome_found.fa",
- "awk '{{a=$0; getline;split(a, b, \":\"); r[b[1]] = r[b[1]]\"\"$0}} END {{ for (k in r) {{ print k\"\\n\"r[k] }} }}' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa",
+ 'awk \'{{a=$0; getline;split(a, b, ":"); r[b[1]] = r[b[1]]""$0}} END {{ for (k in r) {{ print k"\\n"r[k] }} }}\' {asset_outfolder}/genome_found.fa > {asset_outfolder}/decoy.fa',
"cat {fasta_txome} {asset_outfolder}/decoy.fa > {asset_outfolder}/gentrome.fa",
"grep '>' {asset_outfolder}/decoy.fa | awk '{{print substr($1,2); }}' > {asset_outfolder}/decoys.txt",
"rm {asset_outfolder}/exons.bed {asset_outfolder}/reference.masked.genome.fa {asset_outfolder}/mashmap.out {asset_outfolder}/genome_found.sorted.bed {asset_outfolder}/genome_found_merged.bed {asset_outfolder}/genome_found.fa {asset_outfolder}/decoy.fa {asset_outfolder}/reference.masked.genome.fa.fai",
- "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}"
- ]
+ "salmon index -t {asset_outfolder}/gentrome.fa -d {asset_outfolder}/decoys.txt -i {asset_outfolder} -k {kmer} -p {threads}",
+ ],
},
- "epilog_index": {
- DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller",
+ "tgMap": {
+ DESC: "Transcript to gene map file, containing two columns mapping of each transcript present in the reference to the corresponding gene.",
REQ_FILES: [],
REQ_ASSETS: [
{
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
+ KEY: "salmon_partial_sa_index",
+ DEFAULT: "salmon_partial_sa_index",
+ DESC: "partial salmon index asset",
}
],
+ REQ_PARAMS: [],
+ ASSETS: {"tgMap": "{genome}_txp2gene.tsv"},
+ CMD_LST: [
+ "grep '^>' {salmon_partial_sa_index}/gentrome.fa | cut -d ' ' -f 1,7 | tr -s ' ' '\\t' | sed 's/[>'gene_symbol:']//g' > {asset_outfolder}/{genome}_txp2gene.tsv",
+ ],
+ },
+ "epilog_index": {
+ DESC: "Genome index for CpG sites, produced by the epilog DNA methylation caller",
+ REQ_FILES: [],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [
{
KEY: "context",
- DEFAULT: 'CG',
- DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'"
+ DEFAULT: "CG",
+ DESC: "Substring to index. One or more space-separated strings to index. e.g. 'CG' or 'CG CA CT CC'",
}
],
CONT: "databio/refgenie",
- ASSETS: {
- "epilog_index": "."
- },
+ ASSETS: {"epilog_index": "{genome}_{context}.tsv.gz"},
CMD_LST: [
- "epilog index -i {fasta} -o {asset_outfolder}/{genome}_{context}.tsv --context {context} -t"
- ]
+ "epilog index -- --infile {fasta} --outfile {asset_outfolder}/{genome}_{context}.tsv --contexts {context}",
+ "bgzip {asset_outfolder}/{genome}_{context}.tsv",
+ "tabix -s 1 -b 2 -e 2 {asset_outfolder}/{genome}_{context}.tsv.gz",
+ ],
},
"star_index": {
DESC: "Genome index for STAR RNA-seq aligner, produced with STAR --runMode genomeGenerate",
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
+ DESC: "Number of threads to use for parallel computing",
}
],
CONT: "databio/refgenie",
- ASSETS: {
- "star_index": "."
- },
+ ASSETS: {"star_index": "."},
CMD_LST: [
"mkdir -p {asset_outfolder}",
- "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}"
- ]
+ "STAR --runThreadN {threads} --runMode genomeGenerate --genomeDir {asset_outfolder} --genomeFastaFiles {fasta}",
+ ],
},
"gencode_gtf": {
DESC: "GTF annotation asset which provides access to all annotated transcripts which make up an Ensembl gene set.",
REQ_FILES: [
{
KEY: "gencode_gtf",
- DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode"
+ DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode",
}
],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- ASSETS: {
- "gencode_gtf": "{genome}.gtf.gz"
- },
- CMD_LST: [
- "cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz"
- ]
+ ASSETS: {"gencode_gtf": "{genome}.gtf.gz"},
+ CMD_LST: ["cp {gencode_gtf} {asset_outfolder}/{genome}.gtf.gz"],
},
"ensembl_gtf": {
DESC: "Ensembl GTF, TSS, and gene body annotation",
REQ_FILES: [
{
KEY: "ensembl_gtf",
- DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl"
+ DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl",
}
],
REQ_ASSETS: [],
@@ -454,36 +372,27 @@
},
CMD_LST: [
"cp {ensembl_gtf} {asset_outfolder}/{genome}.gtf.gz",
- "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep 'exon_number \"1\";' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1, $4, $5, $20, $14, $7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+20\"\t\"$2+120\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$3-120\"\t\"$3-20\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed",
- "gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk '$3 == \"gene\"' | sed 's/^/chr/' | awk -v OFS='\t' '{{print $1,$4,$5,$14,$6,$7}}' | sed 's/\";//g' | sed 's/\"//g' | awk '$4!=\"Metazoa_SRP\"' | awk '$4!=\"U3\"' | awk '$4!=\"7SK\"' | awk '($3-$2)>200' | awk '{{if($6==\"+\"){{print $1\"\t\"$2+500\"\t\"$3\"\t\"$4\"\t\"$5\"\t\"$6}}else{{print $1\"\t\"$2\"\t\"$3-500\"\t\"$4\"\t\"$5\"\t\"$6}}}}' | awk '$3>$2' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed"
- ]
+ 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | grep \'exon_number "1";\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1, $4, $5, $20, $14, $7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'{{if($6=="+"){{print $1"\t"$2+20"\t"$2+120"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$3-120"\t"$3-20"\t"$4"\t"$5"\t"$6}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_ensembl_TSS.bed',
+ 'gzip -dcf {asset_outfolder}/{genome}.gtf.gz | awk \'$3 == "gene"\' | sed \'s/^/chr/\' | awk -v OFS=\'\t\' \'{{print $1,$4,$5,$14,$6,$7}}\' | sed \'s/";//g\' | sed \'s/"//g\' | awk \'$4!="Metazoa_SRP"\' | awk \'$4!="U3"\' | awk \'$4!="7SK"\' | awk \'($3-$2)>200\' | awk \'{{if($6=="+"){{print $1"\t"$2+500"\t"$3"\t"$4"\t"$5"\t"$6}}else{{print $1"\t"$2"\t"$3-500"\t"$4"\t"$5"\t"$6}}}}\' | awk \'$3>$2\' | LC_COLLATE=C sort -k4 -u > {asset_outfolder}/{genome}_ensembl_gene_body.bed',
+ ],
},
"ensembl_rb": {
DESC: "A regulatory annotation file",
REQ_FILES: [
{
KEY: "gff",
- DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl"
+ DESC: "Regulatory build annotation file in Gene Feature Format (GFF) from Ensembl",
}
],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- ASSETS: {
- "ensembl_rb": "{genome}.gff.gz"
- },
- CMD_LST: [
- "cp {gff} {asset_outfolder}/{genome}.gff.gz"
- ]
+ ASSETS: {"ensembl_rb": "{genome}.gff.gz"},
+ CMD_LST: ["cp {gff} {asset_outfolder}/{genome}.gff.gz"],
},
"refgene_anno": {
DESC: "gene, TSS, exon, intron, and premature mRNA annotation files",
- REQ_FILES: [
- {
- KEY: "refgene",
- DESC: "gzipped RefGene database annotation file"
- }
- ],
+ REQ_FILES: [{KEY: "refgene", DESC: "gzipped RefGene database annotation file"}],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
@@ -496,11 +405,11 @@
},
CMD_LST: [
"cp {refgene} {asset_outfolder}/{genome}_refGene.txt.gz",
- "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk '{{if($4==\"+\"){{print $3\"\t\"$5\"\t\"$5\"\t\"$13\"\t.\t\"$4}}else{{print $3\"\t\"$6\"\t\"$6\"\t\"$13\"\t.\t\"$4}}}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed",
+ 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk \'{{if($4=="+"){{print $3"\t"$5"\t"$5"\t"$13"\t.\t"$4}}else{{print $3"\t"$6"\t"$6"\t"$13"\t.\t"$4}}}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_TSS.bed',
"gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -v OFS='\t' '{{ n = split($10, a, \",\"); split($11, b, \",\"); for(i=1; i {asset_outfolder}/{genome}_exons.bed",
"gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | awk -v OFS='\t' '$9>1' | awk -F'\t' '{{ exonCount=int($9);split($10,exonStarts,\"[,]\"); split($11,exonEnds,\"[,]\"); for(i=1;i {asset_outfolder}/{genome}_introns.bed",
- "gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep 'cmpl' | awk '{{print $3\"\t\"$5\"\t\"$6\"\t\"$13\"\t.\t\"$4}}' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed"
- ]
+ 'gzip -dcf {asset_outfolder}/{genome}_refGene.txt.gz | grep \'cmpl\' | awk \'{{print $3"\t"$5"\t"$6"\t"$13"\t.\t"$4}}\' | LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_pre-mRNA.bed',
+ ],
},
"suffixerator_index": {
DESC: "Enhanced suffix array index for genomes using gt (GenomeTools) suffixerator program",
@@ -508,61 +417,45 @@
{
KEY: "memlimit",
DEFAULT: "8GB",
- DESC: "The maximum amount of memory available to be used during index construction."
+ DESC: "The maximum amount of memory available to be used during index construction.",
}
],
REQ_FILES: [],
- REQ_ASSETS: [
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
- ],
+ REQ_ASSETS: [{KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"}],
CONT: "databio/refgenie",
- ASSETS: {
- "esa": "{genome}.sft"
- },
+ ASSETS: {"esa": "{genome}.sft"},
CMD_LST: [
"gt suffixerator -dna -pl -tis -suf -lcp -v -showprogress -memlimit {memlimit} -db {fasta} -indexname {asset_outfolder}/{genome}.sft"
- ]
+ ],
},
"tallymer_index": {
DESC: "Indexed k-mers for a given enhanced suffix array at a fixed value of k",
REQ_PARAMS: [
- {
- KEY: "mersize",
- DEFAULT: "30",
- DESC: "The mer size."
- },
+ {KEY: "mersize", DEFAULT: "30", DESC: "The mer size."},
{
KEY: "minocc",
DEFAULT: "2",
- DESC: "The minimum occurrence number for the mers to index."
- }
+ DESC: "The minimum occurrence number for the mers to index.",
+ },
],
REQ_FILES: [],
REQ_ASSETS: [
{
KEY: "esa",
DEFAULT: "suffixerator_index",
- DESC: "enhanced suffix array index for genome"
+ DESC: "enhanced suffix array index for genome",
},
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"},
],
CONT: "databio/refgenie",
ASSETS: {
"tindex": "{genome}.tal_{mersize}",
- "search_file": "{genome}.tal_{mersize}.gtTxt"
+ "search_file": "{genome}.tal_{mersize}.gtTxt",
},
CMD_LST: [
"gt tallymer mkindex -v -counts -pl -mersize {mersize} -minocc {minocc} -indexname {asset_outfolder}/{genome}.tal_{mersize} -esa {esa}/{genome}.sft",
- "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt"
- ]
+ "gt tallymer search -output qseqnum qpos -strand fp -tyr {asset_outfolder}/{genome}.tal_{mersize} -q {fasta} > {asset_outfolder}/{genome}.tal_{mersize}.gtTxt",
+ ],
},
"feat_annotation": {
DESC: "Combined genomic feature annotation created using an Ensembl GTF annotation asset and an Ensembl regulatory build annotation asset",
@@ -574,28 +467,28 @@
{
KEY: "ensembl_gtf",
DEFAULT: "ensembl_gtf",
- DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl"
+ DESC: "Annotation file in Gene Transfer Format (GTF) from Ensembl",
},
{
KEY: "ensembl_rb",
DEFAULT: "ensembl_rb",
- DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl"
- }
+ DESC: "Regulatory annotation file in General Feature Format (GTF) from Ensembl",
+ },
],
REQ_PARAMS: [],
CONT: "databio/refgenie",
CMD_LST: [
"gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"Exon\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_exons.bed",
"gzip -dcf {ensembl_gtf} | awk '$3==\"exon\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{ split($20, a, \"\\\"\"); print \"chr\"$1, $4-1, $5, a[2], $6, $7}}' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u | awk 'seen[$4]++ && seen[$4] > 1' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3nr | env LC_COLLATE=C sort -k1,1 -k2,2n -u | env LC_COLLATE=C sort -k1,1 -k3,3n -u | awk -v OFS='\t' '{{if($4==prev4){{new2=prev3+1;}} {{prev4=$4; prev3=$3; print $1, new2, $2-1, \"Intron\", $5, $6}}}}' | awk -F'\t' '$2' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_introns.bed",
- "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed",
- "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5\'\\\'\' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed",
+ "gzip -dcf {ensembl_gtf} | awk '$3==\"three_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"3'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_3utr.bed",
+ "gzip -dcf {ensembl_gtf} | awk '$3==\"five_prime_utr\"' | grep -v 'pseudogene' | awk -v OFS='\t' '{{print \"chr\"$1, $4-1, $5, \"5'\\'' UTR\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -u > {asset_outfolder}/{genome}_5utr.bed",
"gzip -dcf {ensembl_rb} | awk '$3==\"promoter\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter.bed",
"gzip -dcf {ensembl_rb} | awk '$3==\"promoter_flanking_region\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Promoter Flanking Region\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_promoter_flanking.bed",
"gzip -dcf {ensembl_rb} | awk '$3==\"enhancer\"' | awk -v OFS='\t' '{{print \"chr\"$1, $4, $5, \"Enhancer\", $6, $7}}' | awk '$2<$3' | env LC_COLLATE=C sort -k1,1 -k2,2n -k3,3n -u > {asset_outfolder}/{genome}_enhancer.bed",
"cat {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed | awk -F'\t' '!seen[$1, $2, $3]++' > {asset_outfolder}/{genome}_annotations.bed",
"rm -f {asset_outfolder}/{genome}_enhancer.bed {asset_outfolder}/{genome}_promoter.bed {asset_outfolder}/{genome}_promoter_flanking.bed {asset_outfolder}/{genome}_5utr.bed {asset_outfolder}/{genome}_3utr.bed {asset_outfolder}/{genome}_exons.bed {asset_outfolder}/{genome}_introns.bed",
- "gzip -f {asset_outfolder}/{genome}_annotations.bed"
- ]
+ "gzip -f {asset_outfolder}/{genome}_annotations.bed",
+ ],
},
"cellranger_reference": {
DESC: "Cell Ranger custom genome reference for read alignment and gene expression quantification",
@@ -607,19 +500,15 @@
{
KEY: "gtf",
DEFAULT: "gencode_gtf",
- DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode"
+ DESC: "Annotation file in Gene Transfer Format (GTF) from Gencode",
},
- {
- KEY: "fasta",
- DEFAULT: "fasta",
- DESC: "fasta asset for genome"
- }
+ {KEY: "fasta", DEFAULT: "fasta", DESC: "fasta asset for genome"},
],
REQ_PARAMS: [
{
KEY: "threads",
DEFAULT: "8",
- DESC: "Number of threads to use for parallel computing"
+ DESC: "Number of threads to use for parallel computing",
}
],
CONT: "databio/refgenie",
@@ -627,25 +516,18 @@
"gunzip {gtf} -c > {asset_outfolder}/{genome}.gtf",
"cellranger mkgtf {asset_outfolder}/{genome}.gtf {asset_outfolder}/{genome}_filtered.gtf",
"rm {asset_outfolder}/{genome}.gtf",
- "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}"
- ]
+ "cd {asset_outfolder}; cellranger mkref --genome=ref --fasta={fasta} --genes={asset_outfolder}/{genome}_filtered.gtf --nthreads={threads}",
+ ],
},
"blacklist": {
DESC: "Atypical, unstructured, or high signal genomic regions present in next-generation sequencing experiments (e.g. from ENCODE)",
ASSETS: {
"blacklist": "{genome}_blacklist.bed.gz",
},
- REQ_FILES: [
- {
- KEY: "blacklist",
- DESC: "gzipped blacklist file"
- }
- ],
+ REQ_FILES: [{KEY: "blacklist", DESC: "gzipped blacklist file"}],
REQ_ASSETS: [],
REQ_PARAMS: [],
CONT: "databio/refgenie",
- CMD_LST: [
- "cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz"
- ]
- }
+ CMD_LST: ["cp {blacklist} {asset_outfolder}/{genome}_blacklist.bed.gz"],
+ },
}
diff --git a/refgenie/build_all_genome.py b/refgenie/build_all_genome.py
index 12be1c6c..98cf968c 100644
--- a/refgenie/build_all_genome.py
+++ b/refgenie/build_all_genome.py
@@ -8,23 +8,71 @@
import argparse
import divvy
-parser = argparse.ArgumentParser(description='Builds submission scripts for all assets for a genome')
-parser.add_argument('-g', '--genome', dest="genome", type=str,
- help='genome to build the submission scripts for')
-parser.add_argument('-p', '--path', dest="path", type=str,
- help='path to the desired submission directory location')
-parser.add_argument('-pt', '--partition', dest="PARTITION", type=str,
- help='partition in SLURM submission script', default="standard")
-parser.add_argument('-m', '--mem', dest="MEM", type=str,
- help='mem in SLURM submission script', default="200000")
-parser.add_argument('-t', '--time', dest="TIME", type=str,
- help='time in SLURM submission script', default="10:00:00")
-parser.add_argument('-c', '--cores', dest="CORES", type=str,
- help='cpus-per-task in SLURM submission script', default="4")
-parser.add_argument('-o', '--output', dest="LOGFILE", type=str,
- help='output in SLURM submission script', default=None)
-parser.add_argument('-j', '--job-name', dest="JOBNAME", type=str,
- help='job-name in SLURM submission script', default=None)
+parser = argparse.ArgumentParser(
+ description="Builds submission scripts for all assets for a genome"
+)
+parser.add_argument(
+ "-g",
+ "--genome",
+ dest="genome",
+ type=str,
+ help="genome to build the submission scripts for",
+)
+parser.add_argument(
+ "-p",
+ "--path",
+ dest="path",
+ type=str,
+ help="path to the desired submission directory location",
+)
+parser.add_argument(
+ "-pt",
+ "--partition",
+ dest="PARTITION",
+ type=str,
+ help="partition in SLURM submission script",
+ default="standard",
+)
+parser.add_argument(
+ "-m",
+ "--mem",
+ dest="MEM",
+ type=str,
+ help="mem in SLURM submission script",
+ default="200000",
+)
+parser.add_argument(
+ "-t",
+ "--time",
+ dest="TIME",
+ type=str,
+ help="time in SLURM submission script",
+ default="10:00:00",
+)
+parser.add_argument(
+ "-c",
+ "--cores",
+ dest="CORES",
+ type=str,
+ help="cpus-per-task in SLURM submission script",
+ default="4",
+)
+parser.add_argument(
+ "-o",
+ "--output",
+ dest="LOGFILE",
+ type=str,
+ help="output in SLURM submission script",
+ default=None,
+)
+parser.add_argument(
+ "-j",
+ "--job-name",
+ dest="JOBNAME",
+ type=str,
+ help="job-name in SLURM submission script",
+ default=None,
+)
args = parser.parse_args()
@@ -69,8 +117,10 @@ def _req_input_to_args(req_input):
sub_script = os.path.join(subdir_path, asset + ".sub")
req_input = asset_build_packages[asset]["required_inputs"]
if req_input:
- print("{} asset requires additional input in the command ({}), so '{}'"
- " requires manual edit".format(asset, req_input, sub_script))
+ print(
+ "{} asset requires additional input in the command ({}), so '{}'"
+ " requires manual edit".format(asset, req_input, sub_script)
+ )
req_str = " ".join(_req_input_to_args(req_input))
else:
req_str = ""
diff --git a/refgenie/cli.py b/refgenie/cli.py
new file mode 100644
index 00000000..cb6a2b70
--- /dev/null
+++ b/refgenie/cli.py
@@ -0,0 +1,423 @@
+import logmuse
+import sys
+import json
+import os
+
+from .argparser import build_argparser
+from .refgenie import parse_registry_path, _skip_lock
+from ._version import __version__
+from .const import *
+from .exceptions import *
+from .asset_build_packages import *
+from .refgenie import refgenie_build
+from .helpers import _raise_missing_recipe_error, _single_folder_writeable
+
+from refgenconf import (
+ RefGenConf,
+ MissingAssetError,
+ MissingGenomeError,
+ DownloadJsonError,
+ upgrade_config,
+ __version__ as rgc_version,
+ select_genome_config,
+)
+from ubiquerg import query_yes_no
+from requests.exceptions import MissingSchema
+
+from collections import OrderedDict
+from rich.console import Console
+
+
+def main():
+ """ Primary workflow """
+ parser = logmuse.add_logging_options(build_argparser())
+ args, remaining_args = parser.parse_known_args()
+ global _LOGGER
+ _LOGGER = logmuse.logger_via_cli(args, make_root=True)
+ _LOGGER.debug(f"versions: refgenie {__version__} | refgenconf {rgc_version}")
+ _LOGGER.debug(f"Args: {args}")
+
+ if not args.command:
+ parser.print_help()
+ _LOGGER.error("No command given")
+ sys.exit(1)
+
+ if args.command == ALIAS_CMD and not args.subcommand:
+ parser.print_help()
+ _LOGGER.error("No alias subcommand command given")
+ sys.exit(1)
+
+ gencfg = select_genome_config(
+ filename=args.genome_config,
+ check_exist=not args.command == INIT_CMD,
+ on_missing=lambda fp: fp,
+ strict_env=True,
+ )
+ if gencfg is None:
+ raise MissingGenomeConfigError(args.genome_config)
+ _LOGGER.debug("Determined genome config: {}".format(gencfg))
+
+ skip_read_lock = _skip_lock(args.skip_read_lock, gencfg)
+
+ # From user input we want to construct a list of asset dicts, where each
+ # asset has a genome name, asset name, and tag
+ if "asset_registry_paths" in args and args.asset_registry_paths:
+ _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths))
+ asset_list = [parse_registry_path(x) for x in args.asset_registry_paths]
+
+ for a in asset_list:
+ # every asset must have a genome, either provided via registry path
+ # or the args.genome arg.
+ if not a["genome"]:
+ if args.genome:
+ a["genome"] = args.genome
+ else:
+ _LOGGER.error(
+ "Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".format(
+ a["genome"], a["asset"], a["tag"]
+ )
+ )
+ sys.exit(1)
+ else:
+ if args.genome and args.genome != a["genome"]:
+ _LOGGER.warn(
+ "Two different genomes specified for asset '{}'.".format(
+ a["asset"]
+ )
+ )
+
+ else:
+ if args.command in GENOME_ONLY_REQUIRED and not args.genome:
+ parser.error("You must provide either a genome or a registry path")
+ sys.exit(1)
+ if args.command in ASSET_REQUIRED:
+ parser.error("You must provide an asset registry path")
+ sys.exit(1)
+
+ if args.command == INIT_CMD:
+ _LOGGER.debug("Initializing refgenie genome configuration")
+ entries = OrderedDict(
+ {
+ CFG_VERSION_KEY: REQ_CFG_VERSION,
+ CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
+ CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
+ CFG_GENOMES_KEY: None,
+ }
+ )
+ if args.settings_json:
+ if os.path.isfile(args.settings_json):
+ with open(args.settings_json, "r") as json_file:
+ data = json.load(json_file)
+ entries.update(data)
+ else:
+ raise FileNotFoundError(
+ "JSON file with config init settings does not exist: {}".format(
+ args.settings_json
+ )
+ )
+ if args.genome_folder:
+ entries.update({CFG_FOLDER_KEY: args.genome_folder})
+ if args.remote_url_base:
+ entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base})
+ if args.genome_archive_folder:
+ entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder})
+ if args.genome_archive_config:
+ entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config})
+ _LOGGER.debug("initializing with entries: {}".format(entries))
+ rgc = RefGenConf(entries=entries, skip_read_lock=skip_read_lock)
+ rgc.initialize_config_file(os.path.abspath(gencfg))
+
+ elif args.command == BUILD_CMD:
+ if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]):
+ _LOGGER.error("Build can only build assets for one genome")
+ sys.exit(1)
+ recipe_name = None
+ if args.recipe:
+ if len(asset_list) > 1:
+ _LOGGER.error("Recipes cannot be specified for multi-asset builds")
+ sys.exit(1)
+ recipe_name = args.recipe
+ if args.requirements:
+ for a in asset_list:
+ recipe = recipe_name or a["asset"]
+ if recipe not in asset_build_packages.keys():
+ _raise_missing_recipe_error(recipe)
+ _LOGGER.info("'{}' recipe requirements: ".format(recipe))
+ _make_asset_build_reqs(recipe)
+ sys.exit(0)
+ refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args)
+
+ elif args.command == GET_ASSET_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ check = args.check_exists if args.check_exists else None
+ for a in asset_list:
+ _LOGGER.debug(
+ "getting asset: '{}/{}.{}:{}'".format(
+ a["genome"], a["asset"], a["seek_key"], a["tag"]
+ )
+ )
+ print(
+ rgc.seek(
+ a["genome"],
+ a["asset"],
+ a["tag"],
+ a["seek_key"],
+ strict_exists=check,
+ )
+ )
+ return
+
+ elif args.command == INSERT_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+
+ if len(asset_list) > 1:
+ raise NotImplementedError("Can only add 1 asset at a time")
+ else:
+ sk = args.seek_keys
+ if sk:
+ sk = json.loads(args.seek_keys)
+ rgc.add(
+ path=args.path,
+ genome=asset_list[0]["genome"],
+ asset=asset_list[0]["asset"],
+ tag=asset_list[0]["tag"],
+ seek_keys=sk,
+ force=args.force,
+ )
+
+ elif args.command == PULL_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+
+ # existing assets overwriting
+ if args.no_overwrite:
+ force = False
+ elif args.force_overwrite:
+ force = True
+ else:
+ force = None
+ # large archive pulling
+ if args.no_large:
+ force_large = False
+ elif args.pull_large:
+ force_large = True
+ else:
+ force_large = None
+ # batch mode takes precedence over other choices
+ if args.batch:
+ force_large = True
+ force = False
+
+ outdir = rgc.data_dir
+ if not os.path.exists(outdir):
+ raise MissingFolderError(outdir)
+ if not perm_check_x(outdir):
+ return
+ if not _single_folder_writeable(outdir):
+ _LOGGER.error("Insufficient permissions to write to: {}".format(outdir))
+ return
+
+ for a in asset_list:
+ rgc.pull(
+ a["genome"],
+ a["asset"],
+ a["tag"],
+ force=force,
+ force_large=force_large,
+ size_cutoff=args.size_cutoff,
+ )
+
+ elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ console = Console()
+ if args.command == LIST_REMOTE_CMD:
+ num_servers = 0
+ bad_servers = []
+ for server_url in rgc[CFG_SERVERS_KEY]:
+ num_servers += 1
+ try:
+ table = rgc.get_asset_table(
+ genomes=args.genome, server_url=server_url
+ )
+ except (DownloadJsonError, ConnectionError, MissingSchema):
+ bad_servers.append(server_url)
+ continue
+ else:
+ console.print(table)
+ if num_servers >= len(rgc[CFG_SERVERS_KEY]) and bad_servers:
+ _LOGGER.error(
+ "Could not list assets from the following servers: {}".format(
+ bad_servers
+ )
+ )
+ else:
+ if args.recipes:
+ print(", ".join(sorted(list(asset_build_packages.keys()))))
+ else:
+ console.print(rgc.get_asset_table(genomes=args.genome))
+
+ elif args.command == GETSEQ_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ print(rgc.getseq(args.genome, args.locus))
+
+ elif args.command == REMOVE_CMD:
+ force = args.force
+ rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
+ for a in asset_list:
+ a["tag"] = a["tag"] or rgc.get_default_tag(
+ a["genome"], a["asset"], use_existing=False
+ )
+ _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
+ if a["seek_key"] is not None:
+ raise NotImplementedError("You can't remove a specific seek_key.")
+ gat = {"genome": a["genome"], "asset": a["asset"], "tag": a["tag"]}
+ try:
+ if not rgc.is_asset_complete(**gat):
+ with rgc as r:
+ r.cfg_remove_assets(**gat)
+ _LOGGER.info(
+ "Removed an incomplete asset "
+ "'{genome}/{asset}:{tag}'".format(*gat)
+ )
+ return
+ except (KeyError, MissingAssetError, MissingGenomeError):
+ _LOGGER.info(
+ "Asset '{genome}/{asset}:{tag}' does not exist".format(**gat)
+ )
+ return
+ if len(asset_list) > 1:
+ if not query_yes_no(
+ "Are you sure you want to remove {} assets?".format(len(asset_list))
+ ):
+ _LOGGER.info("Action aborted by the user")
+ return
+ force = True
+ for a in asset_list:
+ rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"], force=force)
+
+ elif args.command == TAG_CMD:
+ rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
+ if len(asset_list) > 1:
+ raise NotImplementedError("Can only tag 1 asset at a time")
+ if args.default:
+ # set the default tag and exit
+ with rgc as r:
+ r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
+ sys.exit(0)
+ rgc.tag(a["genome"], a["asset"], a["tag"], args.tag, force=args.force)
+
+ elif args.command == ID_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ if len(asset_list) == 1:
+ g, a = asset_list[0]["genome"], asset_list[0]["asset"]
+ t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
+ print(rgc.id(g, a, t))
+ return
+ for asset in asset_list:
+ g, a = asset["genome"], asset["asset"]
+ t = asset["tag"] or rgc.get_default_tag(g, a)
+ print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
+ return
+ elif args.command == SUBSCRIBE_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ rgc.subscribe(urls=args.genome_server, reset=args.reset)
+ return
+ elif args.command == UNSUBSCRIBE_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ rgc.unsubscribe(urls=args.genome_server)
+ return
+ elif args.command == ALIAS_CMD:
+ rgc = RefGenConf(filepath=gencfg, skip_read_lock=skip_read_lock)
+ if args.subcommand == ALIAS_GET_CMD:
+ if args.aliases is not None:
+ for a in args.aliases:
+ print(rgc.get_genome_alias_digest(alias=a))
+ return
+ console = Console()
+ console.print(rgc.genome_aliases_table)
+
+ if args.subcommand == ALIAS_SET_CMD:
+ rgc.set_genome_alias(
+ digest=args.digest,
+ genome=args.aliases,
+ reset_digest=args.reset,
+ create_genome=args.force,
+ )
+ return
+ elif args.subcommand == ALIAS_REMOVE_CMD:
+ rgc.remove_genome_aliases(digest=args.digest, aliases=args.aliases)
+ return
+
+ elif args.command == COMPARE_CMD:
+ rgc = RefGenConf(filepath=gencfg, writable=False, skip_read_lock=skip_read_lock)
+ res = rgc.compare(
+ args.genome1[0], args.genome2[0], explain=not args.no_explanation
+ )
+ if args.no_explanation:
+ print(res)
+
+ elif args.command == UPGRADE_CMD:
+ upgrade_config(
+ target_version=args.target_version, filepath=gencfg, force=args.force
+ )
+
+
+def perm_check_x(file_to_check, message_tag="genome directory"):
+ """
+ Check X_OK permission on a path, providing according messaging and bool val.
+
+ :param str file_to_check: path to query for permission
+ :param str message_tag: context for error message if check fails
+ :return bool: os.access(path, X_OK) for the given path
+ :raise ValueError: if there's no filepath to check for permission
+ """
+ if not file_to_check:
+ msg = "You must provide a path to {}".format(message_tag)
+ _LOGGER.error(msg)
+ raise ValueError(msg)
+ if not os.access(file_to_check, os.X_OK):
+ _LOGGER.error("Insufficient permissions to write to {}: ".format(file_to_check))
+ return False
+ return True
+
+
+def _make_asset_build_reqs(asset):
+ """
+ Prepare requirements and inputs lists and display it
+
+ :params str asset: name of the asset
+ """
+
+ def _format_reqs(req_list):
+ """
+
+ :param list[dict] req_list:
+ :return list[str]:
+ """
+ templ = "\t{} ({})"
+ return [
+ templ.format(req[KEY], req[DESC])
+ if DEFAULT not in req
+ else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT])
+ for req in req_list
+ ]
+
+ reqs_list = []
+ if asset_build_packages[asset][REQ_FILES]:
+ reqs_list.append(
+ "- files:\n{}".format(
+ "\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES]))
+ )
+ )
+ if asset_build_packages[asset][REQ_ASSETS]:
+ reqs_list.append(
+ "- assets:\n{}".format(
+ "\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS]))
+ )
+ )
+ if asset_build_packages[asset][REQ_PARAMS]:
+ reqs_list.append(
+ "- params:\n{}".format(
+ "\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS]))
+ )
+ )
+ _LOGGER.info("\n".join(reqs_list))
diff --git a/refgenie/const.py b/refgenie/const.py
index 830f6fe5..d8ec6600 100644
--- a/refgenie/const.py
+++ b/refgenie/const.py
@@ -1,9 +1,11 @@
"""
-Constant variables for refgenie package.
-Ones that are integral to refgenconf and/or refgenieserver should be defined in refgenconf.const
+Constant variables for refgenie package. Ones that are integral to refgenconf
+and/or refgenieserver should be defined in refgenconf.const
"""
from refgenconf.const import *
+PKG_NAME = "refgenie"
+
BUILD_CMD = "build"
INIT_CMD = "init"
PULL_CMD = "pull"
@@ -17,6 +19,9 @@
ID_CMD = "id"
SUBSCRIBE_CMD = "subscribe"
UNSUBSCRIBE_CMD = "unsubscribe"
+ALIAS_CMD = "alias"
+COMPARE_CMD = "compare"
+UPGRADE_CMD = "upgrade"
GENOME_ONLY_REQUIRED = [REMOVE_CMD, GETSEQ_CMD]
@@ -37,4 +42,17 @@
ID_CMD: "Return the asset digest.",
SUBSCRIBE_CMD: "Add a refgenieserver URL to the config.",
UNSUBSCRIBE_CMD: "Remove a refgenieserver URL from the config.",
+ ALIAS_CMD: "Interact with aliases.",
+ COMPARE_CMD: "Compare two genomes.",
+ UPGRADE_CMD: "Upgrade config. This will alter the files on disk.",
+}
+
+ALIAS_GET_CMD = "get"
+ALIAS_SET_CMD = "set"
+ALIAS_REMOVE_CMD = "remove"
+
+ALIAS_SUBPARSER_MESSAGES = {
+ ALIAS_REMOVE_CMD: "Remove aliases.",
+ ALIAS_SET_CMD: "Set aliases.",
+ ALIAS_GET_CMD: "Get aliases.",
}
diff --git a/refgenie/exceptions.py b/refgenie/exceptions.py
index 30d32291..7c17797a 100644
--- a/refgenie/exceptions.py
+++ b/refgenie/exceptions.py
@@ -1,10 +1,11 @@
from refgenconf import CFG_ENV_VARS
-__all__ = ["RefgenieError", "MissingGenomeConfigError"]
+__all__ = ["RefgenieError", "MissingGenomeConfigError", "MissingFolderError"]
class RefgenieError(Exception):
""" Base refgenie exception type """
+
pass
@@ -17,8 +18,9 @@ def __init__(self, conf_file=None):
:param str conf_file: path attempted to be used as genome config file
"""
- msg = "You must provide a config file either as an argument or via an environment variable: {}"\
- .format(", ".join(CFG_ENV_VARS))
+ msg = "You must provide a config file either as an argument or via an environment variable: {}".format(
+ ", ".join(CFG_ENV_VARS)
+ )
if conf_file:
msg = "Not a file {} -- {}.".format(conf_file, msg)
super(MissingGenomeConfigError, self).__init__(msg)
@@ -32,4 +34,3 @@ def __init__(self, folder):
:param str folder: path attempted to be used as folder to save a file to
"""
super(MissingFolderError, self).__init__(folder)
-
diff --git a/refgenie/helpers.py b/refgenie/helpers.py
new file mode 100644
index 00000000..6a24ae12
--- /dev/null
+++ b/refgenie/helpers.py
@@ -0,0 +1,63 @@
+import os
+
+from refgenconf import MissingRecipeError
+from ubiquerg import is_writable
+
+from .asset_build_packages import asset_build_packages
+from .exceptions import MissingFolderError
+
+
+def _parse_user_build_input(input):
+ """
+ Parse user input specification. Used in build for specific parents and input parsing.
+
+ :param Iterable[Iterable[str], ...] input: user command line input,
+ formatted as follows: [[fasta=txt, test=txt], ...]
+ :return dict: mapping of keys, which are input names and values
+ """
+ lst = []
+ for i in input or []:
+ lst.extend(i)
+ return (
+ {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x}
+ if lst is not None
+ else lst
+ )
+
+
+def _single_folder_writeable(d):
+ return os.access(d, os.W_OK) and os.access(d, os.X_OK)
+
+
+def _writeable(outdir, strict_exists=False):
+ outdir = outdir or "."
+ if os.path.exists(outdir):
+ return _single_folder_writeable(outdir)
+ elif strict_exists:
+ raise MissingFolderError(outdir)
+ return _writeable(os.path.dirname(outdir), strict_exists)
+
+
+def _raise_missing_recipe_error(recipe):
+ """
+ Raise an error for a missing recipe, when one is requested
+
+ :param str recipe: recipe name
+ :raise MissingRecipeError: always
+ """
+ raise MissingRecipeError(
+ f"Recipe '{recipe}' not found. Available recipes: "
+ f"{', '.join(list(asset_build_packages.keys()))}"
+ )
+
+
+def _skip_lock(skip_arg, cfg):
+ """
+ If config read lock skip was not forced, check if dir is writable and set
+ the default to the result
+
+ :param bool skip_arg: argument selected on the CLI
+ :param str cfg: path to the confjg
+ :return bool: decision -- whether to skip the file lock for read
+ """
+ return is_writable(os.path.dirname(cfg)) if not skip_arg else True
diff --git a/refgenie/refgenie.py b/refgenie/refgenie.py
index a5e09d7a..9f62caa1 100755
--- a/refgenie/refgenie.py
+++ b/refgenie/refgenie.py
@@ -1,270 +1,64 @@
-#!/usr/bin/env python
-
-from collections import OrderedDict
-from shutil import rmtree
-from re import sub
-from requests import ConnectionError
import os
import sys
import csv
import signal
import json
-from ._version import __version__
-from .exceptions import MissingGenomeConfigError, MissingFolderError
from .asset_build_packages import *
from .const import *
+from .helpers import (
+ _raise_missing_recipe_error,
+ _skip_lock,
+ _parse_user_build_input,
+ _writeable,
+)
-import logmuse
import pypiper
import refgenconf
-from refgenconf import RefGenConf, MissingAssetError, MissingGenomeError, \
- MissingRecipeError, DownloadJsonError, get_dir_digest
-from ubiquerg import is_url, query_yes_no, parse_registry_path as prp, \
- VersionInHelpParser, is_command_callable
+from refgenconf import (
+ RefGenConf,
+ get_dir_digest,
+)
+from ubiquerg import parse_registry_path as prp
from ubiquerg.system import is_writable
-from .refget import fasta_checksum
-
-_LOGGER = None
-
-
-def build_argparser():
- """
- Builds argument parser.
-
- :return argparse.ArgumentParser
- """
-
- banner = "%(prog)s - reference genome asset manager"
- additional_description = "\nhttps://refgenie.databio.org"
-
- parser = VersionInHelpParser(
- prog="refgenie",
- version=__version__,
- description=banner,
- epilog=additional_description)
-
- subparsers = parser.add_subparsers(dest="command")
-
- def add_subparser(cmd, description):
- return subparsers.add_parser(
- cmd, description=description, help=description)
-
- sps = {}
- for cmd, desc in SUBPARSER_MESSAGES.items():
- sps[cmd] = add_subparser(cmd, desc)
- # It's required for init
- sps[cmd].add_argument(
- '-c', '--genome-config', required=(cmd == INIT_CMD), dest="genome_config", metavar="C",
- help="Path to local genome configuration file. Optional if {} environment variable is set."
- .format(", ".join(refgenconf.CFG_ENV_VARS)))
-
- sps[INIT_CMD].add_argument('-s', '--genome-server', nargs='+', default=DEFAULT_SERVER,
- help="URL(s) to use for the {} attribute in config file. Default: {}."
- .format(CFG_SERVERS_KEY, DEFAULT_SERVER))
- sps[INIT_CMD].add_argument('-f', '--genome-folder',
- help="Absolute path to parent folder refgenie-managed assets.")
- sps[INIT_CMD].add_argument('-a', '--genome-archive-folder',
- help="Absolute path to parent archive folder refgenie-managed assets; used by refgenieserver.")
- sps[INIT_CMD].add_argument('-b', '--genome-archive-config',
- help="Absolute path to desired archive config file; used by refgenieserver.")
- sps[INIT_CMD].add_argument('-u', '--remote-url-base',
- help="URL to use as an alternative, remote archive location; used by refgenieserver.")
- sps[INIT_CMD].add_argument('-j', '--settings-json',
- help="Absolute path to a JSON file with the key "
- "value pairs to inialize the configuration "
- "file with. Overwritten by itemized specifications.")
- sps[BUILD_CMD] = pypiper.add_pypiper_args(
- sps[BUILD_CMD], groups=None, args=["recover", "config", "new-start"])
-
- # Add any arguments specific to subcommands.
-
- sps[BUILD_CMD].add_argument(
- '--tag-description', required=False, default=None, type=str,
- help="Add tag level description (e.g. built with version 0.3.2).")
-
- sps[BUILD_CMD].add_argument(
- '--genome-description', required=False, default=None, type=str,
- help="Add genome level description (e.g. The mouse mitochondrial genome, released in Dec 2013).")
-
- sps[BUILD_CMD].add_argument(
- "-d", "--docker", action="store_true", help="Run all commands in the refgenie docker container.")
-
- sps[BUILD_CMD].add_argument(
- '--assets', nargs="+", action='append', required=False, default=None,
- help='Override the default genome, asset and tag of the parents'
- ' (e.g. fasta=hg38/fasta:default gtf=mm10/gencode_gtf:default).')
-
- sps[BUILD_CMD].add_argument(
- '--files', nargs="+", action='append', required=False, default=None,
- help='Provide paths to the required files (e.g. fasta=/path/to/file.fa.gz).')
-
- sps[BUILD_CMD].add_argument(
- '--params', nargs="+", action='append', required=False, default=None,
- help='Provide required parameter values (e.g. param1=value1).')
-
- sps[BUILD_CMD].add_argument(
- '-v', '--volumes', nargs="+", required=False, default=None,
- help='If using docker, also mount these folders as volumes.')
-
- sps[BUILD_CMD].add_argument(
- '-o', '--outfolder', dest='outfolder', required=False, default=None,
- help='Override the default path to genomes folder, which is the '
- 'genome_folder attribute in the genome configuration file.')
-
- sps[BUILD_CMD].add_argument(
- "-q", "--requirements", action="store_true",
- help="Show the build requirements for the specified asset and exit.")
-
- sps[BUILD_CMD].add_argument(
- "-r", "--recipe", required=False, default=None, type=str,
- help="Provide a recipe to use.")
-
- # add 'genome' argument to many commands
- for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, GETSEQ_CMD, TAG_CMD, ID_CMD]:
- # genome is not required for listing actions
- sps[cmd].add_argument(
- "-g", "--genome", required=cmd in GETSEQ_CMD, metavar="G",
- help="Reference assembly ID, e.g. mm10.")
-
- for cmd in LIST_REMOTE_CMD, LIST_LOCAL_CMD:
- sps[cmd].add_argument("-g", "--genome", required=False, type=str,
- nargs="*", help="Reference assembly ID, e.g. mm10.")
-
- for cmd in [PULL_CMD, GET_ASSET_CMD, BUILD_CMD, INSERT_CMD, REMOVE_CMD, TAG_CMD, ID_CMD]:
- sps[cmd].add_argument(
- "asset_registry_paths", metavar="asset-registry-paths", type=str, nargs='+',
- help="One or more registry path strings that identify assets (e.g. hg38/fasta or hg38/fasta:tag"
- + (" or hg38/fasta.fai:tag)." if cmd == GET_ASSET_CMD else ")."))
-
- for cmd in [REMOVE_CMD, INSERT_CMD]:
- sps[cmd].add_argument(
- "-f", "--force", action="store_true",
- help="Do not prompt before action, approve upfront.")
-
- force_group = sps[PULL_CMD].add_argument_group(
- title="Prompt handling",
- description="These flags configure the pull prompt responses.")
-
- overwrite_group = force_group.add_mutually_exclusive_group()
-
- overwrite_group.add_argument("--no-overwrite", action="store_true",
- help="Do not overwrite if asset exists.")
-
- overwrite_group.add_argument("--force-overwrite", action="store_true",
- help="Overwrite if asset exists.")
-
- large_group = force_group.add_mutually_exclusive_group()
-
- large_group.add_argument("--no-large", action="store_true",
- help="Do not pull archives over 5GB.")
-
- large_group.add_argument("--pull-large", action="store_true",
- help="Pull any archive, regardless of its size.")
-
- force_group.add_argument("--size-cutoff", type=float, default=10, metavar="S",
- help="Maximum archive file size to download with no confirmation required (in GB, default: 10)")
-
- force_group.add_argument("-b", "--batch", action="store_true",
- help="Use batch mode: pull large archives, do no overwrite")
-
- sps[INSERT_CMD].add_argument(
- "-p", "--path", required=True, metavar="P",
- help="Relative local path to asset.")
-
- sps[INSERT_CMD].add_argument(
- "-s", "--seek-keys", required=False, type=str, metavar="S",
- help="""
- String representation of a JSON object with seek_keys,
- e.g. '{"seek_key1": "file.txt"}')
- """)
-
- sps[GETSEQ_CMD].add_argument(
- "-l", "--locus", required=True,
- help="Coordinates of desired sequence; e.g. 'chr1:50000-50200'.")
-
- sps[GET_ASSET_CMD].add_argument(
- "-e", "--check-exists", required=False, action="store_true",
- help="Whether the returned asset path should be checked for existence on disk.")
-
- group = sps[TAG_CMD].add_mutually_exclusive_group(required=True)
-
- group.add_argument(
- "-t", "--tag", type=str,
- help="Tag to assign to an asset.")
+from yacman import UndefinedAliasError
+from logging import getLogger
- group.add_argument(
- "-d", "--default", action="store_true",
- help="Set the selected asset tag as the default one.")
-
- sps[SUBSCRIBE_CMD].add_argument(
- "-r", "--reset", action="store_true",
- help="Overwrite the current list of server URLs.")
-
- for cmd in [SUBSCRIBE_CMD, UNSUBSCRIBE_CMD]:
- sps[cmd].add_argument(
- "-s", "--genome-server", nargs='+', required=True,
- help="One or more URLs to {action} the {key} attribute in config file.".
- format(action="add to" if cmd == SUBSCRIBE_CMD else "remove from", key=CFG_SERVERS_KEY))
-
- return parser
+_LOGGER = getLogger(PKG_NAME)
def parse_registry_path(path):
- return prp(path, defaults=[
- ("protocol", None),
- ("genome", None),
- ("asset", None),
- ("seek_key", None),
- ("tag", None)])
-
-
-def copy_or_download_file(input_string, outfolder):
- """
- Given an input file, which can be a local file or a URL, and output folder,
- this downloads or copies the file into the output folder.
-
- :param str input_string: Can be either a URL or a path to a local file
- :param str outfolder: Where to store the result.
- :return str, str: output/result file and command
- """
- result_file = os.path.join(outfolder, os.path.basename(input_string))
- parts = ["wget -O", result_file, input_string] \
- if is_url(input_string) else ["cp", input_string, result_file]
- return result_file, " ".join(parts)
-
-
-def convert_file(input_fasta, output_file, conversions):
- """
- Given an input file, output file, and a list of conversions, gives the appropriate output file.
-
- :param str output_file: Path to local output file you want to create
- :param dict conversions: A dictionary of shell commands to convert files of a given type.
- """
- form = {"INPUT": input_fasta, "OUTPUT": output_file}
- _, ext = os.path.splitext(input_fasta)
- if ext in conversions:
- return conversions[ext].format(**form)
-
-
-def default_config_file():
- """
- Path to default compute environment settings file.
-
- :return str: Path to default compute settings file
- """
- return os.path.join(os.path.dirname(__file__), "refgenie.yaml")
-
-
-def get_asset_vars(genome, asset_key, tag, outfolder, specific_args=None, specific_params=None, **kwargs):
+ return prp(
+ path,
+ defaults=[
+ ("protocol", None),
+ ("genome", None),
+ ("asset", None),
+ ("seek_key", None),
+ ("tag", None),
+ ],
+ )
+
+
+def get_asset_vars(
+ genome,
+ asset_key,
+ tag,
+ outfolder,
+ specific_args=None,
+ specific_params=None,
+ **kwargs,
+):
"""
Gives a dict with variables used to populate an asset path.
"""
asset_outfolder = os.path.join(outfolder, asset_key, tag)
- asset_vars = {"genome": genome,
- "asset": asset_key,
- "tag": tag,
- "asset_outfolder": asset_outfolder}
+ asset_vars = {
+ "genome": genome,
+ "asset": asset_key,
+ "tag": tag,
+ "asset_outfolder": asset_outfolder,
+ }
if specific_args:
asset_vars.update(specific_args)
if specific_params:
@@ -287,7 +81,7 @@ def refgenie_initg(rgc, genome, content_checksums):
:param str genome: name of the genome
:param dict content_checksums: checksums of individual content_checksums, e.g. chromosomes
"""
- genome_dir = os.path.join(rgc[CFG_FOLDER_KEY], genome)
+ genome_dir = os.path.join(rgc.data_dir, genome)
if is_writable(genome_dir):
output_file = os.path.join(genome_dir, "{}_sequence_digests.tsv".format(genome))
with open(output_file, "w") as contents_file:
@@ -296,7 +90,11 @@ def refgenie_initg(rgc, genome, content_checksums):
wr.writerow([key, val])
_LOGGER.debug("sequence digests saved to: {}".format(output_file))
else:
- _LOGGER.warning("Could not save the genome sequence digests. '{}' is not writable".format(genome_dir))
+ _LOGGER.warning(
+ "Could not save the genome sequence digests. '{}' is not writable".format(
+ genome_dir
+ )
+ )
def refgenie_build(gencfg, genome, asset_list, recipe_name, args):
@@ -306,23 +104,44 @@ def refgenie_build(gencfg, genome, asset_list, recipe_name, args):
:param str gencfg: path to the genome configuration file
:param argparse.Namespace args: parsed command-line options/arguments
"""
- rgc = RefGenConf(filepath=gencfg, writable=False)
+ rgc = RefGenConf(
+ filepath=gencfg,
+ writable=False,
+ skip_read_lock=_skip_lock(args.skip_read_lock, gencfg),
+ )
specified_args = _parse_user_build_input(args.files)
specified_params = _parse_user_build_input(args.params)
- if not hasattr(args, "outfolder") or not args.outfolder:
- # Default to genome_folder
- _LOGGER.debug("No outfolder provided, using genome config.")
- args.outfolder = rgc[CFG_FOLDER_KEY]
+ def _read_json_file(filepath):
+ """
+ Read a JSON file
- _LOGGER.debug("Default config file: {}".format(default_config_file()))
+ :param str filepath: path to the file to read
+ :return dict: read data
+ """
+ with open(filepath, "r") as f:
+ data = json.load(f)
+ return data
- if args.config_file and not os.path.isfile(args.config_file):
- _LOGGER.debug("Config file path isn't a file: {}".
- format(args.config_file))
- args.config_file = default_config_file()
+ if recipe_name and os.path.isfile(recipe_name) and recipe_name.endswith(".json"):
+ recipe_name = _read_json_file(filepath=recipe_name)
- def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_args, specific_params, **kwargs):
+ if not hasattr(args, "outfolder") or not args.outfolder:
+ # Default to genome_folder
+ _LOGGER.debug("No outfolder provided, using genome config.")
+ args.outfolder = rgc.data_dir
+
+ def _build_asset(
+ genome,
+ asset_key,
+ tag,
+ build_pkg,
+ genome_outfolder,
+ specific_args,
+ specific_params,
+ alias,
+ **kwargs,
+ ):
"""
Builds assets with pypiper and updates a genome config file.
@@ -336,8 +155,14 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar
assets.
"""
- log_outfolder = os.path.abspath(os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR))
- _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(genome_outfolder, log_outfolder))
+ log_outfolder = os.path.abspath(
+ os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR)
+ )
+ _LOGGER.info(
+ "Saving outputs to:\n- content: {}\n- logs: {}".format(
+ genome_outfolder, log_outfolder
+ )
+ )
if args.docker:
# Set up some docker stuff
if args.volumes:
@@ -347,29 +172,49 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar
volumes = genome_outfolder
if not _writeable(genome_outfolder):
- _LOGGER.error("Insufficient permissions to write to output folder: {}".
- format(genome_outfolder))
+ _LOGGER.error(
+ "Insufficient permissions to write to output folder: {}".format(
+ genome_outfolder
+ )
+ )
return
- pm = pypiper.PipelineManager(name="refgenie", outfolder=log_outfolder, args=args)
+ pm = pypiper.PipelineManager(
+ name="refgenie", outfolder=log_outfolder, args=args
+ )
tk = pypiper.NGSTk(pm=pm)
if args.docker:
pm.get_container(build_pkg[CONT], volumes)
_LOGGER.debug("Asset build package: " + str(build_pkg))
- gat = [genome, asset_key, tag] # create a bundle list to simplify calls below
+ # create a bundle list to simplify calls below
+ gat = [genome, asset_key, tag]
# collect variables required to populate the command templates
- asset_vars = get_asset_vars(genome, asset_key, tag, genome_outfolder, specific_args, specific_params, **kwargs)
+ asset_vars = get_asset_vars(
+ genome,
+ asset_key,
+ tag,
+ genome_outfolder,
+ specific_args,
+ specific_params,
+ **kwargs,
+ )
# populate command templates
# prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
- command_list_populated = [x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()})
- for x in build_pkg[CMD_LST]]
+ command_list_populated = [
+ x.format(**{k.split(".")[0]: v for k, v in asset_vars.items()})
+ for x in build_pkg[CMD_LST]
+ ]
# create output directory
tk.make_dir(asset_vars["asset_outfolder"])
- target = os.path.join(log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag))
+ target = os.path.join(
+ log_outfolder, TEMPLATE_TARGET.format(genome, asset_key, tag)
+ )
# add target command
command_list_populated.append("touch {target}".format(target=target))
- _LOGGER.debug("Command populated: '{}'".format(" ".join(command_list_populated)))
+ _LOGGER.debug(
+ "Command populated: '{}'".format(" ".join(command_list_populated))
+ )
try:
# run build command
signal.signal(signal.SIGINT, _handle_sigint(gat))
@@ -380,523 +225,248 @@ def build_asset(genome, asset_key, tag, build_pkg, genome_outfolder, specific_ar
else:
# save build recipe to the JSON-formatted file
recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
- with open(os.path.join(log_outfolder, recipe_file_name), 'w') as outfile:
+ with open(os.path.join(log_outfolder, recipe_file_name), "w") as outfile:
json.dump(build_pkg, outfile)
- # in order to prevent locking the config file for writing once while
- # being able to use the seek method for digest calculation we
- # create a temporary object to run seek on.
- tmp_rgc = RefGenConf()
- tmp_rgc[CFG_FOLDER_KEY] = rgc[CFG_FOLDER_KEY]
- tmp_rgc.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key})
- tmp_rgc.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()})
- digest = get_dir_digest(
- _seek(tmp_rgc, genome, asset_key, tag, enclosing_dir=True), pm)
+ # since the assets are always built to a standard dir structure, we
+ # can just stitch a path together for asset digest calculation
+ asset_dir = os.path.join(rgc.data_dir, *gat)
+ if not os.path.exists(asset_dir):
+ raise OSError(
+ "Could not compute asset digest. Path does not "
+ "exist: {}".format(asset_dir)
+ )
+ digest = get_dir_digest(asset_dir)
_LOGGER.info("Asset digest: {}".format(digest))
- del tmp_rgc
# add updates to config file
with rgc as r:
- r.update_assets(*gat[0:2], data={CFG_ASSET_DESC_KEY: build_pkg[DESC]})
- r.update_tags(*gat, data={CFG_ASSET_PATH_KEY: asset_key,
- CFG_ASSET_CHECKSUM_KEY: digest})
- r.update_seek_keys(*gat, keys={k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()})
- r.set_default_pointer(*gat)
+ if asset_key == "fasta":
+ r.update_genomes(
+ genome, data={CFG_ALIASES_KEY: [alias]}, force_digest=genome
+ )
+ r.update_assets(
+ *gat[0:2],
+ data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
+ force_digest=genome,
+ )
+ r.update_tags(
+ *gat,
+ force_digest=genome,
+ data={
+ CFG_ASSET_PATH_KEY: asset_key,
+ CFG_ASSET_CHECKSUM_KEY: digest,
+ },
+ )
+ r.update_seek_keys(
+ *gat,
+ force_digest=genome,
+ keys={
+ k: v.format(**asset_vars) for k, v in build_pkg[ASSETS].items()
+ },
+ )
+ r.set_default_pointer(*gat, force_digest=genome)
pm.stop_pipeline()
return True
for a in asset_list:
asset_key = a["asset"]
- asset_tag = a["tag"] or rgc.get_default_tag(genome, a["asset"], use_existing=False)
+ asset_tag = a["tag"] or rgc.get_default_tag(
+ genome, a["asset"], use_existing=False
+ )
recipe_name = recipe_name or asset_key
- if recipe_name in asset_build_packages.keys():
- asset_build_package = _check_recipe(asset_build_packages[recipe_name])
+ if isinstance(recipe_name, dict) or (
+ isinstance(recipe_name, str) and recipe_name in asset_build_packages.keys()
+ ):
+ if isinstance(recipe_name, dict):
+ _LOGGER.info("Using custom recipe: \n{}".format(recipe_name))
+ asset_build_package = _check_recipe(recipe_name)
+ recipe_name = asset_build_package["name"]
+ else:
+ asset_build_package = _check_recipe(asset_build_packages[recipe_name])
# handle user-requested parents for the required assets
input_assets = {}
parent_assets = []
specified_asset_keys, specified_assets = None, None
if args.assets is not None:
parsed_parents_input = _parse_user_build_input(args.assets)
- specified_asset_keys, specified_assets = \
- list(parsed_parents_input.keys()), list(parsed_parents_input.values())
- _LOGGER.debug("Custom assets requested: {}".format(args.assets))
+ specified_asset_keys = list(parsed_parents_input.keys())
+ specified_assets = list(parsed_parents_input.values())
+ _LOGGER.debug(f"Custom assets requested: {args.assets}")
if not specified_asset_keys and isinstance(args.assets, list):
- _LOGGER.warning("Specified parent assets format is invalid. Using defaults.")
+ _LOGGER.warning(
+ "Specified parent assets format is invalid. Using defaults."
+ )
for req_asset in asset_build_package[REQ_ASSETS]:
req_asset_data = parse_registry_path(req_asset[KEY])
# for each req asset see if non-default parents were requested
- if specified_asset_keys is not None and req_asset_data["asset"] in specified_asset_keys:
- parent_data = \
- parse_registry_path(specified_assets[specified_asset_keys.index(req_asset_data["asset"])])
- g, a, t, s = parent_data["genome"], \
- parent_data["asset"], \
- parent_data["tag"] or rgc.get_default_tag(genome, parent_data["asset"]), \
- parent_data["seek_key"]
+ if (
+ specified_asset_keys is not None
+ and req_asset_data["asset"] in specified_asset_keys
+ ):
+ parent_data = parse_registry_path(
+ specified_assets[
+ specified_asset_keys.index(req_asset_data["asset"])
+ ]
+ )
+ g, a, t, s = (
+ parent_data["genome"],
+ parent_data["asset"],
+ parent_data["tag"]
+ or rgc.get_default_tag(genome, parent_data["asset"]),
+ parent_data["seek_key"],
+ )
else: # if no custom parents requested for the req asset, use default one
default = parse_registry_path(req_asset[DEFAULT])
- g, a, t, s = genome, default["asset"], \
- rgc.get_default_tag(genome, default["asset"]), \
- req_asset_data["seek_key"]
- parent_assets.append("{}/{}:{}".format(g, a, t))
+ g, a, t, s = (
+ genome,
+ default["asset"],
+ rgc.get_default_tag(genome, default["asset"]),
+ req_asset_data["seek_key"],
+ )
+ parent_assets.append(
+ "{}/{}:{}".format(
+ rgc.get_genome_alias_digest(g, fallback=True), a, t
+ )
+ )
input_assets[req_asset[KEY]] = _seek(rgc, g, a, t, s)
_LOGGER.debug("Using parents: {}".format(", ".join(parent_assets)))
_LOGGER.debug("Provided files: {}".format(specified_args))
_LOGGER.debug("Provided parameters: {}".format(specified_params))
for required_file in asset_build_package[REQ_FILES]:
- if specified_args is None or required_file[KEY] not in specified_args.keys():
- raise ValueError("Path to the '{x}' input ({desc}) is required, but not provided. "
- "Specify it with: --files {x}=/path/to/{x}_file"
- .format(x=required_file[KEY], desc=required_file[DESC]))
+ if (
+ specified_args is None
+ or required_file[KEY] not in specified_args.keys()
+ ):
+ raise ValueError(
+ "Path to the '{x}' input ({desc}) is required, but not provided. "
+ "Specify it with: --files {x}=/path/to/{x}_file".format(
+ x=required_file[KEY], desc=required_file[DESC]
+ )
+ )
for required_param in asset_build_package[REQ_PARAMS]:
if specified_params is None:
specified_params = {}
if required_param[KEY] not in specified_params.keys():
if required_param[DEFAULT] is None:
- raise ValueError("Value for the parameter '{x}' ({desc}) is required, but not provided. "
- "Specify it with: --params {x}=value"
- .format(x=required_param[KEY], desc=required_param[DESC]))
+ raise ValueError(
+ "Value for the parameter '{x}' ({desc}) is required, but not provided. "
+ "Specify it with: --params {x}=value".format(
+ x=required_param[KEY], desc=required_param[DESC]
+ )
+ )
else:
- specified_params.update({required_param[KEY]: required_param[DEFAULT]})
+ specified_params.update(
+ {required_param[KEY]: required_param[DEFAULT]}
+ )
+ _LOGGER.info(
+ "Building '{}/{}:{}' using '{}' recipe".format(
+ genome, asset_key, asset_tag, recipe_name
+ )
+ )
+ ori_genome = genome
+ if recipe_name == "fasta":
+ if (
+ genome in rgc.genomes_list()
+ and "fasta" in rgc.list_assets_by_genome(genome)
+ ):
+ pretag = rgc.get_default_tag(genome, "fasta")
+ _LOGGER.warning(
+ "'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t})".format(
+ g=genome, a=asset_key, t=pretag
+ )
+ )
+ genome = rgc.get_genome_alias_digest(alias=genome, fallback=True)
+ else:
+ # if the recipe is "fasta" we first initialiaze the genome, based on the provided path to the input FASTA file
+ genome, _ = rgc.initialize_genome(
+ fasta_path=specified_args["fasta"],
+ alias=ori_genome,
+ skip_alias_write=True,
+ )
+ else:
+ try:
+ genome = rgc.get_genome_alias_digest(genome, fallback=True)
+ except UndefinedAliasError:
+ _LOGGER.error(
+ "Genome '{}' has not been initialized yet; "
+ "no key found for this alias".format(genome)
+ )
+ return
+ recipe_name = None
genome_outfolder = os.path.join(args.outfolder, genome)
- _LOGGER.info("Building '{}/{}:{}' using '{}' recipe".format(genome, asset_key, asset_tag, recipe_name))
- if recipe_name == 'fasta' and genome in rgc.genomes_list() \
- and 'fasta' in rgc.list_assets_by_genome(genome):
- _LOGGER.warning("'{g}' genome is already initialized with other fasta asset ({g}/{a}:{t}). "
- "It will be re-initialized.".format(g=genome, a=asset_key, t=asset_tag))
- if not build_asset(genome, asset_key, asset_tag, asset_build_package, genome_outfolder,
- specified_args, specified_params, **input_assets):
- log_path = os.path.abspath(os.path.join(genome_outfolder, asset_key, asset_tag,
- BUILD_STATS_DIR, ORI_LOG_NAME))
- _LOGGER.info("'{}/{}:{}' was not added to the config, but directory has been left in place. "
- "See the log file for details: {}".format(genome, asset_key, asset_tag, log_path))
+ if not _build_asset(
+ genome,
+ asset_key,
+ asset_tag,
+ asset_build_package,
+ genome_outfolder,
+ specified_args,
+ specified_params,
+ ori_genome,
+ **input_assets,
+ ):
+ log_path = os.path.abspath(
+ os.path.join(
+ genome_outfolder,
+ asset_key,
+ asset_tag,
+ BUILD_STATS_DIR,
+ ORI_LOG_NAME,
+ )
+ )
+ _LOGGER.info(
+ "'{}/{}:{}' was not added to the config, but directory has been left in place. "
+ "See the log file for details: {}".format(
+ genome, asset_key, asset_tag, log_path
+ )
+ )
return
- # If the recipe was a fasta, we init the genome
- if recipe_name == 'fasta':
- _LOGGER.info("Computing initial genome digest...")
- collection_checksum, content_checksums = \
- fasta_checksum(_seek(rgc, genome, asset_key, asset_tag, "fasta"))
- _LOGGER.info("Initializing genome...")
- refgenie_initg(rgc, genome, content_checksums)
_LOGGER.info("Finished building '{}' asset".format(asset_key))
with rgc as r:
# update asset relationships
- r.update_relatives_assets(genome, asset_key, asset_tag, parent_assets) # adds parents
+ r.update_relatives_assets(
+ genome, asset_key, asset_tag, parent_assets
+ ) # adds parents
for i in parent_assets:
parsed_parent = parse_registry_path(i)
# adds child (currently built asset) to the parent
- r.update_relatives_assets(parsed_parent["genome"], parsed_parent["asset"], parsed_parent["tag"],
- ["{}/{}:{}".format(genome, asset_key, asset_tag)], True)
+ r.update_relatives_assets(
+ parsed_parent["genome"],
+ parsed_parent["asset"],
+ parsed_parent["tag"],
+ ["{}/{}:{}".format(genome, asset_key, asset_tag)],
+ True,
+ )
if args.genome_description is not None:
- _LOGGER.debug("adding genome ({}) description: '{}'".format(genome, args.genome_description))
- r.update_genomes(genome, {CFG_GENOME_DESC_KEY: args.genome_description})
+ _LOGGER.debug(
+ "adding genome ({}) description: '{}'".format(
+ genome, args.genome_description
+ )
+ )
+ r.update_genomes(
+ genome, {CFG_GENOME_DESC_KEY: args.genome_description}
+ )
if args.tag_description is not None:
- _LOGGER.debug("adding tag ({}/{}:{}) description: '{}'".format(genome, asset_key, asset_tag,
- args.tag_description))
- r.update_tags(genome, asset_key, asset_tag, {CFG_TAG_DESC_KEY: args.tag_description})
- if recipe_name == "fasta":
- # to save config lock time when building fasta assets
- # (genome initialization takes some time for large genomes) we repeat the
- # conditional here for writing the computed genome digest
- r.update_genomes(genome, data={CFG_CHECKSUM_KEY: collection_checksum})
+ _LOGGER.debug(
+ "adding tag ({}/{}:{}) description: '{}'".format(
+ genome, asset_key, asset_tag, args.tag_description
+ )
+ )
+ r.update_tags(
+ genome,
+ asset_key,
+ asset_tag,
+ {CFG_TAG_DESC_KEY: args.tag_description},
+ )
+ rgc._symlink_alias(genome, asset_key, asset_tag)
else:
_raise_missing_recipe_error(recipe_name)
-def _exec_list(rgc, remote, genome):
- if remote:
- pfx = "Remote"
- # we use this func looping through the server urls and assigning a
- # single instance as the server for the object. That's why we can
- # access the data with [0] below
- assemblies, assets = \
- list(rgc.listr(genome=genome, as_str=True).values())[0]
- recipes = None # Not implemented
- else:
- pfx = "Local"
- assemblies, assets = rgc.get_local_data_str(genome=genome)
- # also get recipes
- recipes = ", ".join(sorted(list(asset_build_packages.keys())))
- return pfx, assemblies, assets, recipes
-
-
-def perm_check_x(file_to_check, message_tag):
- """
- Check X_OK permission on a path, providing according messaging and bool val.
-
- :param str file_to_check: path to query for permission
- :param str message_tag: context for error message if check fails
- :return bool: os.access(path, X_OK) for the given path
- :raise ValueError: if there's no filepath to check for permission
- """
- if not file_to_check:
- msg = "You must provide a path to {}".format(message_tag)
- _LOGGER.error(msg)
- raise ValueError(msg)
- if not os.access(file_to_check, os.X_OK):
- _LOGGER.error("Insufficient permissions to write to {}: "
- "{}".format(message_tag, file_to_check))
- return False
- return True
-
-
-def main():
- """ Primary workflow """
- parser = logmuse.add_logging_options(build_argparser())
- args, remaining_args = parser.parse_known_args()
- global _LOGGER
- _LOGGER = logmuse.logger_via_cli(args, make_root=True)
- _LOGGER.debug("refgenie {}".format(__version__))
- _LOGGER.debug("Args: {}".format(args))
-
- if not args.command:
- parser.print_help()
- _LOGGER.error("No command given")
- sys.exit(1)
-
- gencfg = refgenconf.select_genome_config(filename=args.genome_config, check_exist=not args.command == INIT_CMD,
- on_missing=lambda fp: fp, strict_env=True)
- if gencfg is None:
- raise MissingGenomeConfigError(args.genome_config)
- _LOGGER.debug("Determined genome config: {}".format(gencfg))
-
- # From user input we want to construct a list of asset dicts, where each
- # asset has a genome name, asset name, and tag
-
- if "asset_registry_paths" in args and args.asset_registry_paths:
- _LOGGER.debug("Found registry_path: {}".format(args.asset_registry_paths))
- asset_list = [parse_registry_path(x) for x in args.asset_registry_paths]
-
- for a in asset_list:
- # every asset must have a genome, either provided via registry path
- # or the args.genome arg.
- if not a["genome"]:
- if args.genome:
- a["genome"] = args.genome
- else:
- _LOGGER.error("Provided asset registry path ({}/{}:{}) is invalid. See help for usage reference.".
- format(a["genome"], a["asset"], a["tag"]))
- sys.exit(1)
- else:
- if args.genome and args.genome != a["genome"]:
- _LOGGER.warn("Two different genomes specified for asset '{}'.".format(a["asset"]))
-
- else:
- if args.command in GENOME_ONLY_REQUIRED and not args.genome:
- parser.error("You must provide either a genome or a registry path")
- sys.exit(1)
- if args.command in ASSET_REQUIRED:
- parser.error("You must provide an asset registry path")
- sys.exit(1)
-
- if args.command == INIT_CMD:
- _LOGGER.debug("Initializing refgenie genome configuration")
- entries = OrderedDict({
- CFG_VERSION_KEY: REQ_CFG_VERSION,
- CFG_FOLDER_KEY: os.path.dirname(os.path.abspath(gencfg)),
- CFG_SERVERS_KEY: args.genome_server or [DEFAULT_SERVER],
- CFG_GENOMES_KEY: None})
- if args.settings_json:
- if os.path.isfile(args.settings_json):
- with open(args.settings_json, 'r') as json_file:
- data = json.load(json_file)
- entries.update(data)
- else:
- raise FileNotFoundError(
- "JSON file with config init settings does not exist: {}".
- format(args.settings_json))
- if args.genome_folder:
- entries.update({CFG_FOLDER_KEY: args.genome_folder})
- if args.remote_url_base:
- entries.update({CFG_REMOTE_URL_BASE_KEY: args.remote_url_base})
- if args.genome_archive_folder:
- entries.update({CFG_ARCHIVE_KEY: args.genome_archive_folder})
- if args.genome_archive_config:
- entries.update({CFG_ARCHIVE_CONFIG_KEY: args.genome_archive_config})
- _LOGGER.debug("initializing with entries: {}".format(entries))
- rgc = RefGenConf(entries=entries)
- rgc.initialize_config_file(os.path.abspath(gencfg))
-
- elif args.command == BUILD_CMD:
- if not all([x["genome"] == asset_list[0]["genome"] for x in asset_list]):
- _LOGGER.error("Build can only build assets for one genome")
- sys.exit(1)
- recipe_name = None
- if args.recipe:
- if len(asset_list) > 1:
- _LOGGER.error("Recipes cannot be specified for multi-asset builds")
- sys.exit(1)
- recipe_name = args.recipe
- if args.requirements:
- for a in asset_list:
- recipe = recipe_name or a["asset"]
- if recipe not in asset_build_packages.keys():
- _raise_missing_recipe_error(recipe)
- _LOGGER.info("'{}' recipe requirements: ".format(recipe))
- _make_asset_build_reqs(recipe)
- sys.exit(0)
- refgenie_build(gencfg, asset_list[0]["genome"], asset_list, recipe_name, args)
-
- elif args.command == GET_ASSET_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- check = args.check_exists if args.check_exists else None
- for a in asset_list:
- _LOGGER.debug("getting asset: '{}/{}.{}:{}'".
- format(a["genome"], a["asset"], a["seek_key"], a["tag"]))
- print(rgc.seek(a["genome"], a["asset"], a["tag"], a["seek_key"],
- strict_exists=check))
- return
-
- elif args.command == INSERT_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- if len(asset_list) > 1:
- raise NotImplementedError("Can only add 1 asset at a time")
- else:
- sk = args.seek_keys
- if sk:
- sk = json.loads(args.seek_keys)
- rgc.add(path=args.path, genome=asset_list[0]["genome"],
- asset=asset_list[0]["asset"], tag=asset_list[0]["tag"],
- seek_keys=sk, force=args.force)
-
- elif args.command == PULL_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- # existing assets overwriting
- if args.no_overwrite:
- force = False
- elif args.force_overwrite:
- force = True
- else:
- force = None
- # large archive pulling
- if args.no_large:
- force_large = False
- elif args.pull_large:
- force_large = True
- else:
- force_large = None
- # batch mode takes precedence over other choices
- if args.batch:
- force_large = True
- force = False
-
- outdir = rgc[CFG_FOLDER_KEY]
- if not os.path.exists(outdir):
- raise MissingFolderError(outdir)
- target = _key_to_name(CFG_FOLDER_KEY)
- if not perm_check_x(outdir, target):
- return
- if not _single_folder_writeable(outdir):
- _LOGGER.error("Insufficient permissions to write to {}: {}".
- format(target, outdir))
- return
-
- for a in asset_list:
- rgc.pull(a["genome"], a["asset"], a["tag"], force=force,
- force_large=force_large, size_cutoff=args.size_cutoff)
-
- elif args.command in [LIST_LOCAL_CMD, LIST_REMOTE_CMD]:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- if args.command == LIST_REMOTE_CMD:
- num_servers = 0
- # Keep all servers so that child updates maintain server list
- server_list = rgc[CFG_SERVERS_KEY]
- bad_servers = []
- for server_url in rgc[CFG_SERVERS_KEY]:
- num_servers += 1
- try:
- rgc[CFG_SERVERS_KEY] = [server_url]
- pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome)
- if assets is None and genomes is None:
- continue
- _LOGGER.info("Server URL: {}".format(server_url))
- _LOGGER.info("{} genomes: {}".format(pfx, genomes))
- if args.command != LIST_REMOTE_CMD: # Not implemented yet
- _LOGGER.info("{} recipes: {}".format(pfx, recipes))
- _LOGGER.info("{} assets:\n{}\n".format(pfx, assets))
- except (DownloadJsonError, ConnectionError):
- bad_servers.append(server_url)
- continue
- if num_servers >= len(server_list) and bad_servers:
- _LOGGER.error("Could not list assets from the following server(s): {}".format(bad_servers))
- # Restore original server list, even when we couldn't find assets on a server
- rgc[CFG_SERVERS_KEY] = server_list
- else: # Only check local assets once
- _LOGGER.info("Server subscriptions: {}".format(", ".join(rgc[CFG_SERVERS_KEY])))
- pfx, genomes, assets, recipes = _exec_list(rgc, args.command == LIST_REMOTE_CMD, args.genome)
- _LOGGER.info("{} genomes: {}".format(pfx, genomes))
- if args.command != LIST_REMOTE_CMD: # Not implemented yet
- _LOGGER.info("{} recipes: {}".format(pfx, recipes))
- _LOGGER.info("{} assets:\n{}".format(pfx, assets))
-
- elif args.command == GETSEQ_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- print(rgc.getseq(args.genome, args.locus))
-
- elif args.command == REMOVE_CMD:
- force = args.force
- rgc = RefGenConf(filepath=gencfg)
- for a in asset_list:
- a["tag"] = a["tag"] or rgc.get_default_tag(a["genome"], a["asset"],
- use_existing=False)
- _LOGGER.debug("Determined tag for removal: {}".format(a["tag"]))
- if a["seek_key"] is not None:
- raise NotImplementedError("You can't remove a specific seek_key.")
- bundle = [a["genome"], a["asset"], a["tag"]]
- try:
- if not rgc.is_asset_complete(*bundle):
- with rgc as r:
- r.cfg_remove_assets(*bundle)
- _LOGGER.info("Removed an incomplete asset '{}/{}:{}'".
- format(*bundle))
- return
- except (KeyError, MissingAssetError, MissingGenomeError):
- _LOGGER.info("Asset '{}/{}:{}' does not exist".format(*bundle))
- return
- if len(asset_list) > 1:
- if not query_yes_no("Are you sure you want to remove {} assets?".
- format(len(asset_list))):
- _LOGGER.info("Action aborted by the user")
- return
- force = True
- for a in asset_list:
- rgc.remove(genome=a["genome"], asset=a["asset"], tag=a["tag"],
- force=force)
-
- elif args.command == TAG_CMD:
- rgc = RefGenConf(filepath=gencfg)
- if len(asset_list) > 1:
- raise NotImplementedError("Can only tag 1 asset at a time")
- if args.default:
- # set the default tag and exit
- with rgc as r:
- r.set_default_pointer(a["genome"], a["asset"], a["tag"], True)
- sys.exit(0)
- rgc.tag(a["genome"], a["asset"], a["tag"], args.tag)
-
- elif args.command == ID_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- if len(asset_list) == 1:
- g, a = asset_list[0]["genome"], asset_list[0]["asset"]
- t = asset_list[0]["tag"] or rgc.get_default_tag(g, a)
- print(rgc.id(g, a, t))
- return
- for asset in asset_list:
- g, a = asset["genome"], asset["asset"]
- t = asset["tag"] or rgc.get_default_tag(g, a)
- print("{}/{}:{},".format(g, a, t) + rgc.id(g, a, t))
- return
- elif args.command == SUBSCRIBE_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- rgc.subscribe(urls=args.genome_server, reset=args.reset)
- return
- elif args.command == UNSUBSCRIBE_CMD:
- rgc = RefGenConf(filepath=gencfg, writable=False)
- rgc.unsubscribe(urls=args.genome_server)
- return
-
-
-def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities):
- """
- Message and save removed entity data
-
- :param str directory: removed dir
- :param str entity_class: class of the entity
- :param dict asset_dict: selected genome/asset:tag combination
- :param list removed_entities: list of the removed entities to append to
- """
- subclass = "asset" if entity_class == "genome" else "tag"
- if os.path.basename(directory) == asset_dict[entity_class]:
- _LOGGER.info("Last {sub} for {ec} '{en}' has been removed, removing {ec} directory".
- format(sub=subclass, ec=entity_class, en=asset_dict[entity_class]))
- removed_entities.append(_remove(directory))
- else:
- _LOGGER.debug("Didn't remove '{}' since it does not match the {} name: {}".
- format(directory, entity_class, asset_dict[entity_class]))
-
-
-def _remove(path):
- """
- remove asset if it is a dir or a file
-
- :param str path: path to the entity to remove, either a file or a dir
- :return str: removed path
- """
- if os.path.isfile(path):
- os.remove(path)
- elif os.path.isdir(path):
- rmtree(path)
- else:
- raise ValueError("path '{}' is neither a file nor a dir.".format(path))
- return path
-
-
def _key_to_name(k):
return k.replace("_", " ")
-def _single_folder_writeable(d):
- return os.access(d, os.W_OK) and os.access(d, os.X_OK)
-
-
-def _writeable(outdir, strict_exists=False):
- outdir = outdir or "."
- if os.path.exists(outdir):
- return _single_folder_writeable(outdir)
- elif strict_exists:
- raise MissingFolderError(outdir)
- return _writeable(os.path.dirname(outdir), strict_exists)
-
-
-def _make_asset_build_reqs(asset):
- """
- Prepare requirements and inputs lists and display it
-
- :params str asset: name of the asset
- """
- def _format_reqs(req_list):
- """
-
- :param list[dict] req_list:
- :return list[str]:
- """
- templ = "\t{} ({})"
- return [templ.format(req[KEY], req[DESC]) if DEFAULT not in req
- else (templ + "; default: {}").format(req[KEY], req[DESC], req[DEFAULT]) for req in req_list]
-
- reqs_list = []
- if asset_build_packages[asset][REQ_FILES]:
- reqs_list.append("- files:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_FILES]))))
- if asset_build_packages[asset][REQ_ASSETS]:
- reqs_list.append("- assets:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_ASSETS]))))
- if asset_build_packages[asset][REQ_PARAMS]:
- reqs_list.append("- params:\n{}".format("\n".join(_format_reqs(asset_build_packages[asset][REQ_PARAMS]))))
- _LOGGER.info("\n".join(reqs_list))
-
-
-def get_dir_digest(path, pm=None):
- """
- Generate a MD5 digest that reflects just the contents of the files in the selected directory.
-
- :param str path: path to the directory to digest
- :param pypiper.PipelineManager pm: a pipeline object, optional. The subprocess module will be used if not provided
- :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334
- """
- if not is_command_callable("md5sum"):
- raise OSError("md5sum command line tool is required for asset digest calculation. \n"
- "Install and try again, e.g on macOS: 'brew install md5sha1sum'")
- cmd = "cd {}; find . -type f -not -path './" + BUILD_STATS_DIR + \
- "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum"
- if isinstance(pm, pypiper.PipelineManager):
- x = pm.checkprint(cmd.format(path))
- else:
- try:
- from subprocess import check_output
- x = check_output(cmd.format(path), shell=True).decode("utf-8")
- except Exception as e:
- _LOGGER.warning("{}: could not calculate digest for '{}'".format(e.__class__.__name__, path))
- return
- return str(sub(r'\W+', '', x)) # strips non-alphanumeric
-
-
def _handle_sigint(gat):
"""
SIGINT handler, unlocks the config file and exists the program
@@ -904,35 +474,12 @@ def _handle_sigint(gat):
:param list gat: a list of genome, asset and tag. Used for a message generation.
:return function: the SIGINT handling function
"""
+
def handle(sig, frame):
_LOGGER.warning("\nThe build was interrupted: {}/{}:{}".format(*gat))
sys.exit(0)
- return handle
-
-
-def _parse_user_build_input(input):
- """
- Parse user input specification. Used in build for specific parents and input parsing.
-
- :param Iterable[Iterable[str], ...] input: user command line input,
- formatted as follows: [[fasta=txt, test=txt], ...]
- :return dict: mapping of keys, which are input names and values
- """
- lst = []
- for i in input or []:
- lst.extend(i)
- return {x.split("=")[0]: x.split("=")[1] for x in lst if "=" in x} if lst is not None else lst
-
-
-def _raise_missing_recipe_error(recipe):
- """
- Raise an error for a missing recipe, when one is requested
- :param str recipe: recipe name
- :raise MissingRecipeError: always
- """
- raise MissingRecipeError("Recipe '{}' not found. Available recipes: {}".
- format(recipe, ", ".join(list(asset_build_packages.keys()))))
+ return handle
def _check_recipe(recipe):
@@ -943,6 +490,21 @@ def _check_recipe(recipe):
:param dict recipe: asset_build_package
:raise ValueError: if any key names are duplicated
"""
+ # experimental feature; recipe jsonschema validation
+ from jsonschema import validate
+ from yacman import load_yaml
+
+ SCHEMA_SRC = os.path.join(
+ os.path.dirname(os.path.abspath(__file__)), "schemas", "recipe_schema.yaml"
+ )
+ if os.path.exists(SCHEMA_SRC):
+ validate(recipe, load_yaml(filepath=SCHEMA_SRC))
+ _LOGGER.info(
+ "Recipe validated successfully against a schema: {}".format(SCHEMA_SRC)
+ )
+ else:
+ _LOGGER.warning("Recipe schema not found: {}".format(SCHEMA_SRC))
+ # end of validation
req_keys = []
for req in [REQ_PARAMS, REQ_ASSETS, REQ_FILES]:
req_keys.extend([req_dict[KEY] for req_dict in recipe[req]])
@@ -951,20 +513,25 @@ def _check_recipe(recipe):
if k not in unique:
unique.append(k)
else:
- raise ValueError("The recipe contains a duplicated requirement"
- " key '{}', which is not permitted.".format(k))
+ raise ValueError(
+ "The recipe contains a duplicated requirement"
+ " key '{}', which is not permitted.".format(k)
+ )
return recipe
-def _seek(rgc, genome_name, asset_name, tag_name=None,
- seek_key=None, enclosing_dir=False):
+def _seek(
+ rgc, genome_name, asset_name, tag_name=None, seek_key=None, enclosing_dir=False
+):
"""
Strict seek. Most use cases in this package require file existence
check in seek. This function makes it easier
"""
- return rgc.seek(genome_name=genome_name,
- asset_name=asset_name,
- tag_name=tag_name,
- seek_key=seek_key,
- enclosing_dir=enclosing_dir,
- strict_exists=True)
+ return rgc.seek_src(
+ genome_name=genome_name,
+ asset_name=asset_name,
+ tag_name=tag_name,
+ seek_key=seek_key,
+ enclosing_dir=enclosing_dir,
+ strict_exists=True,
+ )
diff --git a/refgenie/refgenie.yaml b/refgenie/refgenie.yaml
deleted file mode 100644
index 2e9ad975..00000000
--- a/refgenie/refgenie.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Build configuration
-
-tools:
- # absolute paths to required tools
- bowtie2build: bowtie2-build
- bismark_genome_preparation: bismark_genome_preparation
- epilog_indexer: epilog_indexer.py
- samtools: samtools
- kallisto: kallisto
- hisat2build: hisat2-build
- suffixerator: gt suffixerator
- tallymer: gt tallymer mkindex
-
-index:
- bowtie2: True
- bismark_bt1: False
- bismark_bt2: False
- epilog: False
- hisat: False
- kallisto: True
- suffixerator: False
- tallymer: False
-
-param:
- epilog:
- context: "cg"
- tallymer:
- minocc: 2
\ No newline at end of file
diff --git a/refgenie/refget.py b/refgenie/refget.py
index 1d5f4d82..b0861260 100644
--- a/refgenie/refget.py
+++ b/refgenie/refget.py
@@ -40,4 +40,4 @@ def fasta_checksum(fa_file, checksum_function=trunc512_digest):
content_checksums[k] = checksum_function(str(fa_object[k]))
collection_string = ";".join([":".join(i) for i in content_checksums.items()])
collection_checksum = checksum_function(collection_string)
- return collection_checksum, content_checksums
\ No newline at end of file
+ return collection_checksum, content_checksums
diff --git a/refgenie/schemas/recipe_schema.yaml b/refgenie/schemas/recipe_schema.yaml
new file mode 100644
index 00000000..c06bc03f
--- /dev/null
+++ b/refgenie/schemas/recipe_schema.yaml
@@ -0,0 +1,38 @@
+description: refgenie recipe schema
+
+properties:
+ name:
+ type: string
+ pattern: "^\\S*$"
+ description: "name of the recipe with no whitespaces"
+ description:
+ type: string
+ description: "description of the recipe"
+ assets:
+ type: object
+ description: "seek keys to be produced"
+ required_files:
+ type: array
+ items:
+ type: object
+ description: "File-type input to the recipe"
+ required_assets:
+ type: array
+ items:
+ type: object
+ description: "Asset-type input to the recipe"
+ required_parameters:
+ type: array
+ items:
+ type: object
+ description: "Parameter-type input to the recipe"
+ container:
+ type: string
+ pattern: "^\\S*$"
+ description: "Registry path of the container to use"
+ command_list:
+ type: array
+ items:
+ type: string
+ description: "List of commands that create the asset"
+required: [description, assets, required_files, required_assets, required_parameters, command_list]
\ No newline at end of file
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 08942fb7..b7816e4c 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -1,4 +1,5 @@
logmuse>=0.2.6
-refgenconf>=0.9.1
+refgenconf>=0.10.0
piper>=0.12.1
pyfaidx>=0.5.5.2
+yacman>=0.8.0
\ No newline at end of file
diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt
index a7372dc4..b1e1259c 100644
--- a/requirements/requirements-doc.txt
+++ b/requirements/requirements-doc.txt
@@ -1,2 +1 @@
https://github.com/databio/mkdocs-databio/archive/master.zip
-refgenconf>=0.6.1
diff --git a/setup.py b/setup.py
index d1f81150..f0c50dcf 100755
--- a/setup.py
+++ b/setup.py
@@ -8,37 +8,38 @@
for line in reqs_file:
if not line.strip():
continue
- #DEPENDENCIES.append(line.split("=")[0].rstrip("<>"))
+ # DEPENDENCIES.append(line.split("=")[0].rstrip("<>"))
DEPENDENCIES.append(line)
# Additional keyword arguments for setup()
extra = {"install_requires": DEPENDENCIES}
# 2to3
-if sys.version_info >= (3, ):
+if sys.version_info >= (3,):
extra["use_2to3"] = True
-with open("refgenie/_version.py", 'r') as versionfile:
+with open("refgenie/_version.py", "r") as versionfile:
version = versionfile.readline().split()[-1].strip("\"'\n")
# Handle the pypi README formatting.
try:
import pypandoc
- long_description = pypandoc.convert_file('README.md', 'rst')
+
+ long_description = pypandoc.convert_file("README.md", "rst")
msg = "\033[032mPandoc conversion succeeded.\033[0m"
-except(IOError, ImportError, OSError):
+except (IOError, ImportError, OSError):
msg = "\033[0;31mWarning: pandoc conversion failed!\033[0m"
- long_description = open('README.md').read()
+ long_description = open("README.md").read()
setup(
- name='refgenie',
+ name="refgenie",
packages=["refgenie"],
version=version,
- description='Refgenie creates a standardized folder structure for reference genome files and indexes. '
- 'You can download pre-built genomes or build your own for any fasta file',
+ description="Refgenie creates a standardized folder structure for reference genome files and indexes. "
+ "You can download pre-built genomes or build your own for any fasta file",
long_description=long_description,
- long_description_content_type='text/markdown',
+ long_description_content_type="text/markdown",
classifiers=[
"Development Status :: 4 - Beta",
"License :: OSI Approved :: BSD License",
@@ -46,21 +47,21 @@
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
- "Topic :: Scientific/Engineering :: Bio-Informatics"
- ],
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+ ],
license="BSD2",
entry_points={
"console_scripts": [
- 'refgenie = refgenie.__main__:main',
- 'import_igenome = refgenie.add_assets_igenome:main'
+ "refgenie = refgenie.__main__:main",
+ "import_igenome = refgenie.add_assets_igenome:main",
],
},
keywords="bioinformatics, sequencing, ngs",
package_data={"refgenie": [os.path.join("refgenie", "*")]},
include_package_data=True,
- url='http://refgenie.databio.org',
- author=u'Nathan Sheffield, Vince Reuter, Michal Stolarczyk',
+ url="http://refgenie.databio.org",
+ author=u"Nathan Sheffield, Vince Reuter, Michal Stolarczyk",
**extra
)
-print(msg)
\ No newline at end of file
+print(msg)
diff --git a/tests/assert_in_file.sh b/tests/assert_in_file.sh
new file mode 100755
index 00000000..c0c192f0
--- /dev/null
+++ b/tests/assert_in_file.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+if [ $# -ne 3 ]; then
+ echo $0: usage: assert_in_file.sh filepath query inverse
+ exit 1
+fi
+
+if [[ "$3" == "1" ]]; then
+ echo -e "\nTesting if '$2' is not in '$1'"
+ if grep -q "$2" "$1"; then
+ echo -e "\ERROR: '$2' is in '$1'\nContents:\n"
+ cat "$1"
+ exit 1
+ else
+ echo -e "\nSUCCESS: '$2' not in '$1'\n"
+ exit 0
+ fi
+else
+ echo -e "\nTesting if '$2' is in '$1'"
+ if grep -q "$2" "$1"; then
+ echo -e "\nSUCCESS: '$2' is in '$1'\n"
+ exit 0
+ else
+ echo -e "\nERROR: '$2' not in '$1'\nContents:\n"
+ cat "$1"
+ exit 1
+ fi
+fi
diff --git a/tests/data/recipe_child.json b/tests/data/recipe_child.json
new file mode 100644
index 00000000..afe1d76e
--- /dev/null
+++ b/tests/data/recipe_child.json
@@ -0,0 +1,20 @@
+{
+ "name": "fasta_child",
+ "description": "child of an asset, dummy recipe",
+ "assets": {
+ "fasta_child": "{genome}_child.fa.gz"
+ },
+ "required_assets": [
+ {
+ "key": "fasta",
+ "default": "fasta",
+ "description": "fasta asset for genome"
+ }
+ ],
+ "required_parameters": [],
+ "required_files": [],
+ "container": "databio/refgenie",
+ "command_list": [
+ "cp {fasta} {asset_outfolder}/{genome}_child.fa.gz"
+ ]
+}
\ No newline at end of file
diff --git a/tests/data/recipe_parent.json b/tests/data/recipe_parent.json
new file mode 100644
index 00000000..a664beef
--- /dev/null
+++ b/tests/data/recipe_parent.json
@@ -0,0 +1,20 @@
+{
+ "name": "fasta",
+ "description": "DNA sequences in the FASTA format, dummy recipe",
+ "assets": {
+ "fasta": "{genome}.fa"
+ },
+ "required_files": [
+ {
+ "key": "fasta",
+ "description": "gzipped fasta file"
+ }
+ ],
+ "required_assets": [],
+ "required_parameters": [],
+ "container": "databio/refgenie",
+ "command_list": [
+ "cp {fasta} {asset_outfolder}/{genome}.fa.gz",
+ "gzip -df {asset_outfolder}/{genome}.fa.gz"
+ ]
+}
\ No newline at end of file
diff --git a/tests/data/t7.fa.gz b/tests/data/t7.fa.gz
new file mode 100644
index 00000000..a2168f52
Binary files /dev/null and b/tests/data/t7.fa.gz differ
diff --git a/update-usage-docs.sh b/update-usage-docs.sh
index 4aecfd4a..0841bc4b 100755
--- a/update-usage-docs.sh
+++ b/update-usage-docs.sh
@@ -1,8 +1,7 @@
#!/bin/bash
cp docs/usage.template usage.template
#looper --help > USAGE.temp 2>&1
-
-for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help"; do
+for cmd in "--help" "init --help" "list --help" "listr --help" "pull --help" "build --help" "seek --help" "add --help" "remove --help" "getseq --help" "tag --help" "id --help" "subscribe --help" "unsubscribe --help" "alias --help" "upgrade --help"; do
echo $cmd
echo -e "## \`refgenie $cmd\`" > USAGE_header.temp
refgenie $cmd --help > USAGE.temp 2>&1