Skip to content

Commit 1b35484

Browse files
authored
feat!: refactoring for bollonaster (#22)
1 parent d1e4233 commit 1b35484

File tree

103 files changed

+2925
-304
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+2925
-304
lines changed

.editorconfig

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# http://editorconfig.org
2+
3+
root = true
4+
5+
[*]
6+
charset = utf-8
7+
end_of_line = lf
8+
insert_final_newline = true
9+
trim_trailing_whitespace = true
10+
11+
[*.{py,rst,ini}]
12+
indent_style = space
13+
indent_size = 4
14+
15+
[*.py]
16+
line_length=120
17+
18+
[*.{html,css,scss,json,yml,yaml,json}]
19+
indent_style = space
20+
indent_size = 2
21+
22+
[*.md]
23+
trim_trailing_whitespace = false
24+
25+
[Makefile]
26+
indent_style = tab

.gitignore

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,28 @@
1+
**/download
2+
**/rocksdb*
3+
**/.done
4+
5+
annos/**/*.vcf*
6+
annos/**/*.bed*
7+
annos/**/*.tsv*
8+
9+
features/**/*.bed*
10+
features/**/*.tsv*
11+
features/**/*.md5
12+
13+
genes/**/*.bed*
14+
genes/**/*.tsv*
15+
genes/**/*.jsonl*
16+
genes/**/*.md5
17+
18+
vardbs/**/*.tsv*
19+
vardbs/**/*.bed*
20+
vardbs/**/*.md5
21+
22+
reference/**/*.fa*
23+
24+
tracks/**/*.bed*
25+
126
/stats-*/
227
/*.xlsx
328
.~*

Snakefile

Lines changed: 49 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,53 @@
1-
import glob
2-
from itertools import product
3-
import os
4-
import re
5-
import sys
6-
import textwrap
7-
import json
8-
9-
from snakemake import shell
101
from tools.sv_db_to_tsv import to_tsv
112

123

13-
# Ensure that the configuration file exists and then load it.
14-
if not os.path.exists("config.yaml"):
15-
print("No config.yaml exists yet. Try `cp config.yaml.example config.yaml`.", file=sys.stderr)
16-
sys.exit(1)
17-
18-
19-
configfile: "config.yaml"
20-
21-
22-
# Print configuration.
23-
print("Configuration:", file=sys.stderr)
24-
print("\n---\n%s\n---\n" % json.dumps(config, indent=" "), file=sys.stderr)
25-
26-
#: Use strict mode and also print each command.
27-
shell.prefix("set -x; set -euo pipefail; ")
28-
29-
#: The canonical chromosome names (without "Y").
30-
CHROMS_NO_Y = list(map(str, range(1, 23))) + ["X"]
31-
32-
#: The canonical chromosome names.
33-
CHROMS = CHROMS_NO_Y + ["Y"]
34-
35-
#: List for collecting all result files below.
36-
ALL_RESULT = []
37-
38-
39-
def input_all(wildcards):
40-
return ALL_RESULT
41-
42-
43-
rule all:
4+
rule default:
445
input:
45-
input_all,
46-
47-
48-
# Load all snakemake files `snakefiles/*/*.smk`.
49-
snakefiles = list(sorted(glob.glob("snakefiles/*/*.smk")))
50-
print("Loading Snakefiles...", file=sys.stderr)
51-
52-
for path in snakefiles:
53-
54-
include: path
55-
56-
57-
# Derive output overall output files from rules starting with prefix "output_".
58-
print("Constructing list of output files from all `result_*` rules...\n", file=sys.stderr)
59-
60-
ALL_RESULT = []
61-
for rule in workflow.rules:
62-
if rule.name.startswith("result_"):
63-
for genome_build in ("GRCh37", "GRCh38"):
64-
vals = {**config, "genome_build": genome_build}
65-
for path in rule.output:
66-
if "{chrom}" in path:
67-
chroms = CHROMS
68-
else:
69-
chroms = ["-"]
70-
if "{chrom_no_y}" in path:
71-
chroms_no_y = CHROMS_NO_Y
72-
else:
73-
chroms_no_y = ["-"]
74-
for chrom, chrom_no_y in product(chroms, chroms_no_y):
75-
path = re.sub(r"{([^,]+)(,.*)?}", r"{\1}", path)
76-
ALL_RESULT.append(path.format(chrom=chrom, chrom_no_y=chrom_no_y, **vals))
6+
"annos/grch37/cadd/.done",
7+
"annos/grch37/dbnsfp-4.4a/.done",
8+
"annos/grch37/dbnsfp-4.4c/.done",
9+
"annos/grch37/dbscsnv/.done",
10+
"annos/grch37/helixmtdb/helixmtdb.vcf",
11+
"annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz",
12+
"annos/grch37/ucsc_conservation/ucsc_conservation.tsv",
13+
"annos/grch37/dbsnp/dbsnp.vcf.gz",
14+
"annos/grch37/gnomad_exomes/.done",
15+
"annos/grch37/gnomad_genomes/.done",
16+
"annos/grch38/cadd/.done",
17+
"annos/grch38/dbnsfp-4.4a/.done",
18+
"annos/grch38/dbnsfp-4.4c/.done",
19+
"annos/grch38/gnomad_exomes/.done",
20+
"annos/grch38/gnomad_genomes/.done",
21+
"annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz",
22+
"annos/grch38/helixmtdb/helixmtdb.vcf",
23+
"features/grch37/tads/imr90.bed",
24+
"features/grch37/tads/hesc.bed",
25+
"features/grch37/gene_regions/refseq.bed.gz",
26+
"features/grch37/gene_regions/ensembl.bed.gz",
27+
"features/grch37/masked/repeat.bed.gz",
28+
"features/grch37/masked/segdup.bed.gz",
29+
"genes/hgnc/hgnc_info.jsonl",
30+
"genes/ncbi/gene_info.jsonl",
31+
"genes/dbnsfp/genes.tsv.gz",
32+
"genes/xlink/ensembl.tsv",
33+
"genes/xlink/hgnc.tsv",
34+
"genes/mim2gene/mim2gene.tsv",
35+
"tracks/grch37/ucsc_genomicSuperDups.bed.gz",
36+
"tracks/grch37/ucsc_rmsk.bed.gz",
37+
"tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz",
38+
"tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz",
39+
"vardbs/grch37/strucvar/clinvar.bed.gz",
40+
"vardbs/grch37/strucvar/dbvar.bed.gz",
41+
"vardbs/grch37/strucvar/dgv.bed.gz",
42+
"vardbs/grch37/strucvar/dgv_gs.bed.gz",
43+
"vardbs/grch37/strucvar/g1k.bed.gz",
44+
"vardbs/grch37/strucvar/gnomad_sv.bed.gz",
45+
"vardbs/grch37/strucvar/exac.bed.gz",
46+
47+
48+
include: "snakefiles/annos.smk"
49+
include: "snakefiles/genes.smk"
50+
include: "snakefiles/features.smk"
51+
include: "snakefiles/vardbs-grch37-strucvars.smk"
52+
include: "snakefiles/tracks-grch37.smk"
53+
include: "snakefiles/reference.smk"

Snakefile.old

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import glob
2+
from itertools import product
3+
import os
4+
import re
5+
import sys
6+
import textwrap
7+
import json
8+
9+
from snakemake import shell
10+
from tools.sv_db_to_tsv import to_tsv
11+
12+
13+
# Ensure that the configuration file exists and then load it.
14+
if not os.path.exists("config.yaml"):
15+
print("No config.yaml exists yet. Try `cp config.yaml.example config.yaml`.", file=sys.stderr)
16+
sys.exit(1)
17+
18+
19+
configfile: "config.yaml"
20+
21+
22+
# Print configuration.
23+
print("Configuration:", file=sys.stderr)
24+
print("\n---\n%s\n---\n" % json.dumps(config, indent=" "), file=sys.stderr)
25+
26+
#: Use strict mode and also print each command.
27+
shell.prefix("set -x; set -euo pipefail; ")
28+
29+
#: The canonical chromosome names (without "Y").
30+
CHROMS_NO_Y = list(map(str, range(1, 23))) + ["X"]
31+
32+
#: The canonical chromosome names.
33+
CHROMS = CHROMS_NO_Y + ["Y"]
34+
35+
#: List for collecting all result files below.
36+
ALL_RESULT = []
37+
38+
39+
def input_all(wildcards):
40+
return ALL_RESULT
41+
42+
43+
rule all:
44+
input:
45+
input_all,
46+
47+
48+
# Load all snakemake files `snakefiles/*/*.smk`.
49+
snakefiles = list(sorted(glob.glob("snakefiles/*/*.smk")))
50+
print("Loading Snakefiles...", file=sys.stderr)
51+
52+
for path in snakefiles:
53+
54+
include: path
55+
56+
57+
# Derive output overall output files from rules starting with prefix "output_".
58+
print("Constructing list of output files from all `result_*` rules...\n", file=sys.stderr)
59+
60+
ALL_RESULT = []
61+
for rule in workflow.rules:
62+
if rule.name.startswith("result_"):
63+
for genome_build in ("GRCh37", "GRCh38"):
64+
vals = {**config, "genome_build": genome_build}
65+
for path in rule.output:
66+
if "{chrom}" in path:
67+
chroms = CHROMS
68+
else:
69+
chroms = ["-"]
70+
if "{chrom_no_y}" in path:
71+
chroms_no_y = CHROMS_NO_Y
72+
else:
73+
chroms_no_y = ["-"]
74+
for chrom, chrom_no_y in product(chroms, chroms_no_y):
75+
path = re.sub(r"{([^,]+)(,.*)?}", r"{\1}", path)
76+
ALL_RESULT.append(path.format(chrom=chrom, chrom_no_y=chrom_no_y, **vals))

config.yaml.example

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
# String to use for the download data of data that is not explicitely versioned such as
44
# ENSEMBL BioMart or VISTA.
5-
download_date: 20210728
5+
download_date: 20230115
66

77
# String to use for the release name.
8-
release_name: 20210728b
8+
release_name: bollonaster
99

1010
# Memory to use for sorting (passed to sort -S).
1111
sort_memory: 16G

environment.yaml

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,23 @@ channels:
44
- conda-forge
55
- defaults
66
dependencies:
7-
- python =3.9
8-
- snakemake =7.8
9-
- bedops ==2.4.35
7+
- python =3.10
8+
- snakemake-minimal =7
9+
- bedops =2
1010
- bcftools =1.15
11-
- htslib =1.15
12-
- samtools =1.15
13-
- drmaa ==0.7.9
14-
- bedtools ==2.27.1
11+
- htslib =1.16
12+
- samtools =1.16
13+
- bedtools =2.30
1514
- var-agg ==0.1.1
1615
- lftp
17-
- varfish-annotator-cli ==0.16
18-
- jannovar-cli
1916
- snakefmt
17+
- interval-binning =1
18+
- vcfpy
19+
- tqdm
20+
- prov =2
21+
- attrs
22+
- cattrs
23+
- jq
24+
- aria2
25+
- pigz
26+
- qsv
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"dc:format": "text/x-bed",
3+
"dc:identifier": "features/GRCh37/gene-regions/ensembl.bed:r87",
4+
"dc:title": "ENSEMBL gene regions (r87) for GRCh37",
5+
"dc:description": "This BED file contains the gene ENSEMBL gene regions (release r87 from 2017).",
6+
"dc:created": "2023-02-06",
7+
"dc:creator": "ENSEMBL Team",
8+
"dc:contributor": [
9+
"VarFish Developer Team"
10+
],
11+
"dc:source": [
12+
"PMID:34791404",
13+
"http://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/"
14+
],
15+
"tsv:columns": {
16+
"chrom": "Chromosome name without chr prefix",
17+
"begin": "0-based begin position",
18+
"end": "0-based end position",
19+
"ensembl_gene_id": "ENSEMBL gene ID"
20+
}
21+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"dc:format": "text/x-bed",
3+
"dc:identifier": "features/GRCh37/gene-regions/refseq.bed:105.20220307",
4+
"dc:title": "NCBI RefSeq gene regions (105.20220307) for GRCh37",
5+
"dc:description": "This BED file contains the gene RefSeq gene regions (release 105.20220307).",
6+
"dc:created": "2023-02-06",
7+
"dc:creator": "NCBI",
8+
"dc:contributor": [
9+
"VarFish Developer Team"
10+
],
11+
"dc:source": [
12+
"PMID:24259432",
13+
"https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/105.20220307/"
14+
],
15+
"tsv:columns": {
16+
"chrom": "Chromosome name without chr prefix",
17+
"begin": "0-based begin position",
18+
"end": "0-based end position",
19+
"entrez_id": "The Entrez/NCBI gene ID"
20+
}
21+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"dc:format": "text/x-bed",
3+
"dc:identifier": "features/GRCh37/masked/repeat.bed.gz:2020-02-20",
4+
"dc:title": "Repeat Masked Sequence",
5+
"dc:description": "This BED contains the repeat masked sequence from UCSC genome browser as created by Repeat Masker.",
6+
"dc:created": "2023-02-06",
7+
"dc:creator": "UCSC Genome Browser Team",
8+
"dc:contributor": [
9+
"VarFish Developer Team"
10+
],
11+
"dc:source": [
12+
"https://genome-euro.ucsc.edu/cgi-bin/hgTrackUi?db=hg19&g=rmsk"
13+
],
14+
"tsv:columns": {
15+
"chrom": "Chromosome name without chr prefix",
16+
"begin": "0-based begin position",
17+
"end": "0-based end position",
18+
"label": "label describing the repeat"
19+
}
20+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"dc:format": "text/x-bed",
3+
"dc:identifier": "features/GRCh37/masked/segdup.bed.gz:2020-02-20",
4+
"dc:title": "Segmental Duplications",
5+
"dc:description": "This BED contains the segmental duplication annotation from UCSC Genome Browser.",
6+
"dc:created": "2011-09-26",
7+
"dc:creator": "UCSC Genome Browser Team",
8+
"dc:contributor": [
9+
"VarFish Developer Team"
10+
],
11+
"dc:source": [
12+
"https://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=genomicSuperDups"
13+
],
14+
"tsv:columns": {
15+
"chrom": "Chromosome name without chr prefix",
16+
"begin": "0-based begin position",
17+
"end": "0-based end position",
18+
"label": "label describing the segmental duplication"
19+
}
20+
}

0 commit comments

Comments
 (0)