varfish-org
diff --git a/‎.editorconfig
Lines changed: 26 additions & 0 deletions b/‎.editorconfig
Lines changed: 26 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 25 additions & 0 deletions b/‎.gitignore
Lines changed: 25 additions & 0 deletions
diff --git a/‎Snakefile
Lines changed: 49 additions & 72 deletions b/‎Snakefile
Lines changed: 49 additions & 72 deletions
diff --git a/‎Snakefile.old
Lines changed: 76 additions & 0 deletions b/‎Snakefile.old
Lines changed: 76 additions & 0 deletions
diff --git a/‎config.yaml.example
Lines changed: 2 additions & 2 deletions b/‎config.yaml.example
Lines changed: 2 additions & 2 deletions
diff --git a/‎environment.yaml
Lines changed: 16 additions & 9 deletions b/‎environment.yaml
Lines changed: 16 additions & 9 deletions
diff --git a/‎features/grch37/gene_regions/ensembl.spec.json
Lines changed: 21 additions & 0 deletions b/‎features/grch37/gene_regions/ensembl.spec.json
Lines changed: 21 additions & 0 deletions
diff --git a/‎features/grch37/gene_regions/refseq.spec.json
Lines changed: 21 additions & 0 deletions b/‎features/grch37/gene_regions/refseq.spec.json
Lines changed: 21 additions & 0 deletions
diff --git a/‎features/grch37/masked/repeat.spec.json
Lines changed: 20 additions & 0 deletions b/‎features/grch37/masked/repeat.spec.json
Lines changed: 20 additions & 0 deletions
diff --git a/‎features/grch37/masked/segdup.spec.json
Lines changed: 20 additions & 0 deletions b/‎features/grch37/masked/segdup.spec.json
Lines changed: 20 additions & 0 deletions
@@ -0,0 +1,26 @@
+# http://editorconfig.org
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+[*.{py,rst,ini}]
+indent_style = space
+indent_size = 4
+
+[*.py]
+line_length=120
+
+[*.{html,css,scss,json,yml,yaml,json}]
+indent_style = space
+indent_size = 2
+
+[*.md]
+trim_trailing_whitespace = false
+
+[Makefile]
+indent_style = tab
@@ -1,3 +1,28 @@
+**/download
+**/rocksdb*
+**/.done
+
+annos/**/*.vcf*
+annos/**/*.bed*
+annos/**/*.tsv*
+
+features/**/*.bed*
+features/**/*.tsv*
+features/**/*.md5
+
+genes/**/*.bed*
+genes/**/*.tsv*
+genes/**/*.jsonl*
+genes/**/*.md5
+
+vardbs/**/*.tsv*
+vardbs/**/*.bed*
+vardbs/**/*.md5
+
+reference/**/*.fa*
+
+tracks/**/*.bed*
+
 /stats-*/
 /*.xlsx
 .~*
 
@@ -1,76 +1,53 @@
-import glob
-from itertools import product
-import os
-import re
-import sys
-import textwrap
-import json
-
-from snakemake import shell
 from tools.sv_db_to_tsv import to_tsv
 
 
-# Ensure that the configuration file exists and then load it.
-if not os.path.exists("config.yaml"):
-    print("No config.yaml exists yet. Try `cp config.yaml.example config.yaml`.", file=sys.stderr)
-    sys.exit(1)
-
-
-configfile: "config.yaml"
-
-
-# Print configuration.
-print("Configuration:", file=sys.stderr)
-print("\n---\n%s\n---\n" % json.dumps(config, indent="  "), file=sys.stderr)
-
-#: Use strict mode and also print each command.
-shell.prefix("set -x; set -euo pipefail; ")
-
-#: The canonical chromosome names (without "Y").
-CHROMS_NO_Y = list(map(str, range(1, 23))) + ["X"]
-
-#: The canonical chromosome names.
-CHROMS = CHROMS_NO_Y + ["Y"]
-
-#: List for collecting all result files below.
-ALL_RESULT = []
-
-
-def input_all(wildcards):
-    return ALL_RESULT
-
-
-rule all:
+rule default:
     input:
-        input_all,
-
-
-# Load all snakemake files `snakefiles/*/*.smk`.
-snakefiles = list(sorted(glob.glob("snakefiles/*/*.smk")))
-print("Loading Snakefiles...", file=sys.stderr)
-
-for path in snakefiles:
-
-    include: path
-
-
-# Derive output overall output files from rules starting with prefix "output_".
-print("Constructing list of output files from all `result_*` rules...\n", file=sys.stderr)
-
-ALL_RESULT = []
-for rule in workflow.rules:
-    if rule.name.startswith("result_"):
-        for genome_build in ("GRCh37", "GRCh38"):
-            vals = {**config, "genome_build": genome_build}
-            for path in rule.output:
-                if "{chrom}" in path:
-                    chroms = CHROMS
-                else:
-                    chroms = ["-"]
-                if "{chrom_no_y}" in path:
-                    chroms_no_y = CHROMS_NO_Y
-                else:
-                    chroms_no_y = ["-"]
-                for chrom, chrom_no_y in product(chroms, chroms_no_y):
-                    path = re.sub(r"{([^,]+)(,.*)?}", r"{\1}", path)
-                    ALL_RESULT.append(path.format(chrom=chrom, chrom_no_y=chrom_no_y, **vals))
+        "annos/grch37/cadd/.done",
+        "annos/grch37/dbnsfp-4.4a/.done",
+        "annos/grch37/dbnsfp-4.4c/.done",
+        "annos/grch37/dbscsnv/.done",
+        "annos/grch37/helixmtdb/helixmtdb.vcf",
+        "annos/grch37/gnomad_mtdna/gnomad_mtdna.vcf.gz",
+        "annos/grch37/ucsc_conservation/ucsc_conservation.tsv",
+        "annos/grch37/dbsnp/dbsnp.vcf.gz",
+        "annos/grch37/gnomad_exomes/.done",
+        "annos/grch37/gnomad_genomes/.done",
+        "annos/grch38/cadd/.done",
+        "annos/grch38/dbnsfp-4.4a/.done",
+        "annos/grch38/dbnsfp-4.4c/.done",
+        "annos/grch38/gnomad_exomes/.done",
+        "annos/grch38/gnomad_genomes/.done",
+        "annos/grch38/gnomad_mtdna/gnomad_mtdna.vcf.gz",
+        "annos/grch38/helixmtdb/helixmtdb.vcf",
+        "features/grch37/tads/imr90.bed",
+        "features/grch37/tads/hesc.bed",
+        "features/grch37/gene_regions/refseq.bed.gz",
+        "features/grch37/gene_regions/ensembl.bed.gz",
+        "features/grch37/masked/repeat.bed.gz",
+        "features/grch37/masked/segdup.bed.gz",
+        "genes/hgnc/hgnc_info.jsonl",
+        "genes/ncbi/gene_info.jsonl",
+        "genes/dbnsfp/genes.tsv.gz",
+        "genes/xlink/ensembl.tsv",
+        "genes/xlink/hgnc.tsv",
+        "genes/mim2gene/mim2gene.tsv",
+        "tracks/grch37/ucsc_genomicSuperDups.bed.gz",
+        "tracks/grch37/ucsc_rmsk.bed.gz",
+        "tracks/grch37/ucsc_fixSeqLiftOverPsl.bed.gz",
+        "tracks/grch37/ucsc_altSeqLiftOverPsl.bed.gz",
+        "vardbs/grch37/strucvar/clinvar.bed.gz",
+        "vardbs/grch37/strucvar/dbvar.bed.gz",
+        "vardbs/grch37/strucvar/dgv.bed.gz",
+        "vardbs/grch37/strucvar/dgv_gs.bed.gz",
+        "vardbs/grch37/strucvar/g1k.bed.gz",
+        "vardbs/grch37/strucvar/gnomad_sv.bed.gz",
+        "vardbs/grch37/strucvar/exac.bed.gz",
+
+
+include: "snakefiles/annos.smk"
+include: "snakefiles/genes.smk"
+include: "snakefiles/features.smk"
+include: "snakefiles/vardbs-grch37-strucvars.smk"
+include: "snakefiles/tracks-grch37.smk"
+include: "snakefiles/reference.smk"
@@ -0,0 +1,76 @@
+import glob
+from itertools import product
+import os
+import re
+import sys
+import textwrap
+import json
+
+from snakemake import shell
+from tools.sv_db_to_tsv import to_tsv
+
+
+# Ensure that the configuration file exists and then load it.
+if not os.path.exists("config.yaml"):
+    print("No config.yaml exists yet. Try `cp config.yaml.example config.yaml`.", file=sys.stderr)
+    sys.exit(1)
+
+
+configfile: "config.yaml"
+
+
+# Print configuration.
+print("Configuration:", file=sys.stderr)
+print("\n---\n%s\n---\n" % json.dumps(config, indent="  "), file=sys.stderr)
+
+#: Use strict mode and also print each command.
+shell.prefix("set -x; set -euo pipefail; ")
+
+#: The canonical chromosome names (without "Y").
+CHROMS_NO_Y = list(map(str, range(1, 23))) + ["X"]
+
+#: The canonical chromosome names.
+CHROMS = CHROMS_NO_Y + ["Y"]
+
+#: List for collecting all result files below.
+ALL_RESULT = []
+
+
+def input_all(wildcards):
+    return ALL_RESULT
+
+
+rule all:
+    input:
+        input_all,
+
+
+# Load all snakemake files `snakefiles/*/*.smk`.
+snakefiles = list(sorted(glob.glob("snakefiles/*/*.smk")))
+print("Loading Snakefiles...", file=sys.stderr)
+
+for path in snakefiles:
+
+    include: path
+
+
+# Derive output overall output files from rules starting with prefix "output_".
+print("Constructing list of output files from all `result_*` rules...\n", file=sys.stderr)
+
+ALL_RESULT = []
+for rule in workflow.rules:
+    if rule.name.startswith("result_"):
+        for genome_build in ("GRCh37", "GRCh38"):
+            vals = {**config, "genome_build": genome_build}
+            for path in rule.output:
+                if "{chrom}" in path:
+                    chroms = CHROMS
+                else:
+                    chroms = ["-"]
+                if "{chrom_no_y}" in path:
+                    chroms_no_y = CHROMS_NO_Y
+                else:
+                    chroms_no_y = ["-"]
+                for chrom, chrom_no_y in product(chroms, chroms_no_y):
+                    path = re.sub(r"{([^,]+)(,.*)?}", r"{\1}", path)
+                    ALL_RESULT.append(path.format(chrom=chrom, chrom_no_y=chrom_no_y, **vals))
@@ -2,10 +2,10 @@
 
 # String to use for the download data of data that is not explicitely versioned such as
 # ENSEMBL BioMart or VISTA.
-download_date: 20210728
+download_date: 20230115
 
 # String to use for the release name.
-release_name: 20210728b
+release_name: bollonaster
 
 # Memory to use for sorting (passed to sort -S).
 sort_memory: 16G
@@ -4,16 +4,23 @@ channels:
 - conda-forge
 - defaults
 dependencies:
-- python =3.9
-- snakemake =7.8
-- bedops ==2.4.35
+- python =3.10
+- snakemake-minimal =7
+- bedops =2
 - bcftools =1.15
-- htslib =1.15
-- samtools =1.15
-- drmaa ==0.7.9
-- bedtools ==2.27.1
+- htslib =1.16
+- samtools =1.16
+- bedtools =2.30
 - var-agg ==0.1.1
 - lftp
-- varfish-annotator-cli ==0.16
-- jannovar-cli
 - snakefmt
+- interval-binning =1
+- vcfpy
+- tqdm
+- prov =2
+- attrs
+- cattrs
+- jq
+- aria2
+- pigz
+- qsv
@@ -0,0 +1,21 @@
+{
+  "dc:format": "text/x-bed",
+  "dc:identifier": "features/GRCh37/gene-regions/ensembl.bed:r87",
+  "dc:title": "ENSEMBL gene regions (r87) for GRCh37",
+  "dc:description": "This BED file contains the gene ENSEMBL gene regions (release r87 from 2017).",
+  "dc:created": "2023-02-06",
+  "dc:creator": "ENSEMBL Team",
+  "dc:contributor": [
+      "VarFish Developer Team"
+  ],
+  "dc:source": [
+      "PMID:34791404",
+      "http://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/"
+  ],
+  "tsv:columns": {
+    "chrom": "Chromosome name without chr prefix",
+    "begin": "0-based begin position",
+    "end": "0-based end position",
+    "ensembl_gene_id": "ENSEMBL gene ID"
+  }
+}
@@ -0,0 +1,21 @@
+{
+  "dc:format": "text/x-bed",
+  "dc:identifier": "features/GRCh37/gene-regions/refseq.bed:105.20220307",
+  "dc:title": "NCBI RefSeq gene regions (105.20220307) for GRCh37",
+  "dc:description": "This BED file contains the gene RefSeq gene regions (release 105.20220307).",
+  "dc:created": "2023-02-06",
+  "dc:creator": "NCBI",
+  "dc:contributor": [
+      "VarFish Developer Team"
+  ],
+  "dc:source": [
+      "PMID:24259432",
+      "https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Homo_sapiens/105.20220307/"
+  ],
+  "tsv:columns": {
+    "chrom": "Chromosome name without chr prefix",
+    "begin": "0-based begin position",
+    "end": "0-based end position",
+    "entrez_id": "The Entrez/NCBI gene ID"
+  }
+}
@@ -0,0 +1,20 @@
+{
+  "dc:format": "text/x-bed",
+  "dc:identifier": "features/GRCh37/masked/repeat.bed.gz:2020-02-20",
+  "dc:title": "Repeat Masked Sequence",
+  "dc:description": "This BED contains the repeat masked sequence from UCSC genome browser as created by Repeat Masker.",
+  "dc:created": "2023-02-06",
+  "dc:creator": "UCSC Genome Browser Team",
+  "dc:contributor": [
+      "VarFish Developer Team"
+  ],
+  "dc:source": [
+      "https://genome-euro.ucsc.edu/cgi-bin/hgTrackUi?db=hg19&g=rmsk"
+  ],
+  "tsv:columns": {
+    "chrom": "Chromosome name without chr prefix",
+    "begin": "0-based begin position",
+    "end": "0-based end position",
+    "label": "label describing the repeat"
+  }
+}
@@ -0,0 +1,20 @@
+{
+  "dc:format": "text/x-bed",
+  "dc:identifier": "features/GRCh37/masked/segdup.bed.gz:2020-02-20",
+  "dc:title": "Segmental Duplications",
+  "dc:description": "This BED contains the segmental duplication annotation from UCSC Genome Browser.",
+  "dc:created": "2011-09-26",
+  "dc:creator": "UCSC Genome Browser Team",
+  "dc:contributor": [
+      "VarFish Developer Team"
+  ],
+  "dc:source": [
+      "https://genome-euro.ucsc.edu/cgi-bin/hgc?db=hg19&g=genomicSuperDups"
+  ],
+  "tsv:columns": {
+    "chrom": "Chromosome name without chr prefix",
+    "begin": "0-based begin position",
+    "end": "0-based end position",
+    "label": "label describing the segmental duplication"
+  }
+}