bulik · explodecomputer · Feb 9, 2019 · Feb 9, 2019 · Feb 9, 2019 · Feb 11, 2019
diff --git a/environment.yml b/environment.yml
@@ -9,5 +9,6 @@ dependencies:
 - pip
 - pip:
   - scipy==0.18
-  - pandas==0.20
+  - pandas==0.23.4
   - numpy==1.16
+- pysam
diff --git a/ldsc.py b/ldsc.py
@@ -490,7 +490,8 @@ def ldscore(args, log):
 # Basic Flags for Working with Variance Components
 parser.add_argument('--h2', default=None, type=str,
     help='Filename for a .sumstats[.gz] file for one-phenotype LD Score regression. '
-    '--h2 requires at minimum also setting the --ref-ld and --w-ld flags.')
+    '--h2 requires at minimum also setting the --ref-ld and --w-ld flags.'
+    'Can alternatively provide a vcf/vcf.gz/bcf of GWAS summary data in which case ideally you should also specify --snplist flag to list the rs IDs to retain for the analysis.')
 parser.add_argument('--h2-cts', default=None, type=str,
     help='Filename for a .sumstats[.gz] file for cell-type-specific analysis. '
     '--h2-cts requires the --ref-ld-chr, --w-ld, and --ref-ld-chr-cts flags.')
@@ -543,6 +544,8 @@ def ldscore(args, log):
 parser.add_argument('--ref-ld-chr-cts', default=None, type=str,
     help='Name of a file that has a list of file name prefixes for cell-type-specific analysis.')
 parser.add_argument('--print-all-cts', action='store_true', default=False)
+parser.add_argument('--snplist', default=None, type=str,
+    help='Filename for a .snplist[.gz] file which lists the rs IDs to extract from a bcf file being analysed by --h2. One rsID per line.')
 
 # Flags for both LD Score estimation and h2/gencor estimation
 parser.add_argument('--print-cov', default=False, action='store_true',

diff --git a/ldscore/parse.py b/ldscore/parse.py
@@ -10,6 +10,8 @@
 import pandas as pd
 import os
 import glob
+from pysam import VariantFile
+import gzip
 
 
 def series_eq(x, y):
@@ -51,13 +53,18 @@ def which_compression(fh):
         compression = None
     else:
         raise IOError('Could not open {F}[./gz/bz2]'.format(F=fh))
-
     return suffix, compression
 
 
 def get_compression(fh):
     '''Which sort of compression should we use with read_csv?'''
-    if fh.endswith('gz'):
+    if fh.endswith('vcf'):
+        compression = 'bcf'
+    elif fh.endswith('vcf.gz'):
+        compression = 'bcf'
+    elif fh.endswith('bcf'):
+        compression = 'bcf'
+    elif fh.endswith('gz'):
         compression = 'gzip'
     elif fh.endswith('bz2'):
         compression = 'bz2'
@@ -77,25 +84,111 @@ def read_cts(fh, match_snps):
     return cts.ANNOT.values
 
 
-def sumstats(fh, alleles=False, dropna=True):
+def sumstats(fh, alleles=False, dropna=True, slh=None):
     '''Parses .sumstats files. See docs/file_formats_sumstats.txt.'''
     dtype_dict = {'SNP': str,   'Z': float, 'N': float, 'A1': str, 'A2': str}
     compression = get_compression(fh)
     usecols = ['SNP', 'Z', 'N']
     if alleles:
         usecols += ['A1', 'A2']
 
-    try:
-        x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
-    except (AttributeError, ValueError) as e:
-        raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
+    if compression == 'bcf':
+        try:
+            x = read_vcf(fh, alleles, slh)
+        except (AttributeError, ValueError) as e:
+            raise ValueError('Improperly formatted bcf/vcf file: ' + str(e.args))
+    else:
+        try:
+            x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
+        except (AttributeError, ValueError) as e:
+            raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
 
     if dropna:
         x = x.dropna(how='any')
 
     return x
 
 
+
+def read_vcf(fh, alleles, slh=None):
+    vcf_in = VariantFile(fh)
+    sample = list(vcf_in.header.samples)[0]
+    availcols = next(vcf_in.fetch()).format.keys()
+    vcf_in.seek(0)
+
+    # Check if sample size info is in header
+    global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
+    if alleles:
+        dtype_dict = {'SNP': str,   'Z': float, 'N': float, 'A1': str, 'A2': str}
+        usecols = list(dtype_dict.keys())
+
+        # Read in data
+        if 'SS' in availcols:
+            o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()]
+            N = pd.Series([x[2] for x in o], dtype='float')
+        else:
+            o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()]
+            if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys():
+                N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float')
+            elif 'TotalControls' in global_fields.keys():
+                N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float')
+            else:
+                N = pd.Series([np.NaN] * len(o), dtype='float')
+
+        p = pd.DataFrame(
+            {'SNP': pd.Series([x[0] for x in o], dtype='str'),
+            'Z': pd.Series([x[1] for x in o], dtype='float'),
+            'N': N,
+            'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'),
+            'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')}
+        )
+    else:
+        dtype_dict = {'SNP': str,   'Z': float, 'N': float}
+        usecols = list(dtype_dict.keys())
+        if 'SS' in availcols:
+            o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()]
+            N = pd.Series([x[2] for x in o], dtype='float')
+        else:
+            o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0]] for rec in vcf_in.fetch()]
+            if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys():
+                N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float')
+            elif 'TotalControls' in global_fields.keys():
+                N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float')
+            else:
+                N = pd.Series([np.NaN] * len(o), dtype='float')
+
+        p = pd.DataFrame(
+            {'SNP': pd.Series([x[0] for x in o], dtype='str'),
+            'Z': pd.Series([x[1] for x in o], dtype='float'),
+            'N': N}
+        )
+
+    vcf_in.close()
+
+    if slh is not None:
+        compression = get_compression(slh)
+        sl = []
+        if compression == "gzip":
+            try:
+                with gzip.open(slh) as f:
+                    for line in f:
+                        sl.append(line.strip())
+            except (AttributeError, ValueError) as e:
+                raise ValueError('Improperly formatted snplist file: ' + str(e.args))
+        else:
+            try:
+                with open(slh) as f:
+                    for line in f:
+                        sl.append(line.strip())
+            except (AttributeError, ValueError) as e:
+                raise ValueError('Improperly formatted snplist file: ' + str(e.args))
+        f.close()
+        p = p.loc[p['SNP'].isin(sl)]
+
+    return(p)
+
+
+
 def ldscore_fromlist(flist, num=None):
     '''Sideways concatenation of a list of LD Score files.'''
     ldscore_array = []

diff --git a/ldscore/sumstats.py b/ldscore/sumstats.py
@@ -160,7 +160,11 @@ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
 def _read_sumstats(args, log, fh, alleles=False, dropna=False):
     '''Parse summary statistics.'''
     log.log('Reading summary statistics from {S} ...'.format(S=fh))
-    sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
+    if args.snplist:
+        log.log('and extracting SNPs specified in {S} ...'.format(S=args.snplist))
+        sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna, slh=args.snplist)
+    else:
+        sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
     log_msg = 'Read summary statistics for {N} SNPs.'
     log.log(log_msg.format(N=len(sumstats)))
     m = len(sumstats)

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ pybedtools>=0.7,<0.8
 scipy>=0.18,<0.19
 numpy>=1.16,<1.17
 pandas>=0.20,<0.21
+pysam
diff --git a/test/test_parse.py b/test/test_parse.py
@@ -21,6 +21,9 @@ def test_series_eq():
 
 
 def test_get_compression():
+    assert_equal(ps.get_compression('vcf'), 'bcf')
+    assert_equal(ps.get_compression('bcf'), 'bcf')
+    assert_equal(ps.get_compression('vcf.gz'), 'bcf')
     assert_equal(ps.get_compression('gz'), 'gzip')
     assert_equal(ps.get_compression('bz2'), 'bz2')
     assert_equal(ps.get_compression('asdf'), None)
@@ -42,6 +45,18 @@ def test_read_sumstats():
     assert_raises(ValueError, ps.sumstats, os.path.join(
         DIR, 'parse_test/test.l2.ldscore.gz'))
 
+def test_read_vcf():
+    x1 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=True)
+    x2 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=False)
+    x3 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=True)
+    x4 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=False)
+    x5 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=True)
+    x6 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=False)
+    assert_equal(len(x1), len(x2))
+    assert_equal(len(x3), len(x4))
+    assert_equal(len(x5), len(x6))
+    assert('rs' in x1.SNP[0])
+    # assert_raises(IOError, ps.read_vcf, os.path.join(DIR, 'test/vcf_test/example1.vcf.gz'), alleles=True)
 
 def test_frq_parser():
     x = ps.frq_parser(os.path.join(DIR, 'parse_test/test1.frq'), compression=None)

diff --git a/test/vcf_test/example1.vcf.gz b/test/vcf_test/example1.vcf.gz
diff --git a/test/vcf_test/example2.vcf.gz b/test/vcf_test/example2.vcf.gz
diff --git a/test/vcf_test/example3.vcf.gz b/test/vcf_test/example3.vcf.gz