Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow GWAS-VCF files to be read by ldsc #289

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
3 changes: 2 additions & 1 deletion environment.yml
Expand Up @@ -9,5 +9,6 @@ dependencies:
- pip
- pip:
- scipy==0.18
- pandas==0.20
- pandas==0.23.4
- numpy==1.16
- pysam
5 changes: 4 additions & 1 deletion ldsc.py
Expand Up @@ -490,7 +490,8 @@ def ldscore(args, log):
# Basic Flags for Working with Variance Components
parser.add_argument('--h2', default=None, type=str,
help='Filename for a .sumstats[.gz] file for one-phenotype LD Score regression. '
'--h2 requires at minimum also setting the --ref-ld and --w-ld flags.')
'--h2 requires at minimum also setting the --ref-ld and --w-ld flags.'
'Can alternatively provide a vcf/vcf.gz/bcf of GWAS summary data in which case ideally you should also specify --snplist flag to list the rs IDs to retain for the analysis.')
parser.add_argument('--h2-cts', default=None, type=str,
help='Filename for a .sumstats[.gz] file for cell-type-specific analysis. '
'--h2-cts requires the --ref-ld-chr, --w-ld, and --ref-ld-chr-cts flags.')
Expand Down Expand Up @@ -543,6 +544,8 @@ def ldscore(args, log):
parser.add_argument('--ref-ld-chr-cts', default=None, type=str,
help='Name of a file that has a list of file name prefixes for cell-type-specific analysis.')
parser.add_argument('--print-all-cts', action='store_true', default=False)
parser.add_argument('--snplist', default=None, type=str,
help='Filename for a .snplist[.gz] file which lists the rs IDs to extract from a bcf file being analysed by --h2. One rsID per line.')

# Flags for both LD Score estimation and h2/gencor estimation
parser.add_argument('--print-cov', default=False, action='store_true',
Expand Down
107 changes: 100 additions & 7 deletions ldscore/parse.py
Expand Up @@ -10,6 +10,8 @@
import pandas as pd
import os
import glob
from pysam import VariantFile
import gzip


def series_eq(x, y):
Expand Down Expand Up @@ -51,13 +53,18 @@ def which_compression(fh):
compression = None
else:
raise IOError('Could not open {F}[./gz/bz2]'.format(F=fh))

return suffix, compression


def get_compression(fh):
'''Which sort of compression should we use with read_csv?'''
if fh.endswith('gz'):
if fh.endswith('vcf'):
compression = 'bcf'
elif fh.endswith('vcf.gz'):
compression = 'bcf'
elif fh.endswith('bcf'):
compression = 'bcf'
elif fh.endswith('gz'):
compression = 'gzip'
elif fh.endswith('bz2'):
compression = 'bz2'
Expand All @@ -77,25 +84,111 @@ def read_cts(fh, match_snps):
return cts.ANNOT.values


def sumstats(fh, alleles=False, dropna=True):
def sumstats(fh, alleles=False, dropna=True, slh=None):
'''Parses .sumstats files. See docs/file_formats_sumstats.txt.'''
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
compression = get_compression(fh)
usecols = ['SNP', 'Z', 'N']
if alleles:
usecols += ['A1', 'A2']

try:
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
except (AttributeError, ValueError) as e:
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))
if compression == 'bcf':
try:
x = read_vcf(fh, alleles, slh)
except (AttributeError, ValueError) as e:
raise ValueError('Improperly formatted bcf/vcf file: ' + str(e.args))
else:
try:
x = read_csv(fh, usecols=usecols, dtype=dtype_dict, compression=compression)
except (AttributeError, ValueError) as e:
raise ValueError('Improperly formatted sumstats file: ' + str(e.args))

if dropna:
x = x.dropna(how='any')

return x



def read_vcf(fh, alleles, slh=None):
vcf_in = VariantFile(fh)
sample = list(vcf_in.header.samples)[0]
availcols = next(vcf_in.fetch()).format.keys()
vcf_in.seek(0)

# Check if sample size info is in header
global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
if alleles:
dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
usecols = list(dtype_dict.keys())

# Read in data
if 'SS' in availcols:
o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()]
N = pd.Series([x[2] for x in o], dtype='float')
else:
o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.alts[0], rec.ref] for rec in vcf_in.fetch()]
if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys():
N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float')
elif 'TotalControls' in global_fields.keys():
N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float')
else:
N = pd.Series([np.NaN] * len(o), dtype='float')

p = pd.DataFrame(
{'SNP': pd.Series([x[0] for x in o], dtype='str'),
'Z': pd.Series([x[1] for x in o], dtype='float'),
'N': N,
'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'),
'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')}
)
else:
dtype_dict = {'SNP': str, 'Z': float, 'N': float}
usecols = list(dtype_dict.keys())
if 'SS' in availcols:
o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0]] for rec in vcf_in.fetch()]
N = pd.Series([x[2] for x in o], dtype='float')
else:
o = [[rec.id, rec.samples[sample]['ES'][0]/rec.samples[sample]['SE'][0]] for rec in vcf_in.fetch()]
if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys():
N = pd.Series([float(global_fields['TotalControls']) + float(global_fields['TotalCases'])] * len(o), dtype='float')
elif 'TotalControls' in global_fields.keys():
N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float')
else:
N = pd.Series([np.NaN] * len(o), dtype='float')

p = pd.DataFrame(
{'SNP': pd.Series([x[0] for x in o], dtype='str'),
'Z': pd.Series([x[1] for x in o], dtype='float'),
'N': N}
)

vcf_in.close()

if slh is not None:
compression = get_compression(slh)
sl = []
if compression == "gzip":
try:
with gzip.open(slh) as f:
for line in f:
sl.append(line.strip())
except (AttributeError, ValueError) as e:
raise ValueError('Improperly formatted snplist file: ' + str(e.args))
else:
try:
with open(slh) as f:
for line in f:
sl.append(line.strip())
except (AttributeError, ValueError) as e:
raise ValueError('Improperly formatted snplist file: ' + str(e.args))
f.close()
p = p.loc[p['SNP'].isin(sl)]

return(p)



def ldscore_fromlist(flist, num=None):
'''Sideways concatenation of a list of LD Score files.'''
ldscore_array = []
Expand Down
6 changes: 5 additions & 1 deletion ldscore/sumstats.py
Expand Up @@ -160,7 +160,11 @@ def _read_chr_split_files(chr_arg, not_chr_arg, log, noun, parsefunc, **kwargs):
def _read_sumstats(args, log, fh, alleles=False, dropna=False):
'''Parse summary statistics.'''
log.log('Reading summary statistics from {S} ...'.format(S=fh))
sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
if args.snplist:
log.log('and extracting SNPs specified in {S} ...'.format(S=args.snplist))
sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna, slh=args.snplist)
else:
sumstats = ps.sumstats(fh, alleles=alleles, dropna=dropna)
log_msg = 'Read summary statistics for {N} SNPs.'
log.log(log_msg.format(N=len(sumstats)))
m = len(sumstats)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -4,3 +4,4 @@ pybedtools>=0.7,<0.8
scipy>=0.18,<0.19
numpy>=1.16,<1.17
pandas>=0.20,<0.21
pysam
15 changes: 15 additions & 0 deletions test/test_parse.py
Expand Up @@ -21,6 +21,9 @@ def test_series_eq():


def test_get_compression():
assert_equal(ps.get_compression('vcf'), 'bcf')
assert_equal(ps.get_compression('bcf'), 'bcf')
assert_equal(ps.get_compression('vcf.gz'), 'bcf')
assert_equal(ps.get_compression('gz'), 'gzip')
assert_equal(ps.get_compression('bz2'), 'bz2')
assert_equal(ps.get_compression('asdf'), None)
Expand All @@ -42,6 +45,18 @@ def test_read_sumstats():
assert_raises(ValueError, ps.sumstats, os.path.join(
DIR, 'parse_test/test.l2.ldscore.gz'))

def test_read_vcf():
x1 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=True)
x2 = ps.read_vcf(os.path.join(DIR, "vcf_test/example1.vcf.gz"), alleles=False)
x3 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=True)
x4 = ps.read_vcf(os.path.join(DIR, "vcf_test/example2.vcf.gz"), alleles=False)
x5 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=True)
x6 = ps.read_vcf(os.path.join(DIR, "vcf_test/example3.vcf.gz"), alleles=False)
assert_equal(len(x1), len(x2))
assert_equal(len(x3), len(x4))
assert_equal(len(x5), len(x6))
assert('rs' in x1.SNP[0])
# assert_raises(IOError, ps.read_vcf, os.path.join(DIR, 'test/vcf_test/example1.vcf.gz'), alleles=True)

def test_frq_parser():
x = ps.frq_parser(os.path.join(DIR, 'parse_test/test1.frq'), compression=None)
Expand Down
Binary file added test/vcf_test/example1.vcf.gz
Binary file not shown.
Binary file added test/vcf_test/example2.vcf.gz
Binary file not shown.
Binary file added test/vcf_test/example3.vcf.gz
Binary file not shown.