bulik · pettyalex · May 4, 2022 · Oct 11, 2022 · Oct 11, 2022 · Oct 12, 2022
diff --git a/.gitignore b/.gitignore
@@ -84,3 +84,6 @@ docs/_build/
 # sublime text
 *.idea*
 *sublime*
+
+# vscode
+.vscode/
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,5 @@
+30.01.23 v2.0.1; update a bug introduced in "Fix globbing bug in splitp #221" that broke some flows
+11.10.22 v2.0.0; port to Python 3 and update to newer versions of pandas, numpy, and more
 02.01.20 Fix KeyError in allele_merge
 30.07.19 v1.0.1; update changelog and increment version for edits on this date (see #164)
 30.07.19 Fix to display of liability scale gencov, h2 in rg results log (see #162)

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 
-# LDSC (LD SCore) `v1.0.1`
+# LDSC (LD SCore) `v2.0.1`
 
 `ldsc` is a command line tool for estimating heritability and genetic correlation from GWAS summary statistics. `ldsc` also computes LD Scores.
 

diff --git a/environment.yml b/environment.yml
@@ -2,12 +2,12 @@ name: ldsc
 channels:
 - bioconda
 dependencies:
-- python=2.7
-- bitarray=0.8
-- nose=1.3
-- pybedtools=0.7
+- python=3.10
 - pip
 - pip:
-  - scipy==0.18
-  - pandas==0.20
-  - numpy==1.16
+  - bitarray==2.6.0
+  - nose==1.3.7
+  - pybedtools==0.9.0
+  - scipy==1.9.2
+  - pandas==1.5.0
+  - numpy==1.23.3
diff --git a/ldsc.py b/ldsc.py
@@ -8,7 +8,7 @@
     3. genetic covariance / correlation
 
 '''
-from __future__ import division
+
 import ldscore.ldscore as ld
 import ldscore.parse as ps
 import ldscore.sumstats as sumstats
@@ -18,6 +18,7 @@
 from subprocess import call
 from itertools import product
 import time, sys, traceback, argparse
+from functools import reduce
 
 
 try:
@@ -26,7 +27,7 @@
 except AttributeError:
     raise ImportError('LDSC requires pandas version >= 0.17.0')
 
-__version__ = '1.0.1'
+__version__ = '2.0.0'
 MASTHEAD = "*********************************************************************\n"
 MASTHEAD += "* LD Score Regression (LDSC)\n"
 MASTHEAD += "* Version {V}\n".format(V=__version__)
@@ -37,7 +38,7 @@
 pd.set_option('display.max_rows', 500)
 pd.set_option('display.max_columns', 500)
 pd.set_option('display.width', 1000)
-pd.set_option('precision', 4)
+pd.set_option('display.precision', 4)
 pd.set_option('max_colwidth',1000)
 np.set_printoptions(linewidth=1000)
 np.set_printoptions(precision=4)
@@ -73,15 +74,15 @@ class Logger(object):
 
     '''
     def __init__(self, fh):
-        self.log_fh = open(fh, 'wb')
+        self.log_fh = open(fh, 'w')
 
     def log(self, msg):
         '''
         Print to log file and stdout with a single command.
 
         '''
-        print >>self.log_fh, msg
-        print msg
+        print(msg, file=self.log_fh)
+        print(msg)
 
 
 def __filter__(fname, noun, verb, merge_obj):
@@ -90,12 +91,12 @@ def __filter__(fname, noun, verb, merge_obj):
         f = lambda x,n: x.format(noun=noun, verb=verb, fname=fname, num=n)
         x = ps.FilterFile(fname)
         c = 'Read list of {num} {noun} to {verb} from {fname}'
-        print f(c, len(x.IDList))
+        print(f(c, len(x.IDList)))
         merged_list = merge_obj.loj(x.IDList)
         len_merged_list = len(merged_list)
         if len_merged_list > 0:
             c = 'After merging, {num} {noun} remain'
-            print f(c, len_merged_list)
+            print(f(c, len_merged_list))
         else:
             error_msg = 'No {noun} retained for analysis'
             raise ValueError(f(error_msg, 0))
@@ -106,7 +107,7 @@ def annot_sort_key(s):
     '''For use with --cts-bin. Fixes weird pandas crosstab column order.'''
     if type(s) == tuple:
         s = [x.split('_')[0] for x in s]
-        s = map(lambda x: float(x) if x != 'min' else -float('inf'), s)
+        s = [float(x) if x != 'min' else -float('inf') for x in s]
     else:  # type(s) = str:
         s = s.split('_')[0]
         if s == 'min':
@@ -184,7 +185,7 @@ def ldscore(args, log):
                 raise ValueError(msg)
 
         else:
-            cts_colnames = ['ANNOT'+str(i) for i in xrange(len(cts_fnames))]
+            cts_colnames = ['ANNOT'+str(i) for i in range(len(cts_fnames))]
 
         log.log('Reading numbers with which to bin SNPs from {F}'.format(F=args.cts_bin))
 
@@ -215,7 +216,7 @@ def ldscore(args, log):
             name_breaks[0] = 'min'
             name_breaks[-1] = 'max'
             name_breaks = [str(x) for x in name_breaks]
-            labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in xrange(n_breaks-1)]
+            labs = [name_breaks[i]+'_'+name_breaks[i+1] for i in range(n_breaks-1)]
             cut_vec = pd.Series(pd.cut(vec, bins=cut_breaks, labels=labs))
             cts_levs.append(cut_vec)
             full_labs.append(labs)
@@ -283,7 +284,7 @@ def ldscore(args, log):
 
     if args.ld_wind_snps:
         max_dist = args.ld_wind_snps
-        coords = np.array(xrange(geno_array.m))
+        coords = np.array(range(geno_array.m))
     elif args.ld_wind_kb:
         max_dist = args.ld_wind_kb*1000
         coords = np.array(array_snps.df['BP'])[geno_array.kept_snps]
@@ -339,7 +340,7 @@ def ldscore(args, log):
                         F=args.print_snps, N=len(print_snps)))
 
         print_snps.columns=['SNP']
-        df = df.ix[df.SNP.isin(print_snps.SNP),:]
+        df = df.iloc[df.SNP.isin(print_snps.SNP),:]
         if len(df) == 0:
             raise ValueError('After merging with --print-snps, no SNPs remain.')
         else:
@@ -361,12 +362,12 @@ def ldscore(args, log):
 
     # print .M
     fout_M = open(args.out + '.'+ file_suffix +'.M','wb')
-    print >>fout_M, '\t'.join(map(str,M))
+    print('\t'.join(map(str,M)), file=fout_M)
     fout_M.close()
 
     # print .M_5_50
     fout_M_5_50 = open(args.out + '.'+ file_suffix +'.M_5_50','wb')
-    print >>fout_M_5_50, '\t'.join(map(str,M_5_50))
+    print('\t'.join(map(str,M_5_50)), file=fout_M_5_50)
     fout_M_5_50.close()
 
     # print annot matrix
@@ -383,19 +384,19 @@ def ldscore(args, log):
     # print LD Score summary
     pd.set_option('display.max_rows', 200)
     log.log('\nSummary of LD Scores in {F}'.format(F=out_fname+l2_suffix))
-    t = df.ix[:,4:].describe()
-    log.log( t.ix[1:,:] )
+    t = df.iloc[:,4:].describe()
+    log.log( t.iloc[1:,:] )
 
     np.seterr(divide='ignore', invalid='ignore')  # print NaN instead of weird errors
     # print correlation matrix including all LD Scores and sample MAF
     log.log('')
     log.log('MAF/LD Score Correlation Matrix')
-    log.log( df.ix[:,4:].corr() )
+    log.log( df.iloc[:,4:].corr() )
 
     # print condition number
     if n_annot > 1: # condition number of a column vector w/ nonzero var is trivially one
         log.log('\nLD Score Matrix Condition Number')
-        cond_num = np.linalg.cond(df.ix[:,5:])
+        cond_num = np.linalg.cond(df.iloc[:,5:])
         log.log( reg.remove_brackets(str(np.matrix(cond_num))) )
         if cond_num > 10000:
             log.log('WARNING: ill-conditioned LD Score Matrix!')
@@ -588,7 +589,7 @@ def ldscore(args, log):
     try:
         defaults = vars(parser.parse_args(''))
         opts = vars(args)
-        non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
+        non_defaults = [x for x in list(opts.keys()) if opts[x] != defaults[x]]
         header = MASTHEAD
         header += "Call: \n"
         header += './ldsc.py \\\n'
@@ -647,9 +648,9 @@ def ldscore(args, log):
 
             # bad flags
         else:
-            print header
-            print 'Error: no analysis selected.'
-            print 'ldsc.py -h describes options.'
+            print(header)
+            print('Error: no analysis selected.')
+            print('ldsc.py -h describes options.')
     except Exception:
         ex_type, ex, tb = sys.exc_info()
         log.log( traceback.format_exc(ex) )

diff --git a/ldscore/irwls.py b/ldscore/irwls.py
@@ -4,9 +4,9 @@
 Iterativey re-weighted least squares.
 
 '''
-from __future__ import division
+
 import numpy as np
-import jackknife as jk
+from . import jackknife as jk
 
 
 class IRWLS(object):
@@ -109,10 +109,10 @@ def irwls(cls, x, y, update_func, n_blocks, w, slow=False, separators=None):
                 'w has shape {S}. w must have shape ({N}, 1).'.format(S=w.shape, N=n))
 
         w = np.sqrt(w)
-        for i in xrange(2):  # update this later
+        for i in range(2):  # update this later
             new_w = np.sqrt(update_func(cls.wls(x, y, w)))
             if new_w.shape != w.shape:
-                print 'IRWLS update:', new_w.shape, w.shape
+                print('IRWLS update:', new_w.shape, w.shape)
                 raise ValueError('New weights must have same shape.')
             else:
                 w = new_w
@@ -158,7 +158,7 @@ def wls(cls, x, y, w):
 
         x = cls._weight(x, w)
         y = cls._weight(y, w)
-        coef = np.linalg.lstsq(x, y)
+        coef = np.linalg.lstsq(x, y, rcond=None)
         return coef
 
     @classmethod

diff --git a/ldscore/jackknife.py b/ldscore/jackknife.py
@@ -12,7 +12,7 @@
 
 '''
 
-from __future__ import division
+
 import numpy as np
 from scipy.optimize import nnls
 np.seterr(divide='raise', invalid='raise')
@@ -218,7 +218,7 @@ def __init__(self, x, y, n_blocks=None, nn=False, separators=None):
             func = lambda x, y: np.atleast_2d(nnls(x, np.array(y).T[0])[0])
         else:
             func = lambda x, y: np.atleast_2d(
-                np.linalg.lstsq(x, np.array(y).T[0])[0])
+                np.linalg.lstsq(x, np.array(y).T[0], rcond=None)[0])
 
         self.est = func(x, y)
         self.delete_values = self.delete_values(x, y, func, self.separators)
@@ -256,7 +256,7 @@ def delete_values(cls, x, y, func, s):
         '''
         _check_shape(x, y)
         d = [func(np.vstack([x[0:s[i], ...], x[s[i + 1]:, ...]]), np.vstack([y[0:s[i], ...], y[s[i + 1]:, ...]]))
-             for i in xrange(len(s) - 1)]
+             for i in range(len(s) - 1)]
 
         return np.concatenate(d, axis=0)
 
@@ -346,7 +346,7 @@ def block_values(cls, x, y, s):
         n_blocks = len(s) - 1
         xtx_block_values = np.zeros((n_blocks, p, p))
         xty_block_values = np.zeros((n_blocks, p))
-        for i in xrange(n_blocks):
+        for i in range(n_blocks):
             xty_block_values[i, ...] = np.dot(
                 x[s[i]:s[i + 1], ...].T, y[s[i]:s[i + 1], ...]).reshape((1, p))
             xtx_block_values[i, ...] = np.dot(
@@ -417,7 +417,7 @@ def block_values_to_delete_values(cls, xty_block_values, xtx_block_values):
         delete_values = np.zeros((n_blocks, p))
         xty_tot = np.sum(xty_block_values, axis=0)
         xtx_tot = np.sum(xtx_block_values, axis=0)
-        for j in xrange(n_blocks):
+        for j in range(n_blocks):
             delete_xty = xty_tot - xty_block_values[j]
             delete_xtx = xtx_tot - xtx_block_values[j]
             delete_values[j, ...] = np.linalg.solve(
@@ -507,7 +507,7 @@ def delete_values_to_pseudovalues(cls, est, denom, numer):
         '''
         n_blocks, p = denom.shape
         pseudovalues = np.zeros((n_blocks, p))
-        for j in xrange(0, n_blocks):
+        for j in range(0, n_blocks):
             pseudovalues[j, ...] = n_blocks * est - \
                 (n_blocks - 1) * numer[j, ...] / denom[j, ...]
 

diff --git a/ldscore/ldscore.py b/ldscore/ldscore.py
@@ -1,4 +1,4 @@
-from __future__ import division
+
 import numpy as np
 import bitarray as ba
 
@@ -24,7 +24,7 @@ def getBlockLefts(coords, max_dist):
     M = len(coords)
     j = 0
     block_left = np.zeros(M)
-    for i in xrange(M):
+    for i in range(M):
         while j < M and abs(coords[j] - coords[i]) > max_dist:
             j += 1
 
@@ -51,7 +51,7 @@ def block_left_to_right(block_left):
     M = len(block_left)
     j = 0
     block_right = np.zeros(M)
-    for i in xrange(M):
+    for i in range(M):
         while j < M and block_left[j] <= i:
             j += 1
 
@@ -85,7 +85,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=
                 self.n)
 
             if self.n > 0:
-                print 'After filtering, {n} individuals remain'.format(n=self.n)
+                print('After filtering, {n} individuals remain'.format(n=self.n))
             else:
                 raise ValueError('After filtering, no individuals remain')
 
@@ -99,7 +99,7 @@ def __init__(self, fname, n, snp_list, keep_snps=None, keep_indivs=None, mafMin=
             self.geno, self.m, self.n, self.mafMin, keep_snps)
 
         if self.m > 0:
-            print 'After filtering, {m} SNPs remain'.format(m=self.m)
+            print('After filtering, {m} SNPs remain'.format(m=self.m))
         else:
             raise ValueError('After filtering, no SNPs remain')
 
@@ -190,7 +190,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
         rfuncAB = np.zeros((b, c))
         rfuncBB = np.zeros((c, c))
         # chunk inside of block
-        for l_B in xrange(0, b, c):  # l_B := index of leftmost SNP in matrix B
+        for l_B in range(0, b, c):  # l_B := index of leftmost SNP in matrix B
             B = A[:, l_B:l_B+c]
             np.dot(A.T, B / n, out=rfuncAB)
             rfuncAB = func(rfuncAB)
@@ -199,7 +199,7 @@ def __corSumVarBlocks__(self, block_left, c, func, snp_getter, annot=None):
         b0 = b
         md = int(c*np.floor(m/c))
         end = md + 1 if md != m else md
-        for l_B in xrange(b0, end, c):
+        for l_B in range(b0, end, c):
             # check if the annot matrix is all zeros for this block + chunk
             # this happens w/ sparse categories (i.e., pathways)
             # update the block
@@ -294,7 +294,7 @@ def __filter_indivs__(self, geno, keep_indivs, m, n):
         nru_new = n_new + e
         nru = self.nru
         z = ba.bitarray(m*2*nru_new, endian="little")
-	z.setall(0)
+        z.setall(0)
         for e, i in enumerate(keep_indivs):
             z[2*e::2*nru_new] = geno[2*i::2*nru]
             z[2*e+1::2*nru_new] = geno[2*i+1::2*nru]
@@ -335,7 +335,7 @@ def __filter_snps_maf__(self, geno, m, n, mafMin, keep_snps):
         m_poly = 0
         y = ba.bitarray()
         if keep_snps is None:
-            keep_snps = xrange(m)
+            keep_snps = range(m)
         kept_snps = []
         freq = []
         for e, j in enumerate(keep_snps):
@@ -397,7 +397,7 @@ def nextSNPs(self, b, minorRef=None):
         X = np.array(slice.decode(self._bedcode), dtype="float64").reshape((b, nru)).T
         X = X[0:n, :]
         Y = np.zeros(X.shape)
-        for j in xrange(0, b):
+        for j in range(0, b):
             newsnp = X[:, j]
             ii = newsnp != 9
             avg = np.mean(newsnp[ii])