Merge pull request #37 from computational-metabolomics/new_features

Update portals, peaklist and peak_matrix models, and fix minor bugs
computational-metabolomics · May 29, 2018 · ddcaf23 · ddcaf23
2 parents cde30e3 + 02eeca7
commit ddcaf23
Show file tree

Hide file tree

Showing 15 changed files with 116 additions and 73 deletions.
diff --git a/README.rst b/README.rst
@@ -32,7 +32,7 @@ will help you to make the PR if you are new to `git`.
 Developers & Contributors
 -------------------------
  - Ralf J. M. Weber (r.j.weber@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
- - Jiarui (Albert) Zhou (j.zhou.3@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
+ - Jiarui (Albert) Zhou (j.zhou.3@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_, `HIT Shenzhen (China) <http://www.hitsz.edu.cn>`_
  - Thomas N. Lawson (tnl495@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
 
 

diff --git a/dimspy/models/peak_matrix.py b/dimspy/models/peak_matrix.py
@@ -401,6 +401,7 @@ def rsd(self, *args, **kwargs):
 
         :param args: tags or untyped tag values for RSD calculation, no value = calculate over all samples
         :param kwargs: typed tags for RSD calculation, , no value = calculate over all samples
+        :param on_attr: calculate RSD on given attribute. Default = "intensity"
         :param flagged_only: whether to calculate on flagged peaks only. Default = True
         :type: numpy array
 
@@ -413,6 +414,7 @@ def rsd(self, *args, **kwargs):
         corresponding rsd value will be set to np.nan.
 
         """
+        on_attr = kwargs.pop('on_attr') if kwargs.has_key('on_attr') else 'intensity'
         flagged_only = kwargs.pop('flagged_only') if kwargs.has_key('flagged_only') else True
 
         if self.shape[0] < 2:
@@ -423,7 +425,7 @@ def rsd(self, *args, **kwargs):
               unmask_peakmatrix(self, *args, **kwargs)) as m:
             if m.shape[0] == 0: raise AttributeError('peak matrix does not have label(s) [%s]' %
                                                      join(map(lambda x: str(x)[1:-1], (args, kwargs)), ', '))
-            ints = m.attr_matrix('intensity', flagged_only)
+            ints = m.attr_matrix(on_attr, flagged_only)
             rsd = m._present_std(ints, 0, flagged_only) / m._present_mean(ints, 0, flagged_only) * 100
 
         rsd[np.where(map(lambda x: len(set(x[np.nonzero(x)])) == 1, ints.T))] = np.nan  # only one valid value

diff --git a/dimspy/models/peaklist.py b/dimspy/models/peaklist.py
@@ -140,11 +140,16 @@ def ID(self):
         Property of the peaklist ID.
 
         :getter: returns the peaklist ID
+        :setter: set the peaklist ID
         :type: same as input ID
 
         """
         return self._id
 
+    @ID.setter
+    def ID(self, value):
+        self._id = str(value)
+
     @property
     def size(self):
         """

diff --git a/dimspy/portals/mzml_portal.py b/dimspy/portals/mzml_portal.py
@@ -14,16 +14,21 @@
 import pymzml
 import numpy as np
 import zipfile
+from copy import deepcopy
 from dimspy.models.peaklist import PeakList
 from dimspy.experiment import mz_range_from_header
 
 
 class Mzml:
-    def __init__(self, filename="", archive=None):
+    def __init__(self, filename="", archive=None, preload=True):
         self.filename = filename
         self.archive = archive
+        self._preload = preload
+        self._cache = None
 
     def run(self):
+        if self._cache is not None: return self._cache
+
         if not self.filename.lower().endswith(".mzml") and not self.filename.lower().endswith(".mzml.gz") and not self.filename.lower().endswith(".zip"):
             raise IOError('Incorrect file format for mzML parser')
         if self.archive is not None:
@@ -32,11 +37,15 @@ def run(self):
             zf = zipfile.ZipFile(self.archive, 'r')
             if self.filename not in zf.namelist():
                 raise IOError("{} does not exist in zip file".format(self.filename))
-            return pymzml.run.Reader('', file_object=zf.open(self.filename))
+            dat = pymzml.run.Reader('', file_object=zf.open(self.filename))
+            if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
+            return dat
         elif self.filename.lower().endswith(".mzml") or self.filename.lower().endswith(".mzml.gz"):
             if not os.path.isfile(self.filename):
                 raise IOError("{} does not exist".format(self.filename))
-            return pymzml.run.Reader(self.filename)
+            dat = pymzml.run.Reader(self.filename)
+            if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
+            return dat
         else:
             return None
 
@@ -62,7 +71,10 @@ def peaklist(self, scan_id, function_noise="median"):
         for scan in self.run():
             if scan["id"] == scan_id:
 
-                mzs, ints = zip(*scan.peaks)
+                if len(scan.peaks) > 0:
+                    mzs, ints = zip(*scan.peaks)
+                else:
+                    mzs, ints = [], []
 
                 scan_time = scan["MS:1000016"]
                 tic = scan["total ion current"]
@@ -101,10 +113,21 @@ def tics(self):
         # print self.run()[2]
         for scan in self.run():
             if scan["id"] == "TIC":
-                tics = zip(*scan.peaks)[1]
-                return tics
+                return zip(*scan.peaks)[1]
         return
 
+    def injection_times(self):
+        injection_times = {}
+        for scan in self.run():
+            injection_times[scan['id']] = None
+            for element in scan.xmlTree:
+                if "MS:1000927" == element.get('accession'):
+                    injection_times[scan['id']] = float(element.get("value"))
+                    break
+            if scan['id'] not in injection_times:
+                injection_times[scan['id']] = None
+        return injection_times
+
     def scan_dependents(self):
         l = []
         for scan in self.run():

diff --git a/dimspy/portals/thermo_raw_portal.py b/dimspy/portals/thermo_raw_portal.py
@@ -24,66 +24,53 @@
 
 
 def mz_range_from_header(h):
-    """
-    Extract a list of headers / .
-    :rtype: list
-    """
     return [float(m) for m in re.findall(r'([\w\.-]+)-([\w\.-]+)', h)[0]]
 
 
 class ThermoRaw:
-    """
-    Extract a list of headers / .
-    :rtype: list
-    """
+
     def __init__(self, filename):
         self.run = RawFileReader.RawFileReaderAdapter.FileFactory(filename)
         self.run.SelectInstrument(Business.Device.MS, 1)
+        self.filename = filename
 
     def headers(self):
-        """
-        Extract a particular scan from a *.raw file and return a PeakList objects
-        :rtype: dict
-        """
+
         sids = collections.OrderedDict()
         for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
             sids.setdefault(str(self.run.GetFilterForScanNumber(scan_id).Filter), []).append(scan_id)
         return sids
 
     def scan_ids(self):
-        """
-        Extract a particular scan from a *.raw file and return a PeakList objects
-        :rtype: dict
-        """
+
         sids = collections.OrderedDict()
         for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
             sids[scan_id] = str(self.run.GetFilterForScanNumber(scan_id).Filter)
         return sids
 
     def peaklist(self, scan_id, function_noise="noise_packets"):
-        """
-        Extract a particular scan from a *.raw file and return a PeakList objects
 
-        :param scan_ids:
-        :rtype: list
-        """
         if function_noise not in ["noise_packets", "mean", "median", "mad"]:
             raise ValueError("select a function that is available [noise_packets, mean, median, mad]")
 
         scan = self.run.GetCentroidStream(scan_id, False)
-
-        mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises)  # SignalToNoise not available
-        mz_ibn.sort()
-        mzs, ints, baseline, noise = zip(*mz_ibn)
+        if scan.Masses is not None:
+            mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises)  # SignalToNoise not available
+            mz_ibn.sort()
+            mzs, ints, baseline, noise = zip(*mz_ibn)
+        else:
+            mzs, ints, baseline, noise = [], [], [], []
 
         if function_noise == "noise_packets":
             snr = [p.SignalToNoise for p in scan.GetCentroids()]
-        elif function_noise == "median":
+        elif function_noise == "median" and len(ints) > 0:
             snr = ints / np.median(ints)
-        elif function_noise == "mean":
+        elif function_noise == "mean" and len(ints) > 0:
             snr = ints / np.mean(ints)
-        elif function_noise == "mad":
+        elif function_noise == "mad" and len(ints) > 0:
             snr = ints / np.median(np.abs(np.subtract(ints, np.median(ints))))
+        else:
+            snr = []
 
         scan_stats = self.run.GetScanStatsForScanNumber(scan_id)
 
@@ -119,39 +106,43 @@ def peaklist(self, scan_id, function_noise="noise_packets"):
                       tic=tic,
                       function_noise=function_noise)
 
-        pl.add_attribute('snr', snr)
-        pl.add_attribute('noise', noise)
-        pl.add_attribute('baseline', baseline)
+        if len(pl.mz) > 0:
+            pl.add_attribute('snr', snr)
+            pl.add_attribute('noise', noise)
+            pl.add_attribute('baseline', baseline)
+
         return pl
 
     def peaklists(self, scan_ids, function_noise="noise_packets"):
-        """
-        Extract the scans from a *.raw file and return a list of PeakList objects
-
-        :param scan_ids:
-        :rtype: list
-
-        """
         if function_noise not in ["noise_packets", "mean", "median", "mad"]:
             raise ValueError("select a function that is available [noise_packets, mean, median, mad]")
 
         return [self.peaklist(scan_id, function_noise=function_noise) for scan_id in scan_ids]
 
     def tics(self):
-        # somehow i can not access the scans directly when run() uses an open archive object
-        # print self.run()[2]
-        tics = []
+        tics = collections.OrderedDict()
         for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
             scan_stats = self.run.GetScanStatsForScanNumber(scan_id)
-            tics.append(scan_stats.TIC)
+            tics[scan_id].append(scan_stats.TIC)
         return tics
 
+    def injection_times(self):
+        injection_times = collections.OrderedDict()
+        for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
+            extra_values = list(self.run.GetTrailerExtraInformation(scan_id).Values)
+            extra_labels = list(self.run.GetTrailerExtraInformation(scan_id).Labels)
+            for i, label in enumerate(extra_labels):
+                if "Ion Injection Time (ms):" == label:
+                    injection_times[scan_id] = float(extra_values[i])
+            if scan_id not in injection_times:
+                injection_times[scan_id] = None
+        return injection_times
+
     def scan_dependents(self):
         l = []
         for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
             gsd = self.run.GetScanDependents(scan_id, 5)
             if gsd is not None:
                 for i, d in enumerate(gsd.ScanDependentDetailArray):
-                    print scan_id, self.run.GetFilterForScanNumber(scan_id).Filter, d.ScanIndex, d.FilterString
                     l.append([scan_id, d.ScanIndex])
         return l
diff --git a/dimspy/process/peak_filters.py b/dimspy/process/peak_filters.py
@@ -67,46 +67,48 @@ def filter_ringing(pl, threshold, bin_size=1.0, flag_name='ringing_flag', flag_i
     return pl.add_attribute(flag_name, pl.intensity > (mask * threshold), is_flag=True, on_index=flag_index)
 
 
-def filter_mz_ranges(pl, mz_remove_rngs, flag_name='mz_range_remove_flag', flag_index=None):
+def filter_mz_ranges(pl, mz_ranges, flag_name='mz_ranges_flag', flagged_only=False, flag_index=None):
     """
     Peaklist mz range filter.
-
     :param pl: the target peaklist
-    :param mz_remove_rngs: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
+    :param mz_ranges: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
     :param flag_name: name of the new flag attribute. Default = 'mz_range_remove_flag'
     :param flag_index: index of the new flag to be inserted into the peaklist. Default = None
     :rtype: PeakList object
-
     This filter will remove all the peaks whose mz values are within any of the ranges in the mz_remove_rngs.
-
     """
-    mzrs_removed_flags = np.ones(pl.shape[0], dtype=bool)
-    for mzr in mz_remove_rngs:
+    if flagged_only:
+        flags = np.ones(pl.shape[0], dtype=bool)
+    else:
+        flags = np.ones(pl.full_size, dtype=bool)
+
+    for mzr in mz_ranges:
         if len(mzr) != 2:
             raise ValueError('mzr_remove: Provide a list of "start" and "end" values for each m/z range that needs to be removed.')
         if mzr[0] >= mzr[1]:
             raise ValueError('mzr_remove: Start value cannot be larger then end value.')
-        mzrs_removed_flags[(pl.mz >= mzr[0]) & (pl.mz <= mzr[1])] = False
-    pl.add_attribute(flag_name, mzrs_removed_flags, is_flag=True, on_index=flag_index)
+        flags[(pl.get_attribute("mz", flagged_only) >= mzr[0]) & (pl.get_attribute("mz", flagged_only) <= mzr[1])] = False
+    pl.add_attribute(flag_name, flags, flagged_only=flagged_only, is_flag=True, on_index=flag_index)
     return pl
 
 
 # PeakMatrix filters
-def filter_rsd(pm, rsd_threshold, qc_tag, flag_name='rsd_flag'):
+def filter_rsd(pm, rsd_threshold, qc_tag, on_attr = 'intensity', flag_name='rsd_flag'):
     """
     PeakMatrix RSD filter.
 
     :param pm: the target peak matrix
     :param rsd_threshold: threshold of the RSD of the QC samples
     :param qc_tag: tag (label) to unmask qc samples
+    :param on_attr: calculate RSD on given attribute. Default = "intensity"
     :param flag_name: name of the new flag. Default = 'rsd_flag'
     :rtype: PeakMatrix object
 
     This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the
     threshold will be unflagged.
 
     """
-    rsd_values = pm.rsd(qc_tag)
+    rsd_values = pm.rsd(qc_tag, on_attr = on_attr)
     if np.any(np.isnan(rsd_values)):
         logging.warning('nan found in QC rsd values, filter might not work properly')
 
@@ -136,10 +138,10 @@ def filter_fraction(pm, fraction_threshold, within_classes=False, class_tag_type
             raise KeyError('must provide class tag type for within classes filtering')
         if not all(map(lambda t: t.has_tag_type(class_tag_type), pm.peaklist_tags)):
             raise AttributeError('not all tags have tag type [%s]' % class_tag_type)
-        flg = np.ones(pm.shape[1])
+        flg = np.zeros(pm.shape[1])
         for tag in pm.tags_of(class_tag_type):
             with unmask_peakmatrix(pm, tag) as m:
-                flg = np.logical_and(flg, (m.fraction >= fraction_threshold))
+                flg = np.logical_or(flg, (m.fraction >= fraction_threshold))
         pm.add_flag(flag_name, flg)
     return pm
 

diff --git a/dimspy/process/replicate_processing.py b/dimspy/process/replicate_processing.py
@@ -119,7 +119,7 @@ def read_scans(fn, source, function_noise, min_scans=1, filter_scan_events=None)
     return scans
 
 
-def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, block_size=5000, ncpus=None):
+def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, rsd_on="intensity", block_size=5000, ncpus=None):
 
     emlst = np.array(map(lambda x: x.size == 0, pls))
     if np.sum(emlst) > 0:
@@ -137,18 +137,26 @@ def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0
             if v is not None:
                 pl_avg.metadata[k].append(v)
 
-    pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2)
+    if rsd_on != "intensity":
+        pl_avg.add_attribute(rsd_on, pm.attr_mean_vector(rsd_on), on_index=2)
+        rsd_label = "rsd_{}".format(rsd_on)
+        shift = 1
+    else:
+        rsd_label = "rsd"
+        shift = 0
+
+    pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2+shift)
     pl_avg.add_attribute("snr_flag", np.ones(pl_avg.full_size), flagged_only=False, is_flag=True)
 
-    pl_avg.add_attribute("rsd", pm.rsd(flagged_only=False), on_index=5)
+    pl_avg.add_attribute(rsd_label, pm.rsd(on_attr=rsd_on, flagged_only=False), on_index=5+shift)
 
     if min_fraction is not None:
         pl_avg.add_attribute("fraction_flag", (pm.present / float(pm.shape[0])) >= min_fraction, flagged_only=False, is_flag=True)
     if rsd_thres is not None:
         if pm.shape[0] == 1:
             logging.warning('applying RSD filter on single scan, all peaks removed')
-        rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute("rsd", flagged_only=False))
-        pl_avg.add_attribute("rsd_flag", rsd_flag, flagged_only=False, is_flag=True)
+        rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute(rsd_label, flagged_only=False))
+        pl_avg.add_attribute("{}_flag".format(rsd_label), rsd_flag, flagged_only=False, is_flag=True)
     return pl_avg