Skip to content

Commit

Permalink
Merge pull request #37 from computational-metabolomics/new_features
Browse files Browse the repository at this point in the history
Update portals, peaklist and peak_matrix models, and fix minor bugs
  • Loading branch information
RJMW committed May 29, 2018
2 parents cde30e3 + 02eeca7 commit ddcaf23
Show file tree
Hide file tree
Showing 15 changed files with 116 additions and 73 deletions.
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ will help you to make the PR if you are new to `git`.
Developers & Contributors
-------------------------
- Ralf J. M. Weber (r.j.weber@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
- Jiarui (Albert) Zhou (j.zhou.3@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
- Jiarui (Albert) Zhou (j.zhou.3@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_, `HIT Shenzhen (China) <http://www.hitsz.edu.cn>`_
- Thomas N. Lawson (tnl495@bham.ac.uk) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_


Expand Down
4 changes: 3 additions & 1 deletion dimspy/models/peak_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ def rsd(self, *args, **kwargs):
:param args: tags or untyped tag values for RSD calculation, no value = calculate over all samples
:param kwargs: typed tags for RSD calculation, , no value = calculate over all samples
:param on_attr: calculate RSD on given attribute. Default = "intensity"
:param flagged_only: whether to calculate on flagged peaks only. Default = True
:type: numpy array
Expand All @@ -413,6 +414,7 @@ def rsd(self, *args, **kwargs):
corresponding rsd value will be set to np.nan.
"""
on_attr = kwargs.pop('on_attr') if kwargs.has_key('on_attr') else 'intensity'
flagged_only = kwargs.pop('flagged_only') if kwargs.has_key('flagged_only') else True

if self.shape[0] < 2:
Expand All @@ -423,7 +425,7 @@ def rsd(self, *args, **kwargs):
unmask_peakmatrix(self, *args, **kwargs)) as m:
if m.shape[0] == 0: raise AttributeError('peak matrix does not have label(s) [%s]' %
join(map(lambda x: str(x)[1:-1], (args, kwargs)), ', '))
ints = m.attr_matrix('intensity', flagged_only)
ints = m.attr_matrix(on_attr, flagged_only)
rsd = m._present_std(ints, 0, flagged_only) / m._present_mean(ints, 0, flagged_only) * 100

rsd[np.where(map(lambda x: len(set(x[np.nonzero(x)])) == 1, ints.T))] = np.nan # only one valid value
Expand Down
5 changes: 5 additions & 0 deletions dimspy/models/peaklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,16 @@ def ID(self):
Property of the peaklist ID.
:getter: returns the peaklist ID
:setter: set the peaklist ID
:type: same as input ID
"""
return self._id

@ID.setter
def ID(self, value):
self._id = str(value)

@property
def size(self):
"""
Expand Down
35 changes: 29 additions & 6 deletions dimspy/portals/mzml_portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@
import pymzml
import numpy as np
import zipfile
from copy import deepcopy
from dimspy.models.peaklist import PeakList
from dimspy.experiment import mz_range_from_header


class Mzml:
def __init__(self, filename="", archive=None):
def __init__(self, filename="", archive=None, preload=True):
self.filename = filename
self.archive = archive
self._preload = preload
self._cache = None

def run(self):
if self._cache is not None: return self._cache

if not self.filename.lower().endswith(".mzml") and not self.filename.lower().endswith(".mzml.gz") and not self.filename.lower().endswith(".zip"):
raise IOError('Incorrect file format for mzML parser')
if self.archive is not None:
Expand All @@ -32,11 +37,15 @@ def run(self):
zf = zipfile.ZipFile(self.archive, 'r')
if self.filename not in zf.namelist():
raise IOError("{} does not exist in zip file".format(self.filename))
return pymzml.run.Reader('', file_object=zf.open(self.filename))
dat = pymzml.run.Reader('', file_object=zf.open(self.filename))
if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
return dat
elif self.filename.lower().endswith(".mzml") or self.filename.lower().endswith(".mzml.gz"):
if not os.path.isfile(self.filename):
raise IOError("{} does not exist".format(self.filename))
return pymzml.run.Reader(self.filename)
dat = pymzml.run.Reader(self.filename)
if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
return dat
else:
return None

Expand All @@ -62,7 +71,10 @@ def peaklist(self, scan_id, function_noise="median"):
for scan in self.run():
if scan["id"] == scan_id:

mzs, ints = zip(*scan.peaks)
if len(scan.peaks) > 0:
mzs, ints = zip(*scan.peaks)
else:
mzs, ints = [], []

scan_time = scan["MS:1000016"]
tic = scan["total ion current"]
Expand Down Expand Up @@ -101,10 +113,21 @@ def tics(self):
# print self.run()[2]
for scan in self.run():
if scan["id"] == "TIC":
tics = zip(*scan.peaks)[1]
return tics
return zip(*scan.peaks)[1]
return

def injection_times(self):
injection_times = {}
for scan in self.run():
injection_times[scan['id']] = None
for element in scan.xmlTree:
if "MS:1000927" == element.get('accession'):
injection_times[scan['id']] = float(element.get("value"))
break
if scan['id'] not in injection_times:
injection_times[scan['id']] = None
return injection_times

def scan_dependents(self):
l = []
for scan in self.run():
Expand Down
77 changes: 34 additions & 43 deletions dimspy/portals/thermo_raw_portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,66 +24,53 @@


def mz_range_from_header(h):
"""
Extract a list of headers / .
:rtype: list
"""
return [float(m) for m in re.findall(r'([\w\.-]+)-([\w\.-]+)', h)[0]]


class ThermoRaw:
"""
Extract a list of headers / .
:rtype: list
"""

def __init__(self, filename):
self.run = RawFileReader.RawFileReaderAdapter.FileFactory(filename)
self.run.SelectInstrument(Business.Device.MS, 1)
self.filename = filename

def headers(self):
"""
Extract a particular scan from a *.raw file and return a PeakList objects
:rtype: dict
"""

sids = collections.OrderedDict()
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
sids.setdefault(str(self.run.GetFilterForScanNumber(scan_id).Filter), []).append(scan_id)
return sids

def scan_ids(self):
"""
Extract a particular scan from a *.raw file and return a PeakList objects
:rtype: dict
"""

sids = collections.OrderedDict()
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
sids[scan_id] = str(self.run.GetFilterForScanNumber(scan_id).Filter)
return sids

def peaklist(self, scan_id, function_noise="noise_packets"):
"""
Extract a particular scan from a *.raw file and return a PeakList objects

:param scan_ids:
:rtype: list
"""
if function_noise not in ["noise_packets", "mean", "median", "mad"]:
raise ValueError("select a function that is available [noise_packets, mean, median, mad]")

scan = self.run.GetCentroidStream(scan_id, False)

mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises) # SignalToNoise not available
mz_ibn.sort()
mzs, ints, baseline, noise = zip(*mz_ibn)
if scan.Masses is not None:
mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises) # SignalToNoise not available
mz_ibn.sort()
mzs, ints, baseline, noise = zip(*mz_ibn)
else:
mzs, ints, baseline, noise = [], [], [], []

if function_noise == "noise_packets":
snr = [p.SignalToNoise for p in scan.GetCentroids()]
elif function_noise == "median":
elif function_noise == "median" and len(ints) > 0:
snr = ints / np.median(ints)
elif function_noise == "mean":
elif function_noise == "mean" and len(ints) > 0:
snr = ints / np.mean(ints)
elif function_noise == "mad":
elif function_noise == "mad" and len(ints) > 0:
snr = ints / np.median(np.abs(np.subtract(ints, np.median(ints))))
else:
snr = []

scan_stats = self.run.GetScanStatsForScanNumber(scan_id)

Expand Down Expand Up @@ -119,39 +106,43 @@ def peaklist(self, scan_id, function_noise="noise_packets"):
tic=tic,
function_noise=function_noise)

pl.add_attribute('snr', snr)
pl.add_attribute('noise', noise)
pl.add_attribute('baseline', baseline)
if len(pl.mz) > 0:
pl.add_attribute('snr', snr)
pl.add_attribute('noise', noise)
pl.add_attribute('baseline', baseline)

return pl

def peaklists(self, scan_ids, function_noise="noise_packets"):
"""
Extract the scans from a *.raw file and return a list of PeakList objects
:param scan_ids:
:rtype: list
"""
if function_noise not in ["noise_packets", "mean", "median", "mad"]:
raise ValueError("select a function that is available [noise_packets, mean, median, mad]")

return [self.peaklist(scan_id, function_noise=function_noise) for scan_id in scan_ids]

def tics(self):
# somehow i can not access the scans directly when run() uses an open archive object
# print self.run()[2]
tics = []
tics = collections.OrderedDict()
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
scan_stats = self.run.GetScanStatsForScanNumber(scan_id)
tics.append(scan_stats.TIC)
tics[scan_id].append(scan_stats.TIC)
return tics

def injection_times(self):
injection_times = collections.OrderedDict()
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
extra_values = list(self.run.GetTrailerExtraInformation(scan_id).Values)
extra_labels = list(self.run.GetTrailerExtraInformation(scan_id).Labels)
for i, label in enumerate(extra_labels):
if "Ion Injection Time (ms):" == label:
injection_times[scan_id] = float(extra_values[i])
if scan_id not in injection_times:
injection_times[scan_id] = None
return injection_times

def scan_dependents(self):
l = []
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
gsd = self.run.GetScanDependents(scan_id, 5)
if gsd is not None:
for i, d in enumerate(gsd.ScanDependentDetailArray):
print scan_id, self.run.GetFilterForScanNumber(scan_id).Filter, d.ScanIndex, d.FilterString
l.append([scan_id, d.ScanIndex])
return l
28 changes: 15 additions & 13 deletions dimspy/process/peak_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,46 +67,48 @@ def filter_ringing(pl, threshold, bin_size=1.0, flag_name='ringing_flag', flag_i
return pl.add_attribute(flag_name, pl.intensity > (mask * threshold), is_flag=True, on_index=flag_index)


def filter_mz_ranges(pl, mz_remove_rngs, flag_name='mz_range_remove_flag', flag_index=None):
def filter_mz_ranges(pl, mz_ranges, flag_name='mz_ranges_flag', flagged_only=False, flag_index=None):
"""
Peaklist mz range filter.
:param pl: the target peaklist
:param mz_remove_rngs: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
:param mz_ranges: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
:param flag_name: name of the new flag attribute. Default = 'mz_range_remove_flag'
:param flag_index: index of the new flag to be inserted into the peaklist. Default = None
:rtype: PeakList object
This filter will remove all the peaks whose mz values are within any of the ranges in the mz_remove_rngs.
"""
mzrs_removed_flags = np.ones(pl.shape[0], dtype=bool)
for mzr in mz_remove_rngs:
if flagged_only:
flags = np.ones(pl.shape[0], dtype=bool)
else:
flags = np.ones(pl.full_size, dtype=bool)

for mzr in mz_ranges:
if len(mzr) != 2:
raise ValueError('mzr_remove: Provide a list of "start" and "end" values for each m/z range that needs to be removed.')
if mzr[0] >= mzr[1]:
raise ValueError('mzr_remove: Start value cannot be larger then end value.')
mzrs_removed_flags[(pl.mz >= mzr[0]) & (pl.mz <= mzr[1])] = False
pl.add_attribute(flag_name, mzrs_removed_flags, is_flag=True, on_index=flag_index)
flags[(pl.get_attribute("mz", flagged_only) >= mzr[0]) & (pl.get_attribute("mz", flagged_only) <= mzr[1])] = False
pl.add_attribute(flag_name, flags, flagged_only=flagged_only, is_flag=True, on_index=flag_index)
return pl


# PeakMatrix filters
def filter_rsd(pm, rsd_threshold, qc_tag, flag_name='rsd_flag'):
def filter_rsd(pm, rsd_threshold, qc_tag, on_attr = 'intensity', flag_name='rsd_flag'):
"""
PeakMatrix RSD filter.
:param pm: the target peak matrix
:param rsd_threshold: threshold of the RSD of the QC samples
:param qc_tag: tag (label) to unmask qc samples
:param on_attr: calculate RSD on given attribute. Default = "intensity"
:param flag_name: name of the new flag. Default = 'rsd_flag'
:rtype: PeakMatrix object
This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the
threshold will be unflagged.
"""
rsd_values = pm.rsd(qc_tag)
rsd_values = pm.rsd(qc_tag, on_attr = on_attr)
if np.any(np.isnan(rsd_values)):
logging.warning('nan found in QC rsd values, filter might not work properly')

Expand Down Expand Up @@ -136,10 +138,10 @@ def filter_fraction(pm, fraction_threshold, within_classes=False, class_tag_type
raise KeyError('must provide class tag type for within classes filtering')
if not all(map(lambda t: t.has_tag_type(class_tag_type), pm.peaklist_tags)):
raise AttributeError('not all tags have tag type [%s]' % class_tag_type)
flg = np.ones(pm.shape[1])
flg = np.zeros(pm.shape[1])
for tag in pm.tags_of(class_tag_type):
with unmask_peakmatrix(pm, tag) as m:
flg = np.logical_and(flg, (m.fraction >= fraction_threshold))
flg = np.logical_or(flg, (m.fraction >= fraction_threshold))
pm.add_flag(flag_name, flg)
return pm

Expand Down
18 changes: 13 additions & 5 deletions dimspy/process/replicate_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def read_scans(fn, source, function_noise, min_scans=1, filter_scan_events=None)
return scans


def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, block_size=5000, ncpus=None):
def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, rsd_on="intensity", block_size=5000, ncpus=None):

emlst = np.array(map(lambda x: x.size == 0, pls))
if np.sum(emlst) > 0:
Expand All @@ -137,18 +137,26 @@ def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0
if v is not None:
pl_avg.metadata[k].append(v)

pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2)
if rsd_on != "intensity":
pl_avg.add_attribute(rsd_on, pm.attr_mean_vector(rsd_on), on_index=2)
rsd_label = "rsd_{}".format(rsd_on)
shift = 1
else:
rsd_label = "rsd"
shift = 0

pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2+shift)
pl_avg.add_attribute("snr_flag", np.ones(pl_avg.full_size), flagged_only=False, is_flag=True)

pl_avg.add_attribute("rsd", pm.rsd(flagged_only=False), on_index=5)
pl_avg.add_attribute(rsd_label, pm.rsd(on_attr=rsd_on, flagged_only=False), on_index=5+shift)

if min_fraction is not None:
pl_avg.add_attribute("fraction_flag", (pm.present / float(pm.shape[0])) >= min_fraction, flagged_only=False, is_flag=True)
if rsd_thres is not None:
if pm.shape[0] == 1:
logging.warning('applying RSD filter on single scan, all peaks removed')
rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute("rsd", flagged_only=False))
pl_avg.add_attribute("rsd_flag", rsd_flag, flagged_only=False, is_flag=True)
rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute(rsd_label, flagged_only=False))
pl_avg.add_attribute("{}_flag".format(rsd_label), rsd_flag, flagged_only=False, is_flag=True)
return pl_avg


Expand Down

0 comments on commit ddcaf23

Please sign in to comment.