Skip to content

Commit

Permalink
Merge pull request #1 from applied-systems-biology/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
amedyukhina committed Sep 4, 2019
2 parents 045fe1a + 0822419 commit a4f8277
Show file tree
Hide file tree
Showing 20 changed files with 2,491 additions and 525 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down
5 changes: 3 additions & 2 deletions SPHARM/classes/surface.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,8 @@ def plot_points(self):

with warnings.catch_warnings():
warnings.simplefilter("ignore")
mesh = mlab.points3d(self.x, self.y, self.z, self.z, scale_mode='none', scale_factor=0.2).scene
mesh = mlab.points3d(self.x, self.y, self.z, self.z, scale_mode='none',
scale_factor=0.1, mode='sphere').scene
mesh.background = (1, 1, 1)
mesh.magnification = 3
return mesh
Expand Down Expand Up @@ -358,7 +359,7 @@ def plot_surface(self, points=False, extent=None):
mesh = mlab.mesh(x * np.abs(grid), y * np.abs(grid), z * np.abs(grid), scalars=grid,
colormap='jet')
if points:
mesh = mlab.points3d(self.x, self.y, self.z, self.z, scale_mode='none', scale_factor=0.2)
mesh = mlab.points3d(self.x, self.y, self.z, self.z, scale_mode='none', scale_factor=0.05)

mesh.scene.background = (1, 1, 1)
mesh.scene.magnification = 3
Expand Down
Empty file added SPHARM/lib/__init__.py
Empty file.
285 changes: 285 additions & 0 deletions SPHARM/lib/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
from __future__ import division
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut

from SPHARM.classes.spectrum import Spectrum
from SPHARM.classes.time_spectrum import TimeSpectrum
from SPHARM.classes.stratified_group_shuffle_split import GroupShuffleSplitStratified


def extract_features(input_stat, cell_id='Name', group='Group', static=True, dynamic_features=None,
timelength=10, static_features='amplitude', one_time_point=True, rotation_invariant=True):
"""
Extract spectral features for classification.
Parameters
----------
input_stat : pandas.DataFrame
Input data to extract features
cell_id : str
Column in the input data sheet to group connected time points.
Default is 'TrackID'
group : str, optional
Column in the input data sheet to use for grouping.
Default is 'Group'.
static : bool, optional
If True, static features will be extracted.
If False, dynamic features will be extracted.
Default is True.
dynamic_features : str, optional
Name of the feature to use for computing shape dynamics.
Valid values: 'time', 'derivative', 'frequency'.
Default is 'frequency'.
static_features : str, optional
Name of the feature to represent the harmonic coefficients.
Valid values: 'amplitude', 'real_imag'.
Default is 'amplitude'
timelength : int, optional
Number of time points to include into dynamic features.
Default is 10.
one_time_point : bool, optional
If True, only the first time point of each cell will be used.
If False, all time points will be used as independent samples.
Default is True.
rotation_invariant : bool, optional
If True, rotation-invariant descriptors (frequencies) will be computed.
If False, the whole spectrum will be used as a feature vector.
Default is True.
Returns
-------
features : N x K numpy.array
The returned feature vector of N samples and K features
classes : array of length N
Labels of the true classes.
"""

features = []
classes = []
names = []
group_names = []
samples = []

groups = input_stat[group].unique()
for i in range(len(groups)):
stat = input_stat[input_stat[group] == groups[i]]
if len(stat) > 0:
for name in stat[cell_id].unique():
subsubstat = stat[stat[cell_id] == name].reset_index()
subsubstat = subsubstat.sort_values('Time')
times = subsubstat['Time'].unique()
if static:
spectrum = Spectrum()
if one_time_point:
spectrum.harmonics_csv = subsubstat[subsubstat['Time'] == times[0]]
features.append(spectrum.return_feature_vector(static_features=static_features,
rotation_invariant=rotation_invariant))
classes.append(i)
names.append(spectrum.harmonics_csv.iloc[0]['Name'])
group_names.append(groups[i])
if 'Sample' in subsubstat.columns:
samples.append(subsubstat['Sample'].iloc[0])
else:
for t in times:
spectrum.harmonics_csv = subsubstat[subsubstat['Time'] == t]
features.append(spectrum.return_feature_vector(static_features=static_features,
rotation_invariant=rotation_invariant))
classes.append(i)
names.append(spectrum.harmonics_csv.iloc[0]['Name'])
group_names.append(groups[i])
if 'Sample' in subsubstat.columns:
samples.append(subsubstat['Sample'].iloc[0])
else:
if len(times) >= timelength:
spectrum = TimeSpectrum()
for t in times[:timelength]:
sp = Spectrum()
sp.harmonics_csv = subsubstat[subsubstat['Time'] == t]
spectrum.add_spectrum(sp, timepoint=t)
features.append(spectrum.return_feature_vector(dynamic_features=dynamic_features,
static_features=static_features,
rotation_invariant=rotation_invariant))
classes.append(i)
group_names.append(groups[i])
names.append(spectrum.data.iloc[0]['Name'])
if 'Sample' in subsubstat.columns:
samples.append(subsubstat['Sample'].iloc[0])
classes = np.array(classes)
features = np.array(features)
names = np.array(names)
group_names = np.array(group_names)
samples = np.array(samples)

return features, classes, names, group_names, samples


def predict_classes_loo(features, classes, C=1, groups=None):
"""
Computes accuracy and predicts classes by cross-validation
Parameters
----------
features : array-like
The data to fit.
classes : array-like
The target classes to try to predict.
C : float, optional
Penalty parameter C of the error term.
Default is 1.
groups : array-like, optional
Group labels for the samples used while splitting the dataset into train/test set.
Default is None.
Returns
-------
accuracy: pandas DataFrame
Array of scores of the estimator for each run of the cross validation.
predicted: pandas DataFrame
Predicted classes
"""
clf = svm.SVC(kernel='linear', C=C, cache_size=1000, decision_function_shape='ovo', random_state=0)
cv = LeaveOneGroupOut()
accuracy = pd.DataFrame({'Accuracy': cross_val_score(clf, X=features, y=classes, groups=groups, cv=cv)})
predicted = pd.DataFrame({'Actual class': classes,
'Predicted class': cross_val_predict(clf, X=features,
y=classes, groups=groups, cv=cv)})

return accuracy, predicted


def predict_classes_st_kfold(features, classes, nsplits=7, random_state=0, C=1):
"""
Computes accuracy and predicts classes by cross-validation
Parameters
----------
features : array-like
The data to fit.
classes : array-like
The target classes to try to predict.
nsplits : int, optional
Number of folds.
Default is 7.
random_state : int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Used when ``shuffle`` == True.
C : float, optional
Penalty parameter C of the error term.
Default is 1.
Returns
-------
accuracy: pandas DataFrame
Array of scores of the estimator for each run of the cross validation.
predicted: pandas DataFrame
Predicted classes
"""

clf = svm.SVC(kernel='linear', C=C, cache_size=1000, decision_function_shape='ovo', random_state=0)

cv = StratifiedKFold(n_splits=nsplits, random_state=random_state)
accuracy = pd.DataFrame({'Accuracy': cross_val_score(clf, X=features, y=classes,cv=cv)})
predicted = pd.DataFrame({'Actual class': classes,
'Predicted class': cross_val_predict(clf, X=features, y=classes, cv=cv)})

return accuracy, predicted


def predict_shuffle_split(features, classes, C=1, nsplits=100, test_size=2./5, random_state=0):
"""
Computes accuracy and predicts classes by cross-validation
Parameters
----------
features : array-like
The data to fit.
classes : array-like
The target classes to try to predict.
C : float, optional
Penalty parameter C of the error term.
Default is 1.
nsplits : int, optional
Number of folds.
Default is 100.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples.
Default is 2./5
random_state : int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Used when ``shuffle`` == True.
Returns
-------
accuracy: pandas DataFrame
Array of scores of the estimator for each run of the cross validation.
"""

clf = svm.SVC(kernel='linear', C=C, cache_size=1000, decision_function_shape='ovo', random_state=0)
cv = StratifiedShuffleSplit(n_splits=nsplits, test_size=test_size, random_state=random_state)
accuracy = pd.DataFrame({'Accuracy': cross_val_score(clf, features, classes, cv=cv)})
return accuracy


def predict_group_shuffle_split(features, classes, C=1, nsplits=100, test_size=1, random_state=0, groups=None):
"""
Computes accuracy and predicts classes by cross-validation
Parameters
----------
features : array-like
The data to fit.
classes : array-like
The target classes to try to predict.
C : float, optional
Penalty parameter C of the error term.
Default is 1.
nsplits : int, optional
Number of folds.
Default is 100.
test_size : float, int, None, optional
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples.
Default is 1
random_state : int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`. Used when ``shuffle`` == True.
groups : array-like, optional
Group labels for the samples used while splitting the dataset into train/test set.
Default is None.
Returns
-------
accuracy: pandas DataFrame
Array of scores of the estimator for each run of the cross validation.
"""

clf = svm.SVC(kernel='linear', C=C, cache_size=1000, decision_function_shape='ovo', random_state=0)
cv = GroupShuffleSplitStratified(n_splits=nsplits, test_size=test_size, random_state=random_state)
accuracy = pd.DataFrame({'Accuracy': cross_val_score(clf, X=features, y=classes, groups=groups, cv=cv)})
return accuracy















0 comments on commit a4f8277

Please sign in to comment.