/
DataSplitHGG.py
121 lines (93 loc) · 4.95 KB
/
DataSplitHGG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import SimpleITK as sitk
from tqdm import tqdm
from preprocessing import *
from sklearn.model_selection import train_test_split
import os
from keras.utils import to_categorical
# DataSplitHGG uses the same functions and preprocessing as the LGG files, however re-organised in order to be able to
# apply them on the HGG data. The following code reads in the HGG data, splits the data into train/validate/test
# directly from the patients files and saves these sets. The train and validate sets are then passed through the
# process_HGG function which normalises each image and separates the MRI modality images from the ground truth images.
# The two datasets are then processed through the preprocessing functions and the ground truth data is one-hot-encoded.
# The data sets are then saved in their respective files.
#
# This code was edited and contributed to by all three group members, but adapted from the Multimodal Brain Tumour
# Segmentation GitHub repository by Aryaman Sinha. The link to the repository can be found here:
# https://github.com/as791/Multimodal-Brain-Tumor-Segmentation/blob/master/Main_file.ipynb.
# File path to HGG data:
pathHGG = "/BraTSData/MICCAI_BraTS_2018_Data_Training/HGG/"
def load_HGG(path):
# The load_HGG function enters the directory containing the patient files, then immediately splits the data
# into the train, validate, and test sets.
my_dir = sorted(os.listdir(path))
print(len(my_dir))
X_train_HG, X_test_HG = train_test_split(my_dir, test_size=0.10, random_state=42)
X_train_HG, X_val_HG = train_test_split(X_train_HG, test_size=0.25, random_state=42)
return X_train_HG, X_val_HG, X_test_HG
# Calling the load_HGG function to separate the patients into train, val, test sets:
X_train_HG, X_val_HG, X_test_HG = load_HGG(pathHGG)
print("Length of train, validate, and test:", len(X_train_HG), len(X_val_HG), len(X_test_HG))
def process_HGG(path, data):
# The process_HGG function enters the directory containing the patient files, then iterates through each patient
# modality scan, reads in the image, organises the images by modality or ground truth, normalises each image, and
# saves the images to a data and ground truth array.
data_vector = []
gt_vector = []
for p in tqdm(data):
data_list = sorted(os.listdir(path + p))
# FLAIR images:
img_itk = sitk.ReadImage(path + p + '/' + data_list[0])
flair = sitk.GetArrayFromImage(img_itk)
flair = normalise(flair)
# Ground truth images:
img_itk = sitk.ReadImage(path + p + '/' + data_list[1])
seg = sitk.GetArrayFromImage(img_itk)
# T1 images:
img_itk = sitk.ReadImage(path + p + '/' + data_list[2])
t1 = sitk.GetArrayFromImage(img_itk)
t1 = normalise(t1)
# T1ce (T1Gd) images:
img_itk = sitk.ReadImage(path + p + '/' + data_list[3])
t1ce = sitk.GetArrayFromImage(img_itk)
t1ce = normalise(t1ce)
# T2 images:
img_itk = sitk.ReadImage(path + p + '/' + data_list[4])
t2 = sitk.GetArrayFromImage(img_itk)
t2 = normalise(t2)
data_vector.append([flair, t1, t1ce, t2])
gt_vector.append(seg)
data_vector = np.asarray(data_vector, dtype=np.float32)
gt_vector = np.asarray(gt_vector, dtype=np.uint8)
return data_vector, gt_vector
# Applying the process_HGG function on the training data:
HGG_train_X, HGG_train_Y = process_HGG(pathHGG, X_train_HG)
print("Train data:", len(HGG_train_X), len(HGG_train_Y))
print("Train shapes:", HGG_train_X.shape, HGG_train_Y.shape)
print("Train types:", HGG_train_X.dtype, HGG_train_Y.dtype)
# Applying the process_HGG function on the validation data:
HGG_val_X, HGG_val_Y = process_HGG(pathHGG, X_val_HG)
print("Val data:", len(HGG_val_X), len(HGG_val_Y))
print("Val shapes:", HGG_val_X.shape, HGG_val_Y.shape)
print("Val types:", HGG_val_X.dtype, HGG_val_Y.dtype)
# Applying the processing steps from the Preprocessing file:
HGG_train_X = transpose_data(HGG_train_X)
HGG_train_X, HGG_train_Y = slice_crop(HGG_train_X, HGG_train_Y)
HGG_val_X = transpose_data(HGG_val_X)
HGG_val_X, HGG_val_Y = slice_crop(HGG_val_X, HGG_val_Y)
HGG_train_Y = ground_truth_4_to_3(HGG_train_Y)
HGG_val_Y = ground_truth_4_to_3(HGG_val_Y)
# One-hot-encoding the ground truth labels
HGG_train_Y = to_categorical(HGG_train_Y)
HGG_val_Y = to_categorical(HGG_val_Y)
print("Before saving to files data shape")
print("HGG_train_X, HGG_train_Y shape", HGG_train_X.shape, HGG_train_Y.shape)
print("HGG_val_X, HGG_val_Y shape", HGG_val_X.shape, HGG_val_Y.shape)
# Saving all the X and Y datasets separately for the train and validate sets, and saving the test data without any
# processing.
np.save('Training_data/HGG_train_X.npy', HGG_train_X)
np.save('Training_data/HGG_train_Y.npy', HGG_train_Y)
np.save('Validation_data/HGG_val_X.npy', HGG_val_X)
np.save('Validation_data/HGG_val_Y.npy', HGG_val_Y)
np.save('Test_data/HGG_test_data.npy', X_test_HG)
print("Data saved successfully")