Versioning + Minor adaptions

NumericalMax · NumericalMax · commit 0aaa3702dadd · 2024-02-08T12:40:18.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,5 @@ neurokit2
 openpyxl
 pandas
 matlabengine
+tensorflow
+wfdb
diff --git a/src/icentia11k/icentia11k_dataset_builder.py b/src/icentia11k/icentia11k_dataset_builder.py
@@ -11,13 +11,12 @@
 project_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 sys.path.append(project_path)
 
-
 class Builder(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for icentia11k dataset."""
 
-    VERSION = tfds.core.Version('1.0.0')
+    VERSION = tfds.core.Version('1.0.3')
     RELEASE_NOTES = {
-        '1.0.0': 'Initial release.',
+        '1.0.3': 'Initial release.',
     }
 
     def _info(self) -> tfds.core.DatasetInfo:
@@ -44,41 +43,45 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager):
 
         # TODO: Adjust the path to your needs - due to the size of the dataset (> 1 TB unzipped) no automatic
         #  download is provided
-        path = './data/'
-        for k in range(0, 11000):
+        path = '/mnt/sdb/icentia11k-single-lead-continuous-raw-electrocardiogram-dataset-1.0/'
+        np.random.seed(42)
+        ids = np.random.choice(range(11000), 100, replace=False)
+        for k in ids:
             result.update({str(k): self._generate_examples(path, k)})
 
         return result
 
     def _generate_examples(self, path, subject):
 
         preprocessor = Preprocess(125, 125, peak='R', final_length=500)
-
         for segment in range(0, 50):
-            t = segment // 1000
-            filename = path + 'p' + f"{t :02d}" + '/p' + f"{subject :05d}" + '/p' + f"{subject :05d}" + '_s' + f"{segment :02d}"
-            rec = wfdb.rdrecord(filename)
-            ann = wfdb.rdann(filename, "atr")
-
-            ann.symbol = np.array(ann.symbol)
-            ann.sample = np.array(ann.sample)
-
-            signal = rec.p_signal.flatten()
-            rpeaks = pd.DataFrame(ann.sample[ann.symbol != '+'], columns=['ECG_R_Peaks'])
-            labels = ann.symbol[ann.symbol != '+']
-
-            ecg_clean, q, ind = preprocessor.preprocess(data=signal, sampling_rate=ann.fs, rpeaks=rpeaks)
-            labels = labels[ind]
-
-            for i, k in enumerate(ecg_clean[:, :, 0]):
-                yield str(subject) + '_' + str(segment) + '_' + str(i), {
-                    'ecg': {
-                        'I': k,
-                    },
-                    'patient': int(subject),
-                    'segment': int(segment),
-                    # TODO: the rhythm information is now statically set to N
-                    'rhythm': 'N',
-                    'beat': labels[i],
-                    'quality': q[i],
-                }
+            try:
+                t = subject // 1000
+                filename = path + 'p' + f"{t :02d}" + '/p' + f"{subject :05d}" + '/p' + f"{subject :05d}" + '_s' + f"{segment :02d}"
+                rec = wfdb.rdrecord(filename)
+                ann = wfdb.rdann(filename, "atr")
+
+                ann.symbol = np.array(ann.symbol)
+                ann.sample = np.array(ann.sample)
+
+                signal = rec.p_signal.flatten()
+                rpeaks = pd.DataFrame(ann.sample[ann.symbol != '+'], columns=['ECG_R_Peaks'])
+                labels = ann.symbol[ann.symbol != '+']
+
+                ecg_clean, q, ind = preprocessor.preprocess(data=signal, sampling_rate=ann.fs, rpeaks=rpeaks)
+                labels = labels[ind]
+
+                for i, k in enumerate(ecg_clean[:, :, 0]):
+                    yield str(subject) + '_' + str(segment) + '_' + str(i), {
+                        'ecg': {
+                            'I': k,
+                        },
+                        'patient': int(subject),
+                        'segment': int(segment),
+                        # TODO: the rhythm information is now statically set to N
+                        'rhythm': 'N',
+                        'beat': labels[i],
+                        'quality': q[i],
+                    }
+            except:
+                continue
diff --git a/src/medalcare/medalcare_dataset_builder.py b/src/medalcare/medalcare_dataset_builder.py
@@ -10,9 +10,9 @@
 class Builder(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for medalcare dataset."""
 
-    VERSION = tfds.core.Version('1.0.0')
+    VERSION = tfds.core.Version('1.0.4')
     RELEASE_NOTES = {
-        '1.0.0': 'Initial release.',
+        '1.0.4': 'Initial release.',
     }
 
     def _info(self) -> tfds.core.DatasetInfo:
@@ -23,6 +23,8 @@ def _info(self) -> tfds.core.DatasetInfo:
                 'ecg': tfds.features.Sequence({
                     'I': np.float64,
                 }),
+                'subject': np.int32,
+                'diagnosis': tfds.features.ClassLabel(names=['avblock', 'fam', 'iab', 'lae', 'lbbb', 'mi', 'rbbb', 'sinus']),
                 'v_G.lungs': np.float64,
                 'v_G.ischemia': np.float64,
                 'v_G.torso': np.float64,
@@ -191,7 +193,7 @@ def _generate_examples(self, path, mode):
         files = glob.glob(path + '/*.txt', recursive=True)
         preprocessor = Preprocess(250, 250, peak='R', final_length=500)
 
-        for k in list(set([item[:item.rfind('_')] for item in files])):
+        for subject, k in enumerate(list(set([item[:item.rfind('_')] for item in files]))):
             try:
                 atrial = pd.read_csv(k + '_AtrialParameters.txt', sep=' ', skiprows=2).ffill(axis=1)
                 ventricular = pd.read_csv(k + '_VentricularParameters.txt', sep=' ').ffill(axis=1)
@@ -201,39 +203,46 @@ def _generate_examples(self, path, mode):
                 data_prep, q, ind = preprocessor.preprocess(data=signal, sampling_rate=500)
                 atrial.iloc[:, -1] = atrial.iloc[:, -1].apply(
                     lambda x: x.replace('mm/s', '').replace('deg', '').replace('mm', ''))
-                dict = {
-                    'ecg': {
-                        'I': np.median(data_prep, axis=0).flatten(),
-                    },
-                }
 
-                for j in allowed:
-                    dict.update({j: 0.0})
-
-                for j, val in ventricular.iterrows():
-                    ke = 'v_' + val.iloc[0]
-
-                    if ke in allowed:
+                result = [part for part in k.split("/") if part]
+                print(result[10])
+
+                for i, t in enumerate(data_prep):
+                    dict = {
+                        'ecg': {
+                            'I': t.flatten(),
+                        },
+                        'subject': subject,
+                        'diagnosis': result[10] #TODO: 10 is set statically --> needs to be adapt
+                    }
+
+                    for j in allowed:
+                        dict.update({j: 0.0})
+
+                    for j, val in ventricular.iterrows():
+                        ke = 'v_' + val.iloc[0]
+
+                        if ke in allowed:
+                            va = 0.0
+                            try:
+                                va = np.float64(val.iloc[-1])
+                            except:
+                                if (val.iloc[-1] == 'True') | (val.iloc[-1] == 'False'):
+                                    va = val.iloc[-1]
+                            dict.update({ke: va})
+
+                    for j, val in atrial.iterrows():
+                        ke = 'a_' + val.iloc[0]
                         va = 0.0
-                        try:
-                            va = np.float64(val.iloc[-1])
-                        except:
-                            if (val.iloc[-1] == 'True') | (val.iloc[-1] == 'False'):
-                                va = val.iloc[-1]
-                        dict.update({ke: va})
-
-                for j, val in atrial.iterrows():
-                    ke = 'a_' + val.iloc[0]
-                    va = 0.0
-                    if ke in allowed:
-                        try:
-                            va = np.float64(val.iloc[-1])
-                        except:
-                            if (val.iloc[-1] == 'True') | (val.iloc[-1] == 'False'):
-                                va = val.iloc[-1]
-                        dict.update({ke: va})
-
-                yield k, dict
+                        if ke in allowed:
+                            try:
+                                va = np.float64(val.iloc[-1])
+                            except:
+                                if (val.iloc[-1] == 'True') | (val.iloc[-1] == 'False'):
+                                    va = val.iloc[-1]
+                            dict.update({ke: va})
+
+                    yield k + '_' + str(i), dict
 
             except Exception as e:
                 print(e)
diff --git a/src/ptb/ptb_dataset_builder.py b/src/ptb/ptb_dataset_builder.py
@@ -16,9 +16,9 @@
 class Builder(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for ptb dataset."""
 
-    VERSION = tfds.core.Version('1.0.0')
+    VERSION = tfds.core.Version('1.0.3')
     RELEASE_NOTES = {
-        '1.0.0': 'Initial release.',
+        '1.0.3': 'Initial release.',
     }
 
     def _info(self) -> tfds.core.DatasetInfo:
@@ -28,6 +28,7 @@ def _info(self) -> tfds.core.DatasetInfo:
                 'ecg': tfds.features.Sequence({
                     'I': np.float64,
                 }),
+                'subject': np.int32,
                 'quality': tfds.features.ClassLabel(names=['Unacceptable', 'Barely acceptable', 'Excellent', '[]']),
                 'age': np.uint8,
                 'gender': np.uint8,
@@ -67,15 +68,16 @@ def _generate_examples(self, path):
             data = wfdb.rdsamp(str(path) + '/' + row['filename_hr'])[0][:, 0]
             data_prep, q, ind = preprocessor.preprocess(data=data, sampling_rate=500)
 
-            #for j, k in enumerate(data_prep):
-            key = str(row['patient_id']) + "_" + str(index)
-            diagnostic = "NAV" if len(row['diagnostic_superclass']) == 0 else row['diagnostic_superclass'][0]
-            yield key, {
-                'ecg': {
-                    'I':np.median(data_prep, axis=0).flatten(),
-                },
-                'quality': str(q[0]),
-                'age': row['age'],
-                'gender': row['sex'],
-                'diagnostic': diagnostic,
-            }
+            for j, k in enumerate(data_prep):
+                key = str(row['patient_id']) + "_" + str(index) + "_" + str(j)
+                diagnostic = "NAV" if len(row['diagnostic_superclass']) == 0 else row['diagnostic_superclass'][0]
+                yield key, {
+                    'ecg': {
+                        'I': k.flatten(),
+                    },
+                    'subject': index,
+                    'quality': str(q[0]),
+                    'age': row['age'],
+                    'gender': row['sex'],
+                    'diagnostic': diagnostic,
+                }
diff --git a/src/synth/synth_dataset_builder.py b/src/synth/synth_dataset_builder.py
diff --git a/src/zheng/zheng_dataset_builder.py b/src/zheng/zheng_dataset_builder.py