fastai · YSaxon · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020
@@ -196,6 +196,7 @@
          "FileSplitter": "05_data.transforms.ipynb",
          "ColSplitter": "05_data.transforms.ipynb",
          "RandomSubsetSplitter": "05_data.transforms.ipynb",
+         "GroupedSplitter": "05_data.transforms.ipynb",
          "parent_label": "05_data.transforms.ipynb",
          "RegexLabeller": "05_data.transforms.ipynb",
          "ColReader": "05_data.transforms.ipynb",

@@ -2,10 +2,10 @@
 
 __all__ = ['get_files', 'FileGetter', 'image_extensions', 'get_image_files', 'ImageGetter', 'get_text_files',
            'ItemGetter', 'AttrGetter', 'RandomSplitter', 'TrainTestSplitter', 'IndexSplitter', 'GrandparentSplitter',
-           'FuncSplitter', 'MaskSplitter', 'FileSplitter', 'ColSplitter', 'RandomSubsetSplitter', 'parent_label',
-           'RegexLabeller', 'ColReader', 'CategoryMap', 'Categorize', 'Category', 'MultiCategorize', 'MultiCategory',
-           'OneHotEncode', 'EncodedMultiCategorize', 'RegressionSetup', 'get_c', 'ToTensor', 'IntToFloatTensor',
-           'broadcast_vec', 'Normalize']
+           'FuncSplitter', 'MaskSplitter', 'FileSplitter', 'ColSplitter', 'RandomSubsetSplitter', 'GroupedSplitter',
+           'parent_label', 'RegexLabeller', 'ColReader', 'CategoryMap', 'Categorize', 'Category', 'MultiCategorize',
+           'MultiCategory', 'OneHotEncode', 'EncodedMultiCategorize', 'RegressionSetup', 'get_c', 'ToTensor',
+           'IntToFloatTensor', 'broadcast_vec', 'Normalize']
 
 # Cell
 from ..torch_basics import *
@@ -164,6 +164,31 @@ def _inner(o):
         return idxs[:train_len],idxs[train_len:train_len+valid_len]
     return _inner
 
+# Cell
+def GroupedSplitter(groupkey,valid_pct=0.2, seed=None):
+    "Splits groups of items between train/val randomly, such that val should have close to `valid_pct` of the total number of items (similar to RandomSplitter). Groups are defined by a `groupkey`, a function/lambda to apply to individual items, or a colname if `o` is a DataFrame"
+    def _inner(o):
+        if callable(groupkey):
+            ids=pd.DataFrame(o)
+            ids['group_keys']=ids.applymap(groupkey)
+            keycol='group_keys'
+        else:
+            assert isinstance(o, pd.DataFrame), "o is not a DataFrame, so groupkey must be a function\lambda that extracts a group key from an item"
+            assert groupkey in o, "groupkey is not a colname in the DataFrame o"
+            keycol=groupkey
+            ids=o
+        gk=ids.groupby(keycol).count()
+        shuffled_gk=gk.sample(frac=1,random_state=seed)
+        cumsum=shuffled_gk.cumsum()
+        desired_valid=len(o)*valid_pct
+        abs_diff=abs(cumsum-desired_valid)
+        valid_rows=abs_diff.iloc[:,0].argmin()+1
+        shuffled_gk['is_valid']=([True] * valid_rows +
+                                 [False]*(len(shuffled_gk) - valid_rows))
+        split_df=ids.join(shuffled_gk.loc[:,'is_valid'],on=keycol)
+        return ColSplitter()(split_df)
+    return _inner
+
 # Cell
 def parent_label(o):
     "Label `item` with the parent folder name."

@@ -671,6 +671,82 @@
     "test_eq(len(splits[1]), 10)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# export\n",
+    "def GroupedSplitter(groupkey,valid_pct=0.2, seed=None):\n",
+    "    \"Splits groups of items between train/val randomly, such that val should have close to `valid_pct` of the total number of items (similar to RandomSplitter). Groups are defined by a `groupkey`, a function/lambda to apply to individual items, or a colname if `o` is a DataFrame\"\n",
+    "    def _inner(o):\n",
+    "        if callable(groupkey):\n",
+    "            ids=pd.DataFrame(o)\n",
+    "            ids['group_keys']=ids.applymap(groupkey)\n",
+    "            keycol='group_keys'\n",
+    "        else:\n",
+    "            assert isinstance(o, pd.DataFrame), \"o is not a DataFrame, so groupkey must be a function\\lambda that extracts a group key from an item\"\n",
+    "            assert groupkey in o, \"groupkey is not a colname in the DataFrame o\"\n",
+    "            keycol=groupkey\n",
+    "            ids=o\n",
+    "        gk=ids.groupby(keycol).count()\n",
+    "        shuffled_gk=gk.sample(frac=1,random_state=seed)\n",
+    "        cumsum=shuffled_gk.cumsum()\n",
+    "        desired_valid=len(o)*valid_pct\n",
+    "        abs_diff=abs(cumsum-desired_valid)\n",
+    "        valid_rows=abs_diff.iloc[:,0].argmin()+1\n",
+    "        shuffled_gk['is_valid']=([True] * valid_rows + \n",
+    "                                 [False]*(len(shuffled_gk) - valid_rows))\n",
+    "        split_df=ids.join(shuffled_gk.loc[:,'is_valid'],on=keycol)\n",
+    "        return ColSplitter()(split_df)\n",
+    "    return _inner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "src = list(range(10000))\n",
+    "key_f=lambda x:x%100\n",
+    "f = GroupedSplitter(key_f,seed=42)\n",
+    "trn,val = f(src)\n",
+    "assert 0<len(trn)<len(src)\n",
+    "assert all(o not in val for o in trn)\n",
+    "k_trn=np.unique([key_f(o) for o in trn])\n",
+    "k_val=np.unique([key_f(o) for o in val])\n",
+    "assert all(k not in k_val for k in k_trn)\n",
+    "test_eq(len(trn), len(src)-len(val))\n",
+    "# # test random seed consistency\n",
+    "test_eq(f(src)[0], trn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f = GroupedSplitter('keys',seed=41)\n",
+    "src = list(range(1000))\n",
+    "key_f=lambda x:x%10\n",
+    "src2=pd.DataFrame(src)\n",
+    "src2['keys']=src2.apply(key_f)\n",
+    "src2['conconfounding_col_1']='test'\n",
+    "src2.insert(0,'confounding_col_0','test')\n",
+    "trn,val=f(src2)\n",
+    "assert 0<len(trn)<len(src2)\n",
+    "assert all(o not in val for o in trn)\n",
+    "k_trn=np.unique([key_f(o) for o in trn])\n",
+    "k_val=np.unique([key_f(o) for o in val])\n",
+    "assert all(k not in k_val for k in k_trn)\n",
+    "test_eq(len(trn), len(src2)-len(val))\n",
+    "# # test random seed consistency\n",
+    "test_eq(f(src2)[0], trn)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},