Bug in StackBoWBP

INGEOTEC · Feb 1, 2024 · 6d5071c · 6d5071c
1 parent 62f94a1
commit 6d5071c
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 5 deletions.
diff --git a/EvoMSA/__init__.py b/EvoMSA/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = '2.0.6'
+__version__ = '2.0.7'
 
 try:
     from EvoMSA.text_repr import BoW, TextRepresentations, StackGeneralization, DenseBoW

diff --git a/EvoMSA/back_prop.py b/EvoMSA/back_prop.py
@@ -65,6 +65,25 @@ def stack_model_binary(params, X, df):
     return Y * pesos[0] + nn.sigmoid(df) * pesos[1] - 0.5
 
 
+def initial_parameters(df, df2, y, score=None):
+    """Estimate initial parameters :py:class:`~EvoMSA.back_prop.StackBoWBP`"""
+    from sklearn.metrics import f1_score
+    from scipy.special import softmax
+
+    def f(x):
+        hy = (x[0] * df + x[1] * df2).argmax(axis=1)
+        return score(y, hy)
+
+    if score is None:
+        score = lambda y, hy: f1_score(y, hy, average='macro')
+    df = softmax(df, axis=1)
+    df2 = softmax(df2, axis=1)
+    value = np.linspace(0, 1, 100)
+    _ = [f([v, 1-v]) for v in value]
+    index = np.argmax(_)
+    return jnp.array([value[index], 1 - value[index]])
+
+
 class BoWBP(BoW):
     """BoWBP is a :py:class:`~EvoMSA.text_repr.BoW` with the difference that the parameters are fine-tuned using jax
     
@@ -289,7 +308,8 @@ def initial_parameters(self, X, y, df):
 
     def model_args(self, D: List[Union[dict, list]]):
         if not hasattr(self, '_bow_ins'):
-            hy = BoW.train_predict_decision_function(self, D)
+            X = self._transform(D)
+            hy = self.train_predict_decision_function(D, X=X)
         else:
             X = super(StackBoWBP, self)._transform(D)
             hy = getattr(self._bow_ins, self.decision_function_name)(X)
@@ -303,4 +323,4 @@ def fit(self, D: List[Union[dict, list]],
         _ = self._transform(D)
         labels = self.dependent_variable(D, y=y)
         self._bow_ins = self.estimator_class(**self.estimator_kwargs).fit(_, labels)
-        return self
+        return self
diff --git a/EvoMSA/text_repr.py b/EvoMSA/text_repr.py
@@ -379,14 +379,16 @@ def b4msa_fit(self, D: List[Union[List, dict]]):
         return self.bow.fit(_)
 
     def train_predict_decision_function(self, D: List[Union[dict, list]], 
-                                        y: Union[np.ndarray, None]=None) -> np.ndarray:
+                                        y: Union[np.ndarray, None]=None,
+                                        X=None) -> np.ndarray:
         """
         Method to compute the kfold predictions on dataset `D` with response `y`
 
         :param D: Texts to be transformed. In the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
         :type D: List of texts or dictionaries. 
         :param y: Response variable
         :type y: Array or None
+        :param X: Transform dataset
 
         For example, the following code computes the accuracy using k-fold cross-validation on the dataset found on `TWEETS` 
 
@@ -411,7 +413,8 @@ def train_predict(tr, vs):
         y = self.dependent_variable(D, y=y)
         kf = self.kfold_class(**self.kfold_kwargs)
         kfolds = [x for x in kf.split(D, y)]
-        X = self.transform(D, y=y)
+        if X is None:
+            X = self.transform(D, y=y)
         hys = Parallel(n_jobs=self.n_jobs)(delayed(train_predict)(tr, vs)
                                             for tr, vs in kfolds)
         K = np.unique(y).shape[0]