Skip to content

Commit

Permalink
add duplicate value warning, sthe order of those values will be arbitray
Browse files Browse the repository at this point in the history
  • Loading branch information
zqfang committed Nov 5, 2022
1 parent e909bc3 commit cbcb701
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
10 changes: 10 additions & 0 deletions gseapy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,16 @@ def _load_ranking(self, rnk: Union[pd.DataFrame, pd.Series, str]) -> pd.Series:
self._logger.warning("Input gene rankings contains inf values!")
rankser.replace(-np.inf, method="ffill", inplace=True)
rankser.replace(np.inf, method="bfill", inplace=True)

# check duplicate values and warning
dups = rankser.duplicated().sum()
if dups > 0:
msg = "Duplicated values found in preranked stats: {:.2%} of genes".format(
dups / rankser.size
)
msg += "The order of those genes will be arbitrary, which may produce unexpected results."
self._logger.warning(msg)

# return series
return rankser

Expand Down
10 changes: 10 additions & 0 deletions gseapy/gsea.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,16 @@ def load_ranking(self):
} # column-wise min
ranking = ranking.replace({col: col_min_max for col in ranking.columns})

# check ties in prerank stats
dups = ranking.apply(lambda df: df.duplicated().sum() / df.size)
if (dups > 0).sum() > 0:
msg = (
"Duplicated values found in preranked stats:\nsample\tratio\n%s\n"
% (dups.to_string(float_format="{:,.2%}".format))
)
msg += "The order of those genes will be arbitrary, which may produce unexpected results."
self._logger.warning(msg)

return ranking

# @profile
Expand Down

0 comments on commit cbcb701

Please sign in to comment.