From 332b6d8b5c3168dab0bd175debdb52ea4d4ea793 Mon Sep 17 00:00:00 2001 From: rutikDestek <140688894+rutikDestek@users.noreply.github.com> Date: Thu, 12 Oct 2023 19:44:04 +0530 Subject: [PATCH] issue: #22 I have made some improvement in the visualize.py file --- source/notebooks/visualize.py | 124 ++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/source/notebooks/visualize.py b/source/notebooks/visualize.py index 5cfda48..91e9910 100644 --- a/source/notebooks/visualize.py +++ b/source/notebooks/visualize.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Visualization of text data +Text Data Visualization """ import pandas as pd @@ -8,78 +8,86 @@ import matplotlib.pyplot as plt from typing import Collection, Callable, Tuple -__all__ = ['top_feats_label', 'top_feats_all', 'plot_top_feats'] +__all__ = ['rank_features_by_label', 'rank_features_for_all_labels', 'plot_top_features'] -def top_feats_label(X: np.ndarray, features: Collection[str], label_idx: Collection[bool] = None, - min_val: float = 0.1, agg_func: Callable = np.mean)->pd.DataFrame: - ''' - original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] - rank features of each label by their encoded values (CountVectorizer, TfidfVectorizer, etc.) - aggregated with `agg_func` - :param X np.ndarray: document-value matrix - :param features Collection[str]: feature names - :param label_idx Collection[int]: position of rows with specified label - :param min_val float: minimum value to take into account for each feature - :param agg_func Callable: how to aggregate features such as `np.mean` or `np.sum` - :return: a dataframe with `feature`, `score` and `ngram` - ''' - res = X[label_idx] if label_idx is not None else X - res[res < min_val] = 0 - res_agg = agg_func(res, axis=0) - df = pd.DataFrame([(features[i], res_agg[i]) for i in np.argsort(res_agg)[::-1]]) - df.columns = ['feature','score'] - df['ngram'] = df.feature.map(lambda x: len(set(x.split(' ')))) +def rank_features_by_label(X: np.ndarray, features: Collection[str], label_idx: Collection[bool] = None, + min_value: float = 0.1, aggregation_function: Callable = np.mean) -> pd.DataFrame: + """ + Rank features of each label by their encoded values (e.g., CountVectorizer, TfidfVectorizer, etc.) + aggregated with `aggregation_function`. + + :param X np.ndarray: Document-value matrix. + :param features Collection[str]: Feature names. + :param label_idx Collection[int]: Position of rows with specified label. + :param min_value float: Minimum value to take into account for each feature. + :param aggregation_function Callable: Function to aggregate features such as `np.mean` or `np.sum`. + :return: A DataFrame with 'feature', 'score' and 'ngram'. + """ + # Filter features based on the label index and apply the min_value threshold + filtered_features = X[label_idx] if label_idx is not None else X + filtered_features[filtered_features < min_value] = 0 + + # Calculate the aggregated scores for each feature + aggregated_scores = aggregation_function(filtered_features, axis=0) + + # Create a DataFrame with feature names, scores, and ngram counts + df = pd.DataFrame({ + 'feature': features, + 'score': aggregated_scores, + 'ngram': [len(set(feat.split(' '))) for feat in features] + }) return df -def top_feats_all(X: np.ndarray, y: np.ndarray, features: Collection[str], min_val: float = 0.1, - agg_func: Callable = np.mean)->Collection[pd.DataFrame]: - ''' - original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] - for all labels, rank features of each label by their encoded values (CountVectorizer, TfidfVectorizer, etc.) - aggregated with `agg_func` - :param X np.ndarray: document-value matrix - :param y np.ndarray: labels - :param features Collection[str]: feature names - :param min_val float: minimum value to take into account for each feature - :param agg_func Callable: how to aggregate features such as `np.mean` or `np.sum` - :return: a list of dataframes with `rank` (rank within label), `feature`, `score`, `ngram` and `label` - ''' - labels = np.unique(y) +def rank_features_for_all_labels(X: np.ndarray, y: np.ndarray, features: Collection[str], min_value: float = 0.1, + aggregation_function: Callable = np.mean) -> Collection[pd.DataFrame]: + """ + Rank features for all labels by their encoded values (e.g., CountVectorizer, TfidfVectorizer, etc.) + aggregated with `aggregation_function`. + + :param X np.ndarray: Document-value matrix. + :param y np.ndarray: Labels. + :param features Collection[str]: Feature names. + :param min_value float: Minimum value to take into account for each feature. + :param aggregation_function Callable: Function to aggregate features such as `np.mean` or `np.sum`. + :return: A list of DataFrames with 'rank' (rank within label), 'feature', 'score', 'ngram' and 'label'. + """ + unique_labels = np.unique(y) dfs = [] - for l in labels: - label_idx = (y==l) - df = top_feats_label(X,features,label_idx,min_val,agg_func).reset_index() - df['label'] = l - df.columns = ['rank','feature','score','ngram','label'] + for label in unique_labels: + label_idx = (y == label) + df = rank_features_by_label(X, features, label_idx, min_value, aggregation_function).reset_index() + df['label'] = label + df.columns = ['rank', 'feature', 'score', 'ngram', 'label'] dfs.append(df) return dfs -def plot_top_feats(dfs: Collection[pd.DataFrame], top_n: int = 25, ngram_range: Tuple[int,int]=(1,2),)-> None: - ''' - original code (Thomas Buhrman)[from https://buhrmann.github.io/tfidf-analysis.html] - plot top features from a collection of `top_feats_all` dataframes - :param dfs Collection[pd.DataFrame]: `top_feats_all` dataframes - :param top_n int: number of top features to show - :param ngram_range Tuple[int,int]: range of ngrams for features to show - :return: nothing - ''' +def plot_top_features(dataframes: Collection[pd.DataFrame], top_n: int = 25, ngram_range: Tuple[int,int]=(1,2)) -> None: + """ + Plot top features from a collection of dataframes. + + :param dataframes Collection[pd.DataFrame]: A list of dataframes. + :param top_n int: Number of top features to show. + :param ngram_range Tuple[int,int]: Range of ngrams for features to show. + :return: None + """ fig = plt.figure(figsize=(12, 9), facecolor="w") x = np.arange(top_n) - for i, df in enumerate(dfs): - df = df[(df.ngram>=ngram_range[0])&(df.ngram<=ngram_range[1])][:top_n] - ax = fig.add_subplot(1, len(dfs), i+1) + for i, df in enumerate(dataframes): + # Filter data by ngram range and select the top_n features + filtered_df = df[(df.ngram >= ngram_range[0]) & (df.ngram <= ngram_range[1])][:top_n] + ax = fig.add_subplot(1, len(dataframes), i + 1) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_frame_on(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() - ax.set_xlabel("score", labelpad=16, fontsize=14) - ax.set_title(f"label = {str(df.label[0])}", fontsize=16) - ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2)) - ax.barh(x, df.score, align='center', color='#3F5D7D') + ax.set_xlabel("Score", labelpad=16, fontsize=14) + ax.set_title(f"Label = {str(df.label[0])}", fontsize=16) + ax.ticklabel_format(axis='x', style='sci', scilimits=(-2, 2)) + ax.barh(x, filtered_df.score, align='center', color='#3F5D7D') ax.set_yticks(x) - ax.set_ylim([-1, x[-1]+1]) + ax.set_ylim([-1, x[-1] + 1]) ax.invert_yaxis() - yticks = ax.set_yticklabels(df.feature) + yticks = ax.set_yticklabels(filtered_df.feature) plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52) plt.show()