Merge pull request #554 from brightics/brtc-issue-553

LGTM
brightics · May 31, 2019 · 072cc51 · 072cc51
2 parents 8f1d161 + dce0b85
commit 072cc51
Show file tree

Hide file tree

Showing 4 changed files with 316 additions and 9 deletions.
diff --git a/function/python/brightics/function/textanalytics/__init__.py b/function/python/brightics/function/textanalytics/__init__.py
@@ -14,6 +14,9 @@
     limitations under the License.
 """
 
+
+
 from .ngram import ngram
 from .lda import lda
 from .tfidf import tfidf
+from .tfidf import tfidf2
diff --git a/function/python/brightics/function/textanalytics/meta/tfidf.json b/function/python/brightics/function/textanalytics/meta/tfidf.json
@@ -36,7 +36,8 @@
                 "visibleOption": [],
                 "control": "ColumnSelector",
                 "columnType": [
-                    "String"
+                    "String",
+                    "String[]"
                 ],
                 "validation": [],
                 "multiple": false

diff --git a/function/python/brightics/function/textanalytics/meta/tfidf2.json b/function/python/brightics/function/textanalytics/meta/tfidf2.json
@@ -0,0 +1,198 @@
+{
+    "script": {
+        "type": "",
+        "content": ""
+    },
+    "specJson": {
+        "category": "textanalytics",
+        "func": "brightics.function.textanalytics$tfidf297577",
+        "name": "brightics.function.textanalytics$tfidf2",
+        "context": "python",
+        "label": "TF-IDF",
+        "description": "This is a function to calculate TF-IDF, abbreviated term for term frequency-inverse document frequency. \n\nReference:\n+ <https://en.wikipedia.org/wiki/Tf-idf>",
+        "tags": [],
+        "version": "3.6",
+        "inputs": {
+            "table": ""
+        },
+        "outputs": {
+            "table_1": "",
+            "table_2": "",
+            "model": ""
+        },
+        "meta": {
+            "table": {
+                "type": "table"
+            },
+            "table_1": {
+                "type": "table"
+            },
+            "table_2": {
+                "type": "table"
+            },
+            "model": {
+                "type": "model"
+            }
+        },
+        "params": [
+            {
+                "id": "input_col",
+                "label": "Input Column",
+                "description": "",
+                "mandatory": true,
+                "items": [],
+                "visibleOption": [],
+                "control": "ColumnSelector",
+                "columnType": [
+                    "String",
+                    "String[]"
+                ],
+                "validation": [],
+                "multiple": false
+            },
+            {
+                "id": "max_df",
+                "label": "Maximum Document Frequency",
+                "description": "When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "InputBox",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "placeHolder": "the number of documents",
+                "type": "Integer"
+            },
+            {
+                "id": "min_df",
+                "label": "Minimum Document Frequency",
+                "description": "When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "InputBox",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "placeHolder": "1 (value >= 0)",
+                "type": "Integer",
+                "min": 0
+            },
+            {
+                "id": "num_voca",
+                "label": "Number of Vocabularies",
+                "description": "The number of vocabularies that will be utilized to count their frequencies in the entire documents. It should be greater than or equal to two.",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "InputBox",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "placeHolder": "100 (value >= 2)",
+                "type": "Integer",
+                "min": 2
+            },
+            {
+                "id": "idf_weighting_scheme",
+                "label": "IDF Weighting Scheme",
+                "description": "Weighting scheme for IDF. Currently it is providing \"Unary\" and \"Inverse Document Frequency\" only.",
+                "mandatory": false,
+                "items": [
+                    {
+                        "label": "Unary",
+                        "value": "unary",
+                        "default": false
+                    },
+                    {
+                        "label": "Inverse Document Frequency",
+                        "value": "inverseDocumentFrequency",
+                        "default": true
+                    }
+                ],
+                "visibleOption": [],
+                "control": "RadioButton",
+                "columnType": [],
+                "validation": [],
+                "targetTable": []
+            },
+            {
+                "id": "norm",
+                "label": "Norm",
+                "description": "Norm used to normalize term vectors.",
+                "mandatory": false,
+                "items": [
+                    {
+                        "label": "L1",
+                        "value": "l1",
+                        "default": false
+                    },
+                    {
+                        "label": "L2",
+                        "value": "l2",
+                        "default": true
+                    }
+                ],
+                "visibleOption": [],
+                "control": "RadioButton",
+                "columnType": [],
+                "validation": [],
+                "targetTable": []
+            },
+            {
+                "id": "smooth_idf",
+                "label": "Smooth IDF",
+                "description": "Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "BooleanRadio",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "defaultValue": true
+            },
+            {
+                "id": "sublinear_tf",
+                "label": "Sublinear TF",
+                "description": "Apply sublinear tf scaling, i.e. replace \"tf\" with \"1 + log(tf)\".",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "BooleanRadio",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "defaultValue": false
+            },
+            {
+                "id": "output_type",
+                "label": "Remove Zero Counts",
+                "description": "Delete zero counts.",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "BooleanRadio",
+                "columnType": [],
+                "validation": [],
+                "targetTable": [],
+                "defaultValue": true
+            },
+            {
+                "id": "group_by",
+                "label": "Group By",
+                "description": "Columns to group by",
+                "mandatory": false,
+                "items": [],
+                "visibleOption": [],
+                "control": "ColumnSelector",
+                "columnType": [],
+                "validation": [],
+                "multiple": true,
+                "rowCount": 5
+            }
+        ]
+    },
+    "md": ""
+}
diff --git a/function/python/brightics/function/textanalytics/tfidf.py b/function/python/brightics/function/textanalytics/tfidf.py
@@ -29,7 +29,7 @@
 from sklearn.feature_extraction.text import TfidfTransformer
 
 
-def tfidf(table, group_by=None, **params):
+def tfidf(table, group_by=None, **params):  # This will be deprecated.
     check_required_parameters(_tfidf, params, ['table'])
     params = get_default_from_parameters_if_required(params, _tfidf)
     param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'),
@@ -62,17 +62,14 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
     docID_list = []
     if output_type == False:
         vocabulary_list = []
-        index_list = []
         label_table = pd.DataFrame()
         for doc in range(len(corpus)):
-            docID_list += ['doc_{}'.format(doc + 1) for _ in range(len_voca)]
+            docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
             document_list += [str(corpus[doc]) for _ in range(len_voca)]
             vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
-            index_list += [voca_dict[j][1] for j in range(len_voca)]
         label_table['document_id'] = docID_list
         label_table[input_col] = document_list
         label_table['vocabulary'] = vocabulary_list
-        label_table['index'] = index_list
         tfidf_table = label_table
         tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
         if idf_weighting_scheme == 'inverseDocumentFrequency':
@@ -82,12 +79,11 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
 
     elif output_type == True:
         for doc in range(len(corpus)):
-            docID_list += ['doc_{}'.format(doc + 1) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
+            docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
             document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
         tfidf_table['document_id'] = docID_list
         tfidf_table[input_col] = document_list
         tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
-        tfidf_table['index'] = csr_matrix_tf.indices
         tfidf_table['frequency'] = csr_matrix_tf.data
         data_list = []
         for doc in range(len(corpus)):
@@ -104,7 +100,6 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
 
     idf_table = pd.DataFrame()
     idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
-    idf_table['index'] = [voca_dict[j][1] for j in range(len(voca_dict))]
     if idf_weighting_scheme == 'inverseDocumentFrequency':
         idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
     elif idf_weighting_scheme == 'unary':
@@ -149,3 +144,113 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
     model['_repr_brtc_'] = rb.get()
 
     return {'model' : model}
+
+
+def tfidf2(table, group_by=None, **params):
+    check_required_parameters(_tfidf2, params, ['table'])
+    params = get_default_from_parameters_if_required(params, _tfidf2)
+    param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'),
+                              greater_than_or_equal_to(params, 2, 'num_voca'),
+                              greater_than(params, 0, 'max_df')]
+    validate(*param_validation_check)
+    if group_by is not None:
+        return _function_by_group(_tfidf2, table, group_by=group_by, **params)
+    else:
+        return _tfidf2(table, **params)
+
+
+def _tfidf2(table, input_col, max_df=None, min_df=1, num_voca=100, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=True):
+    corpus = np.array(table[input_col])
+    if max_df == None:
+        max_df = len(corpus)
+    tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca)
+    tf_vectorizer.fit(corpus)
+    csr_matrix_tf = tf_vectorizer.transform(corpus)
+    tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
+    csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf)
+
+    voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1))
+    len_voca = len(voca_dict)
+
+    # tf-idf table
+
+    tfidf_table = pd.DataFrame()
+    document_list = []
+    docID_list = []
+    if output_type == False:
+        vocabulary_list = []
+        label_table = pd.DataFrame()
+        for doc in range(len(corpus)):
+            docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
+            document_list += [str(corpus[doc]) for _ in range(len_voca)]
+            vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
+        label_table['document_id'] = docID_list
+        label_table[input_col] = document_list
+        label_table['vocabulary'] = vocabulary_list
+        tfidf_table = label_table
+        tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
+        if idf_weighting_scheme == 'inverseDocumentFrequency':
+            tfidf_table['tfidf_score'] = np.ravel(csr_matrix_tfidf.todense())
+        elif idf_weighting_scheme == 'unary':
+            tfidf_table['tfidf_score'] = list(map(float, np.array(tfidf_table['frequency'])))
+
+    elif output_type == True:
+        for doc in range(len(corpus)):
+            docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
+            document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
+        tfidf_table['document_id'] = docID_list
+        tfidf_table[input_col] = document_list
+        tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
+        tfidf_table['frequency'] = csr_matrix_tf.data
+        data_list = []
+        for doc in range(len(corpus)):
+            data_list += [csr_matrix_tfidf.data[i]  for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1]
+        if idf_weighting_scheme == 'inverseDocumentFrequency':
+            tfidf_table['tfidf_score'] = data_list
+        elif idf_weighting_scheme == 'unary':
+            tfidf_table['tfidf_score'] = list(map(float, np.array(tfidf_table['frequency'])))
+
+    else:
+        raise_runtime_error("Please check 'output_type'.")
+
+        # idf table
+
+    idf_table = pd.DataFrame()
+    idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
+    if idf_weighting_scheme == 'inverseDocumentFrequency':
+        idf_table['idf_weight'] = tfidf_vectorizer.idf_.tolist()
+    elif idf_weighting_scheme == 'unary':
+        idf_table['idf_weight'] = float(1)
+
+    params = {
+        'Input Column': input_col,
+        'Max DF': max_df,
+        'Min DF': min_df,
+        'Number of Vocabularies': num_voca,
+        'IDF Weighting Scheme': idf_weighting_scheme,
+        'Norm': norm,
+        'Smooth IDF': smooth_idf,
+        'Sublinear TF': sublinear_tf,
+        'Remove Zero Counts': output_type
+    }
+
+    rb = BrtcReprBuilder()
+    rb.addMD(strip_margin("""# TF-IDF Result"""))
+    rb.addMD(strip_margin("""
+    |
+    |### Parameters
+    |
+    |{display_params}
+    |
+    |
+    """.format(display_params=dict2MD(params))))
+
+    model = _model_dict('tfidf')
+    model['csr_matrix_tf'] = csr_matrix_tf
+    model['csr_matrix_tfidf'] = csr_matrix_tfidf
+    model['parameter'] = params
+    model['idf_table'] = idf_table
+    model['tfidf_table'] = tfidf_table
+    model['_repr_brtc_'] = rb.get()
+
+    return {'table_1' : idf_table, 'table_2':tfidf_table, 'model':model}