Skip to content

Commit

Permalink
Merge pull request #554 from brightics/brtc-issue-553
Browse files Browse the repository at this point in the history
LGTM
  • Loading branch information
krazyeom committed May 31, 2019
2 parents 8f1d161 + dce0b85 commit 072cc51
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 9 deletions.
3 changes: 3 additions & 0 deletions function/python/brightics/function/textanalytics/__init__.py
Expand Up @@ -14,6 +14,9 @@
limitations under the License.
"""



from .ngram import ngram
from .lda import lda
from .tfidf import tfidf
from .tfidf import tfidf2
Expand Up @@ -36,7 +36,8 @@
"visibleOption": [],
"control": "ColumnSelector",
"columnType": [
"String"
"String",
"String[]"
],
"validation": [],
"multiple": false
Expand Down
198 changes: 198 additions & 0 deletions function/python/brightics/function/textanalytics/meta/tfidf2.json
@@ -0,0 +1,198 @@
{
"script": {
"type": "",
"content": ""
},
"specJson": {
"category": "textanalytics",
"func": "brightics.function.textanalytics$tfidf297577",
"name": "brightics.function.textanalytics$tfidf2",
"context": "python",
"label": "TF-IDF",
"description": "This is a function to calculate TF-IDF, abbreviated term for term frequency-inverse document frequency. \n\nReference:\n+ <https://en.wikipedia.org/wiki/Tf-idf>",
"tags": [],
"version": "3.6",
"inputs": {
"table": ""
},
"outputs": {
"table_1": "",
"table_2": "",
"model": ""
},
"meta": {
"table": {
"type": "table"
},
"table_1": {
"type": "table"
},
"table_2": {
"type": "table"
},
"model": {
"type": "model"
}
},
"params": [
{
"id": "input_col",
"label": "Input Column",
"description": "",
"mandatory": true,
"items": [],
"visibleOption": [],
"control": "ColumnSelector",
"columnType": [
"String",
"String[]"
],
"validation": [],
"multiple": false
},
{
"id": "max_df",
"label": "Maximum Document Frequency",
"description": "When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "InputBox",
"columnType": [],
"validation": [],
"targetTable": [],
"placeHolder": "the number of documents",
"type": "Integer"
},
{
"id": "min_df",
"label": "Minimum Document Frequency",
"description": "When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "InputBox",
"columnType": [],
"validation": [],
"targetTable": [],
"placeHolder": "1 (value >= 0)",
"type": "Integer",
"min": 0
},
{
"id": "num_voca",
"label": "Number of Vocabularies",
"description": "The number of vocabularies that will be utilized to count their frequencies in the entire documents. It should be greater than or equal to two.",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "InputBox",
"columnType": [],
"validation": [],
"targetTable": [],
"placeHolder": "100 (value >= 2)",
"type": "Integer",
"min": 2
},
{
"id": "idf_weighting_scheme",
"label": "IDF Weighting Scheme",
"description": "Weighting scheme for IDF. Currently it is providing \"Unary\" and \"Inverse Document Frequency\" only.",
"mandatory": false,
"items": [
{
"label": "Unary",
"value": "unary",
"default": false
},
{
"label": "Inverse Document Frequency",
"value": "inverseDocumentFrequency",
"default": true
}
],
"visibleOption": [],
"control": "RadioButton",
"columnType": [],
"validation": [],
"targetTable": []
},
{
"id": "norm",
"label": "Norm",
"description": "Norm used to normalize term vectors.",
"mandatory": false,
"items": [
{
"label": "L1",
"value": "l1",
"default": false
},
{
"label": "L2",
"value": "l2",
"default": true
}
],
"visibleOption": [],
"control": "RadioButton",
"columnType": [],
"validation": [],
"targetTable": []
},
{
"id": "smooth_idf",
"label": "Smooth IDF",
"description": "Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "BooleanRadio",
"columnType": [],
"validation": [],
"targetTable": [],
"defaultValue": true
},
{
"id": "sublinear_tf",
"label": "Sublinear TF",
"description": "Apply sublinear tf scaling, i.e. replace \"tf\" with \"1 + log(tf)\".",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "BooleanRadio",
"columnType": [],
"validation": [],
"targetTable": [],
"defaultValue": false
},
{
"id": "output_type",
"label": "Remove Zero Counts",
"description": "Delete zero counts.",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "BooleanRadio",
"columnType": [],
"validation": [],
"targetTable": [],
"defaultValue": true
},
{
"id": "group_by",
"label": "Group By",
"description": "Columns to group by",
"mandatory": false,
"items": [],
"visibleOption": [],
"control": "ColumnSelector",
"columnType": [],
"validation": [],
"multiple": true,
"rowCount": 5
}
]
},
"md": ""
}
121 changes: 113 additions & 8 deletions function/python/brightics/function/textanalytics/tfidf.py
Expand Up @@ -29,7 +29,7 @@
from sklearn.feature_extraction.text import TfidfTransformer


def tfidf(table, group_by=None, **params):
def tfidf(table, group_by=None, **params): # This will be deprecated.
check_required_parameters(_tfidf, params, ['table'])
params = get_default_from_parameters_if_required(params, _tfidf)
param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'),
Expand Down Expand Up @@ -62,17 +62,14 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
docID_list = []
if output_type == False:
vocabulary_list = []
index_list = []
label_table = pd.DataFrame()
for doc in range(len(corpus)):
docID_list += ['doc_{}'.format(doc + 1) for _ in range(len_voca)]
docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
document_list += [str(corpus[doc]) for _ in range(len_voca)]
vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
index_list += [voca_dict[j][1] for j in range(len_voca)]
label_table['document_id'] = docID_list
label_table[input_col] = document_list
label_table['vocabulary'] = vocabulary_list
label_table['index'] = index_list
tfidf_table = label_table
tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
if idf_weighting_scheme == 'inverseDocumentFrequency':
Expand All @@ -82,12 +79,11 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting

elif output_type == True:
for doc in range(len(corpus)):
docID_list += ['doc_{}'.format(doc + 1) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
tfidf_table['document_id'] = docID_list
tfidf_table[input_col] = document_list
tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
tfidf_table['index'] = csr_matrix_tf.indices
tfidf_table['frequency'] = csr_matrix_tf.data
data_list = []
for doc in range(len(corpus)):
Expand All @@ -104,7 +100,6 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting

idf_table = pd.DataFrame()
idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
idf_table['index'] = [voca_dict[j][1] for j in range(len(voca_dict))]
if idf_weighting_scheme == 'inverseDocumentFrequency':
idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist()
elif idf_weighting_scheme == 'unary':
Expand Down Expand Up @@ -149,3 +144,113 @@ def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting
model['_repr_brtc_'] = rb.get()

return {'model' : model}


def tfidf2(table, group_by=None, **params):
check_required_parameters(_tfidf2, params, ['table'])
params = get_default_from_parameters_if_required(params, _tfidf2)
param_validation_check = [greater_than_or_equal_to(params, 0, 'min_df'),
greater_than_or_equal_to(params, 2, 'num_voca'),
greater_than(params, 0, 'max_df')]
validate(*param_validation_check)
if group_by is not None:
return _function_by_group(_tfidf2, table, group_by=group_by, **params)
else:
return _tfidf2(table, **params)


def _tfidf2(table, input_col, max_df=None, min_df=1, num_voca=100, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=True):
corpus = np.array(table[input_col])
if max_df == None:
max_df = len(corpus)
tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca)
tf_vectorizer.fit(corpus)
csr_matrix_tf = tf_vectorizer.transform(corpus)
tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)
csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf)

voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1))
len_voca = len(voca_dict)

# tf-idf table

tfidf_table = pd.DataFrame()
document_list = []
docID_list = []
if output_type == False:
vocabulary_list = []
label_table = pd.DataFrame()
for doc in range(len(corpus)):
docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)]
document_list += [str(corpus[doc]) for _ in range(len_voca)]
vocabulary_list += [voca_dict[j][0] for j in range(len_voca)]
label_table['document_id'] = docID_list
label_table[input_col] = document_list
label_table['vocabulary'] = vocabulary_list
tfidf_table = label_table
tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense())
if idf_weighting_scheme == 'inverseDocumentFrequency':
tfidf_table['tfidf_score'] = np.ravel(csr_matrix_tfidf.todense())
elif idf_weighting_scheme == 'unary':
tfidf_table['tfidf_score'] = list(map(float, np.array(tfidf_table['frequency'])))

elif output_type == True:
for doc in range(len(corpus)):
docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])]
tfidf_table['document_id'] = docID_list
tfidf_table[input_col] = document_list
tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices]
tfidf_table['frequency'] = csr_matrix_tf.data
data_list = []
for doc in range(len(corpus)):
data_list += [csr_matrix_tfidf.data[i] for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1]
if idf_weighting_scheme == 'inverseDocumentFrequency':
tfidf_table['tfidf_score'] = data_list
elif idf_weighting_scheme == 'unary':
tfidf_table['tfidf_score'] = list(map(float, np.array(tfidf_table['frequency'])))

else:
raise_runtime_error("Please check 'output_type'.")

# idf table

idf_table = pd.DataFrame()
idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))]
if idf_weighting_scheme == 'inverseDocumentFrequency':
idf_table['idf_weight'] = tfidf_vectorizer.idf_.tolist()
elif idf_weighting_scheme == 'unary':
idf_table['idf_weight'] = float(1)

params = {
'Input Column': input_col,
'Max DF': max_df,
'Min DF': min_df,
'Number of Vocabularies': num_voca,
'IDF Weighting Scheme': idf_weighting_scheme,
'Norm': norm,
'Smooth IDF': smooth_idf,
'Sublinear TF': sublinear_tf,
'Remove Zero Counts': output_type
}

rb = BrtcReprBuilder()
rb.addMD(strip_margin("""# TF-IDF Result"""))
rb.addMD(strip_margin("""
|
|### Parameters
|
|{display_params}
|
|
""".format(display_params=dict2MD(params))))

model = _model_dict('tfidf')
model['csr_matrix_tf'] = csr_matrix_tf
model['csr_matrix_tfidf'] = csr_matrix_tfidf
model['parameter'] = params
model['idf_table'] = idf_table
model['tfidf_table'] = tfidf_table
model['_repr_brtc_'] = rb.get()

return {'table_1' : idf_table, 'table_2':tfidf_table, 'model':model}

0 comments on commit 072cc51

Please sign in to comment.