diff --git a/tests/data/vectorizer/expected.json b/tests/data/vectorizer/expected.json
deleted file mode 100644
index b3f54d1..0000000
--- a/tests/data/vectorizer/expected.json
+++ /dev/null
@@ -1,636 +0,0 @@
-{
- "test_simpleVectorizer_binary": {
- "(0, 2)": 1.0,
- "(0, 3)": 1.0,
- "(1, 0)": 1.0,
- "(1, 5)": 1.0,
- "(2, 2)": 1.0,
- "(2, 4)": 1.0,
- "(3, 1)": 1.0,
- "(3, 5)": 1.0
- },
- "test_simpleVectorizer_triple": {
- "(0, 1)": 1.0,
- "(0, 3)": 1.0,
- "(0, 8)": 1.0,
- "(1, 1)": 1.0,
- "(1, 5)": 1.0,
- "(1, 6)": 1.0,
- "(2, 0)": 1.0,
- "(2, 4)": 1.0,
- "(2, 8)": 1.0,
- "(3, 0)": 1.0,
- "(3, 5)": 1.0,
- "(3, 7)": 1.0,
- "(4, 2)": 1.0,
- "(4, 4)": 1.0,
- "(4, 6)": 1.0,
- "(5, 2)": 1.0,
- "(5, 3)": 1.0,
- "(5, 7)": 1.0
- },
- "test_vectorizer_bigrams_1": {
- "bigrams_ _gnorcyvmer": 0.68309028,
- "bigrams_a_common": 0.78013025,
- "bigrams_a_known": 0.68309028,
- "bigrams_be_treated": 0.81649658,
- "bigrams_bmzvpvwbpw_failed": 0.81649658,
- "bigrams_can_be": 0.81649658,
- "bigrams_clinical_trials": 0.81649658,
- "bigrams_common_treatment": 0.78013025,
- "bigrams_effect_of": 0.68309028,
- "bigrams_failed_clinical": 0.81649658,
- "bigrams_for_kyekjnkrfo": 0.81649658,
- "bigrams_for_zgwivlcmly": 0.78013025,
- "bigrams_gnorcyvmer_is": 0.68309028,
- "bigrams_is_a": 1.10705635,
- "bigrams_known_side": 0.68309028,
- "bigrams_kyekjnkrfo_.": 0.81649658,
- "bigrams_of_ruswdgzajr": 0.68309028,
- "bigrams_ootopaoxbg_can": 0.81649658,
- "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025,
- "bigrams_ruswdgzajr_.": 0.68309028,
- "bigrams_side_effect": 0.68309028,
- "bigrams_treated_with": 0.81649658,
- "bigrams_treatment_for": 0.78013025,
- "bigrams_trials_for": 0.81649658,
- "bigrams_vgypkemhjr_.": 0.81649658,
- "bigrams_with_vgypkemhjr": 0.81649658,
- "bigrams_zgwivlcmly_.": 0.78013025
- },
- "test_vectorizer_bigrams_2": {
- "bigrams_ _gnorcyvmer": 0.68309028,
- "bigrams_a_common": 0.78013025,
- "bigrams_a_known": 0.68309028,
- "bigrams_be_treated": 0.81649658,
- "bigrams_bmzvpvwbpw_failed": 0.81649658,
- "bigrams_can_be": 0.81649658,
- "bigrams_clinical_trials": 0.81649658,
- "bigrams_common_treatment": 0.78013025,
- "bigrams_effect_of": 0.68309028,
- "bigrams_failed_clinical": 0.81649658,
- "bigrams_for_kyekjnkrfo": 0.81649658,
- "bigrams_for_zgwivlcmly": 0.78013025,
- "bigrams_gnorcyvmer_is": 0.68309028,
- "bigrams_is_a": 1.10705635,
- "bigrams_known_side": 0.68309028,
- "bigrams_kyekjnkrfo_.": 0.81649658,
- "bigrams_of_ruswdgzajr": 0.68309028,
- "bigrams_ootopaoxbg_can": 0.81649658,
- "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025,
- "bigrams_ruswdgzajr_.": 0.68309028,
- "bigrams_side_effect": 0.68309028,
- "bigrams_treated_with": 0.81649658,
- "bigrams_treatment_for": 0.78013025,
- "bigrams_trials_for": 0.81649658,
- "bigrams_vgypkemhjr_.": 0.81649658,
- "bigrams_with_vgypkemhjr": 0.81649658,
- "bigrams_zgwivlcmly_.": 0.78013025
- },
- "test_vectorizer_bigrams_noTFIDF_1": {
- "bigrams_ _gnorcyvmer": 4.0,
- "bigrams_a_common": 4.0,
- "bigrams_a_known": 4.0,
- "bigrams_be_treated": 4.0,
- "bigrams_bmzvpvwbpw_failed": 4.0,
- "bigrams_can_be": 4.0,
- "bigrams_clinical_trials": 4.0,
- "bigrams_common_treatment": 4.0,
- "bigrams_effect_of": 4.0,
- "bigrams_failed_clinical": 4.0,
- "bigrams_for_kyekjnkrfo": 4.0,
- "bigrams_for_zgwivlcmly": 4.0,
- "bigrams_gnorcyvmer_is": 4.0,
- "bigrams_is_a": 8.0,
- "bigrams_known_side": 4.0,
- "bigrams_kyekjnkrfo_.": 4.0,
- "bigrams_of_ruswdgzajr": 4.0,
- "bigrams_ootopaoxbg_can": 4.0,
- "bigrams_pehhjnlvvewbjccovflf_is": 4.0,
- "bigrams_ruswdgzajr_.": 4.0,
- "bigrams_side_effect": 4.0,
- "bigrams_treated_with": 4.0,
- "bigrams_treatment_for": 4.0,
- "bigrams_trials_for": 4.0,
- "bigrams_vgypkemhjr_.": 4.0,
- "bigrams_with_vgypkemhjr": 4.0,
- "bigrams_zgwivlcmly_.": 4.0
- },
- "test_vectorizer_bigrams_noTFIDF_2": {
- "bigrams_ _gnorcyvmer": 4.0,
- "bigrams_a_common": 4.0,
- "bigrams_a_known": 4.0,
- "bigrams_be_treated": 4.0,
- "bigrams_bmzvpvwbpw_failed": 4.0,
- "bigrams_can_be": 4.0,
- "bigrams_clinical_trials": 4.0,
- "bigrams_common_treatment": 4.0,
- "bigrams_effect_of": 4.0,
- "bigrams_failed_clinical": 4.0,
- "bigrams_for_kyekjnkrfo": 4.0,
- "bigrams_for_zgwivlcmly": 4.0,
- "bigrams_gnorcyvmer_is": 4.0,
- "bigrams_is_a": 8.0,
- "bigrams_known_side": 4.0,
- "bigrams_kyekjnkrfo_.": 4.0,
- "bigrams_of_ruswdgzajr": 4.0,
- "bigrams_ootopaoxbg_can": 4.0,
- "bigrams_pehhjnlvvewbjccovflf_is": 4.0,
- "bigrams_ruswdgzajr_.": 4.0,
- "bigrams_side_effect": 4.0,
- "bigrams_treated_with": 4.0,
- "bigrams_treatment_for": 4.0,
- "bigrams_trials_for": 4.0,
- "bigrams_vgypkemhjr_.": 4.0,
- "bigrams_with_vgypkemhjr": 4.0,
- "bigrams_zgwivlcmly_.": 4.0
- },
- "test_vectorizer_defaults_1": {
- "bigrams_ _gnorcyvmer": 0.68309028,
- "bigrams_a_common": 0.78013025,
- "bigrams_a_known": 0.68309028,
- "bigrams_be_treated": 0.81649658,
- "bigrams_bmzvpvwbpw_failed": 0.81649658,
- "bigrams_can_be": 0.81649658,
- "bigrams_clinical_trials": 0.81649658,
- "bigrams_common_treatment": 0.78013025,
- "bigrams_effect_of": 0.68309028,
- "bigrams_failed_clinical": 0.81649658,
- "bigrams_for_kyekjnkrfo": 0.81649658,
- "bigrams_for_zgwivlcmly": 0.78013025,
- "bigrams_gnorcyvmer_is": 0.68309028,
- "bigrams_is_a": 1.10705635,
- "bigrams_known_side": 0.68309028,
- "bigrams_kyekjnkrfo_.": 0.81649658,
- "bigrams_of_ruswdgzajr": 0.68309028,
- "bigrams_ootopaoxbg_can": 0.81649658,
- "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025,
- "bigrams_ruswdgzajr_.": 0.68309028,
- "bigrams_side_effect": 0.68309028,
- "bigrams_treated_with": 0.81649658,
- "bigrams_treatment_for": 0.78013025,
- "bigrams_trials_for": 0.81649658,
- "bigrams_vgypkemhjr_.": 0.81649658,
- "bigrams_with_vgypkemhjr": 0.81649658,
- "bigrams_zgwivlcmly_.": 0.78013025,
- "dependencypathelements_amod": 2.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 6.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 8.0,
- "dependencypathelements_prep": 8.0,
- "dependencypathnearselectedtoken_0_nsubj": 3.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 3.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0,
- "ngrams_betweenentities_a": 1.45195225,
- "ngrams_betweenentities_be": 1.0,
- "ngrams_betweenentities_can": 1.0,
- "ngrams_betweenentities_clinical": 1.05815267,
- "ngrams_betweenentities_common": 1.03733099,
- "ngrams_betweenentities_effect": 0.88174599,
- "ngrams_betweenentities_failed": 1.05815267,
- "ngrams_betweenentities_for": 1.58541958,
- "ngrams_betweenentities_is": 1.45195225,
- "ngrams_betweenentities_known": 0.88174599,
- "ngrams_betweenentities_of": 0.88174599,
- "ngrams_betweenentities_side": 0.88174599,
- "ngrams_betweenentities_treated": 1.0,
- "ngrams_betweenentities_treatment": 1.03733099,
- "ngrams_betweenentities_trials": 1.05815267,
- "ngrams_betweenentities_with": 1.0,
- "selectedtokentypes_0_disease": 2.0,
- "selectedtokentypes_0_disease2": 2.0,
- "selectedtokentypes_0_drug": 4.0,
- "selectedtokentypes_1_disease": 2.0,
- "selectedtokentypes_1_disease2": 2.0,
- "selectedtokentypes_1_drug": 4.0
- },
- "test_vectorizer_defaults_2": {
- "bigrams_ _gnorcyvmer": 0.0,
- "bigrams_a_common": 1.05815267,
- "bigrams_a_known": 0.0,
- "bigrams_be_treated": 0.81649658,
- "bigrams_bmzvpvwbpw_failed": 1.0,
- "bigrams_can_be": 0.81649658,
- "bigrams_clinical_trials": 2.15470054,
- "bigrams_common_treatment": 1.05815267,
- "bigrams_effect_of": 0.0,
- "bigrams_failed_clinical": 2.15470054,
- "bigrams_for_kyekjnkrfo": 0.0,
- "bigrams_for_zgwivlcmly": 0.0,
- "bigrams_gnorcyvmer_is": 0.0,
- "bigrams_is_a": 0.80058652,
- "bigrams_known_side": 0.0,
- "bigrams_kyekjnkrfo_.": 2.0,
- "bigrams_of_ruswdgzajr": 0.0,
- "bigrams_ootopaoxbg_can": 0.81649658,
- "bigrams_pehhjnlvvewbjccovflf_is": 2.0,
- "bigrams_ruswdgzajr_.": 0.0,
- "bigrams_side_effect": 0.0,
- "bigrams_treated_with": 0.81649658,
- "bigrams_treatment_for": 1.05815267,
- "bigrams_trials_for": 2.15470054,
- "bigrams_vgypkemhjr_.": 0.81649658,
- "bigrams_with_vgypkemhjr": 0.81649658,
- "bigrams_zgwivlcmly_.": 0.0,
- "dependencypathelements_amod": 4.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 8.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 10.0,
- "dependencypathelements_prep": 10.0,
- "dependencypathnearselectedtoken_0_nsubj": 4.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 4.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0,
- "ngrams_betweenentities_a": 0.78483307,
- "ngrams_betweenentities_be": 1.0,
- "ngrams_betweenentities_can": 1.0,
- "ngrams_betweenentities_clinical": 2.11630534,
- "ngrams_betweenentities_common": 1.03733099,
- "ngrams_betweenentities_effect": 0.0,
- "ngrams_betweenentities_failed": 2.11630534,
- "ngrams_betweenentities_for": 2.3860061,
- "ngrams_betweenentities_is": 1.99154812,
- "ngrams_betweenentities_known": 0.0,
- "ngrams_betweenentities_of": 1.59494162,
- "ngrams_betweenentities_side": 0.0,
- "ngrams_betweenentities_treated": 1.0,
- "ngrams_betweenentities_treatment": 1.03733099,
- "ngrams_betweenentities_trials": 2.11630534,
- "ngrams_betweenentities_with": 1.0,
- "selectedtokentypes_0_disease": 5.0,
- "selectedtokentypes_0_disease2": 4.0,
- "selectedtokentypes_0_drug": 9.0,
- "selectedtokentypes_1_disease": 5.0,
- "selectedtokentypes_1_disease2": 4.0,
- "selectedtokentypes_1_drug": 9.0
- },
- "test_vectorizer_defaults_triple_1": {
- "bigrams_and_targets": 2.44948974,
- "bigrams_be_treated": 3.15197269,
- "bigrams_by_fvdxdietdx": 2.28320249,
- "bigrams_by_zkrkzlyfef": 2.28320249,
- "bigrams_can_be": 3.15197269,
- "bigrams_elvptnpvyc_.": 2.28320249,
- "bigrams_fvdxdietdx_inhibition": 2.28320249,
- "bigrams_hxlfssirgk_.": 2.44948974,
- "bigrams_inhibition_using": 3.15197269,
- "bigrams_knetvjnjun_and": 2.44948974,
- "bigrams_kyekjnkrfo_can": 2.28320249,
- "bigrams_oxzbaapqct_treats": 2.44948974,
- "bigrams_targets_hxlfssirgk": 2.44948974,
- "bigrams_treated_by": 3.15197269,
- "bigrams_treats_knetvjnjun": 2.44948974,
- "bigrams_usckfljzxu_.": 2.28320249,
- "bigrams_using_elvptnpvyc": 2.28320249,
- "bigrams_using_usckfljzxu": 2.28320249,
- "bigrams_zgwivlcmly_can": 2.28320249,
- "bigrams_zkrkzlyfef_inhibition": 2.28320249,
- "dependencypathelements_0_1_acl": 8.0,
- "dependencypathelements_0_1_agent": 8.0,
- "dependencypathelements_0_1_compound": 8.0,
- "dependencypathelements_0_1_conj": 4.0,
- "dependencypathelements_0_1_dobj": 12.0,
- "dependencypathelements_0_1_nsubj": 4.0,
- "dependencypathelements_0_1_nsubjpass": 8.0,
- "dependencypathelements_0_1_pobj": 8.0,
- "dependencypathelements_0_2_acl": 8.0,
- "dependencypathelements_0_2_agent": 8.0,
- "dependencypathelements_0_2_compound": 8.0,
- "dependencypathelements_0_2_conj": 4.0,
- "dependencypathelements_0_2_dobj": 12.0,
- "dependencypathelements_0_2_nsubj": 4.0,
- "dependencypathelements_0_2_nsubjpass": 8.0,
- "dependencypathelements_0_2_pobj": 8.0,
- "dependencypathelements_1_2_acl": 8.0,
- "dependencypathelements_1_2_agent": 8.0,
- "dependencypathelements_1_2_compound": 8.0,
- "dependencypathelements_1_2_conj": 4.0,
- "dependencypathelements_1_2_dobj": 12.0,
- "dependencypathelements_1_2_nsubj": 4.0,
- "dependencypathelements_1_2_nsubjpass": 8.0,
- "dependencypathelements_1_2_pobj": 8.0,
- "dependencypathnearselectedtoken_0_compound": 4.0,
- "dependencypathnearselectedtoken_0_conj": 2.0,
- "dependencypathnearselectedtoken_0_nsubj": 2.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 4.0,
- "dependencypathnearselectedtoken_1_compound": 4.0,
- "dependencypathnearselectedtoken_1_conj": 2.0,
- "dependencypathnearselectedtoken_1_nsubj": 2.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 4.0,
- "dependencypathnearselectedtoken_2_compound": 4.0,
- "dependencypathnearselectedtoken_2_conj": 2.0,
- "dependencypathnearselectedtoken_2_nsubj": 2.0,
- "dependencypathnearselectedtoken_2_nsubjpass": 4.0,
- "ngrams_betweenentities_0_1_and": 1.46201744,
- "ngrams_betweenentities_0_1_be": 2.08991136,
- "ngrams_betweenentities_0_1_by": 2.08991136,
- "ngrams_betweenentities_0_1_can": 2.08991136,
- "ngrams_betweenentities_0_1_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_0_1_inhibition": 2.08991136,
- "ngrams_betweenentities_0_1_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_0_1_targets": 1.46201744,
- "ngrams_betweenentities_0_1_treated": 2.08991136,
- "ngrams_betweenentities_0_1_treats": 1.46201744,
- "ngrams_betweenentities_0_1_using": 2.08991136,
- "ngrams_betweenentities_0_1_zkrkzlyfef": 0.8510011,
- "ngrams_betweenentities_0_2_and": 1.46201744,
- "ngrams_betweenentities_0_2_be": 2.08991136,
- "ngrams_betweenentities_0_2_by": 2.08991136,
- "ngrams_betweenentities_0_2_can": 2.08991136,
- "ngrams_betweenentities_0_2_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_0_2_inhibition": 2.08991136,
- "ngrams_betweenentities_0_2_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_0_2_targets": 1.46201744,
- "ngrams_betweenentities_0_2_treated": 2.08991136,
- "ngrams_betweenentities_0_2_treats": 1.46201744,
- "ngrams_betweenentities_0_2_using": 2.08991136,
- "ngrams_betweenentities_0_2_zkrkzlyfef": 0.8510011,
- "ngrams_betweenentities_1_2_and": 1.46201744,
- "ngrams_betweenentities_1_2_be": 2.08991136,
- "ngrams_betweenentities_1_2_by": 2.08991136,
- "ngrams_betweenentities_1_2_can": 2.08991136,
- "ngrams_betweenentities_1_2_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_1_2_inhibition": 2.08991136,
- "ngrams_betweenentities_1_2_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_1_2_targets": 1.46201744,
- "ngrams_betweenentities_1_2_treated": 2.08991136,
- "ngrams_betweenentities_1_2_treats": 1.46201744,
- "ngrams_betweenentities_1_2_using": 2.08991136,
- "ngrams_betweenentities_1_2_zkrkzlyfef": 0.8510011,
- "selectedtokentypes_0_disease": 6.0,
- "selectedtokentypes_0_drug": 6.0,
- "selectedtokentypes_0_gene": 6.0,
- "selectedtokentypes_1_disease": 6.0,
- "selectedtokentypes_1_drug": 6.0,
- "selectedtokentypes_1_gene": 6.0,
- "selectedtokentypes_2_disease": 6.0,
- "selectedtokentypes_2_drug": 6.0,
- "selectedtokentypes_2_gene": 6.0
- },
- "test_vectorizer_defaults_triple_2": {
- "bigrams_and_targets": 2.44948974,
- "bigrams_be_treated": 3.15197269,
- "bigrams_by_fvdxdietdx": 2.28320249,
- "bigrams_by_zkrkzlyfef": 2.28320249,
- "bigrams_can_be": 3.15197269,
- "bigrams_elvptnpvyc_.": 2.28320249,
- "bigrams_fvdxdietdx_inhibition": 2.28320249,
- "bigrams_hxlfssirgk_.": 2.44948974,
- "bigrams_inhibition_using": 3.15197269,
- "bigrams_knetvjnjun_and": 2.44948974,
- "bigrams_kyekjnkrfo_can": 2.28320249,
- "bigrams_oxzbaapqct_treats": 2.44948974,
- "bigrams_targets_hxlfssirgk": 2.44948974,
- "bigrams_treated_by": 3.15197269,
- "bigrams_treats_knetvjnjun": 2.44948974,
- "bigrams_usckfljzxu_.": 2.28320249,
- "bigrams_using_elvptnpvyc": 2.28320249,
- "bigrams_using_usckfljzxu": 2.28320249,
- "bigrams_zgwivlcmly_can": 2.28320249,
- "bigrams_zkrkzlyfef_inhibition": 2.28320249,
- "dependencypathelements_0_1_acl": 8.0,
- "dependencypathelements_0_1_agent": 8.0,
- "dependencypathelements_0_1_compound": 8.0,
- "dependencypathelements_0_1_conj": 4.0,
- "dependencypathelements_0_1_dobj": 12.0,
- "dependencypathelements_0_1_nsubj": 4.0,
- "dependencypathelements_0_1_nsubjpass": 8.0,
- "dependencypathelements_0_1_pobj": 8.0,
- "dependencypathelements_0_2_acl": 8.0,
- "dependencypathelements_0_2_agent": 8.0,
- "dependencypathelements_0_2_compound": 8.0,
- "dependencypathelements_0_2_conj": 4.0,
- "dependencypathelements_0_2_dobj": 12.0,
- "dependencypathelements_0_2_nsubj": 4.0,
- "dependencypathelements_0_2_nsubjpass": 8.0,
- "dependencypathelements_0_2_pobj": 8.0,
- "dependencypathelements_1_2_acl": 8.0,
- "dependencypathelements_1_2_agent": 8.0,
- "dependencypathelements_1_2_compound": 8.0,
- "dependencypathelements_1_2_conj": 4.0,
- "dependencypathelements_1_2_dobj": 12.0,
- "dependencypathelements_1_2_nsubj": 4.0,
- "dependencypathelements_1_2_nsubjpass": 8.0,
- "dependencypathelements_1_2_pobj": 8.0,
- "dependencypathnearselectedtoken_0_compound": 4.0,
- "dependencypathnearselectedtoken_0_conj": 2.0,
- "dependencypathnearselectedtoken_0_nsubj": 2.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 4.0,
- "dependencypathnearselectedtoken_1_compound": 4.0,
- "dependencypathnearselectedtoken_1_conj": 2.0,
- "dependencypathnearselectedtoken_1_nsubj": 2.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 4.0,
- "dependencypathnearselectedtoken_2_compound": 4.0,
- "dependencypathnearselectedtoken_2_conj": 2.0,
- "dependencypathnearselectedtoken_2_nsubj": 2.0,
- "dependencypathnearselectedtoken_2_nsubjpass": 4.0,
- "ngrams_betweenentities_0_1_and": 1.46201744,
- "ngrams_betweenentities_0_1_be": 2.08991136,
- "ngrams_betweenentities_0_1_by": 2.08991136,
- "ngrams_betweenentities_0_1_can": 2.08991136,
- "ngrams_betweenentities_0_1_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_0_1_inhibition": 2.08991136,
- "ngrams_betweenentities_0_1_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_0_1_targets": 1.46201744,
- "ngrams_betweenentities_0_1_treated": 2.08991136,
- "ngrams_betweenentities_0_1_treats": 1.46201744,
- "ngrams_betweenentities_0_1_using": 2.08991136,
- "ngrams_betweenentities_0_1_zkrkzlyfef": 0.8510011,
- "ngrams_betweenentities_0_2_and": 1.46201744,
- "ngrams_betweenentities_0_2_be": 2.08991136,
- "ngrams_betweenentities_0_2_by": 2.08991136,
- "ngrams_betweenentities_0_2_can": 2.08991136,
- "ngrams_betweenentities_0_2_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_0_2_inhibition": 2.08991136,
- "ngrams_betweenentities_0_2_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_0_2_targets": 1.46201744,
- "ngrams_betweenentities_0_2_treated": 2.08991136,
- "ngrams_betweenentities_0_2_treats": 1.46201744,
- "ngrams_betweenentities_0_2_using": 2.08991136,
- "ngrams_betweenentities_0_2_zkrkzlyfef": 0.8510011,
- "ngrams_betweenentities_1_2_and": 1.46201744,
- "ngrams_betweenentities_1_2_be": 2.08991136,
- "ngrams_betweenentities_1_2_by": 2.08991136,
- "ngrams_betweenentities_1_2_can": 2.08991136,
- "ngrams_betweenentities_1_2_fvdxdietdx": 0.8510011,
- "ngrams_betweenentities_1_2_inhibition": 2.08991136,
- "ngrams_betweenentities_1_2_knetvjnjun": 0.8909307,
- "ngrams_betweenentities_1_2_targets": 1.46201744,
- "ngrams_betweenentities_1_2_treated": 2.08991136,
- "ngrams_betweenentities_1_2_treats": 1.46201744,
- "ngrams_betweenentities_1_2_using": 2.08991136,
- "ngrams_betweenentities_1_2_zkrkzlyfef": 0.8510011,
- "selectedtokentypes_0_disease": 6.0,
- "selectedtokentypes_0_drug": 6.0,
- "selectedtokentypes_0_gene": 6.0,
- "selectedtokentypes_1_disease": 6.0,
- "selectedtokentypes_1_drug": 6.0,
- "selectedtokentypes_1_gene": 6.0,
- "selectedtokentypes_2_disease": 6.0,
- "selectedtokentypes_2_drug": 6.0,
- "selectedtokentypes_2_gene": 6.0
- },
- "test_vectorizer_dependencyPathEdgesNearEntities_1": {
- "dependencypathnearselectedtoken_0_nsubj": 3.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 3.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0
- },
- "test_vectorizer_dependencyPathEdgesNearEntities_2": {
- "dependencypathnearselectedtoken_0_nsubj": 3.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 3.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0
- },
- "test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_1": {
- "dependencypathnearselectedtoken_0_nsubj": 3.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 3.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0
- },
- "test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_2": {
- "dependencypathnearselectedtoken_0_nsubj": 3.0,
- "dependencypathnearselectedtoken_0_nsubjpass": 1.0,
- "dependencypathnearselectedtoken_1_nsubj": 3.0,
- "dependencypathnearselectedtoken_1_nsubjpass": 1.0
- },
- "test_vectorizer_dependencyPathEdges_1": {
- "dependencypathelements_amod": 2.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 6.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 8.0,
- "dependencypathelements_prep": 8.0
- },
- "test_vectorizer_dependencyPathEdges_2": {
- "dependencypathelements_amod": 2.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 6.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 8.0,
- "dependencypathelements_prep": 8.0
- },
- "test_vectorizer_dependencyPathEdges_noTFIDF_1": {
- "dependencypathelements_amod": 2.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 6.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 8.0,
- "dependencypathelements_prep": 8.0
- },
- "test_vectorizer_dependencyPathEdges_noTFIDF_2": {
- "dependencypathelements_amod": 2.0,
- "dependencypathelements_attr": 4.0,
- "dependencypathelements_nsubj": 6.0,
- "dependencypathelements_nsubjpass": 2.0,
- "dependencypathelements_pobj": 8.0,
- "dependencypathelements_prep": 8.0
- },
- "test_vectorizer_entityTypes_1": {
- "selectedtokentypes_0_disease": 2.0,
- "selectedtokentypes_0_disease2": 2.0,
- "selectedtokentypes_0_drug": 4.0,
- "selectedtokentypes_1_disease": 2.0,
- "selectedtokentypes_1_disease2": 2.0,
- "selectedtokentypes_1_drug": 4.0
- },
- "test_vectorizer_entityTypes_2": {
- "selectedtokentypes_0_disease": 5.0,
- "selectedtokentypes_0_disease2": 4.0,
- "selectedtokentypes_0_drug": 9.0,
- "selectedtokentypes_1_disease": 5.0,
- "selectedtokentypes_1_disease2": 4.0,
- "selectedtokentypes_1_drug": 9.0
- },
- "test_vectorizer_entityTypes_noTFIDF_1": {
- "selectedtokentypes_0_disease": 2.0,
- "selectedtokentypes_0_disease2": 2.0,
- "selectedtokentypes_0_drug": 4.0,
- "selectedtokentypes_1_disease": 2.0,
- "selectedtokentypes_1_disease2": 2.0,
- "selectedtokentypes_1_drug": 4.0
- },
- "test_vectorizer_entityTypes_noTFIDF_2": {
- "selectedtokentypes_0_disease": 2.0,
- "selectedtokentypes_0_disease2": 2.0,
- "selectedtokentypes_0_drug": 4.0,
- "selectedtokentypes_1_disease": 2.0,
- "selectedtokentypes_1_disease2": 2.0,
- "selectedtokentypes_1_drug": 4.0
- },
- "test_vectorizer_unigramsBetweenEntities_1": {
- "ngrams_betweenentities_a": 1.45195225,
- "ngrams_betweenentities_be": 1.0,
- "ngrams_betweenentities_can": 1.0,
- "ngrams_betweenentities_clinical": 1.05815267,
- "ngrams_betweenentities_common": 1.03733099,
- "ngrams_betweenentities_effect": 0.88174599,
- "ngrams_betweenentities_failed": 1.05815267,
- "ngrams_betweenentities_for": 1.58541958,
- "ngrams_betweenentities_is": 1.45195225,
- "ngrams_betweenentities_known": 0.88174599,
- "ngrams_betweenentities_of": 0.88174599,
- "ngrams_betweenentities_side": 0.88174599,
- "ngrams_betweenentities_treated": 1.0,
- "ngrams_betweenentities_treatment": 1.03733099,
- "ngrams_betweenentities_trials": 1.05815267,
- "ngrams_betweenentities_with": 1.0
- },
- "test_vectorizer_unigramsBetweenEntities_2": {
- "ngrams_betweenentities_a": 1.45195225,
- "ngrams_betweenentities_be": 1.0,
- "ngrams_betweenentities_can": 1.0,
- "ngrams_betweenentities_clinical": 1.05815267,
- "ngrams_betweenentities_common": 1.03733099,
- "ngrams_betweenentities_effect": 0.88174599,
- "ngrams_betweenentities_failed": 1.05815267,
- "ngrams_betweenentities_for": 1.58541958,
- "ngrams_betweenentities_is": 1.45195225,
- "ngrams_betweenentities_known": 0.88174599,
- "ngrams_betweenentities_of": 0.88174599,
- "ngrams_betweenentities_side": 0.88174599,
- "ngrams_betweenentities_treated": 1.0,
- "ngrams_betweenentities_treatment": 1.03733099,
- "ngrams_betweenentities_trials": 1.05815267,
- "ngrams_betweenentities_with": 1.0
- },
- "test_vectorizer_unigramsBetweenEntities_noTFIDF_1": {
- "ngrams_betweenentities_a": 4.0,
- "ngrams_betweenentities_be": 2.0,
- "ngrams_betweenentities_can": 2.0,
- "ngrams_betweenentities_clinical": 2.0,
- "ngrams_betweenentities_common": 2.0,
- "ngrams_betweenentities_effect": 2.0,
- "ngrams_betweenentities_failed": 2.0,
- "ngrams_betweenentities_for": 4.0,
- "ngrams_betweenentities_is": 4.0,
- "ngrams_betweenentities_known": 2.0,
- "ngrams_betweenentities_of": 2.0,
- "ngrams_betweenentities_side": 2.0,
- "ngrams_betweenentities_treated": 2.0,
- "ngrams_betweenentities_treatment": 2.0,
- "ngrams_betweenentities_trials": 2.0,
- "ngrams_betweenentities_with": 2.0
- },
- "test_vectorizer_unigramsBetweenEntities_noTFIDF_2": {
- "ngrams_betweenentities_a": 4.0,
- "ngrams_betweenentities_be": 2.0,
- "ngrams_betweenentities_can": 2.0,
- "ngrams_betweenentities_clinical": 2.0,
- "ngrams_betweenentities_common": 2.0,
- "ngrams_betweenentities_effect": 2.0,
- "ngrams_betweenentities_failed": 2.0,
- "ngrams_betweenentities_for": 4.0,
- "ngrams_betweenentities_is": 4.0,
- "ngrams_betweenentities_known": 2.0,
- "ngrams_betweenentities_of": 2.0,
- "ngrams_betweenentities_side": 2.0,
- "ngrams_betweenentities_treated": 2.0,
- "ngrams_betweenentities_treatment": 2.0,
- "ngrams_betweenentities_trials": 2.0,
- "ngrams_betweenentities_with": 2.0
- }
-}
\ No newline at end of file
diff --git a/tests/test_vectorizer.py b/tests/test_vectorizer.py
index 52dd7aa..56956bf 100644
--- a/tests/test_vectorizer.py
+++ b/tests/test_vectorizer.py
@@ -3,27 +3,7 @@
import os
import json
-from kindred.datageneration import generateData,generateTestData
-
-def check(valueName,value):
- write = False
-
- scriptDir = os.path.dirname(__file__)
- jsonPath = os.path.join(scriptDir,'data','vectorizer','expected.json')
- if os.path.isfile(jsonPath):
- with open(jsonPath) as f:
- data = json.load(f)
- else:
- data = {}
-
- if write:
- data[valueName] = value
- with open(jsonPath,'w') as f:
- json.dump(data,f,indent=2,sort_keys=True)
-
- assert valueName in data
- assert data[valueName] == value
-
+from kindred.datageneration import generateData,generateTestData
def test_simpleVectorizer_binary():
text = 'Erlotinib is a common treatment for NSCLC. Aspirin is the main cause of boneitis . '
@@ -39,14 +19,9 @@ def test_simpleVectorizer_binary():
# We'll just get the vectors for the entityTypes
vectorizer = kindred.Vectorizer(featureChoice=["entityTypes"])
vectors = vectorizer.fit_transform(candidateRelations)
- vectorsCSR = vectors.tocsr()
- rows,cols = vectors.nonzero()
-
- expected = {(0, 2): 1.0, (0, 3): 1.0, (1, 0): 1.0, (1, 5): 1.0, (2, 2): 1.0, (2, 4): 1.0, (3, 1): 1.0, (3, 5): 1.0}
-
- namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) }
- check('test_simpleVectorizer_binary',namedCols)
+ assert vectors.shape[0] == 4
+ assert len(vectors.nonzero()) > 0
def test_simpleVectorizer_triple():
text = 'Erlotinib is a common treatment for NSCLC which targets EGFR. '
@@ -62,14 +37,9 @@ def test_simpleVectorizer_triple():
# We'll just get the vectors for the entityTypes
vectorizer = kindred.Vectorizer(entityCount=3,featureChoice=["entityTypes"])
vectors = vectorizer.fit_transform(candidateRelations)
- vectorsCSR = vectors.tocsr()
- rows,cols = vectors.nonzero()
-
- expected = {(0, 1): 1.0, (0, 3): 1.0, (0, 8): 1.0, (1, 1): 1.0, (1, 5): 1.0, (1, 6): 1.0, (2, 0): 1.0, (2, 4): 1.0, (2, 8): 1.0, (3, 0): 1.0, (3, 5): 1.0, (3, 7): 1.0, (4, 2): 1.0, (4, 4): 1.0, (4, 6): 1.0, (5, 2): 1.0, (5, 3): 1.0, (5, 7): 1.0}
-
- namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) }
-
- check('test_simpleVectorizer_triple',namedCols)
+
+ assert vectors.shape[0] == 6
+ assert len(vectors.nonzero()) > 0
def test_vectorizer_defaults():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -87,16 +57,11 @@ def test_vectorizer_defaults():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
-
- colnames = vectorizer.getFeatureNames()
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_defaults_1',namedCols1)
- colmeans2 = np.sum(matrix2,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_defaults_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_entityTypes():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -116,15 +81,10 @@ def test_vectorizer_entityTypes():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_entityTypes_1',namedCols1)
- colmeans2 = np.sum(matrix2,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_entityTypes_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_unigramsBetweenEntities():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -144,15 +104,10 @@ def test_vectorizer_unigramsBetweenEntities():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_unigramsBetweenEntities_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_unigramsBetweenEntities_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_bigrams():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -172,15 +127,10 @@ def test_vectorizer_bigrams():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_bigrams_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_bigrams_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_dependencyPathEdges():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -200,15 +150,10 @@ def test_vectorizer_dependencyPathEdges():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_dependencyPathEdges_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_dependencyPathEdges_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_dependencyPathEdgesNearEntities():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -228,15 +173,10 @@ def test_vectorizer_dependencyPathEdgesNearEntities():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_dependencyPathEdgesNearEntities_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_dependencyPathEdgesNearEntities_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_entityTypes_noTFIDF():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -256,15 +196,10 @@ def test_vectorizer_entityTypes_noTFIDF():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_entityTypes_noTFIDF_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_entityTypes_noTFIDF_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_unigramsBetweenEntities_noTFIDF():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -284,15 +219,10 @@ def test_vectorizer_unigramsBetweenEntities_noTFIDF():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_unigramsBetweenEntities_noTFIDF_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_unigramsBetweenEntities_noTFIDF_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_bigrams_noTFIDF():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -312,15 +242,10 @@ def test_vectorizer_bigrams_noTFIDF():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_bigrams_noTFIDF_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_bigrams_noTFIDF_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_dependencyPathEdges_noTFIDF():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -340,15 +265,10 @@ def test_vectorizer_dependencyPathEdges_noTFIDF():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_dependencyPathEdges_noTFIDF_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_dependencyPathEdges_noTFIDF_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF():
corpus1, _ = generateTestData(positiveCount=5,negativeCount=5)
@@ -368,15 +288,10 @@ def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_2',namedCols2)
+ assert matrix1.shape[0] == 8
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 18
+ assert len(matrix2.nonzero()) > 0
def test_vectorizer_defaults_triple():
corpus1, _ = generateTestData(entityCount=3,positiveCount=5,negativeCount=5)
@@ -395,15 +310,10 @@ def test_vectorizer_defaults_triple():
matrix1 = vectorizer.fit_transform(candidateRelations1)
matrix2 = vectorizer.transform(candidateRelations2)
- colnames = vectorizer.getFeatureNames()
-
- # As a quick check, we'll confirm that the column means are as expected
- colmeans1 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) }
- check('test_vectorizer_defaults_triple_1',namedCols1)
- colmeans2 = np.sum(matrix1,axis=0).tolist()[0]
- namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) }
- check('test_vectorizer_defaults_triple_2',namedCols2)
+ assert matrix1.shape[0] == 18
+ assert len(matrix1.nonzero()) > 0
+ assert matrix2.shape[0] == 60
+ assert len(matrix2.nonzero()) > 0
if __name__ == '__main__':
test_vectorizer_defaults_triple()