From 24508d20ceeec730e8457482b60f827a6446db15 Mon Sep 17 00:00:00 2001 From: Jake Lever Date: Fri, 3 Mar 2023 21:13:09 +0000 Subject: [PATCH] Simplify vectorizer tests to do basic checks --- tests/data/vectorizer/expected.json | 636 ---------------------------- tests/test_vectorizer.py | 198 +++------ 2 files changed, 54 insertions(+), 780 deletions(-) delete mode 100644 tests/data/vectorizer/expected.json diff --git a/tests/data/vectorizer/expected.json b/tests/data/vectorizer/expected.json deleted file mode 100644 index b3f54d1..0000000 --- a/tests/data/vectorizer/expected.json +++ /dev/null @@ -1,636 +0,0 @@ -{ - "test_simpleVectorizer_binary": { - "(0, 2)": 1.0, - "(0, 3)": 1.0, - "(1, 0)": 1.0, - "(1, 5)": 1.0, - "(2, 2)": 1.0, - "(2, 4)": 1.0, - "(3, 1)": 1.0, - "(3, 5)": 1.0 - }, - "test_simpleVectorizer_triple": { - "(0, 1)": 1.0, - "(0, 3)": 1.0, - "(0, 8)": 1.0, - "(1, 1)": 1.0, - "(1, 5)": 1.0, - "(1, 6)": 1.0, - "(2, 0)": 1.0, - "(2, 4)": 1.0, - "(2, 8)": 1.0, - "(3, 0)": 1.0, - "(3, 5)": 1.0, - "(3, 7)": 1.0, - "(4, 2)": 1.0, - "(4, 4)": 1.0, - "(4, 6)": 1.0, - "(5, 2)": 1.0, - "(5, 3)": 1.0, - "(5, 7)": 1.0 - }, - "test_vectorizer_bigrams_1": { - "bigrams_ _gnorcyvmer": 0.68309028, - "bigrams_a_common": 0.78013025, - "bigrams_a_known": 0.68309028, - "bigrams_be_treated": 0.81649658, - "bigrams_bmzvpvwbpw_failed": 0.81649658, - "bigrams_can_be": 0.81649658, - "bigrams_clinical_trials": 0.81649658, - "bigrams_common_treatment": 0.78013025, - "bigrams_effect_of": 0.68309028, - "bigrams_failed_clinical": 0.81649658, - "bigrams_for_kyekjnkrfo": 0.81649658, - "bigrams_for_zgwivlcmly": 0.78013025, - "bigrams_gnorcyvmer_is": 0.68309028, - "bigrams_is_a": 1.10705635, - "bigrams_known_side": 0.68309028, - "bigrams_kyekjnkrfo_.": 0.81649658, - "bigrams_of_ruswdgzajr": 0.68309028, - "bigrams_ootopaoxbg_can": 0.81649658, - "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025, - "bigrams_ruswdgzajr_.": 0.68309028, - "bigrams_side_effect": 0.68309028, - "bigrams_treated_with": 0.81649658, - "bigrams_treatment_for": 0.78013025, - "bigrams_trials_for": 0.81649658, - "bigrams_vgypkemhjr_.": 0.81649658, - "bigrams_with_vgypkemhjr": 0.81649658, - "bigrams_zgwivlcmly_.": 0.78013025 - }, - "test_vectorizer_bigrams_2": { - "bigrams_ _gnorcyvmer": 0.68309028, - "bigrams_a_common": 0.78013025, - "bigrams_a_known": 0.68309028, - "bigrams_be_treated": 0.81649658, - "bigrams_bmzvpvwbpw_failed": 0.81649658, - "bigrams_can_be": 0.81649658, - "bigrams_clinical_trials": 0.81649658, - "bigrams_common_treatment": 0.78013025, - "bigrams_effect_of": 0.68309028, - "bigrams_failed_clinical": 0.81649658, - "bigrams_for_kyekjnkrfo": 0.81649658, - "bigrams_for_zgwivlcmly": 0.78013025, - "bigrams_gnorcyvmer_is": 0.68309028, - "bigrams_is_a": 1.10705635, - "bigrams_known_side": 0.68309028, - "bigrams_kyekjnkrfo_.": 0.81649658, - "bigrams_of_ruswdgzajr": 0.68309028, - "bigrams_ootopaoxbg_can": 0.81649658, - "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025, - "bigrams_ruswdgzajr_.": 0.68309028, - "bigrams_side_effect": 0.68309028, - "bigrams_treated_with": 0.81649658, - "bigrams_treatment_for": 0.78013025, - "bigrams_trials_for": 0.81649658, - "bigrams_vgypkemhjr_.": 0.81649658, - "bigrams_with_vgypkemhjr": 0.81649658, - "bigrams_zgwivlcmly_.": 0.78013025 - }, - "test_vectorizer_bigrams_noTFIDF_1": { - "bigrams_ _gnorcyvmer": 4.0, - "bigrams_a_common": 4.0, - "bigrams_a_known": 4.0, - "bigrams_be_treated": 4.0, - "bigrams_bmzvpvwbpw_failed": 4.0, - "bigrams_can_be": 4.0, - "bigrams_clinical_trials": 4.0, - "bigrams_common_treatment": 4.0, - "bigrams_effect_of": 4.0, - "bigrams_failed_clinical": 4.0, - "bigrams_for_kyekjnkrfo": 4.0, - "bigrams_for_zgwivlcmly": 4.0, - "bigrams_gnorcyvmer_is": 4.0, - "bigrams_is_a": 8.0, - "bigrams_known_side": 4.0, - "bigrams_kyekjnkrfo_.": 4.0, - "bigrams_of_ruswdgzajr": 4.0, - "bigrams_ootopaoxbg_can": 4.0, - "bigrams_pehhjnlvvewbjccovflf_is": 4.0, - "bigrams_ruswdgzajr_.": 4.0, - "bigrams_side_effect": 4.0, - "bigrams_treated_with": 4.0, - "bigrams_treatment_for": 4.0, - "bigrams_trials_for": 4.0, - "bigrams_vgypkemhjr_.": 4.0, - "bigrams_with_vgypkemhjr": 4.0, - "bigrams_zgwivlcmly_.": 4.0 - }, - "test_vectorizer_bigrams_noTFIDF_2": { - "bigrams_ _gnorcyvmer": 4.0, - "bigrams_a_common": 4.0, - "bigrams_a_known": 4.0, - "bigrams_be_treated": 4.0, - "bigrams_bmzvpvwbpw_failed": 4.0, - "bigrams_can_be": 4.0, - "bigrams_clinical_trials": 4.0, - "bigrams_common_treatment": 4.0, - "bigrams_effect_of": 4.0, - "bigrams_failed_clinical": 4.0, - "bigrams_for_kyekjnkrfo": 4.0, - "bigrams_for_zgwivlcmly": 4.0, - "bigrams_gnorcyvmer_is": 4.0, - "bigrams_is_a": 8.0, - "bigrams_known_side": 4.0, - "bigrams_kyekjnkrfo_.": 4.0, - "bigrams_of_ruswdgzajr": 4.0, - "bigrams_ootopaoxbg_can": 4.0, - "bigrams_pehhjnlvvewbjccovflf_is": 4.0, - "bigrams_ruswdgzajr_.": 4.0, - "bigrams_side_effect": 4.0, - "bigrams_treated_with": 4.0, - "bigrams_treatment_for": 4.0, - "bigrams_trials_for": 4.0, - "bigrams_vgypkemhjr_.": 4.0, - "bigrams_with_vgypkemhjr": 4.0, - "bigrams_zgwivlcmly_.": 4.0 - }, - "test_vectorizer_defaults_1": { - "bigrams_ _gnorcyvmer": 0.68309028, - "bigrams_a_common": 0.78013025, - "bigrams_a_known": 0.68309028, - "bigrams_be_treated": 0.81649658, - "bigrams_bmzvpvwbpw_failed": 0.81649658, - "bigrams_can_be": 0.81649658, - "bigrams_clinical_trials": 0.81649658, - "bigrams_common_treatment": 0.78013025, - "bigrams_effect_of": 0.68309028, - "bigrams_failed_clinical": 0.81649658, - "bigrams_for_kyekjnkrfo": 0.81649658, - "bigrams_for_zgwivlcmly": 0.78013025, - "bigrams_gnorcyvmer_is": 0.68309028, - "bigrams_is_a": 1.10705635, - "bigrams_known_side": 0.68309028, - "bigrams_kyekjnkrfo_.": 0.81649658, - "bigrams_of_ruswdgzajr": 0.68309028, - "bigrams_ootopaoxbg_can": 0.81649658, - "bigrams_pehhjnlvvewbjccovflf_is": 0.78013025, - "bigrams_ruswdgzajr_.": 0.68309028, - "bigrams_side_effect": 0.68309028, - "bigrams_treated_with": 0.81649658, - "bigrams_treatment_for": 0.78013025, - "bigrams_trials_for": 0.81649658, - "bigrams_vgypkemhjr_.": 0.81649658, - "bigrams_with_vgypkemhjr": 0.81649658, - "bigrams_zgwivlcmly_.": 0.78013025, - "dependencypathelements_amod": 2.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 6.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 8.0, - "dependencypathelements_prep": 8.0, - "dependencypathnearselectedtoken_0_nsubj": 3.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 3.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0, - "ngrams_betweenentities_a": 1.45195225, - "ngrams_betweenentities_be": 1.0, - "ngrams_betweenentities_can": 1.0, - "ngrams_betweenentities_clinical": 1.05815267, - "ngrams_betweenentities_common": 1.03733099, - "ngrams_betweenentities_effect": 0.88174599, - "ngrams_betweenentities_failed": 1.05815267, - "ngrams_betweenentities_for": 1.58541958, - "ngrams_betweenentities_is": 1.45195225, - "ngrams_betweenentities_known": 0.88174599, - "ngrams_betweenentities_of": 0.88174599, - "ngrams_betweenentities_side": 0.88174599, - "ngrams_betweenentities_treated": 1.0, - "ngrams_betweenentities_treatment": 1.03733099, - "ngrams_betweenentities_trials": 1.05815267, - "ngrams_betweenentities_with": 1.0, - "selectedtokentypes_0_disease": 2.0, - "selectedtokentypes_0_disease2": 2.0, - "selectedtokentypes_0_drug": 4.0, - "selectedtokentypes_1_disease": 2.0, - "selectedtokentypes_1_disease2": 2.0, - "selectedtokentypes_1_drug": 4.0 - }, - "test_vectorizer_defaults_2": { - "bigrams_ _gnorcyvmer": 0.0, - "bigrams_a_common": 1.05815267, - "bigrams_a_known": 0.0, - "bigrams_be_treated": 0.81649658, - "bigrams_bmzvpvwbpw_failed": 1.0, - "bigrams_can_be": 0.81649658, - "bigrams_clinical_trials": 2.15470054, - "bigrams_common_treatment": 1.05815267, - "bigrams_effect_of": 0.0, - "bigrams_failed_clinical": 2.15470054, - "bigrams_for_kyekjnkrfo": 0.0, - "bigrams_for_zgwivlcmly": 0.0, - "bigrams_gnorcyvmer_is": 0.0, - "bigrams_is_a": 0.80058652, - "bigrams_known_side": 0.0, - "bigrams_kyekjnkrfo_.": 2.0, - "bigrams_of_ruswdgzajr": 0.0, - "bigrams_ootopaoxbg_can": 0.81649658, - "bigrams_pehhjnlvvewbjccovflf_is": 2.0, - "bigrams_ruswdgzajr_.": 0.0, - "bigrams_side_effect": 0.0, - "bigrams_treated_with": 0.81649658, - "bigrams_treatment_for": 1.05815267, - "bigrams_trials_for": 2.15470054, - "bigrams_vgypkemhjr_.": 0.81649658, - "bigrams_with_vgypkemhjr": 0.81649658, - "bigrams_zgwivlcmly_.": 0.0, - "dependencypathelements_amod": 4.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 8.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 10.0, - "dependencypathelements_prep": 10.0, - "dependencypathnearselectedtoken_0_nsubj": 4.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 4.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0, - "ngrams_betweenentities_a": 0.78483307, - "ngrams_betweenentities_be": 1.0, - "ngrams_betweenentities_can": 1.0, - "ngrams_betweenentities_clinical": 2.11630534, - "ngrams_betweenentities_common": 1.03733099, - "ngrams_betweenentities_effect": 0.0, - "ngrams_betweenentities_failed": 2.11630534, - "ngrams_betweenentities_for": 2.3860061, - "ngrams_betweenentities_is": 1.99154812, - "ngrams_betweenentities_known": 0.0, - "ngrams_betweenentities_of": 1.59494162, - "ngrams_betweenentities_side": 0.0, - "ngrams_betweenentities_treated": 1.0, - "ngrams_betweenentities_treatment": 1.03733099, - "ngrams_betweenentities_trials": 2.11630534, - "ngrams_betweenentities_with": 1.0, - "selectedtokentypes_0_disease": 5.0, - "selectedtokentypes_0_disease2": 4.0, - "selectedtokentypes_0_drug": 9.0, - "selectedtokentypes_1_disease": 5.0, - "selectedtokentypes_1_disease2": 4.0, - "selectedtokentypes_1_drug": 9.0 - }, - "test_vectorizer_defaults_triple_1": { - "bigrams_and_targets": 2.44948974, - "bigrams_be_treated": 3.15197269, - "bigrams_by_fvdxdietdx": 2.28320249, - "bigrams_by_zkrkzlyfef": 2.28320249, - "bigrams_can_be": 3.15197269, - "bigrams_elvptnpvyc_.": 2.28320249, - "bigrams_fvdxdietdx_inhibition": 2.28320249, - "bigrams_hxlfssirgk_.": 2.44948974, - "bigrams_inhibition_using": 3.15197269, - "bigrams_knetvjnjun_and": 2.44948974, - "bigrams_kyekjnkrfo_can": 2.28320249, - "bigrams_oxzbaapqct_treats": 2.44948974, - "bigrams_targets_hxlfssirgk": 2.44948974, - "bigrams_treated_by": 3.15197269, - "bigrams_treats_knetvjnjun": 2.44948974, - "bigrams_usckfljzxu_.": 2.28320249, - "bigrams_using_elvptnpvyc": 2.28320249, - "bigrams_using_usckfljzxu": 2.28320249, - "bigrams_zgwivlcmly_can": 2.28320249, - "bigrams_zkrkzlyfef_inhibition": 2.28320249, - "dependencypathelements_0_1_acl": 8.0, - "dependencypathelements_0_1_agent": 8.0, - "dependencypathelements_0_1_compound": 8.0, - "dependencypathelements_0_1_conj": 4.0, - "dependencypathelements_0_1_dobj": 12.0, - "dependencypathelements_0_1_nsubj": 4.0, - "dependencypathelements_0_1_nsubjpass": 8.0, - "dependencypathelements_0_1_pobj": 8.0, - "dependencypathelements_0_2_acl": 8.0, - "dependencypathelements_0_2_agent": 8.0, - "dependencypathelements_0_2_compound": 8.0, - "dependencypathelements_0_2_conj": 4.0, - "dependencypathelements_0_2_dobj": 12.0, - "dependencypathelements_0_2_nsubj": 4.0, - "dependencypathelements_0_2_nsubjpass": 8.0, - "dependencypathelements_0_2_pobj": 8.0, - "dependencypathelements_1_2_acl": 8.0, - "dependencypathelements_1_2_agent": 8.0, - "dependencypathelements_1_2_compound": 8.0, - "dependencypathelements_1_2_conj": 4.0, - "dependencypathelements_1_2_dobj": 12.0, - "dependencypathelements_1_2_nsubj": 4.0, - "dependencypathelements_1_2_nsubjpass": 8.0, - "dependencypathelements_1_2_pobj": 8.0, - "dependencypathnearselectedtoken_0_compound": 4.0, - "dependencypathnearselectedtoken_0_conj": 2.0, - "dependencypathnearselectedtoken_0_nsubj": 2.0, - "dependencypathnearselectedtoken_0_nsubjpass": 4.0, - "dependencypathnearselectedtoken_1_compound": 4.0, - "dependencypathnearselectedtoken_1_conj": 2.0, - "dependencypathnearselectedtoken_1_nsubj": 2.0, - "dependencypathnearselectedtoken_1_nsubjpass": 4.0, - "dependencypathnearselectedtoken_2_compound": 4.0, - "dependencypathnearselectedtoken_2_conj": 2.0, - "dependencypathnearselectedtoken_2_nsubj": 2.0, - "dependencypathnearselectedtoken_2_nsubjpass": 4.0, - "ngrams_betweenentities_0_1_and": 1.46201744, - "ngrams_betweenentities_0_1_be": 2.08991136, - "ngrams_betweenentities_0_1_by": 2.08991136, - "ngrams_betweenentities_0_1_can": 2.08991136, - "ngrams_betweenentities_0_1_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_0_1_inhibition": 2.08991136, - "ngrams_betweenentities_0_1_knetvjnjun": 0.8909307, - "ngrams_betweenentities_0_1_targets": 1.46201744, - "ngrams_betweenentities_0_1_treated": 2.08991136, - "ngrams_betweenentities_0_1_treats": 1.46201744, - "ngrams_betweenentities_0_1_using": 2.08991136, - "ngrams_betweenentities_0_1_zkrkzlyfef": 0.8510011, - "ngrams_betweenentities_0_2_and": 1.46201744, - "ngrams_betweenentities_0_2_be": 2.08991136, - "ngrams_betweenentities_0_2_by": 2.08991136, - "ngrams_betweenentities_0_2_can": 2.08991136, - "ngrams_betweenentities_0_2_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_0_2_inhibition": 2.08991136, - "ngrams_betweenentities_0_2_knetvjnjun": 0.8909307, - "ngrams_betweenentities_0_2_targets": 1.46201744, - "ngrams_betweenentities_0_2_treated": 2.08991136, - "ngrams_betweenentities_0_2_treats": 1.46201744, - "ngrams_betweenentities_0_2_using": 2.08991136, - "ngrams_betweenentities_0_2_zkrkzlyfef": 0.8510011, - "ngrams_betweenentities_1_2_and": 1.46201744, - "ngrams_betweenentities_1_2_be": 2.08991136, - "ngrams_betweenentities_1_2_by": 2.08991136, - "ngrams_betweenentities_1_2_can": 2.08991136, - "ngrams_betweenentities_1_2_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_1_2_inhibition": 2.08991136, - "ngrams_betweenentities_1_2_knetvjnjun": 0.8909307, - "ngrams_betweenentities_1_2_targets": 1.46201744, - "ngrams_betweenentities_1_2_treated": 2.08991136, - "ngrams_betweenentities_1_2_treats": 1.46201744, - "ngrams_betweenentities_1_2_using": 2.08991136, - "ngrams_betweenentities_1_2_zkrkzlyfef": 0.8510011, - "selectedtokentypes_0_disease": 6.0, - "selectedtokentypes_0_drug": 6.0, - "selectedtokentypes_0_gene": 6.0, - "selectedtokentypes_1_disease": 6.0, - "selectedtokentypes_1_drug": 6.0, - "selectedtokentypes_1_gene": 6.0, - "selectedtokentypes_2_disease": 6.0, - "selectedtokentypes_2_drug": 6.0, - "selectedtokentypes_2_gene": 6.0 - }, - "test_vectorizer_defaults_triple_2": { - "bigrams_and_targets": 2.44948974, - "bigrams_be_treated": 3.15197269, - "bigrams_by_fvdxdietdx": 2.28320249, - "bigrams_by_zkrkzlyfef": 2.28320249, - "bigrams_can_be": 3.15197269, - "bigrams_elvptnpvyc_.": 2.28320249, - "bigrams_fvdxdietdx_inhibition": 2.28320249, - "bigrams_hxlfssirgk_.": 2.44948974, - "bigrams_inhibition_using": 3.15197269, - "bigrams_knetvjnjun_and": 2.44948974, - "bigrams_kyekjnkrfo_can": 2.28320249, - "bigrams_oxzbaapqct_treats": 2.44948974, - "bigrams_targets_hxlfssirgk": 2.44948974, - "bigrams_treated_by": 3.15197269, - "bigrams_treats_knetvjnjun": 2.44948974, - "bigrams_usckfljzxu_.": 2.28320249, - "bigrams_using_elvptnpvyc": 2.28320249, - "bigrams_using_usckfljzxu": 2.28320249, - "bigrams_zgwivlcmly_can": 2.28320249, - "bigrams_zkrkzlyfef_inhibition": 2.28320249, - "dependencypathelements_0_1_acl": 8.0, - "dependencypathelements_0_1_agent": 8.0, - "dependencypathelements_0_1_compound": 8.0, - "dependencypathelements_0_1_conj": 4.0, - "dependencypathelements_0_1_dobj": 12.0, - "dependencypathelements_0_1_nsubj": 4.0, - "dependencypathelements_0_1_nsubjpass": 8.0, - "dependencypathelements_0_1_pobj": 8.0, - "dependencypathelements_0_2_acl": 8.0, - "dependencypathelements_0_2_agent": 8.0, - "dependencypathelements_0_2_compound": 8.0, - "dependencypathelements_0_2_conj": 4.0, - "dependencypathelements_0_2_dobj": 12.0, - "dependencypathelements_0_2_nsubj": 4.0, - "dependencypathelements_0_2_nsubjpass": 8.0, - "dependencypathelements_0_2_pobj": 8.0, - "dependencypathelements_1_2_acl": 8.0, - "dependencypathelements_1_2_agent": 8.0, - "dependencypathelements_1_2_compound": 8.0, - "dependencypathelements_1_2_conj": 4.0, - "dependencypathelements_1_2_dobj": 12.0, - "dependencypathelements_1_2_nsubj": 4.0, - "dependencypathelements_1_2_nsubjpass": 8.0, - "dependencypathelements_1_2_pobj": 8.0, - "dependencypathnearselectedtoken_0_compound": 4.0, - "dependencypathnearselectedtoken_0_conj": 2.0, - "dependencypathnearselectedtoken_0_nsubj": 2.0, - "dependencypathnearselectedtoken_0_nsubjpass": 4.0, - "dependencypathnearselectedtoken_1_compound": 4.0, - "dependencypathnearselectedtoken_1_conj": 2.0, - "dependencypathnearselectedtoken_1_nsubj": 2.0, - "dependencypathnearselectedtoken_1_nsubjpass": 4.0, - "dependencypathnearselectedtoken_2_compound": 4.0, - "dependencypathnearselectedtoken_2_conj": 2.0, - "dependencypathnearselectedtoken_2_nsubj": 2.0, - "dependencypathnearselectedtoken_2_nsubjpass": 4.0, - "ngrams_betweenentities_0_1_and": 1.46201744, - "ngrams_betweenentities_0_1_be": 2.08991136, - "ngrams_betweenentities_0_1_by": 2.08991136, - "ngrams_betweenentities_0_1_can": 2.08991136, - "ngrams_betweenentities_0_1_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_0_1_inhibition": 2.08991136, - "ngrams_betweenentities_0_1_knetvjnjun": 0.8909307, - "ngrams_betweenentities_0_1_targets": 1.46201744, - "ngrams_betweenentities_0_1_treated": 2.08991136, - "ngrams_betweenentities_0_1_treats": 1.46201744, - "ngrams_betweenentities_0_1_using": 2.08991136, - "ngrams_betweenentities_0_1_zkrkzlyfef": 0.8510011, - "ngrams_betweenentities_0_2_and": 1.46201744, - "ngrams_betweenentities_0_2_be": 2.08991136, - "ngrams_betweenentities_0_2_by": 2.08991136, - "ngrams_betweenentities_0_2_can": 2.08991136, - "ngrams_betweenentities_0_2_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_0_2_inhibition": 2.08991136, - "ngrams_betweenentities_0_2_knetvjnjun": 0.8909307, - "ngrams_betweenentities_0_2_targets": 1.46201744, - "ngrams_betweenentities_0_2_treated": 2.08991136, - "ngrams_betweenentities_0_2_treats": 1.46201744, - "ngrams_betweenentities_0_2_using": 2.08991136, - "ngrams_betweenentities_0_2_zkrkzlyfef": 0.8510011, - "ngrams_betweenentities_1_2_and": 1.46201744, - "ngrams_betweenentities_1_2_be": 2.08991136, - "ngrams_betweenentities_1_2_by": 2.08991136, - "ngrams_betweenentities_1_2_can": 2.08991136, - "ngrams_betweenentities_1_2_fvdxdietdx": 0.8510011, - "ngrams_betweenentities_1_2_inhibition": 2.08991136, - "ngrams_betweenentities_1_2_knetvjnjun": 0.8909307, - "ngrams_betweenentities_1_2_targets": 1.46201744, - "ngrams_betweenentities_1_2_treated": 2.08991136, - "ngrams_betweenentities_1_2_treats": 1.46201744, - "ngrams_betweenentities_1_2_using": 2.08991136, - "ngrams_betweenentities_1_2_zkrkzlyfef": 0.8510011, - "selectedtokentypes_0_disease": 6.0, - "selectedtokentypes_0_drug": 6.0, - "selectedtokentypes_0_gene": 6.0, - "selectedtokentypes_1_disease": 6.0, - "selectedtokentypes_1_drug": 6.0, - "selectedtokentypes_1_gene": 6.0, - "selectedtokentypes_2_disease": 6.0, - "selectedtokentypes_2_drug": 6.0, - "selectedtokentypes_2_gene": 6.0 - }, - "test_vectorizer_dependencyPathEdgesNearEntities_1": { - "dependencypathnearselectedtoken_0_nsubj": 3.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 3.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0 - }, - "test_vectorizer_dependencyPathEdgesNearEntities_2": { - "dependencypathnearselectedtoken_0_nsubj": 3.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 3.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0 - }, - "test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_1": { - "dependencypathnearselectedtoken_0_nsubj": 3.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 3.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0 - }, - "test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_2": { - "dependencypathnearselectedtoken_0_nsubj": 3.0, - "dependencypathnearselectedtoken_0_nsubjpass": 1.0, - "dependencypathnearselectedtoken_1_nsubj": 3.0, - "dependencypathnearselectedtoken_1_nsubjpass": 1.0 - }, - "test_vectorizer_dependencyPathEdges_1": { - "dependencypathelements_amod": 2.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 6.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 8.0, - "dependencypathelements_prep": 8.0 - }, - "test_vectorizer_dependencyPathEdges_2": { - "dependencypathelements_amod": 2.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 6.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 8.0, - "dependencypathelements_prep": 8.0 - }, - "test_vectorizer_dependencyPathEdges_noTFIDF_1": { - "dependencypathelements_amod": 2.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 6.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 8.0, - "dependencypathelements_prep": 8.0 - }, - "test_vectorizer_dependencyPathEdges_noTFIDF_2": { - "dependencypathelements_amod": 2.0, - "dependencypathelements_attr": 4.0, - "dependencypathelements_nsubj": 6.0, - "dependencypathelements_nsubjpass": 2.0, - "dependencypathelements_pobj": 8.0, - "dependencypathelements_prep": 8.0 - }, - "test_vectorizer_entityTypes_1": { - "selectedtokentypes_0_disease": 2.0, - "selectedtokentypes_0_disease2": 2.0, - "selectedtokentypes_0_drug": 4.0, - "selectedtokentypes_1_disease": 2.0, - "selectedtokentypes_1_disease2": 2.0, - "selectedtokentypes_1_drug": 4.0 - }, - "test_vectorizer_entityTypes_2": { - "selectedtokentypes_0_disease": 5.0, - "selectedtokentypes_0_disease2": 4.0, - "selectedtokentypes_0_drug": 9.0, - "selectedtokentypes_1_disease": 5.0, - "selectedtokentypes_1_disease2": 4.0, - "selectedtokentypes_1_drug": 9.0 - }, - "test_vectorizer_entityTypes_noTFIDF_1": { - "selectedtokentypes_0_disease": 2.0, - "selectedtokentypes_0_disease2": 2.0, - "selectedtokentypes_0_drug": 4.0, - "selectedtokentypes_1_disease": 2.0, - "selectedtokentypes_1_disease2": 2.0, - "selectedtokentypes_1_drug": 4.0 - }, - "test_vectorizer_entityTypes_noTFIDF_2": { - "selectedtokentypes_0_disease": 2.0, - "selectedtokentypes_0_disease2": 2.0, - "selectedtokentypes_0_drug": 4.0, - "selectedtokentypes_1_disease": 2.0, - "selectedtokentypes_1_disease2": 2.0, - "selectedtokentypes_1_drug": 4.0 - }, - "test_vectorizer_unigramsBetweenEntities_1": { - "ngrams_betweenentities_a": 1.45195225, - "ngrams_betweenentities_be": 1.0, - "ngrams_betweenentities_can": 1.0, - "ngrams_betweenentities_clinical": 1.05815267, - "ngrams_betweenentities_common": 1.03733099, - "ngrams_betweenentities_effect": 0.88174599, - "ngrams_betweenentities_failed": 1.05815267, - "ngrams_betweenentities_for": 1.58541958, - "ngrams_betweenentities_is": 1.45195225, - "ngrams_betweenentities_known": 0.88174599, - "ngrams_betweenentities_of": 0.88174599, - "ngrams_betweenentities_side": 0.88174599, - "ngrams_betweenentities_treated": 1.0, - "ngrams_betweenentities_treatment": 1.03733099, - "ngrams_betweenentities_trials": 1.05815267, - "ngrams_betweenentities_with": 1.0 - }, - "test_vectorizer_unigramsBetweenEntities_2": { - "ngrams_betweenentities_a": 1.45195225, - "ngrams_betweenentities_be": 1.0, - "ngrams_betweenentities_can": 1.0, - "ngrams_betweenentities_clinical": 1.05815267, - "ngrams_betweenentities_common": 1.03733099, - "ngrams_betweenentities_effect": 0.88174599, - "ngrams_betweenentities_failed": 1.05815267, - "ngrams_betweenentities_for": 1.58541958, - "ngrams_betweenentities_is": 1.45195225, - "ngrams_betweenentities_known": 0.88174599, - "ngrams_betweenentities_of": 0.88174599, - "ngrams_betweenentities_side": 0.88174599, - "ngrams_betweenentities_treated": 1.0, - "ngrams_betweenentities_treatment": 1.03733099, - "ngrams_betweenentities_trials": 1.05815267, - "ngrams_betweenentities_with": 1.0 - }, - "test_vectorizer_unigramsBetweenEntities_noTFIDF_1": { - "ngrams_betweenentities_a": 4.0, - "ngrams_betweenentities_be": 2.0, - "ngrams_betweenentities_can": 2.0, - "ngrams_betweenentities_clinical": 2.0, - "ngrams_betweenentities_common": 2.0, - "ngrams_betweenentities_effect": 2.0, - "ngrams_betweenentities_failed": 2.0, - "ngrams_betweenentities_for": 4.0, - "ngrams_betweenentities_is": 4.0, - "ngrams_betweenentities_known": 2.0, - "ngrams_betweenentities_of": 2.0, - "ngrams_betweenentities_side": 2.0, - "ngrams_betweenentities_treated": 2.0, - "ngrams_betweenentities_treatment": 2.0, - "ngrams_betweenentities_trials": 2.0, - "ngrams_betweenentities_with": 2.0 - }, - "test_vectorizer_unigramsBetweenEntities_noTFIDF_2": { - "ngrams_betweenentities_a": 4.0, - "ngrams_betweenentities_be": 2.0, - "ngrams_betweenentities_can": 2.0, - "ngrams_betweenentities_clinical": 2.0, - "ngrams_betweenentities_common": 2.0, - "ngrams_betweenentities_effect": 2.0, - "ngrams_betweenentities_failed": 2.0, - "ngrams_betweenentities_for": 4.0, - "ngrams_betweenentities_is": 4.0, - "ngrams_betweenentities_known": 2.0, - "ngrams_betweenentities_of": 2.0, - "ngrams_betweenentities_side": 2.0, - "ngrams_betweenentities_treated": 2.0, - "ngrams_betweenentities_treatment": 2.0, - "ngrams_betweenentities_trials": 2.0, - "ngrams_betweenentities_with": 2.0 - } -} \ No newline at end of file diff --git a/tests/test_vectorizer.py b/tests/test_vectorizer.py index 52dd7aa..56956bf 100644 --- a/tests/test_vectorizer.py +++ b/tests/test_vectorizer.py @@ -3,27 +3,7 @@ import os import json -from kindred.datageneration import generateData,generateTestData - -def check(valueName,value): - write = False - - scriptDir = os.path.dirname(__file__) - jsonPath = os.path.join(scriptDir,'data','vectorizer','expected.json') - if os.path.isfile(jsonPath): - with open(jsonPath) as f: - data = json.load(f) - else: - data = {} - - if write: - data[valueName] = value - with open(jsonPath,'w') as f: - json.dump(data,f,indent=2,sort_keys=True) - - assert valueName in data - assert data[valueName] == value - +from kindred.datageneration import generateData,generateTestData def test_simpleVectorizer_binary(): text = 'Erlotinib is a common treatment for NSCLC. Aspirin is the main cause of boneitis . ' @@ -39,14 +19,9 @@ def test_simpleVectorizer_binary(): # We'll just get the vectors for the entityTypes vectorizer = kindred.Vectorizer(featureChoice=["entityTypes"]) vectors = vectorizer.fit_transform(candidateRelations) - vectorsCSR = vectors.tocsr() - rows,cols = vectors.nonzero() - - expected = {(0, 2): 1.0, (0, 3): 1.0, (1, 0): 1.0, (1, 5): 1.0, (2, 2): 1.0, (2, 4): 1.0, (3, 1): 1.0, (3, 5): 1.0} - - namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) } - check('test_simpleVectorizer_binary',namedCols) + assert vectors.shape[0] == 4 + assert len(vectors.nonzero()) > 0 def test_simpleVectorizer_triple(): text = 'Erlotinib is a common treatment for NSCLC which targets EGFR. ' @@ -62,14 +37,9 @@ def test_simpleVectorizer_triple(): # We'll just get the vectors for the entityTypes vectorizer = kindred.Vectorizer(entityCount=3,featureChoice=["entityTypes"]) vectors = vectorizer.fit_transform(candidateRelations) - vectorsCSR = vectors.tocsr() - rows,cols = vectors.nonzero() - - expected = {(0, 1): 1.0, (0, 3): 1.0, (0, 8): 1.0, (1, 1): 1.0, (1, 5): 1.0, (1, 6): 1.0, (2, 0): 1.0, (2, 4): 1.0, (2, 8): 1.0, (3, 0): 1.0, (3, 5): 1.0, (3, 7): 1.0, (4, 2): 1.0, (4, 4): 1.0, (4, 6): 1.0, (5, 2): 1.0, (5, 3): 1.0, (5, 7): 1.0} - - namedCols = { str((r,c)):vectorsCSR[r,c] for r,c in zip(rows.tolist(),cols.tolist()) } - - check('test_simpleVectorizer_triple',namedCols) + + assert vectors.shape[0] == 6 + assert len(vectors.nonzero()) > 0 def test_vectorizer_defaults(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -87,16 +57,11 @@ def test_vectorizer_defaults(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - - colnames = vectorizer.getFeatureNames() - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_defaults_1',namedCols1) - colmeans2 = np.sum(matrix2,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_defaults_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_entityTypes(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -116,15 +81,10 @@ def test_vectorizer_entityTypes(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_entityTypes_1',namedCols1) - colmeans2 = np.sum(matrix2,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_entityTypes_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_unigramsBetweenEntities(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -144,15 +104,10 @@ def test_vectorizer_unigramsBetweenEntities(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_unigramsBetweenEntities_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_unigramsBetweenEntities_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_bigrams(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -172,15 +127,10 @@ def test_vectorizer_bigrams(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_bigrams_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_bigrams_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_dependencyPathEdges(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -200,15 +150,10 @@ def test_vectorizer_dependencyPathEdges(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_dependencyPathEdges_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_dependencyPathEdges_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_dependencyPathEdgesNearEntities(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -228,15 +173,10 @@ def test_vectorizer_dependencyPathEdgesNearEntities(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_dependencyPathEdgesNearEntities_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_dependencyPathEdgesNearEntities_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_entityTypes_noTFIDF(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -256,15 +196,10 @@ def test_vectorizer_entityTypes_noTFIDF(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_entityTypes_noTFIDF_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_entityTypes_noTFIDF_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_unigramsBetweenEntities_noTFIDF(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -284,15 +219,10 @@ def test_vectorizer_unigramsBetweenEntities_noTFIDF(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_unigramsBetweenEntities_noTFIDF_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_unigramsBetweenEntities_noTFIDF_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_bigrams_noTFIDF(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -312,15 +242,10 @@ def test_vectorizer_bigrams_noTFIDF(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_bigrams_noTFIDF_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_bigrams_noTFIDF_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_dependencyPathEdges_noTFIDF(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -340,15 +265,10 @@ def test_vectorizer_dependencyPathEdges_noTFIDF(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_dependencyPathEdges_noTFIDF_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_dependencyPathEdges_noTFIDF_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF(): corpus1, _ = generateTestData(positiveCount=5,negativeCount=5) @@ -368,15 +288,10 @@ def test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_dependencyPathEdgesNearEntities_noTFIDF_2',namedCols2) + assert matrix1.shape[0] == 8 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 18 + assert len(matrix2.nonzero()) > 0 def test_vectorizer_defaults_triple(): corpus1, _ = generateTestData(entityCount=3,positiveCount=5,negativeCount=5) @@ -395,15 +310,10 @@ def test_vectorizer_defaults_triple(): matrix1 = vectorizer.fit_transform(candidateRelations1) matrix2 = vectorizer.transform(candidateRelations2) - colnames = vectorizer.getFeatureNames() - - # As a quick check, we'll confirm that the column means are as expected - colmeans1 = np.sum(matrix1,axis=0).tolist()[0] - namedCols1 = { col:round(v,8) for col,v in zip(colnames,colmeans1) } - check('test_vectorizer_defaults_triple_1',namedCols1) - colmeans2 = np.sum(matrix1,axis=0).tolist()[0] - namedCols2 = { col:round(v,8) for col,v in zip(colnames,colmeans2) } - check('test_vectorizer_defaults_triple_2',namedCols2) + assert matrix1.shape[0] == 18 + assert len(matrix1.nonzero()) > 0 + assert matrix2.shape[0] == 60 + assert len(matrix2.nonzero()) > 0 if __name__ == '__main__': test_vectorizer_defaults_triple()