diff --git a/.gitignore b/.gitignore index c375d17de..b32a3093b 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ dataset_prep.log # Datasets matminer/datasets/*.json.gz +matminer/utils/data_files/mp_transport/transport_database.csv diff --git a/dev_scripts/dataset_management/refind2matminer.py b/dev_scripts/dataset_management/refind2matminer.py new file mode 100644 index 000000000..7f30de658 --- /dev/null +++ b/dev_scripts/dataset_management/refind2matminer.py @@ -0,0 +1,204 @@ +import os +import glob + +""" +Script used to transform the data from the original repository to +an easier-to-use database for matminer. Very ugly but should work, +at least with the commit 5a22ce0a3fa41fe6e9d26330686b32d83388e760 of +https://github.com/polyanskiy/refractiveindex.info-database + +Place this script in your refractiveindex repo, and run it. +A folder database_matminer should appear containing the new database. + +We use only files from main and other/semiconductors alloys, other/alloys and other/intermetallics. +Within these data, some files are not used because they are outliers when looking at +plots of properties vs composition. +Some files are renamed to represent the composition in an cleaner way. + +Overall, this is very ugly and represents manual manipulations. +This is just added so that anyone can update the database used by matminer +with new data added on the refractiveindex repository. +Add only data that you are sure is representative of 3D crystals, there are no automatic checks! +""" + +os.system("cp -r database database_copy") +os.mkdir("database_matminer") + +# Get the list of files to remove +files_to_remove = [ + "data/main/Al2O3/Querry.yml", + "data/main/Au/*-*nm*.yml", + "data/main/C/Arakawa.yml", + "data/main/C/Djurisic-e.yml", + "data/main/C/Djurisic-o.yml", + "data/main/C/El-Sayed.yml", + "data/main/C/Ermolaev.yml", + "data/main/C/Hagemann.yml", + "data/main/C/Querry*.yml", + "data/main/C/Song*.yml", + "data/main/C/Weber.yml", + "data/main/Ge/*nm*.yml", + "data/main/InSb/Adachi.yml", + "data/main/MoS2/Beal.yml", + "data/main/MoS2/Ermolaev*.yml", + "data/main/MoS2/Hsu*.yml", + "data/main/MoS2/Islam*.yml", + "data/main/MoS2/Jung.yml", + "data/main/MoS2/Song-*L.yml", + "data/main/MoS2/Yim*.yml", + "data/main/MoS2/Zhang.yml", + "data/main/Na/Inagaki-liquid.yml", + "data/main/Bi2Se3/Fang-2L.yml", + "data/main/Pb/Mathewson-140K.yml", + "data/main/Si/Pierce.yml", + "data/main/WSe2/*L*.yml", + "data/main/*/*Werner*.yml", + "data/other/alloys/Ni-Fe/*10nm*", + "data/other/alloys/Ni-Fe/*gold150nm*", + "data/other/intermetallics/AuAl2/Supansomboon.yml", + "data/main/*/Munkhbat-e.yml", + "data/main/*/Munkhbat-gamma.yml" +] + +flat_list = [] +for f in files_to_remove: + flat_list += glob.glob(os.path.join("database_copy", f)) + +flat_list += ["\'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-0.yml\'"] +flat_list += ["\'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-0.yml\'"] +flat_list.remove("database_copy/data/main/Ta/Werner.yml") +flat_list.remove("database_copy/data/main/Ta/Werner-DFT.yml") + +files_to_remove = flat_list + +for f in files_to_remove: + os.system("rm " + f) + +# Also change a file +os.system("sed -i \'s/760/780/g\' database_copy/data/main/C/Peter.yml") + +# Move some files +os.system("mv database_copy/data/other/alloys/Au-Ag/Rioux-Au0Ag100.yml database_copy/data/main/Ag/Rioux.yml") +os.system("mv database_copy/data/other/alloys/Au-Ag/Rioux-Au100Ag0.yml database_copy/data/main/Au/Rioux.yml") +os.system("mv \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-0.yml\' database_copy/data/main/GaSb/Ferrini.yml") + +# Remove entire folders +folders_to_remove = [ + "data/other/alloys/Pd-H", + "data/other/alloys/V-H", + "data/other/alloys/Zr-H", + "data/other/alloys/Nb-Sn", + "data/other/alloys/V-Ga" +] + +for f in folders_to_remove: + os.system("rm -rf database_copy/" + f) + +# Now the database has been filtered. +# We move the files into database_matminer, +# with some renaming... +os.system("cp -r database_copy/data/main/* database_matminer/") +os.system("cp -r database_copy/data/other/intermetallics/* database_matminer/") +os.mkdir("database_matminer/Au2Ag3") +os.mkdir("database_matminer/Au3Ag2") +os.mkdir("database_matminer/Au3Ag7") +os.mkdir("database_matminer/Au4Ag") +os.mkdir("database_matminer/Au7Ag3") +os.mkdir("database_matminer/Au9Ag") +os.mkdir("database_matminer/AuAg") +os.mkdir("database_matminer/AuAg4") +os.mkdir("database_matminer/AuAg9") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au40Ag60.yml database_matminer/Au2Ag3/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au60Ag40.yml database_matminer/Au3Ag2/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au30Ag70.yml database_matminer/Au3Ag7/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au80Ag20.yml database_matminer/Au4Ag/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au70Ag30.yml database_matminer/Au7Ag3/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au90Ag10.yml database_matminer/Au9Ag/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au50Ag50.yml database_matminer/AuAg/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au20Ag80.yml database_matminer/AuAg4/") +os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au10Ag90.yml database_matminer/AuAg9/") + +os.mkdir("database_matminer/Cu17Zn3") +os.mkdir("database_matminer/Cu7Zn3") +os.mkdir("database_matminer/Cu9Zn") +os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu85Zn15.yml database_matminer/Cu17Zn3/") +os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu70Zn30.yml database_matminer/Cu7Zn3/") +os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu90Zn10.yml database_matminer/Cu9Zn/") + +os.mkdir("database_matminer/Ni4Fe") +os.system("cp database_copy/data/other/alloys/Ni-Fe/Tikuisis_bare150nm.yml database_matminer/Ni4Fe/") + + +os.mkdir("database_matminer/Al198Ga802As1000") +os.mkdir("database_matminer/Al219Ga781As1000") +os.mkdir("database_matminer/Al315Ga685As1000") +os.mkdir("database_matminer/Al342Ga658As1000") +os.mkdir("database_matminer/Al411Ga589As1000") +os.mkdir("database_matminer/Al419Ga581As1000") +os.mkdir("database_matminer/Al452Ga548As1000") +os.mkdir("database_matminer/Al491Ga509As1000") +os.mkdir("database_matminer/Al590Ga410As1000") +os.mkdir("database_matminer/Al700Ga300As1000") +os.mkdir("database_matminer/Al804Ga196As1000") +os.mkdir("database_matminer/Al97Ga903As1000") +os.mkdir("database_matminer/Al99Ga901As1000") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-19.8.yml\' database_matminer/Al198Ga802As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-21.9.yml\' database_matminer/Al219Ga781As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Adachi-0.315.yml\' database_matminer/Al315Ga685As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-31.5.yml\' database_matminer/Al315Ga685As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-34.2.yml\' database_matminer/Al342Ga658As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-41.1.yml\' database_matminer/Al411Ga589As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-41.9.yml\' database_matminer/Al419Ga581As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-45.2.yml\' database_matminer/Al452Ga548As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-49.1.yml\' database_matminer/Al491Ga509As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-59.0.yml\' database_matminer/Al590Ga410As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Adachi-0.700.yml\' database_matminer/Al700Ga300As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-70.0.yml\' database_matminer/Al700Ga300As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-80.4.yml\' database_matminer/Al804Ga196As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-9.7.yml\' database_matminer/Al97Ga903As1000/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-9.9.yml\' database_matminer/Al99Ga901As1000/") + +os.mkdir("database_matminer/Al2329O2612N588") +os.mkdir("database_matminer/Al2351O2547N653") +os.mkdir("database_matminer/Al2356O2531N669") +os.mkdir("database_matminer/Al2372O2483N717") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-5.88.yml\' database_matminer/Al2329O2612N588/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-6.53.yml\' database_matminer/Al2351O2547N653/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-6.69.yml\' database_matminer/Al2356O2531N669/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-7.17.yml\' database_matminer/Al2372O2483N717/") + +os.mkdir("database_matminer/Al10Ga90Sb100") +os.mkdir("database_matminer/Al30Ga70Sb100") +os.mkdir("database_matminer/Al50Ga50Sb100") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-10.yml\' database_matminer/Al10Ga90Sb100/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-30.yml\' database_matminer/Al30Ga70Sb100/") +os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-50.yml\' database_matminer/Al50Ga50Sb100/") + +os.mkdir("database_matminer/In52Ga48As100") +os.system("cp \'database_copy/data/other/semiconductor alloys/GaAs-InAs/Adachi.yml\' database_matminer/In52Ga48As100/") + +os.mkdir("database_matminer/In52Ga48As24P76") +os.system("cp \'database_copy/data/other/semiconductor alloys/GaAs-InAs-GaP-InP/Adachi.yml\' database_matminer/In52Ga48As24P76/") + +os.mkdir("database_matminer/Ga51In49P100") +os.system("cp \'database_copy/data/other/semiconductor alloys/GaP-InP/Schubert.yml\' database_matminer/Ga51In49P100/") + +os.mkdir("database_matminer/Si11Ge89") +os.mkdir("database_matminer/Si20Ge80") +os.mkdir("database_matminer/Si28Ge72") +os.mkdir("database_matminer/Si47Ge53") +os.mkdir("database_matminer/Si48Ge52") +os.mkdir("database_matminer/Si65Ge35") +os.mkdir("database_matminer/Si85Ge15") +os.mkdir("database_matminer/Si98Ge2") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-11.yml\' database_matminer/Si11Ge89/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-20.yml\' database_matminer/Si20Ge80/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-28.yml\' database_matminer/Si28Ge72/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-47.yml\' database_matminer/Si47Ge53/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-48.yml\' database_matminer/Si48Ge52/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-65.yml\' database_matminer/Si65Ge35/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-85.yml\' database_matminer/Si85Ge15/") +os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-98.yml\' database_matminer/Si98Ge2/") + +# Remove unncessary folders and files +os.system("rm -r database_copy") diff --git a/matminer/featurizers/composition/alloy.py b/matminer/featurizers/composition/alloy.py index e30073f86..bc661df51 100644 --- a/matminer/featurizers/composition/alloy.py +++ b/matminer/featurizers/composition/alloy.py @@ -4,6 +4,7 @@ import collections import os +import warnings from functools import reduce import numpy as np @@ -16,6 +17,7 @@ from matminer.featurizers.composition.packing import AtomicPackingEfficiency from matminer.featurizers.utils.stats import PropertyStats from matminer.utils.data import CohesiveEnergyData, MagpieData, MixingEnthalpy +from matminer.utils.warnings import IMPUTE_NAN_WARNING module_dir = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(module_dir, "..", "..", "utils", "data_files") @@ -56,6 +58,9 @@ class Miedema(BaseFeaturizer): 'valence_electrons', 'a_const', 'R_const', 'H_trans' 'compressibility', 'shear_modulus', 'melting_point' 'structural_stability'. Please see the references for details. + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. Returns: (list of floats) Miedema formation enthalpies (eV/atom) for input struct_types: @@ -65,7 +70,7 @@ class Miedema(BaseFeaturizer): -Miedema_deltaH_amor: for amorphous phase """ - def __init__(self, struct_types="all", ss_types="min", data_source="Miedema"): + def __init__(self, struct_types="all", ss_types="min", data_source="Miedema", impute_nan=False): if isinstance(struct_types, list): self.struct_types = struct_types else: @@ -88,13 +93,31 @@ def __init__(self, struct_types="all", ss_types="min", data_source="Miedema"): else: raise NotImplementedError(f"data_source {data_source} not implemented yet") + self.impute_nan = impute_nan + if self.impute_nan: + elem_list = [Element(estr) for estr in self.df_dataset.index] + missing_elements = [e for e in Element if e not in elem_list] + missing_element_names = [e.symbol for e in missing_elements] + element_ids = [e.Z for e in missing_elements] + indices = list(range(len(elem_list), len(elem_list) + len(missing_elements))) + df_missing = pd.DataFrame(data=[indices, missing_element_names, element_ids]).T + df_missing.columns = ["Unnamed: 0", "element", "element_id"] + df_missing.set_index("element", inplace=True) + for col in self.df_dataset: + if col not in df_missing: + df_missing[col] = np.mean(self.df_dataset[col]) + self.df_dataset = pd.concat([self.df_dataset, df_missing]) + self.df_dataset.fillna(self.df_dataset.mean(), inplace=True) + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.element_list = [Element(estr) for estr in self.df_dataset.index] def precheck(self, c: Composition) -> bool: """ Precheck a single entry. Miedema does not work for compositions containing any elements for which the Miedema model has no parameters. - To precheck an entire dataframe (qnd automatically gather + To precheck an entire dataframe (and automatically gather the fraction of structures that will pass the precheck), please use precheck_dataframe. @@ -195,7 +218,9 @@ def deltaH_elast(self, elements, fracs): alp[1] * (elec[1] - elec[0]) / n_ws[1], ] ) - alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3)) + alp_a = np.multiply(1.5, np.power(np.abs(v_a), 2 / 3)) / reduce( + lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3) + ) # effective volume in alloy vab_a = v_molar[0] + np.array( @@ -212,13 +237,17 @@ def deltaH_elast(self, elements, fracs): ) # H_elast A in B - hab_elast = (2 * compr[0] * shear_mod[1] * np.power((vab_a[0] - vba_a[0]), 2)) / ( - 4 * shear_mod[1] * vab_a[0] + 3 * compr[0] * vba_a[0] - ) - # H_elast B in A - hba_elast = (2 * compr[1] * shear_mod[0] * np.power((vba_a[1] - vab_a[1]), 2)) / ( - 4 * shear_mod[0] * vba_a[1] + 3 * compr[1] * vab_a[1] - ) + if all(compr == 0) and all(shear_mod == 0): # This is the case for H-N + hab_elast = 0 + hba_elast = 0 + else: + hab_elast = (2 * compr[0] * shear_mod[1] * np.power((vab_a[0] - vba_a[0]), 2)) / ( + 4 * shear_mod[1] * vab_a[0] + 3 * compr[0] * vba_a[0] + ) + # H_elast B in A + hba_elast = (2 * compr[1] * shear_mod[0] * np.power((vba_a[1] - vab_a[1]), 2)) / ( + 4 * shear_mod[0] * vba_a[1] + 3 * compr[1] * vab_a[1] + ) deltaH_elast = np.multiply.reduce(fracs, 0) * (fracs[1] * hab_elast + fracs[0] * hba_elast) return deltaH_elast @@ -457,17 +486,25 @@ class YangSolidSolution(BaseFeaturizer): Yang omega - Mixing thermochemistry feature, Omega Yang delta - Atomic size mismatch term + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. + References: .. Yang and Zhang (2012) `https://linkinghub.elsevier.com/retrieve/pii/S0254058411009357`. """ - def __init__(self): + def __init__(self, impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) # Load in the mixing enthalpy data # Creates a lookup table of the liquid mixing enthalpies - self.dhf_mix = MixingEnthalpy() + self.dhf_mix = MixingEnthalpy(impute_nan=self.impute_nan) # Load in a table of elemental properties - self.elem_data = MagpieData() + self.elem_data = MagpieData(impute_nan=self.impute_nan) def precheck(self, c: Composition) -> bool: """ @@ -476,7 +513,7 @@ def precheck(self, c: Composition) -> bool: parameters. We can nearly equivalently approximate this by checking against the unary element list. - To precheck an entire dataframe (qnd automatically gather + To precheck an entire dataframe (and automatically gather the fraction of structures that will pass the precheck), please use precheck_dataframe. @@ -553,6 +590,9 @@ def compute_delta(self, comp): elements, fractions = zip(*comp.element_composition.items()) + if len(fractions) == 1: + return 0.0 + # Get the radii of elements radii = self.elem_data.get_elemental_properties(elements, "MiracleRadius") mean_r = PropertyStats.mean(radii, fractions) @@ -612,15 +652,23 @@ class WenAlloys(BaseFeaturizer): Shear modulus strength model Copyright 2020 Battelle Energy Alliance, LLC ALL RIGHTS RESERVED + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): # Use of Miedema to retrieve the shear modulus - self.data_source_miedema = Miedema(data_source="Miedema") - self.data_source_magpie = MagpieData().all_elemental_props - self.data_source_cohesive_energy = CohesiveEnergyData() - self.data_source_enthalpy = MixingEnthalpy() - self.yss = YangSolidSolution() + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source_miedema = Miedema(data_source="Miedema", impute_nan=self.impute_nan) + self.data_source_magpie = MagpieData(impute_nan=self.impute_nan).all_elemental_props + self.data_source_cohesive_energy = CohesiveEnergyData(impute_nan=self.impute_nan) + self.data_source_enthalpy = MixingEnthalpy(impute_nan=self.impute_nan) + self.yss = YangSolidSolution(impute_nan=self.impute_nan) def precheck(self, comp): return self.yss.precheck(comp) @@ -746,11 +794,14 @@ def compute_delta(variable, fractions): element :math:`i`, and :math:`\\bar{v}` is the fraction-weighted average of the variable. Args: - variable (list): List of properties to asses - fractions (list): List of fractions to asses + variable (list): List of properties to assess + fractions (list): List of fractions to assess Returns: (float) delta """ + if len(fractions) == 1: + return 0.0 + mean_variable = PropertyStats.mean(variable, fractions) dev_variable = np.power(1.0 - np.divide(variable, mean_variable), 2) return np.sqrt(PropertyStats.mean(dev_variable, fractions)) @@ -864,6 +915,9 @@ def compute_strength_local_mismatch_shear(shear_modulus, mean_shear_modulus, fra array_shear = np.array(shear_modulus) array_fractions = np.array(fractions) + if len(fractions) == 1: + return 0.0 + modulus_combination = ( 2 * array_fractions * (array_shear - mean_shear_modulus) / (array_shear + mean_shear_modulus) ) diff --git a/matminer/featurizers/composition/composite.py b/matminer/featurizers/composition/composite.py index eec6cd89d..f13269c44 100644 --- a/matminer/featurizers/composition/composite.py +++ b/matminer/featurizers/composition/composite.py @@ -2,6 +2,8 @@ Composition featurizers for composite features containing more than 1 category of general-purpose data. """ +import warnings + from matminer.featurizers.base import BaseFeaturizer from matminer.featurizers.composition.element import ElementFraction from matminer.featurizers.composition.orbital import ValenceOrbital @@ -11,8 +13,11 @@ MagpieData, MatscholarElementData, MEGNetElementData, + OpticalData, PymatgenData, + TransportData, ) +from matminer.utils.warnings import IMPUTE_NAN_WARNING class ElementProperty(BaseFeaturizer): @@ -43,21 +48,37 @@ class ElementProperty(BaseFeaturizer): (these must be supported by data_source) stats (list of strings): a list of weighted statistics to compute to for each property (see PropertyStats for available stats) + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self, data_source, features, stats): + def __init__(self, data_source, features, stats, impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) if data_source == "pymatgen": - self.data_source = PymatgenData() + self.data_source = PymatgenData(impute_nan=self.impute_nan) elif data_source == "magpie": - self.data_source = MagpieData() + self.data_source = MagpieData(impute_nan=self.impute_nan) elif data_source == "deml": - self.data_source = DemlData() + self.data_source = DemlData(impute_nan=self.impute_nan) elif data_source == "matscholar_el": - self.data_source = MatscholarElementData() + self.data_source = MatscholarElementData(impute_nan=self.impute_nan) elif data_source == "megnet_el": - self.data_source = MEGNetElementData() + self.data_source = MEGNetElementData(impute_nan=self.impute_nan) + elif data_source == "optical": + self.data_source = OpticalData(impute_nan=self.impute_nan) + elif data_source == "mp_transport": + self.data_source = TransportData(impute_nan=self.impute_nan) else: self.data_source = data_source + if self.impute_nan: + warnings.warn( + """The data_source has been specified externally and impute_nan is set to True. + Please make sure that the NaNs imputation has been done correctly + in the provided data_source to proceed.""" + ) self.features = features self.stats = stats @@ -65,12 +86,15 @@ def __init__(self, data_source, features, stats): self.pstats = PropertyStats() @classmethod - def from_preset(cls, preset_name): + def from_preset(cls, preset_name, impute_nan=False): """ Return ElementProperty from a preset string Args: preset_name: (str) can be one of "magpie", "deml", "matminer", "matscholar_el", or "megnet_el". + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. Returns: ElementProperty based on the preset name. @@ -147,17 +171,27 @@ def from_preset(cls, preset_name): elif preset_name == "matscholar_el": data_source = "matscholar_el" stats = ["minimum", "maximum", "range", "mean", "std_dev"] - features = MatscholarElementData().prop_names + features = MatscholarElementData(impute_nan=impute_nan).prop_names elif preset_name == "megnet_el": data_source = "megnet_el" stats = ["minimum", "maximum", "range", "mean", "std_dev"] - features = MEGNetElementData().prop_names + features = MEGNetElementData(impute_nan=impute_nan).prop_names + + elif preset_name == "optical": + data_source = "optical" + stats = ["minimum", "maximum", "range", "mean", "std_dev", "mode"] + features = OpticalData(impute_nan=impute_nan).prop_names + + elif preset_name == "mp_transport": + data_source = "mp_transport" + stats = ["minimum", "maximum", "range", "mean", "std_dev", "mode"] + features = TransportData(impute_nan=impute_nan).prop_names else: raise ValueError("Invalid preset_name specified!") - return cls(data_source, features, stats) + return cls(data_source, features, stats, impute_nan=impute_nan) def featurize(self, comp): """ @@ -236,12 +270,43 @@ def citations(self): r"adsurl = {https://ui.adsabs.harvard.edu/\#abs/2018arXiv181205055C}," "adsnote = {Provided by the SAO/NASA Astrophysics Data System}}" ] + elif self.data_source.__class__.__name__ == "OpticalData": + citation = [ + "@misc{mtgx," + "author = {Guillaume Brunin, Guido Petretto, David Waroquiers (Matgenix)}," + "year = {2022}" + ] + citation += [ + "@misc{rii," + "author = {Mikhail N. Polyanskiy}," + "title = {Refractive index database}," + "howpublished = {https://refractiveindex.info}," + "note = {Accessed on 2022-06-30}}" + ] + elif self.data_source.__class__.__name__ == "TransportData": + citation = [ + "@misc{mtgx," + "author = {Guillaume Brunin, Guido Petretto, David Waroquiers (Matgenix)}," + "year = {2022}" + ] + citation += [ + "@article{ricci2017ab," + "title={An ab initio electronic transport database for inorganic materials}," + "author={Ricci, Francesco and Chen, Wei and Aydemir, Umut and Snyder, G Jeffrey" + "and Rignanese, Gian-Marco and Jain, Anubhav and Hautier, Geoffroy}," + "journal={Scientific data}," + "volume={4}," + "number={1}," + "pages={1--13}," + "year={2017}," + "publisher={Nature Publishing Group}}" + ] else: citation = [] return citation def implementors(self): - return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn"] + return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn", "Guillaume Brunin (Matgenix)"] class Meredig(BaseFeaturizer): @@ -259,10 +324,17 @@ class Meredig(BaseFeaturizer): Mean number of valence electrons in each orbital Fraction of total valence electrons in each orbital + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): - self.data_source = MagpieData() + def __init__(self, impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = MagpieData(impute_nan=self.impute_nan) # The labels for statistics on element properties self._element_property_feature_labels = [ @@ -305,14 +377,16 @@ def featurize(self, comp): element_property_features[i] = self.pstats.calc_stat(elem_data, stat, fractions) # Final 8 features are statistics on valence orbitals, available from the ValenceOrbital featurizer - valence_orbital_features = ValenceOrbital(orbitals=("s", "p", "d", "f"), props=("avg", "frac")).featurize(comp) + valence_orbital_features = ValenceOrbital( + orbitals=("s", "p", "d", "f"), props=("avg", "frac"), impute_nan=self.impute_nan + ).featurize(comp) return element_fraction_features + element_property_features + valence_orbital_features def feature_labels(self): # Since we have more features than just element fractions, append 'fraction' to element symbols for clarity element_fraction_features = [e + " fraction" for e in ElementFraction().feature_labels()] - valence_orbital_features = ValenceOrbital().feature_labels() + valence_orbital_features = ValenceOrbital(impute_nan=self.impute_nan).feature_labels() return element_fraction_features + self._element_property_feature_labels + valence_orbital_features def citations(self): diff --git a/matminer/featurizers/composition/element.py b/matminer/featurizers/composition/element.py index 206122b17..940d3f3c2 100644 --- a/matminer/featurizers/composition/element.py +++ b/matminer/featurizers/composition/element.py @@ -2,10 +2,13 @@ Composition featurizers for elemental data and stoichiometry. """ +import warnings + from pymatgen.core import Element from matminer.featurizers.base import BaseFeaturizer from matminer.utils.data import DemlData, MagpieData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class ElementFraction(BaseFeaturizer): @@ -27,7 +30,7 @@ def featurize(self, comp): vector (list of floats): fraction of each element in a composition """ - vector = [0] * 103 + vector = [0] * 118 el_list = list(comp.element_composition.fractional_composition.items()) for el in el_list: obj = el @@ -37,7 +40,7 @@ def featurize(self, comp): def feature_labels(self): labels = [] - for i in range(1, 104): + for i in range(1, 119): labels.append(Element.from_Z(i).symbol) return labels @@ -207,12 +210,16 @@ class BandCenter(BaseFeaturizer): - Band center """ - magpie_data = MagpieData() - deml_data = DemlData() + def __init__(self, impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.magpie_data = MagpieData(impute_nan=self.impute_nan) + self.deml_data = DemlData(impute_nan=self.impute_nan) def featurize(self, comp): """ - (Rough) estimation of absolution position of band center using + (Rough) estimation of absolute position of band center using geometric mean of electronegativity. Args: diff --git a/matminer/featurizers/composition/ion.py b/matminer/featurizers/composition/ion.py index 2e780eae8..0b98719bd 100644 --- a/matminer/featurizers/composition/ion.py +++ b/matminer/featurizers/composition/ion.py @@ -2,6 +2,7 @@ Composition featurizers for compositions with ionic data. """ import itertools +import warnings import numpy as np @@ -10,6 +11,7 @@ from matminer.featurizers.utils.oxidation import has_oxidation_states from matminer.featurizers.utils.stats import PropertyStats from matminer.utils.data import DemlData, PymatgenData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class CationProperty(ElementProperty): @@ -35,7 +37,7 @@ class CationProperty(ElementProperty): """ @classmethod - def from_preset(cls, preset_name): + def from_preset(cls, preset_name, impute_nan=False): if preset_name == "deml": data_source = "deml" features = [ @@ -48,7 +50,10 @@ def from_preset(cls, preset_name): stats = ["minimum", "maximum", "range", "mean", "std_dev"] else: raise ValueError('Preset "%s" not found' % preset_name) - return cls(data_source, features, stats) + + if not impute_nan: + warnings.warn("CationProperty(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + return cls(data_source, features, stats, impute_nan=impute_nan) def feature_labels(self): return [f + " of cations" for f in super().feature_labels()] @@ -141,7 +146,7 @@ class IonProperty(BaseFeaturizer): Ionic property attributes. Similar to ElementProperty. """ - def __init__(self, data_source=PymatgenData(), fast=False): + def __init__(self, data_source=None, impute_nan=False, fast=False): """ Args: @@ -151,7 +156,10 @@ def __init__(self, data_source=PymatgenData(), fast=False): which can dramatically accelerate the calculation of whether an ionic compound is possible, but will miss heterovalent compounds like Fe3O4. """ - self.data_source = data_source + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = data_source or PymatgenData(impute_nan=self.impute_nan) self.fast = fast def featurize(self, comp): @@ -236,10 +244,18 @@ class ElectronAffinity(BaseFeaturizer): Calculate average electron affinity times formal charge of anion elements. Note: The formal charges must already be computed before calling `featurize`. Generates average (electron affinity*formal charge) of anions. + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): - self.data_source = DemlData() + def __init__(self, impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = DemlData(impute_nan=self.impute_nan) def featurize(self, comp): """ diff --git a/matminer/featurizers/composition/orbital.py b/matminer/featurizers/composition/orbital.py index f28abcb8c..54f7d8473 100644 --- a/matminer/featurizers/composition/orbital.py +++ b/matminer/featurizers/composition/orbital.py @@ -11,6 +11,7 @@ from matminer.featurizers.base import BaseFeaturizer from matminer.featurizers.utils.stats import PropertyStats from matminer.utils.data import MagpieData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class AtomicOrbitals(BaseFeaturizer): @@ -51,7 +52,7 @@ def featurize(self, comp): integer_comp, factor = comp.get_integer_formula_and_factor() # warning message if composition is dilute and truncated - if not (len(Composition(comp).elements) == len(Composition(integer_comp).elements)): + if not len(Composition(comp).elements) == len(Composition(integer_comp).elements): warn(f"AtomicOrbitals: {comp} truncated to {integer_comp}") homo_lumo = MolecularOrbitals(integer_comp).band_edges @@ -103,10 +104,16 @@ class ValenceOrbital(BaseFeaturizer): orbitals (list): orbitals to calculate props (list): specifies whether to return average number of electrons in each orbital, fraction of electrons in each orbital, or both + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self, orbitals=("s", "p", "d", "f"), props=("avg", "frac")): - self.data_source = MagpieData() + def __init__(self, orbitals=("s", "p", "d", "f"), props=("avg", "frac"), impute_nan=False): + self.impute_nan = impute_nan + if not self.impute_nan: + warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = MagpieData(impute_nan=self.impute_nan) self.orbitals = orbitals self.props = props @@ -126,7 +133,7 @@ def featurize(self, comp): # Get the mean number of electrons in each shell avg = [ PropertyStats.mean( - self.data_source.get_elemental_properties(elements, "N%sValence" % orb), + self.data_source.get_elemental_properties(elements, f"N{orb}Valence"), weights=fractions, ) for orb in self.orbitals diff --git a/matminer/featurizers/composition/packing.py b/matminer/featurizers/composition/packing.py index 2e167d446..0cabf8ac1 100644 --- a/matminer/featurizers/composition/packing.py +++ b/matminer/featurizers/composition/packing.py @@ -3,6 +3,7 @@ """ import itertools +import warnings from functools import lru_cache import numpy as np @@ -13,6 +14,7 @@ from matminer.featurizers.composition.element import ElementFraction from matminer.featurizers.utils.stats import PropertyStats from matminer.utils.data import MagpieData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class AtomicPackingEfficiency(BaseFeaturizer): @@ -49,7 +51,7 @@ class AtomicPackingEfficiency(BaseFeaturizer): for bulk metallic glasses, Nat. Commun. 6 (2015) 8123. doi:10.1038/ncomms9123. """ - def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6): + def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6, impute_nan=False): """ Initialize the featurizer @@ -60,6 +62,9 @@ def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6): max_types (int): Maximum number of atom types to consider when looking for efficient clusters. The process for finding efficient clusters very expensive for large numbers of types + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ # Store the options @@ -74,7 +79,10 @@ def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6): self._n_elems = len(self._el_frac.featurize(Composition("H"))) # Tool for looking up radii - self._data_source = MagpieData() + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self._data_source = MagpieData(impute_nan=impute_nan) # Lookup table of ideal radius ratios self.ideal_ratio = dict( diff --git a/matminer/featurizers/composition/tests/base.py b/matminer/featurizers/composition/tests/base.py index a1ac4f5d2..9f72d06a1 100644 --- a/matminer/featurizers/composition/tests/base.py +++ b/matminer/featurizers/composition/tests/base.py @@ -17,6 +17,15 @@ def setUp(self): } ) + self.df_nans = pd.DataFrame( + { + "composition": [ + # Composition("U2Og3"), + Composition({Specie("U", 3): 2, Specie("Og", -2): 3}), + ] + } + ) + if __name__ == "__main__": unittest.main() diff --git a/matminer/featurizers/composition/tests/test_alloy.py b/matminer/featurizers/composition/tests/test_alloy.py index 48d109fd3..7df5c0c7f 100644 --- a/matminer/featurizers/composition/tests/test_alloy.py +++ b/matminer/featurizers/composition/tests/test_alloy.py @@ -21,7 +21,7 @@ def test_miedema_all(self): ] } ) - miedema = Miedema(struct_types="all") + miedema = Miedema(struct_types="all", impute_nan=False) self.assertTrue(miedema.precheck(df["composition"].iloc[0])) self.assertFalse(miedema.precheck(df["composition"].iloc[-1])) self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition"), 2 / 3) @@ -52,6 +52,51 @@ def test_miedema_all(self): self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300) self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755) + # Tests impute_nan=True + df = pd.DataFrame( + { + "composition": [ + Composition("TiZr"), + Composition("Mg10Cu50Ca40"), + Composition("Fe2O3"), + ] + } + ) + miedema = Miedema(struct_types="all", impute_nan=True) + self.assertTrue(miedema.precheck(df["composition"].iloc[0])) + self.assertTrue(miedema.precheck(df["composition"].iloc[-1])) + self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition"), 1) + + # test precheck for oxidation-state decorated compositions + df = CompositionToOxidComposition(return_original_on_error=True).featurize_dataframe(df, "composition") + self.assertTrue(miedema.precheck(df["composition_oxid"].iloc[0])) + self.assertTrue(miedema.precheck(df["composition_oxid"].iloc[-1])) + self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition_oxid"), 1) + + mfps = miedema.featurize_dataframe(df, col_id="composition") + self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][0], -0.003445022152) + self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755) + + self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][1], -0.235125978427) + self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][1], -0.164541848271) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][1], -0.05280843311) + + self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][2], -0.1664609094669129) + self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][2], -0.08553195254092998) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273) + + # make sure featurization works equally for compositions with or without + # oxidation states + mfps = miedema.featurize_dataframe(df, col_id="composition_oxid") + self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][0], -0.003445022152) + self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755) + + self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][2], -0.1664609094669129) + self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][2], -0.08553195254092998) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273) + def test_miedema_ss(self): df = pd.DataFrame( { @@ -62,7 +107,7 @@ def test_miedema_ss(self): ] } ) - miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"]) + miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"], impute_nan=False) mfps = miedema.featurize_dataframe(df, col_id="composition") self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755) self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][0], 0.04700027066) @@ -82,6 +127,36 @@ def test_miedema_ss(self): self.assertAlmostEqual(math.isnan(mfps["Miedema_deltaH_ss_hcp"][2]), True) self.assertAlmostEqual(math.isnan(mfps["Miedema_deltaH_ss_no_latt"][2]), True) + # Test impute_nan=True + df = pd.DataFrame( + { + "composition": [ + Composition("TiZr"), + Composition("Mg10Cu50Ca40"), + Composition("Fe2O3"), + ] + } + ) + miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"], impute_nan=True) + mfps = miedema.featurize_dataframe(df, col_id="composition") + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][0], 0.04700027066) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][0], 0.08327522653) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][0], 0.03663599755) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][0], 0.036635998) + + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][1], -0.05280843311) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][1], 0.03010575174) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][1], -0.05280843311) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][1], 0.03010575174) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][1], -0.0035781359) + + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][2], 0.21122883524487504) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][2], 0.25737114702211505) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][2], 0.19402130236056273) + self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][2], 0.258796589515172) + def test_yang(self): comps = list( map( @@ -91,7 +166,7 @@ def test_yang(self): ) # Run the featurization - feat = YangSolidSolution() + feat = YangSolidSolution(impute_nan=False) df = pd.DataFrame({"composition": comps}) self.assertFalse(feat.precheck(df["composition"].iloc[-1])) @@ -118,8 +193,31 @@ def test_yang(self): np.testing.assert_array_almost_equal([158.5, 0.0315], features[2], decimal=1) np.testing.assert_array_almost_equal([5.06, 0.0482], features[3], decimal=1) + # Test with impute_nan=True + feat = YangSolidSolution(impute_nan=True) + + df = pd.DataFrame({"composition": comps}) + self.assertTrue(feat.precheck(df["composition"].iloc[-1])) + self.assertAlmostEqual(feat.precheck_dataframe(df, "composition"), 1, places=10) + + # test precheck for oxidation-state decorated compositions + df = CompositionToOxidComposition(return_original_on_error=True).featurize_dataframe(df, "composition") + self.assertTrue(feat.precheck(df["composition_oxid"].iloc[-1])) + self.assertAlmostEqual(feat.precheck_dataframe(df, "composition_oxid"), 1, places=10) + + feat.set_n_jobs(1) + features = feat.featurize_many(comps) + + self.assertEqual((5, 2), np.array(features).shape) + np.testing.assert_array_almost_equal([0.95, 0.1021], features[0], decimal=2) + np.testing.assert_array_almost_equal([2.22, 0.0], features[1], decimal=2) + np.testing.assert_array_almost_equal([158.5, 0.0315], features[2], decimal=1) + np.testing.assert_array_almost_equal([5.06, 0.0482], features[3], decimal=1) + + np.testing.assert_array_almost_equal([0.14893, 0.56212], features[4], decimal=5) + def test_WenAlloys(self): - wa = WenAlloys() + wa = WenAlloys(impute_nan=False) c1 = "Fe0.62C0.000953Mn0.000521Si0.00102Cr0.00011Ni0.192" "Mo0.0176V0.000112Nb6.16e-05Co0.146Al0.00318Ti0.0185" c2 = ( "Fe0.623C0.00854Mn0.000104Si0.000203Cr0.147Ni9.71e-05" @@ -187,6 +285,122 @@ def test_WenAlloys(self): self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5) self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5) + # Test impute_nan=True + wa = WenAlloys(impute_nan=True) + c1 = "Fe0.62C0.000953Mn0.000521Si0.00102Cr0.00011Ni0.192" "Mo0.0176V0.000112Nb6.16e-05Co0.146Al0.00318Ti0.0185" + c2 = ( + "Fe0.623C0.00854Mn0.000104Si0.000203Cr0.147Ni9.71e-05" + "Mo0.0179V0.00515N0.00163Nb6.14e-05Co0.188W0.00729Al0.000845" + ) + comp = Composition(c1) + + # Test prechecking + comp_bad = Composition("LaO3") + self.assertTrue(wa.precheck(comp)) + self.assertTrue(wa.precheck(comp_bad)) + + f = wa.featurize(comp) + + d = dict(zip(wa.feature_labels(), f)) + correct = { + "APE mean": 0.018915555593392162, + "Atomic Fraction": "Fe0.6199642900568927C0.0009529451103616431Mn0.0005209699921284533Si" + "0.0010199412513839203Cr0.00010999366436493258Ni0.1919889414369732Mo" + "0.017598986298389213V0.0001119935491715677Nb6.159645204436224e-05Co" + "0.14599159088436503Al0.003179816842549869Ti0.018498934461375023", + "Atomic weight mean": 57.24008321450784, + "Configuration entropy": -0.008958911485121818, + "Electronegativity delta": 0.042327487126447516, + "Electronegativity local mismatch": 0.08262466022141576, + "Interant d electrons": 45.0, + "Interant electrons": 53.0, + "Interant f electrons": 0, + "Interant p electrons": 5.0, + "Interant s electrons": 3.0, + "Lambda entropy": -12.084431980055149, + "Mean cohesive energy": 4.382084353941212, + "Mixing enthalpy": 3.6650695863166347, + "Radii gamma": 1.4183511064895242, + "Radii local mismatch": 0.7953797741513383, + "Shear modulus delta": 0.1794147729878139, + "Shear modulus local mismatch": 3.192861083726266, + "Shear modulus mean": 79.48600137832061, + "Shear modulus strength model": -0.009636621848440554, + "Total weight": 57.243028243301005, + "VEC mean": 8.395723406331793, + "Weight Fraction": "Fe0.6048579375087819 C0.00019995792415715736 " + "Mn0.0005000210911858884 Si0.0005004488909678273 " + "Cr9.991733798026916e-05 Ni0.19686472127404955 " + "Mo0.029497810507563525 V9.967061797901463e-05 " + "Nb9.997781710071831e-05 Co0.15031081922202344 " + "Al0.0014988950686416751 Ti0.015469822739568852 ", + "Yang delta": 0.027227922269552986, + "Yang omega": 4.4226005659658725, + } + + for flabel, fvalue in d.items(): + correct_value = correct[flabel] + if isinstance(correct_value, str): + self.assertEqual(correct_value, fvalue) + else: + self.assertAlmostEqual(correct_value, fvalue, places=8) + + self.assertEqual(len(wa.feature_labels()), 25) + + df = pd.DataFrame({"composition": [comp, Composition(c2)]}) + + df = wa.featurize_dataframe(df, "composition") + self.assertTupleEqual(df.shape, (2, 26)) + self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5) + self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5) + + f = wa.featurize(comp_bad) + + d = dict(zip(wa.feature_labels(), f)) + correct = { + "APE mean": 0.001579487142797431, + "Atomic Fraction": "La0.25O0.75", + "Atomic weight mean": 46.7259175, + "Configuration entropy": -0.004675254392360772, + "Electronegativity delta": 0.09973459781368167, + "Electronegativity local mismatch": 0.1654880136986303, + "Interant d electrons": 1.0, + "Interant electrons": 5.0, + "Interant f electrons": 0, + "Interant p electrons": 4.0, + "Interant s electrons": 0, + "Lambda entropy": -0.014796268010071023, + "Mean cohesive energy": 3.0675, + "Mixing enthalpy": 10.652682648401827, + "Radii gamma": 1.9699221262511677, + "Radii local mismatch": 23.0625, + "Shear modulus delta": 0.37931021448197916, + "Shear modulus local mismatch": 7.13935787671233, + "Shear modulus mean": 43.467431506849316, + "Shear modulus strength model": -0.08010552710644916, + "Total weight": 186.90367, + "VEC mean": 5.393835616438356, + "Weight Fraction": "La0.743192843671823 O0.25680715632817697 ", + "Yang delta": 0.5621167528521687, + "Yang omega": 0.14893408828673313, + } + + for flabel, fvalue in d.items(): + correct_value = correct[flabel] + if isinstance(correct_value, str): + self.assertEqual(correct_value, fvalue) + else: + self.assertAlmostEqual(correct_value, fvalue, places=8) + + self.assertEqual(len(wa.feature_labels()), 25) + + df = pd.DataFrame({"composition": [comp, Composition(c2)]}) + + df = wa.featurize_dataframe(df, "composition") + self.assertTupleEqual(df.shape, (2, 26)) + self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5) + self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5) + if __name__ == "__main__": unittest.main() diff --git a/matminer/featurizers/composition/tests/test_composite.py b/matminer/featurizers/composition/tests/test_composite.py index ddb45edcf..13e0850ff 100644 --- a/matminer/featurizers/composition/tests/test_composite.py +++ b/matminer/featurizers/composition/tests/test_composite.py @@ -7,7 +7,9 @@ class CompositeFeaturesTest(CompositionFeaturesTest): def test_elem(self): - df_elem = ElementProperty.from_preset("magpie").featurize_dataframe(self.df, col_id="composition") + df_elem = ElementProperty.from_preset("magpie", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 8) self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 26) self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 18) @@ -15,24 +17,113 @@ def test_elem(self): self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 8.64) self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 8) + df_elem = ElementProperty.from_preset("magpie", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 132) + + df_elem = ElementProperty.from_preset("magpie", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 8) + self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 26) + self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 18) + self.assertAlmostEqual(df_elem["MagpieData mean Number"][0], 15.2) + self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 8.64) + self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 8) + self.assertEqual(df_elem.isna().sum().sum(), 0) + + df_elem = ElementProperty.from_preset("magpie", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 56.5) + self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 92) + self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 35.5) + self.assertAlmostEqual(df_elem["MagpieData mean Number"][0], 70.7) + self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 17.04) + self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 56.5) + self.assertEqual(df_elem.isna().sum().sum(), 0) + def test_elem_deml(self): - df_elem_deml = ElementProperty.from_preset("deml").featurize_dataframe(self.df, col_id="composition") + df_elem_deml = ElementProperty.from_preset("deml", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 8) + self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 26) + self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 18) + self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 15.2) + self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 12.7279, 4) + + df_elem_deml = ElementProperty.from_preset("deml", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition", ignore_errors=True + ) + self.assertEqual(df_elem_deml.isna().sum().sum(), 80) + + df_elem_deml = ElementProperty.from_preset("deml", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 8) self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 26) self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 18) self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 15.2) self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 12.7279, 4) + self.assertEqual(df_elem_deml.isna().sum().sum(), 0) + + df_elem_deml = ElementProperty.from_preset("deml", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 92) + self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 118) + self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 26) + self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 107.6) + self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 18.3848, 4) + self.assertEqual(df_elem_deml.isna().sum().sum(), 0) def test_elem_matminer(self): - df_elem = ElementProperty.from_preset("matminer").featurize_dataframe(self.df, col_id="composition") + df_elem = ElementProperty.from_preset("matminer", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) self.assertAlmostEqual(df_elem["PymatgenData minimum melting_point"][0], 54.8, 1) self.assertTrue(math.isnan(df_elem["PymatgenData maximum bulk_modulus"][0])) self.assertAlmostEqual(df_elem["PymatgenData range X"][0], 1.61, 1) self.assertAlmostEqual(df_elem["PymatgenData mean X"][0], 2.796, 1) self.assertAlmostEqual(df_elem["PymatgenData maximum block"][0], 3, 1) + self.assertEqual(df_elem.isna().sum().sum(), 30) + + df_elem = ElementProperty.from_preset("matminer", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 45) + self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 987.903, 3) + + df_elem = ElementProperty.from_preset("matminer", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["PymatgenData minimum melting_point"][0], 54.8, 1) + self.assertFalse(math.isnan(df_elem["PymatgenData maximum bulk_modulus"][0])) + self.assertAlmostEqual(df_elem["PymatgenData range X"][0], 1.61, 1) + self.assertAlmostEqual(df_elem["PymatgenData mean X"][0], 2.796, 1) + self.assertAlmostEqual(df_elem["PymatgenData maximum block"][0], 3, 1) + self.assertEqual(df_elem.isna().sum().sum(), 0) + + df_elem = ElementProperty.from_preset("matminer", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 0) + self.assertAlmostEqual( + df_elem[ + [col for col in df_elem.columns if "composition" not in col and "electrical_resistivity" not in col] + ] + .sum() + .sum(), + 16295.221, + 3, + ) def test_elem_matscholar_el(self): - df_elem = ElementProperty.from_preset("matscholar_el").featurize_dataframe(self.df, col_id="composition") + df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) self.assertAlmostEqual( df_elem["MatscholarElementData range embedding 149"].iloc[0], 0.06827970966696739, @@ -50,8 +141,64 @@ def test_elem_matscholar_el(self): -0.02483355056028813, ) + df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition", ignore_errors=True + ) + self.assertEqual(df_elem.isna().sum().sum(), 1000) + + df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual( + df_elem["MatscholarElementData range embedding 149"].iloc[0], + 0.06827970966696739, + ) + self.assertAlmostEqual( + df_elem["MatscholarElementData range embedding 149"].iloc[1], + 0.06827970966696739, + ) + self.assertAlmostEqual( + df_elem["MatscholarElementData mean embedding 18"].iloc[0], + -0.020534400502219795, + ) + self.assertAlmostEqual( + df_elem["MatscholarElementData mean embedding 18"].iloc[1], + -0.02483355056028813, + ) + self.assertEqual(df_elem.isna().sum().sum(), 0) + + df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 0) + self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 17.666, 3) + def test_elem_megnet_el(self): - ep = ElementProperty.from_preset("megnet_el") + ep = ElementProperty.from_preset("megnet_el", impute_nan=False) + df_elem = ep.featurize_dataframe(self.df, col_id="composition") + self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6) + self.assertAlmostEqual( + df_elem["MEGNetElementData maximum embedding 11"].iloc[0], + 0.160505, + places=6, + ) + self.assertAlmostEqual( + df_elem["MEGNetElementData maximum embedding 11"].iloc[1], + 0.160505, + places=6, + ) + self.assertTrue(ep.citations()) + + df_elem = ep.featurize_dataframe(self.df_nans, col_id="composition") + self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], -0.044911, places=6) + self.assertAlmostEqual( + df_elem["MEGNetElementData maximum embedding 11"].iloc[0], + 0.191229, + places=6, + ) + self.assertTrue(ep.citations()) + + ep = ElementProperty.from_preset("megnet_el", impute_nan=True) df_elem = ep.featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6) self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[1], 0.127333, places=6) @@ -66,27 +213,137 @@ def test_elem_megnet_el(self): places=6, ) self.assertTrue(ep.citations()) + self.assertEqual(df_elem.isna().sum().sum(), 0) + + df_elem = ep.featurize_dataframe(self.df_nans, col_id="composition") + self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], -0.001072, places=6) + self.assertAlmostEqual( + df_elem["MEGNetElementData maximum embedding 11"].iloc[0], + 0.191229, + places=6, + ) + self.assertTrue(ep.citations()) + self.assertEqual(df_elem.isna().sum().sum(), 0) def test_meredig(self): - df_val = Meredig().featurize_dataframe(self.df, col_id="composition") + df_val = Meredig(impute_nan=False).featurize_dataframe(self.df, col_id="composition") + self.assertAlmostEqual(df_val["Fe fraction"].iloc[0], 2.0 / 5.0) + self.assertAlmostEqual(df_val["Fe fraction"].iloc[1], 0.5) + self.assertAlmostEqual(df_val["O fraction"].iloc[0], 3.0 / 5.0) + self.assertAlmostEqual(df_val["O fraction"].iloc[1], 0.5) + self.assertAlmostEqual(df_val["frac s valence electrons"].iloc[0], 0.294117647) + self.assertAlmostEqual(df_val["mean Number"].iloc[0], 15.2) + + df_val = Meredig(impute_nan=False).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_val.isna().sum().sum(), 17) + self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 1, 10) + + df_val = Meredig(impute_nan=True).featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_val["Fe fraction"].iloc[0], 2.0 / 5.0) self.assertAlmostEqual(df_val["Fe fraction"].iloc[1], 0.5) self.assertAlmostEqual(df_val["O fraction"].iloc[0], 3.0 / 5.0) self.assertAlmostEqual(df_val["O fraction"].iloc[1], 0.5) self.assertAlmostEqual(df_val["frac s valence electrons"].iloc[0], 0.294117647) self.assertAlmostEqual(df_val["mean Number"].iloc[0], 15.2) + self.assertEqual(df_val.isna().sum().sum(), 0) + + df_val = Meredig(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_val.isna().sum().sum(), 0) + self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 311.5897, 4) def test_fere_corr(self): df_fere_corr = ElementProperty( features=["FERE correction"], stats=["minimum", "maximum", "range", "mean", "std_dev"], data_source="deml", + impute_nan=False, + ).featurize_dataframe(self.df, col_id="composition") + self.assertAlmostEqual(df_fere_corr["DemlData minimum FERE correction"][0], -0.15213431610903) + self.assertAlmostEqual(df_fere_corr["DemlData maximum FERE correction"][0], 0.23) + self.assertAlmostEqual(df_fere_corr["DemlData range FERE correction"][0], 0.382134316) + self.assertAlmostEqual(df_fere_corr["DemlData mean FERE correction"][0], 0.077146274) + self.assertAlmostEqual(df_fere_corr["DemlData std_dev FERE correction"][0], 0.270209766) + + df_fere_corr = ElementProperty( + features=["FERE correction"], + stats=["minimum", "maximum", "range", "mean", "std_dev"], + data_source="deml", + impute_nan=False, + ).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_fere_corr.isna().sum().sum(), 5) + + df_fere_corr = ElementProperty( + features=["FERE correction"], + stats=["minimum", "maximum", "range", "mean", "std_dev"], + data_source="deml", + impute_nan=True, ).featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_fere_corr["DemlData minimum FERE correction"][0], -0.15213431610903) self.assertAlmostEqual(df_fere_corr["DemlData maximum FERE correction"][0], 0.23) self.assertAlmostEqual(df_fere_corr["DemlData range FERE correction"][0], 0.382134316) self.assertAlmostEqual(df_fere_corr["DemlData mean FERE correction"][0], 0.077146274) self.assertAlmostEqual(df_fere_corr["DemlData std_dev FERE correction"][0], 0.270209766) + self.assertEqual(df_fere_corr.isna().sum().sum(), 0) + + df_fere_corr = ElementProperty( + features=["FERE correction"], + stats=["minimum", "maximum", "range", "mean", "std_dev"], + data_source="deml", + impute_nan=True, + ).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_fere_corr.isna().sum().sum(), 0) + self.assertAlmostEqual(df_fere_corr.drop(columns="composition").sum().sum(), 0.2795, 4) + + def test_elem_optical(self): + df_elem = ElementProperty.from_preset("optical", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["OpticalData mean n_400.0"].iloc[0], 1.98229162203492) + self.assertAlmostEqual(df_elem["OpticalData range k_760.0"].iloc[1], 4.88738594404032) + self.assertAlmostEqual(df_elem["OpticalData maximum R_720.0"].iloc[0], 0.621705031591809) + + df_elem = ElementProperty.from_preset("optical", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 180) + + df_elem = ElementProperty.from_preset("optical", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["OpticalData mean n_400.0"].iloc[0], 1.98229162203492) + self.assertAlmostEqual(df_elem["OpticalData range k_760.0"].iloc[1], 4.88738594404032) + self.assertAlmostEqual(df_elem["OpticalData maximum R_720.0"].iloc[0], 0.621705031591809) + + df_elem = ElementProperty.from_preset("optical", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 0) + self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 201.3255, 4) + + def test_elem_transport(self): + df_elem = ElementProperty.from_preset("mp_transport", impute_nan=False).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["TransportData mean sigma_p"].iloc[0], 14933.7481377614, places=6) + self.assertAlmostEqual(df_elem["TransportData std_dev S_n"].iloc[1], 489.973884028426) + self.assertAlmostEqual(df_elem["TransportData mean m_p"].iloc[0], -0.00019543531213698) + + df_elem = ElementProperty.from_preset("mp_transport", impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertEqual(df_elem.isna().sum().sum(), 60) + + df_elem = ElementProperty.from_preset("mp_transport", impute_nan=True).featurize_dataframe( + self.df, col_id="composition" + ) + self.assertAlmostEqual(df_elem["TransportData mean sigma_p"].iloc[0], 14933.7481377614, places=6) + self.assertAlmostEqual(df_elem["TransportData std_dev S_n"].iloc[1], 489.973884028426) + self.assertAlmostEqual(df_elem["TransportData mean m_p"].iloc[0], -0.00019543531213698) + + df_elem = ElementProperty.from_preset("mp_transport", impute_nan=True).featurize_dataframe( + self.df_nans, col_id="composition" + ) + self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 9798095.622017656, 4) if __name__ == "__main__": diff --git a/matminer/featurizers/composition/tests/test_element.py b/matminer/featurizers/composition/tests/test_element.py index 729c0a0f9..ac0c604df 100644 --- a/matminer/featurizers/composition/tests/test_element.py +++ b/matminer/featurizers/composition/tests/test_element.py @@ -35,9 +35,23 @@ def test_tm_fraction(self): self.assertAlmostEqual(df_tm_frac["transition metal fraction"][0], 0.4) def test_band_center(self): - df_band_center = BandCenter().featurize_dataframe(self.df, col_id="composition") + df_band_center = BandCenter(impute_nan=False).featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_band_center["band center"][0], 5.870418816395603) - self.assertAlmostEqual(BandCenter().featurize(Composition("Ag33O500V200"))[0], 6.033480099340539) + self.assertAlmostEqual( + BandCenter(impute_nan=False).featurize(Composition("Ag33O500V200"))[0], 6.033480099340539 + ) + + df_band_center = BandCenter(impute_nan=False).featurize_dataframe( + self.df_nans, col_id="composition", ignore_errors=True + ) + self.assertEqual(df_band_center.isna().sum().sum(), 1, 10) + + df_band_center = BandCenter(impute_nan=True).featurize_dataframe(self.df, col_id="composition") + self.assertAlmostEqual(df_band_center["band center"][0], 5.870418816395603) + self.assertAlmostEqual(BandCenter(impute_nan=True).featurize(Composition("Ag33O500V200"))[0], 6.033480099340539) + + df_band_center = BandCenter(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition") + self.assertAlmostEqual(df_band_center["band center"][0], 4.598904, 6) if __name__ == "__main__": diff --git a/matminer/featurizers/composition/tests/test_ion.py b/matminer/featurizers/composition/tests/test_ion.py index d39d186e4..41e3723fe 100644 --- a/matminer/featurizers/composition/tests/test_ion.py +++ b/matminer/featurizers/composition/tests/test_ion.py @@ -1,5 +1,7 @@ +import math import unittest +import pytest from pymatgen.core import Composition from pymatgen.core.periodic_table import Specie @@ -20,9 +22,10 @@ def test_is_ionic(self): self.assertTrue(is_ionic(Composition({Specie("Fe", 2): 1, Specie("O", -2): 1}))) self.assertFalse(is_ionic(Composition({Specie("Fe", 0): 1, Specie("Al", 0): 1}))) + self.assertTrue(is_ionic(self.df_nans.loc[0, "composition"])) def test_ionic(self): - featurizer = IonProperty() + featurizer = IonProperty(impute_nan=False) df_ionic = featurizer.featurize_dataframe(self.df, col_id="composition") self.assertEqual(df_ionic["compound possible"][0], 1.0) self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.476922164) @@ -39,8 +42,38 @@ def test_ionic(self): featurizer.featurize(Composition({Specie("Fe", 2): 1, Specie("Fe", 3): 2, Specie("O", -2): 4}))[0], ) + featurizer = IonProperty(impute_nan=False) + df_ionic = featurizer.featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_ionic["compound possible"][0], 1.0) + self.assertTrue(math.isnan(df_ionic["max ionic char"][0])) + self.assertTrue(math.isnan(df_ionic["avg ionic char"][0])) + + featurizer = IonProperty(impute_nan=True) + df_ionic = featurizer.featurize_dataframe(self.df, col_id="composition") + self.assertEqual(df_ionic["compound possible"][0], 1.0) + self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.476922164) + self.assertAlmostEqual(df_ionic["avg ionic char"][0], 0.114461319) + + # Test 'fast' + self.assertEqual(1.0, featurizer.featurize(Composition("Fe3O4"))[0]) + featurizer.fast = True + self.assertEqual(0, featurizer.featurize(Composition("Fe3O4"))[0]) + + # Make sure 'fast' works if I use-precomputed oxidation states + self.assertEqual( + 1, + featurizer.featurize(Composition({Specie("Fe", 2): 1, Specie("Fe", 3): 2, Specie("O", -2): 4}))[0], + ) + + featurizer = IonProperty(impute_nan=True) + df_ionic = featurizer.featurize_dataframe(self.df_nans, col_id="composition") + + self.assertEqual(df_ionic["compound possible"][0], 1.0) + self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.028909, 6) + self.assertAlmostEqual(df_ionic["avg ionic char"][0], 0.006938, 6) + def test_cation_properties(self): - featurizer = CationProperty.from_preset("deml") + featurizer = CationProperty.from_preset("deml", impute_nan=False) features = dict( zip( featurizer.feature_labels(), @@ -53,9 +86,52 @@ def test_cation_properties(self): self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 5.48) self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0) + with pytest.raises(KeyError): + featurizer = CationProperty.from_preset("deml", impute_nan=False) + features = dict( + zip( + featurizer.feature_labels(), + featurizer.featurize(self.df_nans["composition"][0]), + ) + ) + + featurizer = CationProperty.from_preset("deml", impute_nan=True) + features = dict( + zip( + featurizer.feature_labels(), + featurizer.featurize(self.df["composition"][1]), + ) + ) + self.assertAlmostEqual(features["DemlData minimum magn_moment of cations"], 5.48) + self.assertAlmostEqual(features["DemlData maximum magn_moment of cations"], 5.48) + self.assertAlmostEqual(features["DemlData range magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 5.48) + self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0) + + featurizer = CationProperty.from_preset("deml", impute_nan=True) + features = dict( + zip( + featurizer.feature_labels(), + featurizer.featurize(self.df_nans["composition"][0]), + ) + ) + self.assertAlmostEqual(features["DemlData minimum magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData maximum magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData range magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0) + self.assertAlmostEqual(features["DemlData minimum total_ioniz of cations"], 811242.222222, 6) + self.assertAlmostEqual(features["DemlData maximum total_ioniz of cations"], 811242.222222, 6) + self.assertAlmostEqual(features["DemlData mean total_ioniz of cations"], 811242.222222, 6) + def test_elec_affin(self): - featurizer = ElectronAffinity() + featurizer = ElectronAffinity(impute_nan=False) + self.assertAlmostEqual(-141000 * 2, featurizer.featurize(self.df["composition"][1])[0]) + self.assertTrue(math.isnan(featurizer.featurize(self.df_nans["composition"][0])[0])) + + featurizer = ElectronAffinity(impute_nan=True) self.assertAlmostEqual(-141000 * 2, featurizer.featurize(self.df["composition"][1])[0]) + self.assertAlmostEqual(featurizer.featurize(self.df_nans["composition"][0])[0], -153716.470588, 6) def test_en_diff(self): featurizer = ElectronegativityDiff() diff --git a/matminer/featurizers/composition/tests/test_orbital.py b/matminer/featurizers/composition/tests/test_orbital.py index bb9511ea9..e328e5bd8 100644 --- a/matminer/featurizers/composition/tests/test_orbital.py +++ b/matminer/featurizers/composition/tests/test_orbital.py @@ -8,7 +8,7 @@ class OrbitalFeaturesTest(CompositionFeaturesTest): def test_valence(self): - df_val = ValenceOrbital().featurize_dataframe(self.df, col_id="composition") + df_val = ValenceOrbital(impute_nan=False).featurize_dataframe(self.df, col_id="composition") self.assertAlmostEqual(df_val["avg s valence electrons"][0], 2.0) self.assertAlmostEqual(df_val["avg p valence electrons"][0], 2.4) self.assertAlmostEqual(df_val["avg d valence electrons"][0], 2.4) @@ -18,6 +18,23 @@ def test_valence(self): self.assertAlmostEqual(df_val["frac p valence electrons"][0], 0.352941176) self.assertAlmostEqual(df_val["frac f valence electrons"][0], 0) + df_val = ValenceOrbital(impute_nan=False).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_val.isna().sum().sum(), 8) + + df_val = ValenceOrbital(impute_nan=True).featurize_dataframe(self.df, col_id="composition") + self.assertAlmostEqual(df_val["avg s valence electrons"][0], 2.0) + self.assertAlmostEqual(df_val["avg p valence electrons"][0], 2.4) + self.assertAlmostEqual(df_val["avg d valence electrons"][0], 2.4) + self.assertAlmostEqual(df_val["avg f valence electrons"][0], 0.0) + self.assertAlmostEqual(df_val["frac s valence electrons"][0], 0.294117647) + self.assertAlmostEqual(df_val["frac d valence electrons"][0], 0.352941176) + self.assertAlmostEqual(df_val["frac p valence electrons"][0], 0.352941176) + self.assertAlmostEqual(df_val["frac f valence electrons"][0], 0) + + df_val = ValenceOrbital(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition") + self.assertEqual(df_val.isna().sum().sum(), 0) + self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 10.342857, 6) + def test_atomic_orbitals(self): df_atomic_orbitals = AtomicOrbitals().featurize_dataframe(self.df, col_id="composition") self.assertEqual(df_atomic_orbitals["HOMO_character"][0], "d") diff --git a/matminer/featurizers/composition/tests/test_packing.py b/matminer/featurizers/composition/tests/test_packing.py index 5d717d222..ebf43d9fe 100644 --- a/matminer/featurizers/composition/tests/test_packing.py +++ b/matminer/featurizers/composition/tests/test_packing.py @@ -11,11 +11,11 @@ class PackingFeaturesTest(CompositionFeaturesTest): - def test_ape(self): - f = AtomicPackingEfficiency() + def test_ape1(self): + f = AtomicPackingEfficiency(impute_nan=False) + f.set_n_jobs(1) ef = ElementFraction() ef.set_n_jobs(1) - f.set_n_jobs(1) # Test the APE calculation routines self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15)) @@ -90,6 +90,85 @@ def test_ape(self): feat = f.compute_nearest_cluster_distance(Composition("Al")) np.testing.assert_array_almost_equal([1] * 3, feat) + def test_ape2(self): + f = AtomicPackingEfficiency(impute_nan=True) + f.set_n_jobs(1) + ef = ElementFraction() + ef.set_n_jobs(1) + + # Test the APE calculation routines + self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15)) + self.assertAlmostEqual(0.154701, f.get_ideal_radius_ratio(2)) + self.assertAlmostEqual(1.65915, f.get_ideal_radius_ratio(27)) + self.assertAlmostEqual(15, f.find_ideal_cluster_size(1.116)[0]) + self.assertAlmostEqual(3, f.find_ideal_cluster_size(0.1)[0]) + self.assertAlmostEqual(24, f.find_ideal_cluster_size(2)[0]) + + # Test the nearest neighbor lookup tool + nn_lookup = f.create_cluster_lookup_tool([Element("Cu"), Element("Zr")]) + + # Check that the table gets the correct structures + stable_clusters = [ + Composition("CuZr10"), + Composition("Cu6Zr6"), + Composition("Cu8Zr5"), + Composition("Cu13Zr1"), + Composition("Cu3Zr12"), + Composition("Cu8Zr8"), + Composition("Cu12Zr5"), + Composition("Cu17Zr"), + ] + ds, _ = nn_lookup.kneighbors(ef.featurize_many(stable_clusters), n_neighbors=1) + np.testing.assert_array_almost_equal([[0]] * 8, ds) + self.assertEqual(8, nn_lookup._fit_X.shape[0]) + + # Swap the order of the clusters, make sure it gets the same list + nn_lookup_swapped = f.create_cluster_lookup_tool([Element("Zr"), Element("Cu")]) + np.testing.assert_array_almost_equal( + sorted(nn_lookup._fit_X.tolist()), sorted(nn_lookup_swapped._fit_X.tolist()) + ) + + # Make sure we had a cache hit + self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().misses) + self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().hits) + + # Change the tolerance, see if it changes the results properly + f.threshold = 0.002 + nn_lookup = f.create_cluster_lookup_tool([Element("Cu"), Element("Zr")]) + self.assertEqual(2, nn_lookup._fit_X.shape[0]) + ds, _ = nn_lookup.kneighbors( + ef.featurize_many([Composition("CuZr10"), Composition("Cu3Zr12")]), + n_neighbors=1, + ) + np.testing.assert_array_almost_equal([[0]] * 2, ds) + + # Make sure we had a cache miss + self.assertEqual(5, f._create_cluster_lookup_tool.cache_info().misses) + self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().hits) + + # Compute the distances from Cu50Zr50 + mean_dists = f.compute_nearest_cluster_distance(Composition("CuZr")) + np.testing.assert_array_almost_equal([0.424264, 0.667602, 0.800561], mean_dists, decimal=6) + + # Compute the optimal APE for Cu50Zr50 + np.testing.assert_array_almost_equal( + [0.000233857, 0.003508794], + f.compute_simultaneous_packing_efficiency(Composition("Cu50Zr50")), + ) + + # Test the dataframe calculator + df = pd.DataFrame({"comp": [Composition("CuZr")]}) + df = f.featurize_dataframe(df, "comp") + + self.assertEqual(6, len(df.columns)) + self.assertIn("dist from 5 clusters |APE| < 0.002", df.columns) + + self.assertAlmostEqual(0.003508794, df["mean abs simul. packing efficiency"][0]) + + # Make sure it works with composition that do not match any efficient clusters + feat = f.compute_nearest_cluster_distance(Composition("Al")) + np.testing.assert_array_almost_equal([1] * 3, feat) + if __name__ == "__main__": unittest.main() diff --git a/matminer/featurizers/site/chemical.py b/matminer/featurizers/site/chemical.py index 8715bbfed..b05b27aef 100644 --- a/matminer/featurizers/site/chemical.py +++ b/matminer/featurizers/site/chemical.py @@ -2,6 +2,8 @@ Site featurizers based on local chemical information, rather than geometry alone. """ +import warnings + import numpy as np import pymatgen.analysis.local_env from pymatgen.analysis.ewald import EwaldSummation @@ -13,6 +15,7 @@ from matminer.featurizers.base import BaseFeaturizer from matminer.utils.caching import get_nearest_neighbors from matminer.utils.data import MagpieData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class ChemicalSRO(BaseFeaturizer): @@ -310,7 +313,8 @@ class LocalPropertyDifference(BaseFeaturizer): def __init__( self, - data_source=MagpieData(), + data_source=None, + impute_nan=False, weight="area", properties=("Electronegativity",), signed=False, @@ -318,7 +322,7 @@ def __init__( """Initialize the featurizer Args: - data_source (AbstractData) - Class from which to retrieve + data_source (AbstractData or None) - Class from which to retrieve elemental properties weight (str) - What aspect of each voronoi facet to use to weigh each neighbor (see VoronoiNN) @@ -326,13 +330,16 @@ def __init__( signed (bool) - whether to return absolute difference or signed difference of properties(default=False (absolute difference)) """ - self.data_source = data_source + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = data_source or MagpieData(impute_nan=self.impute_nan) self.properties = properties self.weight = weight self.signed = signed @staticmethod - def from_preset(preset): + def from_preset(preset, impute_nan=False): """ Create a new LocalPropertyDifference class according to a preset @@ -342,7 +349,7 @@ def from_preset(preset): if preset == "ward-prb-2017": return LocalPropertyDifference( - data_source=MagpieData(), + data_source=MagpieData(impute_nan=impute_nan), properties=[ "Number", "MendeleevNumber", @@ -441,14 +448,20 @@ class SiteElementalProperty(BaseFeaturizer): `Schmidt et al., _Chem Mater_. (2017) `_ """ - def __init__(self, data_source=None, properties=("Number",)): + def __init__(self, data_source=None, properties=("Number",), impute_nan=False): """Initialize the featurizer Args: data_source (AbstractData): Tool used to look up elemental properties properties ([string]): List of properties to use for features + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - self.data_source = data_source or MagpieData() + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = data_source or MagpieData(impute_nan=self.impute_nan) self.properties = properties self._preset_citations = [] @@ -472,7 +485,7 @@ def implementors(self): return ["Logan Ward"] @staticmethod - def from_preset(preset): + def from_preset(preset, impute_nan=False): """Create the class with pre-defined settings Args: @@ -483,7 +496,7 @@ def from_preset(preset): if preset == "seko-prb-2017": output = SiteElementalProperty( - data_source=MagpieData(), + data_source=MagpieData(impute_nan=impute_nan), properties=[ "Number", "AtomicWeight", diff --git a/matminer/featurizers/site/misc.py b/matminer/featurizers/site/misc.py index bb2c2a1ca..c82a1f08d 100644 --- a/matminer/featurizers/site/misc.py +++ b/matminer/featurizers/site/misc.py @@ -1,6 +1,8 @@ """ Miscellaneous site featurizers. """ +import warnings + import numpy as np import pymatgen.analysis.local_env from pymatgen.analysis.local_env import VoronoiNN, solid_angle, vol_tetra @@ -10,6 +12,7 @@ from matminer.featurizers.utils.stats import PropertyStats from matminer.utils.caching import get_nearest_neighbors from matminer.utils.data import MagpieData +from matminer.utils.warnings import IMPUTE_NAN_WARNING class IntersticeDistribution(BaseFeaturizer): @@ -48,9 +51,12 @@ class IntersticeDistribution(BaseFeaturizer): support sub-list of ['dist', 'area', 'vol']. stats ([str]): statistics of distance/area/volume interstices. radius_type (str): source of radius estimate. (default: "MiracleRadius") + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="MiracleRadius"): + def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="MiracleRadius", impute_nan=False): self.cutoff = cutoff self.interstice_types = ["dist", "area", "vol"] if interstice_types is None else interstice_types if isinstance(self.interstice_types, str): @@ -59,7 +65,10 @@ def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="M raise ValueError("interstice_types only support sub-list of " "['dist', 'area', 'vol']") self.stats = ["mean", "std_dev", "minimum", "maximum"] if stats is None else stats self.radius_type = radius_type - self.data_source = MagpieData() + self.impute_nan = impute_nan + if not self.impute_nan: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + self.data_source = MagpieData(impute_nan=impute_nan) def featurize(self, struct, idx): """ diff --git a/matminer/featurizers/site/tests/base.py b/matminer/featurizers/site/tests/base.py index 50de75d3e..9294b3f52 100644 --- a/matminer/featurizers/site/tests/base.py +++ b/matminer/featurizers/site/tests/base.py @@ -60,6 +60,14 @@ def setUp(self): coords_are_cartesian=False, site_properties=None, ) + self.nans = Structure( + Lattice([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]]), + ["Og", "U"], + [[0, 0, 0], [0.5, 0.5, 0.5]], + validate_proximity=False, + to_unit_cell=False, + coords_are_cartesian=False, + ) def tearDown(self): del self.sc diff --git a/matminer/featurizers/site/tests/test_chemical.py b/matminer/featurizers/site/tests/test_chemical.py index f7e73df90..d1389bb83 100644 --- a/matminer/featurizers/site/tests/test_chemical.py +++ b/matminer/featurizers/site/tests/test_chemical.py @@ -95,34 +95,74 @@ def test_ewald_site(self): np.testing.assert_array_almost_equal(ewald.featurize(self.sc, 0), [0]) def test_local_prop_diff(self): - f = LocalPropertyDifference() + f = LocalPropertyDifference(impute_nan=False) # Test for Al, all features should be zero features = f.featurize(self.sc, 0) np.testing.assert_array_almost_equal(features, [0]) + # Test for fictive structure that leads to NaNs + features = f.featurize(self.nans, 0) + assert np.isnan(features[0]) + + # Change the property to Number, compute for B1 + f.set_params(properties=["Number"]) + for i in range(2): + features = f.featurize(self.b1, i) + np.testing.assert_array_almost_equal(features, [1]) + + for i in range(2): + features = f.featurize(self.nans, i) + assert np.isnan(features[0]) + + f = LocalPropertyDifference(impute_nan=True) + + # Test for Al, all features should be zero + features = f.featurize(self.sc, 0) + np.testing.assert_array_almost_equal(features, [0]) + + # Test for fictive structure that leads to NaNs + features = f.featurize(self.nans, 0) + np.testing.assert_array_almost_equal(features, [0.26003609]) + # Change the property to Number, compute for B1 f.set_params(properties=["Number"]) for i in range(2): features = f.featurize(self.b1, i) np.testing.assert_array_almost_equal(features, [1]) + for i in range(2): + features = f.featurize(self.nans, i) + np.testing.assert_array_almost_equal(features, [27.54767206]) + def test_site_elem_prop(self): - f = SiteElementalProperty.from_preset("seko-prb-2017") + f = SiteElementalProperty.from_preset("seko-prb-2017", impute_nan=False) # Make sure it does the B1 structure correctly feat_labels = f.feature_labels() feats = f.featurize(self.b1, 0) self.assertAlmostEqual(1, feats[feat_labels.index("site Number")]) + assert np.isnan(feats[feat_labels.index("site SecondIonizationEnergy")]) feats = f.featurize(self.b1, 1) self.assertAlmostEqual(2, feats[feat_labels.index("site Number")]) + assert np.isnan(feats[feat_labels.index("site Electronegativity")]) # Test the citations citations = f.citations() self.assertEqual(1, len(citations)) self.assertIn("Seko2017", citations[0]) + f = SiteElementalProperty.from_preset("seko-prb-2017", impute_nan=True) + feat_labels = f.feature_labels() + feats = f.featurize(self.b1, 0) + self.assertAlmostEqual(1, feats[feat_labels.index("site Number")]) + self.assertAlmostEqual(feats[feat_labels.index("site SecondIonizationEnergy")], 18.781681, 6) + + feats = f.featurize(self.b1, 1) + self.assertAlmostEqual(2, feats[feat_labels.index("site Number")]) + self.assertAlmostEqual(feats[feat_labels.index("site Electronegativity")], 1.715102, 6) + if __name__ == "__main__": unittest.main() diff --git a/matminer/featurizers/site/tests/test_misc.py b/matminer/featurizers/site/tests/test_misc.py index f39d51c8e..b74c372da 100644 --- a/matminer/featurizers/site/tests/test_misc.py +++ b/matminer/featurizers/site/tests/test_misc.py @@ -17,7 +17,7 @@ def test_interstice_distribution_of_crystal(self): ) df_bcc_li = pd.DataFrame({"struct": [bcc_li], "site": [1]}) - interstice_distribution = IntersticeDistribution() + interstice_distribution = IntersticeDistribution(impute_nan=False) intersticefp = interstice_distribution.featurize_dataframe(df_bcc_li, ["struct", "site"]) self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.32, 2) @@ -33,6 +33,36 @@ def test_interstice_distribution_of_crystal(self): self.assertAlmostEqual(intersticefp["Interstice_dist_minimum"][0], 0, 3) self.assertAlmostEqual(intersticefp["Interstice_dist_maximum"][0], 0.15461, 5) + bcc_og = Structure( + Lattice([[3.51, 0, 0], [0, 3.51, 0], [0, 0, 3.51]]), + ["Og"] * 2, + [[0, 0, 0], [0.5, 0.5, 0.5]], + ) + df_bcc_og = pd.DataFrame({"struct": [bcc_og], "site": [1]}) + + intersticefp = interstice_distribution.featurize_dataframe(df_bcc_og, ["struct", "site"]) + self.assertEqual(intersticefp.isna().sum().sum(), 4) + self.assertEqual(intersticefp.drop(columns="struct").sum().sum(), 1) + + interstice_distribution = IntersticeDistribution(impute_nan=True) + intersticefp = interstice_distribution.featurize_dataframe(df_bcc_li, ["struct", "site"]) + + self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.32, 2) + self.assertAlmostEqual(intersticefp["Interstice_vol_std_dev"][0], 0) + self.assertAlmostEqual(intersticefp["Interstice_vol_minimum"][0], 0.32, 2) + self.assertAlmostEqual(intersticefp["Interstice_vol_maximum"][0], 0.32, 2) + self.assertAlmostEqual(intersticefp["Interstice_area_mean"][0], 0.16682, 5) + self.assertAlmostEqual(intersticefp["Interstice_area_std_dev"][0], 0) + self.assertAlmostEqual(intersticefp["Interstice_area_minimum"][0], 0.16682, 5) + self.assertAlmostEqual(intersticefp["Interstice_area_maximum"][0], 0.16682, 5) + self.assertAlmostEqual(intersticefp["Interstice_dist_mean"][0], 0.06621, 5) + self.assertAlmostEqual(intersticefp["Interstice_dist_std_dev"][0], 0.07655, 5) + self.assertAlmostEqual(intersticefp["Interstice_dist_minimum"][0], 0, 3) + self.assertAlmostEqual(intersticefp["Interstice_dist_maximum"][0], 0.15461, 5) + intersticefp = interstice_distribution.featurize_dataframe(df_bcc_og, ["struct", "site"]) + self.assertEqual(intersticefp.isna().sum().sum(), 0) + self.assertAlmostEqual(intersticefp.drop(columns="struct").sum().sum(), 2.581817, 6) + def test_interstice_distribution_of_glass(self): cuzr_glass = Structure( Lattice([[25, 0, 0], [0, 25, 0], [0, 0, 25]]), @@ -72,7 +102,7 @@ def test_interstice_distribution_of_glass(self): ) df_glass = pd.DataFrame({"struct": [cuzr_glass], "site": [0]}) - interstice_distribution = IntersticeDistribution() + interstice_distribution = IntersticeDistribution(impute_nan=False) intersticefp = interstice_distribution.featurize_dataframe(df_glass, ["struct", "site"]) self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.28905, 5) diff --git a/matminer/utils/data.py b/matminer/utils/data.py index 9a3963ae1..de8aea73d 100644 --- a/matminer/utils/data.py +++ b/matminer/utils/data.py @@ -7,11 +7,21 @@ import abc import json import os +import tarfile +import warnings +from copy import deepcopy from glob import glob import numpy as np import pandas as pd +from pymatgen.core import Composition from pymatgen.core.periodic_table import Element +from ruamel import yaml +from scipy import stats +from scipy.interpolate import interp1d + +from matminer.utils.utils import get_elem_in_data, get_pseudo_inverse +from matminer.utils.warnings import IMPUTE_NAN_WARNING __author__ = "Kiran Mathew, Jiming Chen, Logan Ward, Anubhav Jain, Alex Dunn" @@ -97,13 +107,31 @@ class CohesiveEnergyData(AbstractData): (http://www.knowledgedoor.com/2/elements_handbook/cohesive_energy.html), which in turn got the data from Introduction to Solid State Physics, 8th Edition, by Charles Kittel (ISBN 978-0-471-41526-8), 2005. + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): # Load elemental cohesive energy data from json file with open(os.path.join(module_dir, "data_files", "cohesive_energies.json")) as f: self.cohesive_energy_data = json.load(f) + self.impute_nan = impute_nan + if self.impute_nan: + elem_list = list(self.cohesive_energy_data) + missing_elements = [e.symbol for e in Element if e.symbol not in elem_list] + avg_cohesive_energy = np.nanmean(list(self.cohesive_energy_data.values())) + + for e in Element: + if e.symbol in missing_elements or np.isnan(self.cohesive_energy_data[e.symbol]): + self.cohesive_energy_data[e.symbol] = avg_cohesive_energy + + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + def get_elemental_property(self, elem, property_name="cohesive energy"): """ Args: @@ -126,12 +154,17 @@ class DemlData(OxidationStateDependentData, OxidationStatesMixin): The meanings of each feature in the data can be found in ./data_files/deml_elementdata.py + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): from matminer.utils.data_files.deml_elementdata import properties - self.all_props = properties + self.all_props = deepcopy(properties) self.available_props = list(self.all_props.keys()) + [ "formal_charge", "valence_s", @@ -141,12 +174,6 @@ def __init__(self): "total_ioniz", ] - # Compute the FERE correction energy - fere_corr = {} - for k, v in self.all_props["GGAU_Etot"].items(): - fere_corr[k] = self.all_props["mus_fere"][k] - v - self.all_props["FERE correction"] = fere_corr - # List out the available charge-dependent properties self.charge_dependent_properties = [ "xtal_field_split", @@ -155,6 +182,47 @@ def __init__(self): "sat_magn", ] + self.impute_nan = impute_nan + if self.impute_nan: + avg_ioniz = np.nanmean([self.all_props["ionization_en"].get(e.symbol, [float("NaN")])[0] for e in Element]) + + for e in Element: + if e.symbol not in self.all_props["atom_num"]: + self.all_props["atom_num"][e.symbol] = e.Z + if e.symbol not in self.all_props["atom_mass"]: + self.all_props["atom_mass"][e.symbol] = e.atomic_mass + if e.symbol not in self.all_props["row_num"]: + self.all_props["row_num"][e.symbol] = e.row + if e.symbol not in self.all_props["ionization_en"] or np.isnan( + self.all_props["ionization_en"][e.symbol][0] + ): + self.all_props["ionization_en"][e.symbol] = [avg_ioniz] + for key in [ + "col_num", + "atom_radius", + "molar_vol", + "heat_fusion", + "melting_point", + "boiling_point", + "heat_cap", + "electron_affin", + "electronegativity", + "electric_pol", + "GGAU_Etot", + "mus_fere", + "electron_affin", + ]: + if e.symbol not in self.all_props[key] or np.isnan(self.all_props[key][e.symbol]): + self.all_props[key][e.symbol] = np.nanmean(list(self.all_props[key].values())) + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + + # Compute the FERE correction energy + fere_corr = {} + for k, v in self.all_props["GGAU_Etot"].items(): + fere_corr[k] = self.all_props["mus_fere"][k] - v + self.all_props["FERE correction"] = fere_corr + def get_elemental_property(self, elem, property_name): if "valence" in property_name: valence_dict = self.all_props["valence_e"][self.all_props["col_num"][elem.symbol]] @@ -169,13 +237,18 @@ def get_elemental_property(self, elem, property_name): return self.all_props[property_name].get(elem.symbol, float("NaN")) def get_oxidation_states(self, elem): - return self.all_props["charge_states"][elem.symbol] + if not self.impute_nan: + return self.all_props["charge_states"][elem.symbol] + else: + return self.all_props["charge_states"].get(elem.symbol, [0]) def get_charge_dependent_property(self, element, charge, property_name): if property_name == "total_ioniz": if charge < 0: raise ValueError("total ionization energy only defined for charge > 0") return sum(self.all_props["ionization_en"][element.symbol][:charge]) + elif self.impute_nan: + return self.all_props[property_name].get(element.symbol, {}).get(charge, 0) else: return self.all_props[property_name].get(element.symbol, {}).get(charge, np.nan) @@ -190,9 +263,14 @@ class MagpieData(AbstractData, OxidationStatesMixin): Finding the exact meaning of each of these features can be quite difficult. Reproduced in ./data_files/magpie_elementdata_feature_descriptions.txt. + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): self.all_elemental_props = dict() available_props = [] self.data_dir = os.path.join(module_dir, "data_files", "magpie_elementdata") @@ -216,6 +294,37 @@ def __init__(self): prop_value = float("NaN") self.all_elemental_props[descriptor_name][Element.from_Z(atomic_no).symbol] = prop_value + self.impute_nan = impute_nan + if self.impute_nan: + for prop in self.all_elemental_props: + if prop == "OxidationStates": + nested_props = list(self.all_elemental_props["OxidationStates"].values()) + flatten_props = [] + for l in nested_props: + if isinstance(l, list): + for e in l: + flatten_props.append(e) + else: + flatten_props.append(l) + + avg_prop = np.round(np.nanmean(flatten_props)) + for e in Element: + if ( + e.symbol not in self.all_elemental_props[prop] + or self.all_elemental_props[prop][e.symbol] == [] + or np.any(np.isnan(self.all_elemental_props[prop][e.symbol])) + ): + self.all_elemental_props[prop][e.symbol] = [avg_prop] + else: + avg_prop = np.nanmean(list(self.all_elemental_props[prop].values())) + for e in Element: + if e.symbol not in self.all_elemental_props[prop] or np.isnan( + self.all_elemental_props[prop][e.symbol] + ): + self.all_elemental_props[prop][e.symbol] = avg_prop + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + def get_elemental_property(self, elem, property_name): return self.all_elemental_props[property_name][elem.symbol] @@ -232,10 +341,16 @@ class PymatgenData(OxidationStateDependentData, OxidationStatesMixin): Meanings of each feature can be obtained from the pymatgen.Composition documentation (attributes). + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self, use_common_oxi_states=True): + def __init__(self, use_common_oxi_states=True, impute_nan=False): self.use_common_oxi_states = use_common_oxi_states + self.impute_nan = impute_nan def get_elemental_property(self, elem, property_name): if property_name == "block": @@ -243,7 +358,22 @@ def get_elemental_property(self, elem, property_name): return block_key[getattr(elem, property_name)] else: value = getattr(elem, property_name) - return np.nan if value is None else value + if self.impute_nan: + if value and not pd.isnull(value): + return value + else: + if property_name == "ionic_radii": + return {0: self.get_elemental_property(elem, "atomic_radius")} + elif property_name in ["common_oxidation_states", "icsd_oxidation_states"]: + return (0,) + else: + all_values = [getattr(e, property_name) for e in Element] + all_values = [val if val else float("NaN") for val in all_values] + value_avg = np.nanmean(all_values) + return value_avg + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + return np.nan if value is None else value def get_oxidation_states(self, elem): """Get the oxidation states of an element @@ -258,7 +388,13 @@ def get_oxidation_states(self, elem): return elem.common_oxidation_states if self.use_common_oxi_states else elem.oxidation_states def get_charge_dependent_property(self, element, charge, property_name): - return getattr(element, property_name)[charge] + if self.impute_nan: + if property_name == "ionic_radii": + return self.get_elemental_property(element, property_name)[charge] + elif property_name in ["common_oxidation_states", "icsd_oxidation_states"]: + return self.get_elemental_property(element, property_name)[charge] + else: + return getattr(element, property_name)[charge] class MixingEnthalpy: @@ -275,12 +411,17 @@ class MixingEnthalpy: valid_element_list ([Element]): A list of elements for which the mixing enthalpy parameters are defined (although no guarantees are provided that all combinations of this list will be available). + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): mixing_dataset = pd.read_csv( os.path.join(module_dir, "data_files", "MiedemaLiquidDeltaHf.tsv"), - delim_whitespace=True, + sep=r"\s+", ) self.mixing_data = {} for a, b, dHf in mixing_dataset.itertuples(index=False): @@ -365,6 +506,18 @@ def __init__(self): ] self.valid_element_list = [Element(e) for e in valid_elements] + self.impute_nan = impute_nan + if self.impute_nan: + avg_value = np.nanmean(list(self.mixing_data.values())) + for e1 in Element: + for e2 in Element: + key = tuple(sorted((e1.symbol, e2.symbol))) + if key not in self.mixing_data or np.isnan(self.mixing_data[key]): + self.mixing_data[key] = avg_value + self.valid_element_list = list(Element) + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + def get_mixing_enthalpy(self, elemA, elemB): """ Get the mixing enthalpy between different elements @@ -392,9 +545,14 @@ class MatscholarElementData(AbstractData): Tshitoyan, V., Dagdelen, J., Weston, L. et al. Unsupervised word embeddings capture latent knowledge from materials science literature. Nature 571, 95–98 (2019). https://doi.org/10.1038/s41586-019-1335-8 + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): dfile = os.path.join(module_dir, "data_files/matscholar_els.json") with open(dfile) as fp: embeddings = json.load(fp) @@ -404,6 +562,22 @@ def __init__(self): all_element_data[el] = dict(zip(self.prop_names, embedding)) self.all_element_data = all_element_data + self.impute_nan = impute_nan + if self.impute_nan: + for embedding in self.prop_names: + avg_value = np.nanmean( + [self.all_element_data[e].get(embedding, float("NaN")) for e in self.all_element_data] + ) + for e in Element: + if e.symbol not in self.all_element_data: + self.all_element_data[e.symbol] = {embedding: avg_value} + elif embedding not in self.all_element_data[e.symbol] or np.isnan( + self.all_element_data[e.symbol][embedding] + ): + self.all_element_data[e.symbol][embedding] = avg_value + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + def get_elemental_property(self, elem, property_name): return self.all_element_data[str(elem)][property_name] @@ -429,9 +603,14 @@ class MEGNetElementData(AbstractData): The representations are learned during training to predict a specific property, though they may be useful for a range of properties. + + Args: + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. """ - def __init__(self): + def __init__(self, impute_nan=False): dfile = os.path.join(module_dir, "data_files/megnet_elemental_embedding.json") self._dummy = "Dummy" with open(dfile) as fp: @@ -445,6 +624,22 @@ def __init__(self): else: self.all_element_data[str(Element.from_Z(i))] = embedding_dict + self.impute_nan = impute_nan + if self.impute_nan: + for embedding in self.prop_names: + avg_value = np.nanmean( + [self.all_element_data[e].get(embedding, float("NaN")) for e in self.all_element_data] + ) + for e in Element: + if e.symbol not in self.all_element_data: + self.all_element_data[e.symbol] = {embedding: avg_value} + elif embedding not in self.all_element_data[e.symbol] or np.isnan( + self.all_element_data[e.symbol][embedding] + ): + self.all_element_data[e.symbol][embedding] = avg_value + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + def get_elemental_property(self, elem, property_name): estr = str(elem) if estr not in self.all_element_data.keys(): @@ -572,3 +767,581 @@ def get_bv_params(self, cation, anion, cat_val, an_val): ] return bond_val_list.iloc[0] # If multiple values exist, take first one # as recommended for reliability. + + +class OpticalData(AbstractData): + """ + Class to use optical data from https://www.refractiveindex.info + The properties are the refractive index n, the extinction coefficient ĸ + (measured or computed with DFT), and the reflectivity R as obtained from + Fresnel's equation. + Data is by default considered if available from 380 to 780 nm, + but other ranges can be chosen as well. + + In case new data becomes available and needs to be added to the database, + it should be added in matminer/utils/data_files/optical_polyanskiy/database, + which should then be compressed in the tar.xz format. + To add a file for a compound, follow any of the formats of refractiveindex.info. + + The database is used to extract: + 1) the properties of single elements when available. + 2) the pseudo-inverse of the properties of single elements, + based on the data for ~200 compounds. These pseudo-inverse + contributions correspond to the coefficients of a least-square fit + from the compositions to the properties. This can allow to better take into account + data from different compounds for a given element. + + Using the pseudo-inverses (method="pseudo_inverse") instead of + the elemental properties (method="exact") leads to better results as far as we have checked. + Another possibility is to use method="combined", where the exact values are taken + for compounds present as pure compounds in the database, and the pseudo-inverse + is taken if the element is not present purely in the database. + + n, ĸ, and R are spectra. These are composed of n_wl wavelengths, from min_wl to max_wl. + We split these spectra into bins (initially 10) where their average values are taken. + These averaged values are the final features. The wavelength corresponding to a given bin + is its midpoint. + + Args: + props: optical properties to include. Should be a list with + "refractive" and/or "extinction" and/or "reflectivity". + method: type of values, either "exact", "pseudo_inverse", or "combined". + min_wl: minimum wavelength to include in the spectra (µm). + max_wl : maximum wavelength to include in the spectra (µm). + n_wl: number of wavelengths to include in the spectra. + bins: number of bins to split the spectra. + saving_dir: folder to save the data and csv file used for the featurization. Saving them helps fasten the + featurization. + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. + """ + + def __init__( + self, + props=None, + method="pseudo_inverse", + min_wl=0.38, + max_wl=0.78, + n_wl=401, + bins=10, + saving_dir="~/.matminer/optical_props/", + impute_nan=False, + ): + # Handles the saving folder + saving_dir = os.path.expanduser(saving_dir) + os.makedirs(os.path.join(saving_dir, "database"), exist_ok=True) + self.saving_dir = saving_dir + + # Handle the selection of properties + if props is None: + props = ["refractive", "extinction", "reflectivity"] + elif not all([prop in ["refractive", "extinction", "reflectivity"] for prop in props]): + raise ValueError("This property is not available: choose from refractive, extinction, or reflectivity") + + self.props = props + self.method = method + self.n_wl = n_wl + self.min_wl = min_wl + self.max_wl = max_wl + self.wavelengths = np.linspace(min_wl, max_wl, n_wl) + + # The data might have already been treated : it is faster to read the data from file + dbfile = os.path.join(self.saving_dir, f"optical_polyanskiy_{self.min_wl}_{self.max_wl}_{self.n_wl}.csv") + + if os.path.isfile(dbfile): + data = pd.read_csv(dbfile) + data.set_index("Compound", inplace=True) + self.data = data + else: + # Recompute the data file from the database + warnings.warn( + """Datafile not existing for these wavelengths: recollecting the data from the database. + This can take a few seconds...""" + ) + self.data = self._get_optical_data_from_database() + self.data.to_csv(dbfile) + warnings.warn("The data has been collected and stored.") + + self.elem_data = self._get_element_props() + + # Split into bins + bins *= len(props) + slices = np.linspace(0, len(self.elem_data.T), bins + 1, True).astype(int) + counts = np.diff(slices) + + cols = self.elem_data.columns[slices[:-1] + counts // 2] + labels = list(cols) # [col for col in cols] + + all_element_data = pd.DataFrame( + np.add.reduceat(self.elem_data.values, slices[:-1], axis=1) / counts, + columns=cols, + index=self.elem_data.index, + ) + + self.all_element_data = all_element_data + self.prop_names = labels + + self.impute_nan = impute_nan + if self.impute_nan: + self.all_element_data.fillna(self.all_element_data.mean(), inplace=True) + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + + def _get_element_props(self): + """ + Returns the properties of single elements from the data contained in the database. + """ + + data = self.data.copy() + + cols = [] + if "refractive" in self.props: + cols += [name for name in data.columns if "n_" in name] + if "extinction" in self.props: + cols += [name for name in data.columns if "k_" in name] + if "reflectivity" in self.props: + cols += [name for name in data.columns if "R_" in name] + + data = data[cols] + + # Compute the exact values + res = [] + elem, elem_absent = get_elem_in_data(data, as_pure=True) + for e in elem: + res.append(data.loc[e].values) + for e in elem_absent: + res.append(np.nan * np.ones(len(self.props) * self.n_wl)) + + res = np.array(res) + df_exact = pd.DataFrame(res, columns=cols, index=pd.Index(elem + elem_absent)) + + if self.method == "exact": + return df_exact + else: + # Compute the pseudo-inversed values + df_pi = get_pseudo_inverse(self.data, cols) + + if self.method == "pseudo_inverse": + return df_pi + elif self.method == "combined": + res_combined = [] + for i, e in df_exact.iterrows(): + if e.isnull().sum() == 0: + res_combined.append(e.values) + else: + res_combined.append(df_pi.loc[i].values) + res_combined = np.array(res_combined) + df_combined = pd.DataFrame(res_combined, columns=cols, index=df_exact.index) + return df_combined + + else: + raise ValueError("The method should be either exact, pseudo_inverse or combined.") + + def _get_optical_data_from_database(self): + """ + Get a dataframe with the refractive index, extinction coefficients, and reflectivity + as obtained from the initial database, for an array of wavelengths. + We need to handle the database that is in different formats... + + Returns: + DataFrame with the data + """ + + db_dir = os.path.join(self.saving_dir, "database") + # The database has been compressed, it needs to be untarred if it is not already the case. + if not os.listdir(db_dir): + db_file = os.path.join(module_dir, "data_files/optical_polyanskiy/database.tar.xz") + with tarfile.open(db_file, mode="r:xz") as tar: + tar.extractall(self.saving_dir) + + names = [] + compos = [] + N = [] + K = [] + + for material in os.listdir(db_dir): + # Some materials have the data in the needed wavelengths range, + # others don't and throw an error + try: + n_avg = self._get_nk_avg(os.path.join(db_dir, material)) + compos.append(Composition(material)) + names.append(material) + N.append(n_avg.real) + K.append(n_avg.imag) + except ValueError: + pass + + W = 1000 * self.wavelengths + N = np.array(N) + K = np.array(K) + R = ((N - 1) ** 2 + K**2) / ((N + 1) ** 2 + K**2) + + cols_names_n = [f"n_{np.round(wl, 2)}" for wl in W] + cols_names_k = [f"k_{np.round(wl, 2)}" for wl in W] + cols_names_r = [f"R_{np.round(wl, 2)}" for wl in W] + + df_n = pd.DataFrame(N, columns=cols_names_n) + df_k = pd.DataFrame(K, columns=cols_names_k) + df_r = pd.DataFrame(R, columns=cols_names_r) + + df = pd.concat([df_n, df_k, df_r], axis=1) + df["Composition"] = compos + df["Compound"] = names + df.set_index("Compound", inplace=True) + + return df + + def _get_optical_data_from_file(self, yaml_file): + """ + From a yml file, returns the refractive index for a given wavelength + + Args: + yaml_file: path to the yml file containing the data + + Returns: + refractive index (complex number) + """ + + # Open the yml file + with open(yaml_file) as yml: + data = yaml.YAML(typ="safe", pure=True).load(yml)["DATA"][0] + + data_format = data["type"] + + # We now treat different formats for the data + if data_format in ["tabulated nk", "tabulated n"]: + # We parse the data to get the wavelength, n and kappa + if data_format == "tabulated nk": + arr = np.fromstring(data["data"].replace("\n", " "), sep=" ").reshape((-1, 3)) + K = arr[:, 2] + # kappa not available -> 0 + elif data_format == "tabulated n": + arr = np.fromstring(data["data"].replace("\n", " "), sep=" ").reshape((-1, 2)) + K = np.zeros(len(arr)) + + wl = arr[:, 0] + range_wl = np.array([np.min(wl), np.max(wl)]) + N = arr[:, 1] + + interpN = interp1d(wl, N) + interpK = interp1d(wl, K) + + # Check that self.wavelengths are within the range + if np.any(self.min_wl < range_wl[0]) or np.any(self.max_wl > range_wl[1]): + raise ValueError( + f"""The values of lambda asked to be returned is outside the range + of available data. This can lead to strong deviation as extrapolation might be bad. For information, the + range is [{range_wl[0]}, {range_wl[1]}] microns.""" + ) + else: + return np.array([x for x in np.nditer(interpN(self.wavelengths) + 1j * interpK(self.wavelengths))]) + + # If the data is not tabulated, it is given with a formula + elif "formula" in data_format: + range_wl = np.fromstring(data["wavelength_range"], sep=" ") + # Check that lamb is within the range + if np.any(self.min_wl < range_wl[0]) or np.any(self.max_wl > range_wl[1]): + raise ValueError( + f""""The values of lambda asked to be returned is outside the range + of available data. This can lead to strong deviation as extrapolation might be bad. For information, the + range is [{range_wl[0]}, {range_wl[1]}] microns.""" + ) + else: + coeff_file = np.fromstring(data["coefficients"], sep=" ") + + N = np.zeros(self.n_wl) + 1j * np.zeros(self.n_wl) + + if data_format == "formula 1": + coeffs = np.zeros(17) + coeffs[0 : len(coeff_file)] = coeff_file + N += 1 + coeffs[0] + for i in range(1, 17, 2): + N += (coeffs[i] * self.wavelengths**2) / (self.wavelengths**2 - coeffs[i + 1] ** 2) + N = np.sqrt(N) + + elif data_format == "formula 2": + coeffs = np.zeros(17) + coeffs[0 : len(coeff_file)] = coeff_file + N += 1 + coeffs[0] + for i in range(1, 17, 2): + N += (coeffs[i] * self.wavelengths**2) / (self.wavelengths**2 - coeffs[i + 1]) + N = np.sqrt(N) + + elif data_format == "formula 3": + coeffs = np.zeros(17) + coeffs[0 : len(coeff_file)] = coeff_file + N += coeffs[0] + for i in range(1, 17, 2): + N += coeffs[i] * self.wavelengths ** coeffs[i + 1] + N = np.sqrt(N) + + elif data_format == "formula 4": + coeffs = np.zeros(17) + coeffs[0 : len(coeff_file)] = coeff_file + N += coeffs[0] + N += coeffs[1] * self.wavelengths ** coeffs[2] / (self.wavelengths**2 - coeffs[3] ** coeffs[4]) + N += coeffs[5] * self.wavelengths ** coeffs[6] / (self.wavelengths**2 - coeffs[7] ** coeffs[8]) + for i in range(9, 17, 2): + N += coeffs[i] * self.wavelengths ** coeffs[i + 1] + N = np.sqrt(N) + + elif data_format == "formula 5": + coeffs = np.zeros(11) + coeffs[0 : len(coeff_file)] = coeff_file + N += coeffs[0] + for i in range(1, 11, 2): + N += coeffs[i] * self.wavelengths ** coeffs[i + 1] + + elif data_format == "formula 6": + coeffs = np.zeros(11) + coeffs[0 : len(coeff_file)] = coeff_file + N += 1 + coeffs[0] + for i in range(1, 11, 2): + N += coeffs[i] * self.wavelengths**2 / (coeffs[i + 1] * self.wavelengths**2 - 1) + + elif data_format == "formula 7": + coeffs = np.zeros(6) + coeffs[0 : len(coeff_file)] = coeff_file + N += ( + coeffs[0] + + coeffs[1] / (self.wavelengths**2 - 0.028) + + coeffs[2] / (self.wavelengths**2 - 0.028) ** 2 + ) + N += ( + coeffs[3] * self.wavelengths**2 + + coeffs[4] * self.wavelengths**4 + + coeffs[5] * self.wavelengths**6 + ) + + elif data_format == "formula 8": + coeffs = np.zeros(4) + coeffs[0 : len(coeff_file)] = coeff_file + N += ( + coeffs[0] + + coeffs[1] * self.wavelengths**2 / (self.wavelengths**2 - coeffs[2]) + + coeffs[3] * self.wavelengths**2 + ) + N = np.sqrt((1 + 2 * N) / (1 - N)) + + elif data_format == "formula 9": + coeffs = np.zeros(6) + coeffs[0 : len(coeff_file)] = coeff_file + N += ( + coeffs[0] + + coeffs[1] / (self.wavelengths**2 - coeffs[2]) + + coeffs[3] * (self.wavelengths - coeffs[4]) / ((self.wavelengths - coeffs[4]) ** 2 + coeffs[5]) + ) + N = np.sqrt(N) + + return N + 1j * np.zeros(len(N)) + + else: + raise ValueError("UnsupportedDataType: This data type is currently not supported !") + + def _get_nk_avg(self, dirname): + """ + Compute the average of n and ĸ for a compound of the database. + + Args: + dirname: path to the compound of interest + + Returns: + refractive index (complex number) + """ + + files = os.listdir(dirname) + + navg = [] + for f in files: + # Some files have only kappa : they raise a ValueError + try: + # We get the average of all curves in the region of interest. + # For some systems, the region of interest is not covered. + navg.append(self._get_optical_data_from_file(os.path.join(dirname, f))) + except ValueError: + pass + + # We go further only if there is data + if navg: + navg = np.array(navg).mean(axis=0) + Navg = navg.real + Kavg = navg.imag + return Navg + 1j * Kavg + else: + raise ValueError(f"No correct data for {dirname}") + + def get_elemental_property(self, elem, property_name): + estr = str(elem) + return self.all_element_data.loc[estr][property_name] + + +class TransportData(AbstractData): + """ + Class to use transport data from Ricci et al., see + An ab initio electronic transport database for inorganic materials. + Ricci, F., Chen, W., Aydemir, U., Snyder, G. J., Rignanese, G. M., Jain, A., & Hautier, G. (2017). + Scientific data, 4(1), 1-13. + https://doi.org/10.1038/sdata.2017.85 + + The database has been used to extract: + 1) the properties of single elements when available. + These are stored in matminer/utils/data_files/mp_transport/transport_pure_elems.csv + 2) the pseudo-inverse of the properties of single elements. + These pseudo-inverse contributions correspond to the coefficients of a least-square fit + from the compositions to the properties. This can allow to better take into account + data from different compounds for a given element. + + Using the pseudo-inverses (method="pseudo_inverse") instead of + the elemental properties (method="exact") leads to better results as far as we have checked. + Another possibility is to use method="combined", where the exact values are taken + for compounds present as pure compounds in the database, and the pseudo-inverse + is taken if the element is not present purely in the database. + + For the effective mass, the pseudo-inverse is obtained on 1/(alpha+m), + then m is re-obtained for single elements. This is to avoid + huge errors coming from the huge spread in data (12 orders of magnitude). + + Args: + props: optical properties to include. Should be a (sub)list of + ["sigma_p", "sigma_n", "S_p", "S_n", "kappa_p", "kappa_n", "PF_p", "PF_n", "m_p", "m_n"] + for the hole (_p) and electron (_n) conductivity (sigma), Seebeck coefficient (S), + thermal conductivity (kappa), power factor (PF) and effective mass (m). + method: type of values, either "exact", "pseudo_inverse", or "combined". + alpha: Value used to featurize the effective mass. + The values of the effective masses span 12 orders of magnitude, which makes the pseudo-inverse biased + To overcome this, we use 1 / (alpha + m) for the pseudo-inversion. + The value of alpha can be tested. A file for each of them is created, + so that it is not computed each time. + Defaults to 0, and used only if method != "exact". + saving_dir: folder to save the data and csv file used for the featurization. Saving them helps fasten the + featurization. + impute_nan (bool): if True, the features for the elements + that are missing from the data_source or are NaNs are replaced by the + average of each features over the available elements. + """ + + def __init__( + self, + props=None, + method="pseudo_inverse", + alpha=0, + saving_dir="~/.matminer/transport_props/", + impute_nan=False, + ): + # Handles the saving folder + saving_dir = os.path.expanduser(saving_dir) + os.makedirs(saving_dir, exist_ok=True) + self.saving_dir = saving_dir + + # Handle the selection of properties + possible_props = ["sigma_p", "sigma_n", "S_p", "S_n", "kappa_p", "kappa_n", "PF_p", "PF_n", "m_p", "m_n"] + if props is None: + props = possible_props + elif not all([prop in possible_props for prop in props]): + raise ValueError("This property is not available: choose from " + ", ".join(possible_props)) + + self.props = props + self.method = method + self.alpha = alpha + + # Read the database as a compressed or csv file + dfile = os.path.join(module_dir, "data_files/mp_transport/", "transport_database.csv") + dfile_tarred = os.path.join(module_dir, "data_files/mp_transport/", "transport_database.tar.xz") + if not os.path.isfile(dfile) and os.path.isfile(dfile_tarred): + with tarfile.open(dfile_tarred, mode="r:xz") as tar: + tar.extractall(os.path.join(module_dir, "data_files/mp_transport/")) + data = pd.read_csv(dfile).drop(columns=["mp_id"]) + data.rename(columns={"pretty_formula": "Compound"}, inplace=True) + data.set_index("Compound", inplace=True) + + # Remove outliers + data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)] + + # Add composition + compos = [] + for c in data.index: + compos.append(Composition(c)) + data["Composition"] = compos + + self.data = data + + self.all_element_data = self._get_element_props() + + self.prop_names = list(self.all_element_data.columns) + + self.impute_nan = impute_nan + if self.impute_nan: + self.all_element_data.fillna(self.all_element_data.mean(), inplace=True) + else: + warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING) + + def _get_element_props(self): + # + # Compute the exact values + # + dfile = os.path.join(module_dir, "data_files/mp_transport/", "transport_pure_elems.csv") + df_exact = pd.read_csv(dfile) + df_exact.set_index("Element", inplace=True) + df_exact.drop(columns=["mp_id"], inplace=True) + + elem, elem_absent = get_elem_in_data(df_exact, as_pure=True) + res = [] + for e in elem: + res.append(df_exact.loc[e][self.props].values) + for e in elem_absent: + res.append(np.nan * np.ones(len(self.props))) + + res = np.array(res) + cols = self.props + df_exact = pd.DataFrame(res, columns=cols, index=elem + elem_absent) + df_exact = df_exact[self.props] + + if self.method == "exact": + return df_exact + else: + # + # Retrieve or compute the pseudo-inversed values. + # + dbfile = os.path.join(self.saving_dir, f"transport_pi_{self.alpha}.csv") + if os.path.isfile(dbfile): + df_pi = pd.read_csv(dbfile) + df_pi.set_index("Element", inplace=True) + df_pi = df_pi[self.props] + else: + warnings.warn( + """Pseudo-inverse values not found for this value of alpha. Recomputing them... + This can take a few seconds...""" + ) + TP = self.data.copy() + TP["1/m_p"] = 1 / (self.alpha + TP["m_p"].values) + TP["1/m_n"] = 1 / (self.alpha + TP["m_n"].values) + + df_pi = get_pseudo_inverse(TP) + + df_pi["m_p"] = 1 / df_pi["1/m_p"].values - self.alpha + df_pi["m_n"] = 1 / df_pi["1/m_n"].values - self.alpha + df_pi.drop(columns=["1/m_p", "1/m_n"], inplace=True) + df_pi.index.name = "Element" + + df_pi.to_csv(dbfile) + warnings.warn("The pseudo-inverse coefficients have been collected and stored.") + + if self.method == "pseudo_inverse": + return df_pi + elif self.method == "combined": + res_combined = [] + for i, e in df_exact.iterrows(): + if e.isnull().sum() == 0: + res_combined.append(e.values) + else: + res_combined.append(df_pi.loc[i].values) + res_combined = np.array(res_combined) + df_combined = pd.DataFrame(res_combined, columns=df_exact.columns, index=df_exact.index) + return df_combined + else: + raise ValueError("The method should be either exact or pseudo_inverse.") + + def get_elemental_property(self, elem, property_name): + estr = str(elem) + return self.all_element_data.loc[estr][property_name] diff --git a/matminer/utils/data_files/mp_transport/transport_database.tar.xz b/matminer/utils/data_files/mp_transport/transport_database.tar.xz new file mode 100644 index 000000000..e7c88ef98 Binary files /dev/null and b/matminer/utils/data_files/mp_transport/transport_database.tar.xz differ diff --git a/matminer/utils/data_files/mp_transport/transport_pure_elems.csv b/matminer/utils/data_files/mp_transport/transport_pure_elems.csv new file mode 100644 index 000000000..490094056 --- /dev/null +++ b/matminer/utils/data_files/mp_transport/transport_pure_elems.csv @@ -0,0 +1,51 @@ +mp_id,Element,sigma_p,sigma_n,S_p,S_n,kappa_p,kappa_n,PF_p,PF_n,m_p,m_n +mp-135,Li,925330,925361,-2.43243,-2.43481,6820550,6820580,5.47493,5.48581,3.05E-05,3.05E-05 +mp-20,Be,1093100,1093600,-18.1913,-18.169,7591050,7588400,361.73,361.009,2.58E-05,2.58E-05 +mp-160,B,95.0934,34.1649,390.474,-655.344,537.571,189.446,14.6366,14.6929,0.296333954,0.82480567 +mp-48,C,5252.98,5276.01,5.26675,-12.5669,111354,110907,0.0297537,1.10388,0.00536446,0.005341044 +mp-154,N,1.89187,10.0096,887.732,-707.978,5.53941,53.6861,1.49093,5.01714,14.89499979,2.815237696 +mp-12957,O,5.00329,10.6565,776.765,-738.948,21.7704,50.8498,3.0109,5.87144,6.41334,4.22984 +mp-127,Na,732035,732088,-3.22304,-3.2284,5406820,5407110,7.60438,7.63023,3.85E-05,3.85E-05 +mp-153,Mg,1192170,1192230,-3.36428,-3.3631,8709300,8709530,16.3136,16.2827,2.36E-05,2.36E-05 +mp-134,Al,3132280,3132330,-1.60472,-1.60452,22912300,22912900,8.06597,8.06411,9.00E-06,9.00E-06 +mp-149,Si,88.7122,95.893,525.926,-513.639,558.792,523.124,24.5376,25.299,0.317649695,0.293862985 +mp-157,P,165.26,182.981,209.384,-227.097,5389.87,5021.21,8.91698,9.31858,0.170515571,0.154001799 +mp-77,S,6.87888,4.19204,610.272,-660.083,24.552,11.3981,2.72008,1.84124,4.096510369,6.722121747 +mp-58,K,352794,352833,-4.52238,-4.52063,2584820,2585150,7.21529,7.21053,7.99E-05,7.99E-05 +mp-45,Ca,302737,302738,-0.809231,-0.806981,2196400,2196440,0.198249,0.197149,9.31E-05,9.31E-05 +mp-67,Sc,288099,288098,-0.945031,-0.942615,1974750,1974720,9.05286,9.06289,9.78E-05,9.78E-05 +mp-72,Ti,165898,165902,-4.22226,-4.22989,1331760,1331800,14.0871,14.1085,0.00016986,0.000169856 +mp-146,V,1355480,1355470,3.24851,3.24894,9812320,9812260,14.3041,14.3079,2.08E-05,2.08E-05 +mp-90,Cr,1025850,1025840,1.20852,1.21177,7357490,7357790,1.49828,1.50633,2.75E-05,2.75E-05 +mp-542909,Mn,53352.6,53352.6,-0.302024,-0.30516,429129,429131,0.00486675,0.00496835,0.000528173,0.000528173 +mp-13,Fe,813572,813563,3.87831,3.87783,5971310,5971220,12.2372,12.234,3.46E-05,3.46E-05 +mp-54,Co,705466,705480,-5.46444,-5.46471,5189260,5189310,21.1108,21.1134,3.99E-05,3.99E-05 +mp-23,Ni,812607,812601,10.1936,10.1932,6120530,6120510,84.4369,84.4304,3.47E-05,3.47E-05 +mp-30,Cu,1630220,1630230,-1.41329,-1.41834,12126400,12127100,3.2562,3.27951,1.73E-05,1.73E-05 +mp-79,Zn,1649870,1650010,-2.2726,-2.25933,11757100,11755300,8.567,8.47377,1.71E-05,1.71E-05 +mp-142,Ga,540375,540276,6.79349,6.77515,4114570,4114110,37.6701,37.6324,5.21E-05,5.22E-05 +mp-32,Ge,1764.96,1926.27,-84.9765,-118.158,53593.9,50314.7,12.7448,26.8933,0.015966029,0.014629 +mp-11,As,104292,104176,21.8047,21.7828,797177,796004,67.0258,66.7326,0.000270197,0.000270498 +mp-70,Rb,286207,286244,-4.43738,-4.43716,2092990,2093160,5.63552,5.63567,9.85E-05,9.84E-05 +mp-76,Sr,72801.8,72825.8,-30.5886,-30.7422,991104,990971,68.1178,68.8265,0.00038707,0.000386943 +mp-112,Y,320047,320045,0.876605,0.879684,2200450,2200410,1.9417,1.95116,8.80E-05,8.80E-05 +mp-131,Zr,304499,304524,-11.4099,-11.4144,2334460,2334590,41.553,41.5854,9.25E-05,9.25E-05 +mp-75,Nb,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN +mp-129,Mo,1491480,1491490,-0.00429693,0.00554035,10290200,10290100,2.75E-05,4.58E-05,1.89E-05,1.89E-05 +mp-33,Ru,1463730,1463720,0.434967,0.434432,10749000,10749000,5.23266,5.24393,1.93E-05,1.93E-05 +mp-74,Rh,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN +mp-2,Pd,1007340,1007330,8.33372,8.33356,7473540,7473380,69.9605,69.9571,2.80E-05,2.80E-05 +mp-124,Ag,1631090,1631240,-3.77911,-3.78152,12041200,12040500,23.2946,23.3265,1.73E-05,1.73E-05 +mp-85,In,1988630,1988620,-0.538077,-0.542589,14778700,14780000,0.575763,0.585455,1.42E-05,1.42E-05 +mp-117,Sn,27096.3,27158.8,-16.1396,-16.7422,252932,253013,7.05821,7.61264,0.001039972,0.001037579 +mp-104,Sb,28155.1,28216.8,-14.6875,-15.964,316195,316514,7.06549,7.97584,0.001000863,0.000998675 +mp-122,Ba,156444,156416,11.9025,11.9038,1145330,1145140,22.1632,22.1641,0.000180125,0.000180157 +mp-103,Hf,340719,340730,-3.76607,-3.76486,2485960,2486050,6.09816,6.09334,8.27E-05,8.27E-05 +mp-42,Ta,45661.8,45659.2,7.94558,7.95357,323410,323408,4.66919,4.6756,0.000617133,0.000617168 +mp-91,W,1087360,1087430,-6.02206,-6.04159,8615380,8615410,39.4334,39.6922,2.59E-05,2.59E-05 +mp-49,Os,1392840,1392830,-0.00168482,-0.00302132,10231700,10232000,0.71973,0.710894,2.02E-05,2.02E-05 +mp-101,Ir,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN +mp-126,Pt,1467730,1467710,6.41142,6.41156,10719000,10718800,60.3328,60.3349,1.92E-05,1.92E-05 +mp-81,Au,1559610,1559550,0.396481,0.394712,11447200,11448700,0.245167,0.242974,1.81E-05,1.81E-05 +mp-20483,Pb,1638450,1638460,-0.50349,-0.496498,11809500,11809800,0.415351,0.403897,1.72E-05,1.72E-05 +mp-23152,Bi,33807.5,33539.1,90.3033,89.4824,416624,416229,476.761,465.214,0.000833525,0.000840196 diff --git a/matminer/utils/data_files/optical_polyanskiy/README b/matminer/utils/data_files/optical_polyanskiy/README new file mode 100644 index 000000000..5477cb1ef --- /dev/null +++ b/matminer/utils/data_files/optical_polyanskiy/README @@ -0,0 +1,5 @@ +This data has been retrieved from https://github.com/polyanskiy/refractiveindex.info-database +on August 27, 2022. The script used to transform the data from the original repository to +an easier-to-use database for matminer can be found in matminer/dev_scripts/dataset_management/handle_refractiveindex.py +This script is ugly because the data format is not homogeneous and many things have been handled by hand. +Do not hesitate to contact Matgenix in case you have troubles to re-create the dataset used by matminer (in case you want to update the database). diff --git a/matminer/utils/data_files/optical_polyanskiy/database.tar.xz b/matminer/utils/data_files/optical_polyanskiy/database.tar.xz new file mode 100644 index 000000000..d67c29a66 Binary files /dev/null and b/matminer/utils/data_files/optical_polyanskiy/database.tar.xz differ diff --git a/matminer/utils/tests/test_data.py b/matminer/utils/tests/test_data.py index 52a801486..2d174a0c6 100644 --- a/matminer/utils/tests/test_data.py +++ b/matminer/utils/tests/test_data.py @@ -2,6 +2,7 @@ from math import isnan from unittest import TestCase +import pytest from pymatgen.core import Element from pymatgen.core.periodic_table import Specie @@ -12,7 +13,9 @@ MatscholarElementData, MEGNetElementData, MixingEnthalpy, + OpticalData, PymatgenData, + TransportData, ) @@ -20,7 +23,8 @@ class TestDemlData(TestCase): """Tests for the DemlData Class""" def setUp(self): - self.data_source = DemlData() + self.data_source = DemlData(impute_nan=False) + self.data_source_imputed = DemlData(impute_nan=True) def test_get_property(self): self.assertAlmostEqual( @@ -45,27 +49,102 @@ def test_get_property(self): self.data_source.get_charge_dependent_property_from_specie(Specie("V", 3), "xtal_field_split"), ) + self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "mus_fere"))) + self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "electron_affin"))) + with pytest.raises(KeyError): + self.data_source.get_elemental_property(Element("Og"), "first_ioniz") + with pytest.raises(KeyError): + self.data_source.get_elemental_property(Element("Og"), "total_ioniz") + self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "xtal_field_split"))) + + self.assertAlmostEqual( + -4.3853, + self.data_source_imputed.get_elemental_property(Element("Bi"), "mus_fere"), + 4, + ) + self.assertEqual( + 59600, + self.data_source_imputed.get_elemental_property(Element("Li"), "electron_affin"), + ) + self.assertAlmostEqual( + 2372300, + self.data_source_imputed.get_elemental_property(Element("He"), "first_ioniz"), + ) + self.assertAlmostEqual( + sum([2372300, 5250500]), + self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("He", 2), "total_ioniz"), + ) + self.assertAlmostEqual( + 18.6, + self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("V", 3), "xtal_field_split"), + ) + + self.assertAlmostEqual( + -3.934405, + self.data_source_imputed.get_elemental_property(Element("Og"), "mus_fere"), + 6, + ) + self.assertAlmostEqual( + 76858.235294, + self.data_source_imputed.get_elemental_property(Element("Og"), "electron_affin"), + 6, + ) + self.assertAlmostEqual( + 811242.222222, + self.data_source_imputed.get_elemental_property(Element("Og"), "first_ioniz"), + 6, + ) + self.assertAlmostEqual( + 811242.222222, + self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("Og", 2), "total_ioniz"), + 6, + ) + self.assertEqual( + 0, + self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("Og", 3), "xtal_field_split"), + ) + def test_get_oxidation(self): self.assertEqual([1], self.data_source.get_oxidation_states(Element("Li"))) + with pytest.raises(KeyError): + self.data_source.get_oxidation_states(Element("Og")) + + self.assertEqual([1], self.data_source_imputed.get_oxidation_states(Element("Li"))) + self.assertEqual([0], self.data_source_imputed.get_oxidation_states(Element("Og"))) class TestMagpieData(TestCase): def setUp(self): - self.data_source = MagpieData() + self.data_source = MagpieData(impute_nan=False) + self.data_source_imputed = MagpieData(impute_nan=True) def test_get_property(self): self.assertAlmostEqual( 9.012182, self.data_source.get_elemental_property(Element("Be"), "AtomicWeight"), ) + self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "AtomicWeight"))) + + self.assertAlmostEqual( + 9.012182, + self.data_source_imputed.get_elemental_property(Element("Be"), "AtomicWeight"), + ) + self.assertAlmostEqual( + 138.710894, self.data_source_imputed.get_elemental_property(Element("Og"), "AtomicWeight"), 6 + ) def test_get_oxidation(self): self.assertEqual([-4, 2, 4], self.data_source.get_oxidation_states(Element("C"))) + self.assertTrue(isnan(self.data_source.get_oxidation_states(Element("Og")))) + + self.assertEqual([-4, 2, 4], self.data_source_imputed.get_oxidation_states(Element("C"))) + self.assertEqual([3.0], self.data_source_imputed.get_oxidation_states(Element("Og"))) class TestPymatgenData(TestCase): def setUp(self): - self.data_source = PymatgenData() + self.data_source = PymatgenData(impute_nan=False) + self.data_source_imputed = PymatgenData(impute_nan=True) def test_get_property(self): self.assertAlmostEqual( @@ -76,16 +155,49 @@ def test_get_property(self): 1.26, self.data_source.get_charge_dependent_property(Element("Ac"), 3, "ionic_radii"), ) + with pytest.raises(KeyError): + self.data_source.get_charge_dependent_property(Element("Og"), 0, "ionic_radii") + with pytest.raises(IndexError): + self.data_source.get_charge_dependent_property(Element("Og"), 0, "common_oxidation_states") + with pytest.raises(IndexError): + self.data_source.get_charge_dependent_property(Element("Og"), 0, "icsd_oxidation_states") + + self.assertAlmostEqual( + 9.012182, + self.data_source_imputed.get_elemental_property(Element("Be"), "atomic_mass"), + ) + self.assertAlmostEqual( + 1.26, + self.data_source_imputed.get_charge_dependent_property(Element("Ac"), 3, "ionic_radii"), + ) + self.assertAlmostEqual( + 1.472889, self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "ionic_radii"), 6 + ) + self.assertEqual( + 0, + self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "common_oxidation_states"), + ) + self.assertEqual( + 0, + self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "icsd_oxidation_states"), + ) def test_get_oxidation(self): self.assertEqual((3,), self.data_source.get_oxidation_states(Element("Nd"))) self.data_source.use_common_oxi_states = False self.assertEqual((2, 3), self.data_source.get_oxidation_states(Element("Nd"))) + self.assertEqual((), self.data_source.get_oxidation_states(Element("Og"))) + + self.assertEqual((3,), self.data_source_imputed.get_oxidation_states(Element("Nd"))) + self.data_source_imputed.use_common_oxi_states = False + self.assertEqual((2, 3), self.data_source_imputed.get_oxidation_states(Element("Nd"))) + self.assertEqual((), self.data_source_imputed.get_oxidation_states(Element("Og"))) class TestMatScholarData(TestCase): def setUp(self): - self.data_source = MatscholarElementData() + self.data_source = MatscholarElementData(impute_nan=False) + self.data_source_imputed = MatscholarElementData(impute_nan=True) def test_get_property(self): embedding_cu = self.data_source.get_elemental_property(Element("Cu"), "embedding 3") @@ -94,10 +206,17 @@ def test_get_property(self): with self.assertRaises(KeyError): self.data_source.get_elemental_property(Element("Db"), "embedding 9") + embedding_cu = self.data_source_imputed.get_elemental_property(Element("Cu"), "embedding 3") + self.assertAlmostEqual(0.028666902333498, embedding_cu) + + embedding_db = self.data_source_imputed.get_elemental_property(Element("Db"), "embedding 9") + self.assertAlmostEqual(0.015063986881835006, embedding_db) + class TestMEGNetData(TestCase): def setUp(self): - self.data_source = MEGNetElementData() + self.data_source = MEGNetElementData(impute_nan=False) + self.data_source_imputed = MEGNetElementData(impute_nan=True) def test_get_property(self): embedding_cu = self.data_source.get_elemental_property(Element("Cu"), "embedding 1") @@ -111,16 +230,27 @@ def test_get_property(self): embedding_dummy = self.data_source.all_element_data["Dummy"]["embedding 1"] self.assertAlmostEqual(-0.044910576194524765, embedding_dummy) + embedding_cu = self.data_source_imputed.get_elemental_property(Element("Cu"), "embedding 1") + self.assertAlmostEqual(0.18259364366531372, embedding_cu) + + embedding_md = self.data_source_imputed.get_elemental_property(Element("Md"), "embedding 1") + self.assertAlmostEqual(-0.0010715560693489877, embedding_md) + class TestMixingEnthalpy(TestCase): def setUp(self): - self.data = MixingEnthalpy() + self.data = MixingEnthalpy(impute_nan=False) + self.data_imputed = MixingEnthalpy(impute_nan=True) def test_get_data(self): self.assertEqual(-27, self.data.get_mixing_enthalpy(Element("H"), Element("Pd"))) self.assertEqual(-27, self.data.get_mixing_enthalpy(Element("Pd"), Element("H"))) self.assertTrue(isnan(self.data.get_mixing_enthalpy(Element("He"), Element("H")))) + self.assertEqual(-27, self.data_imputed.get_mixing_enthalpy(Element("H"), Element("Pd"))) + self.assertEqual(-27, self.data_imputed.get_mixing_enthalpy(Element("Pd"), Element("H"))) + self.assertAlmostEqual(-14.203577, self.data_imputed.get_mixing_enthalpy(Element("He"), Element("H")), 6) + class TestIUCrBondValenceData(TestCase): def setUp(self): @@ -131,5 +261,55 @@ def test_get_data(self): self.assertAlmostEqual(nacl["Ro"], 2.15) +class TestOpticalData(TestCase): + def setUp(self): + self.data_source = OpticalData(impute_nan=False) + self.data_source_imputed = OpticalData(impute_nan=True) + + def test_get_data(self): + au_r = self.data_source.get_elemental_property(elem="Au", property_name="R_400.0") + self.assertAlmostEqual(au_r, 0.25489326484782054) + ag_n = self.data_source.get_elemental_property(elem="Ag", property_name="n_600.0") + self.assertAlmostEqual(ag_n, 0.13644859985917585) + c_k = self.data_source.get_elemental_property(elem="C", property_name="k_760.0") + self.assertAlmostEqual(c_k, 0.7462931865379264) + og_r = self.data_source.get_elemental_property(elem="Og", property_name="R_400.0") + self.assertTrue(isnan(og_r)) + + au_r = self.data_source_imputed.get_elemental_property(elem="Au", property_name="R_400.0") + self.assertAlmostEqual(au_r, 0.25489326484782054) + ag_n = self.data_source_imputed.get_elemental_property(elem="Ag", property_name="n_600.0") + self.assertAlmostEqual(ag_n, 0.13644859985917585) + c_k = self.data_source_imputed.get_elemental_property(elem="C", property_name="k_760.0") + self.assertAlmostEqual(c_k, 0.7462931865379264) + og_r = self.data_source_imputed.get_elemental_property(elem="Og", property_name="R_400.0") + self.assertAlmostEqual(og_r, 0.4624005395190695) + + +class TestTransportData(TestCase): + def setUp(self): + self.data_source = TransportData(impute_nan=False) + self.data_source_imputed = TransportData(impute_nan=True) + + def test_get_data(self): + ca_mn = self.data_source.get_elemental_property(elem="Ca", property_name="m_n") + self.assertAlmostEqual(ca_mn, 0.000279554) + cr_sigmap = self.data_source.get_elemental_property(elem="Cr", property_name="sigma_p") + self.assertAlmostEqual(cr_sigmap, 205569.89838499163) + cu_kappan = self.data_source.get_elemental_property(elem="Cu", property_name="kappa_n") + self.assertAlmostEqual(cu_kappan, 1814544.75663, places=5) + og_mn = self.data_source.get_elemental_property(elem="Og", property_name="m_n") + self.assertTrue(isnan(og_mn)) + + ca_mn = self.data_source_imputed.get_elemental_property(elem="Ca", property_name="m_n") + self.assertAlmostEqual(ca_mn, 0.000279554) + cr_sigmap = self.data_source_imputed.get_elemental_property(elem="Cr", property_name="sigma_p") + self.assertAlmostEqual(cr_sigmap, 205569.89838499163) + cu_kappan = self.data_source_imputed.get_elemental_property(elem="Cu", property_name="kappa_n") + self.assertAlmostEqual(cu_kappan, 1814544.75663, places=5) + og_mn = self.data_source_imputed.get_elemental_property(elem="Og", property_name="m_n") + self.assertAlmostEqual(og_mn, 0.03237036761677134) + + if __name__ == "__main__": unittest.main() diff --git a/matminer/utils/tests/test_utils.py b/matminer/utils/tests/test_utils.py new file mode 100644 index 000000000..9773dbcf5 --- /dev/null +++ b/matminer/utils/tests/test_utils.py @@ -0,0 +1,37 @@ +import os +import unittest +from unittest import TestCase + +import pandas as pd +from pymatgen.core import Composition + +from matminer.utils.utils import get_elem_in_data, get_pseudo_inverse + +test_dir = os.path.dirname(os.path.abspath(__file__)) + + +class UtilsTest(TestCase): + def setUp(self): + data = pd.read_csv(os.path.join(test_dir, "utils_dataframe.csv")) + data.set_index("Compound", inplace=True) + comp = [] + for c in data.index: + comp.append(Composition(c)) + data["Composition"] = comp + self.data = data + + def test_get_elem_in_data(self): + elem, elem_absent = get_elem_in_data(self.data, as_pure=True) + self.assertListEqual(elem, ["Zn", "Au"]) + + elem, elem_absent = get_elem_in_data(self.data, as_pure=False) + self.assertListEqual(elem, ["N", "O", "Al", "Si", "S", "Zn", "Ga", "Ge", "As", "Y", "Ag", "Sn", "Au"]) + + def test_get_pseudo_inverse(self): + PI = get_pseudo_inverse(self.data, cols=["n_380.0"]) + self.assertAlmostEqual(PI["n_380.0"][0], -1.6345896211391995) + self.assertAlmostEqual(PI["n_380.0"][4], 4.155467999999999) + + +if __name__ == "__main__": + unittest.main() diff --git a/matminer/utils/tests/utils_dataframe.csv b/matminer/utils/tests/utils_dataframe.csv new file mode 100644 index 000000000..99fa9c578 --- /dev/null +++ b/matminer/utils/tests/utils_dataframe.csv @@ -0,0 +1,10 @@ +Compound,n_380.0 +Y3 Al5 O12,1.866205082 +Zn,9.671 +Au2 Ag3,1.12038 +Al198 Ga802 As1000,4.1915 +Si3 N4,2.098345509 +Al1 As1,4.010296333 +Sn1 S2,3.46289 +Si85 Ge15,6.234 +Au,1.617719566 diff --git a/matminer/utils/utils.py b/matminer/utils/utils.py index 4191addc6..7f91fd9a4 100644 --- a/matminer/utils/utils.py +++ b/matminer/utils/utils.py @@ -1,6 +1,10 @@ +""" Utility module """ import warnings +import numpy as np import pandas as pd +from numpy.linalg import pinv +from pymatgen.core import Composition, Element def homogenize_multiindex(df, default_key, coerce=False): @@ -38,3 +42,100 @@ def homogenize_multiindex(df, default_key, coerce=False): "multiindexed Matminer featurization without coercion " "to 2 levels." ) + + +def get_elem_in_data(df, as_pure=False): + """ + Looks for all elements present in the compounds forming the index of a dataframe. + + Args: + df: DataFrame containing the compounds formula (as strings) as index + as_pure: if True, consider only the pure compounds + + Returns: + List of elements (str) + """ + elems_in_df = [] + elems_not_in_df = [] + + # Find the elements in the data, as pure or not + if as_pure: + for elem in Element: + if elem.name in df.index: + elems_in_df.append(elem.name) + else: + for elem in Element: + for compound in df.index.to_list(): + if elem.name in compound and elem.name not in elems_in_df: + elems_in_df.append(elem.name) + + # Find the elements not in the data + for elem in Element: + if elem.name not in elems_in_df: + elems_not_in_df.append(elem.name) + + return elems_in_df, elems_not_in_df + + +def get_pseudo_inverse(df_init, cols=None): + """ + Computes the pseudo-inverse matrix of a dataframe containing properties for multiple compositions. + From a compositions matrix (containing the elemental fractions of each element present in the data), + and a property column (or multiple properties gathered in a matrix), the pseudo-inverse column + (or matrix if multiple properties are present) is defined by + + compositions * pseudo-inverse = property + + The pseudo-inverse coefficients are therefore average contributions of each element present in the data to the + property of the compounds containing this element (in a least-square fit manner). + This allows to take many compounds-property into consideration. + + Note that the pseudo-inverse coefficients do not represent the property of single elements in their pure form: + negative values may appear for single elements and for physically-positive properties, but this only reflects + the fact that the presence of the element in compounds generally decreases the value of the property. + + For elements that are not present in compounds of the data, nan are returned. + + Args: + df_init: DataFrame with a column named "Composition" containing compositions + (anything that can be turned into a Pymatgen Composition object), + and other columns containing properties to be inversed. + cols: list of columns of the dataframe giving the features to be pseudo-inversed. + + Returns: + DataFrame with the pseudo-inverse coefficients for all elements present in the initial compositions and all + properties. + """ + df = df_init.copy() + + if "Composition" not in df.columns: + raise LookupError("The DataFrame should contain a column named Composition") + + if cols is None: + cols = list(df.columns) + cols.remove("Composition") + + data = df[cols] + data.index = df["Composition"].map(lambda s: str(s)) + + elems_in_df, elems_not_in_df = get_elem_in_data(data, as_pure=False) + n_elem_tot = len(elems_in_df) + + # Initialize the matrix + A = np.zeros([len(data), n_elem_tot]) + + compos = df["Composition"] + for i, comp in enumerate(compos): + comp = Composition(comp) + for j in range(n_elem_tot): + if elems_in_df[j] in comp: + A[i, j] = comp.get_atomic_fraction(elems_in_df[j]) + + pi_A = pinv(A) + + res_pi = pi_A @ data.values + res_pi = np.vstack([res_pi, np.nan * np.ones([len(elems_not_in_df), len(df.T) - 1])]) + + df_pi = pd.DataFrame(res_pi, columns=cols, index=pd.Index(elems_in_df + elems_not_in_df)) + + return df_pi diff --git a/matminer/utils/warnings.py b/matminer/utils/warnings.py new file mode 100644 index 000000000..e11f9c965 --- /dev/null +++ b/matminer/utils/warnings.py @@ -0,0 +1,6 @@ +IMPUTE_NAN_WARNING = """In a future release, impute_nan will be set to True by default. + This means that features that are missing or are NaNs for elements + from the data source will be replaced by the average of that value + over the available elements. + This avoids NaNs after featurization that are often replaced by + dataset-dependent averages."""