diff --git a/.gitignore b/.gitignore
index c375d17de..b32a3093b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ dataset_prep.log
 
 # Datasets
 matminer/datasets/*.json.gz
+matminer/utils/data_files/mp_transport/transport_database.csv
diff --git a/dev_scripts/dataset_management/refind2matminer.py b/dev_scripts/dataset_management/refind2matminer.py
new file mode 100644
index 000000000..7f30de658
--- /dev/null
+++ b/dev_scripts/dataset_management/refind2matminer.py
@@ -0,0 +1,204 @@
+import os
+import glob
+
+"""
+Script used to transform the data from the original repository to 
+an easier-to-use database for matminer. Very ugly but should work, 
+at least with the commit 5a22ce0a3fa41fe6e9d26330686b32d83388e760 of 
+https://github.com/polyanskiy/refractiveindex.info-database
+
+Place this script in your refractiveindex repo, and run it.
+A folder database_matminer should appear containing the new database.
+
+We use only files from main and other/semiconductors alloys, other/alloys and other/intermetallics.
+Within these data, some files are not used because they are outliers when looking at
+plots of properties vs composition. 
+Some files are renamed to represent the composition in an cleaner way.
+
+Overall, this is very ugly and represents manual manipulations. 
+This is just added so that anyone can update the database used by matminer 
+with new data added on the refractiveindex repository. 
+Add only data that you are sure is representative of 3D crystals, there are no automatic checks!
+"""
+
+os.system("cp -r database database_copy")
+os.mkdir("database_matminer")
+
+# Get the list of files to remove
+files_to_remove = [
+    "data/main/Al2O3/Querry.yml",
+    "data/main/Au/*-*nm*.yml",
+    "data/main/C/Arakawa.yml",
+    "data/main/C/Djurisic-e.yml",
+    "data/main/C/Djurisic-o.yml",
+    "data/main/C/El-Sayed.yml",
+    "data/main/C/Ermolaev.yml",
+    "data/main/C/Hagemann.yml",
+    "data/main/C/Querry*.yml",
+    "data/main/C/Song*.yml",
+    "data/main/C/Weber.yml",
+    "data/main/Ge/*nm*.yml",
+    "data/main/InSb/Adachi.yml",
+    "data/main/MoS2/Beal.yml",
+    "data/main/MoS2/Ermolaev*.yml",
+    "data/main/MoS2/Hsu*.yml",
+    "data/main/MoS2/Islam*.yml",
+    "data/main/MoS2/Jung.yml",
+    "data/main/MoS2/Song-*L.yml",
+    "data/main/MoS2/Yim*.yml",
+    "data/main/MoS2/Zhang.yml",
+    "data/main/Na/Inagaki-liquid.yml",
+    "data/main/Bi2Se3/Fang-2L.yml",
+    "data/main/Pb/Mathewson-140K.yml",
+    "data/main/Si/Pierce.yml",
+    "data/main/WSe2/*L*.yml",
+    "data/main/*/*Werner*.yml",
+    "data/other/alloys/Ni-Fe/*10nm*",
+    "data/other/alloys/Ni-Fe/*gold150nm*",
+    "data/other/intermetallics/AuAl2/Supansomboon.yml",
+    "data/main/*/Munkhbat-e.yml",
+    "data/main/*/Munkhbat-gamma.yml"
+]
+
+flat_list = []
+for f in files_to_remove:
+    flat_list += glob.glob(os.path.join("database_copy", f))
+
+flat_list += ["\'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-0.yml\'"]
+flat_list += ["\'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-0.yml\'"]
+flat_list.remove("database_copy/data/main/Ta/Werner.yml")
+flat_list.remove("database_copy/data/main/Ta/Werner-DFT.yml")
+
+files_to_remove = flat_list
+
+for f in files_to_remove:
+    os.system("rm " + f)
+
+# Also change a file
+os.system("sed -i \'s/760/780/g\' database_copy/data/main/C/Peter.yml")
+
+# Move some files
+os.system("mv database_copy/data/other/alloys/Au-Ag/Rioux-Au0Ag100.yml database_copy/data/main/Ag/Rioux.yml")
+os.system("mv database_copy/data/other/alloys/Au-Ag/Rioux-Au100Ag0.yml database_copy/data/main/Au/Rioux.yml")
+os.system("mv \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-0.yml\' database_copy/data/main/GaSb/Ferrini.yml")
+
+# Remove entire folders
+folders_to_remove = [
+    "data/other/alloys/Pd-H",
+    "data/other/alloys/V-H",
+    "data/other/alloys/Zr-H",
+    "data/other/alloys/Nb-Sn",
+    "data/other/alloys/V-Ga"
+]
+
+for f in folders_to_remove:
+    os.system("rm -rf database_copy/" + f)
+
+# Now the database has been filtered.
+# We move the files into database_matminer,
+# with some renaming...
+os.system("cp -r database_copy/data/main/* database_matminer/")
+os.system("cp -r database_copy/data/other/intermetallics/* database_matminer/")
+os.mkdir("database_matminer/Au2Ag3")
+os.mkdir("database_matminer/Au3Ag2")
+os.mkdir("database_matminer/Au3Ag7")
+os.mkdir("database_matminer/Au4Ag")
+os.mkdir("database_matminer/Au7Ag3")
+os.mkdir("database_matminer/Au9Ag")
+os.mkdir("database_matminer/AuAg")
+os.mkdir("database_matminer/AuAg4")
+os.mkdir("database_matminer/AuAg9")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au40Ag60.yml database_matminer/Au2Ag3/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au60Ag40.yml database_matminer/Au3Ag2/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au30Ag70.yml database_matminer/Au3Ag7/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au80Ag20.yml database_matminer/Au4Ag/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au70Ag30.yml database_matminer/Au7Ag3/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au90Ag10.yml database_matminer/Au9Ag/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au50Ag50.yml database_matminer/AuAg/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au20Ag80.yml database_matminer/AuAg4/")
+os.system("cp database_copy/data/other/alloys/Au-Ag/Rioux-Au10Ag90.yml database_matminer/AuAg9/")
+
+os.mkdir("database_matminer/Cu17Zn3")
+os.mkdir("database_matminer/Cu7Zn3")
+os.mkdir("database_matminer/Cu9Zn")
+os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu85Zn15.yml database_matminer/Cu17Zn3/")
+os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu70Zn30.yml database_matminer/Cu7Zn3/")
+os.system("cp database_copy/data/other/alloys/Cu-Zn/Querry-Cu90Zn10.yml database_matminer/Cu9Zn/")
+
+os.mkdir("database_matminer/Ni4Fe")
+os.system("cp database_copy/data/other/alloys/Ni-Fe/Tikuisis_bare150nm.yml database_matminer/Ni4Fe/")
+
+
+os.mkdir("database_matminer/Al198Ga802As1000")
+os.mkdir("database_matminer/Al219Ga781As1000")
+os.mkdir("database_matminer/Al315Ga685As1000")
+os.mkdir("database_matminer/Al342Ga658As1000")
+os.mkdir("database_matminer/Al411Ga589As1000")
+os.mkdir("database_matminer/Al419Ga581As1000")
+os.mkdir("database_matminer/Al452Ga548As1000")
+os.mkdir("database_matminer/Al491Ga509As1000")
+os.mkdir("database_matminer/Al590Ga410As1000")
+os.mkdir("database_matminer/Al700Ga300As1000")
+os.mkdir("database_matminer/Al804Ga196As1000")
+os.mkdir("database_matminer/Al97Ga903As1000")
+os.mkdir("database_matminer/Al99Ga901As1000")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-19.8.yml\' database_matminer/Al198Ga802As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-21.9.yml\' database_matminer/Al219Ga781As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Adachi-0.315.yml\' database_matminer/Al315Ga685As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-31.5.yml\' database_matminer/Al315Ga685As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-34.2.yml\' database_matminer/Al342Ga658As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-41.1.yml\' database_matminer/Al411Ga589As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-41.9.yml\' database_matminer/Al419Ga581As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-45.2.yml\' database_matminer/Al452Ga548As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-49.1.yml\' database_matminer/Al491Ga509As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-59.0.yml\' database_matminer/Al590Ga410As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Adachi-0.700.yml\' database_matminer/Al700Ga300As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-70.0.yml\' database_matminer/Al700Ga300As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-80.4.yml\' database_matminer/Al804Ga196As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Papatryfonos-9.7.yml\' database_matminer/Al97Ga903As1000/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlAs-GaAs/Aspnes-9.9.yml\' database_matminer/Al99Ga901As1000/")
+
+os.mkdir("database_matminer/Al2329O2612N588")
+os.mkdir("database_matminer/Al2351O2547N653")
+os.mkdir("database_matminer/Al2356O2531N669")
+os.mkdir("database_matminer/Al2372O2483N717")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-5.88.yml\' database_matminer/Al2329O2612N588/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-6.53.yml\' database_matminer/Al2351O2547N653/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-6.69.yml\' database_matminer/Al2356O2531N669/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlN-Al2O3/Hartnett-7.17.yml\' database_matminer/Al2372O2483N717/")
+
+os.mkdir("database_matminer/Al10Ga90Sb100")
+os.mkdir("database_matminer/Al30Ga70Sb100")
+os.mkdir("database_matminer/Al50Ga50Sb100")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-10.yml\' database_matminer/Al10Ga90Sb100/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-30.yml\' database_matminer/Al30Ga70Sb100/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/AlSb-GaSb/Ferrini-50.yml\' database_matminer/Al50Ga50Sb100/")
+
+os.mkdir("database_matminer/In52Ga48As100")
+os.system("cp \'database_copy/data/other/semiconductor alloys/GaAs-InAs/Adachi.yml\' database_matminer/In52Ga48As100/")
+
+os.mkdir("database_matminer/In52Ga48As24P76")
+os.system("cp \'database_copy/data/other/semiconductor alloys/GaAs-InAs-GaP-InP/Adachi.yml\' database_matminer/In52Ga48As24P76/")
+
+os.mkdir("database_matminer/Ga51In49P100")
+os.system("cp \'database_copy/data/other/semiconductor alloys/GaP-InP/Schubert.yml\' database_matminer/Ga51In49P100/")
+
+os.mkdir("database_matminer/Si11Ge89")
+os.mkdir("database_matminer/Si20Ge80")
+os.mkdir("database_matminer/Si28Ge72")
+os.mkdir("database_matminer/Si47Ge53")
+os.mkdir("database_matminer/Si48Ge52")
+os.mkdir("database_matminer/Si65Ge35")
+os.mkdir("database_matminer/Si85Ge15")
+os.mkdir("database_matminer/Si98Ge2")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-11.yml\' database_matminer/Si11Ge89/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-20.yml\' database_matminer/Si20Ge80/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-28.yml\' database_matminer/Si28Ge72/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-47.yml\' database_matminer/Si47Ge53/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-48.yml\' database_matminer/Si48Ge52/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-65.yml\' database_matminer/Si65Ge35/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-85.yml\' database_matminer/Si85Ge15/")
+os.system("cp \'database_copy/data/other/semiconductor alloys/Si-Ge/Jellison-98.yml\' database_matminer/Si98Ge2/")
+
+# Remove unncessary folders and files
+os.system("rm -r database_copy")
diff --git a/matminer/featurizers/composition/alloy.py b/matminer/featurizers/composition/alloy.py
index e30073f86..bc661df51 100644
--- a/matminer/featurizers/composition/alloy.py
+++ b/matminer/featurizers/composition/alloy.py
@@ -4,6 +4,7 @@
 
 import collections
 import os
+import warnings
 from functools import reduce
 
 import numpy as np
@@ -16,6 +17,7 @@
 from matminer.featurizers.composition.packing import AtomicPackingEfficiency
 from matminer.featurizers.utils.stats import PropertyStats
 from matminer.utils.data import CohesiveEnergyData, MagpieData, MixingEnthalpy
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 module_dir = os.path.dirname(os.path.abspath(__file__))
 data_dir = os.path.join(module_dir, "..", "..", "utils", "data_files")
@@ -56,6 +58,9 @@ class Miedema(BaseFeaturizer):
             'valence_electrons', 'a_const', 'R_const', 'H_trans'
             'compressibility', 'shear_modulus', 'melting_point'
             'structural_stability'. Please see the references for details.
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     Returns:
         (list of floats) Miedema formation enthalpies (eV/atom) for input
             struct_types:
@@ -65,7 +70,7 @@ class Miedema(BaseFeaturizer):
             -Miedema_deltaH_amor: for amorphous phase
     """
 
-    def __init__(self, struct_types="all", ss_types="min", data_source="Miedema"):
+    def __init__(self, struct_types="all", ss_types="min", data_source="Miedema", impute_nan=False):
         if isinstance(struct_types, list):
             self.struct_types = struct_types
         else:
@@ -88,13 +93,31 @@ def __init__(self, struct_types="all", ss_types="min", data_source="Miedema"):
         else:
             raise NotImplementedError(f"data_source {data_source} not implemented yet")
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            elem_list = [Element(estr) for estr in self.df_dataset.index]
+            missing_elements = [e for e in Element if e not in elem_list]
+            missing_element_names = [e.symbol for e in missing_elements]
+            element_ids = [e.Z for e in missing_elements]
+            indices = list(range(len(elem_list), len(elem_list) + len(missing_elements)))
+            df_missing = pd.DataFrame(data=[indices, missing_element_names, element_ids]).T
+            df_missing.columns = ["Unnamed: 0", "element", "element_id"]
+            df_missing.set_index("element", inplace=True)
+            for col in self.df_dataset:
+                if col not in df_missing:
+                    df_missing[col] = np.mean(self.df_dataset[col])
+            self.df_dataset = pd.concat([self.df_dataset, df_missing])
+            self.df_dataset.fillna(self.df_dataset.mean(), inplace=True)
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
         self.element_list = [Element(estr) for estr in self.df_dataset.index]
 
     def precheck(self, c: Composition) -> bool:
         """
         Precheck a single entry. Miedema does not work for compositions
         containing any elements for which the Miedema model has no parameters.
-        To precheck an entire dataframe (qnd automatically gather
+        To precheck an entire dataframe (and automatically gather
         the fraction of structures that will pass the precheck), please use
         precheck_dataframe.
 
@@ -195,7 +218,9 @@ def deltaH_elast(self, elements, fracs):
                 alp[1] * (elec[1] - elec[0]) / n_ws[1],
             ]
         )
-        alp_a = np.multiply(1.5, np.power(v_a, 2 / 3)) / reduce(lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3))
+        alp_a = np.multiply(1.5, np.power(np.abs(v_a), 2 / 3)) / reduce(
+            lambda x, y: 1 / x + 1 / y, np.power(n_ws, 1 / 3)
+        )
 
         # effective volume in alloy
         vab_a = v_molar[0] + np.array(
@@ -212,13 +237,17 @@ def deltaH_elast(self, elements, fracs):
         )
 
         # H_elast A in B
-        hab_elast = (2 * compr[0] * shear_mod[1] * np.power((vab_a[0] - vba_a[0]), 2)) / (
-            4 * shear_mod[1] * vab_a[0] + 3 * compr[0] * vba_a[0]
-        )
-        # H_elast B in A
-        hba_elast = (2 * compr[1] * shear_mod[0] * np.power((vba_a[1] - vab_a[1]), 2)) / (
-            4 * shear_mod[0] * vba_a[1] + 3 * compr[1] * vab_a[1]
-        )
+        if all(compr == 0) and all(shear_mod == 0):  # This is the case for H-N
+            hab_elast = 0
+            hba_elast = 0
+        else:
+            hab_elast = (2 * compr[0] * shear_mod[1] * np.power((vab_a[0] - vba_a[0]), 2)) / (
+                4 * shear_mod[1] * vab_a[0] + 3 * compr[0] * vba_a[0]
+            )
+            # H_elast B in A
+            hba_elast = (2 * compr[1] * shear_mod[0] * np.power((vba_a[1] - vab_a[1]), 2)) / (
+                4 * shear_mod[0] * vba_a[1] + 3 * compr[1] * vab_a[1]
+            )
 
         deltaH_elast = np.multiply.reduce(fracs, 0) * (fracs[1] * hab_elast + fracs[0] * hba_elast)
         return deltaH_elast
@@ -457,17 +486,25 @@ class YangSolidSolution(BaseFeaturizer):
         Yang omega - Mixing thermochemistry feature, Omega
         Yang delta - Atomic size mismatch term
 
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
+
     References:
         .. Yang and Zhang (2012) `https://linkinghub.elsevier.com/retrieve/pii/S0254058411009357`.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
         # Load in the mixing enthalpy data
         #  Creates a lookup table of the liquid mixing enthalpies
-        self.dhf_mix = MixingEnthalpy()
+        self.dhf_mix = MixingEnthalpy(impute_nan=self.impute_nan)
 
         # Load in a table of elemental properties
-        self.elem_data = MagpieData()
+        self.elem_data = MagpieData(impute_nan=self.impute_nan)
 
     def precheck(self, c: Composition) -> bool:
         """
@@ -476,7 +513,7 @@ def precheck(self, c: Composition) -> bool:
         parameters. We can nearly equivalently approximate this by checking
         against the unary element list.
 
-        To precheck an entire dataframe (qnd automatically gather
+        To precheck an entire dataframe (and automatically gather
         the fraction of structures that will pass the precheck), please use
         precheck_dataframe.
 
@@ -553,6 +590,9 @@ def compute_delta(self, comp):
 
         elements, fractions = zip(*comp.element_composition.items())
 
+        if len(fractions) == 1:
+            return 0.0
+
         # Get the radii of elements
         radii = self.elem_data.get_elemental_properties(elements, "MiracleRadius")
         mean_r = PropertyStats.mean(radii, fractions)
@@ -612,15 +652,23 @@ class WenAlloys(BaseFeaturizer):
         Shear modulus strength model
 
     Copyright 2020 Battelle Energy Alliance, LLC  ALL RIGHTS RESERVED
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         # Use of Miedema to retrieve the shear modulus
-        self.data_source_miedema = Miedema(data_source="Miedema")
-        self.data_source_magpie = MagpieData().all_elemental_props
-        self.data_source_cohesive_energy = CohesiveEnergyData()
-        self.data_source_enthalpy = MixingEnthalpy()
-        self.yss = YangSolidSolution()
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source_miedema = Miedema(data_source="Miedema", impute_nan=self.impute_nan)
+        self.data_source_magpie = MagpieData(impute_nan=self.impute_nan).all_elemental_props
+        self.data_source_cohesive_energy = CohesiveEnergyData(impute_nan=self.impute_nan)
+        self.data_source_enthalpy = MixingEnthalpy(impute_nan=self.impute_nan)
+        self.yss = YangSolidSolution(impute_nan=self.impute_nan)
 
     def precheck(self, comp):
         return self.yss.precheck(comp)
@@ -746,11 +794,14 @@ def compute_delta(variable, fractions):
         element :math:`i`, and :math:`\\bar{v}` is the fraction-weighted
         average of the variable.
         Args:
-            variable (list): List of properties to asses
-            fractions (list): List of fractions to asses
+            variable (list): List of properties to assess
+            fractions (list): List of fractions to assess
         Returns:
             (float) delta
         """
+        if len(fractions) == 1:
+            return 0.0
+
         mean_variable = PropertyStats.mean(variable, fractions)
         dev_variable = np.power(1.0 - np.divide(variable, mean_variable), 2)
         return np.sqrt(PropertyStats.mean(dev_variable, fractions))
@@ -864,6 +915,9 @@ def compute_strength_local_mismatch_shear(shear_modulus, mean_shear_modulus, fra
 
         array_shear = np.array(shear_modulus)
         array_fractions = np.array(fractions)
+        if len(fractions) == 1:
+            return 0.0
+
         modulus_combination = (
             2 * array_fractions * (array_shear - mean_shear_modulus) / (array_shear + mean_shear_modulus)
         )
diff --git a/matminer/featurizers/composition/composite.py b/matminer/featurizers/composition/composite.py
index eec6cd89d..f13269c44 100644
--- a/matminer/featurizers/composition/composite.py
+++ b/matminer/featurizers/composition/composite.py
@@ -2,6 +2,8 @@
 Composition featurizers for composite features containing more than 1 category of general-purpose data.
 """
 
+import warnings
+
 from matminer.featurizers.base import BaseFeaturizer
 from matminer.featurizers.composition.element import ElementFraction
 from matminer.featurizers.composition.orbital import ValenceOrbital
@@ -11,8 +13,11 @@
     MagpieData,
     MatscholarElementData,
     MEGNetElementData,
+    OpticalData,
     PymatgenData,
+    TransportData,
 )
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class ElementProperty(BaseFeaturizer):
@@ -43,21 +48,37 @@ class ElementProperty(BaseFeaturizer):
             (these must be supported by data_source)
         stats (list of strings): a list of weighted statistics to compute to for each
             property (see PropertyStats for available stats)
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self, data_source, features, stats):
+    def __init__(self, data_source, features, stats, impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
         if data_source == "pymatgen":
-            self.data_source = PymatgenData()
+            self.data_source = PymatgenData(impute_nan=self.impute_nan)
         elif data_source == "magpie":
-            self.data_source = MagpieData()
+            self.data_source = MagpieData(impute_nan=self.impute_nan)
         elif data_source == "deml":
-            self.data_source = DemlData()
+            self.data_source = DemlData(impute_nan=self.impute_nan)
         elif data_source == "matscholar_el":
-            self.data_source = MatscholarElementData()
+            self.data_source = MatscholarElementData(impute_nan=self.impute_nan)
         elif data_source == "megnet_el":
-            self.data_source = MEGNetElementData()
+            self.data_source = MEGNetElementData(impute_nan=self.impute_nan)
+        elif data_source == "optical":
+            self.data_source = OpticalData(impute_nan=self.impute_nan)
+        elif data_source == "mp_transport":
+            self.data_source = TransportData(impute_nan=self.impute_nan)
         else:
             self.data_source = data_source
+            if self.impute_nan:
+                warnings.warn(
+                    """The data_source has been specified externally and impute_nan is set to True.
+                    Please make sure that the NaNs imputation has been done correctly
+                    in the provided data_source to proceed."""
+                )
 
         self.features = features
         self.stats = stats
@@ -65,12 +86,15 @@ def __init__(self, data_source, features, stats):
         self.pstats = PropertyStats()
 
     @classmethod
-    def from_preset(cls, preset_name):
+    def from_preset(cls, preset_name, impute_nan=False):
         """
         Return ElementProperty from a preset string
         Args:
             preset_name: (str) can be one of "magpie", "deml", "matminer",
                 "matscholar_el", or "megnet_el".
+            impute_nan (bool): if True, the features for the elements
+                that are missing from the data_source or are NaNs are replaced by the
+                average of each features over the available elements.
 
         Returns:
             ElementProperty based on the preset name.
@@ -147,17 +171,27 @@ def from_preset(cls, preset_name):
         elif preset_name == "matscholar_el":
             data_source = "matscholar_el"
             stats = ["minimum", "maximum", "range", "mean", "std_dev"]
-            features = MatscholarElementData().prop_names
+            features = MatscholarElementData(impute_nan=impute_nan).prop_names
 
         elif preset_name == "megnet_el":
             data_source = "megnet_el"
             stats = ["minimum", "maximum", "range", "mean", "std_dev"]
-            features = MEGNetElementData().prop_names
+            features = MEGNetElementData(impute_nan=impute_nan).prop_names
+
+        elif preset_name == "optical":
+            data_source = "optical"
+            stats = ["minimum", "maximum", "range", "mean", "std_dev", "mode"]
+            features = OpticalData(impute_nan=impute_nan).prop_names
+
+        elif preset_name == "mp_transport":
+            data_source = "mp_transport"
+            stats = ["minimum", "maximum", "range", "mean", "std_dev", "mode"]
+            features = TransportData(impute_nan=impute_nan).prop_names
 
         else:
             raise ValueError("Invalid preset_name specified!")
 
-        return cls(data_source, features, stats)
+        return cls(data_source, features, stats, impute_nan=impute_nan)
 
     def featurize(self, comp):
         """
@@ -236,12 +270,43 @@ def citations(self):
                 r"adsurl = {https://ui.adsabs.harvard.edu/\#abs/2018arXiv181205055C},"
                 "adsnote = {Provided by the SAO/NASA Astrophysics Data System}}"
             ]
+        elif self.data_source.__class__.__name__ == "OpticalData":
+            citation = [
+                "@misc{mtgx,"
+                "author = {Guillaume Brunin, Guido Petretto, David Waroquiers (Matgenix)},"
+                "year = {2022}"
+            ]
+            citation += [
+                "@misc{rii,"
+                "author = {Mikhail N. Polyanskiy},"
+                "title = {Refractive index database},"
+                "howpublished = {https://refractiveindex.info},"
+                "note = {Accessed on 2022-06-30}}"
+            ]
+        elif self.data_source.__class__.__name__ == "TransportData":
+            citation = [
+                "@misc{mtgx,"
+                "author = {Guillaume Brunin, Guido Petretto, David Waroquiers (Matgenix)},"
+                "year = {2022}"
+            ]
+            citation += [
+                "@article{ricci2017ab,"
+                "title={An ab initio electronic transport database for inorganic materials},"
+                "author={Ricci, Francesco and Chen, Wei and Aydemir, Umut and Snyder, G Jeffrey"
+                "and Rignanese, Gian-Marco and Jain, Anubhav and Hautier, Geoffroy},"
+                "journal={Scientific data},"
+                "volume={4},"
+                "number={1},"
+                "pages={1--13},"
+                "year={2017},"
+                "publisher={Nature Publishing Group}}"
+            ]
         else:
             citation = []
         return citation
 
     def implementors(self):
-        return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn"]
+        return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn", "Guillaume Brunin (Matgenix)"]
 
 
 class Meredig(BaseFeaturizer):
@@ -259,10 +324,17 @@ class Meredig(BaseFeaturizer):
             Mean number of valence electrons in each orbital
             Fraction of total valence electrons in each orbital
 
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
-        self.data_source = MagpieData()
+    def __init__(self, impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = MagpieData(impute_nan=self.impute_nan)
 
         # The labels for statistics on element properties
         self._element_property_feature_labels = [
@@ -305,14 +377,16 @@ def featurize(self, comp):
             element_property_features[i] = self.pstats.calc_stat(elem_data, stat, fractions)
 
         # Final 8 features are statistics on valence orbitals, available from the ValenceOrbital featurizer
-        valence_orbital_features = ValenceOrbital(orbitals=("s", "p", "d", "f"), props=("avg", "frac")).featurize(comp)
+        valence_orbital_features = ValenceOrbital(
+            orbitals=("s", "p", "d", "f"), props=("avg", "frac"), impute_nan=self.impute_nan
+        ).featurize(comp)
 
         return element_fraction_features + element_property_features + valence_orbital_features
 
     def feature_labels(self):
         # Since we have more features than just element fractions, append 'fraction' to element symbols for clarity
         element_fraction_features = [e + " fraction" for e in ElementFraction().feature_labels()]
-        valence_orbital_features = ValenceOrbital().feature_labels()
+        valence_orbital_features = ValenceOrbital(impute_nan=self.impute_nan).feature_labels()
         return element_fraction_features + self._element_property_feature_labels + valence_orbital_features
 
     def citations(self):
diff --git a/matminer/featurizers/composition/element.py b/matminer/featurizers/composition/element.py
index 206122b17..940d3f3c2 100644
--- a/matminer/featurizers/composition/element.py
+++ b/matminer/featurizers/composition/element.py
@@ -2,10 +2,13 @@
 Composition featurizers for elemental data and stoichiometry.
 """
 
+import warnings
+
 from pymatgen.core import Element
 
 from matminer.featurizers.base import BaseFeaturizer
 from matminer.utils.data import DemlData, MagpieData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class ElementFraction(BaseFeaturizer):
@@ -27,7 +30,7 @@ def featurize(self, comp):
             vector (list of floats): fraction of each element in a composition
         """
 
-        vector = [0] * 103
+        vector = [0] * 118
         el_list = list(comp.element_composition.fractional_composition.items())
         for el in el_list:
             obj = el
@@ -37,7 +40,7 @@ def featurize(self, comp):
 
     def feature_labels(self):
         labels = []
-        for i in range(1, 104):
+        for i in range(1, 119):
             labels.append(Element.from_Z(i).symbol)
         return labels
 
@@ -207,12 +210,16 @@ class BandCenter(BaseFeaturizer):
         - Band center
     """
 
-    magpie_data = MagpieData()
-    deml_data = DemlData()
+    def __init__(self, impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.magpie_data = MagpieData(impute_nan=self.impute_nan)
+        self.deml_data = DemlData(impute_nan=self.impute_nan)
 
     def featurize(self, comp):
         """
-        (Rough) estimation of absolution position of band center using
+        (Rough) estimation of absolute position of band center using
         geometric mean of electronegativity.
 
         Args:
diff --git a/matminer/featurizers/composition/ion.py b/matminer/featurizers/composition/ion.py
index 2e780eae8..0b98719bd 100644
--- a/matminer/featurizers/composition/ion.py
+++ b/matminer/featurizers/composition/ion.py
@@ -2,6 +2,7 @@
 Composition featurizers for compositions with ionic data.
 """
 import itertools
+import warnings
 
 import numpy as np
 
@@ -10,6 +11,7 @@
 from matminer.featurizers.utils.oxidation import has_oxidation_states
 from matminer.featurizers.utils.stats import PropertyStats
 from matminer.utils.data import DemlData, PymatgenData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class CationProperty(ElementProperty):
@@ -35,7 +37,7 @@ class CationProperty(ElementProperty):
     """
 
     @classmethod
-    def from_preset(cls, preset_name):
+    def from_preset(cls, preset_name, impute_nan=False):
         if preset_name == "deml":
             data_source = "deml"
             features = [
@@ -48,7 +50,10 @@ def from_preset(cls, preset_name):
             stats = ["minimum", "maximum", "range", "mean", "std_dev"]
         else:
             raise ValueError('Preset "%s" not found' % preset_name)
-        return cls(data_source, features, stats)
+
+        if not impute_nan:
+            warnings.warn("CationProperty(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        return cls(data_source, features, stats, impute_nan=impute_nan)
 
     def feature_labels(self):
         return [f + " of cations" for f in super().feature_labels()]
@@ -141,7 +146,7 @@ class IonProperty(BaseFeaturizer):
     Ionic property attributes. Similar to ElementProperty.
     """
 
-    def __init__(self, data_source=PymatgenData(), fast=False):
+    def __init__(self, data_source=None, impute_nan=False, fast=False):
         """
 
         Args:
@@ -151,7 +156,10 @@ def __init__(self, data_source=PymatgenData(), fast=False):
                 which can dramatically accelerate the calculation of whether an ionic compound
                 is possible, but will miss heterovalent compounds like Fe3O4.
         """
-        self.data_source = data_source
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = data_source or PymatgenData(impute_nan=self.impute_nan)
         self.fast = fast
 
     def featurize(self, comp):
@@ -236,10 +244,18 @@ class ElectronAffinity(BaseFeaturizer):
     Calculate average electron affinity times formal charge of anion elements.
     Note: The formal charges must already be computed before calling `featurize`.
     Generates average (electron affinity*formal charge) of anions.
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
-        self.data_source = DemlData()
+    def __init__(self, impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = DemlData(impute_nan=self.impute_nan)
 
     def featurize(self, comp):
         """
diff --git a/matminer/featurizers/composition/orbital.py b/matminer/featurizers/composition/orbital.py
index f28abcb8c..54f7d8473 100644
--- a/matminer/featurizers/composition/orbital.py
+++ b/matminer/featurizers/composition/orbital.py
@@ -11,6 +11,7 @@
 from matminer.featurizers.base import BaseFeaturizer
 from matminer.featurizers.utils.stats import PropertyStats
 from matminer.utils.data import MagpieData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class AtomicOrbitals(BaseFeaturizer):
@@ -51,7 +52,7 @@ def featurize(self, comp):
         integer_comp, factor = comp.get_integer_formula_and_factor()
 
         # warning message if composition is dilute and truncated
-        if not (len(Composition(comp).elements) == len(Composition(integer_comp).elements)):
+        if not len(Composition(comp).elements) == len(Composition(integer_comp).elements):
             warn(f"AtomicOrbitals: {comp} truncated to {integer_comp}")
 
         homo_lumo = MolecularOrbitals(integer_comp).band_edges
@@ -103,10 +104,16 @@ class ValenceOrbital(BaseFeaturizer):
         orbitals (list): orbitals to calculate
         props (list): specifies whether to return average number of electrons in each orbital,
             fraction of electrons in each orbital, or both
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self, orbitals=("s", "p", "d", "f"), props=("avg", "frac")):
-        self.data_source = MagpieData()
+    def __init__(self, orbitals=("s", "p", "d", "f"), props=("avg", "frac"), impute_nan=False):
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = MagpieData(impute_nan=self.impute_nan)
         self.orbitals = orbitals
         self.props = props
 
@@ -126,7 +133,7 @@ def featurize(self, comp):
         # Get the mean number of electrons in each shell
         avg = [
             PropertyStats.mean(
-                self.data_source.get_elemental_properties(elements, "N%sValence" % orb),
+                self.data_source.get_elemental_properties(elements, f"N{orb}Valence"),
                 weights=fractions,
             )
             for orb in self.orbitals
diff --git a/matminer/featurizers/composition/packing.py b/matminer/featurizers/composition/packing.py
index 2e167d446..0cabf8ac1 100644
--- a/matminer/featurizers/composition/packing.py
+++ b/matminer/featurizers/composition/packing.py
@@ -3,6 +3,7 @@
 """
 
 import itertools
+import warnings
 from functools import lru_cache
 
 import numpy as np
@@ -13,6 +14,7 @@
 from matminer.featurizers.composition.element import ElementFraction
 from matminer.featurizers.utils.stats import PropertyStats
 from matminer.utils.data import MagpieData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class AtomicPackingEfficiency(BaseFeaturizer):
@@ -49,7 +51,7 @@ class AtomicPackingEfficiency(BaseFeaturizer):
         for bulk metallic glasses, Nat. Commun. 6 (2015) 8123. doi:10.1038/ncomms9123.
     """
 
-    def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6):
+    def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6, impute_nan=False):
         """
         Initialize the featurizer
 
@@ -60,6 +62,9 @@ def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6):
             max_types (int): Maximum number of atom types to consider when
                 looking for efficient clusters. The process for finding
                 efficient clusters very expensive for large numbers of types
+            impute_nan (bool): if True, the features for the elements
+                that are missing from the data_source or are NaNs are replaced by the
+                average of each features over the available elements.
         """
 
         # Store the options
@@ -74,7 +79,10 @@ def __init__(self, threshold=0.01, n_nearest=(1, 3, 5), max_types=6):
         self._n_elems = len(self._el_frac.featurize(Composition("H")))
 
         # Tool for looking up radii
-        self._data_source = MagpieData()
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self._data_source = MagpieData(impute_nan=impute_nan)
 
         # Lookup table of ideal radius ratios
         self.ideal_ratio = dict(
diff --git a/matminer/featurizers/composition/tests/base.py b/matminer/featurizers/composition/tests/base.py
index a1ac4f5d2..9f72d06a1 100644
--- a/matminer/featurizers/composition/tests/base.py
+++ b/matminer/featurizers/composition/tests/base.py
@@ -17,6 +17,15 @@ def setUp(self):
             }
         )
 
+        self.df_nans = pd.DataFrame(
+            {
+                "composition": [
+                    # Composition("U2Og3"),
+                    Composition({Specie("U", 3): 2, Specie("Og", -2): 3}),
+                ]
+            }
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/matminer/featurizers/composition/tests/test_alloy.py b/matminer/featurizers/composition/tests/test_alloy.py
index 48d109fd3..7df5c0c7f 100644
--- a/matminer/featurizers/composition/tests/test_alloy.py
+++ b/matminer/featurizers/composition/tests/test_alloy.py
@@ -21,7 +21,7 @@ def test_miedema_all(self):
                 ]
             }
         )
-        miedema = Miedema(struct_types="all")
+        miedema = Miedema(struct_types="all", impute_nan=False)
         self.assertTrue(miedema.precheck(df["composition"].iloc[0]))
         self.assertFalse(miedema.precheck(df["composition"].iloc[-1]))
         self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition"), 2 / 3)
@@ -52,6 +52,51 @@ def test_miedema_all(self):
         self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300)
         self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755)
 
+        # Tests impute_nan=True
+        df = pd.DataFrame(
+            {
+                "composition": [
+                    Composition("TiZr"),
+                    Composition("Mg10Cu50Ca40"),
+                    Composition("Fe2O3"),
+                ]
+            }
+        )
+        miedema = Miedema(struct_types="all", impute_nan=True)
+        self.assertTrue(miedema.precheck(df["composition"].iloc[0]))
+        self.assertTrue(miedema.precheck(df["composition"].iloc[-1]))
+        self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition"), 1)
+
+        # test precheck for oxidation-state decorated compositions
+        df = CompositionToOxidComposition(return_original_on_error=True).featurize_dataframe(df, "composition")
+        self.assertTrue(miedema.precheck(df["composition_oxid"].iloc[0]))
+        self.assertTrue(miedema.precheck(df["composition_oxid"].iloc[-1]))
+        self.assertAlmostEqual(miedema.precheck_dataframe(df, "composition_oxid"), 1)
+
+        mfps = miedema.featurize_dataframe(df, col_id="composition")
+        self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][0], -0.003445022152)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755)
+
+        self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][1], -0.235125978427)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][1], -0.164541848271)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][1], -0.05280843311)
+
+        self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][2], -0.1664609094669129)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][2], -0.08553195254092998)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273)
+
+        # make sure featurization works equally for compositions with or without
+        # oxidation states
+        mfps = miedema.featurize_dataframe(df, col_id="composition_oxid")
+        self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][0], -0.003445022152)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][0], 0.0707658836300)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755)
+
+        self.assertAlmostEqual(mfps["Miedema_deltaH_inter"][2], -0.1664609094669129)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_amor"][2], -0.08553195254092998)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273)
+
     def test_miedema_ss(self):
         df = pd.DataFrame(
             {
@@ -62,7 +107,7 @@ def test_miedema_ss(self):
                 ]
             }
         )
-        miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"])
+        miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"], impute_nan=False)
         mfps = miedema.featurize_dataframe(df, col_id="composition")
         self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755)
         self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][0], 0.04700027066)
@@ -82,6 +127,36 @@ def test_miedema_ss(self):
         self.assertAlmostEqual(math.isnan(mfps["Miedema_deltaH_ss_hcp"][2]), True)
         self.assertAlmostEqual(math.isnan(mfps["Miedema_deltaH_ss_no_latt"][2]), True)
 
+        # Test impute_nan=True
+        df = pd.DataFrame(
+            {
+                "composition": [
+                    Composition("TiZr"),
+                    Composition("Mg10Cu50Ca40"),
+                    Composition("Fe2O3"),
+                ]
+            }
+        )
+        miedema = Miedema(struct_types="ss", ss_types=["min", "fcc", "bcc", "hcp", "no_latt"], impute_nan=True)
+        mfps = miedema.featurize_dataframe(df, col_id="composition")
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][0], 0.03663599755)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][0], 0.04700027066)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][0], 0.08327522653)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][0], 0.03663599755)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][0], 0.036635998)
+
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][1], -0.05280843311)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][1], 0.03010575174)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][1], -0.05280843311)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][1], 0.03010575174)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][1], -0.0035781359)
+
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_min"][2], 0.19402130236056273)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_fcc"][2], 0.21122883524487504)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_bcc"][2], 0.25737114702211505)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_hcp"][2], 0.19402130236056273)
+        self.assertAlmostEqual(mfps["Miedema_deltaH_ss_no_latt"][2], 0.258796589515172)
+
     def test_yang(self):
         comps = list(
             map(
@@ -91,7 +166,7 @@ def test_yang(self):
         )
 
         # Run the featurization
-        feat = YangSolidSolution()
+        feat = YangSolidSolution(impute_nan=False)
 
         df = pd.DataFrame({"composition": comps})
         self.assertFalse(feat.precheck(df["composition"].iloc[-1]))
@@ -118,8 +193,31 @@ def test_yang(self):
         np.testing.assert_array_almost_equal([158.5, 0.0315], features[2], decimal=1)
         np.testing.assert_array_almost_equal([5.06, 0.0482], features[3], decimal=1)
 
+        # Test with impute_nan=True
+        feat = YangSolidSolution(impute_nan=True)
+
+        df = pd.DataFrame({"composition": comps})
+        self.assertTrue(feat.precheck(df["composition"].iloc[-1]))
+        self.assertAlmostEqual(feat.precheck_dataframe(df, "composition"), 1, places=10)
+
+        # test precheck for oxidation-state decorated compositions
+        df = CompositionToOxidComposition(return_original_on_error=True).featurize_dataframe(df, "composition")
+        self.assertTrue(feat.precheck(df["composition_oxid"].iloc[-1]))
+        self.assertAlmostEqual(feat.precheck_dataframe(df, "composition_oxid"), 1, places=10)
+
+        feat.set_n_jobs(1)
+        features = feat.featurize_many(comps)
+
+        self.assertEqual((5, 2), np.array(features).shape)
+        np.testing.assert_array_almost_equal([0.95, 0.1021], features[0], decimal=2)
+        np.testing.assert_array_almost_equal([2.22, 0.0], features[1], decimal=2)
+        np.testing.assert_array_almost_equal([158.5, 0.0315], features[2], decimal=1)
+        np.testing.assert_array_almost_equal([5.06, 0.0482], features[3], decimal=1)
+
+        np.testing.assert_array_almost_equal([0.14893, 0.56212], features[4], decimal=5)
+
     def test_WenAlloys(self):
-        wa = WenAlloys()
+        wa = WenAlloys(impute_nan=False)
         c1 = "Fe0.62C0.000953Mn0.000521Si0.00102Cr0.00011Ni0.192" "Mo0.0176V0.000112Nb6.16e-05Co0.146Al0.00318Ti0.0185"
         c2 = (
             "Fe0.623C0.00854Mn0.000104Si0.000203Cr0.147Ni9.71e-05"
@@ -187,6 +285,122 @@ def test_WenAlloys(self):
         self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5)
         self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5)
 
+        # Test impute_nan=True
+        wa = WenAlloys(impute_nan=True)
+        c1 = "Fe0.62C0.000953Mn0.000521Si0.00102Cr0.00011Ni0.192" "Mo0.0176V0.000112Nb6.16e-05Co0.146Al0.00318Ti0.0185"
+        c2 = (
+            "Fe0.623C0.00854Mn0.000104Si0.000203Cr0.147Ni9.71e-05"
+            "Mo0.0179V0.00515N0.00163Nb6.14e-05Co0.188W0.00729Al0.000845"
+        )
+        comp = Composition(c1)
+
+        # Test prechecking
+        comp_bad = Composition("LaO3")
+        self.assertTrue(wa.precheck(comp))
+        self.assertTrue(wa.precheck(comp_bad))
+
+        f = wa.featurize(comp)
+
+        d = dict(zip(wa.feature_labels(), f))
+        correct = {
+            "APE mean": 0.018915555593392162,
+            "Atomic Fraction": "Fe0.6199642900568927C0.0009529451103616431Mn0.0005209699921284533Si"
+            "0.0010199412513839203Cr0.00010999366436493258Ni0.1919889414369732Mo"
+            "0.017598986298389213V0.0001119935491715677Nb6.159645204436224e-05Co"
+            "0.14599159088436503Al0.003179816842549869Ti0.018498934461375023",
+            "Atomic weight mean": 57.24008321450784,
+            "Configuration entropy": -0.008958911485121818,
+            "Electronegativity delta": 0.042327487126447516,
+            "Electronegativity local mismatch": 0.08262466022141576,
+            "Interant d electrons": 45.0,
+            "Interant electrons": 53.0,
+            "Interant f electrons": 0,
+            "Interant p electrons": 5.0,
+            "Interant s electrons": 3.0,
+            "Lambda entropy": -12.084431980055149,
+            "Mean cohesive energy": 4.382084353941212,
+            "Mixing enthalpy": 3.6650695863166347,
+            "Radii gamma": 1.4183511064895242,
+            "Radii local mismatch": 0.7953797741513383,
+            "Shear modulus delta": 0.1794147729878139,
+            "Shear modulus local mismatch": 3.192861083726266,
+            "Shear modulus mean": 79.48600137832061,
+            "Shear modulus strength model": -0.009636621848440554,
+            "Total weight": 57.243028243301005,
+            "VEC mean": 8.395723406331793,
+            "Weight Fraction": "Fe0.6048579375087819 C0.00019995792415715736 "
+            "Mn0.0005000210911858884 Si0.0005004488909678273 "
+            "Cr9.991733798026916e-05 Ni0.19686472127404955 "
+            "Mo0.029497810507563525 V9.967061797901463e-05 "
+            "Nb9.997781710071831e-05 Co0.15031081922202344 "
+            "Al0.0014988950686416751 Ti0.015469822739568852 ",
+            "Yang delta": 0.027227922269552986,
+            "Yang omega": 4.4226005659658725,
+        }
+
+        for flabel, fvalue in d.items():
+            correct_value = correct[flabel]
+            if isinstance(correct_value, str):
+                self.assertEqual(correct_value, fvalue)
+            else:
+                self.assertAlmostEqual(correct_value, fvalue, places=8)
+
+        self.assertEqual(len(wa.feature_labels()), 25)
+
+        df = pd.DataFrame({"composition": [comp, Composition(c2)]})
+
+        df = wa.featurize_dataframe(df, "composition")
+        self.assertTupleEqual(df.shape, (2, 26))
+        self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5)
+        self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5)
+
+        f = wa.featurize(comp_bad)
+
+        d = dict(zip(wa.feature_labels(), f))
+        correct = {
+            "APE mean": 0.001579487142797431,
+            "Atomic Fraction": "La0.25O0.75",
+            "Atomic weight mean": 46.7259175,
+            "Configuration entropy": -0.004675254392360772,
+            "Electronegativity delta": 0.09973459781368167,
+            "Electronegativity local mismatch": 0.1654880136986303,
+            "Interant d electrons": 1.0,
+            "Interant electrons": 5.0,
+            "Interant f electrons": 0,
+            "Interant p electrons": 4.0,
+            "Interant s electrons": 0,
+            "Lambda entropy": -0.014796268010071023,
+            "Mean cohesive energy": 3.0675,
+            "Mixing enthalpy": 10.652682648401827,
+            "Radii gamma": 1.9699221262511677,
+            "Radii local mismatch": 23.0625,
+            "Shear modulus delta": 0.37931021448197916,
+            "Shear modulus local mismatch": 7.13935787671233,
+            "Shear modulus mean": 43.467431506849316,
+            "Shear modulus strength model": -0.08010552710644916,
+            "Total weight": 186.90367,
+            "VEC mean": 5.393835616438356,
+            "Weight Fraction": "La0.743192843671823 O0.25680715632817697 ",
+            "Yang delta": 0.5621167528521687,
+            "Yang omega": 0.14893408828673313,
+        }
+
+        for flabel, fvalue in d.items():
+            correct_value = correct[flabel]
+            if isinstance(correct_value, str):
+                self.assertEqual(correct_value, fvalue)
+            else:
+                self.assertAlmostEqual(correct_value, fvalue, places=8)
+
+        self.assertEqual(len(wa.feature_labels()), 25)
+
+        df = pd.DataFrame({"composition": [comp, Composition(c2)]})
+
+        df = wa.featurize_dataframe(df, "composition")
+        self.assertTupleEqual(df.shape, (2, 26))
+        self.assertAlmostEqual(df["Configuration entropy"].iloc[0], -0.008959, places=5)
+        self.assertAlmostEqual(df["Configuration entropy"].iloc[1], -0.009039, places=5)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/matminer/featurizers/composition/tests/test_composite.py b/matminer/featurizers/composition/tests/test_composite.py
index ddb45edcf..13e0850ff 100644
--- a/matminer/featurizers/composition/tests/test_composite.py
+++ b/matminer/featurizers/composition/tests/test_composite.py
@@ -7,7 +7,9 @@
 
 class CompositeFeaturesTest(CompositionFeaturesTest):
     def test_elem(self):
-        df_elem = ElementProperty.from_preset("magpie").featurize_dataframe(self.df, col_id="composition")
+        df_elem = ElementProperty.from_preset("magpie", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
         self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 8)
         self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 26)
         self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 18)
@@ -15,24 +17,113 @@ def test_elem(self):
         self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 8.64)
         self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 8)
 
+        df_elem = ElementProperty.from_preset("magpie", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 132)
+
+        df_elem = ElementProperty.from_preset("magpie", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 8)
+        self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 26)
+        self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 18)
+        self.assertAlmostEqual(df_elem["MagpieData mean Number"][0], 15.2)
+        self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 8.64)
+        self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 8)
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+
+        df_elem = ElementProperty.from_preset("magpie", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["MagpieData minimum Number"][0], 56.5)
+        self.assertAlmostEqual(df_elem["MagpieData maximum Number"][0], 92)
+        self.assertAlmostEqual(df_elem["MagpieData range Number"][0], 35.5)
+        self.assertAlmostEqual(df_elem["MagpieData mean Number"][0], 70.7)
+        self.assertAlmostEqual(df_elem["MagpieData avg_dev Number"][0], 17.04)
+        self.assertAlmostEqual(df_elem["MagpieData mode Number"][0], 56.5)
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+
     def test_elem_deml(self):
-        df_elem_deml = ElementProperty.from_preset("deml").featurize_dataframe(self.df, col_id="composition")
+        df_elem_deml = ElementProperty.from_preset("deml", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 8)
+        self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 26)
+        self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 18)
+        self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 15.2)
+        self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 12.7279, 4)
+
+        df_elem_deml = ElementProperty.from_preset("deml", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition", ignore_errors=True
+        )
+        self.assertEqual(df_elem_deml.isna().sum().sum(), 80)
+
+        df_elem_deml = ElementProperty.from_preset("deml", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
         self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 8)
         self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 26)
         self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 18)
         self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 15.2)
         self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 12.7279, 4)
+        self.assertEqual(df_elem_deml.isna().sum().sum(), 0)
+
+        df_elem_deml = ElementProperty.from_preset("deml", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem_deml["DemlData minimum atom_num"][0], 92)
+        self.assertAlmostEqual(df_elem_deml["DemlData maximum atom_num"][0], 118)
+        self.assertAlmostEqual(df_elem_deml["DemlData range atom_num"][0], 26)
+        self.assertAlmostEqual(df_elem_deml["DemlData mean atom_num"][0], 107.6)
+        self.assertAlmostEqual(df_elem_deml["DemlData std_dev atom_num"][0], 18.3848, 4)
+        self.assertEqual(df_elem_deml.isna().sum().sum(), 0)
 
     def test_elem_matminer(self):
-        df_elem = ElementProperty.from_preset("matminer").featurize_dataframe(self.df, col_id="composition")
+        df_elem = ElementProperty.from_preset("matminer", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
         self.assertAlmostEqual(df_elem["PymatgenData minimum melting_point"][0], 54.8, 1)
         self.assertTrue(math.isnan(df_elem["PymatgenData maximum bulk_modulus"][0]))
         self.assertAlmostEqual(df_elem["PymatgenData range X"][0], 1.61, 1)
         self.assertAlmostEqual(df_elem["PymatgenData mean X"][0], 2.796, 1)
         self.assertAlmostEqual(df_elem["PymatgenData maximum block"][0], 3, 1)
+        self.assertEqual(df_elem.isna().sum().sum(), 30)
+
+        df_elem = ElementProperty.from_preset("matminer", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 45)
+        self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 987.903, 3)
+
+        df_elem = ElementProperty.from_preset("matminer", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["PymatgenData minimum melting_point"][0], 54.8, 1)
+        self.assertFalse(math.isnan(df_elem["PymatgenData maximum bulk_modulus"][0]))
+        self.assertAlmostEqual(df_elem["PymatgenData range X"][0], 1.61, 1)
+        self.assertAlmostEqual(df_elem["PymatgenData mean X"][0], 2.796, 1)
+        self.assertAlmostEqual(df_elem["PymatgenData maximum block"][0], 3, 1)
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+
+        df_elem = ElementProperty.from_preset("matminer", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+        self.assertAlmostEqual(
+            df_elem[
+                [col for col in df_elem.columns if "composition" not in col and "electrical_resistivity" not in col]
+            ]
+            .sum()
+            .sum(),
+            16295.221,
+            3,
+        )
 
     def test_elem_matscholar_el(self):
-        df_elem = ElementProperty.from_preset("matscholar_el").featurize_dataframe(self.df, col_id="composition")
+        df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
         self.assertAlmostEqual(
             df_elem["MatscholarElementData range embedding 149"].iloc[0],
             0.06827970966696739,
@@ -50,8 +141,64 @@ def test_elem_matscholar_el(self):
             -0.02483355056028813,
         )
 
+        df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition", ignore_errors=True
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 1000)
+
+        df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(
+            df_elem["MatscholarElementData range embedding 149"].iloc[0],
+            0.06827970966696739,
+        )
+        self.assertAlmostEqual(
+            df_elem["MatscholarElementData range embedding 149"].iloc[1],
+            0.06827970966696739,
+        )
+        self.assertAlmostEqual(
+            df_elem["MatscholarElementData mean embedding 18"].iloc[0],
+            -0.020534400502219795,
+        )
+        self.assertAlmostEqual(
+            df_elem["MatscholarElementData mean embedding 18"].iloc[1],
+            -0.02483355056028813,
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+
+        df_elem = ElementProperty.from_preset("matscholar_el", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+        self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 17.666, 3)
+
     def test_elem_megnet_el(self):
-        ep = ElementProperty.from_preset("megnet_el")
+        ep = ElementProperty.from_preset("megnet_el", impute_nan=False)
+        df_elem = ep.featurize_dataframe(self.df, col_id="composition")
+        self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6)
+        self.assertAlmostEqual(
+            df_elem["MEGNetElementData maximum embedding 11"].iloc[0],
+            0.160505,
+            places=6,
+        )
+        self.assertAlmostEqual(
+            df_elem["MEGNetElementData maximum embedding 11"].iloc[1],
+            0.160505,
+            places=6,
+        )
+        self.assertTrue(ep.citations())
+
+        df_elem = ep.featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], -0.044911, places=6)
+        self.assertAlmostEqual(
+            df_elem["MEGNetElementData maximum embedding 11"].iloc[0],
+            0.191229,
+            places=6,
+        )
+        self.assertTrue(ep.citations())
+
+        ep = ElementProperty.from_preset("megnet_el", impute_nan=True)
         df_elem = ep.featurize_dataframe(self.df, col_id="composition")
         self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], 0.127333, places=6)
         self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[1], 0.127333, places=6)
@@ -66,27 +213,137 @@ def test_elem_megnet_el(self):
             places=6,
         )
         self.assertTrue(ep.citations())
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+
+        df_elem = ep.featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertAlmostEqual(df_elem["MEGNetElementData maximum embedding 1"].iloc[0], -0.001072, places=6)
+        self.assertAlmostEqual(
+            df_elem["MEGNetElementData maximum embedding 11"].iloc[0],
+            0.191229,
+            places=6,
+        )
+        self.assertTrue(ep.citations())
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
 
     def test_meredig(self):
-        df_val = Meredig().featurize_dataframe(self.df, col_id="composition")
+        df_val = Meredig(impute_nan=False).featurize_dataframe(self.df, col_id="composition")
+        self.assertAlmostEqual(df_val["Fe fraction"].iloc[0], 2.0 / 5.0)
+        self.assertAlmostEqual(df_val["Fe fraction"].iloc[1], 0.5)
+        self.assertAlmostEqual(df_val["O fraction"].iloc[0], 3.0 / 5.0)
+        self.assertAlmostEqual(df_val["O fraction"].iloc[1], 0.5)
+        self.assertAlmostEqual(df_val["frac s valence electrons"].iloc[0], 0.294117647)
+        self.assertAlmostEqual(df_val["mean Number"].iloc[0], 15.2)
+
+        df_val = Meredig(impute_nan=False).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_val.isna().sum().sum(), 17)
+        self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 1, 10)
+
+        df_val = Meredig(impute_nan=True).featurize_dataframe(self.df, col_id="composition")
         self.assertAlmostEqual(df_val["Fe fraction"].iloc[0], 2.0 / 5.0)
         self.assertAlmostEqual(df_val["Fe fraction"].iloc[1], 0.5)
         self.assertAlmostEqual(df_val["O fraction"].iloc[0], 3.0 / 5.0)
         self.assertAlmostEqual(df_val["O fraction"].iloc[1], 0.5)
         self.assertAlmostEqual(df_val["frac s valence electrons"].iloc[0], 0.294117647)
         self.assertAlmostEqual(df_val["mean Number"].iloc[0], 15.2)
+        self.assertEqual(df_val.isna().sum().sum(), 0)
+
+        df_val = Meredig(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_val.isna().sum().sum(), 0)
+        self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 311.5897, 4)
 
     def test_fere_corr(self):
         df_fere_corr = ElementProperty(
             features=["FERE correction"],
             stats=["minimum", "maximum", "range", "mean", "std_dev"],
             data_source="deml",
+            impute_nan=False,
+        ).featurize_dataframe(self.df, col_id="composition")
+        self.assertAlmostEqual(df_fere_corr["DemlData minimum FERE correction"][0], -0.15213431610903)
+        self.assertAlmostEqual(df_fere_corr["DemlData maximum FERE correction"][0], 0.23)
+        self.assertAlmostEqual(df_fere_corr["DemlData range FERE correction"][0], 0.382134316)
+        self.assertAlmostEqual(df_fere_corr["DemlData mean FERE correction"][0], 0.077146274)
+        self.assertAlmostEqual(df_fere_corr["DemlData std_dev FERE correction"][0], 0.270209766)
+
+        df_fere_corr = ElementProperty(
+            features=["FERE correction"],
+            stats=["minimum", "maximum", "range", "mean", "std_dev"],
+            data_source="deml",
+            impute_nan=False,
+        ).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_fere_corr.isna().sum().sum(), 5)
+
+        df_fere_corr = ElementProperty(
+            features=["FERE correction"],
+            stats=["minimum", "maximum", "range", "mean", "std_dev"],
+            data_source="deml",
+            impute_nan=True,
         ).featurize_dataframe(self.df, col_id="composition")
         self.assertAlmostEqual(df_fere_corr["DemlData minimum FERE correction"][0], -0.15213431610903)
         self.assertAlmostEqual(df_fere_corr["DemlData maximum FERE correction"][0], 0.23)
         self.assertAlmostEqual(df_fere_corr["DemlData range FERE correction"][0], 0.382134316)
         self.assertAlmostEqual(df_fere_corr["DemlData mean FERE correction"][0], 0.077146274)
         self.assertAlmostEqual(df_fere_corr["DemlData std_dev FERE correction"][0], 0.270209766)
+        self.assertEqual(df_fere_corr.isna().sum().sum(), 0)
+
+        df_fere_corr = ElementProperty(
+            features=["FERE correction"],
+            stats=["minimum", "maximum", "range", "mean", "std_dev"],
+            data_source="deml",
+            impute_nan=True,
+        ).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_fere_corr.isna().sum().sum(), 0)
+        self.assertAlmostEqual(df_fere_corr.drop(columns="composition").sum().sum(), 0.2795, 4)
+
+    def test_elem_optical(self):
+        df_elem = ElementProperty.from_preset("optical", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["OpticalData mean n_400.0"].iloc[0], 1.98229162203492)
+        self.assertAlmostEqual(df_elem["OpticalData range k_760.0"].iloc[1], 4.88738594404032)
+        self.assertAlmostEqual(df_elem["OpticalData maximum R_720.0"].iloc[0], 0.621705031591809)
+
+        df_elem = ElementProperty.from_preset("optical", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 180)
+
+        df_elem = ElementProperty.from_preset("optical", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["OpticalData mean n_400.0"].iloc[0], 1.98229162203492)
+        self.assertAlmostEqual(df_elem["OpticalData range k_760.0"].iloc[1], 4.88738594404032)
+        self.assertAlmostEqual(df_elem["OpticalData maximum R_720.0"].iloc[0], 0.621705031591809)
+
+        df_elem = ElementProperty.from_preset("optical", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 0)
+        self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 201.3255, 4)
+
+    def test_elem_transport(self):
+        df_elem = ElementProperty.from_preset("mp_transport", impute_nan=False).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["TransportData mean sigma_p"].iloc[0], 14933.7481377614, places=6)
+        self.assertAlmostEqual(df_elem["TransportData std_dev S_n"].iloc[1], 489.973884028426)
+        self.assertAlmostEqual(df_elem["TransportData mean m_p"].iloc[0], -0.00019543531213698)
+
+        df_elem = ElementProperty.from_preset("mp_transport", impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertEqual(df_elem.isna().sum().sum(), 60)
+
+        df_elem = ElementProperty.from_preset("mp_transport", impute_nan=True).featurize_dataframe(
+            self.df, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem["TransportData mean sigma_p"].iloc[0], 14933.7481377614, places=6)
+        self.assertAlmostEqual(df_elem["TransportData std_dev S_n"].iloc[1], 489.973884028426)
+        self.assertAlmostEqual(df_elem["TransportData mean m_p"].iloc[0], -0.00019543531213698)
+
+        df_elem = ElementProperty.from_preset("mp_transport", impute_nan=True).featurize_dataframe(
+            self.df_nans, col_id="composition"
+        )
+        self.assertAlmostEqual(df_elem.drop(columns="composition").sum().sum(), 9798095.622017656, 4)
 
 
 if __name__ == "__main__":
diff --git a/matminer/featurizers/composition/tests/test_element.py b/matminer/featurizers/composition/tests/test_element.py
index 729c0a0f9..ac0c604df 100644
--- a/matminer/featurizers/composition/tests/test_element.py
+++ b/matminer/featurizers/composition/tests/test_element.py
@@ -35,9 +35,23 @@ def test_tm_fraction(self):
         self.assertAlmostEqual(df_tm_frac["transition metal fraction"][0], 0.4)
 
     def test_band_center(self):
-        df_band_center = BandCenter().featurize_dataframe(self.df, col_id="composition")
+        df_band_center = BandCenter(impute_nan=False).featurize_dataframe(self.df, col_id="composition")
         self.assertAlmostEqual(df_band_center["band center"][0], 5.870418816395603)
-        self.assertAlmostEqual(BandCenter().featurize(Composition("Ag33O500V200"))[0], 6.033480099340539)
+        self.assertAlmostEqual(
+            BandCenter(impute_nan=False).featurize(Composition("Ag33O500V200"))[0], 6.033480099340539
+        )
+
+        df_band_center = BandCenter(impute_nan=False).featurize_dataframe(
+            self.df_nans, col_id="composition", ignore_errors=True
+        )
+        self.assertEqual(df_band_center.isna().sum().sum(), 1, 10)
+
+        df_band_center = BandCenter(impute_nan=True).featurize_dataframe(self.df, col_id="composition")
+        self.assertAlmostEqual(df_band_center["band center"][0], 5.870418816395603)
+        self.assertAlmostEqual(BandCenter(impute_nan=True).featurize(Composition("Ag33O500V200"))[0], 6.033480099340539)
+
+        df_band_center = BandCenter(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertAlmostEqual(df_band_center["band center"][0], 4.598904, 6)
 
 
 if __name__ == "__main__":
diff --git a/matminer/featurizers/composition/tests/test_ion.py b/matminer/featurizers/composition/tests/test_ion.py
index d39d186e4..41e3723fe 100644
--- a/matminer/featurizers/composition/tests/test_ion.py
+++ b/matminer/featurizers/composition/tests/test_ion.py
@@ -1,5 +1,7 @@
+import math
 import unittest
 
+import pytest
 from pymatgen.core import Composition
 from pymatgen.core.periodic_table import Specie
 
@@ -20,9 +22,10 @@ def test_is_ionic(self):
 
         self.assertTrue(is_ionic(Composition({Specie("Fe", 2): 1, Specie("O", -2): 1})))
         self.assertFalse(is_ionic(Composition({Specie("Fe", 0): 1, Specie("Al", 0): 1})))
+        self.assertTrue(is_ionic(self.df_nans.loc[0, "composition"]))
 
     def test_ionic(self):
-        featurizer = IonProperty()
+        featurizer = IonProperty(impute_nan=False)
         df_ionic = featurizer.featurize_dataframe(self.df, col_id="composition")
         self.assertEqual(df_ionic["compound possible"][0], 1.0)
         self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.476922164)
@@ -39,8 +42,38 @@ def test_ionic(self):
             featurizer.featurize(Composition({Specie("Fe", 2): 1, Specie("Fe", 3): 2, Specie("O", -2): 4}))[0],
         )
 
+        featurizer = IonProperty(impute_nan=False)
+        df_ionic = featurizer.featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_ionic["compound possible"][0], 1.0)
+        self.assertTrue(math.isnan(df_ionic["max ionic char"][0]))
+        self.assertTrue(math.isnan(df_ionic["avg ionic char"][0]))
+
+        featurizer = IonProperty(impute_nan=True)
+        df_ionic = featurizer.featurize_dataframe(self.df, col_id="composition")
+        self.assertEqual(df_ionic["compound possible"][0], 1.0)
+        self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.476922164)
+        self.assertAlmostEqual(df_ionic["avg ionic char"][0], 0.114461319)
+
+        # Test 'fast'
+        self.assertEqual(1.0, featurizer.featurize(Composition("Fe3O4"))[0])
+        featurizer.fast = True
+        self.assertEqual(0, featurizer.featurize(Composition("Fe3O4"))[0])
+
+        # Make sure 'fast' works if I use-precomputed oxidation states
+        self.assertEqual(
+            1,
+            featurizer.featurize(Composition({Specie("Fe", 2): 1, Specie("Fe", 3): 2, Specie("O", -2): 4}))[0],
+        )
+
+        featurizer = IonProperty(impute_nan=True)
+        df_ionic = featurizer.featurize_dataframe(self.df_nans, col_id="composition")
+
+        self.assertEqual(df_ionic["compound possible"][0], 1.0)
+        self.assertAlmostEqual(df_ionic["max ionic char"][0], 0.028909, 6)
+        self.assertAlmostEqual(df_ionic["avg ionic char"][0], 0.006938, 6)
+
     def test_cation_properties(self):
-        featurizer = CationProperty.from_preset("deml")
+        featurizer = CationProperty.from_preset("deml", impute_nan=False)
         features = dict(
             zip(
                 featurizer.feature_labels(),
@@ -53,9 +86,52 @@ def test_cation_properties(self):
         self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 5.48)
         self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0)
 
+        with pytest.raises(KeyError):
+            featurizer = CationProperty.from_preset("deml", impute_nan=False)
+            features = dict(
+                zip(
+                    featurizer.feature_labels(),
+                    featurizer.featurize(self.df_nans["composition"][0]),
+                )
+            )
+
+        featurizer = CationProperty.from_preset("deml", impute_nan=True)
+        features = dict(
+            zip(
+                featurizer.feature_labels(),
+                featurizer.featurize(self.df["composition"][1]),
+            )
+        )
+        self.assertAlmostEqual(features["DemlData minimum magn_moment of cations"], 5.48)
+        self.assertAlmostEqual(features["DemlData maximum magn_moment of cations"], 5.48)
+        self.assertAlmostEqual(features["DemlData range magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 5.48)
+        self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0)
+
+        featurizer = CationProperty.from_preset("deml", impute_nan=True)
+        features = dict(
+            zip(
+                featurizer.feature_labels(),
+                featurizer.featurize(self.df_nans["composition"][0]),
+            )
+        )
+        self.assertAlmostEqual(features["DemlData minimum magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData maximum magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData range magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData mean magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData std_dev magn_moment of cations"], 0)
+        self.assertAlmostEqual(features["DemlData minimum total_ioniz of cations"], 811242.222222, 6)
+        self.assertAlmostEqual(features["DemlData maximum total_ioniz of cations"], 811242.222222, 6)
+        self.assertAlmostEqual(features["DemlData mean total_ioniz of cations"], 811242.222222, 6)
+
     def test_elec_affin(self):
-        featurizer = ElectronAffinity()
+        featurizer = ElectronAffinity(impute_nan=False)
+        self.assertAlmostEqual(-141000 * 2, featurizer.featurize(self.df["composition"][1])[0])
+        self.assertTrue(math.isnan(featurizer.featurize(self.df_nans["composition"][0])[0]))
+
+        featurizer = ElectronAffinity(impute_nan=True)
         self.assertAlmostEqual(-141000 * 2, featurizer.featurize(self.df["composition"][1])[0])
+        self.assertAlmostEqual(featurizer.featurize(self.df_nans["composition"][0])[0], -153716.470588, 6)
 
     def test_en_diff(self):
         featurizer = ElectronegativityDiff()
diff --git a/matminer/featurizers/composition/tests/test_orbital.py b/matminer/featurizers/composition/tests/test_orbital.py
index bb9511ea9..e328e5bd8 100644
--- a/matminer/featurizers/composition/tests/test_orbital.py
+++ b/matminer/featurizers/composition/tests/test_orbital.py
@@ -8,7 +8,7 @@
 
 class OrbitalFeaturesTest(CompositionFeaturesTest):
     def test_valence(self):
-        df_val = ValenceOrbital().featurize_dataframe(self.df, col_id="composition")
+        df_val = ValenceOrbital(impute_nan=False).featurize_dataframe(self.df, col_id="composition")
         self.assertAlmostEqual(df_val["avg s valence electrons"][0], 2.0)
         self.assertAlmostEqual(df_val["avg p valence electrons"][0], 2.4)
         self.assertAlmostEqual(df_val["avg d valence electrons"][0], 2.4)
@@ -18,6 +18,23 @@ def test_valence(self):
         self.assertAlmostEqual(df_val["frac p valence electrons"][0], 0.352941176)
         self.assertAlmostEqual(df_val["frac f valence electrons"][0], 0)
 
+        df_val = ValenceOrbital(impute_nan=False).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_val.isna().sum().sum(), 8)
+
+        df_val = ValenceOrbital(impute_nan=True).featurize_dataframe(self.df, col_id="composition")
+        self.assertAlmostEqual(df_val["avg s valence electrons"][0], 2.0)
+        self.assertAlmostEqual(df_val["avg p valence electrons"][0], 2.4)
+        self.assertAlmostEqual(df_val["avg d valence electrons"][0], 2.4)
+        self.assertAlmostEqual(df_val["avg f valence electrons"][0], 0.0)
+        self.assertAlmostEqual(df_val["frac s valence electrons"][0], 0.294117647)
+        self.assertAlmostEqual(df_val["frac d valence electrons"][0], 0.352941176)
+        self.assertAlmostEqual(df_val["frac p valence electrons"][0], 0.352941176)
+        self.assertAlmostEqual(df_val["frac f valence electrons"][0], 0)
+
+        df_val = ValenceOrbital(impute_nan=True).featurize_dataframe(self.df_nans, col_id="composition")
+        self.assertEqual(df_val.isna().sum().sum(), 0)
+        self.assertAlmostEqual(df_val.drop(columns="composition").sum().sum(), 10.342857, 6)
+
     def test_atomic_orbitals(self):
         df_atomic_orbitals = AtomicOrbitals().featurize_dataframe(self.df, col_id="composition")
         self.assertEqual(df_atomic_orbitals["HOMO_character"][0], "d")
diff --git a/matminer/featurizers/composition/tests/test_packing.py b/matminer/featurizers/composition/tests/test_packing.py
index 5d717d222..ebf43d9fe 100644
--- a/matminer/featurizers/composition/tests/test_packing.py
+++ b/matminer/featurizers/composition/tests/test_packing.py
@@ -11,11 +11,11 @@
 
 
 class PackingFeaturesTest(CompositionFeaturesTest):
-    def test_ape(self):
-        f = AtomicPackingEfficiency()
+    def test_ape1(self):
+        f = AtomicPackingEfficiency(impute_nan=False)
+        f.set_n_jobs(1)
         ef = ElementFraction()
         ef.set_n_jobs(1)
-        f.set_n_jobs(1)
 
         # Test the APE calculation routines
         self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15))
@@ -90,6 +90,85 @@ def test_ape(self):
         feat = f.compute_nearest_cluster_distance(Composition("Al"))
         np.testing.assert_array_almost_equal([1] * 3, feat)
 
+    def test_ape2(self):
+        f = AtomicPackingEfficiency(impute_nan=True)
+        f.set_n_jobs(1)
+        ef = ElementFraction()
+        ef.set_n_jobs(1)
+
+        # Test the APE calculation routines
+        self.assertAlmostEqual(1.11632, f.get_ideal_radius_ratio(15))
+        self.assertAlmostEqual(0.154701, f.get_ideal_radius_ratio(2))
+        self.assertAlmostEqual(1.65915, f.get_ideal_radius_ratio(27))
+        self.assertAlmostEqual(15, f.find_ideal_cluster_size(1.116)[0])
+        self.assertAlmostEqual(3, f.find_ideal_cluster_size(0.1)[0])
+        self.assertAlmostEqual(24, f.find_ideal_cluster_size(2)[0])
+
+        # Test the nearest neighbor lookup tool
+        nn_lookup = f.create_cluster_lookup_tool([Element("Cu"), Element("Zr")])
+
+        #  Check that the table gets the correct structures
+        stable_clusters = [
+            Composition("CuZr10"),
+            Composition("Cu6Zr6"),
+            Composition("Cu8Zr5"),
+            Composition("Cu13Zr1"),
+            Composition("Cu3Zr12"),
+            Composition("Cu8Zr8"),
+            Composition("Cu12Zr5"),
+            Composition("Cu17Zr"),
+        ]
+        ds, _ = nn_lookup.kneighbors(ef.featurize_many(stable_clusters), n_neighbors=1)
+        np.testing.assert_array_almost_equal([[0]] * 8, ds)
+        self.assertEqual(8, nn_lookup._fit_X.shape[0])
+
+        # Swap the order of the clusters, make sure it gets the same list
+        nn_lookup_swapped = f.create_cluster_lookup_tool([Element("Zr"), Element("Cu")])
+        np.testing.assert_array_almost_equal(
+            sorted(nn_lookup._fit_X.tolist()), sorted(nn_lookup_swapped._fit_X.tolist())
+        )
+
+        # Make sure we had a cache hit
+        self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().misses)
+        self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().hits)
+
+        # Change the tolerance, see if it changes the results properly
+        f.threshold = 0.002
+        nn_lookup = f.create_cluster_lookup_tool([Element("Cu"), Element("Zr")])
+        self.assertEqual(2, nn_lookup._fit_X.shape[0])
+        ds, _ = nn_lookup.kneighbors(
+            ef.featurize_many([Composition("CuZr10"), Composition("Cu3Zr12")]),
+            n_neighbors=1,
+        )
+        np.testing.assert_array_almost_equal([[0]] * 2, ds)
+
+        # Make sure we had a cache miss
+        self.assertEqual(5, f._create_cluster_lookup_tool.cache_info().misses)
+        self.assertEqual(4, f._create_cluster_lookup_tool.cache_info().hits)
+
+        # Compute the distances from Cu50Zr50
+        mean_dists = f.compute_nearest_cluster_distance(Composition("CuZr"))
+        np.testing.assert_array_almost_equal([0.424264, 0.667602, 0.800561], mean_dists, decimal=6)
+
+        # Compute the optimal APE for Cu50Zr50
+        np.testing.assert_array_almost_equal(
+            [0.000233857, 0.003508794],
+            f.compute_simultaneous_packing_efficiency(Composition("Cu50Zr50")),
+        )
+
+        # Test the dataframe calculator
+        df = pd.DataFrame({"comp": [Composition("CuZr")]})
+        df = f.featurize_dataframe(df, "comp")
+
+        self.assertEqual(6, len(df.columns))
+        self.assertIn("dist from 5 clusters |APE| < 0.002", df.columns)
+
+        self.assertAlmostEqual(0.003508794, df["mean abs simul. packing efficiency"][0])
+
+        # Make sure it works with composition that do not match any efficient clusters
+        feat = f.compute_nearest_cluster_distance(Composition("Al"))
+        np.testing.assert_array_almost_equal([1] * 3, feat)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/matminer/featurizers/site/chemical.py b/matminer/featurizers/site/chemical.py
index 8715bbfed..b05b27aef 100644
--- a/matminer/featurizers/site/chemical.py
+++ b/matminer/featurizers/site/chemical.py
@@ -2,6 +2,8 @@
 Site featurizers based on local chemical information, rather than geometry alone.
 """
 
+import warnings
+
 import numpy as np
 import pymatgen.analysis.local_env
 from pymatgen.analysis.ewald import EwaldSummation
@@ -13,6 +15,7 @@
 from matminer.featurizers.base import BaseFeaturizer
 from matminer.utils.caching import get_nearest_neighbors
 from matminer.utils.data import MagpieData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class ChemicalSRO(BaseFeaturizer):
@@ -310,7 +313,8 @@ class LocalPropertyDifference(BaseFeaturizer):
 
     def __init__(
         self,
-        data_source=MagpieData(),
+        data_source=None,
+        impute_nan=False,
         weight="area",
         properties=("Electronegativity",),
         signed=False,
@@ -318,7 +322,7 @@ def __init__(
         """Initialize the featurizer
 
         Args:
-            data_source (AbstractData) - Class from which to retrieve
+            data_source (AbstractData or None) - Class from which to retrieve
                 elemental properties
             weight (str) - What aspect of each voronoi facet to use to
                 weigh each neighbor (see VoronoiNN)
@@ -326,13 +330,16 @@ def __init__(
             signed (bool) - whether to return absolute difference or signed difference of
                             properties(default=False (absolute difference))
         """
-        self.data_source = data_source
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = data_source or MagpieData(impute_nan=self.impute_nan)
         self.properties = properties
         self.weight = weight
         self.signed = signed
 
     @staticmethod
-    def from_preset(preset):
+    def from_preset(preset, impute_nan=False):
         """
         Create a new LocalPropertyDifference class according to a preset
 
@@ -342,7 +349,7 @@ def from_preset(preset):
 
         if preset == "ward-prb-2017":
             return LocalPropertyDifference(
-                data_source=MagpieData(),
+                data_source=MagpieData(impute_nan=impute_nan),
                 properties=[
                     "Number",
                     "MendeleevNumber",
@@ -441,14 +448,20 @@ class SiteElementalProperty(BaseFeaturizer):
         `Schmidt et al., _Chem Mater_. (2017) <http://dx.doi.org/10.1021/acs.chemmater.7b00156>`_
     """
 
-    def __init__(self, data_source=None, properties=("Number",)):
+    def __init__(self, data_source=None, properties=("Number",), impute_nan=False):
         """Initialize the featurizer
 
         Args:
             data_source (AbstractData): Tool used to look up elemental properties
             properties ([string]): List of properties to use for features
+            impute_nan (bool): if True, the features for the elements
+                that are missing from the data_source or are NaNs are replaced by the
+                average of each features over the available elements.
         """
-        self.data_source = data_source or MagpieData()
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = data_source or MagpieData(impute_nan=self.impute_nan)
         self.properties = properties
         self._preset_citations = []
 
@@ -472,7 +485,7 @@ def implementors(self):
         return ["Logan Ward"]
 
     @staticmethod
-    def from_preset(preset):
+    def from_preset(preset, impute_nan=False):
         """Create the class with pre-defined settings
 
         Args:
@@ -483,7 +496,7 @@ def from_preset(preset):
 
         if preset == "seko-prb-2017":
             output = SiteElementalProperty(
-                data_source=MagpieData(),
+                data_source=MagpieData(impute_nan=impute_nan),
                 properties=[
                     "Number",
                     "AtomicWeight",
diff --git a/matminer/featurizers/site/misc.py b/matminer/featurizers/site/misc.py
index bb2c2a1ca..c82a1f08d 100644
--- a/matminer/featurizers/site/misc.py
+++ b/matminer/featurizers/site/misc.py
@@ -1,6 +1,8 @@
 """
 Miscellaneous site featurizers.
 """
+import warnings
+
 import numpy as np
 import pymatgen.analysis.local_env
 from pymatgen.analysis.local_env import VoronoiNN, solid_angle, vol_tetra
@@ -10,6 +12,7 @@
 from matminer.featurizers.utils.stats import PropertyStats
 from matminer.utils.caching import get_nearest_neighbors
 from matminer.utils.data import MagpieData
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 
 class IntersticeDistribution(BaseFeaturizer):
@@ -48,9 +51,12 @@ class IntersticeDistribution(BaseFeaturizer):
             support sub-list of ['dist', 'area', 'vol'].
         stats ([str]): statistics of distance/area/volume interstices.
         radius_type (str): source of radius estimate. (default: "MiracleRadius")
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="MiracleRadius"):
+    def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="MiracleRadius", impute_nan=False):
         self.cutoff = cutoff
         self.interstice_types = ["dist", "area", "vol"] if interstice_types is None else interstice_types
         if isinstance(self.interstice_types, str):
@@ -59,7 +65,10 @@ def __init__(self, cutoff=6.5, interstice_types=None, stats=None, radius_type="M
             raise ValueError("interstice_types only support sub-list of " "['dist', 'area', 'vol']")
         self.stats = ["mean", "std_dev", "minimum", "maximum"] if stats is None else stats
         self.radius_type = radius_type
-        self.data_source = MagpieData()
+        self.impute_nan = impute_nan
+        if not self.impute_nan:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+        self.data_source = MagpieData(impute_nan=impute_nan)
 
     def featurize(self, struct, idx):
         """
diff --git a/matminer/featurizers/site/tests/base.py b/matminer/featurizers/site/tests/base.py
index 50de75d3e..9294b3f52 100644
--- a/matminer/featurizers/site/tests/base.py
+++ b/matminer/featurizers/site/tests/base.py
@@ -60,6 +60,14 @@ def setUp(self):
             coords_are_cartesian=False,
             site_properties=None,
         )
+        self.nans = Structure(
+            Lattice([[3.52, 0, 0], [0, 3.52, 0], [0, 0, 3.52]]),
+            ["Og", "U"],
+            [[0, 0, 0], [0.5, 0.5, 0.5]],
+            validate_proximity=False,
+            to_unit_cell=False,
+            coords_are_cartesian=False,
+        )
 
     def tearDown(self):
         del self.sc
diff --git a/matminer/featurizers/site/tests/test_chemical.py b/matminer/featurizers/site/tests/test_chemical.py
index f7e73df90..d1389bb83 100644
--- a/matminer/featurizers/site/tests/test_chemical.py
+++ b/matminer/featurizers/site/tests/test_chemical.py
@@ -95,34 +95,74 @@ def test_ewald_site(self):
         np.testing.assert_array_almost_equal(ewald.featurize(self.sc, 0), [0])
 
     def test_local_prop_diff(self):
-        f = LocalPropertyDifference()
+        f = LocalPropertyDifference(impute_nan=False)
 
         # Test for Al, all features should be zero
         features = f.featurize(self.sc, 0)
         np.testing.assert_array_almost_equal(features, [0])
 
+        # Test for fictive structure that leads to NaNs
+        features = f.featurize(self.nans, 0)
+        assert np.isnan(features[0])
+
+        # Change the property to Number, compute for B1
+        f.set_params(properties=["Number"])
+        for i in range(2):
+            features = f.featurize(self.b1, i)
+            np.testing.assert_array_almost_equal(features, [1])
+
+        for i in range(2):
+            features = f.featurize(self.nans, i)
+            assert np.isnan(features[0])
+
+        f = LocalPropertyDifference(impute_nan=True)
+
+        # Test for Al, all features should be zero
+        features = f.featurize(self.sc, 0)
+        np.testing.assert_array_almost_equal(features, [0])
+
+        # Test for fictive structure that leads to NaNs
+        features = f.featurize(self.nans, 0)
+        np.testing.assert_array_almost_equal(features, [0.26003609])
+
         # Change the property to Number, compute for B1
         f.set_params(properties=["Number"])
         for i in range(2):
             features = f.featurize(self.b1, i)
             np.testing.assert_array_almost_equal(features, [1])
 
+        for i in range(2):
+            features = f.featurize(self.nans, i)
+            np.testing.assert_array_almost_equal(features, [27.54767206])
+
     def test_site_elem_prop(self):
-        f = SiteElementalProperty.from_preset("seko-prb-2017")
+        f = SiteElementalProperty.from_preset("seko-prb-2017", impute_nan=False)
 
         # Make sure it does the B1 structure correctly
         feat_labels = f.feature_labels()
         feats = f.featurize(self.b1, 0)
         self.assertAlmostEqual(1, feats[feat_labels.index("site Number")])
+        assert np.isnan(feats[feat_labels.index("site SecondIonizationEnergy")])
 
         feats = f.featurize(self.b1, 1)
         self.assertAlmostEqual(2, feats[feat_labels.index("site Number")])
+        assert np.isnan(feats[feat_labels.index("site Electronegativity")])
 
         # Test the citations
         citations = f.citations()
         self.assertEqual(1, len(citations))
         self.assertIn("Seko2017", citations[0])
 
+        f = SiteElementalProperty.from_preset("seko-prb-2017", impute_nan=True)
+        feat_labels = f.feature_labels()
+        feats = f.featurize(self.b1, 0)
+        self.assertAlmostEqual(1, feats[feat_labels.index("site Number")])
+        self.assertAlmostEqual(feats[feat_labels.index("site SecondIonizationEnergy")], 18.781681, 6)
+
+        feats = f.featurize(self.b1, 1)
+        self.assertAlmostEqual(2, feats[feat_labels.index("site Number")])
+        self.assertAlmostEqual(feats[feat_labels.index("site Electronegativity")], 1.715102, 6)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/matminer/featurizers/site/tests/test_misc.py b/matminer/featurizers/site/tests/test_misc.py
index f39d51c8e..b74c372da 100644
--- a/matminer/featurizers/site/tests/test_misc.py
+++ b/matminer/featurizers/site/tests/test_misc.py
@@ -17,7 +17,7 @@ def test_interstice_distribution_of_crystal(self):
         )
         df_bcc_li = pd.DataFrame({"struct": [bcc_li], "site": [1]})
 
-        interstice_distribution = IntersticeDistribution()
+        interstice_distribution = IntersticeDistribution(impute_nan=False)
         intersticefp = interstice_distribution.featurize_dataframe(df_bcc_li, ["struct", "site"])
 
         self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.32, 2)
@@ -33,6 +33,36 @@ def test_interstice_distribution_of_crystal(self):
         self.assertAlmostEqual(intersticefp["Interstice_dist_minimum"][0], 0, 3)
         self.assertAlmostEqual(intersticefp["Interstice_dist_maximum"][0], 0.15461, 5)
 
+        bcc_og = Structure(
+            Lattice([[3.51, 0, 0], [0, 3.51, 0], [0, 0, 3.51]]),
+            ["Og"] * 2,
+            [[0, 0, 0], [0.5, 0.5, 0.5]],
+        )
+        df_bcc_og = pd.DataFrame({"struct": [bcc_og], "site": [1]})
+
+        intersticefp = interstice_distribution.featurize_dataframe(df_bcc_og, ["struct", "site"])
+        self.assertEqual(intersticefp.isna().sum().sum(), 4)
+        self.assertEqual(intersticefp.drop(columns="struct").sum().sum(), 1)
+
+        interstice_distribution = IntersticeDistribution(impute_nan=True)
+        intersticefp = interstice_distribution.featurize_dataframe(df_bcc_li, ["struct", "site"])
+
+        self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.32, 2)
+        self.assertAlmostEqual(intersticefp["Interstice_vol_std_dev"][0], 0)
+        self.assertAlmostEqual(intersticefp["Interstice_vol_minimum"][0], 0.32, 2)
+        self.assertAlmostEqual(intersticefp["Interstice_vol_maximum"][0], 0.32, 2)
+        self.assertAlmostEqual(intersticefp["Interstice_area_mean"][0], 0.16682, 5)
+        self.assertAlmostEqual(intersticefp["Interstice_area_std_dev"][0], 0)
+        self.assertAlmostEqual(intersticefp["Interstice_area_minimum"][0], 0.16682, 5)
+        self.assertAlmostEqual(intersticefp["Interstice_area_maximum"][0], 0.16682, 5)
+        self.assertAlmostEqual(intersticefp["Interstice_dist_mean"][0], 0.06621, 5)
+        self.assertAlmostEqual(intersticefp["Interstice_dist_std_dev"][0], 0.07655, 5)
+        self.assertAlmostEqual(intersticefp["Interstice_dist_minimum"][0], 0, 3)
+        self.assertAlmostEqual(intersticefp["Interstice_dist_maximum"][0], 0.15461, 5)
+        intersticefp = interstice_distribution.featurize_dataframe(df_bcc_og, ["struct", "site"])
+        self.assertEqual(intersticefp.isna().sum().sum(), 0)
+        self.assertAlmostEqual(intersticefp.drop(columns="struct").sum().sum(), 2.581817, 6)
+
     def test_interstice_distribution_of_glass(self):
         cuzr_glass = Structure(
             Lattice([[25, 0, 0], [0, 25, 0], [0, 0, 25]]),
@@ -72,7 +102,7 @@ def test_interstice_distribution_of_glass(self):
         )
         df_glass = pd.DataFrame({"struct": [cuzr_glass], "site": [0]})
 
-        interstice_distribution = IntersticeDistribution()
+        interstice_distribution = IntersticeDistribution(impute_nan=False)
         intersticefp = interstice_distribution.featurize_dataframe(df_glass, ["struct", "site"])
 
         self.assertAlmostEqual(intersticefp["Interstice_vol_mean"][0], 0.28905, 5)
diff --git a/matminer/utils/data.py b/matminer/utils/data.py
index 9a3963ae1..de8aea73d 100644
--- a/matminer/utils/data.py
+++ b/matminer/utils/data.py
@@ -7,11 +7,21 @@
 import abc
 import json
 import os
+import tarfile
+import warnings
+from copy import deepcopy
 from glob import glob
 
 import numpy as np
 import pandas as pd
+from pymatgen.core import Composition
 from pymatgen.core.periodic_table import Element
+from ruamel import yaml
+from scipy import stats
+from scipy.interpolate import interp1d
+
+from matminer.utils.utils import get_elem_in_data, get_pseudo_inverse
+from matminer.utils.warnings import IMPUTE_NAN_WARNING
 
 __author__ = "Kiran Mathew, Jiming Chen, Logan Ward, Anubhav Jain, Alex Dunn"
 
@@ -97,13 +107,31 @@ class CohesiveEnergyData(AbstractData):
     (http://www.knowledgedoor.com/2/elements_handbook/cohesive_energy.html),
     which in turn got the data from Introduction to Solid State Physics,
     8th Edition, by Charles Kittel (ISBN 978-0-471-41526-8), 2005.
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         # Load elemental cohesive energy data from json file
         with open(os.path.join(module_dir, "data_files", "cohesive_energies.json")) as f:
             self.cohesive_energy_data = json.load(f)
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            elem_list = list(self.cohesive_energy_data)
+            missing_elements = [e.symbol for e in Element if e.symbol not in elem_list]
+            avg_cohesive_energy = np.nanmean(list(self.cohesive_energy_data.values()))
+
+            for e in Element:
+                if e.symbol in missing_elements or np.isnan(self.cohesive_energy_data[e.symbol]):
+                    self.cohesive_energy_data[e.symbol] = avg_cohesive_energy
+
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
     def get_elemental_property(self, elem, property_name="cohesive energy"):
         """
         Args:
@@ -126,12 +154,17 @@ class DemlData(OxidationStateDependentData, OxidationStatesMixin):
 
     The meanings of each feature in the data can be found in
     ./data_files/deml_elementdata.py
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         from matminer.utils.data_files.deml_elementdata import properties
 
-        self.all_props = properties
+        self.all_props = deepcopy(properties)
         self.available_props = list(self.all_props.keys()) + [
             "formal_charge",
             "valence_s",
@@ -141,12 +174,6 @@ def __init__(self):
             "total_ioniz",
         ]
 
-        # Compute the FERE correction energy
-        fere_corr = {}
-        for k, v in self.all_props["GGAU_Etot"].items():
-            fere_corr[k] = self.all_props["mus_fere"][k] - v
-        self.all_props["FERE correction"] = fere_corr
-
         # List out the available charge-dependent properties
         self.charge_dependent_properties = [
             "xtal_field_split",
@@ -155,6 +182,47 @@ def __init__(self):
             "sat_magn",
         ]
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            avg_ioniz = np.nanmean([self.all_props["ionization_en"].get(e.symbol, [float("NaN")])[0] for e in Element])
+
+            for e in Element:
+                if e.symbol not in self.all_props["atom_num"]:
+                    self.all_props["atom_num"][e.symbol] = e.Z
+                if e.symbol not in self.all_props["atom_mass"]:
+                    self.all_props["atom_mass"][e.symbol] = e.atomic_mass
+                if e.symbol not in self.all_props["row_num"]:
+                    self.all_props["row_num"][e.symbol] = e.row
+                if e.symbol not in self.all_props["ionization_en"] or np.isnan(
+                    self.all_props["ionization_en"][e.symbol][0]
+                ):
+                    self.all_props["ionization_en"][e.symbol] = [avg_ioniz]
+                for key in [
+                    "col_num",
+                    "atom_radius",
+                    "molar_vol",
+                    "heat_fusion",
+                    "melting_point",
+                    "boiling_point",
+                    "heat_cap",
+                    "electron_affin",
+                    "electronegativity",
+                    "electric_pol",
+                    "GGAU_Etot",
+                    "mus_fere",
+                    "electron_affin",
+                ]:
+                    if e.symbol not in self.all_props[key] or np.isnan(self.all_props[key][e.symbol]):
+                        self.all_props[key][e.symbol] = np.nanmean(list(self.all_props[key].values()))
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
+        # Compute the FERE correction energy
+        fere_corr = {}
+        for k, v in self.all_props["GGAU_Etot"].items():
+            fere_corr[k] = self.all_props["mus_fere"][k] - v
+        self.all_props["FERE correction"] = fere_corr
+
     def get_elemental_property(self, elem, property_name):
         if "valence" in property_name:
             valence_dict = self.all_props["valence_e"][self.all_props["col_num"][elem.symbol]]
@@ -169,13 +237,18 @@ def get_elemental_property(self, elem, property_name):
             return self.all_props[property_name].get(elem.symbol, float("NaN"))
 
     def get_oxidation_states(self, elem):
-        return self.all_props["charge_states"][elem.symbol]
+        if not self.impute_nan:
+            return self.all_props["charge_states"][elem.symbol]
+        else:
+            return self.all_props["charge_states"].get(elem.symbol, [0])
 
     def get_charge_dependent_property(self, element, charge, property_name):
         if property_name == "total_ioniz":
             if charge < 0:
                 raise ValueError("total ionization energy only defined for charge > 0")
             return sum(self.all_props["ionization_en"][element.symbol][:charge])
+        elif self.impute_nan:
+            return self.all_props[property_name].get(element.symbol, {}).get(charge, 0)
         else:
             return self.all_props[property_name].get(element.symbol, {}).get(charge, np.nan)
 
@@ -190,9 +263,14 @@ class MagpieData(AbstractData, OxidationStatesMixin):
 
     Finding the exact meaning of each of these features can be quite difficult.
     Reproduced in ./data_files/magpie_elementdata_feature_descriptions.txt.
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         self.all_elemental_props = dict()
         available_props = []
         self.data_dir = os.path.join(module_dir, "data_files", "magpie_elementdata")
@@ -216,6 +294,37 @@ def __init__(self):
                         prop_value = float("NaN")
                     self.all_elemental_props[descriptor_name][Element.from_Z(atomic_no).symbol] = prop_value
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            for prop in self.all_elemental_props:
+                if prop == "OxidationStates":
+                    nested_props = list(self.all_elemental_props["OxidationStates"].values())
+                    flatten_props = []
+                    for l in nested_props:
+                        if isinstance(l, list):
+                            for e in l:
+                                flatten_props.append(e)
+                        else:
+                            flatten_props.append(l)
+
+                    avg_prop = np.round(np.nanmean(flatten_props))
+                    for e in Element:
+                        if (
+                            e.symbol not in self.all_elemental_props[prop]
+                            or self.all_elemental_props[prop][e.symbol] == []
+                            or np.any(np.isnan(self.all_elemental_props[prop][e.symbol]))
+                        ):
+                            self.all_elemental_props[prop][e.symbol] = [avg_prop]
+                else:
+                    avg_prop = np.nanmean(list(self.all_elemental_props[prop].values()))
+                    for e in Element:
+                        if e.symbol not in self.all_elemental_props[prop] or np.isnan(
+                            self.all_elemental_props[prop][e.symbol]
+                        ):
+                            self.all_elemental_props[prop][e.symbol] = avg_prop
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
     def get_elemental_property(self, elem, property_name):
         return self.all_elemental_props[property_name][elem.symbol]
 
@@ -232,10 +341,16 @@ class PymatgenData(OxidationStateDependentData, OxidationStatesMixin):
 
     Meanings of each feature can be obtained from the pymatgen.Composition
     documentation (attributes).
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self, use_common_oxi_states=True):
+    def __init__(self, use_common_oxi_states=True, impute_nan=False):
         self.use_common_oxi_states = use_common_oxi_states
+        self.impute_nan = impute_nan
 
     def get_elemental_property(self, elem, property_name):
         if property_name == "block":
@@ -243,7 +358,22 @@ def get_elemental_property(self, elem, property_name):
             return block_key[getattr(elem, property_name)]
         else:
             value = getattr(elem, property_name)
-            return np.nan if value is None else value
+            if self.impute_nan:
+                if value and not pd.isnull(value):
+                    return value
+                else:
+                    if property_name == "ionic_radii":
+                        return {0: self.get_elemental_property(elem, "atomic_radius")}
+                    elif property_name in ["common_oxidation_states", "icsd_oxidation_states"]:
+                        return (0,)
+                    else:
+                        all_values = [getattr(e, property_name) for e in Element]
+                        all_values = [val if val else float("NaN") for val in all_values]
+                        value_avg = np.nanmean(all_values)
+                    return value_avg
+            else:
+                warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+                return np.nan if value is None else value
 
     def get_oxidation_states(self, elem):
         """Get the oxidation states of an element
@@ -258,7 +388,13 @@ def get_oxidation_states(self, elem):
         return elem.common_oxidation_states if self.use_common_oxi_states else elem.oxidation_states
 
     def get_charge_dependent_property(self, element, charge, property_name):
-        return getattr(element, property_name)[charge]
+        if self.impute_nan:
+            if property_name == "ionic_radii":
+                return self.get_elemental_property(element, property_name)[charge]
+            elif property_name in ["common_oxidation_states", "icsd_oxidation_states"]:
+                return self.get_elemental_property(element, property_name)[charge]
+        else:
+            return getattr(element, property_name)[charge]
 
 
 class MixingEnthalpy:
@@ -275,12 +411,17 @@ class MixingEnthalpy:
         valid_element_list ([Element]): A list of elements for which the
             mixing enthalpy parameters are defined (although no guarantees
             are provided that all combinations of this list will be available).
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         mixing_dataset = pd.read_csv(
             os.path.join(module_dir, "data_files", "MiedemaLiquidDeltaHf.tsv"),
-            delim_whitespace=True,
+            sep=r"\s+",
         )
         self.mixing_data = {}
         for a, b, dHf in mixing_dataset.itertuples(index=False):
@@ -365,6 +506,18 @@ def __init__(self):
         ]
         self.valid_element_list = [Element(e) for e in valid_elements]
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            avg_value = np.nanmean(list(self.mixing_data.values()))
+            for e1 in Element:
+                for e2 in Element:
+                    key = tuple(sorted((e1.symbol, e2.symbol)))
+                    if key not in self.mixing_data or np.isnan(self.mixing_data[key]):
+                        self.mixing_data[key] = avg_value
+            self.valid_element_list = list(Element)
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
     def get_mixing_enthalpy(self, elemA, elemB):
         """
         Get the mixing enthalpy between different elements
@@ -392,9 +545,14 @@ class MatscholarElementData(AbstractData):
     Tshitoyan, V., Dagdelen, J., Weston, L. et al. Unsupervised word embeddings
     capture latent knowledge from materials science literature. Nature 571,
     95–98 (2019). https://doi.org/10.1038/s41586-019-1335-8
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         dfile = os.path.join(module_dir, "data_files/matscholar_els.json")
         with open(dfile) as fp:
             embeddings = json.load(fp)
@@ -404,6 +562,22 @@ def __init__(self):
             all_element_data[el] = dict(zip(self.prop_names, embedding))
         self.all_element_data = all_element_data
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            for embedding in self.prop_names:
+                avg_value = np.nanmean(
+                    [self.all_element_data[e].get(embedding, float("NaN")) for e in self.all_element_data]
+                )
+                for e in Element:
+                    if e.symbol not in self.all_element_data:
+                        self.all_element_data[e.symbol] = {embedding: avg_value}
+                    elif embedding not in self.all_element_data[e.symbol] or np.isnan(
+                        self.all_element_data[e.symbol][embedding]
+                    ):
+                        self.all_element_data[e.symbol][embedding] = avg_value
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
     def get_elemental_property(self, elem, property_name):
         return self.all_element_data[str(elem)][property_name]
 
@@ -429,9 +603,14 @@ class MEGNetElementData(AbstractData):
 
     The representations are learned during training to predict a specific
     property, though they may be useful for a range of properties.
+
+    Args:
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
     """
 
-    def __init__(self):
+    def __init__(self, impute_nan=False):
         dfile = os.path.join(module_dir, "data_files/megnet_elemental_embedding.json")
         self._dummy = "Dummy"
         with open(dfile) as fp:
@@ -445,6 +624,22 @@ def __init__(self):
             else:
                 self.all_element_data[str(Element.from_Z(i))] = embedding_dict
 
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            for embedding in self.prop_names:
+                avg_value = np.nanmean(
+                    [self.all_element_data[e].get(embedding, float("NaN")) for e in self.all_element_data]
+                )
+                for e in Element:
+                    if e.symbol not in self.all_element_data:
+                        self.all_element_data[e.symbol] = {embedding: avg_value}
+                    elif embedding not in self.all_element_data[e.symbol] or np.isnan(
+                        self.all_element_data[e.symbol][embedding]
+                    ):
+                        self.all_element_data[e.symbol][embedding] = avg_value
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
     def get_elemental_property(self, elem, property_name):
         estr = str(elem)
         if estr not in self.all_element_data.keys():
@@ -572,3 +767,581 @@ def get_bv_params(self, cation, anion, cat_val, an_val):
         ]
         return bond_val_list.iloc[0]  # If multiple values exist, take first one
         # as recommended for reliability.
+
+
+class OpticalData(AbstractData):
+    """
+    Class to use optical data from https://www.refractiveindex.info
+    The properties are the refractive index n, the extinction coefficient ĸ
+    (measured or computed with DFT), and the reflectivity R as obtained from
+    Fresnel's equation.
+    Data is by default considered if available from 380 to 780 nm,
+    but other ranges can be chosen as well.
+
+    In case new data becomes available and needs to be added to the database,
+    it should be added in matminer/utils/data_files/optical_polyanskiy/database,
+    which should then be compressed in the tar.xz format.
+    To add a file for a compound, follow any of the formats of refractiveindex.info.
+
+    The database is used to extract:
+    1) the properties of single elements when available.
+    2) the pseudo-inverse of the properties of single elements,
+       based on the data for ~200 compounds. These pseudo-inverse
+       contributions correspond to the coefficients of a least-square fit
+       from the compositions to the properties. This can allow to better take into account
+       data from different compounds for a given element.
+
+    Using the pseudo-inverses (method="pseudo_inverse") instead of
+    the elemental properties (method="exact") leads to better results as far as we have checked.
+    Another possibility is to use method="combined", where the exact values are taken
+    for compounds present as pure compounds in the database, and the pseudo-inverse
+    is taken if the element is not present purely in the database.
+
+    n, ĸ, and R are spectra. These are composed of n_wl wavelengths, from min_wl to max_wl.
+    We split these spectra into bins (initially 10) where their average values are taken.
+    These averaged values are the final features. The wavelength corresponding to a given bin
+    is its midpoint.
+
+    Args:
+        props: optical properties to include. Should be a list with
+               "refractive" and/or "extinction" and/or "reflectivity".
+        method: type of values, either "exact", "pseudo_inverse", or "combined".
+        min_wl: minimum wavelength to include in the spectra (µm).
+        max_wl : maximum wavelength to include in the spectra (µm).
+        n_wl: number of wavelengths to include in the spectra.
+        bins: number of bins to split the spectra.
+        saving_dir: folder to save the data and csv file used for the featurization. Saving them helps fasten the
+                    featurization.
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
+    """
+
+    def __init__(
+        self,
+        props=None,
+        method="pseudo_inverse",
+        min_wl=0.38,
+        max_wl=0.78,
+        n_wl=401,
+        bins=10,
+        saving_dir="~/.matminer/optical_props/",
+        impute_nan=False,
+    ):
+        # Handles the saving folder
+        saving_dir = os.path.expanduser(saving_dir)
+        os.makedirs(os.path.join(saving_dir, "database"), exist_ok=True)
+        self.saving_dir = saving_dir
+
+        # Handle the selection of properties
+        if props is None:
+            props = ["refractive", "extinction", "reflectivity"]
+        elif not all([prop in ["refractive", "extinction", "reflectivity"] for prop in props]):
+            raise ValueError("This property is not available: choose from refractive, extinction, or reflectivity")
+
+        self.props = props
+        self.method = method
+        self.n_wl = n_wl
+        self.min_wl = min_wl
+        self.max_wl = max_wl
+        self.wavelengths = np.linspace(min_wl, max_wl, n_wl)
+
+        # The data might have already been treated : it is faster to read the data from file
+        dbfile = os.path.join(self.saving_dir, f"optical_polyanskiy_{self.min_wl}_{self.max_wl}_{self.n_wl}.csv")
+
+        if os.path.isfile(dbfile):
+            data = pd.read_csv(dbfile)
+            data.set_index("Compound", inplace=True)
+            self.data = data
+        else:
+            # Recompute the data file from the database
+            warnings.warn(
+                """Datafile not existing for these wavelengths: recollecting the data from the database.
+                   This can take a few seconds..."""
+            )
+            self.data = self._get_optical_data_from_database()
+            self.data.to_csv(dbfile)
+            warnings.warn("The data has been collected and stored.")
+
+        self.elem_data = self._get_element_props()
+
+        # Split into bins
+        bins *= len(props)
+        slices = np.linspace(0, len(self.elem_data.T), bins + 1, True).astype(int)
+        counts = np.diff(slices)
+
+        cols = self.elem_data.columns[slices[:-1] + counts // 2]
+        labels = list(cols)  # [col for col in cols]
+
+        all_element_data = pd.DataFrame(
+            np.add.reduceat(self.elem_data.values, slices[:-1], axis=1) / counts,
+            columns=cols,
+            index=self.elem_data.index,
+        )
+
+        self.all_element_data = all_element_data
+        self.prop_names = labels
+
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            self.all_element_data.fillna(self.all_element_data.mean(), inplace=True)
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
+    def _get_element_props(self):
+        """
+        Returns the properties of single elements from the data contained in the database.
+        """
+
+        data = self.data.copy()
+
+        cols = []
+        if "refractive" in self.props:
+            cols += [name for name in data.columns if "n_" in name]
+        if "extinction" in self.props:
+            cols += [name for name in data.columns if "k_" in name]
+        if "reflectivity" in self.props:
+            cols += [name for name in data.columns if "R_" in name]
+
+        data = data[cols]
+
+        # Compute the exact values
+        res = []
+        elem, elem_absent = get_elem_in_data(data, as_pure=True)
+        for e in elem:
+            res.append(data.loc[e].values)
+        for e in elem_absent:
+            res.append(np.nan * np.ones(len(self.props) * self.n_wl))
+
+        res = np.array(res)
+        df_exact = pd.DataFrame(res, columns=cols, index=pd.Index(elem + elem_absent))
+
+        if self.method == "exact":
+            return df_exact
+        else:
+            # Compute the pseudo-inversed values
+            df_pi = get_pseudo_inverse(self.data, cols)
+
+            if self.method == "pseudo_inverse":
+                return df_pi
+            elif self.method == "combined":
+                res_combined = []
+                for i, e in df_exact.iterrows():
+                    if e.isnull().sum() == 0:
+                        res_combined.append(e.values)
+                    else:
+                        res_combined.append(df_pi.loc[i].values)
+                res_combined = np.array(res_combined)
+                df_combined = pd.DataFrame(res_combined, columns=cols, index=df_exact.index)
+                return df_combined
+
+            else:
+                raise ValueError("The method should be either exact, pseudo_inverse or combined.")
+
+    def _get_optical_data_from_database(self):
+        """
+        Get a dataframe with the refractive index, extinction coefficients, and reflectivity
+        as obtained from the initial database, for an array of wavelengths.
+        We need to handle the database that is in different formats...
+
+        Returns:
+            DataFrame with the data
+        """
+
+        db_dir = os.path.join(self.saving_dir, "database")
+        # The database has been compressed, it needs to be untarred if it is not already the case.
+        if not os.listdir(db_dir):
+            db_file = os.path.join(module_dir, "data_files/optical_polyanskiy/database.tar.xz")
+            with tarfile.open(db_file, mode="r:xz") as tar:
+                tar.extractall(self.saving_dir)
+
+        names = []
+        compos = []
+        N = []
+        K = []
+
+        for material in os.listdir(db_dir):
+            # Some materials have the data in the needed wavelengths range,
+            # others don't and throw an error
+            try:
+                n_avg = self._get_nk_avg(os.path.join(db_dir, material))
+                compos.append(Composition(material))
+                names.append(material)
+                N.append(n_avg.real)
+                K.append(n_avg.imag)
+            except ValueError:
+                pass
+
+        W = 1000 * self.wavelengths
+        N = np.array(N)
+        K = np.array(K)
+        R = ((N - 1) ** 2 + K**2) / ((N + 1) ** 2 + K**2)
+
+        cols_names_n = [f"n_{np.round(wl, 2)}" for wl in W]
+        cols_names_k = [f"k_{np.round(wl, 2)}" for wl in W]
+        cols_names_r = [f"R_{np.round(wl, 2)}" for wl in W]
+
+        df_n = pd.DataFrame(N, columns=cols_names_n)
+        df_k = pd.DataFrame(K, columns=cols_names_k)
+        df_r = pd.DataFrame(R, columns=cols_names_r)
+
+        df = pd.concat([df_n, df_k, df_r], axis=1)
+        df["Composition"] = compos
+        df["Compound"] = names
+        df.set_index("Compound", inplace=True)
+
+        return df
+
+    def _get_optical_data_from_file(self, yaml_file):
+        """
+        From a yml file, returns the refractive index for a given wavelength
+
+        Args:
+            yaml_file: path to the yml file containing the data
+
+        Returns:
+            refractive index (complex number)
+        """
+
+        # Open the yml file
+        with open(yaml_file) as yml:
+            data = yaml.YAML(typ="safe", pure=True).load(yml)["DATA"][0]
+
+        data_format = data["type"]
+
+        # We now treat different formats for the data
+        if data_format in ["tabulated nk", "tabulated n"]:
+            # We parse the data to get the wavelength, n and kappa
+            if data_format == "tabulated nk":
+                arr = np.fromstring(data["data"].replace("\n", " "), sep=" ").reshape((-1, 3))
+                K = arr[:, 2]
+            # kappa not available -> 0
+            elif data_format == "tabulated n":
+                arr = np.fromstring(data["data"].replace("\n", " "), sep=" ").reshape((-1, 2))
+                K = np.zeros(len(arr))
+
+            wl = arr[:, 0]
+            range_wl = np.array([np.min(wl), np.max(wl)])
+            N = arr[:, 1]
+
+            interpN = interp1d(wl, N)
+            interpK = interp1d(wl, K)
+
+            # Check that self.wavelengths are within the range
+            if np.any(self.min_wl < range_wl[0]) or np.any(self.max_wl > range_wl[1]):
+                raise ValueError(
+                    f"""The values of lambda asked to be returned is outside the range
+                of available data. This can lead to strong deviation as extrapolation might be bad. For information, the
+                range is [{range_wl[0]}, {range_wl[1]}] microns."""
+                )
+            else:
+                return np.array([x for x in np.nditer(interpN(self.wavelengths) + 1j * interpK(self.wavelengths))])
+
+        # If the data is not tabulated, it is given with a formula
+        elif "formula" in data_format:
+            range_wl = np.fromstring(data["wavelength_range"], sep=" ")
+            # Check that lamb is within the range
+            if np.any(self.min_wl < range_wl[0]) or np.any(self.max_wl > range_wl[1]):
+                raise ValueError(
+                    f""""The values of lambda asked to be returned is outside the range
+                of available data. This can lead to strong deviation as extrapolation might be bad. For information, the
+                range is [{range_wl[0]}, {range_wl[1]}] microns."""
+                )
+            else:
+                coeff_file = np.fromstring(data["coefficients"], sep=" ")
+
+                N = np.zeros(self.n_wl) + 1j * np.zeros(self.n_wl)
+
+                if data_format == "formula 1":
+                    coeffs = np.zeros(17)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += 1 + coeffs[0]
+                    for i in range(1, 17, 2):
+                        N += (coeffs[i] * self.wavelengths**2) / (self.wavelengths**2 - coeffs[i + 1] ** 2)
+                    N = np.sqrt(N)
+
+                elif data_format == "formula 2":
+                    coeffs = np.zeros(17)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += 1 + coeffs[0]
+                    for i in range(1, 17, 2):
+                        N += (coeffs[i] * self.wavelengths**2) / (self.wavelengths**2 - coeffs[i + 1])
+                    N = np.sqrt(N)
+
+                elif data_format == "formula 3":
+                    coeffs = np.zeros(17)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += coeffs[0]
+                    for i in range(1, 17, 2):
+                        N += coeffs[i] * self.wavelengths ** coeffs[i + 1]
+                    N = np.sqrt(N)
+
+                elif data_format == "formula 4":
+                    coeffs = np.zeros(17)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += coeffs[0]
+                    N += coeffs[1] * self.wavelengths ** coeffs[2] / (self.wavelengths**2 - coeffs[3] ** coeffs[4])
+                    N += coeffs[5] * self.wavelengths ** coeffs[6] / (self.wavelengths**2 - coeffs[7] ** coeffs[8])
+                    for i in range(9, 17, 2):
+                        N += coeffs[i] * self.wavelengths ** coeffs[i + 1]
+                    N = np.sqrt(N)
+
+                elif data_format == "formula 5":
+                    coeffs = np.zeros(11)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += coeffs[0]
+                    for i in range(1, 11, 2):
+                        N += coeffs[i] * self.wavelengths ** coeffs[i + 1]
+
+                elif data_format == "formula 6":
+                    coeffs = np.zeros(11)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += 1 + coeffs[0]
+                    for i in range(1, 11, 2):
+                        N += coeffs[i] * self.wavelengths**2 / (coeffs[i + 1] * self.wavelengths**2 - 1)
+
+                elif data_format == "formula 7":
+                    coeffs = np.zeros(6)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += (
+                        coeffs[0]
+                        + coeffs[1] / (self.wavelengths**2 - 0.028)
+                        + coeffs[2] / (self.wavelengths**2 - 0.028) ** 2
+                    )
+                    N += (
+                        coeffs[3] * self.wavelengths**2
+                        + coeffs[4] * self.wavelengths**4
+                        + coeffs[5] * self.wavelengths**6
+                    )
+
+                elif data_format == "formula 8":
+                    coeffs = np.zeros(4)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += (
+                        coeffs[0]
+                        + coeffs[1] * self.wavelengths**2 / (self.wavelengths**2 - coeffs[2])
+                        + coeffs[3] * self.wavelengths**2
+                    )
+                    N = np.sqrt((1 + 2 * N) / (1 - N))
+
+                elif data_format == "formula 9":
+                    coeffs = np.zeros(6)
+                    coeffs[0 : len(coeff_file)] = coeff_file
+                    N += (
+                        coeffs[0]
+                        + coeffs[1] / (self.wavelengths**2 - coeffs[2])
+                        + coeffs[3] * (self.wavelengths - coeffs[4]) / ((self.wavelengths - coeffs[4]) ** 2 + coeffs[5])
+                    )
+                    N = np.sqrt(N)
+
+                return N + 1j * np.zeros(len(N))
+
+        else:
+            raise ValueError("UnsupportedDataType: This data type is currently not supported !")
+
+    def _get_nk_avg(self, dirname):
+        """
+        Compute the average of n and ĸ for a compound of the database.
+
+        Args:
+            dirname: path to the compound of interest
+
+        Returns:
+            refractive index (complex number)
+        """
+
+        files = os.listdir(dirname)
+
+        navg = []
+        for f in files:
+            # Some files have only kappa : they raise a ValueError
+            try:
+                # We get the average of all curves in the region of interest.
+                # For some systems, the region of interest is not covered.
+                navg.append(self._get_optical_data_from_file(os.path.join(dirname, f)))
+            except ValueError:
+                pass
+
+        # We go further only if there is data
+        if navg:
+            navg = np.array(navg).mean(axis=0)
+            Navg = navg.real
+            Kavg = navg.imag
+            return Navg + 1j * Kavg
+        else:
+            raise ValueError(f"No correct data for {dirname}")
+
+    def get_elemental_property(self, elem, property_name):
+        estr = str(elem)
+        return self.all_element_data.loc[estr][property_name]
+
+
+class TransportData(AbstractData):
+    """
+    Class to use transport data from Ricci et al., see
+    An ab initio electronic transport database for inorganic materials.
+    Ricci, F., Chen, W., Aydemir, U., Snyder, G. J., Rignanese, G. M., Jain, A., & Hautier, G. (2017).
+    Scientific data, 4(1), 1-13.
+    https://doi.org/10.1038/sdata.2017.85
+
+    The database has been used to extract:
+    1) the properties of single elements when available.
+       These are stored in matminer/utils/data_files/mp_transport/transport_pure_elems.csv
+    2) the pseudo-inverse of the properties of single elements.
+       These pseudo-inverse contributions correspond to the coefficients of a least-square fit
+       from the compositions to the properties. This can allow to better take into account
+       data from different compounds for a given element.
+
+    Using the pseudo-inverses (method="pseudo_inverse") instead of
+    the elemental properties (method="exact") leads to better results as far as we have checked.
+    Another possibility is to use method="combined", where the exact values are taken
+    for compounds present as pure compounds in the database, and the pseudo-inverse
+    is taken if the element is not present purely in the database.
+
+    For the effective mass, the pseudo-inverse is obtained on 1/(alpha+m),
+    then m is re-obtained for single elements. This is to avoid
+    huge errors coming from the huge spread in data (12 orders of magnitude).
+
+    Args:
+        props: optical properties to include. Should be a (sub)list of
+               ["sigma_p", "sigma_n", "S_p", "S_n", "kappa_p", "kappa_n", "PF_p", "PF_n", "m_p", "m_n"]
+               for the hole (_p) and electron (_n) conductivity (sigma), Seebeck coefficient (S),
+               thermal conductivity (kappa), power factor (PF) and effective mass (m).
+        method: type of values, either "exact", "pseudo_inverse", or "combined".
+        alpha: Value used to featurize the effective mass.
+               The values of the effective masses span 12 orders of magnitude, which makes the pseudo-inverse biased
+               To overcome this, we use 1 / (alpha + m) for the pseudo-inversion.
+               The value of alpha can be tested. A file for each of them is created,
+               so that it is not computed each time.
+               Defaults to 0, and used only if method != "exact".
+        saving_dir: folder to save the data and csv file used for the featurization. Saving them helps fasten the
+                    featurization.
+        impute_nan (bool): if True, the features for the elements
+            that are missing from the data_source or are NaNs are replaced by the
+            average of each features over the available elements.
+    """
+
+    def __init__(
+        self,
+        props=None,
+        method="pseudo_inverse",
+        alpha=0,
+        saving_dir="~/.matminer/transport_props/",
+        impute_nan=False,
+    ):
+        # Handles the saving folder
+        saving_dir = os.path.expanduser(saving_dir)
+        os.makedirs(saving_dir, exist_ok=True)
+        self.saving_dir = saving_dir
+
+        # Handle the selection of properties
+        possible_props = ["sigma_p", "sigma_n", "S_p", "S_n", "kappa_p", "kappa_n", "PF_p", "PF_n", "m_p", "m_n"]
+        if props is None:
+            props = possible_props
+        elif not all([prop in possible_props for prop in props]):
+            raise ValueError("This property is not available: choose from " + ", ".join(possible_props))
+
+        self.props = props
+        self.method = method
+        self.alpha = alpha
+
+        # Read the database as a compressed or csv file
+        dfile = os.path.join(module_dir, "data_files/mp_transport/", "transport_database.csv")
+        dfile_tarred = os.path.join(module_dir, "data_files/mp_transport/", "transport_database.tar.xz")
+        if not os.path.isfile(dfile) and os.path.isfile(dfile_tarred):
+            with tarfile.open(dfile_tarred, mode="r:xz") as tar:
+                tar.extractall(os.path.join(module_dir, "data_files/mp_transport/"))
+        data = pd.read_csv(dfile).drop(columns=["mp_id"])
+        data.rename(columns={"pretty_formula": "Compound"}, inplace=True)
+        data.set_index("Compound", inplace=True)
+
+        # Remove outliers
+        data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]
+
+        # Add composition
+        compos = []
+        for c in data.index:
+            compos.append(Composition(c))
+        data["Composition"] = compos
+
+        self.data = data
+
+        self.all_element_data = self._get_element_props()
+
+        self.prop_names = list(self.all_element_data.columns)
+
+        self.impute_nan = impute_nan
+        if self.impute_nan:
+            self.all_element_data.fillna(self.all_element_data.mean(), inplace=True)
+        else:
+            warnings.warn(f"{self.__class__.__name__}(impute_nan=False):\n" + IMPUTE_NAN_WARNING)
+
+    def _get_element_props(self):
+        #
+        # Compute the exact values
+        #
+        dfile = os.path.join(module_dir, "data_files/mp_transport/", "transport_pure_elems.csv")
+        df_exact = pd.read_csv(dfile)
+        df_exact.set_index("Element", inplace=True)
+        df_exact.drop(columns=["mp_id"], inplace=True)
+
+        elem, elem_absent = get_elem_in_data(df_exact, as_pure=True)
+        res = []
+        for e in elem:
+            res.append(df_exact.loc[e][self.props].values)
+        for e in elem_absent:
+            res.append(np.nan * np.ones(len(self.props)))
+
+        res = np.array(res)
+        cols = self.props
+        df_exact = pd.DataFrame(res, columns=cols, index=elem + elem_absent)
+        df_exact = df_exact[self.props]
+
+        if self.method == "exact":
+            return df_exact
+        else:
+            #
+            # Retrieve or compute the pseudo-inversed values.
+            #
+            dbfile = os.path.join(self.saving_dir, f"transport_pi_{self.alpha}.csv")
+            if os.path.isfile(dbfile):
+                df_pi = pd.read_csv(dbfile)
+                df_pi.set_index("Element", inplace=True)
+                df_pi = df_pi[self.props]
+            else:
+                warnings.warn(
+                    """Pseudo-inverse values not found for this value of alpha. Recomputing them...
+                       This can take a few seconds..."""
+                )
+                TP = self.data.copy()
+                TP["1/m_p"] = 1 / (self.alpha + TP["m_p"].values)
+                TP["1/m_n"] = 1 / (self.alpha + TP["m_n"].values)
+
+                df_pi = get_pseudo_inverse(TP)
+
+                df_pi["m_p"] = 1 / df_pi["1/m_p"].values - self.alpha
+                df_pi["m_n"] = 1 / df_pi["1/m_n"].values - self.alpha
+                df_pi.drop(columns=["1/m_p", "1/m_n"], inplace=True)
+                df_pi.index.name = "Element"
+
+                df_pi.to_csv(dbfile)
+                warnings.warn("The pseudo-inverse coefficients have been collected and stored.")
+
+            if self.method == "pseudo_inverse":
+                return df_pi
+            elif self.method == "combined":
+                res_combined = []
+                for i, e in df_exact.iterrows():
+                    if e.isnull().sum() == 0:
+                        res_combined.append(e.values)
+                    else:
+                        res_combined.append(df_pi.loc[i].values)
+                res_combined = np.array(res_combined)
+                df_combined = pd.DataFrame(res_combined, columns=df_exact.columns, index=df_exact.index)
+                return df_combined
+            else:
+                raise ValueError("The method should be either exact or pseudo_inverse.")
+
+    def get_elemental_property(self, elem, property_name):
+        estr = str(elem)
+        return self.all_element_data.loc[estr][property_name]
diff --git a/matminer/utils/data_files/mp_transport/transport_database.tar.xz b/matminer/utils/data_files/mp_transport/transport_database.tar.xz
new file mode 100644
index 000000000..e7c88ef98
Binary files /dev/null and b/matminer/utils/data_files/mp_transport/transport_database.tar.xz differ
diff --git a/matminer/utils/data_files/mp_transport/transport_pure_elems.csv b/matminer/utils/data_files/mp_transport/transport_pure_elems.csv
new file mode 100644
index 000000000..490094056
--- /dev/null
+++ b/matminer/utils/data_files/mp_transport/transport_pure_elems.csv
@@ -0,0 +1,51 @@
+mp_id,Element,sigma_p,sigma_n,S_p,S_n,kappa_p,kappa_n,PF_p,PF_n,m_p,m_n
+mp-135,Li,925330,925361,-2.43243,-2.43481,6820550,6820580,5.47493,5.48581,3.05E-05,3.05E-05
+mp-20,Be,1093100,1093600,-18.1913,-18.169,7591050,7588400,361.73,361.009,2.58E-05,2.58E-05
+mp-160,B,95.0934,34.1649,390.474,-655.344,537.571,189.446,14.6366,14.6929,0.296333954,0.82480567
+mp-48,C,5252.98,5276.01,5.26675,-12.5669,111354,110907,0.0297537,1.10388,0.00536446,0.005341044
+mp-154,N,1.89187,10.0096,887.732,-707.978,5.53941,53.6861,1.49093,5.01714,14.89499979,2.815237696
+mp-12957,O,5.00329,10.6565,776.765,-738.948,21.7704,50.8498,3.0109,5.87144,6.41334,4.22984
+mp-127,Na,732035,732088,-3.22304,-3.2284,5406820,5407110,7.60438,7.63023,3.85E-05,3.85E-05
+mp-153,Mg,1192170,1192230,-3.36428,-3.3631,8709300,8709530,16.3136,16.2827,2.36E-05,2.36E-05
+mp-134,Al,3132280,3132330,-1.60472,-1.60452,22912300,22912900,8.06597,8.06411,9.00E-06,9.00E-06
+mp-149,Si,88.7122,95.893,525.926,-513.639,558.792,523.124,24.5376,25.299,0.317649695,0.293862985
+mp-157,P,165.26,182.981,209.384,-227.097,5389.87,5021.21,8.91698,9.31858,0.170515571,0.154001799
+mp-77,S,6.87888,4.19204,610.272,-660.083,24.552,11.3981,2.72008,1.84124,4.096510369,6.722121747
+mp-58,K,352794,352833,-4.52238,-4.52063,2584820,2585150,7.21529,7.21053,7.99E-05,7.99E-05
+mp-45,Ca,302737,302738,-0.809231,-0.806981,2196400,2196440,0.198249,0.197149,9.31E-05,9.31E-05
+mp-67,Sc,288099,288098,-0.945031,-0.942615,1974750,1974720,9.05286,9.06289,9.78E-05,9.78E-05
+mp-72,Ti,165898,165902,-4.22226,-4.22989,1331760,1331800,14.0871,14.1085,0.00016986,0.000169856
+mp-146,V,1355480,1355470,3.24851,3.24894,9812320,9812260,14.3041,14.3079,2.08E-05,2.08E-05
+mp-90,Cr,1025850,1025840,1.20852,1.21177,7357490,7357790,1.49828,1.50633,2.75E-05,2.75E-05
+mp-542909,Mn,53352.6,53352.6,-0.302024,-0.30516,429129,429131,0.00486675,0.00496835,0.000528173,0.000528173
+mp-13,Fe,813572,813563,3.87831,3.87783,5971310,5971220,12.2372,12.234,3.46E-05,3.46E-05
+mp-54,Co,705466,705480,-5.46444,-5.46471,5189260,5189310,21.1108,21.1134,3.99E-05,3.99E-05
+mp-23,Ni,812607,812601,10.1936,10.1932,6120530,6120510,84.4369,84.4304,3.47E-05,3.47E-05
+mp-30,Cu,1630220,1630230,-1.41329,-1.41834,12126400,12127100,3.2562,3.27951,1.73E-05,1.73E-05
+mp-79,Zn,1649870,1650010,-2.2726,-2.25933,11757100,11755300,8.567,8.47377,1.71E-05,1.71E-05
+mp-142,Ga,540375,540276,6.79349,6.77515,4114570,4114110,37.6701,37.6324,5.21E-05,5.22E-05
+mp-32,Ge,1764.96,1926.27,-84.9765,-118.158,53593.9,50314.7,12.7448,26.8933,0.015966029,0.014629
+mp-11,As,104292,104176,21.8047,21.7828,797177,796004,67.0258,66.7326,0.000270197,0.000270498
+mp-70,Rb,286207,286244,-4.43738,-4.43716,2092990,2093160,5.63552,5.63567,9.85E-05,9.84E-05
+mp-76,Sr,72801.8,72825.8,-30.5886,-30.7422,991104,990971,68.1178,68.8265,0.00038707,0.000386943
+mp-112,Y,320047,320045,0.876605,0.879684,2200450,2200410,1.9417,1.95116,8.80E-05,8.80E-05
+mp-131,Zr,304499,304524,-11.4099,-11.4144,2334460,2334590,41.553,41.5854,9.25E-05,9.25E-05
+mp-75,Nb,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
+mp-129,Mo,1491480,1491490,-0.00429693,0.00554035,10290200,10290100,2.75E-05,4.58E-05,1.89E-05,1.89E-05
+mp-33,Ru,1463730,1463720,0.434967,0.434432,10749000,10749000,5.23266,5.24393,1.93E-05,1.93E-05
+mp-74,Rh,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
+mp-2,Pd,1007340,1007330,8.33372,8.33356,7473540,7473380,69.9605,69.9571,2.80E-05,2.80E-05
+mp-124,Ag,1631090,1631240,-3.77911,-3.78152,12041200,12040500,23.2946,23.3265,1.73E-05,1.73E-05
+mp-85,In,1988630,1988620,-0.538077,-0.542589,14778700,14780000,0.575763,0.585455,1.42E-05,1.42E-05
+mp-117,Sn,27096.3,27158.8,-16.1396,-16.7422,252932,253013,7.05821,7.61264,0.001039972,0.001037579
+mp-104,Sb,28155.1,28216.8,-14.6875,-15.964,316195,316514,7.06549,7.97584,0.001000863,0.000998675
+mp-122,Ba,156444,156416,11.9025,11.9038,1145330,1145140,22.1632,22.1641,0.000180125,0.000180157
+mp-103,Hf,340719,340730,-3.76607,-3.76486,2485960,2486050,6.09816,6.09334,8.27E-05,8.27E-05
+mp-42,Ta,45661.8,45659.2,7.94558,7.95357,323410,323408,4.66919,4.6756,0.000617133,0.000617168
+mp-91,W,1087360,1087430,-6.02206,-6.04159,8615380,8615410,39.4334,39.6922,2.59E-05,2.59E-05
+mp-49,Os,1392840,1392830,-0.00168482,-0.00302132,10231700,10232000,0.71973,0.710894,2.02E-05,2.02E-05
+mp-101,Ir,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
+mp-126,Pt,1467730,1467710,6.41142,6.41156,10719000,10718800,60.3328,60.3349,1.92E-05,1.92E-05
+mp-81,Au,1559610,1559550,0.396481,0.394712,11447200,11448700,0.245167,0.242974,1.81E-05,1.81E-05
+mp-20483,Pb,1638450,1638460,-0.50349,-0.496498,11809500,11809800,0.415351,0.403897,1.72E-05,1.72E-05
+mp-23152,Bi,33807.5,33539.1,90.3033,89.4824,416624,416229,476.761,465.214,0.000833525,0.000840196
diff --git a/matminer/utils/data_files/optical_polyanskiy/README b/matminer/utils/data_files/optical_polyanskiy/README
new file mode 100644
index 000000000..5477cb1ef
--- /dev/null
+++ b/matminer/utils/data_files/optical_polyanskiy/README
@@ -0,0 +1,5 @@
+This data has been retrieved from https://github.com/polyanskiy/refractiveindex.info-database
+on August 27, 2022. The script used to transform the data from the original repository to
+an easier-to-use database for matminer can be found in matminer/dev_scripts/dataset_management/handle_refractiveindex.py
+This script is ugly because the data format is not homogeneous and many things have been handled by hand.
+Do not hesitate to contact Matgenix in case you have troubles to re-create the dataset used by matminer (in case you want to update the database).
diff --git a/matminer/utils/data_files/optical_polyanskiy/database.tar.xz b/matminer/utils/data_files/optical_polyanskiy/database.tar.xz
new file mode 100644
index 000000000..d67c29a66
Binary files /dev/null and b/matminer/utils/data_files/optical_polyanskiy/database.tar.xz differ
diff --git a/matminer/utils/tests/test_data.py b/matminer/utils/tests/test_data.py
index 52a801486..2d174a0c6 100644
--- a/matminer/utils/tests/test_data.py
+++ b/matminer/utils/tests/test_data.py
@@ -2,6 +2,7 @@
 from math import isnan
 from unittest import TestCase
 
+import pytest
 from pymatgen.core import Element
 from pymatgen.core.periodic_table import Specie
 
@@ -12,7 +13,9 @@
     MatscholarElementData,
     MEGNetElementData,
     MixingEnthalpy,
+    OpticalData,
     PymatgenData,
+    TransportData,
 )
 
 
@@ -20,7 +23,8 @@ class TestDemlData(TestCase):
     """Tests for the DemlData Class"""
 
     def setUp(self):
-        self.data_source = DemlData()
+        self.data_source = DemlData(impute_nan=False)
+        self.data_source_imputed = DemlData(impute_nan=True)
 
     def test_get_property(self):
         self.assertAlmostEqual(
@@ -45,27 +49,102 @@ def test_get_property(self):
             self.data_source.get_charge_dependent_property_from_specie(Specie("V", 3), "xtal_field_split"),
         )
 
+        self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "mus_fere")))
+        self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "electron_affin")))
+        with pytest.raises(KeyError):
+            self.data_source.get_elemental_property(Element("Og"), "first_ioniz")
+        with pytest.raises(KeyError):
+            self.data_source.get_elemental_property(Element("Og"), "total_ioniz")
+        self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "xtal_field_split")))
+
+        self.assertAlmostEqual(
+            -4.3853,
+            self.data_source_imputed.get_elemental_property(Element("Bi"), "mus_fere"),
+            4,
+        )
+        self.assertEqual(
+            59600,
+            self.data_source_imputed.get_elemental_property(Element("Li"), "electron_affin"),
+        )
+        self.assertAlmostEqual(
+            2372300,
+            self.data_source_imputed.get_elemental_property(Element("He"), "first_ioniz"),
+        )
+        self.assertAlmostEqual(
+            sum([2372300, 5250500]),
+            self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("He", 2), "total_ioniz"),
+        )
+        self.assertAlmostEqual(
+            18.6,
+            self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("V", 3), "xtal_field_split"),
+        )
+
+        self.assertAlmostEqual(
+            -3.934405,
+            self.data_source_imputed.get_elemental_property(Element("Og"), "mus_fere"),
+            6,
+        )
+        self.assertAlmostEqual(
+            76858.235294,
+            self.data_source_imputed.get_elemental_property(Element("Og"), "electron_affin"),
+            6,
+        )
+        self.assertAlmostEqual(
+            811242.222222,
+            self.data_source_imputed.get_elemental_property(Element("Og"), "first_ioniz"),
+            6,
+        )
+        self.assertAlmostEqual(
+            811242.222222,
+            self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("Og", 2), "total_ioniz"),
+            6,
+        )
+        self.assertEqual(
+            0,
+            self.data_source_imputed.get_charge_dependent_property_from_specie(Specie("Og", 3), "xtal_field_split"),
+        )
+
     def test_get_oxidation(self):
         self.assertEqual([1], self.data_source.get_oxidation_states(Element("Li")))
+        with pytest.raises(KeyError):
+            self.data_source.get_oxidation_states(Element("Og"))
+
+        self.assertEqual([1], self.data_source_imputed.get_oxidation_states(Element("Li")))
+        self.assertEqual([0], self.data_source_imputed.get_oxidation_states(Element("Og")))
 
 
 class TestMagpieData(TestCase):
     def setUp(self):
-        self.data_source = MagpieData()
+        self.data_source = MagpieData(impute_nan=False)
+        self.data_source_imputed = MagpieData(impute_nan=True)
 
     def test_get_property(self):
         self.assertAlmostEqual(
             9.012182,
             self.data_source.get_elemental_property(Element("Be"), "AtomicWeight"),
         )
+        self.assertTrue(isnan(self.data_source.get_elemental_property(Element("Og"), "AtomicWeight")))
+
+        self.assertAlmostEqual(
+            9.012182,
+            self.data_source_imputed.get_elemental_property(Element("Be"), "AtomicWeight"),
+        )
+        self.assertAlmostEqual(
+            138.710894, self.data_source_imputed.get_elemental_property(Element("Og"), "AtomicWeight"), 6
+        )
 
     def test_get_oxidation(self):
         self.assertEqual([-4, 2, 4], self.data_source.get_oxidation_states(Element("C")))
+        self.assertTrue(isnan(self.data_source.get_oxidation_states(Element("Og"))))
+
+        self.assertEqual([-4, 2, 4], self.data_source_imputed.get_oxidation_states(Element("C")))
+        self.assertEqual([3.0], self.data_source_imputed.get_oxidation_states(Element("Og")))
 
 
 class TestPymatgenData(TestCase):
     def setUp(self):
-        self.data_source = PymatgenData()
+        self.data_source = PymatgenData(impute_nan=False)
+        self.data_source_imputed = PymatgenData(impute_nan=True)
 
     def test_get_property(self):
         self.assertAlmostEqual(
@@ -76,16 +155,49 @@ def test_get_property(self):
             1.26,
             self.data_source.get_charge_dependent_property(Element("Ac"), 3, "ionic_radii"),
         )
+        with pytest.raises(KeyError):
+            self.data_source.get_charge_dependent_property(Element("Og"), 0, "ionic_radii")
+        with pytest.raises(IndexError):
+            self.data_source.get_charge_dependent_property(Element("Og"), 0, "common_oxidation_states")
+        with pytest.raises(IndexError):
+            self.data_source.get_charge_dependent_property(Element("Og"), 0, "icsd_oxidation_states")
+
+        self.assertAlmostEqual(
+            9.012182,
+            self.data_source_imputed.get_elemental_property(Element("Be"), "atomic_mass"),
+        )
+        self.assertAlmostEqual(
+            1.26,
+            self.data_source_imputed.get_charge_dependent_property(Element("Ac"), 3, "ionic_radii"),
+        )
+        self.assertAlmostEqual(
+            1.472889, self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "ionic_radii"), 6
+        )
+        self.assertEqual(
+            0,
+            self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "common_oxidation_states"),
+        )
+        self.assertEqual(
+            0,
+            self.data_source_imputed.get_charge_dependent_property(Element("Og"), 0, "icsd_oxidation_states"),
+        )
 
     def test_get_oxidation(self):
         self.assertEqual((3,), self.data_source.get_oxidation_states(Element("Nd")))
         self.data_source.use_common_oxi_states = False
         self.assertEqual((2, 3), self.data_source.get_oxidation_states(Element("Nd")))
+        self.assertEqual((), self.data_source.get_oxidation_states(Element("Og")))
+
+        self.assertEqual((3,), self.data_source_imputed.get_oxidation_states(Element("Nd")))
+        self.data_source_imputed.use_common_oxi_states = False
+        self.assertEqual((2, 3), self.data_source_imputed.get_oxidation_states(Element("Nd")))
+        self.assertEqual((), self.data_source_imputed.get_oxidation_states(Element("Og")))
 
 
 class TestMatScholarData(TestCase):
     def setUp(self):
-        self.data_source = MatscholarElementData()
+        self.data_source = MatscholarElementData(impute_nan=False)
+        self.data_source_imputed = MatscholarElementData(impute_nan=True)
 
     def test_get_property(self):
         embedding_cu = self.data_source.get_elemental_property(Element("Cu"), "embedding 3")
@@ -94,10 +206,17 @@ def test_get_property(self):
         with self.assertRaises(KeyError):
             self.data_source.get_elemental_property(Element("Db"), "embedding 9")
 
+        embedding_cu = self.data_source_imputed.get_elemental_property(Element("Cu"), "embedding 3")
+        self.assertAlmostEqual(0.028666902333498, embedding_cu)
+
+        embedding_db = self.data_source_imputed.get_elemental_property(Element("Db"), "embedding 9")
+        self.assertAlmostEqual(0.015063986881835006, embedding_db)
+
 
 class TestMEGNetData(TestCase):
     def setUp(self):
-        self.data_source = MEGNetElementData()
+        self.data_source = MEGNetElementData(impute_nan=False)
+        self.data_source_imputed = MEGNetElementData(impute_nan=True)
 
     def test_get_property(self):
         embedding_cu = self.data_source.get_elemental_property(Element("Cu"), "embedding 1")
@@ -111,16 +230,27 @@ def test_get_property(self):
         embedding_dummy = self.data_source.all_element_data["Dummy"]["embedding 1"]
         self.assertAlmostEqual(-0.044910576194524765, embedding_dummy)
 
+        embedding_cu = self.data_source_imputed.get_elemental_property(Element("Cu"), "embedding 1")
+        self.assertAlmostEqual(0.18259364366531372, embedding_cu)
+
+        embedding_md = self.data_source_imputed.get_elemental_property(Element("Md"), "embedding 1")
+        self.assertAlmostEqual(-0.0010715560693489877, embedding_md)
+
 
 class TestMixingEnthalpy(TestCase):
     def setUp(self):
-        self.data = MixingEnthalpy()
+        self.data = MixingEnthalpy(impute_nan=False)
+        self.data_imputed = MixingEnthalpy(impute_nan=True)
 
     def test_get_data(self):
         self.assertEqual(-27, self.data.get_mixing_enthalpy(Element("H"), Element("Pd")))
         self.assertEqual(-27, self.data.get_mixing_enthalpy(Element("Pd"), Element("H")))
         self.assertTrue(isnan(self.data.get_mixing_enthalpy(Element("He"), Element("H"))))
 
+        self.assertEqual(-27, self.data_imputed.get_mixing_enthalpy(Element("H"), Element("Pd")))
+        self.assertEqual(-27, self.data_imputed.get_mixing_enthalpy(Element("Pd"), Element("H")))
+        self.assertAlmostEqual(-14.203577, self.data_imputed.get_mixing_enthalpy(Element("He"), Element("H")), 6)
+
 
 class TestIUCrBondValenceData(TestCase):
     def setUp(self):
@@ -131,5 +261,55 @@ def test_get_data(self):
         self.assertAlmostEqual(nacl["Ro"], 2.15)
 
 
+class TestOpticalData(TestCase):
+    def setUp(self):
+        self.data_source = OpticalData(impute_nan=False)
+        self.data_source_imputed = OpticalData(impute_nan=True)
+
+    def test_get_data(self):
+        au_r = self.data_source.get_elemental_property(elem="Au", property_name="R_400.0")
+        self.assertAlmostEqual(au_r, 0.25489326484782054)
+        ag_n = self.data_source.get_elemental_property(elem="Ag", property_name="n_600.0")
+        self.assertAlmostEqual(ag_n, 0.13644859985917585)
+        c_k = self.data_source.get_elemental_property(elem="C", property_name="k_760.0")
+        self.assertAlmostEqual(c_k, 0.7462931865379264)
+        og_r = self.data_source.get_elemental_property(elem="Og", property_name="R_400.0")
+        self.assertTrue(isnan(og_r))
+
+        au_r = self.data_source_imputed.get_elemental_property(elem="Au", property_name="R_400.0")
+        self.assertAlmostEqual(au_r, 0.25489326484782054)
+        ag_n = self.data_source_imputed.get_elemental_property(elem="Ag", property_name="n_600.0")
+        self.assertAlmostEqual(ag_n, 0.13644859985917585)
+        c_k = self.data_source_imputed.get_elemental_property(elem="C", property_name="k_760.0")
+        self.assertAlmostEqual(c_k, 0.7462931865379264)
+        og_r = self.data_source_imputed.get_elemental_property(elem="Og", property_name="R_400.0")
+        self.assertAlmostEqual(og_r, 0.4624005395190695)
+
+
+class TestTransportData(TestCase):
+    def setUp(self):
+        self.data_source = TransportData(impute_nan=False)
+        self.data_source_imputed = TransportData(impute_nan=True)
+
+    def test_get_data(self):
+        ca_mn = self.data_source.get_elemental_property(elem="Ca", property_name="m_n")
+        self.assertAlmostEqual(ca_mn, 0.000279554)
+        cr_sigmap = self.data_source.get_elemental_property(elem="Cr", property_name="sigma_p")
+        self.assertAlmostEqual(cr_sigmap, 205569.89838499163)
+        cu_kappan = self.data_source.get_elemental_property(elem="Cu", property_name="kappa_n")
+        self.assertAlmostEqual(cu_kappan, 1814544.75663, places=5)
+        og_mn = self.data_source.get_elemental_property(elem="Og", property_name="m_n")
+        self.assertTrue(isnan(og_mn))
+
+        ca_mn = self.data_source_imputed.get_elemental_property(elem="Ca", property_name="m_n")
+        self.assertAlmostEqual(ca_mn, 0.000279554)
+        cr_sigmap = self.data_source_imputed.get_elemental_property(elem="Cr", property_name="sigma_p")
+        self.assertAlmostEqual(cr_sigmap, 205569.89838499163)
+        cu_kappan = self.data_source_imputed.get_elemental_property(elem="Cu", property_name="kappa_n")
+        self.assertAlmostEqual(cu_kappan, 1814544.75663, places=5)
+        og_mn = self.data_source_imputed.get_elemental_property(elem="Og", property_name="m_n")
+        self.assertAlmostEqual(og_mn, 0.03237036761677134)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/matminer/utils/tests/test_utils.py b/matminer/utils/tests/test_utils.py
new file mode 100644
index 000000000..9773dbcf5
--- /dev/null
+++ b/matminer/utils/tests/test_utils.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+from unittest import TestCase
+
+import pandas as pd
+from pymatgen.core import Composition
+
+from matminer.utils.utils import get_elem_in_data, get_pseudo_inverse
+
+test_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+class UtilsTest(TestCase):
+    def setUp(self):
+        data = pd.read_csv(os.path.join(test_dir, "utils_dataframe.csv"))
+        data.set_index("Compound", inplace=True)
+        comp = []
+        for c in data.index:
+            comp.append(Composition(c))
+        data["Composition"] = comp
+        self.data = data
+
+    def test_get_elem_in_data(self):
+        elem, elem_absent = get_elem_in_data(self.data, as_pure=True)
+        self.assertListEqual(elem, ["Zn", "Au"])
+
+        elem, elem_absent = get_elem_in_data(self.data, as_pure=False)
+        self.assertListEqual(elem, ["N", "O", "Al", "Si", "S", "Zn", "Ga", "Ge", "As", "Y", "Ag", "Sn", "Au"])
+
+    def test_get_pseudo_inverse(self):
+        PI = get_pseudo_inverse(self.data, cols=["n_380.0"])
+        self.assertAlmostEqual(PI["n_380.0"][0], -1.6345896211391995)
+        self.assertAlmostEqual(PI["n_380.0"][4], 4.155467999999999)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/matminer/utils/tests/utils_dataframe.csv b/matminer/utils/tests/utils_dataframe.csv
new file mode 100644
index 000000000..99fa9c578
--- /dev/null
+++ b/matminer/utils/tests/utils_dataframe.csv
@@ -0,0 +1,10 @@
+Compound,n_380.0
+Y3 Al5 O12,1.866205082
+Zn,9.671
+Au2 Ag3,1.12038
+Al198 Ga802 As1000,4.1915
+Si3 N4,2.098345509
+Al1 As1,4.010296333
+Sn1 S2,3.46289
+Si85 Ge15,6.234
+Au,1.617719566
diff --git a/matminer/utils/utils.py b/matminer/utils/utils.py
index 4191addc6..7f91fd9a4 100644
--- a/matminer/utils/utils.py
+++ b/matminer/utils/utils.py
@@ -1,6 +1,10 @@
+""" Utility module """
 import warnings
 
+import numpy as np
 import pandas as pd
+from numpy.linalg import pinv
+from pymatgen.core import Composition, Element
 
 
 def homogenize_multiindex(df, default_key, coerce=False):
@@ -38,3 +42,100 @@ def homogenize_multiindex(df, default_key, coerce=False):
             "multiindexed Matminer featurization without coercion "
             "to 2 levels."
         )
+
+
+def get_elem_in_data(df, as_pure=False):
+    """
+    Looks for all elements present in the compounds forming the index of a dataframe.
+
+    Args:
+        df: DataFrame containing the compounds formula (as strings) as index
+        as_pure: if True, consider only the pure compounds
+
+    Returns:
+        List of elements (str)
+    """
+    elems_in_df = []
+    elems_not_in_df = []
+
+    # Find the elements in the data, as pure or not
+    if as_pure:
+        for elem in Element:
+            if elem.name in df.index:
+                elems_in_df.append(elem.name)
+    else:
+        for elem in Element:
+            for compound in df.index.to_list():
+                if elem.name in compound and elem.name not in elems_in_df:
+                    elems_in_df.append(elem.name)
+
+    # Find the elements not in the data
+    for elem in Element:
+        if elem.name not in elems_in_df:
+            elems_not_in_df.append(elem.name)
+
+    return elems_in_df, elems_not_in_df
+
+
+def get_pseudo_inverse(df_init, cols=None):
+    """
+    Computes the pseudo-inverse matrix of a dataframe containing properties for multiple compositions.
+    From a compositions matrix (containing the elemental fractions of each element present in the data),
+    and a property column (or multiple properties gathered in a matrix), the pseudo-inverse column
+    (or matrix if multiple properties are present) is defined by
+
+    compositions * pseudo-inverse = property
+
+    The pseudo-inverse coefficients are therefore average contributions of each element present in the data to the
+    property of the compounds containing this element (in a least-square fit manner).
+    This allows to take many compounds-property into consideration.
+
+    Note that the pseudo-inverse coefficients do not represent the property of single elements in their pure form:
+    negative values may appear for single elements and for physically-positive properties, but this only reflects
+    the fact that the presence of the element in compounds generally decreases the value of the property.
+
+    For elements that are not present in compounds of the data, nan are returned.
+
+    Args:
+        df_init: DataFrame with a column named "Composition" containing compositions
+                 (anything that can be turned into a Pymatgen Composition object),
+                 and other columns containing properties to be inversed.
+        cols: list of columns of the dataframe giving the features to be pseudo-inversed.
+
+    Returns:
+        DataFrame with the pseudo-inverse coefficients for all elements present in the initial compositions and all
+        properties.
+    """
+    df = df_init.copy()
+
+    if "Composition" not in df.columns:
+        raise LookupError("The DataFrame should contain a column named Composition")
+
+    if cols is None:
+        cols = list(df.columns)
+        cols.remove("Composition")
+
+    data = df[cols]
+    data.index = df["Composition"].map(lambda s: str(s))
+
+    elems_in_df, elems_not_in_df = get_elem_in_data(data, as_pure=False)
+    n_elem_tot = len(elems_in_df)
+
+    # Initialize the matrix
+    A = np.zeros([len(data), n_elem_tot])
+
+    compos = df["Composition"]
+    for i, comp in enumerate(compos):
+        comp = Composition(comp)
+        for j in range(n_elem_tot):
+            if elems_in_df[j] in comp:
+                A[i, j] = comp.get_atomic_fraction(elems_in_df[j])
+
+    pi_A = pinv(A)
+
+    res_pi = pi_A @ data.values
+    res_pi = np.vstack([res_pi, np.nan * np.ones([len(elems_not_in_df), len(df.T) - 1])])
+
+    df_pi = pd.DataFrame(res_pi, columns=cols, index=pd.Index(elems_in_df + elems_not_in_df))
+
+    return df_pi
diff --git a/matminer/utils/warnings.py b/matminer/utils/warnings.py
new file mode 100644
index 000000000..e11f9c965
--- /dev/null
+++ b/matminer/utils/warnings.py
@@ -0,0 +1,6 @@
+IMPUTE_NAN_WARNING = """In a future release, impute_nan will be set to True by default.
+                    This means that features that are missing or are NaNs for elements
+                    from the data source will be replaced by the average of that value
+                    over the available elements.
+                    This avoids NaNs after featurization that are often replaced by
+                    dataset-dependent averages."""