Merge pull request #2708 from jdebacker/taxsim_c_set

TAXSIM-35 Validation, "c" files
PSLmodels · Mar 12, 2024 · 598f1d6 · 598f1d6
2 parents 4b665fc + a2173ff
commit 598f1d6
Show file tree

Hide file tree

Showing 14 changed files with 163 additions and 159 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,4 +20,7 @@ puf.csv
 *.ipynb_checkpoints*
 
 # Built documentation
-docs/_build/
+docs/_build/
+
+# Validation files
+taxcalc/validation/taxsim35/actual_differences/
diff --git a/taxcalc/validation/taxsim35/Differences_Explained.md b/taxcalc/validation/taxsim35/Differences_Explained.md
@@ -27,6 +27,8 @@ This document explains the sources of known differences (that exceed $1) between
 
 ## `b` files:
 
+The following are notes that explain differences in addition to those documented above for the `a` files.
+
 ### All years
 
 * Differences in AGI between TAXMSIM and Tax-Calculator have to do with an incorrect calculation of the SECA tax liability in TAXSIM-35.  Half of the SECA tax amount is deductible from AGI on individuals' returns.
@@ -37,4 +39,16 @@ This document explains the sources of known differences (that exceed $1) between
 ### 2020
 
 * Three records in the test files with differences in the recovery rebate credit amount (RRC). The reasons TAXSIM-35 shows different results vary and include: TAXSIM-35 not counting qualifying children (e.g., file "a", id 7);  TAXSIM-35 not differentiating single/head of household filing status (e.g., file "a",id 31); and TAXSIM-35 not counting Economic Impact Payment 2 (e.g., file "a",id 33); TAXSIM-35 counts wrong number of child (e.g., file "a",id 59). Note that some of these are not errors per se, but can be related to different variable inputs in the two models.
-* There is also a single record with a differences in `e02300`, unemployment insurance benefits, and input to the models.  This variable is zeroed out in TAXSIM-35, but not in Tax-Calculator.
+* There is also a single record with a differences in `e02300`, unemployment insurance benefits, and input to the models.  This variable is zeroed out in TAXSIM-35, but not in Tax-Calculator.
+
+## `c` files:
+
+The following are notes that explain differences in addition to those documented above for the `b` files.
+
+### All years
+
+* The `c` file set is the only one that simulates itemized deduction amounts. We have documented differences in `c04470` in all years of the `c` files. In the version of TAXSIM-35 we are using, the itemized deduction is always returned as zero. Hand calculations have confirmed Tax-Calculator's itemized deduction amounts are correct.
+
+### 2017
+
+* There are differences in variable `c21040`, itemized deductions that are phased out.  This only affects the `c` set as itemized deductions are not included in records in the `a` and `b` sets.  Further, tax law only has a phase out of itemized deductions in 2017 and earlier, hence no affect on later years.  The root source of the error is the known differences in the handling of itemized deductions between TAXSIM-35 and Tax-Calculator noted above.
diff --git a/taxcalc/validation/taxsim35/actual_differences/b17-taxdiffs-actual.csv b/taxcalc/validation/taxsim35/actual_differences/b17-taxdiffs-actual.csv
diff --git a/taxcalc/validation/taxsim35/actual_differences/b18-taxdiffs-actual.csv b/taxcalc/validation/taxsim35/actual_differences/b18-taxdiffs-actual.csv
diff --git a/taxcalc/validation/taxsim35/actual_differences/b19-taxdiffs-actual.csv b/taxcalc/validation/taxsim35/actual_differences/b19-taxdiffs-actual.csv
diff --git a/taxcalc/validation/taxsim35/actual_differences/b20-taxdiffs-actual.csv b/taxcalc/validation/taxsim35/actual_differences/b20-taxdiffs-actual.csv
diff --git a/taxcalc/validation/taxsim35/actual_differences/b21-taxdiffs-actual.csv b/taxcalc/validation/taxsim35/actual_differences/b21-taxdiffs-actual.csv
diff --git a/taxcalc/validation/taxsim35/input_setup.py b/taxcalc/validation/taxsim35/input_setup.py
@@ -2,6 +2,7 @@
 Generates TAXSIM-35 `.in` input files, downloads `.in.out-taxsim` output files,
 prepares files for Tax Calculator and zips them
 """
+
 import os
 import glob
 import taxsim_input

diff --git a/taxcalc/validation/taxsim35/main_comparison.py b/taxcalc/validation/taxsim35/main_comparison.py
@@ -5,6 +5,7 @@
 import sys
 import os
 import pandas as pd
+import numpy as np
 import tc_sims
 
 CUR_PATH = os.path.abspath(os.path.dirname(__file__))
@@ -14,6 +15,8 @@
 
 
 def main(letter, year):
+
+    test_passed = False  # set boolean to False, change in tests pass
     # (1) generate TAXSIM-35-formatted output using Tax-Calculator tc CLI
     tc_sims.io(letter, year)
 
@@ -102,7 +105,11 @@ def main(letter, year):
         # delim_whitespace=True,
         index_col=False,
     )
-    with pd.ExcelWriter(os.path.join(CUR_PATH, "actual_differences", f"{letter}{year}differences.xlsx")) as writer:
+    with pd.ExcelWriter(
+        os.path.join(
+            CUR_PATH, "actual_differences", f"{letter}{year}differences.xlsx"
+        )
+    ) as writer:
         # use to_excel function and specify the sheet_name and index
         # to store the dataframe in specified sheet
         taxsim_df.to_excel(writer, sheet_name="taxsim", index=False)
@@ -123,7 +130,9 @@ def main(letter, year):
             key=lambda x: x[1],
         )
 
-        diff_dict["max_diff"].append(taxcalc_df.loc[ind, col] - taxsim_df.loc[ind, col])
+        diff_dict["max_diff"].append(
+            taxcalc_df.loc[ind, col] - taxsim_df.loc[ind, col]
+        )
         if max_val != 0:
             diff_dict["max_diff_index"].append(ind)
             diff_dict["max_diff_taxsim_val"].append(taxsim_df.loc[ind, col])
@@ -134,15 +143,22 @@ def main(letter, year):
             diff_dict["max_diff_taxcalc_val"].append("no diff")
 
     actual_df = pd.DataFrame(diff_dict, index=taxsim_df.columns[3:])
-    print(f"Difference in dataframes for assumption set {letter} in year {year}")
+    print(
+        f"Difference in dataframes for assumption set {letter} in year {year}"
+    )
     print(actual_df)
 
     # (3) check for difference between LYY.taxdiffs-actual and LYY.taxdiffs-expect
-    expected_file_name = os.path.join(CUR_PATH, "expected_differences", f"{letter}{year}-taxdiffs-expect.csv")
+    expected_file_name = os.path.join(
+        CUR_PATH, "expected_differences", f"{letter}{year}-taxdiffs-expect.csv"
+    )
     if os.path.isfile(expected_file_name):
         expect_df = pd.read_csv(expected_file_name, index_col=0)
-
         print(actual_df.eq(expect_df))
+        test_passed = np.allclose(
+            actual_df[["# of differing records", "max_diff"]].values,
+            expect_df[["# of differing records", "max_diff"]].values
+                      )
 
         print(
             "Above, True values mean the element is the same between the ACTUAL and EXPECT dataframes. "
@@ -152,7 +168,15 @@ def main(letter, year):
         print("This EXPECT file doesn't exist.")
 
     # (4) Write the created df to *.taxdiffs-actual
-    actual_df.to_csv(os.path.join(CUR_PATH, "actual_differences", f"{letter}{year}-taxdiffs-actual.csv"))
+    actual_df.to_csv(
+        os.path.join(
+            CUR_PATH,
+            "actual_differences",
+            f"{letter}{year}-taxdiffs-actual.csv",
+        )
+    )
+
+    return test_passed
 
 
 if __name__ == "__main__":

diff --git a/taxcalc/validation/taxsim35/prepare_taxcalc_input.py b/taxcalc/validation/taxsim35/prepare_taxcalc_input.py
@@ -1,6 +1,7 @@
 """
 Translates TAXSIM-35 input file to Tax-Calculator tc input file.
 """
+
 # CODING-STYLE CHECKS:
 # pycodestyle prepare_tc_input.py
 # pylint --disable=locally-disabled prepare_tc_input.py
@@ -101,12 +102,16 @@ def translate(ivar):
     invar = ivar.rename(TAXSIM_TC_MAP, axis=1)
     invar["n24"] = ivar["dep17"]
     # Create variables for Tax-Calculator that aren't directly represented in TAXSIM
-    invar["e02000"] += invar["e26270"]  # add active scorp income to "otherprop" income from taxsim
+    invar["e02000"] += invar[
+        "e26270"
+    ]  # add active scorp income to "otherprop" income from taxsim
     mstat = ivar["mstat"]
     assert np.all(np.logical_or(mstat == 1, mstat == 2))
     num_deps = ivar["depx"]
     mars = np.where(mstat == 1, np.where(num_deps > 0, 4, 1), 2)
-    assert np.all(np.logical_or(mars == 1, np.logical_or(mars == 2, mars == 4)))
+    assert np.all(
+        np.logical_or(mars == 1, np.logical_or(mars == 2, mars == 4))
+    )
     invar["MARS"] = mars
     num_eitc_qualified_kids = ivar["dep18"]
     invar["f2441"] = invar["nu13"]
@@ -123,12 +128,12 @@ def translate(ivar):
     invar["e01500"] = invar["e01700"]
     invar["e02300"] = invar["pui"] + invar["sui"]
     # variables for QBID calculation
-    invar[
-        "PT_SSTB_income"
-    ] = 0  # np.where(invar['pprofinc'] + invar['sprofinc'] > 0, 1, 0)
-    invar[
-        "PT_SSTB_income"
-    ] = 0  # np.where(invar['e26270'] > 0, 1, invar['PT_SSTB_income'])
+    invar["PT_SSTB_income"] = (
+        0  # np.where(invar['pprofinc'] + invar['sprofinc'] > 0, 1, 0)
+    )
+    invar["PT_SSTB_income"] = (
+        0  # np.where(invar['e26270'] > 0, 1, invar['PT_SSTB_income'])
+    )
 
     # Drop TAXSIM variables not used in Tax-Calculator
     invar.drop(

diff --git a/taxcalc/validation/taxsim35/process_taxcalc_output.py b/taxcalc/validation/taxsim35/process_taxcalc_output.py
@@ -1,6 +1,7 @@
 """
 Translates tc --dump output file into file formatted like TAXSIM-35 output.
 """
+
 # CODING-STYLE CHECKS:
 # pycodestyle process_tc_output.py
 # pylint --disable=locally-disabled process_tc_output.py
@@ -68,24 +69,32 @@ def write_taxsim_formatted_output(filename, tcvar):
     tcvar["state"] = 0  # state code is always zero
     tcvar["statetax"] = 0.0  # no state income tax calculation
     tcvar["mtr_state"] = 0.0  # no state income tax calculation
-    tcvar["zero_bracket_amount"] = 0.0  # always set zero-bracket amount to zero
+    tcvar["zero_bracket_amount"] = (
+        0.0  # always set zero-bracket amount to zero
+    )
     pre_phase_out_pe = tcvar["pre_c04600"].values
     post_phase_out_pe = tcvar["c04600"].values
     phased_out_pe = pre_phase_out_pe - post_phase_out_pe
-    tcvar["post_phase_out_pe"] = post_phase_out_pe  # post-phase-out personal exemption
-    tcvar["phased_out_pe"] = phased_out_pe  # personal exemption that is phased out
+    tcvar["post_phase_out_pe"] = (
+        post_phase_out_pe  # post-phase-out personal exemption
+    )
+    tcvar["phased_out_pe"] = (
+        phased_out_pe  # personal exemption that is phased out
+    )
     tcvar["exemption_surtax"] = 0.0  # always set exemption surtax to zero
     tcvar["gen_tax_credit"] = 0.0  # always set general tax credit to zero
     tcvar["non_refundable_child_odep_credit"] = (
         tcvar["c07220"] + tcvar["odc"] + tcvar["ctc_new"]
     )  # non-refundable child+odep credit
-    tcvar["refundable_CDCC"] = tcvar["CDCC_refund"] # refundable CDCC
+    tcvar["refundable_CDCC"] = tcvar["CDCC_refund"]  # refundable CDCC
     tcvar["amt_liability"] = tcvar["c09600"]  # federal AMT liability
     # var28 from TAXSIM-35 is federal income tax before credits; the Tax-Calculator
     # tcvar['c05800'] is this concept but includes AMT liability
     # while Internet-TAXSIM tcvar[28] explicitly excludes AMT liability, so
     # we have the following:
-    tcvar["iitax_before_credits_ex_AMT"] = tcvar["c05800"] - tcvar["amt_liability"]
+    tcvar["iitax_before_credits_ex_AMT"] = (
+        tcvar["c05800"] - tcvar["amt_liability"]
+    )
     tcvar = tcvar[
         [
             "RECID",
@@ -117,11 +126,15 @@ def write_taxsim_formatted_output(filename, tcvar):
             "c62100",
             "amt_liability",
             "iitax_before_credits_ex_AMT",
-            "recovery_rebate_credit"
+            "recovery_rebate_credit",
         ]
     ]
     # better mapping of to how TAXSIM-35 handles refundable credits in 2021
-    tcvar.loc[tcvar["FLPDYR"] == 2021, "c11070"] = tcvar.loc[tcvar["FLPDYR"] == 2021, "non_refundable_child_odep_credit"]
-    tcvar.loc[tcvar["FLPDYR"] == 2021, "c07180"] = tcvar.loc[tcvar["FLPDYR"] == 2021, "refundable_CDCC"]
+    tcvar.loc[tcvar["FLPDYR"] == 2021, "c11070"] = tcvar.loc[
+        tcvar["FLPDYR"] == 2021, "non_refundable_child_odep_credit"
+    ]
+    tcvar.loc[tcvar["FLPDYR"] == 2021, "c07180"] = tcvar.loc[
+        tcvar["FLPDYR"] == 2021, "refundable_CDCC"
+    ]
     tcvar.round(decimals=2)
     tcvar.to_csv(filename)