twitter · presnick · Mar 8, 2024 · Mar 8, 2024
diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
@@ -182,9 +182,13 @@ def _filter_misleading_notes(
   """
   This function actually filters ratings (not notes), based on which notes they rate.
 
-  Filter out ratings of notes that say the Tweet isn't misleading.
-  Also filter out ratings of deleted notes, unless they were deleted after
-    c.deletedNotesTombstoneLaunchTime, and appear in noteStatusHistory.
+  Keep ratings of undeleted notes that either:
+    - say the Tweet is misleading
+    - OR it's after the new UI launch time, c.notMisleadingUILaunchTime. 
+       (After that timestamp, we start assessing the helpfulness of notes that say the Tweet isn't misleading.
+        Before that timestamp, we did not assess the helpfulness of such notes.)
+  Also keep ratings of deleted notes if:
+    - they were scored in noteStatusHistory
 
   Args:
       notes (pd.DataFrame): _description_
@@ -228,13 +232,16 @@ def _filter_misleading_notes(
       f"Preprocess Data: Filter misleading notes, starting with {len(ratings)} ratings on {len(np.unique(ratings[c.noteIdKey]))} notes"
     )
     print(
-      f"  Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} misleading notes"
+      f"  Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} notes that claim the tweet is misleading"
     )
     print(
       f"  Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)"
     )
     print(
-      f"  Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading."
+      f"  Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} notes that do not claim the tweet is misleading, but after the new UI launch time"
+    )
+    print(
+      f"  Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but do not claim the tweet is misleading."
     )
     print(
       f"  Removing {deletedNotInNSH.sum()} ratings on {len(np.unique(ratings.loc[deletedNotInNSH, c.noteIdKey]))} notes that were deleted and not in note status history (e.g. old)."
@@ -255,7 +262,7 @@ def _filter_misleading_notes(
   return ratings
 
 
-def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame:
+def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame:  
   """Drop duplicate ratings, then assert that there is exactly one rating per noteId per raterId.
 
   Args: