update with ggplot

dalejn · Oct 18, 2020 · fde08d3 · fde08d3
1 parent 728365e
commit fde08d3
Showing 1 changed file with 81 additions and 3 deletions.
diff --git a/cleanBib.ipynb b/cleanBib.ipynb
@@ -680,7 +680,7 @@
     "hispanic = [10]\n",
     "print ('looping through your references, predicting gender and race')\n",
     "\n",
-    "columns=['Reference Key','Author','Gender','W','A']\n",
+    "columns=['CitationKey','Author','Gender','W','A', 'GendCat']\n",
     "paper_df = pd.DataFrame(columns=columns)\n",
     "\n",
     "gender = []\n",
@@ -765,9 +765,9 @@
     "\tif la_gender['gender'] == 'unknown':\n",
     "\t\tla_g = gb[2:] \n",
     "\t\n",
-    "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:])]).reshape(1,5)\n",
+    "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:]), '']).reshape(1,6)\n",
     "\tpaper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)\n",
-    "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:])]).reshape(1,5)\n",
+    "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1,6)\n",
     "\tpaper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)\n",
     "\n",
     "\tmm = fa_g[0]*la_g[0]\n",
@@ -887,6 +887,84 @@
     "paper_df.to_csv('/home/jovyan/predictions.csv')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "kernel": "R"
+   },
+   "outputs": [],
+   "source": [
+    "# Plot a histogram #\n",
+    "names <- read.csv('/home/jovyan/predictions.csv', header=T)\n",
+    "total_citations <- nrow(na.omit(names))\n",
+    "names$GendCat <- gsub(\"female\", \"W\", names$GendCat, fixed=T)\n",
+    "names$GendCat <- gsub(\"male\", \"M\", names$GendCat, fixed=T)\n",
+    "names$GendCat <- gsub(\"unknown\", \"U\", names$GendCat, fixed=T)\n",
+    "gend_cats <- unique(names$GendCat)  # get a vector of all the gender categories in your paper\n",
+    "\n",
+    "# Create an empty data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
+    "dat_for_plot <- data.frame(gender_category = NA,\n",
+    "                           number = NA,\n",
+    "                           percentage = NA)\n",
+    "\n",
+    "\n",
+    "### Loop through each gender category from your paper, calculate the citation percentage of each gender category, and save the gender category and its citation percentage in dat_for_plot data frame ###\n",
+    "if (length(names$GendCat) != 1) {\n",
+    "  \n",
+    "  for (i in 1:length(gend_cats)){\n",
+    "    \n",
+    "    # Create an empty temporary data frame that will be binded to the dat_for_plot data frame\n",
+    "    temp_df <- data.frame(gender_category = NA,\n",
+    "                          number = NA,\n",
+    "                          percentage = NA)\n",
+    "    \n",
+    "    # Get the gender category, the number of citations with that category, and calculate the percentage of citations with that category\n",
+    "    gend_cat <- gend_cats[i]\n",
+    "    number_gend_cat <- length(names$GendCat[names$GendCat == gend_cat])\n",
+    "    perc_gend_cat <- (number_gend_cat / total_citations) * 100\n",
+    "    \n",
+    "    # Bind this information to the original data frame\n",
+    "    temp_df$gender_category <- gend_cat\n",
+    "    temp_df$number <- number_gend_cat\n",
+    "    temp_df$percentage <- perc_gend_cat\n",
+    "    dat_for_plot <- rbind(dat_for_plot, temp_df)\n",
+    "    \n",
+    "  }\n",
+    "  \n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n",
+    "dat_for_baserate_plot <- subset(dat_for_plot, gender_category == 'WW' | gender_category == 'MW' | gender_category == 'WM' | gender_category == 'MM')\n",
+    "dat_for_baserate_plot$baserate <- c(6.7, 9.4, 25.5, 58.4)\n",
+    "dat_for_baserate_plot$citation_rel_to_baserate <- dat_for_baserate_plot$percentage - dat_for_baserate_plot$baserate\n",
+    "\n",
+    "\n",
+    "# Plot the Histogram of Number of Papers per category against predicted gender category #\n",
+    "\n",
+    "library(ggplot2)\n",
+    "\n",
+    "dat_for_plot = dat_for_plot[-1:-2,]\n",
+    "\n",
+    "dat_for_plot$gender_category <- factor(dat_for_plot$gender_category, levels = dat_for_plot$gender_category)\n",
+    "ggplot(dat_for_plot[-c(1),], aes(x = gender_category, y = number, fill = gender_category)) +\n",
+    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) + \n",
+    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM', 'UW', 'UM', 'WU', 'MU', 'UU')) +\n",
+    "  geom_text(aes(label = number), vjust = -0.3, color = 'black', size = 2.5) +\n",
+    "  theme(legend.position = 'right') + theme_minimal() +\n",
+    "  xlab('Predicted gender category') + ylab('Number of papers') + ggtitle(\"\") + theme_classic(base_size=15)\n",
+    "\n",
+    "\n",
+    "# Plot the Histogram of % citations relative to benchmarks against predicted gender category\n",
+    "ggplot(dat_for_baserate_plot, aes(x = gender_category, y = citation_rel_to_baserate, fill = gender_category)) +\n",
+    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) +\n",
+    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM')) +\n",
+    "  geom_text(aes(label = round(citation_rel_to_baserate, digits = 2)), vjust = -0.3, color = 'black', size = 2.5) +\n",
+    "  theme(legend.position = 'right') + theme_minimal() +\n",
+    "  xlab('Predicted gender category') + ylab('% of citations relative to benchmarks') + ggtitle(\"\") + theme_classic(base_size=15)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {