From fde08d31b90cba48e42fb2cccf4695b725048eea Mon Sep 17 00:00:00 2001
From: Dale Zhou <dalejn@gmail.com>
Date: Sun, 18 Oct 2020 16:40:39 -0400
Subject: [PATCH] update with ggplot

---
 cleanBib.ipynb | 84 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 3 deletions(-)

diff --git a/cleanBib.ipynb b/cleanBib.ipynb
index cbf9d25..1f280d2 100644
--- a/cleanBib.ipynb
+++ b/cleanBib.ipynb
@@ -680,7 +680,7 @@
     "hispanic = [10]\n",
     "print ('looping through your references, predicting gender and race')\n",
     "\n",
-    "columns=['Reference Key','Author','Gender','W','A']\n",
+    "columns=['CitationKey','Author','Gender','W','A', 'GendCat']\n",
     "paper_df = pd.DataFrame(columns=columns)\n",
     "\n",
     "gender = []\n",
@@ -765,9 +765,9 @@
     "\tif la_gender['gender'] == 'unknown':\n",
     "\t\tla_g = gb[2:] \n",
     "\t\n",
-    "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:])]).reshape(1,5)\n",
+    "\tfa_data = np.array([paper,'%s,%s'%(fa_fname,fa_lname),'%s,%s'%(fa_gender['gender'],fa_gender['accuracy']),fa_race[0],np.sum(fa_race[1:]), '']).reshape(1,6)\n",
     "\tpaper_df = paper_df.append(pd.DataFrame(fa_data,columns=columns),ignore_index =True)\n",
-    "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:])]).reshape(1,5)\n",
+    "\tla_data = np.array([paper,'%s,%s'%(la_fname,la_lname),'%s,%s'%(la_gender['gender'],la_gender['accuracy']),la_race[0],np.sum(la_race[1:]), '%s%s' % (fa_gender['gender'], la_gender['gender'])]).reshape(1,6)\n",
     "\tpaper_df = paper_df.append(pd.DataFrame(la_data,columns=columns),ignore_index =True)\n",
     "\n",
     "\tmm = fa_g[0]*la_g[0]\n",
@@ -887,6 +887,84 @@
     "paper_df.to_csv('/home/jovyan/predictions.csv')"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "kernel": "R"
+   },
+   "outputs": [],
+   "source": [
+    "# Plot a histogram #\n",
+    "names <- read.csv('/home/jovyan/predictions.csv', header=T)\n",
+    "total_citations <- nrow(na.omit(names))\n",
+    "names$GendCat <- gsub(\"female\", \"W\", names$GendCat, fixed=T)\n",
+    "names$GendCat <- gsub(\"male\", \"M\", names$GendCat, fixed=T)\n",
+    "names$GendCat <- gsub(\"unknown\", \"U\", names$GendCat, fixed=T)\n",
+    "gend_cats <- unique(names$GendCat)  # get a vector of all the gender categories in your paper\n",
+    "\n",
+    "# Create an empty data frame that will be used to plot the histogram. This will have the gender category (e.g., WW, MM) in the first column and the percentage (e.g., number of WW citations divided by total number of citations * 100) in the second column #\n",
+    "dat_for_plot <- data.frame(gender_category = NA,\n",
+    "                           number = NA,\n",
+    "                           percentage = NA)\n",
+    "\n",
+    "\n",
+    "### Loop through each gender category from your paper, calculate the citation percentage of each gender category, and save the gender category and its citation percentage in dat_for_plot data frame ###\n",
+    "if (length(names$GendCat) != 1) {\n",
+    "  \n",
+    "  for (i in 1:length(gend_cats)){\n",
+    "    \n",
+    "    # Create an empty temporary data frame that will be binded to the dat_for_plot data frame\n",
+    "    temp_df <- data.frame(gender_category = NA,\n",
+    "                          number = NA,\n",
+    "                          percentage = NA)\n",
+    "    \n",
+    "    # Get the gender category, the number of citations with that category, and calculate the percentage of citations with that category\n",
+    "    gend_cat <- gend_cats[i]\n",
+    "    number_gend_cat <- length(names$GendCat[names$GendCat == gend_cat])\n",
+    "    perc_gend_cat <- (number_gend_cat / total_citations) * 100\n",
+    "    \n",
+    "    # Bind this information to the original data frame\n",
+    "    temp_df$gender_category <- gend_cat\n",
+    "    temp_df$number <- number_gend_cat\n",
+    "    temp_df$percentage <- perc_gend_cat\n",
+    "    dat_for_plot <- rbind(dat_for_plot, temp_df)\n",
+    "    \n",
+    "  }\n",
+    "  \n",
+    "}\n",
+    "\n",
+    "\n",
+    "# Create a data frame with only the WW, MW, WM, MM categories and their base rates - to plot percent citations relative to benchmarks\n",
+    "dat_for_baserate_plot <- subset(dat_for_plot, gender_category == 'WW' | gender_category == 'MW' | gender_category == 'WM' | gender_category == 'MM')\n",
+    "dat_for_baserate_plot$baserate <- c(6.7, 9.4, 25.5, 58.4)\n",
+    "dat_for_baserate_plot$citation_rel_to_baserate <- dat_for_baserate_plot$percentage - dat_for_baserate_plot$baserate\n",
+    "\n",
+    "\n",
+    "# Plot the Histogram of Number of Papers per category against predicted gender category #\n",
+    "\n",
+    "library(ggplot2)\n",
+    "\n",
+    "dat_for_plot = dat_for_plot[-1:-2,]\n",
+    "\n",
+    "dat_for_plot$gender_category <- factor(dat_for_plot$gender_category, levels = dat_for_plot$gender_category)\n",
+    "ggplot(dat_for_plot[-c(1),], aes(x = gender_category, y = number, fill = gender_category)) +\n",
+    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) + \n",
+    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM', 'UW', 'UM', 'WU', 'MU', 'UU')) +\n",
+    "  geom_text(aes(label = number), vjust = -0.3, color = 'black', size = 2.5) +\n",
+    "  theme(legend.position = 'right') + theme_minimal() +\n",
+    "  xlab('Predicted gender category') + ylab('Number of papers') + ggtitle(\"\") + theme_classic(base_size=15)\n",
+    "\n",
+    "\n",
+    "# Plot the Histogram of % citations relative to benchmarks against predicted gender category\n",
+    "ggplot(dat_for_baserate_plot, aes(x = gender_category, y = citation_rel_to_baserate, fill = gender_category)) +\n",
+    "  geom_bar(stat = 'identity', width = 0.75, na.rm = TRUE, show.legend = TRUE) +\n",
+    "  scale_x_discrete(limits = c('WW', 'MW', 'WM', 'MM')) +\n",
+    "  geom_text(aes(label = round(citation_rel_to_baserate, digits = 2)), vjust = -0.3, color = 'black', size = 2.5) +\n",
+    "  theme(legend.position = 'right') + theme_minimal() +\n",
+    "  xlab('Predicted gender category') + ylab('% of citations relative to benchmarks') + ggtitle(\"\") + theme_classic(base_size=15)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {