Documentation

Added documentation for the jupyter notebooks I worked on
chaoss · Aug 9, 2018 · 1b1b849 · 1b1b849
1 parent 876dffd
commit 1b1b849
Show file tree

Hide file tree

Showing 6 changed files with 422 additions and 1,201 deletions.
diff --git a/notebooks/PiperMail.ipynb b/notebooks/PiperMail.ipynb
@@ -3,14 +3,14 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import augur\n",
     "from augur.piper_reader import PiperMail\n",
     "from sqlalchemy.ext.declarative import declarative_base\n",
-    "# import everything from githubapi.py and ghtorrent.py so we can\n",
-    "# just copy and paste our function later\n",
     "import json\n",
     "import pandas as pd\n",
     "from perceval.backends.core.pipermail import Pipermail,PipermailList\n",
@@ -24,6 +24,14 @@
     "from dateutil.parser import parse"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load:\n",
+    "First loads the users data from 'augur.config.json' so will take the Database information (e.g. name of database, port of database). Then connects to the database using augur.App.ghtorrentplus() and also loads the piper_reader and loads the path to the list of mailing lists 'runtine/mailing_lists.csv'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -40,14 +48,12 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "#connect.db.execute(\"\"\"DROP TABLE mailing_list_jobs\"\"\")"
+    "## Connect:\n",
+    "\n",
+    "Queries what tables are in the database and determines if 'mail_lists' is there if it is 'mail_lists' is set as 'True' if it isn't 'mail_lists' is set at 'False'. Then we determine what mailing lists are in 'mailing_list_jobs' and we determine how many rows are in it. If 'mailing_list_jobs' is not in the Database it is created and the column 'augurlistID' is set as the primary key. We then add a connection to 'mailing_list_jobs' so that we can change the column 'last_message_date' if new messages were downloaded for a mailing list."
    ]
   },
   {
@@ -107,6 +113,15 @@
     "res = session.query(table).all()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sifting through messages:\n",
+    "\n",
+    "The function 'write_message' is used to fetch the messages from the downloaded MBox's. The messages downloaded can either be from the last message was downloaded when the jupyter notebook was downloaded previously or if it's the first time all the messages are downloaded. The function 'write_message' then determines which message contains the full message thread by looking to see if the 'ID' for the message is stored in 'References'. Only if the next message downloaded does not reference the previous message will the message be added to dictionary and if a certain amount of messages are stored it is added to the table 'mail_lists' using piper_reader."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -120,72 +135,43 @@
     "    store = None\n",
     "    k = 0\n",
     "    di = {}\n",
-    "    #print(\"HEREEEE\")\n",
     "    for message in repo.fetch(from_date=time):\n",
-    "        #print(message,\"\\n\\n\\n\\n\\n\\n\\n\\n\")\n",
-    "        #print(message['data']['Message-ID'])\n",
     "        if(type_archive == 'not_new'):\n",
     "            mess_check = Piper.convert_date(message['data']['Date'])\n",
-    "            #mess_check = Piper.convert_date(\"Thu, 24 Mar 2019 20:37:11 +0000\")\n",
-    "            #print(time)\n",
     "        if(type_archive == 'not_new' and mess_check <= time ):\n",
-    "            print(\"Right here\")\n",
     "            continue            \n",
     "        elif(type_archive == 'not_new' and mess_check > time):\n",
     "            mail_check[pos] = 'update'\n",
     "            \n",
     "        ID = message['data']['Message-ID']\n",
     "        try:\n",
-    "            message['data']['References']\n",
-    "            '''if(message['data']['Message-ID'] == '<CAFHD1sO814do11F9cKVZgr5fo+dw5q-VmfrYO_Q9vv6kXe8NjA@mail.gmail.com>'):\n",
-    "                print(thread)\n",
-    "                print(store)'''                             \n",
+    "            message['data']['References']                           \n",
     "            if((not thread == None) and (thread['data']['Message-ID'] not in message['data']['References'])):\n",
-    "                #bj = json.dumps(thread, indent=4, sort_keys=True)\n",
     "                di[k] = thread\n",
-    "                #utfile.write(obj)\n",
-    "                #utfile.write('\\n')\n",
     "                store = None\n",
     "                k+=1\n",
-    "                print(\"why\")\n",
     "                \n",
     "            elif( (not store == None) and (store['data']['Message-ID'] not in message['data']['References'])):\n",
-    "                #print(message['data']['References'])\n",
     "                di[k] = store\n",
-    "                #bj = json.dumps(store, indent=4, sort_keys=True)\n",
-    "                #utfile.write(obj)\n",
-    "                #utfile.write('\\n')\n",
     "                store = None\n",
-    "                print(\"yep\")\n",
     "                k+=1\n",
     "            thread = message\n",
     "        except:\n",
-    "            #print(\"got'em\")\n",
     "            if(not thread == None):\n",
     "                di[k] = thread\n",
-    "                #bj = json.dumps(thread, indent=4, sort_keys=True)\n",
-    "                #utfile.write(obj)\n",
-    "                #utfile.write('\\n')\n",
     "                thread = None\n",
-    "                print(\"got-em\")\n",
     "                k+=1\n",
     "            elif(not store == None):\n",
     "                di[k] = store\n",
-    "                #bj = json.dumps(store, indent=4, sort_keys=True)\n",
-    "                #utfile.write(obj)\n",
-    "                #utfile.write('\\n')\n",
     "                store = None\n",
-    "                print(\"getting\") \n",
     "                k+=1\n",
     "            store = message\n",
     "        if(len(di) == 5000):\n",
     "            numb,mail_lists = Piper.make(connect.db,mail_check,archives,mail_lists,res,session,di,numb)\n",
     "            di = {}\n",
     "            k = 0\n",
-    "        #print(\"!\"*50,\"NEW MESSAGE\",\"!\"*50)\n",
     "    if(len(di) < 5000 and len(di) > 0):\n",
     "        print(len(di))\n",
-    "        #print(di)\n",
     "        numb,mail_lists = Piper.make(connect.db,mail_check,archives,mail_lists,res,session,di,numb)\n",
     "        di = {}\n",
     "        k = 0\n",
@@ -196,44 +182,130 @@
     "        good = 1\n",
     "    elif( (thread == None) and (not store == None) ):\n",
     "        di[k] = store\n",
-    "        #obj = json.dumps(store, indent=4, sort_keys=True)\n",
-    "        #outfile.write(obj)\n",
-    "        #outfile.write('\\n')\n",
     "    elif( (store == None) and (not thread == None)):\n",
     "        di[k] = thread\n",
-    "        #obj = json.dumps(thread, indent=4, sort_keys=True)\n",
-    "        #outfile.write(obj)\n",
-    "        #outfile.write('\\n')\n",
     "    elif(store['data']['Message-ID'] in thread['data']['References']):\n",
     "        di[k] = thread\n",
-    "        #obj = json.dumps(thread, indent=4, sort_keys=True)\n",
-    "        #outfile.write(obj)\n",
-    "        #outfile.write('\\n')\n",
     "    else:\n",
     "        di[k] = store\n",
-    "        #obj = json.dumps(store, indent=4, sort_keys=True)\n",
-    "        #outfile.write(obj)\n",
-    "        #outfile.write('\\n')  \n",
-    "    #outfile.close()\n",
     "    if(bool(di)):\n",
     "        numb,mail_lists = Piper.make(connect.db,mail_check,archives,mail_lists,res,session,di,numb)\n",
     "    return numb,mail_lists\n",
     "    "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Iteration through mailing lists:\n",
+    "\n",
+    "Determines if the file with the mailing lists was created, if not it writes a set of default mailing lists (to show how the program would work). The mailing lists are then loaded into a dataframe and we iterate through the mailing lists by grouping them by the links. We then check to see if the mailing list is in the 'mailing_list_jobs' table in the SQL Database and if so we assign 'not_new' to 'mail_check' and store the last message date that is stored in 'mailing_list_jobs' to 'time'. If the mailing list is not in 'mailing_list_jobs' we assign 'new' to 'mail_check' and the date is set to 'None' for 'time'."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
-    "collapsed": true
+    "scrolled": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-08-01 16:50:02 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Looking for messages from 'https://lists.opendaylight.org/pipermail/aalldp-dev/' since 1970-01-01 00:00:00+00:00\n",
+      "2018-08-01 16:50:02 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Downloading mboxes from 'https://lists.opendaylight.org/pipermail/aalldp-dev/' to since 1970-01-01 00:00:00+00:00\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "runtime/mailing_lists.csv Place\n",
+      "yeah\n",
+      "Link,mail_list\n",
+      "\n",
+      "https://lists.opendaylight.org/pipermail/,\"aalldp-dev\"\n",
+      "\n",
+      "['aalldp-dev', 'archetypes-dev'] mail_list\n",
+      "{'aalldp-dev': False, 'archetypes-dev': False}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-08-01 16:50:07 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO 2/2 MBoxes downloaded\n",
+      "2018-08-01 16:50:07 keanu-Inspiron-5567 perceval.backends.core.mbox[18033] INFO Done. 6/6 messages fetched; 0 ignored\n",
+      "2018-08-01 16:50:07 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Fetch process completed\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "getting\n",
+      "getting\n",
+      "yep\n",
+      "got-em\n",
+      "4\n",
+      "['aalldp-dev']\n",
+      "2018-07-06 18:39:58\n",
+      "File uploaded  13\n",
+      "Mailing List Job uploaded\n",
+      "Finished\n",
+      "['aalldp-dev']\n",
+      "File uploaded  4\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-08-01 16:50:31 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Looking for messages from 'https://lists.opendaylight.org/pipermail/archetypes-dev/' since 1970-01-01 00:00:00+00:00\n",
+      "2018-08-01 16:50:31 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Downloading mboxes from 'https://lists.opendaylight.org/pipermail/archetypes-dev/' to since 1970-01-01 00:00:00+00:00\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mailing List Job uploaded\n",
+      "Finished\n",
+      "Created File aalldp-dev\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2018-08-01 16:50:33 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO 1/1 MBoxes downloaded\n",
+      "2018-08-01 16:50:33 keanu-Inspiron-5567 perceval.backends.core.mbox[18033] INFO Done. 2/2 messages fetched; 0 ignored\n",
+      "2018-08-01 16:50:33 keanu-Inspiron-5567 perceval.backends.core.pipermail[18033] INFO Fetch process completed\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "getting\n",
+      "1\n",
+      "['archetypes-dev']\n",
+      "File uploaded  4\n",
+      "Mailing List Job uploaded\n",
+      "Finished\n",
+      "['archetypes-dev']\n",
+      "File uploaded  4\n",
+      "Mailing List Job uploaded\n",
+      "Finished\n",
+      "Created File archetypes-dev\n",
+      "{'aalldp-dev': 'new', 'archetypes-dev': 'new'}\n",
+      "Finished downloading files\n"
+     ]
+    }
+   ],
    "source": [
-    "# create an Augur application so we can test our function\n",
-    "if \"notebooks\" in os.getcwd():\n",
-    "    os.chdir(\"..\")\n",
-    "Piper, path= augurApp.piper()\n",
-    "print(path,\"Place\")\n",
     "if(not os.path.exists(path)):\n",
     "    file = open(path, \"w+\")\n",
     "else:\n",
@@ -253,36 +325,24 @@
     "    if(count == 2):\n",
     "        break\n",
     "if(count == 2):\n",
-    "    #print(pd.read_csv(path))\n",
+    "    file.close()\n",
     "    df = pd.read_csv(path)\n",
     "    groups = df.groupby('Link').groups\n",
     "    for group in groups:\n",
     "        link = group\n",
     "        mail_list = (df.loc[df['Link'] == group]['mail_list']).tolist()\n",
     "        print(mail_list,\"mail_list\")            \n",
-    "        #link = \"https://lists.opendaylight.org/pipermail/\"\n",
-    "        #mail = [\"aalldp-dev\",\"alto-dev\",\"archetypes-dev\"]\n",
-    "        #mail = [\"aalldp-dev\",\"alto-dev\",\"archetypes-dev\",\"dev\"]\n",
-    "        #mail = [\"aalldp-dev\",\"archetypes-dev\",\"alto-dev\"]\n",
-    "        #mail = [\"aalldp-dev\",\"archetypes-dev\"]\n",
     "        mail_check = {key:False for key in mail_list}\n",
     "        print(mail_check)\n",
-    "        #print(os.getcwd())\n",
     "        file = \"mail_list\"\n",
     "        path = \"/augur/data/archive-\" \n",
     "        #numb = 0\n",
     "        for x in range(len(mail_list)):\n",
     "            #print(link+mail[x])\n",
     "            if(mail_list[x] not in df1['project'].values ):\n",
     "                mail_check[mail_list[x]] = 'new'\n",
-    "                #print(os.getcwd())\n",
-    "                #print(os.path.join(os.getcwd() + path+'.json'))\n",
     "                place = os.path.join(os.getcwd() + path + mail_list[x] +'.json')           \n",
     "                repo = Pipermail(url = link+ mail_list[x] + \"/\",dirpath=\"tmp/archives_\"+mail_list[x])\n",
-    "                #print(\"Broken\")\n",
-    "                #break\n",
-    "                #print(repo)\n",
-    "                outfile = open(place,\"w+\")\n",
     "                numb,mail_lists = write_message(repo,'new',mail_check,mail_list[x],connect.db,res,session,\\\n",
     "                                     [mail_list[x]],numb,mail_lists)\n",
     "                print(\"Created File\",mail_list[x])\n",
@@ -294,7 +354,6 @@
     "                time = time.astype(object)\n",
     "                place = os.path.join(os.getcwd() + path + 'temp_' + mail_list[x] +'.json')       \n",
     "                repo = Pipermail(url = link+ mail_list[x] + \"/\",dirpath=\"tmp/archives_\"+mail_list[x])\n",
-    "                outfile = open(place,\"w+\")\n",
     "                print(time[0])\n",
     "                print(type(time[0]))\n",
     "                numb,mail_lists = write_message(repo,'not_new',mail_check,mail_list[x],connect.db,\\\n",