spark dataframe example

datastax-training · Oct 7, 2015 · b80955c · b80955c
1 parent fa11543
commit b80955c
Show file tree

Hide file tree

Showing 3 changed files with 325 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.ipynb_checkpoints/
diff --git a/jupyter/Collect data as lists.ipynb b/jupyter/Collect data as lists.ipynb
@@ -268,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 63,
    "metadata": {
     "collapsed": false
    },
@@ -279,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 64,
    "metadata": {
     "collapsed": false
    },
@@ -290,7 +290,7 @@
        "<table><tr><th>album_title</th><th>album_year</th><th>album_genre</th><th>performer</th><th>tracks</th><tr><tr><td>Duos For Violin and Cello</td><td>2000</td><td>Classical</td><td>Nigel Kennedy</td><td>[Sonata for Violin and Cello - Allegro, Sonata for Violin and Cello - Tres vif, Sonata for Violin and Cello - Lent, Sonata for Violin and Cello - Vif, avec entrain, Passacaglia, Duo for Violin and Cello Op. 7 - Allegro serioso, non troppo, Duo for Violin and Cello Op. 7 - Adagio-Andante-Tempo I, Duo for Violin and Cello Op. 7 - Maestoso e largamente, ma non troppo lento-Presto, Two-Part Intervention No. 6 in E]</td></tr><tr><td>Golden Boy Elvis</td><td>1981</td><td>Rock</td><td>Elvis Presley</td><td>[She's Not You, Return To Sender, Joshua Fit The Battle, Don't, Tutti Frutti, It's Now Or Never, Surrender, Do The Clam, Kiss Me Quick, Such A Night, Blueberry Hill, Don't Be Cruel, Heartbreak Hotel, Fun In Acapulco, Hound Dog, Wooden Heart]</td></tr><tr><td>Home</td><td>1995</td><td>Rock</td><td>Blessid Union of Souls</td><td>[I Believe, Let Me Be The One, All Along, Oh Virginia, Nora, Would You Be There, Home, End Of The World, Heaven, Forever For Tonight, Lucky To Be Here]</td></tr><tr><td>Little Sweet Delirium</td><td>1996</td><td>Unknown</td><td>Creeker</td><td>[Chase, Revolution, 829, Reaction, Hands Like Wings, God Shaped Hole, Fall, Point of You]</td></tr><tr><td>Merry Christmas From Elvis Presley, A</td><td>1982</td><td>Rock</td><td>Elvis Presley</td><td>[O Come All Ye Faithful, The First Noel, On A Snowy Christmas Night, Winter Wonderland, The Wonderful World Of Christmas, It Won't Seem Like Christmas, I'll Be Home On Christmas Day, If I Get Home On Christmas Day, Holly Leaves And Christmas Trees, Merry Christmas Baby, Silver Bells]</td></tr></table>"
       ]
      },
-     "execution_count": 58,
+     "execution_count": 64,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/jupyter/Dataframes Scala.ipynb b/jupyter/Dataframes Scala.ipynb
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%%cql select * from music.tracks_by_album limit 5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a SQL Context"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "val sqlContext = new org.apache.spark.sql.SQLContext(sc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a dataframe on a cassandra table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "val df = sqlContext.read.format(\"org.apache.spark.sql.cassandra\").options(Map(\"keyspace\"->\"music\", \"table\" -> \"tracks_by_album\")).load()\t"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Explain the query plan and view some data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.printSchema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.explain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.show"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.select(\"album_year\").distinct.show"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df.groupBy(\"album_year\").count().show"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Group By Decade\n",
+    "You can use various spark sql functions.  Let's use *floor*."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import org.apache.spark.sql.functions._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------------------+-----+\n",
+      "|(FLOOR((album_year / 10)) * 10)|count|\n",
+      "+-------------------------------+-----+\n",
+      "|                         2000.0| 9497|\n",
+      "|                         1950.0|  143|\n",
+      "|                         1960.0| 1616|\n",
+      "|                         1970.0| 4346|\n",
+      "|                         1980.0| 6390|\n",
+      "|                         1990.0|14759|\n",
+      "+-------------------------------+-----+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.groupBy(floor(df(\"album_year\") / 10) * 10).count.show\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### Clean it up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------+-----+\n",
+      "|decade|count|\n",
+      "+------+-----+\n",
+      "|  1950|  143|\n",
+      "|  1960| 1616|\n",
+      "|  1970| 4346|\n",
+      "|  1980| 6390|\n",
+      "|  1990|14759|\n",
+      "|  2000| 9497|\n",
+      "+------+-----+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "val tmp = df.groupBy((floor(df(\"album_year\") / 10) * 10).cast(\"int\").alias(\"decade\")).count\n",
+    "tmp.show"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------+-----------+\n",
+      "|decade|album_count|\n",
+      "+------+-----------+\n",
+      "|  1950|        143|\n",
+      "|  1960|       1616|\n",
+      "|  1970|       4346|\n",
+      "|  1980|       6390|\n",
+      "|  1990|      14759|\n",
+      "|  2000|       9497|\n",
+      "+------+-----------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "val count_by_decade = tmp.select(tmp(\"decade\"), tmp(\"count\").as(\"album_count\"))\n",
+    "count_by_decade.show"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save to a new table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table><tr></tr></table>"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%cql create table if not exists steve.albums_by_decade  (decade int primary key, album_count int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "count_by_decade.write.format(\"org.apache.spark.sql.cassandra\").options(Map( \"table\" -> \"albums_by_decade\", \"keyspace\" -> \"steve\")).save()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check on it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table><tr><th>decade</th><th>album_count</th></tr><tr><td>1960</td><td>1616</td></tr><tr><td>1950</td><td>143</td></tr><tr><td>1990</td><td>14759</td></tr><tr><td>2000</td><td>9497</td></tr><tr><td>1970</td><td>4346</td></tr><tr><td>1980</td><td>6390</td></tr></table>"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%cql select * from steve.albums_by_decade"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Spark 1.2.1 (Scala 2.10.4)",
+   "language": "scala",
+   "name": "spark"
+  },
+  "language_info": {
+   "name": "scala"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}