Skip to content

Commit

Permalink
spark dataframe example
Browse files Browse the repository at this point in the history
  • Loading branch information
stevelowenthal committed Oct 7, 2015
1 parent fa11543 commit b80955c
Show file tree
Hide file tree
Showing 3 changed files with 325 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.ipynb_checkpoints/
6 changes: 3 additions & 3 deletions jupyter/Collect data as lists.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 63,
"metadata": {
"collapsed": false
},
Expand All @@ -279,7 +279,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 64,
"metadata": {
"collapsed": false
},
Expand All @@ -290,7 +290,7 @@
"<table><tr><th>album_title</th><th>album_year</th><th>album_genre</th><th>performer</th><th>tracks</th><tr><tr><td>Duos For Violin and Cello</td><td>2000</td><td>Classical</td><td>Nigel Kennedy</td><td>[Sonata for Violin and Cello - Allegro, Sonata for Violin and Cello - Tres vif, Sonata for Violin and Cello - Lent, Sonata for Violin and Cello - Vif, avec entrain, Passacaglia, Duo for Violin and Cello Op. 7 - Allegro serioso, non troppo, Duo for Violin and Cello Op. 7 - Adagio-Andante-Tempo I, Duo for Violin and Cello Op. 7 - Maestoso e largamente, ma non troppo lento-Presto, Two-Part Intervention No. 6 in E]</td></tr><tr><td>Golden Boy Elvis</td><td>1981</td><td>Rock</td><td>Elvis Presley</td><td>[She's Not You, Return To Sender, Joshua Fit The Battle, Don't, Tutti Frutti, It's Now Or Never, Surrender, Do The Clam, Kiss Me Quick, Such A Night, Blueberry Hill, Don't Be Cruel, Heartbreak Hotel, Fun In Acapulco, Hound Dog, Wooden Heart]</td></tr><tr><td>Home</td><td>1995</td><td>Rock</td><td>Blessid Union of Souls</td><td>[I Believe, Let Me Be The One, All Along, Oh Virginia, Nora, Would You Be There, Home, End Of The World, Heaven, Forever For Tonight, Lucky To Be Here]</td></tr><tr><td>Little Sweet Delirium</td><td>1996</td><td>Unknown</td><td>Creeker</td><td>[Chase, Revolution, 829, Reaction, Hands Like Wings, God Shaped Hole, Fall, Point of You]</td></tr><tr><td>Merry Christmas From Elvis Presley, A</td><td>1982</td><td>Rock</td><td>Elvis Presley</td><td>[O Come All Ye Faithful, The First Noel, On A Snowy Christmas Night, Winter Wonderland, The Wonderful World Of Christmas, It Won't Seem Like Christmas, I'll Be Home On Christmas Day, If I Get Home On Christmas Day, Holly Leaves And Christmas Trees, Merry Christmas Baby, Silver Bells]</td></tr></table>"
]
},
"execution_count": 58,
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
321 changes: 321 additions & 0 deletions jupyter/Dataframes Scala.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%cql select * from music.tracks_by_album limit 5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a SQL Context"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"val sqlContext = new org.apache.spark.sql.SQLContext(sc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a dataframe on a cassandra table"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"val df = sqlContext.read.format(\"org.apache.spark.sql.cassandra\").options(Map(\"keyspace\"->\"music\", \"table\" -> \"tracks_by_album\")).load()\t"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Explain the query plan and view some data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.printSchema"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.explain"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.show"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.select(\"album_year\").distinct.show"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"df.groupBy(\"album_year\").count().show"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Group By Decade\n",
"You can use various spark sql functions. Let's use *floor*."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import org.apache.spark.sql.functions._"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------------------------+-----+\n",
"|(FLOOR((album_year / 10)) * 10)|count|\n",
"+-------------------------------+-----+\n",
"| 2000.0| 9497|\n",
"| 1950.0| 143|\n",
"| 1960.0| 1616|\n",
"| 1970.0| 4346|\n",
"| 1980.0| 6390|\n",
"| 1990.0|14759|\n",
"+-------------------------------+-----+\n",
"\n"
]
}
],
"source": [
"df.groupBy(floor(df(\"album_year\") / 10) * 10).count.show\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Clean it up"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----+\n",
"|decade|count|\n",
"+------+-----+\n",
"| 1950| 143|\n",
"| 1960| 1616|\n",
"| 1970| 4346|\n",
"| 1980| 6390|\n",
"| 1990|14759|\n",
"| 2000| 9497|\n",
"+------+-----+\n",
"\n"
]
}
],
"source": [
"val tmp = df.groupBy((floor(df(\"album_year\") / 10) * 10).cast(\"int\").alias(\"decade\")).count\n",
"tmp.show"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------+-----------+\n",
"|decade|album_count|\n",
"+------+-----------+\n",
"| 1950| 143|\n",
"| 1960| 1616|\n",
"| 1970| 4346|\n",
"| 1980| 6390|\n",
"| 1990| 14759|\n",
"| 2000| 9497|\n",
"+------+-----------+\n",
"\n"
]
}
],
"source": [
"val count_by_decade = tmp.select(tmp(\"decade\"), tmp(\"count\").as(\"album_count\"))\n",
"count_by_decade.show"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save to a new table"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<table><tr></tr></table>"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%cql create table if not exists steve.albums_by_decade (decade int primary key, album_count int)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"count_by_decade.write.format(\"org.apache.spark.sql.cassandra\").options(Map( \"table\" -> \"albums_by_decade\", \"keyspace\" -> \"steve\")).save()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check on it"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<table><tr><th>decade</th><th>album_count</th></tr><tr><td>1960</td><td>1616</td></tr><tr><td>1950</td><td>143</td></tr><tr><td>1990</td><td>14759</td></tr><tr><td>2000</td><td>9497</td></tr><tr><td>1970</td><td>4346</td></tr><tr><td>1980</td><td>6390</td></tr></table>"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%cql select * from steve.albums_by_decade"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Spark 1.2.1 (Scala 2.10.4)",
"language": "scala",
"name": "spark"
},
"language_info": {
"name": "scala"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit b80955c

Please sign in to comment.