Skip to content

duartejr/uber_rides_dataset_analysis

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

14 Commits
 
 
 
 
 
 

Repository files navigation

Uber dataset analysis

This project's goal is to analyze data about Uber rides while using various data visualization frameworks that are available for Python. Data on Uber rides in New York from April to September of 2014 are included in the dataset. The information was gathered by FiveThirtyEight and is accessible on Kaggle.

The variables in the dataset are:

name descripiton
Date/Time The date and hour of Uber pickup
Lat The Latitude of the Uber pickup
Lon The longitude of the Uber pickup
Base The LTC base company code affiliated with the Uber pickup

Loading the data

import pandas as pd 

apr_data = pd.read_csv('./data/uber-raw-data-apr14.csv') # Pickups in april
may_data = pd.read_csv('./data/uber-raw-data-may14.csv') # Pickups in may
jun_data = pd.read_csv('./data/uber-raw-data-jun14.csv') # pickups in june
jul_data = pd.read_csv('./data/uber-raw-data-jul14.csv') # pickups in july
aug_data = pd.read_csv('./data/uber-raw-data-aug14.csv') # pickups in august
sep_data = pd.read_csv('./data/uber-raw-data-sep14.csv') # pickups in september

# Concat all the subsets in just one variable
data = pd.concat([apr_data, may_data, jun_data, jul_data, aug_data, sep_data])

del apr_data, may_data, jun_data, jul_data, aug_data, sep_data # Deleting the unnecessary variables to clean the memory

data['Date/Time'] = pd.to_datetime(data['Date/Time'], format='%m/%d/%Y %H:%M:%S') # String to datetime
data['day'] = data['Date/Time'].dt.day # Extract the day
data['month'] = data['Date/Time'].dt.month # Extract the month
data['year'] = data['Date/Time'].dt.year # Extract the year
data['day_of_week'] = data['Date/Time'].dt.day_of_week # Extract the day of the week
data['hour'] = data['Date/Time'].dt.hour # Extract the hour
data['minute'] = data['Date/Time'].dt.minute # Extract the minute
data['second'] = data['Date/Time'].dt.second # Extract the second

data.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Date/Time Lat Lon Base day month year day_of_week hour minute second
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 4 2014 1 0 11 0
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 4 2014 1 0 17 0
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 4 2014 1 0 21 0
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 4 2014 1 0 28 0
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 4 2014 1 0 33 0
days_names = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
months_names = {4:'April', 5:'May', 6:'June', 7:'July', 8:'August', 9:'September'}

Trips by the hours in a day

hour_data = data.groupby('hour')['hour'].count().to_frame().rename(columns={'hour':'Total'})
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(11, 6), dpi=100)
fig.fontsize = 20
ax.plot(hour_data, color='#09091a')
ax.set_xlim(0, 23)
ax.set_xticks(range(24))
ax.set_ylim(0, hour_data.Total.max() + 5000)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel('Hour', fontsize=14, color='#222233')
plt.suptitle('Number of trips during the day', fontsize=18, color='#09091a',
             x=0.123, y=1.05, ha='left')
ax.set_title('Most trips in 2014 took place after 15:00', 
             fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')

ax.vlines(15, ymin=0, ymax=hour_data.Total.max(), color='#c0c0c8')
ax.text(15.3, hour_data.loc[15] / 1.2, '2,521,360 trips \nwere recorded between\n 15:00 and 23:00 hours.', color='#222233')
ax.text(14.8, hour_data.loc[15] / 3, '2,288,433 trips \nwere recorded between\n 00:00 and 15:00.',
        horizontalalignment='right', color='#222233')
ax.plot(17, hour_data.loc[17], 'o', color='#222233')
ax.text(17, hour_data.loc[17] * 1.02, 'Pickups peak at 17:00 with 336,190 trips.')
plt.show()

png

Trips by hour and month

data_hour_month = pd.crosstab(data.hour, data.month)
data_hour_month = data_hour_month.rename(columns=months_names)
data_hour_month
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
month April May June July August September
hour
0 11910 13875 14514 17953 21451 24133
1 7769 8186 9167 11527 14471 16107
2 4935 5372 6189 8562 10105 10702
3 5040 5946 6937 9199 10376 10789
4 6095 6945 7701 10040 11774 12675
5 9476 10789 11955 14932 16525 20262
6 18498 21015 22030 23456 24907 33307
7 24924 27413 30834 32545 34064 43314
8 22843 25460 29771 33387 34566 44477
9 17939 20507 24298 28486 30195 38542
10 17865 20801 23584 28558 30706 37634
11 18774 22055 24155 30120 31778 38821
12 19425 23595 25233 30900 32106 39193
13 22603 27699 28937 35832 35764 45042
14 27190 34363 34428 41357 40644 52643
15 35324 43087 41586 46053 48197 61219
16 42003 49127 48162 52403 53481 68224
17 45475 51508 50452 58260 57122 73373
18 43003 48965 45013 57268 55390 75040
19 38923 42387 38203 52332 53008 69660
20 36244 40731 40108 51859 51674 63988
21 36964 42217 40791 49528 51354 60606
22 30645 35556 35614 42218 46008 51817
23 20649 24836 24182 29346 33609 36568
fig, ax = plt.subplots(figsize=(11, 6), dpi=100)
data_hour_month.plot(kind='bar', stacked=True, color=['#d9d9d9', '#999999', '#747474', '#5d5d5d', '#3f3f3f', '#1fbad6'], ax=ax)
ax.spines[['top', 'right']].set_visible(False)
ax.set_xlabel('Hour', fontsize=14, color='#222233')
plt.legend(title='Month')
plt.suptitle('Number of trips by month and hour', fontsize=18, color='#09091a',
             x=0.123, y=1.05, ha='left')
ax.set_title('In the month of September, more trips were registered', 
             fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')
ax.text(17, hour_data.loc[17], 'Peak hours are \nthe same every month.',
        horizontalalignment='right', color='#222233')
plt.show()

png

Trips by day and month

trips_avg = round(data.day.value_counts().mean(), 0)
daily_trips = data.day.value_counts()
days_above_avg = daily_trips[daily_trips > trips_avg].to_frame().sort_index()
palette = []

for i in range(32):
    if i == 29:
        palette.append('#1fbad6')
    elif i == 30:
        palette.append('#d9d9d9')
    elif i+1 in days_above_avg.index:
        palette.append('#3f3f3f')
    else:
        palette.append('#999999')
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(10, 6),
            'axes.facecolor':'white',
            'figure.facecolor':'white'})
ax = sns.countplot(data=data, x='day', palette=palette)
ax.set_xlabel('Hour', fontsize=12)
ax.set_ylabel('')
plt.suptitle('Number of trips by day and month', fontsize=18, color='#09091a',
             x=0.123, y=1.05, ha='left')
ax.set_title('17 out of 31 days are above average trips.', 
             fontsize=14, loc='left', color='#1fbad6', y=1.1, ha='left')
ax.hlines(trips_avg, xmin=-0.5, xmax=31, ls='--', colors='k')
ax.text(31, trips_avg, f"Average = {int(trips_avg)}", va='center')
ax.text(30, daily_trips.loc[31], f"{daily_trips.loc[31]} trips")
ax.text(29, daily_trips.loc[30], f"{daily_trips.loc[30]} trips", color='#1fbad6', weight='bold')
ax;

png

Trips by week day and month

data2 = data.copy()
data2 = data2.replace({'month': months_names, 'day_of_week': days_names})
import plotly.express as px

palette = ['#0d47a1', '#1565c0', '#1976d2', '#1e88e5', '#2196f3', '#42a5f5', '#64b5f6', '#90caf9']

px.histogram(data2, x='month', color='day_of_week', barmode='group',
             labels = {'month':'Months', 'day_of_week':'Day of week'},
             title = 'Trips by week day and month',
             color_discrete_sequence = palette,
             category_orders = {'day_of_week': ['Monday', 'Tuesday', 'Wednesday', 'Quinta', 'Thursday', 'Saturday', 'Sunday']}
             ).update_layout(yaxis_title = '',
                             plot_bgcolor = 'rgb(255, 255, 255)')

Trips by month

from plotnine import ggplot
from plotnine import *
import plotnine as p9

trips = data.groupby('month')['month'].count().to_frame().rename(columns={'month':'Total'}).reset_index()
palette = ('#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff', '#2d9dff')

p9.options.figure_size = (10, 6)

ggplot(trips)\
    + aes(x='month', y='Total', fill='factor(month)')\
    + geom_col()\
    + coord_flip()\
    + geom_text(
        aes(label = 'Total'),
        ha = 'right'
    )\
    + labs(
        y = 'Trips',
        x = 'Months'  ,
        title = 'Trips by month'     
    )\
    + theme_minimal()\
    + theme(legend_position='none')\
    + scale_x_continuous(breaks=list(range(4, 10)), labels=['April', 'May', 'June', 'July', 'August', 'September'])\
    + scale_fill_manual(values=palette)

png

<ggplot: (167300155338)>

Trips by Base

base_trips = data.groupby('Base')['Base'].count().to_frame().rename(columns={'Base':'Total'}).reset_index()
base_trips
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Base Total
0 B02512 205673
1 B02598 1393113
2 B02617 1458853
3 B02682 1212789
4 B02764 263899
import altair as alt

bars = alt.Chart(base_trips, title='Trips by Base').mark_bar().encode(
    x='Total',
    y="Base"
)

text = bars.mark_text(
    align='right',
    baseline='middle',
    dx=-3, color='#ffffff'
).encode(
    text='Total'
)

(bars + text).properties(height=200)

Trips by base and month

month_base_trips = pd.crosstab(data.Base, data.month)
month_base_trips = month_base_trips.rename(columns=months_names)
month_base_trips
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
month April May June July August September
Base
B02512 35536 36765 32509 35021 31472 34370
B02598 183263 260549 242975 245597 220129 240600
B02617 108001 122734 184460 310160 355803 377695
B02682 227808 222883 194926 196754 173280 197138
B02764 9908 9504 8974 8589 48591 178333
from bokeh.io import show
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure

x = [(base, mes) for base in month_base_trips.index.values[:] for mes in month_base_trips.columns]
counts = [month_base_trips.loc[base, mes] for base in month_base_trips.index.values[:] for mes in month_base_trips.columns]
source = ColumnDataSource(data=dict(x=x, counts=counts))

p = figure(x_range=FactorRange(*x), plot_height=350, title="Trips by base and month",
           toolbar_location=None, tools="")

p.vbar(x='x', top='counts', width=0.9, source=source)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
show(p)

alt text

Trips by Base and day of week

data2 = data.copy()
data2 = data2.replace({'month': months_names, 'day_of_week': days_names})
base_days_week_trips = pd.crosstab(data2.Base, data2.day_of_week)
base_days_week_trips
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
day_of_week Friday Monday Saturday Sunday Thursday Tuesday Wednesday
Base
B02512 33319 25460 26773 20490 35032 31670 32929
B02598 229908 163542 198832 146652 235157 202378 216644
B02617 234379 176416 206554 164452 240216 214167 222669
B02682 201594 143372 170160 126511 205091 176198 189863
B02764 41939 32682 43795 32075 39649 39376 34383
import pygal
from pygal.style import LightenStyle
dark_lighten_style = LightenStyle('#336676')

bar_chart = pygal.Bar(style=dark_lighten_style, height=250)
bar_chart.title = 'Trips by Base and day of week'
bar_chart.x_labels = base_days_week_trips.index.values[:]
for column in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
    bar_chart.add(column, base_days_week_trips[column])
bar_chart.render_to_file('trips_base_week_day.svg')

alt text

Heatmaps

Heatmap: trips by hour and day

trips = pd.crosstab(data.hour, data.day) / 1_000
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(trips, cmap=plt.get_cmap("Blues", 13), vmin=0, vmax=13)

ax.set_xticks(np.arange(len(trips.columns)), labels=trips.columns, fontsize=10)
ax.set_yticks(np.arange(len(trips.index)), labels=trips.index, fontsize=10)
ax.set_title("Trips by hour and day", fontsize=20)

cbar = ax.figure.colorbar(im, ticks=np.arange(14), fraction=0.035, ax=ax)
cbar.ax.set_ylabel("Trips in thounsands", rotation=-90, va="bottom", fontsize=12)

ax.spines[:].set_visible(False)

ax.set_xticks(np.arange(trips.shape[1]+1)-.5, minor=True)
ax.set_yticks(np.arange(trips.shape[0]+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
ax.tick_params(which="minor", bottom=False, left=False)

ax.set_xlabel('Day', fontsize=12)
ax.set_ylabel('Hour', fontsize=12)

plt.show()

Heatmap: Trips by month and day

import seaborn as sns

trips = pd.crosstab(data.month, data.day) / 1_000
corridas_plot = trips.rename(index=months_names)

fig, ax = plt.subplots(figsize=(20, 7))
sns.heatmap(trips, 
            vmin=0, 
            vmax=45, 
            cmap=plt.get_cmap("Blues", 9), 
            ax=ax, 
            linewidths=2)
ax.set_title('Trips by month and day', fontsize=20)
ax.set_xlabel('Day', fontsize=12)
ax.set_ylabel('', fontsize=12)
ax.collections[0].colorbar.set_label('Trips in thousands', fontsize=12)

png

Heatmap: Trips by month and week day

trips = pd.crosstab(data.month, data.day_of_week) / 1_000
trips = trips.rename(index=months_names, columns=days_names)
import plotly.graph_objs as go

plot = go.Heatmap(z = trips.values[:],
                  x = trips.columns,
                  y = trips.index,
                  colorscale = 'Blues',
                  xgap = 2,
                  ygap = 2,
                  zmin = 0,
                  zmax = 165,
                  colorbar = dict(title='Trips in thousands')
                )

layout = go.Layout(title = 'Trips by month and week day')

fig = go.Figure(data=plot, layout=layout)
fig.show()

Heatmap: Trips by Base and month

trips = data.groupby(['Base', 'month'])['hour'].count().reset_index().rename(columns={'hour':'Total'})
trips = trips.replace({'month':months_names})
trips['Total'] /= 1000
trips['Total'] = trips['Total'].round(2)
from plotnine import *
import plotnine as p9

p9.options.figure_size = (10, 6)

ggplot(trips)\
    + aes(x='month', y='Base', fill='Total')\
    + geom_tile(aes(width=.95, height=.95))\
    + geom_text(aes(label='Total'), size=10)\
    + labs(
        y = 'Base',
        x = ''  ,
        title = 'Trips by Base and month'     
    )\
    + theme_minimal()\
    + scale_fill_gradient(low='#cbe7ff', high='#08306b')\
    + scale_x_discrete(limits=('April', 'May', 'June', 'July', 'August', 'September'))

png

<ggplot: (167257035978)>

Heatmap: Trips by base and week day

trips = data.groupby(['Base', 'day_of_week'])['hour'].count().reset_index().rename(columns={'hour':'Total'})
trips = trips.replace({'day_of_week':days_names})
trips['Total'] /= 1000
trips['Total'] = trips['Total'].round(2)
trips.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Base day_of_week Total
0 B02512 Monday 25.46
1 B02512 Tuesday 31.67
2 B02512 Wednesday 32.93
3 B02512 Thursday 35.03
4 B02512 Friday 33.32
import altair as alt

alt.Chart(trips, title='Trips by Base and week day').mark_rect().encode(
    x=alt.X('day_of_week', axis=alt.Axis(title='Week day'), sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday',
                                                                  'Friday', 'Saturday', 'Sunday']),
    y='Base',
    color=alt.Color('Total', scale=alt.Scale(scheme='blues')),
).properties(height=300, width=300)

Maps of Uber rides

import pandas as pd
import numpy as np

from bokeh.plotting import figure
from bokeh.tile_providers import get_provider, WIKIMEDIA
from bokeh.io import output_notebook, show
from pyproj import Proj, transform

import warnings
warnings.filterwarnings("ignore")
inProj = Proj(init='epsg:3857')
outProj = Proj(init='epsg:4326')

lons, lats = [], []
for lon, lat in list(set(zip(data["Lon"], data["Lat"]))):
    x, y = transform(outProj, inProj, lon, lat)
    lons.append(x)
    lats.append(y)
data_map = pd.DataFrame([])

data_map["MercatorX"] = lons
data_map["MercatorY"] = lats

data_map.head()
wikimedia = get_provider(WIKIMEDIA)

ny_lon1, ny_lat1 = transform(outProj, inProj, -73.7, 40.58)
ny_lon2, ny_lat2 = transform(outProj, inProj, -74.15, 40.92)

p = figure(plot_width=900, plot_height=700,
           x_range=(ny_lon1, ny_lon2), y_range=(ny_lat1, ny_lat2),
           x_axis_type="mercator", y_axis_type="mercator",
           title="Uber rides in NY")

p.add_tile(wikimedia)

p.circle(x="MercatorX", y="MercatorY",
         size=2,
         fill_color="dodgerblue", line_color="dodgerblue",
         fill_alpha=0.3,
         source=data_map)

show(p)

alt text

Map Uber rides by base

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from cartopy import crs as ccrs
from cartopy import feature as cfeature
# Set the domain for defining the plot region.
latN = 40.92
latS = 40.58
lonW = -74.15
lonE = -73.7
cLat = (latN + latS)/2
cLon = (lonW + lonE )/2

base_colors = {'B02512':'red', 'B02598':'green', 'B02617':'blue', 'B02682':'yellow', 'B02764':'gray'}
bases = data.Base.unique()

proj = ccrs.LambertConformal(central_longitude=cLon, central_latitude=cLat)
res = '10m' # Coarsest and quickest to display; other options are '10m' (slowest), '50m', 1110m.
fig = plt.figure(figsize=(18, 12))
ax = plt.subplot(1 ,1, 1, projection=proj)
ax.set_extent ([lonW, lonE, latS, latN])
ax.add_feature (cfeature.OCEAN.with_scale(res))
ax.add_feature(cfeature.COASTLINE.with_scale(res))
ax.set_title ('New York Map on Uber rides during 2014 (Apr-Sep) by Base')

for base in bases:
    lat = data.query(f'Base == "{base}"').Lat
    lon = data.query(f'Base == "{base}"').Lon
    ax.scatter(lon, lat, s=9, c=base_colors[base], 
               edgecolor=None, alpha=0.75, 
               transform=ccrs.PlateCarree(), label=base)
    
plt.legend()
plt.show()

png

Releases

No releases published

Packages

No packages published