# Import libraries

import torch
import torch.nn as nn

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.size'] = 14
palette = sns.color_palette('hls', 10)


# Read the data
df = pd.read_csv('Data/NYCTaxiFares.csv')
df.shape

(120000, 8)


# Check head
df.head()


# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
pickup_datetime      120000 non-null object
fare_amount          120000 non-null float64
fare_class           120000 non-null int64
pickup_longitude     120000 non-null float64
pickup_latitude      120000 non-null float64
dropoff_longitude    120000 non-null float64
dropoff_latitude     120000 non-null float64
passenger_count      120000 non-null int64
dtypes: float64(5), int64(2), object(1)
memory usage: 7.3+ MB


# Check fare amount
df['fare_amount'].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64


def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d


df['dist_km'] = haversine_distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')
df.head()


# Convert to datetime in EDT
df['EDTdate'] = pd.to_datetime(df['pickup_datetime']) - pd.Timedelta(hours=4)


# Check head
df.head()


# Add hour
df['Hour'] = df['EDTdate'].dt.hour


# Add AM/PM
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')


# Add Weekday
df['Weekday'] = df['EDTdate'].dt.strftime("%a")


# Check head
df.head()


# Check min and max dates
print(min(df['EDTdate']))
print(max(df['EDTdate']))

2010-04-11 00:00:10+00:00
2010-04-24 23:59:42+00:00


# Distribution of fare amount
plt.figure(figsize=(12,5))
sns.kdeplot(df['fare_amount'])
sns.despine()
plt.xlabel('Fare amount')
plt.ylabel('Density')
plt.title('Fare Distribution')
plt.show()


# Distribution of passenger count
plt.figure(figsize=(12,5))
sns.catplot(x='passenger_count', kind='count', data=df,
            aspect=2.5, palette="icefire")
sns.despine()
plt.title('Passenger Count');


# Trips by day of the week
plt.figure(figsize=(12,5))
sns.catplot(x='Weekday', kind='count', data=df,
            aspect=2.5, palette="icefire")
sns.despine()
plt.title('Trips by Day of the Week')
plt.show()

<Figure size 1200x500 with 0 Axes>

<Figure size 1200x500 with 0 Axes>


# Trips by hour of the day
plt.figure(figsize=(12,5))
sns.catplot(x='Hour', kind='count', data=df,
            aspect=2.5, palette="icefire")
sns.despine()
plt.title('Trips by Hour of the Day');


plt.figure(figsize=(12,5))
sns.catplot(x='Weekday',y='Hour',data=df, aspect=3, kind='boxen', palette="icefire")
plt.title('Trips by Day of the Week and Hour')
plt.show()

<Figure size 1200x500 with 0 Axes>

<Figure size 1200x500 with 0 Axes>


# Distribution of Trip Distance
plt.figure(figsize=(12,5))
sns.kdeplot(df['dist_km'])
sns.despine()
plt.xlabel('Trip Distance (in km)')
plt.ylabel('Density')
plt.title('Distribution of Trip Distance')
plt.show()


sns.catplot(x='Weekday',y='dist_km',data=df, aspect=2.5, hue='AMorPM', kind='boxen')
plt.title('Distance travelled by Weekday and AM/PM')
plt.show()


# Create plot
plt.figure(figsize=(15,10))
sns.catplot(x='Weekday', y='dist_km',data=df, aspect=2.5, hue='Hour', height=10, kind='boxen')
plt.title('Distance travelled by Day and Hour')
plt.show()

<Figure size 1500x1000 with 0 Axes>


# Plot
plt.figure(figsize=(15,10))
sns.catplot(x='Weekday',y='fare_amount',data=df, aspect=2.5, hue='Hour', kind='boxen', height=10)
plt.title('Fare Amount by Day and Hour')
plt.show()

<Figure size 1500x1000 with 0 Axes>


# Plot
sns.jointplot(x='fare_amount', y='dist_km', data=df);


# Check data types
df.dtypes

pickup_datetime                   object
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                               int64
AMorPM                            object
Weekday                           object
dtype: object


# Identify categorical columns
cat_cols = ['Hour', 'AMorPM', 'Weekday']

# Identify continous columns
cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'dist_km']

# Identify the target/response variable
y_col = ['fare_amount']  # this column contains the labels


# Convert our three categorical columns to category dtypes
for col in cat_cols:
    df[col] = df[col].astype('category')


# Check data types
df.dtypes

pickup_datetime                   object
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                            category
AMorPM                          category
Weekday                         category
dtype: object


# Check categories for Hour
df['Hour'].cat.categories

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23],
           dtype='int64')


# Check categories for AMorPM
df['AMorPM'].cat.categories

Index(['am', 'pm'], dtype='object')


# Check categories for AMorPM
df['Weekday'].cat.categories

Index(['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed'], dtype='object')


# Check codes of categories for AMorPM
df['AMorPM'].head().cat.codes

0    0
1    0
2    0
3    1
4    1
dtype: int8


# Combine categorical cols into an array
cats_comb = np.stack([df[col].cat.codes.values for col in cat_cols], 1)


# Check shape
cats_comb.shape

(120000, 3)


# Convert categorical columns to a tensor
cats_comb = torch.tensor(cats_comb, dtype=torch.int64)


# Check type and shape
print(cats_comb.type())
print(cats_comb.shape)

torch.LongTensor
torch.Size([120000, 3])


# Convert continous columns to tensor
cont_comb = np.stack([df[col].values for col in cont_cols], 1)
cont_comb = torch.tensor(cont_comb, dtype=torch.float)
cont_comb[:5]

tensor([[ 40.7305, -73.9924,  40.7447, -73.9755,   1.0000,   2.1263],
        [ 40.7406, -73.9901,  40.7441, -73.9742,   1.0000,   1.3923],
        [ 40.7511, -73.9941,  40.7662, -73.9601,   2.0000,   3.3268],
        [ 40.7564, -73.9905,  40.7482, -73.9712,   1.0000,   1.8641],
        [ 40.7342, -73.9910,  40.7431, -73.9060,   1.0000,   7.2313]])


# Check type and shape
print(cont_comb.type())
print(cont_comb.shape)

torch.FloatTensor
torch.Size([120000, 6])


# Convert response variable to a tensor
y = torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1,1)
y.shape

torch.Size([120000, 1])


# Check size of categories in different categorical columns
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
cat_szs

[24, 2, 7]


# Set embedding sizes
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

[(24, 12), (2, 1), (7, 4)]


class TabularModel(nn.Module):
    
    def __init__(self, emb_szs, n_cont, layers, n_out, p=0.5):
        super().__init__()
        
        # setup embeddings list
        # Categorical data will be filtered through these Embeddings in the forward section
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
        
        # setup dropout for embeddings
        self.emb_drop = nn.Dropout(p)
        
        # normalize continous variables
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        # combine categorical and continous sizes to determine final input size
        n_emb = sum([nf for ni,nf in emb_szs])
        n_in = n_cont + n_emb
        
        # Create network of layers
        layerlist = []
        
        """
        the code below:
        - iterates over neuron sizes and creates hidden layers where each layer
        is a linear layer with Relu actovation.
        - gets normalized and then we do Dropout (randomly assign 0's to x% data) to avoid overfitting
        size of input gets updated to neurons in the hidden layer we just iterated over.
        
        Sizes are as follows:
        e.g. if we decide to create 3 hidden layers with 200, 500, 100 neurons, then
        layers = [200, 500, 100]
        size of input = n_in
        size of hidden layer 1 = n_in x 200
        size of hidden layer 2 = 200 x 500
        size of hidden layer 3 = 500 x 100
        size of final Linear layer = 500 x 1
        """
        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        
        # last layer is a linear layer of size(neurons in last layer in layers, n_out)
        layerlist.append(nn.Linear(layers[-1], n_out))
        
        # Combine list of layers together
        self.layers = nn.Sequential(*layerlist)
        
    def forward(self, x_cats, x_cont):
        embeddings = []
        
        # Pass data through embedding layers
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cats[:,i]))
        
        # concatenate embeddings into a tensor
        cat_var = torch.cat(embeddings, dim=1)
        
        # Use dropout to randomly assign 0's to x% data in embeddings
        cat_var = self.emb_drop(cat_var)
        
        # Normalize continous variables
        cont_var = self.bn_cont(x_cont)
        
        # Concatenate categorical and continous variables
        x = torch.cat([cat_var, cont_var], 1)
        
        # Create list of layers
        x = self.layers(x)
        return x


# Build model
torch.manual_seed(33)
model = TabularModel(emb_szs, cont_comb.shape[1], layers=[200,300,200], n_out=1, p=0.4)


# See model details
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4)
    (4): Linear(in_features=200, out_features=300, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4)
    (8): Linear(in_features=300, out_features=200, bias=True)
    (9): ReLU(inplace)
    (10): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4)
    (12): Linear(in_features=200, out_features=1, bias=True)
  )
)


# Setup batch and test size
batch_size = 60000
test_size = int(batch_size * 0.2)


# Create splits
cont_train = cont_comb[:batch_size-test_size]
cont_test = cont_comb[batch_size-test_size:batch_size]
cat_train = cats_comb[:batch_size-test_size]
cat_test = cats_comb[batch_size-test_size:batch_size]

# For response variable
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]


# Check train, test lengths
print(len(cont_train))
print(len(cont_test))

48000
12000


# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


%%time

epochs = 300
losses = []

for i in range(epochs):
    i += 1
    # generate prediction
    y_pred = model(cat_train, cont_train)
    # compute loss
    loss = torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss)
    
    if i%10 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')
        
    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print loss for last epoch

epoch:   1  loss: 12.55616188
epoch:  11  loss: 11.54192066
epoch:  21  loss: 10.93466187
epoch:  31  loss: 10.50934124
epoch:  41  loss: 10.15295029
epoch:  51  loss: 9.84051609
epoch:  61  loss: 9.50658894
epoch:  71  loss: 9.12140083
epoch:  81  loss: 8.67223740
epoch:  91  loss: 8.12860489
epoch: 101  loss: 7.53865576
epoch: 111  loss: 6.87278032
epoch: 121  loss: 6.14578295
epoch: 131  loss: 5.40913677
epoch: 141  loss: 4.70694065
epoch: 151  loss: 4.10182142
epoch: 161  loss: 3.70679307
epoch: 171  loss: 3.56206393
epoch: 181  loss: 3.50243330
epoch: 191  loss: 3.46999526
epoch: 201  loss: 3.45166826
epoch: 211  loss: 3.42403889
epoch: 221  loss: 3.39421129
epoch: 231  loss: 3.40854263
epoch: 241  loss: 3.38766265
epoch: 251  loss: 3.33984447
epoch: 261  loss: 3.33268213
epoch: 271  loss: 3.33059573
epoch: 281  loss: 3.32077336
epoch: 291  loss: 3.30195522
epoch: 300  loss: 3.30223155
CPU times: user 1h 25min 9s, sys: 4min 5s, total: 1h 29min 15s
Wall time: 13min 28s


plt.plot(range(epochs), losses)
plt.ylabel('RMSE Loss')
plt.xlabel('epoch');


with torch.no_grad():
    # Generate prediction
    y_val = model(cat_test, cont_test)
    # Compute loss
    loss = torch.sqrt(criterion(y_val, y_test))
print(f'RMSE: {loss:.8f}')

RMSE: 3.24607873


print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(20):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

   PREDICTED   ACTUAL     DIFF
 1.   5.2433   2.9000   2.3433
 2.  23.3398   5.7000  17.6398
 3.   5.7719   7.7000   1.9281
 4.  13.4511  12.5000   0.9511
 5.   5.1976   4.1000   1.0976
 6.   6.6128   5.3000   1.3128
 7.   3.3402   3.7000   0.3598
 8.  17.9253  14.5000   3.4253
 9.   4.5816   5.7000   1.1184
10.  11.3231  10.1000   1.2231
11.   7.0768   4.5000   2.5768
12.   6.2211   6.1000   0.1211
13.   6.5000   6.9000   0.4000
14.  13.2582  14.1000   0.8418
15.   5.6556   4.5000   1.1556
16.  40.2175  34.1000   6.1175
17.   2.2090  12.5000  10.2910
18.   5.2140   4.1000   1.1140
19.  10.4609   8.5000   1.9609
20.   5.8903   5.3000   0.5903


if len(losses) == epochs:
    torch.save(model.state_dict(), 'TaxiFareRegModel.pt')
    print('Model saved')
else:
    print('Model has not been trained. Consider loading a trained model instead.')

Model saved

	pickup_datetime	fare_amount	fare_class	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count
0	2010-04-19 08:17:56 UTC	6.5	0	-73.992365	40.730521	-73.975499	40.744746	1
1	2010-04-17 15:43:53 UTC	6.9	0	-73.990078	40.740558	-73.974232	40.744114	1
2	2010-04-17 11:23:26 UTC	10.1	1	-73.994149	40.751118	-73.960064	40.766235	2
3	2010-04-11 21:25:03 UTC	8.9	0	-73.990485	40.756422	-73.971205	40.748192	1
4	2010-04-17 02:19:01 UTC	19.7	1	-73.990976	40.734202	-73.905956	40.743115	1

	pickup_datetime	fare_amount	fare_class	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	dist_km
0	2010-04-19 08:17:56 UTC	6.5	0	-73.992365	40.730521	-73.975499	40.744746	1	2.126312
1	2010-04-17 15:43:53 UTC	6.9	0	-73.990078	40.740558	-73.974232	40.744114	1	1.392307
2	2010-04-17 11:23:26 UTC	10.1	1	-73.994149	40.751118	-73.960064	40.766235	2	3.326763
3	2010-04-11 21:25:03 UTC	8.9	0	-73.990485	40.756422	-73.971205	40.748192	1	1.864129
4	2010-04-17 02:19:01 UTC	19.7	1	-73.990976	40.734202	-73.905956	40.743115	1	7.231321

	pickup_datetime	fare_amount	fare_class	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	dist_km	EDTdate
0	2010-04-19 08:17:56 UTC	6.5	0	-73.992365	40.730521	-73.975499	40.744746	1	2.126312	2010-04-19 04:17:56+00:00
1	2010-04-17 15:43:53 UTC	6.9	0	-73.990078	40.740558	-73.974232	40.744114	1	1.392307	2010-04-17 11:43:53+00:00
2	2010-04-17 11:23:26 UTC	10.1	1	-73.994149	40.751118	-73.960064	40.766235	2	3.326763	2010-04-17 07:23:26+00:00
3	2010-04-11 21:25:03 UTC	8.9	0	-73.990485	40.756422	-73.971205	40.748192	1	1.864129	2010-04-11 17:25:03+00:00
4	2010-04-17 02:19:01 UTC	19.7	1	-73.990976	40.734202	-73.905956	40.743115	1	7.231321	2010-04-16 22:19:01+00:00

	pickup_datetime	fare_amount	fare_class	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	passenger_count	dist_km	EDTdate	Hour	AMorPM	Weekday
0	2010-04-19 08:17:56 UTC	6.5	0	-73.992365	40.730521	-73.975499	40.744746	1	2.126312	2010-04-19 04:17:56+00:00	4	am	Mon
1	2010-04-17 15:43:53 UTC	6.9	0	-73.990078	40.740558	-73.974232	40.744114	1	1.392307	2010-04-17 11:43:53+00:00	11	am	Sat
2	2010-04-17 11:23:26 UTC	10.1	1	-73.994149	40.751118	-73.960064	40.766235	2	3.326763	2010-04-17 07:23:26+00:00	7	am	Sat
3	2010-04-11 21:25:03 UTC	8.9	0	-73.990485	40.756422	-73.971205	40.748192	1	1.864129	2010-04-11 17:25:03+00:00	17	pm	Sun
4	2010-04-17 02:19:01 UTC	19.7	1	-73.990976	40.734202	-73.905956	40.743115	1	7.231321	2010-04-16 22:19:01+00:00	22	pm	Fri

NYC Taxi Fare Prediction using Pytorch¶

Table of Contents

Overview¶

Introduction¶

Read the Data¶

Feature Engineering¶

Calculate distance¶

Extract date time columns¶

Add columns for hour, day of week and am/pm¶

Data Exploration¶

Distribution of Fare Amount¶

Distribution of Passengers¶

Trips by Day of the Week¶

Trips by Hour of the Day¶

Trips by Day of the Week and Hour¶

Distribution of Trip Distance¶

Distance travelled by Weekday and AM/PM¶

Distance travelled by Day and Hour¶

Fare Amount by Day and Hour¶

Fare amount by Distance¶

Categorify the data¶

Separate categorical and continous columns¶

Convert columns to category type¶

Combine categorical columns¶

Tensors and Embeddings¶

Create tensors for categorical and continous data¶

Set Embedding Sizes (One hot encoding for categorical data)¶

Build the Model¶

Define Tabular Model¶

Build Model¶

Train and Test the Model¶

Split the data¶

Define loss function and optimizer¶

Train the Model¶

Plot Training Loss¶

Validate the Model¶

Save the Model¶

Summary¶