In [290]:
#Importing the required libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
mpl.style.use("ggplot")
import matplotlib.pyplot as plt
import scipy as sp
import sklearn

Here we use the flights data and the planes data from R under the library "nycflights13" to perform our exploratory analysis

In [291]:
#Read the csv file into a local variable to create a data frame
flights_df= pd.read_csv('flights.csv')
In [292]:
#Use the head function to see the first 10 rows of the data frame
flights_df.head(10)
Out[292]:
Unnamed: 0 year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight origin dest air_time distance hour minute
0 1 2013 1 1 517 2 830 11 UA N14228 1545 EWR IAH 227 1400 5 17
1 2 2013 1 1 533 4 850 20 UA N24211 1714 LGA IAH 227 1416 5 33
2 3 2013 1 1 542 2 923 33 AA N619AA 1141 JFK MIA 160 1089 5 42
3 4 2013 1 1 544 -1 1004 -18 B6 N804JB 725 JFK BQN 183 1576 5 44
4 5 2013 1 1 554 -6 812 -25 DL N668DN 461 LGA ATL 116 762 5 54
5 6 2013 1 1 554 -4 740 12 UA N39463 1696 EWR ORD 150 719 5 54
6 7 2013 1 1 555 -5 913 19 B6 N516JB 507 EWR FLL 158 1065 5 55
7 8 2013 1 1 557 -3 709 -14 EV N829AS 5708 LGA IAD 53 229 5 57
8 9 2013 1 1 557 -3 838 -8 B6 N593JB 79 JFK MCO 140 944 5 57
9 10 2013 1 1 558 -2 753 8 AA N3ALAA 301 LGA ORD 138 733 5 58
In [293]:
#Check to see all the destination airports
flights_df["dest"].unique()
Out[293]:
array(['IAH', 'MIA', 'BQN', 'ATL', 'ORD', 'FLL', 'IAD', 'MCO', 'PBI',
       'TPA', 'LAX', 'SFO', 'DFW', 'BOS', 'LAS', 'MSP', 'DTW', 'RSW',
       'SJU', 'PHX', 'BWI', 'CLT', 'BUF', 'DEN', 'SNA', 'MSY', 'SLC',
       'XNA', 'MKE', 'SEA', 'ROC', 'SYR', 'SRQ', 'RDU', 'CMH', 'JAX',
       'CHS', 'MEM', 'PIT', 'SAN', 'DCA', 'CLE', 'STL', 'MYR', 'JAC',
       'MDW', 'HNL', 'BNA', 'AUS', 'BTV', 'PHL', 'STT', 'EGE', 'AVL',
       'PWM', 'IND', 'SAV', 'CAK', 'HOU', 'LGB', 'DAY', 'ALB', 'BDL',
       'MHT', 'MSN', 'GSO', 'CVG', 'BUR', 'RIC', 'GSP', 'GRR', 'MCI',
       'ORF', 'SAT', 'SDF', 'PDX', 'SJC', 'OMA', 'CRW', 'OAK', 'SMF',
       'TUL', 'TYS', 'OKC', 'PVD', 'DSM', 'PSE', 'BHM', 'CAE', 'HDN',
       'BZN', 'MTJ', 'EYW', 'PSP', 'ACK', 'BGR', 'ABQ', 'ILM', 'MVY',
       'SBN', 'LEX', 'CHO', 'TVC', 'ANC', 'LGA'], dtype=object)
In [312]:
flights_df.head()
#Store the data frame in a new variable
flights = flights_df

#Create a new column in the data frame called dates
flights["date"] = pd.to_datetime(flights['year']*10000 + flights['month']*100 + flights['day'], format="%Y%m%d")
#Group all the rows by dates and find mean of the other column values
flights_new = flights.reset_index().groupby(['date']).mean()
In [313]:
flights_new.head()
Out[313]:
index Unnamed: 0 year month day dep_time dep_delay arr_time arr_delay flight air_time distance hour minute
date
2013-01-01 420.5 421.5 2013 1 1 1384.991647 11.548926 1562.344086 12.651023 1821.496437 169.652226 1077.429929 13.538186 31.173031
2013-01-02 1313.0 1314.0 2013 1 2 1353.960428 13.858824 1532.729904 12.692888 1917.732768 162.198276 1053.117709 13.228877 31.072727
2013-01-03 2241.5 2242.5 2013 1 3 1356.665929 10.987832 1536.290929 5.733333 1913.176149 156.593333 1037.370897 13.253319 31.334071
2013-01-04 3156.0 3157.0 2013 1 4 1347.856986 8.951595 1519.116612 -1.932819 1930.147541 150.997797 1032.475410 13.171617 30.695270
2013-01-05 3973.5 3974.5 2013 1 5 1326.089261 5.732218 1509.136681 -1.525802 1740.019444 160.792190 1067.591667 12.949791 31.110181
In [314]:
#Plot showcasing the departure delays for the year 2013
flights_new["dep_delay"].plot(figsize=(20, 5),color='green', title='Delay', linewidth = 5, label = "Departure Delay")

flights_new["arr_delay"].plot(figsize=(20, 5),color='red', title='Delay',\
                              linestyle = "dashed", linewidth = 2, label = "Arrival Delay")
import pylab
pylab.legend(loc = "upper right")
#plt.plot(flights["dep_delay"],color="red",linewidth = 1.5)
#plt.text(70, 80, r'8th March')
#plt.title("Worst day to fly out of NYC")

plt.show()

We can see from the above plot that the highest departure and arrival delay was some time in the start of March which we calculated and got as 8th March. This date was definitely the worst date to fly on for 2013.

In [299]:
#Create a new data frame which are grouped by destination and we take the mean of the other columns
flights_dest = flights_df.groupby('dest').mean()
#lights_explore.head(20)

#Create a new column called dest and use the index to populate the values
flights_dest["dest"] = flights_dest.index
flights_dest.head(10)
Out[299]:
Unnamed: 0 year month day dep_time dep_delay arr_time arr_delay flight air_time distance hour minute dest
dest
ABQ 191964.271654 2013 8.338583 16.177165 2005.732283 13.740157 2049.397638 4.381890 830.354331 249.169291 1826.000000 19.736220 32.110236 ABQ
ACK 254071.362264 2013 7.532075 16.381132 1032.664151 6.456604 1145.079245 4.852273 1301.641509 42.068182 199.000000 9.984906 34.173585 ACK
ALB 145174.808656 2013 5.665148 15.136674 1627.188544 23.620525 1701.593301 14.397129 4561.744875 31.787081 143.000000 15.980907 29.097852 ALB
ANC 279033.000000 2013 7.500000 15.000000 1635.375000 12.875000 1968.000000 -2.500000 887.000000 413.125000 3370.000000 16.125000 22.875000 ANC
ATL 168184.380076 2013 6.536509 15.712286 1293.291395 12.509824 1513.462336 11.300113 1881.501481 112.930451 757.108220 12.587111 34.580305 ATL
AUS 170830.947109 2013 6.488725 15.679787 1521.476427 13.025641 1614.333058 6.019909 1024.496515 212.727914 1514.252973 14.916460 29.830438 AUS
AVL 196533.741818 2013 8.170909 15.716364 1174.764259 8.190114 1372.638783 8.003831 4278.534545 89.888889 583.581818 11.414449 33.319392 AVL
BDL 182838.677201 2013 5.345372 15.758465 1490.310680 17.720874 1548.604369 7.048544 4353.214447 25.466019 116.000000 14.609223 29.388350 BDL
BGR 173297.730667 2013 8.709333 16.112000 1689.716667 19.475000 1715.203911 8.027933 5299.541333 54.117318 378.000000 16.572222 32.494444 BGR
BHM 175548.336700 2013 6.228956 15.949495 1943.639706 29.694853 2028.096654 16.877323 5033.511785 122.776952 865.996633 19.095588 34.080882 BHM
In [300]:
#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_dest["dest"]))
performance = flights_dest["air_time"]


plt.figure(figsize=(30,10))

plt.bar(y_pos, performance, alpha=0.8, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_dest["dest"],rotation=90)
plt.ylabel('Air Time')
plt.title('Average Air Time by Destination')

plt.autoscale()
plt.show()

From this plot we can see that the flights to Honolulu were on average the longest "in air" flights from NYC

In [301]:
#Create a new data frame which are grouped by destination and we take the mean of the other columns
flights_dest = flights_df.groupby('dest').sum()
#lights_explore.head(20)

#Create a new column called dest and use the index to populate the values
flights_dest["dest"] = flights_dest.index
flights_dest.head(10)
Out[301]:
Unnamed: 0 year month day dep_time dep_delay arr_time arr_delay flight air_time distance hour minute dest
dest
ABQ 48758925 511302 2118 4109 509456 3490 520547 1113 210910 63289 463804 5013 8156 ABQ
ACK 67328911 533445 1996 4341 273656 1711 303446 1281 344935 11106 52735 2646 9056 ACK
ALB 63731741 883707 2487 6645 681792 9897 711266 6018 2002606 13287 62777 6696 12192 ALB
ANC 2232264 16104 60 120 13083 103 15744 -20 7096 3305 26960 129 183 ANC
ATL 2895294103 34653795 112526 270487 21854038 211391 25536650 190260 32390048 1901410 13033618 212697 584338 ATL
AUS 416656680 4909707 15826 38243 3678930 31496 3901843 14514 2498747 512887 3693263 36068 72130 AUS
AVL 54046779 553575 2247 4322 308963 2154 361004 2089 1176597 23461 160485 3002 8763 AVL
BDL 80997534 891759 2368 6981 614008 7301 638025 2904 1928474 10492 51388 6019 12108 BDL
BGR 64986649 754875 3266 6042 608298 7011 614043 2874 1987328 19374 141750 5966 11698 BGR
BHM 52137856 597861 1850 4737 528670 8077 545558 4540 1494953 33027 257201 5194 9270 BHM
In [333]:
#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_dest["dest"]))
performance = flights_dest["distance"]


plt.figure(figsize=(30,10))

plt.bar(y_pos, performance, alpha=0.8, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_dest["dest"],rotation=90)
plt.ylabel('Distance')
plt.title('Average Distance by Destination')

plt.autoscale()
plt.show()

This plot shows that on average the distance covered by flights from NYC to Los Angeles were the longest in terms of distance which is understandle since the two locations are on the opposite ends of the country.

In [331]:
flights_new_hour["hour"] = flights_new_hour.index
flights_new_hour.head(4)

plt.figure(figsize=(20,5))

plt.scatter(flights_new_hour["hour"], flights_new_hour["dep_delay"], s=150, alpha=0.5, color = "green")
plt.scatter(flights_new_hour["hour"], flights_new_hour["arr_delay"], s=150, alpha=0.5, color = "blue")
Out[331]:
<matplotlib.collections.PathCollection at 0x12d2646d0>

As we can clearly see that the departure delay is at its highest during the early morning hours from midnight to 4AM. Through the rest of the day although there is some departure delay, it is not as significant as the early morning delays. Also, we can see that at around 4-5AM the delay is minimum and tends to increase over the course of the day with the highest peak at around 3AM. From about 9PM the delays tend to start getting higher. The arrival delay follows a similar pattern as the departure delay as can be seen which might lead us to believe that the flights that arrived late were probably the ones that departed late as well. To confirm this we would have to perform further analysis.

In [304]:
#Create a scatter plot to further showcase the relation between the Average Departure Delay and 
#Average Distance travelled by the flights
plt.figure(figsize=(20,5))

colors = np.arange(len(flights_explore["carrier"]))
plt.scatter(flights_explore["dep_delay"], flights_explore["distance"], s=150, alpha=0.75, color = "green")
plt.xlabel('Departure Delays')
plt.ylabel('Distance')
plt.text(5.1, 5000, r'HA')
plt.text(20.5, 1500, r'F9')
plt.text(20.25, 500, r'EV')
#plt.legend()

plt.show()

We can see from the above scatter plot which is also supported by the bar plots that the flights that travel large distances tend to have lower departure delay as can be seen by the carrier "HA". We also notice a small cluster of carriers that have a higher departure delay on average and the interesting thing to note here is that they all travelled smaller distances on average, in fact, they all travelled less than 2000 miles. There are carriers where even for smaller distances there is a small delay but for the most of the carriers we can see that the average departure delay is lower if the average distance is higher and for a higher departure delay we have carriers travelling a smaller distance on average.

In [305]:
#Store the planes data set in a new data frame
planes_df= pd.read_csv('planes.csv')

#Merge the two data frames flights and planes together by using left join based on the tailnum column
flights_planes = pd.merge(flights, planes_df, how='left', on=['tailnum'])
#planes_df.head(10)

#Group the data in the new merged data frame based on date and engines and then take the mean of the other columns
flights_planes_grouped = flights_planes.groupby(['date','engines']).mean()

#Create a new column called season based on the below conditions
flights_planes_grouped.loc[(flights_planes_grouped["month"] == 1) |\
                           (flights_planes_grouped["month"] == 2) |\
                           (flights_planes_grouped["month"] == 12),"season"]="Winter"
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 3) |\
                           (flights_planes_grouped['month'] == 4) |\
                           (flights_planes_grouped['month'] == 5), 'season'] = 'Spring'
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 6) |\
                           (flights_planes_grouped['month'] == 7) |\
                           (flights_planes_grouped['month'] == 8), 'season'] = 'Summer'
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 9) |\
                           (flights_planes_grouped['month'] == 10)|\
                           (flights_planes_grouped['month'] == 11), 'season'] = 'Autumn'

flights_planes_grouped.reset_index(inplace=True)
flights_planes_grouped.head(20)
#flights.head()
#Plot a relation between arrival delay and distance
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="distance",y="arr_delay",color="season"),data=flights_planes_grouped)+\
geom_point(size = 100, alpha=0.5)+\
xlab("Mean Distance")+ylab("Mean Arrival Delay")+ggtitle("Mean Arrival Delay vs Mean Distance")+gsize
Out[305]:
<ggplot: (331524537)>

From the above plot we can see that most of the flights have a mean distance ranging from 750 to 1500 miles. There are a few outliers as well that are beyond 2500 miles but what is the most noticeable in the plot is the cluster of distances during Autumn. Also, most of the arrival delays seem to happen during summer.

In [306]:
flights_planes_grouped.reset_index(inplace=True)
flights_planes_grouped.head(20)
#flights.head()
#Plot a relation between departure delay and distance
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="distance",y="dep_delay",color="season"),data=flights_planes_grouped)+\
geom_point(size = 100, alpha = 0.5)+xlab("Mean Distance")+ylab("Mean Departure Delay")+\
ggtitle("Mean Departure Delay vs Mean Distance")+gsize
Out[306]:
<ggplot: (331626329)>

From the above plot we can see a most of the points are clustered between 700 to 1500 miles and similar to the plot comprising of the arrival delay we can see some outliers beyond the 2500 miles mark. There are 3-4 cases of where the departure delay is high during the summer and 1 such case during spring.

In [307]:
#Group the data in the new merged data frame based on engines and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('engines').mean()

flights_planes_grouped["engines"] = flights_planes_grouped.index
flights_planes_grouped.head(10)

gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="air_time",y="distance",color="engines"),data=flights_planes_grouped)+\
geom_point(size = 150, alpha = 0.75)+xlab("Mean Air Time")+ylab("Mean Distance")+\
ggtitle("Mean Distance vs Mean Air Time")+gsize
Out[307]:
<ggplot: (316538169)>

Here we try to look at the mean distance versus the mean air time. It is obvious that with increase in distance the air time is increasing. Something to note here is that air time for a 1 engine plane is highest on average. This could be because there are a lot of flights that are travelling using a single engine as compared to a flight with more number of engines.

In [308]:
#Group the data in the new merged data frame based on manufacturer and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('manufacturer').mean()
flights_planes_grouped["manufacturer"] = flights_planes_grouped.index

#Create a plot that shows the average departure delay based on manufacturer
y_pos = np.arange(len(flights_planes_grouped["manufacturer"]))
performance = flights_planes_grouped["dep_delay"]


plt.figure(figsize=(30,10))

plt.bar(y_pos, performance, alpha=0.75, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_planes_grouped["manufacturer"],rotation=90)
plt.ylabel('Mean Departure Delay')
plt.title('Mean Departure Delay by Manufacturer')

plt.autoscale()
plt.show()

The above plot shows that on average the departure delay is high for most of the manufacturers except 3. However, this could be owing to the fact that the flights that are manufactured by them are far greater in number. The highest departure delay is for the manufacturer "AUGUSTA SPA" which is significantly higher than the rest of the cases of departure delay.

In [309]:
#Group the data in the new merged data frame based on date and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('manufacturer').mean()
flights_planes_grouped["manufacturer"] = flights_planes_grouped.index

#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_planes_grouped["manufacturer"]))
performance = flights_planes_grouped["arr_delay"]


plt.figure(figsize=(30,10))

plt.bar(y_pos, performance, alpha=0.75, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_planes_grouped["manufacturer"],rotation=90)
plt.ylabel('Mean Arrival Delay')
plt.title('Mean Arrival Delay by Manufacturer')

plt.autoscale()
plt.show()

The above plot shows that on average the arrival delay is lesser as compared to the average departure delay that we discussed about in the above plot. The arrival delay is considerably lesser for most of the manufacturers except for "AUGUSTA SPA" which has a relatively higher delay as compared to the rest of the manufacturers.

In [310]:
#Group the data in the new merged data frame based on manufactured year and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('year_y').mean()
flights_planes_grouped["year_y"] = flights_planes_grouped.index
flights_planes_grouped.head(5)

#Create a plot that shows mean departure delay by year of manufacture
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="year_y",y="dep_delay"),data=flights_planes_grouped)+\
geom_point(color="green", size = 150, alpha = 0.75)+xlab("Year of Manufacture")+\
ylab("Mean Departure Delay")+ggtitle("Mean Departure Delay by Year of Manufacture")+gsize
Out[310]:
<ggplot: (316346325)>

The above plot shows how the mean departure delay for planes manufactured during different years. The older planes seem to have a lower mean departure delay which is quite strange. It could be because these planes are used for shorter distances and lesser number of journeys which results in lower delays as compared to the flights that were manufactured recently which might make a lot of journeys and hence have delays that might average out to be a higher number.

In [311]:
#Group the data in the new merged data frame based on manufactured year and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('year_y').mean()
flights_planes_grouped["year_y"] = flights_planes_grouped.index
flights_planes_grouped.head(5)

#Create a plot that shows mean arrival delay by year of manufacture
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="year_y",y="arr_delay"),data=flights_planes_grouped)+\
geom_point(color="green", size = 150, alpha = 0.75)+xlab("Year of Manufacture")+\
ylab("Mean Arrival Delay")+ggtitle("Mean Arrival Delay by Year of Manufacture")+gsize
Out[311]:
<ggplot: (344065509)>

Similar to the plot of the departure delay spread out by year of manufacture, the above plot shows how the mean arrival delay for planes manufactured during different years. The spread of the delay is quite consistent but there is a slightly lesser delay in the planes manufactured in the earlier years.

In [ ]: