#Importing the required libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
mpl.style.use("ggplot")
import matplotlib.pyplot as plt
import scipy as sp
import sklearn
Here we use the flights data and the planes data from R under the library "nycflights13" to perform our exploratory analysis
#Read the csv file into a local variable to create a data frame
flights_df= pd.read_csv('flights.csv')
#Use the head function to see the first 10 rows of the data frame
flights_df.head(10)
#Check to see all the destination airports
flights_df["dest"].unique()
flights_df.head()
#Store the data frame in a new variable
flights = flights_df
#Create a new column in the data frame called dates
flights["date"] = pd.to_datetime(flights['year']*10000 + flights['month']*100 + flights['day'], format="%Y%m%d")
#Group all the rows by dates and find mean of the other column values
flights_new = flights.reset_index().groupby(['date']).mean()
flights_new.head()
#Plot showcasing the departure delays for the year 2013
flights_new["dep_delay"].plot(figsize=(20, 5),color='green', title='Delay', linewidth = 5, label = "Departure Delay")
flights_new["arr_delay"].plot(figsize=(20, 5),color='red', title='Delay',\
linestyle = "dashed", linewidth = 2, label = "Arrival Delay")
import pylab
pylab.legend(loc = "upper right")
#plt.plot(flights["dep_delay"],color="red",linewidth = 1.5)
#plt.text(70, 80, r'8th March')
#plt.title("Worst day to fly out of NYC")
plt.show()
We can see from the above plot that the highest departure and arrival delay was some time in the start of March which we calculated and got as 8th March. This date was definitely the worst date to fly on for 2013.
#Create a new data frame which are grouped by destination and we take the mean of the other columns
flights_dest = flights_df.groupby('dest').mean()
#lights_explore.head(20)
#Create a new column called dest and use the index to populate the values
flights_dest["dest"] = flights_dest.index
flights_dest.head(10)
#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_dest["dest"]))
performance = flights_dest["air_time"]
plt.figure(figsize=(30,10))
plt.bar(y_pos, performance, alpha=0.8, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_dest["dest"],rotation=90)
plt.ylabel('Air Time')
plt.title('Average Air Time by Destination')
plt.autoscale()
plt.show()
From this plot we can see that the flights to Honolulu were on average the longest "in air" flights from NYC
#Create a new data frame which are grouped by destination and we take the mean of the other columns
flights_dest = flights_df.groupby('dest').sum()
#lights_explore.head(20)
#Create a new column called dest and use the index to populate the values
flights_dest["dest"] = flights_dest.index
flights_dest.head(10)
#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_dest["dest"]))
performance = flights_dest["distance"]
plt.figure(figsize=(30,10))
plt.bar(y_pos, performance, alpha=0.8, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_dest["dest"],rotation=90)
plt.ylabel('Distance')
plt.title('Average Distance by Destination')
plt.autoscale()
plt.show()
This plot shows that on average the distance covered by flights from NYC to Los Angeles were the longest in terms of distance which is understandle since the two locations are on the opposite ends of the country.
flights_new_hour["hour"] = flights_new_hour.index
flights_new_hour.head(4)
plt.figure(figsize=(20,5))
plt.scatter(flights_new_hour["hour"], flights_new_hour["dep_delay"], s=150, alpha=0.5, color = "green")
plt.scatter(flights_new_hour["hour"], flights_new_hour["arr_delay"], s=150, alpha=0.5, color = "blue")
As we can clearly see that the departure delay is at its highest during the early morning hours from midnight to 4AM. Through the rest of the day although there is some departure delay, it is not as significant as the early morning delays. Also, we can see that at around 4-5AM the delay is minimum and tends to increase over the course of the day with the highest peak at around 3AM. From about 9PM the delays tend to start getting higher. The arrival delay follows a similar pattern as the departure delay as can be seen which might lead us to believe that the flights that arrived late were probably the ones that departed late as well. To confirm this we would have to perform further analysis.
#Create a scatter plot to further showcase the relation between the Average Departure Delay and
#Average Distance travelled by the flights
plt.figure(figsize=(20,5))
colors = np.arange(len(flights_explore["carrier"]))
plt.scatter(flights_explore["dep_delay"], flights_explore["distance"], s=150, alpha=0.75, color = "green")
plt.xlabel('Departure Delays')
plt.ylabel('Distance')
plt.text(5.1, 5000, r'HA')
plt.text(20.5, 1500, r'F9')
plt.text(20.25, 500, r'EV')
#plt.legend()
plt.show()
We can see from the above scatter plot which is also supported by the bar plots that the flights that travel large distances tend to have lower departure delay as can be seen by the carrier "HA". We also notice a small cluster of carriers that have a higher departure delay on average and the interesting thing to note here is that they all travelled smaller distances on average, in fact, they all travelled less than 2000 miles. There are carriers where even for smaller distances there is a small delay but for the most of the carriers we can see that the average departure delay is lower if the average distance is higher and for a higher departure delay we have carriers travelling a smaller distance on average.
#Store the planes data set in a new data frame
planes_df= pd.read_csv('planes.csv')
#Merge the two data frames flights and planes together by using left join based on the tailnum column
flights_planes = pd.merge(flights, planes_df, how='left', on=['tailnum'])
#planes_df.head(10)
#Group the data in the new merged data frame based on date and engines and then take the mean of the other columns
flights_planes_grouped = flights_planes.groupby(['date','engines']).mean()
#Create a new column called season based on the below conditions
flights_planes_grouped.loc[(flights_planes_grouped["month"] == 1) |\
(flights_planes_grouped["month"] == 2) |\
(flights_planes_grouped["month"] == 12),"season"]="Winter"
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 3) |\
(flights_planes_grouped['month'] == 4) |\
(flights_planes_grouped['month'] == 5), 'season'] = 'Spring'
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 6) |\
(flights_planes_grouped['month'] == 7) |\
(flights_planes_grouped['month'] == 8), 'season'] = 'Summer'
flights_planes_grouped.loc[(flights_planes_grouped['month'] == 9) |\
(flights_planes_grouped['month'] == 10)|\
(flights_planes_grouped['month'] == 11), 'season'] = 'Autumn'
flights_planes_grouped.reset_index(inplace=True)
flights_planes_grouped.head(20)
#flights.head()
#Plot a relation between arrival delay and distance
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="distance",y="arr_delay",color="season"),data=flights_planes_grouped)+\
geom_point(size = 100, alpha=0.5)+\
xlab("Mean Distance")+ylab("Mean Arrival Delay")+ggtitle("Mean Arrival Delay vs Mean Distance")+gsize
From the above plot we can see that most of the flights have a mean distance ranging from 750 to 1500 miles. There are a few outliers as well that are beyond 2500 miles but what is the most noticeable in the plot is the cluster of distances during Autumn. Also, most of the arrival delays seem to happen during summer.
flights_planes_grouped.reset_index(inplace=True)
flights_planes_grouped.head(20)
#flights.head()
#Plot a relation between departure delay and distance
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="distance",y="dep_delay",color="season"),data=flights_planes_grouped)+\
geom_point(size = 100, alpha = 0.5)+xlab("Mean Distance")+ylab("Mean Departure Delay")+\
ggtitle("Mean Departure Delay vs Mean Distance")+gsize
From the above plot we can see a most of the points are clustered between 700 to 1500 miles and similar to the plot comprising of the arrival delay we can see some outliers beyond the 2500 miles mark. There are 3-4 cases of where the departure delay is high during the summer and 1 such case during spring.
#Group the data in the new merged data frame based on engines and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('engines').mean()
flights_planes_grouped["engines"] = flights_planes_grouped.index
flights_planes_grouped.head(10)
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="air_time",y="distance",color="engines"),data=flights_planes_grouped)+\
geom_point(size = 150, alpha = 0.75)+xlab("Mean Air Time")+ylab("Mean Distance")+\
ggtitle("Mean Distance vs Mean Air Time")+gsize
Here we try to look at the mean distance versus the mean air time. It is obvious that with increase in distance the air time is increasing. Something to note here is that air time for a 1 engine plane is highest on average. This could be because there are a lot of flights that are travelling using a single engine as compared to a flight with more number of engines.
#Group the data in the new merged data frame based on manufacturer and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('manufacturer').mean()
flights_planes_grouped["manufacturer"] = flights_planes_grouped.index
#Create a plot that shows the average departure delay based on manufacturer
y_pos = np.arange(len(flights_planes_grouped["manufacturer"]))
performance = flights_planes_grouped["dep_delay"]
plt.figure(figsize=(30,10))
plt.bar(y_pos, performance, alpha=0.75, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_planes_grouped["manufacturer"],rotation=90)
plt.ylabel('Mean Departure Delay')
plt.title('Mean Departure Delay by Manufacturer')
plt.autoscale()
plt.show()
The above plot shows that on average the departure delay is high for most of the manufacturers except 3. However, this could be owing to the fact that the flights that are manufactured by them are far greater in number. The highest departure delay is for the manufacturer "AUGUSTA SPA" which is significantly higher than the rest of the cases of departure delay.
#Group the data in the new merged data frame based on date and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('manufacturer').mean()
flights_planes_grouped["manufacturer"] = flights_planes_grouped.index
#Create a plot that shows the average air time based on destination
y_pos = np.arange(len(flights_planes_grouped["manufacturer"]))
performance = flights_planes_grouped["arr_delay"]
plt.figure(figsize=(30,10))
plt.bar(y_pos, performance, alpha=0.75, color = "green",width=1, align = "center")
plt.xticks(y_pos, flights_planes_grouped["manufacturer"],rotation=90)
plt.ylabel('Mean Arrival Delay')
plt.title('Mean Arrival Delay by Manufacturer')
plt.autoscale()
plt.show()
The above plot shows that on average the arrival delay is lesser as compared to the average departure delay that we discussed about in the above plot. The arrival delay is considerably lesser for most of the manufacturers except for "AUGUSTA SPA" which has a relatively higher delay as compared to the rest of the manufacturers.
#Group the data in the new merged data frame based on manufactured year and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('year_y').mean()
flights_planes_grouped["year_y"] = flights_planes_grouped.index
flights_planes_grouped.head(5)
#Create a plot that shows mean departure delay by year of manufacture
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="year_y",y="dep_delay"),data=flights_planes_grouped)+\
geom_point(color="green", size = 150, alpha = 0.75)+xlab("Year of Manufacture")+\
ylab("Mean Departure Delay")+ggtitle("Mean Departure Delay by Year of Manufacture")+gsize
The above plot shows how the mean departure delay for planes manufactured during different years. The older planes seem to have a lower mean departure delay which is quite strange. It could be because these planes are used for shorter distances and lesser number of journeys which results in lower delays as compared to the flights that were manufactured recently which might make a lot of journeys and hence have delays that might average out to be a higher number.
#Group the data in the new merged data frame based on manufactured year and take the mean of the other columns
flights_planes_grouped = flights_planes.groupby('year_y').mean()
flights_planes_grouped["year_y"] = flights_planes_grouped.index
flights_planes_grouped.head(5)
#Create a plot that shows mean arrival delay by year of manufacture
gsize = theme_matplotlib(rc={"figure.figsize": "20, 5"}, matplotlib_defaults=False)
ggplot(aes(x="year_y",y="arr_delay"),data=flights_planes_grouped)+\
geom_point(color="green", size = 150, alpha = 0.75)+xlab("Year of Manufacture")+\
ylab("Mean Arrival Delay")+ggtitle("Mean Arrival Delay by Year of Manufacture")+gsize
Similar to the plot of the departure delay spread out by year of manufacture, the above plot shows how the mean arrival delay for planes manufactured during different years. The spread of the delay is quite consistent but there is a slightly lesser delay in the planes manufactured in the earlier years.