Answer To: Flight Data Analysis Project Flight Data Analysis Task You work for American Airlines as a Data...
Saravana answered on Jun 05 2021
---
title: "Flight_Data_Analysis"
date: "6/3/2021"
output: pdf_document
---
# Flight Data Analysis
The Flight data set pulled form DOT flight dataase contains various information regarding the flights on a particular day. Theflight details available from Aug 20 to Feb 21. The objective of the analysis is to gain insights in to questions regarding Airline performance from this data set.
## Data Preparation
The following data cleaning steps were performed to make the dataset suitable for the questions regarding the Airline performance.
+ The Data of flight information was available for individual months, so we binded them to form a complete dataset.
+ The only columns that are necessary to for analysis were selected.
+ The time zone information of Orgin and Destination Airports were added from airportData.csv file
+ The date and time of the Departure and Arrival for flights were added along with time zone information.
```{r Library, include = FALSE, message = FALSE, warning = FALSE, echo = FALSE}
# Load necessary Libraries
rm(list = ls())
library("dplyr") # Load dplyr package
library("plyr") # Load plyr package
library("readr") #
library("tidyverse")
library("ggplot2")
library("pipeR")
library("forcats")
library("data.table")
library("ggpubr")
library("magrittr")
library("captioner")
figs <- captioner(prefix="Figure")
tbls <- captioner(prefix="Table")
```
### Summary of Data
```{r, Data_Prep, include = FALSE, message = FALSE, warning = FALSE, echo = FALSE}
# Flight information of seperate months were binded together
data_all <- list.files(path = "/media/priyan/Files/GreyNodes/Assignment22/onTime",
pattern = "*.csv", full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows
# Smaller dataset with only the required columns is createdd
data_AA <- data_all %>%
select(FL_DATE, MKT_UNIQUE_CARRIER,BRANDED_CODE_SHARE, ORIGIN, ORIGIN_CITY_NAME, DEST, DEST_CITY_NAME, CRS_DEP_TIME,DEP_TIME, CRS_ARR_TIME, ARR_TIME, CANCELLED, DIVERTED, DISTANCE)
# rows with missing data are eliminated. This steps removes the cancelled flights row from the analysis
data_AA <- data_AA[complete.cases(data_AA), ]
# A smaller Dataset with Cancelled flights information
data_AA2 <- data_all %>%
select(FL_DATE, MKT_UNIQUE_CARRIER,BRANDED_CODE_SHARE, ORIGIN, ORIGIN_CITY_NAME, DEST, DEST_CITY_NAME, CRS_DEP_TIME,DEP_TIME, CRS_ARR_TIME, ARR_TIME, CANCELLED, DIVERTED, DISTANCE)
# The time zone information is added from "airportData.csv"
airport_data <- list.files(path = "/media/priyan/Files/GreyNodes/Assignment22/Data",
pattern = "airportData.csv",full.names = TRUE) %>%
read_csv %>%
dplyr:: select(airCode, tz)
# the time zone of the Origin and Destination airports are addedd to the data
colnames(airport_data)[1]<- "DEST"
data_AA<- data_AA %>% inner_join(airport_data,by="DEST")
colnames(data_AA)[15] <- "tz_Dest"
colnames(airport_data)[1]<- "ORIGIN"
data_AA <- data_AA %>% inner_join(airport_data,by="ORIGIN")
colnames(data_AA)[16] <- "tz_Origin"
# A new column with with only Month and year of flight is created from the flight date column: FL_DATE
data_AA$Month_Yr <- format(as.Date(data_AA$FL_DATE), "%Y-%m")
# Summary of the data
nrows_all <- nrow(data_all)
Maj_carriers <- unique(data_AA$MKT_UNIQUE_CARRIER)
M_Yr <- unique(data_AA$Month_Yr)
nrows_Carrier <- data_AA %>%
group_by(MKT_UNIQUE_CARRIER)%>%
dplyr :: summarize(Nflight_carrier = n())
```
The summary information regarding the flights and carriers anslysed in data:
+ Total number of flights analysed `r nrows_all`
+ Major Carriers `r Maj_carriers`
+ Months Analysed `r M_Yr`
+ The number of flights anlaysed for carriers: `r nrows_Carrier`
```{r time_zone, include = FALSE, message = FALSE, warning = FALSE, echo = FALSE}
# the date and time information are pastedd togther
data_AA <- data_AA%>%
mutate(CRS_DEP = paste(FL_DATE, paste(as.numeric(CRS_DEP_TIME) %/% 100, as.numeric(CRS_DEP_TIME) %% 100, sep = ":" ), sep = " "),
CRS_ARR = paste(FL_DATE, paste(as.numeric(CRS_ARR_TIME) %/% 100, as.numeric(CRS_ARR_TIME) %% 100, sep = ":" ), sep = " "),
ARR = paste(FL_DATE, paste(as.numeric(ARR_TIME) %/% 100, as.numeric(ARR_TIME) %% 100, sep = ":" ), sep = " "),
DEP = paste(FL_DATE, paste(as.numeric(DEP_TIME) %/% 100, as.numeric(DEP_TIME) %% 100, sep = ":" ), sep = " ")
)
# the time zone informatin are added rowwise
data_AA <- as.data.table(data_AA)
data_AA[, CRS_DEP_tz2 := as.POSIXct(CRS_DEP, tz = tz_Origin[1L]), by = tz_Origin]
data_AA[, CRS_ARR_tz2 := as.POSIXct(CRS_ARR, tz = tz_Dest[1L]), by = tz_Dest]
data_AA[, DEP_tz2 := as.POSIXct(DEP, tz = tz_Origin[1L]), by = tz_Origin]
data_AA[, ARR_tz2 := as.POSIXct(ARR, tz = tz_Dest[1L]), by = tz_Dest]
# the rowise operation provides the numerical values of date and time which are then converted to
# proper format
data_AA$CRS_DEP_tz2 <- as.POSIXct(data_AA$CRS_DEP_tz2, origin = "1970-01-01")
data_AA$CRS_ARR_tz2 <- as.POSIXct(data_AA$CRS_ARR_tz2, origin = "1970-01-01")
data_AA$DEP_tz2 <- as.POSIXct(data_AA$DEP_tz2, origin = "1970-01-01")
data_AA$ARR_tz2 <- as.POSIXct(data_AA$ARR_tz2, origin = "1970-01-01")
```
```{r time_diff, include = FALSE, message = FALSE, warning = FALSE, echo = FALSE}
# The time difference between scheduled departure and actual departure
# The time difference between scheduled arrival and actual arrival time are computed
# the difference is expressed in Minutes
# A new column 'Late_DEP' is added with 1 for late departure and 0 for schedduled or early departure.
# A new column 'Late_ARR' is added with 1 for late arrival and 0 for schedduled or early arrivals
# A new column 'Late_DEP_ARR_ONTIME' is added with 1 for late departure and early arrival flights and 0 otherwise
data_AA <- data_AA%>%
mutate(dep_late = difftime(CRS_DEP_tz2, DEP_tz2, units = c("mins")),
Arr_late = difftime(CRS_ARR_tz2, ARR_tz2, units = c("mins")))%>%
mutate(Late_DEP = case_when(dep_late < 0 ~ 1,
TRUE ~ 0),
Late_ARR = case_when(Arr_late < 0 ~ 1,
TRUE ~ 0))%>%
mutate(Late_DEP_ARR_ONTIME = case_when(Late_DEP == 1 & Late_ARR == 0 ~ 1,
TRUE ~ 0) )
```
## On-time Analysis
### What is our overall on-time arrival rate?
The overall on-time arrival rate for AA flights is computed. The overall total number of Ontime 'AA' flights is divided by the overall total number of 'AA' flights to obtain the overall on-time arrival rate.
```{r late_arr_rate, message = FALSE, warning = FALSE, echo=FALSE, include = TRUE, echo = FALSE}
# the difference between total number of flights and late flights is computed to obtian the number of ontime arrival flights.
# The ONtime arrival rate is computed by dividing the nflights_ONtime by total_nflghts.
LAR <- data_AA%>%
filter(MKT_UNIQUE_CARRIER == 'AA')%>% # only AA flights analysedd
dplyr :: summarize(total_nflghts = n(), # total number of flights
nflight_late = sum(Late_ARR), # Total late arrivals
nflights_ONtime = total_nflghts - nflight_late, # total ontime arrivals
Ontime_Arrival_Rate = nflights_ONtime / total_nflghts, # ontime arrival rate
Late_Arrival_Rate = nflight_late / total_nflghts) # late arrival rate
tbls(name="late_arr_rate","Ontime and Late arrival performance between Aug20 and Feb21")
knitr::kable(LAR,
caption = "Ontime and Late arrival performance between Aug20 and Feb21",
digits = 2)
```
The Ontime arrival rate of AA flights from the table `r tbls("late_arr_rate",display="cite")` is `r LAR$Ontime_Arrival_Rate`.
The on-time arrival rate of our flights is 73.57%.
### How has it changed between Aug20 and Feb21?
The Ontime arrival rate calculated above is for the entire period from Aug20 and Feb21. So, we now assess the monthly Ontime arrival rate for our flights.
The overall aggregate data is split month wise and the monthly Ontime and Late arrival rates were calculated.
```{r ,flt_month, fig.cap = "Ontime arrival rate of AA flights between Aug20 and Feb21", message = FALSE, warning = FALSE,include = TRUE, echo = FALSE}
flt_month<- data_AA%>%
filter(MKT_UNIQUE_CARRIER == 'AA')%>% # Only 'AA' flights analysed
group_by(Month_Yr) %>% # dataset is analysed Monthwise
dplyr :: summarize(total_nflghts = n(), # total number of flights
nflight_late = sum(Late_ARR), # total number of late flights
nflights_ONtime = total_nflghts - nflight_late, # Total ontime flights
Ontime_Arrival_Rate = nflights_ONtime / total_nflghts, # Ontime arrival rate
Late_Arrival_Rate = nflight_late / total_nflghts) # Late arrival rate
tbls(name="flt_month","Ontime arrival rate of AA flights between Aug20 and Feb21")
knitr::kable(flt_month,
caption = "Ontime and Late arrival performance between Aug20 and Feb21",
digits = 2)
```
```{r ,flt_month2, fig.cap = "Ontime arrival rate of AA flights between Aug20 and Feb21", message = FALSE, warning = FALSE,include = TRUE, echo = FALSE, fig.height=5, fig.width=7}
figs(name="flt_month2","Ontime arrival rate of AA flights between Aug20 and Feb21")
# the Ontime arrival rate between Aug20 and Feb21 is plotted
# A line plot is plotted to visualise the change over time
p1<- ggplot(data=flt_month, aes(x=Month_Yr, y=Ontime_Arrival_Rate, group= 1)) +
geom_line(color="red")+
geom_point()+
ylim(0.65, 0.80)+
ggtitle("Ontime arrival rate of AA flights from 08/2020 to 02/2021")+
theme_bw()
p1
```
From the table `r tbls("flt_month",display="cite")` and plot `r figs("flt_month2",display="cite")`, we can observe that from Oct-2020 to Feb-2021, our Ontime arrival rates has fluctuated between 70-75%. This is a substantial decrease from 77% ontime arrival rate during Aug2020.Our ontime arrival rate was 71% in Feb 2021.
### Report overall rate for AA-operated flights and a breakout by contractors.
The overall ontime arrival rate for AA and AA codeshare contractors were assessed.
```{r ,flt_cntctrs, fig.cap = "Ontime arrival rate of AA flights and contractors", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE}
LR_contractors<- data_AA%>%
filter(MKT_UNIQUE_CARRIER == 'AA')%>% # Only AA flights analysed
group_by(BRANDED_CODE_SHARE) %>% ## data is split between AA and AA codeshare
dplyr :: summarize(total_nflghts = n(), # total number of flights
nflight_late = sum(Late_ARR), # total number of late flights
nflights_ONtime = total_nflghts - nflight_late, # Total ontime flights
Ontime_Arrival_Rate = nflights_ONtime / total_nflghts, # Ontime arrival rate
Late_Arrival_Rate = nflight_late / total_nflghts) # late arrival rate
tbls(name="flt_cntctrs","Ontime arrival rate of AA flights and contractors")
knitr::kable(LR_contractors,
caption = "Ontime and Late arrival performance of AA flights and Contractors",
digits = 2)
```
```{r ,flt_cntctrs2, fig.cap = "Ontime arrival rate of AA flights and contractors", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE, fig.height=5, fig.width=7}
figs(name="flt_cntctrs2","Ontime arrival rate of AA flights and contractors")
# Horizontal bar plot is created to visualise the difference
p2<- ggplot(data=LR_contractors, aes(x=BRANDED_CODE_SHARE, y=Ontime_Arrival_Rate)) +
geom_bar(stat="identity")+
ylim(0.0, 0.8)+
ggtitle("On time arrival rate of AA flights and contractors")+
theme_bw() + coord_flip()
p2
```
From the table `r tbls("flt_cntctrs",display="cite")` and plot `r figs("flt_cntctrs2",display="cite")`, we can observe that our codeshare partner's Ontime arrival rate `r LR_contractors$Ontime_Arrival_Rate[2]` is lees than our AA flights ontime arrival rate of `r LR_contractors$Ontime_Arrival_Rate[1]`.
### How do our rates compare to the other major carriers?
The ontime arrival rates of major carriers was assessed. The data was split across major carriers and their ontime arrival rates were calculated
```{r, Maj_carriers, fig.cap = "Ontime arrival rate of Major Carriers", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE}
LR_MjrCarriers<- data_AA%>%
group_by(MKT_UNIQUE_CARRIER) %>% # split across major carriers
dplyr :: summarize(total_nflghts = n(), # total number of flights
nflight_late = sum(Late_ARR), # total number of Late flights
nflights_ONtime = total_nflghts - nflight_late, # Total ontime flights
Ontime_Arrival_Rate = nflights_ONtime / total_nflghts, # Ontime arrival rate
Late_Arrival_Rate = nflight_late / total_nflghts) # late arrival rate
# the data frame is ordered in the descending order of Ontime_Arrival_Rates
LR_MjrCarriers<-LR_MjrCarriers[order(LR_MjrCarriers$Ontime_Arrival_Rate),]
tbls(name="Maj_carriers","Ontime arrival rate of Major Carriers")
knitr::kable(LR_MjrCarriers,
caption = "Ontime arrival rate of Major Carriers",
digits = 2)
```
```{r, Maj_carriers2, fig.cap = "Ontime arrival rate of Major Carriers", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE}
figs(name="Maj_carriers2","Ontime arrival rate of Major Carriers")
# Sorted horizontal bar plot is created to compare the differences
area.color <- c("steelblue", "steelblue", "green", "steelblue", "steelblue", "steelblue", "steelblue", "steelblue", "steelblue", "steelblue") # AA ontime rates are highlighted
p3 <- LR_MjrCarriers %>%
mutate(MKT_UNIQUE_CARRIER = fct_reorder(MKT_UNIQUE_CARRIER, desc(Ontime_Arrival_Rate))) %>% # reorder factor levels in decreasing order
ggplot(aes(x=MKT_UNIQUE_CARRIER, y=Ontime_Arrival_Rate, fill = c)) +
geom_bar(stat="identity", fill=area.color) +
theme_minimal() +
ggtitle("Ontime arrival rate of Major Carriers")+
coord_flip()
p3
```
From the table `r tbls("Maj_carriers",display="cite")` and plot `r figs("Maj_carriers2",display="cite")`, we can observe that our Ontime arrival rate `r LR_MjrCarriers$Ontime_Arrival_Rate[3]` is the third lowest among the major carriers.
### What proportion of flights depart late but arrive on time?
We assesed the proportion of flights that depart late but arrive on time. We computed both the month wise and overall proportion of flights that depart late but arrive on time. By computing both the monthwise and overall proportion we can visualise whether our monthly performance is over or below the average.
```{r, LD_AT_calc, fig.cap = "Late departure but Ontime arrival rate of AA flights", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE}
LD_AT<- data_AA%>%
filter(MKT_UNIQUE_CARRIER == 'AA')%>% ## extract only AA flights
group_by(Month_Yr) %>% # monthwise breakout calculates
dplyr :: summarize(total_nflghts = n(), # total flights
nflights_LD_AT = sum(Late_DEP_ARR_ONTIME), # total flights depart late and arrive ontime
Late_DEP_ARR_OT_Rate = nflights_LD_AT / total_nflghts) # proportion
LD_AT2<- data_AA%>%
filter(MKT_UNIQUE_CARRIER == 'AA')%>%
group_by(MKT_UNIQUE_CARRIER) %>%
dplyr :: summarize(total_nflghts = n(),
nflights_LD_AT = sum(Late_DEP_ARR_ONTIME),
Late_DEP_ARR_OT_Rate = nflights_LD_AT / total_nflghts) # overall proportion
colnames(LD_AT2)[1]<- "Month_Yr"
LD_AT <- rbind(LD_AT, LD_AT2) # combine both results
tbls(name="LD_AT_calc","Late departure but Ontime arrival rate of AA flights")
knitr::kable(LD_AT,
caption = "Late departure but Ontime arrival rate of AA flights",
digits = 3)
```
```{r, LD_AT_calc2, fig.cap = "Late departure but Ontime arrival rate of AA flights", message = FALSE, warning = FALSE, include = TRUE, echo = FALSE, fig.height=5, fig.width=7}
figs(name="LD_AT_calc2","Late departure but Ontime arrival rate of AA flights")
# Line plot to visualise changes over month and compare with Overall proportion
highlight_df <- LD_AT %>%
filter(Month_Yr == "AA") # overall proportion is highlighted in plot
p4<- ggplot(data=LD_AT, aes(x=Month_Yr, y=Late_DEP_ARR_OT_Rate, group= 1)) +
geom_line(color="red")+
geom_point()+
geom_point(data=highlight_df, aes(x=Month_Yr,y=Late_DEP_ARR_OT_Rate),
color='blue', size=5, shape=17)+
ylim(0.025, 0.0725)+
ggtitle("Late departure but arrival Ontime of AA flights from 08/2020 to 02/2021")+
theme_bw()+
theme(axis.text.x = element_text(colour=c("black", "black", "black", "black", "black", "black", "black", "blue")),
axis.text.y = element_text(colour="black"))
p4
```
From the table `r tbls("LD_AT_calc",display="cite")` and plot `r figs("LD_AT_calc2",display="cite")`, we can observe that proportion of flights that depart late but arrive on time has improved from the timeperiod of 12/ 2020 to 02/2021 compared to earlier timeperiod of...