top of page

Airplane Crashes and Fatalities Since 1908 - Multiple regression, Visualizations, Analysis.

library(ggplot2) library(data.table) library(tidyr) library(tm) setwd("C:/Users/Ed/Desktop/datasets") ac = read.csv("aircrashes.csv", header=T, na.strings=c("","NA")) View(ac) str(ac)

If we look in to the structure, we can see that most of the variables are factor variables and there are no continuous variables except for 'Fatalities' and 'Aboard'. And the factor variables we have, has a lot of levels which is not very good for modelling. Anyway we'll try to reduce the dimensions of the factors and we'll group int to small groups as much as possible.

Data Preparation

#========================================================================================= #creating a new column sumnew thereby shortening the number of levels for good data analysis

levels(ac$Summary) = tolower(levels(ac$Summary))

ac$sumnew= ifelse(apply(sapply(c("struck","bird","crash"), grepl, as.character(ac$Summary)), 1, all) ,"struck by bird and crashed", ifelse(apply(sapply(c("struck","obstacle","crash"), grepl, as.character(ac$Summary)), 1, all) ,"struck by obstacle and Crashed", ifelse(apply(sapply(c("missing" , "crash"), grepl, as.character(ac$Summary)), 1, all) ,"missing and crashed", ifelse(apply(sapply(c("shot" , "crash"), grepl, as.character(ac$Summary)), 1, all) ,"shot down and crashed", ifelse(apply(sapply(c("struck","lightning"), grepl, as.character(ac$Summary)), 1, all) ,"struck by lightening and crashed", ifelse(apply(sapply(c("explode","crash"), grepl, as.character(ac$Summary)), 1 , all) ,"exploded and crashed", ifelse(apply(sapply(c("crash","land"), grepl, as.character(ac$Summary)), 1 , all) ,"crashed at landing", ifelse(apply(sapply(c("weather","crash"), grepl, as.character(ac$Summary)), 1 , all) ,"crashed due to bad weather", ifelse(apply(sapply(c("coll","crash"), grepl, as.character(ac$Summary)), 1 , all) ,"collided and crashed", ifelse(apply(sapply(c("takeoff","crash"), grepl, as.character(ac$Summary)), 1 , all) ,"crashed shortly after takeoff", ifelse(apply(sapply(c("runway","crash"), grepl, as.character(ac$Summary)), 1 , all) ,"runway crash", ifelse(apply(sapply(c("fog","crash"), grepl, as.character(ac$Summary)) , 1, all) ,"crashed due to heavy fog", ifelse(apply(sapply(c("fire","crash"), grepl, as.character(ac$Summary)), 1, all) ,"caught fire and crashed", ifelse(apply(sapply(c("lost"), grepl, as.character(ac$Summary)), 1 , all) ,"lost", ifelse(apply(sapply(c("struck"), grepl, as.character(ac$Summary)), 1 , all) ,"struck at obstacle and crashed", ifelse(apply(sapply(c("engine"), grepl, as.character(ac$Summary)), 1 , all) ,"engine failure", ifelse(apply(sapply("shot", grepl, as.character(ac$Summary)), 1, all) ,"shot down", ifelse(apply(sapply("weather", grepl, as.character(ac$Summary)), 1, all) ,"weather related", ifelse(apply(sapply("coll", grepl, as.character(ac$Summary)), 1, all), "collision", ifelse(apply(sapply("crash", grepl, as.character(ac$Summary)), 1, all), "crashed", ifelse(apply(sapply("explode", grepl, as.character(ac$Summary)), 1, all), "exploded", ifelse(apply(sapply("fire", grepl, as.character(ac$Summary)), 1, all),"caught fire", ifelse(apply(sapply("disappear", grepl, as.character(ac$Summary)), 1, all), "Disappeared","others")))))))))))))))))))))))

ac$sumnew = as.factor(ac$sumnew) View(ac) levels(ac$sumnew) # Levels have been brought down from 4000s to 24

#=========================================================================================#separating the "route of the flight" using separator function

ac1 = separate(ac, col = "Route", into = c("From", "To"), sep = "-")

#separating the location into place and country ac2 = separate(ac1, col = "Location", into = c("Place", "Country"), sep = ",")

ac2$Country = as.factor(ac2$Country) levels(ac2$Country) #599 levels

#=========================================================================================# Creating a new column of survival rate.

ac2$Fatalities = as.numeric(ac2$Fatalities) ac2$Fatalities = as.numeric(ac2$Fatalities) ac2$Aboard = as.numeric(ac2$Aboard) ac2$survivalrate = ((ac2$Aboard - ac2$Fatalities)/ac2$Aboard)*100 ac2$survivalrate = round(ac2$survivalrate,2)

#=========================================================================================#Knowing the type of aircraft, Operator by extracting the first word of 'Type'

ac2$typeone = gsub("([A-Za-z]+).*", "\\1", ac2$Type) ac2$Operatormain = gsub("([A-Za-z]+).*", "\\1", ac2$Operator)

ac2$Operatormain = as.factor(ac2$Operatormain) ac2$typeone = as.factor(ac2$typeone)

levels(ac2$typeone) #252 levels levels(ac2$Operatormain) #1420 levels

# Date Formatting

ac2$Date = as.Date(ac2$Date, "%m/%d/%Y") #converted it in to date format ac2$datenew = substring(ac2$Date,1,4) ac2$datenew = as.numeric(ac2$datenew) ac2$datenew = as.factor(ac2$datenew) View(ac2) str(ac2)

#=========================================================================================#survival rate category

ac2$surcat = ifelse(ac2$survivalrate < 25, "Poor survival rate", ifelse(ac2$survivalrate<50, "Below average survival rate", ifelse(ac2$survivalrate<75, "Average survival rate", "Good survival rate")))

#Plane category based on the strength of the passangers aboard

ac2$planesz = ifelse(ac2$Aboard < 3, "very small aircraft", ifelse(ac2$Aboard<30, "small aircraft", ifelse(ac2$Aboard < 60, "medium sized aircraft", ifelse(ac2$Aboard < 100, "big aircraft", ifelse(ac2$Aboard < 150, "large aircraft", "huge aircraft")))))

ac2$planesz = as.factor(ac2$planesz)

#=========================================================================================# Dependent and Independent Variables

ac2$Fatalities = as.integer(ac2$Fatalities) # Dependent # sumnew, planez, datenew, typeone, Country are going to be independent variables

#Prepared data for visualizations and modelling

#========================================================================================= multiple linear regression #=========================================================================================

View(ac2)

#multiple linear regression mlr1 = lm(Fatalities ~ typeone + Country + planesz + sumnew + datenew , data = ac2, na.action = na.exclude)

mlr2 = lm(Fatalities ~ typeone + Country + planesz + sumnew + datenew + Operatormain + Aboard, data = ac2, na.action = na.exclude)

#summary of the model summary(mlr1) summary(mlr2)

plot(fitted(mlr1) + residuals(mlr1), fitted(mlr1))

plot(fitted(mlr2) + residuals(mlr2), fitted(mlr2), xlab = "Fatalities", ylab = "Predictors", col = "Red", main = "Multiple regression plot")

#regression fitted plot

#Comparing the models, second one seem to be more linear. #However if you look at the bottom left corner, you can see many fatalities crowded #in this particular spot. The model we obtained is also not a perfect one.

#=========================================================================================#prediction with inputting the desired independent variables

preddata = data.frame(typeone = "Tupolev", sumnew = "crashed", planesz= "small aircraft", Country = "Russia", datenew = "1973", Operatormain = "Aeroflot"

Aboard = 50)

predict.lm(mlr2, preddata) #24.08 is the predicted value

Visualizations

#=========================================================================================#Fatalities over the years

ggplot(data = ac2, aes(x = datenew, y = Fatalities)) + geom_bar(stat = "identity", fill = "purple") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab(" Year ") + ylab("Fatalities") + ggtitle(" Fatalities over the years")+ theme(plot.title = element_text(hjust = 0.5))

#=========================================================================================#Fatalities per Type of Plane size

ggplot(data = ac2, aes(x = planesz, y = Fatalities, fill = sumnew)) + geom_bar(stat = "identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab(" Sizes of planes ") + ylab("Fatalities") + ggtitle(" Fatalities per Size of Plane")+ theme(plot.title = element_text(hjust = 0.5))

#=========================================================================================# Fatalities per model of plane

tab = table(ac2$Type) # sort tab_s = sort(tab) # extract 10 most frequent nationalities top10t = tail(names(tab_s), 10) # subset of data frame ac2t = subset(ac2, Type %in% top10t) # order factor levels ac2t$Type = factor(ac2t$Type, levels = rev(top10t))

# plot ggplot(ac2t, aes(x = Type)) + geom_bar(stat = "count", fill = "purple") + theme_classic() + xlab(" Model of Plane ") + ylab("Fatalities") + ggtitle(" Fatalities by Model of plane(Top 10 frequent)")+ theme(plot.title = element_text(hjust = 0.5))+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

#=========================================================================================# Fatalities per Operators

tab1 = table(ac2$Operator) # sort tab_s1 = sort(tab1) # extract 10 most frequent nationalities top10p = tail(names(tab_s1), 10) # subset of data frame ac2p = subset(ac2, Operator %in% top10p) # order factor levels ac2p$Operator = factor(ac2p$Operator, levels = rev(top10p))

# plot ggplot(ac2p, aes(x = Operator)) + geom_bar(stat = "count", fill = "purple") + theme_classic() + xlab(" Operators ") + ylab("Fatalities") + ggtitle(" Fatalities by Operators(Top 10 frequent)")+ theme(plot.title = element_text(hjust = 0.5))+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

#=========================================================================================# Fatalities per Country

tab2 = table(ac2$Country) # sort tab_s2 = sort(tab2) # extract 10 most frequent nationalities top10c = tail(names(tab_s2), 10) # subset of data frame ac2c = subset(ac2, Country %in% top10c) # order factor levels ac2c$Country = factor(ac2c$Country, levels = rev(top10c))

# plot ggplot(ac2c, aes(x = Country)) + geom_bar(stat = "count", fill = "purple") + theme_classic() + xlab(" Country") + ylab("Fatalities") + ggtitle("Fatalities by Country (Top 10 frequent)")+ theme(plot.title = element_text(hjust = 0.5))+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

#=========================================================================================#Survival Rate Characteristics

ggplot(data = ac2, aes(x = surcat, y = Fatalities, fill = sumnew)) + geom_bar(stat = "identity") + theme_bw()+ xlab(" Survival Rates") + ylab("Fatalities") + ggtitle("Survival Rate Statistics")+ theme(plot.title = element_text(hjust = 0.5))+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

#=========================================================================================

#Type of crash statistics

ggplot(data = ac2, aes(x = sumnew, y = Fatalities)) + geom_bar(stat = "identity",fill = "purple") + theme_bw()+ xlab(" Type of crashes") + ylab("Fatalities") + ggtitle("Type of crash statistics")+ theme(plot.title = element_text(hjust = 0.5))+ theme(axis.text.x = element_text(angle = 90, hjust = 1))

Conclusion

Number of people died in the war: 105479

Number of people survived: 39193

27 % of the total people survived the crash.

Most of the inferences can be obtained from the visualizations.

Highlights:

  1. Aeroflot from Russia and US Military Airforce have accounted for the the most number of crashes.

  2. By year 2001, Number of crashes and fatalities has seen a decrease.

  3. In most of the crashes, there are only negligible number of survivors

  4. Aircraft crashes have seen a gradual rise from 1945 to 1973 and it has been stable since then.

  5. Very few Aircrafts have encountered engine failures while on air.

Note: Multiple regression is performed with mostly categorical variables. There are hundreds of levels in each of the category. Grouping has been done to the maxiimum possibility. Any suggestions are welcome for the improvement for the model. Feel free to comment below.


Featured Posts
Check back soon
Once posts are published, you’ll see them here.
Recent Posts
Archive
Search By Tags
No tags yet.
Follow Us
  • Facebook Basic Square
  • Twitter Basic Square
  • Google+ Basic Square
bottom of page