Data Preparation 1 - Manipulation, imputation of missing values using Mean and Median

Edwin Varghese
May 30, 2017
2 min read

In this exercise, we'll see how to manipulate data of a csv file for data analysis. File can be downloaded from here Let's see the current state of the csv file.

#=========================================================================================

# Data Manipulation

#========================================================================================= setwd("C:/Users/Ed/Desktop/datasets") ap = read.csv("arabiapop.csv") #View(ap)

# Compared the data viewed in R and the original data in excel and found that two column # names are missing. # Allotted the names for the column in excel file

# step by step we are going to separate the column which contains variables which are

# attached together with ; separation

#install.packages("tidyr") library(tidyr) library(tm) ap1 = separate(ap, col = "Year.Population.Indicator.Indicator.Value", into = c("Year", "Population Indicator", "Indicator.Value"), sep = ";") ap2 = separate(ap1, col = "extra", into = c("Population Indicator2", "Indicator.Value2"), sep = ";") ap3 = separate(ap2, col = "extra2", into = c("Population Indicator3", "Indicator.Value3"), sep = ";") #View(ap3)

ap3$`Population Indicator` = paste(ap3$`Population Indicator`,ap3$`Population Indicator2`, ap3$`Population Indicator3`) ap3$`Population Indicator2` = NULL #dropped the column ap3$`Population Indicator3` = NULL

#View(ap3)

ap3$Indicator.Value = paste(ap3$Indicator.Value,ap3$Indicator.Value2, ap3$Indicator.Value3) ap3$Indicator.Value2 = NULL ap3$Indicator.Value3 = NULL

#View(ap3)

#We can see that there are NAs added to the column while merging. We're going to rid of NA using tm package.

library(tm) stopwords = "NA" # Word which you want to remove x = ap3$Indicator.Value x = removeWords(x,stopwords)

ap3$Indicator.Value = x

ap3$Indicator.Value = as.numeric(ap3$Indicator.Value) View(ap3)

#=========================================================================================

#creating columns of group mean and group median

#=========================================================================================

library(plyr) #head(ap3)

ap4 = ddply(ap3, "`Population Indicator`", transform, group.mean = mean(Indicator.Value, na.rm = TRUE), group.median = median(Indicator.Value, na.rm = TRUE)) # found out the mean and median for the corresponding groups using the help of ddply function

ap4 = as.data.frame(ap4) #converted the above ddply command in to a data frame View(ap4) ap4$`Population Indicator` = NULL #dropped the duplicate column View(ap4)

#=========================================================================================

#imputing with group mean

#=========================================================================================

ap_mean = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, mean)) ap_mean$`Population Indicator` = NULL ap_mean$Indicator.Value = NULL View(ap_mean)

#=========================================================================================

#Imputing with group median

#=========================================================================================

ap_median = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, median)) ap_median$`Population Indicator` = NULL ap_median$Indicator.Value = NULL View(ap_median)

Sample Visualization

ggplot(ap_mean, aes(x = ap_mean$Year, ap_mean$Value)) + geom_bar(data=subset(ap_mean, Population.Indicator=="Inflation GDP deflator (annual %)"), stat="identity", col = "black", fill = "Purple") + theme_bw() + ggtitle("Inflation GDP deflator (annual %)") + xlab("Year -->") + ylab("Indicator Value")+ theme(plot.title = element_text(hjust = 0.5))