Data Preparation 1 - Manipulation, imputation of missing values using Mean and Median
In this exercise, we'll see how to manipulate data of a csv file for data analysis. File can be downloaded from here Let's see the current state of the csv file.
#=========================================================================================
# Data Manipulation
#========================================================================================= setwd("C:/Users/Ed/Desktop/datasets") ap = read.csv("arabiapop.csv") #View(ap)
# Compared the data viewed in R and the original data in excel and found that two column # names are missing. # Allotted the names for the column in excel file
# step by step we are going to separate the column which contains variables which are
# attached together with ; separation
#install.packages("tidyr") library(tidyr) library(tm) ap1 = separate(ap, col = "Year.Population.Indicator.Indicator.Value", into = c("Year", "Population Indicator", "Indicator.Value"), sep = ";") ap2 = separate(ap1, col = "extra", into = c("Population Indicator2", "Indicator.Value2"), sep = ";") ap3 = separate(ap2, col = "extra2", into = c("Population Indicator3", "Indicator.Value3"), sep = ";") #View(ap3)
ap3$`Population Indicator` = paste(ap3$`Population Indicator`,ap3$`Population Indicator2`, ap3$`Population Indicator3`) ap3$`Population Indicator2` = NULL #dropped the column ap3$`Population Indicator3` = NULL
#View(ap3)
ap3$Indicator.Value = paste(ap3$Indicator.Value,ap3$Indicator.Value2, ap3$Indicator.Value3) ap3$Indicator.Value2 = NULL ap3$Indicator.Value3 = NULL
#View(ap3)
#We can see that there are NAs added to the column while merging. We're going to rid of NA using tm package.
library(tm) stopwords = "NA" # Word which you want to remove x = ap3$Indicator.Value x = removeWords(x,stopwords)
ap3$Indicator.Value = x
ap3$Indicator.Value = as.numeric(ap3$Indicator.Value) View(ap3)
#=========================================================================================
#creating columns of group mean and group median
#=========================================================================================
library(plyr) #head(ap3)
ap4 = ddply(ap3, "`Population Indicator`", transform, group.mean = mean(Indicator.Value, na.rm = TRUE), group.median = median(Indicator.Value, na.rm = TRUE)) # found out the mean and median for the corresponding groups using the help of ddply function
ap4 = as.data.frame(ap4) #converted the above ddply command in to a data frame View(ap4) ap4$`Population Indicator` = NULL #dropped the duplicate column View(ap4)
#=========================================================================================
#imputing with group mean
#=========================================================================================
ap_mean = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, mean)) ap_mean$`Population Indicator` = NULL ap_mean$Indicator.Value = NULL View(ap_mean)
#=========================================================================================
#Imputing with group median
#=========================================================================================
ap_median = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, median)) ap_median$`Population Indicator` = NULL ap_median$Indicator.Value = NULL View(ap_median)
Sample Visualization
ggplot(ap_mean, aes(x = ap_mean$Year, ap_mean$Value)) + geom_bar(data=subset(ap_mean, Population.Indicator=="Inflation GDP deflator (annual %)"), stat="identity", col = "black", fill = "Purple") + theme_bw() + ggtitle("Inflation GDP deflator (annual %)") + xlab("Year -->") + ylab("Indicator Value")+ theme(plot.title = element_text(hjust = 0.5))
In the next post, I will be performing imputation with the help of linear regression. Thank You.