top of page

Data Preparation 1 - Manipulation, imputation of missing values using Mean and Median

In this exercise, we'll see how to manipulate data of a csv file for data analysis. File can be downloaded from here Let's see the current state of the csv file.

#=========================================================================================

# Data Manipulation

#========================================================================================= setwd("C:/Users/Ed/Desktop/datasets") ap = read.csv("arabiapop.csv") #View(ap)

# Compared the data viewed in R and the original data in excel and found that two column # names are missing. # Allotted the names for the column in excel file

# step by step we are going to separate the column which contains variables which are

# attached together with ; separation

#install.packages("tidyr") library(tidyr) library(tm) ap1 = separate(ap, col = "Year.Population.Indicator.Indicator.Value", into = c("Year", "Population Indicator", "Indicator.Value"), sep = ";") ap2 = separate(ap1, col = "extra", into = c("Population Indicator2", "Indicator.Value2"), sep = ";") ap3 = separate(ap2, col = "extra2", into = c("Population Indicator3", "Indicator.Value3"), sep = ";") #View(ap3)

ap3$`Population Indicator` = paste(ap3$`Population Indicator`,ap3$`Population Indicator2`, ap3$`Population Indicator3`) ap3$`Population Indicator2` = NULL #dropped the column ap3$`Population Indicator3` = NULL

#View(ap3)

ap3$Indicator.Value = paste(ap3$Indicator.Value,ap3$Indicator.Value2, ap3$Indicator.Value3) ap3$Indicator.Value2 = NULL ap3$Indicator.Value3 = NULL

#View(ap3)

#We can see that there are NAs added to the column while merging. We're going to rid of NA using tm package.

library(tm) stopwords = "NA" # Word which you want to remove x = ap3$Indicator.Value x = removeWords(x,stopwords)

ap3$Indicator.Value = x

ap3$Indicator.Value = as.numeric(ap3$Indicator.Value) View(ap3)

#=========================================================================================

#creating columns of group mean and group median

#=========================================================================================

library(plyr) #head(ap3)

ap4 = ddply(ap3, "`Population Indicator`", transform, group.mean = mean(Indicator.Value, na.rm = TRUE), group.median = median(Indicator.Value, na.rm = TRUE)) # found out the mean and median for the corresponding groups using the help of ddply function

ap4 = as.data.frame(ap4) #converted the above ddply command in to a data frame View(ap4) ap4$`Population Indicator` = NULL #dropped the duplicate column View(ap4)

#=========================================================================================

#imputing with group mean

#=========================================================================================

ap_mean = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, mean)) ap_mean$`Population Indicator` = NULL ap_mean$Indicator.Value = NULL View(ap_mean)

#=========================================================================================

#Imputing with group median

#=========================================================================================

ap_median = ddply(ap3, ~ `Population Indicator`, transform, Value = impute(Indicator.Value, median)) ap_median$`Population Indicator` = NULL ap_median$Indicator.Value = NULL View(ap_median)

Sample Visualization

ggplot(ap_mean, aes(x = ap_mean$Year, ap_mean$Value)) + geom_bar(data=subset(ap_mean, Population.Indicator=="Inflation GDP deflator (annual %)"), stat="identity", col = "black", fill = "Purple") + theme_bw() + ggtitle("Inflation GDP deflator (annual %)") + xlab("Year -->") + ylab("Indicator Value")+ theme(plot.title = element_text(hjust = 0.5))

In the next post, I will be performing imputation with the help of linear regression. Thank You.


Featured Posts
Check back soon
Once posts are published, you’ll see them here.
Recent Posts
Archive
Search By Tags
No tags yet.
Follow Us
  • Facebook Basic Square
  • Twitter Basic Square
  • Google+ Basic Square
bottom of page