Facebook 'Post' Analysis

Edwin Varghese
May 15, 2017
4 min read

In this analysis, we'll try to analyse the facebook posts of the two world leaders Mr. Narendra Modi and Mr. Donald Trump. We are going use Rfacebook package for connecting Facebook to Rstudio. For that we need to generate an API id and an App secret token id. Following steps will guide you to connect and obtain details of the posts.

Connecting Facebook and RStudio

#=========================================================================================

install.packages("Rfacebook") library(Rfacebook) library(tm) library(wordcloud) library(ggplot2)

myfb=fbOAuth(app_id = "1889656487983100", app_secret = "XXXXXXXXXXXXXXXXXXXXXXXXXX") save(myfb, file = "myfb") load("myfb")

#=========================================================================================

#Mining the data from the facebook page

Getting the Facebook Page Id from www.findmybid.com

Narendra Modi

#=========================================================================================

#Posts of Mr. Narendra Modi

#=========================================================================================

getpagedataNM = getPage(177526890164,token=myfb,n=3000) View(getpagedataNM)

'Post' Type Distribution

#=========================================================================================# Categorizing likes into groups and seeing the distribution of each post type #=========================================================================================

library(ggplot2)

getpagedataNM$likes_count = as.numeric(getpagedataNM$likes_count)

getpagedataNM$likes_group=cut(getpagedataNM$likes_count, breaks = c(0, 30000, 60000, 100000, 140000, 200000, 300000, 400000, 500000, 600000, 750000, 900000, 1000000, 1500000, 1700000, 2000000, 4000000), labels = c("up to 30k", "up to 60k", "up to 100k", "up to 140k", "up to 200k", "up to 300k", "up to 400k", "up to 500k", "up to 600k", "up to 750k", "up to 900k", "up to 1m", "up to 1.5m", "up to 1.7m", "up to 2m", "up to 4m" )) # grouping the likes using break function

ggplot(data = getpagedataNM, aes(x=getpagedataNM$likes_group, fill=getpagedataNM$type)) + geom_bar(stat = "count") + theme_bw() + theme(axis.text.x=element_text(angle = +45, hjust = 1))+ theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + ggtitle("'Post' type Distribution") + xlab("Likes group") + ylab("Counts") +theme(plot.title = element_text(hjust = 0.5))

Top Posts Analysis

#==================================================================================== #viewing the top posts and using ggplot #==================================================================================== View(getpagedataNM)

NM1= getpagedataNM[getpagedataNM[,12] == "up to 1.5m"| getpagedataNM[,12] =="up to 2m"| getpagedataNM[,12] =="up to 4m", 1:12] NM1$message = substr(NM1$message,1,40) View(NM1)

NM1 = NM1[!(is.na(NM1$message) | NM1$message==""), ] #removing blank #observation

ggplot(data = NM1, aes(x= NM1$message, y= NM1$likes_count)) + geom_bar(stat = "identity", fill = "purple") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Top Posts") + xlab("Posts") + ylab("Number of likes") +theme(plot.title = element_text(hjust = 0.5))

Time Series

#================================================================================ # Likes Timeline - Time series #==================================================================================== getpagedataNM$date = substr(getpagedataNM$created_time,1,10) getpagedataNM$date = as.Date(getpagedataNM$date)

ggplot(data = getpagedataNM, aes(x = getpagedataNM$date, y = getpagedataNM$likes_count)) + geom_line(stat = "identity", color = "red") + theme_bw() + ggtitle("Time Series") + xlab("Year") + ylab("Number of likes") +theme(plot.title = element_text(hjust = 0.5)) View(getpagedataNM)

WordCloud Analysis

#================================================================== # Word Cloud Analysis_ Narendra Modi #==================================================================

Corpus1 = Corpus(VectorSource(getpagedataNM$message))

Corpus1=tm_map(Corpus1,content_transformer(tolower)) Corpus1=tm_map(Corpus1, removePunctuation) Corpus1=tm_map(Corpus1,stripWhitespace) Corpus1=tm_map(Corpus1,removeWords, stopwords("english"))

#making a document term matrix dtm=DocumentTermMatrix(Corpus1) dtm2=as.matrix(dtm)

#finding the most frequent terms

frequency1=colSums(dtm2) frequency1=sort(frequency1,decreasing = TRUE) head(frequency1)

#install.packages('wordcloud') library(wordcloud) words=names(frequency1) pal <-brewer.pal(8,"Dark2") wordcloud(words[1:100],frequency1[1:100], colors = pal)

(Note: WordCloud anaysis basically shows the words which are used most frequently. for eg. here it is "will', "India", "people" etc.)

Donald Trump

#=========================================================================================#Donald Trump Analysis #=========================================================================================

dt = getPage(153080620724,token=myfb,n=3000)

#=========================================================================================

'Post' Type Distribution

#=========================================================================================# Categorizing likes into groups and seeing the percentage distribution of each post type #=========================================================================================

library(ggplot2)

dt$likes_count = as.numeric(dt$likes_count) dt$likes_group=cut(dt$likes_count, breaks = c(0, 30000, 60000, 100000, 140000, 200000, 300000, 400000, 500000, 600000, 750000, 900000, 1000000, 1500000, 1700000, 2000000, 4000000), labels = c("up to 30k", "up to 60k", "up to 100k", "up to 140k", "up to 200k", "up to 300k", "up to 400k", "up to 500k", "up to 600k", "up to 750k", "up to 900k", "up to 1m", "up to 1.5m", "up to 1.7m", "up to 2m", "up to 4m" )) ggplot(data = dt, aes(x= dt$likes_group, fill= dt$type)) + geom_bar(stat = "count") + theme_bw() + theme(axis.text.x=element_text(angle = +45, hjust = 1))+ theme(legend.position = "bottom", legend.direction = "horizontal", legend.title = element_blank()) + ggtitle("'Post' Type Distribution") + xlab("Likes group") + ylab("Counts") +theme(plot.title = element_text(hjust = 0.5))

Top Posts Analysis

#=========================================================================================#viewing the top posts and using ggplot #========================================================================================= dt1= dt[dt[,12] == "up to 300k"| dt[,12] =="up to 400k", 1:12] dt1$message = substr(dt1$message,1,40) View(dt1)

dt1 = dt1[!(is.na(NM1$message) | NM1$message==""), ] #removing blank #observation

ggplot(data = dt1, aes(x= dt1$message, y= dt1$likes_count)) + geom_bar(stat = "identity", fill = "purple") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Top posts") + xlab("posts") + ylab("Number of likes") +theme(plot.title = element_text(hjust = 0.5))

Time Series

#=========================================================================================# Likes Timeline - Time series #=========================================================================================dt$date = substr(dt$created_time,1,10) dt$date = as.Date(dt$date)

ggplot(data = dt, aes(x = dt$date, y = dt$likes_count)) + geom_line(stat = "identity", color = "red") + theme_bw() + ggtitle("Time Series") + xlab("Year") + ylab("Number of likes") +theme(plot.title = element_text(hjust = 0.5))

WordCloud Analysis

#=========================================================================================# Word Cloud Analysis_Donald Trump #=========================================================================================Corpus2 = Corpus(VectorSource(dt$message))

Corpus2=tm_map(Corpus2,content_transformer(tolower)) Corpus2=tm_map(Corpus2, removePunctuation) Corpus2=tm_map(Corpus2,stripWhitespace) Corpus2=tm_map(Corpus2,removeWords, stopwords("english"))

#making a document term matrix dtma=DocumentTermMatrix(Corpus2) dtm2a=as.matrix(dtma)

#finding the most frequent terms frequency2=colSums(dtm2a) frequency2=sort(frequency2,decreasing = TRUE) head(frequency2)

#install.packages('wordcloud') library(wordcloud) words2=names(frequency2) pal <-brewer.pal(8,"Dark2") wordcloud(words2[1:100],frequency2[1:100], colors = pal)

#===================================================================================

Comparison and Conclusion

While performing the analysis, I had faced errors while mining the data using getpage command. It was consistently getting truncated and not many number of posts could be mined, especially of Trump's. Probably next time, I'll use Python for data mining.

With whatever data we have, we have performed the above analysis. It can be seen that both the leaders are assuring the people of their respective countries that they "will" do things that is obviously beneficial for the country. In the wordcloud analysis, we can see the priorities of each leaders. Also it is interesting to note that the likes count of Mr. Narendra Modi is on a downward trend ( Time Series ).

Mr. Donald Trump is more active on the twitter rather than on facebook. So, next time I'll be performing sentimental analysis based on the tweets. So the comparison will be more meaningful because we can make sure that we can get the equal number of tweets from both the leaders.

Feel free to give suggestions. Thank you for reading.