For this tutorial, download the file SwissPoliticians-tweets.json.gz (~21MB) and SwissPoliticians.csv.
Today, we will go together through the introductory part of the fourth exercise, where we will analyze the network of Swiss politicians on Twitter. Here, we will do the basics of loading and processing Twitter data from a file with tweets in JSON.
First load the packages we will use in this exercise: dplyr, tidygraph, jsonlite, and ggraph.
library(dplyr)
library(tidygraph)
library(jsonlite)
library(ggraph)
Download the file SwissPoliticians.csv and read it as a csv in R. Take into account that separators are tabs. Change the screen names of accounts to lower case and add a column with a sequential id from 1 to the number of politicians.
poldf <- read.csv("SwissPoliticians.csv",sep="\t",header=TRUE, stringsAsFactors=FALSE)
poldf$screenName <- tolower(poldf$screenName)
poldf$id <- seq(1, nrow(poldf))
Read the politician tweets file taking into account that it is compressed. Print a random line and its content read as JSON. Check Exercise 2 (SIT on Twitter) if you need an example of how to do this.
lines <- readLines(gzfile("SwissPoliticians-tweets.json.gz"))
line <- lines[sample(length(lines), 1)]
line
## [1] "{\"created_at\":\"Mon Dec 09 14:29:21 +0000 2013\",\"id\":410053560643363000,\"id_str\":\"410053560643362816\",\"full_text\":\"Les réponses de la CF EWS à l'heure des questions prouve que le Parlement aurait dû voter la Lex USA. Les banques sont livrées au DoJ !\",\"truncated\":false,\"display_text_range\":[0,135],\"entities\":{\"hashtags\":[],\"symbols\":[],\"user_mentions\":[],\"urls\":[],\"media\":{}},\"source\":\"<a href=\\\"http://twitter.com/download/iphone\\\" rel=\\\"nofollow\\\">Twitter for iPhone</a>\",\"user\":{\"id\":627501106,\"id_str\":\"627501106\",\"name\":\"Dominique de Buman\",\"screen_name\":\"DdeBuman\",\"location\":\"Fribourg\",\"description\":\"Conseiller national / Nationalrat @PDCFribourg @CVP_PDC\",\"url\":\"http://t.co/u9VHA7jwJh\",\"entities\":{\"url\":{\"urls\":[{\"url\":\"http://t.co/u9VHA7jwJh\",\"expanded_url\":\"http://www.debuman.ch\",\"display_url\":\"debuman.ch\",\"indices\":[0,22]}]},\"description\":{\"urls\":[]}},\"protected\":false,\"followers_count\":3006,\"friends_count\":1119,\"listed_count\":144,\"created_at\":\"Thu Jul 05 15:56:00 +0000 2012\",\"favourites_count\":1,\"geo_enabled\":false,\"verified\":false,\"statuses_count\":137,\"contributors_enabled\":false,\"is_translator\":false,\"is_translation_enabled\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http://abs.twimg.com/images/themes/theme1/bg.png\",\"profile_background_image_url_https\":\"https://abs.twimg.com/images/themes/theme1/bg.png\",\"profile_background_tile\":false,\"profile_image_url\":\"http://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg\",\"profile_image_url_https\":\"https://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg\",\"profile_banner_url\":\"https://pbs.twimg.com/profile_banners/627501106/1402920965\",\"profile_link_color\":\"FF691F\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"has_extended_profile\":false,\"default_profile\":false,\"default_profile_image\":false,\"following\":false,\"follow_request_sent\":false,\"notifications\":false,\"translator_type\":\"none\",\"withheld_in_countries\":[]},\"is_quote_status\":false,\"retweet_count\":2,\"favorite_count\":1,\"favorited\":false,\"retweeted\":false,\"lang\":\"fr\",\"quoted_status_permalink\":{},\"quoted_status\":{\"display_text_range\":{},\"entities\":{\"hashtags\":{},\"symbols\":{},\"user_mentions\":{},\"urls\":{},\"media\":{}},\"extended_entities\":{\"media\":{}},\"user\":{\"entities\":{\"url\":{\"urls\":{}},\"description\":{\"urls\":{}}},\"withheld_in_countries\":{}}},\"extended_entities\":{\"media\":{}},\"retweeted_status\":{\"display_text_range\":{},\"entities\":{\"hashtags\":{},\"symbols\":{},\"user_mentions\":{},\"urls\":{},\"media\":{}},\"extended_entities\":{\"media\":{}},\"user\":{\"entities\":{\"url\":{\"urls\":{}},\"description\":{\"urls\":{}}},\"withheld_in_countries\":{}}}}"
fromJSON(line)
## $created_at
## [1] "Mon Dec 09 14:29:21 +0000 2013"
##
## $id
## [1] 4.100536e+17
##
## $id_str
## [1] "410053560643362816"
##
## $full_text
## [1] "Les réponses de la CF EWS à l'heure des questions prouve que le Parlement aurait dû voter la Lex USA. Les banques sont livrées au DoJ !"
##
## $truncated
## [1] FALSE
##
## $display_text_range
## [1] 0 135
##
## $entities
## $entities$hashtags
## list()
##
## $entities$symbols
## list()
##
## $entities$user_mentions
## list()
##
## $entities$urls
## list()
##
## $entities$media
## named list()
##
##
## $source
## [1] "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>"
##
## $user
## $user$id
## [1] 627501106
##
## $user$id_str
## [1] "627501106"
##
## $user$name
## [1] "Dominique de Buman"
##
## $user$screen_name
## [1] "DdeBuman"
##
## $user$location
## [1] "Fribourg"
##
## $user$description
## [1] "Conseiller national / Nationalrat @PDCFribourg @CVP_PDC"
##
## $user$url
## [1] "http://t.co/u9VHA7jwJh"
##
## $user$entities
## $user$entities$url
## $user$entities$url$urls
## url expanded_url display_url indices
## 1 http://t.co/u9VHA7jwJh http://www.debuman.ch debuman.ch 0, 22
##
##
## $user$entities$description
## $user$entities$description$urls
## list()
##
##
##
## $user$protected
## [1] FALSE
##
## $user$followers_count
## [1] 3006
##
## $user$friends_count
## [1] 1119
##
## $user$listed_count
## [1] 144
##
## $user$created_at
## [1] "Thu Jul 05 15:56:00 +0000 2012"
##
## $user$favourites_count
## [1] 1
##
## $user$geo_enabled
## [1] FALSE
##
## $user$verified
## [1] FALSE
##
## $user$statuses_count
## [1] 137
##
## $user$contributors_enabled
## [1] FALSE
##
## $user$is_translator
## [1] FALSE
##
## $user$is_translation_enabled
## [1] FALSE
##
## $user$profile_background_color
## [1] "000000"
##
## $user$profile_background_image_url
## [1] "http://abs.twimg.com/images/themes/theme1/bg.png"
##
## $user$profile_background_image_url_https
## [1] "https://abs.twimg.com/images/themes/theme1/bg.png"
##
## $user$profile_background_tile
## [1] FALSE
##
## $user$profile_image_url
## [1] "http://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg"
##
## $user$profile_image_url_https
## [1] "https://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg"
##
## $user$profile_banner_url
## [1] "https://pbs.twimg.com/profile_banners/627501106/1402920965"
##
## $user$profile_link_color
## [1] "FF691F"
##
## $user$profile_sidebar_border_color
## [1] "000000"
##
## $user$profile_sidebar_fill_color
## [1] "000000"
##
## $user$profile_text_color
## [1] "000000"
##
## $user$profile_use_background_image
## [1] FALSE
##
## $user$has_extended_profile
## [1] FALSE
##
## $user$default_profile
## [1] FALSE
##
## $user$default_profile_image
## [1] FALSE
##
## $user$following
## [1] FALSE
##
## $user$follow_request_sent
## [1] FALSE
##
## $user$notifications
## [1] FALSE
##
## $user$translator_type
## [1] "none"
##
## $user$withheld_in_countries
## list()
##
##
## $is_quote_status
## [1] FALSE
##
## $retweet_count
## [1] 2
##
## $favorite_count
## [1] 1
##
## $favorited
## [1] FALSE
##
## $retweeted
## [1] FALSE
##
## $lang
## [1] "fr"
##
## $quoted_status_permalink
## named list()
##
## $quoted_status
## $quoted_status$display_text_range
## named list()
##
## $quoted_status$entities
## $quoted_status$entities$hashtags
## named list()
##
## $quoted_status$entities$symbols
## named list()
##
## $quoted_status$entities$user_mentions
## named list()
##
## $quoted_status$entities$urls
## named list()
##
## $quoted_status$entities$media
## named list()
##
##
## $quoted_status$extended_entities
## $quoted_status$extended_entities$media
## named list()
##
##
## $quoted_status$user
## $quoted_status$user$entities
## $quoted_status$user$entities$url
## $quoted_status$user$entities$url$urls
## named list()
##
##
## $quoted_status$user$entities$description
## $quoted_status$user$entities$description$urls
## named list()
##
##
##
## $quoted_status$user$withheld_in_countries
## named list()
##
##
##
## $extended_entities
## $extended_entities$media
## named list()
##
##
## $retweeted_status
## $retweeted_status$display_text_range
## named list()
##
## $retweeted_status$entities
## $retweeted_status$entities$hashtags
## named list()
##
## $retweeted_status$entities$symbols
## named list()
##
## $retweeted_status$entities$user_mentions
## named list()
##
## $retweeted_status$entities$urls
## named list()
##
## $retweeted_status$entities$media
## named list()
##
##
## $retweeted_status$extended_entities
## $retweeted_status$extended_entities$media
## named list()
##
##
## $retweeted_status$user
## $retweeted_status$user$entities
## $retweeted_status$user$entities$url
## $retweeted_status$user$entities$url$urls
## named list()
##
##
## $retweeted_status$user$entities$description
## $retweeted_status$user$entities$description$urls
## named list()
##
##
##
## $retweeted_status$user$withheld_in_countries
## named list()
Iterate over all the lines you read from the file, interpreting each one as a JSON object with the data of a tweet. For each tweet that is a retweet, save the screen name of the user who tweeted it and the screen name of the user who made the tweet being retweeted. Save these two in a data frame with two columns.
userName <- NULL
RTuserName <- NULL
for (line in lines)
{
tweet <- fromJSON(line)
if (!is.null(tweet$retweeted_status$id_str))
{
userName[length(userName)+1] <- tweet$user$screen_name
RTuserName[length(RTuserName)+1] <- tweet$retweeted_status$user$screen_name
}
}
tweetsdf <- data.frame(userName = tolower(userName), RTuserName = tolower(RTuserName))
As a last step, filter the data frame to remove cases in which a politician was retweeting themselves. How many tweets did you have in the dataset before and after this filter?
nrow(tweetsdf)
## [1] 19047
tweetsdf %>% filter(userName != RTuserName) -> tweetsdf
nrow(tweetsdf)
## [1] 18959