For this tutorial, download the file SwissPoliticians-tweets.json.gz (~21MB) and SwissPoliticians.csv.

Today, we will go together through the introductory part of the fourth exercise, where we will analyze the network of Swiss politicians on Twitter. Here, we will do the basics of loading and processing Twitter data from a file with tweets in JSON.

1. Load the timeline data of Twitter user accounts of Swiss politicians

First load the packages we will use in this exercise: dplyr, tidygraph, jsonlite, and ggraph.

library(dplyr)
library(tidygraph)
library(jsonlite)
library(ggraph)

Download the file SwissPoliticians.csv and read it as a csv in R. Take into account that separators are tabs. Change the screen names of accounts to lower case and add a column with a sequential id from 1 to the number of politicians.

poldf <- read.csv("SwissPoliticians.csv",sep="\t",header=TRUE, stringsAsFactors=FALSE)
poldf$screenName <- tolower(poldf$screenName)
poldf$id <- seq(1, nrow(poldf))

Read the politician tweets file taking into account that it is compressed. Print a random line and its content read as JSON. Check Exercise 2 (SIT on Twitter) if you need an example of how to do this.

lines <- readLines(gzfile("SwissPoliticians-tweets.json.gz"))
line <- lines[sample(length(lines), 1)]
line
## [1] "{\"created_at\":\"Mon Dec 09 14:29:21 +0000 2013\",\"id\":410053560643363000,\"id_str\":\"410053560643362816\",\"full_text\":\"Les réponses de la CF EWS à l'heure des questions prouve que le Parlement aurait dû voter la Lex USA. Les banques sont livrées au DoJ !\",\"truncated\":false,\"display_text_range\":[0,135],\"entities\":{\"hashtags\":[],\"symbols\":[],\"user_mentions\":[],\"urls\":[],\"media\":{}},\"source\":\"<a href=\\\"http://twitter.com/download/iphone\\\" rel=\\\"nofollow\\\">Twitter for iPhone</a>\",\"user\":{\"id\":627501106,\"id_str\":\"627501106\",\"name\":\"Dominique de Buman\",\"screen_name\":\"DdeBuman\",\"location\":\"Fribourg\",\"description\":\"Conseiller national / Nationalrat @PDCFribourg @CVP_PDC\",\"url\":\"http://t.co/u9VHA7jwJh\",\"entities\":{\"url\":{\"urls\":[{\"url\":\"http://t.co/u9VHA7jwJh\",\"expanded_url\":\"http://www.debuman.ch\",\"display_url\":\"debuman.ch\",\"indices\":[0,22]}]},\"description\":{\"urls\":[]}},\"protected\":false,\"followers_count\":3006,\"friends_count\":1119,\"listed_count\":144,\"created_at\":\"Thu Jul 05 15:56:00 +0000 2012\",\"favourites_count\":1,\"geo_enabled\":false,\"verified\":false,\"statuses_count\":137,\"contributors_enabled\":false,\"is_translator\":false,\"is_translation_enabled\":false,\"profile_background_color\":\"000000\",\"profile_background_image_url\":\"http://abs.twimg.com/images/themes/theme1/bg.png\",\"profile_background_image_url_https\":\"https://abs.twimg.com/images/themes/theme1/bg.png\",\"profile_background_tile\":false,\"profile_image_url\":\"http://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg\",\"profile_image_url_https\":\"https://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg\",\"profile_banner_url\":\"https://pbs.twimg.com/profile_banners/627501106/1402920965\",\"profile_link_color\":\"FF691F\",\"profile_sidebar_border_color\":\"000000\",\"profile_sidebar_fill_color\":\"000000\",\"profile_text_color\":\"000000\",\"profile_use_background_image\":false,\"has_extended_profile\":false,\"default_profile\":false,\"default_profile_image\":false,\"following\":false,\"follow_request_sent\":false,\"notifications\":false,\"translator_type\":\"none\",\"withheld_in_countries\":[]},\"is_quote_status\":false,\"retweet_count\":2,\"favorite_count\":1,\"favorited\":false,\"retweeted\":false,\"lang\":\"fr\",\"quoted_status_permalink\":{},\"quoted_status\":{\"display_text_range\":{},\"entities\":{\"hashtags\":{},\"symbols\":{},\"user_mentions\":{},\"urls\":{},\"media\":{}},\"extended_entities\":{\"media\":{}},\"user\":{\"entities\":{\"url\":{\"urls\":{}},\"description\":{\"urls\":{}}},\"withheld_in_countries\":{}}},\"extended_entities\":{\"media\":{}},\"retweeted_status\":{\"display_text_range\":{},\"entities\":{\"hashtags\":{},\"symbols\":{},\"user_mentions\":{},\"urls\":{},\"media\":{}},\"extended_entities\":{\"media\":{}},\"user\":{\"entities\":{\"url\":{\"urls\":{}},\"description\":{\"urls\":{}}},\"withheld_in_countries\":{}}}}"
fromJSON(line)
## $created_at
## [1] "Mon Dec 09 14:29:21 +0000 2013"
## 
## $id
## [1] 4.100536e+17
## 
## $id_str
## [1] "410053560643362816"
## 
## $full_text
## [1] "Les réponses de la CF EWS à l'heure des questions prouve que le Parlement aurait dû voter la Lex USA. Les banques sont livrées au DoJ !"
## 
## $truncated
## [1] FALSE
## 
## $display_text_range
## [1]   0 135
## 
## $entities
## $entities$hashtags
## list()
## 
## $entities$symbols
## list()
## 
## $entities$user_mentions
## list()
## 
## $entities$urls
## list()
## 
## $entities$media
## named list()
## 
## 
## $source
## [1] "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>"
## 
## $user
## $user$id
## [1] 627501106
## 
## $user$id_str
## [1] "627501106"
## 
## $user$name
## [1] "Dominique de Buman"
## 
## $user$screen_name
## [1] "DdeBuman"
## 
## $user$location
## [1] "Fribourg"
## 
## $user$description
## [1] "Conseiller national / Nationalrat @PDCFribourg @CVP_PDC"
## 
## $user$url
## [1] "http://t.co/u9VHA7jwJh"
## 
## $user$entities
## $user$entities$url
## $user$entities$url$urls
##                      url          expanded_url display_url indices
## 1 http://t.co/u9VHA7jwJh http://www.debuman.ch  debuman.ch   0, 22
## 
## 
## $user$entities$description
## $user$entities$description$urls
## list()
## 
## 
## 
## $user$protected
## [1] FALSE
## 
## $user$followers_count
## [1] 3006
## 
## $user$friends_count
## [1] 1119
## 
## $user$listed_count
## [1] 144
## 
## $user$created_at
## [1] "Thu Jul 05 15:56:00 +0000 2012"
## 
## $user$favourites_count
## [1] 1
## 
## $user$geo_enabled
## [1] FALSE
## 
## $user$verified
## [1] FALSE
## 
## $user$statuses_count
## [1] 137
## 
## $user$contributors_enabled
## [1] FALSE
## 
## $user$is_translator
## [1] FALSE
## 
## $user$is_translation_enabled
## [1] FALSE
## 
## $user$profile_background_color
## [1] "000000"
## 
## $user$profile_background_image_url
## [1] "http://abs.twimg.com/images/themes/theme1/bg.png"
## 
## $user$profile_background_image_url_https
## [1] "https://abs.twimg.com/images/themes/theme1/bg.png"
## 
## $user$profile_background_tile
## [1] FALSE
## 
## $user$profile_image_url
## [1] "http://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg"
## 
## $user$profile_image_url_https
## [1] "https://pbs.twimg.com/profile_images/478511608281317377/rbUgEK_B_normal.jpeg"
## 
## $user$profile_banner_url
## [1] "https://pbs.twimg.com/profile_banners/627501106/1402920965"
## 
## $user$profile_link_color
## [1] "FF691F"
## 
## $user$profile_sidebar_border_color
## [1] "000000"
## 
## $user$profile_sidebar_fill_color
## [1] "000000"
## 
## $user$profile_text_color
## [1] "000000"
## 
## $user$profile_use_background_image
## [1] FALSE
## 
## $user$has_extended_profile
## [1] FALSE
## 
## $user$default_profile
## [1] FALSE
## 
## $user$default_profile_image
## [1] FALSE
## 
## $user$following
## [1] FALSE
## 
## $user$follow_request_sent
## [1] FALSE
## 
## $user$notifications
## [1] FALSE
## 
## $user$translator_type
## [1] "none"
## 
## $user$withheld_in_countries
## list()
## 
## 
## $is_quote_status
## [1] FALSE
## 
## $retweet_count
## [1] 2
## 
## $favorite_count
## [1] 1
## 
## $favorited
## [1] FALSE
## 
## $retweeted
## [1] FALSE
## 
## $lang
## [1] "fr"
## 
## $quoted_status_permalink
## named list()
## 
## $quoted_status
## $quoted_status$display_text_range
## named list()
## 
## $quoted_status$entities
## $quoted_status$entities$hashtags
## named list()
## 
## $quoted_status$entities$symbols
## named list()
## 
## $quoted_status$entities$user_mentions
## named list()
## 
## $quoted_status$entities$urls
## named list()
## 
## $quoted_status$entities$media
## named list()
## 
## 
## $quoted_status$extended_entities
## $quoted_status$extended_entities$media
## named list()
## 
## 
## $quoted_status$user
## $quoted_status$user$entities
## $quoted_status$user$entities$url
## $quoted_status$user$entities$url$urls
## named list()
## 
## 
## $quoted_status$user$entities$description
## $quoted_status$user$entities$description$urls
## named list()
## 
## 
## 
## $quoted_status$user$withheld_in_countries
## named list()
## 
## 
## 
## $extended_entities
## $extended_entities$media
## named list()
## 
## 
## $retweeted_status
## $retweeted_status$display_text_range
## named list()
## 
## $retweeted_status$entities
## $retweeted_status$entities$hashtags
## named list()
## 
## $retweeted_status$entities$symbols
## named list()
## 
## $retweeted_status$entities$user_mentions
## named list()
## 
## $retweeted_status$entities$urls
## named list()
## 
## $retweeted_status$entities$media
## named list()
## 
## 
## $retweeted_status$extended_entities
## $retweeted_status$extended_entities$media
## named list()
## 
## 
## $retweeted_status$user
## $retweeted_status$user$entities
## $retweeted_status$user$entities$url
## $retweeted_status$user$entities$url$urls
## named list()
## 
## 
## $retweeted_status$user$entities$description
## $retweeted_status$user$entities$description$urls
## named list()
## 
## 
## 
## $retweeted_status$user$withheld_in_countries
## named list()

Iterate over all the lines you read from the file, interpreting each one as a JSON object with the data of a tweet. For each tweet that is a retweet, save the screen name of the user who tweeted it and the screen name of the user who made the tweet being retweeted. Save these two in a data frame with two columns.

userName <- NULL
RTuserName <- NULL
for (line in lines)
{
  tweet <- fromJSON(line)
  if (!is.null(tweet$retweeted_status$id_str))
  {
    userName[length(userName)+1] <- tweet$user$screen_name
    RTuserName[length(RTuserName)+1] <- tweet$retweeted_status$user$screen_name
  } 
}
tweetsdf <- data.frame(userName = tolower(userName), RTuserName = tolower(RTuserName))

As a last step, filter the data frame to remove cases in which a politician was retweeting themselves. How many tweets did you have in the dataset before and after this filter?

nrow(tweetsdf)
## [1] 19047
tweetsdf %>% filter(userName != RTuserName) -> tweetsdf
nrow(tweetsdf)
## [1] 18959

2. Build social network of retweets

Using inner_join, merge the tweets data frame with the politicians data frame such that each row also contains the information of the politician who wrote the tweet. After this, each tweet should be in one row including also the id of the user that posted it and the political party they belong to.

mergedf1 <- inner_join(tweetsdf, poldf, by=c("userName"="screenName"))
names(mergedf1) <- c("userName", "RTuserName", "userParty", "userid")

Similarly as above, use inner_join to merge the result of the previous chunk with the politicians data frame, but now to match by the screen name of the politician being retweeted. After this, the resulting data frame should contain the id and the party of both the politician retweeting and being retweeted.

mergedf2 <- inner_join(mergedf1, poldf, by=c("RTuserName"="screenName"))
names(mergedf2) <- c("userName", "RTuserName", "userParty", "userid", "RTparty", "RTuserid")

Build the vertices and edges data frames for the network. The vertices data frame only needs to contain the id of each politician, their screen name (as a column called “name”), and the party they belong to. The edges dataframe needs the id of the user being retweeted as “from” and the id of the user retweeting as “to”. This way edges mark information flow. Use group_by to aggregate the multiple instances of these pairs such that the weight of edges is the number of times a user retweeted another.

poldf %>% select(id=id, name=screenName, party) -> vertices
mergedf2 %>% select(from=RTuserid, to=userid) %>% group_by(from, to) %>% summarize(weight=n()) -> edges
## `summarise()` has grouped output by 'from'. You can override using the
## `.groups` argument.

Now do the corresponding call to tbl_graph to build the graph as an undirected graph, using the column id of nodes as identifier (node_key).

graph <- tbl_graph(nodes=vertices, edges = edges, node_key = "id", directed = FALSE)
graph
## # A tbl_graph: 388 nodes and 739 edges
## #
## # An undirected multigraph with 143 components
## #
## # Node Data: 388 × 3 (active)
##      id name            party
##   <int> <chr>           <chr>
## 1     1 andreaskirstein AL   
## 2     2 bergerwthur     AL   
## 3     3 maenij          AL   
## 4     4 walterangst     AL   
## 5     5 asba_j          BDP  
## 6     6 beining_bdp     BDP  
## # … with 382 more rows
## #
## # Edge Data: 739 × 3
##    from    to weight
##   <int> <int>  <int>
## 1     1     4      4
## 2     5    13      3
## 3     5    14      5
## # … with 736 more rows

Show a simple visualization of the graph with the FR layout algorithm. Does it look like a social network?

graph %>% 
  ggraph("fr") + geom_edge_link() + geom_node_point() + theme_graph()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.