# Author : Elise Tancoigne - elise.tancoigne@unige.ch
# License: CC-by 4.0 (https://creativecommons.org/licenses/by/4.0/)
# September 2018

# This R script allows to 
# - retrieve the account information of Twitter users who identify with citizen science in their bios
# - add supplementary qualitative variables to these accounts (gender, profession...)
# - do some basic descriptive statistics on these variables
# - build the network of followers/followees links within the dataset
# - compute three network metrics.


##### PREAMBLE: access to Twitter data is not straightforward
# - Twitter requires to register a phone number to grant access to its API.
# - rate limitations apply with the free API.


##################################################
##### SET THE ENVIRONMENT

# install the required packages
library(rtweet) # to get data from Twitter
# OR the most recent:
devtools::install_github("mkearney/rtweet")
library(rgexf) # to export the network and open it in Gephi

# save (or load) your working environment
setwd(dir = "~/path/to/myworkingdirectory/")
load("myenvironment.RData")
# save.image("myenvironment.RData")


##################################################
###### CREATE CONNECTION WITH TWITTER API

vignette("tokens") # provides documentation for creating connections

# go on https://apps.twitter.com/app/12124871/settings
consumerKey <- "the_consumer_key_provided_by_twitter"
consumerSecret <- "the_consumer_secret_provided_by_twitter"
token <- create_token(app = "my_application_name", consumerKey, consumerSecret)


##################################################
####### SEARCH USERS THROUGH TWITTER API

### exact match not supported, nor concatenation (see documentation)
### see also the thread on https://github.com/mkearney/rtweet/issues/29
users <- search_users(q = "\"citizen sciences\" OR citizensciences", n=1000, verbose = T)

# get users and their data
# (check from time to time that you did not reach your download limit - wait if yes)
rate_limit(token, query="users/search")

users <- search_users("citsci", n=10000, verbose = T)
users <- rbind(users, search_users("citizenscience", n=10000, verbose = T))
users <- rbind(users, search_users("citizensciences", n=10000, verbose = T))
users <- rbind(users, search_users("citizen science", n=10000, verbose = T))
users <- rbind(users, search_users("citizen sciences", n=10000, verbose = T))
users <- rbind(users, search_users("citizen scientist", n=10000, verbose = T))
# > 900 users. There are duplicates (e.g. people using "citsci" and "citizenscience" at the same time)
# and false positives (e.g. "citizen of the world, science addict")
# select only exact matches:
users <- users[grep("citizen scientist|citsci|citizenscience|citizensciences|citizen science|citizen sciences", ignore.case = T, paste(users$screen_name, users$name, users$description)),]
# !!! don't forget to precise "ignore.case=T"
# there are duplicates
users <- unique(users) 
# 595 unique users
users <- users[, -c(19,20)] # delete url and descr_url (encoded as a list)


##################################################
## CODE DATA IN A SPREADSHEET

# export the dataset from R to CSV
write.csv(users, "../twitter/595nodes_to_code.csv", row.names = F)
## !!! be careful - ids should be imported as characters in open office - otherwise they are rounded with scientifi notation

# then code accounts in a spreadsheet with the following variables:
# linkedin (LinkedIn URL, when found)
# url (URLs of other sources of information)
# type1 ("individual" or "organization")
# type2_raw (raw data, eg. "Senior Technical Staff Member at IBM")
# type2 (coded data, e.g. "researcher", "citizen science project")
# employer_1 (raw data, e.g. "Karlus Inc.")
# employer_2 (coded data, e.g. "research organisation", "government"...)
# field (e.g. "DIY", "conservation"), 
# coord.founder (responsibility for a project; yes or no)
# PhD_start (starting date, when relevant)
# PhD_end (defense date, when relevant)
# country of work (e.g. "Germany", "Canada")
# gender ("male" or "female" or "unknown")


##################################################
## LOAD DATA IN R
codes <- read.csv("~/Documents/THEMES_Citizen_Sciences/twitter/595nodes_to_code.csv", numerals = "no.loss", stringsAsFactors = F)


##################################################
# CHECK DATA COMPLETION
# based on LinkedIn sources, for individuals
length(which(codes$linkedin[codes$type1=="individual"]!=""))/length(which(codes$type1=="individual"))*100
# based on LinkedIn + other sources, for individuals
length(which(codes$linkedin[codes$type1=="individual"]!="" | codes$other_URL[codes$type1=="individual"]!=""))/length(which(codes$type1=="individual"))*100
# based on other sources, for organizations
length(which(codes$url[codes$type1=="organization"]!=""))/length(which(codes$type1=="organization"))*100


##################################################
# GROUP CODES (=RECODE IN BROADER CATEGORIES)
codes$field <- gsub("conservation|environment|biodiversity|ornithology", "environment", codes$field)
codes$field <- gsub("distrib_comp|gender|geog/urbanism|linguistics|open science|science", "other", codes$field)
codes$field <- gsub("educ/STS|SHS|UX/design/gaming/web", "meta-CS", codes$field)

codes$type2 <- gsub("education|journalist/writer/comm", "outreach", codes$type2)
codes$type2 <- gsub("other|sc_policy", "other", codes$type2)

codes$employer_2 <- gsub("art center|media|school", "outreach", codes$employer_2)
codes$employer_2 <- gsub("university|research institute|resHybrid|resNGO|museum", "research", codes$employer_2)
codes$employer_2[grep("services for research", codes$employer_3)] <- "research"
codes$employer_2[grep("services for education|outdoor", codes$employer_3)] <- "outreach"
codes$employer_2 <- gsub("unemployed|NGO|government", "other", codes$employer_2)
codes$employer_2[grep("tech|other", codes$employer_3)] <- "other"

codes$location_2 <- gsub("Belgium|Denmark|France|Germany|Italy|Netherlands|Poland|Spain|Sweden|Switzerland", "Europe", codes$location_2)
codes$location_2 <- gsub("Australia|UK|South Africa|Canada|India|Kenya|New Zealand", "Commonwealth", codes$location_2)
codes$location_2 <- gsub("Brazil|Colombia|Costa Rica|Japan|China|Puerto Rico", "other", codes$location_2)


##################################################
# COMPUTE SOME STATISTICS

# % of individual vs. organizational accounts
round(prop.table(table(codes$type1))*100, 0)
# % of uncategorized accounts 
length(grep("unknown", codes$type2))/length(codes$type2)*100

# individuals' professions (%)
sort(round(prop.table(table(codes$type2[codes$type1=="individual"]))*100,0), decreasing = T)
# citizen science's speciality field for outreach professions (%)
sort(round(prop.table(table(codes$field[codes$type2=="outreach"]))*100,0), decreasing = T)
# citizen science's speciality field for researchers (%)
sort(round(prop.table(table(codes$field[codes$type2=="researcher"]))*100,0), decreasing = T)

# % of PhD
table(!codes$PhD_end[codes$type1=="individual"] %in% c("not_applicable"))/length(which(codes$type1=="individual"))*100
# % of CS coordinators among researchers
prop.table(table(codes$coord.founder[codes$type2=="researcher"]))


# % of men and women
prop.table(table(codes$gender[codes$type1=="individual"]))*100
# % of men/women across professions
round(prop.table(table(codes[codes$type1=="individual", c("gender", "type2")]),2)*100,0)
# % of men/women across specialities
round(prop.table(table(codes[codes$type1=="individual", c("gender", "field")]),2)*100,0)


##################################################
### DATA USED TO CREATE FIG. 2 WITH EXCEL

# professions
sort(round(prop.table(table(codes$type2[codes$type1=="individual"]))*100,0), decreasing = T)
# types of collectives
sort(round(prop.table(table(codes$type2[codes$type1=="organization"]))*100,0), decreasing = T)
# research fields for researchers
sort(round(prop.table(table(codes$field[codes$type2=="researcher"]))*100), decreasing = T)
# research fields for citizen science projects
sort(round(prop.table(table(codes$field[codes$type2=="CS_project"]))*100), decreasing = T)

# copy-paste the data in Excel and create the graphs

# build a dataframe of users' data + type, field and gender codes
codes <- merge(users, codes[,c("user_id", "type1", "type2", "field", "gender")], by="user_id")


##################################################
# GET THE FOLLOWERS AND FOLLOWEES LINKS BETWEEN THESE 595 USERS
# If A --> B, the link can be traced through A's followees or through B's followers
# so we will only get the followers for each account 

data <- users

# the function get_followrs is limited to 75000 responses
max(data$followers_count) # max in our dataset < 75000 so no need to worry about limits in responses
options(timeout=1200) # we'll wait up to 20 min if the website does not respond

# create an empty dataframe for the retrieved links of followers
links <- data.frame(source="", target="", stringsAsFactors = F)

# record starting time
starting_time = Sys.time();cat(paste("Starting at", starting_time, sep=" "))

# for each user
for(i in 548:dim(data)[1])  {
  # display where we are in the loop
  cat(paste("User #", i, " ---> Start\n", sep=""))  
  
  # build followers links
  if(data$followers_count[i]!=0){
    
    followersID <- get_followers(data$screen_name[i], n=75000)$user_id
    # need to get rid of NA

    links <- rbind(links,
                   data.frame(source=followersID, target=data$user_id[i])) 
  }
  rm(followersID)
  
  # display where we are in the loop
  cat(paste("User #", i, " ---> Done\n", sep=""))
  
  # handle twitter rate limits
  limit <- rate_limit(token, query="followers/ids") 
  # if the limit is reached
  if(limit$remaining==0){
    # display a message
    cat(paste("Waiting", round(limit$reset, 2), "mins for rate limitation\n", sep=" "))
    # wait for the necessary time
    Sys.sleep(limit$reset*60+1)
  }
  rm(limit)
  
  # restart the loop for the next user  
} ; rm(i, start)

# at the end, print total execution time
cat(paste("Time elapsed:", Sys.time()-starting_time, sep=" ")) ; rm(starting_time) 

# errors may occur for a few accounts; re-launch the loop after them (#462, #165)


##################################################
# BUILD THE NETWORK

# build the data frame of nodes, with nodes attributes
nodes <- data.frame(id=codes$user_id, label=codes$name, stringsAsFactors = F)
nodes.att <- subset(codes, select=c(description, followers_count, friends_count, statuses_count, url, location, lang, screen_name, type1, type2, field, gender))
names(nodes.att)[3] <- "followees_count" # change the name "friends_count" into "followees_count", for consistency

# select only the edges that connect the 595 users
# (i.e. remove links$source that are not in users$user_id)
mut_links <- links[links$source %in% codes$user_id,]
mut_links <- mut_links[mut_links$target %in% codes$user_id,]


##################################################
# COMPUTE NETWORK METRICS WITH PACKAGE IGRAPH

# create the graph object in igraph
library(igraph)
n <- graph.data.frame(mut_links, directed = T)

# add attributes to the vertices
V(n)$type <- codes$type1[match(V(n)$name, codes$user_id)]
V(n)$acc_name <- codes$name[match(V(n)$name, codes$user_id)]
V(n)$screen_name <- codes$screen_name[match(V(n)$name, codes$user_id)]
V(n)$followers_count <- codes$followers_count[match(V(n)$name, codes$user_id)]
V(n)$statuses_count <- as.numeric(codes$statuses_count[match(V(n)$name, codes$user_id)])

# compute network indicators: in-degree, out-degree, betweenness
V(n)$in_degree <- degree(n, v=V(n), mode="in", loops=F)
V(n)$out_degree <- degree(n, v=V(n), mode="out", loops=F)
V(n)$betw <- betweenness(n, v = V(n), directed = TRUE, weights = NULL, nobigint = TRUE, normalized = FALSE)

# compute the indices used in the study
V(n)$ba <- V(n)$in_degree/length(V(n))*100
V(n)$bb <- V(n)$out_degree/length(V(n))*100
V(n)$be <- V(n)$degree/length(V(n))*100
V(n)$bc <- V(n)$in_degree/V(n)$followers_count*100

# transform the list object into a dataframe
attr <- as.data.frame(get.vertex.attribute(n))

##################################################
# EXPORT DATA TO PLOT FIG. 3 WITH EXCEL
write.csv(attr[, c("name", "bb", "ba", "betw")])

