#============================================================================== #Dataset: Africa Youtube species Data Analysis #Author: Anthony Basooma; #Section: Fish Biology and Ecology (Ecological Modelling) #Organisation: National Fisheries Resources Research Institute #============================================================================= #Setting working directory setwd(r"(C:\Users\anthbasooma\Documents\Anthony\personal publications\youtube)") #load('sentiment.RData') #===================================================================== library(tidyr) library(dplyr) library(ggplot2) library(extrafont) library(lubridate) library(caret) library(vosonSML) library(syuzhet) library(tm) library(e1071) library(GGally) library(ggwordcloud) library(tidytext) library(sf) library(ggspatial) options(scipen = 999) options(dplyr.summarise.inform = FALSE) youtube <- read.csv(file = "smedia.csv", header = TRUE, strip.white = TRUE, sep = ',', na.strings = c('', NA))%>%filter(!views>3e7) %>% mutate(yposted = as.factor(year(dposted)), ddiff = abs(time_length(interval(ymd(dposted), ymd(dret)), "day")), vpd = views/ddiff, cts = 1, dpd = unlikes/ddiff, Duration = Duration/100, lpd = likes/ddiff, cpd = comments/ddiff, title_len = nchar(iconv(title, "latin1", "UTF-8",sub=''))) admbound <- sf::st_read('ne_10m_admin_0_countries/newadminlayer.shp', quiet = TRUE) ytube <- youtube %>% group_by(title) %>% filter(!row_number() == 2) mdata <- ytube %>% group_by(message) %>% summarise(cts = length(message)) #========== #Descriptive statistics des <- ytube %>% mutate(platform= 'YT') %>% group_by(platform) %>% summarise(totalviews = sum(views), totallike = sum(likes), totdur = sum(Duration), totaldilikes = sum(unlikes), totaldcomments = sum(comments, na.rm = T)) des <- ytube %>% mutate(platform= 'YT') %>% group_by(platform, channel) %>% slice(which.max(subscriber)) %>% group_by(platform) %>% summarise(totalviews = sum(subscriber)) #================ #Are videos specific to a particular region in Africa region <- ytube %>% group_by(region) %>% summarise(totvideo = length(vpd), meanvpd = mean(vpd), meanlikes = mean(lpd), meanun = mean(dpd), meancpd = mean(cpd, na.rm = T), sdvpd = sd(vpd)/sqrt(length(vpd)), sdlikes = sd(lpd)/sqrt(length(vpd)), sdun = sd(dpd)/sqrt(length(vpd)), sdcpd = sd(cpd, na.rm=TRUE)/sqrt(length(cpd))) %>% mutate(meanvpd1 = paste(round(meanvpd, 2),'±', round(sdvpd, 2)), meanlike1 = paste(round(meanlikes, 2),'±', round(sdlikes, 2)), meanun1 = paste(round(meanun, 2),'±', round(sdun, 2)), meancpd1 = paste(round(meancpd, 2),'±', round(sdcpd, 2))) %>% dplyr::select(1, 2, 11:14) xlsx::write.xlsx(region, 'regional2.xlsx') kruskal.test(vpd~region, data = ytube) kruskal.test(lpd~region, data = ytube) kruskal.test(dpd~region, data = ytube) kruskal.test(cpd~region, data = ytube) #1b. at country level geodata <- ytube %>% filter(nature %in% 'country') ctry <- geodata %>% group_by(geo) %>% summarise(totvideo = length(vpd), meanvpd = mean(vpd), meanlikes = mean(lpd), meanun = mean(dpd), meancpd = mean(cpd, na.rm = T), sdvpd = sd(vpd)/sqrt(length(vpd)), sdlikes = sd(lpd)/sqrt(length(vpd)), sdun = sd(dpd)/sqrt(length(vpd)), sdcpd = sd(cpd, na.rm=TRUE)/sqrt(length(cpd))) %>% mutate(meanvpd1 = paste(round(meanvpd, 2),'±', round(sdvpd, 2)), meanlike1 = paste(round(meanlikes, 2),'±', round(sdlikes, 2)), meanun1 = paste(round(meanun, 2),'±', round(sdun, 2)), meancpd1 = paste(round(meancpd, 2),'±', round(sdcpd, 2))) %>% dplyr::select(1, 2, 11:14) kruskal.test(vpd~geo, data = geodata) kruskal.test(lpd~geo, data = geodata) kruskal.test(dpd~geo, data = geodata) kruskal.test(cpd~geo, data = geodata) #graphical presentation #====================== geosum <- geodata %>% ungroup() %>% dplyr::select(geo, vpd, lpd, cpd) %>% group_by(geo) %>% mutate(nv = length(vpd)) %>% gather(cates, cvalue, -c(geo, nv)) %>% group_by(geo, cates) %>% summarise(nv = log(mean(nv)), meancates = log(mean(cvalue, na.rm = T)), sdvpd = log(sd(cvalue, na.rm = T)/sqrt(length(cvalue)))) %>% mutate(cates = case_when(cates=='vpd'~'Views per day', cates=='lpd'~'Likes per day', TRUE~'Comments per day'), cates = factor(cates, levels=c("Views per day", "Likes per day", "Comments per day"))) #piechart for country video posting #================================== ctvideo <- geodata %>% group_by(geo) %>% summarise(ct = length(geo)) %>% mutate(ctry = case_when(ct>10~geo, TRUE~'Others')) %>% group_by(ctry) %>% summarise(ct = sum(ct)) %>% mutate(per= prop.table(ct) * 100) ggplot(data = ctvideo, aes(x="", y=per, fill=ctry))+ geom_bar(width = 1, stat = "identity")+ coord_polar("y", start=0)+ scale_fill_viridis_d(direction = -1)+ theme_void()+ geom_text(aes(label = paste(ctry, '\n', round(per,2),'%')), position = position_stack(vjust = 0.5),family = 'Cambria', color='black')+ theme(text = element_text(family = 'Cambria', size = 11), legend.position = 'None') #============== bg <- theme(panel.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.background = element_blank()) library(tidytext) ggplot(geosum, aes(x= reorder_within(geo, meancates, cates), y=meancates, fill=cates))+ geom_bar(stat = 'identity')+coord_flip()+ scale_x_reordered()+ facet_wrap(~cates, scales = 'free', ncol=4)+ theme_bw()+#bg+ scale_y_continuous(expand = expansion(mult=c(0,0.051))) + scale_fill_manual(values = c('grey20', 'grey50', 'grey80'))+ geom_errorbar(aes(ymin= meancates, ymax= meancates+sdvpd), width=0.5)+ theme(#text = element_text(family = 'Cambria', size = 10), axis.text.y = element_text(hjust = 0), #axis.text.x = element_text(angle = 45, hjust = 1), legend.position = 'None')+ #geom_text(aes(label= nv, color=nv), size= 3, hjust = -7.2)+ scale_color_gradient(low = 'grey30', high = 'grey5')+ labs(y='Mean views, likes, and comments (±SE)', x='Countries', color='Videos') #new plot ggplot(data = geosum, #%>% filter(cates=='Views per day'), aes(x= reorder_within(geo, meancates, cates), y=meancates, fill=cates, color = cates, shape = cates))+ geom_point()+ coord_flip()+ scale_x_reordered()+theme_bw()+bg+ facet_wrap(~cates, scales = 'free', ncol=4)+ scale_y_continuous(expand = expansion(mult=c(0,0.051))) + geom_errorbar(aes(ymin= meancates-sdvpd, ymax= meancates+sdvpd), width=0.5, linetype='solid')+ theme(axis.text.y = element_text(hjust = 0), text = element_text(size = 13), legend.position = 'None')+ ggpubr::stat_compare_means(label = "p.signif", method = "wilcox.test", ref.group = ".all.") + scale_color_manual(values = c('darkblue', 'darkcyan', 'darkorange4'))+ geom_hline(yintercept = 0, linetype='dotted', color = 'grey50', linewidth = 0.5)+ labs(y='Logarithmic mean views, likes and comments per day (±SE)', x='Countries', color='Videos') ggsave('reviews/vpdlpd1_new.png', dpi = 1000, width=10.4, height=6.4, units='in') dn <- as.data.frame(dunn.test::dunn.test(geodata$lpd, geodata$geo,list = TRUE))%>% filter(P<=0.025) xlsx::write.xlsx(dn, 'differencescountriesdpd.xlsx') #ecoregions ecoregions <- st_read('reviews/Africa_Ecoregions/ecoregion_cleaned.shp', quiet = TRUE) ecoregions1 <- st_transform(ecoregions, crs = 4326) centroids <- read.csv(file = 'reviews/centroids.csv') %>% select(-c(1,3)) #merge with centroids geocentroid <- as.data.frame(geodata) %>% mutate(id = row_number()) %>% ungroup() %>% select(id, geo, vpd, cpd, lpd) %>% left_join(y= centroids %>% rename(geo=name), by='geo') %>% filter(!is.na(lat)) %>% sf::st_as_sf(coords=c('lon','lat'), crs = st_crs(4326)) ecojoin <- st_join(geocentroid, st_make_valid(ecoregions1), join = st_intersects) #get the videos per biome vpbiom <- ecojoin %>% st_drop_geometry() %>% group_by(ECO_NAME) %>% summarise(nv = length(ECO_NAME), meanvpd= mean(vpd, na.rm = TRUE), meanlpd= mean(lpd, na.rm = TRUE), meancpd= mean(cpd, na.rm = TRUE)) %>% filter(!is.na(ECO_NAME))%>% mutate(nv2= case_when(nv>0&nv<25~'1-25', nv>=25&nv<50~'25-50', nv>=50&nv<75~'50-75', nv>=75&nv<100~'75-100'), meanvp= case_when(meanvpd>0&meanvpd<25~'1-200', meanvpd>=200&meanvpd<400~'200-400', meanvpd>=400&meanvpd<600~'400-600', meanvpd>=600&meanvpd<800~'600-800'), meanlp= case_when(meanlpd>0&meanlpd<3~'1-3', meanlpd>=3&meanlpd<6~'3-6', meanlpd>=6&meanlpd<9~'6-9', meanlpd>=9&meanlpd<12~'9-12'), meancp= case_when(meancpd>0&meancpd<0.4~'0.1-0.4', meancpd>=0.4&meancpd<0.8~'0.4-0.8', meancpd>=0.8&meancpd<1.4~'0.8-1.4')) #rejoin on the polygons ecof <- ecoregions1 %>% left_join(vpbiom, by='ECO_NAME') %>% replace(is.na(.), 0) bg2 <- theme(legend.position.inside = c(0.2, 0.3), legend.position = 'inside', text = element_text(size = 10), legend.text = element_text(size = 10), plot.margin = unit(c(0, 0, 0, 0), units = 'cm')) vpdd <- ggplot()+ geom_sf(data = ecof, aes(fill=nv2))+ scale_fill_viridis_d()+ theme_bw()+bg+bg2+ labs(x='Longitude', y='Latitude', fill='videos', title = '(A)') mvpd <- ggplot()+ geom_sf(data = ecof, aes(fill=meanvp))+ scale_fill_viridis_d()+ theme_bw()+bg+bg2+ labs(x='Longitude', y='Latitude', fill='Mean vpd', title = '(B)') mlpd <- ggplot()+ geom_sf(data = ecof, aes(fill=meanlp))+ scale_fill_viridis_d()+ theme_bw()+bg+bg2+ labs(x='Longitude', y='Latitude', fill='Mean lpd', title= '(C)') mcpd <- ggplot()+ geom_sf(data = ecof, aes(fill=meancp))+ scale_fill_viridis_d()+ theme_bw()+bg+bg2+ labs(x='Longitude', y='Latitude', fill='Mean cpd', title = '(D)') library(gridExtra) gplots <- grid.arrange(vpdd, mvpd, mlpd, mcpd, nrow=2)# ggsave(filename = "reviews/ecoregions.png", gplots, width = 6.5, height = 6.8, dpi = 600, units ='in') #================== taxa <- youtube %>% filter(biota%in%'Yes')%>% group_by(title, lowertaxa) %>% filter(!row_number() == 2)%>% group_by(hightaxa) %>% summarise(ct = length(hightaxa)) taxasum <- taxa%>% group_by(hightaxa) %>% summarise(totvideo = length(vpd), meanvpd = mean(vpd), meanlikes = mean(lpd), meanun = mean(dpd), meancpd = mean(cpd, na.rm = T), sdvpd = sd(vpd)/sqrt(length(vpd)), sdlikes = sd(lpd)/sqrt(length(vpd)), sdun = sd(dpd)/sqrt(length(vpd)), sdcpd = sd(cpd, na.rm=TRUE)/sqrt(length(cpd))) %>% mutate(meanvpd1 = paste(round(meanvpd, 2),'±', round(sdvpd, 2)), meanlike1 = paste(round(meanlikes, 2),'±', round(sdlikes, 2)), meanun1 = paste(round(meanun, 2),'±', round(sdun, 2)), meancpd1 = paste(round(meancpd, 2),'±', round(sdcpd, 2))) %>% dplyr::select(1, 2, 11:14) xlsx::write.xlsx(taxasum, 'meanacrosstaxa.xlsx') kruskal.test(vpd~hightaxa, data = taxa) kruskal.test(lpd~hightaxa, data = taxa) kruskal.test(dpd~hightaxa, data = taxa) kruskal.test(cpd~hightaxa, data = taxa) dn <- as.data.frame(dunn.test::dunn.test(taxa$vpd, taxa$hightaxa,list = TRUE))%>% filter(P<=0.025) write.csv(dn, 'reviews/differences.csv', row.names = F) #======= #Specific taxa taxaspec <- taxa%>% group_by(lowertaxa) %>% mutate(ct=length(lowertaxa)) %>% filter(ct!=1, !is.na(lowertaxa)) #%>% #group_by(lowertaxa) %>% summarise(ct = length(lowertaxa)) write.csv(taxaspec, 'lowertaxacount.csv', row.names = F) taxasum <- taxaspec %>% group_by(lowertaxa) %>% summarise(totvideo = length(vpd), meanvpd = mean(vpd), meanlikes = mean(lpd), meanun = mean(dpd), meancpd = mean(cpd, na.rm = T), sdvpd = sd(vpd)/sqrt(length(vpd)), sdlikes = sd(lpd)/sqrt(length(vpd)), sdun = sd(dpd)/sqrt(length(vpd)), sdcpd = sd(cpd, na.rm=TRUE)/sqrt(length(cpd))) kruskal.test(vpd~lowertaxa, data = taxaspec) kruskal.test(lpd~lowertaxa, data = taxaspec) kruskal.test(dpd~lowertaxa, data = taxaspec) kruskal.test(cpd~lowertaxa, data = taxaspec) dn <- as.data.frame(dunn.test::dunn.test(taxaspec$vpd, taxaspec$lowertaxa,list = TRUE))%>% filter(P<=0.025) write.csv(dn, 'reviews/lowertaxadifferences.csv', row.names = F) #================================ #graphical presentation #====================== taxaplot <- taxaspec %>% dplyr::select(hightaxa, lowertaxa, vpd, lpd, cpd) %>% group_by(hightaxa) %>% mutate(nv = length(vpd)) %>% gather(cates, cvalue, -c(hightaxa, nv, lowertaxa)) %>% group_by(hightaxa, cates, lowertaxa) %>% summarise(nv = log(mean(nv)), meancates = log(mean(cvalue, na.rm = T)), sdvpd = log(sd(cvalue, na.rm = T)/sqrt(length(cvalue)))) %>% mutate(cates = case_when(cates=='vpd'~'Views per day', cates=='lpd'~'Likes per day', cates=='dpd'~'Dislikes per day', TRUE~'Comments per day'), cates = factor(cates, levels=c("Views per day", "Likes per day", "Comments per day")), taxa= case_when(hightaxa=='Amphibians'~'Amph', TRUE~lowertaxa)) #============== ggplot(taxaplot, aes(x=reorder_within(lowertaxa, meancates, cates), y=meancates, fill=cates))+ geom_bar(stat = 'identity')+coord_flip()+ theme_bw()+ scale_x_reordered()+ facet_wrap(~cates, scales = 'free')+ scale_fill_manual(values = c('grey20', 'grey50', 'grey80'))+ scale_y_continuous(expand = expansion(mult=c(0,0.051))) + geom_errorbar(aes(ymin= meancates, ymax= meancates+sdvpd), width=0.3)+ theme(text = element_text(family = 'Cambria', size = 11), axis.text.y = element_text(hjust = 0), legend.position = 'None', #axis.text.x = element_text(angle = 45, hjust = 1), strip.text.y = element_text(size = 7))+ #geom_text(aes(label= nv, color=nv), size= 3.2, hjust = -7.2)+ scale_color_gradient(low = 'grey30', high = 'grey5')+ #scale_fill_viridis_d()+ labs(y='Mean views, likes, and comments (±SE)', x='Lower taxonomic ranks') ggplot(taxaplot, aes(x=reorder_within(lowertaxa, meancates, cates), y=meancates, fill=cates, colour = cates, shape = cates))+ geom_point(stat = 'identity')+coord_flip()+ theme_bw()+bg+ scale_x_reordered()+ facet_wrap(~cates, scales = 'free')+ scale_y_continuous(expand = expansion(mult=c(0,0.051))) + geom_errorbar(aes(ymin= meancates-sdvpd, ymax= meancates+sdvpd), width=0.3)+ theme(text = element_text(size = 13), axis.text.y = element_text(hjust = 0), legend.position = 'None')+ scale_color_manual(values = c('darkblue', 'darkcyan', 'darkorange4'))+ geom_hline(yintercept = 0, color='grey50', linetype ='dotted')+ labs(y='Logarithmic mean views, likes, and comments per day', x='Lower taxonomic ranks') ggsave('reviews/subtaxa2.png', dpi = 1000, width=10.4, height= 6.4, units='in') #1.Questions 1: Are governments harnessing the strength of YouTube in distributing info govdata<- ytube %>% group_by(category)%>% summarise(totvideo = length(vpd), meanvpd = mean(vpd), meanlikes = mean(lpd), meanun = mean(dpd), meancpd = mean(cpd, na.rm = T), sdvpd = sd(vpd)/sqrt(length(vpd)), sdlikes = sd(lpd)/sqrt(length(vpd)), sdun = sd(dpd)/sqrt(length(vpd)), sdcpd = sd(cpd, na.rm=TRUE)/sqrt(length(cpd))) %>% mutate(meanvpd1 = paste(round(meanvpd, 2),'±', round(sdvpd, 2)), meanlike1 = paste(round(meanlikes, 2),'±', round(sdlikes, 2)), meanun1 = paste(round(meanun, 2),'±', round(sdun, 2)), meancpd1 = paste(round(meancpd, 2),'±', round(sdcpd, 2))) %>% dplyr::select(1, 2, 11:14) xlsx::write.xlsx(govdata, 'government1.xlsx') kruskal.test(vpd~category, ytube) kruskal.test(lpd~category, ytube) kruskal.test(dpd~category, ytube) kruskal.test(cpd~category, ytube) #as.data.frame( library("rstatix") govcol <- ytube %>% ungroup()%>% dplyr::select(cpd, vpd, lpd, dpd, category) dtest <- list() for (i in seq(1:4)) { a <- colnames(govcol) dtest[[i]] <- as.data.frame(dunn_test(govcol, as.formula(paste(a[i], a[5], sep="~")), p.adjust.method="BH")) df <- do.call(cbind, dtest) } xlsx::write.xlsx(df, 'channelcategoriesdunntest.xlsx') #which words are put in the titles titles <- iconv(ytube$title, to="utf-8") cclean <- sapply(titles,function(row) iconv(row, "latin1", "ASCII", sub="")) corpus <- Corpus(VectorSource(cclean)) corpus <- tm_map(corpus, tolower) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeNumbers) ccorpus<- tm_map(corpus, removeWords, stopwords('english')) ccorpus<- tm_map(ccorpus, gsub, pattern="chines",replacement="china") ccorpus<- tm_map(ccorpus, gsub, pattern="africas",replacement="africa") ccorpus<- tm_map(ccorpus, gsub, pattern="african",replacement="africa") ccorpus<- tm_map(ccorpus, gsub, pattern="extinction",replacement="extinct") ccorpus<- tm_map(ccorpus, gsub, pattern="invasions",replacement="invasive") ccorpus<- tm_map(ccorpus, gsub, pattern="invasion",replacement="invasive") ccorpus<- tm_map(ccorpus, gsub, pattern="invading",replacement="invasive") ccorpus<- tm_map(ccorpus, gsub, pattern="polluted",replacement="pollution") ccorpus<- tm_map(ccorpus, gsub, pattern="polluting",replacement="pollution") ccorpus<- tm_map(ccorpus, gsub, pattern="pollutionrelated",replacement="pollution") #ccorpus<- tm_map(ccorpus, gsub, pattern="killing",replacement="kill") #ccorpus <- tm_map(ccorpus, stemDocument) removeURL <- function(x)gsub('http[[:alnum:]]*','',x) ccorpus<- tm_map(ccorpus, content_transformer(removeURL)) ccorpus<- tm_map(ccorpus, stripWhitespace) #============================================ #convert into rows and column from unstructed data tdm <- TermDocumentMatrix(ccorpus) tdmatrix <- as.matrix(tdm) words <- rowSums(tdmatrix) wfinal <-as.data.frame(subset(words, words>=5))%>% tibble::rownames_to_column("word") %>% filter(!word %in% c('africa', 'biodiversity', 'conservation', 'south')) worddf <- setNames(wfinal, c('word', 'freq')) set.seed(1144) ggplot(worddf, aes(label = word, size=freq, color=freq)) + geom_text_wordcloud_area(family = "Calibri", shape = 'circle')+ scale_size_area(max_size = 16)+theme_bw()+ theme_minimal()+scale_colour_viridis_b(direction = 1)+ theme(text = element_text(family = 'Helvetica', size = 11))#+ #ggtitle('Topical words') ggsave('words.png', dpi = 600, width=5.0, height=4.3, units='in',bg="white") library(wordcloud) set.seed(1234) # for reproducibility wordcloud(words = worddf$word, freq = worddf$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.65, colors=brewer.pal(8, "Dark2")) #generalised linear model y <- lm(views~likes+unlikes+subscriber+ddiff+comments+Duration+title_len, data = ytube) table <- as.data.frame(summary(y)$coefficients) %>% tibble::rownames_to_column("parameters") %>% setNames(c('parameter', 'estimate', 'stderror', 'tvalue', 'pvalue')) %>% ITNr::round_df(digits = 3) xlsx::write.xlsx(table, 'glmoutput.xlsx') #Origin of the countries odata <- ytube %>% group_by(origin) %>% summarise(cts= length(origin)) orign <- admbound %>% dplyr::select(SOVEREIGNT) %>% left_join(y=odata %>% rename(SOVEREIGNT=origin), by='SOVEREIGNT') %>% filter(!SOVEREIGNT=='Antarctica') %>% replace(., is.na(.), 0)#%>% group_by(SOVEREIGNT) %>% #slice(which.max(cts)) ggplot(orign)+ geom_sf(aes(fill=cts))+coord_sf(expand = FALSE)+ scale_fill_viridis_c(direction = 1, option = 'B' )+ theme_bw()+bg+ theme(legend.position.inside = c(0.1, 0.3), legend.position = 'inside')+ #theme(text = element_text(family = 'Cambria', size = 11))+ #suppressWarnings(annotation_scale(aes(location = "bl", width_hint = 0.3))) + # annotation_north_arrow(location = "bl", which_north = "true", # pad_x = unit(0.0, "in"), pad_y = unit(0.2, "in"), # style = north_arrow_fancy_orienteering)+ labs(y='Latitude', x='Longitude', fill='Channels') dc <- tmaptools::get_asp_ratio(orign) ggsave('reviews/locations.png', dpi = 1000, width = dc*3, height = 4.1, units = 'in')