# Article Code #################### # Upload libraries # #################### library(data.table) library(dplyr) library(ggplot2) library(dplyr) library(Matrix) library(arules) ################### # Upload datasets # ################### # 1) Initial dataset: (vacancy ID, processed skill name) [2-column] -> `vac_skill` vac_skill <- fread('vacancy_skill.csv') # 2) Processed soft skills with synonyms -> the set of soft skills (`softs`) sh_soft_clusters <- fread('sh_soft_clusters.csv', sep = ';') softs <- unique(c(sh_soft_clusters$ETALON, 'Time management')) softs <- unique(c(softs, 'English', 'Agile', 'Scrum', 'Kanban')) softs <- softs[which(softs != 'remote work')] # 3) Jaccard matrix between skills (related to sets of vacancy IDs) -> `mat` mat <- fread('jaccard_matrix.csv') mat <- as.matrix(mat ,rownames=1) mat <- as(mat, "dgCMatrix") #################################################### # Computation of Jaccard Matrix `mat` (do not run) # #################################################### # Permutations for Jaccard computation [pairs for detection of hard-soft] giv_com <- data.frame(t(combn(unique(vac_skill$lv), 2)), stringsAsFactors = F) # MinHash (adapted from https://github.com/chrisjmccormick/MinHash/blob/master/runMinHashExample.py) maxShingleID = 2^32-1 nextPrime = 4294967311 pickRandomCoeffs <- function(k){ # Create a list of 'k' random values. randList = c() while (k > 0){ # Get a random shingle ID. randIndex = sample.int(maxShingleID, 1) # Ensure that each random number is unique. while (randIndex %in% randList){ randIndex = sample.int(maxShingleID, 1) } # Add the random number to the list. randList <- c(randList, randIndex) k = k - 1 } randList } # Chosen 100 hashes numHashes = 100 coeffA = pickRandomCoeffs(numHashes) coeffB = pickRandomCoeffs(numHashes) signatures <- c() for (docID in 1:length(unique(vac_skill$lv))){ # Get the shingle set for this document. shingleIDSet = unique(vac_skill$vacid[which(vac_skill$lv %in% unique(vac_skill$lv)[docID])]) # The resulting minhash signature for this document. signature = c() # For each of the random hash functions... for (i in 1:numHashes){ # For each of the shingles actually in the document, calculate its hash code # using hash function 'i'. # Track the lowest hash ID seen. Initialize 'minHashCode' to be greater than # the maximum possible value output by the hash. minHashCode = nextPrime + 1 # For each shingle in the document... for (shingleID in 1:length(shingleIDSet)){ # Evaluate the hash function. hashCode = (coeffA[i] * shingleIDSet[shingleID] + coeffB[i]) %% nextPrime # Track the lowest hash code seen. if (hashCode < minHashCode){ minHashCode = hashCode } } # Add the smallest hash code value as component number 'i' of the signature. signature <- c(signature, minHashCode) } # Store the MinHash signature for this document. signatures <- c(signatures, signature) } # Signature Matrix sigmat <- matrix(signatures, ncol = numHashes, byrow = T) giv_nums <- data.frame(t(combn(1:length(unique(vac_skill$lv)), 2)), stringsAsFactors = F) # Collect Jaccard with MinHash giv_jac <- apply(giv_nums, 1, function(x) sum(sigmat[x[1],] == sigmat[x[2],])) giv_com$jac <- giv_jac giv_com <- giv_com[which(giv_com$jac > 0),] giv_com <- data.table(giv_com, stringsAsFactors = F) # Compute dissimilarity matrix `mat` mat <- giv_com %>% mutate_at(1:2, factor, levels = unique(c(.$X1, .$X2))) %>% xtabs(jac ~ X1 + X2, data=., sparse = TRUE) mat <- mat[order(rownames(mat)),order(rownames(mat))] mat <- mat + t(mat) mat <- mat/100 mat <- mat + Diagonal(nrow(mat)) ############################## # Job occupations clustering # ############################## # Prepare the matrix with `hard` skills matnew <- mat[which(!colnames(mat) %in% union(softs, sh_soft_clusters$V1)), which(!colnames(mat) %in% union(softs, sh_soft_clusters$V1))] # Exclude `hard` skills without any combinations with `soft` skills matnew2 <- mat[which(!colnames(mat) %in% union(softs, sh_soft_clusters$V1)),which(colnames(mat) %in% union(softs, sh_soft_clusters$V1))] matnew <- matnew[as.numeric(which(apply(matnew2, 1, sum) > 0)),as.numeric(which(apply(matnew2, 1, sum) > 0))] # Final hierarchical clustering algorithm based on empirical findings of number of clusters and minimum elements inside k = 1 while(k > 0){ # A BIT CLEAN-UP WHILE ITERATION ij_clus <- hclust(as.dist(1-matnew), method = 'mcquitty') ij_done <- cutree(ij_clus, k = 10) k <- length(which(table(ij_done) < 28)) if (k > 0){ # CUT ONE-ELEMENT CLUSTERS matnew <- matnew[-which(ij_done %in% as.numeric(which(table(ij_done) < 28))),-which(ij_done %in% as.numeric(which(table(ij_done) < 28)))] } } # Obtained: # `ij_done`: (skill name, cluster number) # 10 clusters # `matnew`: processed Jaccard matrics for `hard` skills (644 `hard` skills after processing) # interpretation of clusters is provided in the paper #################################################### # Manual post-processing of obtained `hard` skills # ################################################### # Store ambigous (too general) `hard` skills manually ambig_sk <- c( c('Databases', 'Framework'), c('automation', 'crossover', 'Development', 'programming', 'Software Development', 'Testing'), c('business analyst', 'business processes', 'corporate culture', 'Digital', 'executive', 'logistics', 'payroll', 'payroll and hr', 'president', 'processing', 'product', 'production', 'Research', 'senior', 'the company', 'vice', 'warehouse'), c('business', 'correspondence', 'customer', 'database', 'email', 'Management', 'monitoring', 'orders', 'pc skills', 'calls', 'procurement', 'purchase', 'support', 'Trading', 'documentation'), c('Adobe', 'Adobe After Effect', 'Artist', 'Design', 'graphics', 'photo', 'writing', 'writing articles'), c('Amazon', 'architect', 'developer', 'equipment', 'Hardware', 'Information Technology', 'maintenance', 'network', 'remote', 'repair', 'server', 'software', 'Web Services'), c('content', 'domains', 'email marketing', 'E-Mail Marketing', 'google', 'loyalty programs', 'media', 'newsletter', 'optimization', 'planning', 'production of commercials', 'project', 'search', 'site', 'Social Media Marketing', 'the semantic core', 'yandex', 'social network', 'audit', 'Advertising'), c('analytics', 'Data Science', 'Data Scientist', 'Intermediate', 'marketing', 'Modeling', 'Usability'), c('Backend', 'Frontend', 'fullstack', 'highload', 'web', 'analytical studies'), c('control', 'distribution', 'Engineering', 'Specifications'), c('literacy', 'foreign language')) # Generate the set of `hard` skills (`hards`) hards <- colnames(matnew)[which(!colnames(matnew) %in% ambig_sk)] ##################### # Association rules # ##################### # Prepare dataset with (vacancy ID, set of processed skills) [2-column] -> `sh_as` sh_as <- vac_skill[,.(vacid = unlist(vacid)), by = .(lv)] sh_as <- sh_as[which(sh_as$lv %in% c(softs, hards)),] sh_as <- sh_as[, .(lvs = list(unique(lv))), by = vacid] sh_as <- sh_as[which(unlist(lapply(sh_as$lvs, length)) > 1),] # Collect association rules (thresholds are set after empirical computations) [14768 obs.] rules <- apriori(data=sh_as$lvs, parameter=list (supp=0.0005,conf = 0.001, minlen = 2, maxlen=30), appearance = list (default="lhs", lhs=hards, rhs=intersect(softs, unlist(sh_as$lvs))), control = list (verbose=F)) # Convert rules to data.frame -> `ruledf` ruledf <- data.frame( lhs = labels(lhs(rules)), rhs = labels(rhs(rules)), rules@quality) ruledf$lhs <- gsub('(\\{|\\})', '', ruledf$lhs) ruledf$rhs <- gsub('(\\{|\\})', '', ruledf$rhs) # Validate rules, which obtain `hard` skills from the same cluster (in rhs) [assign zero if no] -> `gr_cl` ruledf2 <- ruledf gr_cl <- c() clnew2 <- stack(lapply(ij_done, as.character)) clnew2 <- transform(clnew2, ind = as.character(clnew2$ind)) for (i in 1:length(ruledf2$lhs)){ rultj <- strsplit(ruledf2$lhs[i], ',')[[1]] if (length(unique(clnew2$values[which(clnew2$ind %in% rultj)])) == 1){ gr_cl <- c(gr_cl, unique(clnew2$values[which(clnew2$ind %in% rultj)])) } else{ gr_cl <- c(gr_cl, 0) } } # Assign cluster numbers to data.frame with rules ruledf2$gr <- gr_cl # Merge BD & ML clusters (based on elements inside clusters) ruledf2$gr[which(ruledf2$gr == '9')] <- '8' # Eliminate of rules with `hard` skills from different clusters ruledf2 <- data.table(ruledf2) ruledf2 <- ruledf2[which(ruledf2$gr != 0),] ############################ # Generate confidence Grid # ############################ # Manual assignment of labels to cluster numbers and rules -> `grid` labs <- data.table(gr = c('1','2','3','4','5','6','7','8','9','10'), gr1 = c('Databases', 'Testing', 'Analytics', 'Support', 'WebDev', 'Hardware', 'SEO', 'Big Data & ML', 'Big Data & ML', 'Engineering')) grid <- ruledf2[, .(sup = mean(support), conf = mean(confidence), lift = mean(lift)), by = c('rhs', 'gr')] grid <- transform(grid, gr = as.character(gr)) grid <- merge(grid, labs) # Set up graphical properties and plot the Grid grid <- grid[grid$lift > 1] # reduce rules with Lift not more than 1 grid$rhs <- tools::toTitleCase(grid$rhs) grid$radius <- grid$conf*5 range01 <- function(x){(x-min(x)+1)/(2-min(x))} ggplot(grid,aes(rhs,gr1))+xlab("") + ylab("")+ geom_point(aes(size=range01(grid$conf)*13, fill = conf), shape=21)+#,fill="white")+#,colour = grid$lift*100)+ scale_fill_gradient(low = "white", high="grey55")+ geom_text(aes(label=round(conf*100,0)),size=range01(grid$conf)*7,family="CMU Serif")+ scale_y_discrete(limits = rev(levels(factor(grid$gr1))))+ scale_size_identity()+ theme(panel.grid.major=element_line(linetype=3,color="grey50"), legend.position="none", panel.background = element_rect(fill = "white", colour = "white"), axis.text.y=element_text(hjust=1,vjust=0.5, size = 12,colour = 'black'), axis.text.x=element_text(angle=45,hjust=1,vjust=1, size = 12,colour = 'black'), text=element_text(family="CMU Serif")) # End