#' pca: A principal component analysis function
#'
#' This is a flexible PCA function that can be run on a standard data frame.
#' It is a wrapper for prcomp/ggplot2 code and can be customised with different colours and font sizes
#' and more.
#'
#' @param mydata Data frame or matrix: if dataframe/matrix should have samples as columns and rows as features
#' @param printres Logical flag: whether to print the PCA into current directory
#' @param labels Character vector: if we want to just label with gender for example
#' @param text Character vector: if we wanted to label the samples with text IDs to look for outliers
#' @param axistextsize Numerical value: axis text size
#' @param legendtextsize Numerical value: legend text size
#' @param dotsize Numerical value: dot size
#' @param textlabelsize Numerical value: text inside plot label size
#' @param legendtitle Character vector: text legend title
#' @param controlscale Logical flag: whether to control the colour scale
#' @param scale Numerical value: 1=spectral palette, 2=manual low and high palette, 3=categorical labels
#' @param low Character vector: continuous scale low colour
#' @param high Character vector: continuous scale high colour
#' @param colvec Character vector: a series of colours in vector for categorical labels, e.g. c("sky blue", "gold")
#' @param printheight Numerical value: png height (default=20)
#' @param printwidth Numerical value: png width (default=22)
#' @param pcx Numerical value: which PC to plot on X axis (default=1)
#' @param pcy Numerical value: which PC to plot on Y axis (default=2)
#' @param scaler Logical flag: whether to scale the features of the input data (rows) (default=FALSE)
#'
#' @return A PCA plot object
#' @export
#'
#' @examples
#' PCA <- pca(mydata)
pca <- function(mydata, printres = FALSE, labels = FALSE, text = FALSE, axistextsize = 18,
legendtextsize = 18, dotsize = 5, textlabelsize = 4, legendtitle = 'Group',
controlscale = FALSE, scale = 1, low = 'grey', high = 'red',
colvec = c("skyblue", "gold", "violet", "darkorchid", "slateblue", "forestgreen",
"violetred", "orange", "midnightblue", "grey31", "black"),
printheight = 20, printwidth = 22, pcx=1, pcy=2, scaler=FALSE){
## basic error handling
if ( controlscale == TRUE && class(labels) %in% c( "character", "factor") && scale %in% c(1,2) ) {
stop("when categorical labels, use scale=3")
}
if ( controlscale == TRUE && class(labels) %in% c( "numeric") && scale %in% c(3) ) {
stop("when continuous labels, use scale=1 or scale=2")
}
if ( controlscale == FALSE && scale %in% c(2,3) ) {
warning("if your trying to control the scale, please set controlscale=TRUE")
}
if (sum(is.na(labels)) > 0 && class(labels) %in% c('character','factor')){
warning("there is NA values in the labels vector, setting to unknown")
labels <- as.character(labels)
labels[is.na(labels)] <- 'Unknown'
}
if (sum(is.na(text)) > 0 && class(text) %in% c('character','factor')){
warning("there is NA values in the text vector, setting to unknown")
text <- as.character(text)
text[is.na(text)] <- 'Unknown'
}
##
message('***PCA wrapper function***')
message('running...')
##
if (scaler){
mydata <- data.frame(t(scale(t(mydata))))
}
PC1 <- pcx
PC2 <- pcy
##
if (labels[1] == FALSE && text[1] == FALSE){
pca1 = prcomp(t(mydata))
scores <- data.frame(pca1$x) # PC score matrix
eigs <- pca1$sdev^2
variance_percentage <- (eigs / sum(eigs))*100
pc1var <- round(variance_percentage[PC1],digits=0)
pc2var <- round(variance_percentage[PC2],digits=0)
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy]) ) + geom_point(colour='skyblue', size = dotsize) +
theme_bw() +
theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize)) +
scale_colour_manual(values = colvec) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
if (printres == TRUE){
message('printing PCA to current directory...')
png('PCA.png', height = printheight, width = printwidth, units = 'cm',
res = 900, type = 'cairo')
print(p) # print ggplot CDF in main plotting window
dev.off()
}
}else if (labels[1] != FALSE && text[1] == FALSE){ #### KEY
pca1 = prcomp(t(mydata))
scores <- data.frame(pca1$x) # PC score matrix
eigs <- pca1$sdev^2
variance_percentage <- (eigs / sum(eigs))*100
pc1var <- round(variance_percentage[PC1],digits=0)
pc2var <- round(variance_percentage[PC2],digits=0)
if (controlscale == TRUE){
if (scale == 1){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy]) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) + scale_colour_distiller(palette = "Spectral") +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
#scale_colour_gradient(low="red", high="white")
}else if (scale == 2){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy]) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) + #scale_colour_distiller(palette = "Spectral")
scale_colour_gradient(low=low, high=high) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}else if (scale == 3){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy]) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) +
scale_colour_manual(values = colvec) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}
}else{
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy]) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}
if (printres == TRUE){
message('printing PCA to current directory...')
png('PCAlabeled.png', height = printheight, width = printwidth, units = 'cm',
res = 900, type = 'cairo')
print(p) # print ggplot CDF in main plotting window
dev.off()
}
}else if (labels[1] != FALSE && text[1] != FALSE){ ##### KEY
pca1 = prcomp(t(mydata))
scores <- data.frame(pca1$x) # PC score matrix
scores$label <- text
eigs <- pca1$sdev^2
variance_percentage <- (eigs / sum(eigs))*100
pc1var <- round(variance_percentage[PC1],digits=0)
pc2var <- round(variance_percentage[PC2],digits=0)
if (controlscale == TRUE){
if (scale == 1){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy], label = label) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) + scale_colour_distiller(palette = "Spectral")+
geom_text(vjust="inward",hjust="inward",size=textlabelsize) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
#scale_colour_gradient(low="red", high="white")
}else if (scale == 2){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy], label = label) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) + #scale_colour_distiller(palette = "Spectral")
scale_colour_gradient(low=low, high=high)+
geom_text(vjust="inward",hjust="inward",size=textlabelsize) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}else if (scale == 3){
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy], label = label) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) +
scale_colour_manual(values = colvec)+
geom_text(vjust="inward",hjust="inward",size=textlabelsize) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}
}else{
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy], label = label) ) + geom_point(aes(colour = labels), size = dotsize) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize),
legend.title = element_text(size = legendtextsize),
legend.text = element_text(size = legendtextsize)) +
#guides(colour=guide_legend(title=legendtitle)) +
labs(colour = legendtitle) +
geom_text(vjust="inward",hjust="inward",size=textlabelsize) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
}
if (printres == TRUE){
message('printing PCA to current directory...')
png('PCAlabeled.png', height = printheight, width = printwidth, units = 'cm',
res = 900, type = 'cairo')
print(p) # print ggplot CDF in main plotting window
dev.off()
}
}else if (labels[1] == FALSE && text[1] != FALSE){
pca1 = prcomp(t(mydata))
scores <- data.frame(pca1$x) # PC score matrix
scores$label <- text
eigs <- pca1$sdev^2
variance_percentage <- (eigs / sum(eigs))*100
pc1var <- round(variance_percentage[PC1],digits=0)
pc2var <- round(variance_percentage[PC2],digits=0)
p <- ggplot(data = scores, aes(x = scores[,pcx], y = scores[,pcy], label = label) ) +
geom_point(aes(colour = factor(rep(1, ncol(mydata)))), size = dotsize) +
theme_bw() +
theme(legend.position = "none", panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
axis.text.y = element_text(size = axistextsize, colour = 'black'),
axis.text.x = element_text(size = axistextsize, colour = 'black'),
axis.title.x = element_text(size = axistextsize),
axis.title.y = element_text(size = axistextsize)) +
scale_colour_manual(values = colvec) +
geom_text(vjust="inward",hjust="inward",size=textlabelsize) +
xlab(paste('PC',PC1,' (',pc1var,'% variance)',sep='')) +
ylab(paste('PC',PC2,' (',pc2var,'% variance)',sep=''))
if (printres == TRUE){
message('printing PCA to current directory...')
png('PCA.png', height = printheight, width = printwidth, units = 'cm',
res = 900, type = 'cairo')
print(p) # print ggplot CDF in main plotting window
dev.off()
}
}else{
message('no valid options detected')
}
message('done.')
return(p)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.