R/auxiliary_functions.R

Defines functions .construct_smoothing_matrix .moving_average winsor swinsor_vector .align_smoothing_matrix GR2norm_df norm_df2GR .correct_merged

Documented in GR2norm_df norm_df2GR swinsor_vector winsor

### Auxiliary functions for package RNAprobR

#Auxiliary function speeding up calculations in functions operating on
#sliding windows.
#Returns a matrix with (length(input_vector) + window_size - 1) columns and
#window_size rows. Each row contains input vector values starting at column
#equal to row number.
.construct_smoothing_matrix <- function(input_vector, window_size){

    vector_length <- length(input_vector)
    #Create empty matrix.
    my_mat <- matrix(nrow=window_size, ncol=(vector_length+window_size-1))
    #Fill the matrix, offsetting by 1 in each cycle.
    for(i in 1:(window_size))
    {
        my_mat[i,i:(vector_length+i-1)] <- input_vector
    }

    my_mat
}

#Moving average calculation. NA values ignored.
#Rreturns a numeric vector. Each position represents the mean of values in
#window_size positions centred at this position.
.moving_average <- function(input_vector, window_size)
{
    window_side <- window_size/2-0.5

    colMeans(.construct_smoothing_matrix(input_vector, window_size),
        na.rm=TRUE)[(window_side+1):(length(input_vector)+window_side)]
}

#' Winsor normalization with fitting to <0,1> range.
#'
#' Function performs Winsor normalization of a supplied vector. Steps:
#' 1. Calcualate top winsor value [(1+winsor_level)/2 quantile], and bottom
#' winsor value ((1-winsor_level)/2 quantile)
#' 2. Each value below bottom winsor value set to bottom winsor value; each
#' value above top winsor value set to top winsor value
#' 3. Transform linearly all the values to [0,1] range
#'
#' @param input_vector Vector with values to be Winsorized
#' @param winsor_level Winsorization level. Bottom outliers will be set to
#' (1-winsor_level)/2 quantile and top outliers to (1+winsor_level)/2 quantile.
#' @param only_top If TRUE then bottom values are not Winsorized and the lowest
#' is set to 0.
#' @return Vector of numerics within <0,1>.
#' @author Lukasz Jan Kielpinski
#' @references Hastings, Cecil; Mosteller, Frederick; Tukey, John W.; Winsor,
#' Charles P. Low Moments for Small Samples: A Comparative Study of Order
#' Statistics. The Annals of Mathematical Statistics 18 (1947), no. 3,
#' 413--426.
#' @keywords ~winsorising
#' @examples
#'
#' data_set <- runif(1:100)*100
#' plot(winsor(data_set, winsor_level=0.8) ~ data_set)
#'
#' @export winsor
winsor <- function(input_vector, winsor_level=0.9, only_top= FALSE)
{
    #!!! Function changes integers to numerics!
    bounds <- quantile(input_vector, c((1-winsor_level)/2,
        1-(1-winsor_level)/2), names= FALSE, na.rm=TRUE)

    if(only_top)
        bounds[1] <- 0
    input_vector[input_vector < bounds[1]] <- bounds[1]
    input_vector[input_vector > bounds[2]] <- bounds[2]

    if((bounds[2]-bounds[1])>0)
        input_vector <- (input_vector-bounds[1])/(bounds[2]-bounds[1])
    else
        input_vector[1:length(input_vector)] <- NA

    input_vector
}

#' Smooth Winsor Normalization
#'
#' Function performs Winsor normalization (see winsor() function) of each
#' window of specified window_size, sliding in a given vector by 1 position,
#' and reports a list of (1) mean Winsorized values for each vector position
#' (mean of Winsorized value for a given position as calculated within each
#' overlapping window) and (2) standard deviation of those Winsorized values.
#'
#' @param input_vector Vector with values to be smooth-Winsorized
#' @param window_size Size of a sliding window.
#' @param winsor_level Winsorization level. Bottom outliers will be set to
#' (1-winsor_level)/2 quantile and top outliers to (1+winsor_level)/2 quantile.
#' @param only_top If TRUE then bottom values are not Winsorized and are set to
#' 0.
#' @return \item{comp1}{Vector with mean Winsorized values for each
#' input_vector position} \item{comp2}{Vector with standard deviation of
#' Winsorized values for each input_vector position}
#' @author Lukasz Jan Kielpinski
#' @references "Analysis of sequencing based RNA structure probing data"
#' Kielpinski, Sidiropoulos, Vinther. Chapter in "Methods in Enzymology"
#' (in preparation)
#' @examples
#'
#' data_set <- runif(1:100)*100
#' plot(swinsor_vector(data_set, window_size=71,
#'                     winsor_level=0.8)[[1]] ~ data_set)
#'
#' @export swinsor_vector
swinsor_vector <- function(input_vector, window_size, winsor_level=0.9,
    only_top= FALSE)
{
    my_matrix <- .construct_smoothing_matrix(input_vector, window_size)
    winsorized_matrix <- apply(my_matrix, 2, winsor, winsor_level=winsor_level,
        only_top=only_top)

    #Control the border behaviour
    winsorized_matrix[,c(1:(window_size-1),
        (length(input_vector)+1):ncol(winsorized_matrix))] <- NA

    aligned_winsorized_matrix <- .align_smoothing_matrix(winsorized_matrix)
    means_vector <- colMeans(aligned_winsorized_matrix, na.rm=TRUE)
    sds_vector <- apply(aligned_winsorized_matrix, 2, FUN=sd, na.rm=TRUE)

    list(means_vector, sds_vector)
}

#Define function for aligning matrix generated by construct_smoothing_matrix
.align_smoothing_matrix <- function(input_matrix)
{
    #Length of the vector used to construct the matrix in
    #construct_smoothing_matrix function.
    vector_length <- ncol(input_matrix)-nrow(input_matrix)+1

    #Create empty matrix.
    my_mat <- matrix(nrow=nrow(input_matrix), ncol=vector_length)

    #Fill the matrix, offseting by -1 in each cycle.
    for(i in 1:nrow(input_matrix))
        my_mat[i,] <- input_matrix[i, i:(vector_length + i - 1)]

    my_mat
}


#' Export normalized GRanges object to data frame
#'
#' Function to make data frame out of GRanges output of normalizing functions
#' (dtcr(), slograt(), swinsor(), compdata()) for all or a set of chosen
#' transcripts in the file.
#'
#' @param norm_GR GRanges object made by other normalization function (dtcr(),
#' slograt(), swinsor(), compdata()) from which data is to be extracted
#' @param RNAid Transcript identifiers of transcripts that are to be extracted
#' @param norm_methods Names of the columns to be extracted.
#' @return Data frame object with columns: RNAid, Pos and desired metadata
#' columns (e.g. nt, dtcr)
#' @author Lukasz Jan Kielpinski, Nikos Sidiropoulos
#' @seealso \code{\link{norm_df2GR}}, \code{\link{dtcr}},
#' \code{\link{swinsor}}, \code{\link{slograt}}, \code{\link{compdata}}
#' @examples
#'
#' dummy_euc_GR_treated <- GRanges(seqnames="DummyRNA",
#'                                 IRanges(start=round(runif(100)*100),
#'                                 width=round(runif(100)*100+1)), strand="+",
#'                                 EUC=round(runif(100)*100))
#' dummy_comp_GR_treated <- comp(dummy_euc_GR_treated)
#' dummy_swinsor <- swinsor(dummy_comp_GR_treated)
#' GR2norm_df(dummy_swinsor)
#'
#' @import GenomicRanges
#' @export GR2norm_df
GR2norm_df <- function(norm_GR, RNAid="all", norm_methods="all")
{

    if(norm_methods=="all" || missing(norm_methods))
        norm_methods <- names(mcols(norm_GR))

    if(RNAid=="all" || missing(RNAid))
        RNAid <- levels(seqnames(norm_GR))

    #Parsing normalized file based on the selected RNAid(s) and normalization
    #method(s).
    norm_GR <- norm_GR[seqnames(norm_GR) %in% RNAid, norm_methods]
    data.frame(RNAid=as.character(seqnames(norm_GR)),
               Pos=as.integer(start(norm_GR)), mcols(norm_GR))
}



#' Function to convert norm_df data frame (product of GR2norm_df()) to GRanges.
#'
#' @param norm_df norm_df data frame needs to have columns: RNAid (equivalent
#' to seqnames in GRanges) and Pos (equivalent to start in GRanges) and
#' metadata
#' @return GRanges compatible with objects created by normalizing functions
#' (dtcr(), slograt(), swinsor(), compdata())
#' @author Lukasz Jan Kielpinski
#' @seealso \code{\link{dtcr}}, \code{\link{slograt}}, \code{\link{swinsor}},
#' \code{\link{compdata}}, \code{\link{GR2norm_df}}, \code{\link{norm2bedgraph}}
#' @examples
#'
#' dummy_norm_df <- data.frame(RNAid="dummyRNA", Pos=1:100,
#'                             my_data1=runif(1:100))
#' norm_df2GR(dummy_norm_df)
#'
#' @export norm_df2GR
norm_df2GR <- function(norm_df)
{
    GRanges(seqnames=norm_df$RNAid, IRanges(start=norm_df$Pos, width=1),
        strand="+", norm_df[names(norm_df)!="RNAid" & names(norm_df)!="Pos"])
}


#Special function to repair merged Comp_df data frames if there is gap between
#merged nucleotides:
.correct_merged <- function(oneRNA_compmerg){
    df_gaps <- diff(oneRNA_compmerg$Pos)
    rnaid_column <- which(names(oneRNA_compmerg)=="RNAid")
    oneRNA_out <- oneRNA_compmerg[c(rep(1:length(df_gaps), df_gaps),
                                    nrow(oneRNA_compmerg)),]
    oneRNA_out[duplicated(rep(1:length(df_gaps), df_gaps)),
               (1:ncol(oneRNA_out))[-rnaid_column]] <- NA
    oneRNA_out$Pos <- oneRNA_compmerg$Pos[1]:(oneRNA_compmerg$Pos[1] +
                                                  nrow(oneRNA_out)-1)
    oneRNA_out$nt[is.na(oneRNA_out$nt)] <- "N"
    oneRNA_out[is.na(oneRNA_out)] <- 0

    oneRNA_out
}
sidiropoulos/RNAprobR-old documentation built on May 29, 2019, 9:58 p.m.