#' preprocessInput_sv
#'
#' Annotating the mutation data with necessary fields for further analysis
#' @param input_data Table describing somatic structural rearrangements.
#' @param ensgene Gene table for annotations. A table of Ensembl genes is provided with the package.
#' @param resdir Results directory where graphical outputs should be exported.
#'
#' @export
#'
preprocessInput_sv = function (input_data = NULL, Sample.col = "Sample", CHROM_1.col = "CHROM_1",
CHROM_2.col = "CHROM_2", POS_1.col = "POS_1", POS_2.col = "POS_2",
type.col = "Type",resdir = resdir, genome_build = "hg19")
{
packs <- c("NMF", "SomaticSignatures")
if ("NMF" %in% ls() | "SomaticSignatures" %in% ls()) {
detach("package:NMF", unload = TRUE)
detach("package:SomaticSignatures", unload = TRUE)
}
if(genome_build == "hg19") ensgene <- ensgene_hg19
if(genome_build == "hg38") ensgene <- ensgene_hg38
chroms <- unique(input_data[, CHROM_1.col])
if (1 %in% chroms == TRUE) {
input_data[, CHROM_1.col] <- paste("chr", input_data[,
CHROM_1.col], sep = "")
input_data[, CHROM_2.col] <- paste("chr", input_data[,
CHROM_2.col], sep = "")
}
input_data <- input_data[mixedorder(input_data[, CHROM_1.col]),
]
input_data$strand.mut <- "+"
input_data$strand.gene <- NA
print("Annotating mutation data:")
vcf <- palimpsest_dfPosXSegm_2(input_data, dfPos.chrom.col = CHROM_1.col,
dfPos.pos.col = POS_1.col, ensgene, dfSegm.chrom.col = "Chromosome.Name",
dfSegm.start.col = "Gene.Start..bp.", dfSegm.end.col = "Gene.End..bp.",
colsToAdd = "Associated.Gene.Name", namesColsToAdd = "Associated.Gene.Name")
vcf$strand.gene <- c("-", NA, "+")[ensgene[match(vcf$Associated.Gene.Name,
ensgene$Associated.Gene.Name), "Strand"] + 2]
vcf <- palimpsest_addSVcategoriesToVcf(vcf, type.col = type.col,
sample.col = Sample.col, CHROM_1.col = CHROM_1.col,
CHROM_2.col = CHROM_2.col, POS_1.col = POS_1.col, POS_2.col = POS_2.col,
resdir = resdir, genome_build = genome_build)
return(vcf)
}
#' palimpsest_dfPosXSegm_2
#'
#' Annotating the mutation data with necessary fields for further analysis
palimpsest_dfPosXSegm_2 = function (dfPos = NULL, dfPos.chrom.col = "chrom", dfPos.pos.col = "pos",
dfSegm = NULL, dfSegm.chrom.col = "chrom", dfSegm.start.col = "start",
dfSegm.end.col = "end", colsToAdd = NULL, namesColsToAdd = NULL,
multseg = c(NA, "first", "all")[1])
{
for (col in namesColsToAdd) dfPos[, col] <- NA
dfSegm. <- split(dfSegm, dfSegm[, dfSegm.chrom.col])
dfPos. <- split(dfPos, dfPos[, dfPos.chrom.col])
chr_listy <- mixedsort(intersect(names(dfSegm.), names(dfPos.)))
total <- length(chr_listy)
pb <- txtProgressBar(min = 0, max = total, style = 3)
counter_start <- c()
if (is.na(multseg)) {
for (chr in chr_listy) {
counter_start <- match(chr, chr_listy)
Sys.sleep(0.1)
setTxtProgressBar(pb, counter_start)
ind <- unlist(sapply(dfPos.[[chr]][, dfPos.pos.col],
function(pos) {
tmp <- which(dfSegm.[[chr]][, dfSegm.start.col] <=
pos & dfSegm.[[chr]][, dfSegm.end.col] >=
pos)
if (length(tmp) == 1)
tmp
else {
NA
}
}))
if(!all(is.na(ind))){ ## Quentin added, sinon ca plante si y'a aucun SV sur toute la série trouve dans tout un chromosome (ca m'est arrive dans GEPELIN avec le chrY)
dfPos.[[chr]][, namesColsToAdd] <- dfSegm.[[chr]][ind, colsToAdd]}
}
close(pb)
}else{
if (multseg == "first") {
for (chr in chr_listy) {
counter_start <- match(chr, chr_listy)
Sys.sleep(0.1)
setTxtProgressBar(pb, counter_start)
ind <- unlist(sapply(dfPos.[[chr]][, dfPos.pos.col],
function(pos) {
tmp <- which(dfSegm.[[chr]][, dfSegm.start.col] <=
pos & dfSegm.[[chr]][, dfSegm.end.col] >=
pos)[1]
}))
dfPos.[[chr]][, namesColsToAdd] <- dfSegm.[[chr]][ind,
colsToAdd]
}
close(pb)
}
if (multseg == "all") {
for (chr in chr_listy) {
counter_start <- match(chr, chr_listy)
Sys.sleep(0.1)
setTxtProgressBar(pb, counter_start)
ind <- sapply(dfPos.[[chr]][, dfPos.pos.col],
function(pos) {
tmp <- which(dfSegm.[[chr]][, dfSegm.start.col] <=
pos & dfSegm.[[chr]][, dfSegm.end.col] >=
pos)
tmp
})
for (j in 1:length(colsToAdd)) {
dfPos.[[chr]][, namesColsToAdd[j]] <- unlist(lapply(ind,
function(z) paste(dfSegm.[[chr]][z, colsToAdd[j]],
collapse = ",")))
dfPos.[[chr]][which(sapply(ind, length) ==
0), namesColsToAdd[j]] <- NA
}
}
close(pb)
}
}
dfPos <- unsplit(dfPos., dfPos[, dfPos.chrom.col])
dfPos
}
#' palimpsest_addSVcategoriesToVcf
#'
#' Function to add SV mutation categories to vcf
#' @param sv vcf data frame containing the SVs
#' @param type.col SV type description column name in mutation_data ["DEL","DUP","INV"]
#' @param sample.col Sample column name in sv
#' @param CHROM_1.col Start chromosome column name in sv
#' @param CHROM_2.col End chromosome column name in sv
#' @param POS_1.col Start position column name in sv
#' @param POS_2.col End position column name in sv
#' @param resdir Result directory
#'
#' @export
palimpsest_addSVcategoriesToVcf = function (sv = sv, type.col = "Type", sample.col = "Sample",
CHROM_1.col = "CHROM", CHROM_2.col = "CHROM", POS_1.col = "POS",
POS_2.col = "POS", resdir = resdir, genome_build = "hg19")
{
if(genome_build == "hg19") cyto <- cytoband_hg19
if(genome_build == "hg38") cyto <- cytoband_hg38
sv$Size <- 0
sv$size.cat <- NA
ind <- which(sv[, type.col] %in% c("DEL", "DUP", "INV"))
sv[ind, "Size"] <- abs(sv[ind, POS_2.col] - sv[ind, POS_1.col]) ### ICI
sv[which(sv$Size <= 1000), "size.cat"] <- "a_0-1kb"
sv[which(sv$Size > 1000 & sv$Size <= 10000), "size.cat"] <- "b_1-10kb"
sv[which(sv$Size > 10000 & sv$Size <= 1e+05), "size.cat"] <- "c_10-100kb"
sv[which(sv$Size > 1e+05 & sv$Size <= 1e+06), "size.cat"] <- "d_100kb-1Mb"
sv[which(sv$Size > 1e+06 & sv$Size <= 1e+07), "size.cat"] <- "e_1-10Mb"
sv[which(sv$Size > 1e+07), "size.cat"] <- "f_>10Mb"
sv$Clustered <- 0
sv$Cluster <- NA
minbkp <- 10
workdir <- file.path(resdir, "bkp_clustering")
if (!file.exists(workdir))
dir.create(workdir)
setwd(workdir)
tmp_1 = paste(sv[,CHROM_1.col], sv[, POS_1.col])
tmp_2 = paste(sv[, CHROM_2.col], sv[,POS_2.col])
for (s in unique(sv[, sample.col])) {
print(s)
chr <- pos <- c()
indsamp <- which(sv[, sample.col] == s)
for (i in indsamp) {
# if(sv[i,"Type"]=="BND"){ # ADDED \\\\\\\\\\\\\\\\\\\\\\\\\\\\\ si 2lignes de BND / trandloc (4/reciproque) ## formater si ca change
# chr <- c(chr,sv[i,CHROM_1.col])
# pos <- c(pos,sv[i,POS_1.col])
# }else{
# chr <- c(chr,rep(sv[i,CHROM_1.col],2))
# pos <- c(pos,as.numeric(sv[i,c(POS_1.col,POS_2.col)]))
# } # ADDED ///////////////////////////////////////////////
# # OLD
chr <- c(chr, sv[i, CHROM_1.col], sv[i, CHROM_2.col])
pos <- c(pos, as.numeric(sv[i, c(POS_1.col, POS_2.col)]))
}
bkp <- data.frame(chr, pos, dist = NA, stringsAsFactors = F)
tmp <- sub("X", 23, sub("Y", 24, bkp$chr))
bkp <- bkp[order(tmp, bkp$pos), ]
for (c in unique(bkp$chr)) {
ind <- which(bkp$chr == c)
if (length(ind) > 1) {
bkp[ind, ] <- bkp[ind[order(bkp[ind, "pos"])],
]
bkp[ind[-1], "dist"] <- bkp[ind[-1], "pos"] -
bkp[ind[-length(ind)], "pos"]
}
}
# bed <- bkp[, c(1, 2, 2, 3)]
# indi <- bed2index(bed, sort = TRUE)
# tmp <- cluster.region(x = indi, distance = "1000000",
# verbose = T)
# tmp[, "chr"] <- data.frame(do.call("rbind", strsplit(as.character(tmp$index),
# ":", fixed = TRUE)))[1]
# tmp[, c("start", "end")] <- data.frame(do.call("rbind",
# strsplit(as.character(tmp$index), "-", fixed = TRUE)))[2]
# bkp$cluster <- tmp[, "regionIndex"]
bed <- bkp[, c(1, 2, 2)]
write.table(bed,"tmp.bed",sep="\t",quote=F,col.names=F,row.names=F) # added
system(paste("bedtools cluster -i tmp.bed -d 1000000","> res.bed")) # added
tmp <- read.delim("res.bed",as.is=T,header=F) # added
bkp$cluster <- tmp[,4] # added
bkp$clustered <- 0
tt <- table(bkp$cluster)
for (clst in names(which(tt >= 10))) {
bkp[which(bkp$cluster == clst), "clustered"] <- 1
}
# # attention ici créer les tmp1 et tmp2 a chaque itération (cf loop modifiee) prend un temps fou...
# for (i in which(bkp$clustered == 1)) {
# ind <- which(sv[, sample.col] == s &
# (paste(sv[,CHROM_1.col], sv[, POS_1.col]) == paste(bkp$chr[i],bkp$pos[i]) |
# paste(sv[, CHROM_2.col], sv[,POS_2.col]) == paste(bkp$chr[i], bkp$pos[i])))
# sv[ind, "Clustered"] <- 1
# sv[ind, "Cluster"] <- paste(s, bkp[i, "cluster"], sep = ".")
# # ind <- ind[which(sv[ind,"Type"]=="BND")]
# # for(k in ind){
# # sv[which(sv$POS_2 == sv[k,"POS_1"] & sv$POS_1 == sv[k,"POS_2"]), "Clustered"] <- 1
# # # sv[match(sv[k,"MATEID"],sv$ID),"Cluster"] <- paste(s,bkp[i,"cluster"],sep=".")
# # }
# }
tmp_0 = sv[, sample.col] == s
# tmp_1 = paste(sv[,CHROM_1.col], sv[, POS_1.col]) # deplace avant la boucle principale
# tmp_2 = paste(sv[, CHROM_2.col], sv[,POS_2.col])
tmp_3 = paste(bkp$chr, bkp$pos)
for (i in which(bkp$clustered == 1)) {
ind <- which(tmp_0 &
(tmp_1 == tmp_3[i] |
tmp_2 == tmp_3[i]))
sv[ind, "Clustered"] <- 1
sv[ind, "Cluster"] <- paste(s, bkp[i, "cluster"], sep = ".")
}
chr <- pos <- clustered <- c()
indsamp <- which(sv[, sample.col] == s)
for (i in indsamp) {
chr <- c(chr, sv[i, CHROM_1.col], sv[i, CHROM_2.col])
pos <- c(pos, as.numeric(sv[i, c(POS_1.col, POS_2.col)]))
clustered <- c(clustered, rep(sv[i, "Clustered"],
2))
}
bkp <- data.frame(chr, pos, clustered, stringsAsFactors = F)
bkp$clustered[is.na(bkp$clustered)] <- 0
pdf(paste("bkp_", s, ".pdf", sep = ""), width = 15,
height = 3)
for (chr in unique(bkp$chr)) {
ind <- which(bkp$chr == chr)
if (length(ind) > 1)
plot(bkp$pos[ind], rep(1, length(ind)), col = bkp$clustered[ind] +
1, main = chr, type = "h", ylim = c(0, 1.5),
yaxt = "n", bty = "l", las = 1, ylab = "",
xlim = c(min(cyto[which(cyto$Chromosome ==
sub("chr", "", chr)), "Start"]), max(cyto[which(cyto$Chromosome ==
sub("chr", "", chr)), "End"])), xlab = "Position")
else next
}
dev.off()
}
sv$Clustered[is.na(sv$Clustered)] <- 0
sv$Category1 <- paste(sv[, type.col], substr(sv$size.cat,
3, nchar(sv$size.cat)), "clust", sv$Clustered, sep = "_")
sv$Category1 <- sub("BND_0-1kb", "BND", sv$Category1)
return(sv)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.