R/seurat_interaction.R

Defines functions .get_top_n_regions .get_features add_to_seurat

Documented in add_to_seurat

#' @description Add meta.data about CNAs to a Seurat object from an infercnv_obj
#'
#' @title add_to_seurat()
#'
#' @param seurat_obj Seurat object to add meta.data to (default: NULL)
#'
#' @param assay_name Name of the assay in the Seurat object if provided. (default: "RNA")
#'
#' @param infercnv_output_path Path to the output folder of the infercnv run to use
#'
#' @param top_n How many of the largest CNA (in number of genes) to get.
#'
#' @param bp_tolerance How many bp of tolerance to have around feature start/end positions for top_n largest CNVs.
#' 
#' @param column_prefix String to add as a prefix to the Seurat metadata columns. Only applied to the seurat_obj, if supplied. Default is NULL
#'
#' @return seurat_obj
#'
#' @export
#'

add_to_seurat <- function(seurat_obj = NULL,
                          assay_name="RNA",
                          infercnv_output_path,
                          top_n = 10,
                          bp_tolerance = 2000000,
                          column_prefix = NULL) {
    lfiles <- list.files(infercnv_output_path, full.names = FALSE)
    
    if (!file.exists(paste(infercnv_output_path, "run.final.infercnv_obj", sep=.Platform$file.sep))) {
        flog.warn(sprintf("::Could not find \"run.final.infercnv_obj\" file at: %s", paste(infercnv_output_path, "run.final.infercnv_obj", sep=.Platform$file.sep)))
        stop()
    }
    infercnv_obj = readRDS(paste(infercnv_output_path, "run.final.infercnv_obj", sep=.Platform$file.sep))
    
    if (is.null(seurat_obj)) {
        flog.info("No Seurat object provided, will only write metadata matrix.")
    }
    else if(!(setequal(row.names(seurat_obj@meta.data), colnames(infercnv_obj@expr.data)) ||
         setequal(colnames(seurat_obj@assays[[assay_name]]), colnames(infercnv_obj@expr.data)))) {
        flog.warn("::Cell names in Seurat object and infercnv results do not match")
        stop()
    }

    # all(colnames(infercnv_obj@expr.data)[match(colnames(seurat_obj@assays[[assay_name]]), colnames(infercnv_obj@expr.data))] == colnames(seurat_obj@assays[[assay_name]]))
    if (is.null(seurat_obj)) {
        cell_ordering_match = seq_len(ncol(infercnv_obj@expr.data))
    }
    else {
        cell_ordering_match = match(colnames(seurat_obj@assays[[assay_name]]), colnames(infercnv_obj@expr.data))
    }
    
    ## add check that data row/col names match seurat obj
    analysis_mode_pattern = "samples"
    if (!is.null(infercnv_obj@options$analysis_mode)) {
            analysis_mode_pattern = gsub('[\"]', '', infercnv_obj@options$analysis_mode)   ## gsub to remove _"\"_ around the name of the options if it is given as string and not through a variable
    }

    by_cells = FALSE
    if (!is.null(infercnv_obj@options$HMM_report_by) && infercnv_obj@options$HMM_report_by == "cell") {
        by_cells = TRUE
    }

    scaling_factor = 2
    if (any(grep(lfiles, pattern=paste0("HMM_CNV_predictions.HMM.*", analysis_mode_pattern, ".Pnorm_0.[0-9]+")))) {
        ###### states used to be 0/0.5/1/1.5/2, they are now 1/2/3/4/5/6
        # scaling_factor = 1
        if (any(grep(lfiles, pattern=paste0("HMM_CNV_predictions.HMMi6.*", analysis_mode_pattern, ".Pnorm_0.[0-9]+")))) {
            mode = "i6"
            center_state = 3
        }
        else if (any(grep(lfiles, pattern=paste0("HMM_CNV_predictions.HMMi3.*", analysis_mode_pattern, ".Pnorm_0.[0-9]+")))) {
            mode = "i3"
            center_state = 2
        }
        else {
            flog.warn("::Found filtered HMM predictions output, but they do not match any known model type.")
            stop()
        }
        # sort to take lowest BayesProb if there are multiple
        regions = read.table(paste(infercnv_output_path, sort(lfiles[grep(lfiles, pattern=paste0("HMM_CNV_predictions.HMMi[36].*", analysis_mode_pattern, ".Pnorm_0.[0-9]+.pred_cnv_regions.dat"))])[1], sep=.Platform$file.sep), sep="\t", header=TRUE, check.names=FALSE, colClasses = c("character", "character", "integer", "character", "integer", "integer"))
        hmm_genes = read.table(paste(infercnv_output_path, sort(lfiles[grep(lfiles, pattern=paste0("HMM_CNV_predictions.HMMi[36].*", analysis_mode_pattern, ".Pnorm_0.[0-9]+.pred_cnv_genes.dat"))])[1], sep=.Platform$file.sep), sep="\t", header=TRUE, check.names=FALSE, colClasses = c("character", "character", "integer", "character", "character", "integer", "integer"))
        # from_pbayes()
    }
    else if (any(grep(lfiles, pattern = paste0("17_HMM_predHMM.*", analysis_mode_pattern)))) {
        ###### states are 1/2/3/4/5/6
        # scaling_factor = 2
        if (any(grep(lfiles, pattern = paste0("17_HMM_predHMMi6.*", analysis_mode_pattern)))) {
            mode = "i6"
            center_state = 3
        }
        else if (any(grep(lfiles, pattern = paste0("17_HMM_predHMMi3.*", analysis_mode_pattern)))) {
            mode = "i3"
            center_state = 2
        }
        else {
            flog.warn("::Found HMM predictions output, but they do not match any known model type")
            stop()
        }
        regions = read.table(paste(infercnv_output_path, lfiles[grep(lfiles, pattern=paste0("17_HMM_predHMMi[36].*", analysis_mode_pattern, ".pred_cnv_regions.dat"))][1], sep=.Platform$file.sep), sep="\t", header=TRUE, check.names=FALSE, colClasses = c("character", "character", "integer", "character", "integer", "integer"))
        hmm_genes = read.table(paste(infercnv_output_path, lfiles[grep(lfiles, pattern=paste0("17_HMM_predHMMi[36].*", analysis_mode_pattern, ".pred_cnv_genes.dat"))][1], sep=.Platform$file.sep), sep="\t", header=TRUE, check.names=FALSE, colClasses = c("character", "character", "integer", "character", "character", "integer", "integer"))
        # from_hmm()
    }
    else {
        flog.warn(sprintf("::Could not find any HMM predictions outputs at: %s", infercnv_output_path))
        flog.warn("::add_to_seurat works by transcribing (and copying if a Seurat object is provided) the HMM results from infercnv, so it cannot be used without having run the HMM.")
        # stop()
    }
    
    # subcluster ids information
    subcluster_groups = vector()
    annotation_names = c(names(infercnv_obj@reference_grouped_cell_indices),
                         names(infercnv_obj@observation_grouped_cell_indices),
                         "all_observations")
    for (annot_name in annotation_names) {
        for (subcluster in names(infercnv_obj@tumor_subclusters$subclusters[[ annot_name ]])) {
            tmp = infercnv_obj@tumor_subclusters$subclusters[[ annot_name ]][[subcluster]]
            tmp[] = subcluster
            subcluster_groups = c(subcluster_groups, tmp)
        }
    }
    subcluster_groups = subcluster_groups[colnames(infercnv_obj@expr.data)]

    features_to_add <- .get_features(infercnv_obj = infercnv_obj,
                                     infercnv_output_path = infercnv_output_path,
                                     regions = regions, 
                                     hmm_genes = hmm_genes, 
                                     center_state = center_state, 
                                     scaling_factor = scaling_factor,
                                     mode = mode,
                                     by_cells = by_cells,
                                     top_n = top_n,
                                     bp_tolerance = bp_tolerance)
    if (!is.null(seurat_obj)) {
      
        if (! is.null(column_prefix)) {
          prefix <- paste(column_prefix, "_", sep = "")
          
        } else {
          prefix <- ""
        }
      
      
        for (lv in levels(infercnv_obj@gene_order$chr)) {
            seurat_obj@meta.data[[paste0(prefix, "has_cnv_", lv)]] = features_to_add$feature_vector_chrs_has_cnv[[lv]][cell_ordering_match]
            seurat_obj@meta.data[[paste0(prefix, "has_loss_", lv)]] = features_to_add$feature_vector_chrs_has_loss[[lv]][cell_ordering_match]
            seurat_obj@meta.data[[paste0(prefix, "has_dupli_", lv)]] = features_to_add$feature_vector_chrs_has_dupli[[lv]][cell_ordering_match]
            seurat_obj@meta.data[[paste0(prefix, "proportion_cnv_", lv)]] = features_to_add$feature_vector_chrs_gene_cnv_proportion[[lv]][cell_ordering_match]
            seurat_obj@meta.data[[paste0(prefix, "proportion_loss_", lv)]] = features_to_add$feature_vector_chrs_gene_loss_proportion[[lv]][cell_ordering_match]
            seurat_obj@meta.data[[paste0(prefix, "proportion_dupli_", lv)]] = features_to_add$feature_vector_chrs_gene_dupli_proportion[[lv]][cell_ordering_match]
            if (mode == "i6") {
                seurat_obj@meta.data[[paste0(prefix, "proportion_scaled_cnv_", lv)]] = features_to_add$feature_vector_chrs_gene_cnv_proportion_scaled[[lv]][cell_ordering_match]
                seurat_obj@meta.data[[paste0(prefix, "proportion_scaled_loss_", lv)]] = features_to_add$feature_vector_chrs_gene_loss_proportion_scaled[[lv]][cell_ordering_match][cell_ordering_match]
                seurat_obj@meta.data[[paste0(prefix, "proportion_scaled_dupli_", lv)]] = features_to_add$feature_vector_chrs_gene_dupli_proportion_scaled[[lv]][cell_ordering_match]
            }
        }
      
        for (n in names(features_to_add)[grep(names(features_to_add), pattern = "top_")] ) {
          
            seurat_obj@meta.data[[paste0(prefix, n)]] = features_to_add[[n]][cell_ordering_match]
        }

        seurat_obj@meta.data[["infercnv_subcluster"]] = subcluster_groups[cell_ordering_match]
    }

    if (mode == "i6") {
        out_mat = matrix(NA, ncol=((9 * length(levels(infercnv_obj@gene_order$chr))) + length(features_to_add) - 9), nrow=ncol(infercnv_obj@expr.data))
        out_mat_feature_names = vector("character", ((9 * length(levels(infercnv_obj@gene_order$chr))) + length(features_to_add) - 9))
    }
    else {
        out_mat = matrix(NA, ncol=((6 * length(levels(infercnv_obj@gene_order$chr))) + length(features_to_add) - 6), nrow=ncol(infercnv_obj@expr.data))
        out_mat_feature_names = vector("character", ((6 * length(levels(infercnv_obj@gene_order$chr))) + length(features_to_add) - 6))
    }
    
    i = 1
    for (lv in levels(infercnv_obj@gene_order$chr)) {
        
        out_mat[, i] = features_to_add$feature_vector_chrs_has_cnv[[lv]]
        out_mat[, i+1] = features_to_add$feature_vector_chrs_has_loss[[lv]]
        out_mat[, i+2] = features_to_add$feature_vector_chrs_has_dupli[[lv]]
        out_mat[, i+3] = features_to_add$feature_vector_chrs_gene_cnv_proportion[[lv]]
        out_mat[, i+4] = features_to_add$feature_vector_chrs_gene_loss_proportion[[lv]]
        out_mat[, i+5] = features_to_add$feature_vector_chrs_gene_dupli_proportion[[lv]]
        if (mode == "i6") {
            out_mat[, i+6] = features_to_add$feature_vector_chrs_gene_cnv_proportion_scaled[[lv]]
            out_mat[, i+7] = features_to_add$feature_vector_chrs_gene_loss_proportion_scaled[[lv]]
            out_mat[, i+8] = features_to_add$feature_vector_chrs_gene_dupli_proportion_scaled[[lv]]
        
            out_mat_feature_names[i:(i+8)] = c(paste0("has_cnv_", lv), paste0("has_loss_", lv), paste0("has_dupli_", lv), paste0("proportion_cnv_", lv), paste0("proportion_loss_", lv), paste0("proportion_dupli_", lv), paste0("proportion_scaled_cnv_", lv), paste0("proportion_scaled_loss_", lv), paste0("proportion_scaled_dupli_", lv))
            i = i + 9
        }
        else {
            out_mat_feature_names[i:(i+5)] = c(paste0("has_cnv_", lv), paste0("has_loss_", lv), paste0("has_dupli_", lv), paste0("proportion_cnv_", lv), paste0("proportion_loss_", lv), paste0("proportion_dupli_", lv))
            i = i + 6
        }
    }
    
    for (n in names(features_to_add)[grep(names(features_to_add), pattern = "top_")] ) {
        out_mat[, i] = features_to_add[[n]]
        out_mat_feature_names[i] = n
        i = i + 1
    }
    
    out_mat_feature_names = c("subcluster", out_mat_feature_names)
    out_mat = cbind(subcluster_groups, out_mat)

    colnames(out_mat) = out_mat_feature_names
    row.names(out_mat) = colnames(infercnv_obj@expr.data)
    
    write.table(out_mat, paste(infercnv_output_path, "map_metadata_from_infercnv.txt", sep=.Platform$file.sep) , quote=FALSE, sep="\t")
    
    return(seurat_obj)
}


#' @title .get_features()
#'
#' @description Get data from infercnv objects to add to Seurat meta.data
#'
#' @param infercnv_obj infercnv hmm object
#'
#' @param infercnv_output_path infercnv output folder
#'
#' @param regions Table with predicted CNAs regions from the HMM model
#'
#' @param hmm_genes Table with predicted CNAs genes from the HMM model
#'
#' @param center_state Value that represents the neutral state in the HMM results.
#'
#' @param scaling_factor Factor to multiply divergence from center_state to get CNA amplitude.
#'
#' @param mode "i3" or "i6", if i3 there is no scaled proportion field as the precise number of lost or gained copies is unknown. 
#'
#' @param by_cells Whether the HMM predictions are given by cells or not.
#'
#' @param top_n How many of the largest CNA (in number of genes) to get.
#'
#' @return all_features A list of all the calculated meta.data to add.
#'
#' @keywords internal
#' @noRd
#'
.get_features <- function(infercnv_obj, infercnv_output_path, regions, hmm_genes, center_state, scaling_factor, by_cells, mode, top_n, bp_tolerance) {
    
    chr_gene_count = table(infercnv_obj@gene_order$chr)
    
    # features templates for initialization
    double_feature_vector = vector(mode="double", length=ncol(infercnv_obj@expr.data))
    names(double_feature_vector) = colnames(infercnv_obj@expr.data)
    logical_feature_vector = vector(mode="logical", length=ncol(infercnv_obj@expr.data))
    names(logical_feature_vector) = colnames(infercnv_obj@expr.data)
    
    # initialize features lists
    all_features = c()
    all_features$feature_vector_chrs_has_cnv = c()
    all_features$feature_vector_chrs_has_dupli = c()
    all_features$feature_vector_chrs_has_loss = c()
    all_features$feature_vector_chrs_gene_cnv_proportion = c()
    all_features$feature_vector_chrs_gene_dupli_proportion = c()
    all_features$feature_vector_chrs_gene_loss_proportion = c()
    if (mode == "i6") {
        all_features$feature_vector_chrs_gene_cnv_proportion_scaled = c()
        all_features$feature_vector_chrs_gene_dupli_proportion_scaled = c()
        all_features$feature_vector_chrs_gene_loss_proportion_scaled = c()
    }
    for (lv in levels(infercnv_obj@gene_order$chr)) {
        all_features$feature_vector_chrs_has_cnv[[lv]] = logical_feature_vector
        all_features$feature_vector_chrs_has_dupli[[lv]] = logical_feature_vector
        all_features$feature_vector_chrs_has_loss[[lv]] = logical_feature_vector
        all_features$feature_vector_chrs_gene_cnv_proportion[[lv]] = double_feature_vector
        all_features$feature_vector_chrs_gene_dupli_proportion[[lv]] = double_feature_vector
        all_features$feature_vector_chrs_gene_loss_proportion[[lv]] = double_feature_vector
        if (mode == "i6") {
            all_features$feature_vector_chrs_gene_cnv_proportion_scaled[[lv]] = double_feature_vector
            all_features$feature_vector_chrs_gene_dupli_proportion_scaled[[lv]] = double_feature_vector
            all_features$feature_vector_chrs_gene_loss_proportion_scaled[[lv]] = double_feature_vector
        }
    }
    
    # map for top_n mapping
    subclust_name_to_clust = list()

    if (!by_cells) {
        for (clust in names(infercnv_obj@tumor_subclusters$subclusters)) {
            for (subclust in names(infercnv_obj@tumor_subclusters$subclusters[[clust]])) {
                subclust_name = paste(clust, subclust, sep=".")
                subclust_name_to_clust[[subclust_name]] = c(clust, subclust)
                res = regions[regions$cell_group_name == subclust_name, , drop=FALSE]
                gres = hmm_genes[hmm_genes$cell_group_name == subclust_name, , drop=FALSE]
                if (nrow(res) > 0) {
                    for (c in unique(res$chr)) {
                        all_features$feature_vector_chrs_has_cnv[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
                        all_features$feature_vector_chrs_gene_cnv_proportion[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (length(which(gres$chr == c)) / chr_gene_count[[c]])
                        if (mode == "i6") {
                               all_features$feature_vector_chrs_gene_cnv_proportion_scaled[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (sum(abs(gres[(which(gres$chr == c)), "state"] - center_state)) / (chr_gene_count[[c]] * scaling_factor))
                        }
                    }
                    
                    sub_gres = gres[gres$state < center_state, ]
                    for (c in unique(sub_gres$chr)) {
                        all_features$feature_vector_chrs_has_loss[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
                        all_features$feature_vector_chrs_gene_loss_proportion[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (length(which(sub_gres$chr == c)) / chr_gene_count[[c]])
                        if (mode == "i6") {
                            all_features$feature_vector_chrs_gene_loss_proportion_scaled[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (abs(sum(sub_gres[(which(sub_gres$chr == c)), "state"] - center_state)) / (chr_gene_count[[c]] * scaling_factor))
                        }
                    }
                    
                    sub_gres = gres[gres$state > center_state, ]
                    for (c in unique(sub_gres$chr)) {
                        all_features$feature_vector_chrs_has_dupli[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
                        all_features$feature_vector_chrs_gene_dupli_proportion[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (length(which(sub_gres$chr == c)) / chr_gene_count[[c]])
                        if (mode == "i6") {
                            all_features$feature_vector_chrs_gene_dupli_proportion_scaled[[c]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = (sum(sub_gres[(which(sub_gres$chr == c)), "state"] - center_state) / (chr_gene_count[[c]] * scaling_factor))
                        }
                    }
                }
            }
        }
    } else {
        for (cell_id in seq_len(ncol(infercnv_obj@expr.data))) {
            cell_name = colnames(infercnv_obj@expr.data)[cell_id]
            res = regions[regions$cell_group_name == cell_name, , drop=FALSE]
            gres = hmm_genes[hmm_genes$cell_group_name == cell_name, , drop=FALSE]
            if (nrow(res) > 0) {
                for (c in unique(res$chr)) {
                    all_features$feature_vector_chrs_has_cnv[[c]][cell_name] = TRUE
                    all_features$feature_vector_chrs_gene_cnv_proportion[[c]][cell_name] = (length(which(gres$chr == c)) / chr_gene_count[[c]])
                    if (mode == "i6") {
                        all_features$feature_vector_chrs_gene_cnv_proportion_scaled[[c]][cell_name] = (sum(abs(gres[(which(gres$chr == c)), "state"] - center_state)) / (chr_gene_count[[c]] * scaling_factor))
                    }
                }
                
                sub_gres = gres[gres$state < center_state, ]
                for (c in unique(sub_gres$chr)) {
                    all_features$feature_vector_chrs_has_loss[[c]][cell_name] = TRUE
                    all_features$feature_vector_chrs_gene_loss_proportion[[c]][cell_name] = (length(which(sub_gres$chr == c)) / chr_gene_count[[c]])
                    if (mode == "i6") {
                        all_features$feature_vector_chrs_gene_loss_proportion_scaled[[c]][cell_name] = (abs(sum(sub_gres[(which(sub_gres$chr == c)), "state"] - center_state)) / (chr_gene_count[[c]] * scaling_factor))
                    }
                }
                
                sub_gres = gres[gres$state > center_state, ]
                for (c in unique(sub_gres$chr)) {
                    all_features$feature_vector_chrs_has_dupli[[c]][cell_name] = TRUE
                    all_features$feature_vector_chrs_gene_dupli_proportion[[c]][cell_name] = (length(which(sub_gres$chr == c)) / chr_gene_count[[c]])
                    if (mode == "i6") {
                        all_features$feature_vector_chrs_gene_dupli_proportion_scaled[[c]][cell_name] = (sum(sub_gres[(which(sub_gres$chr == c)), "state"] - center_state) / (chr_gene_count[[c]] * scaling_factor))
                    }
                }
            }
        }
    }
    
    # sorted_regions = sort(table(hmm_genes$gene_region_name), decreasing=TRUE)
    sorted_regions_loss = sort(table(hmm_genes$gene_region_name[hmm_genes$state < center_state]), decreasing=TRUE)
    sorted_regions_dupli = sort(table(hmm_genes$gene_region_name[hmm_genes$state > center_state]), decreasing=TRUE)
    
    # top_n_cnv = .get_top_n_regions(hmm_genes = hmm_genes, sorted_regions = sorted_regions, top_n = top_n, bp_tolerance = bp_tolerance)
    top_n_loss = .get_top_n_regions(hmm_genes = hmm_genes, sorted_regions = sorted_regions_loss, top_n = top_n, bp_tolerance = bp_tolerance)
    top_n_dupli = .get_top_n_regions(hmm_genes = hmm_genes, sorted_regions = sorted_regions_dupli, top_n = top_n, bp_tolerance = bp_tolerance)
    
    # for (i in seq_along(top_n_cnv)) {
    #     feature_name = paste0("top_cnv_", i)
    #     all_features[[feature_name]] = logical_feature_vector
        
    #     for (subclust_name in top_n_cnv[[i]]$subclust_name) {
    #         clust = subclust_name_to_clust[[subclust_name]][1]
    #         subclust = subclust_name_to_clust[[subclust_name]][2]
    #         all_features[[feature_name]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
    #     }
    # }

    to_write = c()
    for (i in seq_along(top_n_loss)) {
        feature_name = paste0("top_loss_", i)
        all_features[[feature_name]] = logical_feature_vector
        line_to_write = c()
        
        if (!by_cells) {
            for (subclust_name in top_n_loss[[i]]$subclust_name) {
                clust = subclust_name_to_clust[[subclust_name]][1]
                subclust = subclust_name_to_clust[[subclust_name]][2]
                all_features[[feature_name]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
                # line_to_write = c(line_to_write, names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]]))
                line_to_write = c(line_to_write, paste(subclust_name, names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]]), sep=";"))
            }
        } else {
            for (cell_id in top_n_loss[[i]]$subclust_name) {
                all_features[[feature_name]][cell_id] = TRUE
                line_to_write = c(line_to_write, paste(cell_id, cell_id, sep=";"))
            }
        }

        # to_write = c(to_write, paste(feature_name, paste(top_n_loss[[i]]$subclust_name, sep=" "), paste(line_to_write, sep=" "), sep=";"))
        to_write = c(to_write, paste(feature_name, line_to_write, sep=";"))

    }

    if (is.null(to_write)) {
        to_write = ""
    }

    fileConn<-file(paste(infercnv_output_path, "top_losses.txt", sep=.Platform$file.sep), open="w")
    writeLines(to_write, con=fileConn)
    close(fileConn)

    # to_write = to_write = vector(mode="vector", length=length(top_n_dupli))
    to_write = c()
    for (i in seq_along(top_n_dupli)) {
        feature_name = paste0("top_dupli_", i)
        all_features[[feature_name]] = logical_feature_vector
        line_to_write = c()

        if (!by_cells) {
            for (subclust_name in top_n_dupli[[i]]$subclust_name) {
                clust = subclust_name_to_clust[[subclust_name]][1]
                subclust = subclust_name_to_clust[[subclust_name]][2]
                all_features[[feature_name]][names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]])] = TRUE
                # line_to_write = c(line_to_write, paste(feature_name, paste(top_n_dupli[[i]]$subclust_name, sep=" "), paste(line_to_write, sep=" "), sep=";"))
                 # line_to_write = c(line_to_write, names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]]))
                 line_to_write = c(line_to_write, paste(subclust_name, names(infercnv_obj@tumor_subclusters$subclusters[[clust]][[subclust]]), sep=";"))
            }
        } else {
            for (cell_id in top_n_dupli[[i]]$subclust_name) {
                all_features[[feature_name]][cell_id] = TRUE
                line_to_write = c(line_to_write, paste(cell_id, cell_id, sep=";"))
            }
        }

        # to_write = c(to_write, paste(feature_name, paste(top_n_dupli[i]]$subclust_name, sep=" "), paste(line_to_write, sep=" "), sep=";"))
        to_write = c(to_write, paste(feature_name, line_to_write, sep=";"))

    }

    if (is.null(to_write)) {
        to_write = ""
    }

    fileConn<-file(paste(infercnv_output_path, "top_duplis.txt", sep=.Platform$file.sep), open="w")
    writeLines(to_write, con=fileConn)
    close(fileConn)
    
    return(all_features)
}


#' @title .get_top_n_regions()
#'
#' @description Get top n largest CNA regions in number of genes
#'
#' @param hmm_genes Table with predicted CNAs genes from the HMM model
#'
#' @param sorted_region List of regions sorted by size in number of genes for the CNA type desired (gain/loss/both)
#'
#' @param top_n How many of the largest CNA (in number of genes) to get.
#'
#' @return all_features A list of all the calculated meta.data to add.
#'
#' @keywords internal
#' @noRd
#'
.get_top_n_regions <- function(hmm_genes, sorted_regions, top_n, bp_tolerance) {
    
    if (is.null(nrow(sorted_regions)) || (length(sorted_regions) < 1)) {
        return(c())
    }

    j = 1
    top_regions = vector("list", top_n)
    used_regions = c()
    # flog.debug("sorted regions are:")
    # for(sr in names(sorted_regions)) {
    #     flog.debug(paste(sr, sorted_regions[sr]))
    # }

    for (i in seq_len(nrow(sorted_regions))) {

        if (names(sorted_regions[i]) %in% used_regions) {
            next
        }

        genes_in_region = hmm_genes[which(hmm_genes$gene_region_name %in% names(sorted_regions[i])), ]
        region_chr = genes_in_region$chr[1]
        region_start_low = min(genes_in_region$start)
        region_start_high = region_start_low
        region_end_low = max(genes_in_region$end)
        region_end_high = region_end_low

        to_ignore = which(hmm_genes$gene_region_name %in% used_regions)
        
        if (length(to_ignore) > 0) {
            same_chr = setdiff(which(hmm_genes$chr == region_chr), to_ignore)
        }
        else {
            same_chr = which(hmm_genes$chr == region_chr)
        }

        initial_close = list()
        repeat {

            close_start = same_chr[which((hmm_genes$start[same_chr] <= region_start_high + bp_tolerance) & (hmm_genes$start[same_chr] >= region_start_low - bp_tolerance))]
            close_end = same_chr[which((hmm_genes$end[same_chr] <= region_end_high + bp_tolerance) & (hmm_genes$end[same_chr] >= region_end_low - bp_tolerance))]
            close_start_end = intersect(unique(hmm_genes$gene_region_name[close_start]), unique(hmm_genes$gene_region_name[close_end]))
        
            if ((length(setdiff(close_start_end, initial_close)) == 0) && (length(setdiff(initial_close, close_start_end)) == 0)) {
                break
            }
            else {
                initial_close = close_start_end
                starts = c()
                ends = c()
                for (regi in close_start_end) {
                    starts = c(starts, min(hmm_genes$start[which(hmm_genes$gene_region_name == regi)]))
                    ends = c(ends, max(hmm_genes$end[which(hmm_genes$gene_region_name == regi)]))
                }

                region_start_low = min(starts)
                region_start_high = max(starts)
                region_end_low = min(ends)
                region_end_high = max(ends)
            }
        }

        if (length(close_start_end) > 0) {
            top_regions[[j]]$subclust_names = unique(hmm_genes$cell_group_name[which(hmm_genes$gene_region_name %in% close_start_end)])
            top_regions[[j]]$regions_names = close_start_end
            flog.debug(paste0("top cnv ", j, " is composed of subclusts: "))#, paste(close_start_end, sep="   ")))
            flog.debug(paste(top_regions[[j]]$subclust_names, sep="  "))
            flog.debug("and region names: ")
            flog.debug(paste(top_regions[[j]]$regions_names, sep="  "))
        }
        else {
            flog.error("Did not even find itself, error.")
            stop()
        }

        used_regions = c(used_regions, close_start_end)

        if (length(used_regions) != length(unique(used_regions))) {
            flog.error("Used the same region twice")
            stop()
        }

        j = j + 1

        if (j == (top_n + 1)) {
            break
        }
    }

    return(top_regions[1:(j-1)])
}


# .get_top_n_regions <- function(hmm_genes, sorted_regions, top_n, bp_tolerance) {
#     j = 1
#     previous_region_chr = -1
#     previous_region_start = -1
#     previous_region_end = -1
#     top_regions = vector("list", top_n)
    
#     for (i in seq_len(nrow(sorted_regions))) {
#         genes_in_region = hmm_genes[which(hmm_genes$gene_region_name %in% names(sorted_regions[i])), ]
#         region_chr = genes_in_region$chr[1]
#         region_start = min(genes_in_region$start)
#         region_end = max(genes_in_region$end)
#         # check if the current region is the same as the previous one for a different subcluster or not
#         # if it is, extend the previous assignment without increasing the count of found top hits
#         if (region_chr == previous_region_chr && region_start <= previous_region_start + bp_tolerance && region_start >= previous_region_start - bp_tolerance && region_end <= previous_region_end + bp_tolerance && region_end >= previous_region_end - bp_tolerance) {
#             top_regions[[j]]$subclust_names = c(top_regions[[j]]$subclust_names, genes_in_region$cell_group_name[1])
#             top_regions[[j]]$regions_names = c(top_regions[[j]]$regions_names, genes_in_region$gene_region_name[1])
#         }
#         else {
#             top_regions[[j]]$subclust_names = genes_in_region$cell_group_name[1]
#             top_regions[[j]]$regions_names = genes_in_region$gene_region_name[1]
#             previous_region_chr = region_chr
#             previous_region_start = region_start
#             previous_region_end = region_end
#             j = j + 1
#         }
#         if (j == top_n + 1) {
#             break
#         }
#     }
    
#     if (j < top_n + 1) { # if less non unique regions than top_n
#         top_regions = top_regions[1:j]
#     }
    
#     return(top_regions)
# }




##' @keywords internal
##' @noRd
##'
#make_seurat_from_infercnv_obj <- function(infercnv_obj) {
#    return(CreateSeuratObject(counts = infercnv_obj@count.data, project="infercnv", min.cells = 3, min.features = 200))
#}
#
##' @keywords internal
##' @noRd
##'
#make_seurat_from_infercnv <- function(infercnv_output_path) {
#    if (file.exists(paste(infercnv_output_path, "run.final.infercnv_obj", sep=.Platform$file.sep))) {
#        return(make_seurat_from_infercnv_obj(readRDS(paste(infercnv_output_path, "run.final.infercnv_obj", sep=.Platform$file.sep))))
#    }
#    else {
#        stop()
#    }
#}
broadinstitute/inferCNV documentation built on Aug. 6, 2024, 11:14 a.m.