Singleton filtering now done by paraclu and distclu directly

Charles Plessy · Charles Plessy · commit 4702c46257a7 · 2024-05-23T13:22:24.000+09:00
The updated `.ctss_summary_for_clusters` does not remove clusters anymore.

The information on dominant CTSS score and total number of CTSS is now
stored in the dominantCTSS object directly.

Also, reorder computations so that score is the first metadata column in the cluster objects.
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export("genomeName<-")
 export("inputFiles<-")
 export("inputFilesType<-")
 export("sampleLabels<-")
+export(.ctss_summary_for_clusters)
 export(CAGEexp)
 export(CTSS)
 export(CTSScoordinatesGR)
diff --git a/NEWS.md b/NEWS.md
@@ -16,6 +16,8 @@ BACKWARDS-INCOMPATIBLE CHANGES
 -   The `removeSingletons` option of clustering methods is removed and the
     default value of `keepSingletonsAbove` is set to `0`, which keeps the
     standard behavior.
+-   In cluster objects, the dominant CTSS score is now stored in the
+    `dominantCTSS` object directly.
 
 BUG FIXES
 
@@ -34,6 +36,9 @@ NEW FEATURES
 OTHER CHANGES
 
 -   Accelerated the computation of cumulative sums ~10×.
+-   Singleton filtering is now done by the `paraclu` and `distclu` functions
+    themeselves; `.ctss_summary_for_clusters` does not change the input clusters
+    except for adding information.
 
 # Changes in version 2.11.1
 
diff --git a/R/ClusteringFunctions.R b/R/ClusteringFunctions.R
@@ -1,29 +1,46 @@
 #' @include CAGEexp.R CTSS.R
 NULL
 
-#' @noRd
+#' Summarise CTSSs included in clusters
+#' 
+#' @param ctss A [`CTSS`] object.
+#' 
+#' @param clusters A [`TagClusters`], [`ConsensusClusters`] or any other
+#' object implementing the [`GRanges`] class.
+#' 
+#' @return The `clusters` object with a new `dominant_CTSS` metadata in `CTSS`
+#' format reporting the genomic coordinate and expression score of most
+#' highly expressed position in each cluster, plus a `nr_ctss` metadata reporting
+#' the number of expressed CTSSs in each cluster.
 #' 
 #' @importFrom S4Vectors queryHits subjectHits runLength runValue
+#' @export
 #' 
 #' @examples 
 #' # See also benchmarks/dominant_ctss.md
-#' (ctss <- GRanges('chr1', IRanges(start = 1:10, end = 1:10), '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
-#' (clusters <- GRanges('chr1', IRanges(start = c(1,9), end = c(8,10)), '+'))
+#' (ctss <- CTSS( 'chr1', IRanges(start = 1:10, end = 1:10)
+#'              , '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
+#' (clusters <- GRanges( 'chr1', IRanges(start = c(1,9)
+#'                     , end = c(8,10)), '+')) |> as("TagClusters")
 #' 
 #' # The function assumes that all CTSSes have a score above zero
-#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters, keepSingletonsAbove = Inf)
+#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters)
 #' # If not the case, it will give incorrect nr_ctss and  fail to remove singletons
-#' .ctss_summary_for_clusters(ctss, clusters, keepSingletonsAbove = Inf)
+#' .ctss_summary_for_clusters(ctss, clusters)
 #' 
 #' # The function needs its output to be sorted and is not going to check it.
 #' .ctss_summary_for_clusters(rev(ctss), clusters)
 #' .ctss_summary_for_clusters(ctss, rev(clusters))
 #' 
 #' # Ties are resolved with 5' preference for both plus and minus strands.
 #' # This may create a small bias.
-#' .ctss_summary_for_clusters(ctss |> plyranges::mutate(strand = '-'), clusters |> plyranges::mutate(strand = '-'))
+#' ctss_minus <- ctss
+#' strand(ctss_minus) <- '-'
+#' clusters_minus <- clusters
+#' strand(clusters_minus) <- '-'
+#' .ctss_summary_for_clusters(ctss_minus, clusters_minus)
 
-.ctss_summary_for_clusters <- function(ctss, clusters, keepSingletonsAbove = 0) {
+.ctss_summary_for_clusters <- function(ctss, clusters) {
   # Match the clusters and the CTSS
   o <- findOverlaps(clusters, ctss)
 
@@ -47,21 +64,18 @@ NULL
   # Find absolute position of dominant CTSS in each run.
   global_max_ids <- cluster_start_idx + local_max_idx - 1
   
-  # Record dominant CTSS as GRanges object.
-  clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids]
-  
-  # Record dominant CTSS score.  Mabye we should use its GRanges's score instead.
-  clusters$tpm.dominant_ctss <-   score(ctss)[subjectHits(o)][global_max_ids]
-  
   # Record total expression of the cluster
   score(clusters) <- Rle(sum(grouped_scores))
+  
+  # Record dominant CTSS as CTSS object.
+  clusters$dominant_ctss <- CTSS(granges(ctss)[subjectHits(o)][global_max_ids])
+  
+  # Record dominant CTSS score.
+  score(clusters$dominant_ctss) <- score(ctss)[subjectHits(o)][global_max_ids]
 
   # Count the number of clusters   
   clusters$nr_ctss <- rl
   
-  # Remove clusters that match only one CTSS unless their expression is high enough
-  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
-  
   # Give numerical names to the clusters
   names(clusters) <- seq_along(clusters)
   
diff --git a/R/Distclu.R b/R/Distclu.R
@@ -50,8 +50,9 @@ setMethod("distclu", "SummarizedExperiment",
 
 .distclu_CTSS <- function(object, max.dist, keepSingletonsAbove) {
   clusters <- reduce(GRanges(object), min = max.dist)
-  clusters <- .ctss_summary_for_clusters(object, clusters,
-                                         keepSingletonsAbove = keepSingletonsAbove)
+  clusters <- .ctss_summary_for_clusters(object, clusters)
+  # Remove clusters that match only one CTSS unless their expression is high enough
+  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
   names(clusters) <- seq_along(clusters)
   as(clusters, "TagClusters")
 }
diff --git a/R/ExportMethods.R b/R/ExportMethods.R
@@ -557,17 +557,10 @@ function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
 
 
 .exportToTrack_clusters <- function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
-  # Simplify this after the format of dominant_ctss is standardised in all cluster objects.
-  ranges_ <- function(x) {
-    if (inherits(x, "GRanges")) return(IRanges(ranges(x)))
-    IRanges(x)
-  }
-  object$thick <- ranges_(object$dominant_ctss)
+  object$thick <- IRanges(ranges(object$dominant_ctss))
   object$dominant_ctss <- NULL
   names(object) <- NULL
   object$name <- NA
-  object$nr_ctss <- NULL
-  object$tpm.dominant_ctss <- NULL
   exportToTrack( GRanges(object), qLow = qLow, qUp = qUp
                  , colorByExpressionProfile = colorByExpressionProfile
                  , oneTrack = oneTrack)
diff --git a/R/Paraclu.R b/R/Paraclu.R
@@ -150,9 +150,9 @@ setMethod("paraclu", "CTSS",
   clusters <- clusters[(clusters$max_d  >= (minStability * clusters$min_d)) &
                        (width(clusters) <= maxLength)]
   # Compute score and dominant CTSs, and remove singletons as wanted.
-  clusters <-
-    .ctss_summary_for_clusters( object, clusters
-                              , keepSingletonsAbove = keepSingletonsAbove)
+  clusters <- .ctss_summary_for_clusters(object, clusters)
+  # Remove clusters that match only one CTSS unless their expression is high enough
+  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
   # Reduce to non-overlapping as wanted
   if(reduceToNonoverlapping == TRUE){
     o <- findOverlaps(clusters, drop.self = TRUE, type = "within")