Skip to content

Commit 4702c46

Browse files
author
Charles Plessy
committed
Singleton filtering now done by paraclu and distclu directly
The updated `.ctss_summary_for_clusters` does not remove clusters anymore. The information on dominant CTSS score and total number of CTSS is now stored in the dominantCTSS object directly. Also, reorder computations so that score is the first metadata column in the cluster objects.
1 parent 8ed1da1 commit 4702c46

6 files changed

+43
-29
lines changed

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export("genomeName<-")
1111
export("inputFiles<-")
1212
export("inputFilesType<-")
1313
export("sampleLabels<-")
14+
export(.ctss_summary_for_clusters)
1415
export(CAGEexp)
1516
export(CTSS)
1617
export(CTSScoordinatesGR)

NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ BACKWARDS-INCOMPATIBLE CHANGES
1616
- The `removeSingletons` option of clustering methods is removed and the
1717
default value of `keepSingletonsAbove` is set to `0`, which keeps the
1818
standard behavior.
19+
- In cluster objects, the dominant CTSS score is now stored in the
20+
`dominantCTSS` object directly.
1921

2022
BUG FIXES
2123

@@ -34,6 +36,9 @@ NEW FEATURES
3436
OTHER CHANGES
3537

3638
- Accelerated the computation of cumulative sums ~10×.
39+
- Singleton filtering is now done by the `paraclu` and `distclu` functions
40+
themeselves; `.ctss_summary_for_clusters` does not change the input clusters
41+
except for adding information.
3742

3843
# Changes in version 2.11.1
3944

R/ClusteringFunctions.R

+30-16
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,46 @@
11
#' @include CAGEexp.R CTSS.R
22
NULL
33

4-
#' @noRd
4+
#' Summarise CTSSs included in clusters
5+
#'
6+
#' @param ctss A [`CTSS`] object.
7+
#'
8+
#' @param clusters A [`TagClusters`], [`ConsensusClusters`] or any other
9+
#' object implementing the [`GRanges`] class.
10+
#'
11+
#' @return The `clusters` object with a new `dominant_CTSS` metadata in `CTSS`
12+
#' format reporting the genomic coordinate and expression score of most
13+
#' highly expressed position in each cluster, plus a `nr_ctss` metadata reporting
14+
#' the number of expressed CTSSs in each cluster.
515
#'
616
#' @importFrom S4Vectors queryHits subjectHits runLength runValue
17+
#' @export
718
#'
819
#' @examples
920
#' # See also benchmarks/dominant_ctss.md
10-
#' (ctss <- GRanges('chr1', IRanges(start = 1:10, end = 1:10), '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
11-
#' (clusters <- GRanges('chr1', IRanges(start = c(1,9), end = c(8,10)), '+'))
21+
#' (ctss <- CTSS( 'chr1', IRanges(start = 1:10, end = 1:10)
22+
#' , '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
23+
#' (clusters <- GRanges( 'chr1', IRanges(start = c(1,9)
24+
#' , end = c(8,10)), '+')) |> as("TagClusters")
1225
#'
1326
#' # The function assumes that all CTSSes have a score above zero
14-
#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters, keepSingletonsAbove = Inf)
27+
#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters)
1528
#' # If not the case, it will give incorrect nr_ctss and fail to remove singletons
16-
#' .ctss_summary_for_clusters(ctss, clusters, keepSingletonsAbove = Inf)
29+
#' .ctss_summary_for_clusters(ctss, clusters)
1730
#'
1831
#' # The function needs its output to be sorted and is not going to check it.
1932
#' .ctss_summary_for_clusters(rev(ctss), clusters)
2033
#' .ctss_summary_for_clusters(ctss, rev(clusters))
2134
#'
2235
#' # Ties are resolved with 5' preference for both plus and minus strands.
2336
#' # This may create a small bias.
24-
#' .ctss_summary_for_clusters(ctss |> plyranges::mutate(strand = '-'), clusters |> plyranges::mutate(strand = '-'))
37+
#' ctss_minus <- ctss
38+
#' strand(ctss_minus) <- '-'
39+
#' clusters_minus <- clusters
40+
#' strand(clusters_minus) <- '-'
41+
#' .ctss_summary_for_clusters(ctss_minus, clusters_minus)
2542

26-
.ctss_summary_for_clusters <- function(ctss, clusters, keepSingletonsAbove = 0) {
43+
.ctss_summary_for_clusters <- function(ctss, clusters) {
2744
# Match the clusters and the CTSS
2845
o <- findOverlaps(clusters, ctss)
2946

@@ -47,21 +64,18 @@ NULL
4764
# Find absolute position of dominant CTSS in each run.
4865
global_max_ids <- cluster_start_idx + local_max_idx - 1
4966

50-
# Record dominant CTSS as GRanges object.
51-
clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids]
52-
53-
# Record dominant CTSS score. Mabye we should use its GRanges's score instead.
54-
clusters$tpm.dominant_ctss <- score(ctss)[subjectHits(o)][global_max_ids]
55-
5667
# Record total expression of the cluster
5768
score(clusters) <- Rle(sum(grouped_scores))
69+
70+
# Record dominant CTSS as CTSS object.
71+
clusters$dominant_ctss <- CTSS(granges(ctss)[subjectHits(o)][global_max_ids])
72+
73+
# Record dominant CTSS score.
74+
score(clusters$dominant_ctss) <- score(ctss)[subjectHits(o)][global_max_ids]
5875

5976
# Count the number of clusters
6077
clusters$nr_ctss <- rl
6178

62-
# Remove clusters that match only one CTSS unless their expression is high enough
63-
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
64-
6579
# Give numerical names to the clusters
6680
names(clusters) <- seq_along(clusters)
6781

R/Distclu.R

+3-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@ setMethod("distclu", "SummarizedExperiment",
5050

5151
.distclu_CTSS <- function(object, max.dist, keepSingletonsAbove) {
5252
clusters <- reduce(GRanges(object), min = max.dist)
53-
clusters <- .ctss_summary_for_clusters(object, clusters,
54-
keepSingletonsAbove = keepSingletonsAbove)
53+
clusters <- .ctss_summary_for_clusters(object, clusters)
54+
# Remove clusters that match only one CTSS unless their expression is high enough
55+
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
5556
names(clusters) <- seq_along(clusters)
5657
as(clusters, "TagClusters")
5758
}

R/ExportMethods.R

+1-8
Original file line numberDiff line numberDiff line change
@@ -557,17 +557,10 @@ function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
557557

558558

559559
.exportToTrack_clusters <- function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
560-
# Simplify this after the format of dominant_ctss is standardised in all cluster objects.
561-
ranges_ <- function(x) {
562-
if (inherits(x, "GRanges")) return(IRanges(ranges(x)))
563-
IRanges(x)
564-
}
565-
object$thick <- ranges_(object$dominant_ctss)
560+
object$thick <- IRanges(ranges(object$dominant_ctss))
566561
object$dominant_ctss <- NULL
567562
names(object) <- NULL
568563
object$name <- NA
569-
object$nr_ctss <- NULL
570-
object$tpm.dominant_ctss <- NULL
571564
exportToTrack( GRanges(object), qLow = qLow, qUp = qUp
572565
, colorByExpressionProfile = colorByExpressionProfile
573566
, oneTrack = oneTrack)

R/Paraclu.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ setMethod("paraclu", "CTSS",
150150
clusters <- clusters[(clusters$max_d >= (minStability * clusters$min_d)) &
151151
(width(clusters) <= maxLength)]
152152
# Compute score and dominant CTSs, and remove singletons as wanted.
153-
clusters <-
154-
.ctss_summary_for_clusters( object, clusters
155-
, keepSingletonsAbove = keepSingletonsAbove)
153+
clusters <- .ctss_summary_for_clusters(object, clusters)
154+
# Remove clusters that match only one CTSS unless their expression is high enough
155+
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
156156
# Reduce to non-overlapping as wanted
157157
if(reduceToNonoverlapping == TRUE){
158158
o <- findOverlaps(clusters, drop.self = TRUE, type = "within")

0 commit comments

Comments
 (0)