Skip to content

Commit dc4a334

Browse files
author
Charles Plessy
committed
Totally replace clusterCTSS() with distclu() and paraclu()
The expression filtering is now done by `filterLowExpCTSS` because 1) it has consequences beyond tag cluster generation and 2) because it simplifies the argument list of the clustering functions. The remaining `clusterCTSS` function had a set of mutually exclusive arguments for `distclu` and `paraclu`; I find it easier to study and use the two methods when they are taken care of by different functions.
1 parent 4702c46 commit dc4a334

36 files changed

+520
-412
lines changed

NAMESPACE

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ export(aggregateTagClusters)
3131
export(annotateCTSS)
3232
export(annotateConsensusClusters)
3333
export(annotateTagClusters)
34-
export(clusterCTSS)
3534
export(consensusClustersDESeq2)
3635
export(consensusClustersGR)
3736
export(consensusClustersSE)
@@ -40,8 +39,10 @@ export(cumulativeCTSSdistribution)
4039
export(distclu)
4140
export(exportToTrack)
4241
export(expressionClasses)
42+
export(filterLowExpCTSS)
4343
export(findStrandInvaders)
4444
export(flagByUpstreamSequences)
45+
export(flagLowExpCTSS)
4546
export(genomeName)
4647
export(getCTSS)
4748
export(getExpressionProfiles)

NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,14 @@ BACKWARDS-INCOMPATIBLE CHANGES
1818
standard behavior.
1919
- In cluster objects, the dominant CTSS score is now stored in the
2020
`dominantCTSS` object directly.
21+
- The `clusterCTSS` function is replaced by the new `paraclu` and `distclu`
22+
function. CTSS filtering is done beforehand with the new `filterLowExpCTSS`
23+
function.
2124

2225
BUG FIXES
2326

2427
- The `importPublicData` function was repaired for FANTOM samples.
28+
- CTSS filtering now works correctly with `threshold = 0, thresholdIsTpm = TRUE`.
2529

2630
NEW FEATURES
2731

R/AggregationMethods.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
#' @param nrCores Number of cores to use when `useMulticore = TRUE`. Default
3333
#' (`NULL`) uses all detected cores.
3434
#'
35-
#' @details Since the tag clusters (TCs) returned by the [`clusterCTSS`]
35+
#' @details Since the tag clusters (TCs) returned by the CTSS clustering functions
3636
#' function are constructed separately for every CAGE sample within the CAGEr
3737
#' object, they can differ between samples in both their number, genomic
3838
#' coordinates, position of dominant TSS and overall signal. To be able to
@@ -122,7 +122,7 @@ setMethod( "aggregateTagClusters", "CAGEr"
122122
.aggregateTagClustersGRL(gr.list = TC.list, CAGEexp_obj = object, maxDist = maxDist)
123123

124124
if (excludeSignalBelowThreshold) {
125-
filter <- .filterCtss( object
125+
filter <- flagLowExpCTSS( object
126126
, threshold = tpmThreshold
127127
, nrPassThreshold = 1
128128
, thresholdIsTpm = TRUE)
@@ -244,7 +244,7 @@ setMethod( "CustomConsensusClusters", c("CAGEexp", "GRanges")
244244

245245
clusters <- .ConsensusClusters(clusters)
246246

247-
filter <- .filterCtss( object
247+
filter <- flagLowExpCTSS( object
248248
, threshold = threshold
249249
, nrPassThreshold = nrPassThreshold
250250
, thresholdIsTpm = thresholdIsTpm)

R/CAGEexp.R

+2-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,8 @@ setMethod( "initialize", "CAGEexp"
174174
#' CTSStoGenes() |>
175175
#' normalizeTagCount() |>
176176
#' getExpressionProfiles("CTSS") |>
177-
#' clusterCTSS() |>
177+
#' filterLowExpCTSS() |>
178+
#' distclu() |>
178179
#' annotateTagClusters(exampleZv9_annot) |>
179180
#' cumulativeCTSSdistribution("tagClusters") |>
180181
#' quantilePositions("tagClusters") |>

R/CAGEr.R

+66-22
Original file line numberDiff line numberDiff line change
@@ -159,41 +159,85 @@ setMethod("validSamples", "CAGEr", function (object, x){
159159
})
160160

161161

162-
#' @name .filterCtss
163-
#' @noRd
164-
#' @param threshold,nrPassThreshold Only CTSSs with signal \code{>= threshold} in
165-
#' \code{>= nrPassThreshold} experiments will be used for clustering and will
166-
#' contribute towards total signal of the cluster.
167-
#' @param thresholdIsTpm Logical, is threshold raw tag count value (FALSE) or
168-
#' normalized signal (TRUE).
169-
#' @title Private function
170-
#' @details Check if a vector of strings or numbers can be used to identify a sample.
162+
#' Flag CTSSes based on sample expression
163+
#'
164+
#' Flag CTSSes for that do not pass an expression threshold in at least a given
165+
#' number of samples. This is typically used to ignore CTSSes that have been
166+
#' seen only once in a single sample, as they can be considered to not be
167+
#' reproduced.
168+
#'
169+
#' @param object An object from the _CAGEr_ package that contains expression
170+
#' values for multiple samples.
171+
#'
172+
#' @param threshold Flag CTSSs with signal `< threshold`.
173+
#'
174+
#' @param nrPassThreshold Only flag CTSSs when signal is below threshold in at
175+
#' least `nrPassThreshold` samples.
176+
#'
177+
#' @param thresholdIsTpm Logical, is threshold raw tag count value (`FALSE`) or
178+
#' normalized signal (`TRUE`).
179+
#'
180+
#' @returns `flagLowExpCTSS` returns a [`Rle`] vector where `TRUE` indicates the
181+
#' index of a CTSS that passes the filter.
182+
#'
183+
#' @export
184+
#'
185+
#' @examples
186+
#' flagLowExpCTSS(exampleCAGEexp, threshold = 100, nrPassThreshold = 2)
171187

172-
setGeneric(".filterCtss", function( object
173-
, threshold = 0
174-
, nrPassThreshold = 1
175-
, thresholdIsTpm = TRUE) {
176-
if (threshold == 0) return(Rle(TRUE))
177-
standardGeneric(".filterCtss")
178-
})
188+
setGeneric("flagLowExpCTSS", function( object
189+
, threshold = 1
190+
, nrPassThreshold = 1
191+
, thresholdIsTpm = TRUE)
192+
standardGeneric("flagLowExpCTSS")
193+
)
179194

180-
setMethod(".filterCtss", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
181-
.filterCtss(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
195+
#' @rdname flagLowExpCTSS
196+
197+
setMethod("flagLowExpCTSS", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
198+
flagLowExpCTSS(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
182199
})
183200

184-
setMethod(".filterCtss", "RangedSummarizedExperiment", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
201+
#' @rdname flagLowExpCTSS
202+
203+
setMethod("flagLowExpCTSS", "RangedSummarizedExperiment", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
185204
assay <- ifelse(thresholdIsTpm, "normalizedTpmMatrix", "counts")
186205
if(assay == "normalizedTpmMatrix" & is.null(assays(object)[[assay]]))
187206
stop("Normalise the CAGEr object first with ", sQuote("normalizeTagCount()"), ".")
188-
.filterCtss(assays(object)[[assay]], threshold, nrPassThreshold, thresholdIsTpm)
207+
flagLowExpCTSS(assays(object)[[assay]], threshold, nrPassThreshold, thresholdIsTpm)
189208
})
190209

191-
setMethod(".filterCtss", "DataFrame", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
210+
#' @rdname flagLowExpCTSS
211+
212+
setMethod("flagLowExpCTSS", "DataFrame", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
192213
nr.pass.threshold <- rowSums.RleDataFrame(lapply(object, \(x) x > threshold) |> DataFrame())
193214
nr.pass.threshold >= min(nrPassThreshold, ncol(object))
194215
})
195216

196-
setMethod(".filterCtss", "matrix", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
217+
#' @rdname flagLowExpCTSS
218+
219+
setMethod("flagLowExpCTSS", "matrix", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
197220
nr.pass.threshold <- rowSums(object > threshold)
198221
nr.pass.threshold >= min(nrPassThreshold, ncol(object))
222+
})
223+
224+
#' @rdname flagLowExpCTSS
225+
#'
226+
#' @return `filterLowExpCTSS` returns the `CAGEr` object where the output of
227+
#' `flagLowExpCTSS` was stored internally.
228+
#'
229+
#' @export
230+
231+
setGeneric("filterLowExpCTSS", function( object
232+
, threshold = 1
233+
, nrPassThreshold = 1
234+
, thresholdIsTpm = TRUE)
235+
standardGeneric("filterLowExpCTSS")
236+
)
237+
238+
#' @rdname flagLowExpCTSS
239+
240+
setMethod("filterLowExpCTSS", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
241+
filteredCTSSidx(object) <- flagLowExpCTSS(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
242+
object
199243
})

R/ClusteringMethods.R

-151
Original file line numberDiff line numberDiff line change
@@ -1,156 +1,5 @@
11
#' @include CTSS.R Multicore.R
22

3-
#' @name clusterCTSS
4-
#'
5-
#' @title Cluster CTSSs into tag clusters
6-
#'
7-
#' @description Clusters individual CAGE transcription start sites (CTSSs) along
8-
#' the genome into tag clusters (TCs) using specified _ab initio_ method, or
9-
#' assigns them to predefined genomic regions.
10-
#'
11-
#' @param object A [`CAGEr`] object.
12-
#'
13-
#' @param threshold,nrPassThreshold Ignore CTSSs with signal `< threshold`
14-
#' in `< nrPassThreshold` experiments.
15-
#'
16-
#' @param thresholdIsTpm Logical indicating if `threshold` is expressed in
17-
#' raw tag counts (`FALSE`) or normalized signal (`TRUE`).
18-
#'
19-
#' @param method Clustering method: `"distclu"` or `"paraclu"`.
20-
#'
21-
#' @param maxDist Maximal distance between two neighbouring CTSSs for them to be
22-
#' part of the same cluster. Used only when `method = "distclu"`,
23-
#' otherwise ignored.
24-
#'
25-
#' @param keepSingletonsAbove Remove "singleton" tag clusters of width 1 with
26-
#' signal `< keepSingletonsAbove`. Default value `0` results in keeping
27-
#' all TCs by default. Setting it to `Inf` removes all singletons.
28-
#'
29-
#' @param minStability Minimal stability of the cluster, where stability is
30-
#' defined as ratio between maximal and minimal density value for which
31-
#' this cluster is maximal scoring. For definition of stability refer to
32-
#' Frith _et al._, Genome Research, 2007. Clusters with stability
33-
#' `< minStability` will be discarded. Used only when `method = "paraclu"`.
34-
#'
35-
#' @param maxLength Maximal length of cluster in base-pairs. Clusters with length
36-
#' `> maxLength` will be discarded.
37-
#'
38-
#' @param reduceToNonoverlapping Logical, should smaller clusters contained
39-
#' within bigger cluster be removed to make a final set of tag clusters
40-
#' non-overlapping. Used only `method = "paraclu"`.
41-
#'
42-
#' @param useMulticore Logical, should multicore be used. `useMulticore = TRUE`
43-
#' has no effect on non-Unix-like platforms.
44-
#'
45-
#' @param nrCores Number of cores to use when `useMulticore = TRUE`. Default
46-
#' value `NULL` uses all detected cores.
47-
#'
48-
#' @details The `"distclu"` method is an implementation of simple distance-based
49-
#' clustering of data attached to sequences, where two neighbouring TSSs are
50-
#' joined together if they are closer than some specified distance (see
51-
#' [`GenomicRanges::reduce`] for implementation details.
52-
#'
53-
#' `"paraclu"` is an implementation of Paraclu algorithm for parametric
54-
#' clustering of data attached to sequences (Frith _et al._, Genome Research,
55-
#' 2007). Since Paraclu finds clusters within clusters (unlike distclu),
56-
#' additional parameters (`keepSingletonsAbove`,
57-
#' `minStability`, `maxLength` and `reduceToNonoverlapping`) can be specified to
58-
#' simplify the output by discarding too small (singletons) or too big clusters,
59-
#' and to reduce the clusters to a final set of non-overlapping clusters.
60-
#'
61-
#' Clustering is done for every CAGE dataset within the CAGEr object separately,
62-
#' resulting in a different set of tag clusters for every CAGE dataset. TCs from
63-
#' different datasets can further be aggregated into a single referent set of
64-
#' consensus clusters by calling the [`aggregateTagClusters`] function.
65-
#'
66-
#' @return Returns the [`CAGEexp`] object, in which, the results will be stored as a `GRangesList` of
67-
#' [`TagClusters`] objects in the metadata slot `tagClusters`. The
68-
#' `TagClusters` objects will contain a `filteredCTSSidx` column if appropriate.
69-
#' The clustering method name is saved in the metadata slot of the `GRangesList`.
70-
#'
71-
#' @references Frith _et al._ (2007) A code for transcription initiation in
72-
#' mammalian genomes, _Genome Research_ **18**(1):1-12,
73-
#' (\href{http://www.cbrc.jp/paraclu/}{http://www.cbrc.jp/paraclu/}).
74-
#'
75-
#' @author Vanja Haberle
76-
#'
77-
#' @seealso [`aggregateTagClusters`]
78-
#'
79-
#' @family CAGEr object modifiers
80-
#' @family CAGEr clusters functions
81-
#'
82-
#' @examples
83-
#'
84-
#' # Using 'distclu', notice argument 'maxDist'
85-
#' ce <- clusterCTSS( exampleCAGEexp, threshold = 50, thresholdIsTpm = TRUE
86-
#' , nrPassThreshold = 1, method = "distclu", maxDist = 20
87-
#' , keepSingletonsAbove = 100)
88-
#' tagClustersGR(ce, "Zf.30p.dome")
89-
#'
90-
#' # Using 'paraclu', notice arguments 'maxLength' and 'minStability'
91-
#' ce <- clusterCTSS( exampleCAGEexp, threshold = 50, thresholdIsTpm = TRUE
92-
#' , nrPassThreshold = 1, method = "paraclu"
93-
#' , keepSingletonsAbove = 100
94-
#' , maxLength = 500, minStability = 1
95-
#' , reduceToNonoverlapping = TRUE)
96-
#' tagClustersGR(ce, "Zf.30p.dome")
97-
#'
98-
#' @export
99-
100-
setGeneric( "clusterCTSS"
101-
, function( object
102-
, threshold = 1, nrPassThreshold = 1, thresholdIsTpm = TRUE
103-
, method = c("distclu", "paraclu"), maxDist = 20
104-
, keepSingletonsAbove = 0
105-
, minStability = 1, maxLength = 500
106-
, reduceToNonoverlapping = TRUE
107-
, useMulticore = FALSE, nrCores = NULL)
108-
standardGeneric("clusterCTSS"))
109-
110-
#' @rdname clusterCTSS
111-
112-
setMethod( "clusterCTSS", "CAGEexp"
113-
, function( object, threshold, nrPassThreshold, thresholdIsTpm, method, maxDist
114-
, keepSingletonsAbove, minStability, maxLength
115-
, reduceToNonoverlapping, useMulticore, nrCores) {
116-
117-
assay <- ifelse(isTRUE(thresholdIsTpm), "normalizedTpmMatrix", "counts")
118-
data <- CTSStagCountSE(object)
119-
120-
if (! "normalizedTpmMatrix" %in% assayNames(data))
121-
stop( "Could not find normalized CAGE signal values, see ?normalizeTagCount.\n"
122-
, "clusterCTSS() needs normalized values to create its output tables, that "
123-
, "include TPM expression columns.")
124-
125-
message("\nFiltering out CTSSs below threshold...")
126-
filteredCTSSidx(object) <-
127-
.filterCtss(data, threshold = threshold
128-
, nrPassThreshold = nrPassThreshold, thresholdIsTpm = thresholdIsTpm)
129-
130-
message("Clustering...")
131-
method <- match.arg(method)
132-
133-
if (method == "distclu") {
134-
ctss.cluster.list <- distclu( object = data[decode(filteredCTSSidx(object)),]
135-
, max.dist = maxDist, keepSingletonsAbove = keepSingletonsAbove)
136-
} else if (method == "paraclu") {
137-
ctss.cluster.list <- paraclu( object = data[decode(filteredCTSSidx(object)),]
138-
, minStability = minStability, maxLength = maxLength
139-
, keepSingletonsAbove = keepSingletonsAbove
140-
, reduceToNonoverlapping = reduceToNonoverlapping
141-
, useMulticore = useMulticore, nrCores = nrCores)
142-
} else if(method == "custom") {
143-
stop("Deprecated method. See ", dQuote("CustomConsensusClusters()"), " instead.")
144-
}
145-
146-
seqlevels(ctss.cluster.list) <- seqlevels(CTSStagCountSE(object))
147-
seqinfo(ctss.cluster.list) <- seqinfo(CTSStagCountSE(object))
148-
# Changing the sequence levels may change the sort order. Re-sort
149-
ctss.cluster.list <- sort(ctss.cluster.list)
150-
metadata(object)$tagClusters <- ctss.cluster.list
151-
object
152-
})
153-
1543
#' @rdname byCtss
1554
#'
1565
#' @title Apply functions to identical CTSSes.

0 commit comments

Comments
 (0)