Skip to content

Commit 7a4f5ef

Browse files
fitPowerLaw: S4 function accepting various CAGEr objects.
The goal is to easily record powerLaw fitting parameters in a CAGEexp object. Also change .fit.power.law.to.reverse.cumulative to use Rle instead of data.table to compute the reverse cumulative distributions. Next: make the normalisation and plotting functions reuse the fitted values if they already are found in the metadata.
1 parent 18d7295 commit 7a4f5ef

File tree

4 files changed

+152
-49
lines changed

4 files changed

+152
-49
lines changed

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ export(cumulativeCTSSdistribution)
3939
export(exportToTrack)
4040
export(expressionClasses)
4141
export(findStrandInvaders)
42+
export(fitPowerLaw)
4243
export(flagByUpstreamSequences)
4344
export(genomeName)
4445
export(getCTSS)

R/ExportMethods.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,7 @@ setMethod( "plotReverseCumulatives", "CAGEr"
105105
, normalized = CTSSnormalizedTpmDF(object))
106106

107107
if(! is.null(fitInRange)) {
108-
fit.coefs.m <- as.matrix(data.frame(lapply(tag.count, function(x) {.fit.power.law.to.reverse.cumulative(values = decode(x), val.range = fitInRange)})))
109-
fit.slopes <- fit.coefs.m[1,]
108+
fit.slopes <- fitPowerLaw(tag.count)$plSlope
110109
names(fit.slopes) <- sample.labels
111110
reference.slope <- min(median(fit.slopes), -1.05)
112111
reference.library.size <- 10^floor(log10(median(sapply(tag.count, sum))))

R/NormalizationFunctions.R

+95-47
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ setGeneric(".powerLaw", function(tag.counts, fitInRange = c(10, 1000), alpha = 1
3434
standardGeneric(".powerLaw")})
3535

3636
setMethod(".powerLaw", "numeric", function (tag.counts, fitInRange, alpha, T) {
37-
fit.coef <- .fit.power.law.to.reverse.cumulative(values = tag.counts, val.range = fitInRange)
37+
fit.coef <- fitPowerLaw(tag.counts, fitInRange)
3838
.normalize.to.reference.power.law.distribution(values = tag.counts, lin.reg.coef = fit.coef, alpha = alpha, T = T)
3939
})
4040

@@ -50,60 +50,108 @@ setMethod(".powerLaw", "DataFrame", function (tag.counts, fitInRange, alpha, T)
5050
DataFrame(sapply(tag.counts, function(X) .powerLaw(X, fitInRange, alpha, T)))
5151
})
5252

53-
#' @name .fit.power.law.to.reverse.cumulative
53+
#' Fit a power law to the CAGE data
5454
#'
5555
#' Function that fits power-law distribution to reverse cumulative of given
56-
#' values - fitting is done using only the range specified in val.range
56+
#' values. Fitting is done using only a specified range.
5757
#'
58-
#' @return Two coefficients describing fitted power-law distribution (a = slope
59-
#' in the logX-logY plot (where X=signal, Y=number of sites that have >= of given
60-
#' signal), b = intercept in the logX-logY plot)
58+
#' @param x The object containing the data.
59+
#' @param fitInRange The range in which to fit the power law.
60+
#'
61+
#' @return Returns the slope (`plSlope`) and the intercept (`plInt`) of the
62+
#' fitted power-law distribution. If the input is a single sample, function
63+
#' returns a numeric vector. If the input is multiple samples, the function
64+
#' returns a `DataFrame`. If the input is a whole _CAGEexp_ object, the
65+
#' function returns the object with information added to its `colData`.
6166
#'
62-
#' @importFrom data.table data.table setkeyv setnames
6367
#' @importFrom stats coefficients lm
64-
#' @noRd
68+
#'
69+
#' @author Vanja Haberle
70+
#' @author Charles Plessy
71+
#'
72+
#' @export
6573

66-
.fit.power.law.to.reverse.cumulative <- function(values, val.range = c(10, 1000)) {
74+
setGeneric("fitPowerLaw", function(x, fitInRange = c(10, 1000)) {
75+
standardGeneric("fitPowerLaw")})
6776

68-
# using data.table package
69-
v <- data.table(num = 1, nr_tags = values)
70-
num <- nr_tags <- NULL # Keep R CMD check happy.
71-
v <- v[, sum(num), by = nr_tags]
72-
setkeyv(v, "nr_tags")
73-
v$V1 <- rev(cumsum(rev(v$V1)))
74-
setnames(v, c('nr_tags', 'reverse_cumulative'))
75-
76-
# check if range values are > 0
77-
if(any(val.range < 1)){
78-
stop(paste("The range of values for fitting the power law",
79-
"arg 'fitInRange', expects integers > 0."))
80-
}
81-
82-
v <- v[nr_tags >= min(val.range) & nr_tags <= max(val.range)]
83-
84-
# check if specified range values have at least 1 entry in v
85-
if(nrow(v) < 1){
86-
stop(paste("Selected range for fitting the power law does not contain any",
87-
"tag count values. Consider changing/increasing 'fitInRange'."))
88-
}
89-
# v <- aggregate(values, by = list(values), FUN = length)
90-
# v$x <- rev(cumsum(rev(v$x)))
91-
# colnames(v) <- c('nr_tags', 'reverse_cumulative')
92-
# v <- subset(v, nr_tags >= min(val.range) & nr_tags <= max(val.range))
93-
94-
lin.m <- lm(log(reverse_cumulative) ~ log(nr_tags), data = v)
95-
96-
a <- coefficients(lin.m)[2]
97-
b <- coefficients(lin.m)[1]
98-
99-
# check if specified range values have >1 entries in v
100-
if(is.na(a) && b == 0){
101-
stop(paste("Selected range for fitting the power law does not",
102-
"contain enough values. Consider changing/increasing 'fitInRange'."))
103-
}
104-
return(c(a, b))
105-
}
77+
#' @rdname fitPowerLaw
78+
#' @examples
79+
#' exampleCAGEexp |> fitPowerLaw() |> colData()
10680

81+
setMethod("fitPowerLaw", "CAGEexp", function (x, fitInRange) {
82+
DF <- fitPowerLaw(CTSStagCountDF(x), fitInRange)
83+
colData(x) <- cbind(colData(x), DF)
84+
x
85+
})
86+
87+
#' @rdname fitPowerLaw
88+
#' @examples
89+
#' exampleCAGEexp |> CTSStagCountDF() |> fitPowerLaw()
90+
91+
setMethod("fitPowerLaw", "DataFrame", function (x, fitInRange) {
92+
m <- sapply(x, fitPowerLaw, fitInRange)
93+
DF <- as(t(m), "DataFrame")
94+
rownames(DF) <- names(x)
95+
DF
96+
})
97+
98+
#' @rdname fitPowerLaw
99+
#' @examples
100+
#' exampleCAGEexp |> CTSStagCountGR('all') |> fitPowerLaw()
101+
102+
setMethod("fitPowerLaw", "GRangesList", function (x, fitInRange) {
103+
m <- sapply(x, fitPowerLaw, fitInRange)
104+
DF <- as(t(m), "DataFrame")
105+
rownames(DF) <- names(x)
106+
DF
107+
})
108+
109+
#' @rdname fitPowerLaw
110+
#' @examples
111+
#' exampleCAGEexp |> CTSStagCountGR(1) |> fitPowerLaw()
112+
113+
setMethod("fitPowerLaw", "GRanges", function (x, fitInRange) {
114+
fitPowerLaw(score(x), fitInRange)
115+
})
116+
117+
#' @rdname fitPowerLaw
118+
#' @examples
119+
#' exampleCAGEexp |> CTSStagCountGR(1) |> score() |> fitPowerLaw()
120+
121+
setMethod("fitPowerLaw", "Rle", function (x, fitInRange) {
122+
.fit.power.law.to.reverse.cumulative(x, fitInRange)
123+
})
124+
125+
#' @rdname fitPowerLaw
126+
#' @examples
127+
#' exampleCAGEexp |> CTSStagCountGR(1) |> score() |> decode() |> fitPowerLaw()
128+
129+
setMethod("fitPowerLaw", "numeric", function(x, fitInRange) {
130+
.fit.power.law.to.reverse.cumulative(x, fitInRange)
131+
})
132+
133+
.fit.power.law.to.reverse.cumulative <- function(values, val.range = c(10, 1000)) {
134+
# Using Rle
135+
v <- sort(Rle(values), decr = TRUE)
136+
nr_tags <- rev(runValue(v))
137+
reverse_cumulative <- v |> runLength() |> cumsum() |> rev()
138+
139+
inRange <- nr_tags >= min(val.range) & nr_tags <= max(val.range)
140+
141+
lin.m <- lm(log(reverse_cumulative[inRange]) ~ log(nr_tags[inRange]))
142+
143+
a <- coefficients(lin.m)[2]
144+
names(a) <- 'plSlope'
145+
b <- coefficients(lin.m)[1]
146+
names(b) <- 'plInt'
147+
148+
# check if specified range values have >1 entries in v
149+
if(is.na(a) && b == 0){
150+
stop(paste("Selected range for fitting the power law does not",
151+
"contain enough values. Consider changing/increasing 'fitInRange'."))
152+
}
153+
c(a, b)
154+
}
107155

108156
#' .normalize.to.reference.power.law.distribution
109157
#'

man/fitPowerLaw.Rd

+55
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)