From ad9819ff80ed5934eefbe3179532e42656ac8940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Wed, 27 Dec 2023 16:54:49 -0500 Subject: [PATCH 1/4] Refactor BioMarkovChain struct to be BioJulia compliangt using correct alphabet types --- src/BioMarkovChains.jl | 4 +++- src/extended.jl | 2 +- src/models.jl | 8 ++++---- src/transitions.jl | 10 +++++----- src/types.jl | 12 ++++++------ src/utils.jl | 10 +++++----- 6 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/BioMarkovChains.jl b/src/BioMarkovChains.jl index 35289da..ee98b86 100644 --- a/src/BioMarkovChains.jl +++ b/src/BioMarkovChains.jl @@ -11,6 +11,8 @@ using BioSequences: NucleicAcidAlphabet, DNA, DNAAlphabet, + RNAAlphabet, + Alphabet, #RNA RNA, @@ -27,7 +29,6 @@ using BioSequences: #tests and precompilation using PrecompileTools: @setup_workload, @compile_workload -# using StatsAPI: StatsAPI, fit, fit! using VectorizedKmers: count_kmers include("types.jl") @@ -58,6 +59,7 @@ include("extended.jl") # they belong to your package or not (on Julia 1.8 and higher) transition_count_matrix(seq) transition_probability_matrix(seq) + end end diff --git a/src/extended.jl b/src/extended.jl index 326dec0..51f2079 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -48,7 +48,7 @@ end Base.length(bmc::BioMarkovChain) = length(bmc.inits) function Base.eltype(bmc::BioMarkovChain) - return bmc.statespace + return bmc.alphabet end """ diff --git a/src/models.jl b/src/models.jl index 48cbeba..a202bce 100644 --- a/src/models.jl +++ b/src/models.jl @@ -8,7 +8,7 @@ const ECOLICDS = begin inits = [0.245, 0.243, 0.273, 0.239] - BMC(DNA, tpm, inits) + BMC(DNAAlphabet{4}(), tpm, inits) end const ECOLINOCDS = begin @@ -21,7 +21,7 @@ const ECOLINOCDS = begin inits = [0.262, 0.239, 0.240, 0.259] - BMC(DNA, tpm, inits) + BMC(DNAAlphabet{4}(), tpm, inits) end @@ -35,7 +35,7 @@ const CPGPOS = begin inits = [0.262, 0.239, 0.240, 0.259] # not stablished - BMC(DNA, tpm, inits) + BMC(DNAAlphabet{4}(), tpm, inits) end @@ -49,5 +49,5 @@ const CPGNEG = begin inits = [0.262, 0.239, 0.240, 0.259] # not stablished - BMC(DNA, tpm, inits) + BMC(DNAAlphabet{4}(), tpm, inits) end \ No newline at end of file diff --git a/src/transitions.jl b/src/transitions.jl index 6a04301..d6115ea 100644 --- a/src/transitions.jl +++ b/src/transitions.jl @@ -144,7 +144,7 @@ function odds_ratio_matrix( sequence::SeqOrView{A}, model::BioMarkovChain; ) where A - @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent." + @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent." tpm = transition_probability_matrix(sequence) return tpm ./ model.tpm end @@ -173,7 +173,7 @@ function log_odds_ratio_matrix( model::BioMarkovChain; b::Number = ℯ ) where A - @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent." + @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent." @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1." tpm = transition_probability_matrix(sequence) @@ -206,7 +206,7 @@ function log_odds_ratio_matrix( model2::BioMarkovChain; b::Number = ℯ ) - @assert model1.statespace == model2.statespace "Models state spaces are inconsistent" + @assert model1.alphabet == model2.alphabet "Models state spaces are inconsistent" @assert round.(sum(model1.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 1 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1." @assert round.(sum(model2.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 2 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1." @@ -239,7 +239,7 @@ function log_odds_ratio_score( model::BioMarkovChain; b::Number = ℯ ) where A - @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent." + @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent." @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1." tpm = transition_probability_matrix(sequence) @@ -303,7 +303,7 @@ function dnaseqprobability( sequence::NucleicSeqOrView{A}, model::BioMarkovChain ) where A - @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent." + @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent." init = model.inits[_dna_to_int(sequence[1])] probability = init diff --git a/src/types.jl b/src/types.jl index 0f22a18..055f023 100644 --- a/src/types.jl +++ b/src/types.jl @@ -6,7 +6,7 @@ abstract type AbstractBioMarkovChain end A BioMarkovChain represents a Markov chain used in biological sequence analysis. It contains a transition probability matrix (tpm) and an initial distribution of probabilities (inits) and also the order of the Markov chain. # Fields -- `statespace::S`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s. +- `alphabet::A`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s. - `tpm::M`: The transition probability matrix. - `inits::I`: The initial distribution of probabilities. - `n::N`: The order of the Markov chain. @@ -34,20 +34,20 @@ BioMarkovChain: - Markov Chain Order:2 ``` """ -struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain - statespace::S # The sequence DataType (DNA,RNA,AminoAcid) +struct BioMarkovChain{A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain + alphabet::A # The sequence alphabet (DNAAlphabet, RNAAlphabet, AminoAcidAlphabet) tpm::M # The probabilities of the TransitionProbabilityMatrix struct inits::I # the initials distribution of probabilities n::N # The order of the Markov chain - function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} - bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n) + function BioMarkovChain(alphabet::A, tpm::M, inits::I, n::N=1) where {A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} + bmc = new{A,M,I,N}(alphabet, n > 1 ? tpm^n : tpm, inits, n) return bmc end function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A inits = initials(sequence) tpm = transition_probability_matrix(sequence) - bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n) + bmc = new{Alphabet, Matrix{Float64}, Vector{Float64},Int64}(Alphabet(sequence), n > 1 ? tpm^n : tpm, inits, n) return bmc end end diff --git a/src/utils.jl b/src/utils.jl index 9ba8e16..ea9e2af 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -10,13 +10,13 @@ function _dna_to_int(nucleotide::DNA) return reinterpret.(Int8, modifier(nucleotide))[1] #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A)) end -function randbmc(statespace::DataType, n::Int64=1) +function randbmc(A::Alphabet, n::Int64=1) - if !(statespace in (DNA, RNA, AminoAcid)) - throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType.")) + if !(A in (DNAAlphabet{4}(), RNAAlphabet{4}(), AminoAcidAlphabet())) + throw(ArgumentError("Alphabet must be of the DNAAlphabet, RNAAlphabet, or AminoAcidAlphabet.")) end - nstates = (statespace == AminoAcid) ? 20 : 4 + nstates = (A == AminoAcidAlphabet) ? 20 : 4 tpm = rand(nstates, nstates) # Normalize rows of the transition probability matrix @@ -28,5 +28,5 @@ function randbmc(statespace::DataType, n::Int64=1) initsum = sum(inits) @views inits ./= initsum - return BMC(statespace, tpm, inits, n) + return BMC(A, tpm, inits, n) end From 33708e52aa5aa16502322be7c7cef31b876325af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Thu, 28 Dec 2023 13:06:36 -0500 Subject: [PATCH 2/4] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16fa56a..d126990 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver ## [UNRELEASED](https://github.com/camilogarciabotero/GeneFinder.jl/compare/v0.0.10...main) +## [0.9.0] + +- `BioMarkoChain` now has a compliant `BioSequences` alphabet. + ## [0.8.1] - Fix `BioMarkoChain` checks compats. From 76dc6b000183fb51181c38f1d223bc98ed3eef67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Thu, 28 Dec 2023 13:06:45 -0500 Subject: [PATCH 3/4] Update version to 0.9.0 in Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index bafaa23..3406cb2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioMarkovChains" uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c" authors = ["Camilo García-Botero"] -version = "0.8.1" +version = "0.9.0" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" From a5b089d5b7881dd44024fc9ca3660f60d3649b11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Thu, 28 Dec 2023 13:10:11 -0500 Subject: [PATCH 4/4] Update BioMarkovChain with DNAAlphabet{4}() Alphabet --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e4526ff..01a6c00 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ BioMarkovChain(orfdna, 2) ``` ``` -BioMarkovChain with DNA Alphabet: +BioMarkovChain with DNAAlphabet{4}() Alphabet: - Transition Probability Matrix -> Matrix{Float64}(4 × 4): 0.2123 0.2731 0.278 0.2366 0.2017 0.3072 0.2687 0.2224 @@ -94,7 +94,7 @@ ECOLICDS ``` ``` -BioMarkovChain with DNA Alphabet: +BioMarkovChain with DNAAlphabet{4}() Alphabet: - Transition Probability Matrix -> Matrix{Float64}(4 × 4): 0.31 0.224 0.199 0.268 0.251 0.215 0.313 0.221