Merge pull request #21 from camilogarciabotero:dev

Refactor BioMarkovChain struct to be BioJulia compliant using correct alphabet types
BioJulia · Dec 28, 2023 · ae5cfbe · ae5cfbe · camilogarciabotero · Dec 28, 2023
2 parents 8d64c31 + a5b089d
commit ae5cfbe
Show file tree

Hide file tree

Showing 9 changed files with 31 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver
 
 ## [UNRELEASED](https://github.com/camilogarciabotero/GeneFinder.jl/compare/v0.0.10...main)
 
+## [0.9.0]
+
+- `BioMarkoChain` now has a compliant `BioSequences` alphabet.
+
 ## [0.8.1]
 
 - Fix `BioMarkoChain` checks compats.

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioMarkovChains"
 uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c"
 authors = ["Camilo García-Botero<ca.garcia2@uniandes.edu.co>"]
-version = "0.8.1"
+version = "0.9.0"
 
 [deps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"

diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ BioMarkovChain(orfdna, 2)
 ```
 
 ```
-BioMarkovChain with DNA Alphabet:
+BioMarkovChain with DNAAlphabet{4}() Alphabet:
   - Transition Probability Matrix -> Matrix{Float64}(4 × 4):
    0.2123  0.2731  0.278   0.2366
    0.2017  0.3072  0.2687  0.2224
@@ -94,7 +94,7 @@ ECOLICDS
 ```
 
 ```
-BioMarkovChain with DNA Alphabet:
+BioMarkovChain with DNAAlphabet{4}() Alphabet:
   - Transition Probability Matrix -> Matrix{Float64}(4 × 4):
    0.31    0.224   0.199   0.268
    0.251   0.215   0.313   0.221

diff --git a/src/BioMarkovChains.jl b/src/BioMarkovChains.jl
@@ -11,6 +11,8 @@ using BioSequences:
     NucleicAcidAlphabet,
     DNA,
     DNAAlphabet,
+    RNAAlphabet,
+    Alphabet,
 
     #RNA
     RNA,
@@ -27,7 +29,6 @@ using BioSequences:
     #tests and precompilation
 
 using PrecompileTools: @setup_workload, @compile_workload
-# using StatsAPI: StatsAPI, fit, fit!
 using VectorizedKmers: count_kmers
 
 include("types.jl")
@@ -58,6 +59,7 @@ include("extended.jl")
         # they belong to your package or not (on Julia 1.8 and higher)
         transition_count_matrix(seq)
         transition_probability_matrix(seq)
+
     end
 end
 

diff --git a/src/extended.jl b/src/extended.jl
@@ -48,7 +48,7 @@ end
 Base.length(bmc::BioMarkovChain) = length(bmc.inits)
 
 function Base.eltype(bmc::BioMarkovChain)
-    return bmc.statespace
+    return bmc.alphabet
 end
 
 """

diff --git a/src/models.jl b/src/models.jl
@@ -8,7 +8,7 @@ const ECOLICDS = begin
 
     inits = [0.245, 0.243, 0.273, 0.239]
 
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 const ECOLINOCDS = begin
@@ -21,7 +21,7 @@ const ECOLINOCDS = begin
 
     inits = [0.262, 0.239, 0.240, 0.259]
 
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 
@@ -35,7 +35,7 @@ const CPGPOS = begin
 
     inits = [0.262, 0.239, 0.240, 0.259] # not stablished
 
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 
@@ -49,5 +49,5 @@ const CPGNEG = begin
 
     inits = [0.262, 0.239, 0.240, 0.259] # not stablished
 
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
diff --git a/src/transitions.jl b/src/transitions.jl
@@ -144,7 +144,7 @@ function odds_ratio_matrix(
     sequence::SeqOrView{A},
     model::BioMarkovChain;
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     tpm = transition_probability_matrix(sequence)
     return tpm ./ model.tpm
 end
@@ -173,7 +173,7 @@ function log_odds_ratio_matrix(
     model::BioMarkovChain;
     b::Number = ℯ
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
 
     tpm = transition_probability_matrix(sequence)
@@ -206,7 +206,7 @@ function log_odds_ratio_matrix(
     model2::BioMarkovChain;
     b::Number = ℯ
 )
-    @assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
+    @assert model1.alphabet == model2.alphabet "Models state spaces are inconsistent"
     @assert round.(sum(model1.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 1 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
     @assert round.(sum(model2.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 2 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
 
@@ -239,7 +239,7 @@ function log_odds_ratio_score(
     model::BioMarkovChain;
     b::Number = ℯ
 ) where A 
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
 
     tpm = transition_probability_matrix(sequence)
@@ -303,7 +303,7 @@ function dnaseqprobability(
     sequence::NucleicSeqOrView{A},
     model::BioMarkovChain
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     init = model.inits[_dna_to_int(sequence[1])]
 
     probability = init

diff --git a/src/types.jl b/src/types.jl
@@ -6,7 +6,7 @@ abstract type AbstractBioMarkovChain end
 A BioMarkovChain represents a Markov chain used in biological sequence analysis. It contains a transition probability matrix (tpm) and an initial distribution of probabilities (inits) and also the order of the Markov chain.
 
 # Fields
-- `statespace::S`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
+- `alphabet::A`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
 - `tpm::M`: The transition probability matrix.
 - `inits::I`: The initial distribution of probabilities.
 - `n::N`: The order of the Markov chain.
@@ -34,20 +34,20 @@ BioMarkovChain:
   - Markov Chain Order:2
 ```
 """
-struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
-  statespace::S # The sequence DataType (DNA,RNA,AminoAcid)
+struct BioMarkovChain{A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
+  alphabet::A # The sequence alphabet (DNAAlphabet, RNAAlphabet, AminoAcidAlphabet)
   tpm::M # The probabilities of the TransitionProbabilityMatrix struct
   inits::I # the initials distribution of probabilities
   n::N # The order of the Markov chain
-  function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} 
-    bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
+  function BioMarkovChain(alphabet::A, tpm::M, inits::I, n::N=1) where {A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} 
+    bmc = new{A,M,I,N}(alphabet, n > 1 ? tpm^n : tpm, inits, n)
     return bmc
   end
 
   function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A
     inits = initials(sequence)
     tpm = transition_probability_matrix(sequence)
-    bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n)
+    bmc = new{Alphabet, Matrix{Float64}, Vector{Float64},Int64}(Alphabet(sequence), n > 1 ? tpm^n : tpm, inits, n)
     return bmc
   end
 end

diff --git a/src/utils.jl b/src/utils.jl
@@ -10,13 +10,13 @@ function _dna_to_int(nucleotide::DNA)
     return reinterpret.(Int8, modifier(nucleotide))[1]  #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
 end
 
-function randbmc(statespace::DataType, n::Int64=1)
+function randbmc(A::Alphabet, n::Int64=1)
 
-    if !(statespace in (DNA, RNA, AminoAcid))
-        throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType."))
+    if !(A in (DNAAlphabet{4}(), RNAAlphabet{4}(), AminoAcidAlphabet()))
+        throw(ArgumentError("Alphabet must be of the DNAAlphabet, RNAAlphabet, or AminoAcidAlphabet."))
     end
 
-    nstates = (statespace == AminoAcid) ? 20 : 4
+    nstates = (A == AminoAcidAlphabet) ? 20 : 4
     tpm = rand(nstates, nstates)
 
     # Normalize rows of the transition probability matrix
@@ -28,5 +28,5 @@ function randbmc(statespace::DataType, n::Int64=1)
     initsum = sum(inits)
     @views inits ./= initsum
 
-    return BMC(statespace, tpm, inits, n)
+    return BMC(A, tpm, inits, n)
 end