From ad9819ff80ed5934eefbe3179532e42656ac8940 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= <ca.garcia2@uniandes.edu.co>
Date: Wed, 27 Dec 2023 16:54:49 -0500
Subject: [PATCH 1/4] Refactor BioMarkovChain struct to be BioJulia compliangt
 using correct alphabet types

---
 src/BioMarkovChains.jl |  4 +++-
 src/extended.jl        |  2 +-
 src/models.jl          |  8 ++++----
 src/transitions.jl     | 10 +++++-----
 src/types.jl           | 12 ++++++------
 src/utils.jl           | 10 +++++-----
 6 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/BioMarkovChains.jl b/src/BioMarkovChains.jl
index 35289da..ee98b86 100644
--- a/src/BioMarkovChains.jl
+++ b/src/BioMarkovChains.jl
@@ -11,6 +11,8 @@ using BioSequences:
     NucleicAcidAlphabet,
     DNA,
     DNAAlphabet,
+    RNAAlphabet,
+    Alphabet,
     
     #RNA
     RNA,
@@ -27,7 +29,6 @@ using BioSequences:
     #tests and precompilation
 
 using PrecompileTools: @setup_workload, @compile_workload
-# using StatsAPI: StatsAPI, fit, fit!
 using VectorizedKmers: count_kmers
 
 include("types.jl")
@@ -58,6 +59,7 @@ include("extended.jl")
         # they belong to your package or not (on Julia 1.8 and higher)
         transition_count_matrix(seq)
         transition_probability_matrix(seq)
+
     end
 end
 
diff --git a/src/extended.jl b/src/extended.jl
index 326dec0..51f2079 100644
--- a/src/extended.jl
+++ b/src/extended.jl
@@ -48,7 +48,7 @@ end
 Base.length(bmc::BioMarkovChain) = length(bmc.inits)
 
 function Base.eltype(bmc::BioMarkovChain)
-    return bmc.statespace
+    return bmc.alphabet
 end
 
 """
diff --git a/src/models.jl b/src/models.jl
index 48cbeba..a202bce 100644
--- a/src/models.jl
+++ b/src/models.jl
@@ -8,7 +8,7 @@ const ECOLICDS = begin
 
     inits = [0.245, 0.243, 0.273, 0.239]
 
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 const ECOLINOCDS = begin
@@ -21,7 +21,7 @@ const ECOLINOCDS = begin
     
     inits = [0.262, 0.239, 0.240, 0.259]
     
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 
@@ -35,7 +35,7 @@ const CPGPOS = begin
     
     inits = [0.262, 0.239, 0.240, 0.259] # not stablished
     
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
 
 
@@ -49,5 +49,5 @@ const CPGNEG = begin
     
     inits = [0.262, 0.239, 0.240, 0.259] # not stablished
     
-    BMC(DNA, tpm, inits)
+    BMC(DNAAlphabet{4}(), tpm, inits)
 end
\ No newline at end of file
diff --git a/src/transitions.jl b/src/transitions.jl
index 6a04301..d6115ea 100644
--- a/src/transitions.jl
+++ b/src/transitions.jl
@@ -144,7 +144,7 @@ function odds_ratio_matrix(
     sequence::SeqOrView{A},
     model::BioMarkovChain;
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     tpm = transition_probability_matrix(sequence)
     return tpm ./ model.tpm
 end
@@ -173,7 +173,7 @@ function log_odds_ratio_matrix(
     model::BioMarkovChain;
     b::Number = ℯ
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
     
     tpm = transition_probability_matrix(sequence)
@@ -206,7 +206,7 @@ function log_odds_ratio_matrix(
     model2::BioMarkovChain;
     b::Number = ℯ
 )
-    @assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
+    @assert model1.alphabet == model2.alphabet "Models state spaces are inconsistent"
     @assert round.(sum(model1.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 1 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
     @assert round.(sum(model2.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 2 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
 
@@ -239,7 +239,7 @@ function log_odds_ratio_score(
     model::BioMarkovChain;
     b::Number = ℯ
 ) where A 
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     @assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."  
 
     tpm = transition_probability_matrix(sequence)
@@ -303,7 +303,7 @@ function dnaseqprobability(
     sequence::NucleicSeqOrView{A},
     model::BioMarkovChain
 ) where A
-    @assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
+    @assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
     init = model.inits[_dna_to_int(sequence[1])]
 
     probability = init
diff --git a/src/types.jl b/src/types.jl
index 0f22a18..055f023 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -6,7 +6,7 @@ abstract type AbstractBioMarkovChain end
 A BioMarkovChain represents a Markov chain used in biological sequence analysis. It contains a transition probability matrix (tpm) and an initial distribution of probabilities (inits) and also the order of the Markov chain.
 
 # Fields
-- `statespace::S`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
+- `alphabet::A`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
 - `tpm::M`: The transition probability matrix.
 - `inits::I`: The initial distribution of probabilities.
 - `n::N`: The order of the Markov chain.
@@ -34,20 +34,20 @@ BioMarkovChain:
   - Markov Chain Order:2
 ```
 """
-struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
-  statespace::S # The sequence DataType (DNA,RNA,AminoAcid)
+struct BioMarkovChain{A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
+  alphabet::A # The sequence alphabet (DNAAlphabet, RNAAlphabet, AminoAcidAlphabet)
   tpm::M # The probabilities of the TransitionProbabilityMatrix struct
   inits::I # the initials distribution of probabilities
   n::N # The order of the Markov chain
-  function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} 
-    bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
+  function BioMarkovChain(alphabet::A, tpm::M, inits::I, n::N=1) where {A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} 
+    bmc = new{A,M,I,N}(alphabet, n > 1 ? tpm^n : tpm, inits, n)
     return bmc
   end
 
   function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A
     inits = initials(sequence)
     tpm = transition_probability_matrix(sequence)
-    bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n)
+    bmc = new{Alphabet, Matrix{Float64}, Vector{Float64},Int64}(Alphabet(sequence), n > 1 ? tpm^n : tpm, inits, n)
     return bmc
   end
 end
diff --git a/src/utils.jl b/src/utils.jl
index 9ba8e16..ea9e2af 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -10,13 +10,13 @@ function _dna_to_int(nucleotide::DNA)
     return reinterpret.(Int8, modifier(nucleotide))[1]  #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
 end
 
-function randbmc(statespace::DataType, n::Int64=1)
+function randbmc(A::Alphabet, n::Int64=1)
 
-    if !(statespace in (DNA, RNA, AminoAcid))
-        throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType."))
+    if !(A in (DNAAlphabet{4}(), RNAAlphabet{4}(), AminoAcidAlphabet()))
+        throw(ArgumentError("Alphabet must be of the DNAAlphabet, RNAAlphabet, or AminoAcidAlphabet."))
     end
 
-    nstates = (statespace == AminoAcid) ? 20 : 4
+    nstates = (A == AminoAcidAlphabet) ? 20 : 4
     tpm = rand(nstates, nstates)
     
     # Normalize rows of the transition probability matrix
@@ -28,5 +28,5 @@ function randbmc(statespace::DataType, n::Int64=1)
     initsum = sum(inits)
     @views inits ./= initsum
     
-    return BMC(statespace, tpm, inits, n)
+    return BMC(A, tpm, inits, n)
 end

From 33708e52aa5aa16502322be7c7cef31b876325af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= <ca.garcia2@uniandes.edu.co>
Date: Thu, 28 Dec 2023 13:06:36 -0500
Subject: [PATCH 2/4] Update CHANGELOG.md

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16fa56a..d126990 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver
 
 ## [UNRELEASED](https://github.com/camilogarciabotero/GeneFinder.jl/compare/v0.0.10...main)
 
+## [0.9.0]
+
+- `BioMarkoChain` now has a compliant `BioSequences` alphabet.
+
 ## [0.8.1]
 
 - Fix `BioMarkoChain` checks compats.

From 76dc6b000183fb51181c38f1d223bc98ed3eef67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= <ca.garcia2@uniandes.edu.co>
Date: Thu, 28 Dec 2023 13:06:45 -0500
Subject: [PATCH 3/4] Update version to 0.9.0 in Project.toml

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index bafaa23..3406cb2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioMarkovChains"
 uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c"
 authors = ["Camilo García-Botero<ca.garcia2@uniandes.edu.co>"]
-version = "0.8.1"
+version = "0.9.0"
 
 [deps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"

From a5b089d5b7881dd44024fc9ca3660f60d3649b11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= <ca.garcia2@uniandes.edu.co>
Date: Thu, 28 Dec 2023 13:10:11 -0500
Subject: [PATCH 4/4] Update BioMarkovChain with DNAAlphabet{4}() Alphabet

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e4526ff..01a6c00 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ BioMarkovChain(orfdna, 2)
 ```
 
 ```
-BioMarkovChain with DNA Alphabet:
+BioMarkovChain with DNAAlphabet{4}() Alphabet:
   - Transition Probability Matrix -> Matrix{Float64}(4 × 4):
    0.2123  0.2731  0.278   0.2366
    0.2017  0.3072  0.2687  0.2224
@@ -94,7 +94,7 @@ ECOLICDS
 ```
 
 ```
-BioMarkovChain with DNA Alphabet:
+BioMarkovChain with DNAAlphabet{4}() Alphabet:
   - Transition Probability Matrix -> Matrix{Float64}(4 × 4):
    0.31    0.224   0.199   0.268
    0.251   0.215   0.313   0.221