Skip to content

Commit

Permalink
Merge pull request #21 from camilogarciabotero:dev
Browse files Browse the repository at this point in the history
Refactor BioMarkovChain struct to be BioJulia compliant using correct alphabet types
  • Loading branch information
camilogarciabotero authored Dec 28, 2023
2 parents 8d64c31 + a5b089d commit ae5cfbe
Show file tree
Hide file tree
Showing 9 changed files with 31 additions and 25 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ The format is based on Keep a Changelog and this project adheres to Semantic Ver

## [UNRELEASED](https://github.com/camilogarciabotero/GeneFinder.jl/compare/v0.0.10...main)

## [0.9.0]

- `BioMarkoChain` now has a compliant `BioSequences` alphabet.

## [0.8.1]

- Fix `BioMarkoChain` checks compats.
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioMarkovChains"
uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c"
authors = ["Camilo García-Botero<ca.garcia2@uniandes.edu.co>"]
version = "0.8.1"
version = "0.9.0"

[deps]
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ BioMarkovChain(orfdna, 2)
```

```
BioMarkovChain with DNA Alphabet:
BioMarkovChain with DNAAlphabet{4}() Alphabet:
- Transition Probability Matrix -> Matrix{Float64}(4 × 4):
0.2123 0.2731 0.278 0.2366
0.2017 0.3072 0.2687 0.2224
Expand All @@ -94,7 +94,7 @@ ECOLICDS
```

```
BioMarkovChain with DNA Alphabet:
BioMarkovChain with DNAAlphabet{4}() Alphabet:
- Transition Probability Matrix -> Matrix{Float64}(4 × 4):
0.31 0.224 0.199 0.268
0.251 0.215 0.313 0.221
Expand Down
4 changes: 3 additions & 1 deletion src/BioMarkovChains.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ using BioSequences:
NucleicAcidAlphabet,
DNA,
DNAAlphabet,
RNAAlphabet,
Alphabet,

#RNA
RNA,
Expand All @@ -27,7 +29,6 @@ using BioSequences:
#tests and precompilation

using PrecompileTools: @setup_workload, @compile_workload
# using StatsAPI: StatsAPI, fit, fit!
using VectorizedKmers: count_kmers

include("types.jl")
Expand Down Expand Up @@ -58,6 +59,7 @@ include("extended.jl")
# they belong to your package or not (on Julia 1.8 and higher)
transition_count_matrix(seq)
transition_probability_matrix(seq)

end
end

Expand Down
2 changes: 1 addition & 1 deletion src/extended.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ end
Base.length(bmc::BioMarkovChain) = length(bmc.inits)

function Base.eltype(bmc::BioMarkovChain)
return bmc.statespace
return bmc.alphabet
end

"""
Expand Down
8 changes: 4 additions & 4 deletions src/models.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const ECOLICDS = begin

inits = [0.245, 0.243, 0.273, 0.239]

BMC(DNA, tpm, inits)
BMC(DNAAlphabet{4}(), tpm, inits)
end

const ECOLINOCDS = begin
Expand All @@ -21,7 +21,7 @@ const ECOLINOCDS = begin

inits = [0.262, 0.239, 0.240, 0.259]

BMC(DNA, tpm, inits)
BMC(DNAAlphabet{4}(), tpm, inits)
end


Expand All @@ -35,7 +35,7 @@ const CPGPOS = begin

inits = [0.262, 0.239, 0.240, 0.259] # not stablished

BMC(DNA, tpm, inits)
BMC(DNAAlphabet{4}(), tpm, inits)
end


Expand All @@ -49,5 +49,5 @@ const CPGNEG = begin

inits = [0.262, 0.239, 0.240, 0.259] # not stablished

BMC(DNA, tpm, inits)
BMC(DNAAlphabet{4}(), tpm, inits)
end
10 changes: 5 additions & 5 deletions src/transitions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ function odds_ratio_matrix(
sequence::SeqOrView{A},
model::BioMarkovChain;
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
tpm = transition_probability_matrix(sequence)
return tpm ./ model.tpm
end
Expand Down Expand Up @@ -173,7 +173,7 @@ function log_odds_ratio_matrix(
model::BioMarkovChain;
b::Number =
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
@assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

tpm = transition_probability_matrix(sequence)
Expand Down Expand Up @@ -206,7 +206,7 @@ function log_odds_ratio_matrix(
model2::BioMarkovChain;
b::Number =
)
@assert model1.statespace == model2.statespace "Models state spaces are inconsistent"
@assert model1.alphabet == model2.alphabet "Models state spaces are inconsistent"
@assert round.(sum(model1.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 1 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."
@assert round.(sum(model2.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model 2 transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

Expand Down Expand Up @@ -239,7 +239,7 @@ function log_odds_ratio_score(
model::BioMarkovChain;
b::Number =
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
@assert round.(sum(model.tpm, dims=2)') == [1.0 1.0 1.0 1.0] "Model transition probability matrix must be row-stochastic. That is, their row sums must be equal to 1."

tpm = transition_probability_matrix(sequence)
Expand Down Expand Up @@ -303,7 +303,7 @@ function dnaseqprobability(
sequence::NucleicSeqOrView{A},
model::BioMarkovChain
) where A
@assert model.statespace == eltype(sequence) "Sequence and model state space are inconsistent."
@assert model.alphabet == Alphabet(sequence) "Sequence and model state space are inconsistent."
init = model.inits[_dna_to_int(sequence[1])]

probability = init
Expand Down
12 changes: 6 additions & 6 deletions src/types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ abstract type AbstractBioMarkovChain end
A BioMarkovChain represents a Markov chain used in biological sequence analysis. It contains a transition probability matrix (tpm) and an initial distribution of probabilities (inits) and also the order of the Markov chain.
# Fields
- `statespace::S`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
- `alphabet::A`: Is the state space of the sequence whether DNA, RNA AminoAcid `DataType`s.
- `tpm::M`: The transition probability matrix.
- `inits::I`: The initial distribution of probabilities.
- `n::N`: The order of the Markov chain.
Expand Down Expand Up @@ -34,20 +34,20 @@ BioMarkovChain:
- Markov Chain Order:2
```
"""
struct BioMarkovChain{S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
statespace::S # The sequence DataType (DNA,RNA,AminoAcid)
struct BioMarkovChain{A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer} <: AbstractBioMarkovChain
alphabet::A # The sequence alphabet (DNAAlphabet, RNAAlphabet, AminoAcidAlphabet)
tpm::M # The probabilities of the TransitionProbabilityMatrix struct
inits::I # the initials distribution of probabilities
n::N # The order of the Markov chain
function BioMarkovChain(statespace::S, tpm::M, inits::I, n::N=1) where {S<:DataType, M<:AbstractMatrix, I<:AbstractVector, N<:Integer}
bmc = new{S,M,I,N}(statespace, n > 1 ? tpm^n : tpm, inits, n)
function BioMarkovChain(alphabet::A, tpm::M, inits::I, n::N=1) where {A<:Alphabet, M<:AbstractMatrix, I<:AbstractVector, N<:Integer}
bmc = new{A,M,I,N}(alphabet, n > 1 ? tpm^n : tpm, inits, n)
return bmc
end

function BioMarkovChain(sequence::SeqOrView{A}, n::Int64=1) where A
inits = initials(sequence)
tpm = transition_probability_matrix(sequence)
bmc = new{DataType,Matrix{Float64},Vector{Float64},Int64}(eltype(sequence), n > 1 ? tpm^n : tpm, inits, n)
bmc = new{Alphabet, Matrix{Float64}, Vector{Float64},Int64}(Alphabet(sequence), n > 1 ? tpm^n : tpm, inits, n)
return bmc
end
end
Expand Down
10 changes: 5 additions & 5 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ function _dna_to_int(nucleotide::DNA)
return reinterpret.(Int8, modifier(nucleotide))[1] #searchsortedfirst(A, nucleotide) # findfirst(nucleotide, LongSequence{DNAAlphabet{4}}(A))
end

function randbmc(statespace::DataType, n::Int64=1)
function randbmc(A::Alphabet, n::Int64=1)

if !(statespace in (DNA, RNA, AminoAcid))
throw(ArgumentError("Alphabet must be of the DNA, RNA, or AminoAcid DataType."))
if !(A in (DNAAlphabet{4}(), RNAAlphabet{4}(), AminoAcidAlphabet()))
throw(ArgumentError("Alphabet must be of the DNAAlphabet, RNAAlphabet, or AminoAcidAlphabet."))
end

nstates = (statespace == AminoAcid) ? 20 : 4
nstates = (A == AminoAcidAlphabet) ? 20 : 4
tpm = rand(nstates, nstates)

# Normalize rows of the transition probability matrix
Expand All @@ -28,5 +28,5 @@ function randbmc(statespace::DataType, n::Int64=1)
initsum = sum(inits)
@views inits ./= initsum

return BMC(statespace, tpm, inits, n)
return BMC(A, tpm, inits, n)
end

2 comments on commit ae5cfbe

@camilogarciabotero
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/97861

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.9.0 -m "<description of version>" ae5cfbe195a75544b0685b05d8aaee66ccf9556b
git push origin v0.9.0

Please sign in to comment.