Skip to content

Commit

Permalink
Merge pull request #13 from ClapeyronThermo/strict_coverage
Browse files Browse the repository at this point in the history
add option to allow higher order groups
  • Loading branch information
longemen3000 authored Jul 26, 2024
2 parents 90007e7 + c33241b commit e09b709
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 31 deletions.
13 changes: 8 additions & 5 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ jobs:
fail-fast: false
matrix:
version:
- '1.6'
- 'lts'
- '1'
- 'nightly'
- 'pre'
os:
- ubuntu-latest
- windows-latest
Expand All @@ -36,10 +36,13 @@ jobs:
- uses: julia-actions/cache@v2
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v3
if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
- uses: codecov/codecov-action@v4
if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
with:
files: lcov.info
# possibly other stuff
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: false # or true if you want CI to fail when Codecov fails
docs:
name: Documentation
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ GCIdentifierClapeyronExt = "Clapeyron"
ChemicalIdentifiers = "0.1"
Clapeyron = "0.4,0.5,0.6"
Combinatorics = "1"
MolecularGraph = "0.14,0.15,0.16"
MolecularGraph = "0.14,0.15,0.16,0.17"
julia = "1.6"

[extras]
Expand Down
2 changes: 1 addition & 1 deletion src/database/Joback.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
JobackGroups = [GCPair(raw"[CX4H3]","-CH3"),
const JobackGroups = [GCPair(raw"[CX4H3]","-CH3"),
GCPair(raw"[!R;CX4H2]","-CH2-"),
GCPair(raw"[!R;CX4H]",">CH-"),
GCPair(raw"[!R;CX4H0]",">C<"),
Expand Down
2 changes: 1 addition & 1 deletion src/database/SAFTgammaMie.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"),
const SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"),
GCPair(raw"[!R;CX4H2]","CH2"),
GCPair(raw"[!R;CX4H]","CH"),
GCPair(raw"[!R;CX4H0]","C"),
Expand Down
2 changes: 1 addition & 1 deletion src/database/UNIFAC.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
const UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
GCPair(raw"[CX4;H2;!R]","CH2"),
GCPair(raw"[CX4;H1;!R]","CH"),
GCPair(raw"[CX4;H0;!R]","C"),
Expand Down
2 changes: 1 addition & 1 deletion src/database/gcPCSAFT.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
gcPCSAFTGroups = [
const gcPCSAFTGroups = [
GCPair(raw"[CX4H3]", "CH3"),
GCPair(raw"[!R;CX4H2]", "CH2"),
GCPair(raw"[!R;CX4H]", "CH"),
Expand Down
2 changes: 1 addition & 1 deletion src/database/gcPPCSAFT.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"),
const gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"),
GCPair(raw"[!R;CX4H2]","CH2"),
GCPair(raw"[!R;CX4H]","CH"),
GCPair(raw"[!R;CX4H0]","C"),
Expand Down
2 changes: 1 addition & 1 deletion src/database/ogUNIFAC.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
const ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
GCPair(raw"[CX4;H2;!R]","CH2"),
GCPair(raw"[CX4;H1;!R]","CH"),
GCPair(raw"[CX4;H0;!R]","C"),
Expand Down
57 changes: 43 additions & 14 deletions src/group_search.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@

"""
GCPair
Struct used to hold a description of a group. contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name.
GCPair(smarts,name;group_order = 1)
Struct used to hold a description of a group. Contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name.
the `group_order` parameter is used for groups that follow a Constantinou-Gani approach: the list of `GCPair` with `group_order = 1` will be matched with strict coverage (failing if there is missing atoms to cover) while second order groups and above will not be stringly checked for total coverage. Each order group will be matched independendly.
"""
struct GCPair
smarts::String
name::String
group_order::Int
end

GCPair(smarts,name;group_order = 1) = GCPair(smarts,name,group_order)

export GCPair

smarts(x::GCPair) = x.smarts
name(x::GCPair) = x.name

group_order(x::GCPair) = x.group_order
first_group_order(x::GCPair) = x.group_order == 1
#sorting comparison between 2 smatches
function _isless_smatch(smatch1,smatch2)
#fallback, if one is not matched, throw to the end
Expand Down Expand Up @@ -87,7 +91,7 @@ function get_grouplist end
get_grouplist(x::Vector{GCPair}) = x

"""
get_groups_from_smiles(smiles::String,groups;connectivity = false)
get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true)
Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.
Expand All @@ -103,23 +107,48 @@ julia> get_groups_from_smiles("CCO",JobackGroups,connectivity = true)
("CCO", ["-CH3" => 1, "-CH2-" => 1, "-OH (alcohol)" => 1], [("-CH3", "-CH2-") => 1, ("-CH2-", "-OH (alcohol)") => 1])
```
"""
function get_groups_from_smiles(smiles::String,groups;connectivity = false)
function get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true)
groups = get_grouplist(groups)
return get_groups_from_smiles(smiles,groups;connectivity = connectivity)
count(first_group_order,groups) == length(groups) && return _get_groups_from_smiles(smiles,groups,connectivity,check)
group_orders = group_order.(groups) |> unique! |> sort!

#find all group orders, perform a match for each order, then join the results.
conectivity_result = Vector{Pair{Tuple{String,String},Int}}[]
results = Tuple{String,Vector{Pair{String,Int}}}[]
for order in group_orders
groups_n = filter(x -> group_order(x) == order,groups)
if order == 1
result1 = _get_groups_from_smiles(smiles,groups_n,connectivity,check)
if connectivity
push!(conectivity_result,result1[3])
end
push!(results,(result1[1],result1[2]))
else
result_n = _get_groups_from_smiles(smiles,groups_n,false,false)
push!(results,result_n)
end
end

gc_pairs = mapreduce(last,vcat,results)
smiles_res = results[1][1]
if connectivity
return (smiles_res,gc_pairs,reduce(vcat,conectivity_result[1]))
else
return (smiles_res,gc_pairs)
end
end

function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivity=false,check = true)
function _get_groups_from_smiles(smiles::String,groups::Vector{GCPair},connectivity=false,check = true)
mol = get_mol(smiles)
atoms = get_atoms(mol)
natoms = length(atoms)
__bonds = __getbondlist(mol)

group_id_expanded, bond_mat_minimum = get_expanded_groups(mol, groups, atoms, __bonds, check)

group_id = unique(group_id_expanded)
group_occ_list = [sum(group_id_expanded .== i) for i in group_id]

gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]

if check
if sum(bond_mat_minimum) != natoms
Expand All @@ -135,7 +164,7 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivi
end

function find_covered_atoms(mol, groups, atoms, __bonds, check)
smatches = []
smatches = Vector{Dict{String, Vector{Int64}}}[]
smatches_idx = Int[]

#step 0.a, find all groups that could get a match
Expand All @@ -146,7 +175,6 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check)
push!(smatches_idx,i)
end
end

#step 0.b, sort the matches by the amount of matched atoms. biggest groups come first.
perm = sortperm(smatches,lt = _isless_smatch,rev = true)
smatches = smatches[perm]
Expand All @@ -162,8 +190,9 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check)
# Create a matrix with the atoms that are in each group
bond_mat = zeros(Int64, ngroups, natoms)
for i in 1:ngroups
for j in 1:length(smatches_expanded[i]["atoms"])
bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1
smatches_expanded_i_atoms = smatches_expanded[i]["atoms"]
for j in 1:length(smatches_expanded_i_atoms)
bond_mat[i, smatches_expanded_i_atoms[j]+1] = 1
end
end
if check
Expand Down
14 changes: 9 additions & 5 deletions src/missing_groups.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,12 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=
if isnothing(groups)
missing_atoms = ones(Bool, length(atoms))
else
smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, atoms, __bonds, false)
if count(first_group_order,groups) == length(groups)
first_order_groups = groups
else
first_order_groups = filter(x -> group_order(x) == 1, groups)
end
smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, first_order_groups, atoms, __bonds, false)
missing_atoms = (sum(atom_coverage, dims=1) .== 0)[:]
end

Expand Down Expand Up @@ -83,11 +88,10 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=

if max_group_size == 1
unique_smarts = unique(smarts)
unique_names = []
unique_names = String[]
for i in 1:length(unique_smarts)
push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])
push!(unique_names, names[findall(isequal(unique_smarts[i]), smarts)[1]])
end

new_groups = [GCPair(unique_smarts[i], unique_names[i]) for i in 1:length(unique_smarts)]

return new_groups
Expand Down Expand Up @@ -184,7 +188,7 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=
unique_smarts = unique(smarts)
end
# find the names of the unique smarts
unique_names = []
unique_names = String[]
occurrence = zeros(Int, length(unique_smarts))
for i in 1:length(unique_smarts)
push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])
Expand Down

0 comments on commit e09b709

Please sign in to comment.