Merge pull request #13 from ClapeyronThermo/strict_coverage

add option to allow higher order groups
ClapeyronThermo · Jul 26, 2024 · e09b709 · e09b709
2 parents 90007e7 + c33241b
commit e09b709
Show file tree

Hide file tree

Showing 10 changed files with 67 additions and 31 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -18,9 +18,9 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - 'lts'
           - '1'
-          - 'nightly'
+          - 'pre'
         os:
           - ubuntu-latest
           - windows-latest
@@ -36,10 +36,13 @@ jobs:
       - uses: julia-actions/cache@v2
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
-      - uses: julia-actions/julia-processcoverage@v1
-      - uses: codecov/codecov-action@v3
+        if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
+      - uses: codecov/codecov-action@v4
+        if: matrix.version == '1' && matrix.os == 'ubuntu-latest'
         with:
-          files: lcov.info
+          # possibly other stuff
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false  # or true if you want CI to fail when Codecov fails
   docs:
     name: Documentation
     runs-on: ubuntu-latest

diff --git a/Project.toml b/Project.toml
@@ -21,7 +21,7 @@ GCIdentifierClapeyronExt = "Clapeyron"
 ChemicalIdentifiers = "0.1"
 Clapeyron = "0.4,0.5,0.6"
 Combinatorics = "1"
-MolecularGraph = "0.14,0.15,0.16"
+MolecularGraph = "0.14,0.15,0.16,0.17"
 julia = "1.6"
 
 [extras]

diff --git a/src/database/Joback.jl b/src/database/Joback.jl
@@ -1,4 +1,4 @@
-JobackGroups = [GCPair(raw"[CX4H3]","-CH3"),
+const JobackGroups = [GCPair(raw"[CX4H3]","-CH3"),
 GCPair(raw"[!R;CX4H2]","-CH2-"),
 GCPair(raw"[!R;CX4H]",">CH-"),
 GCPair(raw"[!R;CX4H0]",">C<"),

diff --git a/src/database/SAFTgammaMie.jl b/src/database/SAFTgammaMie.jl
@@ -1,4 +1,4 @@
-SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"),
+const SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"),
 GCPair(raw"[!R;CX4H2]","CH2"),
 GCPair(raw"[!R;CX4H]","CH"),
 GCPair(raw"[!R;CX4H0]","C"),

diff --git a/src/database/UNIFAC.jl b/src/database/UNIFAC.jl
@@ -1,4 +1,4 @@
-UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
+const UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
 GCPair(raw"[CX4;H2;!R]","CH2"),
 GCPair(raw"[CX4;H1;!R]","CH"),
 GCPair(raw"[CX4;H0;!R]","C"),

diff --git a/src/database/gcPCSAFT.jl b/src/database/gcPCSAFT.jl
@@ -1,4 +1,4 @@
-gcPCSAFTGroups = [
+const gcPCSAFTGroups = [
     GCPair(raw"[CX4H3]", "CH3"),
     GCPair(raw"[!R;CX4H2]", "CH2"),
     GCPair(raw"[!R;CX4H]", "CH"),

diff --git a/src/database/gcPPCSAFT.jl b/src/database/gcPPCSAFT.jl
@@ -1,4 +1,4 @@
-gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"),
+const gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"),
 GCPair(raw"[!R;CX4H2]","CH2"),
 GCPair(raw"[!R;CX4H]","CH"),
 GCPair(raw"[!R;CX4H0]","C"),

diff --git a/src/database/ogUNIFAC.jl b/src/database/ogUNIFAC.jl
@@ -1,4 +1,4 @@
-ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
+const ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"),
 GCPair(raw"[CX4;H2;!R]","CH2"),
 GCPair(raw"[CX4;H1;!R]","CH"),
 GCPair(raw"[CX4;H0;!R]","C"),

diff --git a/src/group_search.jl b/src/group_search.jl
@@ -1,20 +1,24 @@
 
 """
-    GCPair
-
-Struct used to hold a description of a group. contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name.
+    GCPair(smarts,name;group_order = 1)
 
+Struct used to hold a description of a group. Contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name.
+the `group_order` parameter is used for groups that follow a Constantinou-Gani approach: the list of `GCPair` with `group_order = 1` will be matched with strict coverage (failing if there is missing atoms to cover) while second order groups and above will not be stringly checked for total coverage. Each order group will be matched independendly.
 """
 struct GCPair
     smarts::String
     name::String
+    group_order::Int
 end
 
+GCPair(smarts,name;group_order = 1) = GCPair(smarts,name,group_order)
+
 export GCPair
 
 smarts(x::GCPair) = x.smarts
 name(x::GCPair) = x.name
-
+group_order(x::GCPair) = x.group_order
+first_group_order(x::GCPair) = x.group_order == 1
 #sorting comparison between 2 smatches
 function _isless_smatch(smatch1,smatch2)
     #fallback, if one is not matched, throw to the end
@@ -87,7 +91,7 @@ function get_grouplist end
 get_grouplist(x::Vector{GCPair}) = x
 
 """
-    get_groups_from_smiles(smiles::String,groups;connectivity = false)
+    get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true)
 
 Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount.
 
@@ -103,23 +107,48 @@ julia> get_groups_from_smiles("CCO",JobackGroups,connectivity = true)
 ("CCO", ["-CH3" => 1, "-CH2-" => 1, "-OH (alcohol)" => 1], [("-CH3", "-CH2-") => 1, ("-CH2-", "-OH (alcohol)") => 1])
 ```
 """
-function get_groups_from_smiles(smiles::String,groups;connectivity = false)
+function get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true)
     groups = get_grouplist(groups)
-    return get_groups_from_smiles(smiles,groups;connectivity = connectivity)
+    count(first_group_order,groups) == length(groups) && return _get_groups_from_smiles(smiles,groups,connectivity,check)
+    group_orders = group_order.(groups) |> unique! |> sort!
+
+    #find all group orders, perform a match for each order, then join the results.
+    conectivity_result = Vector{Pair{Tuple{String,String},Int}}[]
+    results = Tuple{String,Vector{Pair{String,Int}}}[]
+    for order in group_orders
+        groups_n = filter(x -> group_order(x) == order,groups)
+        if order == 1
+            result1 = _get_groups_from_smiles(smiles,groups_n,connectivity,check)
+            if connectivity
+                push!(conectivity_result,result1[3])
+            end
+            push!(results,(result1[1],result1[2]))
+        else
+            result_n = _get_groups_from_smiles(smiles,groups_n,false,false)
+            push!(results,result_n)
+        end
+    end
+
+    gc_pairs = mapreduce(last,vcat,results)
+    smiles_res = results[1][1]
+    if connectivity
+        return (smiles_res,gc_pairs,reduce(vcat,conectivity_result[1]))
+    else
+        return (smiles_res,gc_pairs)
+    end
 end
 
-function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivity=false,check = true)
+function _get_groups_from_smiles(smiles::String,groups::Vector{GCPair},connectivity=false,check = true)
     mol = get_mol(smiles)
     atoms = get_atoms(mol)
     natoms = length(atoms)
     __bonds = __getbondlist(mol)
-
     group_id_expanded, bond_mat_minimum = get_expanded_groups(mol, groups, atoms, __bonds, check)
 
     group_id = unique(group_id_expanded)
     group_occ_list = [sum(group_id_expanded .== i) for i in group_id]
 
-    gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]   
+    gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)]
 
     if check
         if sum(bond_mat_minimum) != natoms
@@ -135,7 +164,7 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivi
 end
 
 function find_covered_atoms(mol, groups, atoms, __bonds, check)
-    smatches = []
+    smatches = Vector{Dict{String, Vector{Int64}}}[]
     smatches_idx = Int[]
 
     #step 0.a, find all groups that could get a match
@@ -146,7 +175,6 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check)
             push!(smatches_idx,i)
         end
     end
-
     #step 0.b, sort the matches by the amount of matched atoms. biggest groups come first.
     perm = sortperm(smatches,lt = _isless_smatch,rev = true)
     smatches = smatches[perm]
@@ -162,8 +190,9 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check)
     # Create a matrix with the atoms that are in each group
     bond_mat = zeros(Int64, ngroups, natoms)
     for i in 1:ngroups
-        for j in 1:length(smatches_expanded[i]["atoms"])
-            bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1
+        smatches_expanded_i_atoms = smatches_expanded[i]["atoms"]
+        for j in 1:length(smatches_expanded_i_atoms)
+            bond_mat[i, smatches_expanded_i_atoms[j]+1] = 1
         end
     end
     if check

diff --git a/src/missing_groups.jl b/src/missing_groups.jl
@@ -35,7 +35,12 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=
     if isnothing(groups)
         missing_atoms = ones(Bool, length(atoms))
     else
-        smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, atoms, __bonds, false)
+        if count(first_group_order,groups) == length(groups)
+            first_order_groups = groups
+        else
+            first_order_groups = filter(x -> group_order(x) == 1, groups)
+        end
+        smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, first_order_groups, atoms, __bonds, false)
         missing_atoms = (sum(atom_coverage, dims=1) .== 0)[:]
     end
 
@@ -83,11 +88,10 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=
 
     if max_group_size == 1
         unique_smarts = unique(smarts)
-        unique_names = []
+        unique_names = String[]
         for i in 1:length(unique_smarts)
-            push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])
+            push!(unique_names, names[findall(isequal(unique_smarts[i]), smarts)[1]])
         end
-
         new_groups = [GCPair(unique_smarts[i], unique_names[i]) for i in 1:length(unique_smarts)]
 
         return new_groups
@@ -184,7 +188,7 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size=
         unique_smarts = unique(smarts)
     end
     # find the names of the unique smarts
-    unique_names = []
+    unique_names = String[]
     occurrence = zeros(Int, length(unique_smarts))
     for i in 1:length(unique_smarts)
         push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])