diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6503bb9..542d87f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -18,9 +18,9 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - 'lts' - '1' - - 'nightly' + - 'pre' os: - ubuntu-latest - windows-latest @@ -36,10 +36,13 @@ jobs: - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v3 + if: matrix.version == '1' && matrix.os == 'ubuntu-latest' + - uses: codecov/codecov-action@v4 + if: matrix.version == '1' && matrix.os == 'ubuntu-latest' with: - files: lcov.info + # possibly other stuff + token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: false # or true if you want CI to fail when Codecov fails docs: name: Documentation runs-on: ubuntu-latest diff --git a/Project.toml b/Project.toml index 64e0b29..1d8d78d 100644 --- a/Project.toml +++ b/Project.toml @@ -21,7 +21,7 @@ GCIdentifierClapeyronExt = "Clapeyron" ChemicalIdentifiers = "0.1" Clapeyron = "0.4,0.5,0.6" Combinatorics = "1" -MolecularGraph = "0.14,0.15,0.16" +MolecularGraph = "0.14,0.15,0.16,0.17" julia = "1.6" [extras] diff --git a/src/database/Joback.jl b/src/database/Joback.jl index 5e4c403..efbeb31 100644 --- a/src/database/Joback.jl +++ b/src/database/Joback.jl @@ -1,4 +1,4 @@ -JobackGroups = [GCPair(raw"[CX4H3]","-CH3"), +const JobackGroups = [GCPair(raw"[CX4H3]","-CH3"), GCPair(raw"[!R;CX4H2]","-CH2-"), GCPair(raw"[!R;CX4H]",">CH-"), GCPair(raw"[!R;CX4H0]",">C<"), diff --git a/src/database/SAFTgammaMie.jl b/src/database/SAFTgammaMie.jl index 64e31b4..1609075 100644 --- a/src/database/SAFTgammaMie.jl +++ b/src/database/SAFTgammaMie.jl @@ -1,4 +1,4 @@ -SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"), +const SAFTgammaMieGroups = [GCPair(raw"[CX4H3]","CH3"), GCPair(raw"[!R;CX4H2]","CH2"), GCPair(raw"[!R;CX4H]","CH"), GCPair(raw"[!R;CX4H0]","C"), diff --git a/src/database/UNIFAC.jl b/src/database/UNIFAC.jl index 9794c97..bed8005 100644 --- a/src/database/UNIFAC.jl +++ b/src/database/UNIFAC.jl @@ -1,4 +1,4 @@ -UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"), +const UNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"), GCPair(raw"[CX4;H2;!R]","CH2"), GCPair(raw"[CX4;H1;!R]","CH"), GCPair(raw"[CX4;H0;!R]","C"), diff --git a/src/database/gcPCSAFT.jl b/src/database/gcPCSAFT.jl index 91ffb4a..817dffb 100644 --- a/src/database/gcPCSAFT.jl +++ b/src/database/gcPCSAFT.jl @@ -1,4 +1,4 @@ -gcPCSAFTGroups = [ +const gcPCSAFTGroups = [ GCPair(raw"[CX4H3]", "CH3"), GCPair(raw"[!R;CX4H2]", "CH2"), GCPair(raw"[!R;CX4H]", "CH"), diff --git a/src/database/gcPPCSAFT.jl b/src/database/gcPPCSAFT.jl index aa169b5..13d08cf 100644 --- a/src/database/gcPPCSAFT.jl +++ b/src/database/gcPPCSAFT.jl @@ -1,4 +1,4 @@ -gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"), +const gcPPCSAFTGroups = [GCPair(raw"[CX4H3]","CH3"), GCPair(raw"[!R;CX4H2]","CH2"), GCPair(raw"[!R;CX4H]","CH"), GCPair(raw"[!R;CX4H0]","C"), diff --git a/src/database/ogUNIFAC.jl b/src/database/ogUNIFAC.jl index bc4257f..aec0697 100644 --- a/src/database/ogUNIFAC.jl +++ b/src/database/ogUNIFAC.jl @@ -1,4 +1,4 @@ -ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"), +const ogUNIFACGroups = [GCPair(raw"[CX4;H3;!R]","CH3"), GCPair(raw"[CX4;H2;!R]","CH2"), GCPair(raw"[CX4;H1;!R]","CH"), GCPair(raw"[CX4;H0;!R]","C"), diff --git a/src/group_search.jl b/src/group_search.jl index c95f08a..672df93 100644 --- a/src/group_search.jl +++ b/src/group_search.jl @@ -1,20 +1,24 @@ """ - GCPair - -Struct used to hold a description of a group. contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name. + GCPair(smarts,name;group_order = 1) +Struct used to hold a description of a group. Contains the SMARTS string necessary to match the group within a SMILES query, and the assigned name. +the `group_order` parameter is used for groups that follow a Constantinou-Gani approach: the list of `GCPair` with `group_order = 1` will be matched with strict coverage (failing if there is missing atoms to cover) while second order groups and above will not be stringly checked for total coverage. Each order group will be matched independendly. """ struct GCPair smarts::String name::String + group_order::Int end +GCPair(smarts,name;group_order = 1) = GCPair(smarts,name,group_order) + export GCPair smarts(x::GCPair) = x.smarts name(x::GCPair) = x.name - +group_order(x::GCPair) = x.group_order +first_group_order(x::GCPair) = x.group_order == 1 #sorting comparison between 2 smatches function _isless_smatch(smatch1,smatch2) #fallback, if one is not matched, throw to the end @@ -87,7 +91,7 @@ function get_grouplist end get_grouplist(x::Vector{GCPair}) = x """ - get_groups_from_smiles(smiles::String,groups;connectivity = false) + get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true) Given a SMILES string and a group list (`groups::Vector{GCPair}`), returns a list of groups and their corresponding amount. @@ -103,23 +107,48 @@ julia> get_groups_from_smiles("CCO",JobackGroups,connectivity = true) ("CCO", ["-CH3" => 1, "-CH2-" => 1, "-OH (alcohol)" => 1], [("-CH3", "-CH2-") => 1, ("-CH2-", "-OH (alcohol)") => 1]) ``` """ -function get_groups_from_smiles(smiles::String,groups;connectivity = false) +function get_groups_from_smiles(smiles::String,groups;connectivity = false,check = true) groups = get_grouplist(groups) - return get_groups_from_smiles(smiles,groups;connectivity = connectivity) + count(first_group_order,groups) == length(groups) && return _get_groups_from_smiles(smiles,groups,connectivity,check) + group_orders = group_order.(groups) |> unique! |> sort! + + #find all group orders, perform a match for each order, then join the results. + conectivity_result = Vector{Pair{Tuple{String,String},Int}}[] + results = Tuple{String,Vector{Pair{String,Int}}}[] + for order in group_orders + groups_n = filter(x -> group_order(x) == order,groups) + if order == 1 + result1 = _get_groups_from_smiles(smiles,groups_n,connectivity,check) + if connectivity + push!(conectivity_result,result1[3]) + end + push!(results,(result1[1],result1[2])) + else + result_n = _get_groups_from_smiles(smiles,groups_n,false,false) + push!(results,result_n) + end + end + + gc_pairs = mapreduce(last,vcat,results) + smiles_res = results[1][1] + if connectivity + return (smiles_res,gc_pairs,reduce(vcat,conectivity_result[1])) + else + return (smiles_res,gc_pairs) + end end -function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivity=false,check = true) +function _get_groups_from_smiles(smiles::String,groups::Vector{GCPair},connectivity=false,check = true) mol = get_mol(smiles) atoms = get_atoms(mol) natoms = length(atoms) __bonds = __getbondlist(mol) - group_id_expanded, bond_mat_minimum = get_expanded_groups(mol, groups, atoms, __bonds, check) group_id = unique(group_id_expanded) group_occ_list = [sum(group_id_expanded .== i) for i in group_id] - gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)] + gcpairs = [name(groups[group_id[i]]) => group_occ_list[i] for i in 1:length(group_id)] if check if sum(bond_mat_minimum) != natoms @@ -135,7 +164,7 @@ function get_groups_from_smiles(smiles::String,groups::Vector{GCPair};connectivi end function find_covered_atoms(mol, groups, atoms, __bonds, check) - smatches = [] + smatches = Vector{Dict{String, Vector{Int64}}}[] smatches_idx = Int[] #step 0.a, find all groups that could get a match @@ -146,7 +175,6 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check) push!(smatches_idx,i) end end - #step 0.b, sort the matches by the amount of matched atoms. biggest groups come first. perm = sortperm(smatches,lt = _isless_smatch,rev = true) smatches = smatches[perm] @@ -162,8 +190,9 @@ function find_covered_atoms(mol, groups, atoms, __bonds, check) # Create a matrix with the atoms that are in each group bond_mat = zeros(Int64, ngroups, natoms) for i in 1:ngroups - for j in 1:length(smatches_expanded[i]["atoms"]) - bond_mat[i, smatches_expanded[i]["atoms"][j]+1] = 1 + smatches_expanded_i_atoms = smatches_expanded[i]["atoms"] + for j in 1:length(smatches_expanded_i_atoms) + bond_mat[i, smatches_expanded_i_atoms[j]+1] = 1 end end if check diff --git a/src/missing_groups.jl b/src/missing_groups.jl index 5deb803..f4f90c8 100644 --- a/src/missing_groups.jl +++ b/src/missing_groups.jl @@ -35,7 +35,12 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size= if isnothing(groups) missing_atoms = ones(Bool, length(atoms)) else - smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, groups, atoms, __bonds, false) + if count(first_group_order,groups) == length(groups) + first_order_groups = groups + else + first_order_groups = filter(x -> group_order(x) == 1, groups) + end + smatches_idx_expanded, atom_coverage = find_covered_atoms(mol, first_order_groups, atoms, __bonds, false) missing_atoms = (sum(atom_coverage, dims=1) .== 0)[:] end @@ -83,11 +88,10 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size= if max_group_size == 1 unique_smarts = unique(smarts) - unique_names = [] + unique_names = String[] for i in 1:length(unique_smarts) - push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]]) + push!(unique_names, names[findall(isequal(unique_smarts[i]), smarts)[1]]) end - new_groups = [GCPair(unique_smarts[i], unique_names[i]) for i in 1:length(unique_smarts)] return new_groups @@ -184,7 +188,7 @@ function find_missing_groups_from_smiles(smiles, groups=nothing; max_group_size= unique_smarts = unique(smarts) end # find the names of the unique smarts - unique_names = [] + unique_names = String[] occurrence = zeros(Int, length(unique_smarts)) for i in 1:length(unique_smarts) push!(unique_names, names[findall(x->x==unique_smarts[i], smarts)[1]])