-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscreen.py
executable file
·335 lines (290 loc) · 14.9 KB
/
screen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python
# Copyright 2022 Informatics Matters Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse, time
import traceback
import utils, rdkit_utils
from dm_job_utilities.dm_log import DmLog
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.Fingerprints import FingerprintMols
"""
Filter a file of SMILES using fingerprint based similarity.
One or more query molecules can be specified as SMILES, or the queries can be contained in a SD file or SMILES file.
Similarities for each query are calculated.
When more than one query is specified the min similarity, max similarity, the arithmetic mean of the similarities, the
geometric mean of the similarities and the product of the similarities are also generated and added to the list of
similarities.
The inputs are filtered using a threshold score and the index of the similarity score to use.
For instance when specifying the query as SMILES and where there is only a single query molecule there is only a single
score, so the index to use is 0.
When there are two query molecules specified as SMILES there are two scores plus the extra four. So if you want to
filter by the sum of the individual scores then use an index of 4. For two inputs the indices are:
- 0 similarity to first molecule
- 1 similarity to second molecule
- 2 minimum of the individual scores
- 3 maximum of the individual scores
- 4 arithmetic mean of the individual scores
- 5 geometric mean of the individual scores
- 6 product of the individual scores
When the queries are specified in a file it is assumed that there will be a relatively large number of query molecules
so the individual scores are not output, nor is the product as that is almost always close to zero. In this case the
index to use to filter will be:
- 0 minimum of the individual scores
- 1 maximum of the individual scores
- 2 arithmetic mean of the individual scores
- 3 geometric mean of the individual scores
The descriptor and metric to use can be specified. See the descriptors and metrics properties for the permitted values.
When using Morgan fingerprints you need to be careful about which metric you use. By default Morgan fingerprints use
counts, but these can only be used with tanimoto, dice or tversky metrics. If using another metric you must generate
Morgan bit vectors which you do by specifying the nbits parameter e.g. with a value of 1024.
See http://rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints
When using the tversky metric you can also specify the alpha and beta parameters. Default values are 1 and 0. These
parameters are ignored when using other metrics.
The output is that same as the input with extra similarity scores appended to each line, with only lines passing the
threshold filter being written.
"""
descriptors = {
'maccs': lambda m, nBits: MACCSkeys.GenMACCSKeys(m),
'morgan2': lambda m, nBits: AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=nBits) if nBits else AllChem.GetMorganFingerprint(m, 2),
'morgan3': lambda m, nBits: AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=nBits) if nBits else AllChem.GetMorganFingerprint(m, 3),
'rdkit': lambda m, nBits: FingerprintMols.FingerprintMol(m),
}
metrics = {
'asymmetric': lambda m1, m2, a, b: DataStructs.AsymmetricSimilarity(m1, m2),
'braunblanquet': lambda m1, m2, a, b: DataStructs.BraunBlanquetSimilarity(m1, m2),
'cosine': lambda m1, m2, a, b: DataStructs.CosineSimilarity(m1, m2),
'dice': lambda m1, m2, a, b: DataStructs.DiceSimilarity(m1, m2),
'kulczynski': lambda m1, m2, a, b: DataStructs.KulczynskiSimilarity(m1, m2),
'mcconnaughey': lambda m1, m2, a, b: DataStructs.McConnaugheySimilarity(m1, m2),
'rogotgoldberg': lambda m1, m2, a, b: DataStructs.RogotGoldbergSimilarity(m1, m2),
'russel': lambda m1, m2, a, b: DataStructs.RusselSimilarity(m1, m2),
'sokal': lambda m1, m2, a, b: DataStructs.SokalSimilarity(m1, m2),
'tanimoto': lambda m1, m2, a, b: DataStructs.TanimotoSimilarity(m1, m2),
'tversky': lambda m1, m2, a, b: DataStructs.TverskySimilarity(m2, m1, a, b)
}
def multiply_list(scores) :
result = 1
for x in scores:
result = result * x
return result
def validate_params(descriptor, metric, alpha, beta, nbits):
if descriptor == 'morgan2' or descriptor == 'morgan3':
if not (metric == 'tamimoto' or metric == 'dice' or metric == 'tversky'):
if not nbits:
DmLog.emit_event('When using', descriptor, 'descriptor and', metric, 'metric',
'the nbits parameter must be defined')
exit(1)
else:
if metric == 'tversky':
DmLog.emit_event('Using {} bit vector and {} metric with alpha={}, beta={}'
.format(descriptor, metric, alpha, beta))
else:
DmLog.emit_event('Using {} bit vector and {} metric'.format(descriptor, metric))
else:
if metric == 'tversky':
DmLog.emit_event('Using {} counts and {} metric with alpha={}, beta={}'
.format(descriptor, metric, alpha, beta))
else:
DmLog.emit_event('Using {} counts and {} metric'.format(descriptor, metric))
elif metric == 'tversky':
DmLog.emit_event('Using {} descriptor and {} metric with alpha={}, beta={}'
.format(descriptor, metric, alpha, beta))
else:
DmLog.emit_event('Using {} descriptor and {} metric'.format(descriptor, metric))
def execute(query_smis,
query_file,
inputfile,
outputfile,
descriptor,
metric,
delimiter='\t',
threshold=0.7,
sim_idx=0,
read_header=False,
write_header=False,
id_column=None,
mol_column=0,
read_records=100,
queries_read_header=False,
queries_delimiter=None,
alpha=1.0,
beta=0.0,
nbits=None,
interval=None):
if query_smis and query_file:
raise ValueError("Specify queries as SMILES or a file, not both")
validate_params(descriptor, metric, alpha, beta, nbits)
utils.expand_path(outputfile)
count = 0
hits = 0
errors = 0
my_descriptor = descriptors[descriptor]
my_metric = metrics[metric]
q_mols = []
if query_smis:
for smi in query_smis:
mol = Chem.MolFromSmiles(smi)
if not mol:
raise ValueError('Failed to read query smiles:', smi)
q_mols.append(mol)
elif query_file:
reader = rdkit_utils.create_reader(
query_file,
id_column=id_column,
mol_column=mol_column,
read_records=read_records,
read_header=queries_read_header,
delimiter=queries_delimiter)
q_count = 0
while True:
q_count += 1
t = reader.read()
# break if no more data to read
if not t:
break
mol, smi, id, props = t
if not mol:
raise ValueError('Failed to read query molecule:', q_count)
q_mols.append(mol)
DmLog.emit_event('Read', len(q_mols), 'query molecules')
q_fps = []
for q_mol in q_mols:
q_fp = my_descriptor(q_mol, nbits)
q_fps.append(q_fp)
with open(outputfile, 'wt') as outf:
with open(inputfile) as inf:
if read_header:
line = next(inf)
line = line.strip()
headers = line.split(delimiter)
else:
headers = None
for line in inf:
count += 1
if interval and count % interval == 0:
DmLog.emit_event("Processed {} records, {} hits".format(count, hits))
line = line.strip()
tokens = line.split(delimiter)
smi = tokens[0]
if write_header and count == 1:
if headers is None:
# we need to generate generic field names
headers = []
for i, t in enumerate(tokens):
if i == 0:
headers.append('smiles')
else:
headers.append('field' + str(i + 1))
if query_smis:
for i in range(len(query_smis)):
headers.append('score_' + str(i + 1))
if query_file or (query_smis and len(query_smis) > 1):
headers.extend(['score_min', 'score_max', 'score_amean', 'score_gmean', 'score_prod'])
outf.write(delimiter.join(headers) + '\n')
try:
t_mol = Chem.MolFromSmiles(smi)
if not t_mol:
errors += 1
DmLog.emit_event('Failed to process molecule', count, smi)
continue
t_fp = my_descriptor(t_mol, nbits)
sims = []
for q_fp in q_fps:
sims.append(my_metric(q_fp, t_fp, alpha, beta))
if len(sims) > 1:
min_sims = min(sims)
max_sims = max(sims)
amean_sims = sum(sims) / len(q_mols)
gmean_sims = utils.calc_geometric_mean(sims)
prod_sims = multiply_list(sims)
if query_file:
# clear the sims as we don't want to write individual scores when using a query file
sims = []
sims.append(min_sims)
sims.append(max_sims)
sims.append(amean_sims)
sims.append(gmean_sims)
sims.append(prod_sims)
sim = sims[sim_idx]
if sim > threshold:
hits += 1
#utils.log(smi, sims)
outl = line
for sim in sims:
outl += delimiter + str(sim)
outf.write(outl + '\n')
except Exception as e:
errors += 1
utils.log('Failed to process molecule', count, smi)
traceback.print_exc()
return count, hits, errors
def main():
# Examples:
# python screen.py --smiles 'O=C(Nc1ccc(Cl)cc1)c1ccccn1' --input data/10000.smi --delimiter tab -o foo.smi\
# -d rdkit -m tanimoto
# python screen.py --queries-file data/10.smi --input data/10000.smi --delimiter tab --id-column 1 -o foo.smi \
# -d rdkit -m tanimoto --queries-delimiter tab --threshold 0.4
parser = argparse.ArgumentParser(description='screen')
inputs = parser.add_mutually_exclusive_group(required=True)
inputs.add_argument('-s', '--smiles', nargs='+', help="Query SMILES")
inputs.add_argument('--queries-file', help="File with query molecules")
parser.add_argument('--queries-delimiter', help="Delimiter for queries file (text format)")
parser.add_argument('--queries-read-header', action='store_true',
help="Does the queries file contain a header line (text format)")
parser.add_argument('-i', '--input', required=True, help="SMILES file with molecules to search")
parser.add_argument('--delimiter', default='\t', help="Delimiter")
parser.add_argument('--id-column', help="Column for name field (zero based integer for .smi, text for SDF)")
parser.add_argument('--mol-column', type=int, default=0,
help="Column index for molecule when using delineated text formats (zero based integer)")
parser.add_argument('--read-header', action='store_true', help="Read a header line with the field names")
parser.add_argument('-o', '--output', required=True, help="Output file as SMILES")
parser.add_argument('--write-header', action='store_true', help='Write a header line')
parser.add_argument('--read-records', default=100, type=int,
help="Read this many records to determine the fields that are present")
parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit',
help='Descriptor or fingerprint type (default rdkit)')
parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto',
help='Similarity metric (default tanimoto)')
parser.add_argument("--threshold", type=float, default=0.7, help="Similarity threshold")
parser.add_argument("--sim-index", type=int, default=0, help="Similarity score index")
parser.add_argument("--alpha", type=float, default=1.0, help="Tversky alpha parameter")
parser.add_argument("--beta", type=float, default=0.0, help="Tversky beta parameter")
parser.add_argument("--nbits", type=int, default=None, help="Number of bits if using Morgan as bit vector e.g. 1024")
parser.add_argument("--interval", type=int, help="Reporting interval")
args = parser.parse_args()
DmLog.emit_event("screen: ", args)
delimiter = utils.read_delimiter(args.delimiter)
queries_delimiter = utils.read_delimiter(args.queries_delimiter)
start = time.time()
input_count, hit_count, error_count = \
execute(args.smiles, args.queries_file, args.input, args.output,
args.descriptor, args.metric,
id_column=args.id_column, mol_column=args.mol_column, read_records=args.read_records,
threshold=args.threshold, sim_idx=args.sim_index, delimiter=delimiter,
read_header=args.read_header, write_header=args.write_header,
queries_read_header=args.queries_read_header, queries_delimiter=queries_delimiter,
alpha=args.alpha, beta=args.beta, nbits=args.nbits, interval=args.interval
)
end = time.time()
duration_s = int(end - start)
if duration_s < 1:
duration_s = 1
DmLog.emit_event(input_count, 'inputs,', hit_count, 'hits,', error_count, 'errors.',
'Time (s):', duration_s)
DmLog.emit_cost(input_count)
if __name__ == "__main__":
main()