Skip to content

Commit

Permalink
Boltz: Add smiles lookup from compound name from PubChem API
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Nov 19, 2024
1 parent 0b6d456 commit e2ca9e8
Showing 1 changed file with 42 additions and 0 deletions.
42 changes: 42 additions & 0 deletions Boltz1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"import re\n",
"import hashlib\n",
"import random\n",
"import requests\n",
"from string import ascii_uppercase\n",
"\n",
"# Function to add a hash to the jobname\n",
Expand All @@ -63,6 +64,8 @@
"#@markdown - Use `:` to specify multiple ligands as smile strings\n",
"ligand_input_ccd = 'SAH' #@param {type:\"string\"}\n",
"#@markdown - Use `:` to specify multiple ligands as CCD codes (three-letter codes)\n",
"ligand_input_common_name = '' #@param {type:\"string\"}\n",
"#@markdown - Use `:` to specify multiple ligands with their common name (e.g. Aspirin; SMILES fetched from [PubChem](https://pubchem.ncbi.nlm.nih.gov) API)\n",
"dna_input = '' #@param {type:\"string\"}\n",
"#@markdown - Use `:` to specify multiple DNA sequences\n",
"jobname = 'test' #@param {type:\"string\"}\n",
Expand All @@ -71,6 +74,7 @@
"query_sequence = \"\".join(query_sequence.split())\n",
"ligand_input = \"\".join(ligand_input.split())\n",
"ligand_input_ccd = \"\".join(ligand_input_ccd.split())\n",
"ligand_input_common_name = \"\".join(ligand_input_common_name.split())\n",
"dna_input = \"\".join(dna_input.split())\n",
"basejobname = \"\".join(jobname.split())\n",
"basejobname = re.sub(r'\\W+', '', basejobname)\n",
Expand All @@ -95,8 +99,46 @@
"protein_sequences = query_sequence.strip().split(':') if query_sequence.strip() else []\n",
"ligand_sequences = ligand_input.strip().split(':') if ligand_input.strip() else []\n",
"ligand_sequences_ccd = ligand_input_ccd.strip().split(':') if ligand_input_ccd.strip() else []\n",
"ligand_sequences_common_name = ligand_input_common_name.strip().split(':') if ligand_input_common_name.strip() else []\n",
"dna_sequences = dna_input.strip().split(':') if dna_input.strip() else []\n",
"\n",
"def get_smiles(compound_name):\n",
" autocomplete_url = f\"https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete/compound/{compound_name}/json?limit=1\"\n",
" autocomplete_response = requests.get(autocomplete_url)\n",
" if autocomplete_response.status_code != 200:\n",
" return None\n",
"\n",
" autocomplete_data = autocomplete_response.json()\n",
" if autocomplete_data.get(\"status\", {}).get(\"code\") != 0 or autocomplete_data.get(\"total\", 0) == 0:\n",
" return None\n",
"\n",
" suggested_compound = autocomplete_data.get(\"dictionary_terms\", {}).get(\"compound\", [])\n",
" if not suggested_compound:\n",
" return None\n",
" suggested_compound_name = suggested_compound[0]\n",
"\n",
" smiles_url = f\"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{suggested_compound_name}/property/CanonicalSMILES/JSON\"\n",
" smiles_response = requests.get(smiles_url)\n",
" if smiles_response.status_code != 200:\n",
" return None\n",
"\n",
" smiles_data = smiles_response.json()\n",
" properties = smiles_data.get(\"PropertyTable\", {}).get(\"Properties\", [])\n",
" if len(properties) == 0:\n",
" return None\n",
"\n",
" return properties[0].get(\"CanonicalSMILES\")\n",
"\n",
"smiles_cache = {}\n",
"for name in ligand_sequences_common_name:\n",
" if name not in smiles_cache:\n",
" smiles_cache[name] = get_smiles(name)\n",
" if smiles_cache[name] is not None:\n",
" print(f\"Mapped compound {name} to {smiles_cache[name]}\")\n",
"\n",
" if smiles_cache[name] is not None:\n",
" ligand_sequences.append(smiles_cache[name])\n",
"\n",
"# Initialize chain labels starting from 'A'\n",
"chain_labels = iter(ascii_uppercase)\n",
"\n",
Expand Down

0 comments on commit e2ca9e8

Please sign in to comment.