| 1 | # Copyright 2021 DeepMind Technologies Limited
|
|---|
| 2 | #
|
|---|
| 3 | # Licensed under the Apache License, Version 2.0 (the "License");
|
|---|
| 4 | # you may not use this file except in compliance with the License.
|
|---|
| 5 | # You may obtain a copy of the License at
|
|---|
| 6 | #
|
|---|
| 7 | # http://www.apache.org/licenses/LICENSE-2.0
|
|---|
| 8 | #
|
|---|
| 9 | # Unless required by applicable law or agreed to in writing, software
|
|---|
| 10 | # distributed under the License is distributed on an "AS IS" BASIS,
|
|---|
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 12 | # See the License for the specific language governing permissions and
|
|---|
| 13 | # limitations under the License.
|
|---|
| 14 |
|
|---|
| 15 | """Singularity launch script for Alphafold singularity image."""
|
|---|
| 16 |
|
|---|
| 17 | import os
|
|---|
| 18 | import pathlib
|
|---|
| 19 | import signal
|
|---|
| 20 | from typing import Tuple
|
|---|
| 21 |
|
|---|
| 22 | from absl import app
|
|---|
| 23 | from absl import flags
|
|---|
| 24 | from absl import logging
|
|---|
| 25 |
|
|---|
| 26 | flags.DEFINE_bool(
|
|---|
| 27 | 'use_gpu', True, 'Enable NVIDIA runtime to run with GPUs.')
|
|---|
| 28 | flags.DEFINE_string(
|
|---|
| 29 | 'gpu_devices', os.environ.get('SGE_GPU', '0'),
|
|---|
| 30 | 'Comma separated list GPU identifiers to set environment variable CUDA_VISIBLE_DEVICES.')
|
|---|
| 31 | flags.DEFINE_list(
|
|---|
| 32 | 'fasta_paths', None, 'Paths to FASTA files, each containing a prediction '
|
|---|
| 33 | 'target that will be folded one after another. If a FASTA file contains '
|
|---|
| 34 | 'multiple sequences, then it will be folded as a multimer. Paths should be '
|
|---|
| 35 | 'separated by commas. All FASTA paths must have a unique basename as the '
|
|---|
| 36 | 'basename is used to name the output directories for each prediction.')
|
|---|
| 37 | flags.DEFINE_list(
|
|---|
| 38 | 'is_prokaryote_list', None, 'Optional for multimer system, not used by the '
|
|---|
| 39 | 'single chain system. This list should contain a boolean for each fasta '
|
|---|
| 40 | 'specifying true where the target complex is from a prokaryote, and false '
|
|---|
| 41 | 'where it is not, or where the origin is unknown. These values determine '
|
|---|
| 42 | 'the pairing method for the MSA.')
|
|---|
| 43 | flags.DEFINE_string(
|
|---|
| 44 | 'output_dir', '/tmp/alphafold',
|
|---|
| 45 | 'Path to a directory that will store the results.')
|
|---|
| 46 | flags.DEFINE_string(
|
|---|
| 47 | 'data_dir', '/wynton/group/databases/alphafold_CASP14',
|
|---|
| 48 | 'Path to directory with supporting data: AlphaFold parameters and genetic '
|
|---|
| 49 | 'and template databases. Set to the target of download_all_databases.sh.')
|
|---|
| 50 | flags.DEFINE_string(
|
|---|
| 51 | 'singularity_image_path', '/wynton/home/ferrin/goddard/alphafold_singularity/alphafold21.sif', 'Path to the AlphaFold singularity image.')
|
|---|
| 52 | flags.DEFINE_string(
|
|---|
| 53 | 'max_template_date', '2100-01-01',
|
|---|
| 54 | 'Maximum template release date to consider (ISO-8601 format: YYYY-MM-DD). '
|
|---|
| 55 | 'Important if folding historical test sets.')
|
|---|
| 56 | flags.DEFINE_enum(
|
|---|
| 57 | 'db_preset', 'full_dbs', ['full_dbs', 'reduced_dbs'],
|
|---|
| 58 | 'Choose preset MSA database configuration - smaller genetic database '
|
|---|
| 59 | 'config (reduced_dbs) or full genetic database config (full_dbs)')
|
|---|
| 60 | flags.DEFINE_enum(
|
|---|
| 61 | 'model_preset', 'monomer',
|
|---|
| 62 | ['monomer', 'monomer_casp14', 'monomer_ptm', 'multimer'],
|
|---|
| 63 | 'Choose preset model configuration - the monomer model, the monomer model '
|
|---|
| 64 | 'with extra ensembling, monomer model with pTM head, or multimer model')
|
|---|
| 65 | flags.DEFINE_boolean(
|
|---|
| 66 | 'benchmark', False,
|
|---|
| 67 | 'Run multiple JAX model evaluations to obtain a timing that excludes the '
|
|---|
| 68 | 'compilation time, which should be more indicative of the time required '
|
|---|
| 69 | 'for inferencing many proteins.')
|
|---|
| 70 | flags.DEFINE_boolean(
|
|---|
| 71 | 'use_precomputed_msas', False,
|
|---|
| 72 | 'Whether to read MSAs that have been written to disk. WARNING: This will '
|
|---|
| 73 | 'not check if the sequence, database or configuration have changed.')
|
|---|
| 74 |
|
|---|
| 75 | FLAGS = flags.FLAGS
|
|---|
| 76 |
|
|---|
| 77 | _ROOT_MOUNT_DIRECTORY = '/mnt/'
|
|---|
| 78 |
|
|---|
| 79 | '''
|
|---|
| 80 | def _create_mount(mount_name: str, path: str) -> Tuple[types.Mount, str]:
|
|---|
| 81 | path = os.path.abspath(path)
|
|---|
| 82 | source_path = os.path.dirname(path)
|
|---|
| 83 | target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, mount_name)
|
|---|
| 84 | logging.info('Mounting %s -> %s', source_path, target_path)
|
|---|
| 85 | mount = types.Mount(target_path, source_path, type='bind', read_only=True)
|
|---|
| 86 | return mount, os.path.join(target_path, os.path.basename(path))
|
|---|
| 87 | '''
|
|---|
| 88 |
|
|---|
| 89 | def main(argv):
|
|---|
| 90 | if len(argv) > 1:
|
|---|
| 91 | raise app.UsageError('Too many command-line arguments.')
|
|---|
| 92 |
|
|---|
| 93 | # You can individually override the following paths if you have placed the
|
|---|
| 94 | # data in locations other than the FLAGS.data_dir.
|
|---|
| 95 |
|
|---|
| 96 | # Path to the Uniref90 database for use by JackHMMER.
|
|---|
| 97 | uniref90_database_path = os.path.join(
|
|---|
| 98 | FLAGS.data_dir, 'uniref90', 'uniref90.fasta')
|
|---|
| 99 |
|
|---|
| 100 | # Path to the Uniprot database for use by JackHMMER.
|
|---|
| 101 | uniprot_database_path = os.path.join(
|
|---|
| 102 | FLAGS.data_dir, 'uniprot', 'uniprot.fasta')
|
|---|
| 103 |
|
|---|
| 104 | # Path to the MGnify database for use by JackHMMER.
|
|---|
| 105 | mgnify_database_path = os.path.join(
|
|---|
| 106 | FLAGS.data_dir, 'mgnify', 'mgy_clusters_2018_12.fa')
|
|---|
| 107 |
|
|---|
| 108 | # Path to the BFD database for use by HHblits.
|
|---|
| 109 | bfd_database_path = os.path.join(
|
|---|
| 110 | FLAGS.data_dir, 'bfd',
|
|---|
| 111 | 'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')
|
|---|
| 112 |
|
|---|
| 113 | # Path to the Small BFD database for use by JackHMMER.
|
|---|
| 114 | small_bfd_database_path = os.path.join(
|
|---|
| 115 | FLAGS.data_dir, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta')
|
|---|
| 116 |
|
|---|
| 117 | # Path to the Uniclust30 database for use by HHblits.
|
|---|
| 118 | uniclust30_database_path = os.path.join(
|
|---|
| 119 | FLAGS.data_dir, 'uniclust30', 'uniclust30_2018_08', 'uniclust30_2018_08')
|
|---|
| 120 |
|
|---|
| 121 | # Path to the PDB70 database for use by HHsearch.
|
|---|
| 122 | pdb70_database_path = os.path.join(FLAGS.data_dir, 'pdb70', 'pdb70')
|
|---|
| 123 |
|
|---|
| 124 | # Path to the PDB seqres database for use by hmmsearch.
|
|---|
| 125 | pdb_seqres_database_path = os.path.join(
|
|---|
| 126 | FLAGS.data_dir, 'pdb_seqres', 'pdb_seqres.txt')
|
|---|
| 127 |
|
|---|
| 128 | # Path to a directory with template mmCIF structures, each named <pdb_id>.cif.
|
|---|
| 129 | template_mmcif_dir = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'mmcif_files')
|
|---|
| 130 |
|
|---|
| 131 | # Path to a file mapping obsolete PDB IDs to their replacements.
|
|---|
| 132 | obsolete_pdbs_path = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'obsolete.dat')
|
|---|
| 133 |
|
|---|
| 134 | alphafold_path = pathlib.Path(__file__).parent.parent
|
|---|
| 135 | data_dir_path = pathlib.Path(FLAGS.data_dir)
|
|---|
| 136 | if alphafold_path == data_dir_path or alphafold_path in data_dir_path.parents:
|
|---|
| 137 | raise app.UsageError(
|
|---|
| 138 | f'The download directory {FLAGS.data_dir} should not be a subdirectory '
|
|---|
| 139 | f'in the AlphaFold repository directory. If it is, the Docker build is '
|
|---|
| 140 | f'slow since the large databases are copied during the image creation.')
|
|---|
| 141 |
|
|---|
| 142 | mounts = []
|
|---|
| 143 | command_args = []
|
|---|
| 144 |
|
|---|
| 145 | # Mount each fasta path as a unique target directory.
|
|---|
| 146 | target_fasta_paths = []
|
|---|
| 147 | for i, fasta_path in enumerate(FLAGS.fasta_paths):
|
|---|
| 148 | # mount, target_path = _create_mount(f'fasta_path_{i}', fasta_path)
|
|---|
| 149 | # mounts.append(mount)
|
|---|
| 150 | # target_fasta_paths.append(target_path)
|
|---|
| 151 | target_fasta_paths.append(fasta_path)
|
|---|
| 152 | command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')
|
|---|
| 153 |
|
|---|
| 154 | database_paths = [
|
|---|
| 155 | ('uniref90_database_path', uniref90_database_path),
|
|---|
| 156 | ('mgnify_database_path', mgnify_database_path),
|
|---|
| 157 | ('data_dir', FLAGS.data_dir),
|
|---|
| 158 | ('template_mmcif_dir', template_mmcif_dir),
|
|---|
| 159 | ('obsolete_pdbs_path', obsolete_pdbs_path),
|
|---|
| 160 | ]
|
|---|
| 161 |
|
|---|
| 162 | if FLAGS.model_preset == 'multimer':
|
|---|
| 163 | database_paths.append(('uniprot_database_path', uniprot_database_path))
|
|---|
| 164 | database_paths.append(('pdb_seqres_database_path',
|
|---|
| 165 | pdb_seqres_database_path))
|
|---|
| 166 | else:
|
|---|
| 167 | database_paths.append(('pdb70_database_path', pdb70_database_path))
|
|---|
| 168 |
|
|---|
| 169 | if FLAGS.db_preset == 'reduced_dbs':
|
|---|
| 170 | database_paths.append(('small_bfd_database_path', small_bfd_database_path))
|
|---|
| 171 | else:
|
|---|
| 172 | database_paths.extend([
|
|---|
| 173 | ('uniclust30_database_path', uniclust30_database_path),
|
|---|
| 174 | ('bfd_database_path', bfd_database_path),
|
|---|
| 175 | ])
|
|---|
| 176 | for name, path in database_paths:
|
|---|
| 177 | if path:
|
|---|
| 178 | # mount, target_path = _create_mount(name, path)
|
|---|
| 179 | # mounts.append(mount)
|
|---|
| 180 | # command_args.append(f'--{name}={target_path}')
|
|---|
| 181 | command_args.append(f'--{name}={path}')
|
|---|
| 182 |
|
|---|
| 183 | output_target_path = os.path.abspath('output')
|
|---|
| 184 | # output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output')
|
|---|
| 185 | # mounts.append(types.Mount(output_target_path, FLAGS.output_dir, type='bind'))
|
|---|
| 186 | # mounts.append((output_target_path, FLAGS.output_dir))
|
|---|
| 187 |
|
|---|
| 188 | command_args.extend([
|
|---|
| 189 | f'--output_dir={output_target_path}',
|
|---|
| 190 | f'--max_template_date={FLAGS.max_template_date}',
|
|---|
| 191 | f'--db_preset={FLAGS.db_preset}',
|
|---|
| 192 | f'--model_preset={FLAGS.model_preset}',
|
|---|
| 193 | f'--benchmark={FLAGS.benchmark}',
|
|---|
| 194 | f'--use_precomputed_msas={FLAGS.use_precomputed_msas}',
|
|---|
| 195 | '--logtostderr',
|
|---|
| 196 | ])
|
|---|
| 197 |
|
|---|
| 198 | if FLAGS.is_prokaryote_list:
|
|---|
| 199 | command_args.append(
|
|---|
| 200 | f'--is_prokaryote_list={",".join(FLAGS.is_prokaryote_list)}')
|
|---|
| 201 |
|
|---|
| 202 | env_vars = {
|
|---|
| 203 | 'CUDA_VISIBLE_DEVICES': FLAGS.gpu_devices,
|
|---|
| 204 | # The following flags allow us to make predictions on proteins that
|
|---|
| 205 | # would typically be too long to fit into GPU memory.
|
|---|
| 206 | 'TF_FORCE_UNIFIED_MEMORY': '1',
|
|---|
| 207 | 'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
|
|---|
| 208 | }
|
|---|
| 209 | print ('Running Alphafold with args:\n%s\nenvironment:\n%s'
|
|---|
| 210 | % ('\n'.join(command_args),
|
|---|
| 211 | '\n'.join('%s=%s' % (key,value) for key,value in env_vars.items())))
|
|---|
| 212 |
|
|---|
| 213 | env_vals = ' '.join('%s=%s' % (key,value) for key,value in env_vars.items())
|
|---|
| 214 | cmd = ('env %s singularity run --nv -B "%s" -B "%s" %s %s' %
|
|---|
| 215 | (env_vals, FLAGS.data_dir, os.getcwd(), FLAGS.singularity_image_path,
|
|---|
| 216 | ' '.join(command_args)))
|
|---|
| 217 | print (cmd)
|
|---|
| 218 |
|
|---|
| 219 | '''
|
|---|
| 220 | client = docker.from_env()
|
|---|
| 221 | container = client.containers.run(
|
|---|
| 222 | image=FLAGS.docker_image_name,
|
|---|
| 223 | command=command_args,
|
|---|
| 224 | runtime='nvidia' if FLAGS.use_gpu else None,
|
|---|
| 225 | remove=True,
|
|---|
| 226 | detach=True,
|
|---|
| 227 | mounts=mounts,
|
|---|
| 228 | environment={
|
|---|
| 229 | 'NVIDIA_VISIBLE_DEVICES': FLAGS.gpu_devices,
|
|---|
| 230 | # The following flags allow us to make predictions on proteins that
|
|---|
| 231 | # would typically be too long to fit into GPU memory.
|
|---|
| 232 | 'TF_FORCE_UNIFIED_MEMORY': '1',
|
|---|
| 233 | 'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
|
|---|
| 234 | })
|
|---|
| 235 |
|
|---|
| 236 | # Add signal handler to ensure CTRL+C also stops the running container.
|
|---|
| 237 | signal.signal(signal.SIGINT,
|
|---|
| 238 | lambda unused_sig, unused_frame: container.kill())
|
|---|
| 239 |
|
|---|
| 240 | for line in container.logs(stream=True):
|
|---|
| 241 | logging.info(line.strip().decode('utf-8'))
|
|---|
| 242 | '''
|
|---|
| 243 |
|
|---|
| 244 | if __name__ == '__main__':
|
|---|
| 245 | flags.mark_flags_as_required([
|
|---|
| 246 | 'fasta_paths',
|
|---|
| 247 | ])
|
|---|
| 248 | app.run(main)
|
|---|