# Find PDB structures that contain two sequences using BLAST PDB hits for each of the sequences writing out JSON list of possible PDB dimers. def dimer_pdbs(seq_pdbs): dimers = [] seq_names = tuple(seq_pdbs.keys()) for i1, seq1 in enumerate(seq_names): for seq2 in seq_names[i1:]: c = common_pdbs(pdb_chains(seq_pdbs[seq1]), pdb_chains(seq_pdbs[seq2])) if c: dimers.append((seq1, seq2, c)) return dimers def common_pdbs(pdb_chains1, pdb_chains2): plist = [] for pdb_id, c1 in pdb_chains1.items(): if pdb_id in pdb_chains2: c2 = pdb_chains2[pdb_id] if c1 == c2 and len(c1) == 1: continue # Monomer plist.append((pdb_id, c1, c2)) return plist def pdb_chains(pdb_hits): chains = {} for pdb_chain, evalue, score, descrip in pdb_hits: pdb_id, chain_id = pdb_chain.split('_') if pdb_id not in chains: chains[pdb_id] = set() # BLAST can return multiple hits to the same chain (eg. gene MSL1, PDB 7EDX_A) chains[pdb_id].add(chain_id) uchains = {pdb_id: list(chains) for pdb_id, chains in chains.items()} return uchains path = 'all_monomers_pdb.json' import json pdbs = json.load(open(path,'r')) print('monomers found', '\n'.join(name.split('_')[-1] for name in pdbs.keys())) dimers = dimer_pdbs(pdbs) for seq1, seq2, pdbs in dimers: print(f'{seq1} {seq2} : {", ".join(str(pcc) for pcc in pdbs)}') with open('pdb_dimers.json', 'w') as f: json.dump(dimers, f) #for seq1, seq2, pdbs in dimers: # print(f'{seq1.split("_")[2]} pp {seq2.split("_")[2]}')