# Compare pTM values for 466 Mycoplasma g. Boltz predictions using GPU computed MSAs versus k-mer CPU computed MSAs.
# In the case of protein MG_007 the pTM was higher and the MSA had twice as many sequences with k-mer compared to GPU.

def compare_ptm(predictions1_dir, msa1_dir, predictions2_dir, msa2_dir):
    ptm_scores = []
    from os import listdir
    from os.path import join, exists
    import json
    for pname in listdir(predictions1_dir):
        pdir1 = join(predictions1_dir, pname)
        pdir2 = join(predictions2_dir, pname)
        if exists(pdir2):
            conf1 = join(pdir1, f'confidence_{pname}_model_0.json')
            with open(conf1, 'r') as f:
                scores1 = json.load(f)
            conf2 = join(pdir2, f'confidence_{pname}_model_0.json')
            with open(conf2, 'r') as f:
                scores2 = json.load(f)
            nseq1 = count_lines(join(msa1_dir, pname + '.a3m')) // 2
            nseq2 = count_lines(join(msa2_dir, pname + '.a3m')) // 2
            ptm_scores.append((pname, scores1['ptm'], scores2['ptm'], nseq1, nseq2))
    return ptm_scores

def count_lines(path):
    with open(path, 'r') as f:
        return len(f.readlines())
    
predictions1_dir = '/home/goddard/mgen_msa/boltz_1200_kmer/boltz_results_/predictions'
msa1_dir = '/home/goddard/mgen_msa/results27'
predictions2_dir = '/home/goddard/mgen_msa/boltz_monomers_1200/boltz_results_/predictions'
msa2_dir = '/home/goddard/mgen_msa/results11'
ptm_scores = compare_ptm(predictions1_dir, msa1_dir, predictions2_dir, msa2_dir)

for name, ptm1, ptm2, nseq1, nseq2 in ptm_scores:
    if abs(ptm1 - ptm2) >= 0.1:
        sign = '-' if ptm1 > ptm2 else '+'
        print(sign, name, '%.3f' % ptm1, '%.3f' % ptm2, nseq1, nseq2)
