Skip to content

Commit 5148a14

Browse files
committed
added multiple acnhors to work without netmhcpan
1 parent 2a72ff4 commit 5148a14

File tree

4 files changed

+72
-23
lines changed

4 files changed

+72
-23
lines changed

run_PMGen.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def remove_files_in_directory(directory):
2121

2222

2323
def main():
24+
allowed_mpnn_models = [i.replace('.pt','') for i in os.listdir('ProteinMPNN/vanilla_model_weights/')]
2425
parser = argparse.ArgumentParser(description="Run PMGen wrapper or modeling.")
2526

2627
# Default settings
@@ -89,6 +90,7 @@ def main():
8990
"Fixed positions should be provided as a list for each row in --df, and the columnname should be"
9091
"'fixed_positions'. It will automaticallly enables --fix_anchors as well, but uses fixed positions"
9192
"given to it only.")
93+
parser.add_argument("--proteinmpnn_model_name", type=str, default="v_48_020_soft_ft", help=f"ProteinMPNN model name. Allowed values: {allowed_mpnn_models}")
9294

9395
# BioEmu Argumetns
9496
parser.add_argument('--run_bioemu', action='store_true', help='Enables bioemu pMHC sampling.')
@@ -124,6 +126,7 @@ def main():
124126
parser.add_argument('--iterative_peptide_gen', type=int, default=0, help='If used, the iterative peptide generation is performed, defines the number of iterations.')
125127

126128
args = parser.parse_args()
129+
assert(args.proteinmpnn_model_name) in allowed_mpnn_models, f"Allowed models: {allowed_mpnn_models}"
127130
bioemu_assertions(args)
128131
for iteration in range(args.iterative_peptide_gen + 1):
129132
if iteration == 0:

run_utils.py

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,8 @@ def __init__(self, PMGen_pdb, output_dir,
587587
only_pseudo_sequence_design=True, anchor_pred=True,
588588
sampling_temp=5, batch_size=1, hot_spot_thr=6.0,
589589
save_hotspots=True, binder_pred=False, fix_anchors=False,
590-
anchor_and_peptide=None, return_match_allele=False):
590+
anchor_and_peptide=None, return_match_allele=False,
591+
model_name='v_48_020'):
591592
'''
592593
Args:
593594
PMGen_pdb: (str) Single Chain pdb path generated by PMGen AFfine.
@@ -609,6 +610,7 @@ def __init__(self, PMGen_pdb, output_dir,
609610
Only used if fix_anchors==True, Default: False
610611
return_match_allele: (bool) Returns a list of one or two elementsself.matched_alleles if binder_pred is True,
611612
Depending on MHC type.
613+
model_name (string): model name of proteinmpnn, allowed models found in ProteinMPNN/vanilla_model_weights
612614
'''
613615
self.pdb = PMGen_pdb
614616
self.output_dir = output_dir
@@ -627,6 +629,7 @@ def __init__(self, PMGen_pdb, output_dir,
627629
self.fix_anchors = fix_anchors
628630
self.anchor_and_peptide = anchor_and_peptide
629631
self.return_match_allele = return_match_allele
632+
self.model_name = model_name
630633
self.input_assertion()
631634

632635
os.makedirs(self.output_dir, exist_ok=True)
@@ -684,7 +687,8 @@ def __mhc_design(self):
684687
"--seed", "37",
685688
"--batch_size", f'{self.batch_size}',
686689
"--save_probs", "1",
687-
"--save_score", "1"
690+
"--save_score", "1",
691+
"--model_name", f"{self.model_name}"
688692
], check=True)
689693
print('Full MHC Sequence Generation Mode Done! *****\n')
690694

@@ -721,6 +725,8 @@ def __peptide_design(self):
721725
"--save_probs", "1",
722726
"--save_score", "1",
723727
"--omit_AAs", "X",
728+
"--path_to_model_weights", "ProteinMPNN/vanilla_model_weights",
729+
"--model_name", f"{self.model_name}"
724730
]
725731
if self.fix_anchors:# to fix anchors, fixed_pdbs file and design_only_positions should be generated
726732
# we have anchors, we need to define designable positions which are non-anchor positions
@@ -811,7 +817,8 @@ def __only_pseudo_sequence_design(self):
811817
"--seed", "37",
812818
"--batch_size", f'{self.batch_size}',
813819
"--save_probs", "1",
814-
"--save_score", "1"
820+
"--save_score", "1",
821+
"--model_name", f"{self.model_name}"
815822
], check=True)
816823
print('MHC Pseudo Sequence Generation Mode Done! *****\n')
817824

@@ -857,7 +864,8 @@ def run_single_proteinmpnn(path, directory, args, anchor_and_peptide=None):
857864
hot_spot_thr=args.hot_spot_thr,
858865
binder_pred=args.binder_pred,
859866
fix_anchors=args.fix_anchors,
860-
anchor_and_peptide=anchor_and_peptide
867+
anchor_and_peptide=anchor_and_peptide,
868+
model_name=args.proteinmpnn_model_name
861869
)
862870
runner_mpnn.run() #
863871

@@ -978,25 +986,45 @@ def _process_row(self, row):
978986
assert len(mhc_seq_list) == 1, (f'mhc_seq for mhc_type==1, should be string with no "/", '
979987
f'found: \n {str(row.mhc_seq)}')
980988
parallel = True if self.args.run == 'parallel' else False
981-
netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=self.args.verbose, outfilename=str(row.id), n_jobs=self.args.max_cores, parallel=parallel)
982-
seen_cores = []
983989
results = {'anchors': [], 'mhc_seqs': [], 'ids': [], 'peptides': [], 'mhc_types': []}
984-
counter = 0
985-
for j, net_row in netmhc_df.iterrows():
986-
peptide2 = str(net_row['Core'])
987-
peptide1 = str(row.peptide)
988-
predicted_anchors, pept1, pept2 = processing_functions.align_and_find_anchors_mhc(peptide1, peptide2,
989-
mhc_type)
990-
if not predicted_anchors in seen_cores:
991-
seen_cores.append(predicted_anchors)
992-
results['anchors'].append(";".join([str(pp) for pp in predicted_anchors]))
993-
results['mhc_seqs'].append(str(row['mhc_seq']))
990+
try:
991+
netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=self.args.verbose, outfilename=str(row.id), n_jobs=1, parallel=False)
992+
seen_cores = []
993+
counter = 0
994+
for j, net_row in netmhc_df.iterrows():
995+
peptide2 = str(net_row['Core'])
996+
peptide1 = str(row.peptide)
997+
predicted_anchors, pept1, pept2 = processing_functions.align_and_find_anchors_mhc(peptide1, peptide2,
998+
mhc_type)
999+
if not predicted_anchors in seen_cores:
1000+
seen_cores.append(predicted_anchors)
1001+
results['anchors'].append(";".join([str(pp) for pp in predicted_anchors]))
1002+
results['mhc_seqs'].append(str(row['mhc_seq']))
1003+
results['ids'].append(str(row['id']) + '_' + str(counter))
1004+
results['peptides'].append(str(row['peptide']))
1005+
results['mhc_types'].append(int(row['mhc_type']))
1006+
counter += 1
1007+
if counter == self.args.top_k: break
1008+
return results
1009+
except Exception as e: # if netmhcpan fails or does not exist, take all possible anchors
1010+
peptide = str(row.peptide)
1011+
mhc_type = int(row.mhc_type)
1012+
pep_len = len(peptide)
1013+
anchor_combinations = []
1014+
if mhc_type == 1:
1015+
anchor_combinations = processing_functions.anchor_combinations_mhc1(pep_len)
1016+
elif mhc_type == 2:
1017+
anchor_combinations = processing_functions.anchor_combinations_mhc2(pep_len)
1018+
assert len(anchor_combinations) > 0, f'no anchor combination is found for {row.id}. The peptide sequence "{row.peptide}" length should be longer than 9 for mhc2 and 8 for mhc1'
1019+
for counter, anchors in enumerate(anchor_combinations):
1020+
results['anchors'].append(';'.join([str(i) for i in anchors]))
1021+
results['mhc_seqs'].append(str(row.mhc_seq))
9941022
results['ids'].append(str(row['id']) + '_' + str(counter))
9951023
results['peptides'].append(str(row['peptide']))
9961024
results['mhc_types'].append(int(row['mhc_type']))
997-
counter += 1
998-
if counter == self.args.top_k: break
999-
return results
1025+
return results
1026+
1027+
10001028

10011029
def process(self):
10021030
"""
@@ -1006,7 +1034,7 @@ def process(self):
10061034
DataFrame with processed results
10071035
"""
10081036
df = pd.read_csv(self.args.df, sep='\t')
1009-
print(f" Starting Multiple Anchor Mode on {self.args.max_cores} cores. Make Sure NetMHCpan is installed")
1037+
print(f" Starting Multiple Anchor Mode on {self.args.max_cores} cores. I netMHCpan is installed it is used, if not, all anchor combinations are processed")
10101038
# Determine number of processes
10111039
num_processes = min(cpu_count(), int(self.args.max_cores))
10121040
# Create multiprocessing pool

user_setting.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
##### PLEASE UPDATE #####
22
#Absolute path to NetMHCIPan executable file e.g. 'home/user/netMHCpan-4.1/netMHCpan'
3-
netmhcipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIpan-4.1/netMHCpan'
4-
netmhciipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIIpan-4.3/netMHCIIpan'
3+
netmhcipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIpan-4.1/netMHCpan323'
4+
netmhciipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIIpan-4.3/netMHCIIpan323'
55

66
##### Do not Change #######
77
import os

utils/processing_functions.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from Bio import pairwise2, PDB, SeqIO
88
from Bio.Align import substitution_matrices
99
from scipy.spatial.distance import cdist
10-
import shutil
1110
import random
1211
import sys
1312
import Levenshtein
@@ -1643,3 +1642,22 @@ def add_plddt_as_bfactor_verbose(array_path, pdb_path, output_pdb_path, verbose=
16431642
print("=" * 60)
16441643
return stats
16451644

1645+
1646+
def anchor_combinations_mhc2(n, d1=3, d2=2, d3=3):
1647+
quadruplets = []
1648+
for i in range(1, n + 1):
1649+
j = i + d1
1650+
k = j + d2
1651+
l = k + d3
1652+
if l <= n:
1653+
quadruplets.append([i, j, k, l])
1654+
return quadruplets
1655+
1656+
1657+
def anchor_combinations_mhc1(n, min_distance=6):
1658+
pairs = []
1659+
for i in range(1, n + 1):
1660+
for j in range(i + 1, n + 1):
1661+
if j - i >= min_distance:
1662+
pairs.append([i, j])
1663+
return pairs

0 commit comments

Comments
 (0)