added multiple acnhors to work without netmhcpan

AmirAsgary · AmirAsgary · commit 5148a144e0f2 · 2026-01-22T11:09:09.000+01:00
diff --git a/run_PMGen.py b/run_PMGen.py
@@ -21,6 +21,7 @@ def remove_files_in_directory(directory):
 
 
 def main():
+    allowed_mpnn_models = [i.replace('.pt','') for i in os.listdir('ProteinMPNN/vanilla_model_weights/')]
     parser = argparse.ArgumentParser(description="Run PMGen wrapper or modeling.")
 
     # Default settings
@@ -89,6 +90,7 @@ def main():
                                                                              "Fixed positions should be provided as a list for each row in --df, and the columnname should be"
                                                                              "'fixed_positions'. It will automaticallly enables --fix_anchors as well, but uses fixed positions"
                                                                              "given to it only.")
+    parser.add_argument("--proteinmpnn_model_name", type=str, default="v_48_020_soft_ft", help=f"ProteinMPNN model name. Allowed values: {allowed_mpnn_models}")
 
     # BioEmu Argumetns
     parser.add_argument('--run_bioemu', action='store_true', help='Enables bioemu pMHC sampling.')
@@ -124,6 +126,7 @@ def main():
     parser.add_argument('--iterative_peptide_gen', type=int, default=0, help='If used, the iterative peptide generation is performed, defines the number of iterations.')
 
     args = parser.parse_args()
+    assert(args.proteinmpnn_model_name) in allowed_mpnn_models, f"Allowed models: {allowed_mpnn_models}"
     bioemu_assertions(args)
     for iteration in range(args.iterative_peptide_gen + 1):
         if iteration == 0:
diff --git a/run_utils.py b/run_utils.py
@@ -587,7 +587,8 @@ def __init__(self, PMGen_pdb, output_dir,
                  only_pseudo_sequence_design=True, anchor_pred=True,
                  sampling_temp=5, batch_size=1, hot_spot_thr=6.0,
                  save_hotspots=True, binder_pred=False, fix_anchors=False,
-                 anchor_and_peptide=None, return_match_allele=False):
+                 anchor_and_peptide=None, return_match_allele=False,
+                 model_name='v_48_020'):
         '''
         Args:
             PMGen_pdb: (str) Single Chain pdb path generated by PMGen AFfine.
@@ -609,6 +610,7 @@ def __init__(self, PMGen_pdb, output_dir,
                                 Only used if fix_anchors==True, Default: False
             return_match_allele: (bool) Returns a list of one or two elementsself.matched_alleles if binder_pred is True,
                                 Depending on MHC type.
+            model_name (string): model name of proteinmpnn, allowed models found in ProteinMPNN/vanilla_model_weights
         '''
         self.pdb = PMGen_pdb
         self.output_dir = output_dir
@@ -627,6 +629,7 @@ def __init__(self, PMGen_pdb, output_dir,
         self.fix_anchors = fix_anchors
         self.anchor_and_peptide = anchor_and_peptide
         self.return_match_allele = return_match_allele
+        self.model_name = model_name
         self.input_assertion()
 
         os.makedirs(self.output_dir, exist_ok=True)
@@ -684,7 +687,8 @@ def __mhc_design(self):
             "--seed", "37",
             "--batch_size", f'{self.batch_size}',
             "--save_probs", "1",
-            "--save_score", "1"
+            "--save_score", "1",
+            "--model_name", f"{self.model_name}"
         ], check=True)
         print('Full MHC Sequence Generation Mode Done! *****\n')
 
@@ -721,6 +725,8 @@ def __peptide_design(self):
             "--save_probs", "1",
             "--save_score", "1",
             "--omit_AAs", "X",
+            "--path_to_model_weights", "ProteinMPNN/vanilla_model_weights",
+            "--model_name", f"{self.model_name}"
         ]
         if self.fix_anchors:# to fix anchors, fixed_pdbs file and design_only_positions should be generated
             # we have anchors, we need to define designable positions which are non-anchor positions
@@ -811,7 +817,8 @@ def __only_pseudo_sequence_design(self):
             "--seed", "37",
             "--batch_size", f'{self.batch_size}',
             "--save_probs", "1",
-            "--save_score", "1"
+            "--save_score", "1",
+            "--model_name", f"{self.model_name}"
         ], check=True)
         print('MHC Pseudo Sequence Generation Mode Done! *****\n')
 
@@ -857,7 +864,8 @@ def run_single_proteinmpnn(path, directory, args, anchor_and_peptide=None):
         hot_spot_thr=args.hot_spot_thr,
         binder_pred=args.binder_pred,
         fix_anchors=args.fix_anchors,
-        anchor_and_peptide=anchor_and_peptide
+        anchor_and_peptide=anchor_and_peptide,
+        model_name=args.proteinmpnn_model_name
     )
     runner_mpnn.run() #
 
@@ -978,25 +986,45 @@ def _process_row(self, row):
             assert len(mhc_seq_list) == 1, (f'mhc_seq for mhc_type==1, should be string with no "/", '
                                             f'found: \n {str(row.mhc_seq)}')
         parallel = True if self.args.run == 'parallel' else False
-        netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=self.args.verbose, outfilename=str(row.id), n_jobs=self.args.max_cores, parallel=parallel)
-        seen_cores = []
         results = {'anchors': [], 'mhc_seqs': [], 'ids': [], 'peptides': [], 'mhc_types': []}
-        counter = 0
-        for j, net_row in netmhc_df.iterrows():
-            peptide2 = str(net_row['Core'])
-            peptide1 = str(row.peptide)
-            predicted_anchors, pept1, pept2 = processing_functions.align_and_find_anchors_mhc(peptide1, peptide2,
-                                                                                              mhc_type)
-            if not predicted_anchors in seen_cores:
-                seen_cores.append(predicted_anchors)
-                results['anchors'].append(";".join([str(pp) for pp in predicted_anchors]))
-                results['mhc_seqs'].append(str(row['mhc_seq']))
+        try:
+            netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=self.args.verbose, outfilename=str(row.id), n_jobs=1, parallel=False)
+            seen_cores = []
+            counter = 0
+            for j, net_row in netmhc_df.iterrows():
+                peptide2 = str(net_row['Core'])
+                peptide1 = str(row.peptide)
+                predicted_anchors, pept1, pept2 = processing_functions.align_and_find_anchors_mhc(peptide1, peptide2,
+                                                                                                  mhc_type)
+                if not predicted_anchors in seen_cores:
+                    seen_cores.append(predicted_anchors)
+                    results['anchors'].append(";".join([str(pp) for pp in predicted_anchors]))
+                    results['mhc_seqs'].append(str(row['mhc_seq']))
+                    results['ids'].append(str(row['id']) + '_' + str(counter))
+                    results['peptides'].append(str(row['peptide']))
+                    results['mhc_types'].append(int(row['mhc_type']))
+                    counter += 1
+                if counter == self.args.top_k: break
+            return results
+        except Exception as e: # if netmhcpan fails or does not exist, take all possible anchors
+            peptide = str(row.peptide)
+            mhc_type = int(row.mhc_type)
+            pep_len = len(peptide)
+            anchor_combinations = []
+            if mhc_type == 1:
+                anchor_combinations = processing_functions.anchor_combinations_mhc1(pep_len)
+            elif mhc_type == 2:
+                anchor_combinations = processing_functions.anchor_combinations_mhc2(pep_len)
+            assert len(anchor_combinations) > 0, f'no anchor combination is found for {row.id}. The peptide sequence "{row.peptide}" length should be longer than 9 for mhc2 and 8 for mhc1'
+            for counter, anchors in enumerate(anchor_combinations):
+                results['anchors'].append(';'.join([str(i) for i in anchors]))
+                results['mhc_seqs'].append(str(row.mhc_seq))
                 results['ids'].append(str(row['id']) + '_' + str(counter))
                 results['peptides'].append(str(row['peptide']))
                 results['mhc_types'].append(int(row['mhc_type']))
-                counter += 1
-            if counter == self.args.top_k: break
-        return results
+            return results
+
+
 
     def process(self):
         """
@@ -1006,7 +1034,7 @@ def process(self):
             DataFrame with processed results
         """
         df = pd.read_csv(self.args.df, sep='\t')
-        print(f" Starting Multiple Anchor Mode on {self.args.max_cores} cores. Make Sure NetMHCpan is installed")
+        print(f" Starting Multiple Anchor Mode on {self.args.max_cores} cores. I netMHCpan is installed it is used, if not, all anchor combinations are processed")
         # Determine number of processes
         num_processes = min(cpu_count(), int(self.args.max_cores))
         # Create multiprocessing pool
diff --git a/user_setting.py b/user_setting.py
@@ -1,7 +1,7 @@
 ##### PLEASE UPDATE #####
 #Absolute path to NetMHCIPan executable file e.g. 'home/user/netMHCpan-4.1/netMHCpan'
-netmhcipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIpan-4.1/netMHCpan'
-netmhciipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIIpan-4.3/netMHCIIpan'
+netmhcipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIpan-4.1/netMHCpan323'
+netmhciipan_path = '/home/amir/amir/ParseFold/PMGen/netMHCIIpan-4.3/netMHCIIpan323'
 
 ##### Do not Change #######
 import os
diff --git a/utils/processing_functions.py b/utils/processing_functions.py
@@ -7,7 +7,6 @@
 from Bio import pairwise2, PDB, SeqIO
 from Bio.Align import substitution_matrices
 from scipy.spatial.distance import cdist
-import shutil
 import random
 import sys
 import Levenshtein
@@ -1643,3 +1642,22 @@ def add_plddt_as_bfactor_verbose(array_path, pdb_path, output_pdb_path, verbose=
         print("=" * 60)
     return stats
 
+
+def anchor_combinations_mhc2(n, d1=3, d2=2, d3=3):
+    quadruplets = []
+    for i in range(1, n + 1):
+        j = i + d1
+        k = j + d2
+        l = k + d3
+        if l <= n:
+            quadruplets.append([i, j, k, l])
+    return quadruplets
+
+
+def anchor_combinations_mhc1(n, min_distance=6):
+    pairs = []
+    for i in range(1, n + 1):
+        for j in range(i + 1, n + 1):
+            if j - i >= min_distance:
+                pairs.append([i, j])
+    return pairs