clean_fasta_headers added for lower versions of netmhcpan

AmirAsgary · AmirAsgary · commit e1cc39ee7e11 · 2025-10-28T09:45:04.000+01:00
diff --git a/run_utils.py b/run_utils.py
@@ -747,6 +747,7 @@ def __binder_pred(self):
         output_dir = os.path.join(self.output_dir, 'peptide_design')
         peptide_fasta_file = [i for i in os.listdir(output_dir+'/'+'seqs') if i.endswith('.fa')][0]
         peptide_fasta_file = os.path.join(output_dir+'/'+'seqs', peptide_fasta_file)
+        processing_functions.clean_fasta_headers(peptide_fasta_file, peptide_fasta_file)
         mhc_type = 2 if len(self.chain_dict_dist.keys()) == 2 else 1
         mhc_seq_dict = processing_functions.fetch_polypeptide_sequences(self.multichain_pdb)
         mhc_seq_list = [mhc_seq_dict['A'], mhc_seq_dict['B']] if mhc_type==2 else [mhc_seq_dict['A']]
diff --git a/utils/processing_functions.py b/utils/processing_functions.py
@@ -1188,6 +1188,61 @@ def run_netmhcpan_parallel(peptide_fasta, allele_list, output, mhc_type,
 
 ###################################################################
 
+def clean_fasta_headers(input_fasta, output_fasta):
+    """
+    Clean FASTA headers by standardizing sample numbering and reorganizing metadata.
+    Keeps the first header unchanged and only modifies subsequent headers.
+
+    Parameters:
+    -----------
+    input_fasta : str
+        Path to input FASTA file
+    output_fasta : str
+        Path to output cleaned FASTA file
+
+    Example:
+    --------
+    First header: unchanged
+    Subsequent headers:
+        Input:  >T=1.5, sample=1, score=1.8596, global_score=1.7745, seq_recovery=0.5000
+        Output: >sample_1, score=1.8596, global_score=1.7745, seq_recovery=0.5000, T=1.5
+    """
+    with open(input_fasta) as f, open(output_fasta, "w") as out:
+        sample_counter = 1
+        header_counter = 0
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                header_counter += 1
+
+                # Keep first header unchanged
+                if header_counter == 1:
+                    out.write(line + "\n")
+                else:
+                    # Extract T value
+                    t_match = re.search(r"T=([\d\.]+)", line)
+                    t_value = t_match.group(1) if t_match else None
+
+                    # Remove "T=..." and "sample=..." parts from the header
+                    clean_header = re.sub(r"T=[\d\.]+,?\s*", "", line)
+                    clean_header = re.sub(r"sample=\d+,?\s*", "", clean_header)
+
+                    # Remove leading ">" and whitespace
+                    clean_header = clean_header.lstrip("> ").strip()
+
+                    # Add new standardized header
+                    if t_value:
+                        new_header = f">sample_{sample_counter}, {clean_header}, T={t_value}"
+                    else:
+                        new_header = f">sample_{sample_counter}, {clean_header}"
+
+                    out.write(new_header + "\n")
+                    sample_counter += 1
+            else:
+                # Sequence line
+                out.write(line + "\n")
+
+
 def fetch_polypeptide_sequences(pdb_path):
     """
     Fetches the polypeptide sequences from a PDB file.