@@ -1188,6 +1188,61 @@ def run_netmhcpan_parallel(peptide_fasta, allele_list, output, mhc_type,
11881188
11891189###################################################################
11901190
1191+ def clean_fasta_headers (input_fasta , output_fasta ):
1192+ """
1193+ Clean FASTA headers by standardizing sample numbering and reorganizing metadata.
1194+ Keeps the first header unchanged and only modifies subsequent headers.
1195+
1196+ Parameters:
1197+ -----------
1198+ input_fasta : str
1199+ Path to input FASTA file
1200+ output_fasta : str
1201+ Path to output cleaned FASTA file
1202+
1203+ Example:
1204+ --------
1205+ First header: unchanged
1206+ Subsequent headers:
1207+ Input: >T=1.5, sample=1, score=1.8596, global_score=1.7745, seq_recovery=0.5000
1208+ Output: >sample_1, score=1.8596, global_score=1.7745, seq_recovery=0.5000, T=1.5
1209+ """
1210+ with open (input_fasta ) as f , open (output_fasta , "w" ) as out :
1211+ sample_counter = 1
1212+ header_counter = 0
1213+ for line in f :
1214+ line = line .strip ()
1215+ if line .startswith (">" ):
1216+ header_counter += 1
1217+
1218+ # Keep first header unchanged
1219+ if header_counter == 1 :
1220+ out .write (line + "\n " )
1221+ else :
1222+ # Extract T value
1223+ t_match = re .search (r"T=([\d\.]+)" , line )
1224+ t_value = t_match .group (1 ) if t_match else None
1225+
1226+ # Remove "T=..." and "sample=..." parts from the header
1227+ clean_header = re .sub (r"T=[\d\.]+,?\s*" , "" , line )
1228+ clean_header = re .sub (r"sample=\d+,?\s*" , "" , clean_header )
1229+
1230+ # Remove leading ">" and whitespace
1231+ clean_header = clean_header .lstrip ("> " ).strip ()
1232+
1233+ # Add new standardized header
1234+ if t_value :
1235+ new_header = f">sample_{ sample_counter } , { clean_header } , T={ t_value } "
1236+ else :
1237+ new_header = f">sample_{ sample_counter } , { clean_header } "
1238+
1239+ out .write (new_header + "\n " )
1240+ sample_counter += 1
1241+ else :
1242+ # Sequence line
1243+ out .write (line + "\n " )
1244+
1245+
11911246def fetch_polypeptide_sequences (pdb_path ):
11921247 """
11931248 Fetches the polypeptide sequences from a PDB file.
0 commit comments