Skip to content

Commit e1cc39e

Browse files
committed
clean_fasta_headers added for lower versions of netmhcpan
1 parent 1df42c3 commit e1cc39e

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

run_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,7 @@ def __binder_pred(self):
747747
output_dir = os.path.join(self.output_dir, 'peptide_design')
748748
peptide_fasta_file = [i for i in os.listdir(output_dir+'/'+'seqs') if i.endswith('.fa')][0]
749749
peptide_fasta_file = os.path.join(output_dir+'/'+'seqs', peptide_fasta_file)
750+
processing_functions.clean_fasta_headers(peptide_fasta_file, peptide_fasta_file)
750751
mhc_type = 2 if len(self.chain_dict_dist.keys()) == 2 else 1
751752
mhc_seq_dict = processing_functions.fetch_polypeptide_sequences(self.multichain_pdb)
752753
mhc_seq_list = [mhc_seq_dict['A'], mhc_seq_dict['B']] if mhc_type==2 else [mhc_seq_dict['A']]

utils/processing_functions.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,61 @@ def run_netmhcpan_parallel(peptide_fasta, allele_list, output, mhc_type,
11881188

11891189
###################################################################
11901190

1191+
def clean_fasta_headers(input_fasta, output_fasta):
1192+
"""
1193+
Clean FASTA headers by standardizing sample numbering and reorganizing metadata.
1194+
Keeps the first header unchanged and only modifies subsequent headers.
1195+
1196+
Parameters:
1197+
-----------
1198+
input_fasta : str
1199+
Path to input FASTA file
1200+
output_fasta : str
1201+
Path to output cleaned FASTA file
1202+
1203+
Example:
1204+
--------
1205+
First header: unchanged
1206+
Subsequent headers:
1207+
Input: >T=1.5, sample=1, score=1.8596, global_score=1.7745, seq_recovery=0.5000
1208+
Output: >sample_1, score=1.8596, global_score=1.7745, seq_recovery=0.5000, T=1.5
1209+
"""
1210+
with open(input_fasta) as f, open(output_fasta, "w") as out:
1211+
sample_counter = 1
1212+
header_counter = 0
1213+
for line in f:
1214+
line = line.strip()
1215+
if line.startswith(">"):
1216+
header_counter += 1
1217+
1218+
# Keep first header unchanged
1219+
if header_counter == 1:
1220+
out.write(line + "\n")
1221+
else:
1222+
# Extract T value
1223+
t_match = re.search(r"T=([\d\.]+)", line)
1224+
t_value = t_match.group(1) if t_match else None
1225+
1226+
# Remove "T=..." and "sample=..." parts from the header
1227+
clean_header = re.sub(r"T=[\d\.]+,?\s*", "", line)
1228+
clean_header = re.sub(r"sample=\d+,?\s*", "", clean_header)
1229+
1230+
# Remove leading ">" and whitespace
1231+
clean_header = clean_header.lstrip("> ").strip()
1232+
1233+
# Add new standardized header
1234+
if t_value:
1235+
new_header = f">sample_{sample_counter}, {clean_header}, T={t_value}"
1236+
else:
1237+
new_header = f">sample_{sample_counter}, {clean_header}"
1238+
1239+
out.write(new_header + "\n")
1240+
sample_counter += 1
1241+
else:
1242+
# Sequence line
1243+
out.write(line + "\n")
1244+
1245+
11911246
def fetch_polypeptide_sequences(pdb_path):
11921247
"""
11931248
Fetches the polypeptide sequences from a PDB file.

0 commit comments

Comments
 (0)