Skip to content

Commit 03fbb6c

Browse files
EBI Revio updates and sample sheet validation notebook (#361)
* add simple notebook to validate existing sample sheet file * add functionality to sort plate df in interleaved order * modify revio model_name and platform for EBI requirements per Gail * add unit tests for sample sheet utilities notebook, extend notebook helper to test some notebook contents * make code review fixes
1 parent a709fea commit 03fbb6c

File tree

9 files changed

+632
-3
lines changed

9 files changed

+632
-3
lines changed

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,5 @@ dependencies:
3535
- coverage
3636
- coveralls
3737
# notebook requirements
38+
- ipyfilechooser
3839
- ipykernel

metapool/config/sequencer_types.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,10 @@ NovaSeqXPlus:
120120
# qp-klp comments say RapidRun was HiSeq 2500 (?) which is
121121
# confusing, so not adding it here until/unless we need it.
122122
Revio:
123-
model_name: 'PacBio Revio'
123+
model_name: 'Revio'
124124
machine_prefix: 'r'
125125
profile_name: 'Revio'
126126
# NB: *no* revcomp_samplesheet_i5_index entry because that isn't
127127
# relevant to this technology
128-
platform: "PacBio"
128+
platform: "PacBio_SMRT"
129129
sequencing_method: "single molecule real-time (SMRT) long read sequencing"

metapool/plate.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from datetime import datetime
44
import numpy as np
55
import pandas as pd
6+
import re
67
import warnings
78
from scipy.stats import zscore
89
from sklearn.linear_model import LogisticRegression
@@ -202,6 +203,102 @@ def _well_to_row_and_col(well):
202203
return ord(well[0].upper()) - 64, int(well[1:])
203204

204205

206+
def _parse_and_validate_well_384(well):
207+
"""Validate input 384-well plate well ID and parse into (row, column).
208+
209+
Parameters
210+
----------
211+
well : str
212+
Well identifier like 'A1', 'P24', 'A01', etc.
213+
214+
Returns
215+
-------
216+
tuple
217+
(row_number, column_number) where row A=1, B=2, etc.
218+
219+
Raises
220+
------
221+
ValueError
222+
If well format is invalid or outside 384-well plate bounds.
223+
"""
224+
225+
VALID_384_WELL_ROWS = set('ABCDEFGHIJKLMNOP')
226+
MAX_384_WELL_COL = 24
227+
228+
# Check type and basic format using regex
229+
# Format: single letter A-Pa-p followed by 1-2 digits
230+
if not isinstance(well, str) or not re.match(r'^[A-Pa-p]\d{1,2}$', well):
231+
raise ValueError(
232+
"Well must be a letter A-P followed by a number 1-24, "
233+
"e.g., A1 or A01")
234+
235+
row_letter = well[0].upper()
236+
col_str = well[1:]
237+
238+
# Validate row is A-P
239+
if row_letter not in VALID_384_WELL_ROWS:
240+
raise ValueError(
241+
f"Well row '{row_letter}' is invalid. "
242+
f"Row must be A-P for a 384-well plate.")
243+
244+
# Parse and validate column
245+
col = int(col_str)
246+
if col < 1 or col > MAX_384_WELL_COL:
247+
raise ValueError(
248+
f"Well column {col} is invalid. "
249+
f"Column must be 1-24 for a 384-well plate.")
250+
251+
row = ord(row_letter) - 64
252+
return (row, col)
253+
254+
255+
def sort_by_interleaved_plates(df, well_column):
256+
"""Sort DataFrame rows by interleaved 96-well plate order.
257+
258+
Sorts wells in the order of four interleaved 96-well plates within a
259+
384-well plate. The order is: quadrant 1, then quadrant 2, then quadrant 3,
260+
then quadrant 4. Within each quadrant, wells are sorted by row first,
261+
then by column.
262+
263+
Quadrant mapping (based on 384-well position):
264+
- Quadrant 1: Odd rows (A, C, E, ...) + Odd columns (1, 3, 5, ...)
265+
- Quadrant 2: Odd rows + Even columns (2, 4, 6, ...)
266+
- Quadrant 3: Even rows (B, D, F, ...) + Odd columns
267+
- Quadrant 4: Even rows + Even columns
268+
269+
Parameters
270+
----------
271+
df : pd.DataFrame
272+
DataFrame containing well identifiers.
273+
well_column : str
274+
Name of the column containing 384-well IDs.
275+
276+
Returns
277+
-------
278+
pd.DataFrame
279+
DataFrame sorted by quadrant, then row, then column.
280+
281+
Raises
282+
------
283+
ValueError
284+
If well_column doesn't exist or contains invalid well IDs.
285+
"""
286+
if well_column not in df.columns:
287+
raise ValueError(
288+
f"Column '{well_column}' not found in DataFrame.")
289+
290+
def _get_sort_key(well):
291+
row, col = _parse_and_validate_well_384(well)
292+
# not necessary to validate plate position output since we know the
293+
# well is a valid one if we got here
294+
quadrant = int(_plate_position(well))
295+
return (quadrant, row, col)
296+
297+
sort_keys = df[well_column].apply(_get_sort_key)
298+
sorted_indices = sort_keys.sort_values().index
299+
return df.loc[sorted_indices].reset_index(drop=True)
300+
301+
205302
def _decompress_well(well):
206303
"""Returns a 96 well plate ID from a compressed 384 well plate ID"""
207304

0 commit comments

Comments
 (0)