1- import re
2- import os
3- import gzip
4- import warnings
5- import pandas as pd
6-
7- from glob import glob
1+ from collections import Counter , defaultdict
82from datetime import datetime
9- from string import ascii_letters , digits
3+ from glob import glob
104from metapool .mp_strings import get_short_name_and_id
115from metapool .plate import PlateReplication
12- from collections import Counter
6+ from os import sep , listdir
7+ from os .path import (basename , isdir , join , split , abspath , exists ,
8+ normpath )
139from string import ascii_letters , digits
14- from os import sep
15- from os .path import join , split , abspath , exists
16- from collections import defaultdict
10+ from gzip import open as gz_open
11+ import pandas as pd
12+ import re
13+ import warnings
1714
1815
1916REQUIRED_MF_COLUMNS = {'sample_name' , 'barcode' , 'primer' , 'primer_plate' ,
@@ -139,7 +136,7 @@ def parse_illumina_run_id(run_id):
139136
140137def is_nonempty_gz_file (name ):
141138 """Taken from https://stackoverflow.com/a/37878550/379593"""
142- with gzip . open (name , 'rb' ) as f :
139+ with gz_open (name , 'rb' ) as f :
143140 try :
144141 file_content = f .read (1 )
145142 return len (file_content ) > 0
@@ -171,11 +168,11 @@ def get_run_prefix(run_path, project, sample_id, lane):
171168 The run prefix of the sequence file in the lane, only if the sequence
172169 file is not empty.
173170 """
174- base = os . path . join (run_path , project )
171+ base = join (run_path , project )
175172 path = base
176173
177- qc = os . path . join (base , 'trimmed_sequences' )
178- hf = os . path . join (base , 'filtered_sequences' )
174+ qc = join (base , 'trimmed_sequences' )
175+ hf = join (base , 'filtered_sequences' )
179176
180177 if _exists_and_has_files (qc ) and _exists_and_has_files (hf ):
181178 path = hf
@@ -188,10 +185,10 @@ def get_run_prefix(run_path, project, sample_id, lane):
188185
189186 search_me = '%s_S*_L*%s_R*.fastq.gz' % (sample_id , lane )
190187
191- results = glob (os . path . join (path , search_me ))
188+ results = glob (join (path , search_me ))
192189
193190 with open ('found_files.log' , 'a' ) as f :
194- f .write ("SEARCHING: %s\n " % os . path . join (path , "FFFF" , search_me ))
191+ f .write ("SEARCHING: %s\n " % join (path , "FFFF" , search_me ))
195192 for item in results :
196193 f .write ("%s\n " % item )
197194 f .write ("\n " )
@@ -200,7 +197,7 @@ def get_run_prefix(run_path, project, sample_id, lane):
200197 if len (results ) == 2 :
201198 forward , reverse = sorted (results )
202199 if is_nonempty_gz_file (forward ) and is_nonempty_gz_file (reverse ):
203- f , r = os . path . basename (forward ), os . path . basename (reverse )
200+ f , r = basename (forward ), basename (reverse )
204201 if len (f ) != len (r ):
205202 raise ValueError ("Forward and reverse sequences filenames "
206203 "don't match f:%s r:%s" % (f , r ))
@@ -228,15 +225,15 @@ def get_run_prefix(run_path, project, sample_id, lane):
228225
229226
230227def get_run_prefix_mf (run_path , project ):
231- search_path = os . path . join (run_path , project , 'amplicon' ,
232- '*_SMPL1_S*R?_*.fastq.gz' )
228+ search_path = join (run_path , project , 'amplicon' ,
229+ '*_SMPL1_S*R?_*.fastq.gz' )
233230 results = glob (search_path )
234231
235232 # at this stage there should only be two files forward and reverse
236233 if len (results ) == 2 :
237234 forward , reverse = sorted (results )
238235 if is_nonempty_gz_file (forward ) and is_nonempty_gz_file (reverse ):
239- f , r = os . path . basename (forward ), os . path . basename (reverse )
236+ f , r = basename (forward ), basename (reverse )
240237 if len (f ) != len (r ):
241238 raise ValueError ("Forward and reverse sequences filenames "
242239 "don't match f:%s r:%s" % (f , r ))
@@ -263,12 +260,12 @@ def get_run_prefix_mf(run_path, project):
263260
264261
265262def _file_list (path ):
266- return [f for f in os . listdir (path )
267- if not os . path . isdir (os . path . join (path , f ))]
263+ return [f for f in listdir (path )
264+ if not isdir (join (path , f ))]
268265
269266
270267def _exists_and_has_files (path ):
271- return os . path . exists (path ) and len (_file_list (path ))
268+ return exists (path ) and len (_file_list (path ))
272269
273270
274271def get_machine_code (instrument_model ):
@@ -459,7 +456,7 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
459456 Dictionary keyed by run identifier, project name and lane. Values are
460457 preparations represented as DataFrames.
461458 """
462- _ , run_id = os . path . split (os . path . normpath (run_path ))
459+ _ , run_id = split (normpath (run_path ))
463460 run_date , instrument_code = parse_illumina_run_id (run_id )
464461 instrument_model , run_center = get_model_and_center (instrument_code )
465462
@@ -509,8 +506,6 @@ def log_me(msg):
509506
510507 all_columns = sorted (carried_prep_columns + generated_prep_columns )
511508
512- from json import dumps
513-
514509 for project , project_sheet in sheet .groupby ('sample_project' ):
515510 project_name , qiita_id = get_short_name_and_id (project )
516511
@@ -1076,7 +1071,7 @@ def _find_filtered_files(fp):
10761071 tmp = fastq_fp .replace (fp , '' )
10771072 # remove any leading and/or trailing '/' characters from the
10781073 # remaining path.
1079- # use os. sep instead of '/' to be more platform independent.
1074+ # use sep instead of '/' to be more platform independent.
10801075 tmp = tmp .strip (sep )
10811076 tmp = tmp .split (sep )
10821077
@@ -1124,5 +1119,3 @@ def _foo_get_run_prefix(file_name):
11241119
11251120 # if no orientations were found, then return None.
11261121 return None if pos == - 1 else file_name [0 :pos ]
1127-
1128-
0 commit comments