Broken tests reviewed and updated as needed.

charles-cowart · charles-cowart · commit 6dc65c6a8932 · 2025-02-18T17:39:54.000-08:00
diff --git a/metapool/prep.py b/metapool/prep.py
@@ -1,19 +1,16 @@
-import re
-import os
-import gzip
-import warnings
-import pandas as pd
-
-from glob import glob
+from collections import Counter, defaultdict
 from datetime import datetime
-from string import ascii_letters, digits
+from glob import glob
 from metapool.mp_strings import get_short_name_and_id
 from metapool.plate import PlateReplication
-from collections import Counter
+from os import sep, listdir
+from os.path import (basename, isdir, join, split, abspath, exists,
+                     normpath)
 from string import ascii_letters, digits
-from os import sep
-from os.path import join, split, abspath, exists
-from collections import defaultdict
+from gzip import open as gz_open
+import pandas as pd
+import re
+import warnings
 
 
 REQUIRED_MF_COLUMNS = {'sample_name', 'barcode', 'primer', 'primer_plate',
@@ -139,7 +136,7 @@ def parse_illumina_run_id(run_id):
 
 def is_nonempty_gz_file(name):
     """Taken from https://stackoverflow.com/a/37878550/379593"""
-    with gzip.open(name, 'rb') as f:
+    with gz_open(name, 'rb') as f:
         try:
             file_content = f.read(1)
             return len(file_content) > 0
@@ -171,11 +168,11 @@ def get_run_prefix(run_path, project, sample_id, lane):
         The run prefix of the sequence file in the lane, only if the sequence
         file is not empty.
     """
-    base = os.path.join(run_path, project)
+    base = join(run_path, project)
     path = base
 
-    qc = os.path.join(base, 'trimmed_sequences')
-    hf = os.path.join(base, 'filtered_sequences')
+    qc = join(base, 'trimmed_sequences')
+    hf = join(base, 'filtered_sequences')
 
     if _exists_and_has_files(qc) and _exists_and_has_files(hf):
         path = hf
@@ -188,10 +185,10 @@ def get_run_prefix(run_path, project, sample_id, lane):
 
     search_me = '%s_S*_L*%s_R*.fastq.gz' % (sample_id, lane)
 
-    results = glob(os.path.join(path, search_me))
+    results = glob(join(path, search_me))
 
     with open('found_files.log', 'a') as f:
-        f.write("SEARCHING: %s\n" % os.path.join(path, "FFFF", search_me))
+        f.write("SEARCHING: %s\n" % join(path, "FFFF", search_me))
         for item in results:
             f.write("%s\n" % item)
         f.write("\n")
@@ -200,7 +197,7 @@ def get_run_prefix(run_path, project, sample_id, lane):
     if len(results) == 2:
         forward, reverse = sorted(results)
         if is_nonempty_gz_file(forward) and is_nonempty_gz_file(reverse):
-            f, r = os.path.basename(forward), os.path.basename(reverse)
+            f, r = basename(forward), basename(reverse)
             if len(f) != len(r):
                 raise ValueError("Forward and reverse sequences filenames "
                                  "don't match f:%s r:%s" % (f, r))
@@ -228,15 +225,15 @@ def get_run_prefix(run_path, project, sample_id, lane):
 
 
 def get_run_prefix_mf(run_path, project):
-    search_path = os.path.join(run_path, project, 'amplicon',
-                               '*_SMPL1_S*R?_*.fastq.gz')
+    search_path = join(run_path, project, 'amplicon',
+                       '*_SMPL1_S*R?_*.fastq.gz')
     results = glob(search_path)
 
     # at this stage there should only be two files forward and reverse
     if len(results) == 2:
         forward, reverse = sorted(results)
         if is_nonempty_gz_file(forward) and is_nonempty_gz_file(reverse):
-            f, r = os.path.basename(forward), os.path.basename(reverse)
+            f, r = basename(forward), basename(reverse)
             if len(f) != len(r):
                 raise ValueError("Forward and reverse sequences filenames "
                                  "don't match f:%s r:%s" % (f, r))
@@ -263,12 +260,12 @@ def get_run_prefix_mf(run_path, project):
 
 
 def _file_list(path):
-    return [f for f in os.listdir(path)
-            if not os.path.isdir(os.path.join(path, f))]
+    return [f for f in listdir(path)
+            if not isdir(join(path, f))]
 
 
 def _exists_and_has_files(path):
-    return os.path.exists(path) and len(_file_list(path))
+    return exists(path) and len(_file_list(path))
 
 
 def get_machine_code(instrument_model):
@@ -459,7 +456,7 @@ def preparations_for_run(run_path, sheet, generated_prep_columns,
         Dictionary keyed by run identifier, project name and lane. Values are
         preparations represented as DataFrames.
     """
-    _, run_id = os.path.split(os.path.normpath(run_path))
+    _, run_id = split(normpath(run_path))
     run_date, instrument_code = parse_illumina_run_id(run_id)
     instrument_model, run_center = get_model_and_center(instrument_code)
 
@@ -509,8 +506,6 @@ def log_me(msg):
 
     all_columns = sorted(carried_prep_columns + generated_prep_columns)
 
-    from json import dumps
-
     for project, project_sheet in sheet.groupby('sample_project'):
         project_name, qiita_id = get_short_name_and_id(project)
 
@@ -1076,7 +1071,7 @@ def _find_filtered_files(fp):
         tmp = fastq_fp.replace(fp, '')
         # remove any leading and/or trailing '/' characters from the
         # remaining path.
-        # use os.sep instead of '/' to be more platform independent.
+        # use sep instead of '/' to be more platform independent.
         tmp = tmp.strip(sep)
         tmp = tmp.split(sep)
 
@@ -1124,5 +1119,3 @@ def _foo_get_run_prefix(file_name):
 
     # if no orientations were found, then return None.
     return None if pos == -1 else file_name[0:pos]
-
-
diff --git a/metapool/scripts/tests/test_seqpro.py b/metapool/scripts/tests/test_seqpro.py
@@ -7,7 +7,6 @@
 from os.path import join, exists
 from subprocess import Popen, PIPE
 import pandas as pd
-import warnings
 from glob import glob
 from os.path import basename
 
@@ -37,7 +36,7 @@ def setUp(self):
         )
 
     def tearDown(self):
-        #rmtree(self.vf_test_dir, ignore_errors=True)
+        rmtree(self.vf_test_dir, ignore_errors=True)
         pass
 
     def test_fastp_run(self):
@@ -173,7 +172,8 @@ def test_fastp_run(self):
                         "raw_reads_r1r2": 2300000,
                         "total_biological_reads_r1r2": 61404.0,
                         "quality_filtered_reads_r1r2": 16.0,
-                        "fraction_passing_quality_filter": 6.956521739130435e-06
+                        "fraction_passing_quality_filter":
+                        6.956521739130435e-06
                       }
                     },
                 "200318_A00953_0082_AH5TWYDSXY.Trojecp_666.1.tsv": {
@@ -391,7 +391,7 @@ def test_fastp_run(self):
             self.assertEqual(sorted([basename(x) for x
                                      in glob("./*.tsv")]), exp_preps)
 
-            for prep, exp_lines in zip(exp_preps, [5,4]):
+            for prep, exp_lines in zip(exp_preps, [5, 4]):
                 with open(prep) as f:
                     lines = f.readlines()
                     lines = [x.strip() for x in lines]
diff --git a/metapool/tests/data/runs/200318_A00953_0082_AH5TWYDSXY/mgv90_test_sheet.csv b/metapool/tests/data/runs/200318_A00953_0082_AH5TWYDSXY/mgv90_test_sheet.csv
@@ -22,11 +22,11 @@ ReverseComplement,0,,,,,,,,,
 Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
 1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1
 1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2
-3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1
-3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2
-3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
-3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
-3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
+1,sample3,sample3,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s3
+1,sample4,sample4,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s4
+1,sample5,sample5,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
+1,sample6,sample6,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
+1,sample7,sample7,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
 ,,,,,,,,,,
 [Bioinformatics],,,,,,,,,,
 Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
diff --git a/metapool/tests/test_count.py b/metapool/tests/test_count.py
@@ -54,15 +54,15 @@ def test_parsefier_multiple_matches_raises(self):
             run = os.path.join(tmp, 'funky-rerun-with-repeated-samples')
             shutil.copytree(self.run_dir, run)
 
-            # sample 3 exists, but not with cell number S458, so this should
+            # sample 5 exists, but not with cell number S458, so this should
             # raise an error because if this happense something else went wrong
             fake = os.path.join(run, 'Trojecp_666', 'json',
-                                'sample3_S458_L003_R1_001.json')
+                                'sample5_S458_L001_R1_001.json')
             with open(fake, 'w') as f:
                 f.write(json.dumps({}))
 
             msg = ('Multiple matches found for the same samples in the same '
-                   'lane, only one match is expected: sample3 in lane 3')
+                   'lane, only one match is expected: sample5 in lane 1')
             with self.assertRaisesRegex(ValueError, msg):
                 _parsefier(run, self.ss, 'json', '.json', 'halloween',
                            lambda x: 1)
@@ -290,12 +290,8 @@ def test_raw_read_counts_malformed_lane(self):
 
     def test_raw_read_counts(self):
         obs = raw_read_counts(self.run_dir, self.ss)
-
         exp = self.stats[['raw_reads_r1r2']] * 2
 
-        exp.to_csv('exp.csv', sep=',', index=True)
-        obs.to_csv('obs.csv', sep=',', index=True)
-
         pd.testing.assert_frame_equal(obs.sort_index(), exp)
 
     def tearDown(self):