nf-core · ljwharbers · Oct 7, 2024 · Oct 11, 2024 · Mar 18, 2025 · Jun 9, 2025
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -2,20 +2,32 @@
 
 ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)
 
-  > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
+> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
 
-  > Langer BE, Amaral A, Baudement MO, Bonath F, Charles M, Chitneedi PK, Clark EL, Di Tommaso P, Djebali S, Ewels PA, Eynard S, Fellows Yates JA, Fischer D, Floden EW, Foissac S, Gabernet G, Garcia MU, Gillard G, Gundappa MK, Guyomar C, Hakkaart C, Hanssen F, Harrison PW, Hörtenhuber M, Kurylo C, Kühn C, Lagarrigue S, Lallias D, Macqueen DJ, Miller E, Mir-Pedrol J, Moreira GCM, Nahnsen S, Patel H, Peltzer A, Pitel F, Ramayo-Caldas Y, Ribeiro-Dantas MDC, Rocha D, Salavati M, Sokolov A, Espinosa-Carrasco J, Notredame C, Community TN. Empowering bioinformatics communities with Nextflow and nf-core. Genome Biol. 2025 Jul 29;26(1):228. doi: 10.1186/s13059-025-03673-9. PMID: 40731283; PMCID: PMC12309086.
+> Langer BE, Amaral A, Baudement MO, Bonath F, Charles M, Chitneedi PK, Clark EL, Di Tommaso P, Djebali S, Ewels PA, Eynard S, Fellows Yates JA, Fischer D, Floden EW, Foissac S, Gabernet G, Garcia MU, Gillard G, Gundappa MK, Guyomar C, Hakkaart C, Hanssen F, Harrison PW, Hörtenhuber M, Kurylo C, Kühn C, Lagarrigue S, Lallias D, Macqueen DJ, Miller E, Mir-Pedrol J, Moreira GCM, Nahnsen S, Patel H, Peltzer A, Pitel F, Ramayo-Caldas Y, Ribeiro-Dantas MDC, Rocha D, Salavati M, Sokolov A, Espinosa-Carrasco J, Notredame C, Community TN. Empowering bioinformatics communities with Nextflow and nf-core. Genome Biol. 2025 Jul 29;26(1):228. doi: 10.1186/s13059-025-03673-9. PMID: 40731283; PMCID: PMC12309086.
 
 ## [nf-core/scnanoseq](https://doi.org/10.1093/bioinformatics/btaf487)
 
-  > Trull A, Worthey EA, Ianov L. scnanoseq: an nf-core pipeline for Oxford Nanopore single-cell RNA-sequencing.  Bioinformatics. 2025 Sep 1;41(9):btaf487. doi: 10.1093/bioinformatics/btaf487. PMID: 40905625; PMCID: PMC12449243.
+> Trull A, Worthey EA, Ianov L. scnanoseq: an nf-core pipeline for Oxford Nanopore single-cell RNA-sequencing. Bioinformatics. 2025 Sep 1;41(9):btaf487. doi: 10.1093/bioinformatics/btaf487. PMID: 40905625; PMCID: PMC12449243.
 
 ## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/)
 
-  > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
+> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
 
 ## Pipeline tools
 
+- [SeqKit2](https://pubmed.ncbi.nlm.nih.gov/38898985/)
+
+  > Shen W, Sipos B, Zhao L. SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta. 2024 Apr 5;3(3):e191. doi: 10.1002/imt2.191. PMID: 38898985; PMCID: PMC11183193.
+
+- [Flexiplex](https://pubmed.ncbi.nlm.nih.gov/38379414/)
+
+  > Cheng O, Ling MH, Wang C, Wu S, Ritchie ME, Göke J, Amin N, Davidson NM. Flexiplex: a versatile demultiplexer and search tool for omics data. Bioinformatics. 2024 Mar 4;40(3):btae102. doi: 10.1093/bioinformatics/btae102. PMID: 38379414; PMCID: PMC10914444.
+
+- [Flexiformatter](https://github.com/ljwharbers/flexiformatter)
+
+  > Luuk Harbers. (2025). ljwharbers/flexiformatter: 1.0.6 (1.0.6). Zenodo. https://doi.org/10.5281/zenodo.18098066
+
 - [BLAZE](https://pubmed.ncbi.nlm.nih.gov/37024980/)
 
   > You Y, Prawer YDJ, De Paoli-Iseppi R, Hunt CPJ, Parish CL, Shim H, Clark MB. Identification of cell barcodes from long-read single-cell RNA-seq with BLAZE. Genome Biol. 2023 Apr 6;24(1):66. doi: 10.1186/s13059-023-02907-y. PMID: 37024980; PMCID: PMC10077662.
@@ -40,9 +52,9 @@
 
   > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794.
 
-- [Nanofilt](https://pubmed.ncbi.nlm.nih.gov/29547981/)
+- [Chopper](https://pubmed.ncbi.nlm.nih.gov/37171891/)
 
-  > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics 2018 Aug 1; 34(15):2666-9 doi:10.1093/bioinformatics/bty149. PubMed PMID: 29547981; PubMed Central PMCID: PMC6061794.
+  > De Coster W, Rademakers R. NanoPack2: population-scale evaluation of long-read sequencing data. Bioinformatics. 2023 May 4;39(5):btad311. doi: 10.1093/bioinformatics/btad311. PMID: 37171891; PMCID: PMC10196664.
 
 - [NanoPlot](https://pubmed.ncbi.nlm.nih.gov/29547981/)
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -23,6 +23,12 @@
             },
             "cell_count": {
                 "type": "integer"
+            },
+            "type": {
+                "type": "string",
+                "enum": ["dna", "cdna"],
+                "default": "cdna",
+                "errorMessage": "Type must be either 'dna' or 'cdna'. Default is 'cdna'."
             }
         },
         "required": ["sample", "fastq", "cell_count"]

diff --git a/...s/whitelist/3M-3pgex-may-2023_TRU.txt.zip → ...ts/whitelist/3M-3pgex-may-2023_TRU.txt.gz b/...s/whitelist/3M-3pgex-may-2023_TRU.txt.zip → ...ts/whitelist/3M-3pgex-may-2023_TRU.txt.gz
diff --git a/assets/whitelist/3M-5pgex-jan-2023.txt.zip → assets/whitelist/3M-5pgex-jan-2023.txt.gz b/assets/whitelist/3M-5pgex-jan-2023.txt.zip → assets/whitelist/3M-5pgex-jan-2023.txt.gz
diff --git a/assets/whitelist/3M-february-2018.zip → assets/whitelist/3M-february-2018.txt.gz b/assets/whitelist/3M-february-2018.zip → assets/whitelist/3M-february-2018.txt.gz
diff --git a/assets/whitelist/737K-august-2016.txt.gz b/assets/whitelist/737K-august-2016.txt.gz
diff --git a/assets/whitelist/737K-august-2016.txt.zip b/assets/whitelist/737K-august-2016.txt.zip
diff --git a/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz b/assets/whitelist/cellranger_arc_atac.737K-arc-v1.txt.gz
diff --git a/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz b/assets/whitelist/cellranger_arc_rna.737K-arc-v1.txt.gz
diff --git a/bin/generate_read_counts.sh b/bin/generate_read_counts.sh
@@ -1,19 +1,17 @@
-
 get_fastqc_counts()
 {
     fastqc_file=$1
-    counts=$(unzip -p ${fastqc_file} $(basename ${fastqc_file} .zip)/fastqc_data.txt | \
+    counts=$(unzip -p "${fastqc_file}" "$(basename "${fastqc_file}" .zip)/fastqc_data.txt" | \
         grep 'Total Sequences' | \
         cut -f2 -d$'\t')
-    echo $counts
-
+    echo "$counts"
 }
 
 get_nanoplot_counts()
 {
     nanoplot_file=$1
-    counts=$(grep 'Number of reads' $nanoplot_file | awk '{print $NF}' | cut -f1 -d'.' | sed 's/,//g')
-    echo $counts
+    counts=$(grep 'Number of reads' "$nanoplot_file" | awk '{print $NF}' | cut -f1 -d'.' | sed 's/,//g')
+    echo "$counts"
 }
 
 output=""
@@ -22,27 +20,23 @@ input=""
 while [[ $# -gt 0 ]]
 do
     flag=$1
-
     case "${flag}" in
         --input) input=$2; shift;;
         --output) output=$2; shift;;
-        *) echo "Unknown option $1 ${reset}" && exit 1
+        *) echo "Unknown option $1" && exit 1
     esac
     shift
 done
 
-header=""
-data=""
-
 header="sample,base_fastq_counts,trimmed_read_counts,extracted_read_counts,corrected_read_counts"
-echo "$header" > $output
+echo "$header" > "$output"
 
-for sample_name in $(for file in $(readlink -f $input)/*.tsv; do basename $file; done | cut -f1 -d'.' | sort -u)
-do
-    ###############
-    # INPUT_FILES #
-    ###############
+# Collect all sample names from both barcode file types
+sample_names=$(find "$input" -type f -name "*.corrected_bc_umi.tsv" -o -name "*_known_barcodes.txt" | \
+    sed -E 's|.*/||' | sed -E 's/_known_barcodes\.txt$//; s/\.corrected_bc_umi\.tsv$//' | sort -u)
 
+for sample_name in $sample_names
+do
     raw_fastqc="${sample_name}.raw_fastqc.zip"
     raw_nanoplot="${sample_name}.raw_NanoStats.txt"
 
@@ -52,18 +46,18 @@ do
     extract_fastqc="${sample_name}.extracted_fastqc.zip"
     extract_nanoplot="${sample_name}.extracted_NanoStats.txt"
 
-    correct_csv="${sample_name}.corrected_bc_umi.tsv"
-    data="$(basename $sample_name)"
+    corrected_tsv="${sample_name}.corrected_bc_umi.tsv"
+    known_barcodes="${sample_name}_known_barcodes.txt"
+
+    data="$(basename "$sample_name")"
 
     ####################
     # RAW FASTQ COUNTS #
     ####################
-    if [[ -s "$raw_fastqc" ]]
-    then
+    if [[ -s "$raw_fastqc" ]]; then
         fastqc_counts=$(get_fastqc_counts "$raw_fastqc")
         data="$data,$fastqc_counts"
-    elif [[ -s "$raw_nanoplot" ]]
-    then
+    elif [[ -s "$raw_nanoplot" ]]; then
         nanoplot_counts=$(get_nanoplot_counts "$raw_nanoplot")
         data="$data,$nanoplot_counts"
     else
@@ -73,12 +67,10 @@ do
     ###############
     # TRIM COUNTS #
     ###############
-    if [[ -s "$trim_fastqc" ]]
-    then
+    if [[ -s "$trim_fastqc" ]]; then
         trim_counts=$(get_fastqc_counts "$trim_fastqc")
         data="$data,$trim_counts"
-    elif [[ -s "$trim_nanoplot" ]]
-    then
+    elif [[ -s "$trim_nanoplot" ]]; then
         nanoplot_counts=$(get_nanoplot_counts "$trim_nanoplot")
         data="$data,$nanoplot_counts"
     else
@@ -88,12 +80,10 @@ do
     #####################
     # PREEXTRACT COUNTS #
     #####################
-    if [[ -s "$extract_fastqc" ]]
-    then
+    if [[ -s "$extract_fastqc" ]]; then
         extract_counts=$(get_fastqc_counts "$extract_fastqc")
         data="$data,$extract_counts"
-    elif [[ -s "$extract_nanoplot" ]]
-    then
+    elif [[ -s "$extract_nanoplot" ]]; then
         nanoplot_counts=$(get_nanoplot_counts "$extract_nanoplot")
         data="$data,$nanoplot_counts"
     else
@@ -103,12 +93,15 @@ do
     ##################
     # CORRECT COUNTS #
     ##################
-    if [[ -s $correct_csv ]]
-    then
-        correct_counts=$(cut -f6 $correct_csv | awk '{if ($0 != "") {print $0}}' | wc -l)
+    if [[ -s "$known_barcodes" ]]; then
+        correct_sum=$(awk -F'\t' '{if ($2 != "") sum += $2} END {print sum}' "$known_barcodes")
+        data="$data,$correct_sum"
+    elif [[ -s "$corrected_tsv" ]]; then
+        correct_counts=$(cut -f6 "$corrected_tsv" | awk '{if ($0 != "") print $0}' | wc -l)
         data="$data,$correct_counts"
     else
         data="$data,"
     fi
-    echo "$data" >> $output
+
+    echo "$data" >> "$output"
 done