onecodex · audy · Jun 17, 2025 · Jun 16, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,7 +45,7 @@ jobs:
   tests:
     strategy:
       matrix:
-        rust-version: [1.62.1, stable]
+        rust-version: [stable]
     runs-on: ubuntu-latest
     steps:
       - name: Checkout

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,135 @@
+# This file is autogenerated by maturin v1.8.2
+# To update, run
+#
+#    maturin generate-ci github
+#
+name: build wheels
+on:
+  push:
+    branches:
+      - main
+      - master
+    tags:
+      - "*"
+  pull_request:
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  linux:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: ubuntu-22.04
+            target: x86_64
+        python-version:
+          - "3.11"
+          - "3.12"
+          - "3.13"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
+          sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
+          manylinux: auto
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python-version }}
+          path: dist
+
+  macos:
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        platform:
+          - runner: macos-13
+            target: x86_64
+          - runner: macos-14
+            target: aarch64
+        python-version:
+          - "3.11"
+          - "3.12"
+          - "3.13"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.platform.target }}
+          args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
+          sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
+          manylinux: auto
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python-version }}
+          path: dist
+
+  sdist:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build sdist
+        uses: PyO3/maturin-action@v1
+        with:
+          command: sdist
+          args: --out dist
+      - name: Upload sdist
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-sdist
+          path: dist
+
+  upload-release:
+    name: Upload release artifacts to GitHub
+    needs: [linux, macos, sdist]
+    runs-on: ubuntu-latest
+    if: github.event.release.tag_name
+    steps:
+      - name: Download wheel artifacts
+        uses: actions/download-artifact@v4
+
+      - name: List downloaded wheels
+        run: ls -l wheels-*/*
+
+      - name: Upload Wheels to Release
+        uses: svenstaro/upload-release-action@v2
+        with:
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          file: wheels-*/**
+          tag: ${{ github.event.release.tag_name }}
+          file_glob: true
+
+  release:
+    name: Release to PyPI
+    runs-on: ubuntu-latest
+    if: github.event.release.tag_name
+    needs: [linux, macos, sdist]
+    permissions:
+      # Use to sign the release artifacts
+      id-token: write
+      # Used to upload release artifacts
+      contents: write
+      # Used to generate artifact attestation
+      attestations: write
+    steps:
+      - uses: actions/download-artifact@v4
+      - name: Generate artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-path: "wheels-*/**"
+
+      - name: Publish to PyPI
+        if: ${{ startsWith(github.ref, 'refs/tags/') }}
+        uses: PyO3/maturin-action@v1
+        env:
+          MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+        with:
+          command: upload
+          args: --non-interactive --skip-existing wheels-*/**.whl wheels-*/**.tar.gz
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -12,6 +12,26 @@ This repository provides a library and command-line interface that reimplements
 You may build Finch from source, which requires Rust >= `1.49`. Rust's Cargo package manager (see [rustup](https://www.rustup.rs) for Cargo installation instructions) can automatically build and install Finch with `cargo install finch_cli`.
 If you require python bindings, you must take extra steps (see [python support](#python-support)). Alternatively, [download a prebuilt binary](https://github.com/onecodex/finch-rs/releases) or install from [PyPi](https://pypi.org/project/finch-sketch/) `pip install finch-sketch`.
 
+### Development ###
+
+To build wheels locally, run:
+
+```sh
+uv venv --python 3.11
+source .venv/bin/activate
+uv build
+
+# or, using maturin by itself:
+
+maturin build --features python --release --strip^
+```
+
+#### Building binary wheels and pushing to PyPI
+
+There is a Github Workflow that will build Python wheels for macOS (x86 and
+ARM) and Ubuntu (x86). To run, create a new release.
+
+
 ### Example Usage ###
 To get started, we first compute sketches for several FASTA or FASTQ files. These sketches are compact, sampled representations of the underlying genomic data, and what allow `finch` to rapidly estimate distances between datasets. Sketching files uses the `finch sketch` command:
 
@@ -206,7 +226,7 @@ cont, jacc = sketch_one.compare(sketch_two)
 
 ## Cap'n Proto
 
-There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp) 
+There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp)
 + the changes by @bovee in https://github.com/bovee/Mash/blob/master/src/mash/capnp/MinHash.capnp
 
 Both are generated after installing `capnp` and `cargo install capnpc` with the following command:

diff --git a/cli/src/main.rs b/cli/src/main.rs
@@ -413,7 +413,7 @@ pub fn update_sketch_params(
             if let Some(new_scale_num) = new_scale {
                 if matches.occurrences_of("scale") == 0 {
                     *scale = new_scale_num;
-                } else if (*scale - new_scale_num).abs() < std::f64::EPSILON {
+                } else if (*scale - new_scale_num).abs() < f64::EPSILON {
                     // TODO: maybe this should have a slightly larger delta?
                     bail!(
                         "Specified scale {} does not match {} from sketch {}",

diff --git a/lib/Cargo.toml b/lib/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "finch"
-version = "0.6.1"
+version = "0.6.2"
 authors = ["One Codex <vincent@onecodex.com>"]
 description = "An implementation of min-wise independent permutation locality sensitive hashing ('MinHashing') for genomic data and command-line utility for manipulation."
 keywords = ["minhash", "bioinformatics", "sketches"]

diff --git a/lib/src/distance.rs b/lib/src/distance.rs
@@ -97,7 +97,7 @@ pub fn raw_distance(
     // at this point we've exhausted one of the two sketches, but we may have
     // more counts in the other to compare if these were scaled sketches
     if scale > 0. {
-        let max_hash = u64::max_value() / scale.recip() as u64;
+        let max_hash = u64::MAX / scale.recip() as u64;
         while query_hashes
             .get(i)
             .map(|kmer_count| kmer_count.hash < max_hash)
@@ -125,6 +125,37 @@ pub fn raw_distance(
     (containment, jaccard, common, total)
 }
 
+/// This computes set statistics from one set of hashes to another.
+///
+/// Every hash in the reference set is considered while only those hashes in the
+/// query set that are in the same range as the reference set are compared. This
+/// should be a more accurate representation of the query set's containment in
+/// the reference set because we consider all of the reference set. In
+/// practice, there may be issues especially if the query is sketched to a
+/// different effective scale than the reference.
+pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
+    let mut i: usize = 0;
+    let mut common: u64 = 0;
+    let mut total: u64 = 0;
+
+    for ref_hash in ref_sketch {
+        while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
+            i += 1;
+        }
+
+        if query_sketch[i].hash == ref_hash.hash {
+            common += 1;
+        }
+
+        total += 1;
+    }
+
+    // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
+    let containment: f64 = common as f64 / total as f64;
+    let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
+    (containment, jaccard, common, total)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -306,37 +337,6 @@ mod tests {
     }
 }
 
-/// This computes set statistics from one set of hashes to another.
-///
-/// Every hash in the reference set is considered while only those hashes in the
-/// query set that are in the same range as the reference set are compared. This
-/// should be a more accurate representation of the query set's containment in
-/// the reference set because we consider all of the reference set. In
-/// practice, there may be issues especially if the query is sketched to a
-/// different effective scale than the reference.
-pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
-    let mut i: usize = 0;
-    let mut common: u64 = 0;
-    let mut total: u64 = 0;
-
-    for ref_hash in ref_sketch {
-        while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
-            i += 1;
-        }
-
-        if query_sketch[i].hash == ref_hash.hash {
-            common += 1;
-        }
-
-        total += 1;
-    }
-
-    // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
-    let containment: f64 = common as f64 / total as f64;
-    let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
-    (containment, jaccard, common, total)
-}
-
 // TODO: add another method like this to allow 0's in ref sketch for hashes present in sketches?
 // TODO: maybe we want to do NNLS on these matrices in Rust? for example code, see:
 // https://github.com/igmanthony/fnnls/blob/master/src/fnnls.rs

diff --git a/lib/src/filtering.rs b/lib/src/filtering.rs
@@ -31,11 +31,7 @@ impl FilterParams {
                 Some(u32::max(l, sketch.filter_params.abun_filter.0.unwrap_or(0))),
                 Some(u32::min(
                     h,
-                    sketch
-                        .filter_params
-                        .abun_filter
-                        .1
-                        .unwrap_or(u32::max_value()),
+                    sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
                 )),
             ),
             (Some(l), None) => (
@@ -46,11 +42,7 @@ impl FilterParams {
                 None,
                 Some(u32::min(
                     h,
-                    sketch
-                        .filter_params
-                        .abun_filter
-                        .1
-                        .unwrap_or(u32::max_value()),
+                    sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
                 )),
             ),
             (None, None) => (None, None),
@@ -341,7 +333,7 @@ pub fn filter_abundance(
 ) -> Vec<KmerCount> {
     let mut filtered = Vec::new();
     let lo_threshold = low.unwrap_or(0u32);
-    let hi_threshold = high.unwrap_or(u32::max_value());
+    let hi_threshold = high.unwrap_or(u32::MAX);
     for kmer in sketch {
         if lo_threshold <= kmer.count && kmer.count <= hi_threshold {
             filtered.push(kmer.clone());

diff --git a/lib/src/serialization/json.rs b/lib/src/serialization/json.rs
@@ -247,7 +247,7 @@ impl<'de> Deserialize<'de> for QuotedU64 {
     {
         struct QuotedU64Visitor;
 
-        impl<'de> Visitor<'de> for QuotedU64Visitor {
+        impl Visitor<'_> for QuotedU64Visitor {
             type Value = QuotedU64;
 
             fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {

diff --git a/lib/src/serialization/mod.rs b/lib/src/serialization/mod.rs
@@ -151,13 +151,8 @@ pub fn write_finch_file(file: &mut dyn Write, sketches: &[Sketch]) -> FinchResul
         let mut cap_filter_params = cap_sketch.reborrow().init_filter_params();
         cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false));
         cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0));
-        cap_filter_params.set_high_abun_filter(
-            sketch
-                .filter_params
-                .abun_filter
-                .1
-                .unwrap_or(::std::u32::MAX),
-        );
+        cap_filter_params
+            .set_high_abun_filter(sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX));
         cap_filter_params.set_err_filter(sketch.filter_params.err_filter);
         cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter);
 

diff --git a/lib/src/sketch_schemes/mash.rs b/lib/src/sketch_schemes/mash.rs
@@ -1,6 +1,5 @@
 use std::collections::{BinaryHeap, HashMap};
 use std::hash::BuildHasherDefault;
-use std::usize;
 
 use needletail::Sequence;
 

diff --git a/lib/src/sketch_schemes/scaled.rs b/lib/src/sketch_schemes/scaled.rs
@@ -1,6 +1,5 @@
 use std::collections::{BinaryHeap, HashMap};
 use std::hash::BuildHasherDefault;
-use std::usize;
 
 use needletail::Sequence;
 
@@ -29,7 +28,7 @@ impl ScaledSketcher {
             total_kmers: 0,
             total_bases: 0,
             size,
-            max_hash: u64::max_value() / iscale,
+            max_hash: u64::MAX / iscale,
             seed,
         }
     }

diff --git a/lib/src/statistics.rs b/lib/src/statistics.rs
@@ -16,8 +16,10 @@ pub fn cardinality(sketch: &[KmerCount]) -> Result<u64, &'static str> {
     if sketch.is_empty() {
         return Ok(0u64);
     }
-    Ok(((sketch.len() - 1) as f32
-        / (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64)
+    Ok(
+        ((sketch.len() - 1) as f32 / (sketch.last().unwrap().hash as f32 / usize::MAX as f32))
+            as u64,
+    )
 }
 
 /// Generates a Vec of numbers of kmers for each coverage level