diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 05a53ea..8ee4b60 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: tests: strategy: matrix: - rust-version: [1.62.1, stable] + rust-version: [stable] runs-on: ubuntu-latest steps: - name: Checkout diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..aaacd7e --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,135 @@ +# This file is autogenerated by maturin v1.8.2 +# To update, run +# +# maturin generate-ci github +# +name: build wheels +on: + push: + branches: + - main + - master + tags: + - "*" + pull_request: + workflow_dispatch: + release: + types: [published] + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + python-version: + - "3.11" + - "3.12" + - "3.13" + steps: + - uses: actions/checkout@v4 + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --features python --release --out dist --interpreter python${{ matrix.python-version }} + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python-version }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-13 + target: x86_64 + - runner: macos-14 + target: aarch64 + python-version: + - "3.11" + - "3.12" + - "3.13" + steps: + - uses: actions/checkout@v4 + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --features python --release --out dist --interpreter python${{ matrix.python-version }} + sccache: ${{ !startsWith(github.ref, 'refs/tags/') }} + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python-version }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + upload-release: + name: Upload release artifacts to GitHub + needs: [linux, macos, sdist] + runs-on: ubuntu-latest + if: github.event.release.tag_name + steps: + - name: Download wheel artifacts + uses: actions/download-artifact@v4 + + - name: List downloaded wheels + run: ls -l wheels-*/* + + - name: Upload Wheels to Release + uses: svenstaro/upload-release-action@v2 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: wheels-*/** + tag: ${{ github.event.release.tag_name }} + file_glob: true + + release: + name: Release to PyPI + runs-on: ubuntu-latest + if: github.event.release.tag_name + needs: [linux, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-path: "wheels-*/**" + + - name: Publish to PyPI + if: ${{ startsWith(github.ref, 'refs/tags/') }} + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/**.whl wheels-*/**.tar.gz diff --git a/Cargo.lock b/Cargo.lock index 35c035a..cecda67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "adler" @@ -262,7 +262,7 @@ dependencies = [ [[package]] name = "finch" -version = "0.6.1" +version = "0.6.2" dependencies = [ "bincode", "capnp", diff --git a/README.md b/README.md index ff92c60..8b88208 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,26 @@ This repository provides a library and command-line interface that reimplements You may build Finch from source, which requires Rust >= `1.49`. Rust's Cargo package manager (see [rustup](https://www.rustup.rs) for Cargo installation instructions) can automatically build and install Finch with `cargo install finch_cli`. If you require python bindings, you must take extra steps (see [python support](#python-support)). Alternatively, [download a prebuilt binary](https://github.com/onecodex/finch-rs/releases) or install from [PyPi](https://pypi.org/project/finch-sketch/) `pip install finch-sketch`. +### Development ### + +To build wheels locally, run: + +```sh +uv venv --python 3.11 +source .venv/bin/activate +uv build + +# or, using maturin by itself: + +maturin build --features python --release --strip^ +``` + +#### Building binary wheels and pushing to PyPI + +There is a Github Workflow that will build Python wheels for macOS (x86 and +ARM) and Ubuntu (x86). To run, create a new release. + + ### Example Usage ### To get started, we first compute sketches for several FASTA or FASTQ files. These sketches are compact, sampled representations of the underlying genomic data, and what allow `finch` to rapidly estimate distances between datasets. Sketching files uses the `finch sketch` command: @@ -206,7 +226,7 @@ cont, jacc = sketch_one.compare(sketch_two) ## Cap'n Proto -There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp) +There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp) + the changes by @bovee in https://github.com/bovee/Mash/blob/master/src/mash/capnp/MinHash.capnp Both are generated after installing `capnp` and `cargo install capnpc` with the following command: diff --git a/cli/src/main.rs b/cli/src/main.rs index fff5832..78ba197 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -413,7 +413,7 @@ pub fn update_sketch_params( if let Some(new_scale_num) = new_scale { if matches.occurrences_of("scale") == 0 { *scale = new_scale_num; - } else if (*scale - new_scale_num).abs() < std::f64::EPSILON { + } else if (*scale - new_scale_num).abs() < f64::EPSILON { // TODO: maybe this should have a slightly larger delta? bail!( "Specified scale {} does not match {} from sketch {}", diff --git a/lib/Cargo.toml b/lib/Cargo.toml index bb73efa..bae5006 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "finch" -version = "0.6.1" +version = "0.6.2" authors = ["One Codex "] description = "An implementation of min-wise independent permutation locality sensitive hashing ('MinHashing') for genomic data and command-line utility for manipulation." keywords = ["minhash", "bioinformatics", "sketches"] diff --git a/lib/src/distance.rs b/lib/src/distance.rs index eddcebe..662a679 100644 --- a/lib/src/distance.rs +++ b/lib/src/distance.rs @@ -97,7 +97,7 @@ pub fn raw_distance( // at this point we've exhausted one of the two sketches, but we may have // more counts in the other to compare if these were scaled sketches if scale > 0. { - let max_hash = u64::max_value() / scale.recip() as u64; + let max_hash = u64::MAX / scale.recip() as u64; while query_hashes .get(i) .map(|kmer_count| kmer_count.hash < max_hash) @@ -125,6 +125,37 @@ pub fn raw_distance( (containment, jaccard, common, total) } +/// This computes set statistics from one set of hashes to another. +/// +/// Every hash in the reference set is considered while only those hashes in the +/// query set that are in the same range as the reference set are compared. This +/// should be a more accurate representation of the query set's containment in +/// the reference set because we consider all of the reference set. In +/// practice, there may be issues especially if the query is sketched to a +/// different effective scale than the reference. +pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) { + let mut i: usize = 0; + let mut common: u64 = 0; + let mut total: u64 = 0; + + for ref_hash in ref_sketch { + while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) { + i += 1; + } + + if query_sketch[i].hash == ref_hash.hash { + common += 1; + } + + total += 1; + } + + // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B| + let containment: f64 = common as f64 / total as f64; + let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64; + (containment, jaccard, common, total) +} + #[cfg(test)] mod tests { use super::*; @@ -306,37 +337,6 @@ mod tests { } } -/// This computes set statistics from one set of hashes to another. -/// -/// Every hash in the reference set is considered while only those hashes in the -/// query set that are in the same range as the reference set are compared. This -/// should be a more accurate representation of the query set's containment in -/// the reference set because we consider all of the reference set. In -/// practice, there may be issues especially if the query is sketched to a -/// different effective scale than the reference. -pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) { - let mut i: usize = 0; - let mut common: u64 = 0; - let mut total: u64 = 0; - - for ref_hash in ref_sketch { - while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) { - i += 1; - } - - if query_sketch[i].hash == ref_hash.hash { - common += 1; - } - - total += 1; - } - - // Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B| - let containment: f64 = common as f64 / total as f64; - let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64; - (containment, jaccard, common, total) -} - // TODO: add another method like this to allow 0's in ref sketch for hashes present in sketches? // TODO: maybe we want to do NNLS on these matrices in Rust? for example code, see: // https://github.com/igmanthony/fnnls/blob/master/src/fnnls.rs diff --git a/lib/src/filtering.rs b/lib/src/filtering.rs index bd2bbd7..58ffd08 100644 --- a/lib/src/filtering.rs +++ b/lib/src/filtering.rs @@ -31,11 +31,7 @@ impl FilterParams { Some(u32::max(l, sketch.filter_params.abun_filter.0.unwrap_or(0))), Some(u32::min( h, - sketch - .filter_params - .abun_filter - .1 - .unwrap_or(u32::max_value()), + sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX), )), ), (Some(l), None) => ( @@ -46,11 +42,7 @@ impl FilterParams { None, Some(u32::min( h, - sketch - .filter_params - .abun_filter - .1 - .unwrap_or(u32::max_value()), + sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX), )), ), (None, None) => (None, None), @@ -341,7 +333,7 @@ pub fn filter_abundance( ) -> Vec { let mut filtered = Vec::new(); let lo_threshold = low.unwrap_or(0u32); - let hi_threshold = high.unwrap_or(u32::max_value()); + let hi_threshold = high.unwrap_or(u32::MAX); for kmer in sketch { if lo_threshold <= kmer.count && kmer.count <= hi_threshold { filtered.push(kmer.clone()); diff --git a/lib/src/serialization/json.rs b/lib/src/serialization/json.rs index 56fcf89..3417d6d 100644 --- a/lib/src/serialization/json.rs +++ b/lib/src/serialization/json.rs @@ -247,7 +247,7 @@ impl<'de> Deserialize<'de> for QuotedU64 { { struct QuotedU64Visitor; - impl<'de> Visitor<'de> for QuotedU64Visitor { + impl Visitor<'_> for QuotedU64Visitor { type Value = QuotedU64; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/lib/src/serialization/mod.rs b/lib/src/serialization/mod.rs index 19ccb37..d6b94f0 100644 --- a/lib/src/serialization/mod.rs +++ b/lib/src/serialization/mod.rs @@ -151,13 +151,8 @@ pub fn write_finch_file(file: &mut dyn Write, sketches: &[Sketch]) -> FinchResul let mut cap_filter_params = cap_sketch.reborrow().init_filter_params(); cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false)); cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0)); - cap_filter_params.set_high_abun_filter( - sketch - .filter_params - .abun_filter - .1 - .unwrap_or(::std::u32::MAX), - ); + cap_filter_params + .set_high_abun_filter(sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX)); cap_filter_params.set_err_filter(sketch.filter_params.err_filter); cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter); diff --git a/lib/src/sketch_schemes/mash.rs b/lib/src/sketch_schemes/mash.rs index cfaab4c..67e9d5f 100644 --- a/lib/src/sketch_schemes/mash.rs +++ b/lib/src/sketch_schemes/mash.rs @@ -1,6 +1,5 @@ use std::collections::{BinaryHeap, HashMap}; use std::hash::BuildHasherDefault; -use std::usize; use needletail::Sequence; diff --git a/lib/src/sketch_schemes/scaled.rs b/lib/src/sketch_schemes/scaled.rs index e1d823a..d0238a3 100644 --- a/lib/src/sketch_schemes/scaled.rs +++ b/lib/src/sketch_schemes/scaled.rs @@ -1,6 +1,5 @@ use std::collections::{BinaryHeap, HashMap}; use std::hash::BuildHasherDefault; -use std::usize; use needletail::Sequence; @@ -29,7 +28,7 @@ impl ScaledSketcher { total_kmers: 0, total_bases: 0, size, - max_hash: u64::max_value() / iscale, + max_hash: u64::MAX / iscale, seed, } } diff --git a/lib/src/statistics.rs b/lib/src/statistics.rs index 9c00e52..ff91b91 100644 --- a/lib/src/statistics.rs +++ b/lib/src/statistics.rs @@ -16,8 +16,10 @@ pub fn cardinality(sketch: &[KmerCount]) -> Result { if sketch.is_empty() { return Ok(0u64); } - Ok(((sketch.len() - 1) as f32 - / (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64) + Ok( + ((sketch.len() - 1) as f32 / (sketch.last().unwrap().hash as f32 / usize::MAX as f32)) + as u64, + ) } /// Generates a Vec of numbers of kmers for each coverage level diff --git a/pyproject.toml b/pyproject.toml index 59b7372..19e20a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,20 @@ [build-system] -requires = ["maturin>=0.14,<0.15"] +requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" [project] -name = "finch" +name = "finch-sketch" maintainer = "One Codex" classifier = [ "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", - "Topic :: Scientific/Engineering :: Bio-Informatics" -] \ No newline at end of file + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dynamic = ["version"] +readme = "README.md" + +[tool.maturin] +manifest-path = "lib/Cargo.toml" +bindings = "pyo3" +features = ["python"] +exclude = ["lib/README.md"]