Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
tests:
strategy:
matrix:
rust-version: [1.62.1, stable]
rust-version: [stable]
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand Down
135 changes: 135 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# This file is autogenerated by maturin v1.8.2
# To update, run
#
# maturin generate-ci github
#
name: build wheels
on:
push:
branches:
- main
- master
tags:
- "*"
pull_request:
workflow_dispatch:
release:
types: [published]

jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
python-version:
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python-version }}
path: dist

macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
python-version:
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python-version }}
path: dist

sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist

upload-release:
name: Upload release artifacts to GitHub
needs: [linux, macos, sdist]
runs-on: ubuntu-latest
if: github.event.release.tag_name
steps:
- name: Download wheel artifacts
uses: actions/download-artifact@v4

- name: List downloaded wheels
run: ls -l wheels-*/*

- name: Upload Wheels to Release
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: wheels-*/**
tag: ${{ github.event.release.tag_name }}
file_glob: true

release:
name: Release to PyPI
runs-on: ubuntu-latest
if: github.event.release.tag_name
needs: [linux, macos, sdist]
permissions:
# Use to sign the release artifacts
id-token: write
# Used to upload release artifacts
contents: write
# Used to generate artifact attestation
attestations: write
steps:
- uses: actions/download-artifact@v4
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-path: "wheels-*/**"

- name: Publish to PyPI
if: ${{ startsWith(github.ref, 'refs/tags/') }}
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/**.whl wheels-*/**.tar.gz
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,26 @@ This repository provides a library and command-line interface that reimplements
You may build Finch from source, which requires Rust >= `1.49`. Rust's Cargo package manager (see [rustup](https://www.rustup.rs) for Cargo installation instructions) can automatically build and install Finch with `cargo install finch_cli`.
If you require python bindings, you must take extra steps (see [python support](#python-support)). Alternatively, [download a prebuilt binary](https://github.com/onecodex/finch-rs/releases) or install from [PyPi](https://pypi.org/project/finch-sketch/) `pip install finch-sketch`.

### Development ###

To build wheels locally, run:

```sh
uv venv --python 3.11 #
source .venv/bin/activate
uv build

# or, using maturin by itself:

maturin build --features python --release --strip^
```

#### Building binary wheels and pushing to PyPI

There is a Github Workflow that will build Python wheels for macOS (x86 and
ARM) and Ubuntu (x86). To run, create a new release.


### Example Usage ###
To get started, we first compute sketches for several FASTA or FASTQ files. These sketches are compact, sampled representations of the underlying genomic data, and what allow `finch` to rapidly estimate distances between datasets. Sketching files uses the `finch sketch` command:

Expand Down Expand Up @@ -206,7 +226,7 @@ cont, jacc = sketch_one.compare(sketch_two)

## Cap'n Proto

There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp)
There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp)
+ the changes by @bovee in https://github.com/bovee/Mash/blob/master/src/mash/capnp/MinHash.capnp

Both are generated after installing `capnp` and `cargo install capnpc` with the following command:
Expand Down
2 changes: 1 addition & 1 deletion cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ pub fn update_sketch_params(
if let Some(new_scale_num) = new_scale {
if matches.occurrences_of("scale") == 0 {
*scale = new_scale_num;
} else if (*scale - new_scale_num).abs() < std::f64::EPSILON {
} else if (*scale - new_scale_num).abs() < f64::EPSILON {
// TODO: maybe this should have a slightly larger delta?
bail!(
"Specified scale {} does not match {} from sketch {}",
Expand Down
2 changes: 1 addition & 1 deletion lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "finch"
version = "0.6.1"
version = "0.6.2"
authors = ["One Codex <vincent@onecodex.com>"]
description = "An implementation of min-wise independent permutation locality sensitive hashing ('MinHashing') for genomic data and command-line utility for manipulation."
keywords = ["minhash", "bioinformatics", "sketches"]
Expand Down
64 changes: 32 additions & 32 deletions lib/src/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ pub fn raw_distance(
// at this point we've exhausted one of the two sketches, but we may have
// more counts in the other to compare if these were scaled sketches
if scale > 0. {
let max_hash = u64::max_value() / scale.recip() as u64;
let max_hash = u64::MAX / scale.recip() as u64;
while query_hashes
.get(i)
.map(|kmer_count| kmer_count.hash < max_hash)
Expand Down Expand Up @@ -125,6 +125,37 @@ pub fn raw_distance(
(containment, jaccard, common, total)
}

/// This computes set statistics from one set of hashes to another.
///
/// Every hash in the reference set is considered while only those hashes in the
/// query set that are in the same range as the reference set are compared. This
/// should be a more accurate representation of the query set's containment in
/// the reference set because we consider all of the reference set. In
/// practice, there may be issues especially if the query is sketched to a
/// different effective scale than the reference.
pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
let mut i: usize = 0;
let mut common: u64 = 0;
let mut total: u64 = 0;

for ref_hash in ref_sketch {
while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
i += 1;
}

if query_sketch[i].hash == ref_hash.hash {
common += 1;
}

total += 1;
}

// Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
let containment: f64 = common as f64 / total as f64;
let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
(containment, jaccard, common, total)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -306,37 +337,6 @@ mod tests {
}
}

/// This computes set statistics from one set of hashes to another.
///
/// Every hash in the reference set is considered while only those hashes in the
/// query set that are in the same range as the reference set are compared. This
/// should be a more accurate representation of the query set's containment in
/// the reference set because we consider all of the reference set. In
/// practice, there may be issues especially if the query is sketched to a
/// different effective scale than the reference.
pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
let mut i: usize = 0;
let mut common: u64 = 0;
let mut total: u64 = 0;

for ref_hash in ref_sketch {
while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
i += 1;
}

if query_sketch[i].hash == ref_hash.hash {
common += 1;
}

total += 1;
}

// Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
let containment: f64 = common as f64 / total as f64;
let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
(containment, jaccard, common, total)
}

// TODO: add another method like this to allow 0's in ref sketch for hashes present in sketches?
// TODO: maybe we want to do NNLS on these matrices in Rust? for example code, see:
// https://github.com/igmanthony/fnnls/blob/master/src/fnnls.rs
Expand Down
14 changes: 3 additions & 11 deletions lib/src/filtering.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ impl FilterParams {
Some(u32::max(l, sketch.filter_params.abun_filter.0.unwrap_or(0))),
Some(u32::min(
h,
sketch
.filter_params
.abun_filter
.1
.unwrap_or(u32::max_value()),
sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
)),
),
(Some(l), None) => (
Expand All @@ -46,11 +42,7 @@ impl FilterParams {
None,
Some(u32::min(
h,
sketch
.filter_params
.abun_filter
.1
.unwrap_or(u32::max_value()),
sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
)),
),
(None, None) => (None, None),
Expand Down Expand Up @@ -341,7 +333,7 @@ pub fn filter_abundance(
) -> Vec<KmerCount> {
let mut filtered = Vec::new();
let lo_threshold = low.unwrap_or(0u32);
let hi_threshold = high.unwrap_or(u32::max_value());
let hi_threshold = high.unwrap_or(u32::MAX);
for kmer in sketch {
if lo_threshold <= kmer.count && kmer.count <= hi_threshold {
filtered.push(kmer.clone());
Expand Down
2 changes: 1 addition & 1 deletion lib/src/serialization/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ impl<'de> Deserialize<'de> for QuotedU64 {
{
struct QuotedU64Visitor;

impl<'de> Visitor<'de> for QuotedU64Visitor {
impl Visitor<'_> for QuotedU64Visitor {
type Value = QuotedU64;

fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
Expand Down
9 changes: 2 additions & 7 deletions lib/src/serialization/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,8 @@ pub fn write_finch_file(file: &mut dyn Write, sketches: &[Sketch]) -> FinchResul
let mut cap_filter_params = cap_sketch.reborrow().init_filter_params();
cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false));
cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0));
cap_filter_params.set_high_abun_filter(
sketch
.filter_params
.abun_filter
.1
.unwrap_or(::std::u32::MAX),
);
cap_filter_params
.set_high_abun_filter(sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX));
cap_filter_params.set_err_filter(sketch.filter_params.err_filter);
cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter);

Expand Down
1 change: 0 additions & 1 deletion lib/src/sketch_schemes/mash.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::{BinaryHeap, HashMap};
use std::hash::BuildHasherDefault;
use std::usize;

use needletail::Sequence;

Expand Down
3 changes: 1 addition & 2 deletions lib/src/sketch_schemes/scaled.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::{BinaryHeap, HashMap};
use std::hash::BuildHasherDefault;
use std::usize;

use needletail::Sequence;

Expand Down Expand Up @@ -29,7 +28,7 @@ impl ScaledSketcher {
total_kmers: 0,
total_bases: 0,
size,
max_hash: u64::max_value() / iscale,
max_hash: u64::MAX / iscale,
seed,
}
}
Expand Down
6 changes: 4 additions & 2 deletions lib/src/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ pub fn cardinality(sketch: &[KmerCount]) -> Result<u64, &'static str> {
if sketch.is_empty() {
return Ok(0u64);
}
Ok(((sketch.len() - 1) as f32
/ (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64)
Ok(
((sketch.len() - 1) as f32 / (sketch.last().unwrap().hash as f32 / usize::MAX as f32))
as u64,
)
}

/// Generates a Vec of numbers of kmers for each coverage level
Expand Down
Loading
Loading