Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
tests:
strategy:
matrix:
rust-version: [1.62.1, stable]
rust-version: [stable]
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand Down
135 changes: 135 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# This file is autogenerated by maturin v1.8.2
# To update, run
#
# maturin generate-ci github
#
name: build wheels
on:
push:
branches:
- main
- master
tags:
- "*"
pull_request:
workflow_dispatch:
release:
types: [published]

jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-22.04
target: x86_64
python-version:
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python-version }}
path: dist

macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-13
target: x86_64
- runner: macos-14
target: aarch64
python-version:
- "3.11"
- "3.12"
- "3.13"
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --features python --release --out dist --interpreter python${{ matrix.python-version }}
sccache: ${{ !startsWith(github.ref, 'refs/tags/') }}
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python-version }}
path: dist

sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist

upload-release:
name: Upload release artifacts to GitHub
needs: [linux, macos, sdist]
runs-on: ubuntu-latest
if: github.event.release.tag_name
steps:
- name: Download wheel artifacts
uses: actions/download-artifact@v4

- name: List downloaded wheels
run: ls -l wheels-*/*

- name: Upload Wheels to Release
uses: svenstaro/upload-release-action@v2
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: wheels-*/**
tag: ${{ github.event.release.tag_name }}
file_glob: true

release:
name: Release to PyPI
runs-on: ubuntu-latest
if: github.event.release.tag_name
needs: [linux, macos, sdist]
permissions:
# Use to sign the release artifacts
id-token: write
# Used to upload release artifacts
contents: write
# Used to generate artifact attestation
attestations: write
steps:
- uses: actions/download-artifact@v4
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-path: "wheels-*/**"

- name: Publish to PyPI
if: ${{ startsWith(github.ref, 'refs/tags/') }}
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/**.whl wheels-*/**.tar.gz
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,26 @@ This repository provides a library and command-line interface that reimplements
You may build Finch from source, which requires Rust >= `1.49`. Rust's Cargo package manager (see [rustup](https://www.rustup.rs) for Cargo installation instructions) can automatically build and install Finch with `cargo install finch_cli`.
If you require python bindings, you must take extra steps (see [python support](#python-support)). Alternatively, [download a prebuilt binary](https://github.com/onecodex/finch-rs/releases) or install from [PyPi](https://pypi.org/project/finch-sketch/) `pip install finch-sketch`.

### Development ###

To build wheels locally, run:

```sh
uv venv --python 3.11
source .venv/bin/activate
uv build

# or, using maturin by itself:

maturin build --features python --release --strip^
```

#### Building binary wheels and pushing to PyPI

There is a Github Workflow that will build Python wheels for macOS (x86 and
ARM) and Ubuntu (x86). To run, create a new release.


### Example Usage ###
To get started, we first compute sketches for several FASTA or FASTQ files. These sketches are compact, sampled representations of the underlying genomic data, and what allow `finch` to rapidly estimate distances between datasets. Sketching files uses the `finch sketch` command:

Expand Down Expand Up @@ -206,7 +226,7 @@ cont, jacc = sketch_one.compare(sketch_two)

## Cap'n Proto

There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp)
There is a `finch.capnp` in `src/serialization` file and the output of the MinHash schema (https://github.com/marbl/Mash/blob/54e6d66b7720035a2605a02892cad027ef3231ef/src/mash/capnp/MinHash.capnp)
+ the changes by @bovee in https://github.com/bovee/Mash/blob/master/src/mash/capnp/MinHash.capnp

Both are generated after installing `capnp` and `cargo install capnpc` with the following command:
Expand Down
2 changes: 1 addition & 1 deletion cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ pub fn update_sketch_params(
if let Some(new_scale_num) = new_scale {
if matches.occurrences_of("scale") == 0 {
*scale = new_scale_num;
} else if (*scale - new_scale_num).abs() < std::f64::EPSILON {
} else if (*scale - new_scale_num).abs() < f64::EPSILON {
// TODO: maybe this should have a slightly larger delta?
bail!(
"Specified scale {} does not match {} from sketch {}",
Expand Down
2 changes: 1 addition & 1 deletion lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "finch"
version = "0.6.1"
version = "0.6.2"
authors = ["One Codex <vincent@onecodex.com>"]
description = "An implementation of min-wise independent permutation locality sensitive hashing ('MinHashing') for genomic data and command-line utility for manipulation."
keywords = ["minhash", "bioinformatics", "sketches"]
Expand Down
64 changes: 32 additions & 32 deletions lib/src/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ pub fn raw_distance(
// at this point we've exhausted one of the two sketches, but we may have
// more counts in the other to compare if these were scaled sketches
if scale > 0. {
let max_hash = u64::max_value() / scale.recip() as u64;
let max_hash = u64::MAX / scale.recip() as u64;
while query_hashes
.get(i)
.map(|kmer_count| kmer_count.hash < max_hash)
Expand Down Expand Up @@ -125,6 +125,37 @@ pub fn raw_distance(
(containment, jaccard, common, total)
}

/// This computes set statistics from one set of hashes to another.
///
/// Every hash in the reference set is considered while only those hashes in the
/// query set that are in the same range as the reference set are compared. This
/// should be a more accurate representation of the query set's containment in
/// the reference set because we consider all of the reference set. In
/// practice, there may be issues especially if the query is sketched to a
/// different effective scale than the reference.
pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
let mut i: usize = 0;
let mut common: u64 = 0;
let mut total: u64 = 0;

for ref_hash in ref_sketch {
while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
i += 1;
}

if query_sketch[i].hash == ref_hash.hash {
common += 1;
}

total += 1;
}

// Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
let containment: f64 = common as f64 / total as f64;
let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
(containment, jaccard, common, total)
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -306,37 +337,6 @@ mod tests {
}
}

/// This computes set statistics from one set of hashes to another.
///
/// Every hash in the reference set is considered while only those hashes in the
/// query set that are in the same range as the reference set are compared. This
/// should be a more accurate representation of the query set's containment in
/// the reference set because we consider all of the reference set. In
/// practice, there may be issues especially if the query is sketched to a
/// different effective scale than the reference.
pub fn old_distance(query_sketch: &[KmerCount], ref_sketch: &[KmerCount]) -> (f64, f64, u64, u64) {
let mut i: usize = 0;
let mut common: u64 = 0;
let mut total: u64 = 0;

for ref_hash in ref_sketch {
while (query_sketch[i].hash < ref_hash.hash) && (i < query_sketch.len() - 1) {
i += 1;
}

if query_sketch[i].hash == ref_hash.hash {
common += 1;
}

total += 1;
}

// Numerator is A-intersect-B, |A| is the denominator, we enforce |A| == |B|
let containment: f64 = common as f64 / total as f64;
let jaccard: f64 = common as f64 / (common + 2 * (total - common)) as f64;
(containment, jaccard, common, total)
}

// TODO: add another method like this to allow 0's in ref sketch for hashes present in sketches?
// TODO: maybe we want to do NNLS on these matrices in Rust? for example code, see:
// https://github.com/igmanthony/fnnls/blob/master/src/fnnls.rs
Expand Down
14 changes: 3 additions & 11 deletions lib/src/filtering.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ impl FilterParams {
Some(u32::max(l, sketch.filter_params.abun_filter.0.unwrap_or(0))),
Some(u32::min(
h,
sketch
.filter_params
.abun_filter
.1
.unwrap_or(u32::max_value()),
sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
)),
),
(Some(l), None) => (
Expand All @@ -46,11 +42,7 @@ impl FilterParams {
None,
Some(u32::min(
h,
sketch
.filter_params
.abun_filter
.1
.unwrap_or(u32::max_value()),
sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX),
)),
),
(None, None) => (None, None),
Expand Down Expand Up @@ -341,7 +333,7 @@ pub fn filter_abundance(
) -> Vec<KmerCount> {
let mut filtered = Vec::new();
let lo_threshold = low.unwrap_or(0u32);
let hi_threshold = high.unwrap_or(u32::max_value());
let hi_threshold = high.unwrap_or(u32::MAX);
for kmer in sketch {
if lo_threshold <= kmer.count && kmer.count <= hi_threshold {
filtered.push(kmer.clone());
Expand Down
2 changes: 1 addition & 1 deletion lib/src/serialization/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ impl<'de> Deserialize<'de> for QuotedU64 {
{
struct QuotedU64Visitor;

impl<'de> Visitor<'de> for QuotedU64Visitor {
impl Visitor<'_> for QuotedU64Visitor {
type Value = QuotedU64;

fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
Expand Down
9 changes: 2 additions & 7 deletions lib/src/serialization/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,8 @@ pub fn write_finch_file(file: &mut dyn Write, sketches: &[Sketch]) -> FinchResul
let mut cap_filter_params = cap_sketch.reborrow().init_filter_params();
cap_filter_params.set_filtered(sketch.filter_params.filter_on.unwrap_or(false));
cap_filter_params.set_low_abun_filter(sketch.filter_params.abun_filter.0.unwrap_or(0));
cap_filter_params.set_high_abun_filter(
sketch
.filter_params
.abun_filter
.1
.unwrap_or(::std::u32::MAX),
);
cap_filter_params
.set_high_abun_filter(sketch.filter_params.abun_filter.1.unwrap_or(u32::MAX));
cap_filter_params.set_err_filter(sketch.filter_params.err_filter);
cap_filter_params.set_strand_filter(sketch.filter_params.strand_filter);

Expand Down
1 change: 0 additions & 1 deletion lib/src/sketch_schemes/mash.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::{BinaryHeap, HashMap};
use std::hash::BuildHasherDefault;
use std::usize;

use needletail::Sequence;

Expand Down
3 changes: 1 addition & 2 deletions lib/src/sketch_schemes/scaled.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::{BinaryHeap, HashMap};
use std::hash::BuildHasherDefault;
use std::usize;

use needletail::Sequence;

Expand Down Expand Up @@ -29,7 +28,7 @@ impl ScaledSketcher {
total_kmers: 0,
total_bases: 0,
size,
max_hash: u64::max_value() / iscale,
max_hash: u64::MAX / iscale,
seed,
}
}
Expand Down
6 changes: 4 additions & 2 deletions lib/src/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ pub fn cardinality(sketch: &[KmerCount]) -> Result<u64, &'static str> {
if sketch.is_empty() {
return Ok(0u64);
}
Ok(((sketch.len() - 1) as f32
/ (sketch.last().unwrap().hash as f32 / usize::max_value() as f32)) as u64)
Ok(
((sketch.len() - 1) as f32 / (sketch.last().unwrap().hash as f32 / usize::MAX as f32))
as u64,
)
}

/// Generates a Vec of numbers of kmers for each coverage level
Expand Down
Loading
Loading