Skip to content

Commit 7551fd0

Browse files
authored
Merge pull request #153 from jaebeom-kim/master
Syncmer
2 parents a65c014 + ebbbf93 commit 7551fd0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+5578
-3026
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,19 @@ Please cite: [Kim J, Steinegger M. Metabuli: sensitive and specific metagenomic
3131
3232

3333
---
34+
### Update after latest release
35+
- ***Syncmer is introduced:*** half database size, double classification speed.
36+
- Older Metabuli versions are not compatible with the new database format.
37+
- New GTDB R226 database [here](https://hulk.mmseqs.com/jaebeom/gtdb226db/)
38+
- Species representative genomes with checkm2 completeness > 70% and contamination < 5% are included.
39+
- 129,671 species out of 143,614 species in GTDB R226 are included.
40+
- New pre-built at [https://mmseqs.com/metabuli](https://mmseqs.com/metabuli)
41+
42+
### Update in v1.1.1
43+
- `--validate-input` option added for FASTA/Q validation.
44+
- `--validate-db` option added for database validation.
45+
- `classifiedRefiner` module added to refine classification results.
46+
3447
### Update in v1.1.0
3548
- Fix errors in v1.0.9
3649
- Custom DB creation became easier

src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
include_directories(commons)
22
include_directories(workflow)
33
include_directories(util)
4+
include_directories(benchmark)
45
include_directories(../lib/prodigal)
56
include_directories(../lib/fasta_validator)
67
include_directories(../lib/fastq_utils)
@@ -9,11 +10,13 @@ add_subdirectory(commons)
910
add_subdirectory(util)
1011
add_subdirectory(version)
1112
add_subdirectory(workflow)
13+
add_subdirectory(benchmark)
1214

1315
add_executable(metabuli
1416
${commons_source_files}
1517
${util_source_files}
1618
${workflow_source_files}
19+
${benchmark_source_files}
1720
metabuli.cpp
1821
LocalCommandDeclarations.h
1922
util/filter_by_genus.cpp

src/LocalCommandDeclarations.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ extern int grade(int argc, const char **argv, const Command& command);
1010
extern int gradeByCladeSize(int argc, const char **argv, const Command& command);
1111
extern int seqHeader2TaxId(int argc, const char **argv, const Command& command);
1212
extern int addToLibrary(int argc, const char **argv, const Command& command);
13-
extern int applyThreshold(int argc, const char **argv, const Command& command);
1413
extern int binning2report(int argc, const char **argv, const Command& command);
1514
extern int filterByGenus(int argc, const char **argv, const Command& command);
1615
extern int databaseReport(int argc, const char **argv, const Command& command);
@@ -27,5 +26,9 @@ extern int editNames(int argc, const char **argv, const Command& command);
2726
extern int createnewtaxalist(int argc, const char **argv, const Command& command);
2827
extern int classifiedRefiner(int argc, const char **argv, const Command& command);
2928
extern int validateDatabase(int argc, const char **argv, const Command& command);
29+
extern int printDeltaIdx(int argc, const char **argv, const Command& command);
30+
extern int makeBenchmarkSet(int argc, const char **argv, const Command &command);
31+
extern int makeQuerySet(int argc, const char **argv, const Command &command);
32+
extern int makeVirusBenchmarkSet(int argc, const char **argv, const Command &command);
3033

3134
#endif //ADCLASSIFIER2_LOCALCOMMANDDECLARATIONS_H

src/MetabuliBase.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,13 @@ std::vector<Command> metabuliCommands = {
5252
"<diffIdx>",
5353
CITATION_SPACEPHARER,
5454
{{"Differential index", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::empty}}},
55+
{"printDeltaIdx", printDeltaIdx, &localPar.expand_diffidx, COMMAND_EXPERT,
56+
"Print k-mers stored in delta index",
57+
nullptr,
58+
"Jaebeom Kim <[email protected]>",
59+
"<deltaIdx>",
60+
CITATION_SPACEPHARER,
61+
{{"deltaIdx", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
5562
{"printInfo", printInfo, &localPar.printInfo, COMMAND_EXPERT,
5663
"Print k-mer information",
5764
nullptr,
@@ -196,7 +203,32 @@ std::vector<Command> metabuliCommands = {
196203
"Jaebeom Kim <[email protected]>",
197204
"<i: database directory>",
198205
CITATION_SPACEPHARER,
199-
{{"database directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}
206+
{{"database directory", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
207+
{"maketestsets", makeBenchmarkSet, &localPar.makeBenchmarkSet, COMMAND_EXPERT,
208+
"Create test sets for benchmarking",
209+
nullptr,
210+
"Jaebeom Kim <[email protected]}",
211+
"<i: Assembly accessions> <i: Taxonomy dump>",
212+
CITATION_SPACEPHARER,
213+
{{"Assembly accessions", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
214+
{"Taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
215+
{"makeInclusionTestQueries", makeQuerySet, &localPar.makeBenchmarkSet, COMMAND_EXPERT,
216+
"Create query sets for inclusion tests.",
217+
nullptr,
218+
"Jaebeom Kim <[email protected]}",
219+
"<i: Assembly accessions> <i: Taxonomy dump>",
220+
CITATION_SPACEPHARER,
221+
{{"Assembly accessions", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
222+
{"Taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}},
223+
{"make-virus-benchmark-set", makeVirusBenchmarkSet, &localPar.makeBenchmarkSet, COMMAND_EXPERT,
224+
"Create test sets for benchmarking virus classification",
225+
nullptr,
226+
"Jaebeom Kim <[email protected]}",
227+
"<i: assembly accessions> <i: taxonomy dump> <i: assembly accession 2 taxid>",
228+
CITATION_SPACEPHARER,
229+
{{"Assembly accessions", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile},
230+
{"Taxonomy dump", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory},
231+
{"Assembly accession to taxid mapping", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}
200232
};
201233

202234
std::vector<KmerThreshold> externalThreshold = {};

src/benchmark/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(benchmark_source_files
2+
benchmark/makeInclusionQuerySet.cpp
3+
benchmark/makeBenchmarkSet.cpp
4+
benchmark/makeVirusBenchmarkSet.cpp
5+
PARENT_SCOPE)

0 commit comments

Comments
 (0)