Skip to content

Commit 2feadb1

Browse files
authored
Merge pull request #312 from jacksonrnewhouse/faster_run_unions
faster run container unions.
2 parents 55ff8cc + 77da837 commit 2feadb1

File tree

2 files changed

+110
-19
lines changed

2 files changed

+110
-19
lines changed

arraycontainer.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -395,11 +395,19 @@ func (ac *arrayContainer) iorBitmap(bc2 *bitmapContainer) container {
395395
}
396396

397397
func (ac *arrayContainer) iorRun16(rc *runContainer16) container {
398-
bc1 := ac.toBitmapContainer()
399-
bc2 := rc.toBitmapContainer()
400-
bc1.iorBitmap(bc2)
401-
*ac = *newArrayContainerFromBitmap(bc1)
402-
return ac
398+
runCardinality := rc.getCardinality()
399+
// heuristic for if the container should maybe be an
400+
// array container.
401+
if runCardinality < ac.getCardinality() &&
402+
runCardinality+ac.getCardinality() < arrayDefaultMaxSize {
403+
var result container
404+
result = ac
405+
for _, run := range rc.iv {
406+
result = result.iaddRange(int(run.start), int(run.start)+int(run.length))
407+
}
408+
return result
409+
}
410+
return rc.orArray(ac)
403411
}
404412

405413
func (ac *arrayContainer) lazyIOR(a container) container {

runcontainer.go

Lines changed: 97 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ import (
4747
// runContainer16 does run-length encoding of sets of
4848
// uint16 integers.
4949
type runContainer16 struct {
50-
iv []interval16
50+
iv []interval16
5151
}
5252

5353
// interval16 is the internal to runContainer16
@@ -849,7 +849,7 @@ func (rc *runContainer16) numIntervals() int {
849849
//
850850
// runContainer16.search always returns whichInterval16 < len(rc.iv).
851851
//
852-
// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there
852+
// The search space is from startIndex to endxIndex. If endxIndex is set to zero, then there
853853
// no upper bound.
854854
//
855855
func (rc *runContainer16) searchRange(key int, startIndex int, endxIndex int) (whichInterval16 int, alreadyPresent bool, numCompares int) {
@@ -968,14 +968,12 @@ func (rc *runContainer16) getCardinality() int {
968968
return n
969969
}
970970

971-
972971
// isEmpty returns true if the container is empty.
973972
// It runs in constant time.
974973
func (rc *runContainer16) isEmpty() bool {
975974
return len(rc.iv) == 0
976975
}
977976

978-
979977
// AsSlice decompresses the contents into a []uint16 slice.
980978
func (rc *runContainer16) AsSlice() []uint16 {
981979
s := make([]uint16, rc.getCardinality())
@@ -1198,7 +1196,7 @@ func (ri *runIterator16) advanceIfNeeded(minval uint16) {
11981196
// before calling next() to insure there are contents.
11991197
type runReverseIterator16 struct {
12001198
rc *runContainer16
1201-
curIndex int // index into rc.iv
1199+
curIndex int // index into rc.iv
12021200
curPosInIndex uint16 // offset in rc.iv[curIndex]
12031201
}
12041202

@@ -1288,7 +1286,6 @@ func (ri *runIterator16) nextMany(hs uint32, buf []uint32) int {
12881286
return n
12891287
}
12901288

1291-
12921289
func (ri *runIterator16) nextMany64(hs uint64, buf []uint64) int {
12931290
n := 0
12941291

@@ -1424,7 +1421,7 @@ func intersectWithLeftover16(astart, alast, bstart, blast int) (isOverlap, isLef
14241421
return
14251422
}
14261423

1427-
func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex int, key int) (index int, done bool) {
1424+
func (rc *runContainer16) findNextIntervalThatIntersectsStartingFrom(startIndex int, key int) (index int, done bool) {
14281425
w, _, _ := rc.searchRange(key, startIndex, 0)
14291426
// rc.search always returns w < len(rc.iv)
14301427
if w < startIndex {
@@ -1448,7 +1445,6 @@ func sliceToString16(m []interval16) string {
14481445
return s
14491446
}
14501447

1451-
14521448
// helper for invert
14531449
func (rc *runContainer16) invertlastInterval(origin uint16, lastIdx int) []interval16 {
14541450
cur := rc.iv[lastIdx]
@@ -2152,9 +2148,21 @@ func (rc *runContainer16) orBitmapContainerCardinality(bc *bitmapContainer) int
21522148

21532149
// orArray finds the union of rc and ac.
21542150
func (rc *runContainer16) orArray(ac *arrayContainer) container {
2155-
bc1 := newBitmapContainerFromRun(rc)
2156-
bc2 := ac.toBitmapContainer()
2157-
return bc1.orBitmap(bc2)
2151+
if ac.isEmpty() {
2152+
return rc.clone()
2153+
}
2154+
if rc.isEmpty() {
2155+
return ac.clone()
2156+
}
2157+
intervals, cardMinusOne := runArrayUnionToRuns(rc, ac)
2158+
result := newRunContainer16TakeOwnership(intervals)
2159+
if len(intervals) >= 2048 && cardMinusOne >= arrayDefaultMaxSize {
2160+
return newBitmapContainerFromRun(result)
2161+
}
2162+
if len(intervals)*2 > 1+int(cardMinusOne) {
2163+
return result.toArrayContainer()
2164+
}
2165+
return result
21582166
}
21592167

21602168
// orArray finds the union of rc and ac.
@@ -2197,13 +2205,88 @@ func (rc *runContainer16) iorBitmapContainer(bc *bitmapContainer) container {
21972205
}
21982206

21992207
func (rc *runContainer16) iorArray(ac *arrayContainer) container {
2200-
it := ac.getShortIterator()
2201-
for it.hasNext() {
2202-
rc.Add(it.next())
2208+
if rc.isEmpty() {
2209+
return ac.clone()
2210+
}
2211+
if ac.isEmpty() {
2212+
return rc
2213+
}
2214+
var cardMinusOne uint16
2215+
//TODO: perform the union algorithm in-place using rc.iv
2216+
// this can be done with methods like the in-place array container union
2217+
// but maybe lazily moving the remaining elements back.
2218+
rc.iv, cardMinusOne = runArrayUnionToRuns(rc, ac)
2219+
if len(rc.iv) >= 2048 && cardMinusOne >= arrayDefaultMaxSize {
2220+
return newBitmapContainerFromRun(rc)
2221+
}
2222+
if len(rc.iv)*2 > 1+int(cardMinusOne) {
2223+
return rc.toArrayContainer()
22032224
}
22042225
return rc
22052226
}
22062227

2228+
func runArrayUnionToRuns(rc *runContainer16, ac *arrayContainer) ([]interval16, uint16) {
2229+
pos1 := 0
2230+
pos2 := 0
2231+
length1 := len(ac.content)
2232+
length2 := len(rc.iv)
2233+
target := make([]interval16, 0, len(rc.iv))
2234+
// have to find the first range
2235+
// options are
2236+
// 1. from array container
2237+
// 2. from run container
2238+
var previousInterval interval16
2239+
var cardMinusOne uint16
2240+
if ac.content[0] < rc.iv[0].start {
2241+
previousInterval.start = ac.content[0]
2242+
previousInterval.length = 0
2243+
pos1++
2244+
} else {
2245+
previousInterval.start = rc.iv[0].start
2246+
previousInterval.length = rc.iv[0].length
2247+
pos2++
2248+
}
2249+
2250+
for pos1 < length1 || pos2 < length2 {
2251+
if pos1 < length1 {
2252+
s1 := ac.content[pos1]
2253+
if s1 <= previousInterval.start+previousInterval.length {
2254+
pos1++
2255+
continue
2256+
}
2257+
if previousInterval.last() < MaxUint16 && previousInterval.last()+1 == s1 {
2258+
previousInterval.length++
2259+
pos1++
2260+
continue
2261+
}
2262+
}
2263+
if pos2 < length2 {
2264+
range2 := rc.iv[pos2]
2265+
if range2.start <= previousInterval.last() || range2.start > 0 && range2.start-1 == previousInterval.last() {
2266+
pos2++
2267+
if previousInterval.last() < range2.last() {
2268+
previousInterval.length = range2.last() - previousInterval.start
2269+
}
2270+
continue
2271+
}
2272+
}
2273+
cardMinusOne += previousInterval.length + 1
2274+
target = append(target, previousInterval)
2275+
if pos2 == length2 || pos1 < length1 && ac.content[pos1] < rc.iv[pos2].start {
2276+
previousInterval.start = ac.content[pos1]
2277+
previousInterval.length = 0
2278+
pos1++
2279+
} else {
2280+
previousInterval = rc.iv[pos2]
2281+
pos2++
2282+
}
2283+
}
2284+
cardMinusOne += previousInterval.length + 1
2285+
target = append(target, previousInterval)
2286+
2287+
return target, cardMinusOne
2288+
}
2289+
22072290
// lazyIOR is described (not yet implemented) in
22082291
// this nice note from @lemire on
22092292
// https://github.com/RoaringBitmap/roaring/pull/70#issuecomment-263613737

0 commit comments

Comments
 (0)