Skip to content

feat(bloomfilter): add salt #742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions bench-unions/Bench/Unions.hs
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ doSetup gopts = do
createDirectoryIfMissing True $ rootDir gopts

-- Populate the specified number of tables
LSM.withSession (rootDir gopts) $ \session -> do
LSM.withOpenSession (rootDir gopts) $ \session -> do
-- Create a "baseline" table
--
-- We create a single table that *already has* all the same key value pairs
Expand Down Expand Up @@ -337,7 +337,7 @@ tableRange gopts =
-- | Count duplicate keys in all tables that will be unioned together
doCollisionAnalysis :: GlobalOpts -> IO ()
doCollisionAnalysis gopts = do
LSM.withSession (rootDir gopts) $ \session -> do
LSM.withOpenSession (rootDir gopts) $ \session -> do
seenRef <- newIORef Set.empty
dupRef <- newIORef Set.empty

Expand Down Expand Up @@ -381,7 +381,7 @@ doRun gopts opts = do
withFile dataPath WriteMode $ \h -> do
hPutStrLn h "# iteration \t baseline (ops/sec) \t union (ops/sec) \t union debt"

LSM.withSession (rootDir gopts) $ \session -> do
LSM.withOpenSession (rootDir gopts) $ \session -> do
-- Load the baseline table
LSM.withTableFromSnapshot session baselineTableName label
$ \baselineTable -> do
Expand Down
13 changes: 9 additions & 4 deletions bench/macro/lsm-tree-bench-bloomfilter.hs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ benchmarks = do
benchmark "bloomQueries"
"(this is the batch lookup, less the cost of computing and hashing the keys)"
(benchInBatches benchmarkBatchSize rng0
(\ks -> Bloom.bloomQueries vbs ks `seq` ()))
(\ks -> Bloom.bloomQueries benchSalt vbs ks `seq` ()))
(fromIntegralChecked benchmarkNumLookups)
hashcost
0
Expand Down Expand Up @@ -200,6 +200,8 @@ totalNumEntriesSanityCheck l1 filterSizes =
==
sum [ 2^l1 * sizeFactor | (_, sizeFactor, _) <- filterSizes ]

benchSalt :: Bloom.Salt
benchSalt = 4

-- | Input environment for benchmarking 'Bloom.elemMany'.
--
Expand All @@ -223,7 +225,10 @@ elemManyEnv :: [BloomFilterSizeInfo]
elemManyEnv filterSizes rng0 =
stToIO $ do
-- create the filters
mbs <- sequence [ Bloom.new bsize | (_, _, bsize) <- filterSizes ]
mbs <- sequence
[ Bloom.new bsize benchSalt
| (_, _, bsize) <- filterSizes
]
-- add elements
foldM_
(\rng (i, mb) -> do
Expand Down Expand Up @@ -264,7 +269,7 @@ benchInBatches !b !rng0 !action =
benchMakeHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchMakeHashes !_bs !ks =
let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
!khs = V.convert (V.map Bloom.hashes ks)
!khs = V.convert (V.map (Bloom.hashesWithSalt benchSalt) ks)
in khs `seq` ()

-- | This gives us a combined cost of calculating the series of keys, their
Expand All @@ -273,7 +278,7 @@ benchMakeHashes !_bs !ks =
benchElemHashes :: Vector (Bloom SerialisedKey) -> BatchBench
benchElemHashes !bs !ks =
let khs :: VP.Vector (Bloom.Hashes SerialisedKey)
!khs = V.convert (V.map Bloom.hashes ks)
!khs = V.convert (V.map (Bloom.hashesWithSalt benchSalt) ks)
in V.foldl'
(\_ b -> VP.foldl'
(\_ kh -> Bloom.elemHashes b kh `seq` ())
Expand Down
15 changes: 9 additions & 6 deletions bench/macro/lsm-tree-bench-lookups.hs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ entryBitsWithOverhead = entryBits -- key and value size
numEntriesFitInPage :: Fractional a => a
numEntriesFitInPage = fromIntegral unusedPageBits / fromIntegral entryBitsWithOverhead

benchSalt :: Bloom.Salt
benchSalt = 4

benchmarks :: Run.RunDataCaching -> IO ()
benchmarks !caching = withFS $ \hfs hbio -> do
#ifdef NO_IGNORE_ASSERTS
Expand Down Expand Up @@ -351,7 +354,7 @@ lookupsEnv runSizes keyRng0 hfs hbio caching = do

-- create the runs
rbs <- sequence
[ RunBuilder.new hfs hbio
[ RunBuilder.new hfs hbio benchSalt
RunParams {
runParamCaching = caching,
runParamAlloc = RunAllocFixed benchmarkNumBitsPerEntry,
Expand Down Expand Up @@ -428,7 +431,7 @@ benchBloomQueries !bs !keyRng !n
| n <= 0 = ()
| otherwise =
let (!ks, !keyRng') = genLookupBatch keyRng benchmarkGenBatchSize
in bloomQueries bs ks `seq`
in bloomQueries benchSalt bs ks `seq`
benchBloomQueries bs keyRng' (n-benchmarkGenBatchSize)

-- | This gives us the combined cost of calculating batches of keys, performing
Expand All @@ -445,7 +448,7 @@ benchIndexSearches !arenaManager !bs !ics !hs !keyRng !n
| n <= 0 = pure ()
| otherwise = do
let (!ks, !keyRng') = genLookupBatch keyRng benchmarkGenBatchSize
!rkixs = bloomQueries bs ks
!rkixs = bloomQueries benchSalt bs ks
!_ioops <- withArena arenaManager $ \arena -> stToIO $ indexSearches arena ics hs ks rkixs
benchIndexSearches arenaManager bs ics hs keyRng' (n-benchmarkGenBatchSize)

Expand All @@ -463,7 +466,7 @@ benchPrepLookups !arenaManager !bs !ics !hs !keyRng !n
| n <= 0 = pure ()
| otherwise = do
let (!ks, !keyRng') = genLookupBatch keyRng benchmarkGenBatchSize
(!_rkixs, !_ioops) <- withArena arenaManager $ \arena -> stToIO $ prepLookups arena bs ics hs ks
(!_rkixs, !_ioops) <- withArena arenaManager $ \arena -> stToIO $ prepLookups arena benchSalt bs ics hs ks
benchPrepLookups arenaManager bs ics hs keyRng' (n-benchmarkGenBatchSize)

-- | This gives us the combined cost of calculating batches of keys, and
Expand All @@ -489,7 +492,7 @@ benchLookupsIO !hbio !arenaManager !resolve !wb !wbblobs !rs !bs !ics !hs =
| otherwise = do
let (!ks, !keyRng') = genLookupBatch keyRng benchmarkGenBatchSize
!_ <- lookupsIOWithWriteBuffer
hbio arenaManager resolve wb wbblobs rs bs ics hs ks
hbio arenaManager resolve benchSalt wb wbblobs rs bs ics hs ks
go keyRng' (n-benchmarkGenBatchSize)

{-------------------------------------------------------------------------------
Expand Down Expand Up @@ -524,7 +527,7 @@ classifyLookups !bs !keyRng0 !n0 =
| otherwise =
unsafePerformIO (putStr ".") `seq`
let (!ks, !keyRng') = genLookupBatch keyRng benchmarkGenBatchSize
!rkixs = bloomQueries bs ks
!rkixs = bloomQueries benchSalt bs ks
in loop (positives + VP.length rkixs) keyRng' (n-benchmarkGenBatchSize)

-- | Fill a mutable vector with uniformly random values.
Expand Down
7 changes: 5 additions & 2 deletions bench/macro/lsm-tree-bench-wp8.hs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ benchTableConfig :: LSM.TableConfig
benchTableConfig =
LSM.defaultTableConfig {LSM.confFencePointerIndex = LSM.CompactIndex}

benchSalt :: LSM.Salt
benchSalt = 4

-------------------------------------------------------------------------------
-- Keys and values
-------------------------------------------------------------------------------
Expand Down Expand Up @@ -413,7 +416,7 @@ doSetup' gopts opts = do

let name = LSM.toSnapshotName "bench"

LSM.withSession (mkTracer gopts) hasFS hasBlockIO (FS.mkFsPath []) $ \session -> do
LSM.withOpenSession (mkTracer gopts) hasFS hasBlockIO benchSalt (FS.mkFsPath []) $ \session -> do
tbl <- LSM.newTableWith @IO @K @V @B (mkTableConfigSetup gopts opts benchTableConfig) session

forM_ (groupsOfN 256 [ 0 .. initialSize gopts ]) $ \batch -> do
Expand Down Expand Up @@ -575,7 +578,7 @@ doRun gopts opts = do

let name = LSM.toSnapshotName "bench"

LSM.withSession (mkTracer gopts) hasFS hasBlockIO (FS.mkFsPath []) $ \session ->
LSM.withOpenSession (mkTracer gopts) hasFS hasBlockIO benchSalt (FS.mkFsPath []) $ \session ->
withLatencyHandle $ \h -> do
-- open snapshot
-- In checking mode we start with an empty table, since our pure
Expand Down
12 changes: 8 additions & 4 deletions bench/micro/Bench/Database/LSMTree.hs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import Control.DeepSeq
import Control.Exception
import Control.Tracer
import Criterion.Main
import qualified Data.BloomFilter.Hash as Bloom
import Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as SBS
import Data.Foldable
Expand Down Expand Up @@ -82,6 +83,9 @@ benchConfig = defaultTableConfig
, confFencePointerIndex = CompactIndex
}

benchSalt :: Bloom.Salt
benchSalt = 4

{-------------------------------------------------------------------------------
Large Value vs. Small Value Blob
-------------------------------------------------------------------------------}
Expand Down Expand Up @@ -135,7 +139,7 @@ benchLargeValueVsSmallValueBlob =

initialise inss = do
(tmpDir, hfs, hbio) <- mkFiles
s <- openSession nullTracer hfs hbio (FS.mkFsPath [])
s <- openSession nullTracer hfs hbio benchSalt (FS.mkFsPath [])
t <- newTableWith benchConfig s
V.mapM_ (inserts t) inss
pure (tmpDir, hfs, hbio, s, t)
Expand Down Expand Up @@ -220,7 +224,7 @@ benchCursorScanVsRangeLookupScan =

initialise inss = do
(tmpDir, hfs, hbio) <- mkFiles
s <- openSession nullTracer hfs hbio (FS.mkFsPath [])
s <- openSession nullTracer hfs hbio benchSalt (FS.mkFsPath [])
t <- newTableWith benchConfig s
V.mapM_ (inserts t) inss
pure (tmpDir, hfs, hbio, s, t)
Expand Down Expand Up @@ -265,7 +269,7 @@ benchInsertBatches =

initialise = do
(tmpDir, hfs, hbio) <- mkFiles
s <- openSession nullTracer hfs hbio (FS.mkFsPath [])
s <- openSession nullTracer hfs hbio benchSalt (FS.mkFsPath [])
t <- newTableWith _benchConfig s
pure (tmpDir, hfs, hbio, s, t)

Expand Down Expand Up @@ -451,7 +455,7 @@ mkTable ::
, Table IO K V3 B3
)
mkTable hfs hbio conf = do
sesh <- openSession nullTracer hfs hbio (FS.mkFsPath [])
sesh <- openSession nullTracer hfs hbio benchSalt (FS.mkFsPath [])
t <- newTableWith conf sesh
pure (sesh, t)

Expand Down
7 changes: 5 additions & 2 deletions bench/micro/Bench/Database/LSMTree/Internal/BloomFilter.hs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.BloomFilter" [
]
]

benchSalt :: Bloom.Salt
benchSalt = 4

-- | Input environment for benchmarking 'Bloom.elem'.
elemEnv ::
Double -- ^ False positive rate
Expand All @@ -61,7 +64,7 @@ elemEnv fpr nbloom nelemsPositive nelemsNegative = do
$ uniformWithoutReplacement @UTxOKey g1 (nbloom + nelemsNegative)
ys2 = sampleUniformWithReplacement @UTxOKey g2 nelemsPositive xs
zs = shuffle (ys1 ++ ys2) g3
pure ( Bloom.fromList (Bloom.policyForFPR fpr) (fmap serialiseKey xs)
pure ( Bloom.fromList (Bloom.policyForFPR fpr) benchSalt (fmap serialiseKey xs)
, fmap serialiseKey zs
)

Expand All @@ -86,5 +89,5 @@ constructBloom ::
constructBloom fpr m =
-- For faster construction, avoid going via lists and use Bloom.create,
-- traversing the map inserting the keys
Bloom.create (Bloom.sizeForFPR fpr (Map.size m)) $ \b ->
Bloom.create (Bloom.sizeForFPR fpr (Map.size m)) benchSalt $ \b ->
BiFold.bifoldMap (\k -> Bloom.insert b k) (\_v -> pure ()) m
18 changes: 11 additions & 7 deletions bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import Database.LSMTree.Extras.Random (frequency, randomByteStringR,
import Database.LSMTree.Extras.UTxO
import Database.LSMTree.Internal.Arena (ArenaManager, closeArena,
newArena, newArenaManager, withArena)
import qualified Database.LSMTree.Internal.BloomFilter as Bloom
import Database.LSMTree.Internal.Entry (Entry (..), NumEntries (..))
import Database.LSMTree.Internal.Index as Index
import Database.LSMTree.Internal.Lookup (bloomQueries, indexSearches,
Expand Down Expand Up @@ -84,6 +85,9 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.Lookup" [
}
]

benchSalt :: Bloom.Salt
benchSalt = 4

benchLookups :: Config -> Benchmark
benchLookups conf@Config{name} =
withEnv $ \ ~(_dir, arenaManager, _hasFS, hasBlockIO, wbblobs, rs, ks) ->
Expand All @@ -96,23 +100,23 @@ benchLookups conf@Config{name} =
-- The bloomfilter is queried for all lookup keys. The result is an
-- unboxed vector, so only use @whnf@.
bench "Bloomfilter query" $
whnf (\ks' -> bloomQueries blooms ks') ks
whnf (\ks' -> bloomQueries benchSalt blooms ks') ks
-- The compact index is only searched for (true and false) positive
-- lookup keys. We use whnf here because the result is
, env (pure $ bloomQueries blooms ks) $ \rkixs ->
, env (pure $ bloomQueries benchSalt blooms ks) $ \rkixs ->
bench "Compact index search" $
whnfAppIO (\ks' -> withArena arenaManager $ \arena -> stToIO $ indexSearches arena indexes kopsFiles ks' rkixs) ks
-- prepLookups combines bloom filter querying and index searching.
-- The implementation forces the results to WHNF, so we use
-- whnfAppIO here instead of nfAppIO.
, bench "Lookup preparation in memory" $
whnfAppIO (\ks' -> withArena arenaManager $ \arena -> stToIO $ prepLookups arena blooms indexes kopsFiles ks') ks
whnfAppIO (\ks' -> withArena arenaManager $ \arena -> stToIO $ prepLookups arena benchSalt blooms indexes kopsFiles ks') ks
-- Submit the IOOps we get from prepLookups to HasBlockIO. We use
-- perRunEnv because IOOps contain mutable buffers, so we want fresh
-- ones for each run of the benchmark. We manually evaluate the
-- result to WHNF since it is unboxed vector.
, bench "Submit IOOps" $
perRunEnv (withArena arenaManager $ \arena -> stToIO $ prepLookups arena blooms indexes kopsFiles ks) $ \ ~(_rkixs, ioops) -> do
perRunEnv (withArena arenaManager $ \arena -> stToIO $ prepLookups arena benchSalt blooms indexes kopsFiles ks) $ \ ~(_rkixs, ioops) -> do
!_ioress <- FS.submitIO hasBlockIO ioops
pure ()
-- When IO result have been collected, intra-page lookups searches
Expand All @@ -125,7 +129,7 @@ benchLookups conf@Config{name} =
, bench "Perform intra-page lookups" $
perRunEnvWithCleanup
( do arena <- newArena arenaManager
(rkixs, ioops) <- stToIO (prepLookups arena blooms indexes kopsFiles ks)
(rkixs, ioops) <- stToIO (prepLookups arena benchSalt blooms indexes kopsFiles ks)
ioress <- FS.submitIO hasBlockIO ioops
pure (rkixs, ioops, ioress, arena)
)
Expand All @@ -141,7 +145,7 @@ benchLookups conf@Config{name} =
, bench "Lookups in IO" $
whnfAppIO (\ks' -> lookupsIOWithWriteBuffer
hasBlockIO arenaManager resolveV
WB.empty wbblobs
benchSalt WB.empty wbblobs
rs blooms indexes kopsFiles ks') ks
]
-- TODO: consider adding benchmarks that also use the write buffer
Expand Down Expand Up @@ -192,7 +196,7 @@ lookupsInBatchesEnv Config {..} = do
wbblobs <- WBB.new hasFS (FS.mkFsPath ["0.wbblobs"])
wb <- WB.fromMap <$> traverse (traverse (WBB.addBlob hasFS wbblobs)) storedKeys
let fsps = RunFsPaths (FS.mkFsPath []) (RunNumber 0)
r <- Run.fromWriteBuffer hasFS hasBlockIO runParams fsps wb wbblobs
r <- Run.fromWriteBuffer hasFS hasBlockIO benchSalt runParams fsps wb wbblobs
let NumEntries nentriesReal = Run.size r
assertEqual nentriesReal nentries $ pure ()
-- 42 to 43 entries per page
Expand Down
10 changes: 7 additions & 3 deletions bench/micro/Bench/Database/LSMTree/Internal/Merge.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import Database.LSMTree.Extras.Orphans ()
import qualified Database.LSMTree.Extras.Random as R
import Database.LSMTree.Extras.RunData
import Database.LSMTree.Extras.UTxO
import qualified Database.LSMTree.Internal.BloomFilter as Bloom
import Database.LSMTree.Internal.Entry
import qualified Database.LSMTree.Internal.Index as Index (IndexType (Compact))
import Database.LSMTree.Internal.Merge (MergeType (..))
Expand Down Expand Up @@ -220,6 +221,9 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.Merge" [
| w <- weights
]

benchSalt :: Bloom.Salt
benchSalt = 4

runParams :: RunBuilder.RunParams
runParams =
RunBuilder.RunParams {
Expand Down Expand Up @@ -273,7 +277,7 @@ merge ::
merge fs hbio Config {..} targetPaths runs = do
let f = fromMaybe const mergeResolve
m <- fromMaybe (error "empty inputs, no merge created") <$>
Merge.new fs hbio runParams mergeType f targetPaths runs
Merge.new fs hbio benchSalt runParams mergeType f targetPaths runs
Merge.stepsToCompletion m stepSize

fsPath :: FS.FsPath
Expand Down Expand Up @@ -397,7 +401,7 @@ randomRuns ::
randomRuns hasFS hasBlockIO config@Config {..} rng0 = do
counter <- inputRunPathsCounter
fmap V.fromList $
mapM (unsafeCreateRun hasFS hasBlockIO runParams fsPath counter) $
mapM (unsafeCreateRun hasFS hasBlockIO benchSalt runParams fsPath counter) $
zipWith
(randomRunData config)
nentries
Expand Down Expand Up @@ -446,5 +450,5 @@ randomRunData Config {..} runentries g0 =
-- Each run entry needs a distinct key.
randomWord64OutOf :: Int -> Rnd SerialisedKey
randomWord64OutOf possibleKeys =
first (serialiseKey . Hash.hash64)
first (serialiseKey . Hash.hashSalt64 benchSalt)
. uniformR (0, fromIntegral possibleKeys :: Word64)
3 changes: 2 additions & 1 deletion blockio/src-linux/System/FS/BlockIO/Async.hs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ ctxParamsConv API.IOCtxParams{API.ioctxBatchSizeLimit, API.ioctxConcurrencyLimit
}

submitIO ::
HasFS IO HandleIO
HasCallStack
=> HasFS IO HandleIO
-> I.IOCtx
-> V.Vector (IOOp RealWorld HandleIO)
-> IO (VU.Vector IOResult)
Expand Down
5 changes: 3 additions & 2 deletions blockio/src/System/FS/BlockIO/Serial.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import Control.Monad.Primitive (PrimMonad, PrimState, RealWorld)
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM
import GHC.Stack (HasCallStack)
import System.FS.API
import qualified System.FS.BlockIO.API as API
import System.FS.BlockIO.API (IOOp (..), IOResult (..), LockMode (..))
Expand Down Expand Up @@ -55,7 +56,7 @@ serialHasBlockIO hSetNoCache hAdvise hAllocate tryLockFile hSynchronise synchron
data IOCtx m = IOCtx { ctxFS :: SomeHasFS m, openVar :: MVar m Bool }

{-# SPECIALISE guardIsOpen :: IOCtx IO -> IO () #-}
guardIsOpen :: (MonadMVar m, MonadThrow m) => IOCtx m -> m ()
guardIsOpen :: (HasCallStack, MonadMVar m, MonadThrow m) => IOCtx m -> m ()
guardIsOpen ctx = readMVar (openVar ctx) >>= \b ->
unless b $ throwIO (API.mkClosedError (ctxFS ctx) "submitIO")

Expand All @@ -72,7 +73,7 @@ close ctx = modifyMVar_ (openVar ctx) $ const (pure False)
-> IOCtx IO -> V.Vector (IOOp RealWorld h)
-> IO (VU.Vector IOResult) #-}
submitIO ::
(MonadMVar m, MonadThrow m, PrimMonad m)
(HasCallStack, MonadMVar m, MonadThrow m, PrimMonad m)
=> HasFS m h
-> IOCtx m
-> V.Vector (IOOp (PrimState m) h)
Expand Down
Loading