Skip to content

Lift the restriction of minimum 8-byte keys for the compact index #746

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions bench/micro/Bench/Database/LSMTree/Internal/Index.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@ import Criterion.Main (Benchmark, Benchmarkable, bench, bgroup, env,
import Data.List (foldl')
-- foldl' is included in the Prelude from base 4.20 onwards
#endif
import Database.LSMTree.Extras.Generators (getKeyForIndexCompact,
mkPages, toAppends)
import Database.LSMTree.Extras.Generators (mkPages, toAppends)
-- also for @Arbitrary@ instantiation of @SerialisedKey@
import Database.LSMTree.Extras.Index (Append, append)
import Database.LSMTree.Internal.Index (Index,
IndexType (Compact, Ordinary), newWithDefaults, search,
unsafeEnd)
import Database.LSMTree.Internal.Serialise
(SerialisedKey (SerialisedKey))
import Database.LSMTree.Internal.Serialise (SerialisedKey)
import Test.QuickCheck (choose, vector)
import Test.QuickCheck.Gen (Gen (MkGen))
import Test.QuickCheck.Random (mkQCGen)
Expand Down Expand Up @@ -61,8 +59,7 @@ generated (MkGen exec) = exec (mkQCGen 411) 30
keysForIndexCompact :: Int -- ^ Number of keys
-> [SerialisedKey] -- ^ Constructed keys
keysForIndexCompact = vector >>>
generated >>>
map (getKeyForIndexCompact >>> SerialisedKey)
generated

{-|
Constructs append operations whose serialised keys conform to the key size
Expand Down
2 changes: 1 addition & 1 deletion doc/format-run.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ The compact index type is designed to work with keys that are large
cryptographic hashes, e.g. 32 bytes. In particular it requires:

* keys must be uniformly distributed
* keys must be at least 8 bytes (64bits), but can otherwise be variable length
* keys should ideally be at least 8 bytes (64bits), but can otherwise be variable length

For this important special case, we can do significantly better than storing a
whole key per page: we can typically store just 8 bytes (64bits) per page. This
Expand Down
24 changes: 0 additions & 24 deletions src-extras/Database/LSMTree/Extras/Generators.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ module Database.LSMTree.Extras.Generators (
, genRawBytesSized
, packRawBytesPinnedOrUnpinned
, LargeRawBytes (..)
, isKeyForIndexCompact
, KeyForIndexCompact (..)
, BiasedKey (..)
-- * helpers
, shrinkVec
Expand Down Expand Up @@ -510,28 +508,6 @@ instance Arbitrary LargeRawBytes where

deriving newtype instance SerialiseValue LargeRawBytes

-- Serialised keys for the compact index must be at least 8 bytes long.

genKeyForIndexCompact :: Gen RawBytes
genKeyForIndexCompact =
genRawBytesN =<< QC.sized (\s -> QC.chooseInt (8, s + 8))

isKeyForIndexCompact :: RawBytes -> Bool
isKeyForIndexCompact rb = RB.size rb >= 8

newtype KeyForIndexCompact =
KeyForIndexCompact { getKeyForIndexCompact :: RawBytes }
deriving stock (Eq, Ord, Show)

instance Arbitrary KeyForIndexCompact where
arbitrary =
KeyForIndexCompact <$> genKeyForIndexCompact
shrink (KeyForIndexCompact rawBytes) =
[KeyForIndexCompact rawBytes' | rawBytes' <- shrink rawBytes,
isKeyForIndexCompact rawBytes']

deriving newtype instance SerialiseKey KeyForIndexCompact

-- we try to make collisions and close keys more likely (very crudely)
arbitraryBiasedKey :: (RawBytes -> k) -> Gen RawBytes -> Gen k
arbitraryBiasedKey fromRB genUnbiased = fromRB <$> frequency
Expand Down
3 changes: 1 addition & 2 deletions src/Database/LSMTree.hs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ module Database.LSMTree (
serialiseKeyIdentity,
serialiseKeyIdentityUpToSlicing,
serialiseKeyPreservesOrdering,
serialiseKeyMinimalSize,
serialiseValueIdentity,
serialiseValueIdentityUpToSlicing,
packSlice,
Expand Down Expand Up @@ -216,7 +215,7 @@ import Database.LSMTree.Internal.Config
DiskCachePolicy (..), FencePointerIndexType (..),
LevelNo (..), MergePolicy (..), MergeSchedule (..),
SizeRatio (..), TableConfig (..), WriteBufferAlloc (..),
defaultTableConfig, serialiseKeyMinimalSize)
defaultTableConfig)
import Database.LSMTree.Internal.Config.Override
(OverrideDiskCachePolicy (..))
import Database.LSMTree.Internal.Entry (NumEntries (..))
Expand Down
14 changes: 1 addition & 13 deletions src/Database/LSMTree/Internal/Config.hs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ module Database.LSMTree.Internal.Config (
-- * Fence pointer index
, FencePointerIndexType (..)
, indexTypeForRun
, serialiseKeyMinimalSize
-- * Disk cache policy
, DiskCachePolicy (..)
, diskCachePolicyForLevel
Expand All @@ -32,11 +31,9 @@ import Control.DeepSeq (NFData (..))
import Database.LSMTree.Internal.Index (IndexType)
import qualified Database.LSMTree.Internal.Index as Index
(IndexType (Compact, Ordinary))
import qualified Database.LSMTree.Internal.RawBytes as RB
import Database.LSMTree.Internal.Run (RunDataCaching (..))
import Database.LSMTree.Internal.RunAcc (RunBloomFilterAlloc (..))
import Database.LSMTree.Internal.RunBuilder (RunParams (..))
import Database.LSMTree.Internal.Serialise.Class (SerialiseKey (..))

newtype LevelNo = LevelNo Int
deriving stock (Show, Eq, Ord)
Expand Down Expand Up @@ -303,12 +300,7 @@ data FencePointerIndexType =
| {- |
Compact indexes are designed for the case where the keys in the database are uniformly distributed, e.g., when the keys are hashes.

When using a compact index, the 'Database.LSMTree.Internal.Serialise.Class.serialiseKey' function must satisfy the following additional law:

[Minimal size]
@'Database.LSMTree.Internal.RawBytes.size' ('Database.LSMTree.Internal.Serialise.Class.serialiseKey' x) >= 8@

Use 'serialiseKeyMinimalSize' to test this law.
When using a compact index, serialised keys should ideally contain 8 bytes or more.
-}
CompactIndex
deriving stock (Eq, Show)
Expand All @@ -321,10 +313,6 @@ indexTypeForRun :: FencePointerIndexType -> IndexType
indexTypeForRun CompactIndex = Index.Compact
indexTypeForRun OrdinaryIndex = Index.Ordinary

-- | Test the __Minimal size__ law for the 'CompactIndex' option.
serialiseKeyMinimalSize :: SerialiseKey k => k -> Bool
serialiseKeyMinimalSize x = RB.size (serialiseKey x) >= 8

{-------------------------------------------------------------------------------
Disk cache policy
-------------------------------------------------------------------------------}
Expand Down
4 changes: 2 additions & 2 deletions src/Database/LSMTree/Internal/MergeSchedule.hs
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root uc r0 reg levels ul
traceWith tr $ AtLevel ln TraceAddLevel
-- Make a new level
let policyForLevel = mergePolicyForLevel confMergePolicy ln V.empty ul
ir <- newMerge policyForLevel MR.MergeLastLevel ln rs
ir <- newMerge policyForLevel (mergeTypeForLevel V.empty ul) ln rs
pure $! V.singleton $ Level ir V.empty
go !ln rs' (V.uncons -> Just (Level ir rs, ls)) = do
r <- expectCompletedMerge ln ir
Expand Down Expand Up @@ -707,7 +707,7 @@ addRunToLevels tr conf@TableConfig{..} resolve hfs hbio root uc r0 reg levels ul
-- Otherwise we start merging the incoming runs into the run.
LevelLevelling -> do
assert (V.null rs && V.null ls) $ pure ()
ir' <- newMerge LevelLevelling MR.MergeLastLevel ln (rs' `V.snoc` r)
ir' <- newMerge LevelLevelling (mergeTypeForLevel ls ul) ln (rs' `V.snoc` r)
pure $! Level ir' V.empty `V.cons` V.empty

-- Releases the incoming run.
Expand Down
26 changes: 22 additions & 4 deletions src/Database/LSMTree/Internal/RawBytes.hs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ module Database.LSMTree.Internal.RawBytes (
) where

import Control.DeepSeq (NFData)
import Control.Exception (assert)
import Data.Bits (Bits (..))
import Data.BloomFilter.Hash (Hashable (..), hashByteArray)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as BB
Expand All @@ -71,6 +71,9 @@ import GHC.Stack
import GHC.Word
import Text.Printf (printf)

-- $setup
-- >>> import Numeric

{- Note: [Export structure]
~~~~~~~~~~~~~~~~~~~~~~~
Since RawBytes are very similar to Primitive Vectors, the code is sectioned
Expand Down Expand Up @@ -172,14 +175,29 @@ drop = coerce VP.drop
--
-- The /top/ corresponds to the most significant bit (big-endian).
--
-- PRECONDITION: The byte-size of the raw bytes should be at least 8 bytes.
-- If the number of bits is smaller than @64@, then any missing bits default to
-- @0@s.
--
-- >>> showHex (topBits64 (pack [1,0,0,0,0,0,0,0])) ""
-- "100000000000000"
--
-- >>> showHex (topBits64 (pack [1,0,0])) ""
-- "100000000000000"
--
-- TODO: optimisation ideas: use unsafe shift/byteswap primops, look at GHC
-- core, find other opportunities for using primops.
--
topBits64 :: RawBytes -> Word64
topBits64 rb@(RawBytes (VP.Vector (I# off#) _size (ByteArray k#))) =
assert (size rb >= 8) $ toWord64 (indexWord8ArrayAsWord64# k# off#)
topBits64 rb@(RawBytes v@(VP.Vector (I# off#) _size (ByteArray k#)))
| n >= 8
= toWord64 (indexWord8ArrayAsWord64# k# off#)
| otherwise
= VP.foldl' f 0 v `unsafeShiftL` ((8 - n) * 8)
where
!n = size rb

f :: Word64 -> Word8 -> Word64
f acc w = acc `unsafeShiftL` 8 + fromIntegral w

#if (MIN_VERSION_GLASGOW_HASKELL(9, 4, 0, 0))
toWord64 :: Word64# -> Word64
Expand Down
6 changes: 2 additions & 4 deletions src/Database/LSMTree/Simple.hs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ module Database.LSMTree.Simple (
serialiseKeyIdentity,
serialiseKeyIdentityUpToSlicing,
serialiseKeyPreservesOrdering,
serialiseKeyMinimalSize,
serialiseValueIdentity,
serialiseValueIdentityUpToSlicing,
packSlice,
Expand Down Expand Up @@ -178,9 +177,8 @@ import Database.LSMTree (BloomFilterAlloc, CursorClosedError (..),
TableTooLargeError (..), UnionCredits (..), UnionDebt (..),
WriteBufferAlloc, isValidSnapshotName, packSlice,
serialiseKeyIdentity, serialiseKeyIdentityUpToSlicing,
serialiseKeyMinimalSize, serialiseKeyPreservesOrdering,
serialiseValueIdentity, serialiseValueIdentityUpToSlicing,
toSnapshotName)
serialiseKeyPreservesOrdering, serialiseValueIdentity,
serialiseValueIdentityUpToSlicing, toSnapshotName)
import qualified Database.LSMTree as LSMT
import Prelude hiding (lookup, take, takeWhile)

Expand Down
3 changes: 0 additions & 3 deletions test/Test/Database/LSMTree/Generators.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,6 @@ tests = testGroup "Test.Database.LSMTree.Generators" [
prop_arbitraryAndShrinkPreserveInvariant
(\(LargeRawBytes rb) -> labelRawBytes rb)
(deepseqInvariant @LargeRawBytes)
, testGroup "KeyForIndexCompact" $
prop_arbitraryAndShrinkPreserveInvariant noTags $
isKeyForIndexCompact . getKeyForIndexCompact
, testGroup "BiasedKey" $
prop_arbitraryAndShrinkPreserveInvariant
(labelTestKOps @BiasedKey)
Expand Down
13 changes: 4 additions & 9 deletions test/Test/Database/LSMTree/Internal/Index/Compact.hs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import Data.Word
import Database.LSMTree.Extras
import Database.LSMTree.Extras.Generators (ChunkSize (..),
LogicalPageSummaries, LogicalPageSummary (..), Pages (..),
genRawBytes, isKeyForIndexCompact, labelPages, toAppends)
genRawBytes, labelPages, toAppends)
import Database.LSMTree.Extras.Index (Append (..), appendToCompact)
import Database.LSMTree.Internal.BitMath
import Database.LSMTree.Internal.Chunk as Chunk (toByteString)
Expand All @@ -54,9 +54,7 @@ import Text.Printf (printf)

tests :: TestTree
tests = testGroup "Test.Database.LSMTree.Internal.Index.Compact" [
testGroup "TestKey" $
prop_arbitraryAndShrinkPreserveInvariant @TestKey noTags isTestKey
, testProperty "prop_distribution @TestKey" $
testProperty "prop_distribution @TestKey" $
prop_distribution @TestKey
, testProperty "prop_searchMinMaxKeysAfterConstruction" $
prop_searchMinMaxKeysAfterConstruction @TestKey 100
Expand Down Expand Up @@ -173,15 +171,12 @@ instance Arbitrary TestKey where
-- Shrink keys extensively: most failures will occur in small counterexamples,
-- so we don't have to limit the number of shrinks as much.
shrink (TestKey bytes) = [
TestKey bytes'
testkey'
| let RawBytes vec = bytes
, vec' <- VP.fromList <$> shrink (VP.toList vec)
, let bytes' = RawBytes vec'
, isKeyForIndexCompact bytes'
, let testkey' = TestKey $ RawBytes vec'
]

isTestKey :: TestKey -> Bool
isTestKey (TestKey bytes) = isKeyForIndexCompact bytes

{-------------------------------------------------------------------------------
Properties
Expand Down
10 changes: 3 additions & 7 deletions test/Test/Database/LSMTree/Internal/Lookup.hs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ import qualified Database.LSMTree.Internal.Run as Run
import Database.LSMTree.Internal.RunAcc as Run
import Database.LSMTree.Internal.RunBuilder
(RunDataCaching (CacheRunData), RunParams (RunParams))
import Database.LSMTree.Internal.Serialise
import Database.LSMTree.Internal.Serialise as Serialise
import Database.LSMTree.Internal.Serialise.Class
import Database.LSMTree.Internal.UniqCounter
import qualified Database.LSMTree.Internal.WriteBuffer as WB
Expand Down Expand Up @@ -563,14 +563,10 @@ liftShrink3InMemLookupData shrinkKey shrinkValue shrinkBlob InMemLookupData{ run
shrinkEntry = liftShrink2 shrinkValue shrinkBlob

genSerialisedKey :: Gen SerialisedKey
genSerialisedKey = frequency [
(9, arbitrary `suchThat` (\k -> sizeofKey k >= 8))
, (1, do x <- getSmall <$> arbitrary
pure $ SerialisedKey (RB.pack [0,0,0,0,0,0,0, x]))
]
genSerialisedKey = Serialise.serialiseKey <$> arbitraryBoundedIntegral @Word64

shrinkSerialisedKey :: SerialisedKey -> [SerialisedKey]
shrinkSerialisedKey k = [k' | k' <- shrink k, sizeofKey k' >= 8]
shrinkSerialisedKey k = Serialise.serialiseKey <$> shrink (Serialise.deserialiseKey k :: Word64)

genSerialisedValue :: Gen SerialisedValue
genSerialisedValue = frequency [ (50, arbitrary), (1, genLongValue) ]
Expand Down
32 changes: 29 additions & 3 deletions test/Test/Database/LSMTree/Internal/RawBytes.hs
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
{-# LANGUAGE OverloadedLists #-}

module Test.Database.LSMTree.Internal.RawBytes (tests) where

import Data.Bits (Bits (shiftL))
import qualified Data.List as List
import qualified Data.Vector.Primitive as VP
import Database.LSMTree.Extras.Generators ()
import Database.LSMTree.Internal.RawBytes (RawBytes)
import qualified Database.LSMTree.Internal.RawBytes as RB (size)
import Database.LSMTree.Internal.RawBytes (RawBytes (RawBytes))
import qualified Database.LSMTree.Internal.RawBytes as RB
import Test.QuickCheck (Property, classify, collect, mapSize,
withDiscardRatio, withMaxSuccess, (.||.), (===), (==>))
import Test.Tasty (TestTree, testGroup)
Expand All @@ -26,7 +31,9 @@ tests = testGroup "Test.Database.LSMTree.Internal.RawBytes" $
testProperty "Transitivity" prop_ordTransitivity,
testProperty "Reflexivity" prop_ordReflexivity,
testProperty "Antisymmetry" prop_ordAntisymmetry
]
],
testProperty "prop_topBits64" prop_topBits64,
testProperty "prop_topBits64_default0s" prop_topBits64_default0s
]

-- * Utilities
Expand Down Expand Up @@ -92,3 +99,22 @@ prop_ordAntisymmetry = mapSize (const 4) $
untunedProp block1 block2
= withFirstBlockSizeInfo block1 $
block1 <= block2 && block2 <= block1 ==> block1 === block2

{-------------------------------------------------------------------------------
Accessors
-------------------------------------------------------------------------------}

-- | Compare 'topBits64' against a model
prop_topBits64 :: RawBytes -> Property
prop_topBits64 x@(RawBytes v) =
expected === RB.topBits64 x
where
expected =
let ws = take 8 (VP.toList v ++ repeat 0)
in List.foldl' (\acc w -> acc `shiftL` 8 + fromIntegral w) 0 ws

-- | If @x@ has fewer than 8 bytes, then all missing bits in the result default
-- to 0s.
prop_topBits64_default0s :: RawBytes -> Property
prop_topBits64_default0s x =
RB.topBits64 x === RB.topBits64 (x <> mconcat (replicate 8 [0]))
5 changes: 2 additions & 3 deletions test/Test/Database/LSMTree/Internal/RunAcc.hs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ fromProtoValue (Proto.Value bs) = SerialisedValue . RB.fromShortByteString $ SBS
fromProtoBlobRef :: Proto.BlobRef -> BlobSpan
fromProtoBlobRef (Proto.BlobRef x y) = BlobSpan x y

-- | Wrapper around 'PageLogical' that generates nearly-full pages, and
-- keys that are always large enough (>= 8 bytes) for the compact index.
-- | Wrapper around 'PageLogical' that generates nearly-full pages.
newtype PageLogical' = PageLogical' { getPrototypeKOps :: [(Proto.Key, Proto.Operation)] }
deriving stock Show

Expand All @@ -150,7 +149,7 @@ getRealKOps = fmap fromProtoKOp . getPrototypeKOps

instance Arbitrary PageLogical' where
arbitrary = PageLogical' <$>
Proto.genPageContentFits Proto.DiskPage4k (Proto.MinKeySize 8)
Proto.genPageContentFits Proto.DiskPage4k Proto.noMinKeySize
shrink (PageLogical' page) =
[ PageLogical' page' | page' <- shrink page ]

6 changes: 3 additions & 3 deletions test/Test/Database/LSMTree/StateMachine.hs
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,12 @@ import qualified Database.LSMTree as R
import Database.LSMTree.Class (Entry (..), LookupResult (..))
import qualified Database.LSMTree.Class as Class
import Database.LSMTree.Extras (showPowersOf)
import Database.LSMTree.Extras.Generators (KeyForIndexCompact)
import Database.LSMTree.Extras.Generators ()
import Database.LSMTree.Extras.NoThunks (propNoThunks)
import qualified Database.LSMTree.Internal.Config as R
(TableConfig (TableConfig))
import Database.LSMTree.Internal.Serialise (SerialisedBlob,
SerialisedValue)
SerialisedKey, SerialisedValue)
import qualified Database.LSMTree.Internal.Types as R.Types
import qualified Database.LSMTree.Internal.Unsafe as R.Unsafe
import qualified Database.LSMTree.Model.IO as ModelIO
Expand Down Expand Up @@ -567,7 +567,7 @@ handleFsError = Model.ErrFsError . displayException
Key and value types
-------------------------------------------------------------------------------}

newtype Key = Key KeyForIndexCompact
newtype Key = Key SerialisedKey
deriving stock (Show, Eq, Ord)
deriving newtype (Arbitrary, R.SerialiseKey)

Expand Down