Skip to content

Commit 19a3f66

Browse files
[Enhancement] improve cloud native pk index rebuild and compaction strategy (backport #47065) (#47227)
Signed-off-by: Yixin Luo <[email protected]> Co-authored-by: Yixin Luo <[email protected]>
1 parent 573b080 commit 19a3f66

File tree

9 files changed

+231
-64
lines changed

9 files changed

+231
-64
lines changed

be/src/common/config.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,9 @@ CONF_mInt64(lake_pk_compaction_min_input_segments, "5");
986986
// Used for control memory usage of update state cache and compaction state cache
987987
CONF_mInt32(lake_pk_preload_memory_limit_percent, "30");
988988
CONF_mInt32(lake_pk_index_sst_min_compaction_versions, "2");
989-
CONF_mInt32(lake_pk_index_sst_max_compaction_bytes, /*1GB*/ "1073741824");
989+
CONF_mInt32(lake_pk_index_sst_max_compaction_versions, "100");
990+
// When the ratio of cumulative level to base level is greater than this config, use base merge.
991+
CONF_mDouble(lake_pk_index_cumulative_base_compaction_ratio, "0.1");
990992
CONF_Int32(lake_pk_index_block_cache_limit_percent, "10");
991993

992994
CONF_mBool(dependency_librdkafka_debug_enable, "false");

be/src/storage/lake/lake_persistent_index.cpp

Lines changed: 106 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,18 @@ void KeyValueMerger::flush() {
6767

6868
IndexValuesWithVerPB index_value_pb;
6969
for (const auto& index_value_with_ver : _index_value_vers) {
70+
if (_merge_base_level && index_value_with_ver.second == IndexValue(NullIndexValue)) {
71+
// deleted
72+
continue;
73+
}
7074
auto* value = index_value_pb.add_values();
7175
value->set_version(index_value_with_ver.first);
7276
value->set_rssid(index_value_with_ver.second.get_rssid());
7377
value->set_rowid(index_value_with_ver.second.get_rowid());
7478
}
75-
_builder->Add(Slice(_key), Slice(index_value_pb.SerializeAsString()));
79+
if (index_value_pb.values_size() > 0) {
80+
_builder->Add(Slice(_key), Slice(index_value_pb.SerializeAsString()));
81+
}
7682
_index_value_vers.clear();
7783
}
7884

@@ -130,7 +136,7 @@ Status LakePersistentIndex::minor_compact() {
130136
PersistentIndexSstablePB sstable_pb;
131137
sstable_pb.set_filename(filename);
132138
sstable_pb.set_filesize(filesize);
133-
sstable_pb.set_version(_immutable_memtable->max_version());
139+
sstable_pb.set_max_rss_rowid(_immutable_memtable->max_rss_rowid());
134140
auto* block_cache = _tablet_mgr->update_mgr()->block_cache();
135141
if (block_cache == nullptr) {
136142
return Status::InternalError("Block cache is null.");
@@ -260,10 +266,54 @@ Status LakePersistentIndex::replace(size_t n, const Slice* keys, const IndexValu
260266
return Status::OK();
261267
}
262268

269+
void LakePersistentIndex::pick_sstables_for_merge(const PersistentIndexSstableMetaPB& sstable_meta,
270+
std::vector<PersistentIndexSstablePB>* sstables,
271+
bool* merge_base_level) {
272+
// There are two levels in persistent index:
273+
// 1) base level. It contains only one sst file.
274+
// 2) cumulative level. Sst files that except base level.
275+
// And there are two kinds of merge:
276+
// 1) base merge. Merge all sst files.
277+
// 2) cumulative merge. Only merge cumulative sst files.
278+
//
279+
// And we use this strategy to decide whether to use base merge or cumulative merge:
280+
// 1. When total size of cumulative level sst files reach 1/10 of base level, use base merge.
281+
// 2. Otherwise, use cumulative merge.
282+
DCHECK(sstable_meta.sstables_size() > 0);
283+
int64_t base_level_bytes = 0;
284+
int64_t cumulative_level_bytes = 0;
285+
std::vector<PersistentIndexSstablePB> cumulative_sstables;
286+
for (int i = 0; i < sstable_meta.sstables_size(); i++) {
287+
if (i == 0) {
288+
base_level_bytes = sstable_meta.sstables(i).filesize();
289+
} else {
290+
cumulative_level_bytes += sstable_meta.sstables(i).filesize();
291+
cumulative_sstables.push_back(sstable_meta.sstables(i));
292+
}
293+
}
294+
295+
if ((double)base_level_bytes * config::lake_pk_index_cumulative_base_compaction_ratio >
296+
(double)cumulative_level_bytes) {
297+
// cumulative merge
298+
sstables->swap(cumulative_sstables);
299+
*merge_base_level = false;
300+
} else {
301+
// base merge
302+
sstables->push_back(sstable_meta.sstables(0));
303+
sstables->insert(sstables->end(), cumulative_sstables.begin(), cumulative_sstables.end());
304+
*merge_base_level = true;
305+
}
306+
// Limit max sstable count that can do merge, to avoid cost too much memory.
307+
const int32_t max_limit = config::lake_pk_index_sst_max_compaction_versions;
308+
if (sstables->size() > max_limit) {
309+
sstables->resize(max_limit);
310+
}
311+
}
312+
263313
Status LakePersistentIndex::prepare_merging_iterator(
264314
TabletManager* tablet_mgr, const TabletMetadata& metadata, TxnLogPB* txn_log,
265315
std::vector<std::shared_ptr<PersistentIndexSstable>>* merging_sstables,
266-
std::unique_ptr<sstable::Iterator>* merging_iter_ptr) {
316+
std::unique_ptr<sstable::Iterator>* merging_iter_ptr, bool* merge_base_level) {
267317
sstable::ReadOptions read_options;
268318
// No need to cache input sst's blocks.
269319
read_options.fill_cache = false;
@@ -274,11 +324,16 @@ Status LakePersistentIndex::prepare_merging_iterator(
274324
}
275325
});
276326

277-
auto max_compaction_bytes = config::lake_pk_index_sst_max_compaction_bytes;
278327
iters.reserve(metadata.sstable_meta().sstables().size());
279-
size_t total_filesize = 0;
280328
std::stringstream ss_debug;
281-
for (const auto& sstable_pb : metadata.sstable_meta().sstables()) {
329+
std::vector<PersistentIndexSstablePB> sstables_to_merge;
330+
// Pick sstable for merge, decide to use base merge or cumulative merge.
331+
pick_sstables_for_merge(metadata.sstable_meta(), &sstables_to_merge, merge_base_level);
332+
if (sstables_to_merge.size() <= 1) {
333+
// no need to do merge
334+
return Status::OK();
335+
}
336+
for (const auto& sstable_pb : sstables_to_merge) {
282337
// build sstable from meta, instead of reuse `_sstables`, to keep it thread safe
283338
ASSIGN_OR_RETURN(auto rf,
284339
fs::new_random_access_file(tablet_mgr->sst_location(metadata.id(), sstable_pb.filename())));
@@ -287,14 +342,9 @@ Status LakePersistentIndex::prepare_merging_iterator(
287342
merging_sstables->push_back(merging_sstable);
288343
sstable::Iterator* iter = merging_sstable->new_iterator(read_options);
289344
iters.emplace_back(iter);
290-
total_filesize += sstable_pb.filesize();
291345
// add input sstable.
292346
txn_log->mutable_op_compaction()->add_input_sstables()->CopyFrom(merging_sstable->sstable_pb());
293347
ss_debug << sstable_pb.filename() << " | ";
294-
if (total_filesize >= max_compaction_bytes &&
295-
merging_sstables->size() >= config::lake_pk_index_sst_min_compaction_versions) {
296-
break;
297-
}
298348
}
299349
sstable::Options options;
300350
(*merging_iter_ptr).reset(sstable::NewMergingIterator(options.comparator, &iters[0], iters.size()));
@@ -304,9 +354,9 @@ Status LakePersistentIndex::prepare_merging_iterator(
304354
return Status::OK();
305355
}
306356

307-
Status LakePersistentIndex::merge_sstables(std::unique_ptr<sstable::Iterator> iter_ptr,
308-
sstable::TableBuilder* builder) {
309-
auto merger = std::make_unique<KeyValueMerger>(iter_ptr->key().to_string(), builder);
357+
Status LakePersistentIndex::merge_sstables(std::unique_ptr<sstable::Iterator> iter_ptr, sstable::TableBuilder* builder,
358+
bool base_level_merge) {
359+
auto merger = std::make_unique<KeyValueMerger>(iter_ptr->key().to_string(), builder, base_level_merge);
310360
while (iter_ptr->Valid()) {
311361
RETURN_IF_ERROR(merger->merge(iter_ptr->key().to_string(), iter_ptr->value().to_string()));
312362
iter_ptr->Next();
@@ -324,8 +374,14 @@ Status LakePersistentIndex::major_compact(TabletManager* tablet_mgr, const Table
324374

325375
std::vector<std::shared_ptr<PersistentIndexSstable>> sstable_vec;
326376
std::unique_ptr<sstable::Iterator> merging_iter_ptr;
377+
bool merge_base_level = false;
327378
// build merge iterator
328-
RETURN_IF_ERROR(prepare_merging_iterator(tablet_mgr, metadata, txn_log, &sstable_vec, &merging_iter_ptr));
379+
RETURN_IF_ERROR(prepare_merging_iterator(tablet_mgr, metadata, txn_log, &sstable_vec, &merging_iter_ptr,
380+
&merge_base_level));
381+
if (merging_iter_ptr == nullptr) {
382+
// no need to do merge
383+
return Status::OK();
384+
}
329385
if (!merging_iter_ptr->Valid()) {
330386
return merging_iter_ptr->status();
331387
}
@@ -338,7 +394,7 @@ Status LakePersistentIndex::major_compact(TabletManager* tablet_mgr, const Table
338394
filter_policy.reset(const_cast<sstable::FilterPolicy*>(sstable::NewBloomFilterPolicy(10)));
339395
options.filter_policy = filter_policy.get();
340396
sstable::TableBuilder builder(options, wf.get());
341-
RETURN_IF_ERROR(merge_sstables(std::move(merging_iter_ptr), &builder));
397+
RETURN_IF_ERROR(merge_sstables(std::move(merging_iter_ptr), &builder, merge_base_level));
342398
RETURN_IF_ERROR(wf->close());
343399

344400
// record output sstable pb
@@ -354,7 +410,8 @@ Status LakePersistentIndex::apply_opcompaction(const TxnLogPB_OpCompaction& op_c
354410

355411
PersistentIndexSstablePB sstable_pb;
356412
sstable_pb.CopyFrom(op_compaction.output_sstable());
357-
sstable_pb.set_version(op_compaction.input_sstables(op_compaction.input_sstables().size() - 1).version());
413+
sstable_pb.set_max_rss_rowid(
414+
op_compaction.input_sstables(op_compaction.input_sstables().size() - 1).max_rss_rowid());
358415
auto sstable = std::make_unique<PersistentIndexSstable>();
359416
ASSIGN_OR_RETURN(auto rf, fs::new_random_access_file(_tablet_mgr->sst_location(_tablet_id, sstable_pb.filename())));
360417
auto* block_cache = _tablet_mgr->update_mgr()->block_cache();
@@ -367,24 +424,31 @@ Status LakePersistentIndex::apply_opcompaction(const TxnLogPB_OpCompaction& op_c
367424
for (const auto& input_sstable : op_compaction.input_sstables()) {
368425
filenames.insert(input_sstable.filename());
369426
}
427+
// Erase merged sstable from sstable list
370428
_sstables.erase(std::remove_if(_sstables.begin(), _sstables.end(),
371429
[&](const std::unique_ptr<PersistentIndexSstable>& sstable) {
372430
return filenames.contains(sstable->sstable_pb().filename());
373431
}),
374432
_sstables.end());
375-
_sstables.insert(_sstables.begin(), std::move(sstable));
433+
// Insert sstable to sstable list by `max_rss_rowid` order.
434+
auto lower_it = std::lower_bound(
435+
_sstables.begin(), _sstables.end(), sstable,
436+
[](const std::unique_ptr<PersistentIndexSstable>& a, const std::unique_ptr<PersistentIndexSstable>& b) {
437+
return a->sstable_pb().max_rss_rowid() < b->sstable_pb().max_rss_rowid();
438+
});
439+
_sstables.insert(lower_it, std::move(sstable));
376440
return Status::OK();
377441
}
378442

379443
Status LakePersistentIndex::commit(MetaFileBuilder* builder) {
380444
PersistentIndexSstableMetaPB sstable_meta;
381-
int64_t last_version = 0;
445+
int64_t last_max_rss_rowid = 0;
382446
for (auto& sstable : _sstables) {
383-
int64_t sstable_version = sstable->sstable_pb().version();
384-
if (last_version > sstable_version) {
385-
return Status::InternalError("Versions of sstables are not ordered");
447+
int64_t max_rss_rowid = sstable->sstable_pb().max_rss_rowid();
448+
if (last_max_rss_rowid > max_rss_rowid) {
449+
return Status::InternalError("sstables are not ordered");
386450
}
387-
last_version = sstable_version;
451+
last_max_rss_rowid = max_rss_rowid;
388452
auto* sstable_pb = sstable_meta.add_sstables();
389453
sstable_pb->CopyFrom(sstable->sstable_pb());
390454
}
@@ -406,13 +470,9 @@ Status LakePersistentIndex::load_from_lake_tablet(TabletManager* tablet_mgr, con
406470
_key_size = PrimaryKeyEncoder::get_encoded_fixed_size(pkey_schema);
407471

408472
const auto& sstables = metadata->sstable_meta().sstables();
409-
int64_t max_sstable_version = sstables.empty() ? 0 : sstables.rbegin()->version();
410-
if (max_sstable_version > base_version) {
411-
return Status::OK();
412-
}
413-
TRACE_COUNTER_INCREMENT("max_sstable_version", max_sstable_version);
414-
TRACE_COUNTER_INCREMENT("new_version", metadata->version());
415-
473+
// Rebuild persistent index from `rebuild_rss_rowid_point`
474+
const uint64_t rebuild_rss_rowid_point = sstables.empty() ? 0 : sstables.rbegin()->max_rss_rowid();
475+
const uint32_t rebuild_rss_id = rebuild_rss_rowid_point >> 32;
416476
OlapReaderStatistics stats;
417477
std::unique_ptr<Column> pk_column;
418478
if (pk_columns.size() > 1) {
@@ -428,18 +488,13 @@ Status LakePersistentIndex::load_from_lake_tablet(TabletManager* tablet_mgr, con
428488
auto rowsets = Rowset::get_rowsets(tablet_mgr, metadata);
429489
// Rowset whose version is between max_sstable_version and base_version should be recovered.
430490
for (auto& rowset : rowsets) {
431-
TRACE_COUNTER_INCREMENT("total_rowsets", 1);
432-
TRACE_COUNTER_INCREMENT("total_segments", rowset->num_segments());
433-
TRACE_COUNTER_INCREMENT("total_datasize_bytes", rowset->data_size());
491+
TRACE_COUNTER_INCREMENT("total_segment_cnt", rowset->num_segments());
434492
TRACE_COUNTER_INCREMENT("total_num_rows", rowset->num_rows());
435-
// If it is upgraded from old version of sr, the rowset version will be not set.
436-
// The generated rowset version will be treated as base_version.
437-
int64_t rowset_version = rowset->version() != 0 ? rowset->version() : base_version;
438-
// The data whose version is max_sstable_version in memtable may be not flushed to sstable.
439-
// So rowset whose version is max_sstable_version should also be recovered.
440-
if (rowset_version < max_sstable_version) {
493+
if (rowset->id() + rowset->num_segments() <= rebuild_rss_id) {
494+
// All segments under this rowset are not need to rebuild
441495
continue;
442496
}
497+
const int64_t rowset_version = rowset->version() != 0 ? rowset->version() : base_version;
443498
auto res = rowset->get_each_segment_iterator_with_delvec(pkey_schema, base_version, builder, &stats);
444499
if (!res.ok()) {
445500
return res.status();
@@ -451,6 +506,13 @@ Status LakePersistentIndex::load_from_lake_tablet(TabletManager* tablet_mgr, con
451506
if (itr == nullptr) {
452507
continue;
453508
}
509+
if (rowset->id() + i < rebuild_rss_id) {
510+
// lower than rebuild point, skip
511+
// Notice: segment id that equal `rebuild_rss_id` can't be skip because
512+
// there are maybe some rows need to rebuild.
513+
continue;
514+
}
515+
TRACE_COUNTER_INCREMENT("rebuild_index_segment_cnt", 1);
454516
while (true) {
455517
chunk->reset();
456518
rowids.clear();
@@ -476,6 +538,11 @@ Status LakePersistentIndex::load_from_lake_tablet(TabletManager* tablet_mgr, con
476538
for (uint32_t i = 0; i < pkc->size(); i++) {
477539
values.emplace_back(base + rowids[i]);
478540
}
541+
if (values.back().get_value() <= rebuild_rss_rowid_point) {
542+
// lower AND equal than rebuild point, skip
543+
continue;
544+
}
545+
TRACE_COUNTER_INCREMENT("rebuild_index_num_rows", pkc->size());
479546
Status st;
480547
if (pkc->is_binary()) {
481548
RETURN_IF_ERROR(insert(pkc->size(), reinterpret_cast<const Slice*>(pkc->raw_data()),
@@ -495,10 +562,6 @@ Status LakePersistentIndex::load_from_lake_tablet(TabletManager* tablet_mgr, con
495562
}
496563
itr->close();
497564
}
498-
TRACE_COUNTER_INCREMENT("loaded_rowsets", 1);
499-
TRACE_COUNTER_INCREMENT("loaded_segments", rowset->num_segments());
500-
TRACE_COUNTER_INCREMENT("loaded_datasize_bytes", rowset->data_size());
501-
TRACE_COUNTER_INCREMENT("loaded_num_rows", rowset->num_rows());
502565
}
503566
return Status::OK();
504567
}

be/src/storage/lake/lake_persistent_index.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ using IndexValueWithVer = std::pair<int64_t, IndexValue>;
3939

4040
class KeyValueMerger {
4141
public:
42-
explicit KeyValueMerger(const std::string& key, sstable::TableBuilder* builder)
43-
: _key(std::move(key)), _builder(builder) {}
42+
explicit KeyValueMerger(const std::string& key, sstable::TableBuilder* builder, bool merge_base_level)
43+
: _key(std::move(key)), _builder(builder), _merge_base_level(merge_base_level) {}
4444

4545
Status merge(const std::string& key, const std::string& value);
4646

@@ -53,6 +53,8 @@ class KeyValueMerger {
5353
std::string _key;
5454
sstable::TableBuilder* _builder;
5555
std::list<IndexValueWithVer> _index_value_vers;
56+
// If do merge base level, that means we can delete NullIndexValue items safely.
57+
bool _merge_base_level = false;
5658
};
5759

5860
// LakePersistentIndex is not thread-safe.
@@ -125,6 +127,9 @@ class LakePersistentIndex : public PersistentIndex {
125127

126128
size_t memory_usage() const override;
127129

130+
static void pick_sstables_for_merge(const PersistentIndexSstableMetaPB& sstable_meta,
131+
std::vector<PersistentIndexSstablePB>* sstables, bool* merge_base_level);
132+
128133
private:
129134
Status flush_memtable();
130135

@@ -153,9 +158,11 @@ class LakePersistentIndex : public PersistentIndex {
153158
// get sstable's iterator that need to compact and modify txn_log
154159
static Status prepare_merging_iterator(TabletManager* tablet_mgr, const TabletMetadata& metadata, TxnLogPB* txn_log,
155160
std::vector<std::shared_ptr<PersistentIndexSstable>>* merging_sstables,
156-
std::unique_ptr<sstable::Iterator>* merging_iter_ptr);
161+
std::unique_ptr<sstable::Iterator>* merging_iter_ptr,
162+
bool* merge_base_level);
157163

158-
static Status merge_sstables(std::unique_ptr<sstable::Iterator> iter_ptr, sstable::TableBuilder* builder);
164+
static Status merge_sstables(std::unique_ptr<sstable::Iterator> iter_ptr, sstable::TableBuilder* builder,
165+
bool base_level_merge);
159166

160167
private:
161168
std::unique_ptr<PersistentIndexMemtable> _memtable;

be/src/storage/lake/persistent_index_memtable.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ Status PersistentIndexMemtable::upsert(size_t n, const Slice* keys, const IndexV
4444
nfound += old_value.get_value() != NullIndexValue;
4545
update_index_value(&old_index_value_vers, version, value);
4646
}
47+
_max_rss_rowid = std::max(_max_rss_rowid, value.get_value());
4748
}
4849
*num_found = nfound;
49-
_max_version = std::max(_max_version, version);
5050
return Status::OK();
5151
}
5252

@@ -65,8 +65,8 @@ Status PersistentIndexMemtable::insert(size_t n, const Slice* keys, const IndexV
6565
return Status::AlreadyExist(msg);
6666
}
6767
_keys_size += key.capacity() + sizeof(std::string);
68+
_max_rss_rowid = std::max(_max_rss_rowid, value.get_value());
6869
}
69-
_max_version = std::max(_max_version, version);
7070
return Status::OK();
7171
}
7272

@@ -90,7 +90,6 @@ Status PersistentIndexMemtable::erase(size_t n, const Slice* keys, IndexValue* o
9090
}
9191
}
9292
*num_found = nfound;
93-
_max_version = std::max(_max_version, version);
9493
return Status::OK();
9594
}
9695

@@ -106,8 +105,8 @@ Status PersistentIndexMemtable::replace(const Slice* keys, const IndexValue* val
106105
} else {
107106
_keys_size += key.capacity() + sizeof(std::string);
108107
}
108+
_max_rss_rowid = std::max(_max_rss_rowid, value.get_value());
109109
}
110-
_max_version = std::max(_max_version, version);
111110
return Status::OK();
112111
}
113112

be/src/storage/lake/persistent_index_memtable.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class PersistentIndexMemtable {
5858

5959
void clear();
6060

61-
const int64_t max_version() const { return _max_version; }
61+
const int64_t max_rss_rowid() const { return _max_rss_rowid; }
6262

6363
private:
6464
static void update_index_value(std::list<IndexValueWithVer>* index_value_info, int64_t version,
@@ -67,8 +67,8 @@ class PersistentIndexMemtable {
6767
private:
6868
// The size can be up to 230K. The performance of std::map may be poor.
6969
phmap::btree_map<std::string, std::list<IndexValueWithVer>, std::less<>> _map;
70-
int64_t _max_version{0};
7170
int64_t _keys_size{0};
71+
uint64_t _max_rss_rowid{0};
7272
};
7373

7474
} // namespace starrocks::lake

0 commit comments

Comments
 (0)