Skip to content
This repository was archived by the owner on May 3, 2024. It is now read-only.

Commit c3b78e4

Browse files
author
Tim Shaffer
authored
CORTX-33652: Add min_success parameter for index operations (#1991)
Problem: This PR adds a per-operation setting for the minimum number of successful CAS operations for distributed index operations. Initially, all CAS operations were required to succeed for the operation as a whole to succeed. A previous commit (f2ba0d1) changed the behavior to reduce IO errors during degraded mode. Currently, a single successful operation is sufficient to consider the parent operation successful. This can lead to consistency issues, however. In the read-after-write case, stale data may be returned from successful index operations if the child operations succeed on disjoint sets of CAS services. Solution: This PR changes adds a set sockopt-like function (m0_idx_op_setoption) for tuning parameters of index operations. The only option initially available is M0_DIX_MIN_REPLICA_QUORUM. This defaults to (N+K)/2 + 1 to prevent the situation above (disjoint sets of CAS services). Clients may choose a more lax requirement if they care more about availability than consistency. Clients (RGW) may want to add configuration options for controlling this quorum requirement, or could use this functionality as part of S3 storage tiers, e.g. setting a bucket to use reduced consistency operations. Note that ensuring correctness will also require a mechanism to compare versions of replies from CAS services, which is not in the scope of this PR. Follows is taken care of: * Fail dix operations that don't meet the specified min_success * Update unit test to check dix min_success * Add m0_idx_op_setoption with M0_OIO_MIN_SUCCESS * More detailed comment on min_success handling * Use M0_DIX_MIN_REPLICA_QUORUM by default for dix operations * Fix variable alignment * Make dix_item_version_cmp consistent with rest of codebase * Only send a single CAS GET for meta requests * Handle M0_DIX_MIN_REPLICA_QUORUM in dix_rop_ctx_init Signed-off-by: Tim Shaffer <[email protected]>
1 parent 5e90b7b commit c3b78e4

File tree

10 files changed

+209
-38
lines changed

10 files changed

+209
-38
lines changed

dix/req.c

Lines changed: 88 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "dix/fid_convert.h"
4747
#include "dix/dix_addb.h"
4848
#include "dtm0/dtx.h" /* m0_dtx0_* API */
49+
#include "motr/idx.h" /* M0_DIX_MIN_REPLICA_QUORUM */
4950

5051
static struct m0_sm_state_descr dix_req_states[] = {
5152
[DIXREQ_INIT] = {
@@ -209,11 +210,15 @@ M0_INTERNAL int m0_dix_req_wait(struct m0_dix_req *req, uint64_t states,
209210
static void dix_req_init(struct m0_dix_req *req,
210211
struct m0_dix_cli *cli,
211212
struct m0_sm_group *grp,
213+
int64_t min_success,
212214
bool meta)
213215
{
216+
M0_PRE(ergo(min_success < 1,
217+
min_success == M0_DIX_MIN_REPLICA_QUORUM));
214218
M0_SET0(req);
215219
req->dr_cli = cli;
216220
req->dr_is_meta = meta;
221+
req->dr_min_success = min_success;
217222
m0_sm_init(&req->dr_sm, &dix_req_sm_conf, DIXREQ_INIT, grp);
218223
m0_sm_addb2_counter_init(&req->dr_sm);
219224
}
@@ -222,14 +227,15 @@ M0_INTERNAL void m0_dix_mreq_init(struct m0_dix_req *req,
222227
struct m0_dix_cli *cli,
223228
struct m0_sm_group *grp)
224229
{
225-
dix_req_init(req, cli, grp, true);
230+
dix_req_init(req, cli, grp, 1, true);
226231
}
227232

228233
M0_INTERNAL void m0_dix_req_init(struct m0_dix_req *req,
229234
struct m0_dix_cli *cli,
230-
struct m0_sm_group *grp)
235+
struct m0_sm_group *grp,
236+
int64_t min_success)
231237
{
232-
dix_req_init(req, cli, grp, false);
238+
dix_req_init(req, cli, grp, min_success, false);
233239
}
234240

235241
static enum m0_dix_req_state dix_req_state(const struct m0_dix_req *req)
@@ -1304,12 +1310,13 @@ static int dix_rop_ctx_init(struct m0_dix_req *req,
13041310
const struct m0_bufvec *keys,
13051311
uint64_t *indices)
13061312
{
1307-
struct m0_dix *dix = &req->dr_indices[0];
1308-
struct m0_dix_ldesc *ldesc;
1309-
uint32_t keys_nr;
1310-
struct m0_buf key;
1311-
uint32_t i;
1312-
int rc = 0;
1313+
struct m0_dix *dix = &req->dr_indices[0];
1314+
struct m0_dix_ldesc *ldesc;
1315+
struct m0_pool_version *pver;
1316+
uint32_t keys_nr;
1317+
struct m0_buf key;
1318+
uint32_t i;
1319+
int rc = 0;
13131320

13141321
M0_ENTRY();
13151322
M0_PRE(M0_IS0(rop));
@@ -1320,6 +1327,13 @@ static int dix_rop_ctx_init(struct m0_dix_req *req,
13201327
M0_PRE(keys_nr != 0);
13211328
ldesc = &dix->dd_layout.u.dl_desc;
13221329
rop->dg_pver = dix_pver_find(req, &ldesc->ld_pver);
1330+
M0_ASSERT(ergo(req->dr_min_success < 1,
1331+
req->dr_min_success == M0_DIX_MIN_REPLICA_QUORUM));
1332+
if (req->dr_min_success == M0_DIX_MIN_REPLICA_QUORUM) {
1333+
pver = m0_dix_pver(req->dr_cli, &req->dr_indices[0]);
1334+
req->dr_min_success = (pver->pv_attr.pa_N +
1335+
pver->pv_attr.pa_K)/2 + 1;
1336+
}
13231337
M0_ALLOC_ARR(rop->dg_rec_ops, keys_nr);
13241338
M0_ALLOC_ARR(rop->dg_target_rop, rop->dg_pver->pv_attr.pa_P);
13251339
if (rop->dg_rec_ops == NULL || rop->dg_target_rop == NULL)
@@ -1402,6 +1416,20 @@ static void dix_rop(struct m0_dix_req *req)
14021416
M0_LEAVE();
14031417
}
14041418

1419+
/** Checks if the given cas get reply has a newer version of the value */
1420+
static int dix_item_version_cmp(const struct m0_dix_item *ditem,
1421+
const struct m0_cas_get_reply *get_rep) {
1422+
/*
1423+
* TODO: once cas versions are propagated, check if the get reply
1424+
* has a newer version than seen previously. Will need to add
1425+
* version info to struct m0_dix_item. This function should return
1426+
* true if no previous value is set, or if the previous value has
1427+
* an older version. For now, always return true so the last
1428+
* reply in the array wins.
1429+
*/
1430+
return -1;
1431+
}
1432+
14051433
static void dix_item_rc_update(struct m0_dix_req *req,
14061434
struct m0_cas_req *creq,
14071435
uint64_t key_idx,
@@ -1418,7 +1446,8 @@ static void dix_item_rc_update(struct m0_dix_req *req,
14181446
case DIX_GET:
14191447
m0_cas_get_rep(creq, key_idx, &get_rep);
14201448
rc = get_rep.cge_rc;
1421-
if (rc == 0) {
1449+
if (rc == 0 && dix_item_version_cmp(ditem, &get_rep) < 0) {
1450+
m0_buf_free(&ditem->dxi_val);
14221451
ditem->dxi_val = get_rep.cge_val;
14231452
/* Value will be freed at m0_dix_req_fini(). */
14241453
m0_cas_rep_mlock(creq, key_idx);
@@ -1620,29 +1649,61 @@ static void dix_cas_rop_rc_update(struct m0_dix_cas_rop *cas_rop, int rc)
16201649

16211650
static void dix_rop_completed(struct m0_sm_group *grp, struct m0_sm_ast *ast)
16221651
{
1623-
struct m0_dix_req *req = ast->sa_datum;
1624-
struct m0_dix_rop_ctx *rop = req->dr_rop;
1625-
struct m0_dix_rop_ctx *rop_del_phase2 = NULL;
1626-
bool del_phase2 = false;
1627-
struct m0_dix_cas_rop *cas_rop;
1652+
struct m0_dix_req *req = ast->sa_datum;
1653+
struct m0_dix_rop_ctx *rop = req->dr_rop;
1654+
struct m0_dix_rop_ctx *rop_del_phase2 = NULL;
1655+
bool del_phase2 = false;
1656+
struct m0_dix_cas_rop *cas_rop;
1657+
int64_t min_success;
1658+
int64_t successful_ops = 0;
16281659

16291660
(void)grp;
16301661
if (req->dr_type == DIX_NEXT)
16311662
m0_dix_next_result_prepare(req);
16321663
else {
1664+
min_success = req->dr_min_success;
1665+
M0_ASSERT(min_success > 0);
1666+
1667+
successful_ops = m0_tl_reduce(cas_rop, scan, &rop->dg_cas_reqs, 0,
1668+
+ !!(scan->crp_creq.ccr_sm.sm_rc == 0));
1669+
16331670
/*
1634-
* Consider DIX request to be successful if there is at least
1635-
* one successful CAS request.
1671+
* The idea here is that transient failures are likely to
1672+
* occur and may not persist long enough that the node gets
1673+
* marked as failed. These will still affect individual
1674+
* operations, so we need to make sure that dix correctly
1675+
* handles the issues (if possible) or returns a failure to
1676+
* the client. We therefore let the user choose min_success,
1677+
* which determines the minimum number of successful cas
1678+
* operations to consider the parent dix operation successful.
1679+
* This is necessary to ensure read-after-write consistency.
1680+
* If min_success is set to (N+K)/2 + 1 for both reads and
1681+
* writes, then even in the presence of transient failures at
1682+
* least one copy of the most recent version of data will be
1683+
* found. Other values can be set for reduced consistency or
1684+
* balancing read vs. write.
1685+
*
1686+
* Here we compare the previously computed successful_ops
1687+
* and min_success to decide if we can ignore failed cas
1688+
* operations. If successful_ops >= min_success, we've met
1689+
* the quorum requirement and can ignore failures. This is
1690+
* done by skipping dix_cas_rop_rc_update for failed cas
1691+
* operations. We're guaranteed to have at least one
1692+
* successful cas op somewhere in the list, so this results
1693+
* in the parent dix operation being considered a success,
1694+
* and cas version is used to break ties between multiple
1695+
* successful replies (see dix_item_version_cmp). In the
1696+
* case that successful_ops < min_success, we call
1697+
* dix_cas_rop_rc_update for every cas op, with the result
1698+
* that the failed operations will cause the parent dix op
1699+
* to fail. Since min_success must be greater than 0, this
1700+
* covers the case that all cas requests fail.
16361701
*/
1637-
if (m0_tl_forall(cas_rop, cas_rop,
1638-
&rop->dg_cas_reqs,
1639-
cas_rop->crp_creq.ccr_sm.sm_rc != 0))
1640-
dix_cas_rop_rc_update(cas_rop_tlist_tail(
1641-
&rop->dg_cas_reqs), 0);
1642-
16431702
m0_tl_for (cas_rop, &rop->dg_cas_reqs, cas_rop) {
1644-
if (cas_rop->crp_creq.ccr_sm.sm_rc == 0)
1703+
if (successful_ops < min_success ||
1704+
cas_rop->crp_creq.ccr_sm.sm_rc == 0) {
16451705
dix_cas_rop_rc_update(cas_rop, 0);
1706+
}
16461707
m0_cas_req_fini(&cas_rop->crp_creq);
16471708
} m0_tl_endfor;
16481709
}
@@ -2137,10 +2198,11 @@ static void dix_rop_units_set(struct m0_dix_req *req)
21372198
m0_rwlock_read_unlock(&pm->pm_lock);
21382199

21392200
/*
2140-
* Only one CAS GET request should be sent for every record.
2201+
* For meta requests,
2202+
* only one CAS GET request should be sent for every record.
21412203
* Choose the best destination for every record.
21422204
*/
2143-
if (req->dr_type == DIX_GET) {
2205+
if (req->dr_type == DIX_GET && req->dr_is_meta) {
21442206
for (i = 0; i < rop->dg_rec_ops_nr; i++)
21452207
dix_online_unit_choose(req, &rop->dg_rec_ops[i]);
21462208
}

dix/req.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,11 @@ struct m0_dix_req {
253253
* starting key in DIX_NEXT request.
254254
*/
255255
uint32_t *dr_recs_nr;
256+
/**
257+
* Minimum number of successful CAS operations to treat
258+
* parent DIX operation as successful.
259+
*/
260+
int64_t dr_min_success;
256261
/** Request flags bitmask of m0_cas_op_flags values. */
257262
uint32_t dr_flags;
258263

@@ -283,7 +288,8 @@ struct m0_dix_next_reply {
283288
/** Initialises DIX request. */
284289
M0_INTERNAL void m0_dix_req_init(struct m0_dix_req *req,
285290
struct m0_dix_cli *cli,
286-
struct m0_sm_group *grp);
291+
struct m0_sm_group *grp,
292+
int64_t min_success);
287293

288294
/**
289295
* Initialises DIX request operating with meta-indices.

dix/ut/client_ut.c

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,7 +1244,7 @@ static int dix_common_idx_flagged_op(const struct m0_dix *indices,
12441244
int rc;
12451245
int i;
12461246

1247-
m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp);
1247+
m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp, 1);
12481248
m0_dix_req_lock(&req);
12491249
switch (type) {
12501250
case REQ_CREATE:
@@ -1285,6 +1285,7 @@ static int dix_common_rec_op(const struct m0_dix *index,
12851285
const struct m0_bufvec *keys,
12861286
struct m0_bufvec *vals,
12871287
const uint32_t *recs_nr,
1288+
int64_t min_success,
12881289
uint32_t flags,
12891290
struct dix_rep_arr *rep,
12901291
enum ut_dix_req_type type)
@@ -1294,7 +1295,7 @@ static int dix_common_rec_op(const struct m0_dix *index,
12941295
int i;
12951296
int k = 0;
12961297

1297-
m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp);
1298+
m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp, min_success);
12981299
m0_dix_req_lock(&req);
12991300
switch (type) {
13001301
case REQ_PUT:
@@ -1397,21 +1398,31 @@ static int dix_ut_put(const struct m0_dix *index,
13971398
uint32_t flags,
13981399
struct dix_rep_arr *rep)
13991400
{
1400-
return dix_common_rec_op(index, keys, vals, NULL, flags, rep, REQ_PUT);
1401+
return dix_common_rec_op(index, keys, vals, NULL, 1, flags, rep, REQ_PUT);
1402+
}
1403+
1404+
static int dix_ut_put_min_success(const struct m0_dix *index,
1405+
const struct m0_bufvec *keys,
1406+
struct m0_bufvec *vals,
1407+
int64_t min_success,
1408+
uint32_t flags,
1409+
struct dix_rep_arr *rep)
1410+
{
1411+
return dix_common_rec_op(index, keys, vals, NULL, min_success, flags, rep, REQ_PUT);
14011412
}
14021413

14031414
static int dix_ut_get(const struct m0_dix *index,
14041415
const struct m0_bufvec *keys,
14051416
struct dix_rep_arr *rep)
14061417
{
1407-
return dix_common_rec_op(index, keys, NULL, NULL, 0, rep, REQ_GET);
1418+
return dix_common_rec_op(index, keys, NULL, NULL, 1, 0, rep, REQ_GET);
14081419
}
14091420

14101421
static int dix_ut_del(const struct m0_dix *index,
14111422
const struct m0_bufvec *keys,
14121423
struct dix_rep_arr *rep)
14131424
{
1414-
return dix_common_rec_op(index, keys, NULL, NULL, 0, rep, REQ_DEL);
1425+
return dix_common_rec_op(index, keys, NULL, NULL, 1, 0, rep, REQ_DEL);
14151426
}
14161427

14171428
static int dix_ut_next(const struct m0_dix *index,
@@ -1420,7 +1431,7 @@ static int dix_ut_next(const struct m0_dix *index,
14201431
uint32_t flags,
14211432
struct dix_rep_arr *rep)
14221433
{
1423-
return dix_common_rec_op(index, start_keys, NULL, recs_nr, flags,
1434+
return dix_common_rec_op(index, start_keys, NULL, recs_nr, 1, flags,
14241435
rep, REQ_NEXT);
14251436
}
14261437

@@ -2655,17 +2666,35 @@ static void local_failures(void)
26552666
dix_kv_alloc_and_fill(&keys, &vals, COUNT);
26562667
rc = dix_common_idx_op(&index, 1, REQ_CREATE);
26572668
M0_UT_ASSERT(rc == 0);
2669+
26582670
/*
2659-
* Consider DIX request to be successful if there is at least
2660-
* one successful CAS request. Here two cas requests can be
2661-
* sent successfully.
2671+
* Consider DIX request to be successful only if there are
2672+
* enough successful CAS requests to satisfy min_success.
2673+
* Here two cas requests can be sent successfully. First, try with
2674+
* min_success = 3, which should result in all CAS requests failing.
26622675
*/
26632676
m0_fi_enable_off_n_on_m("cas_req_replied_cb", "send-failure", 2, 3);
2664-
rc = dix_ut_put(&index, &keys, &vals, 0, &rep);
2677+
rc = dix_ut_put_min_success(&index, &keys, &vals, 3, 0, &rep);
2678+
m0_fi_disable("cas_req_replied_cb", "send-failure");
2679+
M0_UT_ASSERT(rc == 0);
2680+
M0_UT_ASSERT(rep.dra_nr == COUNT);
2681+
M0_UT_ASSERT(m0_forall(i, COUNT, rep.dra_rep[i].dre_rc == -ENOTCONN));
2682+
2683+
dix_rep_free(&rep);
2684+
rc = dix_ut_del(&index, &keys, &rep);
2685+
M0_UT_ASSERT(rc == 0);
2686+
dix_rep_free(&rep);
2687+
2688+
/*
2689+
* Now try again with min_success = 2, which should succeed.
2690+
*/
2691+
m0_fi_enable_off_n_on_m("cas_req_replied_cb", "send-failure", 2, 3);
2692+
rc = dix_ut_put_min_success(&index, &keys, &vals, 2, 0, &rep);
26652693
m0_fi_disable("cas_req_replied_cb", "send-failure");
26662694
M0_UT_ASSERT(rc == 0);
26672695
M0_UT_ASSERT(rep.dra_nr == COUNT);
26682696
M0_UT_ASSERT(m0_forall(i, COUNT, rep.dra_rep[i].dre_rc == 0));
2697+
26692698
dix_rep_free(&rep);
26702699
dix_kv_destroy(&keys, &vals);
26712700
dix_index_fini(&index);

motr/client.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,24 @@ enum m0_idx_opcode {
568568
M0_IC_NR /* 21 */
569569
} M0_XCA_ENUM;
570570

571+
/**
572+
* Option to set on an index operation.
573+
*
574+
* See @ref m0_idx_op_setoption.
575+
*/
576+
enum m0_op_idx_option {
577+
/**
578+
* Index operations are distributed across one or more CAS services
579+
* to achieve the configured durability. This option specifies the
580+
* minimum number of services that must successfully complete an
581+
* operation for the operation as a whole to be considered
582+
* successful. The value for this option must be greater than zero
583+
* or a special value. This should normally be set to
584+
* @ref M0_DIX_MIN_REPLICA_QUORUM to ensure consistency.
585+
*/
586+
M0_OIO_MIN_SUCCESS = 1,
587+
};
588+
571589
/**
572590
* Flags passed to m0_obj_op() to specify object IO operation behaviour.
573591
*/
@@ -1628,6 +1646,18 @@ int m0_idx_op(struct m0_idx *idx,
16281646
uint32_t flags,
16291647
struct m0_op **op);
16301648

1649+
/**
1650+
* Set an option on an index operation.
1651+
*
1652+
* The index op must have been previously initialized with @ref m0_idx_op.
1653+
* It is undefined behavior to change an option after the index op has
1654+
* been launched. The meaning of each option is documented in
1655+
* @ref m0_op_idx_option.
1656+
*/
1657+
void m0_idx_op_setoption(struct m0_op *op,
1658+
enum m0_op_idx_option option,
1659+
int64_t value);
1660+
16311661
void m0_realm_create(struct m0_realm *realm,
16321662
uint64_t wcount, uint64_t rcount,
16331663
struct m0_op **op);

motr/client_internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,12 @@ struct m0_op_idx {
230230
struct m0_sm_group *oi_sm_grp;
231231
struct m0_ast_rc oi_ar;
232232

233+
/**
234+
* Minimum number of successful CAS operations to treat
235+
* parent DIX operation as successful.
236+
*/
237+
int64_t oi_min_success;
238+
233239
/* A bit-mask of m0_op_idx_flags. */
234240
uint32_t oi_flags;
235241

0 commit comments

Comments
 (0)