[lustre-devel] [PATCH] Place a memory barrier around cp_state changes
Shaun Tancheff
shaun at tancheff.com
Thu Jun 6 20:54:19 PDT 2019
On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb at suse.com> wrote:
>
> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>
> > Ensure all uses of cl_page->cp_state are smp-safe.
> >
> > LBUG Output:
> > In __cl_page_state_set()
> > ..
> > old = page->cp_state;
> > PASSERT(env, page, allowed_transitions[old][state]);
> > ..
> > Asserted with the following:
> > page at ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]
>
> What do all these fields mean? Or where is this printed?
> The closest I can find is in cl_page_header_print() which uses the
> format string:
>
> "page@%p[%d %p %d %d %p]\n",
>
> But that only has 5 fields in the [], while the output you provided has
> 6.
>
Sorry for the confusion, here is my (long) explanation with source snippets:
The LBUG in full
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
page at ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
vvp-page at ffff80be1fcd6650(0:0) vm at ffff7e02fa0e60c0 5000000000001029 4:0
ffff80be1fcd6600 1049548 lru
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
lov-page at ffff80be1fcd6690, comp index: 0, gen: 0
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
osc-page at ffff80be1fcd66c8 1049548: 1< 0x845fed 2 0 - - > 2< 4298948608 0
4096 0x0 0x420 | (null) ffff809e473708e8 ffff80be199c1e40 > 3< 0
0 0 > 4< 5 13 64 0 + | - - - - > 5< - - - - | 0 - | 0 - ->
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) end
page at ffff80be1fcd6600
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
allowed_transitions[old][state]
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) ASSERTION( 0 )
failed:
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0()) LBUG
The source snippet around L342
...
339
340 ENTRY;
341 old = page->cp_state;
342 PASSERT(env, page, allowed_transitions[old][state]);
343 CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old,
state);
344 PASSERT(env, page, page->cp_state == old);
345 PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner
!= NULL));
346
347 cs_pagestate_dec(page->cp_obj, page->cp_state);
348 cs_pagestate_inc(page->cp_obj, state);
The PASSERT macro ...
...
# define PASSERT(env, page, expr) \
do { \
if (unlikely(!(expr))) { \
CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \
LASSERT(0); \
} \
} while (0)
#else /* !LIBCFS_DEBUG */
...
/**
* Helper macro, dumping detailed information about \a page into a log.
*/
#define CL_PAGE_DEBUG(mask, env, page, format, ...) \
do { \
if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \
LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \
cl_page_print(env, &msgdata, lu_cdebug_printer, page); \
CDEBUG(mask, format , ## __VA_ARGS__); \
} \
} while (0)
Almost there
...
1062 /**
1063 * Prints human readable representation of \a pg to the \a f.
1064 */
1065 void cl_page_print(const struct lu_env *env, void *cookie,
1066 lu_printer_t printer, const struct cl_page *pg)
1067 {
1068 const struct cl_page_slice *slice;
1069 int result = 0;
1070
1071 cl_page_header_print(env, cookie, printer, pg);
1072 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
1073 if (slice->cpl_ops->cpo_print != NULL)
1074 result = (*slice->cpl_ops->cpo_print)(env,
slice,
1075
cookie, printer);
Finally the 'good stuff'
...
1051 void cl_page_header_print(const struct lu_env *env, void *cookie,
1052 lu_printer_t printer, const struct
cl_page *pg)
1053 {
1054 (*printer)(env, cookie,
1055 "page@%p[%d %p %d %d %p]\n",
1056 pg, atomic_read(&pg->cp_ref), pg->cp_obj,
1057 pg->cp_state, pg->cp_type,
1058 pg->cp_owner);
1059 }
...
Given the above along with the first line from the LBUG ....
LustreError: 86407:0:(cl_page.c:342:cl_page_state_set0())
page at ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e28cc0]
Implies the following:
pg == ffff80be1fcd6600
pg->cp_ref == 3
pg->cp_obj == ffff80be1fca5cc0
pg->cp_state == 0 (state == CPS_CACHED)
pg->cp_type == 1
pg->cp_owner == ffff809e60e28cc0
Also known caller was cl_page_assume+0xdc/0x3e0 [obdclass] (from the
backtrace)
So we know the on-stack / register value of state == CPS_OWNED
The backtrace:
...
dump_backtrace+0x0/0x248
show_stack+0x24/0x30
dump_stack+0xbc/0xf4
libcfs_call_trace+0xec/0x120 [libcfs]
lbug_with_loc+0x4c/0xb8 [libcfs]
cl_page_state_set0+0x2b4/0x6e0 [obdclass]
cl_page_assume+0xdc/0x3e0 [obdclass]
ll_io_read_page+0x144c/0x1de0 [lustre]
ll_write_begin+0x3d4/0xee8 [lustre]
and the contents of cl_page_assume
596 void cl_page_assume(const struct lu_env *env,
597 struct cl_io *io, struct cl_page *pg)
598 {
599 const struct cl_page_slice *slice;
600
601 PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
602
603 ENTRY;
604 io = cl_io_top(io);
605
606 list_for_each_entry(slice, &pg->cp_layers, cpl_linkage) {
607 if (slice->cpl_ops->cpo_assume != NULL)
608 (*slice->cpl_ops->cpo_assume)(env, slice,
io);
609 }
610
611 PASSERT(env, pg, pg->cp_owner == NULL);
612 pg->cp_owner = cl_io_top(io);
613 cl_page_owner_set(pg);
614 cl_page_state_set(env, pg, CPS_OWNED);
615 EXIT;
616 }
While I believe my conclusion to be correct I am certainly open to being
dead wrong.
On Thu, Jun 6, 2019 at 7:57 PM NeilBrown <neilb at suse.com> wrote:
> On Thu, Jun 06 2019, Shaun Tancheff wrote:
>
> > Ensure all uses of cl_page->cp_state are smp-safe.
> >
> > LBUG Output:
> > In __cl_page_state_set()
> > ..
> > old = page->cp_state;
> > PASSERT(env, page, allowed_transitions[old][state]);
> > ..
> > Asserted with the following:
> > page at ffff80be1fcd6600[3 ffff80be1fca5cc0 0 1 ffff809e60e2 8cc0]
>
> What do all these fields mean? Or where is this printed?
> The closest I can find is in cl_page_header_print() which uses the
> format string:
>
> "page@%p[%d %p %d %d %p]\n",
>
> But that only has 5 fields in the [], while the output you provided has
> 6.
>
> >
> > However cp_state 0 (CPS_CACHED) to 1 (CPS_OWNED) is a valid transition
> > leading to the conclusion that cp_state became 0 during the
> > assertion.
>
> Where is the evidence that this was the transition that was happening?
> If it was in some part of the LBUG output that wasn't quoted - then
> please fix that by quoting the entire LBUG output.
>
> We need to understand which change happened at this time to cause the
> race.
> Then we need to explain why the added barriers actually close the race
> window.
>
> If these barrier do actually fix a race, then there is something *very*
> wrong.
> The comments say that changes to cp_state are protected by the page lock
> on the corresponding VM page.
> Locking and unlocking a VM page entails sufficient barriers that changes
> made while one thread holds the lock will be visible to another
> thread once it also gets the lock.
>
>
True, but a page lock is just a bit on the page flags. And we absolutely
wait on I/O while the
page is 'locked' so a ping-pong (our task *can* sleep) is entirely
reasonable, as far as
I understand anyway. Since there is multiple state-changes during a single
page lock/unlock
there is a race, however rare and unlikely, and it's arch specific based on
what I understand
of the memory barrier rules.
So the only possible explanation for a race as you suspect, is if the
> vmpage *isn't* locked when something changes cp_state, and that would be
> bad.
>
> Thanks,
> NeilBrown
>
>
> >
> > Signed-off-by: Shaun Tancheff <stancheff at cray.com>
> > ---
> > fs/lustre/include/cl_object.h | 11 +++++++++++
> > fs/lustre/llite/rw26.c | 2 +-
> > fs/lustre/llite/vvp_page.c | 6 +++---
> > fs/lustre/obdclass/cl_page.c | 34 +++++++++++++++++++--------------
> > fs/lustre/obdecho/echo_client.c | 2 +-
> > fs/lustre/osc/osc_cache.c | 18 +++++++++--------
> > 6 files changed, 46 insertions(+), 27 deletions(-)
> >
> > diff --git a/fs/lustre/include/cl_object.h
> b/fs/lustre/include/cl_object.h
> > index 691c2f5da53a..d6e1f6f05f50 100644
> > --- a/fs/lustre/include/cl_object.h
> > +++ b/fs/lustre/include/cl_object.h
> > @@ -752,6 +752,17 @@ struct cl_page {
> > struct cl_sync_io *cp_sync_io;
> > };
> >
> > +static inline enum cl_page_state cl_page_state_get(const struct cl_page
> *pg)
> > +{
> > + /*
> > + * Paired with smp_store_release in cl_page_state_set_trust
> > + * and ensures that we see the most recent value of cp_state
> > + * even when the last modification was not performed on the
> > + * current processor
> > + */
> > + return smp_load_acquire(&pg->cp_state);
> > +}
> > +
> > /**
> > * Per-layer part of cl_page.
> > *
> > diff --git a/fs/lustre/llite/rw26.c b/fs/lustre/llite/rw26.c
> > index e4ce3b6f5772..364dec208ccd 100644
> > --- a/fs/lustre/llite/rw26.c
> > +++ b/fs/lustre/llite/rw26.c
> > @@ -200,7 +200,7 @@ static ssize_t ll_direct_IO_seg(const struct lu_env
> *env, struct cl_io *io,
> >
> > rc = cl_page_own(env, io, clp);
> > if (rc) {
> > - LASSERT(clp->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(clp) == CPS_FREEING);
> > cl_page_put(env, clp);
> > break;
> > }
> > diff --git a/fs/lustre/llite/vvp_page.c b/fs/lustre/llite/vvp_page.c
> > index 590e5f5e43c9..38b8c488d765 100644
> > --- a/fs/lustre/llite/vvp_page.c
> > +++ b/fs/lustre/llite/vvp_page.c
> > @@ -323,18 +323,18 @@ static int vvp_page_make_ready(const struct lu_env
> *env,
> >
> > lock_page(vmpage);
> > if (clear_page_dirty_for_io(vmpage)) {
> > - LASSERT(pg->cp_state == CPS_CACHED);
> > + LASSERT(cl_page_state_get(pg) == CPS_CACHED);
> > /* This actually clears the dirty bit in the radix tree. */
> > set_page_writeback(vmpage);
> > CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
> > - } else if (pg->cp_state == CPS_PAGEOUT) {
> > + } else if (cl_page_state_get(pg) == CPS_PAGEOUT) {
> > /* is it possible for osc_flush_async_page() to already
> > * make it ready?
> > */
> > result = -EALREADY;
> > } else {
> > CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state
> %d.\n",
> > - pg->cp_state);
> > + cl_page_state_get(pg));
> > LBUG();
> > }
> > unlock_page(vmpage);
> > diff --git a/fs/lustre/obdclass/cl_page.c b/fs/lustre/obdclass/cl_page.c
> > index 349f19e014e0..da4429b82932 100644
> > --- a/fs/lustre/obdclass/cl_page.c
> > +++ b/fs/lustre/obdclass/cl_page.c
> > @@ -97,7 +97,7 @@ static void cl_page_free(const struct lu_env *env,
> struct cl_page *page)
> >
> > PASSERT(env, page, list_empty(&page->cp_batch));
> > PASSERT(env, page, !page->cp_owner);
> > - PASSERT(env, page, page->cp_state == CPS_FREEING);
> > + PASSERT(env, page, cl_page_state_get(page) == CPS_FREEING);
> >
> > while ((slice = list_first_entry_or_null(&page->cp_layers,
> > struct cl_page_slice,
> > @@ -119,8 +119,14 @@ static void cl_page_free(const struct lu_env *env,
> struct cl_page *page)
> > static inline void cl_page_state_set_trust(struct cl_page *page,
> > enum cl_page_state state)
> > {
> > - /* bypass const. */
> > - *(enum cl_page_state *)&page->cp_state = state;
> > + /*
> > + * Paired with smp_load_acquire in cl_page_state_get
> > + * and ensures that we see the most recent value of cp_state
> > + * is available even when the next access is not performed on the
> > + * current processor.
> > + * Note we also cast away const as the only modifier of cp_state.
> > + */
> > + smp_store_release((enum cl_page_state *)&page->cp_state, state);
> > }
> >
> > struct cl_page *cl_page_alloc(const struct lu_env *env,
> > @@ -270,10 +276,10 @@ static void __cl_page_state_set(const struct
> lu_env *env,
> > }
> > };
> >
> > - old = page->cp_state;
> > + old = cl_page_state_get(page);
> > PASSERT(env, page, allowed_transitions[old][state]);
> > CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
> > - PASSERT(env, page, page->cp_state == old);
> > + PASSERT(env, page, cl_page_state_get(page) == old);
> > PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner));
> > cl_page_state_set_trust(page, state);
> > }
> > @@ -313,7 +319,7 @@ void cl_page_put(const struct lu_env *env, struct
> cl_page *page)
> > refcount_read(&page->cp_ref));
> >
> > if (refcount_dec_and_test(&page->cp_ref)) {
> > - LASSERT(page->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);
> >
> > LASSERT(refcount_read(&page->cp_ref) == 0);
> > PASSERT(env, page, !page->cp_owner);
> > @@ -378,7 +384,7 @@ void __cl_page_disown(const struct lu_env *env,
> > const struct cl_page_slice *slice;
> > enum cl_page_state state;
> >
> > - state = pg->cp_state;
> > + state = cl_page_state_get(pg);
> > cl_page_owner_clear(pg);
> >
> > if (state == CPS_OWNED)
> > @@ -402,7 +408,7 @@ int cl_page_is_owned(const struct cl_page *pg, const
> struct cl_io *io)
> > struct cl_io *top = cl_io_top((struct cl_io *)io);
> >
> > LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
> > - return pg->cp_state == CPS_OWNED && pg->cp_owner == top;
> > + return cl_page_state_get(pg) == CPS_OWNED && pg->cp_owner == top;
> > }
> > EXPORT_SYMBOL(cl_page_is_owned);
> >
> > @@ -434,7 +440,7 @@ static int __cl_page_own(const struct lu_env *env,
> struct cl_io *io,
> >
> > io = cl_io_top(io);
> >
> > - if (pg->cp_state == CPS_FREEING) {
> > + if (cl_page_state_get(pg) == CPS_FREEING) {
> > result = -ENOENT;
> > goto out;
> > }
> > @@ -453,7 +459,7 @@ static int __cl_page_own(const struct lu_env *env,
> struct cl_io *io,
> > PASSERT(env, pg, !pg->cp_owner);
> > pg->cp_owner = cl_io_top(io);
> > cl_page_owner_set(pg);
> > - if (pg->cp_state != CPS_FREEING) {
> > + if (cl_page_state_get(pg) != CPS_FREEING) {
> > cl_page_state_set(env, pg, CPS_OWNED);
> > } else {
> > __cl_page_disown(env, io, pg);
> > @@ -593,7 +599,7 @@ static void __cl_page_delete(const struct lu_env
> *env, struct cl_page *pg)
> > {
> > const struct cl_page_slice *slice;
> >
> > - PASSERT(env, pg, pg->cp_state != CPS_FREEING);
> > + PASSERT(env, pg, cl_page_state_get(pg) != CPS_FREEING);
> >
> > /*
> > * Sever all ways to obtain new pointers to @pg.
> > @@ -756,7 +762,7 @@ void cl_page_completion(const struct lu_env *env,
> > const struct cl_page_slice *slice;
> >
> > PASSERT(env, pg, crt < CRT_NR);
> > - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
> > + PASSERT(env, pg, cl_page_state_get(pg) == cl_req_type_state(crt));
> >
> > CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
> >
> > @@ -805,7 +811,7 @@ int cl_page_make_ready(const struct lu_env *env,
> struct cl_page *pg,
> > }
> >
> > if (result >= 0) {
> > - PASSERT(env, pg, pg->cp_state == CPS_CACHED);
> > + PASSERT(env, pg, cl_page_state_get(pg) == CPS_CACHED);
> > cl_page_io_start(env, pg, crt);
> > result = 0;
> > }
> > @@ -870,7 +876,7 @@ void cl_page_header_print(const struct lu_env *env,
> void *cookie,
> > (*printer)(env, cookie,
> > "page@%p[%d %p %d %d %p]\n",
> > pg, refcount_read(&pg->cp_ref), pg->cp_obj,
> > - pg->cp_state, pg->cp_type,
> > + cl_page_state_get(pg), pg->cp_type,
> > pg->cp_owner);
> > }
> > EXPORT_SYMBOL(cl_page_header_print);
> > diff --git a/fs/lustre/obdecho/echo_client.c
> b/fs/lustre/obdecho/echo_client.c
> > index 317123fd27cb..d879f109e641 100644
> > --- a/fs/lustre/obdecho/echo_client.c
> > +++ b/fs/lustre/obdecho/echo_client.c
> > @@ -1046,7 +1046,7 @@ static int cl_echo_object_brw(struct echo_object
> *eco, int rw, u64 offset,
> >
> > rc = cl_page_own(env, io, clp);
> > if (rc) {
> > - LASSERT(clp->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(clp) == CPS_FREEING);
> > cl_page_put(env, clp);
> > break;
> > }
> > diff --git a/fs/lustre/osc/osc_cache.c b/fs/lustre/osc/osc_cache.c
> > index f8fddbfe6a7e..75984b98b229 100644
> > --- a/fs/lustre/osc/osc_cache.c
> > +++ b/fs/lustre/osc/osc_cache.c
> > @@ -1045,7 +1045,7 @@ static int osc_extent_truncate(struct osc_extent
> *ext, pgoff_t trunc_index,
> > cl_page_discard(env, io, page);
> > cl_page_disown(env, io, page);
> > } else {
> > - LASSERT(page->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);
> > LASSERT(0);
> > }
> >
> > @@ -1356,10 +1356,12 @@ static int osc_completion(const struct lu_env
> *env, struct osc_async_page *oap,
> > int srvlock;
> >
> > cmd &= ~OBD_BRW_NOQUOTA;
> > - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ),
> > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> > - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE),
> > - "cp_state:%u, cmd:%d\n", page->cp_state, cmd);
> > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEIN,
> > + cmd == OBD_BRW_READ),
> > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
> > + LASSERTF(equi(cl_page_state_get(page) == CPS_PAGEOUT,
> > + cmd == OBD_BRW_WRITE),
> > + "cp_state:%u, cmd:%d\n", cl_page_state_get(page), cmd);
> > LASSERT(opg->ops_transfer_pinned);
> >
> > crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
> > @@ -3061,7 +3063,7 @@ bool osc_page_gang_lookup(const struct lu_env
> *env, struct cl_io *io,
> >
> > page = ops->ops_cl.cpl_page;
> > LASSERT(page->cp_type == CPT_CACHEABLE);
> > - if (page->cp_state == CPS_FREEING)
> > + if (cl_page_state_get(page) == CPS_FREEING)
> > continue;
> >
> > cl_page_get(page);
> > @@ -3142,7 +3144,7 @@ static bool check_and_discard_cb(const struct
> lu_env *env, struct cl_io *io,
> > cl_page_discard(env, io, page);
> > cl_page_disown(env, io, page);
> > } else {
> > - LASSERT(page->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);
> > }
> > }
> >
> > @@ -3169,7 +3171,7 @@ static bool discard_cb(const struct lu_env *env,
> struct cl_io *io,
> > cl_page_discard(env, io, page);
> > cl_page_disown(env, io, page);
> > } else {
> > - LASSERT(page->cp_state == CPS_FREEING);
> > + LASSERT(cl_page_state_get(page) == CPS_FREEING);
> > }
> >
> > return true;
> > --
> > 2.17.1
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.lustre.org/pipermail/lustre-devel-lustre.org/attachments/20190606/6bdc81e1/attachment-0001.html>
More information about the lustre-devel
mailing list