diff options
Diffstat (limited to 'drivers/block/drbd/drbd_receiver.c')
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 251 |
1 files changed, 220 insertions, 31 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ccfcf00f2798..c7ad88d91a09 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -50,7 +50,7 @@ #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) struct packet_info { enum drbd_packet cmd; @@ -1490,14 +1490,129 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } -static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +/* + * Mapping "discard" to ZEROOUT with UNMAP does not work for us: + * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it + * will directly go to fallback mode, submitting normal writes, and + * never even try to UNMAP. + * + * And dm-thin does not do this (yet), mostly because in general it has + * to assume that "skip_block_zeroing" is set. See also: + * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html + * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html + * + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that this "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, with skip_block_zeroing=false, + * the result is effectively "discard_zeroes_data=1". + */ +/* flags: EE_TRIM|EE_ZEROOUT */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; - if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, - GFP_NOIO, 0)) - peer_req->flags |= EE_WAS_ERROR; + if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many + * layers are below us, some may have smaller granularity */ + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= max_discard_sectors) { + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); + nr_sectors -= max_discard_sectors; + start += max_discard_sectors; + } + if (nr_sectors) { + /* max_discard_sectors is unsigned int (and a multiple of + * granularity, we made sure of that above already); + * nr is < max_discard_sectors; + * I don't need sector_div here, even though nr is sector_t */ + nr = nr_sectors; + nr -= (unsigned int)nr % granularity; + if (nr) { + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start += nr; + } + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, + (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); + } + return err != 0; +} +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct disk_conf *dc; + bool can_do; + + if (!blk_queue_discard(q)) + return false; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; +} + +static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) + peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); } @@ -1550,7 +1665,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * Correctness first, performance later. Next step is to code an * asynchronous variant of the same. */ - if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { + if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1567,8 +1682,8 @@ int drbd_submit_peer_request(struct drbd_device *device, spin_unlock_irq(&device->resource->req_lock); } - if (peer_req->flags & EE_IS_TRIM) - drbd_issue_peer_discard(device, peer_req); + if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) + drbd_issue_peer_discard_or_zero_out(device, peer_req); else /* EE_WRITE_SAME */ drbd_issue_peer_wsame(device, peer_req); return 0; @@ -1765,6 +1880,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; @@ -1786,6 +1902,10 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); + } else if (zeroes) { + if (!expect(data_size == 0)) + return NULL; + ds = be32_to_cpu(zeroes->size); } else if (wsame) { if (data_size != queue_logical_block_size(device->rq_queue)) { drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", @@ -1802,7 +1922,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(IS_ALIGNED(ds, 512))) return NULL; - if (trim || wsame) { + if (trim || wsame || zeroes) { if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) @@ -1827,7 +1947,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, peer_req->flags |= EE_WRITE; if (trim) { - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; + return peer_req; + } + if (zeroes) { + peer_req->flags |= EE_ZEROOUT; return peer_req; } if (wsame) @@ -2326,8 +2450,12 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf) static unsigned long wire_flags_to_bio_op(u32 dpf) { - if (dpf & DP_DISCARD) + if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; + if (dpf & DP_DISCARD) + return REQ_OP_DISCARD; + if (dpf & DP_WSAME) + return REQ_OP_WRITE_SAME; else return REQ_OP_WRITE; } @@ -2518,8 +2646,19 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * op_flags = wire_flags_to_bio_flags(dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, op == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + /* need to play safe: an older DRBD sender + * may mean zero-out while sending P_TRIM. */ + if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) + peer_req->flags |= EE_ZEROOUT; + } else if (pi->cmd == P_ZEROES) { + D_ASSERT(peer_device, peer_req->i.size > 0); D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); + /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ + if (dp_flags & DP_DISCARD) + peer_req->flags |= EE_TRIM; } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); @@ -2587,7 +2726,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) + if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -3364,7 +3503,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, enum drbd_conns rv = C_MASK; enum drbd_disk_state mydisk; struct net_conf *nc; - int hg, rule_nr, rr_conflict, tentative; + int hg, rule_nr, rr_conflict, tentative, always_asbp; mydisk = device->state.disk; if (mydisk == D_NEGOTIATING) @@ -3415,8 +3554,12 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, rcu_read_lock(); nc = rcu_dereference(peer_device->connection->net_conf); + always_asbp = nc->always_asbp; + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); - if (hg == 100 || (hg == -100 && nc->always_asbp)) { + if (hg == 100 || (hg == -100 && always_asbp)) { int pcount = (device->state.role == R_PRIMARY) + (peer_role == R_PRIMARY); int forced = (hg == -100); @@ -3455,9 +3598,6 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, "Sync from %s node\n", (hg < 0) ? "peer" : "this"); } - rr_conflict = nc->rr_conflict; - tentative = nc->tentative; - rcu_read_unlock(); if (hg == -100) { /* FIXME this log message is not correct if we end up here @@ -3980,6 +4120,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, p_csize, my_usize; + sector_t new_size, cur_size; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -3987,6 +4128,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (!peer_device) return config_unknown_volume(connection, pi); device = peer_device->device; + cur_size = drbd_get_capacity(device->this_bdev); p_size = be64_to_cpu(p->d_size); p_usize = be64_to_cpu(p->u_size); @@ -3997,7 +4139,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info device->p_size = p_size; if (get_ldev(device)) { - sector_t new_size, cur_size; rcu_read_lock(); my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); @@ -4012,13 +4153,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info if (device->state.conn == C_WF_REPORT_PARAMS) p_usize = min_not_zero(my_usize, p_usize); - /* Never shrink a device with usable data during connect. - But allow online shrinking if we are connected. */ + /* Never shrink a device with usable data during connect, + * or "attach" on the peer. + * But allow online shrinking if we are connected. */ new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); - cur_size = drbd_get_capacity(device->this_bdev); if (new_size < cur_size && device->state.disk >= D_OUTDATED && - device->state.conn < C_CONNECTED) { + (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) { drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", (unsigned long long)new_size, (unsigned long long)cur_size); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); @@ -4046,8 +4187,8 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info synchronize_rcu(); kfree(old_disk_conf); - drbd_info(device, "Peer sets u_size to %lu sectors\n", - (unsigned long)my_usize); + drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", + (unsigned long)p_usize, (unsigned long)my_usize); } put_ldev(device); @@ -4080,9 +4221,36 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info * * However, if he sends a zero current size, * take his (user-capped or) backing disk size anyways. + * + * Unless of course he does not have a disk himself. + * In which case we ignore this completely. */ + sector_t new_size = p_csize ?: p_usize ?: p_size; drbd_reconsider_queue_parameters(device, NULL, o); - drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); + if (new_size == 0) { + /* Ignore, peer does not know nothing. */ + } else if (new_size == cur_size) { + /* nothing to do */ + } else if (cur_size != 0 && p_size == 0) { + drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + } else if (new_size < cur_size && device->state.role == R_PRIMARY) { + drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n", + (unsigned long long)new_size, (unsigned long long)cur_size); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } else { + /* I believe the peer, if + * - I don't have a current size myself + * - we agree on the size anyways + * - I do have a current size, am Secondary, + * and he has the only disk + * - I do have a current size, am Primary, + * and he has the only disk, + * which is larger than my current size + */ + drbd_set_my_capacity(device, new_size); + } } if (get_ldev(device)) { @@ -4142,7 +4310,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info kfree(device->p_uuid); device->p_uuid = p_uuid; - if (device->state.conn < C_CONNECTED && + if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) && device->state.disk < D_INCONSISTENT && device->state.role == R_PRIMARY && (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { @@ -4368,6 +4536,25 @@ static int receive_state(struct drbd_connection *connection, struct packet_info if (peer_state.conn == C_AHEAD) ns.conn = C_BEHIND; + /* TODO: + * if (primary and diskless and peer uuid != effective uuid) + * abort attach on peer; + * + * If this node does not have good data, was already connected, but + * the peer did a late attach only now, trying to "negotiate" with me, + * AND I am currently Primary, possibly frozen, with some specific + * "effective" uuid, this should never be reached, really, because + * we first send the uuids, then the current state. + * + * In this scenario, we already dropped the connection hard + * when we received the unsuitable uuids (receive_uuids(). + * + * Should we want to change this, that is: not drop the connection in + * receive_uuids() already, then we would need to add a branch here + * that aborts the attach of "unsuitable uuids" on the peer in case + * this node is currently Diskless Primary. + */ + if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(device, D_NEGOTIATING)) { int cr; /* consider resync */ @@ -4380,7 +4567,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info (peer_state.disk == D_NEGOTIATING || os.disk == D_NEGOTIATING)); /* if we have both been inconsistent, and the peer has been - * forced to be UpToDate with --overwrite-data */ + * forced to be UpToDate with --force */ cr |= test_bit(CONSIDER_RESYNC, &device->flags); /* if we had been plain connected, and the admin requested to * start a sync by "invalidate" or "invalidate-remote" */ @@ -4845,7 +5032,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac peer_req->w.cb = e_end_resync_block; peer_req->submit_jif = jiffies; - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; spin_lock_irq(&device->resource->req_lock); list_add_tail(&peer_req->w.list, &device->sync_ee); @@ -4913,6 +5100,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; @@ -5197,11 +5385,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", connection->agreed_features, connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", - connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", + connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : connection->agreed_features ? "" : " none"); return 1; @@ -5284,7 +5473,7 @@ static int drbd_do_auth(struct drbd_connection *connection) if (pi.cmd != P_AUTH_CHALLENGE) { drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); - rv = 0; + rv = -1; goto fail; } |