1 files changed, 117 insertions, 30 deletions
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3ae2c0086563..2255dcfebd2b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
 		kref_get(&req->kref); /* wait for the DONE */
 
 	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
-		/* potentially already completed in the asender thread */
+		/* potentially already completed in the ack_receiver thread */
 		if (!(s & RQ_NET_DONE)) {
 			atomic_add(req->i.size >> 9, &device->ap_in_flight);
 			set_if_null_req_not_net_done(peer_device, req);
 		}
-		if (s & RQ_NET_PENDING)
+		if (req->rq_state & RQ_NET_PENDING)
 			set_if_null_req_ack_pending(peer_device, req);
 	}
 
@@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req)
 	return false;
 }
 
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
 /* returns number of connections (== 1, for drbd 8.4)
  * expected to actually write this data,
  * which does NOT include those that we are L_AHEAD for. */
@@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	 * stable storage, and this is a WRITE, we may not even submit
 	 * this bio. */
 	if (get_ldev(device)) {
-		req->pre_submit_jif = jiffies;
 		if (drbd_insert_fault(device,
 				      rw == WRITE ? DRBD_FAULT_DT_WR
 				    : rw == READ  ? DRBD_FAULT_DT_RD
@@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 			&device->pending_master_completion[rw == WRITE]);
 	if (req->private_bio) {
 		/* needs to be marked within the same spinlock */
+		req->pre_submit_jif = jiffies;
 		list_add_tail(&req->req_pending_local,
 			&device->pending_completion[rw == WRITE]);
 		_req_mod(req, TO_BE_SUBMITTED);
@@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static bool net_timeout_reached(struct drbd_request *net_req,
+		struct drbd_connection *connection,
+		unsigned long now, unsigned long ent,
+		unsigned int ko_count, unsigned int timeout)
+{
+	struct drbd_device *device = net_req->device;
+
+	if (!time_after(now, net_req->pre_send_jif + ent))
+		return false;
+
+	if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+		return false;
+
+	if (net_req->rq_state & RQ_NET_PENDING) {
+		drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return true;
+	}
+
+	/* We received an ACK already (or are using protocol A),
+	 * but are waiting for the epoch closing barrier ack.
+	 * Check if we sent the barrier already.  We should not blame the peer
+	 * for being unresponsive, if we did not even ask it yet. */
+	if (net_req->epoch == connection->send.current_epoch_nr) {
+		drbd_warn(device,
+			"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return false;
+	}
+
+	/* Worst case: we may have been blocked for whatever reason, then
+	 * suddenly are able to send a lot of requests (and epoch separating
+	 * barriers) in quick succession.
+	 * The timestamp of the net_req may be much too old and not correspond
+	 * to the sending time of the relevant unack'ed barrier packet, so
+	 * would trigger a spurious timeout.  The latest barrier packet may
+	 * have a too recent timestamp to trigger the timeout, potentially miss
+	 * a timeout.  Right now we don't have a place to conveniently store
+	 * these timestamps.
+	 * But in this particular situation, the application requests are still
+	 * completed to upper layers, DRBD should still "feel" responsive.
+	 * No need yet to kill this connection, it may still recover.
+	 * If not, eventually we will have queued enough into the network for
+	 * us to block. From that point of view, the timestamp of the last sent
+	 * barrier packet is relevant enough.
+	 */
+	if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+		drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			connection->send.last_sent_barrier_jif, now,
+			jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+		return true;
+	}
+	return false;
+}
+
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+
 void request_timer_fn(unsigned long data)
 {
 	struct drbd_device *device = (struct drbd_device *) data;
@@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data)
 	unsigned long oldest_submit_jif;
 	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
 	unsigned long now;
+	unsigned int ko_count = 0, timeout = 0;
 
 	rcu_read_lock();
 	nc = rcu_dereference(connection->net_conf);
-	if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
-		ent = nc->timeout * HZ/10 * nc->ko_count;
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
+		ko_count = nc->ko_count;
+		timeout = nc->timeout;
+	}
 
 	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
 		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
@@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data)
 	}
 	rcu_read_unlock();
 
+
+	ent = timeout * HZ/10 * ko_count;
 	et = min_not_zero(dt, ent);
 
 	if (!et)
@@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data)
 	spin_lock_irq(&device->resource->req_lock);
 	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
 	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
-	req_peer = connection->req_not_net_done;
+
 	/* maybe the oldest request waiting for the peer is in fact still
-	 * blocking in tcp sendmsg */
-	if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
-		req_peer = connection->req_next;
+	 * blocking in tcp sendmsg.  That's ok, though, that's handled via the
+	 * socket send timeout, requesting a ping, and bumping ko-count in
+	 * we_should_drop_the_connection().
+	 */
+
+	/* check the oldest request we did successfully sent,
+	 * but which is still waiting for an ACK. */
+	req_peer = connection->req_ack_pending;
+
+	/* if we don't have such request (e.g. protocoll A)
+	 * check the oldest requests which is still waiting on its epoch
+	 * closing barrier ack. */
+	if (!req_peer)
+		req_peer = connection->req_not_net_done;
 
 	/* evaluate the oldest peer request only in one timer! */
 	if (req_peer && req_peer->device != device)
@@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data)
 		: req_write ? req_write->pre_submit_jif
 		: req_read ? req_read->pre_submit_jif : now;
 
-	/* The request is considered timed out, if
-	 * - we have some effective timeout from the configuration,
-	 *   with above state restrictions applied,
-	 * - the oldest request is waiting for a response from the network
-	 *   resp. the local disk,
-	 * - the oldest request is in fact older than the effective timeout,
-	 * - the connection was established (resp. disk was attached)
-	 *   for longer than the timeout already.
-	 * Note that for 32bit jiffies and very stable connections/disks,
-	 * we may have a wrap around, which is catched by
-	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
-	 *
-	 * Side effect: once per 32bit wrap-around interval, which means every
-	 * ~198 days with 250 HZ, we have a window where the timeout would need
-	 * to expire twice (worst case) to become effective. Good enough.
-	 */
-	if (ent && req_peer &&
-		 time_after(now, req_peer->pre_send_jif + ent) &&
-		!time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
-		drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+	if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
 		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
-	}
+
 	if (dt && oldest_submit_jif != now &&
 		 time_after(now, oldest_submit_jif + dt) &&
 		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {