From 31ce479ecab2638892f4fba191d300aa5a9d9746 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <73628@studenti.unimore.it>
Date: Mon, 12 Sep 2011 22:40:31 +0200
Subject: [PATCH 1/3] block: prepare I/O context code for BFQ-v3 for 3.0

BFQ uses struct cfq_io_context to store its per-process per-device data,
reusing the same code for cic handling of CFQ.  The code is not shared
ATM to minimize the impact of these patches.

This patch introduces a new hlist to each io_context to store all the
cic's allocated by BFQ to allow calling the right destructor on module
unload; the radix tree used for cic lookup needs to be duplicated
because it can contain dead keys inserted by a scheduler and later
retrieved by the other one.

Update the io_context exit and free paths to take care also of
the BFQ cic's.

Change the type of cfqq inside struct cfq_io_context to void *
to use it also for BFQ per-queue data.

A new bfq-specific ioprio_changed field is necessary, too, to avoid
clobbering cfq's one, so switch ioprio_changed to a bitmap, with one
element per scheduler.

Signed-off-by: Fabio Checconi <fabio@gandalf.sssup.it>
Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
---
 block/Kconfig.iosched     |   26 ++++++++++++++++++++++++++
 block/blk-ioc.c           |   30 +++++++++++++++++-------------
 block/cfq-iosched.c       |   10 +++++++---
 fs/ioprio.c               |    9 +++++++--
 include/linux/iocontext.h |   18 +++++++++++++++---
 5 files changed, 72 insertions(+), 21 deletions(-)

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76..5905452 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -43,6 +43,28 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
+config IOSCHED_BFQ
+	tristate "BFQ I/O scheduler"
+	depends on EXPERIMENTAL
+	default n
+	---help---
+	  The BFQ I/O scheduler tries to distribute bandwidth among
+	  all processes according to their weights.
+	  It aims at distributing the bandwidth as desired, independently of
+	  the disk parameters and with any workload. It also tries to
+	  guarantee low latency to interactive and soft real-time
+	  applications.  If compiled built-in (saying Y here), BFQ can
+	  be configured to support hierarchical scheduling.
+
+config CGROUP_BFQIO
+	bool "BFQ hierarchical scheduling support"
+	depends on CGROUPS && IOSCHED_BFQ=y
+	default n
+	---help---
+	  Enable hierarchical scheduling in BFQ, using the cgroups
+	  filesystem interface.  The name of the subsystem will be
+	  bfqio.
+
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
@@ -56,6 +78,9 @@ choice
 	config DEFAULT_CFQ
 		bool "CFQ" if IOSCHED_CFQ=y
 
+	config DEFAULT_BFQ
+		bool "BFQ" if IOSCHED_BFQ=y
+
 	config DEFAULT_NOOP
 		bool "No-op"
 
@@ -65,6 +90,7 @@ config DEFAULT_IOSCHED
 	string
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
+	default "bfq" if DEFAULT_BFQ
 	default "noop" if DEFAULT_NOOP
 
 endmenu
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 342eae9..21f13b1 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/bio.h>
+#include <linux/bitmap.h>
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
@@ -16,13 +17,12 @@
  */
 static struct kmem_cache *iocontext_cachep;
 
-static void cfq_dtor(struct io_context *ioc)
+static void hlist_sched_dtor(struct io_context *ioc, struct hlist_head *list)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
+	if (!hlist_empty(list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
+		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
 		cic->dtor(ioc);
 	}
 }
@@ -40,7 +40,9 @@ int put_io_context(struct io_context *ioc)
 
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
-		cfq_dtor(ioc);
+
+		hlist_sched_dtor(ioc, &ioc->cic_list);
+		hlist_sched_dtor(ioc, &ioc->bfq_cic_list);
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
@@ -50,15 +52,14 @@ int put_io_context(struct io_context *ioc)
 }
 EXPORT_SYMBOL(put_io_context);
 
-static void cfq_exit(struct io_context *ioc)
+static void hlist_sched_exit(struct io_context *ioc, struct hlist_head *list)
 {
 	rcu_read_lock();
 
-	if (!hlist_empty(&ioc->cic_list)) {
+	if (!hlist_empty(list)) {
 		struct cfq_io_context *cic;
 
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
+		cic = hlist_entry(list->first, struct cfq_io_context, cic_list);
 		cic->exit(ioc);
 	}
 	rcu_read_unlock();
@@ -74,9 +75,10 @@ void exit_io_context(struct task_struct *task)
 	task->io_context = NULL;
 	task_unlock(task);
 
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
-
+	if (atomic_dec_and_test(&ioc->nr_tasks)) {
+		hlist_sched_exit(ioc, &ioc->cic_list);
+		hlist_sched_exit(ioc, &ioc->bfq_cic_list);
+	}
 	put_io_context(ioc);
 }
 
@@ -89,12 +91,14 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
-		ret->ioprio_changed = 0;
+		bitmap_zero(ret->ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
 		ret->ioprio = 0;
 		ret->last_waited = 0; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
+		INIT_RADIX_TREE(&ret->bfq_radix_root, GFP_ATOMIC | __GFP_HIGH);
+		INIT_HLIST_HEAD(&ret->bfq_cic_list);
 		ret->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 		ret->cgroup_changed = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ae21919..b581793 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2919,7 +2919,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_ioprio);
-	ioc->ioprio_changed = 0;
 }
 
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3204,8 +3203,13 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 		goto err_free;
 
 out:
-	smp_read_barrier_depends();
-	if (unlikely(ioc->ioprio_changed))
+	/*
+	 * test_and_clear_bit() implies a memory barrier, paired with
+	 * the wmb() in fs/ioprio.c, so the value seen for ioprio is the
+	 * new one.
+	 */
+	if (unlikely(test_and_clear_bit(IOC_CFQ_IOPRIO_CHANGED,
+					ioc->ioprio_changed)))
 		cfq_ioc_set_ioprio(ioc);
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7da2a06..95a6c2b 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -30,7 +30,7 @@
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
-	int err;
+	int err, i;
 	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 
@@ -60,12 +60,17 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 			err = -ENOMEM;
 			break;
 		}
+		/* let other ioc users see the new values */
+		smp_wmb();
 		task->io_context = ioc;
 	} while (1);
 
 	if (!err) {
 		ioc->ioprio = ioprio;
-		ioc->ioprio_changed = 1;
+		/* make sure schedulers see the new ioprio value */
+		wmb();
+		for (i = 0; i < IOC_IOPRIO_CHANGED_BITS; i++)
+			set_bit(i, ioc->ioprio_changed);
 	}
 
 	task_unlock(task);
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index b2eee89..7cbace3 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -1,14 +1,14 @@
 #ifndef IOCONTEXT_H
 #define IOCONTEXT_H
 
+#include <linux/bitmap.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 
-struct cfq_queue;
 struct cfq_io_context {
 	void *key;
 
-	struct cfq_queue *cfqq[2];
+	void *cfqq[2];
 
 	struct io_context *ioc;
 
@@ -28,6 +28,16 @@ struct cfq_io_context {
 };
 
 /*
+ * Indexes into the ioprio_changed bitmap.  A bit set indicates that
+ * the corresponding I/O scheduler needs to see a ioprio update.
+ */
+enum {
+	IOC_CFQ_IOPRIO_CHANGED,
+	IOC_BFQ_IOPRIO_CHANGED,
+	IOC_IOPRIO_CHANGED_BITS
+};
+
+/*
  * I/O subsystem state of the associated processes.  It is refcounted
  * and kmalloc'ed. These could be shared between processes.
  */
@@ -39,7 +49,7 @@ struct io_context {
 	spinlock_t lock;
 
 	unsigned short ioprio;
-	unsigned short ioprio_changed;
+	DECLARE_BITMAP(ioprio_changed, IOC_IOPRIO_CHANGED_BITS);
 
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 	unsigned short cgroup_changed;
@@ -53,6 +63,8 @@ struct io_context {
 
 	struct radix_tree_root radix_root;
 	struct hlist_head cic_list;
+	struct radix_tree_root bfq_radix_root;
+	struct hlist_head bfq_cic_list;
 	void __rcu *ioc_data;
 };
 
-- 
1.7.4.1

