/*-
 * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id$
 * $FreeBSD$
 *
 * This code implements a round-robin anticipatory scheduler, with
 * per-client queues.
 * The goal of this scheduler is to give weighted-fair service to
 * devices where seek times are not relevant (e.g. flash memories).
 * In these environments, and in presence of synchronous requests
 * as typically issued by disk clients, we might be unable to achieve
 * weighted-fair service because flows are never backlogged after
 * their request has been served.
 * We use anticipation to simulate an arrival rate that approximates the
 * weight of each flow. In detail:
 *
 * The system is organized in rounds, during which each flow can receive
 * an amount of service proportional to its weight.
 * For each flow we keep the amount of service received so far,
 * and also the 'round' in which it should be served first.
 *
 * The system maintains a 'current round' index, and three round-robin
 * lists of flows:
 * CURR: flows with pending requests for the current round,
 * NEXT: flows with pending requests that cannot be served in this round;
 * STAGING: flows which received service in the current round but have
 *   no pending requests.
 * The scheduler has three states: READY, BUSY, IDLING.
 *
 * Initially, all flows start idle and with a service=0.
 *
 * When a request arrives for an idle flow, it is put at the end of
 * CURR or NEXT depending on the amount of service
 * received so far. Requests for busy flows are simply appended to the
 * list of pending requests for the same flow.
 * We then generate a 'dispatch' event.
 *
 * On a dispatch event:
 * if state == BUSY, just return; otherwise,
 * if CURR is not empty, serve the first request from it;
 * if CURR is empty, state is READY (i.e. not IDLING), and NEXT
 * contains entries, the current round number is incremented, and
 * we call dispatch again;
 * otherwise, just return.
 *
 * When a flow is served, the scheduler goes into BUSY state, stops a
 * timer if any, charges the flow for the request, and depending on
 * residual requests and the amount of work received, the flow
 * - remains at the head of CURR if it has pending requests
 *   and is still within the budget;
 * - is moved to the tail of the NEXT queue if it has pending
 *   requests but it has exhausted its budget;
 * - is moved to the STAGING queue if it has no more pending requests.
 *
 * When a service request is completed:
 * if CURR is not empty, or both CURR and STAGING are empty,
 *     change state to READY and call dispatch;
 * otherwise (CURR is empty but STAGING is not empty), set state to IDLING,
 *    start a timer and return;
 *
 * When a timer expires, drain the STAGING queue, change state to READY
 * and call dispatch
 * 
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/bio.h>
#include <sys/callout.h>
#include <sys/hash.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
#include "gs_scheduler.h"

struct g_ssd_softc;

/*
 * Per client queue structure.  Each client in the system is
 * represented by this structure where we store the client's
 * requests.
 */
struct g_ssd_queue {
	struct g_ssd_softc *q_sc;	/* reference to the parent */

	struct bio_queue_head q_bioq;
	unsigned int	q_service;
	unsigned int	q_budget;

	int		q_round;
	int		q_wait_end;	/* idling window end, in ticks */

	LIST_ENTRY(g_ssd_queue) q_staging; /* staging list link field */
	TAILQ_ENTRY(g_ssd_queue) q_tailq; /* RR list link field */
};

/* List types. */
TAILQ_HEAD(g_ssd_tailq, g_ssd_queue);
LIST_HEAD(g_ssd_staging, g_ssd_queue);

/* Default slice for RR between queues. */
#define	G_SSD_DEFAULT_BUDGET	0x00800000

/*
 * Per device descriptor, holding the Round Robin list of queues
 * accessing the disk, a reference to the geom, the timer
 * and the hash table where we store the existing entries.
 */
struct g_ssd_softc {
	struct g_geom	*sc_geom;

	int		sc_nqueues;	/* Number of active queues. */
	struct callout	sc_wait;	/* Timer for sc_active. */
	struct g_ssd_tailq sc_ssd_tailq; /* The round-robin list. */
	struct g_ssd_staging sc_staging; /* List of staging queues. */

	int		sc_round;
	int		sc_next_round;
	int		sc_in_flight;	/* Requests in the driver. */
	int		sc_wait_ticks;
	int		sc_waiting;
};

/* Descriptor for bounded values. */
struct x_bound {		
	int	x_min;
	int	x_cur;
	int	x_max;
};

/*
 * Parameters, config and stats.
 */
struct g_ssd_params {
	int	queue_depth;		/* Max nr. of parallel requests. */
	int	units;			/* How many instances. */
	int	queues;			/* Total number of queues. */
	int	qrefs;			/* Total number of refs to queues. */

	struct x_bound wait_ms;		/* Wait time in milliseconds. */
	struct x_bound slice_kb;	/* slice size in Kb (1024 bytes) */
};

static struct g_ssd_params me = {
	.queue_depth =	8,
	.wait_ms =	{ 1, 	10,	30},
	.slice_kb =	{ 16, 	8192,	65536},
};

SYSCTL_DECL(_kern_geom_sched);
SYSCTL_NODE(_kern_geom_sched, OID_AUTO, ssd, CTLFLAG_RW, 0,
    "GEOM_SCHED ANTICIPATORY SSD stuff");
SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, units, CTLFLAG_RD,
    &me.units, 0, "Scheduler instances");
SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, queues, CTLFLAG_RD,
    &me.queues, 0, "Total ssd queues");
SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, wait_ms, CTLFLAG_RW,
    &me.wait_ms.x_cur, 0, "Wait time milliseconds");
SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, slice_kb, CTLFLAG_RW,
    &me.slice_kb.x_cur, 0, "Slice size Kbytes");
SYSCTL_UINT(_kern_geom_sched_ssd, OID_AUTO, queue_depth, CTLFLAG_RW,
    &me.queue_depth, 0, "Maximum simultaneous requests");

/*
 * Get a bounded value, optionally convert to a min of t_min ticks.
 */
static int
get_bounded(struct x_bound *v, int t_min)
{
	int x;

	x = v->x_cur;
	if (x < v->x_min)
		x = v->x_min;
	else if (x > v->x_max)
		x = v->x_max;
	if (t_min) {
		x = x * hz / 1000;	/* convert to ticks */
		if (x < t_min)
			x = t_min;
	}
	return (x);
}

/*
 * Get a reference to the queue that holds requests for tp, allocating
 * it if necessary.
 */
static int
g_ssd_init_class(void *data, void *priv, struct thread *tp)
{
	struct g_ssd_softc *sc = data;
	struct g_ssd_queue *qp = priv;

	qp->q_sc = sc;
	gs_bioq_init(&qp->q_bioq);

	/* compute the slice size in bytes */
	qp->q_budget = 1024 * get_bounded(&me.slice_kb, 0);

	qp->q_sc->sc_nqueues++;
	me.queues++;

	return (0);
}

static struct g_ssd_queue *
g_ssd_queue_get(struct g_ssd_softc *sc, struct bio *bp)
{

	return (g_sched_get_class(sc->sc_geom, bp));
}
 
/*
 * Release a reference to the queue.
 */
static void
g_ssd_queue_put(struct g_ssd_queue *qp)
{
        
        g_sched_put_class(qp->q_sc->sc_geom, qp);
}

/*
 * Release a reference to the queue.
 */
static void
g_ssd_fini_class(void *data, void *priv)
{
	struct g_ssd_queue *qp = priv;

	KASSERT(gs_bioq_first(&qp->q_bioq) == NULL,
			("released nonempty queue"));
	qp->q_sc->sc_nqueues--;
	me.queues--;
}

static inline int
g_ssd_queue_expired(struct g_ssd_queue *qp)
{

	return (qp->q_service >= qp->q_budget);
}

static inline int
g_ssd_next_round(struct g_ssd_softc *sc, struct g_ssd_queue *qp)
{

	return (qp->q_round == sc->sc_round + 1);
}


static void
g_ssd_cleanup_staging(struct g_ssd_softc *sc, int force)
{
	struct g_ssd_queue *qp, *qp2;

	LIST_FOREACH_SAFE(qp, &sc->sc_staging, q_staging, qp2) {
		if ((ticks - qp->q_wait_end) >= 0 || force) {
			qp->q_service = 0;
			LIST_REMOVE(qp, q_staging);
			g_ssd_queue_put(qp);
		}
	}
}

/*
 * Callout executed when a queue times out waiting for a new request.
 */
static void
g_ssd_wait_timeout(void *data)
{
	struct g_ssd_softc *sc = data;
	struct g_geom *geom = sc->sc_geom;

	g_sched_lock(geom);
	sc->sc_waiting = 0;
	g_ssd_cleanup_staging(sc, 1);
	g_sched_dispatch(geom);
	g_sched_unlock(geom);
}

/*
 * called on a request arrival, timeout or completion.
 * Try to serve a request among those queued.
 */
static struct bio *
g_ssd_next(void *data, int force)
{
	struct g_ssd_softc *sc = data;
	struct g_ssd_queue *qp;
	struct bio *bp, *next;

	g_ssd_cleanup_staging(sc, force);

	if (!force && sc->sc_in_flight >= me.queue_depth)
		return (NULL);

	qp = TAILQ_FIRST(&sc->sc_ssd_tailq);
	if (qp == NULL)
		return (NULL);

	if (!LIST_EMPTY(&sc->sc_staging) && g_ssd_next_round(sc, qp)) {
		if (!sc->sc_waiting) {
			callout_reset(&sc->sc_wait, sc->sc_wait_ticks,
			    g_ssd_wait_timeout, sc);
			sc->sc_waiting = 1;
		}
		return (NULL);
	}

	/* Select the new queue for service. */
	sc->sc_round = qp->q_round;

	bp = gs_bioq_takefirst(&qp->q_bioq);
	qp->q_service += bp->bio_length;
	next = gs_bioq_first(&qp->q_bioq);

	if (g_ssd_queue_expired(qp)) {
		TAILQ_REMOVE(&sc->sc_ssd_tailq, qp, q_tailq);
		qp->q_service = 0;
		if (next) {
			qp->q_round++;
			sc->sc_next_round = qp->q_round;
			TAILQ_INSERT_TAIL(&sc->sc_ssd_tailq, qp, q_tailq);
		} else
			g_ssd_queue_put(qp);
	} else if (!next) {
		TAILQ_REMOVE(&sc->sc_ssd_tailq, qp, q_tailq);
		if (!force) {
			qp->q_wait_end = ticks + 30 * sc->sc_wait_ticks;
			LIST_INSERT_HEAD(&sc->sc_staging, qp, q_staging);
		} else
			g_ssd_queue_put(qp);
	}

	sc->sc_in_flight++;

	return (bp);
}

/*
 * Called when a real request for disk I/O arrives.
 * Locate the queue associated with the client, and dispatch
 * immediately if it cannot be found.
 * If the queue is the one we are anticipating for, reset its timeout;
 * if the queue is not in the round robin list, insert it in the list.
 * Finally, call dispatch.
 */
static int
g_ssd_start(void *data, struct bio *bp)
{
	struct g_ssd_softc *sc = data;
	struct g_ssd_queue *qp;

	/* Get the queue for the thread that issued the request. */
	qp = g_ssd_queue_get(sc, bp);
	if (qp == NULL)
		return (-1); /* allocation failed, tell upstream */

	if (gs_bioq_first(&qp->q_bioq) == NULL) {
		/*
		 * We are inserting into an empty queue; check whether
		 * this is the one for which we are doing anticipation,
		 * in which case stop the timer.
		 * Otherwise insert the queue in the rr list.
		 */
		if (sc->sc_round == qp->q_round && qp->q_service != 0) {
			LIST_REMOVE(qp, q_staging);
			callout_stop(&sc->sc_wait);
			sc->sc_waiting = 0;
			TAILQ_INSERT_HEAD(&sc->sc_ssd_tailq, qp, q_tailq);
		} else {
			/*
			 * ... this is the first request, we need to
			 * activate the queue.
			 */
			g_sched_priv_ref(qp);

			qp->q_round = sc->sc_next_round;
			TAILQ_INSERT_TAIL(&sc->sc_ssd_tailq, qp, q_tailq);
			KASSERT(qp->q_service == 0, ("invalid service"));
		}
	}

	/*
	 * Each request holds a reference to the queue containing it:
	 * inherit the "caller" one.
	 */
	bp->bio_caller1 = qp;
	gs_bioq_disksort(&qp->q_bioq, bp);

	return (0);
}

/*
 * Module glue -- allocate descriptor, initialize the hash table and
 * the callout structure.
 */
static void *
g_ssd_init(struct g_geom *geom)
{
	struct g_ssd_softc *sc;

	sc = malloc(sizeof *sc, M_GEOM_SCHED, M_WAITOK | M_ZERO);
	sc->sc_geom = geom;
	TAILQ_INIT(&sc->sc_ssd_tailq);
	LIST_INIT(&sc->sc_staging);
	callout_init(&sc->sc_wait, CALLOUT_MPSAFE);
	sc->sc_wait_ticks = get_bounded(&me.wait_ms, 2);

	me.units++;

	return (sc);
}

/*
 * Module glue -- drain the callout structure, destroy the
 * hash table and its element, and free the descriptor.
 */
static void
g_ssd_fini(void *data)
{
	struct g_ssd_softc *sc = data;

	callout_drain(&sc->sc_wait);
	g_ssd_cleanup_staging(sc, 1);
	KASSERT(TAILQ_EMPTY(&sc->sc_ssd_tailq), ("still scheduled queues"));
	KASSERT(LIST_EMPTY(&sc->sc_staging), ("still staging queues"));
	me.units--;
	free(sc, M_GEOM_SCHED);
}

/*
 * Called when the request under service terminates.
 */
static void
g_ssd_done(void *data, struct bio *bp)
{
	struct g_ssd_softc *sc;
	struct g_ssd_queue *qp;

	sc = data;
	sc->sc_in_flight--;

	qp = bp->bio_caller1;
	qp->q_wait_end = ticks + sc->sc_wait_ticks;
	g_ssd_queue_put(qp);

	if (!sc->sc_in_flight)
		g_sched_dispatch(sc->sc_geom);
}

static struct g_gsched g_ssd = {
	.gs_name = "ssd",
	.gs_priv_size = sizeof(struct g_ssd_queue),
	.gs_init = g_ssd_init,
	.gs_fini = g_ssd_fini,
	.gs_start = g_ssd_start,
	.gs_done = g_ssd_done,
	.gs_next = g_ssd_next,
	.gs_init_class = g_ssd_init_class,
	.gs_fini_class = g_ssd_fini_class,
};

DECLARE_GSCHED_MODULE(ssd, &g_ssd);
