/*
 * nasd_od_linuxio.c
 *
 * Linux kernel drive I/O module
 *
 * Authors: Jim Zelenka, Nat Lanza
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1999, 2000.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#define __KERNEL_SYSCALLS__
#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_od.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_sys.h>
#include <nasd/nasd_timeout.h>
#include <nasd/nasd_nonce_mgmt.h>
#define NASD_DRIVE_IO_MODULE 1
#include <nasd/nasd_drive_io.h>
#include <nasd/nasd_ioqueue.h>

#undef asm

#include <linux/config.h>
#include <linux/version.h>
#include <linux/types.h>
#include <linux/malloc.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/utime.h>
#include <linux/time.h>
#include <linux/kdev_t.h>
#include <linux/wait.h>
#include <linux/locks.h>
#include <linux/file.h>
#include <linux/kmod.h>
#include <linux/swap.h>
#include <linux/unistd.h>
#include <linux/version.h>

#include <asm/uaccess.h>

#include <asm/unistd.h>
/*
 * Don't want to give me the syscalls? Fine, I'll just TAKE them.
 */
static inline _syscall3(int,ioctl,int,fd,unsigned int,cmd,unsigned long,arg)
static inline _syscall3(int,readv,int,fd,struct iovec *,iov,int,iov_count)
static inline _syscall3(int,writev,int,fd,struct iovec *,iov,int,iov_count)
#ifndef __alpha__
static inline _syscall5(int,_llseek,int,fd,unsigned long,offset_high,unsigned long,
  offset_low,loff_t *,result,unsigned int,origin)
#endif /* !__alpha__ */
static int errno; /* burn in hell, me, yep */

/*
 * I love how kernel interfaces stay so constant between
 * MINOR REVISIONS of a STABLE RELEASE CYCLE. Fucking freaks.
 *
 * Sure, explicitly testing kernel versions is ugly, but if
 * they're going to change symbol names between minor versions, I
 * don't really have much choice.
 */
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,2,14)
# define GLOBAL_EVENT event
#else
# define GLOBAL_EVENT global_event
#endif

/*
 * We have two implementations of the actual disk I/O stuff
 * buried in here. One attempts to use the low-level I/O
 * interfaces via ll_rw_block(). This fails because that
 * function seems to want everything to be exactly 1k long.
 * We have massive deadline pressure here and I don't really
 * have the time to figure out what's wrong with my usage
 * of these interfaces, or what the right interfaces are.
 * There's nothing even remotely like documentation for any
 * of this, so I had to blindly guess. Anyway, if you set
 * this value nonzero, it will enable that code.
 */
#define NASD_LINUX_USE_LLIO 0

nasd_threadgroup_t nasd_od_linux_flush_threadgroup;

void nasd_od_linux_flush_proc(nasd_threadarg_t ignored);

struct wait_queue *nasd_od_linux_flush_wq = NULL;

#define FLUSH_PROC_KICK() wake_up(&nasd_od_linux_flush_wq)

kdev_t nasd_od_linux_kdev;

nasd_threadgroup_t nasd_od_linux_handler_group;
struct wait_queue *nasd_od_linux_wq;

/*
 * Linux requires us to set a special "DMA" flag when allocating
 * memory that will be used for I/O. We can't set this flag on
 * every allocation or we exhaust resources, so we set it only
 * when allocating I/O pages. We also have to have this temporary
 * buf, nasd_od_linux_headerbuf, to use as a bounce-buffer when
 * reading and writing disk headers.
 */
nasd_od_disk_t *nasd_od_linux_headerbuf = NULL;

/*
 * The linux page allocater wants to give us 2^n pages.
 * NASD_OD_LINUX_PAGEBLOCK_ORD is lg(NASD_OD_BASIC_BLOCKSIZE/PAGE_SIZE)
 */
#define NASD_OD_BASIC_BLOCK_PAGES (NASD_OD_BASIC_BLOCKSIZE/PAGE_SIZE)

#if NASD_OD_BASIC_BLOCK_PAGES == 1
#define NASD_OD_LINUX_PAGEBLOCK_ORD 0
#endif /* NASD_OD_BASIC_BLOCK_PAGES == 1 */

#if NASD_OD_BASIC_BLOCK_PAGES == 2
#define NASD_OD_LINUX_PAGEBLOCK_ORD 1
#endif /* NASD_OD_BASIC_BLOCK_PAGES == 2 */

/*
 * Hacking this thing to try to work within the immensely
 * restrictive confines of the linux kernel has been such
 * an abusive process that I've acquried a kind of amorality
 * about hacks like this.
 */
#define NASD_OD_LINUX_NUM_EVILPAGES 512
nasd_pagebuf_t nasd_od_linux_evilpages[NASD_OD_LINUX_NUM_EVILPAGES];

#define NASD_OD_LINUX_ISEVIL_PAGE(_buf_) \
  ((((unsigned long)(_buf_)) >= ((unsigned long)nasd_od_linux_evilpages)) && \
    (((unsigned long)(_buf_)) < \
      ((unsigned long)&nasd_od_linux_evilpages[NASD_OD_LINUX_NUM_EVILPAGES])))

nasd_pagebuf_t *nasd_od_linux_evil_pagelist = NULL;

void
nasd_od_linux_evil_init()
{
  nasd_pagebuf_t *pb;
  int i;

  for(i=0;i<NASD_OD_LINUX_NUM_EVILPAGES;i++) {
    pb = &nasd_od_linux_evilpages[i];
    pb->next = nasd_od_linux_evil_pagelist;
    nasd_od_linux_evil_pagelist = pb;
  }
  nasd_printf("DRIVE: created %d evil pages\n", NASD_OD_LINUX_NUM_EVILPAGES);
}

#if NASD_LINUX_USE_LLIO == 0

#define NASD_IO_LASTCOMP_ASSIGN(_off_) { \
  loff_t _off; \
  _off = (_off_); \
  _off >>= NASD_OD_SECT_SHIFT; \
  nasd_od_io_last_completed_sect = _off; \
}

nasd_threadgroup_t nasd_k_io_threadgroup;

char nasd_k_devname[MAXPATHLEN];
NASD_DECLARE_MUTEX(nasd_k_extrafd_lock)
struct file *nasd_k_extrafile;
fl_owner_t nasd_k_extrafile_id;

NASD_DECLARE_COND(nasd_k_run_cond)
NASD_DECLARE_COND(nasd_k_q_cond)
NASD_DECLARE_MUTEX(nasd_k_q_lock)

#define NASD_IO_EFD_LOCK()   NASD_LOCK_MUTEX(nasd_k_extrafd_lock)
#define NASD_IO_EFD_UNLOCK() NASD_UNLOCK_MUTEX(nasd_k_extrafd_lock)

/*
 * If I/O lock must be held, lock before this
 */
#define NASD_IO_PEND_LOCK()   NASD_LOCK_MUTEX(nasd_k_q_lock)
#define NASD_IO_PEND_UNLOCK() NASD_UNLOCK_MUTEX(nasd_k_q_lock)

int nasd_k_max_retries = 3;

nasd_odc_ent_t nasd_k_pending;

#define SEEK_SET 0

extern void nasd_linux_drive_module_start();
extern void nasd_linux_drive_module_stop();

#define NASD_OD_LINUX_SAVEPROC_DECL \
  uid_t oldsuid, olduid, oldeuid, oldfsuid; \
  mm_segment_t oldfs;

#define NASD_OD_LINUX_SAVEPROC() { \
  oldfs = get_fs(); \
  set_fs(KERNEL_DS); \
  olduid = current->uid; \
  oldeuid = current->euid; \
  oldsuid = current->fsuid; \
  oldfsuid = current->fsuid; \
  current->uid = 0; \
  current->euid = 0; \
  current->suid = 0; \
  current->fsuid = 0; \
}

#define NASD_OD_LINUX_RESTOREPROC() { \
  current->uid = olduid; \
  current->euid = oldeuid; \
  current->suid = oldsuid; \
  current->fsuid = oldfsuid; \
  set_fs(oldfs); \
}

#ifndef __alpha__
#ifdef lseek
#undef lseek
#endif /* lseek */

#define do_lseek(_file_,_off_,_whence_) _do_lseek(_file_,_off_,_whence_,__FILE__,__LINE__)
#define do_lseek_fd(_fd_,_off_,_whence_) _do_lseek_fd(_fd_,_off_,_whence_,__FILE__,__LINE__)

/*
 * Using the lseek() or _llseek() syscalls doesn't seem to work,
 * so I rolled my own.
 */
static inline loff_t
_do_lseek(
  struct file  *fil,
  loff_t        off,
  int           whence,
  char         *file,
  int           line)
{
  struct dentry *dentry;
  struct inode *inode;
  loff_t got;

  lock_kernel();

  dentry = fil->f_dentry;
  if (dentry == NULL) {
    nasd_printf("seek at %s:%d file=0x%lx no dentry\n", file, line, fil);
    unlock_kernel();
    return(off);
  }
  inode = dentry->d_inode;
  if (inode == NULL) {
    nasd_printf("seek at %s:%d file=0x%lx no inode\n", file, line, fil);
    unlock_kernel();
    return(off);
  }
  if (fil->f_op && fil->f_op->llseek) {
    nasd_printf("call file-specific seek to %" NASD_64s_FMT "\n", off);
    got = fil->f_op->llseek(fil, off, whence);
    if (got != off) {
      nasd_printf("got=%" NASD_64s_FMT " off=%" NASD_64s_FMT " file=0x%lx\n",
        got, off, fil);
      nasd_printf("seek was (0x%lx, %" NASD_64s_FMT ", %d) at %s:%d\n",
        fil, off, whence, file, line);
    }
  }
  else {
    if (whence != SEEK_SET)
      NASD_PANIC();
    fil->f_pos = off;
    fil->f_reada = 0;
    fil->f_version = ++GLOBAL_EVENT;
  }
  unlock_kernel();
  return(off);
}

static inline loff_t
_do_lseek_fd(
  int      fd,
  loff_t   off,
  int      whence,
  char    *file,
  int      line)
{
  struct dentry *dentry;
  struct inode *inode;
  struct file *fil;
  loff_t got;

  lock_kernel();
  fil = fget(fd);
  if (fil == NULL) {
    nasd_printf("seek at %s:%d fd=%d fd is BAD\n", file, line, fd);
    unlock_kernel();
    return(off);
  }
  dentry = fil->f_dentry;
  if (dentry == NULL) {
    nasd_printf("seek at %s:%d fd=%d no dentry\n", file, line, fd);
    fput(fil);
    unlock_kernel();
    return(off);
  }
  inode = dentry->d_inode;
  if (inode == NULL) {
    nasd_printf("seek at %s:%d fd=%d no inode\n", file, line, fd);
    fput(fil);
    unlock_kernel();
    return(off);
  }
  if (fil->f_op && fil->f_op->llseek) {
    nasd_printf("call file-specific seek to %" NASD_64s_FMT "\n", off);
    got = fil->f_op->llseek(fil, off, whence);
    if (got != off) {
      nasd_printf("got=%" NASD_64s_FMT " off=%" NASD_64s_FMT " fd=%d\n",
        got, off, fd);
      nasd_printf("seek was (%d, %" NASD_64s_FMT ", %d) at %s:%d\n",
        fd, off, whence, file, line);
    }
  }
  else {
    if (whence != SEEK_SET)
      NASD_PANIC();
    fil->f_pos = off;
    fil->f_reada = 0;
    fil->f_version = ++GLOBAL_EVENT;
  }
  fput(fil);
  unlock_kernel();
  return(off);
}

static inline ssize_t
do_write_pos(
  struct file  *file,
  void         *buf,
  size_t        count,
  loff_t        pos)
{
  struct inode *inode;
  loff_t tmp_pos;
  int ret;

  lock_kernel();

  if (!(file->f_mode & FMODE_WRITE)) {
    ret = -EBADF;
    goto done;
  }

  inode = file->f_dentry->d_inode;

  if (file->f_op == NULL) {
    ret = -EBADF;
    goto done;
  }
  if (file->f_op->write == NULL) {
    ret = -EBADF;
    goto done;
  }

  /*
   * It makes me a little bit sad that we're going to end up totally
   * lacking concurrency here.
   */
  tmp_pos = pos;
  down(&inode->i_sem);
  ret = file->f_op->write(file, buf, count, &tmp_pos);
  up(&inode->i_sem);

done:
  unlock_kernel();
  return(ret);
}

static inline ssize_t
do_read_pos(
  struct file  *file,
  void         *buf,
  size_t        count,
  loff_t        pos)
{
  struct inode *inode;
  loff_t tmp_pos;
  int ret;

  lock_kernel();

  if (!(file->f_mode & FMODE_READ)) {
    ret = -EBADF;
    goto done;
  }

  inode = file->f_dentry->d_inode;

  if (file->f_op == NULL) {
    ret = -EBADF;
    goto done;
  }
  if (file->f_op->read == NULL) {
    ret = -EBADF;
    goto done;
  }

  tmp_pos = pos;
  ret = file->f_op->read(file, buf, count, &tmp_pos);

done:
  unlock_kernel();
  return(ret);
}

#endif /* !__alpha__ */

#endif /* NASD_LINUX_USE_LLIO == 0 */

void
nasd_od_linux_shutdown_flush_threadgroup(
  void  *ignored)
{
  nasd_status_t rc;

  rc = nasd_destroy_threadgroup(&nasd_od_linux_flush_threadgroup);
  if (rc) {
    nasd_printf(
      "DRIVE WARNING: got 0x%x (%s) destroying "
      "nasd_od_linux_flush_threadgroup\n",
      rc, nasd_error_string(rc));
  }
}

void
nasd_od_linux_stop_flush_proc(
  void  *ignored)
{
  nasd_odc_dirty_kick();
  NASD_THREADGROUP_INDICATE_SHUTDOWN(&nasd_od_linux_flush_threadgroup);
  FLUSH_PROC_KICK();
  NASD_THREADGROUP_WAIT_STOP(&nasd_od_linux_flush_threadgroup);
}

void
nasd_od_linux_flush_proc(
  nasd_threadarg_t  ignored)
{
  struct timespec linux_ts;
  nasd_status_t rc;

  /*
   * This is pretty primitive
   */

  NASD_THREADGROUP_RUNNING(&nasd_od_linux_flush_threadgroup);
  while (!NASD_THREADGROUP_SHUTDOWNP(&nasd_od_linux_flush_threadgroup)) {
    /*
     * XXX make this configurable or something
     */
    linux_ts.tv_sec = 5;
    linux_ts.tv_nsec = 0;
    interruptible_sleep_on_timeout(&nasd_od_linux_flush_wq,
      timespec_to_jiffies(&linux_ts));
    if (NASD_THREADGROUP_SHUTDOWNP(&nasd_od_linux_flush_threadgroup))
      break;
    NASD_IO_INC_STAT(auto_flush);
    rc = nasd_odc_flush_dirty(0);
    if (rc) {
      NASD_PANIC();
    }
  }
  NASD_THREADGROUP_DONE(&nasd_od_linux_flush_threadgroup);
}

#if NASD_LINUX_USE_LLIO > 0

nasd_status_t nasd_od_io_launch_internal(nasd_odc_ent_t *entlist);
void nasd_od_k_io_done(struct buffer_head *bh, int uptodate);

void
nasd_od_k_io_done(
  struct buffer_head  *bh,
  int                  uptodate)
{
  nasd_odc_ent_t *ent, *el;
  nasd_status_t rc;
  int washead;
  loff_t off;

  if (uptodate == 0) {
    set_bit(BH_Uptodate, &bh->b_state);
  }
  else {
    clear_bit(BH_Uptodate, &bh->b_state);
    NASD_PANIC();
  }

  ent = (nasd_odc_ent_t *)bh->b_dev_id;

  NASD_IO_LOCK();

  if (ent->io_flags&NASD_CI_IOHEAD)
    washead = 1;
  else
    washead = 0;

  ent->io_flags &= ~(NASD_CI_DISPATCH|NASD_CI_IOHEAD);
  NASD_IO_TM_COMPLETE(ent);
#if NASD_DRIVE_DEBUG_PHYS_OUTSTANDING > 0
  ent->kbuf = NULL;
#endif /* NASD_DRIVE_DEBUG_PHYS_OUTSTANDING > 0 */

  nasd_od_io_last_completed_sect = ent->real_sectno;

  ent->inext = ent->iprev = NULL;
  ent->cnext = ent->cprev = NULL;

  NASD_IO_UNLOCK();

  nasd_od_io_iodone(ent);

  if (washead) {
    NASD_IO_LOCK();

    nasd_od_io_deq_next(&el, 1);

    if (el) {
      rc = nasd_od_io_launch_internal(el);
      if (rc) {
        NASD_PANIC();
      }
    }
    else {
      nasd_od_io_ios_outstanding--;
    }

    NASD_IO_UNLOCK();
  }
}

/*
 * Caller holds I/O lock here
 */
nasd_status_t
nasd_od_io_launch_internal(
  nasd_odc_ent_t  *entlist)
{
  struct buffer_head *bharr[NASD_IO_MAX_COALESCE], *bh;
  nasd_odc_ent_t *e;
  int i, dir;

  NASD_IO_INC_STAT(pull_ios);

  dir = entlist->iodir;

  for(i=0,e=entlist;e;e=e->inext,i++) {
    NASD_ASSERT((e->io_flags&NASD_CI_DISPATCH) == NASD_CI_DISPATCH);
    NASD_ASSERT(e->iodir == dir);
    NASD_ASSERT(i < NASD_IO_MAX_COALESCE);
    bh = bharr[i] = e->rbh;

    bh->b_next = NULL;
    bh->b_blocknr = e->real_sectno;
    bh->b_size = NASD_OD_BASIC_BLOCKSIZE;
    bh->b_dev = nasd_od_linux_kdev;
    bh->b_rdev = nasd_od_linux_kdev;
    bh->b_rsector = e->real_sectno;
    bh->b_this_page = NULL;
    bh->b_state = 0;
    bh->b_next_free = NULL;
    bh->b_count = 1;

    bh->b_data = e->data.buf;
    bh->b_list = BUF_CLEAN;
    bh->b_flushtime = 0;

    init_waitqueue(&bh->b_wait);
    bh->b_pprev = NULL;
    bh->b_prev_free = NULL;
    bh->b_reqnext = NULL;

    bh->b_end_io = nasd_od_k_io_done;
    bh->b_dev_id = e;

    NASD_ASSERT((e->io_flags&NASD_CI_DISPATCH) == NASD_CI_DISPATCH);
    NASD_ASSERT(e->iodir == dir);

#if NASD_DRIVE_DEBUG_PHYS_OUTSTANDING > 0
    e->kbuf = bh;
#endif /* NASD_DRIVE_DEBUG_PHYS_OUTSTANDING > 0 */
  }
  NASD_ASSERT(i > 0);

  entlist->io_flags |= NASD_CI_IOHEAD;

  if (dir == NASD_U_READ) {
    ll_rw_block(READA, i, bharr);
  }
  else if (dir == NASD_U_WRITE) {
    ll_rw_block(WRITEA, i, bharr);
  }
  else {
    NASD_PANIC();
  }

  return(NASD_OP_NOT_SUPPORTED);
}

/*
 * Caller holds I/O lock here
 */
nasd_status_t
nasd_od_io_launch(
  nasd_odc_ent_t  *entlist)
{
  nasd_status_t rc;

  if (nasd_od_io_ios_outstanding >= nasd_od_ioq_max_outstanding) {
    return(NASD_IOSYS_FULL);
  }
  nasd_od_io_ios_outstanding++;

  rc = nasd_od_io_launch_internal(entlist);
  return(rc);
}

void
nasd_od_k_blkio_done_async(
  struct buffer_head  *bh,
  int                  uptodate)
{
  nasd_odc_ent_t *ent;

  ent = (nasd_odc_ent_t *)bh->b_dev_id;

  if (uptodate) {
    set_bit(BH_Uptodate, &bh->b_state);
  }
  else {
    clear_bit(BH_Uptodate, &bh->b_state);
  }
  /* XXX */
  NASD_IO_TM_COMPLETE(ent);

  nasd_od_io_flush_block_async_finish(ent);
}

void
nasd_od_k_blkio_done(
  struct buffer_head  *bh,
  int                  uptodate)
{
  if (uptodate) {
    set_bit(BH_Uptodate, &bh->b_state);
  }
  else {
    clear_bit(BH_Uptodate, &bh->b_state);
  }

  /*
   * This is for synchronous I/O, so the thread
   * waiting on this block does the rest
   */
}

nasd_status_t
nasd_od_k_blkio(
  nasd_odc_ent_t   *ent,   /* not guaranteed to be "real" */
  nasd_blkno_t      sectno,
  void             *buf,
  int               len,
  int               iodir,
  void            (*finish_proc)(struct buffer_head *, int))
{
  struct buffer_head *bh, real_bh;

  bh = &real_bh;

  bh->b_next = NULL;
  bh->b_blocknr = sectno;
  bh->b_size = len;
  bh->b_dev = nasd_od_linux_kdev;
  bh->b_rdev = nasd_od_linux_kdev;
  bh->b_rsector = sectno;
  bh->b_this_page = NULL;
  bh->b_state = 0;
  bh->b_next_free = NULL;
  bh->b_count = 1;

  bh->b_data = buf;
  bh->b_list = BUF_CLEAN;
  bh->b_flushtime = 0;

  init_waitqueue(&bh->b_wait);
  bh->b_pprev = NULL;
  bh->b_prev_free = NULL;
  bh->b_reqnext = NULL;

  bh->b_end_io = finish_proc;
  bh->b_dev_id = ent;

  nasd_od_io_sync_launch(sectno);

  NASD_IO_TM_LAUNCH(ent);

  if (iodir == NASD_U_READ) {
    ll_rw_block(READA, 1, &bh);
  }
  else if (iodir == NASD_U_WRITE) {
    ll_rw_block(WRITEA, 1, &bh);
  }
  else {
    NASD_PANIC();
  }

  wait_on_buffer(bh);

  NASD_IO_TM_COMPLETE(ent);

  NASD_ASSERT(bh->b_count == 1);

  if (test_bit(BH_Uptodate, &bh->b_state)) {
    return(NASD_SUCCESS);
  }
  else {
    nasd_printf("DRIVE ERROR: failed completing blkio\n");
    NASD_PANIC();
    return(NASD_FAIL);
  }
}

void
nasd_od_io_sys_flush_block(
  nasd_odc_ent_t  *ent)
{
  nasd_status_t rc;

  NASD_IO_INC_SIZE_STAT(1,write);
  nasd_od_io_sync_launch(ent->real_sectno);
  NASD_IO_TM_LAUNCH(ent);
  rc = nasd_od_k_blkio(ent, ent->real_sectno, ent->data.buf,
    NASD_OD_BASIC_BLOCKSIZE, NASD_U_WRITE,
    nasd_od_k_blkio_done);
  if (rc != NASD_SUCCESS) {
    NASD_PANIC();
  }
  NASD_IO_TM_COMPLETE(ent);

  nasd_od_io_last_completed_sect = ent->real_sectno;
}

void
nasd_od_io_sys_flush_block_async(
  nasd_odc_ent_t  *ent)
{
  nasd_status_t rc;

  NASD_IO_INC_SIZE_STAT(1,write);
  nasd_od_io_sync_launch(ent->real_sectno);
  NASD_IO_TM_LAUNCH(ent);
  rc = nasd_od_k_blkio(ent, ent->real_sectno, ent->data.buf,
    NASD_OD_BASIC_BLOCKSIZE, NASD_U_WRITE,
    nasd_od_k_blkio_done_async);
  if (rc != NASD_SUCCESS) {
    NASD_PANIC();
  }
}

nasd_status_t
nasd_od_k_write_header(
  nasd_odc_ent_t  *fake_ent,
  nasd_blkno_t     sectno,
  nasd_od_disk_t  *disk)
{
  nasd_status_t rc;

  NASD_IO_INC_IO_STAT(header_write,write);
  rc = nasd_od_k_blkio(fake_ent, sectno, (void *)disk,
    NASD_OD_SECT_SIZE, NASD_U_WRITE, nasd_od_k_blkio_done);
  return(rc);
}

void
nasd_od_io_read_header(
  nasd_blkno_t     sectno,
  nasd_od_disk_t  *disk)
{
  nasd_odc_ent_t fake_ent;
  nasd_status_t rc;

  NASD_IO_TM_ENQ(&fake_ent);
  rc = nasd_od_k_blkio(&fake_ent, sectno, (void *)disk,
    NASD_OD_SECT_SIZE, NASD_U_READ, nasd_od_k_blkio_done);
  if (rc != NASD_SUCCESS) {
    NASD_PANIC();
  }
  NASD_IO_TM_DONE(&fake_ent);
}

/*
 * Call with diskstate locked
 */
nasd_status_t
nasd_od_write_diskstate(
  int  force_sync)
{
  nasd_sectno_t last_comp, diff1, diff2;
  nasd_odc_ent_t fake_ent;
  nasd_status_t rc1, rc2;
  int write_which;

  NASD_IO_TM_ENQ(&fake_ent);

  nasd_odc_state->disk->mod_time = nasd_odc_state->nvstate->mod_time;

  if (force_sync) {
    NASD_IO_INC_IO_STAT(header_force_sync,write);
  }

  write_which = 0; /* shut up whiner compiler */
  if (force_sync == 0) {
    last_comp = nasd_od_io_last_completed_sect;
    if (last_comp >= nasd_diskheader_dup_blk) {
      write_which = 2;
    }
    else {
      NASD_ASSERT(nasd_diskheader_blk <= last_comp);
      NASD_ASSERT(nasd_diskheader_dup_blk > last_comp);
      diff1 = last_comp - nasd_diskheader_blk;
      diff2 = nasd_diskheader_dup_blk - last_comp;
      if (diff1 > diff2)
        write_which = 2;
      else
        write_which = 1;
    }
  }

  if (force_sync || (write_which == 1)) {
    NASD_IO_INC_IO_STAT(header1_write,write);
    rc1 = nasd_od_k_write_header(&fake_ent, nasd_diskheader_blk,
      &nasd_odc_state->disk);
    NASD_IO_TM_DONE(&fake_ent);
    NASD_IO_TM_ENQ(&fake_ent);
  }
  else {
    rc1 = NASD_SUCCESS;
  }

  if (force_sync || (write_which == 2)) {
    NASD_IO_INC_IO_STAT(header2_write,write);
    rc2 = nasd_od_k_write_header(&fake_ent, nasd_diskheader_dup_blk,
      &nasd_odc_state->disk);
  }
  else {
    rc2 = NASD_SUCCESS;
  }
  NASD_IO_TM_DONE(&fake_ent);

  if (rc1)
    return(rc1);
  return(rc2);
}

#else /* NASD_LINUX_USE_LLIO > 0 */

/*
 * This code is largely lifted from nasd_od_uio.c
 * It implements the drive I/O interface in terms
 * of syscalls using the __KERNEL_SYSCALLS__ stuff.
 * It's not nearly as efficient as the "right" thing
 * would be. See the comment at NASD_LINUX_USE_LLIO
 * for details.
 */

void
nasd_od_linuxio_proc(
  nasd_threadarg_t  arg)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  int j, wb, t, fd, rc, iodir, retry_count;
  struct iovec iov[NASD_IO_MAX_COALESCE];
  nasd_odc_ent_t *e, *dispatch, *next;
  loff_t want;

  NASD_OD_LINUX_SAVEPROC();

  t = (int)((u_long)arg);
  fd = open(nasd_k_devname, O_RDWR, 0);
  if (fd < 0) {
    NASD_PANIC();
  }
  NASD_IO_PEND_LOCK();
  NASD_THREADGROUP_RUNNING(&nasd_k_io_threadgroup)
  NASD_SIGNAL_COND(nasd_k_run_cond);
  while(!NASD_THREADGROUP_SHUTDOWNP(&nasd_k_io_threadgroup)) {
    if (nasd_k_pending.cnext != &nasd_k_pending) {
      /*
       * We have a pending I/O, dispatch it.
       */
      dispatch = nasd_k_pending.cnext;
      /* remove from pending queue */
      dispatch->cnext->cprev = dispatch->cprev;
      dispatch->cprev->cnext = dispatch->cnext;
      dispatch->cnext = dispatch->cprev = NULL;
do_dispatch:
      iodir = dispatch->iodir;
      NASD_IO_INC_STAT(pull_ios);

      for(wb=j=0,e=dispatch;e;e=e->inext) {
        NASD_ASSERT(j<NASD_UIO_MAXIOV);
        NASD_ASSERT(j<NASD_IO_MAX_COALESCE);
        NASD_ASSERT(e->blkno <= nasd_od_blocks);
        NASD_ASSERT(e->iodir == iodir);
        NASD_ASSERT((e->io_flags&(NASD_CI_DISPATCH|NASD_CI_IOQ)) == NASD_CI_DISPATCH);
        iov[j].iov_base = (caddr_t)e->data.buf;
        iov[j].iov_len = NASD_OD_BASIC_BLOCKSIZE;
        j++;
        wb += NASD_OD_BASIC_BLOCKSIZE;
        NASD_IO_TM_LAUNCH(e);
      }
      if (iodir == NASD_U_READ) {
        NASD_IO_INC_SIZE_STAT(j,read);
      }
      else {
        NASD_IO_INC_SIZE_STAT(j,write);
      }
      NASD_IO_PEND_UNLOCK();
      /*
       * Actually do the I/O.
       */
      retry_count = 0;
      want = ((loff_t)dispatch->real_sectno) << NASD_OD_SECT_SHIFT;
do_retry:
      do_lseek_fd(fd, want, SEEK_SET);
      switch(iodir) {
        case NASD_U_READ:
          NASD_IO_INC_STAT(num_io_reads);
          rc = readv(fd, iov, j);
          break;
        case NASD_U_WRITE:
          NASD_IO_INC_STAT(num_io_writes);
          rc = writev(fd, iov, j);
          break;
        default:
          NASD_PANIC();
      }
      if (rc > 0) {
        NASD_IO_LASTCOMP_ASSIGN(want + (loff_t)rc);
      }

      if (rc != wb) {
        nasd_printf("DRIVE UIO: got rc=%d wanted wb=%d\n", rc, wb);
        nasd_printf("DRIVE UIO: want=%lu j=%d errno %d retry_count %d\n",
          want, j, errno, retry_count);
        retry_count++;
        if (retry_count <= nasd_k_max_retries) {
          NASD_IO_INC_STAT(retries);
          goto do_retry;
        }
        NASD_PANIC();
      }

      NASD_IO_LOCK();
      for(e=dispatch;e;e=e->inext) {
        e->io_flags &= ~NASD_CI_DISPATCH;
        NASD_IO_TM_COMPLETE(e);
      }
      NASD_IO_UNLOCK();
      /*
       * Announce completion of all I/Os in list
       */
      for(e=dispatch;e;e=next) {
        next = e->inext;
        e->inext = e->iprev = NULL;
        nasd_od_io_iodone(e);
      }
      NASD_IO_PEND_LOCK();
      if (nasd_k_pending.cnext != &nasd_k_pending) {
        dispatch = nasd_k_pending.cnext;
        dispatch->cnext->cprev = dispatch->cprev;
        dispatch->cprev->cnext = dispatch->cnext;
        dispatch->cnext = dispatch->cprev = NULL;
      }
      else {
        dispatch = NULL;
      }
      NASD_IO_PEND_UNLOCK();
      if (dispatch == NULL) {
        nasd_od_io_deq_next(&dispatch, 0);
      }
      NASD_IO_PEND_LOCK();
      if (dispatch) {
        /* We already have our next I/O in hand. Do it. */
        goto do_dispatch;
      }
      else {
        nasd_od_io_ios_outstanding--;
      }
    }
    else {
      NASD_WAIT_COND(nasd_k_q_cond,nasd_k_q_lock);
    }
  }

  close(fd);

  NASD_IO_PEND_UNLOCK();

  NASD_OD_LINUX_RESTOREPROC();

  NASD_SIGNAL_COND(nasd_k_run_cond);
  NASD_THREADGROUP_DONE(&nasd_k_io_threadgroup)
}

void
nasd_k_close_file(
  void  *ignored)
{
  NASD_OD_LINUX_SAVEPROC_DECL

  NASD_OD_LINUX_SAVEPROC();

  lock_kernel();

  NASD_ASSERT(nasd_k_extrafile != NULL);
  NASD_ASSERT(nasd_k_extrafile->f_count > 0);
  if (nasd_k_extrafile->f_op) {
    if (nasd_k_extrafile->f_op->flush) {
      nasd_k_extrafile->f_op->flush(nasd_k_extrafile);
    }
  }

  fput(nasd_k_extrafile);

  unlock_kernel();

  nasd_k_extrafile = NULL;

  NASD_OD_LINUX_RESTOREPROC();
}

void
nasd_k_shutdown_io_threadgroup(
  void  *ignored)
{
  nasd_status_t rc;

  rc = nasd_destroy_threadgroup(&nasd_k_io_threadgroup);
  if (rc) {
    nasd_printf("DRIVE WARNING: got 0x%x (%s) destroying nasd_k_io_threadgroup\n",
      rc, nasd_error_string(rc));
  }
}

void
nasd_k_stop_iothreads(
  void  *ignored)
{
  NASD_THREADGROUP_INDICATE_SHUTDOWN(&nasd_k_io_threadgroup);
  NASD_BROADCAST_COND(nasd_k_q_cond);
  NASD_THREADGROUP_WAIT_STOP(&nasd_k_io_threadgroup);
}

#define NASD_K_ICOND(_c_) { \
  rc = nasd_cond_init(_c_); \
  if (rc) { \
    return(rc); \
  } \
  rc = nasd_shutdown_cond(nasd_odc_shutdown, _c_); \
  if (rc) { \
    return(rc); \
  } \
}

#define NASD_K_MUTEX(_m_) { \
  rc = nasd_mutex_init(_m_); \
  if (rc) { \
    return(rc); \
  } \
  rc = nasd_shutdown_mutex(nasd_odc_shutdown, _m_); \
  if (rc) { \
    return(rc); \
  } \
}

/*
 * Caller holds I/O lock here
 */
nasd_status_t
nasd_od_io_launch(
  nasd_odc_ent_t  *entlist)
{
  NASD_IO_PEND_LOCK();

  if (nasd_od_io_ios_outstanding >= nasd_od_ioq_max_outstanding) {
    NASD_IO_PEND_UNLOCK();
    return(NASD_IOSYS_FULL);
  }
  nasd_od_io_ios_outstanding++;

  entlist->cprev = nasd_k_pending.cprev;
  entlist->cnext = &nasd_k_pending;
  entlist->cprev->cnext = entlist;
  entlist->cnext->cprev = entlist;

  NASD_IO_PEND_UNLOCK();

  NASD_SIGNAL_COND(nasd_k_q_cond);

  return(NASD_SUCCESS);
}

void
nasd_od_io_sys_flush_block(
  nasd_odc_ent_t  *ent)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  loff_t want;
  int rc;

  NASD_OD_LINUX_SAVEPROC();

  NASD_IO_EFD_LOCK();

  want = ((loff_t)ent->real_sectno) << NASD_OD_SECT_SHIFT;

  NASD_IO_INC_SIZE_STAT(1,write);
  nasd_od_io_sync_launch(ent->real_sectno);
  NASD_IO_TM_LAUNCH(ent);
  rc = do_write_pos(nasd_k_extrafile, ent->data.buf, 
    NASD_OD_BASIC_BLOCKSIZE, want);
  NASD_ASSERT(rc == NASD_OD_BASIC_BLOCKSIZE);
  NASD_IO_TM_COMPLETE(ent);

  NASD_IO_EFD_UNLOCK();

  NASD_OD_LINUX_RESTOREPROC();

  if (rc > 0) {
    NASD_IO_LASTCOMP_ASSIGN(want + (loff_t)rc);
  }
}

void
nasd_od_io_sys_flush_block_async(
  nasd_odc_ent_t  *ent)
{
  /*
   * Eventually, something clever could be done here.
   */
  nasd_od_io_sys_flush_block(ent);
  nasd_od_io_flush_block_async_finish(ent);
}

/*
 * Call with diskstate locked
 */
nasd_status_t
nasd_od_write_diskstate(
  int  force_sync)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  nasd_sectno_t last_comp, diff1, diff2;
  nasd_odc_ent_t fake_ent;
  int ret, write_which;
  loff_t want;

  NASD_IO_TM_ENQ(&fake_ent);

  nasd_odc_state->disk->mod_time = nasd_odc_state->nvstate->mod_time;

  if (force_sync) {
    NASD_IO_INC_IO_STAT(header_force_sync,write);
  }

  write_which = 0; /* shut up whiner compiler */
  if (force_sync == 0) {
    last_comp = nasd_od_io_last_completed_sect;
    if (last_comp >= nasd_diskheader_dup_blk) {
      write_which = 2;
    }
    else {
      NASD_ASSERT(nasd_diskheader_blk <= last_comp);
      NASD_ASSERT(nasd_diskheader_dup_blk > last_comp);
      diff1 = last_comp - nasd_diskheader_blk;
      diff2 = nasd_diskheader_dup_blk - last_comp;
      if (diff1 > diff2)
        write_which = 2;
      else
        write_which = 1;
    }
  }

  NASD_OD_LINUX_SAVEPROC();

  NASD_IO_EFD_LOCK();

  /*
   * See comment before the declaration of nasd_od_linux_headerbuf
   * near the top of this file for an explanation.
   */
  bcopy((char *)&nasd_odc_state->disk, nasd_od_linux_headerbuf,
    sizeof(nasd_od_disk_t));

  if (force_sync || (write_which == 1)) {
    want = ((loff_t)nasd_diskheader_blk) << NASD_OD_SECT_SHIFT;
    NASD_IO_TM_LAUNCH(&fake_ent);
    nasd_od_io_sync_launch(nasd_diskheader_blk);
    ret = do_write_pos(nasd_k_extrafile, (void *)nasd_od_linux_headerbuf,
      NASD_OD_SECT_SIZE, want);
    if (ret != NASD_OD_SECT_SIZE) {
      nasd_printf("ret=%d wanted %d buf 0x%lx errno %d\n",
        ret, NASD_OD_SECT_SIZE,
        (unsigned long)nasd_od_linux_headerbuf, errno);
      NASD_PANIC();
    }
    NASD_IO_TM_COMPLETE(&fake_ent);

    if ((force_sync == 0) && (ret > 0)) {
      NASD_IO_LASTCOMP_ASSIGN(want + (loff_t)ret);
    }

    NASD_IO_INC_IO_STAT(header_write,write);
    NASD_IO_INC_IO_STAT(header1_write,write);
  }

  if (force_sync || (write_which == 2)) {
    want = ((loff_t)nasd_diskheader_dup_blk) << NASD_OD_SECT_SHIFT;
    NASD_IO_TM_LAUNCH(&fake_ent);
    nasd_od_io_sync_launch(nasd_diskheader_dup_blk);
    NASD_OD_LINUX_SAVEPROC();
    ret = do_write_pos(nasd_k_extrafile, (void *)nasd_od_linux_headerbuf,
      NASD_OD_SECT_SIZE, want);
    NASD_OD_LINUX_RESTOREPROC();
    if (ret != NASD_OD_SECT_SIZE) {
      nasd_printf("ret=%d wanted %d buf 0x%lx\n",
        ret, NASD_OD_SECT_SIZE,
        (unsigned long)nasd_od_linux_headerbuf);
      NASD_PANIC();
    }
    NASD_IO_TM_COMPLETE(&fake_ent);

    if (ret > 0) {
      NASD_IO_LASTCOMP_ASSIGN(want + (loff_t)ret);
    }

    NASD_IO_INC_IO_STAT(header_write,write);
    NASD_IO_INC_IO_STAT(header2_write,write);
  }

  NASD_IO_EFD_UNLOCK();

  NASD_OD_LINUX_RESTOREPROC();

  NASD_IO_TM_DONE(&fake_ent);

  return(NASD_SUCCESS);
}

void
nasd_od_io_read_header(
  nasd_blkno_t     sectno,
  nasd_od_disk_t  *disk)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  nasd_odc_ent_t fake_ent;
  loff_t want;
  int rc;

  NASD_IO_TM_ENQ(&fake_ent);

  NASD_OD_LINUX_SAVEPROC();

  NASD_IO_EFD_LOCK();

  want = ((loff_t)sectno) << NASD_OD_SECT_SHIFT;
  nasd_od_io_sync_launch(sectno);
  NASD_IO_TM_LAUNCH(&fake_ent);
  rc = do_read_pos(nasd_k_extrafile, (void *)nasd_od_linux_headerbuf,
    NASD_OD_SECT_SIZE, want);
  if (rc != NASD_OD_SECT_SIZE)
    NASD_PANIC();
  /*
   * See comment before the declaration of nasd_od_linux_headerbuf
   * near the top of this file for an explanation.
   */
  bcopy((char *)nasd_od_linux_headerbuf, (char *)disk,
    sizeof(nasd_od_disk_t));
  NASD_IO_TM_COMPLETE(&fake_ent);
  if (rc > 0) {
    NASD_IO_LASTCOMP_ASSIGN(want + (loff_t)rc);
  }
  NASD_IO_INC_IO_STAT(header_read,read);

  NASD_IO_EFD_UNLOCK();

  NASD_IO_TM_DONE(&fake_ent);

  NASD_OD_LINUX_RESTOREPROC();
}

#endif /* NASD_LINUX_USE_LLIO > 0 */

void
nasd_od_linux_shutdown_disk_headerbuf(
  void  *ignored)
{
  kfree(nasd_od_linux_headerbuf);
  nasd_od_linux_headerbuf = NULL;
}

nasd_status_t
nasd_od_io_init(
  kdev_t             kdev,
  char              *devname,
  nasd_od_config_t  *config)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  nasd_thread_t handle;
  nasd_status_t rc;
  int i;

  nasd_od_linux_kdev = kdev;

  nasd_od_ioq_max_outstanding = config->ios_outstanding;
  if (nasd_od_ioq_max_outstanding < 1)
    return(NASD_BAD_IOQUEUE_LEN);

  NASD_IO_MODULE_INIT();

  rc = nasd_init_threadgroup(&nasd_od_linux_flush_threadgroup);
  if (rc)
    return(rc);
  rc = nasd_shutdown_proc(nasd_odc_shutdown,
    nasd_od_linux_shutdown_flush_threadgroup, NULL);
  if (rc) {
    nasd_od_linux_shutdown_flush_threadgroup(NULL);
    return(rc);
  }

  init_waitqueue(&nasd_od_linux_flush_wq);

  nasd_od_linux_headerbuf = (nasd_od_disk_t *)kmalloc(
    sizeof(nasd_od_disk_t), GFP_KERNEL|GFP_DMA);
  if (nasd_od_linux_headerbuf == NULL) {
    return(NASD_NO_MEM);
  }
  rc = nasd_shutdown_proc(nasd_odc_shutdown,
    nasd_od_linux_shutdown_disk_headerbuf, NULL);
  if (rc) {
    nasd_od_linux_shutdown_disk_headerbuf(NULL);
    return(NASD_NO_MEM);
  }

#if NASD_LINUX_USE_LLIO == 0

  bzero((char *)&nasd_k_pending, sizeof(nasd_k_pending));
  nasd_k_pending.cnext = nasd_k_pending.cprev = &nasd_k_pending;

  strcpy(nasd_k_devname, devname);

  NASD_OD_LINUX_SAVEPROC();

  nasd_k_extrafile = filp_open(nasd_k_devname, O_RDWR, 0);

  /*
   * AFAICT, this is only used with posix locks and flock and such.
   * I think we'll be able to get away with not dealing with this.
   */
  nasd_k_extrafile_id = NULL;

  NASD_OD_LINUX_RESTOREPROC();

  if (nasd_k_extrafile == NULL) {
    return(NASD_FAIL);
  }
  if (IS_ERR(nasd_k_extrafile)) {
    return(NASD_FAIL);
  }

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_k_close_file, NULL);
  if (rc) {
    nasd_k_close_file(NULL);
    return(rc);
  }

  NASD_K_ICOND(&nasd_k_run_cond);
  NASD_K_ICOND(&nasd_k_q_cond);

  NASD_K_MUTEX(&nasd_k_q_lock);
  NASD_K_MUTEX(&nasd_k_extrafd_lock);

  rc = nasd_init_threadgroup(&nasd_k_io_threadgroup);
  if (rc)
    return(rc);
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_k_shutdown_io_threadgroup,
    NULL);
  if (rc) {
    nasd_k_shutdown_io_threadgroup(NULL);
    return(rc);
  }

  /* create I/O threads */
  for(i=0;i<nasd_od_ioq_max_outstanding;i++) {
    rc = nasd_thread_create_w_name(&handle, nasd_od_linuxio_proc,
      (nasd_threadarg_t)((u_long)i), "nasd_k_proc");
    if (rc) {
      NASD_THREADGROUP_WAIT_START(&nasd_k_io_threadgroup);
      nasd_k_stop_iothreads(NULL);
      return(rc);
    }
    NASD_THREADGROUP_STARTED(&nasd_k_io_threadgroup);
  }

  /* wait for threads to start */
  NASD_THREADGROUP_WAIT_START(&nasd_k_io_threadgroup);

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_k_stop_iothreads, NULL);
  if (rc) {
    nasd_k_stop_iothreads(NULL);
    return(rc);
  }

#endif /* NASD_LINUX_USE_LLIO == 0 */

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_od_io_go()
{
  nasd_thread_t handle;
  nasd_status_t rc;

  rc = nasd_thread_create_w_name(&handle, nasd_od_linux_flush_proc,
    NULL, "nasd_k_flush_proc");
  if (rc)
    return(rc);
  NASD_THREADGROUP_STARTED(&nasd_od_linux_flush_threadgroup);

  NASD_THREADGROUP_WAIT_START(&nasd_od_linux_flush_threadgroup);

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_od_linux_stop_flush_proc,
    NULL);
  if (rc) {
    nasd_od_linux_stop_flush_proc(NULL);
    return(rc);
  }

  return(NASD_SUCCESS);
}

void
nasd_od_linux_cachepage_free(
  void  *buf)
{
  nasd_pagebuf_t *pb;

  if (NASD_OD_LINUX_ISEVIL_PAGE(buf)) {
    pb = (nasd_pagebuf_t *)buf;
    pb->next = nasd_od_linux_evil_pagelist;
    nasd_od_linux_evil_pagelist = pb;
  }
  else {
    NASD_SYS_FREE_PGPWER(buf, NASD_OD_LINUX_PAGEBLOCK_ORD);
  }
}

/*
 * Alloc actual page of storage for a cache handle.
 */
nasd_status_t
nasd_odc_io_alloc_page(
  nasd_odc_ent_t  *ent)
{
  nasd_delaycounter_t delayer;
  nasd_pagebuf_t *pb;
  int tries;
  void *buf;

#define TRYGFP0 (__GFP_LOW | __GFP_WAIT | __GFP_IO)
#define TRYGFP1 GFP_KERNEL
#define TRYGFP2 GFP_NFS

  buf = NULL;
  if (buf == NULL) {
    pb = nasd_od_linux_evil_pagelist;
    if (pb) {
      nasd_od_linux_evil_pagelist = pb->next;
      buf = pb->buf;
      NASD_ASSERT(((void *)buf) == ((void *)pb));
    }
  }
  for (tries=0;(tries<5)&&(buf==NULL);tries++) {
    nasd_sys_thread_yield();
    buf = (void *) __get_free_pages(TRYGFP0,
      NASD_OD_LINUX_PAGEBLOCK_ORD);
    if (buf == NULL) {
      nasd_sys_thread_yield();
      NASD_BEGIN_DELAYCNT(&delayer);
      NASD_DELAY_FROM(&delayer,10000); /* 10 milliseconds */
      nasd_sys_thread_yield();
      buf = (void *) __get_free_pages(TRYGFP1,
        NASD_OD_LINUX_PAGEBLOCK_ORD);
      if (buf == NULL) {
        nasd_sys_thread_yield();
        NASD_BEGIN_DELAYCNT(&delayer);
        NASD_DELAY_FROM(&delayer,10000); /* 10 milliseconds */
        nasd_sys_thread_yield();
        buf = (void *) __get_free_pages(TRYGFP2,
          NASD_OD_LINUX_PAGEBLOCK_ORD);
      }
    }
  }
  if (buf == NULL)
    return(NASD_NO_MEM);

  bzero(buf, NASD_OD_BASIC_BLOCKSIZE);
  ent->data.buf = buf;

#if NASD_LINUX_USE_LLIO > 0
  NASD_Malloc(ent->rbh, sizeof(struct buffer_head), (struct buffer_head *));
  if (ent->rbh == NULL) {
    ent->data.buf = NULL;
    nasd_od_linux_cachepage_free(buf);
    return(NASD_NO_MEM);
  }
#else /* NASD_LINUX_USE_LLIO > 0 */
  ent->rbh = NULL;
#endif /* NASD_LINUX_USE_LLIO > 0 */

  return(NASD_SUCCESS);
}

/*
 * Release page of storage
 */
void
nasd_odc_io_release_page(
  nasd_odc_ent_t  *ent)
{
  if (ent->data.buf == NULL) {
    /* nothing here */
    return;
  }

  nasd_od_linux_cachepage_free(ent->data.buf);
  ent->data.buf = NULL;

#if NASD_LINUX_USE_LLIO > 0
  NASD_Free(ent->rbh, sizeof(struct buffer_head));
  ent->rbh = NULL;
#else /* NASD_LINUX_USE_LLIO > 0 */
  NASD_ASSERT(ent->rbh == NULL);
#endif /* NASD_LINUX_USE_LLIO > 0 */
}

nasd_status_t
nasd_od_sys_rshutdown(
  nasd_drive_rshutdown_flags_t  flags)
{
  return(NASD_OP_NOT_SUPPORTED);
}

void
nasd_od_linux_killhandlergroup(
  void  *arg)
{
  nasd_threadgroup_t *group;
  nasd_status_t rc;

  group = (nasd_threadgroup_t *)arg;
  rc = nasd_destroy_threadgroup(group);
  if (rc) {
    nasd_printf("DRIVE: got 0x%x (%s) destroying thread group 0x%lx\n",
      rc, nasd_error_string(rc), (unsigned long)group);
  }
}

void
nasd_od_linux_wait_handler(
  nasd_threadgroup_t  *group)
{
  NASD_THREADGROUP_WAIT_STOP(group);
}

void
nasd_psrv_thread(
  nasd_threadarg_t  arg)
{
  struct nasd_serv *nasdsrv;
  nasd_svinfo_t *svinfo;
  nasd_status_t rc;

  NASD_THREADGROUP_RUNNING(&nasd_od_linux_handler_group);

  svinfo = (nasd_svinfo_t *)arg;
  nasdsrv = &svinfo->srv;

#if NASD_RPC_PACKAGE == NASD_RPC_PACKAGE_DCE
  if (nasdsrv->use_tcp) {
    nasd_drive_dce_setup_tcp();
    nasd_printf("DRIVE: using DCE-TCP\n");
  }
  else {
    nasd_drive_dce_setup_udp();
    nasd_printf("DRIVE: using DCE-UDP\n");
  }
#endif /* NASD_RPC_PACKAGE == NASD_RPC_PACKAGE_DCE */

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: startup RPC\n");
  }

  rc = nasd_drive_startup_rpc();
  if (rc) {
    nasd_printf("DRIVE: got 0x%x (%s) from nasd_startup_rpc()\n",
      rc, nasd_error_string(rc));
    goto done;
  }

  if (nasdsrv->stack_size) {
    rc = nasd_drive_rpc_set_stacksize(nasdsrv->stack_size);
    if (rc == NASD_OP_NOT_SUPPORTED) {
      nasd_printf("DRIVE: RPC package does not support stack size "
        "adjustment\n");
    }
    else if (rc) {
      nasd_printf("DRIVE: got 0x%x (%s) setting RPC stack size to %d\n",
        rc, nasd_error_string(rc), nasdsrv->stack_size);
    }
    else if (nasdsrv->verbose) {
      nasd_printf("DRIVE: set RPC stacksize to %d\n", nasdsrv->stack_size);
    }
  }

  rc = nasd_drive_rpc_listen(nasdsrv->svc_threads, nasdsrv->ipport);

  if (rc || nasdsrv->verbose) {
    nasd_printf("DRIVE: RPC listener returns 0x%x (%s)\n",
      rc, nasd_error_string(rc));
  }

done:
  NASD_THREADGROUP_DONE(&nasd_od_linux_handler_group);
  wake_up(&nasd_od_linux_wq);
  NASD_THREAD_KILL_SELF();
}

/*
 * Call with NASD_D_LOCK held, consumes the lock.
 */
int
nasd_linux_psrv_go(
  nasd_svinfo_t  *svinfo,
  void           *arg)
{
  NASD_OD_LINUX_SAVEPROC_DECL
  struct nasd_serv *nasdsrv, *nasdsrv_arg;
  nasd_thread_t nasd_k_handler_thread;
  int error, lockheld, fd;
  unsigned long size;
  nasd_status_t rc;
  kdev_t kdev;
  dev_t dev;

  NASD_ASSERT(svinfo != NULL);
  NASD_ASSERT(arg != NULL);

  lockheld = 1;
  size = 0;

  /*
   * nasdsrv_arg is nasdsrv in the user-level
   * we use this for the open syscall since
   * linux seems to want the filename at the
   * user-level for some unclear reason
   */

  nasd_linux_drive_module_start();

  nasdsrv = &svinfo->srv;
  nasdsrv_arg = arg;
  dev = (dev_t)nasdsrv->dev;
  kdev = to_kdev_t(dev);

  nasd_odc_force_format = nasdsrv->do_format;

  if ((nasdsrv->cache_blocks < 10) || (nasdsrv->cache_blocks > 262144)) {
    nasd_printf("DRIVE: bad cache_blocks value %d\n", nasdsrv->cache_blocks);
    error = EINVAL;
    goto done;
  }
  nasd_odc_size = nasdsrv->cache_blocks;

  init_waitqueue(&nasd_od_linux_wq);

  /*
   * Check device exists
   * Load device blocks into size
   *
   * I'm going to burn in hell for this code
   */

  NASD_OD_LINUX_SAVEPROC();

  fd = open(nasdsrv->root_path, O_RDWR, 0);

  if (fd < 0) {
    nasd_printf("DRIVE: could not open %s (%d=%u,%d)\n",
      nasdsrv->root_path, fd, (unsigned int)fd, errno);
    error = errno;
    goto done;
  }

  error = ioctl(fd, BLKGETSIZE, (unsigned long)&size);

  close(fd);

  NASD_OD_LINUX_RESTOREPROC();

  if (error < 0) {
    /* burn in hell, yep, that's me */
    nasd_printf("DRIVE: could not get blocks for %s (%d,%d)\n",
      nasdsrv->root_path, fd, errno);
    error = errno;
  }

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: %lu sectors\n", size);
  }

  if (nasdsrv->max_disk_len) {
    svinfo->size = NASD_MIN(size, nasdsrv->max_disk_len);
  }
  else {
    svinfo->size = size;
  }

  if (size == 0) {
    nasd_printf("DRIVE: invalid or zero-length device\n");
    error = ENOSPC;
    goto done;
  }

  rc = nasd_basic_init();
  if (rc) {
    nasd_printf("DRIVE: failed nasd_basic_init(), error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    goto done;
  }

  rc = nasd_od_io_init(kdev, nasdsrv->root_path, &nasdsrv->config);
  if (rc) {
    nasd_printf("DRIVE: failed nasd_od_io_init(), error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }

  rc = nasd_setup_disk(svinfo->size, nasdsrv->dev, &nasdsrv->config);
  if (rc) {
    nasd_printf("DRIVE: failed nasd_setup_disk(), error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }

  rc = nasd_od_io_go();
  if (rc) {
    nasd_printf("DRIVE: failed nasd_od_io_go(), error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }

  rc = nasd_init_threadgroup(&nasd_od_linux_handler_group);
  if (rc) {
    nasd_printf("DRIVE: failed initializing handler group, error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_od_linux_killhandlergroup,
    &nasd_od_linux_handler_group);
  if (rc) {
    nasd_od_linux_killhandlergroup(&nasd_od_linux_handler_group);
    nasd_printf("DRIVE: failed setting shutdown for handler group,"
      " error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }

  rc = nasd_thread_create_w_name(&nasd_k_handler_thread,
    (void (*)())nasd_psrv_thread, (void *)svinfo, "nasd_k_handler");
  if (rc) {
    nasd_printf("DRIVE: failed creating handler thread, error=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    error = nasd_linux_nasd_status_to_errno(rc);
    rc = nasd_basic_shutdown();
    if (rc) {
      nasd_printf("DRIVE: failed nasd_basic_shutdown(), error=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      NASD_PANIC();
    }
    goto done;
  }
  NASD_THREADGROUP_STARTED(&nasd_od_linux_handler_group);
  NASD_THREADGROUP_WAIT_START(&nasd_od_linux_handler_group);
  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: handler thread running id %" NASD_THREAD_ID_FMT "\n",
      nasd_thread_self());
  }

  NASD_ASSERT(lockheld == 1);
  NASD_D_UNLOCK();
  lockheld = 0;

  interruptible_sleep_on(&nasd_od_linux_wq);

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: stop RPC subsystem\n");
  }

  nasd_drive_stop_rpc();

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: waiting for handler thread\n");
  }

  /* wait to be done */
  nasd_od_linux_wait_handler(&nasd_od_linux_handler_group);

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: handler thread no longer running\n");
  }

  if (nasdsrv->verbose) {
    nasd_printf("DRIVE: sequence shutdown\n");
  }

  rc = nasd_basic_shutdown();
  if (rc) {
    nasd_printf("DRIVE: nasd_basic_shutdown() returned 0x%x (%s)\n",
      rc, nasd_error_string(rc));
    NASD_PANIC();
  }

done:
  if (lockheld) {
    NASD_D_UNLOCK();
  }

  nasd_linux_drive_module_stop();

  return(-error);
}

/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
