5045-use-atomic_inc_*-atomic_dec_*-instead-of-atomic_add_* Wdiff usr/src/uts/common/fs/ufs/ufs_directio.c

Print this page

5045 use atomic_{inc,dec}_* instead of atomic_add_*

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/ufs/ufs_directio.c
          +++ new/usr/src/uts/common/fs/ufs/ufs_directio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27   27  /* All Rights Reserved */
  28   28  
  29   29  /*
  30   30   * Portions of this source code were derived from Berkeley 4.3 BSD
  31   31   * under license from the Regents of the University of California.
  32   32   */
  33   33  
  34   34  #include <sys/types.h>
  35   35  #include <sys/t_lock.h>
  36   36  #include <sys/param.h>
  37   37  #include <sys/time.h>
  38   38  #include <sys/systm.h>
  39   39  #include <sys/sysmacros.h>
  40   40  #include <sys/resource.h>
  41   41  #include <sys/signal.h>
  42   42  #include <sys/cred.h>
  43   43  #include <sys/user.h>
  44   44  #include <sys/buf.h>
  45   45  #include <sys/vfs.h>
  46   46  #include <sys/vnode.h>
  47   47  #include <sys/proc.h>
  48   48  #include <sys/disp.h>
  49   49  #include <sys/file.h>
  50   50  #include <sys/fcntl.h>
  51   51  #include <sys/flock.h>
  52   52  #include <sys/kmem.h>
  53   53  #include <sys/uio.h>
  54   54  #include <sys/dnlc.h>
  55   55  #include <sys/conf.h>
  56   56  #include <sys/mman.h>
  57   57  #include <sys/pathname.h>
  58   58  #include <sys/debug.h>
  59   59  #include <sys/vmsystm.h>
  60   60  #include <sys/cmn_err.h>
  61   61  #include <sys/filio.h>
  62   62  #include <sys/atomic.h>
  63   63  
  64   64  #include <sys/fssnap_if.h>
  65   65  #include <sys/fs/ufs_fs.h>
  66   66  #include <sys/fs/ufs_lockfs.h>
  67   67  #include <sys/fs/ufs_filio.h>
  68   68  #include <sys/fs/ufs_inode.h>
  69   69  #include <sys/fs/ufs_fsdir.h>
  70   70  #include <sys/fs/ufs_quota.h>
  71   71  #include <sys/fs/ufs_trans.h>
  72   72  #include <sys/fs/ufs_panic.h>
  73   73  #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  74   74  #include <sys/errno.h>
  75   75  
  76   76  #include <sys/filio.h>          /* _FIOIO */
  77   77  
  78   78  #include <vm/hat.h>
  79   79  #include <vm/page.h>
  80   80  #include <vm/pvn.h>
  81   81  #include <vm/as.h>
  82   82  #include <vm/seg.h>
  83   83  #include <vm/seg_map.h>
  84   84  #include <vm/seg_vn.h>
  85   85  #include <vm/seg_kmem.h>
  86   86  #include <vm/rm.h>
  87   87  #include <sys/swap.h>
  88   88  #include <sys/epm.h>
  89   89  
  90   90  #include <fs/fs_subr.h>
  91   91  
  92   92  static void     *ufs_directio_zero_buf;
  93   93  static int      ufs_directio_zero_len   = 8192;
  94   94  
  95   95  int     ufs_directio_enabled = 1;       /* feature is enabled */
  96   96  
  97   97  /*
  98   98   * for kstats reader
  99   99   */
 100  100  struct ufs_directio_kstats {
 101  101          kstat_named_t   logical_reads;
 102  102          kstat_named_t   phys_reads;
 103  103          kstat_named_t   hole_reads;
 104  104          kstat_named_t   nread;
 105  105          kstat_named_t   logical_writes;
 106  106          kstat_named_t   phys_writes;
 107  107          kstat_named_t   nwritten;
 108  108          kstat_named_t   nflushes;
 109  109  } ufs_directio_kstats = {
 110  110          { "logical_reads",      KSTAT_DATA_UINT64 },
 111  111          { "phys_reads",         KSTAT_DATA_UINT64 },
 112  112          { "hole_reads",         KSTAT_DATA_UINT64 },
 113  113          { "nread",              KSTAT_DATA_UINT64 },
 114  114          { "logical_writes",     KSTAT_DATA_UINT64 },
 115  115          { "phys_writes",        KSTAT_DATA_UINT64 },
 116  116          { "nwritten",           KSTAT_DATA_UINT64 },
 117  117          { "nflushes",           KSTAT_DATA_UINT64 },
 118  118  };
 119  119  
 120  120  kstat_t *ufs_directio_kstatsp;
 121  121  
 122  122  /*
 123  123   * use kmem_cache_create for direct-physio buffers. This has shown
 124  124   * a better cache distribution compared to buffers on the
 125  125   * stack. It also avoids semaphore construction/deconstruction
 126  126   * per request
 127  127   */
 128  128  struct directio_buf {
 129  129          struct directio_buf     *next;
 130  130          char            *addr;
 131  131          size_t          nbytes;
 132  132          struct buf      buf;
 133  133  };
 134  134  static struct kmem_cache *directio_buf_cache;
 135  135  
 136  136  
 137  137  /* ARGSUSED */
 138  138  static int
 139  139  directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
 140  140  {
 141  141          bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
 142  142          return (0);
 143  143  }
 144  144  
 145  145  /* ARGSUSED */
 146  146  static void
 147  147  directio_buf_destructor(void *dbp, void *cdrarg)
 148  148  {
 149  149          biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
 150  150  }
 151  151  
 152  152  void
 153  153  directio_bufs_init(void)
 154  154  {
 155  155          directio_buf_cache = kmem_cache_create("directio_buf_cache",
 156  156              sizeof (struct directio_buf), 0,
 157  157              directio_buf_constructor, directio_buf_destructor,
 158  158              NULL, NULL, NULL, 0);
 159  159  }
 160  160  
 161  161  void
 162  162  ufs_directio_init(void)
 163  163  {
 164  164          /*
 165  165           * kstats
 166  166           */
 167  167          ufs_directio_kstatsp = kstat_create("ufs", 0,
 168  168              "directio", "ufs", KSTAT_TYPE_NAMED,
 169  169              sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
 170  170              KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 171  171          if (ufs_directio_kstatsp) {
 172  172                  ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
 173  173                  kstat_install(ufs_directio_kstatsp);
 174  174          }
 175  175          /*
 176  176           * kzero is broken so we have to use a private buf of zeroes
 177  177           */
 178  178          ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
 179  179          directio_bufs_init();
 180  180  }
 181  181  
 182  182  /*
 183  183   * Wait for the first direct IO operation to finish
 184  184   */
 185  185  static int
 186  186  directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
 187  187  {
 188  188          buf_t   *bp;
 189  189          int     error;
 190  190  
 191  191          /*
 192  192           * Wait for IO to finish
 193  193           */
 194  194          bp = &dbp->buf;
 195  195          error = biowait(bp);
 196  196  
 197  197          /*
 198  198           * bytes_io will be used to figure out a resid
 199  199           * for the caller. The resid is approximated by reporting
 200  200           * the bytes following the first failed IO as the residual.
 201  201           *
 202  202           * I am cautious about using b_resid because I
 203  203           * am not sure how well the disk drivers maintain it.
 204  204           */
 205  205          if (error)
 206  206                  if (bp->b_resid)
 207  207                          *bytes_iop = bp->b_bcount - bp->b_resid;
 208  208                  else
 209  209                          *bytes_iop = 0;
 210  210          else
 211  211                  *bytes_iop += bp->b_bcount;
 212  212          /*
 213  213           * Release direct IO resources
 214  214           */
 215  215          bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 216  216          kmem_cache_free(directio_buf_cache, dbp);
 217  217          return (error);
 218  218  }
 219  219  
 220  220  /*
 221  221   * Wait for all of the direct IO operations to finish
 222  222   */
 223  223  
 224  224  uint32_t        ufs_directio_drop_kpri = 0;     /* enable kpri hack */
 225  225  
 226  226  static int
 227  227  directio_wait(struct directio_buf *tail, long *bytes_iop)
 228  228  {
 229  229          int     error = 0, newerror;
 230  230          struct directio_buf     *dbp;
 231  231          uint_t  kpri_req_save;
 232  232  
 233  233          /*
 234  234           * The linked list of directio buf structures is maintained
 235  235           * in reverse order (tail->last request->penultimate request->...)
 236  236           */
 237  237          /*
 238  238           * This is the k_pri_req hack. Large numbers of threads
 239  239           * sleeping with kernel priority will cause scheduler thrashing
 240  240           * on an MP machine. This can be seen running Oracle using
 241  241           * directio to ufs files. Sleep at normal priority here to
 242  242           * more closely mimic physio to a device partition. This
 243  243           * workaround is disabled by default as a niced thread could
 244  244           * be starved from running while holding i_rwlock and i_contents.
 245  245           */
 246  246          if (ufs_directio_drop_kpri) {
 247  247                  kpri_req_save = curthread->t_kpri_req;
 248  248                  curthread->t_kpri_req = 0;
 249  249          }
 250  250          while ((dbp = tail) != NULL) {
 251  251                  tail = dbp->next;
 252  252                  newerror = directio_wait_one(dbp, bytes_iop);
 253  253                  if (error == 0)
 254  254                          error = newerror;
 255  255          }
 256  256          if (ufs_directio_drop_kpri)
 257  257                  curthread->t_kpri_req = kpri_req_save;
 258  258          return (error);
 259  259  }
 260  260  /*
 261  261   * Initiate direct IO request
 262  262   */
 263  263  static void
 264  264  directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
 265  265          offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
 266  266          struct directio_buf **tailp, page_t **pplist)
 267  267  {
 268  268          buf_t *bp;
 269  269          struct directio_buf *dbp;
 270  270  
 271  271          /*
 272  272           * Allocate a directio buf header
 273  273           *   Note - list is maintained in reverse order.
 274  274           *   directio_wait_one() depends on this fact when
 275  275           *   adjusting the ``bytes_io'' param. bytes_io
 276  276           *   is used to compute a residual in the case of error.
 277  277           */
 278  278          dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
 279  279          dbp->next = *tailp;
 280  280          *tailp = dbp;
 281  281  
 282  282          /*
 283  283           * Initialize buf header
 284  284           */
 285  285          dbp->addr = addr;
 286  286          dbp->nbytes = nbytes;
 287  287          bp = &dbp->buf;
 288  288          bp->b_edev = ip->i_dev;
 289  289          bp->b_lblkno = btodt(offset);
 290  290          bp->b_bcount = nbytes;
 291  291          bp->b_un.b_addr = addr;
 292  292          bp->b_proc = procp;
 293  293          bp->b_file = ip->i_vnode;
 294  294  
 295  295          /*
 296  296           * Note that S_WRITE implies B_READ and vice versa: a read(2)
 297  297           * will B_READ data from the filesystem and S_WRITE it into
 298  298           * the user's buffer; a write(2) will S_READ data from the
 299  299           * user's buffer and B_WRITE it to the filesystem.
 300  300           */
 301  301          if (rw == S_WRITE) {
 302  302                  bp->b_flags = B_BUSY | B_PHYS | B_READ;
 303  303                  ufs_directio_kstats.phys_reads.value.ui64++;
 304  304                  ufs_directio_kstats.nread.value.ui64 += nbytes;
 305  305          } else {
 306  306                  bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
 307  307                  ufs_directio_kstats.phys_writes.value.ui64++;
 308  308                  ufs_directio_kstats.nwritten.value.ui64 += nbytes;
 309  309          }
 310  310          bp->b_shadow = pplist;
 311  311          if (pplist != NULL)
 312  312                  bp->b_flags |= B_SHADOW;
 313  313  
 314  314          /*
 315  315           * Issue I/O request.
 316  316           */
 317  317          ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 318  318          if (ufsvfsp->vfs_snapshot)
 319  319                  fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 320  320          else
 321  321                  (void) bdev_strategy(bp);
 322  322  
 323  323          if (rw == S_WRITE)
 324  324                  lwp_stat_update(LWP_STAT_OUBLK, 1);
 325  325          else
 326  326                  lwp_stat_update(LWP_STAT_INBLK, 1);
 327  327  
 328  328  }
 329  329  
 330  330  uint32_t        ufs_shared_writes;      /* writes done w/ lock shared */
 331  331  uint32_t        ufs_cur_writes;         /* # concurrent writes */
 332  332  uint32_t        ufs_maxcur_writes;      /* high water concurrent writes */
 333  333  uint32_t        ufs_posix_hits;         /* writes done /w lock excl. */
 334  334  
 335  335  /*
 336  336   * Force POSIX syncronous data integrity on all writes for testing.
 337  337   */
 338  338  uint32_t        ufs_force_posix_sdi = 0;
 339  339  
 340  340  /*
 341  341   * Direct Write
 342  342   */
 343  343  
 344  344  int
 345  345  ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
 346  346          cred_t *cr, int *statusp)
 347  347  {
 348  348          long            resid, bytes_written;
 349  349          u_offset_t      size, uoff;
 350  350          uio_t           *uio = arg_uio;
 351  351          rlim64_t        limit = uio->uio_llimit;
 352  352          int             on, n, error, newerror, len, has_holes;
 353  353          daddr_t         bn;
 354  354          size_t          nbytes;
 355  355          struct fs       *fs;
 356  356          vnode_t         *vp;
 357  357          iovec_t         *iov;
 358  358          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 359  359          struct proc     *procp;
 360  360          struct as       *as;
 361  361          struct directio_buf     *tail;
 362  362          int             exclusive, ncur, bmap_peek;
 363  363          uio_t           copy_uio;
 364  364          iovec_t         copy_iov;
 365  365          char            *copy_base;
 366  366          long            copy_resid;
 367  367  
 368  368          /*
 369  369           * assume that directio isn't possible (normal case)
 370  370           */
 371  371          *statusp = DIRECTIO_FAILURE;
 372  372  
 373  373          /*
 374  374           * Don't go direct
 375  375           */
 376  376          if (ufs_directio_enabled == 0)
 377  377                  return (0);
 378  378  
 379  379          /*
 380  380           * mapped file; nevermind
 381  381           */
 382  382          if (ip->i_mapcnt)
 383  383                  return (0);
 384  384  
 385  385          /*
 386  386           * CAN WE DO DIRECT IO?
 387  387           */
 388  388          uoff = uio->uio_loffset;
 389  389          resid = uio->uio_resid;
 390  390  
 391  391          /*
 392  392           * beyond limit
 393  393           */
 394  394          if (uoff + resid > limit)
 395  395                  return (0);
 396  396  
 397  397          /*
 398  398           * must be sector aligned
 399  399           */
 400  400          if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 401  401                  return (0);
 402  402  
 403  403          /*
 404  404           * SHOULD WE DO DIRECT IO?
 405  405           */
 406  406          size = ip->i_size;
 407  407          has_holes = -1;
 408  408  
 409  409          /*
 410  410           * only on regular files; no metadata
 411  411           */
 412  412          if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
 413  413                  return (0);
 414  414  
 415  415          /*
 416  416           * Synchronous, allocating writes run very slow in Direct-Mode
 417  417           *      XXX - can be fixed with bmap_write changes for large writes!!!
 418  418           *      XXX - can be fixed for updates to "almost-full" files
 419  419           *      XXX - WARNING - system hangs if bmap_write() has to
 420  420           *                      allocate lots of pages since pageout
 421  421           *                      suspends on locked inode
 422  422           */
 423  423          if (!rewrite && (ip->i_flag & ISYNC)) {
 424  424                  if ((uoff + resid) > size)
 425  425                          return (0);
 426  426                  has_holes = bmap_has_holes(ip);
 427  427                  if (has_holes)
 428  428                          return (0);
 429  429          }
 430  430  
 431  431          /*
 432  432           * Each iovec must be short aligned and sector aligned.  If
 433  433           * one is not, then kmem_alloc a new buffer and copy all of
 434  434           * the smaller buffers into the new buffer.  This new
 435  435           * buffer will be short aligned and sector aligned.
 436  436           */
 437  437          iov = uio->uio_iov;
 438  438          nbytes = uio->uio_iovcnt;
 439  439          while (nbytes--) {
 440  440                  if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
 441  441                      (intptr_t)(iov->iov_base) & 1) {
 442  442                          copy_resid = uio->uio_resid;
 443  443                          copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
 444  444                          if (copy_base == NULL)
 445  445                                  return (0);
 446  446                          copy_iov.iov_base = copy_base;
 447  447                          copy_iov.iov_len = copy_resid;
 448  448                          copy_uio.uio_iov = &copy_iov;
 449  449                          copy_uio.uio_iovcnt = 1;
 450  450                          copy_uio.uio_segflg = UIO_SYSSPACE;
 451  451                          copy_uio.uio_extflg = UIO_COPY_DEFAULT;
 452  452                          copy_uio.uio_loffset = uio->uio_loffset;
 453  453                          copy_uio.uio_resid = uio->uio_resid;
 454  454                          copy_uio.uio_llimit = uio->uio_llimit;
 455  455                          error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
 456  456                          if (error) {
 457  457                                  kmem_free(copy_base, copy_resid);
 458  458                                  return (0);
 459  459                          }
 460  460                          uio = &copy_uio;
 461  461                          break;
 462  462                  }
 463  463                  iov++;
 464  464          }
 465  465  
 466  466          /*
 467  467           * From here on down, all error exits must go to errout and
 468  468           * not simply return a 0.
 469  469           */
 470  470  
 471  471          /*
 472  472           * DIRECTIO
 473  473           */
 474  474  
 475  475          fs = ip->i_fs;
 476  476  
 477  477          /*
 478  478           * POSIX check. If attempting a concurrent re-write, make sure
 479  479           * that this will be a single request to the driver to meet
 480  480           * POSIX synchronous data integrity requirements.
 481  481           */
 482  482          bmap_peek = 0;
 483  483          if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
 484  484                  int upgrade = 0;
 485  485  
 486  486                  /* check easy conditions first */
 487  487                  if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
 488  488                          upgrade = 1;
 489  489                  } else {
 490  490                          /* now look for contiguous allocation */
 491  491                          len = (ssize_t)blkroundup(fs, resid);
 492  492                          error = bmap_read(ip, uoff, &bn, &len);
 493  493                          if (error || bn == UFS_HOLE || len == 0)
 494  494                                  goto errout;
 495  495                          /* save a call to bmap_read later */
 496  496                          bmap_peek = 1;
 497  497                          if (len < resid)
 498  498                                  upgrade = 1;
 499  499                  }
 500  500                  if (upgrade) {
 501  501                          rw_exit(&ip->i_contents);
 502  502                          rw_enter(&ip->i_contents, RW_WRITER);
 503  503                          ufs_posix_hits++;
 504  504                  }
 505  505          }
 506  506  
 507  507  
 508  508          /*
 509  509           * allocate space
 510  510           */
 511  511  
 512  512          /*
 513  513           * If attempting a re-write, there is no allocation to do.
 514  514           * bmap_write would trip an ASSERT if i_contents is held shared.
 515  515           */
 516  516          if (rewrite)
 517  517                  goto skip_alloc;
 518  518  
 519  519          do {
 520  520                  on = (int)blkoff(fs, uoff);
 521  521                  n = (int)MIN(fs->fs_bsize - on, resid);
 522  522                  if ((uoff + n) > ip->i_size) {
 523  523                          error = bmap_write(ip, uoff, (int)(on + n),
 524  524                              (int)(uoff & (offset_t)MAXBOFFSET) == 0,
 525  525                              NULL, cr);
 526  526                          /* Caller is responsible for updating i_seq if needed */
 527  527                          if (error)
 528  528                                  break;
 529  529                          ip->i_size = uoff + n;
 530  530                          ip->i_flag |= IATTCHG;
 531  531                  } else if (n == MAXBSIZE) {
 532  532                          error = bmap_write(ip, uoff, (int)(on + n),
 533  533                              BI_ALLOC_ONLY, NULL, cr);
 534  534                          /* Caller is responsible for updating i_seq if needed */
 535  535                  } else {
 536  536                          if (has_holes < 0)
 537  537                                  has_holes = bmap_has_holes(ip);
 538  538                          if (has_holes) {
 539  539                                  uint_t  blk_size;
 540  540                                  u_offset_t offset;
 541  541  
 542  542                                  offset = uoff & (offset_t)fs->fs_bmask;
 543  543                                  blk_size = (int)blksize(fs, ip,
 544  544                                      (daddr_t)lblkno(fs, offset));
 545  545                                  error = bmap_write(ip, uoff, blk_size,
 546  546                                      BI_NORMAL, NULL, cr);
 547  547                                  /*
 548  548                                   * Caller is responsible for updating
 549  549                                   * i_seq if needed
 550  550                                   */
 551  551                          } else
 552  552                                  error = 0;
 553  553                  }
 554  554                  if (error)
 555  555                          break;
 556  556                  uoff += n;
 557  557                  resid -= n;
 558  558                  /*
 559  559                   * if file has grown larger than 2GB, set flag
 560  560                   * in superblock if not already set
 561  561                   */
 562  562                  if ((ip->i_size > MAXOFF32_T) &&
 563  563                      !(fs->fs_flags & FSLARGEFILES)) {
 564  564                          ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
 565  565                          mutex_enter(&ufsvfsp->vfs_lock);
 566  566                          fs->fs_flags |= FSLARGEFILES;
 567  567                          ufs_sbwrite(ufsvfsp);
 568  568                          mutex_exit(&ufsvfsp->vfs_lock);
 569  569                  }
 570  570          } while (resid);
 571  571  
 572  572          if (error) {
 573  573                  /*
 574  574                   * restore original state
 575  575                   */
 576  576                  if (resid) {
 577  577                          if (size == ip->i_size)
 578  578                                  goto errout;
 579  579                          (void) ufs_itrunc(ip, size, 0, cr);
 580  580                  }
 581  581                  /*
 582  582                   * try non-directio path
 583  583                   */
 584  584                  goto errout;
 585  585          }
 586  586  skip_alloc:
 587  587  
 588  588          /*
 589  589           * get rid of cached pages
 590  590           */
 591  591          vp = ITOV(ip);
 592  592          exclusive = rw_write_held(&ip->i_contents);
 593  593          if (vn_has_cached_data(vp)) {
 594  594                  if (!exclusive) {
 595  595                          /*
 596  596                           * Still holding i_rwlock, so no allocations
 597  597                           * can happen after dropping contents.
 598  598                           */
 599  599                          rw_exit(&ip->i_contents);
 600  600                          rw_enter(&ip->i_contents, RW_WRITER);
 601  601                  }
 602  602                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 603  603                      B_INVAL, cr, NULL);
 604  604                  if (vn_has_cached_data(vp))
 605  605                          goto errout;
 606  606                  if (!exclusive)

↓ open down ↓

606 lines elided

↑ open up ↑

 607  607                          rw_downgrade(&ip->i_contents);
 608  608                  ufs_directio_kstats.nflushes.value.ui64++;
 609  609          }
 610  610  
 611  611          /*
 612  612           * Direct Writes
 613  613           */
 614  614  
 615  615          if (!exclusive) {
 616  616                  ufs_shared_writes++;
 617      -                ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
      617 +                ncur = atomic_inc_32_nv(&ufs_cur_writes);
 618  618                  if (ncur > ufs_maxcur_writes)
 619  619                          ufs_maxcur_writes = ncur;
 620  620          }
 621  621  
 622  622          /*
 623  623           * proc and as are for VM operations in directio_start()
 624  624           */
 625  625          if (uio->uio_segflg == UIO_USERSPACE) {
 626  626                  procp = ttoproc(curthread);
 627  627                  as = procp->p_as;

 628  628          } else {
 629  629                  procp = NULL;
 630  630                  as = &kas;
 631  631          }
 632  632          *statusp = DIRECTIO_SUCCESS;
 633  633          error = 0;
 634  634          newerror = 0;
 635  635          resid = uio->uio_resid;
 636  636          bytes_written = 0;
 637  637          ufs_directio_kstats.logical_writes.value.ui64++;
 638  638          while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 639  639                  size_t pglck_len, pglck_size;
 640  640                  caddr_t pglck_base;
 641  641                  page_t **pplist, **spplist;
 642  642  
 643  643                  tail = NULL;
 644  644  
 645  645                  /*
 646  646                   * Adjust number of bytes
 647  647                   */
 648  648                  iov = uio->uio_iov;
 649  649                  pglck_len = (size_t)MIN(iov->iov_len, resid);
 650  650                  pglck_base = iov->iov_base;
 651  651                  if (pglck_len == 0) {
 652  652                          uio->uio_iov++;
 653  653                          uio->uio_iovcnt--;
 654  654                          continue;
 655  655                  }
 656  656  
 657  657                  /*
 658  658                   * Try to Lock down the largest chunck of pages possible.
 659  659                   */
 660  660                  pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 661  661                  error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
 662  662  
 663  663                  if (error)
 664  664                          break;
 665  665  
 666  666                  pglck_size = pglck_len;
 667  667                  while (pglck_len) {
 668  668  
 669  669                          nbytes = pglck_len;
 670  670                          uoff = uio->uio_loffset;
 671  671  
 672  672                          if (!bmap_peek) {
 673  673  
 674  674                                  /*
 675  675                                   * Re-adjust number of bytes to contiguous
 676  676                                   * range. May have already called bmap_read
 677  677                                   * in the case of a concurrent rewrite.
 678  678                                   */
 679  679                                  len = (ssize_t)blkroundup(fs, nbytes);
 680  680                                  error = bmap_read(ip, uoff, &bn, &len);
 681  681                                  if (error)
 682  682                                          break;
 683  683                                  if (bn == UFS_HOLE || len == 0)
 684  684                                          break;
 685  685                          }
 686  686                          nbytes = (size_t)MIN(nbytes, len);
 687  687                          bmap_peek = 0;
 688  688  
 689  689                          /*
 690  690                           * Get the pagelist pointer for this offset to be
 691  691                           * passed to directio_start.
 692  692                           */
 693  693  
 694  694                          if (pplist != NULL)
 695  695                                  spplist = pplist +
 696  696                                      btop((uintptr_t)iov->iov_base -
 697  697                                      ((uintptr_t)pglck_base & PAGEMASK));
 698  698                          else
 699  699                                  spplist = NULL;
 700  700  
 701  701                          /*
 702  702                           * Kick off the direct write requests
 703  703                           */
 704  704                          directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
 705  705                              iov->iov_base, S_READ, procp, &tail, spplist);
 706  706  
 707  707                          /*
 708  708                           * Adjust pointers and counters
 709  709                           */
 710  710                          iov->iov_len -= nbytes;
 711  711                          iov->iov_base += nbytes;
 712  712                          uio->uio_loffset += nbytes;
 713  713                          resid -= nbytes;
 714  714                          pglck_len -= nbytes;
 715  715                  }
 716  716  
 717  717                  /*
 718  718                   * Wait for outstanding requests
 719  719                   */

↓ open down ↓

92 lines elided

↑ open up ↑

 720  720                  newerror = directio_wait(tail, &bytes_written);
 721  721  
 722  722                  /*
 723  723                   * Release VM resources
 724  724                   */
 725  725                  as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
 726  726  
 727  727          }
 728  728  
 729  729          if (!exclusive) {
 730      -                atomic_add_32(&ufs_cur_writes, -1);
      730 +                atomic_dec_32(&ufs_cur_writes);
 731  731                  /*
 732  732                   * If this write was done shared, readers may
 733  733                   * have pulled in unmodified pages. Get rid of
 734  734                   * these potentially stale pages.
 735  735                   */
 736  736                  if (vn_has_cached_data(vp)) {
 737  737                          rw_exit(&ip->i_contents);
 738  738                          rw_enter(&ip->i_contents, RW_WRITER);
 739  739                          (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 740  740                              B_INVAL, cr, NULL);

 741  741                          ufs_directio_kstats.nflushes.value.ui64++;
 742  742                          rw_downgrade(&ip->i_contents);
 743  743                  }
 744  744          }
 745  745  
 746  746          /*
 747  747           * If error, adjust resid to begin at the first
 748  748           * un-writable byte.
 749  749           */
 750  750          if (error == 0)
 751  751                  error = newerror;
 752  752          if (error)
 753  753                  resid = uio->uio_resid - bytes_written;
 754  754          arg_uio->uio_resid = resid;
 755  755  
 756  756          if (!rewrite) {
 757  757                  ip->i_flag |= IUPD | ICHG;
 758  758                  /* Caller will update i_seq */
 759  759                  TRANS_INODE(ip->i_ufsvfs, ip);
 760  760          }
 761  761          /*
 762  762           * If there is a residual; adjust the EOF if necessary
 763  763           */
 764  764          if (resid) {
 765  765                  if (size != ip->i_size) {
 766  766                          if (uio->uio_loffset > size)
 767  767                                  size = uio->uio_loffset;
 768  768                          (void) ufs_itrunc(ip, size, 0, cr);
 769  769                  }
 770  770          }
 771  771  
 772  772          if (uio == &copy_uio)
 773  773                  kmem_free(copy_base, copy_resid);
 774  774  
 775  775          return (error);
 776  776  
 777  777  errout:
 778  778          if (uio == &copy_uio)
 779  779                  kmem_free(copy_base, copy_resid);
 780  780  
 781  781          return (0);
 782  782  }
 783  783  /*
 784  784   * Direct read of a hole
 785  785   */
 786  786  static int
 787  787  directio_hole(struct uio *uio, size_t nbytes)
 788  788  {
 789  789          int             error = 0, nzero;
 790  790          uio_t           phys_uio;
 791  791          iovec_t         phys_iov;
 792  792  
 793  793          ufs_directio_kstats.hole_reads.value.ui64++;
 794  794          ufs_directio_kstats.nread.value.ui64 += nbytes;
 795  795  
 796  796          phys_iov.iov_base = uio->uio_iov->iov_base;
 797  797          phys_iov.iov_len = nbytes;
 798  798  
 799  799          phys_uio.uio_iov = &phys_iov;
 800  800          phys_uio.uio_iovcnt = 1;
 801  801          phys_uio.uio_resid = phys_iov.iov_len;
 802  802          phys_uio.uio_segflg = uio->uio_segflg;
 803  803          phys_uio.uio_extflg = uio->uio_extflg;
 804  804          while (error == 0 && phys_uio.uio_resid) {
 805  805                  nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
 806  806                  error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
 807  807                      &phys_uio);
 808  808          }
 809  809          return (error);
 810  810  }
 811  811  
 812  812  /*
 813  813   * Direct Read
 814  814   */
 815  815  int
 816  816  ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
 817  817  {
 818  818          ssize_t         resid, bytes_read;
 819  819          u_offset_t      size, uoff;
 820  820          int             error, newerror, len;
 821  821          size_t          nbytes;
 822  822          struct fs       *fs;
 823  823          vnode_t         *vp;
 824  824          daddr_t         bn;
 825  825          iovec_t         *iov;
 826  826          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 827  827          struct proc     *procp;
 828  828          struct as       *as;
 829  829          struct directio_buf     *tail;
 830  830  
 831  831          /*
 832  832           * assume that directio isn't possible (normal case)
 833  833           */
 834  834          *statusp = DIRECTIO_FAILURE;
 835  835  
 836  836          /*
 837  837           * Don't go direct
 838  838           */
 839  839          if (ufs_directio_enabled == 0)
 840  840                  return (0);
 841  841  
 842  842          /*
 843  843           * mapped file; nevermind
 844  844           */
 845  845          if (ip->i_mapcnt)
 846  846                  return (0);
 847  847  
 848  848          /*
 849  849           * CAN WE DO DIRECT IO?
 850  850           */
 851  851          /*
 852  852           * must be sector aligned
 853  853           */
 854  854          uoff = uio->uio_loffset;
 855  855          resid = uio->uio_resid;
 856  856          if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 857  857                  return (0);
 858  858          /*
 859  859           * must be short aligned and sector aligned
 860  860           */
 861  861          iov = uio->uio_iov;
 862  862          nbytes = uio->uio_iovcnt;
 863  863          while (nbytes--) {
 864  864                  if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
 865  865                          return (0);
 866  866                  if ((intptr_t)(iov++->iov_base) & 1)
 867  867                          return (0);
 868  868          }
 869  869  
 870  870          /*
 871  871           * DIRECTIO
 872  872           */
 873  873          fs = ip->i_fs;
 874  874  
 875  875          /*
 876  876           * don't read past EOF
 877  877           */
 878  878          size = ip->i_size;
 879  879  
 880  880          /*
 881  881           * The file offset is past EOF so bail out here; we don't want
 882  882           * to update uio_resid and make it look like we read something.
 883  883           * We say that direct I/O was a success to avoid having rdip()
 884  884           * go through the same "read past EOF logic".
 885  885           */
 886  886          if (uoff >= size) {
 887  887                  *statusp = DIRECTIO_SUCCESS;
 888  888                  return (0);
 889  889          }
 890  890  
 891  891          /*
 892  892           * The read would extend past EOF so make it smaller.
 893  893           */
 894  894          if ((uoff + resid) > size) {
 895  895                  resid = size - uoff;
 896  896                  /*
 897  897                   * recheck sector alignment
 898  898                   */
 899  899                  if (resid & (DEV_BSIZE - 1))
 900  900                          return (0);
 901  901          }
 902  902  
 903  903          /*
 904  904           * At this point, we know there is some real work to do.
 905  905           */
 906  906          ASSERT(resid);
 907  907  
 908  908          /*
 909  909           * get rid of cached pages
 910  910           */
 911  911          vp = ITOV(ip);
 912  912          if (vn_has_cached_data(vp)) {
 913  913                  rw_exit(&ip->i_contents);
 914  914                  rw_enter(&ip->i_contents, RW_WRITER);
 915  915                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 916  916                      B_INVAL, cr, NULL);
 917  917                  if (vn_has_cached_data(vp))
 918  918                          return (0);
 919  919                  rw_downgrade(&ip->i_contents);
 920  920                  ufs_directio_kstats.nflushes.value.ui64++;
 921  921          }
 922  922          /*
 923  923           * Direct Reads
 924  924           */
 925  925  
 926  926          /*
 927  927           * proc and as are for VM operations in directio_start()
 928  928           */
 929  929          if (uio->uio_segflg == UIO_USERSPACE) {
 930  930                  procp = ttoproc(curthread);
 931  931                  as = procp->p_as;
 932  932          } else {
 933  933                  procp = NULL;
 934  934                  as = &kas;
 935  935          }
 936  936  
 937  937          *statusp = DIRECTIO_SUCCESS;
 938  938          error = 0;
 939  939          newerror = 0;
 940  940          bytes_read = 0;
 941  941          ufs_directio_kstats.logical_reads.value.ui64++;
 942  942          while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 943  943                  size_t pglck_len, pglck_size;
 944  944                  caddr_t pglck_base;
 945  945                  page_t **pplist, **spplist;
 946  946  
 947  947                  tail = NULL;
 948  948  
 949  949                  /*
 950  950                   * Adjust number of bytes
 951  951                   */
 952  952                  iov = uio->uio_iov;
 953  953                  pglck_len = (size_t)MIN(iov->iov_len, resid);
 954  954                  pglck_base = iov->iov_base;
 955  955                  if (pglck_len == 0) {
 956  956                          uio->uio_iov++;
 957  957                          uio->uio_iovcnt--;
 958  958                          continue;
 959  959                  }
 960  960  
 961  961                  /*
 962  962                   * Try to Lock down the largest chunck of pages possible.
 963  963                   */
 964  964                  pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 965  965                  error = as_pagelock(as, &pplist, pglck_base,
 966  966                      pglck_len, S_WRITE);
 967  967  
 968  968                  if (error)
 969  969                          break;
 970  970  
 971  971                  pglck_size = pglck_len;
 972  972                  while (pglck_len) {
 973  973  
 974  974                          nbytes = pglck_len;
 975  975                          uoff = uio->uio_loffset;
 976  976  
 977  977                          /*
 978  978                           * Re-adjust number of bytes to contiguous range
 979  979                           */
 980  980                          len = (ssize_t)blkroundup(fs, nbytes);
 981  981                          error = bmap_read(ip, uoff, &bn, &len);
 982  982                          if (error)
 983  983                                  break;
 984  984  
 985  985                          if (bn == UFS_HOLE) {
 986  986                                  nbytes = (size_t)MIN(fs->fs_bsize -
 987  987                                      (long)blkoff(fs, uoff), nbytes);
 988  988                                  error = directio_hole(uio, nbytes);
 989  989                                  /*
 990  990                                   * Hole reads are not added to the list
 991  991                                   * processed by directio_wait() below so
 992  992                                   * account for bytes read here.
 993  993                                   */
 994  994                                  if (!error)
 995  995                                          bytes_read += nbytes;
 996  996                          } else {
 997  997                                  nbytes = (size_t)MIN(nbytes, len);
 998  998  
 999  999                                  /*
1000 1000                                   * Get the pagelist pointer for this offset
1001 1001                                   * to be passed to directio_start.
1002 1002                                   */
1003 1003                                  if (pplist != NULL)
1004 1004                                          spplist = pplist +
1005 1005                                              btop((uintptr_t)iov->iov_base -
1006 1006                                              ((uintptr_t)pglck_base & PAGEMASK));
1007 1007                                  else
1008 1008                                          spplist = NULL;
1009 1009  
1010 1010                                  /*
1011 1011                                   * Kick off the direct read requests
1012 1012                                   */
1013 1013                                  directio_start(ufsvfsp, ip, nbytes,
1014 1014                                      ldbtob(bn), iov->iov_base,
1015 1015                                      S_WRITE, procp, &tail, spplist);
1016 1016                          }
1017 1017  
1018 1018                          if (error)
1019 1019                                  break;
1020 1020  
1021 1021                          /*
1022 1022                           * Adjust pointers and counters
1023 1023                           */
1024 1024                          iov->iov_len -= nbytes;
1025 1025                          iov->iov_base += nbytes;
1026 1026                          uio->uio_loffset += nbytes;
1027 1027                          resid -= nbytes;
1028 1028                          pglck_len -= nbytes;
1029 1029                  }
1030 1030  
1031 1031                  /*
1032 1032                   * Wait for outstanding requests
1033 1033                   */
1034 1034                  newerror = directio_wait(tail, &bytes_read);
1035 1035                  /*
1036 1036                   * Release VM resources
1037 1037                   */
1038 1038                  as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 1039  
1040 1040          }
1041 1041  
1042 1042          /*
1043 1043           * If error, adjust resid to begin at the first
1044 1044           * un-read byte.
1045 1045           */
1046 1046          if (error == 0)
1047 1047                  error = newerror;
1048 1048          uio->uio_resid -= bytes_read;
1049 1049          return (error);
1050 1050  }

↓ open down ↓

310 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX