as-lock-macros Wdiff usr/src/uts/common/os/vm_subr.c

Print this page

patch as-lock-macro-simplification

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/vm_subr.c
          +++ new/usr/src/uts/common/os/vm_subr.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26   26  /*        All Rights Reserved   */
  27   27  
  28   28  /*
  29   29   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   30   * The Regents of the University of California
  31   31   * All Rights Reserved
  32   32   *
  33   33   * University Acknowledgment- Portions of this document are derived from
  34   34   * software developed by the University of California, Berkeley, and its
  35   35   * contributors.
  36   36   */
  37   37  
  38   38  #include <sys/types.h>
  39   39  #include <sys/t_lock.h>
  40   40  #include <sys/param.h>
  41   41  #include <sys/errno.h>
  42   42  #include <sys/debug.h>
  43   43  #include <sys/cmn_err.h>
  44   44  #include <sys/kmem.h>
  45   45  #include <sys/sysmacros.h>
  46   46  #include <sys/inline.h>
  47   47  #include <sys/buf.h>
  48   48  #include <sys/uio.h>
  49   49  #include <sys/user.h>
  50   50  #include <sys/proc.h>
  51   51  #include <sys/systm.h>
  52   52  #include <sys/vmsystm.h>
  53   53  #include <sys/cpuvar.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/cred.h>
  56   56  #include <sys/vnode.h>
  57   57  #include <sys/file.h>
  58   58  #include <sys/vm.h>
  59   59  
  60   60  #include <sys/swap.h>
  61   61  #include <sys/vtrace.h>
  62   62  #include <sys/tnf_probe.h>
  63   63  #include <sys/fs/snode.h>
  64   64  #include <sys/copyops.h>
  65   65  #include <sys/conf.h>
  66   66  #include <sys/sdt.h>
  67   67  
  68   68  #include <vm/anon.h>
  69   69  #include <vm/hat.h>
  70   70  #include <vm/as.h>
  71   71  #include <vm/seg.h>
  72   72  #include <vm/page.h>
  73   73  #include <vm/seg_vn.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  extern int maxphys;
  77   77  
  78   78  void
  79   79  minphys(struct buf *bp)
  80   80  {
  81   81          if (bp->b_bcount > maxphys)
  82   82                  bp->b_bcount = maxphys;
  83   83  }
  84   84  
  85   85  /*
  86   86   * use kmem_cache_create for physio buffers. This has shown
  87   87   * a better cache distribution compared to buffers on the
  88   88   * stack. It also avoids semaphore construction/deconstruction
  89   89   * per request
  90   90   */
  91   91  
  92   92  static struct kmem_cache *physio_buf_cache;
  93   93  
  94   94  /* ARGSUSED */
  95   95  static int
  96   96  physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
  97   97  {
  98   98          bioinit((struct buf *)buf);
  99   99          return (0);
 100  100  }
 101  101  
 102  102  /* ARGSUSED */
 103  103  static void
 104  104  physio_buf_destructor(void *buf, void *cdrarg)
 105  105  {
 106  106          biofini((struct buf *)buf);
 107  107  }
 108  108  
 109  109  void
 110  110  physio_bufs_init(void)
 111  111  {
 112  112          physio_buf_cache = kmem_cache_create("physio_buf_cache",
 113  113              sizeof (struct buf), 0, physio_buf_constructor,
 114  114              physio_buf_destructor, NULL, NULL, NULL, 0);
 115  115  }
 116  116  
 117  117  
 118  118  
 119  119  /*
 120  120   * initiate raw I/O request
 121  121   *
 122  122   * allocate buf header if necessary
 123  123   * adjust max size of each I/O request
 124  124   * lock down user pages and verify access protections
 125  125   * call driver's strategy routine to submit request
 126  126   * wait for I/O completion
 127  127   * unlock user pages and free allocated buf header
 128  128   */
 129  129  
 130  130  int
 131  131  default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
 132  132          int rw, void (*mincnt)(struct buf *), struct uio *uio)
 133  133  {
 134  134          struct iovec *iov;
 135  135          struct proc *procp;
 136  136          struct as *asp;
 137  137          ssize_t c;
 138  138          char *a;
 139  139          int error = 0;
 140  140          page_t **pplist;
 141  141          int allocbuf = 0;
 142  142  
 143  143          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
 144  144  
 145  145          /* Kernel probe */
 146  146          TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
 147  147              tnf_device,         device,         dev,
 148  148              tnf_offset,         offset,         uio->uio_loffset,
 149  149              tnf_size,           size,           uio->uio_resid,
 150  150              tnf_bioflags,       rw,             rw);
 151  151  
 152  152          if (rw == B_READ) {
 153  153                  CPU_STATS_ADD_K(sys, phread, 1);
 154  154          } else {
 155  155                  CPU_STATS_ADD_K(sys, phwrite, 1);
 156  156          }
 157  157  
 158  158          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
 159  159              "getbuf_start: bp %p", bp);
 160  160  
 161  161          if (bp == NULL) {
 162  162                  bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
 163  163                  bp->b_iodone = NULL;
 164  164                  bp->b_resid = 0;
 165  165                  allocbuf = 1;
 166  166          }
 167  167          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
 168  168  
 169  169          if (uio->uio_segflg == UIO_USERSPACE) {
 170  170                  procp = ttoproc(curthread);
 171  171                  asp = procp->p_as;
 172  172          } else {
 173  173                  procp = NULL;
 174  174                  asp = &kas;
 175  175          }
 176  176          ASSERT(SEMA_HELD(&bp->b_sem));
 177  177  
 178  178          /*
 179  179           * We need to prepare this buffer for the io:::start probe, including
 180  180           * NULL'ing out the file, clearing the offset, and filling in the
 181  181           * b_dip field.
 182  182           */
 183  183          bp->b_file = NULL;
 184  184          bp->b_offset = -1;
 185  185  
 186  186          if (dev != NODEV) {
 187  187                  (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
 188  188                      DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
 189  189          } else {
 190  190                  bp->b_dip = NULL;
 191  191          }
 192  192  
 193  193          while (uio->uio_iovcnt > 0) {
 194  194                  iov = uio->uio_iov;
 195  195  
 196  196                  bp->b_error = 0;
 197  197                  bp->b_proc = procp;
 198  198  
 199  199                  while (iov->iov_len > 0) {
 200  200                          if (uio->uio_resid == 0)
 201  201                                  break;
 202  202                          if (uio->uio_loffset < 0) {
 203  203                                  error = EINVAL;
 204  204                                  break;
 205  205                          }
 206  206  #ifdef  _ILP32
 207  207                          /*
 208  208                           * For 32-bit kernels, check against SPEC_MAXOFFSET_T
 209  209                           * which represents the maximum size that can be
 210  210                           * supported by the IO subsystem.
 211  211                           * XXX this code assumes a D_64BIT driver.
 212  212                           */
 213  213                          if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
 214  214                                  error = EINVAL;
 215  215                                  break;
 216  216                          }
 217  217  #endif  /* _ILP32 */
 218  218                          bp->b_flags = B_BUSY | B_PHYS | rw;
 219  219                          bp->b_edev = dev;
 220  220                          bp->b_lblkno = btodt(uio->uio_loffset);
 221  221  
 222  222                          /*
 223  223                           * Don't count on b_addr remaining untouched by the
 224  224                           * code below (it may be reset because someone does
 225  225                           * a bp_mapin on the buffer) -- reset from the iov
 226  226                           * each time through, updating the iov's base address
 227  227                           * instead.
 228  228                           */
 229  229                          a = bp->b_un.b_addr = iov->iov_base;
 230  230                          bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
 231  231                          (*mincnt)(bp);
 232  232                          c = bp->b_bcount;
 233  233  
 234  234                          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
 235  235                              "as_pagelock_start: bp %p", bp);
 236  236  
 237  237                          error = as_pagelock(asp, &pplist, a,
 238  238                              c, rw == B_READ? S_WRITE : S_READ);
 239  239  
 240  240                          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
 241  241                              "as_pagelock_end:");
 242  242  
 243  243                          if (error != 0) {
 244  244                                  bp->b_flags |= B_ERROR;
 245  245                                  bp->b_error = error;
 246  246                                  bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
 247  247                                  break;
 248  248                          }
 249  249                          bp->b_shadow = pplist;
 250  250                          if (pplist != NULL) {
 251  251                                  bp->b_flags |= B_SHADOW;
 252  252                          }
 253  253  
 254  254                          DTRACE_IO1(start, struct buf *, bp);
 255  255                          bp->b_flags |= B_STARTED;
 256  256  
 257  257                          (void) (*strat)(bp);
 258  258                          error = biowait(bp);
 259  259  
 260  260                          /*
 261  261                           * unlock the pages
 262  262                           */
 263  263                          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
 264  264                              "as_pageunlock_start: bp %p", bp);
 265  265  
 266  266                          as_pageunlock(asp, pplist, a, c,
 267  267                              rw == B_READ? S_WRITE : S_READ);
 268  268  
 269  269                          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
 270  270                              "as_pageunlock_end:");
 271  271  
 272  272                          c -= bp->b_resid;
 273  273                          iov->iov_base += c;
 274  274                          iov->iov_len -= c;
 275  275                          uio->uio_resid -= c;
 276  276                          uio->uio_loffset += c;
 277  277                          /* bp->b_resid - temp kludge for tape drives */
 278  278                          if (bp->b_resid || error)
 279  279                                  break;
 280  280                  }
 281  281                  bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 282  282                  /* bp->b_resid - temp kludge for tape drives */
 283  283                  if (bp->b_resid || error)
 284  284                          break;
 285  285                  uio->uio_iov++;
 286  286                  uio->uio_iovcnt--;
 287  287          }
 288  288  
 289  289          if (allocbuf) {
 290  290                  kmem_cache_free(physio_buf_cache, bp);
 291  291          }
 292  292  
 293  293          /* Kernel probe */
 294  294          TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
 295  295                  tnf_device,     device,         dev);
 296  296  
 297  297          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
 298  298  
 299  299          return (error);
 300  300  }
 301  301  
 302  302  /*
 303  303   * Returns 0 on success, or an error on failure.
 304  304   *
 305  305   * This function is no longer a part of the DDI/DKI.
 306  306   * However, for compatibility, its interface should not
 307  307   * be changed and it should not be removed from the kernel.
 308  308   */
 309  309  int
 310  310  useracc(void *addr, size_t count, int access)
 311  311  {
 312  312          uint_t prot;
 313  313  
 314  314          prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
 315  315          return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
 316  316  }
 317  317  
 318  318  #define MAX_MAPIN_PAGES 8
 319  319  
 320  320  /*
 321  321   * This function temporarily "borrows" user pages for kernel use. If
 322  322   * "cow" is on, it also sets up copy-on-write protection (only feasible
 323  323   * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
 324  324   * pages from any changes by the user. The caller is responsible for
 325  325   * unlocking and tearing down cow settings when it's done with the pages.
 326  326   * For an example, see kcfree().
 327  327   *
 328  328   * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
 329  329   * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
 330  330   * kaddr != -1. On entering this function, cached_ppp contains a list
 331  331   * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
 332  332   * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
 333  333   * the kernel map won't need to be reloaded again.
 334  334   *
 335  335   * For cow == 1, if the pages are anonymous pages, it also bumps the anon
 336  336   * reference count, and change the user-mapping to read-only. This
 337  337   * scheme should work on all types of segment drivers. But to be safe,
 338  338   * we check against segvn here.
 339  339   *
 340  340   * Since this function is used to emulate copyin() semantic, it checks
 341  341   * to make sure the user-mappings allow "user-read".
 342  342   *
 343  343   * On exit "lenp" contains the number of bytes successfully locked and
 344  344   * mapped in. For the unsuccessful ones, the caller can fall back to
 345  345   * copyin().
 346  346   *
 347  347   * Error return:
 348  348   * ENOTSUP - operation like this is not supported either on this segment
 349  349   * type, or on this platform type.
 350  350   */
 351  351  int
 352  352  cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
 353  353      struct anon **app, size_t *lenp, int cow)
 354  354  {
 355  355          struct          hat *hat;
 356  356          struct seg      *seg;

↓ open down ↓

356 lines elided

↑ open up ↑

 357  357          caddr_t         base;
 358  358          page_t          *pp, *ppp[MAX_MAPIN_PAGES];
 359  359          long            i;
 360  360          int             flags;
 361  361          size_t          size, total = *lenp;
 362  362          char            first = 1;
 363  363          faultcode_t     res;
 364  364  
 365  365          *lenp = 0;
 366  366          if (cow) {
 367      -                AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
      367 +                AS_LOCK_ENTER(as, RW_WRITER);
 368  368                  seg = as_findseg(as, uaddr, 0);
 369  369                  if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
 370  370                      (uaddr + total) > base + seg->s_size) {
 371      -                        AS_LOCK_EXIT(as, &as->a_lock);
      371 +                        AS_LOCK_EXIT(as);
 372  372                          return (EINVAL);
 373  373                  }
 374  374                  /*
 375  375                   * The COW scheme should work for all segment types.
 376  376                   * But to be safe, we check against segvn.
 377  377                   */
 378  378                  if (seg->s_ops != &segvn_ops) {
 379      -                        AS_LOCK_EXIT(as, &as->a_lock);
      379 +                        AS_LOCK_EXIT(as);
 380  380                          return (ENOTSUP);
 381  381                  } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
 382      -                        AS_LOCK_EXIT(as, &as->a_lock);
      382 +                        AS_LOCK_EXIT(as);
 383  383                          return (ENOTSUP);
 384  384                  }
 385  385          }
 386  386          hat = as->a_hat;
 387  387          size = total;
 388  388  tryagain:
 389  389          /*
 390  390           * If (cow), hat_softlock will also change the usr protection to RO.
 391  391           * This is the first step toward setting up cow. Before we
 392  392           * bump up an_refcnt, we can't allow any cow-fault on this

 393  393           * address. Otherwise segvn_fault will change the protection back
 394  394           * to RW upon seeing an_refcnt == 1.
 395  395           * The solution is to hold the writer lock on "as".
 396  396           */
 397  397          res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
 398  398          size = total - size;
 399  399          *lenp += size;
 400  400          size = size >> PAGESHIFT;
 401  401          i = 0;
 402  402          while (i < size) {
 403  403                  pp = ppp[i];
 404  404                  if (cow) {
 405  405                          kmutex_t *ahm;
 406  406                          /*
 407  407                           * Another solution is to hold SE_EXCL on pp, and
 408  408                           * disable PROT_WRITE. This also works for MAP_SHARED
 409  409                           * segment. The disadvantage is that it locks the
 410  410                           * page from being used by anybody else.
 411  411                           */
 412  412                          ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
 413  413                          mutex_enter(ahm);
 414  414                          *app = swap_anon(pp->p_vnode, pp->p_offset);
 415  415                          /*
 416  416                           * Since we are holding the as lock, this avoids a
 417  417                           * potential race with anon_decref. (segvn_unmap and
 418  418                           * segvn_free needs the as writer lock to do anon_free.)
 419  419                           */
 420  420                          if (*app != NULL) {
 421  421  #if 0
 422  422                                  if ((*app)->an_refcnt == 0)
 423  423                                  /*
 424  424                                   * Consider the following senario (unlikey
 425  425                                   * though):
 426  426                                   * 1. an_refcnt == 2
 427  427                                   * 2. we solftlock the page.
 428  428                                   * 3. cow ocurrs on this addr. So a new ap,
 429  429                                   * page and mapping is established on addr.
 430  430                                   * 4. an_refcnt drops to 1 (segvn_faultpage
 431  431                                   * -> anon_decref(oldap))
 432  432                                   * 5. the last ref to ap also drops (from
 433  433                                   * another as). It ends up blocked inside
 434  434                                   * anon_decref trying to get page's excl lock.
 435  435                                   * 6. Later kcfree unlocks the page, call
 436  436                                   * anon_decref -> oops, ap is gone already.
 437  437                                   *
 438  438                                   * Holding as writer lock solves all problems.
 439  439                                   */
 440  440                                          *app = NULL;
 441  441                                  else
 442  442  #endif
 443  443                                          (*app)->an_refcnt++;
 444  444                          }
 445  445                          mutex_exit(ahm);
 446  446                  } else {
 447  447                          *app = NULL;
 448  448                  }
 449  449                  if (kaddr != (caddr_t)-1) {
 450  450                          if (pp != *cached_ppp) {
 451  451                                  if (*cached_ppp == NULL)
 452  452                                          flags = HAT_LOAD_LOCK | HAT_NOSYNC |
 453  453                                              HAT_LOAD_NOCONSIST;
 454  454                                  else
 455  455                                          flags = HAT_LOAD_REMAP |
 456  456                                              HAT_LOAD_NOCONSIST;
 457  457                                  /*
 458  458                                   * In order to cache the kernel mapping after
 459  459                                   * the user page is unlocked, we call
 460  460                                   * hat_devload instead of hat_memload so
 461  461                                   * that the kernel mapping we set up here is
 462  462                                   * "invisible" to the rest of the world. This
 463  463                                   * is not very pretty. But as long as the
 464  464                                   * caller bears the responsibility of keeping
 465  465                                   * cache consistency, we should be ok -
 466  466                                   * HAT_NOCONSIST will get us a uncached
 467  467                                   * mapping on VAC. hat_softlock will flush
 468  468                                   * a VAC_WRITEBACK cache. Therefore the kaddr
 469  469                                   * doesn't have to be of the same vcolor as
 470  470                                   * uaddr.
 471  471                                   * The alternative is - change hat_devload
 472  472                                   * to get a cached mapping. Allocate a kaddr
 473  473                                   * with the same vcolor as uaddr. Then
 474  474                                   * hat_softlock won't need to flush the VAC.
 475  475                                   */
 476  476                                  hat_devload(kas.a_hat, kaddr, PAGESIZE,

↓ open down ↓

84 lines elided

↑ open up ↑

 477  477                                      page_pptonum(pp), PROT_READ, flags);
 478  478                                  *cached_ppp = pp;
 479  479                          }
 480  480                          kaddr += PAGESIZE;
 481  481                  }
 482  482                  cached_ppp++;
 483  483                  app++;
 484  484                  ++i;
 485  485          }
 486  486          if (cow) {
 487      -                AS_LOCK_EXIT(as, &as->a_lock);
      487 +                AS_LOCK_EXIT(as);
 488  488          }
 489  489          if (first && res == FC_NOMAP) {
 490  490                  /*
 491  491                   * If the address is not mapped yet, we call as_fault to
 492  492                   * fault the pages in. We could've fallen back to copy and
 493  493                   * let it fault in the pages. But for a mapped file, we
 494  494                   * normally reference each page only once. For zero-copy to
 495  495                   * be of any use, we'd better fall in the page now and try
 496  496                   * again.
 497  497                   */
 498  498                  first = 0;
 499  499                  size = size << PAGESHIFT;
 500  500                  uaddr += size;
 501  501                  total -= size;
 502  502                  size = total;
 503  503                  res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
 504  504                  if (cow)
 505      -                        AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
      505 +                        AS_LOCK_ENTER(as, RW_WRITER);
 506  506                  goto tryagain;
 507  507          }
 508  508          switch (res) {
 509  509          case FC_NOSUPPORT:
 510  510                  return (ENOTSUP);
 511  511          case FC_PROT:   /* Pretend we don't know about it. This will be */
 512  512                          /* caught by the caller when uiomove fails. */
 513  513          case FC_NOMAP:
 514  514          case FC_OBJERR:
 515  515          default:
 516  516                  return (0);
 517  517          }
 518  518  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX