1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2012 by Delphix. All rights reserved.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/devops.h>
  32 #include <sys/conf.h>
  33 #include <sys/modctl.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/stat.h>
  36 #include <sys/poll_impl.h>
  37 #include <sys/errno.h>
  38 #include <sys/kmem.h>
  39 #include <sys/mkdev.h>
  40 #include <sys/debug.h>
  41 #include <sys/file.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/systm.h>
  44 #include <sys/bitmap.h>
  45 #include <sys/devpoll.h>
  46 #include <sys/rctl.h>
  47 #include <sys/resource.h>
  48 
  49 #define RESERVED        1
  50 
  51 /* local data struct */
  52 static  dp_entry_t      **devpolltbl;   /* dev poll entries */
  53 static  size_t          dptblsize;
  54 
  55 static  kmutex_t        devpoll_lock;   /* lock protecting dev tbl */
  56 int                     devpoll_init;   /* is /dev/poll initialized already */
  57 
  58 /* device local functions */
  59 
  60 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
  61 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
  62 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
  63     int *rvalp);
  64 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
  65     struct pollhead **phpp);
  66 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
  67 static dev_info_t *dpdevi;
  68 
  69 
  70 static struct cb_ops    dp_cb_ops = {
  71         dpopen,                 /* open */
  72         dpclose,                /* close */
  73         nodev,                  /* strategy */
  74         nodev,                  /* print */
  75         nodev,                  /* dump */
  76         nodev,                  /* read */
  77         dpwrite,                /* write */
  78         dpioctl,                /* ioctl */
  79         nodev,                  /* devmap */
  80         nodev,                  /* mmap */
  81         nodev,                  /* segmap */
  82         dppoll,                 /* poll */
  83         ddi_prop_op,            /* prop_op */
  84         (struct streamtab *)0,  /* streamtab */
  85         D_MP,                   /* flags */
  86         CB_REV,                 /* cb_ops revision */
  87         nodev,                  /* aread */
  88         nodev                   /* awrite */
  89 };
  90 
  91 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
  92 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
  93 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
  94 
  95 static struct dev_ops dp_ops = {
  96         DEVO_REV,               /* devo_rev */
  97         0,                      /* refcnt */
  98         dpinfo,                 /* info */
  99         nulldev,                /* identify */
 100         nulldev,                /* probe */
 101         dpattach,               /* attach */
 102         dpdetach,               /* detach */
 103         nodev,                  /* reset */
 104         &dp_cb_ops,         /* driver operations */
 105         (struct bus_ops *)NULL, /* bus operations */
 106         nulldev,                /* power */
 107         ddi_quiesce_not_needed,         /* quiesce */
 108 };
 109 
 110 
 111 static struct modldrv modldrv = {
 112         &mod_driverops,             /* type of module - a driver */
 113         "/dev/poll driver",
 114         &dp_ops,
 115 };
 116 
 117 static struct modlinkage modlinkage = {
 118         MODREV_1,
 119         (void *)&modldrv,
 120         NULL
 121 };
 122 
 123 /*
 124  * Locking Design
 125  *
 126  * The /dev/poll driver shares most of its code with poll sys call whose
 127  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
 128  * structure is per lwp. An implicit assumption is made there that some
 129  * portion of pollcache will never be touched by other lwps. E.g., in
 130  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
 131  * This assumption is not true for /dev/poll; hence the need for extra
 132  * locking.
 133  *
 134  * To allow more parallelism, each /dev/poll file descriptor (indexed by
 135  * minor number) has its own lock. Since read (dpioctl) is a much more
 136  * frequent operation than write, we want to allow multiple reads on same
 137  * /dev/poll fd. However, we prevent writes from being starved by giving
 138  * priority to write operation. Theoretically writes can starve reads as
 139  * well. But in practical sense this is not important because (1) writes
 140  * happens less often than reads, and (2) write operation defines the
 141  * content of poll fd a cache set. If writes happens so often that they
 142  * can starve reads, that means the cached set is very unstable. It may
 143  * not make sense to read an unstable cache set anyway. Therefore, the
 144  * writers starving readers case is not handled in this design.
 145  */
 146 
 147 int
 148 _init()
 149 {
 150         int     error;
 151 
 152         dptblsize = DEVPOLLSIZE;
 153         devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
 154         mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
 155         devpoll_init = 1;
 156         if ((error = mod_install(&modlinkage)) != 0) {
 157                 mutex_destroy(&devpoll_lock);
 158                 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
 159                 devpoll_init = 0;
 160         }
 161         return (error);
 162 }
 163 
 164 int
 165 _fini()
 166 {
 167         int error;
 168 
 169         if ((error = mod_remove(&modlinkage)) != 0) {
 170                 return (error);
 171         }
 172         mutex_destroy(&devpoll_lock);
 173         kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
 174         return (0);
 175 }
 176 
 177 int
 178 _info(struct modinfo *modinfop)
 179 {
 180         return (mod_info(&modlinkage, modinfop));
 181 }
 182 
 183 /*ARGSUSED*/
 184 static int
 185 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 186 {
 187         if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL)
 188             == DDI_FAILURE) {
 189                 ddi_remove_minor_node(devi, NULL);
 190                 return (DDI_FAILURE);
 191         }
 192         dpdevi = devi;
 193         return (DDI_SUCCESS);
 194 }
 195 
 196 static int
 197 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 198 {
 199         if (cmd != DDI_DETACH)
 200                 return (DDI_FAILURE);
 201 
 202         ddi_remove_minor_node(devi, NULL);
 203         return (DDI_SUCCESS);
 204 }
 205 
 206 /* ARGSUSED */
 207 static int
 208 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 209 {
 210         int error;
 211 
 212         switch (infocmd) {
 213         case DDI_INFO_DEVT2DEVINFO:
 214                 *result = (void *)dpdevi;
 215                 error = DDI_SUCCESS;
 216                 break;
 217         case DDI_INFO_DEVT2INSTANCE:
 218                 *result = (void *)0;
 219                 error = DDI_SUCCESS;
 220                 break;
 221         default:
 222                 error = DDI_FAILURE;
 223         }
 224         return (error);
 225 }
 226 
 227 /*
 228  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
 229  * differences are: (1) /dev/poll requires scanning the bitmap starting at
 230  * where it was stopped last time, instead of always starting from 0,
 231  * (2) since user may not have cleaned up the cached fds when they are
 232  * closed, some polldats in cache may refer to closed or reused fds. We
 233  * need to check for those cases.
 234  *
 235  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
 236  *       poll(2) caches but NOT for /dev/poll caches. So expect some
 237  *       stale entries!
 238  */
 239 static int
 240 dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp)
 241 {
 242         int             start, ostart, end;
 243         int             fdcnt, fd;
 244         boolean_t       done;
 245         file_t          *fp;
 246         short           revent;
 247         boolean_t       no_wrap;
 248         pollhead_t      *php;
 249         polldat_t       *pdp;
 250         int             error = 0;
 251 
 252         ASSERT(MUTEX_HELD(&pcp->pc_lock));
 253         if (pcp->pc_bitmap == NULL) {
 254                 /*
 255                  * No Need to search because no poll fd
 256                  * has been cached.
 257                  */
 258                 return (error);
 259         }
 260 retry:
 261         start = ostart = pcp->pc_mapstart;
 262         end = pcp->pc_mapend;
 263         php = NULL;
 264 
 265         if (start == 0) {
 266                 /*
 267                  * started from every begining, no need to wrap around.
 268                  */
 269                 no_wrap = B_TRUE;
 270         } else {
 271                 no_wrap = B_FALSE;
 272         }
 273         done = B_FALSE;
 274         fdcnt = 0;
 275         while ((fdcnt < nfds) && !done) {
 276                 php = NULL;
 277                 revent = 0;
 278                 /*
 279                  * Examine the bit map in a circular fashion
 280                  * to avoid starvation. Always resume from
 281                  * last stop. Scan till end of the map. Then
 282                  * wrap around.
 283                  */
 284                 fd = bt_getlowbit(pcp->pc_bitmap, start, end);
 285                 ASSERT(fd <= end);
 286                 if (fd >= 0) {
 287                         if (fd == end) {
 288                                 if (no_wrap) {
 289                                         done = B_TRUE;
 290                                 } else {
 291                                         start = 0;
 292                                         end = ostart - 1;
 293                                         no_wrap = B_TRUE;
 294                                 }
 295                         } else {
 296                                 start = fd + 1;
 297                         }
 298                         pdp = pcache_lookup_fd(pcp, fd);
 299 repoll:
 300                         ASSERT(pdp != NULL);
 301                         ASSERT(pdp->pd_fd == fd);
 302                         if (pdp->pd_fp == NULL) {
 303                                 /*
 304                                  * The fd is POLLREMOVed. This fd is
 305                                  * logically no longer cached. So move
 306                                  * on to the next one.
 307                                  */
 308                                 continue;
 309                         }
 310                         if ((fp = getf(fd)) == NULL) {
 311                                 /*
 312                                  * The fd has been closed, but user has not
 313                                  * done a POLLREMOVE on this fd yet. Instead
 314                                  * of cleaning it here implicitly, we return
 315                                  * POLLNVAL. This is consistent with poll(2)
 316                                  * polling a closed fd. Hope this will remind
 317                                  * user to do a POLLREMOVE.
 318                                  */
 319                                 pfdp[fdcnt].fd = fd;
 320                                 pfdp[fdcnt].revents = POLLNVAL;
 321                                 fdcnt++;
 322                                 continue;
 323                         }
 324                         if (fp != pdp->pd_fp) {
 325                                 /*
 326                                  * user is polling on a cached fd which was
 327                                  * closed and then reused. Unfortunately
 328                                  * there is no good way to inform user.
 329                                  * If the file struct is also reused, we
 330                                  * may not be able to detect the fd reuse
 331                                  * at all.  As long as this does not
 332                                  * cause system failure and/or memory leak,
 333                                  * we will play along. Man page states if
 334                                  * user does not clean up closed fds, polling
 335                                  * results will be indeterministic.
 336                                  *
 337                                  * XXX - perhaps log the detection of fd
 338                                  *       reuse?
 339                                  */
 340                                 pdp->pd_fp = fp;
 341                         }
 342                         /*
 343                          * XXX - pollrelock() logic needs to know which
 344                          * which pollcache lock to grab. It'd be a
 345                          * cleaner solution if we could pass pcp as
 346                          * an arguement in VOP_POLL interface instead
 347                          * of implicitly passing it using thread_t
 348                          * struct. On the other hand, changing VOP_POLL
 349                          * interface will require all driver/file system
 350                          * poll routine to change. May want to revisit
 351                          * the tradeoff later.
 352                          */
 353                         curthread->t_pollcache = pcp;
 354                         error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
 355                             &revent, &php, NULL);
 356                         curthread->t_pollcache = NULL;
 357                         releasef(fd);
 358                         if (error != 0) {
 359                                 break;
 360                         }
 361                         /*
 362                          * layered devices (e.g. console driver)
 363                          * may change the vnode and thus the pollhead
 364                          * pointer out from underneath us.
 365                          */
 366                         if (php != NULL && pdp->pd_php != NULL &&
 367                             php != pdp->pd_php) {
 368                                 pollhead_delete(pdp->pd_php, pdp);
 369                                 pdp->pd_php = php;
 370                                 pollhead_insert(php, pdp);
 371                                 /*
 372                                  * The bit should still be set.
 373                                  */
 374                                 ASSERT(BT_TEST(pcp->pc_bitmap, fd));
 375                                 goto retry;
 376                         }
 377 
 378                         if (revent != 0) {
 379                                 pfdp[fdcnt].fd = fd;
 380                                 pfdp[fdcnt].events = pdp->pd_events;
 381                                 pfdp[fdcnt].revents = revent;
 382                                 fdcnt++;
 383                         } else if (php != NULL) {
 384                                 /*
 385                                  * We clear a bit or cache a poll fd if
 386                                  * the driver returns a poll head ptr,
 387                                  * which is expected in the case of 0
 388                                  * revents. Some buggy driver may return
 389                                  * NULL php pointer with 0 revents. In
 390                                  * this case, we just treat the driver as
 391                                  * "noncachable" and not clearing the bit
 392                                  * in bitmap.
 393                                  */
 394                                 if ((pdp->pd_php != NULL) &&
 395                                     ((pcp->pc_flag & T_POLLWAKE) == 0)) {
 396                                         BT_CLEAR(pcp->pc_bitmap, fd);
 397                                 }
 398                                 if (pdp->pd_php == NULL) {
 399                                         pollhead_insert(php, pdp);
 400                                         pdp->pd_php = php;
 401                                         /*
 402                                          * An event of interest may have
 403                                          * arrived between the VOP_POLL() and
 404                                          * the pollhead_insert(); check again.
 405                                          */
 406                                         goto repoll;
 407                                 }
 408                         }
 409                 } else {
 410                         /*
 411                          * No bit set in the range. Check for wrap around.
 412                          */
 413                         if (!no_wrap) {
 414                                 start = 0;
 415                                 end = ostart - 1;
 416                                 no_wrap = B_TRUE;
 417                         } else {
 418                                 done = B_TRUE;
 419                         }
 420                 }
 421         }
 422 
 423         if (!done) {
 424                 pcp->pc_mapstart = start;
 425         }
 426         ASSERT(*fdcntp == 0);
 427         *fdcntp = fdcnt;
 428         return (error);
 429 }
 430 
 431 /*ARGSUSED*/
 432 static int
 433 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
 434 {
 435         minor_t         minordev;
 436         dp_entry_t      *dpep;
 437         pollcache_t     *pcp;
 438 
 439         ASSERT(devpoll_init);
 440         ASSERT(dptblsize <= MAXMIN);
 441         mutex_enter(&devpoll_lock);
 442         for (minordev = 0; minordev < dptblsize; minordev++) {
 443                 if (devpolltbl[minordev] == NULL) {
 444                         devpolltbl[minordev] = (dp_entry_t *)RESERVED;
 445                         break;
 446                 }
 447         }
 448         if (minordev == dptblsize) {
 449                 dp_entry_t      **newtbl;
 450                 size_t          oldsize;
 451 
 452                 /*
 453                  * Used up every entry in the existing devpoll table.
 454                  * Grow the table by DEVPOLLSIZE.
 455                  */
 456                 if ((oldsize = dptblsize) >= MAXMIN) {
 457                         mutex_exit(&devpoll_lock);
 458                         return (ENXIO);
 459                 }
 460                 dptblsize += DEVPOLLSIZE;
 461                 if (dptblsize > MAXMIN) {
 462                         dptblsize = MAXMIN;
 463                 }
 464                 newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
 465                 bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
 466                 kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
 467                 devpolltbl = newtbl;
 468                 devpolltbl[minordev] = (dp_entry_t *)RESERVED;
 469         }
 470         mutex_exit(&devpoll_lock);
 471 
 472         dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
 473         /*
 474          * allocate a pollcache skeleton here. Delay allocating bitmap
 475          * structures until dpwrite() time, since we don't know the
 476          * optimal size yet.
 477          */
 478         pcp = pcache_alloc();
 479         dpep->dpe_pcache = pcp;
 480         pcp->pc_pid = curproc->p_pid;
 481         *devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
 482         mutex_enter(&devpoll_lock);
 483         ASSERT(minordev < dptblsize);
 484         ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
 485         devpolltbl[minordev] = dpep;
 486         mutex_exit(&devpoll_lock);
 487         return (0);
 488 }
 489 
 490 /*
 491  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
 492  * or change poll events for a watched fd.
 493  */
 494 /*ARGSUSED*/
 495 static int
 496 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
 497 {
 498         minor_t         minor;
 499         dp_entry_t      *dpep;
 500         pollcache_t     *pcp;
 501         pollfd_t        *pollfdp, *pfdp;
 502         int             error;
 503         ssize_t         uiosize;
 504         nfds_t          pollfdnum;
 505         struct pollhead *php = NULL;
 506         polldat_t       *pdp;
 507         int             fd;
 508         file_t          *fp;
 509 
 510         minor = getminor(dev);
 511 
 512         mutex_enter(&devpoll_lock);
 513         ASSERT(minor < dptblsize);
 514         dpep = devpolltbl[minor];
 515         ASSERT(dpep != NULL);
 516         mutex_exit(&devpoll_lock);
 517         pcp = dpep->dpe_pcache;
 518         if (curproc->p_pid != pcp->pc_pid) {
 519                 return (EACCES);
 520         }
 521         uiosize = uiop->uio_resid;
 522         pollfdnum = uiosize / sizeof (pollfd_t);
 523         mutex_enter(&curproc->p_lock);
 524         if (pollfdnum > (uint_t)rctl_enforced_value(
 525             rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
 526                 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 527                     curproc->p_rctls, curproc, RCA_SAFE);
 528                 mutex_exit(&curproc->p_lock);
 529                 return (set_errno(EINVAL));
 530         }
 531         mutex_exit(&curproc->p_lock);
 532         /*
 533          * Copy in the pollfd array.  Walk through the array and add
 534          * each polled fd to the cached set.
 535          */
 536         pollfdp = kmem_alloc(uiosize, KM_SLEEP);
 537 
 538         /*
 539          * Although /dev/poll uses the write(2) interface to cache fds, it's
 540          * not supposed to function as a seekable device. To prevent offset
 541          * from growing and eventually exceed the maximum, reset the offset
 542          * here for every call.
 543          */
 544         uiop->uio_loffset = 0;
 545         if ((error = uiomove((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop))
 546             != 0) {
 547                 kmem_free(pollfdp, uiosize);
 548                 return (error);
 549         }
 550         /*
 551          * We are about to enter the core portion of dpwrite(). Make sure this
 552          * write has exclusive access in this portion of the code, i.e., no
 553          * other writers in this code and no other readers in dpioctl.
 554          */
 555         mutex_enter(&dpep->dpe_lock);
 556         dpep->dpe_writerwait++;
 557         while (dpep->dpe_refcnt != 0) {
 558                 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
 559                         dpep->dpe_writerwait--;
 560                         mutex_exit(&dpep->dpe_lock);
 561                         kmem_free(pollfdp, uiosize);
 562                         return (set_errno(EINTR));
 563                 }
 564         }
 565         dpep->dpe_writerwait--;
 566         dpep->dpe_flag |= DP_WRITER_PRESENT;
 567         dpep->dpe_refcnt++;
 568         mutex_exit(&dpep->dpe_lock);
 569 
 570         mutex_enter(&pcp->pc_lock);
 571         if (pcp->pc_bitmap == NULL) {
 572                 pcache_create(pcp, pollfdnum);
 573         }
 574         for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) {
 575                 fd = pfdp->fd;
 576                 if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles)
 577                         continue;
 578                 pdp = pcache_lookup_fd(pcp, fd);
 579                 if (pfdp->events != POLLREMOVE) {
 580                         if (pdp == NULL) {
 581                                 pdp = pcache_alloc_fd(0);
 582                                 pdp->pd_fd = fd;
 583                                 pdp->pd_pcache = pcp;
 584                                 pcache_insert_fd(pcp, pdp, pollfdnum);
 585                         }
 586                         ASSERT(pdp->pd_fd == fd);
 587                         ASSERT(pdp->pd_pcache == pcp);
 588                         if (fd >= pcp->pc_mapsize) {
 589                                 mutex_exit(&pcp->pc_lock);
 590                                 pcache_grow_map(pcp, fd);
 591                                 mutex_enter(&pcp->pc_lock);
 592                         }
 593                         if (fd > pcp->pc_mapend) {
 594                                 pcp->pc_mapend = fd;
 595                         }
 596                         if ((fp = getf(fd)) == NULL) {
 597                                 /*
 598                                  * The fd is not valid. Since we can't pass
 599                                  * this error back in the write() call, set
 600                                  * the bit in bitmap to force DP_POLL ioctl
 601                                  * to examine it.
 602                                  */
 603                                 BT_SET(pcp->pc_bitmap, fd);
 604                                 pdp->pd_events |= pfdp->events;
 605                                 continue;
 606                         }
 607                         /*
 608                          * Don't do VOP_POLL for an already cached fd with
 609                          * same poll events.
 610                          */
 611                         if ((pdp->pd_events == pfdp->events) &&
 612                             (pdp->pd_fp != NULL)) {
 613                                 /*
 614                                  * the events are already cached
 615                                  */
 616                                 releasef(fd);
 617                                 continue;
 618                         }
 619 
 620                         /*
 621                          * do VOP_POLL and cache this poll fd.
 622                          */
 623                         /*
 624                          * XXX - pollrelock() logic needs to know which
 625                          * which pollcache lock to grab. It'd be a
 626                          * cleaner solution if we could pass pcp as
 627                          * an arguement in VOP_POLL interface instead
 628                          * of implicitly passing it using thread_t
 629                          * struct. On the other hand, changing VOP_POLL
 630                          * interface will require all driver/file system
 631                          * poll routine to change. May want to revisit
 632                          * the tradeoff later.
 633                          */
 634                         curthread->t_pollcache = pcp;
 635                         error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
 636                             &pfdp->revents, &php, NULL);
 637                         curthread->t_pollcache = NULL;
 638                         /*
 639                          * We always set the bit when this fd is cached;
 640                          * this forces the first DP_POLL to poll this fd.
 641                          * Real performance gain comes from subsequent
 642                          * DP_POLL.  We also attempt a pollhead_insert();
 643                          * if it's not possible, we'll do it in dpioctl().
 644                          */
 645                         BT_SET(pcp->pc_bitmap, fd);
 646                         if (error != 0) {
 647                                 releasef(fd);
 648                                 break;
 649                         }
 650                         pdp->pd_fp = fp;
 651                         pdp->pd_events |= pfdp->events;
 652                         if (php != NULL) {
 653                                 if (pdp->pd_php == NULL) {
 654                                         pollhead_insert(php, pdp);
 655                                         pdp->pd_php = php;
 656                                 } else {
 657                                         if (pdp->pd_php != php) {
 658                                                 pollhead_delete(pdp->pd_php,
 659                                                     pdp);
 660                                                 pollhead_insert(php, pdp);
 661                                                 pdp->pd_php = php;
 662                                         }
 663                                 }
 664 
 665                         }
 666                         releasef(fd);
 667                 } else {
 668                         if (pdp == NULL) {
 669                                 continue;
 670                         }
 671                         ASSERT(pdp->pd_fd == fd);
 672                         pdp->pd_fp = NULL;
 673                         pdp->pd_events = 0;
 674                         ASSERT(pdp->pd_thread == NULL);
 675                         if (pdp->pd_php != NULL) {
 676                                 pollhead_delete(pdp->pd_php, pdp);
 677                                 pdp->pd_php = NULL;
 678                         }
 679                         BT_CLEAR(pcp->pc_bitmap, fd);
 680                 }
 681         }
 682         mutex_exit(&pcp->pc_lock);
 683         mutex_enter(&dpep->dpe_lock);
 684         dpep->dpe_flag &= ~DP_WRITER_PRESENT;
 685         ASSERT(dpep->dpe_refcnt == 1);
 686         dpep->dpe_refcnt--;
 687         cv_broadcast(&dpep->dpe_cv);
 688         mutex_exit(&dpep->dpe_lock);
 689         kmem_free(pollfdp, uiosize);
 690         return (error);
 691 }
 692 
 693 /*ARGSUSED*/
 694 static int
 695 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
 696 {
 697         minor_t         minor;
 698         dp_entry_t      *dpep;
 699         pollcache_t     *pcp;
 700         hrtime_t        now;
 701         int             error = 0;
 702         STRUCT_DECL(dvpoll, dvpoll);
 703 
 704         if (cmd == DP_POLL) {
 705                 /* do this now, before we sleep on DP_WRITER_PRESENT */
 706                 now = gethrtime();
 707         }
 708 
 709         minor = getminor(dev);
 710         mutex_enter(&devpoll_lock);
 711         ASSERT(minor < dptblsize);
 712         dpep = devpolltbl[minor];
 713         mutex_exit(&devpoll_lock);
 714         ASSERT(dpep != NULL);
 715         pcp = dpep->dpe_pcache;
 716         if (curproc->p_pid != pcp->pc_pid)
 717                 return (EACCES);
 718 
 719         mutex_enter(&dpep->dpe_lock);
 720         while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
 721             (dpep->dpe_writerwait != 0)) {
 722                 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
 723                         mutex_exit(&dpep->dpe_lock);
 724                         return (EINTR);
 725                 }
 726         }
 727         dpep->dpe_refcnt++;
 728         mutex_exit(&dpep->dpe_lock);
 729 
 730         switch (cmd) {
 731         case    DP_POLL:
 732         {
 733                 pollstate_t     *ps;
 734                 nfds_t          nfds;
 735                 int             fdcnt = 0;
 736                 hrtime_t        deadline = 0;
 737 
 738                 STRUCT_INIT(dvpoll, mode);
 739                 error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
 740                     STRUCT_SIZE(dvpoll));
 741                 if (error) {
 742                         DP_REFRELE(dpep);
 743                         return (EFAULT);
 744                 }
 745 
 746                 deadline = STRUCT_FGET(dvpoll, dp_timeout);
 747                 if (deadline > 0) {
 748                         /*
 749                          * Convert the deadline from relative milliseconds
 750                          * to absolute nanoseconds.  They must wait for at
 751                          * least a tick.
 752                          */
 753                         deadline = deadline * NANOSEC / MILLISEC;
 754                         deadline = MAX(deadline, nsec_per_tick);
 755                         deadline += now;
 756                 }
 757 
 758                 if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
 759                         /*
 760                          * We are just using DP_POLL to sleep, so
 761                          * we don't any of the devpoll apparatus.
 762                          * Do not check for signals if we have a zero timeout.
 763                          */
 764                         DP_REFRELE(dpep);
 765                         if (deadline == 0)
 766                                 return (0);
 767                         mutex_enter(&curthread->t_delay_lock);
 768                         while ((error =
 769                             cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
 770                             &curthread->t_delay_lock, deadline)) > 0)
 771                                 continue;
 772                         mutex_exit(&curthread->t_delay_lock);
 773                         return (error == 0 ? EINTR : 0);
 774                 }
 775 
 776                 /*
 777                  * XXX It would be nice not to have to alloc each time, but it
 778                  * requires another per thread structure hook. This can be
 779                  * implemented later if data suggests that it's necessary.
 780                  */
 781                 if ((ps = curthread->t_pollstate) == NULL) {
 782                         curthread->t_pollstate = pollstate_create();
 783                         ps = curthread->t_pollstate;
 784                 }
 785                 if (ps->ps_dpbufsize < nfds) {
 786                         struct proc *p = ttoproc(curthread);
 787                         /*
 788                          * The maximum size should be no large than
 789                          * current maximum open file count.
 790                          */
 791                         mutex_enter(&p->p_lock);
 792                         if (nfds > p->p_fno_ctl) {
 793                                 mutex_exit(&p->p_lock);
 794                                 DP_REFRELE(dpep);
 795                                 return (EINVAL);
 796                         }
 797                         mutex_exit(&p->p_lock);
 798                         kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) *
 799                             ps->ps_dpbufsize);
 800                         ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) *
 801                             nfds, KM_SLEEP);
 802                         ps->ps_dpbufsize = nfds;
 803                 }
 804 
 805                 mutex_enter(&pcp->pc_lock);
 806                 for (;;) {
 807                         pcp->pc_flag = 0;
 808                         error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt);
 809                         if (fdcnt > 0 || error != 0)
 810                                 break;
 811 
 812                         /*
 813                          * A pollwake has happened since we polled cache.
 814                          */
 815                         if (pcp->pc_flag & T_POLLWAKE)
 816                                 continue;
 817 
 818                         /*
 819                          * Sleep until we are notified, signaled, or timed out.
 820                          */
 821                         if (deadline == 0) {
 822                                 /* immediate timeout; do not check signals */
 823                                 break;
 824                         }
 825                         error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
 826                             &pcp->pc_lock, deadline);
 827                         /*
 828                          * If we were awakened by a signal or timeout
 829                          * then break the loop, else poll again.
 830                          */
 831                         if (error <= 0) {
 832                                 error = (error == 0) ? EINTR : 0;
 833                                 break;
 834                         } else {
 835                                 error = 0;
 836                         }
 837                 }
 838                 mutex_exit(&pcp->pc_lock);
 839 
 840                 if (error == 0 && fdcnt > 0) {
 841                         if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll,
 842                             dp_fds), sizeof (pollfd_t) * fdcnt)) {
 843                                 DP_REFRELE(dpep);
 844                                 return (EFAULT);
 845                         }
 846                         *rvalp = fdcnt;
 847                 }
 848                 break;
 849         }
 850 
 851         case    DP_ISPOLLED:
 852         {
 853                 pollfd_t        pollfd;
 854                 polldat_t       *pdp;
 855 
 856                 STRUCT_INIT(dvpoll, mode);
 857                 error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
 858                 if (error) {
 859                         DP_REFRELE(dpep);
 860                         return (EFAULT);
 861                 }
 862                 mutex_enter(&pcp->pc_lock);
 863                 if (pcp->pc_hash == NULL) {
 864                         /*
 865                          * No Need to search because no poll fd
 866                          * has been cached.
 867                          */
 868                         mutex_exit(&pcp->pc_lock);
 869                         DP_REFRELE(dpep);
 870                         return (0);
 871                 }
 872                 if (pollfd.fd < 0) {
 873                         mutex_exit(&pcp->pc_lock);
 874                         break;
 875                 }
 876                 pdp = pcache_lookup_fd(pcp, pollfd.fd);
 877                 if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
 878                     (pdp->pd_fp != NULL)) {
 879                         pollfd.revents = pdp->pd_events;
 880                         if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
 881                                 mutex_exit(&pcp->pc_lock);
 882                                 DP_REFRELE(dpep);
 883                                 return (EFAULT);
 884                         }
 885                         *rvalp = 1;
 886                 }
 887                 mutex_exit(&pcp->pc_lock);
 888                 break;
 889         }
 890 
 891         default:
 892                 DP_REFRELE(dpep);
 893                 return (EINVAL);
 894         }
 895         DP_REFRELE(dpep);
 896         return (error);
 897 }
 898 
 899 /*ARGSUSED*/
 900 static int
 901 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
 902     struct pollhead **phpp)
 903 {
 904         /*
 905          * Polling on a /dev/poll fd is not fully supported yet.
 906          */
 907         *reventsp = POLLERR;
 908         return (0);
 909 }
 910 
 911 /*
 912  * devpoll close should do enough clean up before the pollcache is deleted,
 913  * i.e., it should ensure no one still references the pollcache later.
 914  * There is no "permission" check in here. Any process having the last
 915  * reference of this /dev/poll fd can close.
 916  */
 917 /*ARGSUSED*/
 918 static int
 919 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
 920 {
 921         minor_t         minor;
 922         dp_entry_t      *dpep;
 923         pollcache_t     *pcp;
 924         int             i;
 925         polldat_t       **hashtbl;
 926         polldat_t       *pdp;
 927 
 928         minor = getminor(dev);
 929 
 930         mutex_enter(&devpoll_lock);
 931         dpep = devpolltbl[minor];
 932         ASSERT(dpep != NULL);
 933         devpolltbl[minor] = NULL;
 934         mutex_exit(&devpoll_lock);
 935         pcp = dpep->dpe_pcache;
 936         ASSERT(pcp != NULL);
 937         /*
 938          * At this point, no other lwp can access this pollcache via the
 939          * /dev/poll fd. This pollcache is going away, so do the clean
 940          * up without the pc_lock.
 941          */
 942         hashtbl = pcp->pc_hash;
 943         for (i = 0; i < pcp->pc_hashsize; i++) {
 944                 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
 945                         if (pdp->pd_php != NULL) {
 946                                 pollhead_delete(pdp->pd_php, pdp);
 947                                 pdp->pd_php = NULL;
 948                                 pdp->pd_fp = NULL;
 949                         }
 950                 }
 951         }
 952         /*
 953          * pollwakeup() may still interact with this pollcache. Wait until
 954          * it is done.
 955          */
 956         mutex_enter(&pcp->pc_no_exit);
 957         ASSERT(pcp->pc_busy >= 0);
 958         while (pcp->pc_busy > 0)
 959                 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
 960         mutex_exit(&pcp->pc_no_exit);
 961         pcache_destroy(pcp);
 962         ASSERT(dpep->dpe_refcnt == 0);
 963         kmem_free(dpep, sizeof (dp_entry_t));
 964         return (0);
 965 }