1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /* ONC_PLUS EXTRACT START */
  22 
  23 /*
  24  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  25  * Use is subject to license terms.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29 /*      All Rights Reserved */
  30 
  31 #pragma ident   "%Z%%M% %I%     %E% SMI"
  32 
  33 #include <sys/flock_impl.h>
  34 #include <sys/vfs.h>
  35 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  36 #include <sys/callb.h>
  37 #include <sys/clconf.h>
  38 #include <sys/cladm.h>
  39 #include <sys/nbmlock.h>
  40 #include <sys/cred.h>
  41 #include <sys/policy.h>
  42 
  43 /*
  44  * The following four variables are for statistics purposes and they are
  45  * not protected by locks. They may not be accurate but will at least be
  46  * close to the actual value.
  47  */
  48 
  49 int     flk_lock_allocs;
  50 int     flk_lock_frees;
  51 int     edge_allocs;
  52 int     edge_frees;
  53 int     flk_proc_vertex_allocs;
  54 int     flk_proc_edge_allocs;
  55 int     flk_proc_vertex_frees;
  56 int     flk_proc_edge_frees;
  57 
  58 static kmutex_t flock_lock;
  59 
  60 #ifdef DEBUG
  61 int check_debug = 0;
  62 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  63                                         check_active_locks(gp);
  64 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  65                                                 check_sleeping_locks(gp);
  66 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  67                 if (check_debug)        \
  68                         check_owner_locks(gp, pid, sysid, vp);
  69 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  70         { \
  71                 if (check_lock_transition(old_state, new_state)) { \
  72                         cmn_err(CE_PANIC, "Illegal lock transition \
  73                             from %d to %d", old_state, new_state); \
  74                 } \
  75         }
  76 #else
  77 
  78 #define CHECK_ACTIVE_LOCKS(gp)
  79 #define CHECK_SLEEPING_LOCKS(gp)
  80 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  81 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  82 
  83 #endif /* DEBUG */
  84 
  85 struct kmem_cache       *flk_edge_cache;
  86 
  87 graph_t         *lock_graph[HASH_SIZE];
  88 proc_graph_t    pgraph;
  89 
  90 /*
  91  * Clustering.
  92  *
  93  * NLM REGISTRY TYPE IMPLEMENTATION
  94  *
  95  * Assumptions:
  96  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  97  *      integers; maximum node id is returned by clconf_maximum_nodeid().
  98  *  2.  We use this node id to identify the node an NLM server runs on.
  99  */
 100 
 101 /*
 102  * NLM registry object keeps track of NLM servers via their
 103  * nlmids (which are the node ids of the node in the cluster they run on)
 104  * that have requested locks at this LLM with which this registry is
 105  * associated.
 106  *
 107  * Representation of abstraction:
 108  *    rep = record[     states: array[nlm_state],
 109  *                      lock: mutex]
 110  *
 111  *    Representation invariants:
 112  *      1. index i of rep.states is between 0 and n - 1 where n is number
 113  *         of elements in the array, which happen to be the maximum number
 114  *         of nodes in the cluster configuration + 1.
 115  *      2. map nlmid to index i of rep.states
 116  *              0   -> 0
 117  *              1   -> 1
 118  *              2   -> 2
 119  *              n-1 -> clconf_maximum_nodeid()+1
 120  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 121  *          from forgetting to subtract 1 from the index.
 122  *      4.  The reason we keep the 0th index is the following.  A legitimate
 123  *          cluster configuration includes making a UFS file system NFS
 124  *          exportable.  The code is structured so that if you're in a cluster
 125  *          you do one thing; otherwise, you do something else.  The problem
 126  *          is what to do if you think you're in a cluster with PXFS loaded,
 127  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 128  *          encode the node id of the node where NLM server runs; these bytes
 129  *          are zero for UFS.  Since the nodeid is used to index into the
 130  *          registry, we can record the NLM server state information at index
 131  *          0 using the same mechanism used for PXFS file locks!
 132  */
 133 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 134 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 135 static uint_t nlm_status_size;                  /* size of state array */
 136 
 137 /*
 138  * Although we need a global lock dependency graph (and associated data
 139  * structures), we also need a per-zone notion of whether the lock manager is
 140  * running, and so whether to allow lock manager requests or not.
 141  *
 142  * Thus, on a per-zone basis we maintain a ``global'' variable
 143  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 144  * manager is determined to be changing state (starting or stopping).
 145  *
 146  * Each graph/zone pair also has a copy of this variable, which is protected by
 147  * the graph's mutex.
 148  *
 149  * The per-graph copies are used to synchronize lock requests with shutdown
 150  * requests.  The global copy is used to initialize the per-graph field when a
 151  * new graph is created.
 152  */
 153 struct flock_globals {
 154         flk_lockmgr_status_t flk_lockmgr_status;
 155         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 156 };
 157 
 158 zone_key_t flock_zone_key;
 159 
 160 static void create_flock(lock_descriptor_t *, flock64_t *);
 161 static lock_descriptor_t        *flk_get_lock(void);
 162 static void     flk_free_lock(lock_descriptor_t *lock);
 163 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 164 static int flk_process_request(lock_descriptor_t *);
 165 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 166 static edge_t *flk_get_edge(void);
 167 static int flk_wait_execute_request(lock_descriptor_t *);
 168 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 169 static void flk_insert_active_lock(lock_descriptor_t *);
 170 static void flk_delete_active_lock(lock_descriptor_t *, int);
 171 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 172 static void flk_graph_uncolor(graph_t *);
 173 static void flk_wakeup(lock_descriptor_t *, int);
 174 static void flk_free_edge(edge_t *);
 175 static void flk_recompute_dependencies(lock_descriptor_t *,
 176                         lock_descriptor_t **,  int, int);
 177 static int flk_find_barriers(lock_descriptor_t *);
 178 static void flk_update_barriers(lock_descriptor_t *);
 179 static int flk_color_reachables(lock_descriptor_t *);
 180 static int flk_canceled(lock_descriptor_t *);
 181 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 182 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 183 static void wait_for_lock(lock_descriptor_t *);
 184 static void unlock_lockmgr_granted(struct flock_globals *);
 185 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 186 
 187 /* Clustering hooks */
 188 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 189 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 190 static void cl_flk_unlock_nlm_granted(int);
 191 
 192 #ifdef DEBUG
 193 static int check_lock_transition(int, int);
 194 static void check_sleeping_locks(graph_t *);
 195 static void check_active_locks(graph_t *);
 196 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 197 static void path(lock_descriptor_t *, lock_descriptor_t *);
 198 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 199 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 200 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 201 #endif
 202 
 203 /*      proc_graph function definitions */
 204 static int flk_check_deadlock(lock_descriptor_t *);
 205 static void flk_proc_graph_uncolor(void);
 206 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 207 static proc_edge_t *flk_get_proc_edge(void);
 208 static void flk_proc_release(proc_vertex_t *);
 209 static void flk_free_proc_edge(proc_edge_t *);
 210 static void flk_update_proc_graph(edge_t *, int);
 211 
 212 /* Non-blocking mandatory locking */
 213 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 214                         u_offset_t);
 215 
 216 static struct flock_globals *
 217 flk_get_globals(void)
 218 {
 219         /*
 220          * The KLM module had better be loaded if we're attempting to handle
 221          * lockmgr requests.
 222          */
 223         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 224         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 225 }
 226 
 227 static flk_lockmgr_status_t
 228 flk_get_lockmgr_status(void)
 229 {
 230         struct flock_globals *fg;
 231 
 232         ASSERT(MUTEX_HELD(&flock_lock));
 233 
 234         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 235                 /*
 236                  * KLM module not loaded; lock manager definitely not running.
 237                  */
 238                 return (FLK_LOCKMGR_DOWN);
 239         }
 240         fg = flk_get_globals();
 241         return (fg->flk_lockmgr_status);
 242 }
 243 
 244 /*
 245  * Routine called from fs_frlock in fs/fs_subr.c
 246  */
 247 
 248 int
 249 reclock(vnode_t         *vp,
 250         flock64_t       *lckdat,
 251         int             cmd,
 252         int             flag,
 253         u_offset_t      offset,
 254         flk_callback_t  *flk_cbp)
 255 {
 256         lock_descriptor_t       stack_lock_request;
 257         lock_descriptor_t       *lock_request;
 258         int error = 0;
 259         graph_t *gp;
 260         int                     nlmid;
 261 
 262         /*
 263          * Check access permissions
 264          */
 265         if ((cmd & SETFLCK) &&
 266                 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 267                 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 268                         return (EBADF);
 269 
 270         /*
 271          * for query and unlock we use the stack_lock_request
 272          */
 273 
 274         if ((lckdat->l_type == F_UNLCK) ||
 275                         !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 276                 lock_request = &stack_lock_request;
 277                 (void) bzero((caddr_t)lock_request,
 278                                 sizeof (lock_descriptor_t));
 279 
 280                 /*
 281                  * following is added to make the assertions in
 282                  * flk_execute_request() to pass through
 283                  */
 284 
 285                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 286                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 287                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 288                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 289                 lock_request->l_status = FLK_INITIAL_STATE;
 290         } else {
 291                 lock_request = flk_get_lock();
 292         }
 293         lock_request->l_state = 0;
 294         lock_request->l_vnode = vp;
 295         lock_request->l_zoneid = getzoneid();
 296 
 297         /*
 298          * Convert the request range into the canonical start and end
 299          * values.  The NLM protocol supports locking over the entire
 300          * 32-bit range, so there's no range checking for remote requests,
 301          * but we still need to verify that local requests obey the rules.
 302          */
 303         /* Clustering */
 304         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 305                 ASSERT(lckdat->l_whence == 0);
 306                 lock_request->l_start = lckdat->l_start;
 307                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 308                         lckdat->l_start + (lckdat->l_len - 1);
 309         } else {
 310                 /* check the validity of the lock range */
 311                 error = flk_convert_lock_data(vp, lckdat,
 312                         &lock_request->l_start, &lock_request->l_end,
 313                         offset);
 314                 if (error) {
 315                         goto done;
 316                 }
 317                 error = flk_check_lock_data(lock_request->l_start,
 318                                             lock_request->l_end, MAXEND);
 319                 if (error) {
 320                         goto done;
 321                 }
 322         }
 323 
 324         ASSERT(lock_request->l_end >= lock_request->l_start);
 325 
 326         lock_request->l_type = lckdat->l_type;
 327         if (cmd & INOFLCK)
 328                 lock_request->l_state |= IO_LOCK;
 329         if (cmd & SLPFLCK)
 330                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 331         if (cmd & RCMDLCK)
 332                 lock_request->l_state |= LOCKMGR_LOCK;
 333         if (cmd & NBMLCK)
 334                 lock_request->l_state |= NBMAND_LOCK;
 335         /*
 336          * Clustering: set flag for PXFS locks
 337          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 338          * also be of type 'RCMDLCK'.
 339          * We do not _only_ check the GETPXFSID() macro because local PXFS
 340          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 341          */
 342 
 343         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 344                 lock_request->l_state |= PXFS_LOCK;
 345         }
 346         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 347                 if (lock_request->l_type == F_RDLCK ||
 348                         lock_request->l_type == F_WRLCK)
 349                         lock_request->l_state |= QUERY_LOCK;
 350         }
 351         lock_request->l_flock = (*lckdat);
 352         lock_request->l_callbacks = flk_cbp;
 353 
 354         /*
 355          * We are ready for processing the request
 356          */
 357         if (IS_LOCKMGR(lock_request)) {
 358                 /*
 359                  * If the lock request is an NLM server request ....
 360                  */
 361                 if (nlm_status_size == 0) { /* not booted as cluster */
 362                         mutex_enter(&flock_lock);
 363                         /*
 364                          * Bail out if this is a lock manager request and the
 365                          * lock manager is not supposed to be running.
 366                          */
 367                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 368                                 mutex_exit(&flock_lock);
 369                                 error = ENOLCK;
 370                                 goto done;
 371                         }
 372                         mutex_exit(&flock_lock);
 373                 } else {                        /* booted as a cluster */
 374                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 375                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 376 
 377                         mutex_enter(&nlm_reg_lock);
 378                         /*
 379                          * If the NLM registry does not know about this
 380                          * NLM server making the request, add its nlmid
 381                          * to the registry.
 382                          */
 383                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 384                                 nlmid)) {
 385                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 386                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 387                                 nlmid)) {
 388                                 /*
 389                                  * If the NLM server is already known (has made
 390                                  * previous lock requests) and its state is
 391                                  * not NLM_UP (means that NLM server is
 392                                  * shutting down), then bail out with an
 393                                  * error to deny the lock request.
 394                                  */
 395                                 mutex_exit(&nlm_reg_lock);
 396                                 error = ENOLCK;
 397                                 goto done;
 398                         }
 399                         mutex_exit(&nlm_reg_lock);
 400                 }
 401         }
 402 
 403         /* Now get the lock graph for a particular vnode */
 404         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 405 
 406         /*
 407          * We drop rwlock here otherwise this might end up causing a
 408          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 409          */
 410 
 411         if (IS_IO_LOCK(lock_request)) {
 412                 VOP_RWUNLOCK(vp,
 413                         (lock_request->l_type == F_RDLCK) ?
 414                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 415         }
 416         mutex_enter(&gp->gp_mutex);
 417 
 418         lock_request->l_state |= REFERENCED_LOCK;
 419         lock_request->l_graph = gp;
 420 
 421         switch (lock_request->l_type) {
 422         case F_RDLCK:
 423         case F_WRLCK:
 424                 if (IS_QUERY_LOCK(lock_request)) {
 425                         flk_get_first_blocking_lock(lock_request);
 426                         (*lckdat) = lock_request->l_flock;
 427                         break;
 428                 }
 429 
 430                 /* process the request now */
 431 
 432                 error = flk_process_request(lock_request);
 433                 break;
 434 
 435         case F_UNLCK:
 436                 /* unlock request will not block so execute it immediately */
 437 
 438                 if (IS_LOCKMGR(lock_request) &&
 439                     flk_canceled(lock_request)) {
 440                         error = 0;
 441                 } else {
 442                         error = flk_execute_request(lock_request);
 443                 }
 444                 break;
 445 
 446         case F_UNLKSYS:
 447                 /*
 448                  * Recovery mechanism to release lock manager locks when
 449                  * NFS client crashes and restart. NFS server will clear
 450                  * old locks and grant new locks.
 451                  */
 452 
 453                 if (lock_request->l_flock.l_sysid == 0) {
 454                         mutex_exit(&gp->gp_mutex);
 455                         return (EINVAL);
 456                 }
 457                 if (secpolicy_nfs(CRED()) != 0) {
 458                         mutex_exit(&gp->gp_mutex);
 459                         return (EPERM);
 460                 }
 461                 flk_delete_locks_by_sysid(lock_request);
 462                 lock_request->l_state &= ~REFERENCED_LOCK;
 463                 flk_set_state(lock_request, FLK_DEAD_STATE);
 464                 flk_free_lock(lock_request);
 465                 mutex_exit(&gp->gp_mutex);
 466                 return (0);
 467 
 468         default:
 469                 error = EINVAL;
 470                 break;
 471         }
 472 
 473         /* Clustering: For blocked PXFS locks, return */
 474         if (error == PXFS_LOCK_BLOCKED) {
 475                 lock_request->l_state &= ~REFERENCED_LOCK;
 476                 mutex_exit(&gp->gp_mutex);
 477                 return (error);
 478         }
 479 
 480         /*
 481          * Now that we have seen the status of locks in the system for
 482          * this vnode we acquire the rwlock if it is an IO_LOCK.
 483          */
 484 
 485         if (IS_IO_LOCK(lock_request)) {
 486                 (void) VOP_RWLOCK(vp,
 487                         (lock_request->l_type == F_RDLCK) ?
 488                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 489                 if (!error) {
 490                         lckdat->l_type = F_UNLCK;
 491 
 492                         /*
 493                          * This wake up is needed otherwise
 494                          * if IO_LOCK has slept the dependents on this
 495                          * will not be woken up at all. (bugid # 1185482).
 496                          */
 497 
 498                         flk_wakeup(lock_request, 1);
 499                         flk_set_state(lock_request, FLK_DEAD_STATE);
 500                         flk_free_lock(lock_request);
 501                 }
 502                 /*
 503                  * else if error had occurred either flk_process_request()
 504                  * has returned EDEADLK in which case there will be no
 505                  * dependents for this lock or EINTR from flk_wait_execute_
 506                  * request() in which case flk_cancel_sleeping_lock()
 507                  * would have been done. same is true with EBADF.
 508                  */
 509         }
 510 
 511         if (lock_request == &stack_lock_request) {
 512                 flk_set_state(lock_request, FLK_DEAD_STATE);
 513         } else {
 514                 lock_request->l_state &= ~REFERENCED_LOCK;
 515                 if ((error != 0) || IS_DELETED(lock_request)) {
 516                         flk_set_state(lock_request, FLK_DEAD_STATE);
 517                         flk_free_lock(lock_request);
 518                 }
 519         }
 520 
 521         mutex_exit(&gp->gp_mutex);
 522         return (error);
 523 
 524 done:
 525         flk_set_state(lock_request, FLK_DEAD_STATE);
 526         if (lock_request != &stack_lock_request)
 527                 flk_free_lock(lock_request);
 528         return (error);
 529 }
 530 
 531 /*
 532  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 533  * list order.  If after sleeping, invoke in reverse order.
 534  *
 535  * CPR (suspend/resume) support: if one of the callbacks returns a
 536  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 537  * while it is sleeping.  There should be at most one callb_cpr_t for the
 538  * thread.
 539  * XXX This is unnecessarily complicated.  The CPR information should just
 540  * get passed in directly through VOP_FRLOCK and reclock, rather than
 541  * sneaking it in via a callback.
 542  */
 543 
 544 callb_cpr_t *
 545 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 546 {
 547         callb_cpr_t *cpr_callbackp = NULL;
 548         callb_cpr_t *one_result;
 549         flk_callback_t *cb;
 550 
 551         if (cblist == NULL)
 552                 return (NULL);
 553 
 554         if (when == FLK_BEFORE_SLEEP) {
 555                 cb = cblist;
 556                 do {
 557                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 558                         if (one_result != NULL) {
 559                                 ASSERT(cpr_callbackp == NULL);
 560                                 cpr_callbackp = one_result;
 561                         }
 562                         cb = cb->cb_next;
 563                 } while (cb != cblist);
 564         } else {
 565                 cb = cblist->cb_prev;
 566                 do {
 567                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 568                         if (one_result != NULL) {
 569                                 cpr_callbackp = one_result;
 570                         }
 571                         cb = cb->cb_prev;
 572                 } while (cb != cblist->cb_prev);
 573         }
 574 
 575         return (cpr_callbackp);
 576 }
 577 
 578 /*
 579  * Initialize a flk_callback_t to hold the given callback.
 580  */
 581 
 582 void
 583 flk_init_callback(flk_callback_t *flk_cb,
 584         callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 585 {
 586         flk_cb->cb_next = flk_cb;
 587         flk_cb->cb_prev = flk_cb;
 588         flk_cb->cb_callback = cb_fcn;
 589         flk_cb->cb_data = cbdata;
 590 }
 591 
 592 /*
 593  * Initialize an flk_callback_t and then link it into the head of an
 594  * existing list (which may be NULL).
 595  */
 596 
 597 void
 598 flk_add_callback(flk_callback_t *newcb,
 599                 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 600                 void *cbdata, flk_callback_t *cblist)
 601 {
 602         flk_init_callback(newcb, cb_fcn, cbdata);
 603 
 604         if (cblist == NULL)
 605                 return;
 606 
 607         newcb->cb_prev = cblist->cb_prev;
 608         newcb->cb_next = cblist;
 609         cblist->cb_prev->cb_next = newcb;
 610         cblist->cb_prev = newcb;
 611 }
 612 /* ONC_PLUS EXTRACT END */
 613 
 614 /*
 615  * Initialize the flk_edge_cache data structure and create the
 616  * nlm_reg_status array.
 617  */
 618 
 619 void
 620 flk_init(void)
 621 {
 622         uint_t  i;
 623 
 624         flk_edge_cache = kmem_cache_create("flk_edges",
 625                 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 626         if (flk_edge_cache == NULL) {
 627                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 628         }
 629         /*
 630          * Create the NLM registry object.
 631          */
 632 
 633         if (cluster_bootflags & CLUSTER_BOOTED) {
 634                 /*
 635                  * This routine tells you the maximum node id that will be used
 636                  * in the cluster.  This number will be the size of the nlm
 637                  * registry status array.  We add 1 because we will be using
 638                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 639                  * to 64, for a total of 65 entries.
 640                  */
 641                 nlm_status_size = clconf_maximum_nodeid() + 1;
 642         } else {
 643                 nlm_status_size = 0;
 644         }
 645 
 646         if (nlm_status_size != 0) {     /* booted as a cluster */
 647                 nlm_reg_status = (flk_nlm_status_t *)
 648                         kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 649                                 KM_SLEEP);
 650 
 651                 /* initialize all NLM states in array to NLM_UNKNOWN */
 652                 for (i = 0; i < nlm_status_size; i++) {
 653                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 654                 }
 655         }
 656 }
 657 
 658 /*
 659  * Zone constructor/destructor callbacks to be executed when a zone is
 660  * created/destroyed.
 661  */
 662 /* ARGSUSED */
 663 void *
 664 flk_zone_init(zoneid_t zoneid)
 665 {
 666         struct flock_globals *fg;
 667         uint_t i;
 668 
 669         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 670         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 671         for (i = 0; i < HASH_SIZE; i++)
 672                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 673         return (fg);
 674 }
 675 
 676 /* ARGSUSED */
 677 void
 678 flk_zone_fini(zoneid_t zoneid, void *data)
 679 {
 680         struct flock_globals *fg = data;
 681 
 682         kmem_free(fg, sizeof (*fg));
 683 }
 684 
 685 /*
 686  * Get a lock_descriptor structure with initialization of edge lists.
 687  */
 688 
 689 static lock_descriptor_t *
 690 flk_get_lock(void)
 691 {
 692         lock_descriptor_t       *l;
 693 
 694         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 695 
 696         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 697         l->l_edge.edge_in_next = &l->l_edge;
 698         l->l_edge.edge_in_prev = &l->l_edge;
 699         l->l_edge.edge_adj_next = &l->l_edge;
 700         l->l_edge.edge_adj_prev = &l->l_edge;
 701         l->pvertex = -1;
 702         l->l_status = FLK_INITIAL_STATE;
 703         flk_lock_allocs++;
 704         return (l);
 705 }
 706 
 707 /*
 708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 709  * when some thread has a reference to it as in reclock().
 710  */
 711 
 712 void
 713 flk_free_lock(lock_descriptor_t *lock)
 714 {
 715         ASSERT(IS_DEAD(lock));
 716         if (IS_REFERENCED(lock)) {
 717                 lock->l_state |= DELETED_LOCK;
 718                 return;
 719         }
 720         flk_lock_frees++;
 721         kmem_free((void *)lock, sizeof (lock_descriptor_t));
 722 }
 723 
 724 void
 725 flk_set_state(lock_descriptor_t *lock, int new_state)
 726 {
 727         /*
 728          * Locks in the sleeping list may be woken up in a number of ways,
 729          * and more than once.  If a sleeping lock is signaled awake more
 730          * than once, then it may or may not change state depending on its
 731          * current state.
 732          * Also note that NLM locks that are sleeping could be moved to an
 733          * interrupted state more than once if the unlock request is
 734          * retransmitted by the NLM client - the second time around, this is
 735          * just a nop.
 736          * The ordering of being signaled awake is:
 737          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 738          * The checks below implement this ordering.
 739          */
 740         if (IS_INTERRUPTED(lock)) {
 741                 if ((new_state == FLK_CANCELLED_STATE) ||
 742                     (new_state == FLK_GRANTED_STATE) ||
 743                     (new_state == FLK_INTERRUPTED_STATE)) {
 744                         return;
 745                 }
 746         }
 747         if (IS_CANCELLED(lock)) {
 748                 if ((new_state == FLK_GRANTED_STATE) ||
 749                     (new_state == FLK_CANCELLED_STATE)) {
 750                         return;
 751                 }
 752         }
 753         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 754         if (IS_PXFS(lock)) {
 755                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 756         }
 757         lock->l_status = new_state;
 758 }
 759 
 760 /*
 761  * Routine that checks whether there are any blocking locks in the system.
 762  *
 763  * The policy followed is if a write lock is sleeping we don't allow read
 764  * locks before this write lock even though there may not be any active
 765  * locks corresponding to the read locks' region.
 766  *
 767  * flk_add_edge() function adds an edge between l1 and l2 iff there
 768  * is no path between l1 and l2. This is done to have a "minimum
 769  * storage representation" of the dependency graph.
 770  *
 771  * Another property of the graph is since only the new request throws
 772  * edges to the existing locks in the graph, the graph is always topologically
 773  * ordered.
 774  */
 775 
 776 static int
 777 flk_process_request(lock_descriptor_t *request)
 778 {
 779         graph_t *gp = request->l_graph;
 780         lock_descriptor_t *lock;
 781         int request_blocked_by_active = 0;
 782         int request_blocked_by_granted = 0;
 783         int request_blocked_by_sleeping = 0;
 784         vnode_t *vp = request->l_vnode;
 785         int     error = 0;
 786         int request_will_wait = 0;
 787         int found_covering_lock = 0;
 788         lock_descriptor_t *covered_by = NULL;
 789 
 790         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 791         request_will_wait = IS_WILLING_TO_SLEEP(request);
 792 
 793         /*
 794          * check active locks
 795          */
 796 
 797         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 798 
 799 
 800         if (lock) {
 801                 do {
 802                         if (BLOCKS(lock, request)) {
 803                                 if (!request_will_wait)
 804                                         return (EAGAIN);
 805                                 request_blocked_by_active = 1;
 806                                 break;
 807                         }
 808                         /*
 809                          * Grant lock if it is for the same owner holding active
 810                          * lock that covers the request.
 811                          */
 812 
 813                         if (SAME_OWNER(lock, request) &&
 814                                         COVERS(lock, request) &&
 815                                                 (request->l_type == F_RDLCK))
 816                                 return (flk_execute_request(request));
 817                         lock = lock->l_next;
 818                 } while (lock->l_vnode == vp);
 819         }
 820 
 821         if (!request_blocked_by_active) {
 822                         lock_descriptor_t *lk[1];
 823                         lock_descriptor_t *first_glock = NULL;
 824                 /*
 825                  * Shall we grant this?! NO!!
 826                  * What about those locks that were just granted and still
 827                  * in sleep queue. Those threads are woken up and so locks
 828                  * are almost active.
 829                  */
 830                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 831                 if (lock) {
 832                         do {
 833                                 if (BLOCKS(lock, request)) {
 834                                         if (IS_GRANTED(lock)) {
 835                                                 request_blocked_by_granted = 1;
 836                                         } else {
 837                                                 request_blocked_by_sleeping = 1;
 838                                         }
 839                                 }
 840 
 841                                 lock = lock->l_next;
 842                         } while ((lock->l_vnode == vp));
 843                         first_glock = lock->l_prev;
 844                         ASSERT(first_glock->l_vnode == vp);
 845                 }
 846 
 847                 if (request_blocked_by_granted)
 848                         goto block;
 849 
 850                 if (!request_blocked_by_sleeping) {
 851                         /*
 852                          * If the request isn't going to be blocked by a
 853                          * sleeping request, we know that it isn't going to
 854                          * be blocked; we can just execute the request --
 855                          * without performing costly deadlock detection.
 856                          */
 857                         ASSERT(!request_blocked_by_active);
 858                         return (flk_execute_request(request));
 859                 } else if (request->l_type == F_RDLCK) {
 860                         /*
 861                          * If we have a sleeping writer in the requested
 862                          * lock's range, block.
 863                          */
 864                         goto block;
 865                 }
 866 
 867                 lk[0] = request;
 868                 request->l_state |= RECOMPUTE_LOCK;
 869                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 870                 if (lock) {
 871                         do {
 872                                 flk_recompute_dependencies(lock, lk, 1, 0);
 873                                 lock = lock->l_next;
 874                         } while (lock->l_vnode == vp);
 875                 }
 876                 lock = first_glock;
 877                 if (lock) {
 878                         do {
 879                                 if (IS_GRANTED(lock)) {
 880                                 flk_recompute_dependencies(lock, lk, 1, 0);
 881                                 }
 882                                 lock = lock->l_prev;
 883                         } while ((lock->l_vnode == vp));
 884                 }
 885                 request->l_state &= ~RECOMPUTE_LOCK;
 886                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 887                         return (EDEADLK);
 888                 return (flk_execute_request(request));
 889         }
 890 
 891 block:
 892         if (request_will_wait)
 893                 flk_graph_uncolor(gp);
 894 
 895         /* check sleeping locks */
 896 
 897         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 898 
 899         /*
 900          * If we find a sleeping write lock that is a superset of the
 901          * region wanted by request we can be assured that by adding an
 902          * edge to this write lock we have paths to all locks in the
 903          * graph that blocks the request except in one case and that is why
 904          * another check for SAME_OWNER in the loop below. The exception
 905          * case is when this process that owns the sleeping write lock 'l1'
 906          * has other locks l2, l3, l4 that are in the system and arrived
 907          * before l1. l1 does not have path to these locks as they are from
 908          * same process. We break when we find a second covering sleeping
 909          * lock l5 owned by a process different from that owning l1, because
 910          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 911          * it has l1 would have produced a deadlock already.
 912          */
 913 
 914         if (lock) {
 915                 do {
 916                         if (BLOCKS(lock, request)) {
 917                                 if (!request_will_wait)
 918                                         return (EAGAIN);
 919                                 if (COVERS(lock, request) &&
 920                                                 lock->l_type == F_WRLCK) {
 921                                         if (found_covering_lock &&
 922                                             !SAME_OWNER(lock, covered_by)) {
 923                                                 found_covering_lock++;
 924                                                 break;
 925                                         }
 926                                         found_covering_lock = 1;
 927                                         covered_by = lock;
 928                                 }
 929                                 if (found_covering_lock &&
 930                                         !SAME_OWNER(lock, covered_by)) {
 931                                         lock = lock->l_next;
 932                                         continue;
 933                                 }
 934                                 if ((error = flk_add_edge(request, lock,
 935                                                 !found_covering_lock, 0)))
 936                                         return (error);
 937                         }
 938                         lock = lock->l_next;
 939                 } while (lock->l_vnode == vp);
 940         }
 941 
 942 /*
 943  * found_covering_lock == 2 iff at this point 'request' has paths
 944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 945  * point 'request' has paths to all locks that blocks 'request' whose owners
 946  * are not same as the one that covers 'request' (covered_by above) and
 947  * we can have locks whose owner is same as covered_by in the active list.
 948  */
 949 
 950         if (request_blocked_by_active && found_covering_lock != 2) {
 951                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 952                 ASSERT(lock != NULL);
 953                 do {
 954                         if (BLOCKS(lock, request)) {
 955                                 if (found_covering_lock &&
 956                                         !SAME_OWNER(lock, covered_by)) {
 957                                         lock = lock->l_next;
 958                                         continue;
 959                                 }
 960                                 if ((error = flk_add_edge(request, lock,
 961                                                         CHECK_CYCLE, 0)))
 962                                         return (error);
 963                         }
 964                         lock = lock->l_next;
 965                 } while (lock->l_vnode == vp);
 966         }
 967 
 968         if (NOT_BLOCKED(request)) {
 969                 /*
 970                  * request not dependent on any other locks
 971                  * so execute this request
 972                  */
 973                 return (flk_execute_request(request));
 974         } else {
 975                 /*
 976                  * check for deadlock
 977                  */
 978                 if (flk_check_deadlock(request))
 979                         return (EDEADLK);
 980                 /*
 981                  * this thread has to sleep
 982                  */
 983                 return (flk_wait_execute_request(request));
 984         }
 985 }
 986 
 987 /* ONC_PLUS EXTRACT START */
 988 /*
 989  * The actual execution of the request in the simple case is only to
 990  * insert the 'request' in the list of active locks if it is not an
 991  * UNLOCK.
 992  * We have to consider the existing active locks' relation to
 993  * this 'request' if they are owned by same process. flk_relation() does
 994  * this job and sees to that the dependency graph information is maintained
 995  * properly.
 996  */
 997 
 998 int
 999 flk_execute_request(lock_descriptor_t *request)
1000 {
1001         graph_t *gp = request->l_graph;
1002         vnode_t *vp = request->l_vnode;
1003         lock_descriptor_t       *lock, *lock1;
1004         int done_searching = 0;
1005 
1006         CHECK_SLEEPING_LOCKS(gp);
1007         CHECK_ACTIVE_LOCKS(gp);
1008 
1009         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1010 
1011         flk_set_state(request, FLK_START_STATE);
1012 
1013         ASSERT(NOT_BLOCKED(request));
1014 
1015         /* IO_LOCK requests are only to check status */
1016 
1017         if (IS_IO_LOCK(request))
1018                 return (0);
1019 
1020         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1021 
1022         if (lock == NULL && request->l_type == F_UNLCK)
1023                 return (0);
1024         if (lock == NULL) {
1025                 flk_insert_active_lock(request);
1026                 return (0);
1027         }
1028 
1029         do {
1030                 lock1 = lock->l_next;
1031                 if (SAME_OWNER(request, lock)) {
1032                         done_searching = flk_relation(lock, request);
1033                 }
1034                 lock = lock1;
1035         } while (lock->l_vnode == vp && !done_searching);
1036 
1037         /*
1038          * insert in active queue
1039          */
1040 
1041         if (request->l_type != F_UNLCK)
1042                 flk_insert_active_lock(request);
1043 
1044         return (0);
1045 }
1046 /* ONC_PLUS EXTRACT END */
1047 
1048 /*
1049  * 'request' is blocked by some one therefore we put it into sleep queue.
1050  */
1051 static int
1052 flk_wait_execute_request(lock_descriptor_t *request)
1053 {
1054         graph_t *gp = request->l_graph;
1055         callb_cpr_t     *cprp;          /* CPR info from callback */
1056         struct flock_globals *fg;
1057         int index;
1058 
1059         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1060         ASSERT(IS_WILLING_TO_SLEEP(request));
1061 
1062         flk_insert_sleeping_lock(request);
1063 
1064         if (IS_LOCKMGR(request)) {
1065                 index = HASH_INDEX(request->l_vnode);
1066                 fg = flk_get_globals();
1067 
1068                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1069                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1070                                 flk_cancel_sleeping_lock(request, 1);
1071                                 return (ENOLCK);
1072                         }
1073                 } else {                        /* booted as a cluster */
1074                         /*
1075                          * If the request is an NLM server lock request,
1076                          * and the NLM state of the lock request is not
1077                          * NLM_UP (because the NLM server is shutting
1078                          * down), then cancel the sleeping lock and
1079                          * return error ENOLCK that will encourage the
1080                          * client to retransmit.
1081                          */
1082                         if (!IS_NLM_UP(request)) {
1083                                 flk_cancel_sleeping_lock(request, 1);
1084                                 return (ENOLCK);
1085                         }
1086                 }
1087         }
1088 
1089         /* Clustering: For blocking PXFS locks, return */
1090         if (IS_PXFS(request)) {
1091                 /*
1092                  * PXFS locks sleep on the client side.
1093                  * The callback argument is used to wake up the sleeper
1094                  * when the lock is granted.
1095                  * We return -1 (rather than an errno value) to indicate
1096                  * the client side should sleep
1097                  */
1098                 return (PXFS_LOCK_BLOCKED);
1099         }
1100 
1101         if (request->l_callbacks != NULL) {
1102                 /*
1103                  * To make sure the shutdown code works correctly, either
1104                  * the callback must happen after putting the lock on the
1105                  * sleep list, or we must check the shutdown status after
1106                  * returning from the callback (and before sleeping).  At
1107                  * least for now, we'll use the first option.  If a
1108                  * shutdown or signal or whatever happened while the graph
1109                  * mutex was dropped, that will be detected by
1110                  * wait_for_lock().
1111                  */
1112                 mutex_exit(&gp->gp_mutex);
1113 
1114                 cprp = flk_invoke_callbacks(request->l_callbacks,
1115                                             FLK_BEFORE_SLEEP);
1116 
1117                 mutex_enter(&gp->gp_mutex);
1118 
1119                 if (cprp == NULL) {
1120                         wait_for_lock(request);
1121                 } else {
1122                         mutex_enter(cprp->cc_lockp);
1123                         CALLB_CPR_SAFE_BEGIN(cprp);
1124                         mutex_exit(cprp->cc_lockp);
1125                         wait_for_lock(request);
1126                         mutex_enter(cprp->cc_lockp);
1127                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1128                         mutex_exit(cprp->cc_lockp);
1129                 }
1130 
1131                 mutex_exit(&gp->gp_mutex);
1132                 (void) flk_invoke_callbacks(request->l_callbacks,
1133                                             FLK_AFTER_SLEEP);
1134                 mutex_enter(&gp->gp_mutex);
1135         } else {
1136                 wait_for_lock(request);
1137         }
1138 
1139         if (IS_LOCKMGR(request)) {
1140                 /*
1141                  * If the lock manager is shutting down, return an
1142                  * error that will encourage the client to retransmit.
1143                  */
1144                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1145                         !IS_GRANTED(request)) {
1146                         flk_cancel_sleeping_lock(request, 1);
1147                         return (ENOLCK);
1148                 }
1149         }
1150 
1151         if (IS_INTERRUPTED(request)) {
1152                 /* we got a signal, or act like we did */
1153                 flk_cancel_sleeping_lock(request, 1);
1154                 return (EINTR);
1155         }
1156 
1157         /* Cancelled if some other thread has closed the file */
1158 
1159         if (IS_CANCELLED(request)) {
1160                 flk_cancel_sleeping_lock(request, 1);
1161                 return (EBADF);
1162         }
1163 
1164         request->l_state &= ~GRANTED_LOCK;
1165         REMOVE_SLEEP_QUEUE(request);
1166         return (flk_execute_request(request));
1167 }
1168 
1169 /*
1170  * This routine adds an edge between from and to because from depends
1171  * to. If asked to check for deadlock it checks whether there are any
1172  * reachable locks from "from_lock" that is owned by the same process
1173  * as "from_lock".
1174  * NOTE: It is the caller's responsibility to make sure that the color
1175  * of the graph is consistent between the calls to flk_add_edge as done
1176  * in flk_process_request. This routine does not color and check for
1177  * deadlock explicitly.
1178  */
1179 
1180 static int
1181 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1182                         int check_cycle, int update_graph)
1183 {
1184         edge_t  *edge;
1185         edge_t  *ep;
1186         lock_descriptor_t       *vertex;
1187         lock_descriptor_t *vertex_stack;
1188 
1189         STACK_INIT(vertex_stack);
1190 
1191         /*
1192          * if to vertex already has mark_color just return
1193          * don't add an edge as it is reachable from from vertex
1194          * before itself.
1195          */
1196 
1197         if (COLORED(to_lock))
1198                 return (0);
1199 
1200         edge = flk_get_edge();
1201 
1202         /*
1203          * set the from and to vertex
1204          */
1205 
1206         edge->from_vertex = from_lock;
1207         edge->to_vertex = to_lock;
1208 
1209         /*
1210          * put in adjacency list of from vertex
1211          */
1212 
1213         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1214         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1215         edge->edge_adj_prev = &from_lock->l_edge;
1216         from_lock->l_edge.edge_adj_next = edge;
1217 
1218         /*
1219          * put in in list of to vertex
1220          */
1221 
1222         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1223         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1224         to_lock->l_edge.edge_in_next = edge;
1225         edge->edge_in_prev = &to_lock->l_edge;
1226 
1227 
1228         if (update_graph) {
1229                 flk_update_proc_graph(edge, 0);
1230                 return (0);
1231         }
1232         if (!check_cycle) {
1233                 return (0);
1234         }
1235 
1236         STACK_PUSH(vertex_stack, from_lock, l_stack);
1237 
1238         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1239 
1240                 STACK_POP(vertex_stack, l_stack);
1241 
1242                 for (ep = FIRST_ADJ(vertex);
1243                         ep != HEAD(vertex);
1244                                 ep = NEXT_ADJ(ep)) {
1245                         if (COLORED(ep->to_vertex))
1246                                 continue;
1247                         COLOR(ep->to_vertex);
1248                         if (SAME_OWNER(ep->to_vertex, from_lock))
1249                                 goto dead_lock;
1250                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1251                 }
1252         }
1253         return (0);
1254 
1255 dead_lock:
1256 
1257         /*
1258          * remove all edges
1259          */
1260 
1261         ep = FIRST_ADJ(from_lock);
1262 
1263         while (ep != HEAD(from_lock)) {
1264                 IN_LIST_REMOVE(ep);
1265                 from_lock->l_sedge = NEXT_ADJ(ep);
1266                 ADJ_LIST_REMOVE(ep);
1267                 flk_free_edge(ep);
1268                 ep = from_lock->l_sedge;
1269         }
1270         return (EDEADLK);
1271 }
1272 
1273 /*
1274  * Get an edge structure for representing the dependency between two locks.
1275  */
1276 
1277 static edge_t *
1278 flk_get_edge()
1279 {
1280         edge_t  *ep;
1281 
1282         ASSERT(flk_edge_cache != NULL);
1283 
1284         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1285         edge_allocs++;
1286         return (ep);
1287 }
1288 
1289 /*
1290  * Free the edge structure.
1291  */
1292 
1293 static void
1294 flk_free_edge(edge_t *ep)
1295 {
1296         edge_frees++;
1297         kmem_cache_free(flk_edge_cache, (void *)ep);
1298 }
1299 
1300 /*
1301  * Check the relationship of request with lock and perform the
1302  * recomputation of dependencies, break lock if required, and return
1303  * 1 if request cannot have any more relationship with the next
1304  * active locks.
1305  * The 'lock' and 'request' are compared and in case of overlap we
1306  * delete the 'lock' and form new locks to represent the non-overlapped
1307  * portion of original 'lock'. This function has side effects such as
1308  * 'lock' will be freed, new locks will be added to the active list.
1309  */
1310 
1311 static int
1312 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1313 {
1314         int lock_effect;
1315         lock_descriptor_t *lock1, *lock2;
1316         lock_descriptor_t *topology[3];
1317         int nvertex = 0;
1318         int i;
1319         edge_t  *ep;
1320         graph_t *gp = (lock->l_graph);
1321 
1322 
1323         CHECK_SLEEPING_LOCKS(gp);
1324         CHECK_ACTIVE_LOCKS(gp);
1325 
1326         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1327 
1328         topology[0] = topology[1] = topology[2] = NULL;
1329 
1330         if (request->l_type == F_UNLCK)
1331                 lock_effect = FLK_UNLOCK;
1332         else if (request->l_type == F_RDLCK &&
1333                         lock->l_type == F_WRLCK)
1334                 lock_effect = FLK_DOWNGRADE;
1335         else if (request->l_type == F_WRLCK &&
1336                         lock->l_type == F_RDLCK)
1337                 lock_effect = FLK_UPGRADE;
1338         else
1339                 lock_effect = FLK_STAY_SAME;
1340 
1341         if (lock->l_end < request->l_start) {
1342                 if (lock->l_end == request->l_start - 1 &&
1343                                 lock_effect == FLK_STAY_SAME) {
1344                         topology[0] = request;
1345                         request->l_start = lock->l_start;
1346                         nvertex = 1;
1347                         goto recompute;
1348                 } else {
1349                         return (0);
1350                 }
1351         }
1352 
1353         if (lock->l_start > request->l_end) {
1354                 if (request->l_end == lock->l_start - 1 &&
1355                                         lock_effect == FLK_STAY_SAME) {
1356                         topology[0] = request;
1357                         request->l_end = lock->l_end;
1358                         nvertex = 1;
1359                         goto recompute;
1360                 } else {
1361                         return (1);
1362                 }
1363         }
1364 
1365         if (request->l_end < lock->l_end) {
1366                 if (request->l_start > lock->l_start) {
1367                         if (lock_effect == FLK_STAY_SAME) {
1368                                 request->l_start = lock->l_start;
1369                                 request->l_end = lock->l_end;
1370                                 topology[0] = request;
1371                                 nvertex = 1;
1372                         } else {
1373                                 lock1 = flk_get_lock();
1374                                 lock2 = flk_get_lock();
1375                                 COPY(lock1, lock);
1376                                 COPY(lock2, lock);
1377                                 lock1->l_start = lock->l_start;
1378                                 lock1->l_end = request->l_start - 1;
1379                                 lock2->l_start = request->l_end + 1;
1380                                 lock2->l_end = lock->l_end;
1381                                 topology[0] = lock1;
1382                                 topology[1] = lock2;
1383                                 topology[2] = request;
1384                                 nvertex = 3;
1385                         }
1386                 } else if (request->l_start < lock->l_start) {
1387                         if (lock_effect == FLK_STAY_SAME) {
1388                                 request->l_end = lock->l_end;
1389                                 topology[0] = request;
1390                                 nvertex = 1;
1391                         } else {
1392                                 lock1 = flk_get_lock();
1393                                 COPY(lock1, lock);
1394                                 lock1->l_start = request->l_end + 1;
1395                                 topology[0] = lock1;
1396                                 topology[1] = request;
1397                                 nvertex = 2;
1398                         }
1399                 } else  {
1400                         if (lock_effect == FLK_STAY_SAME) {
1401                                 request->l_start = lock->l_start;
1402                                 request->l_end = lock->l_end;
1403                                 topology[0] = request;
1404                                 nvertex = 1;
1405                         } else {
1406                                 lock1 = flk_get_lock();
1407                                 COPY(lock1, lock);
1408                                 lock1->l_start = request->l_end + 1;
1409                                 topology[0] = lock1;
1410                                 topology[1] = request;
1411                                 nvertex = 2;
1412                         }
1413                 }
1414         } else if (request->l_end > lock->l_end) {
1415                 if (request->l_start > lock->l_start)  {
1416                         if (lock_effect == FLK_STAY_SAME) {
1417                                 request->l_start = lock->l_start;
1418                                 topology[0] = request;
1419                                 nvertex = 1;
1420                         } else {
1421                                 lock1 = flk_get_lock();
1422                                 COPY(lock1, lock);
1423                                 lock1->l_end = request->l_start - 1;
1424                                 topology[0] = lock1;
1425                                 topology[1] = request;
1426                                 nvertex = 2;
1427                         }
1428                 } else if (request->l_start < lock->l_start)  {
1429                         topology[0] = request;
1430                         nvertex = 1;
1431                 } else {
1432                         topology[0] = request;
1433                         nvertex = 1;
1434                 }
1435         } else {
1436                 if (request->l_start > lock->l_start) {
1437                         if (lock_effect == FLK_STAY_SAME) {
1438                                 request->l_start = lock->l_start;
1439                                 topology[0] = request;
1440                                 nvertex = 1;
1441                         } else {
1442                                 lock1 = flk_get_lock();
1443                                 COPY(lock1, lock);
1444                                 lock1->l_end = request->l_start - 1;
1445                                 topology[0] = lock1;
1446                                 topology[1] = request;
1447                                 nvertex = 2;
1448                         }
1449                 } else if (request->l_start < lock->l_start) {
1450                         topology[0] = request;
1451                         nvertex = 1;
1452                 } else {
1453                         if (lock_effect !=  FLK_UNLOCK) {
1454                                 topology[0] = request;
1455                                 nvertex = 1;
1456                         } else {
1457                                 flk_delete_active_lock(lock, 0);
1458                                 flk_wakeup(lock, 1);
1459                                 flk_free_lock(lock);
1460                                 CHECK_SLEEPING_LOCKS(gp);
1461                                 CHECK_ACTIVE_LOCKS(gp);
1462                                 return (1);
1463                         }
1464                 }
1465         }
1466 
1467 recompute:
1468 
1469         /*
1470          * For unlock we don't send the 'request' to for recomputing
1471          * dependencies because no lock will add an edge to this.
1472          */
1473 
1474         if (lock_effect == FLK_UNLOCK) {
1475                 topology[nvertex-1] = NULL;
1476                 nvertex--;
1477         }
1478         for (i = 0; i < nvertex; i++) {
1479                 topology[i]->l_state |= RECOMPUTE_LOCK;
1480                 topology[i]->l_color = NO_COLOR;
1481         }
1482 
1483         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1484 
1485         /*
1486          * we remove the adjacent edges for all vertices' to this vertex
1487          * 'lock'.
1488          */
1489 
1490         ep = FIRST_IN(lock);
1491         while (ep != HEAD(lock)) {
1492                 ADJ_LIST_REMOVE(ep);
1493                 ep = NEXT_IN(ep);
1494         }
1495 
1496         flk_delete_active_lock(lock, 0);
1497 
1498         /* We are ready for recomputing the dependencies now */
1499 
1500         flk_recompute_dependencies(lock, topology, nvertex, 1);
1501 
1502         for (i = 0; i < nvertex; i++) {
1503                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1504                 topology[i]->l_color = NO_COLOR;
1505         }
1506 
1507 
1508         if (lock_effect == FLK_UNLOCK) {
1509                 nvertex++;
1510         }
1511         for (i = 0; i < nvertex - 1; i++) {
1512                 flk_insert_active_lock(topology[i]);
1513         }
1514 
1515 
1516         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1517                 flk_wakeup(lock, 0);
1518         } else {
1519                 ep = FIRST_IN(lock);
1520                 while (ep != HEAD(lock)) {
1521                         lock->l_sedge = NEXT_IN(ep);
1522                         IN_LIST_REMOVE(ep);
1523                         flk_update_proc_graph(ep, 1);
1524                         flk_free_edge(ep);
1525                         ep = lock->l_sedge;
1526                 }
1527         }
1528         flk_free_lock(lock);
1529 
1530         CHECK_SLEEPING_LOCKS(gp);
1531         CHECK_ACTIVE_LOCKS(gp);
1532         return (0);
1533 }
1534 
1535 /*
1536  * Insert a lock into the active queue.
1537  */
1538 
1539 static void
1540 flk_insert_active_lock(lock_descriptor_t *new_lock)
1541 {
1542         graph_t *gp = new_lock->l_graph;
1543         vnode_t *vp = new_lock->l_vnode;
1544         lock_descriptor_t *first_lock, *lock;
1545 
1546         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1547 
1548         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1549         first_lock = lock;
1550 
1551         if (first_lock != NULL) {
1552                 for (; (lock->l_vnode == vp &&
1553                         lock->l_start < new_lock->l_start); lock = lock->l_next)
1554                         ;
1555         } else {
1556                 lock = ACTIVE_HEAD(gp);
1557         }
1558 
1559         lock->l_prev->l_next = new_lock;
1560         new_lock->l_next = lock;
1561         new_lock->l_prev = lock->l_prev;
1562         lock->l_prev = new_lock;
1563 
1564         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1565                 vp->v_filocks = (struct filock *)new_lock;
1566         }
1567         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1568         new_lock->l_state |= ACTIVE_LOCK;
1569 
1570         CHECK_ACTIVE_LOCKS(gp);
1571         CHECK_SLEEPING_LOCKS(gp);
1572 }
1573 
1574 /*
1575  * Delete the active lock : Performs two functions depending on the
1576  * value of second parameter. One is to remove from the active lists
1577  * only and other is to both remove and free the lock.
1578  */
1579 
1580 static void
1581 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1582 {
1583         vnode_t *vp = lock->l_vnode;
1584         graph_t *gp = lock->l_graph;
1585 
1586         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1587         if (free_lock)
1588                 ASSERT(NO_DEPENDENTS(lock));
1589         ASSERT(NOT_BLOCKED(lock));
1590         ASSERT(IS_ACTIVE(lock));
1591 
1592         ASSERT((vp->v_filocks != NULL));
1593 
1594         if (vp->v_filocks == (struct filock *)lock) {
1595                 vp->v_filocks = (struct filock *)
1596                                 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1597                                                                 NULL);
1598         }
1599         lock->l_next->l_prev = lock->l_prev;
1600         lock->l_prev->l_next = lock->l_next;
1601         lock->l_next = lock->l_prev = NULL;
1602         flk_set_state(lock, FLK_DEAD_STATE);
1603         lock->l_state &= ~ACTIVE_LOCK;
1604 
1605         if (free_lock)
1606                 flk_free_lock(lock);
1607         CHECK_ACTIVE_LOCKS(gp);
1608         CHECK_SLEEPING_LOCKS(gp);
1609 }
1610 
1611 /*
1612  * Insert into the sleep queue.
1613  */
1614 
1615 static void
1616 flk_insert_sleeping_lock(lock_descriptor_t *request)
1617 {
1618         graph_t *gp = request->l_graph;
1619         vnode_t *vp = request->l_vnode;
1620         lock_descriptor_t       *lock;
1621 
1622         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1623         ASSERT(IS_INITIAL(request));
1624 
1625         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1626                 lock->l_vnode < vp); lock = lock->l_next)
1627                 ;
1628 
1629         lock->l_prev->l_next = request;
1630         request->l_prev = lock->l_prev;
1631         lock->l_prev = request;
1632         request->l_next = lock;
1633         flk_set_state(request, FLK_SLEEPING_STATE);
1634         request->l_state |= SLEEPING_LOCK;
1635 }
1636 
1637 /*
1638  * Cancelling a sleeping lock implies removing a vertex from the
1639  * dependency graph and therefore we should recompute the dependencies
1640  * of all vertices that have a path  to this vertex, w.r.t. all
1641  * vertices reachable from this vertex.
1642  */
1643 
1644 void
1645 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1646 {
1647         graph_t *gp = request->l_graph;
1648         vnode_t *vp = request->l_vnode;
1649         lock_descriptor_t **topology = NULL;
1650         edge_t  *ep;
1651         lock_descriptor_t *vertex, *lock;
1652         int nvertex = 0;
1653         int i;
1654         lock_descriptor_t *vertex_stack;
1655 
1656         STACK_INIT(vertex_stack);
1657 
1658         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1659         /*
1660          * count number of vertex pointers that has to be allocated
1661          * All vertices that are reachable from request.
1662          */
1663 
1664         STACK_PUSH(vertex_stack, request, l_stack);
1665 
1666         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1667                 STACK_POP(vertex_stack, l_stack);
1668                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1669                                         ep = NEXT_ADJ(ep)) {
1670                         if (IS_RECOMPUTE(ep->to_vertex))
1671                                 continue;
1672                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1673                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1674                         nvertex++;
1675                 }
1676         }
1677 
1678         /*
1679          * allocate memory for holding the vertex pointers
1680          */
1681 
1682         if (nvertex) {
1683                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1684                                                 KM_SLEEP);
1685         }
1686 
1687         /*
1688          * one more pass to actually store the vertices in the
1689          * allocated array.
1690          * We first check sleeping locks and then active locks
1691          * so that topology array will be in a topological
1692          * order.
1693          */
1694 
1695         nvertex = 0;
1696         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1697 
1698         if (lock) {
1699                 do {
1700                         if (IS_RECOMPUTE(lock)) {
1701                                 lock->l_index = nvertex;
1702                                 topology[nvertex++] = lock;
1703                         }
1704                         lock->l_color = NO_COLOR;
1705                         lock = lock->l_next;
1706                 } while (lock->l_vnode == vp);
1707         }
1708 
1709         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1710 
1711         if (lock) {
1712                 do {
1713                         if (IS_RECOMPUTE(lock)) {
1714                                 lock->l_index = nvertex;
1715                                 topology[nvertex++] = lock;
1716                         }
1717                         lock->l_color = NO_COLOR;
1718                         lock = lock->l_next;
1719                 } while (lock->l_vnode == vp);
1720         }
1721 
1722         /*
1723          * remove in and out edges of request
1724          * They are freed after updating proc_graph below.
1725          */
1726 
1727         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1728                 ADJ_LIST_REMOVE(ep);
1729         }
1730 
1731 
1732         if (remove_from_queue)
1733                 REMOVE_SLEEP_QUEUE(request);
1734 
1735         /* we are ready to recompute */
1736 
1737         flk_recompute_dependencies(request, topology, nvertex, 1);
1738 
1739         ep = FIRST_ADJ(request);
1740         while (ep != HEAD(request)) {
1741                 IN_LIST_REMOVE(ep);
1742                 request->l_sedge = NEXT_ADJ(ep);
1743                 ADJ_LIST_REMOVE(ep);
1744                 flk_update_proc_graph(ep, 1);
1745                 flk_free_edge(ep);
1746                 ep = request->l_sedge;
1747         }
1748 
1749 
1750         /*
1751          * unset the RECOMPUTE flag in those vertices
1752          */
1753 
1754         for (i = 0; i < nvertex; i++) {
1755                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1756         }
1757 
1758         /*
1759          * free the topology
1760          */
1761         if (nvertex)
1762                 kmem_free((void *)topology,
1763                         (nvertex * sizeof (lock_descriptor_t *)));
1764         /*
1765          * Possibility of some locks unblocked now
1766          */
1767 
1768         flk_wakeup(request, 0);
1769 
1770         /*
1771          * we expect to have a correctly recomputed graph  now.
1772          */
1773         flk_set_state(request, FLK_DEAD_STATE);
1774         flk_free_lock(request);
1775         CHECK_SLEEPING_LOCKS(gp);
1776         CHECK_ACTIVE_LOCKS(gp);
1777 
1778 }
1779 
1780 /*
1781  * Uncoloring the graph is simply to increment the mark value of the graph
1782  * And only when wrap round takes place will we color all vertices in
1783  * the graph explicitly.
1784  */
1785 
1786 static void
1787 flk_graph_uncolor(graph_t *gp)
1788 {
1789         lock_descriptor_t *lock;
1790 
1791         if (gp->mark == UINT_MAX) {
1792                 gp->mark = 1;
1793         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1794                                         lock = lock->l_next)
1795                         lock->l_color  = 0;
1796 
1797         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1798                                         lock = lock->l_next)
1799                         lock->l_color  = 0;
1800         } else {
1801                 gp->mark++;
1802         }
1803 }
1804 
1805 /*
1806  * Wake up locks that are blocked on the given lock.
1807  */
1808 
1809 static void
1810 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1811 {
1812         edge_t  *ep;
1813         graph_t *gp = lock->l_graph;
1814         lock_descriptor_t       *lck;
1815 
1816         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1817         if (NO_DEPENDENTS(lock))
1818                 return;
1819         ep = FIRST_IN(lock);
1820         do {
1821                 /*
1822                  * delete the edge from the adjacency list
1823                  * of from vertex. if no more adjacent edges
1824                  * for this vertex wake this process.
1825                  */
1826                 lck = ep->from_vertex;
1827                 if (adj_list_remove)
1828                         ADJ_LIST_REMOVE(ep);
1829                 flk_update_proc_graph(ep, 1);
1830                 if (NOT_BLOCKED(lck)) {
1831                         GRANT_WAKEUP(lck);
1832                 }
1833                 lock->l_sedge = NEXT_IN(ep);
1834                 IN_LIST_REMOVE(ep);
1835                 flk_free_edge(ep);
1836                 ep = lock->l_sedge;
1837         } while (ep != HEAD(lock));
1838         ASSERT(NO_DEPENDENTS(lock));
1839 }
1840 
1841 /*
1842  * The dependents of request, is checked for its dependency against the
1843  * locks in topology (called topology because the array is and should be in
1844  * topological order for this algorithm, if not in topological order the
1845  * inner loop below might add more edges than necessary. Topological ordering
1846  * of vertices satisfies the property that all edges will be from left to
1847  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1848  * If lock l1 in the dependent set of request is dependent (blocked by)
1849  * on lock l2 in topology but does not have a path to it, we add an edge
1850  * in the inner loop below.
1851  *
1852  * We don't want to add an edge between l1 and l2 if there exists
1853  * already a path from l1 to l2, so care has to be taken for those vertices
1854  * that  have two paths to 'request'. These vertices are referred to here
1855  * as barrier locks.
1856  *
1857  * The barriers has to be found (those vertex that originally had two paths
1858  * to request) because otherwise we may end up adding edges unnecessarily
1859  * to vertices in topology, and thus barrier vertices can have an edge
1860  * to a vertex in topology as well a path to it.
1861  */
1862 
1863 static void
1864 flk_recompute_dependencies(lock_descriptor_t *request,
1865                 lock_descriptor_t **topology,
1866                         int nvertex, int update_graph)
1867 {
1868         lock_descriptor_t *vertex, *lock;
1869         graph_t *gp = request->l_graph;
1870         int i, count;
1871         int barrier_found = 0;
1872         edge_t  *ep;
1873         lock_descriptor_t *vertex_stack;
1874 
1875         STACK_INIT(vertex_stack);
1876 
1877         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878         if (nvertex == 0)
1879                 return;
1880         flk_graph_uncolor(request->l_graph);
1881         barrier_found = flk_find_barriers(request);
1882         request->l_state |= RECOMPUTE_DONE;
1883 
1884         STACK_PUSH(vertex_stack, request, l_stack);
1885         request->l_sedge = FIRST_IN(request);
1886 
1887 
1888         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1889                 if (vertex->l_state & RECOMPUTE_DONE) {
1890                         count = 0;
1891                         goto next_in_edge;
1892                 }
1893                 if (IS_BARRIER(vertex)) {
1894                         /* decrement the barrier count */
1895                         if (vertex->l_index) {
1896                                 vertex->l_index--;
1897                                 /* this guy will be pushed again anyway ? */
1898                                 STACK_POP(vertex_stack, l_stack);
1899                                 if (vertex->l_index == 0)  {
1900                                 /*
1901                                  * barrier is over we can recompute
1902                                  * dependencies for this lock in the
1903                                  * next stack pop
1904                                  */
1905                                         vertex->l_state &= ~BARRIER_LOCK;
1906                                 }
1907                                 continue;
1908                         }
1909                 }
1910                 vertex->l_state |= RECOMPUTE_DONE;
1911                 flk_graph_uncolor(gp);
1912                 count = flk_color_reachables(vertex);
1913                 for (i = 0; i < nvertex; i++) {
1914                         lock = topology[i];
1915                         if (COLORED(lock))
1916                                 continue;
1917                         if (BLOCKS(lock, vertex)) {
1918                                 (void) flk_add_edge(vertex, lock,
1919                                     NO_CHECK_CYCLE, update_graph);
1920                                 COLOR(lock);
1921                                 count++;
1922                                 count += flk_color_reachables(lock);
1923                         }
1924 
1925                 }
1926 
1927 next_in_edge:
1928                 if (count == nvertex ||
1929                                 vertex->l_sedge == HEAD(vertex)) {
1930                         /* prune the tree below this */
1931                         STACK_POP(vertex_stack, l_stack);
1932                         vertex->l_state &= ~RECOMPUTE_DONE;
1933                         /* update the barrier locks below this! */
1934                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1935                                 flk_graph_uncolor(gp);
1936                                 flk_update_barriers(vertex);
1937                         }
1938                         continue;
1939                 }
1940 
1941                 ep = vertex->l_sedge;
1942                 lock = ep->from_vertex;
1943                 STACK_PUSH(vertex_stack, lock, l_stack);
1944                 lock->l_sedge = FIRST_IN(lock);
1945                 vertex->l_sedge = NEXT_IN(ep);
1946         }
1947 
1948 }
1949 
1950 /*
1951  * Color all reachable vertices from vertex that belongs to topology (here
1952  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1953  *
1954  * Note: we need to use a different stack_link l_stack1 because this is
1955  * called from flk_recompute_dependencies() that already uses a stack with
1956  * l_stack as stack_link.
1957  */
1958 
1959 static int
1960 flk_color_reachables(lock_descriptor_t *vertex)
1961 {
1962         lock_descriptor_t *ver, *lock;
1963         int count;
1964         edge_t  *ep;
1965         lock_descriptor_t *vertex_stack;
1966 
1967         STACK_INIT(vertex_stack);
1968 
1969         STACK_PUSH(vertex_stack, vertex, l_stack1);
1970         count = 0;
1971         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1972 
1973                 STACK_POP(vertex_stack, l_stack1);
1974                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1975                                         ep = NEXT_ADJ(ep)) {
1976                         lock = ep->to_vertex;
1977                         if (COLORED(lock))
1978                                 continue;
1979                         COLOR(lock);
1980                         if (IS_RECOMPUTE(lock))
1981                                 count++;
1982                         STACK_PUSH(vertex_stack, lock, l_stack1);
1983                 }
1984 
1985         }
1986         return (count);
1987 }
1988 
1989 /*
1990  * Called from flk_recompute_dependencies() this routine decrements
1991  * the barrier count of barrier vertices that are reachable from lock.
1992  */
1993 
1994 static void
1995 flk_update_barriers(lock_descriptor_t *lock)
1996 {
1997         lock_descriptor_t *vertex, *lck;
1998         edge_t  *ep;
1999         lock_descriptor_t *vertex_stack;
2000 
2001         STACK_INIT(vertex_stack);
2002 
2003         STACK_PUSH(vertex_stack, lock, l_stack1);
2004 
2005         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2006                 STACK_POP(vertex_stack, l_stack1);
2007                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2008                                                 ep = NEXT_IN(ep)) {
2009                         lck = ep->from_vertex;
2010                         if (COLORED(lck)) {
2011                                 if (IS_BARRIER(lck)) {
2012                                         ASSERT(lck->l_index > 0);
2013                                         lck->l_index--;
2014                                         if (lck->l_index == 0)
2015                                                 lck->l_state &= ~BARRIER_LOCK;
2016                                 }
2017                                 continue;
2018                         }
2019                         COLOR(lck);
2020                         if (IS_BARRIER(lck)) {
2021                                 ASSERT(lck->l_index > 0);
2022                                 lck->l_index--;
2023                                 if (lck->l_index == 0)
2024                                         lck->l_state &= ~BARRIER_LOCK;
2025                         }
2026                         STACK_PUSH(vertex_stack, lck, l_stack1);
2027                 }
2028         }
2029 }
2030 
2031 /*
2032  * Finds all vertices that are reachable from 'lock' more than once and
2033  * mark them as barrier vertices and increment their barrier count.
2034  * The barrier count is one minus the total number of paths from lock
2035  * to that vertex.
2036  */
2037 
2038 static int
2039 flk_find_barriers(lock_descriptor_t *lock)
2040 {
2041         lock_descriptor_t *vertex, *lck;
2042         int found = 0;
2043         edge_t  *ep;
2044         lock_descriptor_t *vertex_stack;
2045 
2046         STACK_INIT(vertex_stack);
2047 
2048         STACK_PUSH(vertex_stack, lock, l_stack1);
2049 
2050         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2051                 STACK_POP(vertex_stack, l_stack1);
2052                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2053                                                 ep = NEXT_IN(ep)) {
2054                         lck = ep->from_vertex;
2055                         if (COLORED(lck)) {
2056                                 /* this is a barrier */
2057                                 lck->l_state |= BARRIER_LOCK;
2058                                 /* index will have barrier count */
2059                                 lck->l_index++;
2060                                 if (!found)
2061                                         found = 1;
2062                                 continue;
2063                         }
2064                         COLOR(lck);
2065                         lck->l_index = 0;
2066                         STACK_PUSH(vertex_stack, lck, l_stack1);
2067                 }
2068         }
2069         return (found);
2070 }
2071 
2072 /*
2073  * Finds the first lock that is mainly responsible for blocking this
2074  * request.  If there is no such lock, request->l_flock.l_type is set to
2075  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2076  * of the blocking lock.
2077  *
2078  * Note: It is possible a request is blocked by a sleeping lock because
2079  * of the fairness policy used in flk_process_request() to construct the
2080  * dependencies. (see comments before flk_process_request()).
2081  */
2082 
2083 static void
2084 flk_get_first_blocking_lock(lock_descriptor_t *request)
2085 {
2086         graph_t *gp = request->l_graph;
2087         vnode_t *vp = request->l_vnode;
2088         lock_descriptor_t *lock, *blocker;
2089 
2090         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2091         blocker = NULL;
2092         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2093 
2094         if (lock) {
2095                 do {
2096                         if (BLOCKS(lock, request)) {
2097                                 blocker = lock;
2098                                 break;
2099                         }
2100                         lock = lock->l_next;
2101                 } while (lock->l_vnode == vp);
2102         }
2103 
2104         if (blocker) {
2105                 report_blocker(blocker, request);
2106         } else
2107                 request->l_flock.l_type = F_UNLCK;
2108 }
2109 
2110 /*
2111  * Get the graph_t structure associated with a vnode.
2112  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2113  * not yet been initialized, then a new element is allocated and returned.
2114  */
2115 graph_t *
2116 flk_get_lock_graph(vnode_t *vp, int initialize)
2117 {
2118         graph_t *gp;
2119         graph_t *gp_alloc = NULL;
2120         int index = HASH_INDEX(vp);
2121 
2122         if (initialize == FLK_USE_GRAPH) {
2123                 mutex_enter(&flock_lock);
2124                 gp = lock_graph[index];
2125                 mutex_exit(&flock_lock);
2126                 return (gp);
2127         }
2128 
2129         ASSERT(initialize == FLK_INIT_GRAPH);
2130 
2131         if (lock_graph[index] == NULL) {
2132 
2133                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2134 
2135                 /* Initialize the graph */
2136 
2137                 gp_alloc->active_locks.l_next =
2138                     gp_alloc->active_locks.l_prev =
2139                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2140                 gp_alloc->sleeping_locks.l_next =
2141                     gp_alloc->sleeping_locks.l_prev =
2142                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2143                 gp_alloc->index = index;
2144                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2145         }
2146 
2147         mutex_enter(&flock_lock);
2148 
2149         gp = lock_graph[index];
2150 
2151         /* Recheck the value within flock_lock */
2152         if (gp == NULL) {
2153                 struct flock_globals *fg;
2154 
2155                 /* We must have previously allocated the graph_t structure */
2156                 ASSERT(gp_alloc != NULL);
2157                 lock_graph[index] = gp = gp_alloc;
2158                 /*
2159                  * The lockmgr status is only needed if KLM is loaded.
2160                  */
2161                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2162                         fg = flk_get_globals();
2163                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2164                 }
2165         }
2166 
2167         mutex_exit(&flock_lock);
2168 
2169         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2170                 /* There was a race to allocate the graph_t and we lost */
2171                 mutex_destroy(&gp_alloc->gp_mutex);
2172                 kmem_free(gp_alloc, sizeof (graph_t));
2173         }
2174 
2175         return (gp);
2176 }
2177 
2178 /*
2179  * PSARC case 1997/292
2180  */
2181 int
2182 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2183 {
2184         lock_descriptor_t *lock;
2185         int result = 0;
2186         graph_t *gp;
2187         int                     lock_nlmid;
2188 
2189         /*
2190          * Check to see if node is booted as a cluster. If not, return.
2191          */
2192         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2193                 return (0);
2194         }
2195 
2196         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2197         if (gp == NULL) {
2198                 return (0);
2199         }
2200 
2201         mutex_enter(&gp->gp_mutex);
2202 
2203         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2204 
2205         if (lock) {
2206                 while (lock->l_vnode == vp) {
2207                         /* get NLM id from sysid */
2208                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2209 
2210                         /*
2211                          * If NLM server request _and_ nlmid of lock matches
2212                          * nlmid of argument, then we've found a remote lock.
2213                          */
2214                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2215                                 result = 1;
2216                                 goto done;
2217                         }
2218                         lock = lock->l_next;
2219                 }
2220         }
2221 
2222         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2223 
2224         if (lock) {
2225                 while (lock->l_vnode == vp) {
2226                         /* get NLM id from sysid */
2227                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2228 
2229                         /*
2230                          * If NLM server request _and_ nlmid of lock matches
2231                          * nlmid of argument, then we've found a remote lock.
2232                          */
2233                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2234                                 result = 1;
2235                                 goto done;
2236                         }
2237                         lock = lock->l_next;
2238                 }
2239         }
2240 
2241 done:
2242         mutex_exit(&gp->gp_mutex);
2243         return (result);
2244 }
2245 
2246 /* ONC_PLUS EXTRACT START */
2247 /*
2248  * Determine whether there are any locks for the given vnode with a remote
2249  * sysid.  Returns zero if not, non-zero if there are.
2250  *
2251  * Note that the return value from this function is potentially invalid
2252  * once it has been returned.  The caller is responsible for providing its
2253  * own synchronization mechanism to ensure that the return value is useful
2254  * (e.g., see nfs_lockcompletion()).
2255  */
2256 int
2257 flk_has_remote_locks(vnode_t *vp)
2258 {
2259         lock_descriptor_t *lock;
2260         int result = 0;
2261         graph_t *gp;
2262 
2263         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2264         if (gp == NULL) {
2265                 return (0);
2266         }
2267 
2268         mutex_enter(&gp->gp_mutex);
2269 
2270         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2271 
2272         if (lock) {
2273                 while (lock->l_vnode == vp) {
2274                         if (IS_REMOTE(lock)) {
2275                                 result = 1;
2276                                 goto done;
2277                         }
2278                         lock = lock->l_next;
2279                 }
2280         }
2281 
2282         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2283 
2284         if (lock) {
2285                 while (lock->l_vnode == vp) {
2286                         if (IS_REMOTE(lock)) {
2287                                 result = 1;
2288                                 goto done;
2289                         }
2290                         lock = lock->l_next;
2291                 }
2292         }
2293 
2294 done:
2295         mutex_exit(&gp->gp_mutex);
2296         return (result);
2297 }
2298 
2299 /*
2300  * Determine if there are any locks owned by the given sysid.
2301  * Returns zero if not, non-zero if there are.  Note that this return code
2302  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2303  * avoids all the memory allocations of those routines.
2304  *
2305  * This routine has the same synchronization issues as
2306  * flk_has_remote_locks.
2307  */
2308 
2309 int
2310 flk_sysid_has_locks(int sysid, int lck_type)
2311 {
2312         int             has_locks = 0;
2313         lock_descriptor_t       *lock;
2314         graph_t         *gp;
2315         int             i;
2316 
2317         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2318                 mutex_enter(&flock_lock);
2319                 gp = lock_graph[i];
2320                 mutex_exit(&flock_lock);
2321                 if (gp == NULL) {
2322                         continue;
2323                 }
2324 
2325                 mutex_enter(&gp->gp_mutex);
2326 
2327                 if (lck_type & FLK_QUERY_ACTIVE) {
2328                         for (lock = ACTIVE_HEAD(gp)->l_next;
2329                             lock != ACTIVE_HEAD(gp) && !has_locks;
2330                             lock = lock->l_next) {
2331                                 if (lock->l_flock.l_sysid == sysid)
2332                                         has_locks = 1;
2333                         }
2334                 }
2335 
2336                 if (lck_type & FLK_QUERY_SLEEPING) {
2337                         for (lock = SLEEPING_HEAD(gp)->l_next;
2338                                 lock != SLEEPING_HEAD(gp) && !has_locks;
2339                                 lock = lock->l_next) {
2340                                 if (lock->l_flock.l_sysid == sysid)
2341                                         has_locks = 1;
2342                         }
2343                 }
2344                 mutex_exit(&gp->gp_mutex);
2345         }
2346 
2347         return (has_locks);
2348 }
2349 
2350 
2351 /*
2352  * PSARC case 1997/292
2353  *
2354  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2355  *  quantity, the real sysid generated by the NLM server; the upper half
2356  *  identifies the node of the cluster where the NLM server ran.
2357  *  This routine is only called by an NLM server running in a cluster.
2358  * Effects: Remove all locks held on behalf of the client identified
2359  *  by "sysid."
2360  */
2361 void
2362 cl_flk_remove_locks_by_sysid(int sysid)
2363 {
2364         graph_t *gp;
2365         int i;
2366         lock_descriptor_t *lock, *nlock;
2367 
2368         /*
2369          * Check to see if node is booted as a cluster. If not, return.
2370          */
2371         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2372                 return;
2373         }
2374 
2375         ASSERT(sysid != 0);
2376         for (i = 0; i < HASH_SIZE; i++) {
2377                 mutex_enter(&flock_lock);
2378                 gp = lock_graph[i];
2379                 mutex_exit(&flock_lock);
2380 
2381                 if (gp == NULL)
2382                         continue;
2383 
2384                 mutex_enter(&gp->gp_mutex);      /*  get mutex on lock graph */
2385 
2386                 /* signal sleeping requests so that they bail out */
2387                 lock = SLEEPING_HEAD(gp)->l_next;
2388                 while (lock != SLEEPING_HEAD(gp)) {
2389                         nlock = lock->l_next;
2390                         if (lock->l_flock.l_sysid == sysid) {
2391                                 INTERRUPT_WAKEUP(lock);
2392                         }
2393                         lock = nlock;
2394                 }
2395 
2396                 /* delete active locks */
2397                 lock = ACTIVE_HEAD(gp)->l_next;
2398                 while (lock != ACTIVE_HEAD(gp)) {
2399                         nlock = lock->l_next;
2400                         if (lock->l_flock.l_sysid == sysid) {
2401                                 flk_delete_active_lock(lock, 0);
2402                                 flk_wakeup(lock, 1);
2403                                 flk_free_lock(lock);
2404                         }
2405                         lock = nlock;
2406                 }
2407                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2408         }
2409 }
2410 
2411 /*
2412  * Delete all locks in the system that belongs to the sysid of the request.
2413  */
2414 
2415 static void
2416 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2417 {
2418         int     sysid  = request->l_flock.l_sysid;
2419         lock_descriptor_t *lock, *nlock;
2420         graph_t *gp;
2421         int i;
2422 
2423         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2424         ASSERT(sysid != 0);
2425 
2426         mutex_exit(&request->l_graph->gp_mutex);
2427 
2428         for (i = 0; i < HASH_SIZE; i++) {
2429                 mutex_enter(&flock_lock);
2430                 gp = lock_graph[i];
2431                 mutex_exit(&flock_lock);
2432 
2433                 if (gp == NULL)
2434                         continue;
2435 
2436                 mutex_enter(&gp->gp_mutex);
2437 
2438                 /* signal sleeping requests so that they bail out */
2439                 lock = SLEEPING_HEAD(gp)->l_next;
2440                 while (lock != SLEEPING_HEAD(gp)) {
2441                         nlock = lock->l_next;
2442                         if (lock->l_flock.l_sysid == sysid) {
2443                                 INTERRUPT_WAKEUP(lock);
2444                         }
2445                         lock = nlock;
2446                 }
2447 
2448                 /* delete active locks */
2449                 lock = ACTIVE_HEAD(gp)->l_next;
2450                 while (lock != ACTIVE_HEAD(gp)) {
2451                         nlock = lock->l_next;
2452                         if (lock->l_flock.l_sysid == sysid) {
2453                                 flk_delete_active_lock(lock, 0);
2454                                 flk_wakeup(lock, 1);
2455                                 flk_free_lock(lock);
2456                         }
2457                         lock = nlock;
2458                 }
2459                 mutex_exit(&gp->gp_mutex);
2460         }
2461 
2462         mutex_enter(&request->l_graph->gp_mutex);
2463 }
2464 
2465 /*
2466  * Clustering: Deletes PXFS locks
2467  * Effects: Delete all locks on files in the given file system and with the
2468  *  given PXFS id.
2469  */
2470 void
2471 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2472 {
2473         lock_descriptor_t *lock, *nlock;
2474         graph_t *gp;
2475         int i;
2476 
2477         for (i = 0; i < HASH_SIZE; i++) {
2478                 mutex_enter(&flock_lock);
2479                 gp = lock_graph[i];
2480                 mutex_exit(&flock_lock);
2481 
2482                 if (gp == NULL)
2483                         continue;
2484 
2485                 mutex_enter(&gp->gp_mutex);
2486 
2487                 /* signal sleeping requests so that they bail out */
2488                 lock = SLEEPING_HEAD(gp)->l_next;
2489                 while (lock != SLEEPING_HEAD(gp)) {
2490                         nlock = lock->l_next;
2491                         if (lock->l_vnode->v_vfsp == vfsp) {
2492                                 ASSERT(IS_PXFS(lock));
2493                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2494                                     pxfsid) {
2495                                         flk_set_state(lock,
2496                                             FLK_CANCELLED_STATE);
2497                                         flk_cancel_sleeping_lock(lock, 1);
2498                                 }
2499                         }
2500                         lock = nlock;
2501                 }
2502 
2503                 /* delete active locks */
2504                 lock = ACTIVE_HEAD(gp)->l_next;
2505                 while (lock != ACTIVE_HEAD(gp)) {
2506                         nlock = lock->l_next;
2507                         if (lock->l_vnode->v_vfsp == vfsp) {
2508                                 ASSERT(IS_PXFS(lock));
2509                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2510                                     pxfsid) {
2511                                         flk_delete_active_lock(lock, 0);
2512                                         flk_wakeup(lock, 1);
2513                                         flk_free_lock(lock);
2514                                 }
2515                         }
2516                         lock = nlock;
2517                 }
2518                 mutex_exit(&gp->gp_mutex);
2519         }
2520 }
2521 
2522 /*
2523  * Search for a sleeping lock manager lock which matches exactly this lock
2524  * request; if one is found, fake a signal to cancel it.
2525  *
2526  * Return 1 if a matching lock was found, 0 otherwise.
2527  */
2528 
2529 static int
2530 flk_canceled(lock_descriptor_t *request)
2531 {
2532         lock_descriptor_t *lock, *nlock;
2533         graph_t *gp = request->l_graph;
2534         vnode_t *vp = request->l_vnode;
2535 
2536         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2537         ASSERT(IS_LOCKMGR(request));
2538         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2539 
2540         if (lock) {
2541                 while (lock->l_vnode == vp) {
2542                         nlock = lock->l_next;
2543                         if (SAME_OWNER(lock, request) &&
2544                                 lock->l_start == request->l_start &&
2545                                         lock->l_end == request->l_end) {
2546                                 INTERRUPT_WAKEUP(lock);
2547                                 return (1);
2548                         }
2549                         lock = nlock;
2550                 }
2551         }
2552         return (0);
2553 }
2554 
2555 /*
2556  * Remove all the locks for the vnode belonging to the given pid and sysid.
2557  */
2558 
2559 void
2560 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2561 {
2562         graph_t *gp;
2563         lock_descriptor_t *lock, *nlock;
2564         lock_descriptor_t *link_stack;
2565 
2566         STACK_INIT(link_stack);
2567 
2568         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2569 
2570         if (gp == NULL)
2571                 return;
2572         mutex_enter(&gp->gp_mutex);
2573 
2574         CHECK_SLEEPING_LOCKS(gp);
2575         CHECK_ACTIVE_LOCKS(gp);
2576 
2577         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2578 
2579         if (lock) {
2580                 do {
2581                         nlock = lock->l_next;
2582                         if ((lock->l_flock.l_pid == pid ||
2583                                         pid == IGN_PID) &&
2584                                 lock->l_flock.l_sysid == sysid) {
2585                                 CANCEL_WAKEUP(lock);
2586                         }
2587                         lock = nlock;
2588                 } while (lock->l_vnode == vp);
2589         }
2590 
2591         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2592 
2593         if (lock) {
2594                 do {
2595                         nlock = lock->l_next;
2596                         if ((lock->l_flock.l_pid == pid ||
2597                                         pid == IGN_PID) &&
2598                                 lock->l_flock.l_sysid == sysid) {
2599                                 flk_delete_active_lock(lock, 0);
2600                                 STACK_PUSH(link_stack, lock, l_stack);
2601                         }
2602                         lock = nlock;
2603                 } while (lock->l_vnode == vp);
2604         }
2605 
2606         while ((lock = STACK_TOP(link_stack)) != NULL) {
2607                 STACK_POP(link_stack, l_stack);
2608                 flk_wakeup(lock, 1);
2609                 flk_free_lock(lock);
2610         }
2611 
2612         CHECK_SLEEPING_LOCKS(gp);
2613         CHECK_ACTIVE_LOCKS(gp);
2614         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2615         mutex_exit(&gp->gp_mutex);
2616 }
2617 /* ONC_PLUS EXTRACT END */
2618 
2619 
2620 /*
2621  * Called from 'fs' read and write routines for files that have mandatory
2622  * locking enabled.
2623  */
2624 
2625 int
2626 chklock(
2627         struct vnode    *vp,
2628         int             iomode,
2629         u_offset_t      offset,
2630         ssize_t         len,
2631         int             fmode,
2632         caller_context_t *ct)
2633 {
2634         register int    i;
2635         struct flock64  bf;
2636         int             error = 0;
2637 
2638         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2639         bf.l_whence = 0;
2640         bf.l_start = offset;
2641         bf.l_len = len;
2642         if (ct == NULL) {
2643                 bf.l_pid = curproc->p_pid;
2644                 bf.l_sysid = 0;
2645         } else {
2646                 bf.l_pid = ct->cc_pid;
2647                 bf.l_sysid = ct->cc_sysid;
2648         }
2649         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2650         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2651             bf.l_type != F_UNLCK)
2652                 error = i ? i : EAGAIN;
2653         return (error);
2654 }
2655 
2656 /* ONC_PLUS EXTRACT START */
2657 /*
2658  * convoff - converts the given data (start, whence) to the
2659  * given whence.
2660  */
2661 int
2662 convoff(vp, lckdat, whence, offset)
2663         struct vnode    *vp;
2664         struct flock64  *lckdat;
2665         int             whence;
2666         offset_t        offset;
2667 {
2668         int             error;
2669         struct vattr    vattr;
2670 
2671         if ((lckdat->l_whence == 2) || (whence == 2)) {
2672                 vattr.va_mask = AT_SIZE;
2673                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2674                         return (error);
2675         }
2676 
2677         switch (lckdat->l_whence) {
2678         case 1:
2679                 lckdat->l_start += offset;
2680                 break;
2681         case 2:
2682                 lckdat->l_start += vattr.va_size;
2683                 /* FALLTHRU */
2684         case 0:
2685                 break;
2686         default:
2687                 return (EINVAL);
2688         }
2689 
2690         if (lckdat->l_start < 0)
2691                 return (EINVAL);
2692 
2693         switch (whence) {
2694         case 1:
2695                 lckdat->l_start -= offset;
2696                 break;
2697         case 2:
2698                 lckdat->l_start -= vattr.va_size;
2699                 /* FALLTHRU */
2700         case 0:
2701                 break;
2702         default:
2703                 return (EINVAL);
2704         }
2705 
2706         lckdat->l_whence = (short)whence;
2707         return (0);
2708 }
2709 /* ONC_PLUS EXTRACT END */
2710 
2711 
2712 /*      proc_graph function definitions */
2713 
2714 /*
2715  * Function checks for deadlock due to the new 'lock'. If deadlock found
2716  * edges of this lock are freed and returned.
2717  */
2718 
2719 static int
2720 flk_check_deadlock(lock_descriptor_t *lock)
2721 {
2722         proc_vertex_t   *start_vertex, *pvertex;
2723         proc_vertex_t *dvertex;
2724         proc_edge_t *pep, *ppep;
2725         edge_t  *ep, *nep;
2726         proc_vertex_t *process_stack;
2727 
2728         STACK_INIT(process_stack);
2729 
2730         mutex_enter(&flock_lock);
2731         start_vertex = flk_get_proc_vertex(lock);
2732         ASSERT(start_vertex != NULL);
2733 
2734         /* construct the edges from this process to other processes */
2735 
2736         ep = FIRST_ADJ(lock);
2737         while (ep != HEAD(lock)) {
2738                 proc_vertex_t *adj_proc;
2739 
2740                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2741                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2742                         if (pep->to_proc == adj_proc) {
2743                                 ASSERT(pep->refcount);
2744                                 pep->refcount++;
2745                                 break;
2746                         }
2747                 }
2748                 if (pep == NULL) {
2749                         pep = flk_get_proc_edge();
2750                         pep->to_proc = adj_proc;
2751                         pep->refcount = 1;
2752                         adj_proc->incount++;
2753                         pep->next = start_vertex->edge;
2754                         start_vertex->edge = pep;
2755                 }
2756                 ep = NEXT_ADJ(ep);
2757         }
2758 
2759         ep = FIRST_IN(lock);
2760 
2761         while (ep != HEAD(lock)) {
2762                 proc_vertex_t *in_proc;
2763 
2764                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2765 
2766                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2767                         if (pep->to_proc == start_vertex) {
2768                                 ASSERT(pep->refcount);
2769                                 pep->refcount++;
2770                                 break;
2771                         }
2772                 }
2773                 if (pep == NULL) {
2774                         pep = flk_get_proc_edge();
2775                         pep->to_proc = start_vertex;
2776                         pep->refcount = 1;
2777                         start_vertex->incount++;
2778                         pep->next = in_proc->edge;
2779                         in_proc->edge = pep;
2780                 }
2781                 ep = NEXT_IN(ep);
2782         }
2783 
2784         if (start_vertex->incount == 0) {
2785                 mutex_exit(&flock_lock);
2786                 return (0);
2787         }
2788 
2789         flk_proc_graph_uncolor();
2790 
2791         start_vertex->p_sedge = start_vertex->edge;
2792 
2793         STACK_PUSH(process_stack, start_vertex, p_stack);
2794 
2795         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2796                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2797                         dvertex = pep->to_proc;
2798                         if (!PROC_ARRIVED(dvertex)) {
2799                                 STACK_PUSH(process_stack, dvertex, p_stack);
2800                                 dvertex->p_sedge = dvertex->edge;
2801                                 PROC_ARRIVE(pvertex);
2802                                 pvertex->p_sedge = pep->next;
2803                                 break;
2804                         }
2805                         if (!PROC_DEPARTED(dvertex))
2806                                 goto deadlock;
2807                 }
2808                 if (pep == NULL) {
2809                         PROC_DEPART(pvertex);
2810                         STACK_POP(process_stack, p_stack);
2811                 }
2812         }
2813         mutex_exit(&flock_lock);
2814         return (0);
2815 
2816 deadlock:
2817 
2818         /* we remove all lock edges and proc edges */
2819 
2820         ep = FIRST_ADJ(lock);
2821         while (ep != HEAD(lock)) {
2822                 proc_vertex_t *adj_proc;
2823                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2824                 nep = NEXT_ADJ(ep);
2825                 IN_LIST_REMOVE(ep);
2826                 ADJ_LIST_REMOVE(ep);
2827                 flk_free_edge(ep);
2828                 ppep = start_vertex->edge;
2829                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2830                                                 pep = ppep->next) {
2831                         if (pep->to_proc == adj_proc) {
2832                                 pep->refcount--;
2833                                 if (pep->refcount == 0) {
2834                                         if (pep == ppep) {
2835                                                 start_vertex->edge = pep->next;
2836                                         } else {
2837                                                 ppep->next = pep->next;
2838                                         }
2839                                         adj_proc->incount--;
2840                                         flk_proc_release(adj_proc);
2841                                         flk_free_proc_edge(pep);
2842                                 }
2843                                 break;
2844                         }
2845                 }
2846                 ep = nep;
2847         }
2848         ep = FIRST_IN(lock);
2849         while (ep != HEAD(lock)) {
2850                 proc_vertex_t *in_proc;
2851                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2852                 nep = NEXT_IN(ep);
2853                 IN_LIST_REMOVE(ep);
2854                 ADJ_LIST_REMOVE(ep);
2855                 flk_free_edge(ep);
2856                 ppep = in_proc->edge;
2857                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2858                                                 pep = ppep->next) {
2859                         if (pep->to_proc == start_vertex) {
2860                                 pep->refcount--;
2861                                 if (pep->refcount == 0) {
2862                                         if (pep == ppep) {
2863                                                 in_proc->edge = pep->next;
2864                                         } else {
2865                                                 ppep->next = pep->next;
2866                                         }
2867                                         start_vertex->incount--;
2868                                         flk_proc_release(in_proc);
2869                                         flk_free_proc_edge(pep);
2870                                 }
2871                                 break;
2872                         }
2873                 }
2874                 ep = nep;
2875         }
2876         flk_proc_release(start_vertex);
2877         mutex_exit(&flock_lock);
2878         return (1);
2879 }
2880 
2881 /*
2882  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2883  * from the list we return that, otherwise we allocate one. If necessary,
2884  * we grow the list of vertices also.
2885  */
2886 
2887 static proc_vertex_t *
2888 flk_get_proc_vertex(lock_descriptor_t *lock)
2889 {
2890         int i;
2891         proc_vertex_t   *pv;
2892         proc_vertex_t   **palloc;
2893 
2894         ASSERT(MUTEX_HELD(&flock_lock));
2895         if (lock->pvertex != -1) {
2896                 ASSERT(lock->pvertex >= 0);
2897                 pv = pgraph.proc[lock->pvertex];
2898                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2899                         return (pv);
2900                 }
2901         }
2902         for (i = 0; i < pgraph.gcount; i++) {
2903                 pv = pgraph.proc[i];
2904                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2905                         lock->pvertex = pv->index = i;
2906                         return (pv);
2907                 }
2908         }
2909         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2910         pv->pid = lock->l_flock.l_pid;
2911         pv->sysid = lock->l_flock.l_sysid;
2912         flk_proc_vertex_allocs++;
2913         if (pgraph.free != 0) {
2914                 for (i = 0; i < pgraph.gcount; i++) {
2915                         if (pgraph.proc[i] == NULL) {
2916                                 pgraph.proc[i] = pv;
2917                                 lock->pvertex = pv->index = i;
2918                                 pgraph.free--;
2919                                 return (pv);
2920                         }
2921                 }
2922         }
2923         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2924                                 sizeof (proc_vertex_t *), KM_SLEEP);
2925 
2926         if (pgraph.proc) {
2927                 bcopy(pgraph.proc, palloc,
2928                         pgraph.gcount * sizeof (proc_vertex_t *));
2929 
2930                 kmem_free(pgraph.proc,
2931                         pgraph.gcount * sizeof (proc_vertex_t *));
2932         }
2933         pgraph.proc = palloc;
2934         pgraph.free += (PROC_CHUNK - 1);
2935         pv->index = lock->pvertex = pgraph.gcount;
2936         pgraph.gcount += PROC_CHUNK;
2937         pgraph.proc[pv->index] = pv;
2938         return (pv);
2939 }
2940 
2941 /*
2942  * Allocate a proc edge.
2943  */
2944 
2945 static proc_edge_t *
2946 flk_get_proc_edge()
2947 {
2948         proc_edge_t *pep;
2949 
2950         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2951         flk_proc_edge_allocs++;
2952         return (pep);
2953 }
2954 
2955 /*
2956  * Free the proc edge. Called whenever its reference count goes to zero.
2957  */
2958 
2959 static void
2960 flk_free_proc_edge(proc_edge_t *pep)
2961 {
2962         ASSERT(pep->refcount == 0);
2963         kmem_free((void *)pep, sizeof (proc_edge_t));
2964         flk_proc_edge_frees++;
2965 }
2966 
2967 /*
2968  * Color the graph explicitly done only when the mark value hits max value.
2969  */
2970 
2971 static void
2972 flk_proc_graph_uncolor()
2973 {
2974         int i;
2975 
2976         if (pgraph.mark == UINT_MAX) {
2977                 for (i = 0; i < pgraph.gcount; i++)
2978                         if (pgraph.proc[i] != NULL) {
2979                                 pgraph.proc[i]->atime = 0;
2980                                 pgraph.proc[i]->dtime = 0;
2981                         }
2982                 pgraph.mark = 1;
2983         } else {
2984                 pgraph.mark++;
2985         }
2986 }
2987 
2988 /*
2989  * Release the proc vertex iff both there are no in edges and out edges
2990  */
2991 
2992 static void
2993 flk_proc_release(proc_vertex_t *proc)
2994 {
2995         ASSERT(MUTEX_HELD(&flock_lock));
2996         if (proc->edge == NULL && proc->incount == 0) {
2997                 pgraph.proc[proc->index] = NULL;
2998                 pgraph.free++;
2999                 kmem_free(proc, sizeof (proc_vertex_t));
3000                 flk_proc_vertex_frees++;
3001         }
3002 }
3003 
3004 /*
3005  * Updates process graph to reflect change in a lock_graph.
3006  * Note: We should call this function only after we have a correctly
3007  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3008  * eg: in function flk_relation() we call this function after flk_recompute_
3009  * dependencies() otherwise if a process tries to lock a vnode hashed
3010  * into another graph it might sleep for ever.
3011  */
3012 
3013 static void
3014 flk_update_proc_graph(edge_t *ep, int delete)
3015 {
3016         proc_vertex_t *toproc, *fromproc;
3017         proc_edge_t *pep, *prevpep;
3018 
3019         mutex_enter(&flock_lock);
3020         toproc = flk_get_proc_vertex(ep->to_vertex);
3021         fromproc = flk_get_proc_vertex(ep->from_vertex);
3022 
3023         if (!delete)
3024                 goto add;
3025         pep = prevpep = fromproc->edge;
3026 
3027         ASSERT(pep != NULL);
3028         while (pep != NULL) {
3029                 if (pep->to_proc == toproc) {
3030                         ASSERT(pep->refcount > 0);
3031                         pep->refcount--;
3032                         if (pep->refcount == 0) {
3033                                 if (pep == prevpep) {
3034                                         fromproc->edge = pep->next;
3035                                 } else {
3036                                         prevpep->next = pep->next;
3037                                 }
3038                                 toproc->incount--;
3039                                 flk_proc_release(toproc);
3040                                 flk_free_proc_edge(pep);
3041                         }
3042                         break;
3043                 }
3044                 prevpep = pep;
3045                 pep = pep->next;
3046         }
3047         flk_proc_release(fromproc);
3048         mutex_exit(&flock_lock);
3049         return;
3050 add:
3051 
3052         pep = fromproc->edge;
3053 
3054         while (pep != NULL) {
3055                 if (pep->to_proc == toproc) {
3056                         ASSERT(pep->refcount > 0);
3057                         pep->refcount++;
3058                         break;
3059                 }
3060                 pep = pep->next;
3061         }
3062         if (pep == NULL) {
3063                 pep = flk_get_proc_edge();
3064                 pep->to_proc = toproc;
3065                 pep->refcount = 1;
3066                 toproc->incount++;
3067                 pep->next = fromproc->edge;
3068                 fromproc->edge = pep;
3069         }
3070         mutex_exit(&flock_lock);
3071 }
3072 
3073 /* ONC_PLUS EXTRACT START */
3074 /*
3075  * Set the control status for lock manager requests.
3076  *
3077  */
3078 
3079 /*
3080  * PSARC case 1997/292
3081  *
3082  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3083  * Effects: Set the state of the NLM server identified by "nlmid"
3084  *   in the NLM registry to state "nlm_state."
3085  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3086  *   NLM server to this LLM.
3087  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3088  *   may be locks requests that have gotten started but not finished.  In
3089  *   particular, there may be blocking requests that are in the callback code
3090  *   before sleeping (so they're not holding the lock for the graph).  If
3091  *   such a thread reacquires the graph's lock (to go to sleep) after
3092  *   NLM state in the NLM registry  is set to a non-up value,
3093  *   it will notice the status and bail out.  If the request gets
3094  *   granted before the thread can check the NLM registry, let it
3095  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3096  *
3097  * Modifies: nlm_reg_obj (global)
3098  * Arguments:
3099  *    nlmid     (IN):    id uniquely identifying an NLM server
3100  *    nlm_state (IN):    NLM server state to change "nlmid" to
3101  */
3102 void
3103 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3104 {
3105         /*
3106          * Check to see if node is booted as a cluster. If not, return.
3107          */
3108         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3109                 return;
3110         }
3111 
3112         /*
3113          * Check for development/debugging.  It is possible to boot a node
3114          * in non-cluster mode, and then run a special script, currently
3115          * available only to developers, to bring up the node as part of a
3116          * cluster.  The problem is that running such a script does not
3117          * result in the routine flk_init() being called and hence global array
3118          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3119          * but the LLM needs to do an additional check to see if the global
3120          * array has been created or not. If nlm_reg_status is NULL, then
3121          * return, else continue.
3122          */
3123         if (nlm_reg_status == NULL) {
3124                 return;
3125         }
3126 
3127         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3128         mutex_enter(&nlm_reg_lock);
3129 
3130         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3131                 /*
3132                  * If the NLM server "nlmid" is unknown in the NLM registry,
3133                  * add it to the registry in the nlm shutting down state.
3134                  */
3135                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3136                         FLK_NLM_SHUTTING_DOWN);
3137         } else {
3138                 /*
3139                  * Change the state of the NLM server identified by "nlmid"
3140                  * in the NLM registry to the argument "nlm_state."
3141                  */
3142                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3143                         nlm_state);
3144         }
3145 
3146         /*
3147          *  The reason we must register the NLM server that is shutting down
3148          *  with an LLM that doesn't already know about it (never sent a lock
3149          *  request) is to handle correctly a race between shutdown and a new
3150          *  lock request.  Suppose that a shutdown request from the NLM server
3151          *  invokes this routine at the LLM, and a thread is spawned to
3152          *  service the request. Now suppose a new lock request is in
3153          *  progress and has already passed the first line of defense in
3154          *  reclock(), which denies new locks requests from NLM servers
3155          *  that are not in the NLM_UP state.  After the current routine
3156          *  is invoked for both phases of shutdown, the routine will return,
3157          *  having done nothing, and the lock request will proceed and
3158          *  probably be granted.  The problem is that the shutdown was ignored
3159          *  by the lock request because there was no record of that NLM server
3160          *  shutting down.   We will be in the peculiar position of thinking
3161          *  that we've shutdown the NLM server and all locks at all LLMs have
3162          *  been discarded, but in fact there's still one lock held.
3163          *  The solution is to record the existence of NLM server and change
3164          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3165          *  progress may proceed because the next phase NLM_DOWN will catch
3166          *  this lock and discard it.
3167          */
3168         mutex_exit(&nlm_reg_lock);
3169 
3170         switch (nlm_state) {
3171         case FLK_NLM_UP:
3172                 /*
3173                  * Change the NLM state of all locks still held on behalf of
3174                  * the NLM server identified by "nlmid" to NLM_UP.
3175                  */
3176                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3177                 break;
3178 
3179         case FLK_NLM_SHUTTING_DOWN:
3180                 /*
3181                  * Wake up all sleeping locks for the NLM server identified
3182                  * by "nlmid." Note that eventually all woken threads will
3183                  * have their lock requests cancelled and descriptors
3184                  * removed from the sleeping lock list.  Note that the NLM
3185                  * server state associated with each lock descriptor is
3186                  * changed to FLK_NLM_SHUTTING_DOWN.
3187                  */
3188                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3189                 break;
3190 
3191         case FLK_NLM_DOWN:
3192                 /*
3193                  * Discard all active, granted locks for this NLM server
3194                  * identified by "nlmid."
3195                  */
3196                 cl_flk_unlock_nlm_granted(nlmid);
3197                 break;
3198 
3199         default:
3200                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3201         }
3202 }
3203 
3204 /*
3205  * Set the control status for lock manager requests.
3206  *
3207  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3208  * may be locks requests that have gotten started but not finished.  In
3209  * particular, there may be blocking requests that are in the callback code
3210  * before sleeping (so they're not holding the lock for the graph).  If
3211  * such a thread reacquires the graph's lock (to go to sleep) after
3212  * flk_lockmgr_status is set to a non-up value, it will notice the status
3213  * and bail out.  If the request gets granted before the thread can check
3214  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3215  * we are called with FLK_LOCKMGR_DOWN.
3216  */
3217 
3218 void
3219 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3220 {
3221         int i;
3222         graph_t *gp;
3223         struct flock_globals *fg;
3224 
3225         fg = flk_get_globals();
3226         ASSERT(fg != NULL);
3227 
3228         mutex_enter(&flock_lock);
3229         fg->flk_lockmgr_status = status;
3230         mutex_exit(&flock_lock);
3231 
3232         /*
3233          * If the lock manager is coming back up, all that's needed is to
3234          * propagate this information to the graphs.  If the lock manager
3235          * is going down, additional action is required, and each graph's
3236          * copy of the state is updated atomically with this other action.
3237          */
3238         switch (status) {
3239         case FLK_LOCKMGR_UP:
3240                 for (i = 0; i < HASH_SIZE; i++) {
3241                         mutex_enter(&flock_lock);
3242                         gp = lock_graph[i];
3243                         mutex_exit(&flock_lock);
3244                         if (gp == NULL)
3245                                 continue;
3246                         mutex_enter(&gp->gp_mutex);
3247                         fg->lockmgr_status[i] = status;
3248                         mutex_exit(&gp->gp_mutex);
3249                 }
3250                 break;
3251         case FLK_WAKEUP_SLEEPERS:
3252                 wakeup_sleeping_lockmgr_locks(fg);
3253                 break;
3254         case FLK_LOCKMGR_DOWN:
3255                 unlock_lockmgr_granted(fg);
3256                 break;
3257         default:
3258                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3259                 break;
3260         }
3261 }
3262 
3263 /*
3264  * This routine returns all the locks that are active or sleeping and are
3265  * associated with a particular set of identifiers.  If lock_state != 0, then
3266  * only locks that match the lock_state are returned. If lock_state == 0, then
3267  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3268  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3269  * vnode pointer is ignored.
3270  *
3271  * A list containing the vnode pointer and an flock structure
3272  * describing the lock is returned.  Each element in the list is
3273  * dynamically allocated and must be freed by the caller.  The
3274  * last item in the list is denoted by a NULL value in the ll_next
3275  * field.
3276  *
3277  * The vnode pointers returned are held.  The caller is responsible
3278  * for releasing these.  Note that the returned list is only a snapshot of
3279  * the current lock information, and that it is a snapshot of a moving
3280  * target (only one graph is locked at a time).
3281  */
3282 
3283 locklist_t *
3284 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3285                 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3286 {
3287         lock_descriptor_t       *lock;
3288         lock_descriptor_t       *graph_head;
3289         locklist_t              listhead;
3290         locklist_t              *llheadp;
3291         locklist_t              *llp;
3292         locklist_t              *lltp;
3293         graph_t                 *gp;
3294         int                     i;
3295         int                     first_index; /* graph index */
3296         int                     num_indexes; /* graph index */
3297 
3298         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3299             (list_type == FLK_SLEEPING_STATE));
3300 
3301         /*
3302          * Get a pointer to something to use as a list head while building
3303          * the rest of the list.
3304          */
3305         llheadp = &listhead;
3306         lltp = llheadp;
3307         llheadp->ll_next = (locklist_t *)NULL;
3308 
3309         /* Figure out which graphs we want to look at. */
3310         if (vp == NULL) {
3311                 first_index = 0;
3312                 num_indexes = HASH_SIZE;
3313         } else {
3314                 first_index = HASH_INDEX(vp);
3315                 num_indexes = 1;
3316         }
3317 
3318         for (i = first_index; i < first_index + num_indexes; i++) {
3319                 mutex_enter(&flock_lock);
3320                 gp = lock_graph[i];
3321                 mutex_exit(&flock_lock);
3322                 if (gp == NULL) {
3323                         continue;
3324                 }
3325 
3326                 mutex_enter(&gp->gp_mutex);
3327                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3328                         ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3329                 for (lock = graph_head->l_next;
3330                     lock != graph_head;
3331                     lock = lock->l_next) {
3332                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3333                                 continue;
3334                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3335                                 continue;
3336                         if (vp != NULL && lock->l_vnode != vp)
3337                                 continue;
3338                         if (lock_state && !(lock_state & lock->l_state))
3339                                 continue;
3340                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3341                                 continue;
3342                         /*
3343                          * A matching lock was found.  Allocate
3344                          * space for a new locklist entry and fill
3345                          * it in.
3346                          */
3347                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3348                         lltp->ll_next = llp;
3349                         VN_HOLD(lock->l_vnode);
3350                         llp->ll_vp = lock->l_vnode;
3351                         create_flock(lock, &(llp->ll_flock));
3352                         llp->ll_next = (locklist_t *)NULL;
3353                         lltp = llp;
3354                 }
3355                 mutex_exit(&gp->gp_mutex);
3356         }
3357 
3358         llp = llheadp->ll_next;
3359         return (llp);
3360 }
3361 
3362 /*
3363  * These two functions are simply interfaces to get_lock_list.  They return
3364  * a list of sleeping or active locks for the given sysid and pid.  See
3365  * get_lock_list for details.
3366  *
3367  * In either case we don't particularly care to specify the zone of interest;
3368  * the sysid-space is global across zones, so the sysid will map to exactly one
3369  * zone, and we'll return information for that zone.
3370  */
3371 
3372 locklist_t *
3373 flk_get_sleeping_locks(int sysid, pid_t pid)
3374 {
3375         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3376                     ALL_ZONES));
3377 }
3378 
3379 locklist_t *
3380 flk_get_active_locks(int sysid, pid_t pid)
3381 {
3382         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3383                     ALL_ZONES));
3384 }
3385 
3386 /*
3387  * Another interface to get_lock_list.  This one returns all the active
3388  * locks for a given vnode.  Again, see get_lock_list for details.
3389  *
3390  * We don't need to specify which zone's locks we're interested in.  The matter
3391  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3392  * be used by multiple zones, so the list of locks will all be from the right
3393  * zone.
3394  */
3395 
3396 locklist_t *
3397 flk_active_locks_for_vp(const vnode_t *vp)
3398 {
3399         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3400                     ALL_ZONES));
3401 }
3402 
3403 /*
3404  * Another interface to get_lock_list.  This one returns all the active
3405  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3406  *
3407  * See the comment for flk_active_locks_for_vp() for why we don't care to
3408  * specify the particular zone of interest.
3409  */
3410 locklist_t *
3411 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3412 {
3413         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3414                                 NOPID, vp, ALL_ZONES));
3415 }
3416 
3417 /*
3418  * Another interface to get_lock_list.  This one returns all the active
3419  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3420  *
3421  * The zone doesn't need to be specified here; the locks held by a
3422  * particular process will either be local (ie, non-NFS) or from the zone
3423  * the process is executing in.  This is because other parts of the system
3424  * ensure that an NFS vnode can't be used in a zone other than that in
3425  * which it was opened.
3426  */
3427 locklist_t *
3428 flk_active_nbmand_locks(pid_t pid)
3429 {
3430         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3431                                 pid, NULL, ALL_ZONES));
3432 }
3433 
3434 /*
3435  * Free up all entries in the locklist.
3436  */
3437 void
3438 flk_free_locklist(locklist_t *llp)
3439 {
3440         locklist_t *next_llp;
3441 
3442         while (llp) {
3443                 next_llp = llp->ll_next;
3444                 VN_RELE(llp->ll_vp);
3445                 kmem_free(llp, sizeof (*llp));
3446                 llp = next_llp;
3447         }
3448 }
3449 
3450 static void
3451 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3452 {
3453         /*
3454          * For each graph "lg" in the hash table lock_graph do
3455          * a.  Get the list of sleeping locks
3456          * b.  For each lock descriptor in the list do
3457          *      i.   If the requested lock is an NLM server request AND
3458          *              the nlmid is the same as the routine argument then
3459          *              change the lock descriptor's state field to
3460          *              "nlm_state."
3461          * c.  Get the list of active locks
3462          * d.  For each lock descriptor in the list do
3463          *      i.   If the requested lock is an NLM server request AND
3464          *              the nlmid is the same as the routine argument then
3465          *              change the lock descriptor's state field to
3466          *              "nlm_state."
3467          */
3468 
3469         int                     i;
3470         graph_t                 *gp;                    /* lock graph */
3471         lock_descriptor_t       *lock;                  /* lock */
3472         lock_descriptor_t       *nlock = NULL;          /* next lock */
3473         int                     lock_nlmid;
3474 
3475         for (i = 0; i < HASH_SIZE; i++) {
3476                 mutex_enter(&flock_lock);
3477                 gp = lock_graph[i];
3478                 mutex_exit(&flock_lock);
3479                 if (gp == NULL) {
3480                         continue;
3481                 }
3482 
3483                 /* Get list of sleeping locks in current lock graph. */
3484                 mutex_enter(&gp->gp_mutex);
3485                 for (lock = SLEEPING_HEAD(gp)->l_next;
3486                     lock != SLEEPING_HEAD(gp);
3487                     lock = nlock) {
3488                         nlock = lock->l_next;
3489                         /* get NLM id */
3490                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3491 
3492                         /*
3493                          * If NLM server request AND nlmid of lock matches
3494                          * nlmid of argument, then set the NLM state of the
3495                          * lock to "nlm_state."
3496                          */
3497                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3498                                 SET_NLM_STATE(lock, nlm_state);
3499                         }
3500                 }
3501 
3502                 /* Get list of active locks in current lock graph. */
3503                 for (lock = ACTIVE_HEAD(gp)->l_next;
3504                     lock != ACTIVE_HEAD(gp);
3505                     lock = nlock) {
3506                         nlock = lock->l_next;
3507                         /* get NLM id */
3508                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3509 
3510                         /*
3511                          * If NLM server request AND nlmid of lock matches
3512                          * nlmid of argument, then set the NLM state of the
3513                          * lock to "nlm_state."
3514                          */
3515                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3516                                 ASSERT(IS_ACTIVE(lock));
3517                                 SET_NLM_STATE(lock, nlm_state);
3518                         }
3519                 }
3520                 mutex_exit(&gp->gp_mutex);
3521         }
3522 }
3523 
3524 /*
3525  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3526  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3527  *   identified by "nlmid." Poke those lock requests.
3528  */
3529 static void
3530 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3531 {
3532         lock_descriptor_t *lock;
3533         lock_descriptor_t *nlock = NULL; /* next lock */
3534         int i;
3535         graph_t *gp;
3536         int     lock_nlmid;
3537 
3538         for (i = 0; i < HASH_SIZE; i++) {
3539                 mutex_enter(&flock_lock);
3540                 gp = lock_graph[i];
3541                 mutex_exit(&flock_lock);
3542                 if (gp == NULL) {
3543                         continue;
3544                 }
3545 
3546                 mutex_enter(&gp->gp_mutex);
3547                 for (lock = SLEEPING_HEAD(gp)->l_next;
3548                     lock != SLEEPING_HEAD(gp);
3549                     lock = nlock) {
3550                         nlock = lock->l_next;
3551                         /*
3552                          * If NLM server request _and_ nlmid of lock matches
3553                          * nlmid of argument, then set the NLM state of the
3554                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3555                          * request.
3556                          */
3557                         if (IS_LOCKMGR(lock)) {
3558                                 /* get NLM id */
3559                                 lock_nlmid =
3560                                         GETNLMID(lock->l_flock.l_sysid);
3561                                 if (nlmid == lock_nlmid) {
3562                                         SET_NLM_STATE(lock,
3563                                                 FLK_NLM_SHUTTING_DOWN);
3564                                         INTERRUPT_WAKEUP(lock);
3565                                 }
3566                         }
3567                 }
3568                 mutex_exit(&gp->gp_mutex);
3569         }
3570 }
3571 
3572 /*
3573  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3574  * Effects:  Find all active (granted) lock manager locks _only_ for the
3575  *   NLM server identified by "nlmid" and release them.
3576  */
3577 static void
3578 cl_flk_unlock_nlm_granted(int nlmid)
3579 {
3580         lock_descriptor_t *lock;
3581         lock_descriptor_t *nlock = NULL; /* next lock */
3582         int i;
3583         graph_t *gp;
3584         int     lock_nlmid;
3585 
3586         for (i = 0; i < HASH_SIZE; i++) {
3587                 mutex_enter(&flock_lock);
3588                 gp = lock_graph[i];
3589                 mutex_exit(&flock_lock);
3590                 if (gp == NULL) {
3591                         continue;
3592                 }
3593 
3594                 mutex_enter(&gp->gp_mutex);
3595                 for (lock = ACTIVE_HEAD(gp)->l_next;
3596                     lock != ACTIVE_HEAD(gp);
3597                     lock = nlock) {
3598                         nlock = lock->l_next;
3599                         ASSERT(IS_ACTIVE(lock));
3600 
3601                         /*
3602                          * If it's an  NLM server request _and_ nlmid of
3603                          * the lock matches nlmid of argument, then
3604                          * remove the active lock the list, wakup blocked
3605                          * threads, and free the storage for the lock.
3606                          * Note that there's no need to mark the NLM state
3607                          * of this lock to NLM_DOWN because the lock will
3608                          * be deleted anyway and its storage freed.
3609                          */
3610                         if (IS_LOCKMGR(lock)) {
3611                                 /* get NLM id */
3612                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3613                                 if (nlmid == lock_nlmid) {
3614                                         flk_delete_active_lock(lock, 0);
3615                                         flk_wakeup(lock, 1);
3616                                         flk_free_lock(lock);
3617                                 }
3618                         }
3619                 }
3620                 mutex_exit(&gp->gp_mutex);
3621         }
3622 }
3623 
3624 /*
3625  * Find all sleeping lock manager requests and poke them.
3626  */
3627 static void
3628 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3629 {
3630         lock_descriptor_t *lock;
3631         lock_descriptor_t *nlock = NULL; /* next lock */
3632         int i;
3633         graph_t *gp;
3634         zoneid_t zoneid = getzoneid();
3635 
3636         for (i = 0; i < HASH_SIZE; i++) {
3637                 mutex_enter(&flock_lock);
3638                 gp = lock_graph[i];
3639                 mutex_exit(&flock_lock);
3640                 if (gp == NULL) {
3641                         continue;
3642                 }
3643 
3644                 mutex_enter(&gp->gp_mutex);
3645                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3646                 for (lock = SLEEPING_HEAD(gp)->l_next;
3647                     lock != SLEEPING_HEAD(gp);
3648                     lock = nlock) {
3649                         nlock = lock->l_next;
3650                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3651                                 INTERRUPT_WAKEUP(lock);
3652                         }
3653                 }
3654                 mutex_exit(&gp->gp_mutex);
3655         }
3656 }
3657 
3658 
3659 /*
3660  * Find all active (granted) lock manager locks and release them.
3661  */
3662 static void
3663 unlock_lockmgr_granted(struct flock_globals *fg)
3664 {
3665         lock_descriptor_t *lock;
3666         lock_descriptor_t *nlock = NULL; /* next lock */
3667         int i;
3668         graph_t *gp;
3669         zoneid_t zoneid = getzoneid();
3670 
3671         for (i = 0; i < HASH_SIZE; i++) {
3672                 mutex_enter(&flock_lock);
3673                 gp = lock_graph[i];
3674                 mutex_exit(&flock_lock);
3675                 if (gp == NULL) {
3676                         continue;
3677                 }
3678 
3679                 mutex_enter(&gp->gp_mutex);
3680                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3681                 for (lock = ACTIVE_HEAD(gp)->l_next;
3682                     lock != ACTIVE_HEAD(gp);
3683                     lock = nlock) {
3684                         nlock = lock->l_next;
3685                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3686                                 ASSERT(IS_ACTIVE(lock));
3687                                 flk_delete_active_lock(lock, 0);
3688                                 flk_wakeup(lock, 1);
3689                                 flk_free_lock(lock);
3690                         }
3691                 }
3692                 mutex_exit(&gp->gp_mutex);
3693         }
3694 }
3695 /* ONC_PLUS EXTRACT END */
3696 
3697 
3698 /*
3699  * Wait until a lock is granted, cancelled, or interrupted.
3700  */
3701 
3702 static void
3703 wait_for_lock(lock_descriptor_t *request)
3704 {
3705         graph_t *gp = request->l_graph;
3706 
3707         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3708 
3709         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3710             !(IS_INTERRUPTED(request))) {
3711                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3712                         flk_set_state(request, FLK_INTERRUPTED_STATE);
3713                         request->l_state |= INTERRUPTED_LOCK;
3714                 }
3715         }
3716 }
3717 
3718 /* ONC_PLUS EXTRACT START */
3719 /*
3720  * Create an flock structure from the existing lock information
3721  *
3722  * This routine is used to create flock structures for the lock manager
3723  * to use in a reclaim request.  Since the lock was originated on this
3724  * host, it must be conforming to UNIX semantics, so no checking is
3725  * done to make sure it falls within the lower half of the 32-bit range.
3726  */
3727 
3728 static void
3729 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3730 {
3731         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3732         ASSERT(lp->l_end >= lp->l_start);
3733 
3734         flp->l_type = lp->l_type;
3735         flp->l_whence = 0;
3736         flp->l_start = lp->l_start;
3737         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3738                 (lp->l_end - lp->l_start + 1);
3739         flp->l_sysid = lp->l_flock.l_sysid;
3740         flp->l_pid = lp->l_flock.l_pid;
3741 }
3742 
3743 /*
3744  * Convert flock_t data describing a lock range into unsigned long starting
3745  * and ending points, which are put into lock_request.  Returns 0 or an
3746  * errno value.
3747  * Large Files: max is passed by the caller and we return EOVERFLOW
3748  * as defined by LFS API.
3749  */
3750 
3751 int
3752 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3753     u_offset_t *start, u_offset_t *end, offset_t offset)
3754 {
3755         struct vattr    vattr;
3756         int     error;
3757 
3758         /*
3759          * Determine the starting point of the request
3760          */
3761         switch (flp->l_whence) {
3762         case 0:         /* SEEK_SET */
3763                 *start = (u_offset_t)flp->l_start;
3764                 break;
3765         case 1:         /* SEEK_CUR */
3766                 *start = (u_offset_t)(flp->l_start + offset);
3767                 break;
3768         case 2:         /* SEEK_END */
3769                 vattr.va_mask = AT_SIZE;
3770                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3771                         return (error);
3772                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3773                 break;
3774         default:
3775                 return (EINVAL);
3776         }
3777 
3778         /*
3779          * Determine the range covered by the request.
3780          */
3781         if (flp->l_len == 0)
3782                 *end = MAX_U_OFFSET_T;
3783         else if ((offset_t)flp->l_len > 0) {
3784                 *end = (u_offset_t)(*start + (flp->l_len - 1));
3785         } else {
3786                 /*
3787                  * Negative length; why do we even allow this ?
3788                  * Because this allows easy specification of
3789                  * the last n bytes of the file.
3790                  */
3791                 *end = *start;
3792                 *start += (u_offset_t)flp->l_len;
3793                 (*start)++;
3794         }
3795         return (0);
3796 }
3797 
3798 /*
3799  * Check the validity of lock data.  This can used by the NFS
3800  * frlock routines to check data before contacting the server.  The
3801  * server must support semantics that aren't as restrictive as
3802  * the UNIX API, so the NFS client is required to check.
3803  * The maximum is now passed in by the caller.
3804  */
3805 
3806 int
3807 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3808 {
3809         /*
3810          * The end (length) for local locking should never be greater
3811          * than MAXEND. However, the representation for
3812          * the entire file is MAX_U_OFFSET_T.
3813          */
3814         if ((start > max) ||
3815             ((end > max) && (end != MAX_U_OFFSET_T))) {
3816                 return (EINVAL);
3817         }
3818         if (start > end) {
3819             return (EINVAL);
3820         }
3821         return (0);
3822 }
3823 
3824 /*
3825  * Fill in request->l_flock with information about the lock blocking the
3826  * request.  The complexity here is that lock manager requests are allowed
3827  * to see into the upper part of the 32-bit address range, whereas local
3828  * requests are only allowed to see signed values.
3829  *
3830  * What should be done when "blocker" is a lock manager lock that uses the
3831  * upper portion of the 32-bit range, but "request" is local?  Since the
3832  * request has already been determined to have been blocked by the blocker,
3833  * at least some portion of "blocker" must be in the range of the request,
3834  * or the request extends to the end of file.  For the first case, the
3835  * portion in the lower range is returned with the indication that it goes
3836  * "to EOF."  For the second case, the last byte of the lower range is
3837  * returned with the indication that it goes "to EOF."
3838  */
3839 
3840 static void
3841 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3842 {
3843         flock64_t *flrp;                        /* l_flock portion of request */
3844 
3845         ASSERT(blocker != NULL);
3846 
3847         flrp = &request->l_flock;
3848         flrp->l_whence = 0;
3849         flrp->l_type = blocker->l_type;
3850         flrp->l_pid = blocker->l_flock.l_pid;
3851         flrp->l_sysid = blocker->l_flock.l_sysid;
3852 
3853         if (IS_LOCKMGR(request)) {
3854                 flrp->l_start = blocker->l_start;
3855                 if (blocker->l_end == MAX_U_OFFSET_T)
3856                         flrp->l_len = 0;
3857                 else
3858                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
3859         } else {
3860                 if (blocker->l_start > MAXEND) {
3861                         flrp->l_start = MAXEND;
3862                         flrp->l_len = 0;
3863                 } else {
3864                         flrp->l_start = blocker->l_start;
3865                         if (blocker->l_end == MAX_U_OFFSET_T)
3866                                 flrp->l_len = 0;
3867                         else
3868                                 flrp->l_len = blocker->l_end -
3869                                         blocker->l_start + 1;
3870                 }
3871         }
3872 }
3873 /* ONC_PLUS EXTRACT END */
3874 
3875 /*
3876  * PSARC case 1997/292
3877  */
3878 /*
3879  * This is the public routine exported by flock.h.
3880  */
3881 void
3882 cl_flk_change_nlm_state_to_unknown(int nlmid)
3883 {
3884         /*
3885          * Check to see if node is booted as a cluster. If not, return.
3886          */
3887         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3888                 return;
3889         }
3890 
3891         /*
3892          * See comment in cl_flk_set_nlm_status().
3893          */
3894         if (nlm_reg_status == NULL) {
3895                 return;
3896         }
3897 
3898         /*
3899          * protect NLM registry state with a mutex.
3900          */
3901         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3902         mutex_enter(&nlm_reg_lock);
3903         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3904         mutex_exit(&nlm_reg_lock);
3905 }
3906 
3907 /*
3908  * Return non-zero if the given I/O request conflicts with an active NBMAND
3909  * lock.
3910  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3911  * locks.
3912  */
3913 
3914 int
3915 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3916                 ssize_t length, int svmand, caller_context_t *ct)
3917 {
3918         int conflict = 0;
3919         graph_t                 *gp;
3920         lock_descriptor_t       *lock;
3921         pid_t pid;
3922         int sysid;
3923 
3924         if (ct == NULL) {
3925                 pid = curproc->p_pid;
3926                 sysid = 0;
3927         } else {
3928                 pid = ct->cc_pid;
3929                 sysid = ct->cc_sysid;
3930         }
3931 
3932         mutex_enter(&flock_lock);
3933         gp = lock_graph[HASH_INDEX(vp)];
3934         mutex_exit(&flock_lock);
3935         if (gp == NULL)
3936                 return (0);
3937 
3938         mutex_enter(&gp->gp_mutex);
3939         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3940 
3941         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3942                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3943                     (lock->l_flock.l_sysid != sysid ||
3944                     lock->l_flock.l_pid != pid) &&
3945                     lock_blocks_io(op, offset, length,
3946                                 lock->l_type, lock->l_start, lock->l_end)) {
3947                         conflict = 1;
3948                         break;
3949                 }
3950         }
3951         mutex_exit(&gp->gp_mutex);
3952 
3953         return (conflict);
3954 }
3955 
3956 /*
3957  * Return non-zero if the given I/O request conflicts with the given lock.
3958  */
3959 
3960 static int
3961 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3962             int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3963 {
3964         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3965         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3966 
3967         if (op == NBL_READ && lock_type == F_RDLCK)
3968                 return (0);
3969 
3970         if (offset <= lock_start && lock_start < offset + length)
3971                 return (1);
3972         if (lock_start <= offset && offset <= lock_end)
3973                 return (1);
3974 
3975         return (0);
3976 }
3977 
3978 #ifdef DEBUG
3979 static void
3980 check_active_locks(graph_t *gp)
3981 {
3982         lock_descriptor_t *lock, *lock1;
3983         edge_t  *ep;
3984 
3985         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3986                                                 lock = lock->l_next) {
3987                 ASSERT(IS_ACTIVE(lock));
3988                 ASSERT(NOT_BLOCKED(lock));
3989                 ASSERT(!IS_BARRIER(lock));
3990 
3991                 ep = FIRST_IN(lock);
3992 
3993                 while (ep != HEAD(lock)) {
3994                         ASSERT(IS_SLEEPING(ep->from_vertex));
3995                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
3996                         ep = NEXT_IN(ep);
3997                 }
3998 
3999                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4000                                         lock1 = lock1->l_next) {
4001                         if (lock1->l_vnode == lock->l_vnode) {
4002                         if (BLOCKS(lock1, lock)) {
4003                                 cmn_err(CE_PANIC,
4004                                     "active lock %p blocks %p",
4005                                     (void *)lock1, (void *)lock);
4006                         } else if (BLOCKS(lock, lock1)) {
4007                                 cmn_err(CE_PANIC,
4008                                     "active lock %p blocks %p",
4009                                     (void *)lock, (void *)lock1);
4010                         }
4011                         }
4012                 }
4013         }
4014 }
4015 
4016 /*
4017  * Effect: This functions checks to see if the transition from 'old_state' to
4018  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4019  *      and 1 if it is not.
4020  *      For a map of valid transitions, see sys/flock_impl.h
4021  */
4022 static int
4023 check_lock_transition(int old_state, int new_state)
4024 {
4025         switch (old_state) {
4026         case FLK_INITIAL_STATE:
4027                 if ((new_state == FLK_START_STATE) ||
4028                     (new_state == FLK_SLEEPING_STATE) ||
4029                     (new_state == FLK_ACTIVE_STATE) ||
4030                     (new_state == FLK_DEAD_STATE)) {
4031                         return (0);
4032                 } else {
4033                         return (1);
4034                 }
4035         case FLK_START_STATE:
4036                 if ((new_state == FLK_ACTIVE_STATE) ||
4037                     (new_state == FLK_DEAD_STATE)) {
4038                         return (0);
4039                 } else {
4040                         return (1);
4041                 }
4042         case FLK_ACTIVE_STATE:
4043                 if (new_state == FLK_DEAD_STATE) {
4044                         return (0);
4045                 } else {
4046                         return (1);
4047                 }
4048         case FLK_SLEEPING_STATE:
4049                 if ((new_state == FLK_GRANTED_STATE) ||
4050                     (new_state == FLK_INTERRUPTED_STATE) ||
4051                     (new_state == FLK_CANCELLED_STATE)) {
4052                         return (0);
4053                 } else {
4054                         return (1);
4055                 }
4056         case FLK_GRANTED_STATE:
4057                 if ((new_state == FLK_START_STATE) ||
4058                     (new_state == FLK_INTERRUPTED_STATE) ||
4059                     (new_state == FLK_CANCELLED_STATE)) {
4060                         return (0);
4061                 } else {
4062                         return (1);
4063                 }
4064         case FLK_CANCELLED_STATE:
4065                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4066                     (new_state == FLK_DEAD_STATE)) {
4067                         return (0);
4068                 } else {
4069                         return (1);
4070                 }
4071         case FLK_INTERRUPTED_STATE:
4072                 if (new_state == FLK_DEAD_STATE) {
4073                         return (0);
4074                 } else {
4075                         return (1);
4076                 }
4077         case FLK_DEAD_STATE:
4078                 /* May be set more than once */
4079                 if (new_state == FLK_DEAD_STATE) {
4080                         return (0);
4081                 } else {
4082                         return (1);
4083                 }
4084         default:
4085                 return (1);
4086         }
4087 }
4088 
4089 static void
4090 check_sleeping_locks(graph_t *gp)
4091 {
4092         lock_descriptor_t *lock1, *lock2;
4093         edge_t *ep;
4094         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4095                                 lock1 = lock1->l_next) {
4096                                 ASSERT(!IS_BARRIER(lock1));
4097         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4098                                 lock2 = lock2->l_next) {
4099                 if (lock1->l_vnode == lock2->l_vnode) {
4100                         if (BLOCKS(lock2, lock1)) {
4101                                 ASSERT(!IS_GRANTED(lock1));
4102                                 ASSERT(!NOT_BLOCKED(lock1));
4103                                 path(lock1, lock2);
4104                         }
4105                 }
4106         }
4107 
4108         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4109                                         lock2 = lock2->l_next) {
4110                                 ASSERT(!IS_BARRIER(lock1));
4111                 if (lock1->l_vnode == lock2->l_vnode) {
4112                         if (BLOCKS(lock2, lock1)) {
4113                                 ASSERT(!IS_GRANTED(lock1));
4114                                 ASSERT(!NOT_BLOCKED(lock1));
4115                                 path(lock1, lock2);
4116                         }
4117                 }
4118         }
4119         ep = FIRST_ADJ(lock1);
4120         while (ep != HEAD(lock1)) {
4121                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4122                 ep = NEXT_ADJ(ep);
4123         }
4124         }
4125 }
4126 
4127 static int
4128 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4129 {
4130         edge_t  *ep;
4131         lock_descriptor_t       *vertex;
4132         lock_descriptor_t *vertex_stack;
4133 
4134         STACK_INIT(vertex_stack);
4135 
4136         flk_graph_uncolor(lock1->l_graph);
4137         ep = FIRST_ADJ(lock1);
4138         ASSERT(ep != HEAD(lock1));
4139         while (ep != HEAD(lock1)) {
4140                 if (no_path)
4141                         ASSERT(ep->to_vertex != lock2);
4142                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4143                 COLOR(ep->to_vertex);
4144                 ep = NEXT_ADJ(ep);
4145         }
4146 
4147         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4148                 STACK_POP(vertex_stack, l_dstack);
4149                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4150                                                 ep = NEXT_ADJ(ep)) {
4151                         if (COLORED(ep->to_vertex))
4152                                 continue;
4153                         COLOR(ep->to_vertex);
4154                         if (ep->to_vertex == lock2)
4155                                 return (1);
4156 
4157                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4158                 }
4159         }
4160         return (0);
4161 }
4162 
4163 static void
4164 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4165 {
4166         lock_descriptor_t *lock;
4167 
4168         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4169 
4170         if (lock) {
4171                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4172                         if (lock->l_flock.l_pid == pid &&
4173                             lock->l_flock.l_sysid == sysid)
4174                                 cmn_err(CE_PANIC,
4175                                     "owner pid %d's lock %p in active queue",
4176                                     pid, (void *)lock);
4177                         lock = lock->l_next;
4178                 }
4179         }
4180         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4181 
4182         if (lock) {
4183                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4184                         if (lock->l_flock.l_pid == pid &&
4185                             lock->l_flock.l_sysid == sysid)
4186                                 cmn_err(CE_PANIC,
4187                                     "owner pid %d's lock %p in sleep queue",
4188                                     pid, (void *)lock);
4189                         lock = lock->l_next;
4190                 }
4191         }
4192 }
4193 
4194 static int
4195 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 {
4197         edge_t *ep = FIRST_ADJ(lock1);
4198 
4199         while (ep != HEAD(lock1)) {
4200                 if (ep->to_vertex == lock2)
4201                         return (1);
4202                 else
4203                         ep = NEXT_ADJ(ep);
4204         }
4205         return (0);
4206 }
4207 
4208 static int
4209 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4210 {
4211         return (!level_two_path(lock1, lock2, 1));
4212 }
4213 
4214 static void
4215 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4216 {
4217         if (level_one_path(lock1, lock2)) {
4218                 if (level_two_path(lock1, lock2, 0) != 0) {
4219                         cmn_err(CE_WARN,
4220                             "one edge one path from lock1 %p lock2 %p",
4221                             (void *)lock1, (void *)lock2);
4222                 }
4223         } else if (no_path(lock1, lock2)) {
4224                 cmn_err(CE_PANIC,
4225                     "No path from  lock1 %p to lock2 %p",
4226                     (void *)lock1, (void *)lock2);
4227         }
4228 }
4229 #endif /* DEBUG */