5045-use-atomic_inc_*-atomic_dec_*-instead-of-atomic_add_* Wdiff usr/src/uts/i86pc/vm/htable.c

Print this page

5045 use atomic_{inc,dec}_* instead of atomic_add_*

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/vm/htable.c
          +++ new/usr/src/uts/i86pc/vm/htable.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/types.h>
  27   27  #include <sys/sysmacros.h>
  28   28  #include <sys/kmem.h>
  29   29  #include <sys/atomic.h>
  30   30  #include <sys/bitmap.h>
  31   31  #include <sys/machparam.h>
  32   32  #include <sys/machsystm.h>
  33   33  #include <sys/mman.h>
  34   34  #include <sys/systm.h>
  35   35  #include <sys/cpuvar.h>
  36   36  #include <sys/thread.h>
  37   37  #include <sys/proc.h>
  38   38  #include <sys/cpu.h>
  39   39  #include <sys/kmem.h>
  40   40  #include <sys/disp.h>
  41   41  #include <sys/vmem.h>
  42   42  #include <sys/vmsystm.h>
  43   43  #include <sys/promif.h>
  44   44  #include <sys/var.h>
  45   45  #include <sys/x86_archext.h>
  46   46  #include <sys/archsystm.h>
  47   47  #include <sys/bootconf.h>
  48   48  #include <sys/dumphdr.h>
  49   49  #include <vm/seg_kmem.h>
  50   50  #include <vm/seg_kpm.h>
  51   51  #include <vm/hat.h>
  52   52  #include <vm/hat_i86.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/panic.h>
  55   55  
  56   56  #ifdef __xpv
  57   57  #include <sys/hypervisor.h>
  58   58  #include <sys/xpv_panic.h>
  59   59  #endif
  60   60  
  61   61  #include <sys/bootinfo.h>
  62   62  #include <vm/kboot_mmu.h>
  63   63  
  64   64  static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count);
  65   65  
  66   66  kmem_cache_t *htable_cache;
  67   67  
  68   68  /*
  69   69   * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
  70   70   * is used in order to facilitate testing of the htable_steal() code.
  71   71   * By resetting htable_reserve_amount to a lower value, we can force
  72   72   * stealing to occur.  The reserve amount is a guess to get us through boot.
  73   73   */
  74   74  #define HTABLE_RESERVE_AMOUNT   (200)
  75   75  uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT;
  76   76  kmutex_t htable_reserve_mutex;
  77   77  uint_t htable_reserve_cnt;
  78   78  htable_t *htable_reserve_pool;
  79   79  
  80   80  /*
  81   81   * Used to hand test htable_steal().
  82   82   */
  83   83  #ifdef DEBUG
  84   84  ulong_t force_steal = 0;
  85   85  ulong_t ptable_cnt = 0;
  86   86  #endif
  87   87  
  88   88  /*
  89   89   * This variable is so that we can tune this via /etc/system
  90   90   * Any value works, but a power of two <= mmu.ptes_per_table is best.
  91   91   */
  92   92  uint_t htable_steal_passes = 8;
  93   93  
  94   94  /*
  95   95   * mutex stuff for access to htable hash
  96   96   */
  97   97  #define NUM_HTABLE_MUTEX 128
  98   98  kmutex_t htable_mutex[NUM_HTABLE_MUTEX];
  99   99  #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
 100  100  
 101  101  #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 102  102  #define HTABLE_EXIT(h)  mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
 103  103  
 104  104  /*
 105  105   * forward declarations
 106  106   */
 107  107  static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr);
 108  108  static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr);
 109  109  static void htable_free(htable_t *ht);
 110  110  static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index);
 111  111  static void x86pte_release_pagetable(htable_t *ht);
 112  112  static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old,
 113  113          x86pte_t new);
 114  114  
 115  115  /*
 116  116   * A counter to track if we are stealing or reaping htables. When non-zero
 117  117   * htable_free() will directly free htables (either to the reserve or kmem)
 118  118   * instead of putting them in a hat's htable cache.
 119  119   */
 120  120  uint32_t htable_dont_cache = 0;
 121  121  
 122  122  /*
 123  123   * Track the number of active pagetables, so we can know how many to reap
 124  124   */
 125  125  static uint32_t active_ptables = 0;
 126  126  
 127  127  #ifdef __xpv
 128  128  /*
 129  129   * Deal with hypervisor complications.
 130  130   */
 131  131  void
 132  132  xen_flush_va(caddr_t va)
 133  133  {
 134  134          struct mmuext_op t;
 135  135          uint_t count;
 136  136  
 137  137          if (IN_XPV_PANIC()) {
 138  138                  mmu_tlbflush_entry((caddr_t)va);
 139  139          } else {
 140  140                  t.cmd = MMUEXT_INVLPG_LOCAL;
 141  141                  t.arg1.linear_addr = (uintptr_t)va;
 142  142                  if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 143  143                          panic("HYPERVISOR_mmuext_op() failed");
 144  144                  ASSERT(count == 1);
 145  145          }
 146  146  }
 147  147  
 148  148  void
 149  149  xen_gflush_va(caddr_t va, cpuset_t cpus)
 150  150  {
 151  151          struct mmuext_op t;
 152  152          uint_t count;
 153  153  
 154  154          if (IN_XPV_PANIC()) {
 155  155                  mmu_tlbflush_entry((caddr_t)va);
 156  156                  return;
 157  157          }
 158  158  
 159  159          t.cmd = MMUEXT_INVLPG_MULTI;
 160  160          t.arg1.linear_addr = (uintptr_t)va;
 161  161          /*LINTED: constant in conditional context*/
 162  162          set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 163  163          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 164  164                  panic("HYPERVISOR_mmuext_op() failed");
 165  165          ASSERT(count == 1);
 166  166  }
 167  167  
 168  168  void
 169  169  xen_flush_tlb()
 170  170  {
 171  171          struct mmuext_op t;
 172  172          uint_t count;
 173  173  
 174  174          if (IN_XPV_PANIC()) {
 175  175                  xpv_panic_reload_cr3();
 176  176          } else {
 177  177                  t.cmd = MMUEXT_TLB_FLUSH_LOCAL;
 178  178                  if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 179  179                          panic("HYPERVISOR_mmuext_op() failed");
 180  180                  ASSERT(count == 1);
 181  181          }
 182  182  }
 183  183  
 184  184  void
 185  185  xen_gflush_tlb(cpuset_t cpus)
 186  186  {
 187  187          struct mmuext_op t;
 188  188          uint_t count;
 189  189  
 190  190          ASSERT(!IN_XPV_PANIC());
 191  191          t.cmd = MMUEXT_TLB_FLUSH_MULTI;
 192  192          /*LINTED: constant in conditional context*/
 193  193          set_xen_guest_handle(t.arg2.vcpumask, &cpus);
 194  194          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 195  195                  panic("HYPERVISOR_mmuext_op() failed");
 196  196          ASSERT(count == 1);
 197  197  }
 198  198  
 199  199  /*
 200  200   * Install/Adjust a kpm mapping under the hypervisor.
 201  201   * Value of "how" should be:
 202  202   *      PT_WRITABLE | PT_VALID - regular kpm mapping
 203  203   *      PT_VALID - make mapping read-only
 204  204   *      0       - remove mapping
 205  205   *
 206  206   * returns 0 on success. non-zero for failure.
 207  207   */
 208  208  int
 209  209  xen_kpm_page(pfn_t pfn, uint_t how)
 210  210  {
 211  211          paddr_t pa = mmu_ptob((paddr_t)pfn);
 212  212          x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD;
 213  213  
 214  214          if (kpm_vbase == NULL)
 215  215                  return (0);
 216  216  
 217  217          if (how)
 218  218                  pte |= pa_to_ma(pa) | how;
 219  219          else
 220  220                  pte = 0;
 221  221          return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa,
 222  222              pte, UVMF_INVLPG | UVMF_ALL));
 223  223  }
 224  224  
 225  225  void
 226  226  xen_pin(pfn_t pfn, level_t lvl)
 227  227  {
 228  228          struct mmuext_op t;
 229  229          uint_t count;
 230  230  
 231  231          t.cmd = MMUEXT_PIN_L1_TABLE + lvl;
 232  232          t.arg1.mfn = pfn_to_mfn(pfn);
 233  233          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 234  234                  panic("HYPERVISOR_mmuext_op() failed");
 235  235          ASSERT(count == 1);
 236  236  }
 237  237  
 238  238  void
 239  239  xen_unpin(pfn_t pfn)
 240  240  {
 241  241          struct mmuext_op t;
 242  242          uint_t count;
 243  243  
 244  244          t.cmd = MMUEXT_UNPIN_TABLE;
 245  245          t.arg1.mfn = pfn_to_mfn(pfn);
 246  246          if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
 247  247                  panic("HYPERVISOR_mmuext_op() failed");
 248  248          ASSERT(count == 1);
 249  249  }
 250  250  
 251  251  static void
 252  252  xen_map(uint64_t pte, caddr_t va)
 253  253  {
 254  254          if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte,
 255  255              UVMF_INVLPG | UVMF_LOCAL))
 256  256                  panic("HYPERVISOR_update_va_mapping() failed");
 257  257  }
 258  258  #endif /* __xpv */
 259  259  
 260  260  /*
 261  261   * Allocate a memory page for a hardware page table.
 262  262   *
 263  263   * A wrapper around page_get_physical(), with some extra checks.
 264  264   */
 265  265  static pfn_t
 266  266  ptable_alloc(uintptr_t seed)
 267  267  {
 268  268          pfn_t pfn;
 269  269          page_t *pp;
 270  270  
 271  271          pfn = PFN_INVALID;
 272  272  
 273  273          /*
 274  274           * The first check is to see if there is memory in the system. If we
 275  275           * drop to throttlefree, then fail the ptable_alloc() and let the
 276  276           * stealing code kick in. Note that we have to do this test here,
 277  277           * since the test in page_create_throttle() would let the NOSLEEP
 278  278           * allocation go through and deplete the page reserves.
 279  279           *
 280  280           * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
 281  281           */
 282  282          if (!NOMEMWAIT() && freemem <= throttlefree + 1)
 283  283                  return (PFN_INVALID);
 284  284  
 285  285  #ifdef DEBUG
 286  286          /*
 287  287           * This code makes htable_steal() easier to test. By setting
 288  288           * force_steal we force pagetable allocations to fall
 289  289           * into the stealing code. Roughly 1 in ever "force_steal"
 290  290           * page table allocations will fail.
 291  291           */
 292  292          if (proc_pageout != NULL && force_steal > 1 &&
 293  293              ++ptable_cnt > force_steal) {
 294  294                  ptable_cnt = 0;
 295  295                  return (PFN_INVALID);

↓ open down ↓

295 lines elided

↑ open up ↑

 296  296          }
 297  297  #endif /* DEBUG */
 298  298  
 299  299          pp = page_get_physical(seed);
 300  300          if (pp == NULL)
 301  301                  return (PFN_INVALID);
 302  302          ASSERT(PAGE_SHARED(pp));
 303  303          pfn = pp->p_pagenum;
 304  304          if (pfn == PFN_INVALID)
 305  305                  panic("ptable_alloc(): Invalid PFN!!");
 306      -        atomic_add_32(&active_ptables, 1);
      306 +        atomic_inc_32(&active_ptables);
 307  307          HATSTAT_INC(hs_ptable_allocs);
 308  308          return (pfn);
 309  309  }
 310  310  
 311  311  /*
 312  312   * Free an htable's associated page table page.  See the comments
 313  313   * for ptable_alloc().
 314  314   */
 315  315  static void
 316  316  ptable_free(pfn_t pfn)
 317  317  {
 318  318          page_t *pp = page_numtopp_nolock(pfn);
 319  319  
 320  320          /*
 321  321           * need to destroy the page used for the pagetable
 322  322           */
 323  323          ASSERT(pfn != PFN_INVALID);
 324  324          HATSTAT_INC(hs_ptable_frees);
 325      -        atomic_add_32(&active_ptables, -1);
      325 +        atomic_dec_32(&active_ptables);
 326  326          if (pp == NULL)
 327  327                  panic("ptable_free(): no page for pfn!");
 328  328          ASSERT(PAGE_SHARED(pp));
 329  329          ASSERT(pfn == pp->p_pagenum);
 330  330          ASSERT(!IN_XPV_PANIC());
 331  331  
 332  332          /*
 333  333           * Get an exclusive lock, might have to wait for a kmem reader.
 334  334           */
 335  335          if (!page_tryupgrade(pp)) {

 336  336                  u_offset_t off = pp->p_offset;
 337  337                  page_unlock(pp);
 338  338                  pp = page_lookup(&kvp, off, SE_EXCL);
 339  339                  if (pp == NULL)
 340  340                          panic("page not found");
 341  341          }
 342  342  #ifdef __xpv
 343  343          if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
 344  344                  panic("failure making kpm r/w pfn=0x%lx", pfn);
 345  345  #endif
 346  346          page_hashout(pp, NULL);
 347  347          page_free(pp, 1);
 348  348          page_unresv(1);
 349  349  }
 350  350  
 351  351  /*
 352  352   * Put one htable on the reserve list.
 353  353   */
 354  354  static void
 355  355  htable_put_reserve(htable_t *ht)
 356  356  {
 357  357          ht->ht_hat = NULL;              /* no longer tied to a hat */
 358  358          ASSERT(ht->ht_pfn == PFN_INVALID);
 359  359          HATSTAT_INC(hs_htable_rputs);
 360  360          mutex_enter(&htable_reserve_mutex);
 361  361          ht->ht_next = htable_reserve_pool;
 362  362          htable_reserve_pool = ht;
 363  363          ++htable_reserve_cnt;
 364  364          mutex_exit(&htable_reserve_mutex);
 365  365  }
 366  366  
 367  367  /*
 368  368   * Take one htable from the reserve.
 369  369   */
 370  370  static htable_t *
 371  371  htable_get_reserve(void)
 372  372  {
 373  373          htable_t *ht = NULL;
 374  374  
 375  375          mutex_enter(&htable_reserve_mutex);
 376  376          if (htable_reserve_cnt != 0) {
 377  377                  ht = htable_reserve_pool;
 378  378                  ASSERT(ht != NULL);
 379  379                  ASSERT(ht->ht_pfn == PFN_INVALID);
 380  380                  htable_reserve_pool = ht->ht_next;
 381  381                  --htable_reserve_cnt;
 382  382                  HATSTAT_INC(hs_htable_rgets);
 383  383          }
 384  384          mutex_exit(&htable_reserve_mutex);
 385  385          return (ht);
 386  386  }
 387  387  
 388  388  /*
 389  389   * Allocate initial htables and put them on the reserve list
 390  390   */
 391  391  void
 392  392  htable_initial_reserve(uint_t count)
 393  393  {
 394  394          htable_t *ht;
 395  395  
 396  396          count += HTABLE_RESERVE_AMOUNT;
 397  397          while (count > 0) {
 398  398                  ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP);
 399  399                  ASSERT(ht != NULL);
 400  400  
 401  401                  ASSERT(use_boot_reserve);
 402  402                  ht->ht_pfn = PFN_INVALID;
 403  403                  htable_put_reserve(ht);
 404  404                  --count;
 405  405          }
 406  406  }
 407  407  
 408  408  /*
 409  409   * Readjust the reserves after a thread finishes using them.
 410  410   */
 411  411  void
 412  412  htable_adjust_reserve()
 413  413  {
 414  414          htable_t *ht;
 415  415  
 416  416          /*
 417  417           * Free any excess htables in the reserve list
 418  418           */
 419  419          while (htable_reserve_cnt > htable_reserve_amount &&
 420  420              !USE_HAT_RESERVES()) {
 421  421                  ht = htable_get_reserve();
 422  422                  if (ht == NULL)
 423  423                          return;
 424  424                  ASSERT(ht->ht_pfn == PFN_INVALID);
 425  425                  kmem_cache_free(htable_cache, ht);
 426  426          }
 427  427  }
 428  428  
 429  429  
 430  430  /*
 431  431   * This routine steals htables from user processes for htable_alloc() or
 432  432   * for htable_reap().
 433  433   */
 434  434  static htable_t *
 435  435  htable_steal(uint_t cnt)
 436  436  {
 437  437          hat_t           *hat = kas.a_hat;       /* list starts with khat */
 438  438          htable_t        *list = NULL;
 439  439          htable_t        *ht;
 440  440          htable_t        *higher;
 441  441          uint_t          h;
 442  442          uint_t          h_start;
 443  443          static uint_t   h_seed = 0;
 444  444          uint_t          e;
 445  445          uintptr_t       va;
 446  446          x86pte_t        pte;
 447  447          uint_t          stolen = 0;
 448  448          uint_t          pass;
 449  449          uint_t          threshold;
 450  450  
 451  451          /*
 452  452           * Limit htable_steal_passes to something reasonable

↓ open down ↓

117 lines elided

↑ open up ↑

 453  453           */
 454  454          if (htable_steal_passes == 0)
 455  455                  htable_steal_passes = 1;
 456  456          if (htable_steal_passes > mmu.ptes_per_table)
 457  457                  htable_steal_passes = mmu.ptes_per_table;
 458  458  
 459  459          /*
 460  460           * Loop through all user hats. The 1st pass takes cached htables that
 461  461           * aren't in use. The later passes steal by removing mappings, too.
 462  462           */
 463      -        atomic_add_32(&htable_dont_cache, 1);
      463 +        atomic_inc_32(&htable_dont_cache);
 464  464          for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) {
 465  465                  threshold = pass * mmu.ptes_per_table / htable_steal_passes;
 466  466                  hat = kas.a_hat;
 467  467                  for (;;) {
 468  468  
 469  469                          /*
 470  470                           * Clear the victim flag and move to next hat
 471  471                           */
 472  472                          mutex_enter(&hat_list_lock);
 473  473                          if (hat != kas.a_hat) {

 474  474                                  hat->hat_flags &= ~HAT_VICTIM;
 475  475                                  cv_broadcast(&hat_list_cv);
 476  476                          }
 477  477                          hat = hat->hat_next;
 478  478  
 479  479                          /*
 480  480                           * Skip any hat that is already being stolen from.
 481  481                           *
 482  482                           * We skip SHARED hats, as these are dummy
 483  483                           * hats that host ISM shared page tables.
 484  484                           *
 485  485                           * We also skip if HAT_FREEING because hat_pte_unmap()
 486  486                           * won't zero out the PTE's. That would lead to hitting
 487  487                           * stale PTEs either here or under hat_unload() when we
 488  488                           * steal and unload the same page table in competing
 489  489                           * threads.
 490  490                           */
 491  491                          while (hat != NULL &&
 492  492                              (hat->hat_flags &
 493  493                              (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
 494  494                                  hat = hat->hat_next;
 495  495  
 496  496                          if (hat == NULL) {
 497  497                                  mutex_exit(&hat_list_lock);
 498  498                                  break;
 499  499                          }
 500  500  
 501  501                          /*
 502  502                           * Are we finished?
 503  503                           */
 504  504                          if (stolen == cnt) {
 505  505                                  /*
 506  506                                   * Try to spread the pain of stealing,
 507  507                                   * move victim HAT to the end of the HAT list.
 508  508                                   */
 509  509                                  if (pass >= 1 && cnt == 1 &&
 510  510                                      kas.a_hat->hat_prev != hat) {
 511  511  
 512  512                                          /* unlink victim hat */
 513  513                                          if (hat->hat_prev)
 514  514                                                  hat->hat_prev->hat_next =
 515  515                                                      hat->hat_next;
 516  516                                          else
 517  517                                                  kas.a_hat->hat_next =
 518  518                                                      hat->hat_next;
 519  519                                          if (hat->hat_next)
 520  520                                                  hat->hat_next->hat_prev =
 521  521                                                      hat->hat_prev;
 522  522                                          else
 523  523                                                  kas.a_hat->hat_prev =
 524  524                                                      hat->hat_prev;
 525  525  
 526  526  
 527  527                                          /* relink at end of hat list */
 528  528                                          hat->hat_next = NULL;
 529  529                                          hat->hat_prev = kas.a_hat->hat_prev;
 530  530                                          if (hat->hat_prev)
 531  531                                                  hat->hat_prev->hat_next = hat;
 532  532                                          else
 533  533                                                  kas.a_hat->hat_next = hat;
 534  534                                          kas.a_hat->hat_prev = hat;
 535  535  
 536  536                                  }
 537  537  
 538  538                                  mutex_exit(&hat_list_lock);
 539  539                                  break;
 540  540                          }
 541  541  
 542  542                          /*
 543  543                           * Mark the HAT as a stealing victim.
 544  544                           */
 545  545                          hat->hat_flags |= HAT_VICTIM;
 546  546                          mutex_exit(&hat_list_lock);
 547  547  
 548  548                          /*
 549  549                           * Take any htables from the hat's cached "free" list.
 550  550                           */
 551  551                          hat_enter(hat);
 552  552                          while ((ht = hat->hat_ht_cached) != NULL &&
 553  553                              stolen < cnt) {
 554  554                                  hat->hat_ht_cached = ht->ht_next;
 555  555                                  ht->ht_next = list;
 556  556                                  list = ht;
 557  557                                  ++stolen;
 558  558                          }
 559  559                          hat_exit(hat);
 560  560  
 561  561                          /*
 562  562                           * Don't steal on first pass.
 563  563                           */
 564  564                          if (pass == 0 || stolen == cnt)
 565  565                                  continue;
 566  566  
 567  567                          /*
 568  568                           * Search the active htables for one to steal.
 569  569                           * Start at a different hash bucket every time to
 570  570                           * help spread the pain of stealing.
 571  571                           */
 572  572                          h = h_start = h_seed++ % hat->hat_num_hash;
 573  573                          do {
 574  574                                  higher = NULL;
 575  575                                  HTABLE_ENTER(h);
 576  576                                  for (ht = hat->hat_ht_hash[h]; ht;
 577  577                                      ht = ht->ht_next) {
 578  578  
 579  579                                          /*
 580  580                                           * Can we rule out reaping?
 581  581                                           */
 582  582                                          if (ht->ht_busy != 0 ||
 583  583                                              (ht->ht_flags & HTABLE_SHARED_PFN)||
 584  584                                              ht->ht_level > 0 ||
 585  585                                              ht->ht_valid_cnt > threshold ||
 586  586                                              ht->ht_lock_cnt != 0)
 587  587                                                  continue;
 588  588  
 589  589                                          /*
 590  590                                           * Increment busy so the htable can't
 591  591                                           * disappear. We drop the htable mutex
 592  592                                           * to avoid deadlocks with
 593  593                                           * hat_pageunload() and the hment mutex
 594  594                                           * while we call hat_pte_unmap()
 595  595                                           */
 596  596                                          ++ht->ht_busy;
 597  597                                          HTABLE_EXIT(h);
 598  598  
 599  599                                          /*
 600  600                                           * Try stealing.
 601  601                                           * - unload and invalidate all PTEs
 602  602                                           */
 603  603                                          for (e = 0, va = ht->ht_vaddr;
 604  604                                              e < HTABLE_NUM_PTES(ht) &&
 605  605                                              ht->ht_valid_cnt > 0 &&
 606  606                                              ht->ht_busy == 1 &&
 607  607                                              ht->ht_lock_cnt == 0;
 608  608                                              ++e, va += MMU_PAGESIZE) {
 609  609                                                  pte = x86pte_get(ht, e);
 610  610                                                  if (!PTE_ISVALID(pte))
 611  611                                                          continue;
 612  612                                                  hat_pte_unmap(ht, e,
 613  613                                                      HAT_UNLOAD, pte, NULL);
 614  614                                          }
 615  615  
 616  616                                          /*
 617  617                                           * Reacquire htable lock. If we didn't
 618  618                                           * remove all mappings in the table,
 619  619                                           * or another thread added a new mapping
 620  620                                           * behind us, give up on this table.
 621  621                                           */
 622  622                                          HTABLE_ENTER(h);
 623  623                                          if (ht->ht_busy != 1 ||
 624  624                                              ht->ht_valid_cnt != 0 ||
 625  625                                              ht->ht_lock_cnt != 0) {
 626  626                                                  --ht->ht_busy;
 627  627                                                  continue;
 628  628                                          }
 629  629  
 630  630                                          /*
 631  631                                           * Steal it and unlink the page table.
 632  632                                           */
 633  633                                          higher = ht->ht_parent;
 634  634                                          unlink_ptp(higher, ht, ht->ht_vaddr);
 635  635  
 636  636                                          /*
 637  637                                           * remove from the hash list
 638  638                                           */
 639  639                                          if (ht->ht_next)
 640  640                                                  ht->ht_next->ht_prev =
 641  641                                                      ht->ht_prev;
 642  642  
 643  643                                          if (ht->ht_prev) {
 644  644                                                  ht->ht_prev->ht_next =
 645  645                                                      ht->ht_next;
 646  646                                          } else {
 647  647                                                  ASSERT(hat->hat_ht_hash[h] ==
 648  648                                                      ht);
 649  649                                                  hat->hat_ht_hash[h] =
 650  650                                                      ht->ht_next;
 651  651                                          }
 652  652  
 653  653                                          /*
 654  654                                           * Break to outer loop to release the
 655  655                                           * higher (ht_parent) pagetable. This
 656  656                                           * spreads out the pain caused by
 657  657                                           * pagefaults.
 658  658                                           */
 659  659                                          ht->ht_next = list;
 660  660                                          list = ht;
 661  661                                          ++stolen;

↓ open down ↓

188 lines elided

↑ open up ↑

 662  662                                          break;
 663  663                                  }
 664  664                                  HTABLE_EXIT(h);
 665  665                                  if (higher != NULL)
 666  666                                          htable_release(higher);
 667  667                                  if (++h == hat->hat_num_hash)
 668  668                                          h = 0;
 669  669                          } while (stolen < cnt && h != h_start);
 670  670                  }
 671  671          }
 672      -        atomic_add_32(&htable_dont_cache, -1);
      672 +        atomic_dec_32(&htable_dont_cache);
 673  673          return (list);
 674  674  }
 675  675  
 676  676  /*
 677  677   * This is invoked from kmem when the system is low on memory.  We try
 678  678   * to free hments, htables, and ptables to improve the memory situation.
 679  679   */
 680  680  /*ARGSUSED*/
 681  681  static void
 682  682  htable_reap(void *handle)

 683  683  {
 684  684          uint_t          reap_cnt;
 685  685          htable_t        *list;
 686  686          htable_t        *ht;
 687  687  
 688  688          HATSTAT_INC(hs_reap_attempts);
 689  689          if (!can_steal_post_boot)
 690  690                  return;
 691  691  
 692  692          /*
 693  693           * Try to reap 5% of the page tables bounded by a maximum of
 694  694           * 5% of physmem and a minimum of 10.
 695  695           */
 696  696          reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10);
 697  697  
 698  698          /*
 699  699           * Let htable_steal() do the work, we just call htable_free()
 700  700           */
 701  701          XPV_DISALLOW_MIGRATE();
 702  702          list = htable_steal(reap_cnt);
 703  703          XPV_ALLOW_MIGRATE();
 704  704          while ((ht = list) != NULL) {
 705  705                  list = ht->ht_next;
 706  706                  HATSTAT_INC(hs_reaped);
 707  707                  htable_free(ht);
 708  708          }
 709  709  
 710  710          /*
 711  711           * Free up excess reserves
 712  712           */
 713  713          htable_adjust_reserve();
 714  714          hment_adjust_reserve();
 715  715  }
 716  716  
 717  717  /*
 718  718   * Allocate an htable, stealing one or using the reserve if necessary
 719  719   */
 720  720  static htable_t *
 721  721  htable_alloc(
 722  722          hat_t           *hat,
 723  723          uintptr_t       vaddr,
 724  724          level_t         level,
 725  725          htable_t        *shared)
 726  726  {
 727  727          htable_t        *ht = NULL;
 728  728          uint_t          is_vlp;
 729  729          uint_t          is_bare = 0;
 730  730          uint_t          need_to_zero = 1;
 731  731          int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 732  732  
 733  733          if (level < 0 || level > TOP_LEVEL(hat))
 734  734                  panic("htable_alloc(): level %d out of range\n", level);
 735  735  
 736  736          is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
 737  737          if (is_vlp || shared != NULL)
 738  738                  is_bare = 1;
 739  739  
 740  740          /*
 741  741           * First reuse a cached htable from the hat_ht_cached field, this
 742  742           * avoids unnecessary trips through kmem/page allocators.
 743  743           */
 744  744          if (hat->hat_ht_cached != NULL && !is_bare) {
 745  745                  hat_enter(hat);
 746  746                  ht = hat->hat_ht_cached;
 747  747                  if (ht != NULL) {
 748  748                          hat->hat_ht_cached = ht->ht_next;
 749  749                          need_to_zero = 0;
 750  750                          /* XX64 ASSERT() they're all zero somehow */
 751  751                          ASSERT(ht->ht_pfn != PFN_INVALID);
 752  752                  }
 753  753                  hat_exit(hat);
 754  754          }
 755  755  
 756  756          if (ht == NULL) {
 757  757                  /*
 758  758                   * Allocate an htable, possibly refilling the reserves.
 759  759                   */
 760  760                  if (USE_HAT_RESERVES()) {
 761  761                          ht = htable_get_reserve();
 762  762                  } else {
 763  763                          /*
 764  764                           * Donate successful htable allocations to the reserve.
 765  765                           */
 766  766                          for (;;) {
 767  767                                  ht = kmem_cache_alloc(htable_cache, kmflags);
 768  768                                  if (ht == NULL)
 769  769                                          break;
 770  770                                  ht->ht_pfn = PFN_INVALID;
 771  771                                  if (USE_HAT_RESERVES() ||
 772  772                                      htable_reserve_cnt >= htable_reserve_amount)
 773  773                                          break;
 774  774                                  htable_put_reserve(ht);
 775  775                          }
 776  776                  }
 777  777  
 778  778                  /*
 779  779                   * allocate a page for the hardware page table if needed
 780  780                   */
 781  781                  if (ht != NULL && !is_bare) {
 782  782                          ht->ht_hat = hat;
 783  783                          ht->ht_pfn = ptable_alloc((uintptr_t)ht);
 784  784                          if (ht->ht_pfn == PFN_INVALID) {
 785  785                                  if (USE_HAT_RESERVES())
 786  786                                          htable_put_reserve(ht);
 787  787                                  else
 788  788                                          kmem_cache_free(htable_cache, ht);
 789  789                                  ht = NULL;
 790  790                          }
 791  791                  }
 792  792          }
 793  793  
 794  794          /*
 795  795           * If allocations failed, kick off a kmem_reap() and resort to
 796  796           * htable steal(). We may spin here if the system is very low on
 797  797           * memory. If the kernel itself has consumed all memory and kmem_reap()
 798  798           * can't free up anything, then we'll really get stuck here.
 799  799           * That should only happen in a system where the administrator has
 800  800           * misconfigured VM parameters via /etc/system.
 801  801           */
 802  802          while (ht == NULL && can_steal_post_boot) {
 803  803                  kmem_reap();
 804  804                  ht = htable_steal(1);
 805  805                  HATSTAT_INC(hs_steals);
 806  806  
 807  807                  /*
 808  808                   * If we stole for a bare htable, release the pagetable page.
 809  809                   */
 810  810                  if (ht != NULL) {
 811  811                          if (is_bare) {
 812  812                                  ptable_free(ht->ht_pfn);
 813  813                                  ht->ht_pfn = PFN_INVALID;
 814  814  #if defined(__xpv) && defined(__amd64)
 815  815                          /*
 816  816                           * make stolen page table writable again in kpm
 817  817                           */
 818  818                          } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn,
 819  819                              PT_VALID | PT_WRITABLE) < 0) {
 820  820                                  panic("failure making kpm r/w pfn=0x%lx",
 821  821                                      ht->ht_pfn);
 822  822  #endif
 823  823                          }
 824  824                  }
 825  825          }
 826  826  
 827  827          /*
 828  828           * All attempts to allocate or steal failed. This should only happen
 829  829           * if we run out of memory during boot, due perhaps to a huge
 830  830           * boot_archive. At this point there's no way to continue.
 831  831           */
 832  832          if (ht == NULL)
 833  833                  panic("htable_alloc(): couldn't steal\n");
 834  834  
 835  835  #if defined(__amd64) && defined(__xpv)
 836  836          /*
 837  837           * Under the 64-bit hypervisor, we have 2 top level page tables.
 838  838           * If this allocation fails, we'll resort to stealing.
 839  839           * We use the stolen page indirectly, by freeing the
 840  840           * stolen htable first.
 841  841           */
 842  842          if (level == mmu.max_level) {
 843  843                  for (;;) {
 844  844                          htable_t *stolen;
 845  845  
 846  846                          hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
 847  847                          if (hat->hat_user_ptable != PFN_INVALID)
 848  848                                  break;
 849  849                          stolen = htable_steal(1);
 850  850                          if (stolen == NULL)
 851  851                                  panic("2nd steal ptable failed\n");
 852  852                          htable_free(stolen);
 853  853                  }
 854  854                  block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable),
 855  855                      MMU_PAGESIZE);
 856  856          }
 857  857  #endif
 858  858  
 859  859          /*
 860  860           * Shared page tables have all entries locked and entries may not
 861  861           * be added or deleted.
 862  862           */
 863  863          ht->ht_flags = 0;
 864  864          if (shared != NULL) {
 865  865                  ASSERT(shared->ht_valid_cnt > 0);
 866  866                  ht->ht_flags |= HTABLE_SHARED_PFN;
 867  867                  ht->ht_pfn = shared->ht_pfn;
 868  868                  ht->ht_lock_cnt = 0;
 869  869                  ht->ht_valid_cnt = 0;           /* updated in hat_share() */
 870  870                  ht->ht_shares = shared;
 871  871                  need_to_zero = 0;
 872  872          } else {
 873  873                  ht->ht_shares = NULL;
 874  874                  ht->ht_lock_cnt = 0;
 875  875                  ht->ht_valid_cnt = 0;
 876  876          }
 877  877  
 878  878          /*
 879  879           * setup flags, etc. for VLP htables
 880  880           */
 881  881          if (is_vlp) {
 882  882                  ht->ht_flags |= HTABLE_VLP;
 883  883                  ASSERT(ht->ht_pfn == PFN_INVALID);
 884  884                  need_to_zero = 0;
 885  885          }
 886  886  
 887  887          /*
 888  888           * fill in the htable
 889  889           */
 890  890          ht->ht_hat = hat;
 891  891          ht->ht_parent = NULL;
 892  892          ht->ht_vaddr = vaddr;
 893  893          ht->ht_level = level;
 894  894          ht->ht_busy = 1;
 895  895          ht->ht_next = NULL;
 896  896          ht->ht_prev = NULL;
 897  897  
 898  898          /*
 899  899           * Zero out any freshly allocated page table
 900  900           */
 901  901          if (need_to_zero)
 902  902                  x86pte_zero(ht, 0, mmu.ptes_per_table);
 903  903  
 904  904  #if defined(__amd64) && defined(__xpv)
 905  905          if (!is_bare && kpm_vbase) {
 906  906                  (void) xen_kpm_page(ht->ht_pfn, PT_VALID);
 907  907                  if (level == mmu.max_level)
 908  908                          (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID);
 909  909          }
 910  910  #endif
 911  911  
 912  912          return (ht);
 913  913  }
 914  914  
 915  915  /*
 916  916   * Free up an htable, either to a hat's cached list, the reserves or
 917  917   * back to kmem.
 918  918   */
 919  919  static void
 920  920  htable_free(htable_t *ht)
 921  921  {
 922  922          hat_t *hat = ht->ht_hat;
 923  923  
 924  924          /*
 925  925           * If the process isn't exiting, cache the free htable in the hat
 926  926           * structure. We always do this for the boot time reserve. We don't
 927  927           * do this if the hat is exiting or we are stealing/reaping htables.
 928  928           */
 929  929          if (hat != NULL &&
 930  930              !(ht->ht_flags & HTABLE_SHARED_PFN) &&
 931  931              (use_boot_reserve ||
 932  932              (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
 933  933                  ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
 934  934                  ASSERT(ht->ht_pfn != PFN_INVALID);
 935  935                  hat_enter(hat);
 936  936                  ht->ht_next = hat->hat_ht_cached;
 937  937                  hat->hat_ht_cached = ht;
 938  938                  hat_exit(hat);
 939  939                  return;
 940  940          }
 941  941  
 942  942          /*
 943  943           * If we have a hardware page table, free it.
 944  944           * We don't free page tables that are accessed by sharing.
 945  945           */
 946  946          if (ht->ht_flags & HTABLE_SHARED_PFN) {
 947  947                  ASSERT(ht->ht_pfn != PFN_INVALID);
 948  948          } else if (!(ht->ht_flags & HTABLE_VLP)) {
 949  949                  ptable_free(ht->ht_pfn);
 950  950  #if defined(__amd64) && defined(__xpv)
 951  951                  if (ht->ht_level == mmu.max_level) {
 952  952                          ptable_free(hat->hat_user_ptable);
 953  953                          hat->hat_user_ptable = PFN_INVALID;
 954  954                  }
 955  955  #endif
 956  956          }
 957  957          ht->ht_pfn = PFN_INVALID;
 958  958  
 959  959          /*
 960  960           * Free it or put into reserves.
 961  961           */
 962  962          if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
 963  963                  htable_put_reserve(ht);
 964  964          } else {
 965  965                  kmem_cache_free(htable_cache, ht);
 966  966                  htable_adjust_reserve();
 967  967          }
 968  968  }
 969  969  
 970  970  
 971  971  /*
 972  972   * This is called when a hat is being destroyed or swapped out. We reap all
 973  973   * the remaining htables in the hat cache. If destroying all left over
 974  974   * htables are also destroyed.
 975  975   *
 976  976   * We also don't need to invalidate any of the PTPs nor do any demapping.
 977  977   */

↓ open down ↓

295 lines elided

↑ open up ↑

 978  978  void
 979  979  htable_purge_hat(hat_t *hat)
 980  980  {
 981  981          htable_t *ht;
 982  982          int h;
 983  983  
 984  984          /*
 985  985           * Purge the htable cache if just reaping.
 986  986           */
 987  987          if (!(hat->hat_flags & HAT_FREEING)) {
 988      -                atomic_add_32(&htable_dont_cache, 1);
      988 +                atomic_inc_32(&htable_dont_cache);
 989  989                  for (;;) {
 990  990                          hat_enter(hat);
 991  991                          ht = hat->hat_ht_cached;
 992  992                          if (ht == NULL) {
 993  993                                  hat_exit(hat);
 994  994                                  break;
 995  995                          }
 996  996                          hat->hat_ht_cached = ht->ht_next;
 997  997                          hat_exit(hat);
 998  998                          htable_free(ht);
 999  999                  }
1000      -                atomic_add_32(&htable_dont_cache, -1);
     1000 +                atomic_dec_32(&htable_dont_cache);
1001 1001                  return;
1002 1002          }
1003 1003  
1004 1004          /*
1005 1005           * if freeing, no locking is needed
1006 1006           */
1007 1007          while ((ht = hat->hat_ht_cached) != NULL) {
1008 1008                  hat->hat_ht_cached = ht->ht_next;
1009 1009                  htable_free(ht);
1010 1010          }

1011 1011  
1012 1012          /*
1013 1013           * walk thru the htable hash table and free all the htables in it.
1014 1014           */
1015 1015          for (h = 0; h < hat->hat_num_hash; ++h) {
1016 1016                  while ((ht = hat->hat_ht_hash[h]) != NULL) {
1017 1017                          if (ht->ht_next)
1018 1018                                  ht->ht_next->ht_prev = ht->ht_prev;
1019 1019  
1020 1020                          if (ht->ht_prev) {
1021 1021                                  ht->ht_prev->ht_next = ht->ht_next;
1022 1022                          } else {
1023 1023                                  ASSERT(hat->hat_ht_hash[h] == ht);
1024 1024                                  hat->hat_ht_hash[h] = ht->ht_next;
1025 1025                          }
1026 1026                          htable_free(ht);
1027 1027                  }
1028 1028          }
1029 1029  }
1030 1030  
1031 1031  /*
1032 1032   * Unlink an entry for a table at vaddr and level out of the existing table
1033 1033   * one level higher. We are always holding the HASH_ENTER() when doing this.
1034 1034   */
1035 1035  static void
1036 1036  unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
1037 1037  {
1038 1038          uint_t          entry = htable_va2entry(vaddr, higher);
1039 1039          x86pte_t        expect = MAKEPTP(old->ht_pfn, old->ht_level);
1040 1040          x86pte_t        found;
1041 1041          hat_t           *hat = old->ht_hat;
1042 1042  
1043 1043          ASSERT(higher->ht_busy > 0);
1044 1044          ASSERT(higher->ht_valid_cnt > 0);
1045 1045          ASSERT(old->ht_valid_cnt == 0);
1046 1046          found = x86pte_cas(higher, entry, expect, 0);
1047 1047  #ifdef __xpv
1048 1048          /*
1049 1049           * This is weird, but Xen apparently automatically unlinks empty
1050 1050           * pagetables from the upper page table. So allow PTP to be 0 already.
1051 1051           */
1052 1052          if (found != expect && found != 0)
1053 1053  #else
1054 1054          if (found != expect)
1055 1055  #endif
1056 1056                  panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1057 1057                      found, expect);
1058 1058  
1059 1059          /*
1060 1060           * When a top level VLP page table entry changes, we must issue
1061 1061           * a reload of cr3 on all processors.
1062 1062           *
1063 1063           * If we don't need do do that, then we still have to INVLPG against
1064 1064           * an address covered by the inner page table, as the latest processors
1065 1065           * have TLB-like caches for non-leaf page table entries.
1066 1066           */
1067 1067          if (!(hat->hat_flags & HAT_FREEING)) {
1068 1068                  hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
1069 1069                      DEMAP_ALL_ADDR : old->ht_vaddr);
1070 1070          }
1071 1071  
1072 1072          HTABLE_DEC(higher->ht_valid_cnt);
1073 1073  }
1074 1074  
1075 1075  /*
1076 1076   * Link an entry for a new table at vaddr and level into the existing table
1077 1077   * one level higher. We are always holding the HASH_ENTER() when doing this.
1078 1078   */
1079 1079  static void
1080 1080  link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1081 1081  {
1082 1082          uint_t          entry = htable_va2entry(vaddr, higher);
1083 1083          x86pte_t        newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1084 1084          x86pte_t        found;
1085 1085  
1086 1086          ASSERT(higher->ht_busy > 0);
1087 1087  
1088 1088          ASSERT(new->ht_level != mmu.max_level);
1089 1089  
1090 1090          HTABLE_INC(higher->ht_valid_cnt);
1091 1091  
1092 1092          found = x86pte_cas(higher, entry, 0, newptp);
1093 1093          if ((found & ~PT_REF) != 0)
1094 1094                  panic("HAT: ptp not 0, found=" FMT_PTE, found);
1095 1095  
1096 1096          /*
1097 1097           * When any top level VLP page table entry changes, we must issue
1098 1098           * a reload of cr3 on all processors using it.
1099 1099           * We also need to do this for the kernel hat on PAE 32 bit kernel.
1100 1100           */
1101 1101          if (
1102 1102  #ifdef __i386
1103 1103              (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
1104 1104  #endif
1105 1105              (higher->ht_flags & HTABLE_VLP))
1106 1106                  hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1107 1107  }
1108 1108  
1109 1109  /*
1110 1110   * Release of hold on an htable. If this is the last use and the pagetable
1111 1111   * is empty we may want to free it, then recursively look at the pagetable
1112 1112   * above it. The recursion is handled by the outer while() loop.
1113 1113   *
1114 1114   * On the metal, during process exit, we don't bother unlinking the tables from
1115 1115   * upper level pagetables. They are instead handled in bulk by hat_free_end().
1116 1116   * We can't do this on the hypervisor as we need the page table to be
1117 1117   * implicitly unpinnned before it goes to the free page lists. This can't
1118 1118   * happen unless we fully unlink it from the page table hierarchy.
1119 1119   */
1120 1120  void
1121 1121  htable_release(htable_t *ht)
1122 1122  {
1123 1123          uint_t          hashval;
1124 1124          htable_t        *shared;
1125 1125          htable_t        *higher;
1126 1126          hat_t           *hat;
1127 1127          uintptr_t       va;
1128 1128          level_t         level;
1129 1129  
1130 1130          while (ht != NULL) {
1131 1131                  shared = NULL;
1132 1132                  for (;;) {
1133 1133                          hat = ht->ht_hat;
1134 1134                          va = ht->ht_vaddr;
1135 1135                          level = ht->ht_level;
1136 1136                          hashval = HTABLE_HASH(hat, va, level);
1137 1137  
1138 1138                          /*
1139 1139                           * The common case is that this isn't the last use of
1140 1140                           * an htable so we don't want to free the htable.
1141 1141                           */
1142 1142                          HTABLE_ENTER(hashval);
1143 1143                          ASSERT(ht->ht_valid_cnt >= 0);
1144 1144                          ASSERT(ht->ht_busy > 0);
1145 1145                          if (ht->ht_valid_cnt > 0)
1146 1146                                  break;
1147 1147                          if (ht->ht_busy > 1)
1148 1148                                  break;
1149 1149                          ASSERT(ht->ht_lock_cnt == 0);
1150 1150  
1151 1151  #if !defined(__xpv)
1152 1152                          /*
1153 1153                           * we always release empty shared htables
1154 1154                           */
1155 1155                          if (!(ht->ht_flags & HTABLE_SHARED_PFN)) {
1156 1156  
1157 1157                                  /*
1158 1158                                   * don't release if in address space tear down
1159 1159                                   */
1160 1160                                  if (hat->hat_flags & HAT_FREEING)
1161 1161                                          break;
1162 1162  
1163 1163                                  /*
1164 1164                                   * At and above max_page_level, free if it's for
1165 1165                                   * a boot-time kernel mapping below kernelbase.
1166 1166                                   */
1167 1167                                  if (level >= mmu.max_page_level &&
1168 1168                                      (hat != kas.a_hat || va >= kernelbase))
1169 1169                                          break;
1170 1170                          }
1171 1171  #endif /* __xpv */
1172 1172  
1173 1173                          /*
1174 1174                           * Remember if we destroy an htable that shares its PFN
1175 1175                           * from elsewhere.
1176 1176                           */
1177 1177                          if (ht->ht_flags & HTABLE_SHARED_PFN) {
1178 1178                                  ASSERT(shared == NULL);
1179 1179                                  shared = ht->ht_shares;
1180 1180                                  HATSTAT_INC(hs_htable_unshared);
1181 1181                          }
1182 1182  
1183 1183                          /*
1184 1184                           * Handle release of a table and freeing the htable_t.
1185 1185                           * Unlink it from the table higher (ie. ht_parent).
1186 1186                           */
1187 1187                          higher = ht->ht_parent;
1188 1188                          ASSERT(higher != NULL);
1189 1189  
1190 1190                          /*
1191 1191                           * Unlink the pagetable.
1192 1192                           */
1193 1193                          unlink_ptp(higher, ht, va);
1194 1194  
1195 1195                          /*
1196 1196                           * remove this htable from its hash list
1197 1197                           */
1198 1198                          if (ht->ht_next)
1199 1199                                  ht->ht_next->ht_prev = ht->ht_prev;
1200 1200  
1201 1201                          if (ht->ht_prev) {
1202 1202                                  ht->ht_prev->ht_next = ht->ht_next;
1203 1203                          } else {
1204 1204                                  ASSERT(hat->hat_ht_hash[hashval] == ht);
1205 1205                                  hat->hat_ht_hash[hashval] = ht->ht_next;
1206 1206                          }
1207 1207                          HTABLE_EXIT(hashval);
1208 1208                          htable_free(ht);
1209 1209                          ht = higher;
1210 1210                  }
1211 1211  
1212 1212                  ASSERT(ht->ht_busy >= 1);
1213 1213                  --ht->ht_busy;
1214 1214                  HTABLE_EXIT(hashval);
1215 1215  
1216 1216                  /*
1217 1217                   * If we released a shared htable, do a release on the htable
1218 1218                   * from which it shared
1219 1219                   */
1220 1220                  ht = shared;
1221 1221          }
1222 1222  }
1223 1223  
1224 1224  /*
1225 1225   * Find the htable for the pagetable at the given level for the given address.
1226 1226   * If found acquires a hold that eventually needs to be htable_release()d
1227 1227   */
1228 1228  htable_t *
1229 1229  htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1230 1230  {
1231 1231          uintptr_t       base;
1232 1232          uint_t          hashval;
1233 1233          htable_t        *ht = NULL;
1234 1234  
1235 1235          ASSERT(level >= 0);
1236 1236          ASSERT(level <= TOP_LEVEL(hat));
1237 1237  
1238 1238          if (level == TOP_LEVEL(hat)) {
1239 1239  #if defined(__amd64)
1240 1240                  /*
1241 1241                   * 32 bit address spaces on 64 bit kernels need to check
1242 1242                   * for overflow of the 32 bit address space
1243 1243                   */
1244 1244                  if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
1245 1245                          return (NULL);
1246 1246  #endif
1247 1247                  base = 0;
1248 1248          } else {
1249 1249                  base = vaddr & LEVEL_MASK(level + 1);
1250 1250          }
1251 1251  
1252 1252          hashval = HTABLE_HASH(hat, base, level);
1253 1253          HTABLE_ENTER(hashval);
1254 1254          for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1255 1255                  if (ht->ht_hat == hat &&
1256 1256                      ht->ht_vaddr == base &&
1257 1257                      ht->ht_level == level)
1258 1258                          break;
1259 1259          }
1260 1260          if (ht)
1261 1261                  ++ht->ht_busy;
1262 1262  
1263 1263          HTABLE_EXIT(hashval);
1264 1264          return (ht);
1265 1265  }
1266 1266  
1267 1267  /*
1268 1268   * Acquires a hold on a known htable (from a locked hment entry).
1269 1269   */
1270 1270  void
1271 1271  htable_acquire(htable_t *ht)
1272 1272  {
1273 1273          hat_t           *hat = ht->ht_hat;
1274 1274          level_t         level = ht->ht_level;
1275 1275          uintptr_t       base = ht->ht_vaddr;
1276 1276          uint_t          hashval = HTABLE_HASH(hat, base, level);
1277 1277  
1278 1278          HTABLE_ENTER(hashval);
1279 1279  #ifdef DEBUG
1280 1280          /*
1281 1281           * make sure the htable is there
1282 1282           */
1283 1283          {
1284 1284                  htable_t        *h;
1285 1285  
1286 1286                  for (h = hat->hat_ht_hash[hashval];
1287 1287                      h && h != ht;
1288 1288                      h = h->ht_next)
1289 1289                          ;
1290 1290                  ASSERT(h == ht);
1291 1291          }
1292 1292  #endif /* DEBUG */
1293 1293          ++ht->ht_busy;
1294 1294          HTABLE_EXIT(hashval);
1295 1295  }
1296 1296  
1297 1297  /*
1298 1298   * Find the htable for the pagetable at the given level for the given address.
1299 1299   * If found acquires a hold that eventually needs to be htable_release()d
1300 1300   * If not found the table is created.
1301 1301   *
1302 1302   * Since we can't hold a hash table mutex during allocation, we have to
1303 1303   * drop it and redo the search on a create. Then we may have to free the newly
1304 1304   * allocated htable if another thread raced in and created it ahead of us.
1305 1305   */
1306 1306  htable_t *
1307 1307  htable_create(
1308 1308          hat_t           *hat,
1309 1309          uintptr_t       vaddr,
1310 1310          level_t         level,
1311 1311          htable_t        *shared)
1312 1312  {
1313 1313          uint_t          h;
1314 1314          level_t         l;
1315 1315          uintptr_t       base;
1316 1316          htable_t        *ht;
1317 1317          htable_t        *higher = NULL;
1318 1318          htable_t        *new = NULL;
1319 1319  
1320 1320          if (level < 0 || level > TOP_LEVEL(hat))
1321 1321                  panic("htable_create(): level %d out of range\n", level);
1322 1322  
1323 1323          /*
1324 1324           * Create the page tables in top down order.
1325 1325           */
1326 1326          for (l = TOP_LEVEL(hat); l >= level; --l) {
1327 1327                  new = NULL;
1328 1328                  if (l == TOP_LEVEL(hat))
1329 1329                          base = 0;
1330 1330                  else
1331 1331                          base = vaddr & LEVEL_MASK(l + 1);
1332 1332  
1333 1333                  h = HTABLE_HASH(hat, base, l);
1334 1334  try_again:
1335 1335                  /*
1336 1336                   * look up the htable at this level
1337 1337                   */
1338 1338                  HTABLE_ENTER(h);
1339 1339                  if (l == TOP_LEVEL(hat)) {
1340 1340                          ht = hat->hat_htable;
1341 1341                  } else {
1342 1342                          for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
1343 1343                                  ASSERT(ht->ht_hat == hat);
1344 1344                                  if (ht->ht_vaddr == base &&
1345 1345                                      ht->ht_level == l)
1346 1346                                          break;
1347 1347                          }
1348 1348                  }
1349 1349  
1350 1350                  /*
1351 1351                   * if we found the htable, increment its busy cnt
1352 1352                   * and if we had allocated a new htable, free it.
1353 1353                   */
1354 1354                  if (ht != NULL) {
1355 1355                          /*
1356 1356                           * If we find a pre-existing shared table, it must
1357 1357                           * share from the same place.
1358 1358                           */
1359 1359                          if (l == level && shared && ht->ht_shares &&
1360 1360                              ht->ht_shares != shared) {
1361 1361                                  panic("htable shared from wrong place "
1362 1362                                      "found htable=%p shared=%p",
1363 1363                                      (void *)ht, (void *)shared);
1364 1364                          }
1365 1365                          ++ht->ht_busy;
1366 1366                          HTABLE_EXIT(h);
1367 1367                          if (new)
1368 1368                                  htable_free(new);
1369 1369                          if (higher != NULL)
1370 1370                                  htable_release(higher);
1371 1371                          higher = ht;
1372 1372  
1373 1373                  /*
1374 1374                   * if we didn't find it on the first search
1375 1375                   * allocate a new one and search again
1376 1376                   */
1377 1377                  } else if (new == NULL) {
1378 1378                          HTABLE_EXIT(h);
1379 1379                          new = htable_alloc(hat, base, l,
1380 1380                              l == level ? shared : NULL);
1381 1381                          goto try_again;
1382 1382  
1383 1383                  /*
1384 1384                   * 2nd search and still not there, use "new" table
1385 1385                   * Link new table into higher, when not at top level.
1386 1386                   */
1387 1387                  } else {
1388 1388                          ht = new;
1389 1389                          if (higher != NULL) {
1390 1390                                  link_ptp(higher, ht, base);
1391 1391                                  ht->ht_parent = higher;
1392 1392                          }
1393 1393                          ht->ht_next = hat->hat_ht_hash[h];
1394 1394                          ASSERT(ht->ht_prev == NULL);
1395 1395                          if (hat->hat_ht_hash[h])
1396 1396                                  hat->hat_ht_hash[h]->ht_prev = ht;
1397 1397                          hat->hat_ht_hash[h] = ht;
1398 1398                          HTABLE_EXIT(h);
1399 1399  
1400 1400                          /*
1401 1401                           * Note we don't do htable_release(higher).
1402 1402                           * That happens recursively when "new" is removed by
1403 1403                           * htable_release() or htable_steal().
1404 1404                           */
1405 1405                          higher = ht;
1406 1406  
1407 1407                          /*
1408 1408                           * If we just created a new shared page table we
1409 1409                           * increment the shared htable's busy count, so that
1410 1410                           * it can't be the victim of a steal even if it's empty.
1411 1411                           */
1412 1412                          if (l == level && shared) {
1413 1413                                  (void) htable_lookup(shared->ht_hat,
1414 1414                                      shared->ht_vaddr, shared->ht_level);
1415 1415                                  HATSTAT_INC(hs_htable_shared);
1416 1416                          }
1417 1417                  }
1418 1418          }
1419 1419  
1420 1420          return (ht);
1421 1421  }
1422 1422  
1423 1423  /*
1424 1424   * Inherit initial pagetables from the boot program. On the 64-bit
1425 1425   * hypervisor we also temporarily mark the p_index field of page table
1426 1426   * pages, so we know not to try making them writable in seg_kpm.
1427 1427   */
1428 1428  void
1429 1429  htable_attach(
1430 1430          hat_t *hat,
1431 1431          uintptr_t base,
1432 1432          level_t level,
1433 1433          htable_t *parent,
1434 1434          pfn_t pfn)
1435 1435  {
1436 1436          htable_t        *ht;
1437 1437          uint_t          h;
1438 1438          uint_t          i;
1439 1439          x86pte_t        pte;
1440 1440          x86pte_t        *ptep;
1441 1441          page_t          *pp;
1442 1442          extern page_t   *boot_claim_page(pfn_t);
1443 1443  
1444 1444          ht = htable_get_reserve();
1445 1445          if (level == mmu.max_level)
1446 1446                  kas.a_hat->hat_htable = ht;
1447 1447          ht->ht_hat = hat;
1448 1448          ht->ht_parent = parent;
1449 1449          ht->ht_vaddr = base;
1450 1450          ht->ht_level = level;
1451 1451          ht->ht_busy = 1;
1452 1452          ht->ht_next = NULL;
1453 1453          ht->ht_prev = NULL;
1454 1454          ht->ht_flags = 0;
1455 1455          ht->ht_pfn = pfn;
1456 1456          ht->ht_lock_cnt = 0;
1457 1457          ht->ht_valid_cnt = 0;
1458 1458          if (parent != NULL)
1459 1459                  ++parent->ht_busy;
1460 1460  
1461 1461          h = HTABLE_HASH(hat, base, level);
1462 1462          HTABLE_ENTER(h);
1463 1463          ht->ht_next = hat->hat_ht_hash[h];
1464 1464          ASSERT(ht->ht_prev == NULL);
1465 1465          if (hat->hat_ht_hash[h])
1466 1466                  hat->hat_ht_hash[h]->ht_prev = ht;
1467 1467          hat->hat_ht_hash[h] = ht;
1468 1468          HTABLE_EXIT(h);
1469 1469  
1470 1470          /*
1471 1471           * make sure the page table physical page is not FREE
1472 1472           */
1473 1473          if (page_resv(1, KM_NOSLEEP) == 0)
1474 1474                  panic("page_resv() failed in ptable alloc");
1475 1475  
1476 1476          pp = boot_claim_page(pfn);
1477 1477          ASSERT(pp != NULL);
1478 1478  
1479 1479          /*
1480 1480           * Page table pages that were allocated by dboot or
1481 1481           * in very early startup didn't go through boot_mapin()
1482 1482           * and so won't have vnode/offsets. Fix that here.
1483 1483           */
1484 1484          if (pp->p_vnode == NULL) {
1485 1485                  /* match offset calculation in page_get_physical() */
1486 1486                  u_offset_t offset = (uintptr_t)ht;
1487 1487                  if (offset > kernelbase)
1488 1488                          offset -= kernelbase;
1489 1489                  offset <<= MMU_PAGESHIFT;
1490 1490  #if defined(__amd64)
1491 1491                  offset += mmu.hole_start;       /* something in VA hole */
1492 1492  #else
1493 1493                  offset += 1ULL << 40;           /* something > 4 Gig */
1494 1494  #endif
1495 1495                  ASSERT(page_exists(&kvp, offset) == NULL);
1496 1496                  (void) page_hashin(pp, &kvp, offset, NULL);
1497 1497          }
1498 1498          page_downgrade(pp);
1499 1499  #if defined(__xpv) && defined(__amd64)
1500 1500          /*
1501 1501           * Record in the page_t that is a pagetable for segkpm setup.
1502 1502           */
1503 1503          if (kpm_vbase)
1504 1504                  pp->p_index = 1;
1505 1505  #endif
1506 1506  
1507 1507          /*
1508 1508           * Count valid mappings and recursively attach lower level pagetables.
1509 1509           */
1510 1510          ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1511 1511          for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
1512 1512                  if (mmu.pae_hat)
1513 1513                          pte = ptep[i];
1514 1514                  else
1515 1515                          pte = ((x86pte32_t *)ptep)[i];
1516 1516                  if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) {
1517 1517                          ++ht->ht_valid_cnt;
1518 1518                          if (!PTE_ISPAGE(pte, level)) {
1519 1519                                  htable_attach(hat, base, level - 1,
1520 1520                                      ht, PTE2PFN(pte, level));
1521 1521                                  ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1522 1522                          }
1523 1523                  }
1524 1524                  base += LEVEL_SIZE(level);
1525 1525                  if (base == mmu.hole_start)
1526 1526                          base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK;
1527 1527          }
1528 1528  
1529 1529          /*
1530 1530           * As long as all the mappings we had were below kernel base
1531 1531           * we can release the htable.
1532 1532           */
1533 1533          if (base < kernelbase)
1534 1534                  htable_release(ht);
1535 1535  }
1536 1536  
1537 1537  /*
1538 1538   * Walk through a given htable looking for the first valid entry.  This
1539 1539   * routine takes both a starting and ending address.  The starting address
1540 1540   * is required to be within the htable provided by the caller, but there is
1541 1541   * no such restriction on the ending address.
1542 1542   *
1543 1543   * If the routine finds a valid entry in the htable (at or beyond the
1544 1544   * starting address), the PTE (and its address) will be returned.
1545 1545   * This PTE may correspond to either a page or a pagetable - it is the
1546 1546   * caller's responsibility to determine which.  If no valid entry is
1547 1547   * found, 0 (and invalid PTE) and the next unexamined address will be
1548 1548   * returned.
1549 1549   *
1550 1550   * The loop has been carefully coded for optimization.
1551 1551   */
1552 1552  static x86pte_t
1553 1553  htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr)
1554 1554  {
1555 1555          uint_t e;
1556 1556          x86pte_t found_pte = (x86pte_t)0;
1557 1557          caddr_t pte_ptr;
1558 1558          caddr_t end_pte_ptr;
1559 1559          int l = ht->ht_level;
1560 1560          uintptr_t va = *vap & LEVEL_MASK(l);
1561 1561          size_t pgsize = LEVEL_SIZE(l);
1562 1562  
1563 1563          ASSERT(va >= ht->ht_vaddr);
1564 1564          ASSERT(va <= HTABLE_LAST_PAGE(ht));
1565 1565  
1566 1566          /*
1567 1567           * Compute the starting index and ending virtual address
1568 1568           */
1569 1569          e = htable_va2entry(va, ht);
1570 1570  
1571 1571          /*
1572 1572           * The following page table scan code knows that the valid
1573 1573           * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1574 1574           */
1575 1575          pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0);
1576 1576          end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht));
1577 1577          pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e);
1578 1578          while (!PTE_ISVALID(*pte_ptr)) {
1579 1579                  va += pgsize;
1580 1580                  if (va >= eaddr)
1581 1581                          break;
1582 1582                  pte_ptr += mmu.pte_size;
1583 1583                  ASSERT(pte_ptr <= end_pte_ptr);
1584 1584                  if (pte_ptr == end_pte_ptr)
1585 1585                          break;
1586 1586          }
1587 1587  
1588 1588          /*
1589 1589           * if we found a valid PTE, load the entire PTE
1590 1590           */
1591 1591          if (va < eaddr && pte_ptr != end_pte_ptr)
1592 1592                  found_pte = GET_PTE((x86pte_t *)pte_ptr);
1593 1593          x86pte_release_pagetable(ht);
1594 1594  
1595 1595  #if defined(__amd64)
1596 1596          /*
1597 1597           * deal with VA hole on amd64
1598 1598           */
1599 1599          if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end)
1600 1600                  va = mmu.hole_end + va - mmu.hole_start;
1601 1601  #endif /* __amd64 */
1602 1602  
1603 1603          *vap = va;
1604 1604          return (found_pte);
1605 1605  }
1606 1606  
1607 1607  /*
1608 1608   * Find the address and htable for the first populated translation at or
1609 1609   * above the given virtual address.  The caller may also specify an upper
1610 1610   * limit to the address range to search.  Uses level information to quickly
1611 1611   * skip unpopulated sections of virtual address spaces.
1612 1612   *
1613 1613   * If not found returns NULL. When found, returns the htable and virt addr
1614 1614   * and has a hold on the htable.
1615 1615   */
1616 1616  x86pte_t
1617 1617  htable_walk(
1618 1618          struct hat *hat,
1619 1619          htable_t **htp,
1620 1620          uintptr_t *vaddr,
1621 1621          uintptr_t eaddr)
1622 1622  {
1623 1623          uintptr_t va = *vaddr;
1624 1624          htable_t *ht;
1625 1625          htable_t *prev = *htp;
1626 1626          level_t l;
1627 1627          level_t max_mapped_level;
1628 1628          x86pte_t pte;
1629 1629  
1630 1630          ASSERT(eaddr > va);
1631 1631  
1632 1632          /*
1633 1633           * If this is a user address, then we know we need not look beyond
1634 1634           * kernelbase.
1635 1635           */
1636 1636          ASSERT(hat == kas.a_hat || eaddr <= kernelbase ||
1637 1637              eaddr == HTABLE_WALK_TO_END);
1638 1638          if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END)
1639 1639                  eaddr = kernelbase;
1640 1640  
1641 1641          /*
1642 1642           * If we're coming in with a previous page table, search it first
1643 1643           * without doing an htable_lookup(), this should be frequent.
1644 1644           */
1645 1645          if (prev) {
1646 1646                  ASSERT(prev->ht_busy > 0);
1647 1647                  ASSERT(prev->ht_vaddr <= va);
1648 1648                  l = prev->ht_level;
1649 1649                  if (va <= HTABLE_LAST_PAGE(prev)) {
1650 1650                          pte = htable_scan(prev, &va, eaddr);
1651 1651  
1652 1652                          if (PTE_ISPAGE(pte, l)) {
1653 1653                                  *vaddr = va;
1654 1654                                  *htp = prev;
1655 1655                                  return (pte);
1656 1656                          }
1657 1657                  }
1658 1658  
1659 1659                  /*
1660 1660                   * We found nothing in the htable provided by the caller,
1661 1661                   * so fall through and do the full search
1662 1662                   */
1663 1663                  htable_release(prev);
1664 1664          }
1665 1665  
1666 1666          /*
1667 1667           * Find the level of the largest pagesize used by this HAT.
1668 1668           */
1669 1669          if (hat->hat_ism_pgcnt > 0) {
1670 1670                  max_mapped_level = mmu.umax_page_level;
1671 1671          } else {
1672 1672                  max_mapped_level = 0;
1673 1673                  for (l = 1; l <= mmu.max_page_level; ++l)
1674 1674                          if (hat->hat_pages_mapped[l] != 0)
1675 1675                                  max_mapped_level = l;
1676 1676          }
1677 1677  
1678 1678          while (va < eaddr && va >= *vaddr) {
1679 1679                  ASSERT(!IN_VA_HOLE(va));
1680 1680  
1681 1681                  /*
1682 1682                   *  Find lowest table with any entry for given address.
1683 1683                   */
1684 1684                  for (l = 0; l <= TOP_LEVEL(hat); ++l) {
1685 1685                          ht = htable_lookup(hat, va, l);
1686 1686                          if (ht != NULL) {
1687 1687                                  pte = htable_scan(ht, &va, eaddr);
1688 1688                                  if (PTE_ISPAGE(pte, l)) {
1689 1689                                          *vaddr = va;
1690 1690                                          *htp = ht;
1691 1691                                          return (pte);
1692 1692                                  }
1693 1693                                  htable_release(ht);
1694 1694                                  break;
1695 1695                          }
1696 1696  
1697 1697                          /*
1698 1698                           * No htable at this level for the address. If there
1699 1699                           * is no larger page size that could cover it, we can
1700 1700                           * skip right to the start of the next page table.
1701 1701                           */
1702 1702                          ASSERT(l < TOP_LEVEL(hat));
1703 1703                          if (l >= max_mapped_level) {
1704 1704                                  va = NEXT_ENTRY_VA(va, l + 1);
1705 1705                                  if (va >= eaddr)
1706 1706                                          break;
1707 1707                          }
1708 1708                  }
1709 1709          }
1710 1710  
1711 1711          *vaddr = 0;
1712 1712          *htp = NULL;
1713 1713          return (0);
1714 1714  }
1715 1715  
1716 1716  /*
1717 1717   * Find the htable and page table entry index of the given virtual address
1718 1718   * with pagesize at or below given level.
1719 1719   * If not found returns NULL. When found, returns the htable, sets
1720 1720   * entry, and has a hold on the htable.
1721 1721   */
1722 1722  htable_t *
1723 1723  htable_getpte(
1724 1724          struct hat *hat,
1725 1725          uintptr_t vaddr,
1726 1726          uint_t *entry,
1727 1727          x86pte_t *pte,
1728 1728          level_t level)
1729 1729  {
1730 1730          htable_t        *ht;
1731 1731          level_t         l;
1732 1732          uint_t          e;
1733 1733  
1734 1734          ASSERT(level <= mmu.max_page_level);
1735 1735  
1736 1736          for (l = 0; l <= level; ++l) {
1737 1737                  ht = htable_lookup(hat, vaddr, l);
1738 1738                  if (ht == NULL)
1739 1739                          continue;
1740 1740                  e = htable_va2entry(vaddr, ht);
1741 1741                  if (entry != NULL)
1742 1742                          *entry = e;
1743 1743                  if (pte != NULL)
1744 1744                          *pte = x86pte_get(ht, e);
1745 1745                  return (ht);
1746 1746          }
1747 1747          return (NULL);
1748 1748  }
1749 1749  
1750 1750  /*
1751 1751   * Find the htable and page table entry index of the given virtual address.
1752 1752   * There must be a valid page mapped at the given address.
1753 1753   * If not found returns NULL. When found, returns the htable, sets
1754 1754   * entry, and has a hold on the htable.
1755 1755   */
1756 1756  htable_t *
1757 1757  htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry)
1758 1758  {
1759 1759          htable_t        *ht;
1760 1760          uint_t          e;
1761 1761          x86pte_t        pte;
1762 1762  
1763 1763          ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level);
1764 1764          if (ht == NULL)
1765 1765                  return (NULL);
1766 1766  
1767 1767          if (entry)
1768 1768                  *entry = e;
1769 1769  
1770 1770          if (PTE_ISPAGE(pte, ht->ht_level))
1771 1771                  return (ht);
1772 1772          htable_release(ht);
1773 1773          return (NULL);
1774 1774  }
1775 1775  
1776 1776  
1777 1777  void
1778 1778  htable_init()
1779 1779  {
1780 1780          /*
1781 1781           * To save on kernel VA usage, we avoid debug information in 32 bit
1782 1782           * kernels.
1783 1783           */
1784 1784  #if defined(__amd64)
1785 1785          int     kmem_flags = KMC_NOHASH;
1786 1786  #elif defined(__i386)
1787 1787          int     kmem_flags = KMC_NOHASH | KMC_NODEBUG;
1788 1788  #endif
1789 1789  
1790 1790          /*
1791 1791           * initialize kmem caches
1792 1792           */
1793 1793          htable_cache = kmem_cache_create("htable_t",
1794 1794              sizeof (htable_t), 0, NULL, NULL,
1795 1795              htable_reap, NULL, hat_memload_arena, kmem_flags);
1796 1796  }
1797 1797  
1798 1798  /*
1799 1799   * get the pte index for the virtual address in the given htable's pagetable
1800 1800   */
1801 1801  uint_t
1802 1802  htable_va2entry(uintptr_t va, htable_t *ht)
1803 1803  {
1804 1804          level_t l = ht->ht_level;
1805 1805  
1806 1806          ASSERT(va >= ht->ht_vaddr);
1807 1807          ASSERT(va <= HTABLE_LAST_PAGE(ht));
1808 1808          return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1));
1809 1809  }
1810 1810  
1811 1811  /*
1812 1812   * Given an htable and the index of a pte in it, return the virtual address
1813 1813   * of the page.
1814 1814   */
1815 1815  uintptr_t
1816 1816  htable_e2va(htable_t *ht, uint_t entry)
1817 1817  {
1818 1818          level_t l = ht->ht_level;
1819 1819          uintptr_t va;
1820 1820  
1821 1821          ASSERT(entry < HTABLE_NUM_PTES(ht));
1822 1822          va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l));
1823 1823  
1824 1824          /*
1825 1825           * Need to skip over any VA hole in top level table
1826 1826           */
1827 1827  #if defined(__amd64)
1828 1828          if (ht->ht_level == mmu.max_level && va >= mmu.hole_start)
1829 1829                  va += ((mmu.hole_end - mmu.hole_start) + 1);
1830 1830  #endif
1831 1831  
1832 1832          return (va);
1833 1833  }
1834 1834  
1835 1835  /*
1836 1836   * The code uses compare and swap instructions to read/write PTE's to
1837 1837   * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1838 1838   * will naturally be atomic.
1839 1839   *
1840 1840   * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1841 1841   * are used to ensure that an interrupt won't overwrite a temporary mapping
1842 1842   * while it's in use. If an interrupt thread tries to access a PTE, it will
1843 1843   * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1844 1844   */
1845 1845  void
1846 1846  x86pte_cpu_init(cpu_t *cpu)
1847 1847  {
1848 1848          struct hat_cpu_info *hci;
1849 1849  
1850 1850          hci = kmem_zalloc(sizeof (*hci), KM_SLEEP);
1851 1851          mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL);
1852 1852          cpu->cpu_hat_info = hci;
1853 1853  }
1854 1854  
1855 1855  void
1856 1856  x86pte_cpu_fini(cpu_t *cpu)
1857 1857  {
1858 1858          struct hat_cpu_info *hci = cpu->cpu_hat_info;
1859 1859  
1860 1860          kmem_free(hci, sizeof (*hci));
1861 1861          cpu->cpu_hat_info = NULL;
1862 1862  }
1863 1863  
1864 1864  #ifdef __i386
1865 1865  /*
1866 1866   * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1867 1867   */
1868 1868  x86pte_t
1869 1869  get_pte64(x86pte_t *ptr)
1870 1870  {
1871 1871          volatile uint32_t *p = (uint32_t *)ptr;
1872 1872          x86pte_t t;
1873 1873  
1874 1874          ASSERT(mmu.pae_hat != 0);
1875 1875          for (;;) {
1876 1876                  t = p[0];
1877 1877                  t |= (uint64_t)p[1] << 32;
1878 1878                  if ((t & 0xffffffff) == p[0])
1879 1879                          return (t);
1880 1880          }
1881 1881  }
1882 1882  #endif /* __i386 */
1883 1883  
1884 1884  /*
1885 1885   * Disable preemption and establish a mapping to the pagetable with the
1886 1886   * given pfn. This is optimized for there case where it's the same
1887 1887   * pfn as we last used referenced from this CPU.
1888 1888   */
1889 1889  static x86pte_t *
1890 1890  x86pte_access_pagetable(htable_t *ht, uint_t index)
1891 1891  {
1892 1892          /*
1893 1893           * VLP pagetables are contained in the hat_t
1894 1894           */
1895 1895          if (ht->ht_flags & HTABLE_VLP)
1896 1896                  return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
1897 1897          return (x86pte_mapin(ht->ht_pfn, index, ht));
1898 1898  }
1899 1899  
1900 1900  /*
1901 1901   * map the given pfn into the page table window.
1902 1902   */
1903 1903  /*ARGSUSED*/
1904 1904  x86pte_t *
1905 1905  x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1906 1906  {
1907 1907          x86pte_t *pteptr;
1908 1908          x86pte_t pte = 0;
1909 1909          x86pte_t newpte;
1910 1910          int x;
1911 1911  
1912 1912          ASSERT(pfn != PFN_INVALID);
1913 1913  
1914 1914          if (!khat_running) {
1915 1915                  caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1916 1916                  return (PT_INDEX_PTR(va, index));
1917 1917          }
1918 1918  
1919 1919          /*
1920 1920           * If kpm is available, use it.
1921 1921           */
1922 1922          if (kpm_vbase)
1923 1923                  return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1924 1924  
1925 1925          /*
1926 1926           * Disable preemption and grab the CPU's hci_mutex
1927 1927           */
1928 1928          kpreempt_disable();
1929 1929          ASSERT(CPU->cpu_hat_info != NULL);
1930 1930          mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1931 1931          x = PWIN_TABLE(CPU->cpu_id);
1932 1932          pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1933 1933  #ifndef __xpv
1934 1934          if (mmu.pae_hat)
1935 1935                  pte = *pteptr;
1936 1936          else
1937 1937                  pte = *(x86pte32_t *)pteptr;
1938 1938  #endif
1939 1939  
1940 1940          newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1941 1941  
1942 1942          /*
1943 1943           * For hardware we can use a writable mapping.
1944 1944           */
1945 1945  #ifdef __xpv
1946 1946          if (IN_XPV_PANIC())
1947 1947  #endif
1948 1948                  newpte |= PT_WRITABLE;
1949 1949  
1950 1950          if (!PTE_EQUIV(newpte, pte)) {
1951 1951  
1952 1952  #ifdef __xpv
1953 1953                  if (!IN_XPV_PANIC()) {
1954 1954                          xen_map(newpte, PWIN_VA(x));
1955 1955                  } else
1956 1956  #endif
1957 1957                  {
1958 1958                          XPV_ALLOW_PAGETABLE_UPDATES();
1959 1959                          if (mmu.pae_hat)
1960 1960                                  *pteptr = newpte;
1961 1961                          else
1962 1962                                  *(x86pte32_t *)pteptr = newpte;
1963 1963                          XPV_DISALLOW_PAGETABLE_UPDATES();
1964 1964                          mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
1965 1965                  }
1966 1966          }
1967 1967          return (PT_INDEX_PTR(PWIN_VA(x), index));
1968 1968  }
1969 1969  
1970 1970  /*
1971 1971   * Release access to a page table.
1972 1972   */
1973 1973  static void
1974 1974  x86pte_release_pagetable(htable_t *ht)
1975 1975  {
1976 1976          /*
1977 1977           * nothing to do for VLP htables
1978 1978           */
1979 1979          if (ht->ht_flags & HTABLE_VLP)
1980 1980                  return;
1981 1981  
1982 1982          x86pte_mapout();
1983 1983  }
1984 1984  
1985 1985  void
1986 1986  x86pte_mapout(void)
1987 1987  {
1988 1988          if (kpm_vbase != NULL || !khat_running)
1989 1989                  return;
1990 1990  
1991 1991          /*
1992 1992           * Drop the CPU's hci_mutex and restore preemption.
1993 1993           */
1994 1994  #ifdef __xpv
1995 1995          if (!IN_XPV_PANIC()) {
1996 1996                  uintptr_t va;
1997 1997  
1998 1998                  /*
1999 1999                   * We need to always clear the mapping in case a page
2000 2000                   * that was once a page table page is ballooned out.
2001 2001                   */
2002 2002                  va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id));
2003 2003                  (void) HYPERVISOR_update_va_mapping(va, 0,
2004 2004                      UVMF_INVLPG | UVMF_LOCAL);
2005 2005          }
2006 2006  #endif
2007 2007          mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2008 2008          kpreempt_enable();
2009 2009  }
2010 2010  
2011 2011  /*
2012 2012   * Atomic retrieval of a pagetable entry
2013 2013   */
2014 2014  x86pte_t
2015 2015  x86pte_get(htable_t *ht, uint_t entry)
2016 2016  {
2017 2017          x86pte_t        pte;
2018 2018          x86pte_t        *ptep;
2019 2019  
2020 2020          /*
2021 2021           * Be careful that loading PAE entries in 32 bit kernel is atomic.
2022 2022           */
2023 2023          ASSERT(entry < mmu.ptes_per_table);
2024 2024          ptep = x86pte_access_pagetable(ht, entry);
2025 2025          pte = GET_PTE(ptep);
2026 2026          x86pte_release_pagetable(ht);
2027 2027          return (pte);
2028 2028  }
2029 2029  
2030 2030  /*
2031 2031   * Atomic unconditional set of a page table entry, it returns the previous
2032 2032   * value. For pre-existing mappings if the PFN changes, then we don't care
2033 2033   * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2034 2034   * the MOD/REF bits unchanged.
2035 2035   *
2036 2036   * If asked to overwrite a link to a lower page table with a large page
2037 2037   * mapping, this routine returns the special value of LPAGE_ERROR. This
2038 2038   * allows the upper HAT layers to retry with a smaller mapping size.
2039 2039   */
2040 2040  x86pte_t
2041 2041  x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
2042 2042  {
2043 2043          x86pte_t        old;
2044 2044          x86pte_t        prev;
2045 2045          x86pte_t        *ptep;
2046 2046          level_t         l = ht->ht_level;
2047 2047          x86pte_t        pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR;
2048 2048          x86pte_t        n;
2049 2049          uintptr_t       addr = htable_e2va(ht, entry);
2050 2050          hat_t           *hat = ht->ht_hat;
2051 2051  
2052 2052          ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2053 2053          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2054 2054          if (ptr == NULL)
2055 2055                  ptep = x86pte_access_pagetable(ht, entry);
2056 2056          else
2057 2057                  ptep = ptr;
2058 2058  
2059 2059          /*
2060 2060           * Install the new PTE. If remapping the same PFN, then
2061 2061           * copy existing REF/MOD bits to new mapping.
2062 2062           */
2063 2063          do {
2064 2064                  prev = GET_PTE(ptep);
2065 2065                  n = new;
2066 2066                  if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2067 2067                          n |= prev & (PT_REF | PT_MOD);
2068 2068  
2069 2069                  /*
2070 2070                   * Another thread may have installed this mapping already,
2071 2071                   * flush the local TLB and be done.
2072 2072                   */
2073 2073                  if (prev == n) {
2074 2074                          old = new;
2075 2075  #ifdef __xpv
2076 2076                          if (!IN_XPV_PANIC())
2077 2077                                  xen_flush_va((caddr_t)addr);
2078 2078                          else
2079 2079  #endif
2080 2080                                  mmu_tlbflush_entry((caddr_t)addr);
2081 2081                          goto done;
2082 2082                  }
2083 2083  
2084 2084                  /*
2085 2085                   * Detect if we have a collision of installing a large
2086 2086                   * page mapping where there already is a lower page table.
2087 2087                   */
2088 2088                  if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2089 2089                          old = LPAGE_ERROR;
2090 2090                          goto done;
2091 2091                  }
2092 2092  
2093 2093                  XPV_ALLOW_PAGETABLE_UPDATES();
2094 2094                  old = CAS_PTE(ptep, prev, n);
2095 2095                  XPV_DISALLOW_PAGETABLE_UPDATES();
2096 2096          } while (old != prev);
2097 2097  
2098 2098          /*
2099 2099           * Do a TLB demap if needed, ie. the old pte was valid.
2100 2100           *
2101 2101           * Note that a stale TLB writeback to the PTE here either can't happen
2102 2102           * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2103 2103           * mappings, but they were created with REF and MOD already set, so
2104 2104           * no stale writeback will happen.
2105 2105           *
2106 2106           * Segmap is the only place where remaps happen on the same pfn and for
2107 2107           * that we want to preserve the stale REF/MOD bits.
2108 2108           */
2109 2109          if (old & PT_REF)
2110 2110                  hat_tlb_inval(hat, addr);
2111 2111  
2112 2112  done:
2113 2113          if (ptr == NULL)
2114 2114                  x86pte_release_pagetable(ht);
2115 2115          return (old);
2116 2116  }
2117 2117  
2118 2118  /*
2119 2119   * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2120 2120   * This is used for links between pagetables of different levels.
2121 2121   * Note we always create these links with dirty/access set, so they should
2122 2122   * never change.
2123 2123   */
2124 2124  x86pte_t
2125 2125  x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2126 2126  {
2127 2127          x86pte_t        pte;
2128 2128          x86pte_t        *ptep;
2129 2129  #ifdef __xpv
2130 2130          /*
2131 2131           * We can't use writable pagetables for upper level tables, so fake it.
2132 2132           */
2133 2133          mmu_update_t t[2];
2134 2134          int cnt = 1;
2135 2135          int count;
2136 2136          maddr_t ma;
2137 2137  
2138 2138          if (!IN_XPV_PANIC()) {
2139 2139                  ASSERT(!(ht->ht_flags & HTABLE_VLP));   /* no VLP yet */
2140 2140                  ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2141 2141                  t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2142 2142                  t[0].val = new;
2143 2143  
2144 2144  #if defined(__amd64)
2145 2145                  /*
2146 2146                   * On the 64-bit hypervisor we need to maintain the user mode
2147 2147                   * top page table too.
2148 2148                   */
2149 2149                  if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2150 2150                          ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2151 2151                              ht->ht_hat->hat_user_ptable), entry));
2152 2152                          t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2153 2153                          t[1].val = new;
2154 2154                          ++cnt;
2155 2155                  }
2156 2156  #endif  /* __amd64 */
2157 2157  
2158 2158                  if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2159 2159                          panic("HYPERVISOR_mmu_update() failed");
2160 2160                  ASSERT(count == cnt);
2161 2161                  return (old);
2162 2162          }
2163 2163  #endif
2164 2164          ptep = x86pte_access_pagetable(ht, entry);
2165 2165          XPV_ALLOW_PAGETABLE_UPDATES();
2166 2166          pte = CAS_PTE(ptep, old, new);
2167 2167          XPV_DISALLOW_PAGETABLE_UPDATES();
2168 2168          x86pte_release_pagetable(ht);
2169 2169          return (pte);
2170 2170  }
2171 2171  
2172 2172  /*
2173 2173   * Invalidate a page table entry as long as it currently maps something that
2174 2174   * matches the value determined by expect.
2175 2175   *
2176 2176   * Also invalidates any TLB entries and returns the previous value of the PTE.
2177 2177   */
2178 2178  x86pte_t
2179 2179  x86pte_inval(
2180 2180          htable_t *ht,
2181 2181          uint_t entry,
2182 2182          x86pte_t expect,
2183 2183          x86pte_t *pte_ptr)
2184 2184  {
2185 2185          x86pte_t        *ptep;
2186 2186          x86pte_t        oldpte;
2187 2187          x86pte_t        found;
2188 2188  
2189 2189          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2190 2190          ASSERT(ht->ht_level <= mmu.max_page_level);
2191 2191  
2192 2192          if (pte_ptr != NULL)
2193 2193                  ptep = pte_ptr;
2194 2194          else
2195 2195                  ptep = x86pte_access_pagetable(ht, entry);
2196 2196  
2197 2197  #if defined(__xpv)
2198 2198          /*
2199 2199           * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2200 2200           * with anything else.
2201 2201           */
2202 2202          if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) {
2203 2203                  int count;
2204 2204                  mmu_update_t t[1];
2205 2205                  maddr_t ma;
2206 2206  
2207 2207                  oldpte = GET_PTE(ptep);
2208 2208                  if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2209 2209                          goto done;
2210 2210                  ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2211 2211                  t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2212 2212                  t[0].val = 0;
2213 2213                  if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF))
2214 2214                          panic("HYPERVISOR_mmu_update() failed");
2215 2215                  ASSERT(count == 1);
2216 2216                  goto done;
2217 2217          }
2218 2218  #endif /* __xpv */
2219 2219  
2220 2220          /*
2221 2221           * Note that the loop is needed to handle changes due to h/w updating
2222 2222           * of PT_MOD/PT_REF.
2223 2223           */
2224 2224          do {
2225 2225                  oldpte = GET_PTE(ptep);
2226 2226                  if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2227 2227                          goto done;
2228 2228                  XPV_ALLOW_PAGETABLE_UPDATES();
2229 2229                  found = CAS_PTE(ptep, oldpte, 0);
2230 2230                  XPV_DISALLOW_PAGETABLE_UPDATES();
2231 2231          } while (found != oldpte);
2232 2232          if (oldpte & (PT_REF | PT_MOD))
2233 2233                  hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2234 2234  
2235 2235  done:
2236 2236          if (pte_ptr == NULL)
2237 2237                  x86pte_release_pagetable(ht);
2238 2238          return (oldpte);
2239 2239  }
2240 2240  
2241 2241  /*
2242 2242   * Change a page table entry af it currently matches the value in expect.
2243 2243   */
2244 2244  x86pte_t
2245 2245  x86pte_update(
2246 2246          htable_t *ht,
2247 2247          uint_t entry,
2248 2248          x86pte_t expect,
2249 2249          x86pte_t new)
2250 2250  {
2251 2251          x86pte_t        *ptep;
2252 2252          x86pte_t        found;
2253 2253  
2254 2254          ASSERT(new != 0);
2255 2255          ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2256 2256          ASSERT(ht->ht_level <= mmu.max_page_level);
2257 2257  
2258 2258          ptep = x86pte_access_pagetable(ht, entry);
2259 2259          XPV_ALLOW_PAGETABLE_UPDATES();
2260 2260          found = CAS_PTE(ptep, expect, new);
2261 2261          XPV_DISALLOW_PAGETABLE_UPDATES();
2262 2262          if (found == expect) {
2263 2263                  hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2264 2264  
2265 2265                  /*
2266 2266                   * When removing write permission *and* clearing the
2267 2267                   * MOD bit, check if a write happened via a stale
2268 2268                   * TLB entry before the TLB shootdown finished.
2269 2269                   *
2270 2270                   * If it did happen, simply re-enable write permission and
2271 2271                   * act like the original CAS failed.
2272 2272                   */
2273 2273                  if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2274 2274                      (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2275 2275                      (GET_PTE(ptep) & PT_MOD) != 0) {
2276 2276                          do {
2277 2277                                  found = GET_PTE(ptep);
2278 2278                                  XPV_ALLOW_PAGETABLE_UPDATES();
2279 2279                                  found =
2280 2280                                      CAS_PTE(ptep, found, found | PT_WRITABLE);
2281 2281                                  XPV_DISALLOW_PAGETABLE_UPDATES();
2282 2282                          } while ((found & PT_WRITABLE) == 0);
2283 2283                  }
2284 2284          }
2285 2285          x86pte_release_pagetable(ht);
2286 2286          return (found);
2287 2287  }
2288 2288  
2289 2289  #ifndef __xpv
2290 2290  /*
2291 2291   * Copy page tables - this is just a little more complicated than the
2292 2292   * previous routines. Note that it's also not atomic! It also is never
2293 2293   * used for VLP pagetables.
2294 2294   */
2295 2295  void
2296 2296  x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2297 2297  {
2298 2298          caddr_t src_va;
2299 2299          caddr_t dst_va;
2300 2300          size_t size;
2301 2301          x86pte_t *pteptr;
2302 2302          x86pte_t pte;
2303 2303  
2304 2304          ASSERT(khat_running);
2305 2305          ASSERT(!(dest->ht_flags & HTABLE_VLP));
2306 2306          ASSERT(!(src->ht_flags & HTABLE_VLP));
2307 2307          ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2308 2308          ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2309 2309  
2310 2310          /*
2311 2311           * Acquire access to the CPU pagetable windows for the dest and source.
2312 2312           */
2313 2313          dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2314 2314          if (kpm_vbase) {
2315 2315                  src_va = (caddr_t)
2316 2316                      PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2317 2317          } else {
2318 2318                  uint_t x = PWIN_SRC(CPU->cpu_id);
2319 2319  
2320 2320                  /*
2321 2321                   * Finish defining the src pagetable mapping
2322 2322                   */
2323 2323                  src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2324 2324                  pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2325 2325                  pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2326 2326                  if (mmu.pae_hat)
2327 2327                          *pteptr = pte;
2328 2328                  else
2329 2329                          *(x86pte32_t *)pteptr = pte;
2330 2330                  mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2331 2331          }
2332 2332  
2333 2333          /*
2334 2334           * now do the copy
2335 2335           */
2336 2336          size = count << mmu.pte_size_shift;
2337 2337          bcopy(src_va, dst_va, size);
2338 2338  
2339 2339          x86pte_release_pagetable(dest);
2340 2340  }
2341 2341  
2342 2342  #else /* __xpv */
2343 2343  
2344 2344  /*
2345 2345   * The hypervisor only supports writable pagetables at level 0, so we have
2346 2346   * to install these 1 by 1 the slow way.
2347 2347   */
2348 2348  void
2349 2349  x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2350 2350  {
2351 2351          caddr_t src_va;
2352 2352          x86pte_t pte;
2353 2353  
2354 2354          ASSERT(!IN_XPV_PANIC());
2355 2355          src_va = (caddr_t)x86pte_access_pagetable(src, entry);
2356 2356          while (count) {
2357 2357                  if (mmu.pae_hat)
2358 2358                          pte = *(x86pte_t *)src_va;
2359 2359                  else
2360 2360                          pte = *(x86pte32_t *)src_va;
2361 2361                  if (pte != 0) {
2362 2362                          set_pteval(pfn_to_pa(dest->ht_pfn), entry,
2363 2363                              dest->ht_level, pte);
2364 2364  #ifdef __amd64
2365 2365                          if (dest->ht_level == mmu.max_level &&
2366 2366                              htable_e2va(dest, entry) < HYPERVISOR_VIRT_END)
2367 2367                                  set_pteval(
2368 2368                                      pfn_to_pa(dest->ht_hat->hat_user_ptable),
2369 2369                                      entry, dest->ht_level, pte);
2370 2370  #endif
2371 2371                  }
2372 2372                  --count;
2373 2373                  ++entry;
2374 2374                  src_va += mmu.pte_size;
2375 2375          }
2376 2376          x86pte_release_pagetable(src);
2377 2377  }
2378 2378  #endif /* __xpv */
2379 2379  
2380 2380  /*
2381 2381   * Zero page table entries - Note this doesn't use atomic stores!
2382 2382   */
2383 2383  static void
2384 2384  x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2385 2385  {
2386 2386          caddr_t dst_va;
2387 2387          size_t size;
2388 2388  #ifdef __xpv
2389 2389          int x;
2390 2390          x86pte_t newpte;
2391 2391  #endif
2392 2392  
2393 2393          /*
2394 2394           * Map in the page table to be zeroed.
2395 2395           */
2396 2396          ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2397 2397          ASSERT(!(dest->ht_flags & HTABLE_VLP));
2398 2398  
2399 2399          /*
2400 2400           * On the hypervisor we don't use x86pte_access_pagetable() since
2401 2401           * in this case the page is not pinned yet.
2402 2402           */
2403 2403  #ifdef __xpv
2404 2404          if (kpm_vbase == NULL) {
2405 2405                  kpreempt_disable();
2406 2406                  ASSERT(CPU->cpu_hat_info != NULL);
2407 2407                  mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2408 2408                  x = PWIN_TABLE(CPU->cpu_id);
2409 2409                  newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2410 2410                  xen_map(newpte, PWIN_VA(x));
2411 2411                  dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2412 2412          } else
2413 2413  #endif
2414 2414                  dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2415 2415  
2416 2416          size = count << mmu.pte_size_shift;
2417 2417          ASSERT(size > BLOCKZEROALIGN);
2418 2418  #ifdef __i386
2419 2419          if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
2420 2420                  bzero(dst_va, size);
2421 2421          else
2422 2422  #endif
2423 2423                  block_zero_no_xmm(dst_va, size);
2424 2424  
2425 2425  #ifdef __xpv
2426 2426          if (kpm_vbase == NULL) {
2427 2427                  xen_map(0, PWIN_VA(x));
2428 2428                  mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2429 2429                  kpreempt_enable();
2430 2430          } else
2431 2431  #endif
2432 2432                  x86pte_release_pagetable(dest);
2433 2433  }
2434 2434  
2435 2435  /*
2436 2436   * Called to ensure that all pagetables are in the system dump
2437 2437   */
2438 2438  void
2439 2439  hat_dump(void)
2440 2440  {
2441 2441          hat_t *hat;
2442 2442          uint_t h;
2443 2443          htable_t *ht;
2444 2444  
2445 2445          /*
2446 2446           * Dump all page tables
2447 2447           */
2448 2448          for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2449 2449                  for (h = 0; h < hat->hat_num_hash; ++h) {
2450 2450                          for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2451 2451                                  if ((ht->ht_flags & HTABLE_VLP) == 0)
2452 2452                                          dump_page(ht->ht_pfn);
2453 2453                          }
2454 2454                  }
2455 2455          }
2456 2456  }

↓ open down ↓

1446 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX