1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/types.h>
  29 #include <sys/systm.h>
  30 #include <sys/archsystm.h>
  31 #include <sys/machsystm.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/vmem.h>
  34 #include <sys/mman.h>
  35 #include <sys/vm.h>
  36 #include <sys/cpu.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/cpuvar.h>
  39 #include <sys/atomic.h>
  40 #include <vm/as.h>
  41 #include <vm/hat.h>
  42 #include <vm/as.h>
  43 #include <vm/page.h>
  44 #include <vm/seg.h>
  45 #include <vm/seg_kmem.h>
  46 #include <vm/seg_kpm.h>
  47 #include <vm/hat_sfmmu.h>
  48 #include <sys/debug.h>
  49 #include <sys/cpu_module.h>
  50 #include <sys/mem_cage.h>
  51 
  52 /*
  53  * A quick way to generate a cache consistent address to map in a page.
  54  * users: ppcopy, pagezero, /proc, dev/mem
  55  *
  56  * The ppmapin/ppmapout routines provide a quick way of generating a cache
  57  * consistent address by reserving a given amount of kernel address space.
  58  * The base is PPMAPBASE and its size is PPMAPSIZE.  This memory is divided
  59  * into x number of sets, where x is the number of colors for the virtual
  60  * cache. The number of colors is how many times a page can be mapped
  61  * simulatenously in the cache.  For direct map caches this translates to
  62  * the number of pages in the cache.
  63  * Each set will be assigned a group of virtual pages from the reserved memory
  64  * depending on its virtual color.
  65  * When trying to assign a virtual address we will find out the color for the
  66  * physical page in question (if applicable).  Then we will try to find an
  67  * available virtual page from the set of the appropiate color.
  68  */
  69 
  70 #define clsettoarray(color, set) ((color * nsets) + set)
  71 
  72 int pp_slots = 4;               /* small default, tuned by cpu module */
  73 
  74 /* tuned by cpu module, default is "safe" */
  75 int pp_consistent_coloring = PPAGE_STORES_POLLUTE | PPAGE_LOADS_POLLUTE;
  76 
  77 static caddr_t  ppmap_vaddrs[PPMAPSIZE / MMU_PAGESIZE];
  78 static int      nsets;                  /* number of sets */
  79 static int      ppmap_pages;            /* generate align mask */
  80 static int      ppmap_shift;            /* set selector */
  81 
  82 #ifdef PPDEBUG
  83 #define         MAXCOLORS       16      /* for debug only */
  84 static int      ppalloc_noslot = 0;     /* # of allocations from kernelmap */
  85 static int      align_hits[MAXCOLORS];
  86 static int      pp_allocs;              /* # of ppmapin requests */
  87 #endif /* PPDEBUG */
  88 
  89 /*
  90  * There are only 64 TLB entries on spitfire, 16 on cheetah
  91  * (fully-associative TLB) so we allow the cpu module to tune the
  92  * number to use here via pp_slots.
  93  */
  94 static struct ppmap_va {
  95         caddr_t ppmap_slots[MAXPP_SLOTS];
  96 } ppmap_va[NCPU];
  97 
  98 void
  99 ppmapinit(void)
 100 {
 101         int color, nset, setsize;
 102         caddr_t va;
 103 
 104         ASSERT(pp_slots <= MAXPP_SLOTS);
 105 
 106         va = (caddr_t)PPMAPBASE;
 107         if (cache & CACHE_VAC) {
 108                 int a;
 109 
 110                 ppmap_pages = mmu_btop(shm_alignment);
 111                 nsets = PPMAPSIZE / shm_alignment;
 112                 setsize = shm_alignment;
 113                 ppmap_shift = MMU_PAGESHIFT;
 114                 a = ppmap_pages;
 115                 while (a >>= 1)
 116                         ppmap_shift++;
 117         } else {
 118                 /*
 119                  * If we do not have a virtual indexed cache we simply
 120                  * have only one set containing all pages.
 121                  */
 122                 ppmap_pages = 1;
 123                 nsets = mmu_btop(PPMAPSIZE);
 124                 setsize = MMU_PAGESIZE;
 125                 ppmap_shift = MMU_PAGESHIFT;
 126         }
 127         for (color = 0; color < ppmap_pages; color++) {
 128                 for (nset = 0; nset < nsets; nset++) {
 129                         ppmap_vaddrs[clsettoarray(color, nset)] =
 130                             (caddr_t)((uintptr_t)va + (nset * setsize));
 131                 }
 132                 va += MMU_PAGESIZE;
 133         }
 134 }
 135 
 136 /*
 137  * Allocate a cache consistent virtual address to map a page, pp,
 138  * with protection, vprot; and map it in the MMU, using the most
 139  * efficient means possible.  The argument avoid is a virtual address
 140  * hint which when masked yields an offset into a virtual cache
 141  * that should be avoided when allocating an address to map in a
 142  * page.  An avoid arg of -1 means you don't care, for instance pagezero.
 143  *
 144  * machine dependent, depends on virtual address space layout,
 145  * understands that all kernel addresses have bit 31 set.
 146  *
 147  * NOTE: For sun4 platforms the meaning of the hint argument is opposite from
 148  * that found in other architectures.  In other architectures the hint
 149  * (called avoid) was used to ask ppmapin to NOT use the specified cache color.
 150  * This was used to avoid virtual cache trashing in the bcopy.  Unfortunately
 151  * in the case of a COW,  this later on caused a cache aliasing conflict.  In
 152  * sun4, the bcopy routine uses the block ld/st instructions so we don't have
 153  * to worry about virtual cache trashing.  Actually, by using the hint to choose
 154  * the right color we can almost guarantee a cache conflict will not occur.
 155  */
 156 
 157 caddr_t
 158 ppmapin(page_t *pp, uint_t vprot, caddr_t hint)
 159 {
 160         int color, nset, index, start;
 161         caddr_t va;
 162 
 163 #ifdef PPDEBUG
 164         pp_allocs++;
 165 #endif /* PPDEBUG */
 166         if (cache & CACHE_VAC) {
 167                 color = sfmmu_get_ppvcolor(pp);
 168                 if (color == -1) {
 169                         if ((intptr_t)hint != -1L) {
 170                                 color = addr_to_vcolor(hint);
 171                         } else {
 172                                 color = addr_to_vcolor(mmu_ptob(pp->p_pagenum));
 173                         }
 174                 }
 175 
 176         } else {
 177                 /*
 178                  * For physical caches, we can pick any address we want.
 179                  */
 180                 color = 0;
 181         }
 182 
 183         start = color;
 184         do {
 185                 for (nset = 0; nset < nsets; nset++) {
 186                         index = clsettoarray(color, nset);
 187                         va = ppmap_vaddrs[index];
 188                         if (va != NULL) {
 189 #ifdef PPDEBUG
 190                                 align_hits[color]++;
 191 #endif /* PPDEBUG */
 192                                 if (casptr(&ppmap_vaddrs[index],
 193                                     va, NULL) == va) {
 194                                         hat_memload(kas.a_hat, va, pp,
 195                                             vprot | HAT_NOSYNC,
 196                                             HAT_LOAD_LOCK);
 197                                         return (va);
 198                                 }
 199                         }
 200                 }
 201                 /*
 202                  * first pick didn't succeed, try another
 203                  */
 204                 if (++color == ppmap_pages)
 205                         color = 0;
 206         } while (color != start);
 207 
 208 #ifdef PPDEBUG
 209         ppalloc_noslot++;
 210 #endif /* PPDEBUG */
 211 
 212         /*
 213          * No free slots; get a random one from the kernel heap area.
 214          */
 215         va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
 216 
 217         hat_memload(kas.a_hat, va, pp, vprot | HAT_NOSYNC, HAT_LOAD_LOCK);
 218 
 219         return (va);
 220 
 221 }
 222 
 223 void
 224 ppmapout(caddr_t va)
 225 {
 226         int color, nset, index;
 227 
 228         if (va >= kernelheap && va < ekernelheap) {
 229                 /*
 230                  * Space came from kernelmap, flush the page and
 231                  * return the space.
 232                  */
 233                 hat_unload(kas.a_hat, va, PAGESIZE,
 234                     (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));
 235                 vmem_free(heap_arena, va, PAGESIZE);
 236         } else {
 237                 /*
 238                  * Space came from ppmap_vaddrs[], give it back.
 239                  */
 240                 color = addr_to_vcolor(va);
 241                 ASSERT((cache & CACHE_VAC)? (color < ppmap_pages) : 1);
 242 
 243                 nset = ((uintptr_t)va >> ppmap_shift) & (nsets - 1);
 244                 index = clsettoarray(color, nset);
 245                 hat_unload(kas.a_hat, va, PAGESIZE,
 246                     (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));
 247 
 248                 ASSERT(ppmap_vaddrs[index] == NULL);
 249                 ppmap_vaddrs[index] = va;
 250         }
 251 }
 252 
 253 #ifdef DEBUG
 254 #define PP_STAT_ADD(stat)       (stat)++
 255 uint_t pload, ploadfail;
 256 uint_t ppzero, ppzero_short;
 257 #else
 258 #define PP_STAT_ADD(stat)
 259 #endif /* DEBUG */
 260 
 261 /*
 262  * Find a slot in per CPU page copy area. Load up a locked TLB in the
 263  * running cpu. We don't call hat layer to load up the tte since the
 264  * mapping is only temporary. If the thread migrates it'll get a TLB
 265  * miss trap and TLB/TSB miss handler will panic since there is no
 266  * official hat record of this mapping.
 267  */
 268 static caddr_t
 269 pp_load_tlb(processorid_t cpu, caddr_t **pslot, page_t *pp, uint_t prot)
 270 {
 271         struct ppmap_va *ppmap;
 272         tte_t           tte;
 273         caddr_t         *myslot;
 274         caddr_t         va;
 275         long            i, start, stride;
 276         int             vcolor;
 277         uint_t          flags, strict_flag;
 278 
 279         PP_STAT_ADD(pload);
 280 
 281         ppmap = &ppmap_va[cpu];
 282         va = (caddr_t)(PPMAP_FAST_BASE + (MMU_PAGESIZE * MAXPP_SLOTS) * cpu);
 283         myslot = ppmap->ppmap_slots;
 284         ASSERT(addr_to_vcolor(va) == 0);
 285 
 286         if (prot & TTE_HWWR_INT) {
 287                 flags = PPAGE_STORE_VCOLORING | PPAGE_STORES_POLLUTE;
 288                 strict_flag = PPAGE_STORES_POLLUTE;
 289         } else {
 290                 flags = PPAGE_LOAD_VCOLORING | PPAGE_LOADS_POLLUTE;
 291                 strict_flag = PPAGE_LOADS_POLLUTE;
 292         }
 293 
 294         /*
 295          * If consistent handling is required then keep the current
 296          * vcolor of the page.  Furthermore, if loads or stores can
 297          * pollute the VAC then using a "new" page (unassigned vcolor)
 298          * won't work and we have to return a failure.
 299          */
 300         if (pp_consistent_coloring & flags) {
 301                 vcolor = sfmmu_get_ppvcolor(pp);
 302                 if ((vcolor == -1) &&
 303                     (pp_consistent_coloring & strict_flag))
 304                         return (NULL);
 305                 /* else keep the current vcolor of the page */
 306         } else {
 307                 vcolor = -1;
 308         }
 309 
 310         if (vcolor != -1) {
 311                 va += MMU_PAGESIZE * vcolor;
 312                 start = vcolor;
 313                 stride = ppmap_pages; /* number of colors */
 314                 myslot += vcolor;
 315         } else {
 316                 start = 0;
 317                 stride = 1;
 318         }
 319 
 320         for (i = start; i < pp_slots; i += stride) {
 321                 if (*myslot == NULL) {
 322                         if (casptr(myslot, NULL, va) == NULL)
 323                                 break;
 324                 }
 325                 myslot += stride;
 326                 va += MMU_PAGESIZE * stride;
 327         }
 328 
 329         if (i >= pp_slots) {
 330                 PP_STAT_ADD(ploadfail);
 331                 return (NULL);
 332         }
 333 
 334         ASSERT(vcolor == -1 || addr_to_vcolor(va) == vcolor);
 335 
 336         /*
 337          * Now we have a slot we can use, make the tte.
 338          */
 339         tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(pp->p_pagenum);
 340         tte.tte_intlo = TTE_PFN_INTLO(pp->p_pagenum) | TTE_CP_INT |
 341             TTE_CV_INT | TTE_PRIV_INT | TTE_LCK_INT | prot;
 342 
 343         ASSERT(CPU->cpu_id == cpu);
 344         sfmmu_dtlb_ld_kva(va, &tte);
 345 
 346         *pslot = myslot;        /* Return ptr to the slot we used. */
 347 
 348         return (va);
 349 }
 350 
 351 static void
 352 pp_unload_tlb(caddr_t *pslot, caddr_t va)
 353 {
 354         ASSERT(*pslot == va);
 355 
 356         vtag_flushpage(va, (uint64_t)ksfmmup);
 357         *pslot = NULL;                          /* release the slot */
 358 }
 359 
 360 /*
 361  * Common copy routine which attempts to use hwblkpagecopy.  If this routine
 362  * can't be used, failure (0) will be returned.  Otherwise, a PAGESIZE page
 363  * will be copied and success (1) will be returned.
 364  */
 365 int
 366 ppcopy_common(page_t *fm_pp, page_t *to_pp)
 367 {
 368         caddr_t fm_va, to_va;
 369         caddr_t *fm_slot, *to_slot;
 370         processorid_t cpu;
 371         label_t ljb;
 372         int ret = 1;
 373 
 374         ASSERT(fm_pp != NULL && PAGE_LOCKED(fm_pp));
 375         ASSERT(to_pp != NULL && PAGE_LOCKED(to_pp));
 376 
 377         /*
 378          * If we can't use VIS block loads and stores we can't use
 379          * pp_load_tlb/pp_unload_tlb due to the possibility of
 380          * d$ aliasing.
 381          */
 382         if (!use_hw_bcopy && (cache & CACHE_VAC))
 383                 return (0);
 384 
 385         kpreempt_disable();
 386         cpu = CPU->cpu_id;
 387         fm_va = pp_load_tlb(cpu, &fm_slot, fm_pp, 0);
 388         if (fm_va == NULL) {
 389                 kpreempt_enable();
 390                 return (0);
 391         }
 392         to_va = pp_load_tlb(cpu, &to_slot, to_pp, TTE_HWWR_INT);
 393         if (to_va == NULL) {
 394                 pp_unload_tlb(fm_slot, fm_va);
 395                 kpreempt_enable();
 396                 return (0);
 397         }
 398         if (on_fault(&ljb)) {
 399                 ret = 0;
 400                 goto faulted;
 401         }
 402         hwblkpagecopy(fm_va, to_va);
 403         no_fault();
 404 faulted:
 405         ASSERT(CPU->cpu_id == cpu);
 406         pp_unload_tlb(fm_slot, fm_va);
 407         pp_unload_tlb(to_slot, to_va);
 408         kpreempt_enable();
 409         return (ret);
 410 }
 411 
 412 /*
 413  * Routine to copy kernel pages during relocation.  It will copy one
 414  * PAGESIZE page to another PAGESIZE page.  This function may be called
 415  * above LOCK_LEVEL so it should not grab any locks.
 416  */
 417 void
 418 ppcopy_kernel__relocatable(page_t *fm_pp, page_t *to_pp)
 419 {
 420         uint64_t fm_pa, to_pa;
 421         size_t nbytes;
 422 
 423         fm_pa = (uint64_t)(fm_pp->p_pagenum) << MMU_PAGESHIFT;
 424         to_pa = (uint64_t)(to_pp->p_pagenum) << MMU_PAGESHIFT;
 425 
 426         nbytes = MMU_PAGESIZE;
 427 
 428         for (; nbytes > 0; fm_pa += 32, to_pa += 32, nbytes -= 32)
 429                 hw_pa_bcopy32(fm_pa, to_pa);
 430 }
 431 
 432 /*
 433  * Copy the data from the physical page represented by "frompp" to
 434  * that represented by "topp".
 435  *
 436  * Try to use per cpu mapping first, if that fails then call pp_mapin
 437  * to load it.
 438  *
 439  * Returns one on success or zero on some sort of fault while doing the copy.
 440  */
 441 int
 442 ppcopy(page_t *fm_pp, page_t *to_pp)
 443 {
 444         caddr_t fm_va, to_va;
 445         label_t ljb;
 446         int ret = 1;
 447         boolean_t       use_kpm = B_FALSE;
 448 
 449         /* Try the fast path first */
 450         if (ppcopy_common(fm_pp, to_pp))
 451                 return (1);
 452 
 453         /*
 454          * Try to map using KPM if enabled and we are the cageout thread.
 455          * If it fails, fall back to ppmapin/ppmaput
 456          */
 457 
 458         if (kpm_enable) {
 459                 if (curthread == kcage_cageout_thread)
 460                         use_kpm = B_TRUE;
 461         }
 462 
 463         if (use_kpm) {
 464                 if ((fm_va = hat_kpm_mapin(fm_pp, NULL)) == NULL ||
 465                     (to_va = hat_kpm_mapin(to_pp, NULL)) == NULL) {
 466                         if (fm_va != NULL)
 467                                 hat_kpm_mapout(fm_pp, NULL, fm_va);
 468                         use_kpm = B_FALSE;
 469                 }
 470         }
 471 
 472         if (use_kpm == B_FALSE) {
 473                 /* do the slow path */
 474                 fm_va = ppmapin(fm_pp, PROT_READ, (caddr_t)-1);
 475                 to_va = ppmapin(to_pp, PROT_READ | PROT_WRITE, fm_va);
 476                 if (on_fault(&ljb)) {
 477                         ret = 0;
 478                         goto faulted;
 479                 }
 480         }
 481         bcopy(fm_va, to_va, PAGESIZE);
 482         no_fault();
 483 faulted:
 484         /* unmap */
 485         if (use_kpm == B_TRUE) {
 486                 hat_kpm_mapout(fm_pp, NULL, fm_va);
 487                 hat_kpm_mapout(to_pp, NULL, to_va);
 488         } else {
 489                 ppmapout(fm_va);
 490                 ppmapout(to_va);
 491         }
 492         return (ret);
 493 }
 494 
 495 /*
 496  * Zero the physical page from off to off + len given by `pp'
 497  * without changing the reference and modified bits of page.
 498  *
 499  * Again, we'll try per cpu mapping first.
 500  */
 501 void
 502 pagezero(page_t *pp, uint_t off, uint_t len)
 503 {
 504         caddr_t va;
 505         caddr_t *slot;
 506         int fast = 1;
 507         processorid_t cpu;
 508         extern int hwblkclr(void *, size_t);
 509         extern int use_hw_bzero;
 510 
 511         ASSERT((int)len > 0 && (int)off >= 0 && off + len <= PAGESIZE);
 512         ASSERT(PAGE_LOCKED(pp));
 513 
 514         PP_STAT_ADD(ppzero);
 515 
 516         if (len != MMU_PAGESIZE || !use_hw_bzero) {
 517                 /*
 518                  * Since the fast path doesn't do anything about
 519                  * VAC coloring, we make sure bcopy h/w will be used.
 520                  */
 521                 fast = 0;
 522                 va = NULL;
 523                 PP_STAT_ADD(ppzero_short);
 524         }
 525 
 526         kpreempt_disable();
 527 
 528         if (fast) {
 529                 cpu = CPU->cpu_id;
 530                 va = pp_load_tlb(cpu, &slot, pp, TTE_HWWR_INT);
 531         }
 532 
 533         if (va == NULL) {
 534                 /*
 535                  * We are here either length != MMU_PAGESIZE or pp_load_tlb()
 536                  * returns NULL or use_hw_bzero is disabled.
 537                  */
 538                 va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
 539                 fast = 0;
 540         }
 541 
 542         if (hwblkclr(va + off, len)) {
 543                 /*
 544                  * We may not have used block commit asi.
 545                  * So flush the I-$ manually
 546                  */
 547 
 548                 ASSERT(fast == 0);
 549 
 550                 sync_icache(va + off, len);
 551         } else {
 552                 /*
 553                  * We have used blk commit, and flushed the I-$. However we
 554                  * still may have an instruction in the pipeline. Only a flush
 555                  * instruction will invalidate that.
 556                  */
 557                 doflush(va);
 558         }
 559 
 560         if (fast) {
 561                 ASSERT(CPU->cpu_id == cpu);
 562                 pp_unload_tlb(slot, va);
 563         } else {
 564                 ppmapout(va);
 565         }
 566 
 567         kpreempt_enable();
 568 }