XXXX-pass-in-cpu_pause_func-via-pause_cpus Wdiff usr/src/uts/i86pc/os/machdep.c

Print this page

XXXX pass in cpu_pause_func via pause_cpus

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/i86pc/os/machdep.c
          +++ new/usr/src/uts/i86pc/os/machdep.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  /*
  26   26   * Copyright (c) 2010, Intel Corporation.
  27   27   * All rights reserved.
  28   28   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/t_lock.h>
  32   32  #include <sys/param.h>
  33   33  #include <sys/segments.h>
  34   34  #include <sys/sysmacros.h>
  35   35  #include <sys/signal.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/user.h>
  38   38  #include <sys/mman.h>
  39   39  #include <sys/vm.h>
  40   40  
  41   41  #include <sys/disp.h>
  42   42  #include <sys/class.h>
  43   43  
  44   44  #include <sys/proc.h>
  45   45  #include <sys/buf.h>
  46   46  #include <sys/kmem.h>
  47   47  
  48   48  #include <sys/reboot.h>
  49   49  #include <sys/uadmin.h>
  50   50  #include <sys/callb.h>
  51   51  
  52   52  #include <sys/cred.h>
  53   53  #include <sys/vnode.h>
  54   54  #include <sys/file.h>
  55   55  
  56   56  #include <sys/procfs.h>
  57   57  #include <sys/acct.h>
  58   58  
  59   59  #include <sys/vfs.h>
  60   60  #include <sys/dnlc.h>
  61   61  #include <sys/var.h>
  62   62  #include <sys/cmn_err.h>
  63   63  #include <sys/utsname.h>
  64   64  #include <sys/debug.h>
  65   65  
  66   66  #include <sys/dumphdr.h>
  67   67  #include <sys/bootconf.h>
  68   68  #include <sys/varargs.h>
  69   69  #include <sys/promif.h>
  70   70  #include <sys/modctl.h>
  71   71  
  72   72  #include <sys/consdev.h>
  73   73  #include <sys/frame.h>
  74   74  
  75   75  #include <sys/sunddi.h>
  76   76  #include <sys/ddidmareq.h>
  77   77  #include <sys/psw.h>
  78   78  #include <sys/regset.h>
  79   79  #include <sys/privregs.h>
  80   80  #include <sys/clock.h>
  81   81  #include <sys/tss.h>
  82   82  #include <sys/cpu.h>
  83   83  #include <sys/stack.h>
  84   84  #include <sys/trap.h>
  85   85  #include <sys/pic.h>
  86   86  #include <vm/hat.h>
  87   87  #include <vm/anon.h>
  88   88  #include <vm/as.h>
  89   89  #include <vm/page.h>
  90   90  #include <vm/seg.h>
  91   91  #include <vm/seg_kmem.h>
  92   92  #include <vm/seg_map.h>
  93   93  #include <vm/seg_vn.h>
  94   94  #include <vm/seg_kp.h>
  95   95  #include <vm/hat_i86.h>
  96   96  #include <sys/swap.h>
  97   97  #include <sys/thread.h>
  98   98  #include <sys/sysconf.h>
  99   99  #include <sys/vm_machparam.h>
 100  100  #include <sys/archsystm.h>
 101  101  #include <sys/machsystm.h>
 102  102  #include <sys/machlock.h>
 103  103  #include <sys/x_call.h>
 104  104  #include <sys/instance.h>
 105  105  
 106  106  #include <sys/time.h>
 107  107  #include <sys/smp_impldefs.h>
 108  108  #include <sys/psm_types.h>
 109  109  #include <sys/atomic.h>
 110  110  #include <sys/panic.h>
 111  111  #include <sys/cpuvar.h>
 112  112  #include <sys/dtrace.h>
 113  113  #include <sys/bl.h>
 114  114  #include <sys/nvpair.h>
 115  115  #include <sys/x86_archext.h>
 116  116  #include <sys/pool_pset.h>
 117  117  #include <sys/autoconf.h>
 118  118  #include <sys/mem.h>
 119  119  #include <sys/dumphdr.h>
 120  120  #include <sys/compress.h>
 121  121  #include <sys/cpu_module.h>
 122  122  #if defined(__xpv)
 123  123  #include <sys/hypervisor.h>
 124  124  #include <sys/xpv_panic.h>
 125  125  #endif
 126  126  
 127  127  #include <sys/fastboot.h>
 128  128  #include <sys/machelf.h>
 129  129  #include <sys/kobj.h>
 130  130  #include <sys/multiboot.h>
 131  131  
 132  132  #ifdef  TRAPTRACE
 133  133  #include <sys/traptrace.h>
 134  134  #endif  /* TRAPTRACE */
 135  135  
 136  136  #include <c2/audit.h>
 137  137  #include <sys/clock_impl.h>
 138  138  
 139  139  extern void audit_enterprom(int);
 140  140  extern void audit_exitprom(int);
 141  141  
 142  142  /*
 143  143   * Tunable to enable apix PSM; if set to 0, pcplusmp PSM will be used.
 144  144   */
 145  145  int     apix_enable = 1;
 146  146  
 147  147  int     apic_nvidia_io_max = 0; /* no. of NVIDIA i/o apics */
 148  148  
 149  149  /*
 150  150   * Occassionally the kernel knows better whether to power-off or reboot.
 151  151   */
 152  152  int force_shutdown_method = AD_UNKNOWN;
 153  153  
 154  154  /*
 155  155   * The panicbuf array is used to record messages and state:
 156  156   */
 157  157  char panicbuf[PANICBUFSIZE];
 158  158  
 159  159  /*
 160  160   * Flags to control Dynamic Reconfiguration features.
 161  161   */
 162  162  uint64_t plat_dr_options;
 163  163  
 164  164  /*
 165  165   * Maximum physical address for memory DR operations.
 166  166   */
 167  167  uint64_t plat_dr_physmax;
 168  168  
 169  169  /*
 170  170   * maxphys - used during physio
 171  171   * klustsize - used for klustering by swapfs and specfs
 172  172   */
 173  173  int maxphys = 56 * 1024;    /* XXX See vm_subr.c - max b_count in physio */
 174  174  int klustsize = 56 * 1024;
 175  175  
 176  176  caddr_t p0_va;          /* Virtual address for accessing physical page 0 */
 177  177  
 178  178  /*
 179  179   * defined here, though unused on x86,
 180  180   * to make kstat_fr.c happy.
 181  181   */
 182  182  int vac;
 183  183  
 184  184  void debug_enter(char *);
 185  185  
 186  186  extern void pm_cfb_check_and_powerup(void);
 187  187  extern void pm_cfb_rele(void);
 188  188  
 189  189  extern fastboot_info_t newkernel;
 190  190  
 191  191  /*
 192  192   * Machine dependent code to reboot.
 193  193   * "mdep" is interpreted as a character pointer; if non-null, it is a pointer
 194  194   * to a string to be used as the argument string when rebooting.
 195  195   *
 196  196   * "invoke_cb" is a boolean. It is set to true when mdboot() can safely
 197  197   * invoke CB_CL_MDBOOT callbacks before shutting the system down, i.e. when
 198  198   * we are in a normal shutdown sequence (interrupts are not blocked, the
 199  199   * system is not panic'ing or being suspended).
 200  200   */
 201  201  /*ARGSUSED*/
 202  202  void
 203  203  mdboot(int cmd, int fcn, char *mdep, boolean_t invoke_cb)
 204  204  {
 205  205          processorid_t bootcpuid = 0;
 206  206          static int is_first_quiesce = 1;
 207  207          static int is_first_reset = 1;
 208  208          int reset_status = 0;
 209  209          static char fallback_str[] = "Falling back to regular reboot.\n";
 210  210  
 211  211          if (fcn == AD_FASTREBOOT && !newkernel.fi_valid)
 212  212                  fcn = AD_BOOT;
 213  213  
 214  214          if (!panicstr) {
 215  215                  kpreempt_disable();
 216  216                  if (fcn == AD_FASTREBOOT) {
 217  217                          mutex_enter(&cpu_lock);
 218  218                          if (CPU_ACTIVE(cpu_get(bootcpuid))) {
 219  219                                  affinity_set(bootcpuid);
 220  220                          }
 221  221                          mutex_exit(&cpu_lock);
 222  222                  } else {
 223  223                          affinity_set(CPU_CURRENT);
 224  224                  }
 225  225          }
 226  226  
 227  227          if (force_shutdown_method != AD_UNKNOWN)
 228  228                  fcn = force_shutdown_method;
 229  229  
 230  230          /*
 231  231           * XXX - rconsvp is set to NULL to ensure that output messages
 232  232           * are sent to the underlying "hardware" device using the
 233  233           * monitor's printf routine since we are in the process of
 234  234           * either rebooting or halting the machine.
 235  235           */
 236  236          rconsvp = NULL;
 237  237  
 238  238          /*
 239  239           * Print the reboot message now, before pausing other cpus.
 240  240           * There is a race condition in the printing support that
 241  241           * can deadlock multiprocessor machines.
 242  242           */
 243  243          if (!(fcn == AD_HALT || fcn == AD_POWEROFF))
 244  244                  prom_printf("rebooting...\n");
 245  245  
 246  246          if (IN_XPV_PANIC())
 247  247                  reset();
 248  248  
 249  249          /*
 250  250           * We can't bring up the console from above lock level, so do it now
 251  251           */
 252  252          pm_cfb_check_and_powerup();
 253  253  
 254  254          /* make sure there are no more changes to the device tree */
 255  255          devtree_freeze();
 256  256  
 257  257          if (invoke_cb)
 258  258                  (void) callb_execute_class(CB_CL_MDBOOT, NULL);
 259  259  
 260  260          /*
 261  261           * Clear any unresolved UEs from memory.
 262  262           */
 263  263          page_retire_mdboot();
 264  264  
 265  265  #if defined(__xpv)
 266  266          /*
 267  267           * XXPV Should probably think some more about how we deal
 268  268           *      with panicing before it's really safe to panic.
 269  269           *      On hypervisors, we reboot very quickly..  Perhaps panic
 270  270           *      should only attempt to recover by rebooting if,
 271  271           *      say, we were able to mount the root filesystem,
 272  272           *      or if we successfully launched init(1m).
 273  273           */
 274  274          if (panicstr && proc_init == NULL)
 275  275                  (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);

↓ open down ↓

275 lines elided

↑ open up ↑

 276  276  #endif
 277  277          /*
 278  278           * stop other cpus and raise our priority.  since there is only
 279  279           * one active cpu after this, and our priority will be too high
 280  280           * for us to be preempted, we're essentially single threaded
 281  281           * from here on out.
 282  282           */
 283  283          (void) spl6();
 284  284          if (!panicstr) {
 285  285                  mutex_enter(&cpu_lock);
 286      -                pause_cpus(NULL);
      286 +                pause_cpus(NULL, NULL);
 287  287                  mutex_exit(&cpu_lock);
 288  288          }
 289  289  
 290  290          /*
 291  291           * If the system is panicking, the preloaded kernel is valid, and
 292  292           * fastreboot_onpanic has been set, and the system has been up for
 293  293           * longer than fastreboot_onpanic_uptime (default to 10 minutes),
 294  294           * choose Fast Reboot.
 295  295           */
 296  296          if (fcn == AD_BOOT && panicstr && newkernel.fi_valid &&

 297  297              fastreboot_onpanic &&
 298  298              (panic_lbolt - lbolt_at_boot) > fastreboot_onpanic_uptime) {
 299  299                  fcn = AD_FASTREBOOT;
 300  300          }
 301  301  
 302  302          /*
 303  303           * Try to quiesce devices.
 304  304           */
 305  305          if (is_first_quiesce) {
 306  306                  /*
 307  307                   * Clear is_first_quiesce before calling quiesce_devices()
 308  308                   * so that if quiesce_devices() causes panics, it will not
 309  309                   * be invoked again.
 310  310                   */
 311  311                  is_first_quiesce = 0;
 312  312  
 313  313                  quiesce_active = 1;
 314  314                  quiesce_devices(ddi_root_node(), &reset_status);
 315  315                  if (reset_status == -1) {
 316  316                          if (fcn == AD_FASTREBOOT && !force_fastreboot) {
 317  317                                  prom_printf("Driver(s) not capable of fast "
 318  318                                      "reboot.\n");
 319  319                                  prom_printf(fallback_str);
 320  320                                  fastreboot_capable = 0;
 321  321                                  fcn = AD_BOOT;
 322  322                          } else if (fcn != AD_FASTREBOOT)
 323  323                                  fastreboot_capable = 0;
 324  324                  }
 325  325                  quiesce_active = 0;
 326  326          }
 327  327  
 328  328          /*
 329  329           * Try to reset devices. reset_leaves() should only be called
 330  330           * a) when there are no other threads that could be accessing devices,
 331  331           *    and
 332  332           * b) on a system that's not capable of fast reboot (fastreboot_capable
 333  333           *    being 0), or on a system where quiesce_devices() failed to
 334  334           *    complete (quiesce_active being 1).
 335  335           */
 336  336          if (is_first_reset && (!fastreboot_capable || quiesce_active)) {
 337  337                  /*
 338  338                   * Clear is_first_reset before calling reset_devices()
 339  339                   * so that if reset_devices() causes panics, it will not
 340  340                   * be invoked again.
 341  341                   */
 342  342                  is_first_reset = 0;
 343  343                  reset_leaves();
 344  344          }
 345  345  
 346  346          /* Verify newkernel checksum */
 347  347          if (fastreboot_capable && fcn == AD_FASTREBOOT &&
 348  348              fastboot_cksum_verify(&newkernel) != 0) {
 349  349                  fastreboot_capable = 0;
 350  350                  prom_printf("Fast reboot: checksum failed for the new "
 351  351                      "kernel.\n");
 352  352                  prom_printf(fallback_str);
 353  353          }
 354  354  
 355  355          (void) spl8();
 356  356  
 357  357          if (fastreboot_capable && fcn == AD_FASTREBOOT) {
 358  358                  /*
 359  359                   * psm_shutdown is called within fast_reboot()
 360  360                   */
 361  361                  fast_reboot();
 362  362          } else {
 363  363                  (*psm_shutdownf)(cmd, fcn);
 364  364  
 365  365                  if (fcn == AD_HALT || fcn == AD_POWEROFF)
 366  366                          halt((char *)NULL);
 367  367                  else
 368  368                          prom_reboot("");
 369  369          }
 370  370          /*NOTREACHED*/
 371  371  }
 372  372  
 373  373  /* mdpreboot - may be called prior to mdboot while root fs still mounted */
 374  374  /*ARGSUSED*/
 375  375  void
 376  376  mdpreboot(int cmd, int fcn, char *mdep)
 377  377  {
 378  378          if (fcn == AD_FASTREBOOT && !fastreboot_capable) {
 379  379                  fcn = AD_BOOT;
 380  380  #ifdef  __xpv
 381  381                  cmn_err(CE_WARN, "Fast reboot is not supported on xVM");
 382  382  #else
 383  383                  cmn_err(CE_WARN,
 384  384                      "Fast reboot is not supported on this platform%s",
 385  385                      fastreboot_nosup_message());
 386  386  #endif
 387  387          }
 388  388  
 389  389          if (fcn == AD_FASTREBOOT) {
 390  390                  fastboot_load_kernel(mdep);
 391  391                  if (!newkernel.fi_valid)
 392  392                          fcn = AD_BOOT;
 393  393          }
 394  394  
 395  395          (*psm_preshutdownf)(cmd, fcn);
 396  396  }
 397  397  
 398  398  static void
 399  399  stop_other_cpus(void)
 400  400  {
 401  401          ulong_t s = clear_int_flag(); /* fast way to keep CPU from changing */
 402  402          cpuset_t xcset;
 403  403  
 404  404          CPUSET_ALL_BUT(xcset, CPU->cpu_id);
 405  405          xc_priority(0, 0, 0, CPUSET2BV(xcset), (xc_func_t)mach_cpu_halt);
 406  406          restore_int_flag(s);
 407  407  }
 408  408  
 409  409  /*
 410  410   *      Machine dependent abort sequence handling
 411  411   */
 412  412  void
 413  413  abort_sequence_enter(char *msg)
 414  414  {
 415  415          if (abort_enable == 0) {
 416  416                  if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 417  417                          audit_enterprom(0);
 418  418                  return;
 419  419          }
 420  420          if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 421  421                  audit_enterprom(1);
 422  422          debug_enter(msg);
 423  423          if (AU_ZONE_AUDITING(GET_KCTX_GZ))
 424  424                  audit_exitprom(1);
 425  425  }
 426  426  
 427  427  /*
 428  428   * Enter debugger.  Called when the user types ctrl-alt-d or whenever
 429  429   * code wants to enter the debugger and possibly resume later.
 430  430   */
 431  431  void
 432  432  debug_enter(
 433  433          char    *msg)           /* message to print, possibly NULL */
 434  434  {
 435  435          if (dtrace_debugger_init != NULL)
 436  436                  (*dtrace_debugger_init)();
 437  437  
 438  438          if (msg)
 439  439                  prom_printf("%s\n", msg);
 440  440  
 441  441          if (boothowto & RB_DEBUG)
 442  442                  kmdb_enter();
 443  443  
 444  444          if (dtrace_debugger_fini != NULL)
 445  445                  (*dtrace_debugger_fini)();
 446  446  }
 447  447  
 448  448  void
 449  449  reset(void)
 450  450  {
 451  451          extern  void acpi_reset_system();
 452  452  #if !defined(__xpv)
 453  453          ushort_t *bios_memchk;
 454  454  
 455  455          /*
 456  456           * Can't use psm_map_phys or acpi_reset_system before the hat is
 457  457           * initialized.
 458  458           */
 459  459          if (khat_running) {
 460  460                  bios_memchk = (ushort_t *)psm_map_phys(0x472,
 461  461                      sizeof (ushort_t), PROT_READ | PROT_WRITE);
 462  462                  if (bios_memchk)
 463  463                          *bios_memchk = 0x1234;  /* bios memory check disable */
 464  464  
 465  465                  if (options_dip != NULL &&
 466  466                      ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), 0,
 467  467                      "efi-systab")) {
 468  468                          efi_reset();
 469  469                  }
 470  470  
 471  471                  /*
 472  472                   * The problem with using stubs is that we can call
 473  473                   * acpi_reset_system only after the kernel is up and running.
 474  474                   *
 475  475                   * We should create a global state to keep track of how far
 476  476                   * up the kernel is but for the time being we will depend on
 477  477                   * bootops. bootops cleared in startup_end().
 478  478                   */
 479  479                  if (bootops == NULL)
 480  480                          acpi_reset_system();
 481  481          }
 482  482  
 483  483          pc_reset();
 484  484  #else
 485  485          if (IN_XPV_PANIC()) {
 486  486                  if (khat_running && bootops == NULL) {
 487  487                          acpi_reset_system();
 488  488                  }
 489  489  
 490  490                  pc_reset();
 491  491          }
 492  492  
 493  493          (void) HYPERVISOR_shutdown(SHUTDOWN_reboot);
 494  494          panic("HYPERVISOR_shutdown() failed");
 495  495  #endif
 496  496          /*NOTREACHED*/
 497  497  }
 498  498  
 499  499  /*
 500  500   * Halt the machine and return to the monitor
 501  501   */
 502  502  void
 503  503  halt(char *s)
 504  504  {
 505  505          stop_other_cpus();      /* send stop signal to other CPUs */
 506  506          if (s)
 507  507                  prom_printf("(%s) \n", s);
 508  508          prom_exit_to_mon();
 509  509          /*NOTREACHED*/
 510  510  }
 511  511  
 512  512  /*
 513  513   * Initiate interrupt redistribution.
 514  514   */
 515  515  void
 516  516  i_ddi_intr_redist_all_cpus()
 517  517  {
 518  518  }
 519  519  
 520  520  /*
 521  521   * XXX These probably ought to live somewhere else
 522  522   * XXX They are called from mem.c
 523  523   */
 524  524  
 525  525  /*
 526  526   * Convert page frame number to an OBMEM page frame number
 527  527   * (i.e. put in the type bits -- zero for this implementation)
 528  528   */
 529  529  pfn_t
 530  530  impl_obmem_pfnum(pfn_t pf)
 531  531  {
 532  532          return (pf);
 533  533  }
 534  534  
 535  535  #ifdef  NM_DEBUG
 536  536  int nmi_test = 0;       /* checked in intentry.s during clock int */
 537  537  int nmtest = -1;
 538  538  nmfunc1(arg, rp)
 539  539  int     arg;
 540  540  struct regs *rp;
 541  541  {
 542  542          printf("nmi called with arg = %x, regs = %x\n", arg, rp);
 543  543          nmtest += 50;
 544  544          if (arg == nmtest) {
 545  545                  printf("ip = %x\n", rp->r_pc);
 546  546                  return (1);
 547  547          }
 548  548          return (0);
 549  549  }
 550  550  
 551  551  #endif
 552  552  
 553  553  #include <sys/bootsvcs.h>
 554  554  
 555  555  /* Hacked up initialization for initial kernel check out is HERE. */
 556  556  /* The basic steps are: */
 557  557  /*      kernel bootfuncs definition/initialization for KADB */
 558  558  /*      kadb bootfuncs pointer initialization */
 559  559  /*      putchar/getchar (interrupts disabled) */
 560  560  
 561  561  /* kadb bootfuncs pointer initialization */
 562  562  
 563  563  int
 564  564  sysp_getchar()
 565  565  {
 566  566          int i;
 567  567          ulong_t s;
 568  568  
 569  569          if (cons_polledio == NULL) {
 570  570                  /* Uh oh */
 571  571                  prom_printf("getchar called with no console\n");
 572  572                  for (;;)
 573  573                          /* LOOP FOREVER */;
 574  574          }
 575  575  
 576  576          s = clear_int_flag();
 577  577          i = cons_polledio->cons_polledio_getchar(
 578  578              cons_polledio->cons_polledio_argument);
 579  579          restore_int_flag(s);
 580  580          return (i);
 581  581  }
 582  582  
 583  583  void
 584  584  sysp_putchar(int c)
 585  585  {
 586  586          ulong_t s;
 587  587  
 588  588          /*
 589  589           * We have no alternative but to drop the output on the floor.
 590  590           */
 591  591          if (cons_polledio == NULL ||
 592  592              cons_polledio->cons_polledio_putchar == NULL)
 593  593                  return;
 594  594  
 595  595          s = clear_int_flag();
 596  596          cons_polledio->cons_polledio_putchar(
 597  597              cons_polledio->cons_polledio_argument, c);
 598  598          restore_int_flag(s);
 599  599  }
 600  600  
 601  601  int
 602  602  sysp_ischar()
 603  603  {
 604  604          int i;
 605  605          ulong_t s;
 606  606  
 607  607          if (cons_polledio == NULL ||
 608  608              cons_polledio->cons_polledio_ischar == NULL)
 609  609                  return (0);
 610  610  
 611  611          s = clear_int_flag();
 612  612          i = cons_polledio->cons_polledio_ischar(
 613  613              cons_polledio->cons_polledio_argument);
 614  614          restore_int_flag(s);
 615  615          return (i);
 616  616  }
 617  617  
 618  618  int
 619  619  goany(void)
 620  620  {
 621  621          prom_printf("Type any key to continue ");
 622  622          (void) prom_getchar();
 623  623          prom_printf("\n");
 624  624          return (1);
 625  625  }
 626  626  
 627  627  static struct boot_syscalls kern_sysp = {
 628  628          sysp_getchar,   /*      unchar  (*getchar)();   7  */
 629  629          sysp_putchar,   /*      int     (*putchar)();   8  */
 630  630          sysp_ischar,    /*      int     (*ischar)();    9  */
 631  631  };
 632  632  
 633  633  #if defined(__xpv)
 634  634  int using_kern_polledio;
 635  635  #endif
 636  636  
 637  637  void
 638  638  kadb_uses_kernel()
 639  639  {
 640  640          /*
 641  641           * This routine is now totally misnamed, since it does not in fact
 642  642           * control kadb's I/O; it only controls the kernel's prom_* I/O.
 643  643           */
 644  644          sysp = &kern_sysp;
 645  645  #if defined(__xpv)
 646  646          using_kern_polledio = 1;
 647  647  #endif
 648  648  }
 649  649  
 650  650  /*
 651  651   *      the interface to the outside world
 652  652   */
 653  653  
 654  654  /*
 655  655   * poll_port -- wait for a register to achieve a
 656  656   *              specific state.  Arguments are a mask of bits we care about,
 657  657   *              and two sub-masks.  To return normally, all the bits in the
 658  658   *              first sub-mask must be ON, all the bits in the second sub-
 659  659   *              mask must be OFF.  If about seconds pass without the register
 660  660   *              achieving the desired bit configuration, we return 1, else
 661  661   *              0.
 662  662   */
 663  663  int
 664  664  poll_port(ushort_t port, ushort_t mask, ushort_t onbits, ushort_t offbits)
 665  665  {
 666  666          int i;
 667  667          ushort_t maskval;
 668  668  
 669  669          for (i = 500000; i; i--) {
 670  670                  maskval = inb(port) & mask;
 671  671                  if (((maskval & onbits) == onbits) &&
 672  672                      ((maskval & offbits) == 0))
 673  673                          return (0);
 674  674                  drv_usecwait(10);
 675  675          }
 676  676          return (1);
 677  677  }
 678  678  
 679  679  /*
 680  680   * set_idle_cpu is called from idle() when a CPU becomes idle.
 681  681   */
 682  682  /*LINTED: static unused */
 683  683  static uint_t last_idle_cpu;
 684  684  
 685  685  /*ARGSUSED*/
 686  686  void
 687  687  set_idle_cpu(int cpun)
 688  688  {
 689  689          last_idle_cpu = cpun;
 690  690          (*psm_set_idle_cpuf)(cpun);
 691  691  }
 692  692  
 693  693  /*
 694  694   * unset_idle_cpu is called from idle() when a CPU is no longer idle.
 695  695   */
 696  696  /*ARGSUSED*/
 697  697  void
 698  698  unset_idle_cpu(int cpun)
 699  699  {
 700  700          (*psm_unset_idle_cpuf)(cpun);
 701  701  }
 702  702  
 703  703  /*
 704  704   * This routine is almost correct now, but not quite.  It still needs the
 705  705   * equivalent concept of "hres_last_tick", just like on the sparc side.
 706  706   * The idea is to take a snapshot of the hi-res timer while doing the
 707  707   * hrestime_adj updates under hres_lock in locore, so that the small
 708  708   * interval between interrupt assertion and interrupt processing is
 709  709   * accounted for correctly.  Once we have this, the code below should
 710  710   * be modified to subtract off hres_last_tick rather than hrtime_base.
 711  711   *
 712  712   * I'd have done this myself, but I don't have source to all of the
 713  713   * vendor-specific hi-res timer routines (grrr...).  The generic hook I
 714  714   * need is something like "gethrtime_unlocked()", which would be just like
 715  715   * gethrtime() but would assume that you're already holding CLOCK_LOCK().
 716  716   * This is what the GET_HRTIME() macro is for on sparc (although it also
 717  717   * serves the function of making time available without a function call
 718  718   * so you don't take a register window overflow while traps are disabled).
 719  719   */
 720  720  void
 721  721  pc_gethrestime(timestruc_t *tp)
 722  722  {
 723  723          int lock_prev;
 724  724          timestruc_t now;
 725  725          int nslt;               /* nsec since last tick */
 726  726          int adj;                /* amount of adjustment to apply */
 727  727  
 728  728  loop:
 729  729          lock_prev = hres_lock;
 730  730          now = hrestime;
 731  731          nslt = (int)(gethrtime() - hres_last_tick);
 732  732          if (nslt < 0) {
 733  733                  /*
 734  734                   * nslt < 0 means a tick came between sampling
 735  735                   * gethrtime() and hres_last_tick; restart the loop
 736  736                   */
 737  737  
 738  738                  goto loop;
 739  739          }
 740  740          now.tv_nsec += nslt;
 741  741          if (hrestime_adj != 0) {
 742  742                  if (hrestime_adj > 0) {
 743  743                          adj = (nslt >> ADJ_SHIFT);
 744  744                          if (adj > hrestime_adj)
 745  745                                  adj = (int)hrestime_adj;
 746  746                  } else {
 747  747                          adj = -(nslt >> ADJ_SHIFT);
 748  748                          if (adj < hrestime_adj)
 749  749                                  adj = (int)hrestime_adj;
 750  750                  }
 751  751                  now.tv_nsec += adj;
 752  752          }
 753  753          while ((unsigned long)now.tv_nsec >= NANOSEC) {
 754  754  
 755  755                  /*
 756  756                   * We might have a large adjustment or have been in the
 757  757                   * debugger for a long time; take care of (at most) four
 758  758                   * of those missed seconds (tv_nsec is 32 bits, so
 759  759                   * anything >4s will be wrapping around).  However,
 760  760                   * anything more than 2 seconds out of sync will trigger
 761  761                   * timedelta from clock() to go correct the time anyway,
 762  762                   * so do what we can, and let the big crowbar do the
 763  763                   * rest.  A similar correction while loop exists inside
 764  764                   * hres_tick(); in all cases we'd like tv_nsec to
 765  765                   * satisfy 0 <= tv_nsec < NANOSEC to avoid confusing
 766  766                   * user processes, but if tv_sec's a little behind for a
 767  767                   * little while, that's OK; time still monotonically
 768  768                   * increases.
 769  769                   */
 770  770  
 771  771                  now.tv_nsec -= NANOSEC;
 772  772                  now.tv_sec++;
 773  773          }
 774  774          if ((hres_lock & ~1) != lock_prev)
 775  775                  goto loop;
 776  776  
 777  777          *tp = now;
 778  778  }
 779  779  
 780  780  void
 781  781  gethrestime_lasttick(timespec_t *tp)
 782  782  {
 783  783          int s;
 784  784  
 785  785          s = hr_clock_lock();
 786  786          *tp = hrestime;
 787  787          hr_clock_unlock(s);
 788  788  }
 789  789  
 790  790  time_t
 791  791  gethrestime_sec(void)
 792  792  {
 793  793          timestruc_t now;
 794  794  
 795  795          gethrestime(&now);
 796  796          return (now.tv_sec);
 797  797  }
 798  798  
 799  799  /*
 800  800   * Initialize a kernel thread's stack
 801  801   */
 802  802  
 803  803  caddr_t
 804  804  thread_stk_init(caddr_t stk)
 805  805  {
 806  806          ASSERT(((uintptr_t)stk & (STACK_ALIGN - 1)) == 0);
 807  807          return (stk - SA(MINFRAME));
 808  808  }
 809  809  
 810  810  /*
 811  811   * Initialize lwp's kernel stack.
 812  812   */
 813  813  
 814  814  #ifdef TRAPTRACE
 815  815  /*
 816  816   * There's a tricky interdependency here between use of sysenter and
 817  817   * TRAPTRACE which needs recording to avoid future confusion (this is
 818  818   * about the third time I've re-figured this out ..)
 819  819   *
 820  820   * Here's how debugging lcall works with TRAPTRACE.
 821  821   *
 822  822   * 1 We're in userland with a breakpoint on the lcall instruction.
 823  823   * 2 We execute the instruction - the instruction pushes the userland
 824  824   *   %ss, %esp, %efl, %cs, %eip on the stack and zips into the kernel
 825  825   *   via the call gate.
 826  826   * 3 The hardware raises a debug trap in kernel mode, the hardware
 827  827   *   pushes %efl, %cs, %eip and gets to dbgtrap via the idt.
 828  828   * 4 dbgtrap pushes the error code and trapno and calls cmntrap
 829  829   * 5 cmntrap finishes building a trap frame
 830  830   * 6 The TRACE_REGS macros in cmntrap copy a REGSIZE worth chunk
 831  831   *   off the stack into the traptrace buffer.
 832  832   *
 833  833   * This means that the traptrace buffer contains the wrong values in
 834  834   * %esp and %ss, but everything else in there is correct.
 835  835   *
 836  836   * Here's how debugging sysenter works with TRAPTRACE.
 837  837   *
 838  838   * a We're in userland with a breakpoint on the sysenter instruction.
 839  839   * b We execute the instruction - the instruction pushes -nothing-
 840  840   *   on the stack, but sets %cs, %eip, %ss, %esp to prearranged
 841  841   *   values to take us to sys_sysenter, at the top of the lwp's
 842  842   *   stack.
 843  843   * c goto 3
 844  844   *
 845  845   * At this point, because we got into the kernel without the requisite
 846  846   * five pushes on the stack, if we didn't make extra room, we'd
 847  847   * end up with the TRACE_REGS macro fetching the saved %ss and %esp
 848  848   * values from negative (unmapped) stack addresses -- which really bites.
 849  849   * That's why we do the '-= 8' below.
 850  850   *
 851  851   * XXX  Note that reading "up" lwp0's stack works because t0 is declared
 852  852   *      right next to t0stack in locore.s
 853  853   */
 854  854  #endif
 855  855  
 856  856  caddr_t
 857  857  lwp_stk_init(klwp_t *lwp, caddr_t stk)
 858  858  {
 859  859          caddr_t oldstk;
 860  860          struct pcb *pcb = &lwp->lwp_pcb;
 861  861  
 862  862          oldstk = stk;
 863  863          stk -= SA(sizeof (struct regs) + SA(MINFRAME));
 864  864  #ifdef TRAPTRACE
 865  865          stk -= 2 * sizeof (greg_t); /* space for phony %ss:%sp (see above) */
 866  866  #endif
 867  867          stk = (caddr_t)((uintptr_t)stk & ~(STACK_ALIGN - 1ul));
 868  868          bzero(stk, oldstk - stk);
 869  869          lwp->lwp_regs = (void *)(stk + SA(MINFRAME));
 870  870  
 871  871          /*
 872  872           * Arrange that the virtualized %fs and %gs GDT descriptors
 873  873           * have a well-defined initial state (present, ring 3
 874  874           * and of type data).
 875  875           */
 876  876  #if defined(__amd64)
 877  877          if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE)
 878  878                  pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc;
 879  879          else
 880  880                  pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_u32desc;
 881  881  #elif defined(__i386)
 882  882          pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc;
 883  883  #endif  /* __i386 */
 884  884          lwp_installctx(lwp);
 885  885          return (stk);
 886  886  }
 887  887  
 888  888  /*ARGSUSED*/
 889  889  void
 890  890  lwp_stk_fini(klwp_t *lwp)
 891  891  {}
 892  892  
 893  893  /*
 894  894   * If we're not the panic CPU, we wait in panic_idle for reboot.
 895  895   */
 896  896  void
 897  897  panic_idle(void)
 898  898  {
 899  899          splx(ipltospl(CLOCK_LEVEL));
 900  900          (void) setjmp(&curthread->t_pcb);
 901  901  
 902  902          dumpsys_helper();
 903  903  
 904  904  #ifndef __xpv
 905  905          for (;;)
 906  906                  i86_halt();
 907  907  #else
 908  908          for (;;)
 909  909                  ;
 910  910  #endif
 911  911  }
 912  912  
 913  913  /*
 914  914   * Stop the other CPUs by cross-calling them and forcing them to enter
 915  915   * the panic_idle() loop above.
 916  916   */
 917  917  /*ARGSUSED*/
 918  918  void
 919  919  panic_stopcpus(cpu_t *cp, kthread_t *t, int spl)
 920  920  {
 921  921          processorid_t i;
 922  922          cpuset_t xcset;
 923  923  
 924  924          /*
 925  925           * In the case of a Xen panic, the hypervisor has already stopped
 926  926           * all of the CPUs.
 927  927           */
 928  928          if (!IN_XPV_PANIC()) {
 929  929                  (void) splzs();
 930  930  
 931  931                  CPUSET_ALL_BUT(xcset, cp->cpu_id);
 932  932                  xc_priority(0, 0, 0, CPUSET2BV(xcset), (xc_func_t)panic_idle);
 933  933          }
 934  934  
 935  935          for (i = 0; i < NCPU; i++) {
 936  936                  if (i != cp->cpu_id && cpu[i] != NULL &&
 937  937                      (cpu[i]->cpu_flags & CPU_EXISTS))
 938  938                          cpu[i]->cpu_flags |= CPU_QUIESCED;
 939  939          }
 940  940  }
 941  941  
 942  942  /*
 943  943   * Platform callback following each entry to panicsys().
 944  944   */
 945  945  /*ARGSUSED*/
 946  946  void
 947  947  panic_enter_hw(int spl)
 948  948  {
 949  949          /* Nothing to do here */
 950  950  }
 951  951  
 952  952  /*
 953  953   * Platform-specific code to execute after panicstr is set: we invoke
 954  954   * the PSM entry point to indicate that a panic has occurred.
 955  955   */
 956  956  /*ARGSUSED*/
 957  957  void
 958  958  panic_quiesce_hw(panic_data_t *pdp)
 959  959  {
 960  960          psm_notifyf(PSM_PANIC_ENTER);
 961  961  
 962  962          cmi_panic_callback();
 963  963  
 964  964  #ifdef  TRAPTRACE
 965  965          /*
 966  966           * Turn off TRAPTRACE
 967  967           */
 968  968          TRAPTRACE_FREEZE;
 969  969  #endif  /* TRAPTRACE */
 970  970  }
 971  971  
 972  972  /*
 973  973   * Platform callback prior to writing crash dump.
 974  974   */
 975  975  /*ARGSUSED*/
 976  976  void
 977  977  panic_dump_hw(int spl)
 978  978  {
 979  979          /* Nothing to do here */
 980  980  }
 981  981  
 982  982  void *
 983  983  plat_traceback(void *fpreg)
 984  984  {
 985  985  #ifdef __xpv
 986  986          if (IN_XPV_PANIC())
 987  987                  return (xpv_traceback(fpreg));
 988  988  #endif
 989  989          return (fpreg);
 990  990  }
 991  991  
 992  992  /*ARGSUSED*/
 993  993  void
 994  994  plat_tod_fault(enum tod_fault_type tod_bad)
 995  995  {}
 996  996  
 997  997  /*ARGSUSED*/
 998  998  int
 999  999  blacklist(int cmd, const char *scheme, nvlist_t *fmri, const char *class)
1000 1000  {
1001 1001          return (ENOTSUP);
1002 1002  }
1003 1003  
1004 1004  /*
1005 1005   * The underlying console output routines are protected by raising IPL in case
1006 1006   * we are still calling into the early boot services.  Once we start calling
1007 1007   * the kernel console emulator, it will disable interrupts completely during
1008 1008   * character rendering (see sysp_putchar, for example).  Refer to the comments
1009 1009   * and code in common/os/console.c for more information on these callbacks.
1010 1010   */
1011 1011  /*ARGSUSED*/
1012 1012  int
1013 1013  console_enter(int busy)
1014 1014  {
1015 1015          return (splzs());
1016 1016  }
1017 1017  
1018 1018  /*ARGSUSED*/
1019 1019  void
1020 1020  console_exit(int busy, int spl)
1021 1021  {
1022 1022          splx(spl);
1023 1023  }
1024 1024  
1025 1025  /*
1026 1026   * Allocate a region of virtual address space, unmapped.
1027 1027   * Stubbed out except on sparc, at least for now.
1028 1028   */
1029 1029  /*ARGSUSED*/
1030 1030  void *
1031 1031  boot_virt_alloc(void *addr, size_t size)
1032 1032  {
1033 1033          return (addr);
1034 1034  }
1035 1035  
1036 1036  volatile unsigned long  tenmicrodata;
1037 1037  
1038 1038  void
1039 1039  tenmicrosec(void)
1040 1040  {
1041 1041          extern int gethrtime_hires;
1042 1042  
1043 1043          if (gethrtime_hires) {
1044 1044                  hrtime_t start, end;
1045 1045                  start = end =  gethrtime();
1046 1046                  while ((end - start) < (10 * (NANOSEC / MICROSEC))) {
1047 1047                          SMT_PAUSE();
1048 1048                          end = gethrtime();
1049 1049                  }
1050 1050          } else {
1051 1051  #if defined(__xpv)
1052 1052                  hrtime_t newtime;
1053 1053  
1054 1054                  newtime = xpv_gethrtime() + 10000; /* now + 10 us */
1055 1055                  while (xpv_gethrtime() < newtime)
1056 1056                          SMT_PAUSE();
1057 1057  #else   /* __xpv */
1058 1058                  int i;
1059 1059  
1060 1060                  /*
1061 1061                   * Artificial loop to induce delay.
1062 1062                   */
1063 1063                  for (i = 0; i < microdata; i++)
1064 1064                          tenmicrodata = microdata;
1065 1065  #endif  /* __xpv */
1066 1066          }
1067 1067  }
1068 1068  
1069 1069  /*
1070 1070   * get_cpu_mstate() is passed an array of timestamps, NCMSTATES
1071 1071   * long, and it fills in the array with the time spent on cpu in
1072 1072   * each of the mstates, where time is returned in nsec.
1073 1073   *
1074 1074   * No guarantee is made that the returned values in times[] will
1075 1075   * monotonically increase on sequential calls, although this will
1076 1076   * be true in the long run. Any such guarantee must be handled by
1077 1077   * the caller, if needed. This can happen if we fail to account
1078 1078   * for elapsed time due to a generation counter conflict, yet we
1079 1079   * did account for it on a prior call (see below).
1080 1080   *
1081 1081   * The complication is that the cpu in question may be updating
1082 1082   * its microstate at the same time that we are reading it.
1083 1083   * Because the microstate is only updated when the CPU's state
1084 1084   * changes, the values in cpu_intracct[] can be indefinitely out
1085 1085   * of date. To determine true current values, it is necessary to
1086 1086   * compare the current time with cpu_mstate_start, and add the
1087 1087   * difference to times[cpu_mstate].
1088 1088   *
1089 1089   * This can be a problem if those values are changing out from
1090 1090   * under us. Because the code path in new_cpu_mstate() is
1091 1091   * performance critical, we have not added a lock to it. Instead,
1092 1092   * we have added a generation counter. Before beginning
1093 1093   * modifications, the counter is set to 0. After modifications,
1094 1094   * it is set to the old value plus one.
1095 1095   *
1096 1096   * get_cpu_mstate() will not consider the values of cpu_mstate
1097 1097   * and cpu_mstate_start to be usable unless the value of
1098 1098   * cpu_mstate_gen is both non-zero and unchanged, both before and
1099 1099   * after reading the mstate information. Note that we must
1100 1100   * protect against out-of-order loads around accesses to the
1101 1101   * generation counter. Also, this is a best effort approach in
1102 1102   * that we do not retry should the counter be found to have
1103 1103   * changed.
1104 1104   *
1105 1105   * cpu_intracct[] is used to identify time spent in each CPU
1106 1106   * mstate while handling interrupts. Such time should be reported
1107 1107   * against system time, and so is subtracted out from its
1108 1108   * corresponding cpu_acct[] time and added to
1109 1109   * cpu_acct[CMS_SYSTEM].
1110 1110   */
1111 1111  
1112 1112  void
1113 1113  get_cpu_mstate(cpu_t *cpu, hrtime_t *times)
1114 1114  {
1115 1115          int i;
1116 1116          hrtime_t now, start;
1117 1117          uint16_t gen;
1118 1118          uint16_t state;
1119 1119          hrtime_t intracct[NCMSTATES];
1120 1120  
1121 1121          /*
1122 1122           * Load all volatile state under the protection of membar.
1123 1123           * cpu_acct[cpu_mstate] must be loaded to avoid double counting
1124 1124           * of (now - cpu_mstate_start) by a change in CPU mstate that
1125 1125           * arrives after we make our last check of cpu_mstate_gen.
1126 1126           */
1127 1127  
1128 1128          now = gethrtime_unscaled();
1129 1129          gen = cpu->cpu_mstate_gen;
1130 1130  
1131 1131          membar_consumer();      /* guarantee load ordering */
1132 1132          start = cpu->cpu_mstate_start;
1133 1133          state = cpu->cpu_mstate;
1134 1134          for (i = 0; i < NCMSTATES; i++) {
1135 1135                  intracct[i] = cpu->cpu_intracct[i];
1136 1136                  times[i] = cpu->cpu_acct[i];
1137 1137          }
1138 1138          membar_consumer();      /* guarantee load ordering */
1139 1139  
1140 1140          if (gen != 0 && gen == cpu->cpu_mstate_gen && now > start)
1141 1141                  times[state] += now - start;
1142 1142  
1143 1143          for (i = 0; i < NCMSTATES; i++) {
1144 1144                  if (i == CMS_SYSTEM)
1145 1145                          continue;
1146 1146                  times[i] -= intracct[i];
1147 1147                  if (times[i] < 0) {
1148 1148                          intracct[i] += times[i];
1149 1149                          times[i] = 0;
1150 1150                  }
1151 1151                  times[CMS_SYSTEM] += intracct[i];
1152 1152                  scalehrtime(&times[i]);
1153 1153          }
1154 1154          scalehrtime(&times[CMS_SYSTEM]);
1155 1155  }
1156 1156  
1157 1157  /*
1158 1158   * This is a version of the rdmsr instruction that allows
1159 1159   * an error code to be returned in the case of failure.
1160 1160   */
1161 1161  int
1162 1162  checked_rdmsr(uint_t msr, uint64_t *value)
1163 1163  {
1164 1164          if (!is_x86_feature(x86_featureset, X86FSET_MSR))
1165 1165                  return (ENOTSUP);
1166 1166          *value = rdmsr(msr);
1167 1167          return (0);
1168 1168  }
1169 1169  
1170 1170  /*
1171 1171   * This is a version of the wrmsr instruction that allows
1172 1172   * an error code to be returned in the case of failure.
1173 1173   */
1174 1174  int
1175 1175  checked_wrmsr(uint_t msr, uint64_t value)
1176 1176  {
1177 1177          if (!is_x86_feature(x86_featureset, X86FSET_MSR))
1178 1178                  return (ENOTSUP);
1179 1179          wrmsr(msr, value);
1180 1180          return (0);
1181 1181  }
1182 1182  
1183 1183  /*
1184 1184   * The mem driver's usual method of using hat_devload() to establish a
1185 1185   * temporary mapping will not work for foreign pages mapped into this
1186 1186   * domain or for the special hypervisor-provided pages.  For the foreign
1187 1187   * pages, we often don't know which domain owns them, so we can't ask the
1188 1188   * hypervisor to set up a new mapping.  For the other pages, we don't have
1189 1189   * a pfn, so we can't create a new PTE.  For these special cases, we do a
1190 1190   * direct uiomove() from the existing kernel virtual address.
1191 1191   */
1192 1192  /*ARGSUSED*/
1193 1193  int
1194 1194  plat_mem_do_mmio(struct uio *uio, enum uio_rw rw)
1195 1195  {
1196 1196  #if defined(__xpv)
1197 1197          void *va = (void *)(uintptr_t)uio->uio_loffset;
1198 1198          off_t pageoff = uio->uio_loffset & PAGEOFFSET;
1199 1199          size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
1200 1200              (size_t)uio->uio_iov->iov_len);
1201 1201  
1202 1202          if ((rw == UIO_READ &&
1203 1203              (va == HYPERVISOR_shared_info || va == xen_info)) ||
1204 1204              (pfn_is_foreign(hat_getpfnum(kas.a_hat, va))))
1205 1205                  return (uiomove(va, nbytes, rw, uio));
1206 1206  #endif
1207 1207          return (ENOTSUP);
1208 1208  }
1209 1209  
1210 1210  pgcnt_t
1211 1211  num_phys_pages()
1212 1212  {
1213 1213          pgcnt_t npages = 0;
1214 1214          struct memlist *mp;
1215 1215  
1216 1216  #if defined(__xpv)
1217 1217          if (DOMAIN_IS_INITDOMAIN(xen_info))
1218 1218                  return (xpv_nr_phys_pages());
1219 1219  #endif /* __xpv */
1220 1220  
1221 1221          for (mp = phys_install; mp != NULL; mp = mp->ml_next)
1222 1222                  npages += mp->ml_size >> PAGESHIFT;
1223 1223  
1224 1224          return (npages);
1225 1225  }
1226 1226  
1227 1227  /* cpu threshold for compressed dumps */
1228 1228  #ifdef _LP64
1229 1229  uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_64_MINCPU;
1230 1230  #else
1231 1231  uint_t dump_plat_mincpu_default = DUMP_PLAT_X86_32_MINCPU;
1232 1232  #endif
1233 1233  
1234 1234  int
1235 1235  dump_plat_addr()
1236 1236  {
1237 1237  #ifdef __xpv
1238 1238          pfn_t pfn = mmu_btop(xen_info->shared_info) | PFN_IS_FOREIGN_MFN;
1239 1239          mem_vtop_t mem_vtop;
1240 1240          int cnt;
1241 1241  
1242 1242          /*
1243 1243           * On the hypervisor, we want to dump the page with shared_info on it.
1244 1244           */
1245 1245          if (!IN_XPV_PANIC()) {
1246 1246                  mem_vtop.m_as = &kas;
1247 1247                  mem_vtop.m_va = HYPERVISOR_shared_info;
1248 1248                  mem_vtop.m_pfn = pfn;
1249 1249                  dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1250 1250                  cnt = 1;
1251 1251          } else {
1252 1252                  cnt = dump_xpv_addr();
1253 1253          }
1254 1254          return (cnt);
1255 1255  #else
1256 1256          return (0);
1257 1257  #endif
1258 1258  }
1259 1259  
1260 1260  void
1261 1261  dump_plat_pfn()
1262 1262  {
1263 1263  #ifdef __xpv
1264 1264          pfn_t pfn = mmu_btop(xen_info->shared_info) | PFN_IS_FOREIGN_MFN;
1265 1265  
1266 1266          if (!IN_XPV_PANIC())
1267 1267                  dumpvp_write(&pfn, sizeof (pfn));
1268 1268          else
1269 1269                  dump_xpv_pfn();
1270 1270  #endif
1271 1271  }
1272 1272  
1273 1273  /*ARGSUSED*/
1274 1274  int
1275 1275  dump_plat_data(void *dump_cbuf)
1276 1276  {
1277 1277  #ifdef __xpv
1278 1278          uint32_t csize;
1279 1279          int cnt;
1280 1280  
1281 1281          if (!IN_XPV_PANIC()) {
1282 1282                  csize = (uint32_t)compress(HYPERVISOR_shared_info, dump_cbuf,
1283 1283                      PAGESIZE);
1284 1284                  dumpvp_write(&csize, sizeof (uint32_t));
1285 1285                  dumpvp_write(dump_cbuf, csize);
1286 1286                  cnt = 1;
1287 1287          } else {
1288 1288                  cnt = dump_xpv_data(dump_cbuf);
1289 1289          }
1290 1290          return (cnt);
1291 1291  #else
1292 1292          return (0);
1293 1293  #endif
1294 1294  }
1295 1295  
1296 1296  /*
1297 1297   * Calculates a linear address, given the CS selector and PC values
1298 1298   * by looking up the %cs selector process's LDT or the CPU's GDT.
1299 1299   * proc->p_ldtlock must be held across this call.
1300 1300   */
1301 1301  int
1302 1302  linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
1303 1303  {
1304 1304          user_desc_t     *descrp;
1305 1305          caddr_t         baseaddr;
1306 1306          uint16_t        idx = SELTOIDX(rp->r_cs);
1307 1307  
1308 1308          ASSERT(rp->r_cs <= 0xFFFF);
1309 1309          ASSERT(MUTEX_HELD(&p->p_ldtlock));
1310 1310  
1311 1311          if (SELISLDT(rp->r_cs)) {
1312 1312                  /*
1313 1313                   * Currently 64 bit processes cannot have private LDTs.
1314 1314                   */
1315 1315                  ASSERT(p->p_model != DATAMODEL_LP64);
1316 1316  
1317 1317                  if (p->p_ldt == NULL)
1318 1318                          return (-1);
1319 1319  
1320 1320                  descrp = &p->p_ldt[idx];
1321 1321                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1322 1322  
1323 1323                  /*
1324 1324                   * Calculate the linear address (wraparound is not only ok,
1325 1325                   * it's expected behavior).  The cast to uint32_t is because
1326 1326                   * LDT selectors are only allowed in 32-bit processes.
1327 1327                   */
1328 1328                  *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
1329 1329                      rp->r_pc);
1330 1330          } else {
1331 1331  #ifdef DEBUG
1332 1332                  descrp = &CPU->cpu_gdt[idx];
1333 1333                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1334 1334                  /* GDT-based descriptors' base addresses should always be 0 */
1335 1335                  ASSERT(baseaddr == 0);
1336 1336  #endif
1337 1337                  *linearp = (caddr_t)(uintptr_t)rp->r_pc;
1338 1338          }
1339 1339  
1340 1340          return (0);
1341 1341  }
1342 1342  
1343 1343  /*
1344 1344   * The implementation of dtrace_linear_pc is similar to the that of
1345 1345   * linear_pc, above, but here we acquire p_ldtlock before accessing
1346 1346   * p_ldt.  This implementation is used by the pid provider; we prefix
1347 1347   * it with "dtrace_" to avoid inducing spurious tracing events.
1348 1348   */
1349 1349  int
1350 1350  dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp)
1351 1351  {
1352 1352          user_desc_t     *descrp;
1353 1353          caddr_t         baseaddr;
1354 1354          uint16_t        idx = SELTOIDX(rp->r_cs);
1355 1355  
1356 1356          ASSERT(rp->r_cs <= 0xFFFF);
1357 1357  
1358 1358          if (SELISLDT(rp->r_cs)) {
1359 1359                  /*
1360 1360                   * Currently 64 bit processes cannot have private LDTs.
1361 1361                   */
1362 1362                  ASSERT(p->p_model != DATAMODEL_LP64);
1363 1363  
1364 1364                  mutex_enter(&p->p_ldtlock);
1365 1365                  if (p->p_ldt == NULL) {
1366 1366                          mutex_exit(&p->p_ldtlock);
1367 1367                          return (-1);
1368 1368                  }
1369 1369                  descrp = &p->p_ldt[idx];
1370 1370                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1371 1371                  mutex_exit(&p->p_ldtlock);
1372 1372  
1373 1373                  /*
1374 1374                   * Calculate the linear address (wraparound is not only ok,
1375 1375                   * it's expected behavior).  The cast to uint32_t is because
1376 1376                   * LDT selectors are only allowed in 32-bit processes.
1377 1377                   */
1378 1378                  *linearp = (caddr_t)(uintptr_t)(uint32_t)((uintptr_t)baseaddr +
1379 1379                      rp->r_pc);
1380 1380          } else {
1381 1381  #ifdef DEBUG
1382 1382                  descrp = &CPU->cpu_gdt[idx];
1383 1383                  baseaddr = (caddr_t)(uintptr_t)USEGD_GETBASE(descrp);
1384 1384                  /* GDT-based descriptors' base addresses should always be 0 */
1385 1385                  ASSERT(baseaddr == 0);
1386 1386  #endif
1387 1387                  *linearp = (caddr_t)(uintptr_t)rp->r_pc;
1388 1388          }
1389 1389  
1390 1390          return (0);
1391 1391  }
1392 1392  
1393 1393  /*
1394 1394   * We need to post a soft interrupt to reprogram the lbolt cyclic when
1395 1395   * switching from event to cyclic driven lbolt. The following code adds
1396 1396   * and posts the softint for x86.
1397 1397   */
1398 1398  static ddi_softint_hdl_impl_t lbolt_softint_hdl =
1399 1399          {0, NULL, NULL, NULL, 0, NULL, NULL, NULL};
1400 1400  
1401 1401  void
1402 1402  lbolt_softint_add(void)
1403 1403  {
1404 1404          (void) add_avsoftintr((void *)&lbolt_softint_hdl, LOCK_LEVEL,
1405 1405              (avfunc)lbolt_ev_to_cyclic, "lbolt_ev_to_cyclic", NULL, NULL);
1406 1406  }
1407 1407  
1408 1408  void
1409 1409  lbolt_softint_post(void)
1410 1410  {
1411 1411          (*setsoftint)(CBE_LOCK_PIL, lbolt_softint_hdl.ih_pending);
1412 1412  }
1413 1413  
1414 1414  boolean_t
1415 1415  plat_dr_check_capability(uint64_t features)
1416 1416  {
1417 1417          return ((plat_dr_options & features) == features);
1418 1418  }
1419 1419  
1420 1420  boolean_t
1421 1421  plat_dr_support_cpu(void)
1422 1422  {
1423 1423          return (plat_dr_options & PLAT_DR_FEATURE_CPU);
1424 1424  }
1425 1425  
1426 1426  boolean_t
1427 1427  plat_dr_support_memory(void)
1428 1428  {
1429 1429          return (plat_dr_options & PLAT_DR_FEATURE_MEMORY);
1430 1430  }
1431 1431  
1432 1432  void
1433 1433  plat_dr_enable_capability(uint64_t features)
1434 1434  {
1435 1435          atomic_or_64(&plat_dr_options, features);
1436 1436  }
1437 1437  
1438 1438  void
1439 1439  plat_dr_disable_capability(uint64_t features)
1440 1440  {
1441 1441          atomic_and_64(&plat_dr_options, ~features);
1442 1442  }

↓ open down ↓

1146 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX