Print this page
XXXX introduce drv_sectohz


2423                  *         on_trap protection)
2424                  *      4. The error is on a retired page
2425                  *
2426                  * Note: AFLT_PROT_EC is used places other than the memory
2427                  * scrubber.  However, none of those errors should occur
2428                  * on a retired page.
2429                  */
2430                 if ((ch_flt->afsr_errs &
2431                     (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
2432                     aflt->flt_prot == AFLT_PROT_EC) {
2433 
2434                         if (page_retire_check(aflt->flt_addr, NULL) == 0) {
2435                                 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2436 
2437                                 /*
2438                                  * Since we're skipping logging, we'll need
2439                                  * to schedule the re-enabling of CEEN
2440                                  */
2441                                 (void) timeout(cpu_delayed_check_ce_errors,
2442                                     (void *)(uintptr_t)aflt->flt_inst,
2443                                     drv_usectohz((clock_t)cpu_ceen_delay_secs
2444                                     * MICROSEC));
2445                                 }
2446 
2447                                 /*
2448                                  * Inform memscrubber - scrubbing induced
2449                                  * CE on a retired page.
2450                                  */
2451                                 memscrub_induced_error();
2452                                 return (0);
2453                         }
2454                 }
2455 
2456                 /*
2457                  * Perform/schedule further classification actions, but
2458                  * only if the page is healthy (we don't want bad
2459                  * pages inducing too much diagnostic activity).  If we could
2460                  * not find a page pointer then we also skip this.  If
2461                  * ce_scrub_xdiag_recirc returns nonzero then it has chosen
2462                  * to copy and recirculate the event (for further diagnostics)
2463                  * and we should not proceed to log it here.
2464                  *


2651                             ch_flt->flt_bit);
2652         }
2653 
2654         if (aflt->flt_func != NULL)
2655                 aflt->flt_func(aflt, unum);
2656 
2657         if (afar_status != AFLT_STAT_INVALID)
2658                 cpu_log_diag_info(ch_flt);
2659 
2660         /*
2661          * If we have a CEEN error , we do not reenable CEEN until after
2662          * we exit the trap handler. Otherwise, another error may
2663          * occur causing the handler to be entered recursively.
2664          * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
2665          * to try and ensure that the CPU makes progress in the face
2666          * of a CE storm.
2667          */
2668         if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2669                 (void) timeout(cpu_delayed_check_ce_errors,
2670                     (void *)(uintptr_t)aflt->flt_inst,
2671                     drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
2672         }
2673 }
2674 
2675 /*
2676  * Invoked by error_init() early in startup and therefore before
2677  * startup_errorq() is called to drain any error Q -
2678  *
2679  * startup()
2680  *   startup_end()
2681  *     error_init()
2682  *       cpu_error_init()
2683  * errorq_init()
2684  *   errorq_drain()
2685  * start_other_cpus()
2686  *
2687  * The purpose of this routine is to create error-related taskqs.  Taskqs
2688  * are used for this purpose because cpu_lock can't be grabbed from interrupt
2689  * context.
2690  */
2691 void


6055 
6056         do {
6057                 outstanding = *countp;
6058                 for (i = 0; i < outstanding; i++) {
6059                         scrub_ecache(how_many);
6060                 }
6061         } while (atomic_add_32_nv(countp, -outstanding));
6062 
6063         return (DDI_INTR_CLAIMED);
6064 }
6065 
6066 /*
6067  * Timeout function to reenable CE
6068  */
6069 static void
6070 cpu_delayed_check_ce_errors(void *arg)
6071 {
6072         if (!taskq_dispatch(ch_check_ce_tq, cpu_check_ce_errors, arg,
6073             TQ_NOSLEEP)) {
6074                 (void) timeout(cpu_delayed_check_ce_errors, arg,
6075                     drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
6076         }
6077 }
6078 
6079 /*
6080  * CE Deferred Re-enable after trap.
6081  *
6082  * When the CPU gets a disrupting trap for any of the errors
6083  * controlled by the CEEN bit, CEEN is disabled in the trap handler
6084  * immediately. To eliminate the possibility of multiple CEs causing
6085  * recursive stack overflow in the trap handler, we cannot
6086  * reenable CEEN while still running in the trap handler. Instead,
6087  * after a CE is logged on a CPU, we schedule a timeout function,
6088  * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
6089  * seconds. This function will check whether any further CEs
6090  * have occurred on that CPU, and if none have, will reenable CEEN.
6091  *
6092  * If further CEs have occurred while CEEN is disabled, another
6093  * timeout will be scheduled. This is to ensure that the CPU can
6094  * make progress in the face of CE 'storms', and that it does not
6095  * spend all its time logging CE errors.


6144                     TIMEOUT_CEEN_CHECK, 0);
6145                 mutex_exit(&cpu_lock);
6146         } else {
6147                 /*
6148                  * When the CPU is not accepting xcalls, or
6149                  * the processor is offlined, we don't want to
6150                  * incur the extra overhead of trying to schedule the
6151                  * CE timeout indefinitely. However, we don't want to lose
6152                  * CE checking forever.
6153                  *
6154                  * Keep rescheduling the timeout, accepting the additional
6155                  * overhead as the cost of correctness in the case where we get
6156                  * a CE, disable CEEN, offline the CPU during the
6157                  * the timeout interval, and then online it at some
6158                  * point in the future. This is unlikely given the short
6159                  * cpu_ceen_delay_secs.
6160                  */
6161                 mutex_exit(&cpu_lock);
6162                 (void) timeout(cpu_delayed_check_ce_errors,
6163                     (void *)(uintptr_t)cp->cpu_id,
6164                     drv_usectohz((clock_t)cpu_ceen_delay_secs * MICROSEC));
6165         }
6166 }
6167 
6168 /*
6169  * This routine will check whether CEs have occurred while
6170  * CEEN is disabled. Any CEs detected will be logged and, if
6171  * possible, scrubbed.
6172  *
6173  * The memscrubber will also use this routine to clear any errors
6174  * caused by its scrubbing with CEEN disabled.
6175  *
6176  * flag == SCRUBBER_CEEN_CHECK
6177  *              called from memscrubber, just check/scrub, no reset
6178  *              paddr   physical addr. for start of scrub pages
6179  *              vaddr   virtual addr. for scrub area
6180  *              psz     page size of area to be scrubbed
6181  *
6182  * flag == TIMEOUT_CEEN_CHECK
6183  *              timeout function has triggered, reset timeout or CEEN
6184  *




2423                  *         on_trap protection)
2424                  *      4. The error is on a retired page
2425                  *
2426                  * Note: AFLT_PROT_EC is used places other than the memory
2427                  * scrubber.  However, none of those errors should occur
2428                  * on a retired page.
2429                  */
2430                 if ((ch_flt->afsr_errs &
2431                     (C_AFSR_ALL_ERRS | C_AFSR_EXT_ALL_ERRS)) == C_AFSR_CE &&
2432                     aflt->flt_prot == AFLT_PROT_EC) {
2433 
2434                         if (page_retire_check(aflt->flt_addr, NULL) == 0) {
2435                                 if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2436 
2437                                 /*
2438                                  * Since we're skipping logging, we'll need
2439                                  * to schedule the re-enabling of CEEN
2440                                  */
2441                                 (void) timeout(cpu_delayed_check_ce_errors,
2442                                     (void *)(uintptr_t)aflt->flt_inst,
2443                                     drv_sectohz((clock_t)cpu_ceen_delay_secs));

2444                                 }
2445 
2446                                 /*
2447                                  * Inform memscrubber - scrubbing induced
2448                                  * CE on a retired page.
2449                                  */
2450                                 memscrub_induced_error();
2451                                 return (0);
2452                         }
2453                 }
2454 
2455                 /*
2456                  * Perform/schedule further classification actions, but
2457                  * only if the page is healthy (we don't want bad
2458                  * pages inducing too much diagnostic activity).  If we could
2459                  * not find a page pointer then we also skip this.  If
2460                  * ce_scrub_xdiag_recirc returns nonzero then it has chosen
2461                  * to copy and recirculate the event (for further diagnostics)
2462                  * and we should not proceed to log it here.
2463                  *


2650                             ch_flt->flt_bit);
2651         }
2652 
2653         if (aflt->flt_func != NULL)
2654                 aflt->flt_func(aflt, unum);
2655 
2656         if (afar_status != AFLT_STAT_INVALID)
2657                 cpu_log_diag_info(ch_flt);
2658 
2659         /*
2660          * If we have a CEEN error , we do not reenable CEEN until after
2661          * we exit the trap handler. Otherwise, another error may
2662          * occur causing the handler to be entered recursively.
2663          * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
2664          * to try and ensure that the CPU makes progress in the face
2665          * of a CE storm.
2666          */
2667         if (ch_flt->flt_trapped_ce & CE_CEEN_DEFER) {
2668                 (void) timeout(cpu_delayed_check_ce_errors,
2669                     (void *)(uintptr_t)aflt->flt_inst,
2670                     drv_sectohz((clock_t)cpu_ceen_delay_secs));
2671         }
2672 }
2673 
2674 /*
2675  * Invoked by error_init() early in startup and therefore before
2676  * startup_errorq() is called to drain any error Q -
2677  *
2678  * startup()
2679  *   startup_end()
2680  *     error_init()
2681  *       cpu_error_init()
2682  * errorq_init()
2683  *   errorq_drain()
2684  * start_other_cpus()
2685  *
2686  * The purpose of this routine is to create error-related taskqs.  Taskqs
2687  * are used for this purpose because cpu_lock can't be grabbed from interrupt
2688  * context.
2689  */
2690 void


6054 
6055         do {
6056                 outstanding = *countp;
6057                 for (i = 0; i < outstanding; i++) {
6058                         scrub_ecache(how_many);
6059                 }
6060         } while (atomic_add_32_nv(countp, -outstanding));
6061 
6062         return (DDI_INTR_CLAIMED);
6063 }
6064 
6065 /*
6066  * Timeout function to reenable CE
6067  */
6068 static void
6069 cpu_delayed_check_ce_errors(void *arg)
6070 {
6071         if (!taskq_dispatch(ch_check_ce_tq, cpu_check_ce_errors, arg,
6072             TQ_NOSLEEP)) {
6073                 (void) timeout(cpu_delayed_check_ce_errors, arg,
6074                     drv_sectohz((clock_t)cpu_ceen_delay_secs));
6075         }
6076 }
6077 
6078 /*
6079  * CE Deferred Re-enable after trap.
6080  *
6081  * When the CPU gets a disrupting trap for any of the errors
6082  * controlled by the CEEN bit, CEEN is disabled in the trap handler
6083  * immediately. To eliminate the possibility of multiple CEs causing
6084  * recursive stack overflow in the trap handler, we cannot
6085  * reenable CEEN while still running in the trap handler. Instead,
6086  * after a CE is logged on a CPU, we schedule a timeout function,
6087  * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
6088  * seconds. This function will check whether any further CEs
6089  * have occurred on that CPU, and if none have, will reenable CEEN.
6090  *
6091  * If further CEs have occurred while CEEN is disabled, another
6092  * timeout will be scheduled. This is to ensure that the CPU can
6093  * make progress in the face of CE 'storms', and that it does not
6094  * spend all its time logging CE errors.


6143                     TIMEOUT_CEEN_CHECK, 0);
6144                 mutex_exit(&cpu_lock);
6145         } else {
6146                 /*
6147                  * When the CPU is not accepting xcalls, or
6148                  * the processor is offlined, we don't want to
6149                  * incur the extra overhead of trying to schedule the
6150                  * CE timeout indefinitely. However, we don't want to lose
6151                  * CE checking forever.
6152                  *
6153                  * Keep rescheduling the timeout, accepting the additional
6154                  * overhead as the cost of correctness in the case where we get
6155                  * a CE, disable CEEN, offline the CPU during the
6156                  * the timeout interval, and then online it at some
6157                  * point in the future. This is unlikely given the short
6158                  * cpu_ceen_delay_secs.
6159                  */
6160                 mutex_exit(&cpu_lock);
6161                 (void) timeout(cpu_delayed_check_ce_errors,
6162                     (void *)(uintptr_t)cp->cpu_id,
6163                     drv_sectohz((clock_t)cpu_ceen_delay_secs));
6164         }
6165 }
6166 
6167 /*
6168  * This routine will check whether CEs have occurred while
6169  * CEEN is disabled. Any CEs detected will be logged and, if
6170  * possible, scrubbed.
6171  *
6172  * The memscrubber will also use this routine to clear any errors
6173  * caused by its scrubbing with CEEN disabled.
6174  *
6175  * flag == SCRUBBER_CEEN_CHECK
6176  *              called from memscrubber, just check/scrub, no reset
6177  *              paddr   physical addr. for start of scrub pages
6178  *              vaddr   virtual addr. for scrub area
6179  *              psz     page size of area to be scrubbed
6180  *
6181  * flag == TIMEOUT_CEEN_CHECK
6182  *              timeout function has triggered, reset timeout or CEEN
6183  *