881 /*
882 * Attempt to claim ownership, temporarily, of every cache line that a
883 * non-responsive cpu might be using. This might kick that cpu out of
884 * this state.
885 *
886 * The return value indicates to the caller if we have exhausted all recovery
887 * techniques. If 1 is returned, it is useless to call this function again
888 * even for a different target CPU.
889 */
890 int
891 mondo_recover(uint16_t cpuid, int bn)
892 {
893 struct memseg *seg;
894 uint64_t begin_pa, end_pa, cur_pa;
895 hrtime_t begin_hrt, end_hrt;
896 int retval = 0;
897 int pages_claimed = 0;
898 cheetah_livelock_entry_t *histp;
899 uint64_t idsr;
900
901 if (cas32(&sendmondo_in_recover, 0, 1) != 0) {
902 /*
903 * Wait while recovery takes place
904 */
905 while (sendmondo_in_recover) {
906 drv_usecwait(1);
907 }
908 /*
909 * Assume we didn't claim the whole memory. If
910 * the target of this caller is not recovered,
911 * it will come back.
912 */
913 return (retval);
914 }
915
916 CHEETAH_LIVELOCK_ENTRY_NEXT(histp);
917 CHEETAH_LIVELOCK_ENTRY_SET(histp, lbolt, LBOLT_WAITFREE);
918 CHEETAH_LIVELOCK_ENTRY_SET(histp, cpuid, cpuid);
919 CHEETAH_LIVELOCK_ENTRY_SET(histp, buddy, CPU->cpu_id);
920
921 begin_hrt = gethrtime_waitfree();
967 pages_claimed++;
968 }
969 }
970
971 /*
972 * We did all we could.
973 */
974 retval = 1;
975
976 done:
977 /*
978 * Update statistics
979 */
980 end_hrt = gethrtime_waitfree();
981 CHEETAH_LIVELOCK_STAT(recovery);
982 CHEETAH_LIVELOCK_MAXSTAT(hrt, (end_hrt - begin_hrt));
983 CHEETAH_LIVELOCK_MAXSTAT(full_claimed, pages_claimed);
984 CHEETAH_LIVELOCK_ENTRY_SET(histp, recovery_time, \
985 (end_hrt - begin_hrt));
986
987 while (cas32(&sendmondo_in_recover, 1, 0) != 1)
988 ;
989
990 return (retval);
991 }
992
993 /*
994 * This is called by the cyclic framework when this CPU becomes online
995 */
996 /*ARGSUSED*/
997 static void
998 cheetah_nudge_onln(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
999 {
1000
1001 hdlr->cyh_func = (cyc_func_t)cheetah_nudge_buddy;
1002 hdlr->cyh_level = CY_LOW_LEVEL;
1003 hdlr->cyh_arg = NULL;
1004
1005 /*
1006 * Stagger the start time
1007 */
6283 (cpu_error_regs.afsr & cpu_ce_not_deferred))
6284 set_error_enable(ec_err_enable | EN_REG_CEEN);
6285
6286 }
6287
6288 /*
6289 * Attempt a cpu logout for an error that we did not trap for, such
6290 * as a CE noticed with CEEN off. It is assumed that we are still running
6291 * on the cpu that took the error and that we cannot migrate. Returns
6292 * 0 on success, otherwise nonzero.
6293 */
6294 static int
6295 cpu_ce_delayed_ec_logout(uint64_t afar)
6296 {
6297 ch_cpu_logout_t *clop;
6298
6299 if (CPU_PRIVATE(CPU) == NULL)
6300 return (0);
6301
6302 clop = CPU_PRIVATE_PTR(CPU, chpr_cecc_logout);
6303 if (cas64(&clop->clo_data.chd_afar, LOGOUT_INVALID, afar) !=
6304 LOGOUT_INVALID)
6305 return (0);
6306
6307 cpu_delayed_logout(afar, clop);
6308 return (1);
6309 }
6310
6311 /*
6312 * We got an error while CEEN was disabled. We
6313 * need to clean up after it and log whatever
6314 * information we have on the CE.
6315 */
6316 void
6317 cpu_ce_detected(ch_cpu_errors_t *cpu_error_regs, int flag)
6318 {
6319 ch_async_flt_t ch_flt;
6320 struct async_flt *aflt;
6321 char pr_reason[MAX_REASON_STRING];
6322
6323 bzero(&ch_flt, sizeof (ch_async_flt_t));
|
881 /*
882 * Attempt to claim ownership, temporarily, of every cache line that a
883 * non-responsive cpu might be using. This might kick that cpu out of
884 * this state.
885 *
886 * The return value indicates to the caller if we have exhausted all recovery
887 * techniques. If 1 is returned, it is useless to call this function again
888 * even for a different target CPU.
889 */
890 int
891 mondo_recover(uint16_t cpuid, int bn)
892 {
893 struct memseg *seg;
894 uint64_t begin_pa, end_pa, cur_pa;
895 hrtime_t begin_hrt, end_hrt;
896 int retval = 0;
897 int pages_claimed = 0;
898 cheetah_livelock_entry_t *histp;
899 uint64_t idsr;
900
901 if (atomic_cas_32(&sendmondo_in_recover, 0, 1) != 0) {
902 /*
903 * Wait while recovery takes place
904 */
905 while (sendmondo_in_recover) {
906 drv_usecwait(1);
907 }
908 /*
909 * Assume we didn't claim the whole memory. If
910 * the target of this caller is not recovered,
911 * it will come back.
912 */
913 return (retval);
914 }
915
916 CHEETAH_LIVELOCK_ENTRY_NEXT(histp);
917 CHEETAH_LIVELOCK_ENTRY_SET(histp, lbolt, LBOLT_WAITFREE);
918 CHEETAH_LIVELOCK_ENTRY_SET(histp, cpuid, cpuid);
919 CHEETAH_LIVELOCK_ENTRY_SET(histp, buddy, CPU->cpu_id);
920
921 begin_hrt = gethrtime_waitfree();
967 pages_claimed++;
968 }
969 }
970
971 /*
972 * We did all we could.
973 */
974 retval = 1;
975
976 done:
977 /*
978 * Update statistics
979 */
980 end_hrt = gethrtime_waitfree();
981 CHEETAH_LIVELOCK_STAT(recovery);
982 CHEETAH_LIVELOCK_MAXSTAT(hrt, (end_hrt - begin_hrt));
983 CHEETAH_LIVELOCK_MAXSTAT(full_claimed, pages_claimed);
984 CHEETAH_LIVELOCK_ENTRY_SET(histp, recovery_time, \
985 (end_hrt - begin_hrt));
986
987 while (atomic_cas_32(&sendmondo_in_recover, 1, 0) != 1)
988 ;
989
990 return (retval);
991 }
992
993 /*
994 * This is called by the cyclic framework when this CPU becomes online
995 */
996 /*ARGSUSED*/
997 static void
998 cheetah_nudge_onln(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
999 {
1000
1001 hdlr->cyh_func = (cyc_func_t)cheetah_nudge_buddy;
1002 hdlr->cyh_level = CY_LOW_LEVEL;
1003 hdlr->cyh_arg = NULL;
1004
1005 /*
1006 * Stagger the start time
1007 */
6283 (cpu_error_regs.afsr & cpu_ce_not_deferred))
6284 set_error_enable(ec_err_enable | EN_REG_CEEN);
6285
6286 }
6287
6288 /*
6289 * Attempt a cpu logout for an error that we did not trap for, such
6290 * as a CE noticed with CEEN off. It is assumed that we are still running
6291 * on the cpu that took the error and that we cannot migrate. Returns
6292 * 0 on success, otherwise nonzero.
6293 */
6294 static int
6295 cpu_ce_delayed_ec_logout(uint64_t afar)
6296 {
6297 ch_cpu_logout_t *clop;
6298
6299 if (CPU_PRIVATE(CPU) == NULL)
6300 return (0);
6301
6302 clop = CPU_PRIVATE_PTR(CPU, chpr_cecc_logout);
6303 if (atomic_cas_64(&clop->clo_data.chd_afar, LOGOUT_INVALID, afar) !=
6304 LOGOUT_INVALID)
6305 return (0);
6306
6307 cpu_delayed_logout(afar, clop);
6308 return (1);
6309 }
6310
6311 /*
6312 * We got an error while CEEN was disabled. We
6313 * need to clean up after it and log whatever
6314 * information we have on the CE.
6315 */
6316 void
6317 cpu_ce_detected(ch_cpu_errors_t *cpu_error_regs, int flag)
6318 {
6319 ch_async_flt_t ch_flt;
6320 struct async_flt *aflt;
6321 char pr_reason[MAX_REASON_STRING];
6322
6323 bzero(&ch_flt, sizeof (ch_async_flt_t));
|