diff options
-rw-r--r-- | drivers/pci/pci.h | 4 | ||||
-rw-r--r-- | drivers/pci/pcie/aer.c | 66 |
2 files changed, 64 insertions, 6 deletions
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index e1a28215967f..3023c68fe485 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -587,13 +587,15 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; + int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; const char *level; /* printk level */ unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ - unsigned int __pad1:5; + unsigned int root_ratelimit_print:1; /* 0=skip, 1=print */ + unsigned int __pad1:4; unsigned int multi_error_valid:1; unsigned int first_error:5; diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 5d62b82b5bc1..7886a9ad126c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -28,6 +28,7 @@ #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/kfifo.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <acpi/apei.h> #include <acpi/ghes.h> @@ -88,6 +89,10 @@ struct aer_info { u64 rootport_total_cor_errs; u64 rootport_total_fatal_errs; u64 rootport_total_nonfatal_errs; + + /* Ratelimits for errors */ + struct ratelimit_state correctable_ratelimit; + struct ratelimit_state nonfatal_ratelimit; }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ @@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev) dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL); + ratelimit_state_init(&dev->aer_info->correctable_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + ratelimit_state_init(&dev->aer_info->nonfatal_ratelimit, + DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + /* * We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER, * PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event @@ -669,6 +679,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, } } +static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) +{ + switch (severity) { + case AER_NONFATAL: + return __ratelimit(&dev->aer_info->nonfatal_ratelimit); + case AER_CORRECTABLE: + return __ratelimit(&dev->aer_info->correctable_ratelimit); + default: + return 1; /* Don't ratelimit fatal errors */ + } +} + static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; @@ -721,6 +743,9 @@ void aer_print_error(struct aer_err_info *info, int i) trace_aer_event(pci_name(dev), (info->status & ~info->mask), info->severity, info->tlp_header_valid, &info->tlp); + if (!info->ratelimit_print[i]) + return; + if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", aer_error_severity_string[info->severity]); @@ -790,6 +815,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, trace_aer_event(pci_name(dev), (status & ~mask), aer_severity, tlp_header_valid, &aer->header_log); + if (!aer_ratelimit(dev, info.severity)) + return; + layer = AER_GET_LAYER_ERROR(aer_severity, status); agent = AER_GET_AGENT(aer_severity, status); @@ -824,6 +852,18 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) e_info->dev[i] = pci_dev_get(dev); e_info->error_dev_num++; + /* + * Ratelimit AER log messages. "dev" is either the source + * identified by the root's Error Source ID or it has an unmasked + * error logged in its own AER Capability. Messages are emitted + * when "ratelimit_print[i]" is non-zero. If we will print detail + * for a downstream device, make sure we print the Error Source ID + * from the root as well. + */ + if (aer_ratelimit(dev, e_info->severity)) { + e_info->ratelimit_print[i] = 1; + e_info->root_ratelimit_print = 1; + } return 0; } @@ -918,7 +958,7 @@ static int find_device_iter(struct pci_dev *dev, void *data) * e_info->error_dev_num and e_info->dev[], based on the given information. */ static bool find_source_device(struct pci_dev *parent, - struct aer_err_info *e_info) + struct aer_err_info *e_info) { struct pci_dev *dev = parent; int result; @@ -1144,9 +1184,10 @@ static void aer_recover_work_func(struct work_struct *work) pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus, entry.devfn); if (!pdev) { - pr_err("no pci_dev for %04x:%02x:%02x.%x\n", - entry.domain, entry.bus, - PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); + pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n", + entry.domain, entry.bus, + PCI_SLOT(entry.devfn), + PCI_FUNC(entry.devfn)); continue; } pci_print_aer(pdev, entry.severity, entry.regs); @@ -1294,7 +1335,22 @@ static void aer_isr_one_error_type(struct pci_dev *root, bool found; found = find_source_device(root, info); - aer_print_source(root, info, found); + + /* + * If we're going to log error messages, we've already set + * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to + * non-zero (which enables printing) because this is either an + * ERR_FATAL or we found a device with an error logged in its AER + * Capability. + * + * If we didn't find the Error Source device, at least log the + * Requester ID from the ERR_* Message received by the Root Port or + * RCEC, ratelimited by the RP or RCEC. + */ + if (info->root_ratelimit_print || + (!found && aer_ratelimit(root, info->severity))) + aer_print_source(root, info, found); + if (found) aer_process_err_devices(info); } |