summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/pci/pci.h4
-rw-r--r--drivers/pci/pcie/aer.c66
2 files changed, 64 insertions, 6 deletions
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index e1a28215967f..3023c68fe485 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -587,13 +587,15 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
struct aer_err_info {
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+ int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
const char *level; /* printk level */
unsigned int id:16;
unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
- unsigned int __pad1:5;
+ unsigned int root_ratelimit_print:1; /* 0=skip, 1=print */
+ unsigned int __pad1:4;
unsigned int multi_error_valid:1;
unsigned int first_error:5;
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 5d62b82b5bc1..7886a9ad126c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -28,6 +28,7 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/kfifo.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
@@ -88,6 +89,10 @@ struct aer_info {
u64 rootport_total_cor_errs;
u64 rootport_total_fatal_errs;
u64 rootport_total_nonfatal_errs;
+
+ /* Ratelimits for errors */
+ struct ratelimit_state correctable_ratelimit;
+ struct ratelimit_state nonfatal_ratelimit;
};
#define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \
@@ -379,6 +384,11 @@ void pci_aer_init(struct pci_dev *dev)
dev->aer_info = kzalloc(sizeof(*dev->aer_info), GFP_KERNEL);
+ ratelimit_state_init(&dev->aer_info->correctable_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+ ratelimit_state_init(&dev->aer_info->nonfatal_ratelimit,
+ DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+
/*
* We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
* PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event
@@ -669,6 +679,18 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
}
}
+static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
+{
+ switch (severity) {
+ case AER_NONFATAL:
+ return __ratelimit(&dev->aer_info->nonfatal_ratelimit);
+ case AER_CORRECTABLE:
+ return __ratelimit(&dev->aer_info->correctable_ratelimit);
+ default:
+ return 1; /* Don't ratelimit fatal errors */
+ }
+}
+
static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
{
const char **strings;
@@ -721,6 +743,9 @@ void aer_print_error(struct aer_err_info *info, int i)
trace_aer_event(pci_name(dev), (info->status & ~info->mask),
info->severity, info->tlp_header_valid, &info->tlp);
+ if (!info->ratelimit_print[i])
+ return;
+
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
aer_error_severity_string[info->severity]);
@@ -790,6 +815,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
trace_aer_event(pci_name(dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
+ if (!aer_ratelimit(dev, info.severity))
+ return;
+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
agent = AER_GET_AGENT(aer_severity, status);
@@ -824,6 +852,18 @@ static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
e_info->dev[i] = pci_dev_get(dev);
e_info->error_dev_num++;
+ /*
+ * Ratelimit AER log messages. "dev" is either the source
+ * identified by the root's Error Source ID or it has an unmasked
+ * error logged in its own AER Capability. Messages are emitted
+ * when "ratelimit_print[i]" is non-zero. If we will print detail
+ * for a downstream device, make sure we print the Error Source ID
+ * from the root as well.
+ */
+ if (aer_ratelimit(dev, e_info->severity)) {
+ e_info->ratelimit_print[i] = 1;
+ e_info->root_ratelimit_print = 1;
+ }
return 0;
}
@@ -918,7 +958,7 @@ static int find_device_iter(struct pci_dev *dev, void *data)
* e_info->error_dev_num and e_info->dev[], based on the given information.
*/
static bool find_source_device(struct pci_dev *parent,
- struct aer_err_info *e_info)
+ struct aer_err_info *e_info)
{
struct pci_dev *dev = parent;
int result;
@@ -1144,9 +1184,10 @@ static void aer_recover_work_func(struct work_struct *work)
pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
entry.devfn);
if (!pdev) {
- pr_err("no pci_dev for %04x:%02x:%02x.%x\n",
- entry.domain, entry.bus,
- PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
+ pr_err_ratelimited("%04x:%02x:%02x.%x: no pci_dev found\n",
+ entry.domain, entry.bus,
+ PCI_SLOT(entry.devfn),
+ PCI_FUNC(entry.devfn));
continue;
}
pci_print_aer(pdev, entry.severity, entry.regs);
@@ -1294,7 +1335,22 @@ static void aer_isr_one_error_type(struct pci_dev *root,
bool found;
found = find_source_device(root, info);
- aer_print_source(root, info, found);
+
+ /*
+ * If we're going to log error messages, we've already set
+ * "info->root_ratelimit_print" and "info->ratelimit_print[i]" to
+ * non-zero (which enables printing) because this is either an
+ * ERR_FATAL or we found a device with an error logged in its AER
+ * Capability.
+ *
+ * If we didn't find the Error Source device, at least log the
+ * Requester ID from the ERR_* Message received by the Root Port or
+ * RCEC, ratelimited by the RP or RCEC.
+ */
+ if (info->root_ratelimit_print ||
+ (!found && aer_ratelimit(root, info->severity)))
+ aer_print_source(root, info, found);
+
if (found)
aer_process_err_devices(info);
}