summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin K. Petersen <martin.petersen@oracle.com>2025-04-28 21:48:37 -0400
committerMartin K. Petersen <martin.petersen@oracle.com>2025-04-28 21:48:37 -0400
commit6b08fe7763de9e388e6b224af2e39fa5083c44d8 (patch)
treead164e3d8991d1336927e51f82d776b9b23b41d3
parenta0d1cf505d3f3a98388c2d05fe03b126dfda3513 (diff)
parent268975a87c7b6f6b0ceb62df236c1e1b08b89379 (diff)
Merge patch series "target: Remove atomics from main IO path"
Mike Christie <michael.christie@oracle.com> says: The following patches made over Linus's tree remove the atomic use from the main IO path. There was a handful of atomic_longs used just used for stats and a couple atomics used for handling ordered commands. These patches move the stats to per cpu, and moves the ordered tracking to a per cpu counter. With the patches 8K IOPS increases by up to 33% when running fio with numjobs >= 4 and using the vhost-scsi target with virtio-scsi and virtio num_queues >= 4 (jobs and queues match, and virtqueue_size and cmd_per_lun are increased to match the total iodepth of all jobs). Link: https://lore.kernel.org/r/20250424032741.16216-1-michael.christie@oracle.com Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
-rw-r--r--drivers/target/target_core_device.c89
-rw-r--r--drivers/target/target_core_stat.c69
-rw-r--r--drivers/target/target_core_transport.c119
-rw-r--r--include/target/target_core_base.h24
4 files changed, 204 insertions, 97 deletions
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index cc2da086f96e..7bb711b24c0d 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -55,14 +55,14 @@ transport_lookup_cmd_lun(struct se_cmd *se_cmd)
rcu_read_lock();
deve = target_nacl_find_deve(nacl, se_cmd->orig_fe_lun);
if (deve) {
- atomic_long_inc(&deve->total_cmds);
+ this_cpu_inc(deve->stats->total_cmds);
if (se_cmd->data_direction == DMA_TO_DEVICE)
- atomic_long_add(se_cmd->data_length,
- &deve->write_bytes);
+ this_cpu_add(deve->stats->write_bytes,
+ se_cmd->data_length);
else if (se_cmd->data_direction == DMA_FROM_DEVICE)
- atomic_long_add(se_cmd->data_length,
- &deve->read_bytes);
+ this_cpu_add(deve->stats->read_bytes,
+ se_cmd->data_length);
if ((se_cmd->data_direction == DMA_TO_DEVICE) &&
deve->lun_access_ro) {
@@ -126,14 +126,14 @@ out_unlock:
* target_core_fabric_configfs.c:target_fabric_port_release
*/
se_cmd->se_dev = rcu_dereference_raw(se_lun->lun_se_dev);
- atomic_long_inc(&se_cmd->se_dev->num_cmds);
+ this_cpu_inc(se_cmd->se_dev->stats->total_cmds);
if (se_cmd->data_direction == DMA_TO_DEVICE)
- atomic_long_add(se_cmd->data_length,
- &se_cmd->se_dev->write_bytes);
+ this_cpu_add(se_cmd->se_dev->stats->write_bytes,
+ se_cmd->data_length);
else if (se_cmd->data_direction == DMA_FROM_DEVICE)
- atomic_long_add(se_cmd->data_length,
- &se_cmd->se_dev->read_bytes);
+ this_cpu_add(se_cmd->se_dev->stats->read_bytes,
+ se_cmd->data_length);
return ret;
}
@@ -322,6 +322,7 @@ int core_enable_device_list_for_node(
struct se_portal_group *tpg)
{
struct se_dev_entry *orig, *new;
+ int ret = 0;
new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new) {
@@ -329,6 +330,12 @@ int core_enable_device_list_for_node(
return -ENOMEM;
}
+ new->stats = alloc_percpu(struct se_dev_entry_io_stats);
+ if (!new->stats) {
+ ret = -ENOMEM;
+ goto free_deve;
+ }
+
spin_lock_init(&new->ua_lock);
INIT_LIST_HEAD(&new->ua_list);
INIT_LIST_HEAD(&new->lun_link);
@@ -351,8 +358,8 @@ int core_enable_device_list_for_node(
" for dynamic -> explicit NodeACL conversion:"
" %s\n", nacl->initiatorname);
mutex_unlock(&nacl->lun_entry_mutex);
- kfree(new);
- return -EINVAL;
+ ret = -EINVAL;
+ goto free_stats;
}
if (orig->se_lun_acl != NULL) {
pr_warn_ratelimited("Detected existing explicit"
@@ -360,8 +367,8 @@ int core_enable_device_list_for_node(
" mapped_lun: %llu, failing\n",
nacl->initiatorname, mapped_lun);
mutex_unlock(&nacl->lun_entry_mutex);
- kfree(new);
- return -EINVAL;
+ ret = -EINVAL;
+ goto free_stats;
}
new->se_lun = lun;
@@ -394,6 +401,20 @@ int core_enable_device_list_for_node(
target_luns_data_has_changed(nacl, new, true);
return 0;
+
+free_stats:
+ free_percpu(new->stats);
+free_deve:
+ kfree(new);
+ return ret;
+}
+
+static void target_free_dev_entry(struct rcu_head *head)
+{
+ struct se_dev_entry *deve = container_of(head, struct se_dev_entry,
+ rcu_head);
+ free_percpu(deve->stats);
+ kfree(deve);
}
void core_disable_device_list_for_node(
@@ -443,7 +464,7 @@ void core_disable_device_list_for_node(
kref_put(&orig->pr_kref, target_pr_kref_release);
wait_for_completion(&orig->pr_comp);
- kfree_rcu(orig, rcu_head);
+ call_rcu(&orig->rcu_head, target_free_dev_entry);
core_scsi3_free_pr_reg_from_nacl(dev, nacl);
target_luns_data_has_changed(nacl, NULL, false);
@@ -679,6 +700,18 @@ static void scsi_dump_inquiry(struct se_device *dev)
pr_debug(" Type: %s ", scsi_device_type(device_type));
}
+static void target_non_ordered_release(struct percpu_ref *ref)
+{
+ struct se_device *dev = container_of(ref, struct se_device,
+ non_ordered);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+ if (!list_empty(&dev->delayed_cmd_list))
+ schedule_work(&dev->delayed_cmd_work);
+ spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+}
+
struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
{
struct se_device *dev;
@@ -689,11 +722,13 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
if (!dev)
return NULL;
+ dev->stats = alloc_percpu(struct se_dev_io_stats);
+ if (!dev->stats)
+ goto free_device;
+
dev->queues = kcalloc(nr_cpu_ids, sizeof(*dev->queues), GFP_KERNEL);
- if (!dev->queues) {
- hba->backend->ops->free_device(dev);
- return NULL;
- }
+ if (!dev->queues)
+ goto free_stats;
dev->queue_cnt = nr_cpu_ids;
for (i = 0; i < dev->queue_cnt; i++) {
@@ -707,6 +742,10 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
INIT_WORK(&q->sq.work, target_queued_submit_work);
}
+ if (percpu_ref_init(&dev->non_ordered, target_non_ordered_release,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto free_queues;
+
dev->se_hba = hba;
dev->transport = hba->backend->ops;
dev->transport_flags = dev->transport->transport_flags_default;
@@ -791,6 +830,14 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
sizeof(dev->t10_wwn.revision));
return dev;
+
+free_queues:
+ kfree(dev->queues);
+free_stats:
+ free_percpu(dev->stats);
+free_device:
+ hba->backend->ops->free_device(dev);
+ return NULL;
}
/*
@@ -980,6 +1027,9 @@ void target_free_device(struct se_device *dev)
WARN_ON(!list_empty(&dev->dev_sep_list));
+ percpu_ref_exit(&dev->non_ordered);
+ cancel_work_sync(&dev->delayed_cmd_work);
+
if (target_dev_configured(dev)) {
dev->transport->destroy_device(dev);
@@ -1001,6 +1051,7 @@ void target_free_device(struct se_device *dev)
dev->transport->free_prot(dev);
kfree(dev->queues);
+ free_percpu(dev->stats);
dev->transport->free_device(dev);
}
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index 210648a0092e..6bdf2d8bd694 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -280,30 +280,51 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item,
char *page)
{
struct se_device *dev = to_stat_lu_dev(item);
+ struct se_dev_io_stats *stats;
+ unsigned int cpu;
+ u32 cmds = 0;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(dev->stats, cpu);
+ cmds += stats->total_cmds;
+ }
/* scsiLuNumCommands */
- return snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&dev->num_cmds));
+ return snprintf(page, PAGE_SIZE, "%u\n", cmds);
}
static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item,
char *page)
{
struct se_device *dev = to_stat_lu_dev(item);
+ struct se_dev_io_stats *stats;
+ unsigned int cpu;
+ u32 bytes = 0;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(dev->stats, cpu);
+ bytes += stats->read_bytes;
+ }
/* scsiLuReadMegaBytes */
- return snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&dev->read_bytes) >> 20);
+ return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
}
static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item,
char *page)
{
struct se_device *dev = to_stat_lu_dev(item);
+ struct se_dev_io_stats *stats;
+ unsigned int cpu;
+ u32 bytes = 0;
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(dev->stats, cpu);
+ bytes += stats->write_bytes;
+ }
/* scsiLuWrittenMegaBytes */
- return snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&dev->write_bytes) >> 20);
+ return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
}
static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page)
@@ -1019,8 +1040,11 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
{
struct se_lun_acl *lacl = auth_to_lacl(item);
struct se_node_acl *nacl = lacl->se_lun_nacl;
+ struct se_dev_entry_io_stats *stats;
struct se_dev_entry *deve;
+ unsigned int cpu;
ssize_t ret;
+ u32 cmds = 0;
rcu_read_lock();
deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1028,9 +1052,14 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item,
rcu_read_unlock();
return -ENODEV;
}
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(deve->stats, cpu);
+ cmds += stats->total_cmds;
+ }
+
/* scsiAuthIntrOutCommands */
- ret = snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&deve->total_cmds));
+ ret = snprintf(page, PAGE_SIZE, "%u\n", cmds);
rcu_read_unlock();
return ret;
}
@@ -1040,8 +1069,11 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
{
struct se_lun_acl *lacl = auth_to_lacl(item);
struct se_node_acl *nacl = lacl->se_lun_nacl;
+ struct se_dev_entry_io_stats *stats;
struct se_dev_entry *deve;
+ unsigned int cpu;
ssize_t ret;
+ u32 bytes = 0;
rcu_read_lock();
deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1049,9 +1081,14 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item,
rcu_read_unlock();
return -ENODEV;
}
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(deve->stats, cpu);
+ bytes += stats->read_bytes;
+ }
+
/* scsiAuthIntrReadMegaBytes */
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&deve->read_bytes) >> 20));
+ ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
rcu_read_unlock();
return ret;
}
@@ -1061,8 +1098,11 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
{
struct se_lun_acl *lacl = auth_to_lacl(item);
struct se_node_acl *nacl = lacl->se_lun_nacl;
+ struct se_dev_entry_io_stats *stats;
struct se_dev_entry *deve;
+ unsigned int cpu;
ssize_t ret;
+ u32 bytes = 0;
rcu_read_lock();
deve = target_nacl_find_deve(nacl, lacl->mapped_lun);
@@ -1070,9 +1110,14 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item,
rcu_read_unlock();
return -ENODEV;
}
+
+ for_each_possible_cpu(cpu) {
+ stats = per_cpu_ptr(deve->stats, cpu);
+ bytes += stats->write_bytes;
+ }
+
/* scsiAuthIntrWrittenMegaBytes */
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&deve->write_bytes) >> 20));
+ ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20);
rcu_read_unlock();
return ret;
}
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 05d29201b730..0a76bdfe5528 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2213,6 +2213,7 @@ static int target_write_prot_action(struct se_cmd *cmd)
static bool target_handle_task_attr(struct se_cmd *cmd)
{
struct se_device *dev = cmd->se_dev;
+ unsigned long flags;
if (dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH)
return false;
@@ -2225,13 +2226,10 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
*/
switch (cmd->sam_task_attr) {
case TCM_HEAD_TAG:
- atomic_inc_mb(&dev->non_ordered);
pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n",
cmd->t_task_cdb[0]);
return false;
case TCM_ORDERED_TAG:
- atomic_inc_mb(&dev->delayed_cmd_count);
-
pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n",
cmd->t_task_cdb[0]);
break;
@@ -2239,29 +2237,29 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
/*
* For SIMPLE and UNTAGGED Task Attribute commands
*/
- atomic_inc_mb(&dev->non_ordered);
-
- if (atomic_read(&dev->delayed_cmd_count) == 0)
+retry:
+ if (percpu_ref_tryget_live(&dev->non_ordered))
return false;
+
break;
}
- if (cmd->sam_task_attr != TCM_ORDERED_TAG) {
- atomic_inc_mb(&dev->delayed_cmd_count);
- /*
- * We will account for this when we dequeue from the delayed
- * list.
- */
- atomic_dec_mb(&dev->non_ordered);
+ spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+ if (cmd->sam_task_attr == TCM_SIMPLE_TAG &&
+ !percpu_ref_is_dying(&dev->non_ordered)) {
+ spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+ /* We raced with the last ordered completion so retry. */
+ goto retry;
+ } else if (!percpu_ref_is_dying(&dev->non_ordered)) {
+ percpu_ref_kill(&dev->non_ordered);
}
- spin_lock_irq(&cmd->t_state_lock);
+ spin_lock(&cmd->t_state_lock);
cmd->transport_state &= ~CMD_T_SENT;
- spin_unlock_irq(&cmd->t_state_lock);
+ spin_unlock(&cmd->t_state_lock);
- spin_lock(&dev->delayed_cmd_lock);
list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list);
- spin_unlock(&dev->delayed_cmd_lock);
+ spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn",
cmd->t_task_cdb[0], cmd->sam_task_attr);
@@ -2313,41 +2311,52 @@ void target_do_delayed_work(struct work_struct *work)
while (!dev->ordered_sync_in_progress) {
struct se_cmd *cmd;
- if (list_empty(&dev->delayed_cmd_list))
+ /*
+ * We can be woken up early/late due to races or the
+ * extra wake up we do when adding commands to the list.
+ * We check for both cases here.
+ */
+ if (list_empty(&dev->delayed_cmd_list) ||
+ !percpu_ref_is_zero(&dev->non_ordered))
break;
cmd = list_entry(dev->delayed_cmd_list.next,
struct se_cmd, se_delayed_node);
+ cmd->se_cmd_flags |= SCF_TASK_ORDERED_SYNC;
+ cmd->transport_state |= CMD_T_SENT;
- if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
- /*
- * Check if we started with:
- * [ordered] [simple] [ordered]
- * and we are now at the last ordered so we have to wait
- * for the simple cmd.
- */
- if (atomic_read(&dev->non_ordered) > 0)
- break;
-
- dev->ordered_sync_in_progress = true;
- }
+ dev->ordered_sync_in_progress = true;
list_del(&cmd->se_delayed_node);
- atomic_dec_mb(&dev->delayed_cmd_count);
spin_unlock(&dev->delayed_cmd_lock);
- if (cmd->sam_task_attr != TCM_ORDERED_TAG)
- atomic_inc_mb(&dev->non_ordered);
-
- cmd->transport_state |= CMD_T_SENT;
-
__target_execute_cmd(cmd, true);
-
spin_lock(&dev->delayed_cmd_lock);
}
spin_unlock(&dev->delayed_cmd_lock);
}
+static void transport_complete_ordered_sync(struct se_cmd *cmd)
+{
+ struct se_device *dev = cmd->se_dev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->delayed_cmd_lock, flags);
+ dev->dev_cur_ordered_id++;
+
+ pr_debug("Incremented dev_cur_ordered_id: %u for type %d\n",
+ dev->dev_cur_ordered_id, cmd->sam_task_attr);
+
+ dev->ordered_sync_in_progress = false;
+
+ if (list_empty(&dev->delayed_cmd_list))
+ percpu_ref_resurrect(&dev->non_ordered);
+ else
+ schedule_work(&dev->delayed_cmd_work);
+
+ spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags);
+}
+
/*
* Called from I/O completion to determine which dormant/delayed
* and ordered cmds need to have their tasks added to the execution queue.
@@ -2360,30 +2369,24 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
return;
if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET))
- goto restart;
-
- if (cmd->sam_task_attr == TCM_SIMPLE_TAG) {
- atomic_dec_mb(&dev->non_ordered);
- dev->dev_cur_ordered_id++;
- } else if (cmd->sam_task_attr == TCM_HEAD_TAG) {
- atomic_dec_mb(&dev->non_ordered);
- dev->dev_cur_ordered_id++;
- pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n",
- dev->dev_cur_ordered_id);
- } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
- spin_lock(&dev->delayed_cmd_lock);
- dev->ordered_sync_in_progress = false;
- spin_unlock(&dev->delayed_cmd_lock);
+ return;
- dev->dev_cur_ordered_id++;
- pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n",
- dev->dev_cur_ordered_id);
- }
cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET;
-restart:
- if (atomic_read(&dev->delayed_cmd_count) > 0)
- schedule_work(&dev->delayed_cmd_work);
+ if (cmd->se_cmd_flags & SCF_TASK_ORDERED_SYNC) {
+ transport_complete_ordered_sync(cmd);
+ return;
+ }
+
+ switch (cmd->sam_task_attr) {
+ case TCM_SIMPLE_TAG:
+ percpu_ref_put(&dev->non_ordered);
+ break;
+ case TCM_ORDERED_TAG:
+ /* All ordered should have been executed as sync */
+ WARN_ON(1);
+ break;
+ }
}
static void transport_complete_qf(struct se_cmd *cmd)
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 97099a5e3f6c..a52d4967c0d3 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -157,6 +157,7 @@ enum se_cmd_flags_table {
SCF_USE_CPUID = (1 << 16),
SCF_TASK_ATTR_SET = (1 << 17),
SCF_TREAT_READ_AS_NORMAL = (1 << 18),
+ SCF_TASK_ORDERED_SYNC = (1 << 19),
};
/*
@@ -669,15 +670,19 @@ struct se_lun_acl {
struct se_ml_stat_grps ml_stat_grps;
};
+struct se_dev_entry_io_stats {
+ u32 total_cmds;
+ u32 read_bytes;
+ u32 write_bytes;
+};
+
struct se_dev_entry {
u64 mapped_lun;
u64 pr_res_key;
u64 creation_time;
bool lun_access_ro;
u32 attach_count;
- atomic_long_t total_cmds;
- atomic_long_t read_bytes;
- atomic_long_t write_bytes;
+ struct se_dev_entry_io_stats __percpu *stats;
/* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */
struct kref pr_kref;
struct completion pr_comp;
@@ -800,6 +805,12 @@ struct se_device_queue {
struct se_cmd_queue sq;
};
+struct se_dev_io_stats {
+ u32 total_cmds;
+ u32 read_bytes;
+ u32 write_bytes;
+};
+
struct se_device {
/* Used for SAM Task Attribute ordering */
u32 dev_cur_ordered_id;
@@ -821,13 +832,10 @@ struct se_device {
atomic_long_t num_resets;
atomic_long_t aborts_complete;
atomic_long_t aborts_no_task;
- atomic_long_t num_cmds;
- atomic_long_t read_bytes;
- atomic_long_t write_bytes;
+ struct se_dev_io_stats __percpu *stats;
/* Active commands on this virtual SE device */
- atomic_t non_ordered;
+ struct percpu_ref non_ordered;
bool ordered_sync_in_progress;
- atomic_t delayed_cmd_count;
atomic_t dev_qf_count;
u32 export_count;
spinlock_t delayed_cmd_lock;