diff options
author | Martin K. Petersen <martin.petersen@oracle.com> | 2025-04-28 21:48:37 -0400 |
---|---|---|
committer | Martin K. Petersen <martin.petersen@oracle.com> | 2025-04-28 21:48:37 -0400 |
commit | 6b08fe7763de9e388e6b224af2e39fa5083c44d8 (patch) | |
tree | ad164e3d8991d1336927e51f82d776b9b23b41d3 | |
parent | a0d1cf505d3f3a98388c2d05fe03b126dfda3513 (diff) | |
parent | 268975a87c7b6f6b0ceb62df236c1e1b08b89379 (diff) |
Merge patch series "target: Remove atomics from main IO path"
Mike Christie <michael.christie@oracle.com> says:
The following patches made over Linus's tree remove the atomic use from
the main IO path. There was a handful of atomic_longs used just used
for stats and a couple atomics used for handling ordered commands. These
patches move the stats to per cpu, and moves the ordered tracking to a
per cpu counter.
With the patches 8K IOPS increases by up to 33% when running fio
with numjobs >= 4 and using the vhost-scsi target with virtio-scsi
and virtio num_queues >= 4 (jobs and queues match, and virtqueue_size
and cmd_per_lun are increased to match the total iodepth of all
jobs).
Link: https://lore.kernel.org/r/20250424032741.16216-1-michael.christie@oracle.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
-rw-r--r-- | drivers/target/target_core_device.c | 89 | ||||
-rw-r--r-- | drivers/target/target_core_stat.c | 69 | ||||
-rw-r--r-- | drivers/target/target_core_transport.c | 119 | ||||
-rw-r--r-- | include/target/target_core_base.h | 24 |
4 files changed, 204 insertions, 97 deletions
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index cc2da086f96e..7bb711b24c0d 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -55,14 +55,14 @@ transport_lookup_cmd_lun(struct se_cmd *se_cmd) rcu_read_lock(); deve = target_nacl_find_deve(nacl, se_cmd->orig_fe_lun); if (deve) { - atomic_long_inc(&deve->total_cmds); + this_cpu_inc(deve->stats->total_cmds); if (se_cmd->data_direction == DMA_TO_DEVICE) - atomic_long_add(se_cmd->data_length, - &deve->write_bytes); + this_cpu_add(deve->stats->write_bytes, + se_cmd->data_length); else if (se_cmd->data_direction == DMA_FROM_DEVICE) - atomic_long_add(se_cmd->data_length, - &deve->read_bytes); + this_cpu_add(deve->stats->read_bytes, + se_cmd->data_length); if ((se_cmd->data_direction == DMA_TO_DEVICE) && deve->lun_access_ro) { @@ -126,14 +126,14 @@ out_unlock: * target_core_fabric_configfs.c:target_fabric_port_release */ se_cmd->se_dev = rcu_dereference_raw(se_lun->lun_se_dev); - atomic_long_inc(&se_cmd->se_dev->num_cmds); + this_cpu_inc(se_cmd->se_dev->stats->total_cmds); if (se_cmd->data_direction == DMA_TO_DEVICE) - atomic_long_add(se_cmd->data_length, - &se_cmd->se_dev->write_bytes); + this_cpu_add(se_cmd->se_dev->stats->write_bytes, + se_cmd->data_length); else if (se_cmd->data_direction == DMA_FROM_DEVICE) - atomic_long_add(se_cmd->data_length, - &se_cmd->se_dev->read_bytes); + this_cpu_add(se_cmd->se_dev->stats->read_bytes, + se_cmd->data_length); return ret; } @@ -322,6 +322,7 @@ int core_enable_device_list_for_node( struct se_portal_group *tpg) { struct se_dev_entry *orig, *new; + int ret = 0; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) { @@ -329,6 +330,12 @@ int core_enable_device_list_for_node( return -ENOMEM; } + new->stats = alloc_percpu(struct se_dev_entry_io_stats); + if (!new->stats) { + ret = -ENOMEM; + goto free_deve; + } + spin_lock_init(&new->ua_lock); INIT_LIST_HEAD(&new->ua_list); INIT_LIST_HEAD(&new->lun_link); @@ -351,8 +358,8 @@ int core_enable_device_list_for_node( " for dynamic -> explicit NodeACL conversion:" " %s\n", nacl->initiatorname); mutex_unlock(&nacl->lun_entry_mutex); - kfree(new); - return -EINVAL; + ret = -EINVAL; + goto free_stats; } if (orig->se_lun_acl != NULL) { pr_warn_ratelimited("Detected existing explicit" @@ -360,8 +367,8 @@ int core_enable_device_list_for_node( " mapped_lun: %llu, failing\n", nacl->initiatorname, mapped_lun); mutex_unlock(&nacl->lun_entry_mutex); - kfree(new); - return -EINVAL; + ret = -EINVAL; + goto free_stats; } new->se_lun = lun; @@ -394,6 +401,20 @@ int core_enable_device_list_for_node( target_luns_data_has_changed(nacl, new, true); return 0; + +free_stats: + free_percpu(new->stats); +free_deve: + kfree(new); + return ret; +} + +static void target_free_dev_entry(struct rcu_head *head) +{ + struct se_dev_entry *deve = container_of(head, struct se_dev_entry, + rcu_head); + free_percpu(deve->stats); + kfree(deve); } void core_disable_device_list_for_node( @@ -443,7 +464,7 @@ void core_disable_device_list_for_node( kref_put(&orig->pr_kref, target_pr_kref_release); wait_for_completion(&orig->pr_comp); - kfree_rcu(orig, rcu_head); + call_rcu(&orig->rcu_head, target_free_dev_entry); core_scsi3_free_pr_reg_from_nacl(dev, nacl); target_luns_data_has_changed(nacl, NULL, false); @@ -679,6 +700,18 @@ static void scsi_dump_inquiry(struct se_device *dev) pr_debug(" Type: %s ", scsi_device_type(device_type)); } +static void target_non_ordered_release(struct percpu_ref *ref) +{ + struct se_device *dev = container_of(ref, struct se_device, + non_ordered); + unsigned long flags; + + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + if (!list_empty(&dev->delayed_cmd_list)) + schedule_work(&dev->delayed_cmd_work); + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); +} + struct se_device *target_alloc_device(struct se_hba *hba, const char *name) { struct se_device *dev; @@ -689,11 +722,13 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) if (!dev) return NULL; + dev->stats = alloc_percpu(struct se_dev_io_stats); + if (!dev->stats) + goto free_device; + dev->queues = kcalloc(nr_cpu_ids, sizeof(*dev->queues), GFP_KERNEL); - if (!dev->queues) { - hba->backend->ops->free_device(dev); - return NULL; - } + if (!dev->queues) + goto free_stats; dev->queue_cnt = nr_cpu_ids; for (i = 0; i < dev->queue_cnt; i++) { @@ -707,6 +742,10 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) INIT_WORK(&q->sq.work, target_queued_submit_work); } + if (percpu_ref_init(&dev->non_ordered, target_non_ordered_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + goto free_queues; + dev->se_hba = hba; dev->transport = hba->backend->ops; dev->transport_flags = dev->transport->transport_flags_default; @@ -791,6 +830,14 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) sizeof(dev->t10_wwn.revision)); return dev; + +free_queues: + kfree(dev->queues); +free_stats: + free_percpu(dev->stats); +free_device: + hba->backend->ops->free_device(dev); + return NULL; } /* @@ -980,6 +1027,9 @@ void target_free_device(struct se_device *dev) WARN_ON(!list_empty(&dev->dev_sep_list)); + percpu_ref_exit(&dev->non_ordered); + cancel_work_sync(&dev->delayed_cmd_work); + if (target_dev_configured(dev)) { dev->transport->destroy_device(dev); @@ -1001,6 +1051,7 @@ void target_free_device(struct se_device *dev) dev->transport->free_prot(dev); kfree(dev->queues); + free_percpu(dev->stats); dev->transport->free_device(dev); } diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c index 210648a0092e..6bdf2d8bd694 100644 --- a/drivers/target/target_core_stat.c +++ b/drivers/target/target_core_stat.c @@ -280,30 +280,51 @@ static ssize_t target_stat_lu_num_cmds_show(struct config_item *item, char *page) { struct se_device *dev = to_stat_lu_dev(item); + struct se_dev_io_stats *stats; + unsigned int cpu; + u32 cmds = 0; + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(dev->stats, cpu); + cmds += stats->total_cmds; + } /* scsiLuNumCommands */ - return snprintf(page, PAGE_SIZE, "%lu\n", - atomic_long_read(&dev->num_cmds)); + return snprintf(page, PAGE_SIZE, "%u\n", cmds); } static ssize_t target_stat_lu_read_mbytes_show(struct config_item *item, char *page) { struct se_device *dev = to_stat_lu_dev(item); + struct se_dev_io_stats *stats; + unsigned int cpu; + u32 bytes = 0; + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(dev->stats, cpu); + bytes += stats->read_bytes; + } /* scsiLuReadMegaBytes */ - return snprintf(page, PAGE_SIZE, "%lu\n", - atomic_long_read(&dev->read_bytes) >> 20); + return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20); } static ssize_t target_stat_lu_write_mbytes_show(struct config_item *item, char *page) { struct se_device *dev = to_stat_lu_dev(item); + struct se_dev_io_stats *stats; + unsigned int cpu; + u32 bytes = 0; + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(dev->stats, cpu); + bytes += stats->write_bytes; + } /* scsiLuWrittenMegaBytes */ - return snprintf(page, PAGE_SIZE, "%lu\n", - atomic_long_read(&dev->write_bytes) >> 20); + return snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20); } static ssize_t target_stat_lu_resets_show(struct config_item *item, char *page) @@ -1019,8 +1040,11 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item, { struct se_lun_acl *lacl = auth_to_lacl(item); struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry_io_stats *stats; struct se_dev_entry *deve; + unsigned int cpu; ssize_t ret; + u32 cmds = 0; rcu_read_lock(); deve = target_nacl_find_deve(nacl, lacl->mapped_lun); @@ -1028,9 +1052,14 @@ static ssize_t target_stat_auth_num_cmds_show(struct config_item *item, rcu_read_unlock(); return -ENODEV; } + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(deve->stats, cpu); + cmds += stats->total_cmds; + } + /* scsiAuthIntrOutCommands */ - ret = snprintf(page, PAGE_SIZE, "%lu\n", - atomic_long_read(&deve->total_cmds)); + ret = snprintf(page, PAGE_SIZE, "%u\n", cmds); rcu_read_unlock(); return ret; } @@ -1040,8 +1069,11 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item, { struct se_lun_acl *lacl = auth_to_lacl(item); struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry_io_stats *stats; struct se_dev_entry *deve; + unsigned int cpu; ssize_t ret; + u32 bytes = 0; rcu_read_lock(); deve = target_nacl_find_deve(nacl, lacl->mapped_lun); @@ -1049,9 +1081,14 @@ static ssize_t target_stat_auth_read_mbytes_show(struct config_item *item, rcu_read_unlock(); return -ENODEV; } + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(deve->stats, cpu); + bytes += stats->read_bytes; + } + /* scsiAuthIntrReadMegaBytes */ - ret = snprintf(page, PAGE_SIZE, "%u\n", - (u32)(atomic_long_read(&deve->read_bytes) >> 20)); + ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20); rcu_read_unlock(); return ret; } @@ -1061,8 +1098,11 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item, { struct se_lun_acl *lacl = auth_to_lacl(item); struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry_io_stats *stats; struct se_dev_entry *deve; + unsigned int cpu; ssize_t ret; + u32 bytes = 0; rcu_read_lock(); deve = target_nacl_find_deve(nacl, lacl->mapped_lun); @@ -1070,9 +1110,14 @@ static ssize_t target_stat_auth_write_mbytes_show(struct config_item *item, rcu_read_unlock(); return -ENODEV; } + + for_each_possible_cpu(cpu) { + stats = per_cpu_ptr(deve->stats, cpu); + bytes += stats->write_bytes; + } + /* scsiAuthIntrWrittenMegaBytes */ - ret = snprintf(page, PAGE_SIZE, "%u\n", - (u32)(atomic_long_read(&deve->write_bytes) >> 20)); + ret = snprintf(page, PAGE_SIZE, "%u\n", bytes >> 20); rcu_read_unlock(); return ret; } diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 05d29201b730..0a76bdfe5528 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2213,6 +2213,7 @@ static int target_write_prot_action(struct se_cmd *cmd) static bool target_handle_task_attr(struct se_cmd *cmd) { struct se_device *dev = cmd->se_dev; + unsigned long flags; if (dev->transport_flags & TRANSPORT_FLAG_PASSTHROUGH) return false; @@ -2225,13 +2226,10 @@ static bool target_handle_task_attr(struct se_cmd *cmd) */ switch (cmd->sam_task_attr) { case TCM_HEAD_TAG: - atomic_inc_mb(&dev->non_ordered); pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n", cmd->t_task_cdb[0]); return false; case TCM_ORDERED_TAG: - atomic_inc_mb(&dev->delayed_cmd_count); - pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n", cmd->t_task_cdb[0]); break; @@ -2239,29 +2237,29 @@ static bool target_handle_task_attr(struct se_cmd *cmd) /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc_mb(&dev->non_ordered); - - if (atomic_read(&dev->delayed_cmd_count) == 0) +retry: + if (percpu_ref_tryget_live(&dev->non_ordered)) return false; + break; } - if (cmd->sam_task_attr != TCM_ORDERED_TAG) { - atomic_inc_mb(&dev->delayed_cmd_count); - /* - * We will account for this when we dequeue from the delayed - * list. - */ - atomic_dec_mb(&dev->non_ordered); + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + if (cmd->sam_task_attr == TCM_SIMPLE_TAG && + !percpu_ref_is_dying(&dev->non_ordered)) { + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); + /* We raced with the last ordered completion so retry. */ + goto retry; + } else if (!percpu_ref_is_dying(&dev->non_ordered)) { + percpu_ref_kill(&dev->non_ordered); } - spin_lock_irq(&cmd->t_state_lock); + spin_lock(&cmd->t_state_lock); cmd->transport_state &= ~CMD_T_SENT; - spin_unlock_irq(&cmd->t_state_lock); + spin_unlock(&cmd->t_state_lock); - spin_lock(&dev->delayed_cmd_lock); list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list); - spin_unlock(&dev->delayed_cmd_lock); + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn", cmd->t_task_cdb[0], cmd->sam_task_attr); @@ -2313,41 +2311,52 @@ void target_do_delayed_work(struct work_struct *work) while (!dev->ordered_sync_in_progress) { struct se_cmd *cmd; - if (list_empty(&dev->delayed_cmd_list)) + /* + * We can be woken up early/late due to races or the + * extra wake up we do when adding commands to the list. + * We check for both cases here. + */ + if (list_empty(&dev->delayed_cmd_list) || + !percpu_ref_is_zero(&dev->non_ordered)) break; cmd = list_entry(dev->delayed_cmd_list.next, struct se_cmd, se_delayed_node); + cmd->se_cmd_flags |= SCF_TASK_ORDERED_SYNC; + cmd->transport_state |= CMD_T_SENT; - if (cmd->sam_task_attr == TCM_ORDERED_TAG) { - /* - * Check if we started with: - * [ordered] [simple] [ordered] - * and we are now at the last ordered so we have to wait - * for the simple cmd. - */ - if (atomic_read(&dev->non_ordered) > 0) - break; - - dev->ordered_sync_in_progress = true; - } + dev->ordered_sync_in_progress = true; list_del(&cmd->se_delayed_node); - atomic_dec_mb(&dev->delayed_cmd_count); spin_unlock(&dev->delayed_cmd_lock); - if (cmd->sam_task_attr != TCM_ORDERED_TAG) - atomic_inc_mb(&dev->non_ordered); - - cmd->transport_state |= CMD_T_SENT; - __target_execute_cmd(cmd, true); - spin_lock(&dev->delayed_cmd_lock); } spin_unlock(&dev->delayed_cmd_lock); } +static void transport_complete_ordered_sync(struct se_cmd *cmd) +{ + struct se_device *dev = cmd->se_dev; + unsigned long flags; + + spin_lock_irqsave(&dev->delayed_cmd_lock, flags); + dev->dev_cur_ordered_id++; + + pr_debug("Incremented dev_cur_ordered_id: %u for type %d\n", + dev->dev_cur_ordered_id, cmd->sam_task_attr); + + dev->ordered_sync_in_progress = false; + + if (list_empty(&dev->delayed_cmd_list)) + percpu_ref_resurrect(&dev->non_ordered); + else + schedule_work(&dev->delayed_cmd_work); + + spin_unlock_irqrestore(&dev->delayed_cmd_lock, flags); +} + /* * Called from I/O completion to determine which dormant/delayed * and ordered cmds need to have their tasks added to the execution queue. @@ -2360,30 +2369,24 @@ static void transport_complete_task_attr(struct se_cmd *cmd) return; if (!(cmd->se_cmd_flags & SCF_TASK_ATTR_SET)) - goto restart; - - if (cmd->sam_task_attr == TCM_SIMPLE_TAG) { - atomic_dec_mb(&dev->non_ordered); - dev->dev_cur_ordered_id++; - } else if (cmd->sam_task_attr == TCM_HEAD_TAG) { - atomic_dec_mb(&dev->non_ordered); - dev->dev_cur_ordered_id++; - pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n", - dev->dev_cur_ordered_id); - } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) { - spin_lock(&dev->delayed_cmd_lock); - dev->ordered_sync_in_progress = false; - spin_unlock(&dev->delayed_cmd_lock); + return; - dev->dev_cur_ordered_id++; - pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", - dev->dev_cur_ordered_id); - } cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET; -restart: - if (atomic_read(&dev->delayed_cmd_count) > 0) - schedule_work(&dev->delayed_cmd_work); + if (cmd->se_cmd_flags & SCF_TASK_ORDERED_SYNC) { + transport_complete_ordered_sync(cmd); + return; + } + + switch (cmd->sam_task_attr) { + case TCM_SIMPLE_TAG: + percpu_ref_put(&dev->non_ordered); + break; + case TCM_ORDERED_TAG: + /* All ordered should have been executed as sync */ + WARN_ON(1); + break; + } } static void transport_complete_qf(struct se_cmd *cmd) diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 97099a5e3f6c..a52d4967c0d3 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -157,6 +157,7 @@ enum se_cmd_flags_table { SCF_USE_CPUID = (1 << 16), SCF_TASK_ATTR_SET = (1 << 17), SCF_TREAT_READ_AS_NORMAL = (1 << 18), + SCF_TASK_ORDERED_SYNC = (1 << 19), }; /* @@ -669,15 +670,19 @@ struct se_lun_acl { struct se_ml_stat_grps ml_stat_grps; }; +struct se_dev_entry_io_stats { + u32 total_cmds; + u32 read_bytes; + u32 write_bytes; +}; + struct se_dev_entry { u64 mapped_lun; u64 pr_res_key; u64 creation_time; bool lun_access_ro; u32 attach_count; - atomic_long_t total_cmds; - atomic_long_t read_bytes; - atomic_long_t write_bytes; + struct se_dev_entry_io_stats __percpu *stats; /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */ struct kref pr_kref; struct completion pr_comp; @@ -800,6 +805,12 @@ struct se_device_queue { struct se_cmd_queue sq; }; +struct se_dev_io_stats { + u32 total_cmds; + u32 read_bytes; + u32 write_bytes; +}; + struct se_device { /* Used for SAM Task Attribute ordering */ u32 dev_cur_ordered_id; @@ -821,13 +832,10 @@ struct se_device { atomic_long_t num_resets; atomic_long_t aborts_complete; atomic_long_t aborts_no_task; - atomic_long_t num_cmds; - atomic_long_t read_bytes; - atomic_long_t write_bytes; + struct se_dev_io_stats __percpu *stats; /* Active commands on this virtual SE device */ - atomic_t non_ordered; + struct percpu_ref non_ordered; bool ordered_sync_in_progress; - atomic_t delayed_cmd_count; atomic_t dev_qf_count; u32 export_count; spinlock_t delayed_cmd_lock; |