summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/netlink/specs/netdev.yaml4
-rw-r--r--Documentation/networking/xsk-tx-metadata.rst62
-rw-r--r--drivers/net/ethernet/intel/igc/igc.h1
-rw-r--r--drivers/net/ethernet/intel/igc/igc_main.c143
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac.h2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c13
-rw-r--r--include/linux/filter.h1
-rw-r--r--include/linux/skbuff.h12
-rw-r--r--include/net/sock.h10
-rw-r--r--include/net/tcp.h7
-rw-r--r--include/net/xdp_sock.h10
-rw-r--r--include/net/xdp_sock_drv.h1
-rw-r--r--include/uapi/linux/bpf.h30
-rw-r--r--include/uapi/linux/if_xdp.h10
-rw-r--r--include/uapi/linux/netdev.h3
-rw-r--r--kernel/bpf/btf.c1
-rw-r--r--net/core/dev.c3
-rw-r--r--net/core/filter.c80
-rw-r--r--net/core/netdev-genl.c2
-rw-r--r--net/core/skbuff.c53
-rw-r--r--net/core/sock.c14
-rw-r--r--net/dsa/user.c2
-rw-r--r--net/ipv4/tcp.c6
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/socket.c2
-rw-r--r--net/xdp/xsk.c3
-rw-r--r--tools/include/uapi/linux/bpf.h30
-rw-r--r--tools/include/uapi/linux/if_xdp.h10
-rw-r--r--tools/include/uapi/linux/netdev.h3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/net_timestamping.c239
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_tracing_net.h1
-rw-r--r--tools/testing/selftests/bpf/progs/net_timestamping.c248
-rw-r--r--tools/testing/selftests/bpf/progs/setget_sockopt.c1
-rw-r--r--tools/testing/selftests/bpf/xdp_hw_metadata.c168
35 files changed, 1126 insertions, 53 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 766b82005d18..36f1152bfac3 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -70,6 +70,10 @@ definitions:
name: tx-checksum
doc:
L3 checksum HW offload is supported by the driver.
+ -
+ name: tx-launch-time-fifo
+ doc:
+ Launch time HW offload is supported by the driver.
-
name: queue-type
type: enum
diff --git a/Documentation/networking/xsk-tx-metadata.rst b/Documentation/networking/xsk-tx-metadata.rst
index e76b0cfc32f7..df53a10ccac3 100644
--- a/Documentation/networking/xsk-tx-metadata.rst
+++ b/Documentation/networking/xsk-tx-metadata.rst
@@ -50,6 +50,10 @@ The flags field enables the particular offload:
checksum. ``csum_start`` specifies byte offset of where the checksumming
should start and ``csum_offset`` specifies byte offset where the
device should store the computed checksum.
+- ``XDP_TXMD_FLAGS_LAUNCH_TIME``: requests the device to schedule the
+ packet for transmission at a pre-determined time called launch time. The
+ value of launch time is indicated by ``launch_time`` field of
+ ``union xsk_tx_metadata``.
Besides the flags above, in order to trigger the offloads, the first
packet's ``struct xdp_desc`` descriptor should set ``XDP_TX_METADATA``
@@ -65,6 +69,63 @@ In this case, when running in ``XDK_COPY`` mode, the TX checksum
is calculated on the CPU. Do not enable this option in production because
it will negatively affect performance.
+Launch Time
+===========
+
+The value of the requested launch time should be based on the device's PTP
+Hardware Clock (PHC) to ensure accuracy. AF_XDP takes a different data path
+compared to the ETF queuing discipline, which organizes packets and delays
+their transmission. Instead, AF_XDP immediately hands off the packets to
+the device driver without rearranging their order or holding them prior to
+transmission. Since the driver maintains FIFO behavior and does not perform
+packet reordering, a packet with a launch time request will block other
+packets in the same Tx Queue until it is sent. Therefore, it is recommended
+to allocate separate queue for scheduling traffic that is intended for
+future transmission.
+
+In scenarios where the launch time offload feature is disabled, the device
+driver is expected to disregard the launch time request. For correct
+interpretation and meaningful operation, the launch time should never be
+set to a value larger than the farthest programmable time in the future
+(the horizon). Different devices have different hardware limitations on the
+launch time offload feature.
+
+stmmac driver
+-------------
+
+For stmmac, TSO and launch time (TBS) features are mutually exclusive for
+each individual Tx Queue. By default, the driver configures Tx Queue 0 to
+support TSO and the rest of the Tx Queues to support TBS. The launch time
+hardware offload feature can be enabled or disabled by using the tc-etf
+command to call the driver's ndo_setup_tc() callback.
+
+The value of the launch time that is programmed in the Enhanced Normal
+Transmit Descriptors is a 32-bit value, where the most significant 8 bits
+represent the time in seconds and the remaining 24 bits represent the time
+in 256 ns increments. The programmed launch time is compared against the
+PTP time (bits[39:8]) and rolls over after 256 seconds. Therefore, the
+horizon of the launch time for dwmac4 and dwxlgmac2 is 128 seconds in the
+future.
+
+igc driver
+----------
+
+For igc, all four Tx Queues support the launch time feature. The launch
+time hardware offload feature can be enabled or disabled by using the
+tc-etf command to call the driver's ndo_setup_tc() callback. When entering
+TSN mode, the igc driver will reset the device and create a default Qbv
+schedule with a 1-second cycle time, with all Tx Queues open at all times.
+
+The value of the launch time that is programmed in the Advanced Transmit
+Context Descriptor is a relative offset to the starting time of the Qbv
+transmission window of the queue. The Frst flag of the descriptor can be
+set to schedule the packet for the next Qbv cycle. Therefore, the horizon
+of the launch time for i225 and i226 is the ending time of the next cycle
+of the Qbv transmission window of the queue. For example, when the Qbv
+cycle time is set to 1 second, the horizon of the launch time ranges
+from 1 second to 2 seconds, depending on where the Qbv cycle is currently
+running.
+
Querying Device Capabilities
============================
@@ -74,6 +135,7 @@ Refer to ``xsk-flags`` features bitmask in
- ``tx-timestamp``: device supports ``XDP_TXMD_FLAGS_TIMESTAMP``
- ``tx-checksum``: device supports ``XDP_TXMD_FLAGS_CHECKSUM``
+- ``tx-launch-time-fifo``: device supports ``XDP_TXMD_FLAGS_LAUNCH_TIME``
See ``tools/net/ynl/samples/netdev.c`` on how to query this information.
diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index b8111ad9a9a8..cd1d7b6c1782 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -579,6 +579,7 @@ struct igc_metadata_request {
struct xsk_tx_metadata *meta;
struct igc_ring *tx_ring;
u32 cmd_type;
+ u16 used_desc;
};
struct igc_q_vector {
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 84307bb7313e..3044392e8ded 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -1092,7 +1092,8 @@ static int igc_init_empty_frame(struct igc_ring *ring,
dma = dma_map_single(ring->dev, skb->data, size, DMA_TO_DEVICE);
if (dma_mapping_error(ring->dev, dma)) {
- netdev_err_once(ring->netdev, "Failed to map DMA for TX\n");
+ net_err_ratelimited("%s: DMA mapping error for empty frame\n",
+ netdev_name(ring->netdev));
return -ENOMEM;
}
@@ -1108,20 +1109,12 @@ static int igc_init_empty_frame(struct igc_ring *ring,
return 0;
}
-static int igc_init_tx_empty_descriptor(struct igc_ring *ring,
- struct sk_buff *skb,
- struct igc_tx_buffer *first)
+static void igc_init_tx_empty_descriptor(struct igc_ring *ring,
+ struct sk_buff *skb,
+ struct igc_tx_buffer *first)
{
union igc_adv_tx_desc *desc;
u32 cmd_type, olinfo_status;
- int err;
-
- if (!igc_desc_unused(ring))
- return -EBUSY;
-
- err = igc_init_empty_frame(ring, first, skb);
- if (err)
- return err;
cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD |
@@ -1140,8 +1133,6 @@ static int igc_init_tx_empty_descriptor(struct igc_ring *ring,
ring->next_to_use++;
if (ring->next_to_use == ring->count)
ring->next_to_use = 0;
-
- return 0;
}
#define IGC_EMPTY_FRAME_SIZE 60
@@ -1567,6 +1558,40 @@ static bool igc_request_tx_tstamp(struct igc_adapter *adapter, struct sk_buff *s
return false;
}
+static int igc_insert_empty_frame(struct igc_ring *tx_ring)
+{
+ struct igc_tx_buffer *empty_info;
+ struct sk_buff *empty_skb;
+ void *data;
+ int ret;
+
+ empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+ empty_skb = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC);
+ if (unlikely(!empty_skb)) {
+ net_err_ratelimited("%s: skb alloc error for empty frame\n",
+ netdev_name(tx_ring->netdev));
+ return -ENOMEM;
+ }
+
+ data = skb_put(empty_skb, IGC_EMPTY_FRAME_SIZE);
+ memset(data, 0, IGC_EMPTY_FRAME_SIZE);
+
+ /* Prepare DMA mapping and Tx buffer information */
+ ret = igc_init_empty_frame(tx_ring, empty_info, empty_skb);
+ if (unlikely(ret)) {
+ dev_kfree_skb_any(empty_skb);
+ return ret;
+ }
+
+ /* Prepare advanced context descriptor for empty packet */
+ igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0);
+
+ /* Prepare advanced data descriptor for empty packet */
+ igc_init_tx_empty_descriptor(tx_ring, empty_skb, empty_info);
+
+ return 0;
+}
+
static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
struct igc_ring *tx_ring)
{
@@ -1586,6 +1611,7 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
* + 1 desc for skb_headlen/IGC_MAX_DATA_PER_TXD,
* + 2 desc gap to keep tail from touching head,
* + 1 desc for context descriptor,
+ * + 2 desc for inserting an empty packet for launch time,
* otherwise try next time
*/
for (f = 0; f < skb_shinfo(skb)->nr_frags; f++)
@@ -1605,24 +1631,16 @@ static netdev_tx_t igc_xmit_frame_ring(struct sk_buff *skb,
launch_time = igc_tx_launchtime(tx_ring, txtime, &first_flag, &insert_empty);
if (insert_empty) {
- struct igc_tx_buffer *empty_info;
- struct sk_buff *empty;
- void *data;
-
- empty_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
- empty = alloc_skb(IGC_EMPTY_FRAME_SIZE, GFP_ATOMIC);
- if (!empty)
- goto done;
-
- data = skb_put(empty, IGC_EMPTY_FRAME_SIZE);
- memset(data, 0, IGC_EMPTY_FRAME_SIZE);
-
- igc_tx_ctxtdesc(tx_ring, 0, false, 0, 0, 0);
-
- if (igc_init_tx_empty_descriptor(tx_ring,
- empty,
- empty_info) < 0)
- dev_kfree_skb_any(empty);
+ /* Reset the launch time if the required empty frame fails to
+ * be inserted. However, this packet is not dropped, so it
+ * "dirties" the current Qbv cycle. This ensures that the
+ * upcoming packet, which is scheduled in the next Qbv cycle,
+ * does not require an empty frame. This way, the launch time
+ * continues to function correctly despite the current failure
+ * to insert the empty frame.
+ */
+ if (igc_insert_empty_frame(tx_ring))
+ launch_time = 0;
}
done:
@@ -2953,9 +2971,48 @@ static u64 igc_xsk_fill_timestamp(void *_priv)
return *(u64 *)_priv;
}
+static void igc_xsk_request_launch_time(u64 launch_time, void *_priv)
+{
+ struct igc_metadata_request *meta_req = _priv;
+ struct igc_ring *tx_ring = meta_req->tx_ring;
+ __le32 launch_time_offset;
+ bool insert_empty = false;
+ bool first_flag = false;
+ u16 used_desc = 0;
+
+ if (!tx_ring->launchtime_enable)
+ return;
+
+ launch_time_offset = igc_tx_launchtime(tx_ring,
+ ns_to_ktime(launch_time),
+ &first_flag, &insert_empty);
+ if (insert_empty) {
+ /* Disregard the launch time request if the required empty frame
+ * fails to be inserted.
+ */
+ if (igc_insert_empty_frame(tx_ring))
+ return;
+
+ meta_req->tx_buffer =
+ &tx_ring->tx_buffer_info[tx_ring->next_to_use];
+ /* Inserting an empty packet requires two descriptors:
+ * one data descriptor and one context descriptor.
+ */
+ used_desc += 2;
+ }
+
+ /* Use one context descriptor to specify launch time and first flag. */
+ igc_tx_ctxtdesc(tx_ring, launch_time_offset, first_flag, 0, 0, 0);
+ used_desc += 1;
+
+ /* Update the number of used descriptors in this request */
+ meta_req->used_desc += used_desc;
+}
+
const struct xsk_tx_metadata_ops igc_xsk_tx_metadata_ops = {
.tmo_request_timestamp = igc_xsk_request_timestamp,
.tmo_fill_timestamp = igc_xsk_fill_timestamp,
+ .tmo_request_launch_time = igc_xsk_request_launch_time,
};
static void igc_xdp_xmit_zc(struct igc_ring *ring)
@@ -2978,7 +3035,13 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
ntu = ring->next_to_use;
budget = igc_desc_unused(ring);
- while (xsk_tx_peek_desc(pool, &xdp_desc) && budget--) {
+ /* Packets with launch time require one data descriptor and one context
+ * descriptor. When the launch time falls into the next Qbv cycle, we
+ * may need to insert an empty packet, which requires two more
+ * descriptors. Therefore, to be safe, we always ensure we have at least
+ * 4 descriptors available.
+ */
+ while (xsk_tx_peek_desc(pool, &xdp_desc) && budget >= 4) {
struct igc_metadata_request meta_req;
struct xsk_tx_metadata *meta = NULL;
struct igc_tx_buffer *bi;
@@ -2999,9 +3062,19 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
meta_req.tx_ring = ring;
meta_req.tx_buffer = bi;
meta_req.meta = meta;
+ meta_req.used_desc = 0;
xsk_tx_metadata_request(meta, &igc_xsk_tx_metadata_ops,
&meta_req);
+ /* xsk_tx_metadata_request() may have updated next_to_use */
+ ntu = ring->next_to_use;
+
+ /* xsk_tx_metadata_request() may have updated Tx buffer info */
+ bi = meta_req.tx_buffer;
+
+ /* xsk_tx_metadata_request() may use a few descriptors */
+ budget -= meta_req.used_desc;
+
tx_desc = IGC_TX_DESC(ring, ntu);
tx_desc->read.cmd_type_len = cpu_to_le32(meta_req.cmd_type);
tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
@@ -3019,9 +3092,11 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring)
ntu++;
if (ntu == ring->count)
ntu = 0;
+
+ ring->next_to_use = ntu;
+ budget--;
}
- ring->next_to_use = ntu;
if (tx_desc) {
igc_flush_tx_descriptors(ring);
xsk_tx_release(pool);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 3395188c198a..b403029b1aaa 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -106,6 +106,8 @@ struct stmmac_metadata_request {
struct stmmac_priv *priv;
struct dma_desc *tx_desc;
bool *set_ic;
+ struct dma_edesc *edesc;
+ int tbs;
};
struct stmmac_xsk_tx_complete {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 4d542f482ecb..edf4b1d651a6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2486,9 +2486,20 @@ static u64 stmmac_xsk_fill_timestamp(void *_priv)
return 0;
}
+static void stmmac_xsk_request_launch_time(u64 launch_time, void *_priv)
+{
+ struct timespec64 ts = ns_to_timespec64(launch_time);
+ struct stmmac_metadata_request *meta_req = _priv;
+
+ if (meta_req->tbs & STMMAC_TBS_EN)
+ stmmac_set_desc_tbs(meta_req->priv, meta_req->edesc, ts.tv_sec,
+ ts.tv_nsec);
+}
+
static const struct xsk_tx_metadata_ops stmmac_xsk_tx_metadata_ops = {
.tmo_request_timestamp = stmmac_xsk_request_timestamp,
.tmo_fill_timestamp = stmmac_xsk_fill_timestamp,
+ .tmo_request_launch_time = stmmac_xsk_request_launch_time,
};
static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
@@ -2572,6 +2583,8 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
meta_req.priv = priv;
meta_req.tx_desc = tx_desc;
meta_req.set_ic = &set_ic;
+ meta_req.tbs = tx_q->tbs;
+ meta_req.edesc = &tx_q->dma_entx[entry];
xsk_tx_metadata_request(meta, &stmmac_xsk_tx_metadata_ops,
&meta_req);
if (set_ic) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a3ea46281595..d36d5d5180b1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern {
void *skb_data_end;
u8 op;
u8 is_fullsock;
+ u8 is_locked_tcp_sock;
u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb2b751d274a..0b4f1889500d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,7 @@ struct skb_shared_hwtstamps {
/* Definitions for tx_flags in struct skb_shared_info */
enum {
/* generate hardware time stamp */
- SKBTX_HW_TSTAMP = 1 << 0,
+ SKBTX_HW_TSTAMP_NOBPF = 1 << 0,
/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,
@@ -489,10 +489,16 @@ enum {
/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,
+
+ /* used for bpf extension when a bpf program is loaded */
+ SKBTX_BPF = 1 << 7,
};
+#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)
+
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
- SKBTX_SCHED_TSTAMP)
+ SKBTX_SCHED_TSTAMP | \
+ SKBTX_BPF)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
SKBTX_HW_TSTAMP_USE_CYCLES | \
SKBTX_ANY_SW_TSTAMP)
@@ -4564,7 +4570,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
- if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+ if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
skb_tstamp_tx(skb, NULL);
}
diff --git a/include/net/sock.h b/include/net/sock.h
index edbb870e3f86..efc031163c33 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
@@ -525,6 +526,8 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u8 sk_bpf_cb_flags;
void *sk_user_data;
#ifdef CONFIG_SECURITY
@@ -2909,6 +2912,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ced32b924a9c..9745c7f18170 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -978,10 +978,12 @@ struct tcp_skb_cb {
__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
- __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
+#define TSTAMP_ACK_SK 0x1
+#define TSTAMP_ACK_BPF 0x2
+ __u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
- unused:5;
+ unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
@@ -2671,6 +2673,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_owned_by_me(sk);
}
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index bfe625b55d55..a58ae7589d12 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -110,11 +110,16 @@ struct xdp_sock {
* indicates position where checksumming should start.
* csum_offset indicates position where checksum should be stored.
*
+ * void (*tmo_request_launch_time)(u64 launch_time, void *priv)
+ * Called when AF_XDP frame requested launch time HW offload support.
+ * launch_time indicates the PTP time at which the device can schedule the
+ * packet for transmission.
*/
struct xsk_tx_metadata_ops {
void (*tmo_request_timestamp)(void *priv);
u64 (*tmo_fill_timestamp)(void *priv);
void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+ void (*tmo_request_launch_time)(u64 launch_time, void *priv);
};
#ifdef CONFIG_XDP_SOCKETS
@@ -162,6 +167,11 @@ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
if (!meta)
return;
+ if (ops->tmo_request_launch_time)
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ ops->tmo_request_launch_time(meta->request.launch_time,
+ priv);
+
if (ops->tmo_request_timestamp)
if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
ops->tmo_request_timestamp(priv);
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 15086dcf51d8..513c8e9704f6 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -216,6 +216,7 @@ xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
#define XDP_TXMD_FLAGS_VALID ( \
XDP_TXMD_FLAGS_TIMESTAMP | \
XDP_TXMD_FLAGS_CHECKSUM | \
+ XDP_TXMD_FLAGS_LAUNCH_TIME | \
0)
static inline bool
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2acf9b336371..defa5bb881f4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6913,6 +6913,12 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
+enum {
+ SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
+ SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
+ SK_BPF_CB_TX_TIMESTAMPING
+};
+
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -7025,6 +7031,29 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing
+ * through dev layer when
+ * SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send
+ * to the nic when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when
+ * SK_BPF_CB_TX_TIMESTAMPING feature
+ * is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the
+ * same sendmsg call are acked
+ * when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall
+ * is triggered. It's used to correlate
+ * sendmsg timestamp with corresponding
+ * tskey.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7091,6 +7120,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
};
enum {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 42ec5ddaab8d..42869770776e 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -127,6 +127,12 @@ struct xdp_options {
*/
#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+/* Request launch time hardware offload. The device will schedule the packet for
+ * transmission at a pre-determined time called launch time. The value of
+ * launch time is communicated via launch_time field of struct xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2)
+
/* AF_XDP offloads request. 'request' union member is consumed by the driver
* when the packet is being transmitted. 'completion' union member is
* filled by the driver when the transmit completion arrives.
@@ -142,6 +148,10 @@ struct xsk_tx_metadata {
__u16 csum_start;
/* Offset from csum_start where checksum should be stored. */
__u16 csum_offset;
+
+ /* XDP_TXMD_FLAGS_LAUNCH_TIME */
+ /* Launch time in nanosecond against the PTP HW Clock */
+ __u64 launch_time;
} request;
struct {
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 4e82f3871473..7600bf62dbdf 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -59,10 +59,13 @@ enum netdev_xdp_rx_metadata {
* by the driver.
* @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
* driver.
+ * @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported
+ * by the driver.
*/
enum netdev_xsk_flags {
NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+ NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4,
};
enum netdev_queue_type {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9de6acddd479..551eedb87d58 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8524,6 +8524,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
diff --git a/net/core/dev.c b/net/core/dev.c
index 18064be6cf3e..8c7ee7ada6a3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4572,7 +4572,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ec162dd83c4..a0867c5b32b3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
+{
+ u32 sk_bpf_cb_flags;
+
+ if (getopt) {
+ *(u32 *)optval = sk->sk_bpf_cb_flags;
+ return 0;
+ }
+
+ sk_bpf_cb_flags = *(u32 *)optval;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+ return -EINVAL;
+
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
@@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
@@ -5382,6 +5405,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
case TCP_USER_TIMEOUT:
case TCP_NOTSENT_LOWAT:
case TCP_SAVE_SYN:
+ case TCP_RTO_MAX_MS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5500,6 +5524,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}
+static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
+{
+ return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+}
+
static int _bpf_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
@@ -5650,6 +5679,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -5735,6 +5767,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
int ret, copy_len = 0;
@@ -5777,6 +5812,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
@@ -7586,6 +7624,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
u8 search_kind, search_len, copy_len, magic_len;
int ret;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
/* 2 byte is the minimal option len except TCPOPT_NOP and
* TCPOPT_EOL which are useless for the bpf prog to learn
* and this helper disallow loading them also.
@@ -10358,10 +10399,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
} \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
if (si->dst_reg == si->src_reg) \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
@@ -10446,10 +10487,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
@@ -12062,6 +12103,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
#endif
}
+__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
+ u64 flags)
+{
+ struct sk_buff *skb;
+
+ if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
+ return -EOPNOTSUPP;
+
+ if (flags)
+ return -EINVAL;
+
+ skb = skops->skb;
+ skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
+ TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
+ skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
@@ -12095,6 +12155,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
+
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
@@ -12115,6 +12179,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
.set = &bpf_kfunc_check_set_tcp_reqsk,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_ops,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -12133,7 +12202,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index c92fba65b20d..2b774183d31c 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -53,6 +53,8 @@ XDP_METADATA_KFUNC_xxx
xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
}
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7b03b64fdcb2..5b241c9e6f38 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5449,6 +5449,52 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5461,6 +5507,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
diff --git a/net/core/sock.c b/net/core/sock.c
index 0d385bf27b38..5ac445f8244b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -949,6 +949,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
return 0;
}
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 2296a4ead020..804dc7dac4f2 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -897,7 +897,7 @@ static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
{
struct dsa_switch *ds = p->dp->ds;
- if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+ if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF))
return;
if (!ds->ops->port_txtstamp)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6a8f19a10911..08d73f17e816 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -492,10 +492,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
sock_tx_timestamp(sk, sockc, &shinfo->tx_flags);
if (tsflags & SOF_TIMESTAMPING_TX_ACK)
- tcb->txstamp_ack = 1;
+ tcb->txstamp_ack |= TSTAMP_ACK_SK;
if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
+
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
+ bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
static bool tcp_stream_is_readable(struct sock *sk, int target)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5fddcd0bbe91..217a8747a79b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -169,6 +169,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
@@ -185,6 +186,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op,
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
sock_ops.op = bpf_op;
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
/* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
if (skb)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4b40f135432..9a3cf51eab78 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -525,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
}
@@ -570,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
sock_owned_by_me(sk);
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_ops.sk = sk;
}
diff --git a/net/socket.c b/net/socket.c
index 28bae5a94234..0545e9ea7058 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -681,7 +681,7 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags)
u8 flags = *tx_flags;
if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) {
- flags |= SKBTX_HW_TSTAMP;
+ flags |= SKBTX_HW_TSTAMP_NOBPF;
/* PTP hardware clocks can provide a free running cycle counter
* as a time base for virtual clocks. Tell driver to use the
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 0edf25973072..84bf9f1d4bf2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -742,6 +742,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
goto free_err;
}
}
+
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ skb->skb_mstamp_ns = meta->request.launch_time;
}
}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2acf9b336371..defa5bb881f4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6913,6 +6913,12 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
+enum {
+ SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
+ SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
+ SK_BPF_CB_TX_TIMESTAMPING
+};
+
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -7025,6 +7031,29 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing
+ * through dev layer when
+ * SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send
+ * to the nic when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when
+ * SK_BPF_CB_TX_TIMESTAMPING feature
+ * is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the
+ * same sendmsg call are acked
+ * when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall
+ * is triggered. It's used to correlate
+ * sendmsg timestamp with corresponding
+ * tskey.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7091,6 +7120,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
};
enum {
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 42ec5ddaab8d..42869770776e 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -127,6 +127,12 @@ struct xdp_options {
*/
#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1)
+/* Request launch time hardware offload. The device will schedule the packet for
+ * transmission at a pre-determined time called launch time. The value of
+ * launch time is communicated via launch_time field of struct xsk_tx_metadata.
+ */
+#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2)
+
/* AF_XDP offloads request. 'request' union member is consumed by the driver
* when the packet is being transmitted. 'completion' union member is
* filled by the driver when the transmit completion arrives.
@@ -142,6 +148,10 @@ struct xsk_tx_metadata {
__u16 csum_start;
/* Offset from csum_start where checksum should be stored. */
__u16 csum_offset;
+
+ /* XDP_TXMD_FLAGS_LAUNCH_TIME */
+ /* Launch time in nanosecond against the PTP HW Clock */
+ __u64 launch_time;
} request;
struct {
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 4e82f3871473..7600bf62dbdf 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -59,10 +59,13 @@ enum netdev_xdp_rx_metadata {
* by the driver.
* @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the
* driver.
+ * @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported
+ * by the driver.
*/
enum netdev_xsk_flags {
NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1,
NETDEV_XSK_FLAGS_TX_CHECKSUM = 2,
+ NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4,
};
enum netdev_queue_type {
diff --git a/tools/testing/selftests/bpf/prog_tests/net_timestamping.c b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c
new file mode 100644
index 000000000000..dbfd87499b6b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c
@@ -0,0 +1,239 @@
+#include <linux/net_tstamp.h>
+#include <sys/time.h>
+#include <linux/errqueue.h>
+#include "test_progs.h"
+#include "network_helpers.h"
+#include "net_timestamping.skel.h"
+
+#define CG_NAME "/net-timestamping-test"
+#define NSEC_PER_SEC 1000000000LL
+
+static const char addr4_str[] = "127.0.0.1";
+static const char addr6_str[] = "::1";
+static struct net_timestamping *skel;
+static const int cfg_payload_len = 30;
+static struct timespec usr_ts;
+static u64 delay_tolerance_nsec = 10000000000; /* 10 seconds */
+int SK_TS_SCHED;
+int SK_TS_TXSW;
+int SK_TS_ACK;
+
+static int64_t timespec_to_ns64(struct timespec *ts)
+{
+ return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec;
+}
+
+static void validate_key(int tskey, int tstype)
+{
+ static int expected_tskey = -1;
+
+ if (tstype == SCM_TSTAMP_SCHED)
+ expected_tskey = cfg_payload_len - 1;
+
+ ASSERT_EQ(expected_tskey, tskey, "tskey mismatch");
+
+ expected_tskey = tskey;
+}
+
+static void validate_timestamp(struct timespec *cur, struct timespec *prev)
+{
+ int64_t cur_ns, prev_ns;
+
+ cur_ns = timespec_to_ns64(cur);
+ prev_ns = timespec_to_ns64(prev);
+
+ ASSERT_LT(cur_ns - prev_ns, delay_tolerance_nsec, "latency");
+}
+
+static void test_socket_timestamp(struct scm_timestamping *tss, int tstype,
+ int tskey)
+{
+ static struct timespec prev_ts;
+
+ validate_key(tskey, tstype);
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ validate_timestamp(&tss->ts[0], &usr_ts);
+ SK_TS_SCHED += 1;
+ break;
+ case SCM_TSTAMP_SND:
+ validate_timestamp(&tss->ts[0], &prev_ts);
+ SK_TS_TXSW += 1;
+ break;
+ case SCM_TSTAMP_ACK:
+ validate_timestamp(&tss->ts[0], &prev_ts);
+ SK_TS_ACK += 1;
+ break;
+ }
+
+ prev_ts = tss->ts[0];
+}
+
+static void test_recv_errmsg_cmsg(struct msghdr *msg)
+{
+ struct sock_extended_err *serr = NULL;
+ struct scm_timestamping *tss = NULL;
+ struct cmsghdr *cm;
+
+ for (cm = CMSG_FIRSTHDR(msg);
+ cm && cm->cmsg_len;
+ cm = CMSG_NXTHDR(msg, cm)) {
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_TIMESTAMPING) {
+ tss = (void *)CMSG_DATA(cm);
+ } else if ((cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_RECVERR) ||
+ (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_RECVERR) ||
+ (cm->cmsg_level == SOL_PACKET &&
+ cm->cmsg_type == PACKET_TX_TIMESTAMP)) {
+ serr = (void *)CMSG_DATA(cm);
+ ASSERT_EQ(serr->ee_origin, SO_EE_ORIGIN_TIMESTAMPING,
+ "cmsg type");
+ }
+
+ if (serr && tss)
+ test_socket_timestamp(tss, serr->ee_info,
+ serr->ee_data);
+ }
+}
+
+static bool socket_recv_errmsg(int fd)
+{
+ static char ctrl[1024 /* overprovision*/];
+ char data[cfg_payload_len];
+ static struct msghdr msg;
+ struct iovec entry;
+ int n = 0;
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&entry, 0, sizeof(entry));
+ memset(ctrl, 0, sizeof(ctrl));
+
+ entry.iov_base = data;
+ entry.iov_len = cfg_payload_len;
+ msg.msg_iov = &entry;
+ msg.msg_iovlen = 1;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = ctrl;
+ msg.msg_controllen = sizeof(ctrl);
+
+ n = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (n == -1)
+ ASSERT_EQ(errno, EAGAIN, "recvmsg MSG_ERRQUEUE");
+
+ if (n >= 0)
+ test_recv_errmsg_cmsg(&msg);
+
+ return n == -1;
+}
+
+static void test_socket_timestamping(int fd)
+{
+ while (!socket_recv_errmsg(fd));
+
+ ASSERT_EQ(SK_TS_SCHED, 1, "SCM_TSTAMP_SCHED");
+ ASSERT_EQ(SK_TS_TXSW, 1, "SCM_TSTAMP_SND");
+ ASSERT_EQ(SK_TS_ACK, 1, "SCM_TSTAMP_ACK");
+
+ SK_TS_SCHED = 0;
+ SK_TS_TXSW = 0;
+ SK_TS_ACK = 0;
+}
+
+static void test_tcp(int family, bool enable_socket_timestamping)
+{
+ struct net_timestamping__bss *bss;
+ char buf[cfg_payload_len];
+ int sfd = -1, cfd = -1;
+ unsigned int sock_opt;
+ struct netns_obj *ns;
+ int cg_fd;
+ int ret;
+
+ cg_fd = test__join_cgroup(CG_NAME);
+ if (!ASSERT_OK_FD(cg_fd, "join cgroup"))
+ return;
+
+ ns = netns_new("net_timestamping_ns", true);
+ if (!ASSERT_OK_PTR(ns, "create ns"))
+ goto out;
+
+ skel = net_timestamping__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open and load skel"))
+ goto out;
+
+ if (!ASSERT_OK(net_timestamping__attach(skel), "attach skel"))
+ goto out;
+
+ skel->links.skops_sockopt =
+ bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
+ if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
+ goto out;
+
+ bss = skel->bss;
+ memset(bss, 0, sizeof(*bss));
+
+ skel->bss->monitored_pid = getpid();
+
+ sfd = start_server(family, SOCK_STREAM,
+ family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
+ if (!ASSERT_OK_FD(sfd, "start_server"))
+ goto out;
+
+ cfd = connect_to_fd(sfd, 0);
+ if (!ASSERT_OK_FD(cfd, "connect_to_fd_server"))
+ goto out;
+
+ if (enable_socket_timestamping) {
+ sock_opt = SOF_TIMESTAMPING_SOFTWARE |
+ SOF_TIMESTAMPING_OPT_ID |
+ SOF_TIMESTAMPING_TX_SCHED |
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK;
+ ret = setsockopt(cfd, SOL_SOCKET, SO_TIMESTAMPING,
+ (char *) &sock_opt, sizeof(sock_opt));
+ if (!ASSERT_OK(ret, "setsockopt SO_TIMESTAMPING"))
+ goto out;
+
+ ret = clock_gettime(CLOCK_REALTIME, &usr_ts);
+ if (!ASSERT_OK(ret, "get user time"))
+ goto out;
+ }
+
+ ret = write(cfd, buf, sizeof(buf));
+ if (!ASSERT_EQ(ret, sizeof(buf), "send to server"))
+ goto out;
+
+ if (enable_socket_timestamping)
+ test_socket_timestamping(cfd);
+
+ ASSERT_EQ(bss->nr_active, 1, "nr_active");
+ ASSERT_EQ(bss->nr_snd, 2, "nr_snd");
+ ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
+ ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
+ ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
+
+out:
+ if (sfd >= 0)
+ close(sfd);
+ if (cfd >= 0)
+ close(cfd);
+ net_timestamping__destroy(skel);
+ netns_free(ns);
+ close(cg_fd);
+}
+
+void test_net_timestamping(void)
+{
+ if (test__start_subtest("INET4: bpf timestamping"))
+ test_tcp(AF_INET, false);
+ if (test__start_subtest("INET4: bpf and socket timestamping"))
+ test_tcp(AF_INET, true);
+ if (test__start_subtest("INET6: bpf timestamping"))
+ test_tcp(AF_INET6, false);
+ if (test__start_subtest("INET6: bpf and socket timestamping"))
+ test_tcp(AF_INET6, true);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 59843b430f76..eb6ed1b7b2ef 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -49,6 +49,7 @@
#define TCP_SAVED_SYN 28
#define TCP_CA_NAME_MAX 16
#define TCP_NAGLE_OFF 1
+#define TCP_RTO_MAX_MS 44
#define TCP_ECN_OK 1
#define TCP_ECN_QUEUE_CWR 2
diff --git a/tools/testing/selftests/bpf/progs/net_timestamping.c b/tools/testing/selftests/bpf/progs/net_timestamping.c
new file mode 100644
index 000000000000..b4c2f0f2be11
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/net_timestamping.c
@@ -0,0 +1,248 @@
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_kfuncs.h"
+#include <errno.h>
+
+__u32 monitored_pid = 0;
+
+int nr_active;
+int nr_snd;
+int nr_passive;
+int nr_sched;
+int nr_txsw;
+int nr_ack;
+
+struct sk_stg {
+ __u64 sendmsg_ns; /* record ts when sendmsg is called */
+};
+
+struct sk_tskey {
+ u64 cookie;
+ u32 tskey;
+};
+
+struct delay_info {
+ u64 sendmsg_ns; /* record ts when sendmsg is called */
+ u32 sched_delay; /* SCHED_CB - sendmsg_ns */
+ u32 snd_sw_delay; /* SND_SW_CB - SCHED_CB */
+ u32 ack_delay; /* ACK_CB - SND_SW_CB */
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct sk_stg);
+} sk_stg_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct sk_tskey);
+ __type(value, struct delay_info);
+ __uint(max_entries, 1024);
+} time_map SEC(".maps");
+
+static u64 delay_tolerance_nsec = 10000000000; /* 10 second as an example */
+
+extern int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, u64 flags) __ksym;
+
+static int bpf_test_sockopt(void *ctx, const struct sock *sk, int expected)
+{
+ int tmp, new = SK_BPF_CB_TX_TIMESTAMPING;
+ int opt = SK_BPF_CB_FLAGS;
+ int level = SOL_SOCKET;
+
+ if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)) != expected)
+ return 1;
+
+ if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) != expected ||
+ (!expected && tmp != new))
+ return 1;
+
+ return 0;
+}
+
+static bool bpf_test_access_sockopt(void *ctx, const struct sock *sk)
+{
+ if (bpf_test_sockopt(ctx, sk, -EOPNOTSUPP))
+ return true;
+ return false;
+}
+
+static bool bpf_test_access_load_hdr_opt(struct bpf_sock_ops *skops)
+{
+ u8 opt[3] = {0};
+ int load_flags = 0;
+ int ret;
+
+ ret = bpf_load_hdr_opt(skops, opt, sizeof(opt), load_flags);
+ if (ret != -EOPNOTSUPP)
+ return true;
+
+ return false;
+}
+
+static bool bpf_test_access_cb_flags_set(struct bpf_sock_ops *skops)
+{
+ int ret;
+
+ ret = bpf_sock_ops_cb_flags_set(skops, 0);
+ if (ret != -EOPNOTSUPP)
+ return true;
+
+ return false;
+}
+
+/* In the timestamping callbacks, we're not allowed to call the following
+ * BPF CALLs for the safety concern. Return false if expected.
+ */
+static bool bpf_test_access_bpf_calls(struct bpf_sock_ops *skops,
+ const struct sock *sk)
+{
+ if (bpf_test_access_sockopt(skops, sk))
+ return true;
+
+ if (bpf_test_access_load_hdr_opt(skops))
+ return true;
+
+ if (bpf_test_access_cb_flags_set(skops))
+ return true;
+
+ return false;
+}
+
+static bool bpf_test_delay(struct bpf_sock_ops *skops, const struct sock *sk)
+{
+ struct bpf_sock_ops_kern *skops_kern;
+ u64 timestamp = bpf_ktime_get_ns();
+ struct skb_shared_info *shinfo;
+ struct delay_info dinfo = {0};
+ struct sk_tskey key = {0};
+ struct delay_info *val;
+ struct sk_buff *skb;
+ struct sk_stg *stg;
+ u64 prior_ts, delay;
+
+ if (bpf_test_access_bpf_calls(skops, sk))
+ return false;
+
+ skops_kern = bpf_cast_to_kern_ctx(skops);
+ skb = skops_kern->skb;
+ shinfo = bpf_core_cast(skb->head + skb->end, struct skb_shared_info);
+
+ key.cookie = bpf_get_socket_cookie(skops);
+ if (!key.cookie)
+ return false;
+
+ if (skops->op == BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) {
+ stg = bpf_sk_storage_get(&sk_stg_map, (void *)sk, 0, 0);
+ if (!stg)
+ return false;
+ dinfo.sendmsg_ns = stg->sendmsg_ns;
+ bpf_sock_ops_enable_tx_tstamp(skops_kern, 0);
+ key.tskey = shinfo->tskey;
+ if (!key.tskey)
+ return false;
+ bpf_map_update_elem(&time_map, &key, &dinfo, BPF_ANY);
+ return true;
+ }
+
+ key.tskey = shinfo->tskey;
+ if (!key.tskey)
+ return false;
+
+ val = bpf_map_lookup_elem(&time_map, &key);
+ if (!val)
+ return false;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_TSTAMP_SCHED_CB:
+ val->sched_delay = timestamp - val->sendmsg_ns;
+ delay = val->sched_delay;
+ break;
+ case BPF_SOCK_OPS_TSTAMP_SND_SW_CB:
+ prior_ts = val->sched_delay + val->sendmsg_ns;
+ val->snd_sw_delay = timestamp - prior_ts;
+ delay = val->snd_sw_delay;
+ break;
+ case BPF_SOCK_OPS_TSTAMP_ACK_CB:
+ prior_ts = val->snd_sw_delay + val->sched_delay + val->sendmsg_ns;
+ val->ack_delay = timestamp - prior_ts;
+ delay = val->ack_delay;
+ break;
+ }
+
+ if (delay >= delay_tolerance_nsec)
+ return false;
+
+ /* Since it's the last one, remove from the map after latency check */
+ if (skops->op == BPF_SOCK_OPS_TSTAMP_ACK_CB)
+ bpf_map_delete_elem(&time_map, &key);
+
+ return true;
+}
+
+SEC("fentry/tcp_sendmsg_locked")
+int BPF_PROG(trace_tcp_sendmsg_locked, struct sock *sk, struct msghdr *msg,
+ size_t size)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ u64 timestamp = bpf_ktime_get_ns();
+ u32 flag = sk->sk_bpf_cb_flags;
+ struct sk_stg *stg;
+
+ if (pid != monitored_pid || !flag)
+ return 0;
+
+ stg = bpf_sk_storage_get(&sk_stg_map, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!stg)
+ return 0;
+
+ stg->sendmsg_ns = timestamp;
+ nr_snd += 1;
+ return 0;
+}
+
+SEC("sockops")
+int skops_sockopt(struct bpf_sock_ops *skops)
+{
+ struct bpf_sock *bpf_sk = skops->sk;
+ const struct sock *sk;
+
+ if (!bpf_sk)
+ return 1;
+
+ sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
+ if (!sk)
+ return 1;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ nr_active += !bpf_test_sockopt(skops, sk, 0);
+ break;
+ case BPF_SOCK_OPS_TSTAMP_SENDMSG_CB:
+ if (bpf_test_delay(skops, sk))
+ nr_snd += 1;
+ break;
+ case BPF_SOCK_OPS_TSTAMP_SCHED_CB:
+ if (bpf_test_delay(skops, sk))
+ nr_sched += 1;
+ break;
+ case BPF_SOCK_OPS_TSTAMP_SND_SW_CB:
+ if (bpf_test_delay(skops, sk))
+ nr_txsw += 1;
+ break;
+ case BPF_SOCK_OPS_TSTAMP_ACK_CB:
+ if (bpf_test_delay(skops, sk))
+ nr_ack += 1;
+ break;
+ }
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
index 6dd4318debbf..106fe430f41b 100644
--- a/tools/testing/selftests/bpf/progs/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -61,6 +61,7 @@ static const struct sockopt_test sol_tcp_tests[] = {
{ .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
{ .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS,
.expected = BPF_SOCK_OPS_ALL_CB_FLAGS, },
+ { .opt = TCP_RTO_MAX_MS, .new = 2000, .expected = 2000, },
{ .opt = 0, },
};
diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c
index 6f7b15d6c6ed..3d8de0d4c96a 100644
--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
@@ -13,6 +13,7 @@
* - UDP 9091 packets trigger TX reply
* - TX HW timestamp is requested and reported back upon completion
* - TX checksum is requested
+ * - TX launch time HW offload is requested for transmission
*/
#include <test_progs.h>
@@ -37,6 +38,15 @@
#include <time.h>
#include <unistd.h>
#include <libgen.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/ethtool.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
#include "xdp_metadata.h"
@@ -64,6 +74,18 @@ int rxq;
bool skip_tx;
__u64 last_hw_rx_timestamp;
__u64 last_xdp_rx_timestamp;
+__u64 last_launch_time;
+__u64 launch_time_delta_to_hw_rx_timestamp;
+int launch_time_queue;
+
+#define run_command(cmd, ...) \
+({ \
+ char command[1024]; \
+ memset(command, 0, sizeof(command)); \
+ snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \
+ fprintf(stderr, "Running: %s\n", command); \
+ system(command); \
+})
void test__fail(void) { /* for network_helpers.c */ }
@@ -298,6 +320,12 @@ static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
if (meta->completion.tx_timestamp) {
__u64 ref_tstamp = gettime(clock_id);
+ if (launch_time_delta_to_hw_rx_timestamp) {
+ print_tstamp_delta("HW Launch-time",
+ "HW TX-complete-time",
+ last_launch_time,
+ meta->completion.tx_timestamp);
+ }
print_tstamp_delta("HW TX-complete-time", "User TX-complete-time",
meta->completion.tx_timestamp, ref_tstamp);
print_tstamp_delta("XDP RX-time", "User TX-complete-time",
@@ -395,6 +423,17 @@ static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
xsk, ntohs(udph->check), ntohs(want_csum),
meta->request.csum_start, meta->request.csum_offset);
+ /* Set the value of launch time */
+ if (launch_time_delta_to_hw_rx_timestamp) {
+ meta->flags |= XDP_TXMD_FLAGS_LAUNCH_TIME;
+ meta->request.launch_time = last_hw_rx_timestamp +
+ launch_time_delta_to_hw_rx_timestamp;
+ last_launch_time = meta->request.launch_time;
+ print_tstamp_delta("HW RX-time", "HW Launch-time",
+ last_hw_rx_timestamp,
+ meta->request.launch_time);
+ }
+
memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
tx_desc->options |= XDP_TX_METADATA;
tx_desc->len = len;
@@ -407,6 +446,7 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t
const struct xdp_desc *rx_desc;
struct pollfd fds[rxq + 1];
__u64 comp_addr;
+ __u64 deadline;
__u64 addr;
__u32 idx = 0;
int ret;
@@ -477,9 +517,15 @@ peek:
if (ret)
printf("kick_tx ret=%d\n", ret);
- for (int j = 0; j < 500; j++) {
+ /* wait 1 second + cover launch time */
+ deadline = gettime(clock_id) +
+ NANOSEC_PER_SEC +
+ launch_time_delta_to_hw_rx_timestamp;
+ while (true) {
if (complete_tx(xsk, clock_id))
break;
+ if (gettime(clock_id) >= deadline)
+ break;
usleep(10);
}
}
@@ -608,6 +654,10 @@ static void print_usage(void)
" -h Display this help and exit\n\n"
" -m Enable multi-buffer XDP for larger MTU\n"
" -r Don't generate AF_XDP reply (rx metadata only)\n"
+ " -l Delta of launch time relative to HW RX-time in ns\n"
+ " default: 0 ns (launch time request is disabled)\n"
+ " -L Tx Queue to be enabled with launch time offload\n"
+ " default: 0 (Tx Queue 0)\n"
"Generate test packets on the other machine with:\n"
" echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
@@ -618,7 +668,7 @@ static void read_args(int argc, char *argv[])
{
int opt;
- while ((opt = getopt(argc, argv, "chmr")) != -1) {
+ while ((opt = getopt(argc, argv, "chmrl:L:")) != -1) {
switch (opt) {
case 'c':
bind_flags &= ~XDP_USE_NEED_WAKEUP;
@@ -634,6 +684,12 @@ static void read_args(int argc, char *argv[])
case 'r':
skip_tx = true;
break;
+ case 'l':
+ launch_time_delta_to_hw_rx_timestamp = atoll(optarg);
+ break;
+ case 'L':
+ launch_time_queue = atoll(optarg);
+ break;
case '?':
if (isprint(optopt))
fprintf(stderr, "Unknown option: -%c\n", optopt);
@@ -657,23 +713,118 @@ static void read_args(int argc, char *argv[])
error(-1, errno, "Invalid interface name");
}
+void clean_existing_configurations(void)
+{
+ /* Check and delete root qdisc if exists */
+ if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc mqprio 8001:'", ifname) == 0)
+ run_command("sudo tc qdisc del dev %s root", ifname);
+
+ /* Check and delete ingress qdisc if exists */
+ if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc ingress ffff:'", ifname) == 0)
+ run_command("sudo tc qdisc del dev %s ingress", ifname);
+
+ /* Check and delete ethtool filters if any exist */
+ if (run_command("sudo ethtool -n %s | grep -q 'Filter:'", ifname) == 0) {
+ run_command("sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 sudo ethtool -N %s delete >&2",
+ ifname, ifname);
+ }
+}
+
+#define MAX_TC 16
+
int main(int argc, char *argv[])
{
clockid_t clock_id = CLOCK_TAI;
+ struct bpf_program *prog;
int server_fd = -1;
+ size_t map_len = 0;
+ size_t que_len = 0;
+ char *buf = NULL;
+ char *map = NULL;
+ char *que = NULL;
+ char *tmp = NULL;
+ int tc = 0;
int ret;
int i;
- struct bpf_program *prog;
-
read_args(argc, argv);
rxq = rxq_num(ifname);
-
printf("rxq: %d\n", rxq);
+ if (launch_time_queue >= rxq || launch_time_queue < 0)
+ error(1, 0, "Invalid launch_time_queue.");
+
+ clean_existing_configurations();
+ sleep(1);
+
+ /* Enable tx and rx hardware timestamping */
hwtstamp_enable(ifname);
+ /* Prepare priority to traffic class map for tc-mqprio */
+ for (i = 0; i < MAX_TC; i++) {
+ if (i < rxq)
+ tc = i;
+
+ if (asprintf(&buf, "%d ", tc) == -1) {
+ printf("Failed to malloc buf for tc map.\n");
+ goto free_mem;
+ }
+
+ map_len += strlen(buf);
+ tmp = realloc(map, map_len + 1);
+ if (!tmp) {
+ printf("Failed to realloc tc map.\n");
+ goto free_mem;
+ }
+ map = tmp;
+ strcat(map, buf);
+ free(buf);
+ buf = NULL;
+ }
+
+ /* Prepare traffic class to hardware queue map for tc-mqprio */
+ for (i = 0; i <= tc; i++) {
+ if (asprintf(&buf, "1@%d ", i) == -1) {
+ printf("Failed to malloc buf for tc queues.\n");
+ goto free_mem;
+ }
+
+ que_len += strlen(buf);
+ tmp = realloc(que, que_len + 1);
+ if (!tmp) {
+ printf("Failed to realloc tc queues.\n");
+ goto free_mem;
+ }
+ que = tmp;
+ strcat(que, buf);
+ free(buf);
+ buf = NULL;
+ }
+
+ /* Add mqprio qdisc */
+ run_command("sudo tc qdisc add dev %s handle 8001: parent root mqprio num_tc %d map %squeues %shw 0",
+ ifname, tc + 1, map, que);
+
+ /* To test launch time, send UDP packet with VLAN priority 1 to port 9091 */
+ if (launch_time_delta_to_hw_rx_timestamp) {
+ /* Enable launch time hardware offload on launch_time_queue */
+ run_command("sudo tc qdisc replace dev %s parent 8001:%d etf offload clockid CLOCK_TAI delta 500000",
+ ifname, launch_time_queue + 1);
+ sleep(1);
+
+ /* Route incoming packet with VLAN priority 1 into launch_time_queue */
+ if (run_command("sudo ethtool -N %s flow-type ether vlan 0x2000 vlan-mask 0x1FFF action %d",
+ ifname, launch_time_queue)) {
+ run_command("sudo tc qdisc add dev %s ingress", ifname);
+ run_command("sudo tc filter add dev %s parent ffff: protocol 802.1Q flower vlan_prio 1 hw_tc %d",
+ ifname, launch_time_queue);
+ }
+
+ /* Enable VLAN tag stripping offload */
+ run_command("sudo ethtool -K %s rxvlan on", ifname);
+ }
+
rx_xsk = malloc(sizeof(struct xsk) * rxq);
if (!rx_xsk)
error(1, ENOMEM, "malloc");
@@ -733,4 +884,11 @@ int main(int argc, char *argv[])
cleanup();
if (ret)
error(1, -ret, "verify_metadata");
+
+ clean_existing_configurations();
+
+free_mem:
+ free(buf);
+ free(map);
+ free(que);
}