diff options
82 files changed, 4563 insertions, 1171 deletions
diff --git a/Documentation/devicetree/bindings/net/cirrus,cs89x0.txt b/Documentation/devicetree/bindings/net/cirrus,cs89x0.txt new file mode 100644 index 000000000000..c070076bacb9 --- /dev/null +++ b/Documentation/devicetree/bindings/net/cirrus,cs89x0.txt @@ -0,0 +1,13 @@ +* Cirrus Logic CS8900/CS8920 Network Controller + +Required properties: +- compatible : Should be "cirrus,cs8900" or "cirrus,cs8920". +- reg : Address and length of the IO space. +- interrupts : Should contain the controller interrupt line. + +Examples: + eth0: eth@10000000 { + compatible = "cirrus,cs8900"; + reg = <0x10000000 0x400>; + interrupts = <10>; + }; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c index 245c063ed4db..4523c8662ed2 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c @@ -127,7 +127,7 @@ static int lio_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) dev_err(&oct->pci_dev->dev, "Unknown link interface reported\n"); } - if (linfo->link.s.status) { + if (linfo->link.s.link_up) { ethtool_cmd_speed_set(ecmd, linfo->link.s.speed); ecmd->duplex = linfo->link.s.duplex; } else { @@ -222,23 +222,20 @@ static int octnet_gpio_access(struct net_device *netdev, int addr, int val) struct lio *lio = GET_LIO(netdev); struct octeon_device *oct = lio->oct_dev; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; int ret = 0; memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = OCTNET_CMD_GPIO_ACCESS; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = addr; - nctrl.ncmd.s.param3 = val; + nctrl.ncmd.s.param1 = addr; + nctrl.ncmd.s.param2 = val; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.wait_time = 100; nctrl.netpndev = (u64)netdev; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; - nparams.resp_order = OCTEON_RESP_ORDERED; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "Failed to configure gpio value\n"); return -EINVAL; @@ -303,9 +300,10 @@ octnet_mdio45_access(struct lio *lio, int op, int loc, int *value) mdio_cmd->mdio_addr = loc; if (op) mdio_cmd->value1 = *value; - mdio_cmd->value2 = lio->linfo.ifidx; octeon_swap_8B_data((u64 *)mdio_cmd, sizeof(struct oct_mdio_cmd) / 8); + sc->iq_no = lio->linfo.txpciq[0].s.q_no; + octeon_prepare_soft_command(oct_dev, sc, OPCODE_NIC, OPCODE_NIC_MDIO45, 0, 0, 0); @@ -317,7 +315,7 @@ octnet_mdio45_access(struct lio *lio, int op, int loc, int *value) retval = octeon_send_soft_command(oct_dev, sc); - if (retval) { + if (retval == IQ_SEND_FAILED) { dev_err(&oct_dev->pci_dev->dev, "octnet_mdio45_access instruction failed status: %x\n", retval); @@ -503,10 +501,10 @@ static void lio_set_msglevel(struct net_device *netdev, u32 msglvl) if ((msglvl ^ lio->msg_enable) & NETIF_MSG_HW) { if (msglvl & NETIF_MSG_HW) liquidio_set_feature(netdev, - OCTNET_CMD_VERBOSE_ENABLE); + OCTNET_CMD_VERBOSE_ENABLE, 0); else liquidio_set_feature(netdev, - OCTNET_CMD_VERBOSE_DISABLE); + OCTNET_CMD_VERBOSE_DISABLE, 0); } lio->msg_enable = msglvl; @@ -653,7 +651,7 @@ static int lio_get_intr_coalesce(struct net_device *netdev, intrmod_cfg->intrmod_mincnt_trigger; } - iq = oct->instr_queue[lio->linfo.txpciq[0]]; + iq = oct->instr_queue[lio->linfo.txpciq[0].s.q_no]; intr_coal->tx_max_coalesced_frames = iq->fill_threshold; break; @@ -722,7 +720,7 @@ static int octnet_set_intrmod_cfg(void *oct, struct oct_intrmod_cfg *intr_cfg) sc->wait_time = 1000; retval = octeon_send_soft_command(oct_dev, sc); - if (retval) { + if (retval == IQ_SEND_FAILED) { octeon_free_soft_command(oct_dev, sc); return -EINVAL; } @@ -859,7 +857,7 @@ static int lio_set_intr_coalesce(struct net_device *netdev, if ((intr_coal->tx_max_coalesced_frames >= CN6XXX_DB_MIN) && (intr_coal->tx_max_coalesced_frames <= CN6XXX_DB_MAX)) { for (j = 0; j < lio->linfo.num_txpciq; j++) { - q_no = lio->linfo.txpciq[j]; + q_no = lio->linfo.txpciq[j].s.q_no; oct->instr_queue[q_no]->fill_threshold = intr_coal->tx_max_coalesced_frames; } @@ -950,7 +948,6 @@ static int lio_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) struct octeon_device *oct = lio->oct_dev; struct oct_link_info *linfo; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; int ret = 0; /* get the link info */ @@ -978,9 +975,9 @@ static int lio_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = OCTNET_CMD_SET_SETTINGS; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.wait_time = 1000; nctrl.netpndev = (u64)netdev; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; /* Passing the parameters sent by ethtool like Speed, Autoneg & Duplex @@ -990,19 +987,17 @@ static int lio_set_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) /* Autoneg ON */ nctrl.ncmd.s.more = OCTNIC_NCMD_PHY_ON | OCTNIC_NCMD_AUTONEG_ON; - nctrl.ncmd.s.param2 = ecmd->advertising; + nctrl.ncmd.s.param1 = ecmd->advertising; } else { /* Autoneg OFF */ nctrl.ncmd.s.more = OCTNIC_NCMD_PHY_ON; - nctrl.ncmd.s.param3 = ecmd->duplex; + nctrl.ncmd.s.param2 = ecmd->duplex; - nctrl.ncmd.s.param2 = ecmd->speed; + nctrl.ncmd.s.param1 = ecmd->speed; } - nparams.resp_order = OCTEON_RESP_ORDERED; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "Failed to set settings\n"); return -1; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 655d89e8814f..d0ab97c15f4a 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -84,6 +84,8 @@ static int conf_type; module_param(conf_type, int, 0); MODULE_PARM_DESC(conf_type, "select octeon configuration 0 default 1 ovs"); +static int ptp_enable = 1; + /* Bit mask values for lio->ifstate */ #define LIO_IFSTATE_DROQ_OPS 0x01 #define LIO_IFSTATE_REGISTERED 0x02 @@ -166,6 +168,8 @@ struct octnic_gather { * received from the IP layer. */ struct octeon_sg_entry *sg; + + u64 sg_dma_ptr; }; /** This structure is used by NIC driver to store information required @@ -682,7 +686,8 @@ static inline void txqs_wake(struct net_device *netdev) int i; for (i = 0; i < netdev->num_tx_queues; i++) - netif_wake_subqueue(netdev, i); + if (__netif_subqueue_stopped(netdev, i)) + netif_wake_subqueue(netdev, i); } else { netif_wake_queue(netdev); } @@ -705,7 +710,7 @@ static void start_txq(struct net_device *netdev) { struct lio *lio = GET_LIO(netdev); - if (lio->linfo.link.s.status) { + if (lio->linfo.link.s.link_up) { txqs_start(netdev); return; } @@ -752,11 +757,14 @@ static inline int check_txq_status(struct lio *lio) /* check each sub-queue state */ for (q = 0; q < numqs; q++) { - iq = lio->linfo.txpciq[q & (lio->linfo.num_txpciq - 1)]; + iq = lio->linfo.txpciq[q % + (lio->linfo.num_txpciq)].s.q_no; if (octnet_iq_is_full(lio->oct_dev, iq)) continue; - wake_q(lio->netdev, q); - ret_val++; + if (__netif_subqueue_stopped(lio->netdev, q)) { + wake_q(lio->netdev, q); + ret_val++; + } } } else { if (octnet_iq_is_full(lio->oct_dev, lio->txq)) @@ -787,64 +795,116 @@ static inline struct list_head *list_delete_head(struct list_head *root) } /** - * \brief Delete gather list + * \brief Delete gather lists * @param lio per-network private data */ -static void delete_glist(struct lio *lio) +static void delete_glists(struct lio *lio) { struct octnic_gather *g; + int i; - do { - g = (struct octnic_gather *) - list_delete_head(&lio->glist); - if (g) { - if (g->sg) - kfree((void *)((unsigned long)g->sg - - g->adjust)); - kfree(g); - } - } while (g); + if (!lio->glist) + return; + + for (i = 0; i < lio->linfo.num_txpciq; i++) { + do { + g = (struct octnic_gather *) + list_delete_head(&lio->glist[i]); + if (g) { + if (g->sg) { + dma_unmap_single(&lio->oct_dev-> + pci_dev->dev, + g->sg_dma_ptr, + g->sg_size, + DMA_TO_DEVICE); + kfree((void *)((unsigned long)g->sg - + g->adjust)); + } + kfree(g); + } + } while (g); + } + + kfree((void *)lio->glist); } /** - * \brief Setup gather list + * \brief Setup gather lists * @param lio per-network private data */ -static int setup_glist(struct lio *lio) +static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs) { - int i; + int i, j; struct octnic_gather *g; - INIT_LIST_HEAD(&lio->glist); + lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock), + GFP_KERNEL); + if (!lio->glist_lock) + return 1; - for (i = 0; i < lio->tx_qsize; i++) { - g = kzalloc(sizeof(*g), GFP_KERNEL); - if (!g) - break; + lio->glist = kcalloc(num_iqs, sizeof(*lio->glist), + GFP_KERNEL); + if (!lio->glist) { + kfree((void *)lio->glist_lock); + return 1; + } - g->sg_size = - ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE); + for (i = 0; i < num_iqs; i++) { + int numa_node = cpu_to_node(i % num_online_cpus()); - g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); - if (!g->sg) { - kfree(g); - break; + spin_lock_init(&lio->glist_lock[i]); + + INIT_LIST_HEAD(&lio->glist[i]); + + for (j = 0; j < lio->tx_qsize; j++) { + g = kzalloc_node(sizeof(*g), GFP_KERNEL, + numa_node); + if (!g) + g = kzalloc(sizeof(*g), GFP_KERNEL); + if (!g) + break; + + g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * + OCT_SG_ENTRY_SIZE); + + g->sg = kmalloc_node(g->sg_size + 8, + GFP_KERNEL, numa_node); + if (!g->sg) + g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL); + if (!g->sg) { + kfree(g); + break; + } + + /* The gather component should be aligned on 64-bit + * boundary + */ + if (((unsigned long)g->sg) & 7) { + g->adjust = 8 - (((unsigned long)g->sg) & 7); + g->sg = (struct octeon_sg_entry *) + ((unsigned long)g->sg + g->adjust); + } + g->sg_dma_ptr = dma_map_single(&oct->pci_dev->dev, + g->sg, g->sg_size, + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, + g->sg_dma_ptr)) { + kfree((void *)((unsigned long)g->sg - + g->adjust)); + kfree(g); + break; + } + + list_add_tail(&g->list, &lio->glist[i]); } - /* The gather component should be aligned on 64-bit boundary */ - if (((unsigned long)g->sg) & 7) { - g->adjust = 8 - (((unsigned long)g->sg) & 7); - g->sg = (struct octeon_sg_entry *) - ((unsigned long)g->sg + g->adjust); + if (j != lio->tx_qsize) { + delete_glists(lio); + return 1; } - list_add_tail(&g->list, &lio->glist); } - if (i == lio->tx_qsize) - return 0; - - delete_glist(lio); - return 1; + return 0; } /** @@ -858,7 +918,7 @@ static void print_link_info(struct net_device *netdev) if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED) { struct oct_link_info *linfo = &lio->linfo; - if (linfo->link.s.status) { + if (linfo->link.s.link_up) { netif_info(lio, link, lio->netdev, "%d Mbps %s Duplex UP\n", linfo->link.s.speed, (linfo->link.s.duplex) ? "Full" : "Half"); @@ -880,13 +940,15 @@ static inline void update_link_status(struct net_device *netdev, union oct_link_status *ls) { struct lio *lio = GET_LIO(netdev); + int changed = (lio->linfo.link.u64 != ls->u64); - if ((lio->intf_open) && (lio->linfo.link.u64 != ls->u64)) { - lio->linfo.link.u64 = ls->u64; + lio->linfo.link.u64 = ls->u64; + if ((lio->intf_open) && (changed)) { print_link_info(netdev); + lio->link_changes++; - if (lio->linfo.link.s.status) { + if (lio->linfo.link.s.link_up) { netif_carrier_on(netdev); /* start_txq(netdev); */ txqs_wake(netdev); @@ -1159,18 +1221,15 @@ static void octeon_destroy_resources(struct octeon_device *oct) static void send_rx_ctrl_cmd(struct lio *lio, int start_stop) { struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); nctrl.ncmd.s.cmd = OCTNET_CMD_RX_CTL; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = start_stop; + nctrl.ncmd.s.param1 = start_stop; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.netpndev = (u64)lio->netdev; - nparams.resp_order = OCTEON_RESP_NORESPONSE; - - if (octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams) < 0) + if (octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl) < 0) netif_info(lio, rx_err, lio->netdev, "Failed to send RX Control message\n"); } @@ -1205,10 +1264,12 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx) if (atomic_read(&lio->ifstate) & LIO_IFSTATE_REGISTERED) unregister_netdev(netdev); - delete_glist(lio); + delete_glists(lio); free_netdev(netdev); + oct->props[ifidx].gmxport = -1; + oct->props[ifidx].netdev = NULL; } @@ -1230,7 +1291,8 @@ static int liquidio_stop_nic_module(struct octeon_device *oct) for (i = 0; i < oct->ifcount; i++) { lio = GET_LIO(oct->props[i].netdev); for (j = 0; j < lio->linfo.num_rxpciq; j++) - octeon_unregister_droq_ops(oct, lio->linfo.rxpciq[j]); + octeon_unregister_droq_ops(oct, + lio->linfo.rxpciq[j].s.q_no); } for (i = 0; i < oct->ifcount; i++) @@ -1326,6 +1388,16 @@ static int octeon_pci_os_setup(struct octeon_device *oct) return 0; } +static inline int skb_iq(struct lio *lio, struct sk_buff *skb) +{ + int q = 0; + + if (netif_is_multiqueue(lio->netdev)) + q = skb->queue_mapping % lio->linfo.num_txpciq; + + return q; +} + /** * \brief Check Tx queue state for a given network buffer * @param lio per-network private data @@ -1337,14 +1409,17 @@ static inline int check_txq_state(struct lio *lio, struct sk_buff *skb) if (netif_is_multiqueue(lio->netdev)) { q = skb->queue_mapping; - iq = lio->linfo.txpciq[(q & (lio->linfo.num_txpciq - 1))]; + iq = lio->linfo.txpciq[(q % (lio->linfo.num_txpciq))].s.q_no; } else { iq = lio->txq; + q = iq; } if (octnet_iq_is_full(lio->oct_dev, iq)) return 0; - wake_q(lio->netdev, q); + + if (__netif_subqueue_stopped(lio->netdev, q)) + wake_q(lio->netdev, q); return 1; } @@ -1367,7 +1442,7 @@ static void free_netbuf(void *buf) check_txq_state(lio, skb); - recv_buffer_free((struct sk_buff *)skb); + tx_buffer_free(skb); } /** @@ -1380,7 +1455,7 @@ static void free_netsgbuf(void *buf) struct sk_buff *skb; struct lio *lio; struct octnic_gather *g; - int i, frags; + int i, frags, iq; finfo = (struct octnet_buf_free_info *)buf; skb = finfo->skb; @@ -1402,17 +1477,17 @@ static void free_netsgbuf(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); + dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev, + g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE); - spin_lock(&lio->lock); - list_add_tail(&g->list, &lio->glist); - spin_unlock(&lio->lock); + iq = skb_iq(lio, skb); + spin_lock(&lio->glist_lock[iq]); + list_add_tail(&g->list, &lio->glist[iq]); + spin_unlock(&lio->glist_lock[iq]); check_txq_state(lio, skb); /* mq support: sub-queue state check */ - recv_buffer_free((struct sk_buff *)skb); + tx_buffer_free(skb); } /** @@ -1426,7 +1501,7 @@ static void free_netsgbuf_with_resp(void *buf) struct sk_buff *skb; struct lio *lio; struct octnic_gather *g; - int i, frags; + int i, frags, iq; sc = (struct octeon_soft_command *)buf; skb = (struct sk_buff *)sc->callback_arg; @@ -1450,13 +1525,14 @@ static void free_netsgbuf_with_resp(void *buf) i++; } - dma_unmap_single(&lio->oct_dev->pci_dev->dev, - finfo->dptr, g->sg_size, - DMA_TO_DEVICE); + dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev, + g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE); - spin_lock(&lio->lock); - list_add_tail(&g->list, &lio->glist); - spin_unlock(&lio->lock); + iq = skb_iq(lio, skb); + + spin_lock(&lio->glist_lock[iq]); + list_add_tail(&g->list, &lio->glist[iq]); + spin_unlock(&lio->glist_lock[iq]); /* Don't free the skb yet */ @@ -1743,14 +1819,13 @@ static void if_cfg_callback(struct octeon_device *oct, static u16 select_q(struct net_device *dev, struct sk_buff *skb, void *accel_priv, select_queue_fallback_t fallback) { - int qindex; + u32 qindex = 0; struct lio *lio; lio = GET_LIO(dev); - /* select queue on chosen queue_mapping or core */ - qindex = skb_rx_queue_recorded(skb) ? - skb_get_rx_queue(skb) : smp_processor_id(); - return (u16)(qindex & (lio->linfo.num_txpciq - 1)); + qindex = skb_tx_hash(dev, skb); + + return (u16)(qindex % (lio->linfo.num_txpciq)); } /** Routine to push packets arriving on Octeon interface upto network layer. @@ -1759,26 +1834,27 @@ static u16 select_q(struct net_device *dev, struct sk_buff *skb, * @param len - size of total data received. * @param rh - Control header associated with the packet * @param param - additional control data with the packet + * @param arg - farg registered in droq_ops */ static void liquidio_push_packet(u32 octeon_id, void *skbuff, u32 len, union octeon_rh *rh, - void *param) + void *param, + void *arg) { struct napi_struct *napi = param; - struct octeon_device *oct = lio_get_device(octeon_id); struct sk_buff *skb = (struct sk_buff *)skbuff; struct skb_shared_hwtstamps *shhwtstamps; u64 ns; - struct net_device *netdev = - (struct net_device *)oct->props[rh->r_dh.link].netdev; + struct net_device *netdev = (struct net_device *)arg; struct octeon_droq *droq = container_of(param, struct octeon_droq, napi); if (netdev) { int packet_was_received; struct lio *lio = GET_LIO(netdev); + struct octeon_device *oct = lio->oct_dev; /* Do not proceed if the interface is not in RUNNING state. */ if (!ifstate_check(lio, LIO_IFSTATE_RUNNING)) { @@ -1789,21 +1865,54 @@ liquidio_push_packet(u32 octeon_id, skb->dev = netdev; - if (rh->r_dh.has_hwtstamp) { - /* timestamp is included from the hardware at the - * beginning of the packet. - */ - if (ifstate_check(lio, - LIO_IFSTATE_RX_TIMESTAMP_ENABLED)) { - /* Nanoseconds are in the first 64-bits - * of the packet. + skb_record_rx_queue(skb, droq->q_no); + if (likely(len > MIN_SKB_SIZE)) { + struct octeon_skb_page_info *pg_info; + unsigned char *va; + + pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + if (pg_info->page) { + /* For Paged allocation use the frags */ + va = page_address(pg_info->page) + + pg_info->page_offset; + memcpy(skb->data, va, MIN_SKB_SIZE); + skb_put(skb, MIN_SKB_SIZE); + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + pg_info->page, + pg_info->page_offset + + MIN_SKB_SIZE, + len - MIN_SKB_SIZE, + LIO_RXBUFFER_SZ); + } + } else { + struct octeon_skb_page_info *pg_info = + ((struct octeon_skb_page_info *)(skb->cb)); + skb_copy_to_linear_data(skb, page_address(pg_info->page) + + pg_info->page_offset, len); + skb_put(skb, len); + put_page(pg_info->page); + } + + if (((oct->chip_id == OCTEON_CN66XX) || + (oct->chip_id == OCTEON_CN68XX)) && + ptp_enable) { + if (rh->r_dh.has_hwtstamp) { + /* timestamp is included from the hardware at + * the beginning of the packet. */ - memcpy(&ns, (skb->data), sizeof(ns)); - shhwtstamps = skb_hwtstamps(skb); - shhwtstamps->hwtstamp = - ns_to_ktime(ns + lio->ptp_adjust); + if (ifstate_check + (lio, LIO_IFSTATE_RX_TIMESTAMP_ENABLED)) { + /* Nanoseconds are in the first 64-bits + * of the packet. + */ + memcpy(&ns, (skb->data), sizeof(ns)); + shhwtstamps = skb_hwtstamps(skb); + shhwtstamps->hwtstamp = + ns_to_ktime(ns + + lio->ptp_adjust); + } + skb_pull(skb, sizeof(ns)); } - skb_pull(skb, sizeof(ns)); } skb->protocol = eth_type_trans(skb, skb->dev); @@ -1935,10 +2044,10 @@ static int liquidio_napi_poll(struct napi_struct *napi, int budget) * are for ingress packets. */ static inline int setup_io_queues(struct octeon_device *octeon_dev, - struct net_device *net_device) + int ifidx) { - static int first_time = 1; - static struct octeon_droq_ops droq_ops; + struct octeon_droq_ops droq_ops; + struct net_device *netdev; static int cpu_id; static int cpu_id_modulus; struct octeon_droq *droq; @@ -1947,23 +2056,26 @@ static inline int setup_io_queues(struct octeon_device *octeon_dev, struct lio *lio; int num_tx_descs; - lio = GET_LIO(net_device); - if (first_time) { - first_time = 0; - memset(&droq_ops, 0, sizeof(struct octeon_droq_ops)); + netdev = octeon_dev->props[ifidx].netdev; - droq_ops.fptr = liquidio_push_packet; + lio = GET_LIO(netdev); - droq_ops.poll_mode = 1; - droq_ops.napi_fn = liquidio_napi_drv_callback; - cpu_id = 0; - cpu_id_modulus = num_present_cpus(); - } + memset(&droq_ops, 0, sizeof(struct octeon_droq_ops)); + + droq_ops.fptr = liquidio_push_packet; + droq_ops.farg = (void *)netdev; + + droq_ops.poll_mode = 1; + droq_ops.napi_fn = liquidio_napi_drv_callback; + cpu_id = 0; + cpu_id_modulus = num_present_cpus(); /* set up DROQs. */ for (q = 0; q < lio->linfo.num_rxpciq; q++) { - q_no = lio->linfo.rxpciq[q]; - + q_no = lio->linfo.rxpciq[q].s.q_no; + dev_dbg(&octeon_dev->pci_dev->dev, + "setup_io_queues index:%d linfo.rxpciq.s.q_no:%d\n", + q, q_no); retval = octeon_setup_droq(octeon_dev, q_no, CFG_GET_NUM_RX_DESCS_NIC_IF (octeon_get_conf(octeon_dev), @@ -1980,7 +2092,11 @@ static inline int setup_io_queues(struct octeon_device *octeon_dev, droq = octeon_dev->droq[q_no]; napi = &droq->napi; - netif_napi_add(net_device, napi, liquidio_napi_poll, 64); + dev_dbg(&octeon_dev->pci_dev->dev, + "netif_napi_add netdev:%llx oct:%llx\n", + (u64)netdev, + (u64)octeon_dev); + netif_napi_add(netdev, napi, liquidio_napi_poll, 64); /* designate a CPU for this droq */ droq->cpu_id = cpu_id; @@ -1996,9 +2112,9 @@ static inline int setup_io_queues(struct octeon_device *octeon_dev, num_tx_descs = CFG_GET_NUM_TX_DESCS_NIC_IF(octeon_get_conf (octeon_dev), lio->ifidx); - retval = octeon_setup_iq(octeon_dev, lio->linfo.txpciq[q], - num_tx_descs, - netdev_get_tx_queue(net_device, q)); + retval = octeon_setup_iq(octeon_dev, ifidx, q, + lio->linfo.txpciq[q], num_tx_descs, + netdev_get_tx_queue(netdev, q)); if (retval) { dev_err(&octeon_dev->pci_dev->dev, " %s : Runtime IQ(TxQ) creation failed.\n", @@ -2096,7 +2212,8 @@ static int liquidio_stop(struct net_device *netdev) netif_info(lio, ifdown, lio->netdev, "Stopping interface!\n"); /* Inform that netif carrier is down */ lio->intf_open = 0; - lio->linfo.link.s.status = 0; + lio->linfo.link.s.link_up = 0; + lio->link_changes++; netif_carrier_off(netdev); @@ -2235,7 +2352,6 @@ static void liquidio_set_mcast_list(struct net_device *netdev) struct lio *lio = GET_LIO(netdev); struct octeon_device *oct = lio->oct_dev; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; struct netdev_hw_addr *ha; u64 *mc; int ret, i; @@ -2246,10 +2362,10 @@ static void liquidio_set_mcast_list(struct net_device *netdev) /* Create a ctrl pkt command to be sent to core app. */ nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = OCTNET_CMD_SET_MULTI_LIST; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = get_new_flags(netdev); - nctrl.ncmd.s.param3 = mc_count; + nctrl.ncmd.s.param1 = get_new_flags(netdev); + nctrl.ncmd.s.param2 = mc_count; nctrl.ncmd.s.more = mc_count; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.netpndev = (u64)netdev; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; @@ -2270,9 +2386,7 @@ static void liquidio_set_mcast_list(struct net_device *netdev) */ nctrl.wait_time = 0; - nparams.resp_order = OCTEON_RESP_NORESPONSE; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "DEVFLAGS change failed in core (ret: 0x%x)\n", ret); @@ -2290,19 +2404,17 @@ static int liquidio_set_mac(struct net_device *netdev, void *p) struct octeon_device *oct = lio->oct_dev; struct sockaddr *addr = (struct sockaddr *)p; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; - if ((!is_valid_ether_addr(addr->sa_data)) || - (ifstate_check(lio, LIO_IFSTATE_RUNNING))) + if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MACADDR; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = 0; + nctrl.ncmd.s.param1 = 0; nctrl.ncmd.s.more = 1; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.netpndev = (u64)netdev; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; nctrl.wait_time = 100; @@ -2311,9 +2423,7 @@ static int liquidio_set_mac(struct net_device *netdev, void *p) /* The MAC Address is presented in network byte order. */ memcpy((u8 *)&nctrl.udd[0] + 2, addr->sa_data, ETH_ALEN); - nparams.resp_order = OCTEON_RESP_ORDERED; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "MAC Address change failed\n"); return -ENOMEM; @@ -2341,7 +2451,7 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev) oct = lio->oct_dev; for (i = 0; i < lio->linfo.num_txpciq; i++) { - iq_no = lio->linfo.txpciq[i]; + iq_no = lio->linfo.txpciq[i].s.q_no; iq_stats = &oct->instr_queue[iq_no]->stats; pkts += iq_stats->tx_done; drop += iq_stats->tx_dropped; @@ -2357,7 +2467,7 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev) bytes = 0; for (i = 0; i < lio->linfo.num_rxpciq; i++) { - oq_no = lio->linfo.rxpciq[i]; + oq_no = lio->linfo.rxpciq[i].s.q_no; oq_stats = &oct->droq[oq_no]->stats; pkts += oq_stats->rx_pkts_received; drop += (oq_stats->rx_dropped + @@ -2383,7 +2493,6 @@ static int liquidio_change_mtu(struct net_device *netdev, int new_mtu) struct lio *lio = GET_LIO(netdev); struct octeon_device *oct = lio->oct_dev; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; int max_frm_size = new_mtu + OCTNET_FRM_HEADER_SIZE; int ret = 0; @@ -2403,15 +2512,13 @@ static int liquidio_change_mtu(struct net_device *netdev, int new_mtu) nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MTU; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = new_mtu; + nctrl.ncmd.s.param1 = new_mtu; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.wait_time = 100; nctrl.netpndev = (u64)netdev; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; - nparams.resp_order = OCTEON_RESP_ORDERED; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "Failed to set MTU\n"); return -1; @@ -2536,7 +2643,7 @@ static void handle_timestamp(struct octeon_device *oct, } octeon_free_soft_command(oct, sc); - recv_buffer_free(skb); + tx_buffer_free(skb); } /* \brief Send a data packet that will be timestamped @@ -2551,10 +2658,9 @@ static inline int send_nic_timestamp_pkt(struct octeon_device *oct, { int retval; struct octeon_soft_command *sc; - struct octeon_instr_ih *ih; - struct octeon_instr_rdp *rdp; struct lio *lio; int ring_doorbell; + u32 len; lio = finfo->lio; @@ -2576,14 +2682,13 @@ static inline int send_nic_timestamp_pkt(struct octeon_device *oct, sc->callback_arg = finfo->skb; sc->iq_no = ndata->q_no; - ih = (struct octeon_instr_ih *)&sc->cmd.ih; - rdp = (struct octeon_instr_rdp *)&sc->cmd.rdp; + len = (u32)((struct octeon_instr_ih2 *)(&sc->cmd.cmd2.ih2))->dlengsz; ring_doorbell = !xmit_more; retval = octeon_send_command(oct, sc->iq_no, ring_doorbell, &sc->cmd, - sc, ih->dlengsz, ndata->reqtype); + sc, len, ndata->reqtype); - if (retval) { + if (retval == IQ_SEND_FAILED) { dev_err(&oct->pci_dev->dev, "timestamp data packet failed status: %x\n", retval); octeon_free_soft_command(oct, sc); @@ -2594,68 +2699,6 @@ static inline int send_nic_timestamp_pkt(struct octeon_device *oct, return retval; } -static inline int is_ipv4(struct sk_buff *skb) -{ - return (skb->protocol == htons(ETH_P_IP)) && - (ip_hdr(skb)->version == 4); -} - -static inline int is_vlan(struct sk_buff *skb) -{ - return skb->protocol == htons(ETH_P_8021Q); -} - -static inline int is_ip_fragmented(struct sk_buff *skb) -{ - /* The Don't fragment and Reserved flag fields are ignored. - * IP is fragmented if - * - the More fragments bit is set (indicating this IP is a fragment - * with more to follow; the current offset could be 0 ). - * - ths offset field is non-zero. - */ - return (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ? 1 : 0; -} - -static inline int is_ipv6(struct sk_buff *skb) -{ - return (skb->protocol == htons(ETH_P_IPV6)) && - (ipv6_hdr(skb)->version == 6); -} - -static inline int is_with_extn_hdr(struct sk_buff *skb) -{ - return (ipv6_hdr(skb)->nexthdr != IPPROTO_TCP) && - (ipv6_hdr(skb)->nexthdr != IPPROTO_UDP); -} - -static inline int is_tcpudp(struct sk_buff *skb) -{ - return (ip_hdr(skb)->protocol == IPPROTO_TCP) || - (ip_hdr(skb)->protocol == IPPROTO_UDP); -} - -static inline u32 get_ipv4_5tuple_tag(struct sk_buff *skb) -{ - u32 tag; - struct iphdr *iphdr = ip_hdr(skb); - - tag = crc32(0, &iphdr->protocol, 1); - tag = crc32(tag, (u8 *)&iphdr->saddr, 8); - tag = crc32(tag, skb_transport_header(skb), 4); - return tag; -} - -static inline u32 get_ipv6_5tuple_tag(struct sk_buff *skb) -{ - u32 tag; - struct ipv6hdr *ipv6hdr = ipv6_hdr(skb); - - tag = crc32(0, &ipv6hdr->nexthdr, 1); - tag = crc32(tag, (u8 *)&ipv6hdr->saddr, 32); - tag = crc32(tag, skb_transport_header(skb), 4); - return tag; -} - /** \brief Transmit networks packets to the Octeon interface * @param skbuff skbuff struct to be passed to network layer. * @param netdev pointer to network device @@ -2670,18 +2713,22 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) struct octnic_data_pkt ndata; struct octeon_device *oct; struct oct_iq_stats *stats; - int cpu = 0, status = 0; + struct octeon_instr_irh *irh; + union tx_info *tx_info; + int status = 0; int q_idx = 0, iq_no = 0; - int xmit_more; + int xmit_more, j; + u64 dptr = 0; u32 tag = 0; lio = GET_LIO(netdev); oct = lio->oct_dev; if (netif_is_multiqueue(netdev)) { - cpu = skb->queue_mapping; - q_idx = (cpu & (lio->linfo.num_txpciq - 1)); - iq_no = lio->linfo.txpciq[q_idx]; + q_idx = skb->queue_mapping; + q_idx = (q_idx % (lio->linfo.num_txpciq)); + tag = q_idx; + iq_no = lio->linfo.txpciq[q_idx].s.q_no; } else { iq_no = lio->txq; } @@ -2692,11 +2739,11 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) * transmitted. */ if (!(atomic_read(&lio->ifstate) & LIO_IFSTATE_RUNNING) || - (!lio->linfo.link.s.status) || + (!lio->linfo.link.s.link_up) || (skb->len <= 0)) { netif_info(lio, tx_err, lio->netdev, "Transmit failed link_status : %d\n", - lio->linfo.link.s.status); + lio->linfo.link.s.link_up); goto lio_xmit_failed; } @@ -2739,53 +2786,11 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) ndata.datasize = skb->len; cmdsetup.u64 = 0; - cmdsetup.s.ifidx = lio->linfo.ifidx; - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (is_ipv4(skb) && !is_ip_fragmented(skb) && is_tcpudp(skb)) { - tag = get_ipv4_5tuple_tag(skb); - - cmdsetup.s.cksum_offset = sizeof(struct ethhdr) + 1; - - if (ip_hdr(skb)->ihl > 5) - cmdsetup.s.ipv4opts_ipv6exthdr = - OCT_PKT_PARAM_IPV4OPTS; - - } else if (is_ipv6(skb)) { - tag = get_ipv6_5tuple_tag(skb); - - cmdsetup.s.cksum_offset = sizeof(struct ethhdr) + 1; - - if (is_with_extn_hdr(skb)) - cmdsetup.s.ipv4opts_ipv6exthdr = - OCT_PKT_PARAM_IPV6EXTHDR; + cmdsetup.s.iq_no = iq_no; - } else if (is_vlan(skb)) { - if (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto - == htons(ETH_P_IP) && - !is_ip_fragmented(skb) && is_tcpudp(skb)) { - tag = get_ipv4_5tuple_tag(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + cmdsetup.s.transport_csum = 1; - cmdsetup.s.cksum_offset = - sizeof(struct vlan_ethhdr) + 1; - - if (ip_hdr(skb)->ihl > 5) - cmdsetup.s.ipv4opts_ipv6exthdr = - OCT_PKT_PARAM_IPV4OPTS; - - } else if (vlan_eth_hdr(skb)->h_vlan_encapsulated_proto - == htons(ETH_P_IPV6)) { - tag = get_ipv6_5tuple_tag(skb); - - cmdsetup.s.cksum_offset = - sizeof(struct vlan_ethhdr) + 1; - - if (is_with_extn_hdr(skb)) - cmdsetup.s.ipv4opts_ipv6exthdr = - OCT_PKT_PARAM_IPV6EXTHDR; - } - } - } if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; cmdsetup.s.timestamp = 1; @@ -2793,20 +2798,20 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) if (skb_shinfo(skb)->nr_frags == 0) { cmdsetup.s.u.datasize = skb->len; - octnet_prepare_pci_cmd(&ndata.cmd, &cmdsetup, tag); + octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag); /* Offload checksum calculation for TCP/UDP packets */ - ndata.cmd.dptr = dma_map_single(&oct->pci_dev->dev, - skb->data, - skb->len, - DMA_TO_DEVICE); - if (dma_mapping_error(&oct->pci_dev->dev, ndata.cmd.dptr)) { + dptr = dma_map_single(&oct->pci_dev->dev, + skb->data, + skb->len, + DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, dptr)) { dev_err(&oct->pci_dev->dev, "%s DMA mapping error 1\n", __func__); return NETDEV_TX_BUSY; } - finfo->dptr = ndata.cmd.dptr; - + ndata.cmd.cmd2.dptr = dptr; + finfo->dptr = dptr; ndata.reqtype = REQTYPE_NORESP_NET; } else { @@ -2814,9 +2819,10 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) struct skb_frag_struct *frag; struct octnic_gather *g; - spin_lock(&lio->lock); - g = (struct octnic_gather *)list_delete_head(&lio->glist); - spin_unlock(&lio->lock); + spin_lock(&lio->glist_lock[q_idx]); + g = (struct octnic_gather *) + list_delete_head(&lio->glist[q_idx]); + spin_unlock(&lio->glist_lock[q_idx]); if (!g) { netif_info(lio, tx_err, lio->netdev, @@ -2826,7 +2832,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) cmdsetup.s.gather = 1; cmdsetup.s.u.gatherptrs = (skb_shinfo(skb)->nr_frags + 1); - octnet_prepare_pci_cmd(&ndata.cmd, &cmdsetup, tag); + octnet_prepare_pci_cmd(oct, &ndata.cmd, &cmdsetup, tag); memset(g->sg, 0, g->sg_size); @@ -2853,34 +2859,43 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev) frag->size, DMA_TO_DEVICE); + if (dma_mapping_error(&oct->pci_dev->dev, + g->sg[i >> 2].ptr[i & 3])) { + dma_unmap_single(&oct->pci_dev->dev, + g->sg[0].ptr[0], + skb->len - skb->data_len, + DMA_TO_DEVICE); + for (j = 1; j < i; j++) { + frag = &skb_shinfo(skb)->frags[j - 1]; + dma_unmap_page(&oct->pci_dev->dev, + g->sg[j >> 2].ptr[j & 3], + frag->size, + DMA_TO_DEVICE); + } + dev_err(&oct->pci_dev->dev, "%s DMA mapping error 3\n", + __func__); + return NETDEV_TX_BUSY; + } + add_sg_size(&g->sg[(i >> 2)], frag->size, (i & 3)); i++; } - ndata.cmd.dptr = dma_map_single(&oct->pci_dev->dev, - g->sg, g->sg_size, - DMA_TO_DEVICE); - if (dma_mapping_error(&oct->pci_dev->dev, ndata.cmd.dptr)) { - dev_err(&oct->pci_dev->dev, "%s DMA mapping error 3\n", - __func__); - dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0], - skb->len - skb->data_len, - DMA_TO_DEVICE); - return NETDEV_TX_BUSY; - } + dma_sync_single_for_device(&oct->pci_dev->dev, g->sg_dma_ptr, + g->sg_size, DMA_TO_DEVICE); + dptr = g->sg_dma_ptr; - finfo->dptr = ndata.cmd.dptr; + ndata.cmd.cmd2.dptr = dptr; + finfo->dptr = dptr; finfo->g = g; ndata.reqtype = REQTYPE_NORESP_NET_SG; } - if (skb_shinfo(skb)->gso_size) { - struct octeon_instr_irh *irh = - (struct octeon_instr_irh *)&ndata.cmd.irh; - union tx_info *tx_info = (union tx_info *)&ndata.cmd.ossp[0]; + irh = (struct octeon_instr_irh *)&ndata.cmd.cmd2.irh; + tx_info = (union tx_info *)&ndata.cmd.cmd2.ossp[0]; - irh->len = 1; /* to indicate that ossp[0] contains tx_info */ + if (skb_shinfo(skb)->gso_size) { tx_info->s.gso_size = skb_shinfo(skb)->gso_size; tx_info->s.gso_segs = skb_shinfo(skb)->gso_segs; } @@ -2910,9 +2925,10 @@ lio_xmit_failed: stats->tx_dropped++; netif_info(lio, tx_err, lio->netdev, "IQ%d Transmit dropped:%llu\n", iq_no, stats->tx_dropped); - dma_unmap_single(&oct->pci_dev->dev, ndata.cmd.dptr, - ndata.datasize, DMA_TO_DEVICE); - recv_buffer_free(skb); + if (dptr) + dma_unmap_single(&oct->pci_dev->dev, dptr, + ndata.datasize, DMA_TO_DEVICE); + tx_buffer_free(skb); return NETDEV_TX_OK; } @@ -2932,27 +2948,24 @@ static void liquidio_tx_timeout(struct net_device *netdev) txqs_wake(netdev); } -int liquidio_set_feature(struct net_device *netdev, int cmd) +int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1) { struct lio *lio = GET_LIO(netdev); struct octeon_device *oct = lio->oct_dev; struct octnic_ctrl_pkt nctrl; - struct octnic_ctrl_params nparams; int ret = 0; memset(&nctrl, 0, sizeof(struct octnic_ctrl_pkt)); nctrl.ncmd.u64 = 0; nctrl.ncmd.s.cmd = cmd; - nctrl.ncmd.s.param1 = lio->linfo.ifidx; - nctrl.ncmd.s.param2 = OCTNIC_LROIPV4 | OCTNIC_LROIPV6; + nctrl.ncmd.s.param1 = param1; + nctrl.iq_no = lio->linfo.txpciq[0].s.q_no; nctrl.wait_time = 100; nctrl.netpndev = (u64)netdev; nctrl.cb_fn = liquidio_link_ctrl_cmd_completion; - nparams.resp_order = OCTEON_RESP_NORESPONSE; - - ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl, nparams); + ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, &nctrl); if (ret < 0) { dev_err(&oct->pci_dev->dev, "Feature change failed in core (ret: 0x%x)\n", ret); @@ -3008,10 +3021,12 @@ static int liquidio_set_features(struct net_device *netdev, return 0; if ((features & NETIF_F_LRO) && (lio->dev_capability & NETIF_F_LRO)) - liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE); + liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); else if (!(features & NETIF_F_LRO) && (lio->dev_capability & NETIF_F_LRO)) - liquidio_set_feature(netdev, OCTNET_CMD_LRO_DISABLE); + liquidio_set_feature(netdev, OCTNET_CMD_LRO_DISABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); return 0; } @@ -3082,24 +3097,27 @@ static int lio_nic_info(struct octeon_recv_info *recv_info, void *buf) { struct octeon_device *oct = (struct octeon_device *)buf; struct octeon_recv_pkt *recv_pkt = recv_info->recv_pkt; - int ifidx = 0; + int gmxport = 0; union oct_link_status *ls; int i; - if ((recv_pkt->buffer_size[0] != sizeof(*ls)) || - (recv_pkt->rh.r_nic_info.ifidx > oct->ifcount)) { + if (recv_pkt->buffer_size[0] != sizeof(*ls)) { dev_err(&oct->pci_dev->dev, "Malformed NIC_INFO, len=%d, ifidx=%d\n", recv_pkt->buffer_size[0], - recv_pkt->rh.r_nic_info.ifidx); + recv_pkt->rh.r_nic_info.gmxport); goto nic_info_err; } - ifidx = recv_pkt->rh.r_nic_info.ifidx; + gmxport = recv_pkt->rh.r_nic_info.gmxport; ls = (union oct_link_status *)get_rbd(recv_pkt->buffer_ptr[0]); octeon_swap_8B_data((u64 *)ls, (sizeof(union oct_link_status)) >> 3); - - update_link_status(oct->props[ifidx].netdev, ls); + for (i = 0; i < oct->ifcount; i++) { + if (oct->props[i].gmxport == gmxport) { + update_link_status(oct->props[i].netdev, ls); + break; + } + } nic_info_err: for (i = 0; i < recv_pkt->buffer_count; i++) @@ -3125,13 +3143,13 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) struct liquidio_if_cfg_context *ctx; struct liquidio_if_cfg_resp *resp; struct octdev_props *props; - int retval, num_iqueues, num_oqueues, q_no; - u64 q_mask; + int retval, num_iqueues, num_oqueues; int num_cpus = num_online_cpus(); union oct_nic_if_cfg if_cfg; unsigned int base_queue; unsigned int gmx_port_id; u32 resp_size, ctx_size; + u32 ifidx_or_pfnum; /* This is to handle link status changes */ octeon_register_dispatch_fn(octeon_dev, OPCODE_NIC, @@ -3167,13 +3185,14 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) CFG_GET_BASE_QUE_NIC_IF(octeon_get_conf(octeon_dev), i); gmx_port_id = CFG_GET_GMXID_NIC_IF(octeon_get_conf(octeon_dev), i); + ifidx_or_pfnum = i; if (num_iqueues > num_cpus) num_iqueues = num_cpus; if (num_oqueues > num_cpus) num_oqueues = num_cpus; dev_dbg(&octeon_dev->pci_dev->dev, "requesting config for interface %d, iqs %d, oqs %d\n", - i, num_iqueues, num_oqueues); + ifidx_or_pfnum, num_iqueues, num_oqueues); ACCESS_ONCE(ctx->cond) = 0; ctx->octeon_id = lio_get_device_id(octeon_dev); init_waitqueue_head(&ctx->wc); @@ -3183,8 +3202,11 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) if_cfg.s.num_oqueues = num_oqueues; if_cfg.s.base_queue = base_queue; if_cfg.s.gmx_port_id = gmx_port_id; + + sc->iq_no = 0; + octeon_prepare_soft_command(octeon_dev, sc, OPCODE_NIC, - OPCODE_NIC_IF_CFG, i, + OPCODE_NIC_IF_CFG, 0, if_cfg.u64, 0); sc->callback = if_cfg_callback; @@ -3192,7 +3214,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) sc->wait_time = 1000; retval = octeon_send_soft_command(octeon_dev, sc); - if (retval) { + if (retval == IQ_SEND_FAILED) { dev_err(&octeon_dev->pci_dev->dev, "iq/oq config failed status: %x\n", retval); @@ -3234,8 +3256,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) goto setup_nic_dev_fail; } - props = &octeon_dev->props[i]; - props->netdev = netdev; + SET_NETDEV_DEV(netdev, &octeon_dev->pci_dev->dev); if (num_iqueues > 1) lionetdevops.ndo_select_queue = select_q; @@ -3249,23 +3270,21 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) memset(lio, 0, sizeof(struct lio)); - lio->linfo.ifidx = resp->cfg_info.ifidx; - lio->ifidx = resp->cfg_info.ifidx; + lio->ifidx = ifidx_or_pfnum; + + props = &octeon_dev->props[i]; + props->gmxport = resp->cfg_info.linfo.gmxport; + props->netdev = netdev; lio->linfo.num_rxpciq = num_oqueues; lio->linfo.num_txpciq = num_iqueues; - q_mask = resp->cfg_info.oqmask; - /* q_mask is 0-based and already verified mask is nonzero */ for (j = 0; j < num_oqueues; j++) { - q_no = __ffs64(q_mask); - q_mask &= (~(1UL << q_no)); - lio->linfo.rxpciq[j] = q_no; + lio->linfo.rxpciq[j].u64 = + resp->cfg_info.linfo.rxpciq[j].u64; } - q_mask = resp->cfg_info.iqmask; for (j = 0; j < num_iqueues; j++) { - q_no = __ffs64(q_mask); - q_mask &= (~(1UL << q_no)); - lio->linfo.txpciq[j] = q_no; + lio->linfo.txpciq[j].u64 = + resp->cfg_info.linfo.txpciq[j].u64; } lio->linfo.hw_addr = resp->cfg_info.linfo.hw_addr; lio->linfo.gmxport = resp->cfg_info.linfo.gmxport; @@ -3274,13 +3293,15 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) lio->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); lio->dev_capability = NETIF_F_HIGHDMA - | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM - | NETIF_F_SG | NETIF_F_RXCSUM - | NETIF_F_TSO | NETIF_F_TSO6 - | NETIF_F_LRO; + | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM + | NETIF_F_SG | NETIF_F_RXCSUM + | NETIF_F_GRO + | NETIF_F_TSO | NETIF_F_TSO6 + | NETIF_F_LRO; netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE); - netdev->features = lio->dev_capability; + netdev->features = (lio->dev_capability & ~NETIF_F_LRO); + netdev->vlan_features = lio->dev_capability; netdev->hw_features = lio->dev_capability; @@ -3291,7 +3312,6 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) lio->oct_dev = octeon_dev; lio->octprops = props; lio->netdev = netdev; - spin_lock_init(&lio->lock); dev_dbg(&octeon_dev->pci_dev->dev, "if%d gmx: %d hw_addr: 0x%llx\n", i, @@ -3306,23 +3326,22 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) ether_addr_copy(netdev->dev_addr, mac); - if (setup_io_queues(octeon_dev, netdev)) { + /* By default all interfaces on a single Octeon uses the same + * tx and rx queues + */ + lio->txq = lio->linfo.txpciq[0].s.q_no; + lio->rxq = lio->linfo.rxpciq[0].s.q_no; + if (setup_io_queues(octeon_dev, i)) { dev_err(&octeon_dev->pci_dev->dev, "I/O queues creation failed\n"); goto setup_nic_dev_fail; } ifstate_set(lio, LIO_IFSTATE_DROQ_OPS); - /* By default all interfaces on a single Octeon uses the same - * tx and rx queues - */ - lio->txq = lio->linfo.txpciq[0]; - lio->rxq = lio->linfo.rxpciq[0]; - lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq); lio->rx_qsize = octeon_get_rx_qsize(octeon_dev, lio->rxq); - if (setup_glist(lio)) { + if (setup_glists(octeon_dev, lio, num_iqueues)) { dev_err(&octeon_dev->pci_dev->dev, "Gather list allocation failed\n"); goto setup_nic_dev_fail; @@ -3331,10 +3350,13 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) /* Register ethtool support */ liquidio_set_ethtool_ops(netdev); - liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE); + if (netdev->features & NETIF_F_LRO) + liquidio_set_feature(netdev, OCTNET_CMD_LRO_ENABLE, + OCTNIC_LROIPV4 | OCTNIC_LROIPV6); if ((debug != -1) && (debug & NETIF_MSG_HW)) - liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE); + liquidio_set_feature(netdev, OCTNET_CMD_VERBOSE_ENABLE, + 0); /* Register the network device with the OS */ if (register_netdev(netdev)) { @@ -3346,13 +3368,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev) "Setup NIC ifidx:%d mac:%02x%02x%02x%02x%02x%02x\n", i, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); netif_carrier_off(netdev); - - if (lio->linfo.link.s.status) { - netif_carrier_on(netdev); - start_txq(netdev); - } else { - netif_carrier_off(netdev); - } + lio->link_changes++; ifstate_set(lio, LIO_IFSTATE_REGISTERED); @@ -3386,7 +3402,7 @@ setup_nic_dev_fail: static int liquidio_init_nic_module(struct octeon_device *oct) { struct oct_intrmod_cfg *intrmod_cfg; - int retval = 0; + int i, retval = 0; int num_nic_ports = CFG_GET_NUM_NIC_PORTS(octeon_get_conf(oct)); dev_dbg(&oct->pci_dev->dev, "Initializing network interfaces\n"); @@ -3400,6 +3416,9 @@ static int liquidio_init_nic_module(struct octeon_device *oct) memset(oct->props, 0, sizeof(struct octdev_props) * num_nic_ports); + for (i = 0; i < MAX_OCTEON_LINKS; i++) + oct->props[i].gmxport = -1; + retval = setup_nic_devices(oct); if (retval) { dev_err(&oct->pci_dev->dev, "Setup NIC devices failed\n"); diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h index 0ac347ccc8ba..2179691efebc 100644 --- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h +++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h @@ -174,9 +174,11 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry, /*------------------------- End Scatter/Gather ---------------------------*/ #define OCTNET_FRM_PTP_HEADER_SIZE 8 -#define OCTNET_FRM_HEADER_SIZE 30 /* PTP timestamp + VLAN + Ethernet */ -#define OCTNET_MIN_FRM_SIZE (64 + OCTNET_FRM_PTP_HEADER_SIZE) +#define OCTNET_FRM_HEADER_SIZE 22 /* VLAN + Ethernet */ + +#define OCTNET_MIN_FRM_SIZE 64 + #define OCTNET_MAX_FRM_SIZE (16000 + OCTNET_FRM_HEADER_SIZE) #define OCTNET_DEFAULT_FRM_SIZE (1500 + OCTNET_FRM_HEADER_SIZE) @@ -258,19 +260,19 @@ union octnet_cmd { u64 more:6; /* How many udd words follow the command */ - u64 param1:29; + u64 reserved:29; - u64 param2:16; + u64 param1:16; - u64 param3:8; + u64 param2:8; #else - u64 param3:8; + u64 param2:8; - u64 param2:16; + u64 param1:16; - u64 param1:29; + u64 reserved:29; u64 more:6; @@ -283,8 +285,140 @@ union octnet_cmd { #define OCTNET_CMD_SIZE (sizeof(union octnet_cmd)) +/* Instruction Header (DPI - CN23xx) - for OCTEON-III models */ +struct octeon_instr_ih3 { +#ifdef __BIG_ENDIAN_BITFIELD + + /** Reserved3 */ + u64 reserved3:1; + + /** Gather indicator 1=gather*/ + u64 gather:1; + + /** Data length OR no. of entries in gather list */ + u64 dlengsz:14; + + /** Front Data size */ + u64 fsz:6; + + /** Reserved2 */ + u64 reserved2:4; + + /** PKI port kind - PKIND */ + u64 pkind:6; + + /** Reserved1 */ + u64 reserved1:32; + +#else + /** Reserved1 */ + u64 reserved1:32; + + /** PKI port kind - PKIND */ + u64 pkind:6; + + /** Reserved2 */ + u64 reserved2:4; + + /** Front Data size */ + u64 fsz:6; + + /** Data length OR no. of entries in gather list */ + u64 dlengsz:14; + + /** Gather indicator 1=gather*/ + u64 gather:1; + + /** Reserved3 */ + u64 reserved3:1; + +#endif +}; + +/* Optional PKI Instruction Header(PKI IH) - for OCTEON CN23XX models */ +/** BIG ENDIAN format. */ +struct octeon_instr_pki_ih3 { +#ifdef __BIG_ENDIAN_BITFIELD + + /** Wider bit */ + u64 w:1; + + /** Raw mode indicator 1 = RAW */ + u64 raw:1; + + /** Use Tag */ + u64 utag:1; + + /** Use QPG */ + u64 uqpg:1; + + /** Reserved2 */ + u64 reserved2:1; + + /** Parse Mode */ + u64 pm:3; + + /** Skip Length */ + u64 sl:8; + + /** Use Tag Type */ + u64 utt:1; + + /** Tag type */ + u64 tagtype:2; + + /** Reserved1 */ + u64 reserved1:2; + + /** QPG Value */ + u64 qpg:11; + + /** Tag Value */ + u64 tag:32; + +#else + + /** Tag Value */ + u64 tag:32; + + /** QPG Value */ + u64 qpg:11; + + /** Reserved1 */ + u64 reserved1:2; + + /** Tag type */ + u64 tagtype:2; + + /** Use Tag Type */ + u64 utt:1; + + /** Skip Length */ + u64 sl:8; + + /** Parse Mode */ + u64 pm:3; + + /** Reserved2 */ + u64 reserved2:1; + + /** Use QPG */ + u64 uqpg:1; + + /** Use Tag */ + u64 utag:1; + + /** Raw mode indicator 1 = RAW */ + u64 raw:1; + + /** Wider bit */ + u64 w:1; +#endif + +}; + /** Instruction Header */ -struct octeon_instr_ih { +struct octeon_instr_ih2 { #ifdef __BIG_ENDIAN_BITFIELD /** Raw mode indicator 1 = RAW */ u64 raw:1; @@ -412,10 +546,9 @@ union octeon_rh { u64 opcode:4; u64 subcode:8; u64 len:3; /** additional 64-bit words */ - u64 rid:13; - u64 reserved:4; + u64 reserved:8; u64 extra:25; - u64 ifidx:7; + u64 gmxport:16; } r_nic_info; #else u64 u64; @@ -448,10 +581,9 @@ union octeon_rh { u64 opcode:4; } r_core_drv_init; struct { - u64 ifidx:7; + u64 gmxport:16; u64 extra:25; - u64 reserved:4; - u64 rid:13; + u64 reserved:8; u64 len:3; /** additional 64-bit words */ u64 subcode:8; u64 opcode:4; @@ -461,30 +593,25 @@ union octeon_rh { #define OCT_RH_SIZE (sizeof(union octeon_rh)) -#define OCT_PKT_PARAM_IPV4OPTS 1 -#define OCT_PKT_PARAM_IPV6EXTHDR 2 - union octnic_packet_params { u32 u32; struct { #ifdef __BIG_ENDIAN_BITFIELD - u32 reserved:6; + u32 reserved:24; + u32 ip_csum:1; /* Perform IP header checksum(s) */ + /* Perform Outer transport header checksum */ + u32 transport_csum:1; + /* Find tunnel, and perform transport csum. */ u32 tnl_csum:1; - u32 ip_csum:1; - u32 ipv4opts_ipv6exthdr:2; - u32 ipsec_ops:4; - u32 tsflag:1; - u32 csoffset:9; - u32 ifidx:8; + u32 tsflag:1; /* Timestamp this packet */ + u32 ipsec_ops:4; /* IPsec operation */ #else - u32 ifidx:8; - u32 csoffset:9; - u32 tsflag:1; u32 ipsec_ops:4; - u32 ipv4opts_ipv6exthdr:2; - u32 ip_csum:1; + u32 tsflag:1; u32 tnl_csum:1; - u32 reserved:6; + u32 transport_csum:1; + u32 ip_csum:1; + u32 reserved:24; #endif } s; }; @@ -496,53 +623,90 @@ union oct_link_status { struct { #ifdef __BIG_ENDIAN_BITFIELD u64 duplex:8; - u64 status:8; u64 mtu:16; u64 speed:16; + u64 link_up:1; u64 autoneg:1; u64 interface:4; u64 pause:1; - u64 reserved:10; + u64 reserved:17; #else - u64 reserved:10; + u64 reserved:17; u64 pause:1; u64 interface:4; u64 autoneg:1; + u64 link_up:1; u64 speed:16; u64 mtu:16; - u64 status:8; u64 duplex:8; #endif } s; }; +/** The txpciq info passed to host from the firmware */ + +union oct_txpciq { + u64 u64; + + struct { +#ifdef __BIG_ENDIAN_BITFIELD + u64 q_no:8; + u64 port:8; + u64 pkind:6; + u64 use_qpg:1; + u64 qpg:11; + u64 reserved:30; +#else + u64 reserved:30; + u64 qpg:11; + u64 use_qpg:1; + u64 pkind:6; + u64 port:8; + u64 q_no:8; +#endif + } s; +}; + +/** The rxpciq info passed to host from the firmware */ + +union oct_rxpciq { + u64 u64; + + struct { +#ifdef __BIG_ENDIAN_BITFIELD + u64 q_no:8; + u64 reserved:56; +#else + u64 reserved:56; + u64 q_no:8; +#endif + } s; +}; + /** Information for a OCTEON ethernet interface shared between core & host. */ struct oct_link_info { union oct_link_status link; u64 hw_addr; #ifdef __BIG_ENDIAN_BITFIELD - u16 gmxport; - u8 rsvd[3]; - u8 num_txpciq; - u8 num_rxpciq; - u8 ifidx; + u64 gmxport:16; + u64 rsvd:32; + u64 num_txpciq:8; + u64 num_rxpciq:8; #else - u8 ifidx; - u8 num_rxpciq; - u8 num_txpciq; - u8 rsvd[3]; - u16 gmxport; + u64 num_rxpciq:8; + u64 num_txpciq:8; + u64 rsvd:32; + u64 gmxport:16; #endif - u8 txpciq[MAX_IOQS_PER_NICIF]; - u8 rxpciq[MAX_IOQS_PER_NICIF]; + union oct_txpciq txpciq[MAX_IOQS_PER_NICIF]; + union oct_rxpciq rxpciq[MAX_IOQS_PER_NICIF]; }; #define OCT_LINK_INFO_SIZE (sizeof(struct oct_link_info)) struct liquidio_if_cfg_info { - u64 ifidx; u64 iqmask; /** mask for IQs enabled for the port */ u64 oqmask; /** mask for OQs enabled for the port */ struct oct_link_info linfo; /** initial link information */ diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c index 8e23e3fad662..32900093527a 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c @@ -741,49 +741,59 @@ struct octeon_device *octeon_allocate_device(u32 pci_id, return oct; } +/* this function is only for setting up the first queue */ int octeon_setup_instr_queues(struct octeon_device *oct) { - u32 i, num_iqs = 0; + u32 num_iqs = 0; u32 num_descs = 0; + u32 iq_no = 0; + union oct_txpciq txpciq; + int numa_node = cpu_to_node(iq_no % num_online_cpus()); + num_iqs = 1; /* this causes queue 0 to be default queue */ - if (OCTEON_CN6XXX(oct)) { - num_iqs = 1; + if (OCTEON_CN6XXX(oct)) num_descs = CFG_GET_NUM_DEF_TX_DESCS(CHIP_FIELD(oct, cn6xxx, conf)); - } oct->num_iqs = 0; - for (i = 0; i < num_iqs; i++) { - oct->instr_queue[i] = + oct->instr_queue[0] = vmalloc_node(sizeof(*oct->instr_queue[0]), + numa_node); + if (!oct->instr_queue[0]) + oct->instr_queue[0] = vmalloc(sizeof(struct octeon_instr_queue)); - if (!oct->instr_queue[i]) - return 1; - - memset(oct->instr_queue[i], 0, - sizeof(struct octeon_instr_queue)); - - oct->instr_queue[i]->app_ctx = (void *)(size_t)i; - if (octeon_init_instr_queue(oct, i, num_descs)) - return 1; - - oct->num_iqs++; + if (!oct->instr_queue[0]) + return 1; + memset(oct->instr_queue[0], 0, sizeof(struct octeon_instr_queue)); + oct->instr_queue[0]->q_index = 0; + oct->instr_queue[0]->app_ctx = (void *)(size_t)0; + oct->instr_queue[0]->ifidx = 0; + txpciq.u64 = 0; + txpciq.s.q_no = iq_no; + txpciq.s.use_qpg = 0; + txpciq.s.qpg = 0; + if (octeon_init_instr_queue(oct, txpciq, num_descs)) { + /* prevent memory leak */ + vfree(oct->instr_queue[0]); + return 1; } + oct->num_iqs++; return 0; } int octeon_setup_output_queues(struct octeon_device *oct) { - u32 i, num_oqs = 0; + u32 num_oqs = 0; u32 num_descs = 0; u32 desc_size = 0; + u32 oq_no = 0; + int numa_node = cpu_to_node(oq_no % num_online_cpus()); + num_oqs = 1; /* this causes queue 0 to be default queue */ if (OCTEON_CN6XXX(oct)) { - /* CFG_GET_OQ_MAX_BASE_Q(CHIP_FIELD(oct, cn6xxx, conf)); */ - num_oqs = 1; num_descs = CFG_GET_NUM_DEF_RX_DESCS(CHIP_FIELD(oct, cn6xxx, conf)); desc_size = @@ -791,19 +801,15 @@ int octeon_setup_output_queues(struct octeon_device *oct) } oct->num_oqs = 0; + oct->droq[0] = vmalloc_node(sizeof(*oct->droq[0]), numa_node); + if (!oct->droq[0]) + oct->droq[0] = vmalloc(sizeof(*oct->droq[0])); + if (!oct->droq[0]) + return 1; - for (i = 0; i < num_oqs; i++) { - oct->droq[i] = vmalloc(sizeof(*oct->droq[i])); - if (!oct->droq[i]) - return 1; - - memset(oct->droq[i], 0, sizeof(struct octeon_droq)); - - if (octeon_init_droq(oct, i, num_descs, desc_size, NULL)) - return 1; - - oct->num_oqs++; - } + if (octeon_init_droq(oct, oq_no, num_descs, desc_size, NULL)) + return 1; + oct->num_oqs++; return 0; } diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h index 36e1f85df8c4..0950b94f8805 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h @@ -267,6 +267,7 @@ struct octdev_props { /* Each interface in the Octeon device has a network * device pointer (used for OS specific calls). */ + int gmxport; struct net_device *netdev; }; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c index 174072b3740b..59a529353f6d 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c @@ -151,22 +151,26 @@ octeon_droq_destroy_ring_buffers(struct octeon_device *oct, struct octeon_droq *droq) { u32 i; + struct octeon_skb_page_info *pg_info; for (i = 0; i < droq->max_count; i++) { - if (droq->recv_buf_list[i].buffer) { - if (droq->desc_ring) { - lio_unmap_ring_info(oct->pci_dev, - (u64)droq-> - desc_ring[i].info_ptr, - OCT_DROQ_INFO_SIZE); - lio_unmap_ring(oct->pci_dev, - (u64)droq->desc_ring[i]. - buffer_ptr, - droq->buffer_size); - } - recv_buffer_free(droq->recv_buf_list[i].buffer); - droq->recv_buf_list[i].buffer = NULL; - } + pg_info = &droq->recv_buf_list[i].pg_info; + + if (pg_info->dma) + lio_unmap_ring(oct->pci_dev, + (u64)pg_info->dma); + pg_info->dma = 0; + + if (pg_info->page) + recv_buffer_destroy(droq->recv_buf_list[i].buffer, + pg_info); + + if (droq->desc_ring && droq->desc_ring[i].info_ptr) + lio_unmap_ring_info(oct->pci_dev, + (u64)droq-> + desc_ring[i].info_ptr, + OCT_DROQ_INFO_SIZE); + droq->recv_buf_list[i].buffer = NULL; } octeon_droq_reset_indices(droq); @@ -181,11 +185,12 @@ octeon_droq_setup_ring_buffers(struct octeon_device *oct, struct octeon_droq_desc *desc_ring = droq->desc_ring; for (i = 0; i < droq->max_count; i++) { - buf = recv_buffer_alloc(oct, droq->q_no, droq->buffer_size); + buf = recv_buffer_alloc(oct, &droq->recv_buf_list[i].pg_info); if (!buf) { dev_err(&oct->pci_dev->dev, "%s buffer alloc failed\n", __func__); + droq->stats.rx_alloc_failure++; return -ENOMEM; } @@ -197,9 +202,7 @@ octeon_droq_setup_ring_buffers(struct octeon_device *oct, /* map ring buffers into memory */ desc_ring[i].info_ptr = lio_map_ring_info(droq, i); desc_ring[i].buffer_ptr = - lio_map_ring(oct->pci_dev, - droq->recv_buf_list[i].buffer, - droq->buffer_size); + lio_map_ring(droq->recv_buf_list[i].buffer); } octeon_droq_reset_indices(droq); @@ -242,6 +245,8 @@ int octeon_init_droq(struct octeon_device *oct, struct octeon_droq *droq; u32 desc_ring_size = 0, c_num_descs = 0, c_buf_size = 0; u32 c_pkts_per_intr = 0, c_refill_threshold = 0; + int orig_node = dev_to_node(&oct->pci_dev->dev); + int numa_node = cpu_to_node(q_no % num_online_cpus()); dev_dbg(&oct->pci_dev->dev, "%s[%d]\n", __func__, q_no); @@ -261,15 +266,23 @@ int octeon_init_droq(struct octeon_device *oct, struct octeon_config *conf6x = CHIP_FIELD(oct, cn6xxx, conf); c_pkts_per_intr = (u32)CFG_GET_OQ_PKTS_PER_INTR(conf6x); - c_refill_threshold = (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf6x); + c_refill_threshold = + (u32)CFG_GET_OQ_REFILL_THRESHOLD(conf6x); + } else { + return 1; } droq->max_count = c_num_descs; droq->buffer_size = c_buf_size; desc_ring_size = droq->max_count * OCT_DROQ_DESC_SIZE; + set_dev_node(&oct->pci_dev->dev, numa_node); droq->desc_ring = lio_dma_alloc(oct, desc_ring_size, (dma_addr_t *)&droq->desc_ring_dma); + set_dev_node(&oct->pci_dev->dev, orig_node); + if (!droq->desc_ring) + droq->desc_ring = lio_dma_alloc(oct, desc_ring_size, + (dma_addr_t *)&droq->desc_ring_dma); if (!droq->desc_ring) { dev_err(&oct->pci_dev->dev, @@ -283,12 +296,11 @@ int octeon_init_droq(struct octeon_device *oct, droq->max_count); droq->info_list = - cnnic_alloc_aligned_dma(oct->pci_dev, - (droq->max_count * OCT_DROQ_INFO_SIZE), - &droq->info_alloc_size, - &droq->info_base_addr, - &droq->info_list_dma); - + cnnic_numa_alloc_aligned_dma((droq->max_count * + OCT_DROQ_INFO_SIZE), + &droq->info_alloc_size, + &droq->info_base_addr, + numa_node); if (!droq->info_list) { dev_err(&oct->pci_dev->dev, "Cannot allocate memory for info list.\n"); lio_dma_free(oct, (droq->max_count * OCT_DROQ_DESC_SIZE), @@ -297,7 +309,12 @@ int octeon_init_droq(struct octeon_device *oct, } droq->recv_buf_list = (struct octeon_recv_buffer *) - vmalloc(droq->max_count * + vmalloc_node(droq->max_count * + OCT_DROQ_RECVBUF_SIZE, + numa_node); + if (!droq->recv_buf_list) + droq->recv_buf_list = (struct octeon_recv_buffer *) + vmalloc(droq->max_count * OCT_DROQ_RECVBUF_SIZE); if (!droq->recv_buf_list) { dev_err(&oct->pci_dev->dev, "Output queue recv buf list alloc failed\n"); @@ -358,6 +375,7 @@ static inline struct octeon_recv_info *octeon_create_recv_info( struct octeon_recv_pkt *recv_pkt; struct octeon_recv_info *recv_info; u32 i, bytes_left; + struct octeon_skb_page_info *pg_info; info = &droq->info_list[idx]; @@ -375,9 +393,14 @@ static inline struct octeon_recv_info *octeon_create_recv_info( bytes_left = (u32)info->length; while (buf_cnt) { - lio_unmap_ring(octeon_dev->pci_dev, - (u64)droq->desc_ring[idx].buffer_ptr, - droq->buffer_size); + { + pg_info = &droq->recv_buf_list[idx].pg_info; + + lio_unmap_ring(octeon_dev->pci_dev, + (u64)pg_info->dma); + pg_info->page = NULL; + pg_info->dma = 0; + } recv_pkt->buffer_size[i] = (bytes_left >= @@ -449,6 +472,7 @@ octeon_droq_refill(struct octeon_device *octeon_dev, struct octeon_droq *droq) void *buf = NULL; u8 *data; u32 desc_refilled = 0; + struct octeon_skb_page_info *pg_info; desc_ring = droq->desc_ring; @@ -458,13 +482,22 @@ octeon_droq_refill(struct octeon_device *octeon_dev, struct octeon_droq *droq) * the buffer, else allocate. */ if (!droq->recv_buf_list[droq->refill_idx].buffer) { - buf = recv_buffer_alloc(octeon_dev, droq->q_no, - droq->buffer_size); + pg_info = + &droq->recv_buf_list[droq->refill_idx].pg_info; + /* Either recycle the existing pages or go for + * new page alloc + */ + if (pg_info->page) + buf = recv_buffer_reuse(octeon_dev, pg_info); + else + buf = recv_buffer_alloc(octeon_dev, pg_info); /* If a buffer could not be allocated, no point in * continuing */ - if (!buf) + if (!buf) { + droq->stats.rx_alloc_failure++; break; + } droq->recv_buf_list[droq->refill_idx].buffer = buf; data = get_rbd(buf); @@ -476,11 +509,8 @@ octeon_droq_refill(struct octeon_device *octeon_dev, struct octeon_droq *droq) droq->recv_buf_list[droq->refill_idx].data = data; desc_ring[droq->refill_idx].buffer_ptr = - lio_map_ring(octeon_dev->pci_dev, - droq->recv_buf_list[droq-> - refill_idx].buffer, - droq->buffer_size); - + lio_map_ring(droq->recv_buf_list[droq-> + refill_idx].buffer); /* Reset any previous values in the length field. */ droq->info_list[droq->refill_idx].length = 0; @@ -586,6 +616,8 @@ octeon_droq_fast_process_packets(struct octeon_device *oct, for (pkt = 0; pkt < pkt_count; pkt++) { u32 pkt_len = 0; struct sk_buff *nicbuf = NULL; + struct octeon_skb_page_info *pg_info; + void *buf; info = &droq->info_list[droq->read_idx]; octeon_swap_8B_data((u64 *)info, 2); @@ -605,7 +637,6 @@ octeon_droq_fast_process_packets(struct octeon_device *oct, rh = &info->rh; total_len += (u32)info->length; - if (OPCODE_SLOW_PATH(rh)) { u32 buf_cnt; @@ -614,50 +645,44 @@ octeon_droq_fast_process_packets(struct octeon_device *oct, droq->refill_count += buf_cnt; } else { if (info->length <= droq->buffer_size) { - lio_unmap_ring(oct->pci_dev, - (u64)droq->desc_ring[ - droq->read_idx].buffer_ptr, - droq->buffer_size); pkt_len = (u32)info->length; nicbuf = droq->recv_buf_list[ droq->read_idx].buffer; + pg_info = &droq->recv_buf_list[ + droq->read_idx].pg_info; + if (recv_buffer_recycle(oct, pg_info)) + pg_info->page = NULL; droq->recv_buf_list[droq->read_idx].buffer = NULL; INCR_INDEX_BY1(droq->read_idx, droq->max_count); - skb_put(nicbuf, pkt_len); droq->refill_count++; } else { - nicbuf = octeon_fast_packet_alloc(oct, droq, - droq->q_no, - (u32) + nicbuf = octeon_fast_packet_alloc((u32) info->length); pkt_len = 0; /* nicbuf allocation can fail. We'll handle it * inside the loop. */ while (pkt_len < info->length) { - int cpy_len; + int cpy_len, idx = droq->read_idx; - cpy_len = ((pkt_len + - droq->buffer_size) > - info->length) ? + cpy_len = ((pkt_len + droq->buffer_size) + > info->length) ? ((u32)info->length - pkt_len) : droq->buffer_size; if (nicbuf) { - lio_unmap_ring(oct->pci_dev, - (u64) - droq->desc_ring - [droq->read_idx]. - buffer_ptr, - droq-> - buffer_size); octeon_fast_packet_next(droq, nicbuf, cpy_len, - droq-> - read_idx - ); + idx); + buf = droq->recv_buf_list[idx]. + buffer; + recv_buffer_fast_free(buf); + droq->recv_buf_list[idx].buffer + = NULL; + } else { + droq->stats.rx_alloc_failure++; } pkt_len += cpy_len; @@ -668,12 +693,14 @@ octeon_droq_fast_process_packets(struct octeon_device *oct, } if (nicbuf) { - if (droq->ops.fptr) + if (droq->ops.fptr) { droq->ops.fptr(oct->octeon_id, - nicbuf, pkt_len, - rh, &droq->napi); - else + nicbuf, pkt_len, + rh, &droq->napi, + droq->ops.farg); + } else { recv_buffer_free(nicbuf); + } } } @@ -681,16 +708,16 @@ octeon_droq_fast_process_packets(struct octeon_device *oct, int desc_refilled = octeon_droq_refill(oct, droq); /* Flush the droq descriptor data to memory to be sure - * that when we update the credits the data in memory - * is accurate. - */ + * that when we update the credits the data in memory + * is accurate. + */ wmb(); writel((desc_refilled), droq->pkts_credit_reg); /* make sure mmio write completes */ mmiowb(); } - } /* for ( each packet )... */ + } /* for (each packet)... */ /* Increment refill_count by the number of buffers processed. */ droq->stats.pkts_received += pkt; @@ -937,6 +964,7 @@ int octeon_unregister_droq_ops(struct octeon_device *oct, u32 q_no) spin_lock_irqsave(&droq->lock, flags); droq->ops.fptr = NULL; + droq->ops.farg = NULL; droq->ops.drop_on_max = 0; spin_unlock_irqrestore(&droq->lock, flags); @@ -949,6 +977,7 @@ int octeon_create_droq(struct octeon_device *oct, u32 desc_size, void *app_ctx) { struct octeon_droq *droq; + int numa_node = cpu_to_node(q_no % num_online_cpus()); if (oct->droq[q_no]) { dev_dbg(&oct->pci_dev->dev, "Droq already in use. Cannot create droq %d again\n", @@ -957,7 +986,9 @@ int octeon_create_droq(struct octeon_device *oct, } /* Allocate the DS for the new droq. */ - droq = vmalloc(sizeof(*droq)); + droq = vmalloc_node(sizeof(*droq), numa_node); + if (!droq) + droq = vmalloc(sizeof(*droq)); if (!droq) goto create_droq_fail; memset(droq, 0, sizeof(struct octeon_droq)); diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h index 7940ccee12d9..1ca9c4f05702 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h @@ -65,6 +65,17 @@ struct octeon_droq_info { #define OCT_DROQ_INFO_SIZE (sizeof(struct octeon_droq_info)) +struct octeon_skb_page_info { + /* DMA address for the page */ + dma_addr_t dma; + + /* Page for the rx dma **/ + struct page *page; + + /** which offset into page */ + unsigned int page_offset; +}; + /** Pointer to data buffer. * Driver keeps a pointer to the data buffer that it made available to * the Octeon device. Since the descriptor ring keeps physical (bus) @@ -77,6 +88,9 @@ struct octeon_recv_buffer { /** Data in the packet buffer. */ u8 *data; + + /** pg_info **/ + struct octeon_skb_page_info pg_info; }; #define OCT_DROQ_RECVBUF_SIZE (sizeof(struct octeon_recv_buffer)) @@ -106,6 +120,10 @@ struct oct_droq_stats { /** Num of Packets dropped due to receive path failures. */ u64 rx_dropped; + + /** Num of failures of recv_buffer_alloc() */ + u64 rx_alloc_failure; + }; #define POLL_EVENT_INTR_ARRIVED 1 @@ -213,7 +231,8 @@ struct octeon_droq_ops { * data in the buffer. The receive header gives the port * number to the caller. Function pointer is set by caller. */ - void (*fptr)(u32, void *, u32, union octeon_rh *, void *); + void (*fptr)(u32, void *, u32, union octeon_rh *, void *, void *); + void *farg; /* This function will be called by the driver for all NAPI related * events. The first param is the octeon id. The second param is the diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h index 592fe49b589d..513f8a068179 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h @@ -75,14 +75,16 @@ struct oct_iq_stats { * a Octeon device has one such structure to represent it. */ struct octeon_instr_queue { + struct octeon_device *oct_dev; + /** A spinlock to protect access to the input ring. */ spinlock_t lock; /** Flag that indicates if the queue uses 64 byte commands. */ u32 iqcmd_64B:1; - /** Queue Number. */ - u32 iq_no:5; + /** Queue info. */ + union oct_txpciq txpciq; u32 rsvd:17; @@ -147,6 +149,13 @@ struct octeon_instr_queue { /** Application context */ void *app_ctx; + + /* network stack queue index */ + int q_index; + + /*os ifidx associated with this queue */ + int ifidx; + }; /*---------------------- INSTRUCTION FORMAT ----------------------------*/ @@ -176,12 +185,12 @@ struct octeon_instr_32B { /** 64-byte instruction format. * Format of instruction for a 64-byte mode input queue. */ -struct octeon_instr_64B { +struct octeon_instr2_64B { /** Pointer where the input data is available. */ u64 dptr; /** Instruction Header. */ - u64 ih; + u64 ih2; /** Input Request Header. */ u64 irh; @@ -198,10 +207,40 @@ struct octeon_instr_64B { u64 rptr; u64 reserved; +}; + +struct octeon_instr3_64B { + /** Pointer where the input data is available. */ + u64 dptr; + + /** Instruction Header. */ + u64 ih3; + + /** Instruction Header. */ + u64 pki_ih3; + + /** Input Request Header. */ + u64 irh; + + /** opcode/subcode specific parameters */ + u64 ossp[2]; + + /** Return Data Parameters */ + u64 rdp; + + /** Pointer where the response for a RAW mode packet will be written + * by Octeon. + */ + u64 rptr; }; -#define OCT_64B_INSTR_SIZE (sizeof(struct octeon_instr_64B)) +union octeon_instr_64B { + struct octeon_instr2_64B cmd2; + struct octeon_instr3_64B cmd3; +}; + +#define OCT_64B_INSTR_SIZE (sizeof(union octeon_instr_64B)) /** The size of each buffer in soft command buffer pool */ @@ -214,7 +253,8 @@ struct octeon_soft_command { u32 size; /** Command and return status */ - struct octeon_instr_64B cmd; + union octeon_instr_64B cmd; + #define COMPLETION_WORD_INIT 0xffffffffffffffffULL u64 *status_word; @@ -268,14 +308,15 @@ void octeon_free_soft_command(struct octeon_device *oct, /** * octeon_init_instr_queue() * @param octeon_dev - pointer to the octeon device structure. - * @param iq_no - queue to be initialized (0 <= q_no <= 3). + * @param txpciq - queue to be initialized (0 <= q_no <= 3). * * Called at driver init time for each input queue. iq_conf has the * configuration parameters for the queue. * * @return Success: 0 Failure: 1 */ -int octeon_init_instr_queue(struct octeon_device *octeon_dev, u32 iq_no, +int octeon_init_instr_queue(struct octeon_device *octeon_dev, + union oct_txpciq txpciq, u32 num_descs); /** @@ -313,7 +354,8 @@ void octeon_prepare_soft_command(struct octeon_device *oct, int octeon_send_soft_command(struct octeon_device *oct, struct octeon_soft_command *sc); -int octeon_setup_iq(struct octeon_device *oct, u32 iq_no, - u32 num_descs, void *app_ctx); +int octeon_setup_iq(struct octeon_device *oct, int ifidx, + int q_index, union oct_txpciq iq_no, u32 num_descs, + void *app_ctx); #endif /* __OCTEON_IQ_H__ */ diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h index cbd081981180..0ff3efc67b84 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h @@ -126,22 +126,27 @@ static inline int octeon_map_pci_barx(struct octeon_device *oct, } static inline void * -cnnic_alloc_aligned_dma(struct pci_dev *pci_dev, - u32 size, - u32 *alloc_size, - size_t *orig_ptr, - size_t *dma_addr __attribute__((unused))) +cnnic_numa_alloc_aligned_dma(u32 size, + u32 *alloc_size, + size_t *orig_ptr, + int numa_node) { int retries = 0; void *ptr = NULL; #define OCTEON_MAX_ALLOC_RETRIES 1 do { - ptr = - (void *)__get_free_pages(GFP_KERNEL, - get_order(size)); + struct page *page = NULL; + + page = alloc_pages_node(numa_node, + GFP_KERNEL, + get_order(size)); + if (!page) + page = alloc_pages(GFP_KERNEL, + get_order(size)); + ptr = (void *)page_address(page); if ((unsigned long)ptr & 0x07) { - free_pages((unsigned long)ptr, get_order(size)); + __free_pages(page, get_order(size)); ptr = NULL; /* Increment the size required if the first * attempt failed. diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index b3abe5818fd3..9c14484bfca0 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -48,11 +48,11 @@ struct lio { */ int rxq; - /** Guards the glist */ - spinlock_t lock; + /** Guards each glist */ + spinlock_t *glist_lock; - /** Linked list of gather components */ - struct list_head glist; + /** Array of gather component linked lists */ + struct list_head *glist; /** Pointer to the NIC properties for the Octeon device this network * interface is associated with. @@ -67,6 +67,9 @@ struct lio { /** Link information sent by the core application for this interface. */ struct oct_link_info linfo; + /** counter of link changes */ + u64 link_changes; + /** Size of Tx queue for this octeon device. */ u32 tx_qsize; @@ -111,8 +114,9 @@ struct lio { * \brief Enable or disable feature * @param netdev pointer to network device * @param cmd Command that just requires acknowledgment + * @param param1 Parameter to command */ -int liquidio_set_feature(struct net_device *netdev, int cmd); +int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1); /** * \brief Link control command completion callback @@ -131,14 +135,30 @@ void liquidio_link_ctrl_cmd_completion(void *nctrl_ptr); */ void liquidio_set_ethtool_ops(struct net_device *netdev); -static inline void -*recv_buffer_alloc(struct octeon_device *oct __attribute__((unused)), - u32 q_no __attribute__((unused)), u32 size) -{ #define SKB_ADJ_MASK 0x3F #define SKB_ADJ (SKB_ADJ_MASK + 1) - struct sk_buff *skb = dev_alloc_skb(size + SKB_ADJ); +#define MIN_SKB_SIZE 256 /* 8 bytes and more - 8 bytes for PTP */ +#define LIO_RXBUFFER_SZ 2048 + +static inline void +*recv_buffer_alloc(struct octeon_device *oct, + struct octeon_skb_page_info *pg_info) +{ + struct page *page; + struct sk_buff *skb; + struct octeon_skb_page_info *skb_pg_info; + + page = alloc_page(GFP_ATOMIC | __GFP_COLD); + if (unlikely(!page)) + return NULL; + + skb = dev_alloc_skb(MIN_SKB_SIZE + SKB_ADJ); + if (unlikely(!skb)) { + __free_page(page); + pg_info->page = NULL; + return NULL; + } if ((unsigned long)skb->data & SKB_ADJ_MASK) { u32 r = SKB_ADJ - ((unsigned long)skb->data & SKB_ADJ_MASK); @@ -146,11 +166,151 @@ static inline void skb_reserve(skb, r); } + skb_pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + /* Get DMA info */ + pg_info->dma = dma_map_page(&oct->pci_dev->dev, page, 0, + PAGE_SIZE, DMA_FROM_DEVICE); + + /* Mapping failed!! */ + if (dma_mapping_error(&oct->pci_dev->dev, pg_info->dma)) { + __free_page(page); + dev_kfree_skb_any((struct sk_buff *)skb); + pg_info->page = NULL; + return NULL; + } + + pg_info->page = page; + pg_info->page_offset = 0; + skb_pg_info->page = page; + skb_pg_info->page_offset = 0; + skb_pg_info->dma = pg_info->dma; + return (void *)skb; } +static inline void +*recv_buffer_fast_alloc(u32 size) +{ + struct sk_buff *skb; + struct octeon_skb_page_info *skb_pg_info; + + skb = dev_alloc_skb(size + SKB_ADJ); + if (unlikely(!skb)) + return NULL; + + if ((unsigned long)skb->data & SKB_ADJ_MASK) { + u32 r = SKB_ADJ - ((unsigned long)skb->data & SKB_ADJ_MASK); + + skb_reserve(skb, r); + } + + skb_pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + skb_pg_info->page = NULL; + skb_pg_info->page_offset = 0; + skb_pg_info->dma = 0; + + return skb; +} + +static inline int +recv_buffer_recycle(struct octeon_device *oct, void *buf) +{ + struct octeon_skb_page_info *pg_info = buf; + + if (!pg_info->page) { + dev_err(&oct->pci_dev->dev, "%s: pg_info->page NULL\n", + __func__); + return -ENOMEM; + } + + if (unlikely(page_count(pg_info->page) != 1) || + unlikely(page_to_nid(pg_info->page) != numa_node_id())) { + dma_unmap_page(&oct->pci_dev->dev, + pg_info->dma, (PAGE_SIZE << 0), + DMA_FROM_DEVICE); + pg_info->dma = 0; + pg_info->page = NULL; + pg_info->page_offset = 0; + return -ENOMEM; + } + + /* Flip to other half of the buffer */ + if (pg_info->page_offset == 0) + pg_info->page_offset = LIO_RXBUFFER_SZ; + else + pg_info->page_offset = 0; + page_ref_inc(pg_info->page); + + return 0; +} + +static inline void +*recv_buffer_reuse(struct octeon_device *oct, void *buf) +{ + struct octeon_skb_page_info *pg_info = buf, *skb_pg_info; + struct sk_buff *skb; + + skb = dev_alloc_skb(MIN_SKB_SIZE + SKB_ADJ); + if (unlikely(!skb)) { + dma_unmap_page(&oct->pci_dev->dev, + pg_info->dma, (PAGE_SIZE << 0), + DMA_FROM_DEVICE); + return NULL; + } + + if ((unsigned long)skb->data & SKB_ADJ_MASK) { + u32 r = SKB_ADJ - ((unsigned long)skb->data & SKB_ADJ_MASK); + + skb_reserve(skb, r); + } + + skb_pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + skb_pg_info->page = pg_info->page; + skb_pg_info->page_offset = pg_info->page_offset; + skb_pg_info->dma = pg_info->dma; + + return skb; +} + +static inline void +recv_buffer_destroy(void *buffer, struct octeon_skb_page_info *pg_info) +{ + struct sk_buff *skb = (struct sk_buff *)buffer; + + put_page(pg_info->page); + pg_info->dma = 0; + pg_info->page = NULL; + pg_info->page_offset = 0; + + if (skb) + dev_kfree_skb_any(skb); +} + static inline void recv_buffer_free(void *buffer) { + struct sk_buff *skb = (struct sk_buff *)buffer; + struct octeon_skb_page_info *pg_info; + + pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + + if (pg_info->page) { + put_page(pg_info->page); + pg_info->dma = 0; + pg_info->page = NULL; + pg_info->page_offset = 0; + } + + dev_kfree_skb_any((struct sk_buff *)buffer); +} + +static inline void +recv_buffer_fast_free(void *buffer) +{ + dev_kfree_skb_any((struct sk_buff *)buffer); +} + +static inline void tx_buffer_free(void *buffer) +{ dev_kfree_skb_any((struct sk_buff *)buffer); } @@ -159,7 +319,17 @@ static inline void recv_buffer_free(void *buffer) #define lio_dma_free(oct, size, virt_addr, dma_addr) \ dma_free_coherent(&oct->pci_dev->dev, size, virt_addr, dma_addr) -#define get_rbd(ptr) (((struct sk_buff *)(ptr))->data) +static inline +void *get_rbd(struct sk_buff *skb) +{ + struct octeon_skb_page_info *pg_info; + unsigned char *va; + + pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + va = page_address(pg_info->page) + pg_info->page_offset; + + return va; +} static inline u64 lio_map_ring_info(struct octeon_droq *droq, u32 i) @@ -183,33 +353,44 @@ lio_unmap_ring_info(struct pci_dev *pci_dev, } static inline u64 -lio_map_ring(struct pci_dev *pci_dev, - void *buf, u32 size) +lio_map_ring(void *buf) { dma_addr_t dma_addr; - dma_addr = dma_map_single(&pci_dev->dev, get_rbd(buf), size, - DMA_FROM_DEVICE); + struct sk_buff *skb = (struct sk_buff *)buf; + struct octeon_skb_page_info *pg_info; - BUG_ON(dma_mapping_error(&pci_dev->dev, dma_addr)); + pg_info = ((struct octeon_skb_page_info *)(skb->cb)); + if (!pg_info->page) { + pr_err("%s: pg_info->page NULL\n", __func__); + WARN_ON(1); + } + + /* Get DMA info */ + dma_addr = pg_info->dma; + if (!pg_info->dma) { + pr_err("%s: ERROR it should be already available\n", + __func__); + WARN_ON(1); + } + dma_addr += pg_info->page_offset; return (u64)dma_addr; } static inline void lio_unmap_ring(struct pci_dev *pci_dev, - u64 buf_ptr, u32 size) + u64 buf_ptr) + { - dma_unmap_single(&pci_dev->dev, - buf_ptr, size, - DMA_FROM_DEVICE); + dma_unmap_page(&pci_dev->dev, + buf_ptr, (PAGE_SIZE << 0), + DMA_FROM_DEVICE); } -static inline void *octeon_fast_packet_alloc(struct octeon_device *oct, - struct octeon_droq *droq, - u32 q_no, u32 size) +static inline void *octeon_fast_packet_alloc(u32 size) { - return recv_buffer_alloc(oct, q_no, size); + return recv_buffer_fast_alloc(size); } static inline void octeon_fast_packet_next(struct octeon_droq *droq, diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c index 1a0191549cb3..7843b8a05dcf 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c @@ -44,11 +44,11 @@ void * octeon_alloc_soft_command_resp(struct octeon_device *oct, - struct octeon_instr_64B *cmd, - size_t rdatasize) + union octeon_instr_64B *cmd, + u32 rdatasize) { struct octeon_soft_command *sc; - struct octeon_instr_ih *ih; + struct octeon_instr_ih2 *ih2; struct octeon_instr_irh *irh; struct octeon_instr_rdp *rdp; @@ -59,24 +59,25 @@ octeon_alloc_soft_command_resp(struct octeon_device *oct, return NULL; /* Copy existing command structure into the soft command */ - memcpy(&sc->cmd, cmd, sizeof(struct octeon_instr_64B)); + memcpy(&sc->cmd, cmd, sizeof(union octeon_instr_64B)); /* Add in the response related fields. Opcode and Param are already * there. */ - ih = (struct octeon_instr_ih *)&sc->cmd.ih; - ih->fsz = 40; /* irh + ossp[0] + ossp[1] + rdp + rptr = 40 bytes */ + ih2 = (struct octeon_instr_ih2 *)&sc->cmd.cmd2.ih2; + rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd2.rdp; + irh = (struct octeon_instr_irh *)&sc->cmd.cmd2.irh; + ih2->fsz = 40; /* irh + ossp[0] + ossp[1] + rdp + rptr = 40 bytes */ - irh = (struct octeon_instr_irh *)&sc->cmd.irh; irh->rflag = 1; /* a response is required */ - irh->len = 4; /* means four 64-bit words immediately follow irh */ - rdp = (struct octeon_instr_rdp *)&sc->cmd.rdp; rdp->pcie_port = oct->pcie_port; rdp->rlen = rdatasize; *sc->status_word = COMPLETION_WORD_INIT; + sc->cmd.cmd2.rptr = sc->dmarptr; + sc->wait_time = 1000; sc->timeout = jiffies + sc->wait_time; @@ -119,12 +120,11 @@ static void octnet_link_ctrl_callback(struct octeon_device *oct, static inline struct octeon_soft_command *octnic_alloc_ctrl_pkt_sc(struct octeon_device *oct, - struct octnic_ctrl_pkt *nctrl, - struct octnic_ctrl_params nparams) + struct octnic_ctrl_pkt *nctrl) { struct octeon_soft_command *sc = NULL; u8 *data; - size_t rdatasize; + u32 rdatasize; u32 uddsize = 0, datasize = 0; uddsize = (u32)(nctrl->ncmd.s.more * 8); @@ -143,7 +143,7 @@ static inline struct octeon_soft_command data = (u8 *)sc->virtdptr; - memcpy(data, &nctrl->ncmd, OCTNET_CMD_SIZE); + memcpy(data, &nctrl->ncmd, OCTNET_CMD_SIZE); octeon_swap_8B_data((u64 *)data, (OCTNET_CMD_SIZE >> 3)); @@ -152,6 +152,8 @@ static inline struct octeon_soft_command memcpy(data + OCTNET_CMD_SIZE, nctrl->udd, uddsize); } + sc->iq_no = (u32)nctrl->iq_no; + octeon_prepare_soft_command(oct, sc, OPCODE_NIC, OPCODE_NIC_CMD, 0, 0, 0); @@ -164,13 +166,12 @@ static inline struct octeon_soft_command int octnet_send_nic_ctrl_pkt(struct octeon_device *oct, - struct octnic_ctrl_pkt *nctrl, - struct octnic_ctrl_params nparams) + struct octnic_ctrl_pkt *nctrl) { int retval; struct octeon_soft_command *sc = NULL; - sc = octnic_alloc_ctrl_pkt_sc(oct, nctrl, nparams); + sc = octnic_alloc_ctrl_pkt_sc(oct, nctrl); if (!sc) { dev_err(&oct->pci_dev->dev, "%s soft command alloc failed\n", __func__); @@ -178,7 +179,7 @@ octnet_send_nic_ctrl_pkt(struct octeon_device *oct, } retval = octeon_send_soft_command(oct, sc); - if (retval) { + if (retval == IQ_SEND_FAILED) { octeon_free_soft_command(oct, sc); dev_err(&oct->pci_dev->dev, "%s soft command send failed status: %x\n", __func__, retval); diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_nic.h b/drivers/net/ethernet/cavium/liquidio/octeon_nic.h index 0238857c8105..b71a2bbe4bee 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_nic.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_nic.h @@ -52,6 +52,9 @@ struct octnic_ctrl_pkt { /** Additional data that may be needed by some commands. */ u64 udd[MAX_NCTRL_UDD]; + /** Input queue to use to send this command. */ + u64 iq_no; + /** Time to wait for Octeon software to respond to this control command. * If wait_time is 0, OSI assumes no response is expected. */ @@ -82,7 +85,7 @@ struct octnic_data_pkt { u32 datasize; /** Command to be passed to the Octeon device software. */ - struct octeon_instr_64B cmd; + union octeon_instr_64B cmd; /** Input queue to use to send this command. */ u32 q_no; @@ -94,15 +97,14 @@ struct octnic_data_pkt { */ union octnic_cmd_setup { struct { - u32 ifidx:8; - u32 cksum_offset:7; + u32 iq_no:8; u32 gather:1; u32 timestamp:1; - u32 ipv4opts_ipv6exthdr:2; u32 ip_csum:1; + u32 transport_csum:1; u32 tnl_csum:1; + u32 rsvd:19; - u32 rsvd:11; union { u32 datasize; u32 gatherptrs; @@ -113,79 +115,146 @@ union octnic_cmd_setup { }; -struct octnic_ctrl_params { - u32 resp_order; -}; - static inline int octnet_iq_is_full(struct octeon_device *oct, u32 q_no) { return ((u32)atomic_read(&oct->instr_queue[q_no]->instr_pending) >= (oct->instr_queue[q_no]->max_count - 2)); } -/** Utility function to prepare a 64B NIC instruction based on a setup command - * @param cmd - pointer to instruction to be filled in. - * @param setup - pointer to the setup structure - * @param q_no - which queue for back pressure - * - * Assumes the cmd instruction is pre-allocated, but no fields are filled in. - */ static inline void -octnet_prepare_pci_cmd(struct octeon_instr_64B *cmd, - union octnic_cmd_setup *setup, u32 tag) +octnet_prepare_pci_cmd_o2(struct octeon_device *oct, + union octeon_instr_64B *cmd, + union octnic_cmd_setup *setup, u32 tag) { - struct octeon_instr_ih *ih; + struct octeon_instr_ih2 *ih2; struct octeon_instr_irh *irh; union octnic_packet_params packet_params; + int port; - memset(cmd, 0, sizeof(struct octeon_instr_64B)); + memset(cmd, 0, sizeof(union octeon_instr_64B)); - ih = (struct octeon_instr_ih *)&cmd->ih; + ih2 = (struct octeon_instr_ih2 *)&cmd->cmd2.ih2; /* assume that rflag is cleared so therefore front data will only have - * irh and ossp[1] and ossp[2] for a total of 24 bytes + * irh and ossp[0], ossp[1] for a total of 32 bytes */ - ih->fsz = 24; + ih2->fsz = 24; + + ih2->tagtype = ORDERED_TAG; + ih2->grp = DEFAULT_POW_GRP; - ih->tagtype = ORDERED_TAG; - ih->grp = DEFAULT_POW_GRP; + port = (int)oct->instr_queue[setup->s.iq_no]->txpciq.s.port; if (tag) - ih->tag = tag; + ih2->tag = tag; else - ih->tag = LIO_DATA(setup->s.ifidx); + ih2->tag = LIO_DATA(port); - ih->raw = 1; - ih->qos = (setup->s.ifidx & 3) + 4; /* map qos based on interface */ + ih2->raw = 1; + ih2->qos = (port & 3) + 4; /* map qos based on interface */ if (!setup->s.gather) { - ih->dlengsz = setup->s.u.datasize; + ih2->dlengsz = setup->s.u.datasize; } else { - ih->gather = 1; - ih->dlengsz = setup->s.u.gatherptrs; + ih2->gather = 1; + ih2->dlengsz = setup->s.u.gatherptrs; } - irh = (struct octeon_instr_irh *)&cmd->irh; + irh = (struct octeon_instr_irh *)&cmd->cmd2.irh; irh->opcode = OPCODE_NIC; irh->subcode = OPCODE_NIC_NW_DATA; packet_params.u32 = 0; - if (setup->s.cksum_offset) { - packet_params.s.csoffset = setup->s.cksum_offset; - packet_params.s.ipv4opts_ipv6exthdr = - setup->s.ipv4opts_ipv6exthdr; + packet_params.s.ip_csum = setup->s.ip_csum; + packet_params.s.transport_csum = setup->s.transport_csum; + packet_params.s.tnl_csum = setup->s.tnl_csum; + packet_params.s.tsflag = setup->s.timestamp; + + irh->ossp = packet_params.u32; +} + +static inline void +octnet_prepare_pci_cmd_o3(struct octeon_device *oct, + union octeon_instr_64B *cmd, + union octnic_cmd_setup *setup, u32 tag) +{ + struct octeon_instr_irh *irh; + struct octeon_instr_ih3 *ih3; + struct octeon_instr_pki_ih3 *pki_ih3; + union octnic_packet_params packet_params; + int port; + + memset(cmd, 0, sizeof(union octeon_instr_64B)); + + ih3 = (struct octeon_instr_ih3 *)&cmd->cmd3.ih3; + pki_ih3 = (struct octeon_instr_pki_ih3 *)&cmd->cmd3.pki_ih3; + + /* assume that rflag is cleared so therefore front data will only have + * irh and ossp[1] and ossp[2] for a total of 24 bytes + */ + ih3->pkind = oct->instr_queue[setup->s.iq_no]->txpciq.s.pkind; + /*PKI IH*/ + ih3->fsz = 24 + 8; + + if (!setup->s.gather) { + ih3->dlengsz = setup->s.u.datasize; + } else { + ih3->gather = 1; + ih3->dlengsz = setup->s.u.gatherptrs; } + pki_ih3->w = 1; + pki_ih3->raw = 1; + pki_ih3->utag = 1; + pki_ih3->utt = 1; + pki_ih3->uqpg = oct->instr_queue[setup->s.iq_no]->txpciq.s.use_qpg; + + port = (int)oct->instr_queue[setup->s.iq_no]->txpciq.s.port; + + if (tag) + pki_ih3->tag = tag; + else + pki_ih3->tag = LIO_DATA(port); + + pki_ih3->tagtype = ORDERED_TAG; + pki_ih3->qpg = oct->instr_queue[setup->s.iq_no]->txpciq.s.qpg; + pki_ih3->pm = 0x7; /*0x7 - meant for Parse nothing, uninterpreted*/ + pki_ih3->sl = 8; /* sl will be sizeof(pki_ih3)*/ + + irh = (struct octeon_instr_irh *)&cmd->cmd3.irh; + + irh->opcode = OPCODE_NIC; + irh->subcode = OPCODE_NIC_NW_DATA; + + packet_params.u32 = 0; + packet_params.s.ip_csum = setup->s.ip_csum; + packet_params.s.transport_csum = setup->s.transport_csum; packet_params.s.tnl_csum = setup->s.tnl_csum; - packet_params.s.ifidx = setup->s.ifidx; packet_params.s.tsflag = setup->s.timestamp; irh->ossp = packet_params.u32; } +/** Utility function to prepare a 64B NIC instruction based on a setup command + * @param cmd - pointer to instruction to be filled in. + * @param setup - pointer to the setup structure + * @param q_no - which queue for back pressure + * + * Assumes the cmd instruction is pre-allocated, but no fields are filled in. + */ +static inline void +octnet_prepare_pci_cmd(struct octeon_device *oct, union octeon_instr_64B *cmd, + union octnic_cmd_setup *setup, u32 tag) +{ + if (OCTEON_CN6XXX(oct)) + octnet_prepare_pci_cmd_o2(oct, cmd, setup, tag); + else + octnet_prepare_pci_cmd_o3(oct, cmd, setup, tag); +} + /** Allocate and a soft command with space for a response immediately following * the commnad. * @param oct - octeon device pointer @@ -198,8 +267,8 @@ octnet_prepare_pci_cmd(struct octeon_instr_64B *cmd, */ void * octeon_alloc_soft_command_resp(struct octeon_device *oct, - struct octeon_instr_64B *cmd, - size_t rdatasize); + union octeon_instr_64B *cmd, + u32 rdatasize); /** Send a NIC data packet to the device * @param oct - octeon device pointer @@ -214,14 +283,11 @@ int octnet_send_nic_data_pkt(struct octeon_device *oct, /** Send a NIC control packet to the device * @param oct - octeon device pointer * @param nctrl - control structure with command, timout, and callback info - * @param nparams - response control structure - * * @returns IQ_FAILED if it failed to add to the input queue. IQ_STOP if it the * queue should be stopped, and IQ_SEND_OK if it sent okay. */ int octnet_send_nic_ctrl_pkt(struct octeon_device *oct, - struct octnic_ctrl_pkt *nctrl, - struct octnic_ctrl_params nparams); + struct octnic_ctrl_pkt *nctrl); #endif diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index 931391574048..8649677b2411 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -69,12 +69,16 @@ static inline int IQ_INSTR_MODE_64B(struct octeon_device *oct, int iq_no) /* Return 0 on success, 1 on failure */ int octeon_init_instr_queue(struct octeon_device *oct, - u32 iq_no, u32 num_descs) + union oct_txpciq txpciq, + u32 num_descs) { struct octeon_instr_queue *iq; struct octeon_iq_config *conf = NULL; + u32 iq_no = (u32)txpciq.s.q_no; u32 q_size; struct cavium_wq *db_wq; + int orig_node = dev_to_node(&oct->pci_dev->dev); + int numa_node = cpu_to_node(iq_no % num_online_cpus()); if (OCTEON_CN6XXX(oct)) conf = &(CFG_GET_IQ_CFG(CHIP_FIELD(oct, cn6xxx, conf))); @@ -95,9 +99,15 @@ int octeon_init_instr_queue(struct octeon_device *oct, q_size = (u32)conf->instr_type * num_descs; iq = oct->instr_queue[iq_no]; + iq->oct_dev = oct; + set_dev_node(&oct->pci_dev->dev, numa_node); iq->base_addr = lio_dma_alloc(oct, q_size, (dma_addr_t *)&iq->base_addr_dma); + set_dev_node(&oct->pci_dev->dev, orig_node); + if (!iq->base_addr) + iq->base_addr = lio_dma_alloc(oct, q_size, + (dma_addr_t *)&iq->base_addr_dma); if (!iq->base_addr) { dev_err(&oct->pci_dev->dev, "Cannot allocate memory for instr queue %d\n", iq_no); @@ -109,7 +119,11 @@ int octeon_init_instr_queue(struct octeon_device *oct, /* Initialize a list to holds requests that have been posted to Octeon * but has yet to be fetched by octeon */ - iq->request_list = vmalloc(sizeof(*iq->request_list) * num_descs); + iq->request_list = vmalloc_node((sizeof(*iq->request_list) * num_descs), + numa_node); + if (!iq->request_list) + iq->request_list = vmalloc(sizeof(*iq->request_list) * + num_descs); if (!iq->request_list) { lio_dma_free(oct, q_size, iq->base_addr, iq->base_addr_dma); dev_err(&oct->pci_dev->dev, "Alloc failed for IQ[%d] nr free list\n", @@ -122,7 +136,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, dev_dbg(&oct->pci_dev->dev, "IQ[%d]: base: %p basedma: %llx count: %d\n", iq_no, iq->base_addr, iq->base_addr_dma, iq->max_count); - iq->iq_no = iq_no; + iq->txpciq.u64 = txpciq.u64; iq->fill_threshold = (u32)conf->db_min; iq->fill_cnt = 0; iq->host_write_index = 0; @@ -189,26 +203,38 @@ int octeon_delete_instr_queue(struct octeon_device *oct, u32 iq_no) /* Return 0 on success, 1 on failure */ int octeon_setup_iq(struct octeon_device *oct, - u32 iq_no, + int ifidx, + int q_index, + union oct_txpciq txpciq, u32 num_descs, void *app_ctx) { + u32 iq_no = (u32)txpciq.s.q_no; + int numa_node = cpu_to_node(iq_no % num_online_cpus()); + if (oct->instr_queue[iq_no]) { dev_dbg(&oct->pci_dev->dev, "IQ is in use. Cannot create the IQ: %d again\n", iq_no); + oct->instr_queue[iq_no]->txpciq.u64 = txpciq.u64; oct->instr_queue[iq_no]->app_ctx = app_ctx; return 0; } oct->instr_queue[iq_no] = - vmalloc(sizeof(struct octeon_instr_queue)); + vmalloc_node(sizeof(struct octeon_instr_queue), numa_node); + if (!oct->instr_queue[iq_no]) + oct->instr_queue[iq_no] = + vmalloc(sizeof(struct octeon_instr_queue)); if (!oct->instr_queue[iq_no]) return 1; memset(oct->instr_queue[iq_no], 0, sizeof(struct octeon_instr_queue)); + oct->instr_queue[iq_no]->q_index = q_index; oct->instr_queue[iq_no]->app_ctx = app_ctx; - if (octeon_init_instr_queue(oct, iq_no, num_descs)) { + oct->instr_queue[iq_no]->ifidx = ifidx; + + if (octeon_init_instr_queue(oct, txpciq, num_descs)) { vfree(oct->instr_queue[iq_no]); oct->instr_queue[iq_no] = NULL; return 1; @@ -395,7 +421,7 @@ lio_process_iq_request_list(struct octeon_device *oct, case REQTYPE_SOFT_COMMAND: sc = buf; - irh = (struct octeon_instr_irh *)&sc->cmd.irh; + irh = (struct octeon_instr_irh *)&sc->cmd.cmd2.irh; if (irh->rflag) { /* We're expecting a response from Octeon. * It's up to lio_process_ordered_list() to @@ -558,7 +584,7 @@ octeon_prepare_soft_command(struct octeon_device *oct, u64 ossp1) { struct octeon_config *oct_cfg; - struct octeon_instr_ih *ih; + struct octeon_instr_ih2 *ih2; struct octeon_instr_irh *irh; struct octeon_instr_rdp *rdp; @@ -567,73 +593,69 @@ octeon_prepare_soft_command(struct octeon_device *oct, oct_cfg = octeon_get_conf(oct); - ih = (struct octeon_instr_ih *)&sc->cmd.ih; - ih->tagtype = ATOMIC_TAG; - ih->tag = LIO_CONTROL; - ih->raw = 1; - ih->grp = CFG_GET_CTRL_Q_GRP(oct_cfg); + ih2 = (struct octeon_instr_ih2 *)&sc->cmd.cmd2.ih2; + ih2->tagtype = ATOMIC_TAG; + ih2->tag = LIO_CONTROL; + ih2->raw = 1; + ih2->grp = CFG_GET_CTRL_Q_GRP(oct_cfg); if (sc->datasize) { - ih->dlengsz = sc->datasize; - ih->rs = 1; + ih2->dlengsz = sc->datasize; + ih2->rs = 1; } - irh = (struct octeon_instr_irh *)&sc->cmd.irh; + irh = (struct octeon_instr_irh *)&sc->cmd.cmd2.irh; irh->opcode = opcode; irh->subcode = subcode; /* opcode/subcode specific parameters (ossp) */ irh->ossp = irh_ossp; - sc->cmd.ossp[0] = ossp0; - sc->cmd.ossp[1] = ossp1; + sc->cmd.cmd2.ossp[0] = ossp0; + sc->cmd.cmd2.ossp[1] = ossp1; if (sc->rdatasize) { - rdp = (struct octeon_instr_rdp *)&sc->cmd.rdp; + rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd2.rdp; rdp->pcie_port = oct->pcie_port; rdp->rlen = sc->rdatasize; irh->rflag = 1; - irh->len = 4; - ih->fsz = 40; /* irh+ossp[0]+ossp[1]+rdp+rptr = 40 bytes */ + ih2->fsz = 40; /* irh+ossp[0]+ossp[1]+rdp+rptr = 40 bytes */ } else { irh->rflag = 0; - irh->len = 2; - ih->fsz = 24; /* irh + ossp[0] + ossp[1] = 24 bytes */ + ih2->fsz = 24; /* irh + ossp[0] + ossp[1] = 24 bytes */ } - - while (!(oct->io_qmask.iq & (1 << sc->iq_no))) - sc->iq_no++; } int octeon_send_soft_command(struct octeon_device *oct, struct octeon_soft_command *sc) { - struct octeon_instr_ih *ih; + struct octeon_instr_ih2 *ih2; struct octeon_instr_irh *irh; struct octeon_instr_rdp *rdp; + u32 len; - ih = (struct octeon_instr_ih *)&sc->cmd.ih; - if (ih->dlengsz) { - BUG_ON(!sc->dmadptr); - sc->cmd.dptr = sc->dmadptr; + ih2 = (struct octeon_instr_ih2 *)&sc->cmd.cmd2.ih2; + if (ih2->dlengsz) { + WARN_ON(!sc->dmadptr); + sc->cmd.cmd2.dptr = sc->dmadptr; } - - irh = (struct octeon_instr_irh *)&sc->cmd.irh; + irh = (struct octeon_instr_irh *)&sc->cmd.cmd2.irh; if (irh->rflag) { BUG_ON(!sc->dmarptr); BUG_ON(!sc->status_word); *sc->status_word = COMPLETION_WORD_INIT; - rdp = (struct octeon_instr_rdp *)&sc->cmd.rdp; + rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd2.rdp; - sc->cmd.rptr = sc->dmarptr; + sc->cmd.cmd2.rptr = sc->dmarptr; } + len = (u32)ih2->dlengsz; if (sc->wait_time) sc->timeout = jiffies + sc->wait_time; - return octeon_send_command(oct, sc->iq_no, 1, &sc->cmd, sc, - (u32)ih->dlengsz, REQTYPE_SOFT_COMMAND); + return (octeon_send_command(oct, sc->iq_no, 1, &sc->cmd, sc, + len, REQTYPE_SOFT_COMMAND)); } int octeon_setup_sc_buffer_pool(struct octeon_device *oct) diff --git a/drivers/net/ethernet/cavium/liquidio/response_manager.c b/drivers/net/ethernet/cavium/liquidio/response_manager.c index 6287a7c72b9e..e2e9103e6ebd 100644 --- a/drivers/net/ethernet/cavium/liquidio/response_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/response_manager.c @@ -85,6 +85,7 @@ int lio_process_ordered_list(struct octeon_device *octeon_dev, u32 status; u64 status64; struct octeon_instr_rdp *rdp; + u64 rptr; ordered_sc_list = &octeon_dev->response_list[OCTEON_ORDERED_SC_LIST]; @@ -102,7 +103,8 @@ int lio_process_ordered_list(struct octeon_device *octeon_dev, sc = (struct octeon_soft_command *)ordered_sc_list-> head.next; - rdp = (struct octeon_instr_rdp *)&sc->cmd.rdp; + rdp = (struct octeon_instr_rdp *)&sc->cmd.cmd2.rdp; + rptr = sc->cmd.cmd2.rptr; status = OCTEON_REQUEST_PENDING; @@ -110,7 +112,7 @@ int lio_process_ordered_list(struct octeon_device *octeon_dev, * to where rptr is pointing to */ dma_sync_single_for_cpu(&octeon_dev->pci_dev->dev, - sc->cmd.rptr, rdp->rlen, + rptr, rdp->rlen, DMA_FROM_DEVICE); status64 = *sc->status_word; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 477db477b133..c45de49dc963 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -64,6 +64,7 @@ #include <net/bonding.h> #include <net/addrconf.h> #include <asm/uaccess.h> +#include <linux/crash_dump.h> #include "cxgb4.h" #include "t4_regs.h" @@ -206,7 +207,7 @@ static int rx_dma_offset = 2; static unsigned int num_vf[NUM_OF_PF_WITH_SRIOV]; module_param_array(num_vf, uint, NULL, 0644); -MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3"); +MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3, deprecated parameter - please use the pci sysfs interface."); #endif /* TX Queue select used to determine what algorithm to use for selecting TX @@ -460,11 +461,8 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok) struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; - if (!(dev->flags & IFF_PROMISC)) { - __dev_uc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync); - if (!(dev->flags & IFF_ALLMULTI)) - __dev_mc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync); - } + __dev_uc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync); + __dev_mc_sync(dev, cxgb4_mac_sync, cxgb4_mac_unsync); return t4_set_rxmode(adapter, adapter->mbox, pi->viid, mtu, (dev->flags & IFF_PROMISC) ? 1 : 0, @@ -3735,7 +3733,8 @@ static int adap_init0(struct adapter *adap) return ret; /* Contact FW, advertising Master capability */ - ret = t4_fw_hello(adap, adap->mbox, adap->mbox, MASTER_MAY, &state); + ret = t4_fw_hello(adap, adap->mbox, adap->mbox, + is_kdump_kernel() ? MASTER_MUST : MASTER_MAY, &state); if (ret < 0) { dev_err(adap->pdev_dev, "could not connect to FW, error %d\n", ret); @@ -4366,6 +4365,11 @@ static void cfg_queues(struct adapter *adap) if (q10g > netif_get_num_default_rss_queues()) q10g = netif_get_num_default_rss_queues(); + /* Reduce memory usage in kdump environment, disable all offload. + */ + if (is_kdump_kernel()) + adap->params.offload = 0; + for_each_port(adap, i) { struct port_info *pi = adap2pinfo(adap, i); @@ -4829,6 +4833,60 @@ static int get_chip_type(struct pci_dev *pdev, u32 pl_rev) return -EINVAL; } +#ifdef CONFIG_PCI_IOV +static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs) +{ + int err = 0; + int current_vfs = pci_num_vf(pdev); + u32 pcie_fw; + void __iomem *regs; + + regs = pci_ioremap_bar(pdev, 0); + if (!regs) { + dev_err(&pdev->dev, "cannot map device registers\n"); + return -ENOMEM; + } + + pcie_fw = readl(regs + PCIE_FW_A); + iounmap(regs); + /* Check if cxgb4 is the MASTER and fw is initialized */ + if (!(pcie_fw & PCIE_FW_INIT_F) || + !(pcie_fw & PCIE_FW_MASTER_VLD_F) || + PCIE_FW_MASTER_G(pcie_fw) != 4) { + dev_warn(&pdev->dev, + "cxgb4 driver needs to be MASTER to support SRIOV\n"); + return -EOPNOTSUPP; + } + + /* If any of the VF's is already assigned to Guest OS, then + * SRIOV for the same cannot be modified + */ + if (current_vfs && pci_vfs_assigned(pdev)) { + dev_err(&pdev->dev, + "Cannot modify SR-IOV while VFs are assigned\n"); + num_vfs = current_vfs; + return num_vfs; + } + + /* Disable SRIOV when zero is passed. + * One needs to disable SRIOV before modifying it, else + * stack throws the below warning: + * " 'n' VFs already enabled. Disable before enabling 'm' VFs." + */ + if (!num_vfs) { + pci_disable_sriov(pdev); + return num_vfs; + } + + if (num_vfs != current_vfs) { + err = pci_enable_sriov(pdev, num_vfs); + if (err) + return err; + } + return num_vfs; +} +#endif + static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int func, i, err, s_qpp, qpp, num_seg; @@ -5162,11 +5220,16 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) sriov: #ifdef CONFIG_PCI_IOV - if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0) + if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0) { + dev_warn(&pdev->dev, + "Enabling SR-IOV VFs using the num_vf module " + "parameter is deprecated - please use the pci sysfs " + "interface instead.\n"); if (pci_enable_sriov(pdev, num_vf[func]) == 0) dev_info(&pdev->dev, "instantiated %u virtual functions\n", num_vf[func]); + } #endif return 0; @@ -5259,6 +5322,9 @@ static struct pci_driver cxgb4_driver = { .probe = init_one, .remove = remove_one, .shutdown = remove_one, +#ifdef CONFIG_PCI_IOV + .sriov_configure = cxgb4_iov_configure, +#endif .err_handler = &cxgb4_eeh, }; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 04fc6f6d1e25..8d9b2cb74aa2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -937,12 +937,8 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok) { struct port_info *pi = netdev_priv(dev); - if (!(dev->flags & IFF_PROMISC)) { - __dev_uc_sync(dev, cxgb4vf_mac_sync, cxgb4vf_mac_unsync); - if (!(dev->flags & IFF_ALLMULTI)) - __dev_mc_sync(dev, cxgb4vf_mac_sync, - cxgb4vf_mac_unsync); - } + __dev_uc_sync(dev, cxgb4vf_mac_sync, cxgb4vf_mac_unsync); + __dev_mc_sync(dev, cxgb4vf_mac_sync, cxgb4vf_mac_unsync); return t4vf_set_rxmode(pi->adapter, pi->viid, -1, (dev->flags & IFF_PROMISC) != 0, (dev->flags & IFF_ALLMULTI) != 0, diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index 60383040d6c6..c363b58552e9 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -53,6 +53,8 @@ #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/of.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/kernel.h> #include <linux/types.h> @@ -1895,9 +1897,17 @@ static int cs89x0_platform_remove(struct platform_device *pdev) return 0; } +static const struct __maybe_unused of_device_id cs89x0_match[] = { + { .compatible = "cirrus,cs8900", }, + { .compatible = "cirrus,cs8920", }, + { }, +}; +MODULE_DEVICE_TABLE(of, cs89x0_match); + static struct platform_driver cs89x0_driver = { .driver = { - .name = DRV_NAME, + .name = DRV_NAME, + .of_match_table = of_match_ptr(cs89x0_match), }, .remove = cs89x0_platform_remove, }; diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c index dbd4ecc205dc..963838d4fac1 100644 --- a/drivers/net/phy/mdio-mux.c +++ b/drivers/net/phy/mdio-mux.c @@ -115,6 +115,7 @@ int mdio_mux_init(struct device *dev, goto err_parent_bus; } } else { + parent_bus_node = NULL; parent_bus = mux_bus; } @@ -184,8 +185,7 @@ int mdio_mux_init(struct device *dev, put_device(&pb->mii_bus->dev); err_parent_bus: - if (!mux_bus) - of_node_put(parent_bus_node); + of_node_put(parent_bus_node); return ret_val; } EXPORT_SYMBOL_GPL(mdio_mux_init); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8cc6bf4f5ba3..4884802e0af1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1254,6 +1254,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return -EFAULT; } + err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun)); + if (err) { + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); + kfree_skb(skb); + return -EINVAL; + } + switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { @@ -1280,13 +1287,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, break; } - err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun)); - if (err) { - this_cpu_inc(tun->pcpu_stats->rx_frame_errors); - kfree_skb(skb); - return -EINVAL; - } - /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = msg_control; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index b4d746943bc5..32173aa9208e 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -378,23 +378,37 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) } /* holding rtnl */ -static void vrf_rt6_release(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct rt6_info *rt6_local = rtnl_dereference(vrf->rt6_local); + struct net *net = dev_net(dev); + struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); RCU_INIT_POINTER(vrf->rt6_local, NULL); synchronize_rcu(); - if (rt6) - dst_release(&rt6->dst); + /* move dev in dst's to loopback so this VRF device can be deleted + * - based on dst_ifdown + */ + if (rt6) { + dst = &rt6->dst; + dev_put(dst->dev); + dst->dev = net->loopback_dev; + dev_hold(dst->dev); + dst_release(dst); + } if (rt6_local) { if (rt6_local->rt6i_idev) in6_dev_put(rt6_local->rt6i_idev); - dst_release(&rt6_local->dst); + dst = &rt6_local->dst; + dev_put(dst->dev); + dst->dev = net->loopback_dev; + dev_hold(dst->dev); + dst_release(dst); } } @@ -449,7 +463,7 @@ out: return rc; } #else -static void vrf_rt6_release(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } @@ -518,20 +532,35 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) } /* holding rtnl */ -static void vrf_rtable_release(struct net_vrf *vrf) +static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct rtable *rth_local = rtnl_dereference(vrf->rth_local); + struct net *net = dev_net(dev); + struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); RCU_INIT_POINTER(vrf->rth_local, NULL); synchronize_rcu(); - if (rth) - dst_release(&rth->dst); + /* move dev in dst's to loopback so this VRF device can be deleted + * - based on dst_ifdown + */ + if (rth) { + dst = &rth->dst; + dev_put(dst->dev); + dst->dev = net->loopback_dev; + dev_hold(dst->dev); + dst_release(dst); + } - if (rth_local) - dst_release(&rth_local->dst); + if (rth_local) { + dst = &rth_local->dst; + dev_put(dst->dev); + dst->dev = net->loopback_dev; + dev_hold(dst->dev); + dst_release(dst); + } } static int vrf_rtable_create(struct net_device *dev) @@ -633,8 +662,8 @@ static void vrf_dev_uninit(struct net_device *dev) struct net_device *port_dev; struct list_head *iter; - vrf_rtable_release(vrf); - vrf_rt6_release(vrf); + vrf_rtable_release(dev, vrf); + vrf_rt6_release(dev, vrf); netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); @@ -669,7 +698,7 @@ static int vrf_dev_init(struct net_device *dev) return 0; out_rth: - vrf_rtable_release(vrf); + vrf_rtable_release(dev, vrf); out_stats: free_percpu(dev->dstats); dev->dstats = NULL; @@ -785,9 +814,63 @@ out: return rc; } +static struct rt6_info *vrf_ip6_route_lookup(struct net *net, + const struct net_device *dev, + struct flowi6 *fl6, + int ifindex, + int flags) +{ + struct net_vrf *vrf = netdev_priv(dev); + struct fib6_table *table = NULL; + struct rt6_info *rt6; + + rcu_read_lock(); + + /* fib6_table does not have a refcnt and can not be freed */ + rt6 = rcu_dereference(vrf->rt6); + if (likely(rt6)) + table = rt6->rt6i_table; + + rcu_read_unlock(); + + if (!table) + return NULL; + + return ip6_pol_route(net, table, ifindex, fl6, flags); +} + +static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, + int ifindex) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct flowi6 fl6 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, + .flowi6_iif = ifindex, + }; + struct net *net = dev_net(vrf_dev); + struct rt6_info *rt6; + + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, + RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); + if (unlikely(!rt6)) + return; + + if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) + return; + + skb_dst_set(skb, &rt6->dst); +} + static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { + int orig_iif = skb->skb_iif; + bool need_strict; + /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ @@ -798,8 +881,11 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, goto out; } - /* if packet is NDISC keep the ingress interface */ - if (!ipv6_ndisc_frame(skb)) { + /* if packet is NDISC or addressed to multicast or link-local + * then keep the ingress interface + */ + need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); + if (!ipv6_ndisc_frame(skb) && !need_strict) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; @@ -810,6 +896,9 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } + if (need_strict) + vrf_ip6_input_dst(skb, vrf_dev, orig_iif); + out: return skb; } @@ -861,13 +950,37 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, - const struct flowi6 *fl6) + struct flowi6 *fl6) { + bool need_strict = rt6_need_strict(&fl6->daddr); + struct net_vrf *vrf = netdev_priv(dev); + struct net *net = dev_net(dev); struct dst_entry *dst = NULL; + struct rt6_info *rt; - if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { - struct net_vrf *vrf = netdev_priv(dev); - struct rt6_info *rt; + /* send to link-local or multicast address */ + if (need_strict) { + int flags = RT6_LOOKUP_F_IFACE; + + /* VRF device does not have a link-local address and + * sending packets to link-local or mcast addresses over + * a VRF device does not make sense + */ + if (fl6->flowi6_oif == dev->ifindex) { + struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst; + + dst_hold(dst); + return dst; + } + + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + if (rt) + dst = &rt->dst; + + } else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { rcu_read_lock(); @@ -880,6 +993,10 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, rcu_read_unlock(); } + /* make sure oif is set to VRF device for lookup */ + if (!need_strict) + fl6->flowi6_oif = dev->ifindex; + return dst; } #endif @@ -1011,6 +1128,20 @@ static void vrf_setup(struct net_device *dev) /* don't allow vrf devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; + + /* does not make sense for a VLAN to be added to a vrf device */ + dev->features |= NETIF_F_VLAN_CHALLENGED; + + /* enable offload features */ + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM; + dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; + + dev->hw_features = dev->features; + dev->hw_enc_features = dev->features; + + /* default to no qdisc; user can add if desired */ + dev->priv_flags |= IFF_NO_QUEUE; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e955a2859009..152421cc6f44 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -123,6 +123,10 @@ enum { MLX5_CMD_OP_DRAIN_DCT = 0x712, MLX5_CMD_OP_QUERY_DCT = 0x713, MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION = 0x714, + MLX5_CMD_OP_CREATE_XRQ = 0x717, + MLX5_CMD_OP_DESTROY_XRQ = 0x718, + MLX5_CMD_OP_QUERY_XRQ = 0x719, + MLX5_CMD_OP_ARM_XRQ = 0x71a, MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, @@ -139,6 +143,8 @@ enum { MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773, + MLX5_CMD_OP_SET_RATE_LIMIT = 0x780, + MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781, MLX5_CMD_OP_ALLOC_PD = 0x800, MLX5_CMD_OP_DEALLOC_PD = 0x801, MLX5_CMD_OP_ALLOC_UAR = 0x802, @@ -362,7 +368,8 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { }; struct mlx5_ifc_fte_match_set_misc_bits { - u8 reserved_at_0[0x20]; + u8 reserved_at_0[0x8]; + u8 source_sqn[0x18]; u8 reserved_at_20[0x10]; u8 source_port[0x10]; @@ -508,6 +515,17 @@ struct mlx5_ifc_e_switch_cap_bits { u8 reserved_at_20[0x7e0]; }; +struct mlx5_ifc_qos_cap_bits { + u8 packet_pacing[0x1]; + u8 reserved_0[0x1f]; + u8 reserved_1[0x20]; + u8 packet_pacing_max_rate[0x20]; + u8 packet_pacing_min_rate[0x20]; + u8 reserved_2[0x10]; + u8 packet_pacing_rate_table_size[0x10]; + u8 reserved_3[0x760]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -747,7 +765,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 out_of_seq_cnt[0x1]; u8 vport_counters[0x1]; - u8 reserved_at_182[0x4]; + u8 retransmission_q_counters[0x1]; + u8 reserved_at_183[0x3]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; @@ -774,7 +793,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_msg[0x5]; u8 reserved_at_1c8[0x4]; u8 max_tc[0x4]; - u8 reserved_at_1d0[0x6]; + u8 reserved_at_1d0[0x1]; + u8 dcbx[0x1]; + u8 reserved_at_1d2[0x4]; u8 rol_s[0x1]; u8 rol_g[0x1]; u8 reserved_at_1d8[0x1]; @@ -806,7 +827,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 tph[0x1]; u8 rf[0x1]; u8 dct[0x1]; - u8 reserved_at_21b[0x1]; + u8 qos[0x1]; u8 eth_net_offloads[0x1]; u8 roce[0x1]; u8 atomic[0x1]; @@ -932,7 +953,15 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cqe_compression_timeout[0x10]; u8 cqe_compression_max_num[0x10]; - u8 reserved_at_5e0[0x220]; + u8 reserved_at_5e0[0x10]; + u8 tag_matching[0x1]; + u8 rndv_offload_rc[0x1]; + u8 rndv_offload_dc[0x1]; + u8 log_tag_matching_list_sz[0x5]; + u8 reserved_at_5e8[0x3]; + u8 log_max_xrq[0x5]; + + u8 reserved_at_5f0[0x200]; }; enum mlx5_flow_destination_type { @@ -1970,7 +1999,7 @@ struct mlx5_ifc_qpc_bits { u8 reserved_at_560[0x5]; u8 rq_type[0x3]; - u8 srqn_rmpn[0x18]; + u8 srqn_rmpn_xrqn[0x18]; u8 reserved_at_580[0x8]; u8 rmsn[0x18]; @@ -2021,6 +2050,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; struct mlx5_ifc_e_switch_cap_bits e_switch_cap; struct mlx5_ifc_vector_calc_cap_bits vector_calc_cap; + struct mlx5_ifc_qos_cap_bits qos_cap; u8 reserved_at_0[0x8000]; }; @@ -2247,8 +2277,9 @@ struct mlx5_ifc_sqc_bits { u8 reserved_at_40[0x8]; u8 cqn[0x18]; - u8 reserved_at_60[0xa0]; + u8 reserved_at_60[0x90]; + u8 packet_pacing_rate_limit_index[0x10]; u8 tis_lst_sz[0x10]; u8 reserved_at_110[0x10]; @@ -2596,7 +2627,7 @@ struct mlx5_ifc_dctc_bits { u8 reserved_at_98[0x8]; u8 reserved_at_a0[0x8]; - u8 srqn[0x18]; + u8 srqn_xrqn[0x18]; u8 reserved_at_c0[0x8]; u8 pd[0x18]; @@ -2648,6 +2679,7 @@ enum { enum { MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0, MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1, + MLX5_CQ_PERIOD_NUM_MODES }; struct mlx5_ifc_cqc_bits { @@ -2725,6 +2757,54 @@ struct mlx5_ifc_query_adapter_param_block_bits { u8 vsd_contd_psid[16][0x8]; }; +enum { + MLX5_XRQC_STATE_GOOD = 0x0, + MLX5_XRQC_STATE_ERROR = 0x1, +}; + +enum { + MLX5_XRQC_TOPOLOGY_NO_SPECIAL_TOPOLOGY = 0x0, + MLX5_XRQC_TOPOLOGY_TAG_MATCHING = 0x1, +}; + +enum { + MLX5_XRQC_OFFLOAD_RNDV = 0x1, +}; + +struct mlx5_ifc_tag_matching_topology_context_bits { + u8 log_matching_list_sz[0x4]; + u8 reserved_at_4[0xc]; + u8 append_next_index[0x10]; + + u8 sw_phase_cnt[0x10]; + u8 hw_phase_cnt[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_xrqc_bits { + u8 state[0x4]; + u8 rlkey[0x1]; + u8 reserved_at_5[0xf]; + u8 topology[0x4]; + u8 reserved_at_18[0x4]; + u8 offload[0x4]; + + u8 reserved_at_20[0x8]; + u8 user_index[0x18]; + + u8 reserved_at_40[0x8]; + u8 cqn[0x18]; + + u8 reserved_at_60[0xa0]; + + struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context; + + u8 reserved_at_180[0x180]; + + struct mlx5_ifc_wq_bits wq; +}; + union mlx5_ifc_modify_field_select_resize_field_select_auto_bits { struct mlx5_ifc_modify_field_select_bits modify_field_select; struct mlx5_ifc_resize_field_select_bits resize_field_select; @@ -3147,6 +3227,30 @@ struct mlx5_ifc_rst2init_qp_in_bits { u8 reserved_at_800[0x80]; }; +struct mlx5_ifc_query_xrq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_xrqc_bits xrq_context; +}; + +struct mlx5_ifc_query_xrq_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_query_xrc_srq_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -3550,7 +3654,27 @@ struct mlx5_ifc_query_q_counter_out_bits { u8 out_of_sequence[0x20]; - u8 reserved_at_1e0[0x620]; + u8 reserved_at_1e0[0x20]; + + u8 duplicate_request[0x20]; + + u8 reserved_at_220[0x20]; + + u8 rnr_nak_retry_err[0x20]; + + u8 reserved_at_260[0x20]; + + u8 packet_seq_err[0x20]; + + u8 reserved_at_2a0[0x20]; + + u8 implied_nak_seq_err[0x20]; + + u8 reserved_at_2e0[0x20]; + + u8 local_ack_timeout_err[0x20]; + + u8 reserved_at_320[0x4e0]; }; struct mlx5_ifc_query_q_counter_in_bits { @@ -5004,6 +5128,28 @@ struct mlx5_ifc_detach_from_mcg_in_bits { u8 multicast_gid[16][0x8]; }; +struct mlx5_ifc_destroy_xrq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_xrq_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_destroy_xrc_srq_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -5589,6 +5735,30 @@ struct mlx5_ifc_dealloc_flow_counter_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_create_xrq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_create_xrq_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_xrqc_bits xrq_context; +}; + struct mlx5_ifc_create_xrc_srq_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6130,6 +6300,29 @@ struct mlx5_ifc_attach_to_mcg_in_bits { u8 multicast_gid[16][0x8]; }; +struct mlx5_ifc_arm_xrq_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_arm_xrq_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 xrqn[0x18]; + + u8 reserved_at_60[0x10]; + u8 lwm[0x10]; +}; + struct mlx5_ifc_arm_xrc_srq_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6167,7 +6360,8 @@ struct mlx5_ifc_arm_rq_out_bits { }; enum { - MLX5_ARM_RQ_IN_OP_MOD_SRQ_ = 0x1, + MLX5_ARM_RQ_IN_OP_MOD_SRQ = 0x1, + MLX5_ARM_RQ_IN_OP_MOD_XRQ = 0x2, }; struct mlx5_ifc_arm_rq_in_bits { @@ -6360,6 +6554,30 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits { u8 vxlan_udp_port[0x10]; }; +struct mlx5_ifc_set_rate_limit_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_rate_limit_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 rate_limit_index[0x10]; + + u8 reserved_at_60[0x20]; + + u8 rate_limit[0x20]; +}; + struct mlx5_ifc_access_register_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6484,12 +6702,15 @@ struct mlx5_ifc_pude_reg_bits { }; struct mlx5_ifc_ptys_reg_bits { - u8 reserved_at_0[0x8]; + u8 an_disable_cap[0x1]; + u8 an_disable_admin[0x1]; + u8 reserved_at_2[0x6]; u8 local_port[0x8]; u8 reserved_at_10[0xd]; u8 proto_mask[0x3]; - u8 reserved_at_20[0x40]; + u8 an_status[0x4]; + u8 reserved_at_24[0x3c]; u8 eth_proto_capability[0x20]; @@ -7450,4 +7671,34 @@ struct mlx5_ifc_mcia_reg_bits { u8 dword_11[0x20]; }; +struct mlx5_ifc_dcbx_param_bits { + u8 dcbx_cee_cap[0x1]; + u8 dcbx_ieee_cap[0x1]; + u8 dcbx_standby_cap[0x1]; + u8 reserved_at_0[0x5]; + u8 port_number[0x8]; + u8 reserved_at_10[0xa]; + u8 max_application_table_size[6]; + u8 reserved_at_20[0x15]; + u8 version_oper[0x3]; + u8 reserved_at_38[5]; + u8 version_admin[0x3]; + u8 willing_admin[0x1]; + u8 reserved_at_41[0x3]; + u8 pfc_cap_oper[0x4]; + u8 reserved_at_48[0x4]; + u8 pfc_cap_admin[0x4]; + u8 reserved_at_50[0x4]; + u8 num_of_tc_oper[0x4]; + u8 reserved_at_58[0x4]; + u8 num_of_tc_admin[0x4]; + u8 remote_willing[0x1]; + u8 reserved_at_61[3]; + u8 remote_pfc_cap[4]; + u8 reserved_at_68[0x14]; + u8 remote_num_of_tc[0x4]; + u8 reserved_at_80[0x18]; + u8 error[0x8]; + u8 reserved_at_a0[0x160]; +}; #endif /* MLX5_IFC_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d101e4d904ba..890158e99159 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,6 +1456,8 @@ enum netdev_priv_flags { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @ethtool_ops: Management operations + * @ndisc_ops: Includes callbacks for different IPv6 neighbour + * discovery handling. Necessary for e.g. 6LoWPAN. * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * @@ -1483,8 +1485,7 @@ enum netdev_priv_flags { * @perm_addr: Permanent hw address * @addr_assign_type: Hw address assignment type * @addr_len: Hardware address length - * @neigh_priv_len; Used in neigh_alloc(), - * initialized only in atm/clip.c + * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address * @dev_port: Used to differentiate devices that share @@ -1673,6 +1674,9 @@ struct net_device { #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; #endif +#if IS_ENABLED(CONFIG_IPV6) + const struct ndisc_ops *ndisc_ops; +#endif const struct header_ops *header_ops; diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h new file mode 100644 index 000000000000..562a65e8bcc0 --- /dev/null +++ b/include/linux/ptr_ring.h @@ -0,0 +1,393 @@ +/* + * Definitions for the 'struct ptr_ring' datastructure. + * + * Author: + * Michael S. Tsirkin <mst@redhat.com> + * + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This is a limited-size FIFO maintaining pointers in FIFO order, with + * one CPU producing entries and another consuming entries from a FIFO. + * + * This implementation tries to minimize cache-contention when there is a + * single producer and a single consumer CPU. + */ + +#ifndef _LINUX_PTR_RING_H +#define _LINUX_PTR_RING_H 1 + +#ifdef __KERNEL__ +#include <linux/spinlock.h> +#include <linux/cache.h> +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <asm/errno.h> +#endif + +struct ptr_ring { + int producer ____cacheline_aligned_in_smp; + spinlock_t producer_lock; + int consumer ____cacheline_aligned_in_smp; + spinlock_t consumer_lock; + /* Shared consumer/producer data */ + /* Read-only by both the producer and the consumer */ + int size ____cacheline_aligned_in_smp; /* max entries in queue */ + void **queue; +}; + +/* Note: callers invoking this in a loop must use a compiler barrier, + * for example cpu_relax(). If ring is ever resized, callers must hold + * producer_lock - see e.g. ptr_ring_full. Otherwise, if callers don't hold + * producer_lock, the next call to __ptr_ring_produce may fail. + */ +static inline bool __ptr_ring_full(struct ptr_ring *r) +{ + return r->queue[r->producer]; +} + +static inline bool ptr_ring_full(struct ptr_ring *r) +{ + bool ret; + + spin_lock(&r->producer_lock); + ret = __ptr_ring_full(r); + spin_unlock(&r->producer_lock); + + return ret; +} + +static inline bool ptr_ring_full_irq(struct ptr_ring *r) +{ + bool ret; + + spin_lock_irq(&r->producer_lock); + ret = __ptr_ring_full(r); + spin_unlock_irq(&r->producer_lock); + + return ret; +} + +static inline bool ptr_ring_full_any(struct ptr_ring *r) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&r->producer_lock, flags); + ret = __ptr_ring_full(r); + spin_unlock_irqrestore(&r->producer_lock, flags); + + return ret; +} + +static inline bool ptr_ring_full_bh(struct ptr_ring *r) +{ + bool ret; + + spin_lock_bh(&r->producer_lock); + ret = __ptr_ring_full(r); + spin_unlock_bh(&r->producer_lock); + + return ret; +} + +/* Note: callers invoking this in a loop must use a compiler barrier, + * for example cpu_relax(). Callers must hold producer_lock. + */ +static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) +{ + if (r->queue[r->producer]) + return -ENOSPC; + + r->queue[r->producer++] = ptr; + if (unlikely(r->producer >= r->size)) + r->producer = 0; + return 0; +} + +static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) +{ + int ret; + + spin_lock(&r->producer_lock); + ret = __ptr_ring_produce(r, ptr); + spin_unlock(&r->producer_lock); + + return ret; +} + +static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) +{ + int ret; + + spin_lock_irq(&r->producer_lock); + ret = __ptr_ring_produce(r, ptr); + spin_unlock_irq(&r->producer_lock); + + return ret; +} + +static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&r->producer_lock, flags); + ret = __ptr_ring_produce(r, ptr); + spin_unlock_irqrestore(&r->producer_lock, flags); + + return ret; +} + +static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) +{ + int ret; + + spin_lock_bh(&r->producer_lock); + ret = __ptr_ring_produce(r, ptr); + spin_unlock_bh(&r->producer_lock); + + return ret; +} + +/* Note: callers invoking this in a loop must use a compiler barrier, + * for example cpu_relax(). Callers must take consumer_lock + * if they dereference the pointer - see e.g. PTR_RING_PEEK_CALL. + * If ring is never resized, and if the pointer is merely + * tested, there's no need to take the lock - see e.g. __ptr_ring_empty. + */ +static inline void *__ptr_ring_peek(struct ptr_ring *r) +{ + return r->queue[r->consumer]; +} + +/* Note: callers invoking this in a loop must use a compiler barrier, + * for example cpu_relax(). Callers must take consumer_lock + * if the ring is ever resized - see e.g. ptr_ring_empty. + */ +static inline bool __ptr_ring_empty(struct ptr_ring *r) +{ + return !__ptr_ring_peek(r); +} + +static inline bool ptr_ring_empty(struct ptr_ring *r) +{ + bool ret; + + spin_lock(&r->consumer_lock); + ret = __ptr_ring_empty(r); + spin_unlock(&r->consumer_lock); + + return ret; +} + +static inline bool ptr_ring_empty_irq(struct ptr_ring *r) +{ + bool ret; + + spin_lock_irq(&r->consumer_lock); + ret = __ptr_ring_empty(r); + spin_unlock_irq(&r->consumer_lock); + + return ret; +} + +static inline bool ptr_ring_empty_any(struct ptr_ring *r) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&r->consumer_lock, flags); + ret = __ptr_ring_empty(r); + spin_unlock_irqrestore(&r->consumer_lock, flags); + + return ret; +} + +static inline bool ptr_ring_empty_bh(struct ptr_ring *r) +{ + bool ret; + + spin_lock_bh(&r->consumer_lock); + ret = __ptr_ring_empty(r); + spin_unlock_bh(&r->consumer_lock); + + return ret; +} + +/* Must only be called after __ptr_ring_peek returned !NULL */ +static inline void __ptr_ring_discard_one(struct ptr_ring *r) +{ + r->queue[r->consumer++] = NULL; + if (unlikely(r->consumer >= r->size)) + r->consumer = 0; +} + +static inline void *__ptr_ring_consume(struct ptr_ring *r) +{ + void *ptr; + + ptr = __ptr_ring_peek(r); + if (ptr) + __ptr_ring_discard_one(r); + + return ptr; +} + +static inline void *ptr_ring_consume(struct ptr_ring *r) +{ + void *ptr; + + spin_lock(&r->consumer_lock); + ptr = __ptr_ring_consume(r); + spin_unlock(&r->consumer_lock); + + return ptr; +} + +static inline void *ptr_ring_consume_irq(struct ptr_ring *r) +{ + void *ptr; + + spin_lock_irq(&r->consumer_lock); + ptr = __ptr_ring_consume(r); + spin_unlock_irq(&r->consumer_lock); + + return ptr; +} + +static inline void *ptr_ring_consume_any(struct ptr_ring *r) +{ + unsigned long flags; + void *ptr; + + spin_lock_irqsave(&r->consumer_lock, flags); + ptr = __ptr_ring_consume(r); + spin_unlock_irqrestore(&r->consumer_lock, flags); + + return ptr; +} + +static inline void *ptr_ring_consume_bh(struct ptr_ring *r) +{ + void *ptr; + + spin_lock_bh(&r->consumer_lock); + ptr = __ptr_ring_consume(r); + spin_unlock_bh(&r->consumer_lock); + + return ptr; +} + +/* Cast to structure type and call a function without discarding from FIFO. + * Function must return a value. + * Callers must take consumer_lock. + */ +#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) + +#define PTR_RING_PEEK_CALL(r, f) ({ \ + typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ + \ + spin_lock(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ + spin_unlock(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v; \ +}) + +#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ + typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ + \ + spin_lock_irq(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ + spin_unlock_irq(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v; \ +}) + +#define PTR_RING_PEEK_CALL_BH(r, f) ({ \ + typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ + \ + spin_lock_bh(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ + spin_unlock_bh(&(r)->consumer_lock); \ + __PTR_RING_PEEK_CALL_v; \ +}) + +#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ + typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ + unsigned long __PTR_RING_PEEK_CALL_f;\ + \ + spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ + __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ + spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ + __PTR_RING_PEEK_CALL_v; \ +}) + +static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp) +{ + return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp); +} + +static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) +{ + r->queue = __ptr_ring_init_queue_alloc(size, gfp); + if (!r->queue) + return -ENOMEM; + + r->size = size; + r->producer = r->consumer = 0; + spin_lock_init(&r->producer_lock); + spin_lock_init(&r->consumer_lock); + + return 0; +} + +static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, + void (*destroy)(void *)) +{ + unsigned long flags; + int producer = 0; + void **queue = __ptr_ring_init_queue_alloc(size, gfp); + void **old; + void *ptr; + + if (!queue) + return -ENOMEM; + + spin_lock_irqsave(&(r)->producer_lock, flags); + + while ((ptr = ptr_ring_consume(r))) + if (producer < size) + queue[producer++] = ptr; + else if (destroy) + destroy(ptr); + + r->size = size; + r->producer = producer; + r->consumer = 0; + old = r->queue; + r->queue = queue; + + spin_unlock_irqrestore(&(r)->producer_lock, flags); + + kfree(old); + + return 0; +} + +static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) +{ + void *ptr; + + if (destroy) + while ((ptr = ptr_ring_consume(r))) + destroy(ptr); + kfree(r->queue); +} + +#endif /* _LINUX_PTR_RING_H */ diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c006cc900c44..2daece8979f7 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -89,8 +89,9 @@ void net_inc_egress_queue(void); void net_dec_egress_queue(void); #endif -extern void rtnetlink_init(void); -extern void __rtnl_unlock(void); +void rtnetlink_init(void); +void __rtnl_unlock(void); +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); #define ASSERT_RTNL() do { \ if (unlikely(!rtnl_is_locked())) { \ diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h new file mode 100644 index 000000000000..678bfbf78ac4 --- /dev/null +++ b/include/linux/skb_array.h @@ -0,0 +1,169 @@ +/* + * Definitions for the 'struct skb_array' datastructure. + * + * Author: + * Michael S. Tsirkin <mst@redhat.com> + * + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * Limited-size FIFO of skbs. Can be used more or less whenever + * sk_buff_head can be used, except you need to know the queue size in + * advance. + * Implemented as a type-safe wrapper around ptr_ring. + */ + +#ifndef _LINUX_SKB_ARRAY_H +#define _LINUX_SKB_ARRAY_H 1 + +#ifdef __KERNEL__ +#include <linux/ptr_ring.h> +#include <linux/skbuff.h> +#include <linux/if_vlan.h> +#endif + +struct skb_array { + struct ptr_ring ring; +}; + +/* Might be slightly faster than skb_array_full below, but callers invoking + * this in a loop must use a compiler barrier, for example cpu_relax(). + */ +static inline bool __skb_array_full(struct skb_array *a) +{ + return __ptr_ring_full(&a->ring); +} + +static inline bool skb_array_full(struct skb_array *a) +{ + return ptr_ring_full(&a->ring); +} + +static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) +{ + return ptr_ring_produce(&a->ring, skb); +} + +static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) +{ + return ptr_ring_produce_irq(&a->ring, skb); +} + +static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) +{ + return ptr_ring_produce_bh(&a->ring, skb); +} + +static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) +{ + return ptr_ring_produce_any(&a->ring, skb); +} + +/* Might be slightly faster than skb_array_empty below, but only safe if the + * array is never resized. Also, callers invoking this in a loop must take care + * to use a compiler barrier, for example cpu_relax(). + */ +static inline bool __skb_array_empty(struct skb_array *a) +{ + return !__ptr_ring_peek(&a->ring); +} + +static inline bool skb_array_empty(struct skb_array *a) +{ + return ptr_ring_empty(&a->ring); +} + +static inline bool skb_array_empty_bh(struct skb_array *a) +{ + return ptr_ring_empty_bh(&a->ring); +} + +static inline bool skb_array_empty_irq(struct skb_array *a) +{ + return ptr_ring_empty_irq(&a->ring); +} + +static inline bool skb_array_empty_any(struct skb_array *a) +{ + return ptr_ring_empty_any(&a->ring); +} + +static inline struct sk_buff *skb_array_consume(struct skb_array *a) +{ + return ptr_ring_consume(&a->ring); +} + +static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) +{ + return ptr_ring_consume_irq(&a->ring); +} + +static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) +{ + return ptr_ring_consume_any(&a->ring); +} + +static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) +{ + return ptr_ring_consume_bh(&a->ring); +} + +static inline int __skb_array_len_with_tag(struct sk_buff *skb) +{ + if (likely(skb)) { + int len = skb->len; + + if (skb_vlan_tag_present(skb)) + len += VLAN_HLEN; + + return len; + } else { + return 0; + } +} + +static inline int skb_array_peek_len(struct skb_array *a) +{ + return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); +} + +static inline int skb_array_peek_len_irq(struct skb_array *a) +{ + return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); +} + +static inline int skb_array_peek_len_bh(struct skb_array *a) +{ + return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); +} + +static inline int skb_array_peek_len_any(struct skb_array *a) +{ + return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); +} + +static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp) +{ + return ptr_ring_init(&a->ring, size, gfp); +} + +void __skb_array_destroy_skb(void *ptr) +{ + kfree_skb(ptr); +} + +int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) +{ + return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); +} + +static inline void skb_array_cleanup(struct skb_array *a) +{ + ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); +} + +#endif /* _LINUX_SKB_ARRAY_H */ diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index da84cf920b78..5ab4c9901ccc 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -141,6 +141,16 @@ struct lowpan_dev { u8 priv[0] __aligned(sizeof(void *)); }; +struct lowpan_802154_neigh { + __le16 short_addr; +}; + +static inline +struct lowpan_802154_neigh *lowpan_802154_neigh(void *neigh_priv) +{ + return neigh_priv; +} + static inline struct lowpan_dev *lowpan_dev(const struct net_device *dev) { @@ -244,6 +254,12 @@ static inline bool lowpan_fetch_skb(struct sk_buff *skb, void *data, return false; } +static inline bool lowpan_802154_is_valid_src_short_addr(__le16 addr) +{ + /* First bit of addr is multicast, reserved or 802.15.4 specific */ + return !(addr & cpu_to_le16(0x8000)); +} + static inline void lowpan_push_hc_data(u8 **hc_ptr, const void *data, const size_t len) { diff --git a/include/net/act_api.h b/include/net/act_api.h index db218a12efb5..fb82b5b5d9e7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -155,8 +155,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct tc_action *a); int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index); u32 tcf_hash_new_index(struct tc_action_net *tn); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind); +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind); int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action *a, int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 730d856683e5..9826d3a9464c 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -94,6 +94,16 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags); + +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft); + static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 54c779416eec..f55bf3d294aa 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -76,6 +76,8 @@ static inline struct dst_entry *ip6_route_output(struct net *net, struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, int flags); +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int ifindex, struct flowi6 *fl6, int flags); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index dbf444428437..9222678426a1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -132,6 +132,7 @@ struct ip_tunnel { int ip_tnl_net_id; struct gro_cells gro_cells; bool collect_md; + bool ignore_df; }; #define TUNNEL_CSUM __cpu_to_be16(0x01) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 34f33eb96a5e..f8a416ec674c 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -38,7 +38,7 @@ struct l3mdev_ops { /* IPv6 ops */ struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, - const struct flowi6 *fl6); + struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -139,7 +139,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6); +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -225,7 +225,7 @@ static inline int l3mdev_get_saddr(struct net *net, int ifindex, } static inline -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, const struct flowi6 *fl6) +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) { return NULL; } diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 2d8edaad29cb..be1fe2283254 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -35,6 +35,7 @@ enum { ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ ND_OPT_DNSSL = 31, /* RFC6106 */ + ND_OPT_6CO = 34, /* RFC6775 */ __ND_OPT_MAX }; @@ -53,11 +54,21 @@ enum { #include <net/neighbour.h> +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(val, level, fmt, ...) \ +do { \ + if (val <= ND_DEBUG) \ + net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ +} while (0) + struct ctl_table; struct inet6_dev; struct net_device; struct net_proto_family; struct sk_buff; +struct prefix_info; extern struct neigh_table nd_tbl; @@ -99,20 +110,201 @@ struct ndisc_options { #endif struct nd_opt_hdr *nd_useropts; struct nd_opt_hdr *nd_useropts_end; +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + struct nd_opt_hdr *nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR + 1]; +#endif }; -#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] -#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] -#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] -#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] -#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] -#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts); +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad); + +#define NDISC_OPS_REDIRECT_DATA_SPACE 2 + +/* + * This structure defines the hooks for IPv6 neighbour discovery. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * int (*is_useropt)(u8 nd_opt_type): + * This function is called when IPv6 decide RA userspace options. if + * this function returns 1 then the option given by nd_opt_type will + * be handled as userspace option additional to the IPv6 options. + * + * int (*parse_options)(const struct net_device *dev, + * struct nd_opt_hdr *nd_opt, + * struct ndisc_options *ndopts): + * This function is called while parsing ndisc ops and put each position + * as pointer into ndopts. If this function return unequal 0, then this + * function took care about the ndisc option, if 0 then the IPv6 ndisc + * option parser will take care about that option. + * + * void (*update)(const struct net_device *dev, struct neighbour *n, + * u32 flags, u8 icmp6_type, + * const struct ndisc_options *ndopts): + * This function is called when IPv6 ndisc updates the neighbour cache + * entry. Additional options which can be updated may be previously + * parsed by parse_opts callback and accessible over ndopts parameter. + * + * int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, + * struct neighbour *neigh, u8 *ha_buf, + * u8 **ha): + * This function is called when the necessary option space will be + * calculated before allocating a skb. The parameters neigh, ha_buf + * abd ha are available on NDISC_REDIRECT messages only. + * + * void (*fill_addr_option)(const struct net_device *dev, + * struct sk_buff *skb, u8 icmp6_type, + * const u8 *ha): + * This function is called when the skb will finally fill the option + * fields inside skb. NOTE: this callback should fill the option + * fields to the skb which are previously indicated by opt_space + * parameter. That means the decision to add such option should + * not lost between these two callbacks, e.g. protected by interface + * up state. + * + * void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, + * const struct prefix_info *pinfo, + * struct inet6_dev *in6_dev, + * struct in6_addr *addr, + * int addr_type, u32 addr_flags, + * bool sllao, bool tokenized, + * __u32 valid_lft, u32 prefered_lft, + * bool dev_addr_generated): + * This function is called when a RA messages is received with valid + * PIO option fields and an IPv6 address will be added to the interface + * for autoconfiguration. The parameter dev_addr_generated reports about + * if the address was based on dev->dev_addr or not. This can be used + * to add a second address if link-layer operates with two link layer + * addresses. E.g. 802.15.4 6LoWPAN. + */ +struct ndisc_ops { + int (*is_useropt)(u8 nd_opt_type); + int (*parse_options)(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts); + void (*update)(const struct net_device *dev, struct neighbour *n, + u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts); + int (*opt_addr_space)(const struct net_device *dev, u8 icmp6_type, + struct neighbour *neigh, u8 *ha_buf, + u8 **ha); + void (*fill_addr_option)(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha); + void (*prefix_rcv_add_addr)(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft, + bool dev_addr_generated); +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline int ndisc_ops_is_useropt(const struct net_device *dev, + u8 nd_opt_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->is_useropt) + return dev->ndisc_ops->is_useropt(nd_opt_type); + else + return 0; +} + +static inline int ndisc_ops_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + if (dev->ndisc_ops && dev->ndisc_ops->parse_options) + return dev->ndisc_ops->parse_options(dev, nd_opt, ndopts); + else + return 0; +} + +static inline void ndisc_ops_update(const struct net_device *dev, + struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (dev->ndisc_ops && dev->ndisc_ops->update) + dev->ndisc_ops->update(dev, n, flags, icmp6_type, ndopts); +} + +static inline int ndisc_ops_opt_addr_space(const struct net_device *dev, + u8 icmp6_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space && + icmp6_type != NDISC_REDIRECT) + return dev->ndisc_ops->opt_addr_space(dev, icmp6_type, NULL, + NULL, NULL); + else + return 0; +} + +static inline int ndisc_ops_redirect_opt_addr_space(const struct net_device *dev, + struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + if (dev->ndisc_ops && dev->ndisc_ops->opt_addr_space) + return dev->ndisc_ops->opt_addr_space(dev, NDISC_REDIRECT, + neigh, ha_buf, ha); + else + return 0; +} + +static inline void ndisc_ops_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, + u8 icmp6_type) +{ + if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option && + icmp6_type != NDISC_REDIRECT) + dev->ndisc_ops->fill_addr_option(dev, skb, icmp6_type, NULL); +} + +static inline void ndisc_ops_fill_redirect_addr_option(const struct net_device *dev, + struct sk_buff *skb, + const u8 *ha) +{ + if (dev->ndisc_ops && dev->ndisc_ops->fill_addr_option) + dev->ndisc_ops->fill_addr_option(dev, skb, NDISC_REDIRECT, ha); +} + +static inline void ndisc_ops_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + if (dev->ndisc_ops && dev->ndisc_ops->prefix_rcv_add_addr) + dev->ndisc_ops->prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, + addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); +} +#endif + /* * Return the padding between the option length and the start of the * link addr. Currently only IP-over-InfiniBand needs this, although @@ -127,23 +319,48 @@ static inline int ndisc_addr_option_pad(unsigned short type) } } -static inline int ndisc_opt_addr_space(struct net_device *dev) +static inline int __ndisc_opt_addr_space(unsigned char addr_len, int pad) { - return NDISC_OPT_SPACE(dev->addr_len + - ndisc_addr_option_pad(dev->type)); + return NDISC_OPT_SPACE(addr_len + pad); } -static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, - struct net_device *dev) +#if IS_ENABLED(CONFIG_IPV6) +static inline int ndisc_opt_addr_space(struct net_device *dev, u8 icmp6_type) +{ + return __ndisc_opt_addr_space(dev->addr_len, + ndisc_addr_option_pad(dev->type)) + + ndisc_ops_opt_addr_space(dev, icmp6_type); +} + +static inline int ndisc_redirect_opt_addr_space(struct net_device *dev, + struct neighbour *neigh, + u8 *ops_data_buf, + u8 **ops_data) +{ + return __ndisc_opt_addr_space(dev->addr_len, + ndisc_addr_option_pad(dev->type)) + + ndisc_ops_redirect_opt_addr_space(dev, neigh, ops_data_buf, + ops_data); +} +#endif + +static inline u8 *__ndisc_opt_addr_data(struct nd_opt_hdr *p, + unsigned char addr_len, int prepad) { u8 *lladdr = (u8 *)(p + 1); int lladdrlen = p->nd_opt_len << 3; - int prepad = ndisc_addr_option_pad(dev->type); - if (lladdrlen != ndisc_opt_addr_space(dev)) + if (lladdrlen != __ndisc_opt_addr_space(addr_len, prepad)) return NULL; return lladdr + prepad; } +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + return __ndisc_opt_addr_data(p, dev->addr_len, + ndisc_addr_option_pad(dev->type)); +} + static inline u32 ndisc_hashfn(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { const u32 *p32 = pkey; @@ -194,6 +411,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target); int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir); +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts); /* * IGMP diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9a0d177884c6..4f7cee8344c4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -683,19 +683,21 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) return skb; } -static inline void __qdisc_reset_queue(struct Qdisc *sch, - struct sk_buff_head *list) +static inline void __qdisc_reset_queue(struct sk_buff_head *list) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ - __skb_queue_purge(list); + if (!skb_queue_empty(list)) { + rtnl_kfree_skbs(list->next, list->prev); + __skb_queue_head_init(list); + } } static inline void qdisc_reset_queue(struct Qdisc *sch) { - __qdisc_reset_queue(sch, &sch->q); + __qdisc_reset_queue(&sch->q); sch->qstats.backlog = 0; } @@ -716,6 +718,12 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, return old; } +static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) +{ + rtnl_kfree_skbs(skb, skb); + qdisc_qstats_drop(sch); +} + static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { kfree_skb(skb); diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index af4de90ba27d..1046f5515174 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -113,6 +113,7 @@ enum { IFLA_GRE_ENCAP_SPORT, IFLA_GRE_ENCAP_DPORT, IFLA_GRE_COLLECT_METADATA, + IFLA_GRE_IGNORE_DF, __IFLA_GRE_MAX, }; diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index 97ecc27aeca6..a67caee11929 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -12,6 +12,10 @@ static inline bool lowpan_is_ll(const struct net_device *dev, return lowpan_dev(dev)->lltype == lltype; } +extern const struct ndisc_ops lowpan_ndisc_ops; + +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev); + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index e44f3bf2dd42..12d131ab2324 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := core.o iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o ndisc.o 6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o #rfc6282 nhcs diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7a240b3eaed1..5945f7e19c67 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <net/6lowpan.h> +#include <net/addrconf.h> #include "6lowpan_i.h" @@ -33,6 +34,8 @@ int lowpan_register_netdevice(struct net_device *dev, for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) lowpan_dev(dev)->ctx.table[i].id = i; + dev->ndisc_ops = &lowpan_ndisc_ops; + ret = register_netdevice(dev); if (ret < 0) return ret; @@ -72,16 +75,61 @@ void lowpan_unregister_netdev(struct net_device *dev) } EXPORT_SYMBOL(lowpan_unregister_netdev); +int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) +{ + struct wpan_dev *wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + /* Set short_addr autoconfiguration if short_addr is present only */ + if (!lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + return -1; + + /* For either address format, all zero addresses MUST NOT be used */ + if (wpan_dev->pan_id == cpu_to_le16(0x0000) && + wpan_dev->short_addr == cpu_to_le16(0x0000)) + return -1; + + /* Alternatively, if no PAN ID is known, 16 zero bits may be used */ + if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)) + memset(eui, 0, 2); + else + ieee802154_le16_to_be16(eui, &wpan_dev->pan_id); + + /* The "Universal/Local" (U/L) bit shall be set to zero */ + eui[0] &= ~2; + eui[2] = 0; + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[5] = 0; + ieee802154_le16_to_be16(&eui[6], &wpan_dev->short_addr); + return 0; +} + static int lowpan_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct inet6_dev *idev; + struct in6_addr addr; int i; if (dev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; + idev = __in6_dev_get(dev); + if (!idev) + return NOTIFY_DONE; + switch (event) { + case NETDEV_UP: + case NETDEV_CHANGE: + /* (802.15.4 6LoWPAN short address slaac handling */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && + addrconf_ifid_802154_6lowpan(addr.s6_addr + 8, dev) == 0) { + __ipv6_addr_set_half(&addr.s6_addr32[0], + htonl(0xFE800000), 0); + addrconf_add_linklocal(idev, &addr, 0); + } + break; case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, @@ -112,8 +160,6 @@ static int __init lowpan_module_init(void) return ret; } - request_module_nowait("ipv6"); - request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index acbaa3db493b..24915e0bb9ea 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -245,6 +245,41 @@ static const struct file_operations lowpan_context_fops = { .release = single_release, }; +static int lowpan_short_addr_get(void *data, u64 *val) +{ + struct wpan_dev *wdev = data; + + rtnl_lock(); + *val = le16_to_cpu(wdev->short_addr); + rtnl_unlock(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, + NULL, "0x%04llx\n"); + +static int lowpan_dev_debugfs_802154_init(const struct net_device *dev, + struct lowpan_dev *ldev) +{ + struct dentry *dentry, *root; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + root = debugfs_create_dir("ieee802154", ldev->iface_debugfs); + if (!root) + return -EINVAL; + + dentry = debugfs_create_file("short_addr", 0444, root, + lowpan_802154_dev(dev)->wdev->ieee802154_ptr, + &lowpan_short_addr_fops); + if (!dentry) + return -EINVAL; + + return 0; +} + int lowpan_dev_debugfs_init(struct net_device *dev) { struct lowpan_dev *ldev = lowpan_dev(dev); @@ -272,6 +307,10 @@ int lowpan_dev_debugfs_init(struct net_device *dev) goto remove_root; } + ret = lowpan_dev_debugfs_802154_init(dev, ldev); + if (ret < 0) + goto remove_root; + return 0; remove_root: diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 8501dd532fe1..79f1fa22509a 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -761,22 +761,75 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = { [LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11, }; -static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr, + const struct lowpan_iphc_ctx *ctx, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], &extended_addr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const struct lowpan_iphc_ctx *ctx, const unsigned char *lladdr, bool sam) { struct in6_addr tmp = {}; u8 dam; - /* check for SAM/DAM = 11 */ - memcpy(&tmp.s6_addr[8], lladdr, 8); - /* second bit-flip (Universe/Local) is done according RFC2464 */ - tmp.s6_addr[8] ^= 0x02; - /* context information are always used */ - ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); - if (ipv6_addr_equal(&tmp, ipaddr)) { - dam = LOWPAN_IPHC_DAM_11; - goto out; + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_ctx_802154_lladdr(ipaddr, ctx, + lladdr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; + default: + /* check for SAM/DAM = 11 */ + memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN); + /* second bit-flip (Universe/Local) is done according RFC2464 */ + tmp.s6_addr[8] ^= 0x02; + /* context information are always used */ + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + if (ipv6_addr_equal(&tmp, ipaddr)) { + dam = LOWPAN_IPHC_DAM_11; + goto out; + } + break; } memset(&tmp, 0, sizeof(tmp)); @@ -813,28 +866,85 @@ out: return dam; } -static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr, +static inline bool +lowpan_iphc_compress_802154_lladdr(const struct in6_addr *ipaddr, + const void *lladdr) +{ + const struct ieee802154_addr *addr = lladdr; + unsigned char extended_addr[EUI64_ADDR_LEN]; + bool lladdr_compress = false; + struct in6_addr tmp = {}; + + switch (addr->mode) { + case IEEE802154_ADDR_LONG: + ieee802154_le64_to_be64(&extended_addr, &addr->extended_addr); + if (is_addr_mac_addr_based(ipaddr, extended_addr)) + lladdr_compress = true; + break; + case IEEE802154_ADDR_SHORT: + /* fe:80::ff:fe00:XXXX + * \__/ + * short_addr + * + * Universe/Local bit is zero. + */ + tmp.s6_addr[0] = 0xFE; + tmp.s6_addr[1] = 0x80; + tmp.s6_addr[11] = 0xFF; + tmp.s6_addr[12] = 0xFE; + ieee802154_le16_to_be16(&tmp.s6_addr16[7], + &addr->short_addr); + if (ipv6_addr_equal(&tmp, ipaddr)) + lladdr_compress = true; + break; + default: + /* should never handled and filtered by 802154 6lowpan */ + WARN_ON_ONCE(1); + break; + } + + return lladdr_compress; +} + +static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev, + const struct in6_addr *ipaddr, const unsigned char *lladdr, bool sam) { - u8 dam = LOWPAN_IPHC_DAM_00; + u8 dam = LOWPAN_IPHC_DAM_01; - if (is_addr_mac_addr_based(ipaddr, lladdr)) { - dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ - pr_debug("address compression 0 bits\n"); - } else if (lowpan_is_iid_16_bit_compressable(ipaddr)) { + switch (lowpan_dev(dev)->lltype) { + case LOWPAN_LLTYPE_IEEE802154: + if (lowpan_iphc_compress_802154_lladdr(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + default: + if (is_addr_mac_addr_based(ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + pr_debug("address compression 0 bits\n"); + goto out; + } + break; + } + + if (lowpan_is_iid_16_bit_compressable(ipaddr)) { /* compress IID to 16 bits xxxx::XXXX */ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[7], 2); dam = LOWPAN_IPHC_DAM_10; /* 16-bits */ raw_dump_inline(NULL, "Compressed ipv6 addr is (16 bits)", *hc_ptr - 2, 2); - } else { - /* do not compress IID => xxxx::IID */ - lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); - dam = LOWPAN_IPHC_DAM_01; /* 64-bits */ - raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", - *hc_ptr - 8, 8); + goto out; } + /* do not compress IID => xxxx::IID */ + lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr16[4], 8); + raw_dump_inline(NULL, "Compressed ipv6 addr is (64 bits)", + *hc_ptr - 8, 8); + +out: + if (sam) return lowpan_iphc_dam_to_sam_value[dam]; else @@ -1013,9 +1123,6 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc0 = LOWPAN_DISPATCH_IPHC; iphc1 = 0; - raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN); - raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN); - raw_dump_table(__func__, "sending raw skb network uncompressed packet", skb->data, skb->len); @@ -1088,14 +1195,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, iphc1 |= LOWPAN_IPHC_SAC; } else { if (sci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->saddr, &sci_entry, saddr, true); iphc1 |= LOWPAN_IPHC_SAC; } else { if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->saddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->saddr, saddr, true); pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n", @@ -1123,14 +1231,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, } } else { if (dci) { - iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr, + iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, dev, + &hdr->daddr, &dci_entry, daddr, false); iphc1 |= LOWPAN_IPHC_DAC; } else { if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL && lowpan_is_linklocal_zero_padded(hdr->daddr)) { - iphc1 |= lowpan_compress_addr_64(&hc_ptr, + iphc1 |= lowpan_compress_addr_64(&hc_ptr, dev, &hdr->daddr, daddr, false); pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n", diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c new file mode 100644 index 000000000000..ae1d4199aa4c --- /dev/null +++ b/net/6lowpan/ndisc.c @@ -0,0 +1,234 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de> + */ + +#include <net/6lowpan.h> +#include <net/addrconf.h> +#include <net/ndisc.h> + +#include "6lowpan_i.h" + +static int lowpan_ndisc_is_useropt(u8 nd_opt_type) +{ + return nd_opt_type == ND_OPT_6CO; +} + +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) +#define NDISC_802154_SHORT_ADDR_LENGTH 1 +static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_len) { + case NDISC_802154_SHORT_ADDR_LENGTH: + if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) + ND_PRINTK(2, warn, + "%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); + else + ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; + return 1; + default: + /* all others will be handled by ndisc IPv6 option parsing */ + return 0; + } +} + +static int lowpan_ndisc_parse_options(const struct net_device *dev, + struct nd_opt_hdr *nd_opt, + struct ndisc_options *ndopts) +{ + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + return lowpan_ndisc_parse_802154_options(dev, nd_opt, ndopts); + default: + return 0; + } +} + +static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, + u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + u8 *lladdr_short = NULL; + + switch (icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + if (ndopts->nd_802154_opts_src_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + case NDISC_REDIRECT: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + if (ndopts->nd_802154_opts_tgt_lladdr) { + lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, + IEEE802154_SHORT_ADDR_LEN, 0); + if (!lladdr_short) { + ND_PRINTK(2, warn, + "NA: invalid short link-layer address length\n"); + return; + } + } + break; + default: + break; + } + + write_lock_bh(&n->lock); + if (lladdr_short) + ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); + else + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + write_unlock_bh(&n->lock); +} + +static void lowpan_ndisc_update(const struct net_device *dev, + struct neighbour *n, u32 flags, u8 icmp6_type, + const struct ndisc_options *ndopts) +{ + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + /* react on overrides only. TODO check if this is really right. */ + if (flags & NEIGH_UPDATE_F_OVERRIDE) + lowpan_ndisc_802154_update(n, flags, icmp6_type, ndopts); +} + +static int lowpan_ndisc_opt_addr_space(const struct net_device *dev, + u8 icmp6_type, struct neighbour *neigh, + u8 *ha_buf, u8 **ha) +{ + struct lowpan_802154_neigh *n; + struct wpan_dev *wpan_dev; + int addr_space = 0; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return 0; + + switch (icmp6_type) { + case NDISC_REDIRECT: + n = lowpan_802154_neigh(neighbour_priv(neigh)); + + read_lock_bh(&neigh->lock); + if (lowpan_802154_is_valid_src_short_addr(n->short_addr)) { + memcpy(ha_buf, &n->short_addr, + IEEE802154_SHORT_ADDR_LEN); + read_unlock_bh(&neigh->lock); + addr_space += __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + *ha = ha_buf; + } + read_unlock_bh(&neigh->lock); + break; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_ROUTER_SOLICITATION: + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) + addr_space = __ndisc_opt_addr_space(IEEE802154_SHORT_ADDR_LEN, 0); + break; + default: + break; + } + + return addr_space; +} + +static void lowpan_ndisc_fill_addr_option(const struct net_device *dev, + struct sk_buff *skb, u8 icmp6_type, + const u8 *ha) +{ + struct wpan_dev *wpan_dev; + __be16 short_addr; + u8 opt_type; + + if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154)) + return; + + switch (icmp6_type) { + case NDISC_REDIRECT: + if (ha) { + ieee802154_le16_to_be16(&short_addr, ha); + __ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, + &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } + return; + case NDISC_NEIGHBOUR_ADVERTISEMENT: + opt_type = ND_OPT_TARGET_LL_ADDR; + break; + case NDISC_ROUTER_SOLICITATION: + case NDISC_NEIGHBOUR_SOLICITATION: + opt_type = ND_OPT_SOURCE_LL_ADDR; + break; + default: + return; + } + + wpan_dev = lowpan_802154_dev(dev)->wdev->ieee802154_ptr; + + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + ieee802154_le16_to_be16(&short_addr, + &wpan_dev->short_addr); + __ndisc_fill_addr_option(skb, opt_type, &short_addr, + IEEE802154_SHORT_ADDR_LEN, 0); + } +} + +static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, + struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + struct in6_addr *addr, + int addr_type, u32 addr_flags, + bool sllao, bool tokenized, + __u32 valid_lft, + u32 prefered_lft, + bool dev_addr_generated) +{ + int err; + + /* generates short based address for RA PIO's */ + if (lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154) && dev_addr_generated && + !addrconf_ifid_802154_6lowpan(addr->s6_addr + 8, dev)) { + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + addr, addr_type, addr_flags, + sllao, tokenized, valid_lft, + prefered_lft); + if (err) + ND_PRINTK(2, warn, + "RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); + } +} +#endif + +const struct ndisc_ops lowpan_ndisc_ops = { + .is_useropt = lowpan_ndisc_is_useropt, +#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + .parse_options = lowpan_ndisc_parse_options, + .update = lowpan_ndisc_update, + .opt_addr_space = lowpan_ndisc_opt_addr_space, + .fill_addr_option = lowpan_ndisc_fill_addr_option, + .prefix_rcv_add_addr = lowpan_ndisc_prefix_rcv_add_addr, +#endif +}; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d69c4644f8f2..eb49ca24274a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -71,9 +71,31 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +static struct sk_buff *defer_kfree_skb_list; +void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) +{ + if (head && tail) { + tail->next = defer_kfree_skb_list; + defer_kfree_skb_list = head; + } +} +EXPORT_SYMBOL(rtnl_kfree_skbs); + void __rtnl_unlock(void) { + struct sk_buff *head = defer_kfree_skb_list; + + defer_kfree_skb_list = NULL; + mutex_unlock(&rtnl_mutex); + + while (head) { + struct sk_buff *next = head->next; + + kfree_skb(head); + cond_resched(); + head = next; + } } void rtnl_unlock(void) diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 4e2b30894224..8c004a0c8d64 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -81,11 +81,21 @@ static int lowpan_stop(struct net_device *dev) return 0; } +static int lowpan_neigh_construct(struct neighbour *n) +{ + struct lowpan_802154_neigh *neigh = lowpan_802154_neigh(neighbour_priv(n)); + + /* default no short_addr is available for a neighbour */ + neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + return 0; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, + .ndo_neigh_construct = lowpan_neigh_construct, }; static void lowpan_setup(struct net_device *ldev) @@ -150,6 +160,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, wdev->needed_headroom; ldev->needed_tailroom = wdev->needed_tailroom; + ldev->neigh_priv_len = sizeof(struct lowpan_802154_neigh); + ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154); if (ret < 0) { dev_put(wdev); diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index e459afd16bb3..dbb476d7d38f 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -9,6 +9,7 @@ */ #include <net/6lowpan.h> +#include <net/ndisc.h> #include <net/ieee802154_netdev.h> #include <net/mac802154.h> @@ -17,19 +18,9 @@ #define LOWPAN_FRAG1_HEAD_SIZE 0x4 #define LOWPAN_FRAGN_HEAD_SIZE 0x5 -/* don't save pan id, it's intra pan */ -struct lowpan_addr { - u8 mode; - union { - /* IPv6 needs big endian here */ - __be64 extended_addr; - __be16 short_addr; - } u; -}; - struct lowpan_addr_info { - struct lowpan_addr daddr; - struct lowpan_addr saddr; + struct ieee802154_addr daddr; + struct ieee802154_addr saddr; }; static inline struct @@ -48,12 +39,14 @@ lowpan_addr_info *lowpan_skb_priv(const struct sk_buff *skb) * RAW/DGRAM sockets. */ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, - unsigned short type, const void *_daddr, - const void *_saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { - const u8 *saddr = _saddr; - const u8 *daddr = _daddr; - struct lowpan_addr_info *info; + struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; + struct lowpan_addr_info *info = lowpan_skb_priv(skb); + struct lowpan_802154_neigh *llneigh = NULL; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct neighbour *n; /* TODO: * if this package isn't ipv6 one, where should it be routed? @@ -61,21 +54,50 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev, if (type != ETH_P_IPV6) return 0; - if (!saddr) - saddr = ldev->dev_addr; + /* intra-pan communication */ + info->saddr.pan_id = wpan_dev->pan_id; + info->daddr.pan_id = info->saddr.pan_id; - raw_dump_inline(__func__, "saddr", (unsigned char *)saddr, 8); - raw_dump_inline(__func__, "daddr", (unsigned char *)daddr, 8); + if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { + info->daddr.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + __le16 short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); + + n = neigh_lookup(&nd_tbl, &hdr->daddr, ldev); + if (n) { + llneigh = lowpan_802154_neigh(neighbour_priv(n)); + read_lock_bh(&n->lock); + short_addr = llneigh->short_addr; + read_unlock_bh(&n->lock); + } - info = lowpan_skb_priv(skb); + if (llneigh && + lowpan_802154_is_valid_src_short_addr(short_addr)) { + info->daddr.short_addr = short_addr; + info->daddr.mode = IEEE802154_ADDR_SHORT; + } else { + info->daddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->daddr.extended_addr, + daddr); + } - /* TODO: Currently we only support extended_addr */ - info->daddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->daddr.u.extended_addr, daddr, - sizeof(info->daddr.u.extended_addr)); - info->saddr.mode = IEEE802154_ADDR_LONG; - memcpy(&info->saddr.u.extended_addr, saddr, - sizeof(info->daddr.u.extended_addr)); + if (n) + neigh_release(n); + } + + if (!saddr) { + if (lowpan_802154_is_valid_src_short_addr(wpan_dev->short_addr)) { + info->saddr.mode = IEEE802154_ADDR_SHORT; + info->saddr.short_addr = wpan_dev->short_addr; + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + info->saddr.extended_addr = wpan_dev->extended_addr; + } + } else { + info->saddr.mode = IEEE802154_ADDR_LONG; + ieee802154_be64_to_le64(&info->saddr.extended_addr, saddr); + } return 0; } @@ -209,47 +231,26 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, u16 *dgram_size, u16 *dgram_offset) { struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; - struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; - void *daddr, *saddr; memcpy(&info, lowpan_skb_priv(skb), sizeof(info)); - /* TODO: Currently we only support extended_addr */ - daddr = &info.daddr.u.extended_addr; - saddr = &info.saddr.u.extended_addr; - *dgram_size = skb->len; - lowpan_header_compress(skb, ldev, daddr, saddr); + lowpan_header_compress(skb, ldev, &info.daddr, &info.saddr); /* dgram_offset = (saved bytes after compression) + lowpan header len */ *dgram_offset = (*dgram_size - skb->len) + skb_network_header_len(skb); cb->type = IEEE802154_FC_TYPE_DATA; - /* prepare wpan address data */ - sa.mode = IEEE802154_ADDR_LONG; - sa.pan_id = wpan_dev->pan_id; - sa.extended_addr = ieee802154_devaddr_from_raw(saddr); - - /* intra-PAN communications */ - da.pan_id = sa.pan_id; - - /* if the destination address is the broadcast address, use the - * corresponding short address - */ - if (!memcmp(daddr, ldev->broadcast, EUI64_ADDR_LEN)) { - da.mode = IEEE802154_ADDR_SHORT; - da.short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); + if (info.daddr.mode == IEEE802154_ADDR_SHORT && + ieee802154_is_broadcast_short_addr(info.daddr.short_addr)) cb->ackreq = false; - } else { - da.mode = IEEE802154_ADDR_LONG; - da.extended_addr = ieee802154_devaddr_from_raw(daddr); + else cb->ackreq = wpan_dev->ackreq; - } - return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da, - &sa, 0); + return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, + &info.daddr, &info.saddr, 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4d2025f7ec57..0f8ca3fca00a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -841,17 +841,19 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct net_device *dev, +static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], struct ip_tunnel_parm *parms) { + struct ip_tunnel *t = netdev_priv(dev); + memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_GRE; if (!data) - return; + return 0; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); @@ -880,16 +882,26 @@ static void ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) + if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { + if (t->ignore_df) + return -EINVAL; parms->iph.frag_off = htons(IP_DF); + } if (data[IFLA_GRE_COLLECT_METADATA]) { - struct ip_tunnel *t = netdev_priv(dev); - t->collect_md = true; if (dev->type == ARPHRD_IPGRE) dev->type = ARPHRD_NONE; } + + if (data[IFLA_GRE_IGNORE_DF]) { + if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) + && (parms->iph.frag_off & htons(IP_DF))) + return -EINVAL; + t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); + } + + return 0; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -960,16 +972,19 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_newlink(dev, tb, &p); } @@ -978,16 +993,19 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - int err = ip_tunnel_encap_setup(t, &ipencap); + err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p); + if (err < 0) + return err; return ip_tunnel_changelink(dev, tb, &p); } @@ -1024,6 +1042,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_GRE_IGNORE_DF */ + nla_total_size(1) + 0; } @@ -1057,6 +1077,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df)) + goto nla_put_failure; + if (t->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; @@ -1084,6 +1107,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d8f5e0a269f5..95649ebd2874 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -682,7 +682,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off&htons(IP_DF)); max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 47f837a58e0a..6c8fc3f96b11 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2254,7 +2254,7 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return ERR_PTR(-EACCES); /* Add default multicast route */ - if (!(dev->flags & IFF_LOOPBACK)) + if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev)) addrconf_add_mroute(dev); return idev; @@ -2333,12 +2333,109 @@ static bool is_addr_mode_generate_stable(struct inet6_dev *idev) idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM; } +int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, + const struct prefix_info *pinfo, + struct inet6_dev *in6_dev, + const struct in6_addr *addr, int addr_type, + u32 addr_flags, bool sllao, bool tokenized, + __u32 valid_lft, u32 prefered_lft) +{ + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); + int create = 0, update_lft = 0; + + if (!ifp && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !net->ipv6.devconf_all->forwarding && sllao) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, addr, NULL, + pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags, valid_lft, + prefered_lft); + + if (IS_ERR_OR_NULL(ifp)) + return -1; + + update_lft = 0; + create = 1; + spin_lock_bh(&ifp->lock); + ifp->flags |= IFA_F_MANAGETEMPADDR; + ifp->cstamp = jiffies; + ifp->tokenized = tokenized; + spin_unlock_bh(&ifp->lock); + addrconf_dad_start(ifp); + } + + if (ifp) { + u32 flags; + unsigned long now; + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock_bh(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && !create && stored_lft) { + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock_bh(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock_bh(&ifp->lock); + + manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, + create, now); + + in6_ifa_put(ifp); + addrconf_verify(); + } + + return 0; +} +EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); + void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; __u32 valid_lft; __u32 prefered_lft; - int addr_type; + int addr_type, err; u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); @@ -2432,10 +2529,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ if (pinfo->autoconf && in6_dev->cnf.autoconf) { - struct inet6_ifaddr *ifp; struct in6_addr addr; - int create = 0, update_lft = 0; - bool tokenized = false; + bool tokenized = false, dev_addr_generated = false; if (pinfo->prefix_len == 64) { memcpy(&addr, &pinfo->prefix, 8); @@ -2453,106 +2548,36 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) goto ok; } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { - in6_dev_put(in6_dev); - return; + goto put; + } else { + dev_addr_generated = true; } goto ok; } net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n", pinfo->prefix_len); - in6_dev_put(in6_dev); - return; + goto put; ok: + err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, + &addr, addr_type, + addr_flags, sllao, + tokenized, valid_lft, + prefered_lft); + if (err) + goto put; - ifp = ipv6_get_ifaddr(net, &addr, dev, 1); - - if (!ifp && valid_lft) { - int max_addresses = in6_dev->cnf.max_addresses; - -#ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && - !net->ipv6.devconf_all->forwarding && sllao) - addr_flags |= IFA_F_OPTIMISTIC; -#endif - - /* Do not allow to create too much of autoconfigured - * addresses; this would be too easy way to crash kernel. - */ - if (!max_addresses || - ipv6_count_addresses(in6_dev) < max_addresses) - ifp = ipv6_add_addr(in6_dev, &addr, NULL, - pinfo->prefix_len, - addr_type&IPV6_ADDR_SCOPE_MASK, - addr_flags, valid_lft, - prefered_lft); - - if (IS_ERR_OR_NULL(ifp)) { - in6_dev_put(in6_dev); - return; - } - - update_lft = 0; - create = 1; - spin_lock_bh(&ifp->lock); - ifp->flags |= IFA_F_MANAGETEMPADDR; - ifp->cstamp = jiffies; - ifp->tokenized = tokenized; - spin_unlock_bh(&ifp->lock); - addrconf_dad_start(ifp); - } - - if (ifp) { - u32 flags; - unsigned long now; - u32 stored_lft; - - /* update lifetime (RFC2462 5.5.3 e) */ - spin_lock_bh(&ifp->lock); - now = jiffies; - if (ifp->valid_lft > (now - ifp->tstamp) / HZ) - stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; - else - stored_lft = 0; - if (!update_lft && !create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - - if (update_lft) { - ifp->valid_lft = valid_lft; - ifp->prefered_lft = prefered_lft; - ifp->tstamp = now; - flags = ifp->flags; - ifp->flags &= ~IFA_F_DEPRECATED; - spin_unlock_bh(&ifp->lock); - - if (!(flags&IFA_F_TENTATIVE)) - ipv6_ifa_notify(0, ifp); - } else - spin_unlock_bh(&ifp->lock); - - manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft, - create, now); - - in6_ifa_put(ifp); - addrconf_verify(); - } + /* Ignore error case here because previous prefix add addr was + * successful which will be notified. + */ + ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr, + addr_type, addr_flags, sllao, + tokenized, valid_lft, + prefered_lft, + dev_addr_generated); } inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); +put: in6_dev_put(in6_dev); } @@ -2947,8 +2972,8 @@ static void init_loopback(struct net_device *dev) } } -static void addrconf_add_linklocal(struct inet6_dev *idev, - const struct in6_addr *addr, u32 flags) +void addrconf_add_linklocal(struct inet6_dev *idev, + const struct in6_addr *addr, u32 flags) { struct inet6_ifaddr *ifp; u32 addr_flags = flags | IFA_F_PERMANENT; @@ -2967,6 +2992,7 @@ static void addrconf_add_linklocal(struct inet6_dev *idev, in6_ifa_put(ifp); } } +EXPORT_SYMBOL_GPL(addrconf_add_linklocal); static bool ipv6_reserved_interfaceid(struct in6_addr address) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 40454bfb534e..e32a72fb9982 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -587,7 +587,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; - fl6.flowi6_oif = l3mdev_fib_oif(skb->dev); + fl6.flowi6_oif = skb->dev->ifindex; fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index b3d00be484d4..ec9efbcdad35 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -34,12 +34,12 @@ static void ila_csum_do_neutral(struct ila_addr *iaddr, if (p->locator_match.v64) { diff = p->csum_diff; } else { - diff = compute_csum_diff8((__be32 *)iaddr, - (__be32 *)&p->locator); + diff = compute_csum_diff8((__be32 *)&p->locator, + (__be32 *)iaddr); } fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? - ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG); + CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); diff = csum_add(diff, fval); @@ -140,8 +140,8 @@ void ila_init_saved_csum(struct ila_params *p) return; p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, - (__be32 *)&p->locator); + (__be32 *)&p->locator, + (__be32 *)&p->locator_match); } static int __init ila_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c245895a3d41..fe65cdc28a45 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -73,15 +73,6 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); @@ -150,11 +141,10 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, + int data_len, int pad) { - int pad = ndisc_addr_option_pad(skb->dev->type); - int data_len = skb->dev->addr_len; - int space = ndisc_opt_addr_space(skb->dev); + int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; @@ -171,6 +161,23 @@ static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data) if (space > 0) memset(opt, 0, space); } +EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); + +static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, + void *data, u8 icmp6_type) +{ + __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, + ndisc_addr_option_pad(skb->dev->type)); + ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); +} + +static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, + void *ha, + const u8 *ops_data) +{ + ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); + ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); +} static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) @@ -185,24 +192,28 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, return cur <= end && cur->nd_opt_type == type ? cur : NULL; } -static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +static inline int ndisc_is_useropt(const struct net_device *dev, + struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_RDNSS || - opt->nd_opt_type == ND_OPT_DNSSL; + opt->nd_opt_type == ND_OPT_DNSSL || + ndisc_ops_is_useropt(dev, opt->nd_opt_type); } -static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, +static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, + struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); - } while (cur < end && !ndisc_is_useropt(cur)); - return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; + } while (cur < end && !ndisc_is_useropt(dev, cur)); + return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, + u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; @@ -217,6 +228,8 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; + if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) + goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: @@ -243,7 +256,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, break; #endif default: - if (ndisc_is_useropt(nd_opt)) { + if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; @@ -260,6 +273,7 @@ struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, nd_opt->nd_opt_len); } } +next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } @@ -509,7 +523,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (!dev->addr_len) inc_opt = 0; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -528,8 +543,8 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, - dev->dev_addr); - + dev->dev_addr, + NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } @@ -574,7 +589,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, + NDISC_NEIGHBOUR_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -590,7 +606,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_NEIGHBOUR_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -626,7 +643,7 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, } #endif if (send_sllao) - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) @@ -641,7 +658,8 @@ void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, - dev->dev_addr); + dev->dev_addr, + NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } @@ -702,6 +720,15 @@ static int pndisc_is_router(const void *pkey, return ret; } +void ndisc_update(const struct net_device *dev, struct neighbour *neigh, + const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, + struct ndisc_options *ndopts) +{ + neigh_update(neigh, lladdr, new, flags); + /* report ndisc ops about neighbour update */ + ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); +} + static void ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); @@ -738,7 +765,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND options\n"); return; } @@ -856,9 +883,10 @@ have_ifp: neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, + NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); @@ -911,7 +939,7 @@ static void ndisc_recv_na(struct sk_buff *skb) idev->cnf.drop_unsolicited_na) return; - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, warn, "NS: invalid ND option\n"); return; } @@ -967,12 +995,13 @@ static void ndisc_recv_na(struct sk_buff *skb) goto out; } - neigh_update(neigh, lladdr, + ndisc_update(dev, neigh, lladdr, msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), + NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* @@ -1017,7 +1046,7 @@ static void ndisc_recv_rs(struct sk_buff *skb) goto out; /* Parse ND options */ - if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) { ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n"); goto out; } @@ -1031,10 +1060,11 @@ static void ndisc_recv_rs(struct sk_buff *skb) neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + NEIGH_UPDATE_F_OVERRIDE_ISROUTER, + NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); } out: @@ -1135,7 +1165,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - if (!ndisc_parse_options(opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) { ND_PRINTK(2, warn, "RA: invalid ND options\n"); return; } @@ -1329,11 +1359,12 @@ skip_linkparms: goto out; } } - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER); + NEIGH_UPDATE_F_ISROUTER, + NDISC_ROUTER_ADVERTISEMENT, &ndopts); } if (!ipv6_accept_ra(in6_dev)) { @@ -1421,7 +1452,8 @@ skip_routeinfo: struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; - p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + p = ndisc_next_useropt(skb->dev, p, + ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } @@ -1459,7 +1491,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) return; } - if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) + if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return; if (!ndopts.nd_opts_rh) { @@ -1504,7 +1536,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) struct dst_entry *dst; struct flowi6 fl6; int rd_len; - u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, + ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; int oif = l3mdev_fib_oif(dev); bool ret; @@ -1563,7 +1596,9 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; - optlen += ndisc_opt_addr_space(dev); + optlen += ndisc_redirect_opt_addr_space(dev, neigh, + ops_data_buf, + &ops_data); } else read_unlock_bh(&neigh->lock); @@ -1594,7 +1629,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) */ if (ha) - ndisc_fill_addr_option(buff, ND_OPT_TARGET_LL_ADDR, ha); + ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c6ae6f9b5fe3..9e1516785dac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1042,8 +1042,8 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return pcpu_rt; } -static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, - struct flowi6 *fl6, int flags) +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt; @@ -1139,6 +1139,7 @@ redo_rt6_select: } } +EXPORT_SYMBOL_GPL(ip6_pol_route); static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) @@ -2200,7 +2201,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * first-hop router for the specified ICMP Destination Address. */ - if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { + if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } @@ -2235,12 +2236,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * We have finally decided to accept it. */ - neigh_update(neigh, lladdr, NUD_STALE, + ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| - NEIGH_UPDATE_F_ISROUTER)) - ); + NEIGH_UPDATE_F_ISROUTER)), + NDISC_REDIRECT, &ndopts); nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index fc3598a922b0..37d674e6f8a9 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1033,6 +1033,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); + size_t headroom, linear; struct sk_buff *skb; struct iucv_message txmsg = {0}; struct cmsghdr *cmsg; @@ -1110,20 +1111,31 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, * this is fine for SOCK_SEQPACKET (unless we want to support * segmented records using the MSG_EOR flag), but * for SOCK_STREAM we might want to improve it in future */ - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb = sock_alloc_send_skb(sk, - len + sizeof(struct af_iucv_trans_hdr) + ETH_HLEN, - noblock, &err); - else - skb = sock_alloc_send_skb(sk, len, noblock, &err); + headroom = (iucv->transport == AF_IUCV_TRANS_HIPER) + ? sizeof(struct af_iucv_trans_hdr) + ETH_HLEN : 0; + if (headroom + len < PAGE_SIZE) { + linear = len; + } else { + /* In nonlinear "classic" iucv skb, + * reserve space for iucv_array + */ + if (iucv->transport != AF_IUCV_TRANS_HIPER) + headroom += sizeof(struct iucv_array) * + (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = sock_alloc_send_pskb(sk, headroom + linear, len - linear, + noblock, &err, 0); if (!skb) goto out; - if (iucv->transport == AF_IUCV_TRANS_HIPER) - skb_reserve(skb, sizeof(struct af_iucv_trans_hdr) + ETH_HLEN); - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); + if (err) goto fail; - } /* wait if outstanding messages for iucv path has reached */ timeo = sock_sndtimeo(sk, noblock); @@ -1148,49 +1160,67 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, atomic_dec(&iucv->msg_sent); goto fail; } - goto release; - } - skb_queue_tail(&iucv->send_skb_q, skb); - - if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) - && skb->len <= 7) { - err = iucv_send_iprm(iucv->path, &txmsg, skb); + } else { /* Classic VM IUCV transport */ + skb_queue_tail(&iucv->send_skb_q, skb); + + if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) && + skb->len <= 7) { + err = iucv_send_iprm(iucv->path, &txmsg, skb); + + /* on success: there is no message_complete callback */ + /* for an IPRMDATA msg; remove skb from send queue */ + if (err == 0) { + skb_unlink(skb, &iucv->send_skb_q); + kfree_skb(skb); + } - /* on success: there is no message_complete callback - * for an IPRMDATA msg; remove skb from send queue */ - if (err == 0) { - skb_unlink(skb, &iucv->send_skb_q); - kfree_skb(skb); + /* this error should never happen since the */ + /* IUCV_IPRMDATA path flag is set... sever path */ + if (err == 0x15) { + pr_iucv->path_sever(iucv->path, NULL); + skb_unlink(skb, &iucv->send_skb_q); + err = -EPIPE; + goto fail; + } + } else if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + /* skip iucv_array lying in the headroom */ + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + err = pr_iucv->message_send(iucv->path, &txmsg, + IUCV_IPBUFLST, 0, + (void *)iba, skb->len); + } else { /* non-IPRM Linear skb */ + err = pr_iucv->message_send(iucv->path, &txmsg, + 0, 0, (void *)skb->data, skb->len); } - - /* this error should never happen since the - * IUCV_IPRMDATA path flag is set... sever path */ - if (err == 0x15) { - pr_iucv->path_sever(iucv->path, NULL); + if (err) { + if (err == 3) { + user_id[8] = 0; + memcpy(user_id, iucv->dst_user_id, 8); + appl_id[8] = 0; + memcpy(appl_id, iucv->dst_name, 8); + pr_err( + "Application %s on z/VM guest %s exceeds message limit\n", + appl_id, user_id); + err = -EAGAIN; + } else { + err = -EPIPE; + } skb_unlink(skb, &iucv->send_skb_q); - err = -EPIPE; goto fail; } - } else - err = pr_iucv->message_send(iucv->path, &txmsg, 0, 0, - (void *) skb->data, skb->len); - if (err) { - if (err == 3) { - user_id[8] = 0; - memcpy(user_id, iucv->dst_user_id, 8); - appl_id[8] = 0; - memcpy(appl_id, iucv->dst_name, 8); - pr_err("Application %s on z/VM guest %s" - " exceeds message limit\n", - appl_id, user_id); - err = -EAGAIN; - } else - err = -EPIPE; - skb_unlink(skb, &iucv->send_skb_q); - goto fail; } -release: release_sock(sk); return len; @@ -1201,42 +1231,32 @@ out: return err; } -/* iucv_fragment_skb() - Fragment a single IUCV message into multiple skb's - * - * Locking: must be called with message_q.lock held - */ -static int iucv_fragment_skb(struct sock *sk, struct sk_buff *skb, int len) +static struct sk_buff *alloc_iucv_recv_skb(unsigned long len) { - int dataleft, size, copied = 0; - struct sk_buff *nskb; - - dataleft = len; - while (dataleft) { - if (dataleft >= sk->sk_rcvbuf / 4) - size = sk->sk_rcvbuf / 4; - else - size = dataleft; - - nskb = alloc_skb(size, GFP_ATOMIC | GFP_DMA); - if (!nskb) - return -ENOMEM; - - /* copy target class to control buffer of new skb */ - IUCV_SKB_CB(nskb)->class = IUCV_SKB_CB(skb)->class; - - /* copy data fragment */ - memcpy(nskb->data, skb->data + copied, size); - copied += size; - dataleft -= size; - - skb_reset_transport_header(nskb); - skb_reset_network_header(nskb); - nskb->len = size; + size_t headroom, linear; + struct sk_buff *skb; + int err; - skb_queue_tail(&iucv_sk(sk)->backlog_skb_q, nskb); + if (len < PAGE_SIZE) { + headroom = 0; + linear = len; + } else { + headroom = sizeof(struct iucv_array) * (MAX_SKB_FRAGS + 1); + linear = PAGE_SIZE - headroom; + } + skb = alloc_skb_with_frags(headroom + linear, len - linear, + 0, &err, GFP_ATOMIC | GFP_DMA); + WARN_ONCE(!skb, + "alloc of recv iucv skb len=%lu failed with errcode=%d\n", + len, err); + if (skb) { + if (headroom) + skb_reserve(skb, headroom); + skb_put(skb, linear); + skb->len = len; + skb->data_len = len - linear; } - - return 0; + return skb; } /* iucv_process_message() - Receive a single outstanding IUCV message @@ -1263,31 +1283,32 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, skb->len = 0; } } else { - rc = pr_iucv->message_receive(path, msg, + if (skb_is_nonlinear(skb)) { + struct iucv_array *iba = (struct iucv_array *)skb->head; + int i; + + iba[0].address = (u32)(addr_t)skb->data; + iba[0].length = (u32)skb_headlen(skb); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + iba[i + 1].address = + (u32)(addr_t)skb_frag_address(frag); + iba[i + 1].length = (u32)skb_frag_size(frag); + } + rc = pr_iucv->message_receive(path, msg, + IUCV_IPBUFLST, + (void *)iba, len, NULL); + } else { + rc = pr_iucv->message_receive(path, msg, msg->flags & IUCV_IPRMDATA, skb->data, len, NULL); + } if (rc) { kfree_skb(skb); return; } - /* we need to fragment iucv messages for SOCK_STREAM only; - * for SOCK_SEQPACKET, it is only relevant if we support - * record segmentation using MSG_EOR (see also recvmsg()) */ - if (sk->sk_type == SOCK_STREAM && - skb->truesize >= sk->sk_rcvbuf / 4) { - rc = iucv_fragment_skb(sk, skb, len); - kfree_skb(skb); - skb = NULL; - if (rc) { - pr_iucv->path_sever(path, NULL); - return; - } - skb = skb_dequeue(&iucv_sk(sk)->backlog_skb_q); - } else { - skb_reset_transport_header(skb); - skb_reset_network_header(skb); - skb->len = len; - } + WARN_ON_ONCE(skb->len != len); } IUCV_SKB_CB(skb)->offset = 0; @@ -1306,7 +1327,7 @@ static void iucv_process_message_q(struct sock *sk) struct sock_msg_q *p, *n; list_for_each_entry_safe(p, n, &iucv->message_q.list, list) { - skb = alloc_skb(iucv_msg_length(&p->msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(&p->msg)); if (!skb) break; iucv_process_message(sk, skb, p->path, &p->msg); @@ -1801,7 +1822,7 @@ static void iucv_callback_rx(struct iucv_path *path, struct iucv_message *msg) if (len > sk->sk_rcvbuf) goto save_message; - skb = alloc_skb(iucv_msg_length(msg), GFP_ATOMIC | GFP_DMA); + skb = alloc_iucv_recv_skb(iucv_msg_length(msg)); if (!skb) goto save_message; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index 7da97809a7e8..d90e4ef09e85 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); */ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - const struct flowi6 *fl6) + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b6db56ec8117..f8c61d2a7963 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -224,8 +224,8 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index) } EXPORT_SYMBOL(tcf_hash_search); -int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, - int bind) +bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind) { struct tcf_hashinfo *hinfo = tn->hinfo; struct tcf_common *p = NULL; @@ -235,9 +235,9 @@ int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, p->tcfc_refcnt++; a->priv = p; a->hinfo = hinfo; - return 1; + return true; } - return 0; + return false; } EXPORT_SYMBOL(tcf_hash_check); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 02f5a8ba95d7..b7fa96926c90 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -423,7 +423,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, u16 ife_type = 0; u8 *daddr = NULL; u8 *saddr = NULL; - int ret = 0, exists = 0; + bool exists = false; + int ret = 0; int err; err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8998a3594e86..6148e323ed93 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -97,7 +97,8 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, struct tcf_ipt *ipt; struct xt_entry_target *td, *t; char *tname; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; u32 hook = 0; u32 index = 0; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 787751a7981a..5b135d357e1e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -62,7 +62,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; - int ret, ok_push = 0, exists = 0; + int ret, ok_push = 0; + bool exists = false; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index be5fbb51cfed..318328d34d12 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -86,8 +86,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; struct tcf_defact *d; + bool exists = false; + int ret = 0, err; char *defdata; - int ret = 0, err, exists = 0; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7e2bc3c2b6da..53d1486cddf7 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -69,7 +69,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_skbedit *d; u32 flags = 0, *priority = NULL, *mark = NULL; u16 *queue_mapping = NULL; - int ret = 0, err, exists = 0; + bool exists = false; + int ret = 0, err; if (nla == NULL) return -EINVAL; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b075d50e0fc3..db9b7ed570ba 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,8 +77,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; - int ret = 0, exists = 0; - int err; + bool exists = false; + int ret = 0, err; if (!nla) return -EINVAL; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 04e0b0583e00..789b69ee9e51 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -375,11 +375,11 @@ static void choke_reset(struct Qdisc *sch) q->head = (q->head + 1) & q->tab_mask; if (!skb) continue; - qdisc_qstats_backlog_dec(sch, skb); - --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } + sch->q.qlen = 0; + sch->qstats.backlog = 0; memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); @@ -455,7 +455,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); --sch->q.qlen; - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped); q->head = 0; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index dddf3bb65a32..c5bc424e3b3c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -174,7 +174,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f49c81e91acd..6eb06674f778 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -514,17 +514,25 @@ out: return skb; } +static void fq_flow_purge(struct fq_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; + flow->qlen = 0; +} + static void fq_reset(struct Qdisc *sch) { struct fq_sched_data *q = qdisc_priv(sch); struct rb_root *root; - struct sk_buff *skb; struct rb_node *p; struct fq_flow *f; unsigned int idx; - while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL) - kfree_skb(skb); + sch->q.qlen = 0; + sch->qstats.backlog = 0; + + fq_flow_purge(&q->internal); if (!q->fq_root) return; @@ -535,8 +543,7 @@ static void fq_reset(struct Qdisc *sch) f = container_of(p, struct fq_flow, fq_node); rb_erase(p, root); - while ((skb = fq_dequeue_head(sch, f)) != NULL) - kfree_skb(skb); + fq_flow_purge(f); kmem_cache_free(fq_flow_cachep, f); } @@ -737,7 +744,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!skb) break; drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); drop_count++; } qdisc_tree_reduce_backlog(sch, drop_count, drop_len); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a302e8ef5498..2dc0a849515a 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -336,6 +336,12 @@ begin: return skb; } +static void fq_codel_flow_purge(struct fq_codel_flow *flow) +{ + rtnl_kfree_skbs(flow->head, flow->tail); + flow->head = NULL; +} + static void fq_codel_reset(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -346,18 +352,13 @@ static void fq_codel_reset(struct Qdisc *sch) for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; - while (flow->head) { - struct sk_buff *skb = dequeue_head(flow); - - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - } - + fq_codel_flow_purge(flow); INIT_LIST_HEAD(&flow->flowchain); codel_vars_init(&flow->cvars); } memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); sch->q.qlen = 0; + sch->qstats.backlog = 0; q->memory_usage = 0; } @@ -433,7 +434,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) struct sk_buff *skb = fq_codel_dequeue(sch); q->cstats.drop_len += qdisc_pkt_len(skb); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); q->cstats.drop_count++; } qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0c9cb516f2e3..773b632e1e33 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -493,7 +493,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(qdisc, band2list(priv, prio)); + __qdisc_reset_queue(band2list(priv, prio)); priv->bitmap = 0; qdisc->qstats.backlog = 0; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c51791848a38..c44593b8e65a 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -464,7 +464,7 @@ static void hhf_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } static void *hhf_zalloc(size_t sz) @@ -574,7 +574,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, prev_backlog - sch->qstats.backlog); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 07dcd2933f01..a454605ab5cb 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -957,7 +957,7 @@ static void htb_reset(struct Qdisc *sch) } } qdisc_watchdog_cancel(&q->watchdog); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); sch->q.qlen = 0; sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); @@ -1231,7 +1231,7 @@ static void htb_destroy(struct Qdisc *sch) htb_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); - __skb_queue_purge(&q->direct_queue); + __qdisc_reset_queue(&q->direct_queue); } static int htb_delete(struct Qdisc *sch, unsigned long arg) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 876df13c745a..e271967439bf 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -368,9 +368,7 @@ static void tfifo_reset(struct Qdisc *sch) struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); - skb->next = NULL; - skb->prev = NULL; - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 71ae3b9629f9..912a46a5d02e 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -234,7 +234,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); - qdisc_drop(skb, sch); + rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a2e0b855d1c8..57d118b41cad 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -520,7 +520,7 @@ sfq_reset(struct Qdisc *sch) struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) - kfree_skb(skb); + rtnl_kfree_skbs(skb, skb); } /* diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 57e460be4692..31b9f9c52974 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_TIPC) := tipc.o tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ - name_distr.o subscr.o name_table.o net.o \ + name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ server.o socket.o diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f7c983be33..64f4004a6fac 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -73,4 +73,5 @@ int tipc_addr_node_valid(u32 addr); int tipc_in_scope(u32 domain, u32 addr); int tipc_addr_scope(u32 domain); char *tipc_addr_string_fill(char *string, u32 addr); + #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 6f11c62bc8f9..9a70e1d744d2 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1,7 +1,7 @@ /* * net/tipc/bearer.c: TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * @@ -39,6 +39,7 @@ #include "bearer.h" #include "link.h" #include "discover.h" +#include "monitor.h" #include "bcast.h" #include "netlink.h" @@ -313,6 +314,10 @@ restart: rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); + + if (tipc_mon_create(net, bearer_id)) + return -ENOMEM; + pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, tipc_addr_string_fill(addr_string, disc_domain), priority); @@ -348,6 +353,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) tipc_disc_delete(b->link_req); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); + tipc_mon_delete(net, bearer_id); } int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index f686e41b5abb..0d337c7b6fad 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -1,7 +1,7 @@ /* * net/tipc/bearer.h: Include file for TIPC bearer code * - * Copyright (c) 1996-2006, 2013-2014, Ericsson AB + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * diff --git a/net/tipc/core.c b/net/tipc/core.c index fe1b062c4f18..236b043a4156 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -57,6 +57,7 @@ static int __net_init tipc_init_net(struct net *net) tn->net_id = 4711; tn->own_addr = 0; + tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); spin_lock_init(&tn->node_list_lock); diff --git a/net/tipc/core.h b/net/tipc/core.h index eff58dc53aa1..a1845fb27d80 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -66,11 +66,13 @@ struct tipc_bc_base; struct tipc_link; struct tipc_name_table; struct tipc_server; +struct tipc_monitor; #define TIPC_MOD_VER "2.0.0" -#define NODE_HTABLE_SIZE 512 -#define MAX_BEARERS 3 +#define NODE_HTABLE_SIZE 512 +#define MAX_BEARERS 3 +#define TIPC_DEF_MON_THRESHOLD 32 extern int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; @@ -88,6 +90,10 @@ struct tipc_net { u32 num_nodes; u32 num_links; + /* Neighbor monitoring list */ + struct tipc_monitor *monitors[MAX_BEARERS]; + int mon_threshold; + /* Bearer list */ struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; @@ -126,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline unsigned int tipc_hashfn(u32 addr) +{ + return addr & (NODE_HTABLE_SIZE - 1); +} + static inline u16 mod(u16 x) { return x & 0xffffu; diff --git a/net/tipc/link.c b/net/tipc/link.c index a904ccd5a93a..03f8bdf70d8f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "discover.h" #include "netlink.h" +#include "monitor.h" #include <linux/pkt_sched.h> @@ -95,6 +96,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') + * @mon_state: cookie with information needed by link monitor * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -138,6 +140,7 @@ struct tipc_link { char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; + struct tipc_mon_state mon_state; u16 rst_cnt; /* Failover/synch */ @@ -708,18 +711,25 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) bool setup = false; u16 bc_snt = l->bc_sndlink->snd_nxt - 1; u16 bc_acked = l->bc_rcvlink->acked; - - link_profile_stats(l); + struct tipc_mon_state *mstate = &l->mon_state; switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: - if (l->silent_intv_cnt > l->abort_limit) - return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); mtyp = STATE_MSG; + link_profile_stats(l); + tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id); + if (mstate->reset || (l->silent_intv_cnt > l->abort_limit)) + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); state = bc_acked != bc_snt; - probe = l->silent_intv_cnt; - l->silent_intv_cnt++; + state |= l->bc_rcvlink->rcv_unacked; + state |= l->rcv_unacked; + state |= !skb_queue_empty(&l->transmq); + state |= !skb_queue_empty(&l->deferdq); + probe = mstate->probing; + probe |= l->silent_intv_cnt; + if (probe || mstate->monitoring) + l->silent_intv_cnt++; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; @@ -830,6 +840,7 @@ void tipc_link_reset(struct tipc_link *l) l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; + memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); } @@ -1238,6 +1249,9 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, struct tipc_msg *hdr; struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); + struct tipc_mon_state *mstate = &l->mon_state; + int dlen = 0; + void *data; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1250,12 +1264,13 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, - TIPC_MAX_IF_NAME, l->addr, + tipc_max_domain_size, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return; hdr = buf_msg(skb); + data = msg_data(hdr); msg_set_session(hdr, l->session); msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); @@ -1271,14 +1286,18 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (mtyp == STATE_MSG) { msg_set_seq_gap(hdr, rcvgap); - msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); + tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); + msg_set_size(hdr, INT_H_SIZE + dlen); + skb_trim(skb, INT_H_SIZE + dlen); l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - strcpy(msg_data(hdr), l->if_name); + strcpy(data, l->if_name); + msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME); + skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME); } if (probe) l->stats.sent_probes++; @@ -1371,7 +1390,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); u16 rcv_nxt = l->rcv_nxt; + u16 dlen = msg_data_sz(hdr); int mtyp = msg_type(hdr); + void *data; char *if_name; int rc = 0; @@ -1381,6 +1402,10 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); + skb_linearize(skb); + hdr = buf_msg(skb); + data = msg_data(hdr); + switch (mtyp) { case RESET_MSG: @@ -1391,8 +1416,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* fall thru' */ case ACTIVATE_MSG: - skb_linearize(skb); - hdr = buf_msg(skb); /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; @@ -1400,7 +1423,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); + strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) @@ -1448,6 +1471,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = TIPC_LINK_UP_EVT; break; } + tipc_mon_rcv(l->net, data, dlen, l->addr, + &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c new file mode 100644 index 000000000000..87d4efedd09f --- /dev/null +++ b/net/tipc/monitor.c @@ -0,0 +1,651 @@ +/* + * net/tipc/monitor.c + * + * Copyright (c) 2016, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "monitor.h" + +#define MAX_MON_DOMAIN 64 +#define MON_TIMEOUT 120000 +#define MAX_PEER_DOWN_EVENTS 4 + +/* struct tipc_mon_domain: domain record to be transferred between peers + * @len: actual size of domain record + * @gen: current generation of sender's domain + * @ack_gen: most recent generation of self's domain acked by peer + * @member_cnt: number of domain member nodes described in this record + * @up_map: bit map indicating which of the members the sender considers up + * @members: identity of the domain members + */ +struct tipc_mon_domain { + u16 len; + u16 gen; + u16 ack_gen; + u16 member_cnt; + u64 up_map; + u32 members[MAX_MON_DOMAIN]; +}; + +/* struct tipc_peer: state of a peer node and its domain + * @addr: tipc node identity of peer + * @head_map: shows which other nodes currently consider peer 'up' + * @domain: most recent domain record from peer + * @hash: position in hashed lookup list + * @list: position in linked list, in circular ascending order by 'addr' + * @applied: number of reported domain members applied on this monitor list + * @is_up: peer is up as seen from this node + * @is_head: peer is assigned domain head as seen from this node + * @is_local: peer is in local domain and should be continuously monitored + * @down_cnt: - numbers of other peers which have reported this on lost + */ +struct tipc_peer { + u32 addr; + struct tipc_mon_domain *domain; + struct hlist_node hash; + struct list_head list; + u8 applied; + u8 down_cnt; + bool is_up; + bool is_head; + bool is_local; +}; + +struct tipc_monitor { + struct hlist_head peers[NODE_HTABLE_SIZE]; + int peer_cnt; + struct tipc_peer *self; + rwlock_t lock; + struct tipc_mon_domain cache; + u16 list_gen; + u16 dom_gen; + struct net *net; + struct timer_list timer; + unsigned long timer_intv; +}; + +static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) +{ + return tipc_net(net)->monitors[bearer_id]; +} + +const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); + +/* dom_rec_len(): actual length of domain record for transport + */ +static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) +{ + return ((void *)&dom->members - (void *)dom) + (mcnt * sizeof(u32)); +} + +/* dom_size() : calculate size of own domain based on number of peers + */ +static int dom_size(int peers) +{ + int i = 0; + + while ((i * i) < peers) + i++; + return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN; +} + +static void map_set(u64 *up_map, int i, unsigned int v) +{ + *up_map &= ~(1 << i); + *up_map |= (v << i); +} + +static int map_get(u64 up_map, int i) +{ + return (up_map & (1 << i)) >> i; +} + +static struct tipc_peer *peer_prev(struct tipc_peer *peer) +{ + return list_last_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_nxt(struct tipc_peer *peer) +{ + return list_first_entry(&peer->list, struct tipc_peer, list); +} + +static struct tipc_peer *peer_head(struct tipc_peer *peer) +{ + while (!peer->is_head) + peer = peer_prev(peer); + return peer; +} + +static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) +{ + struct tipc_peer *peer; + unsigned int thash = tipc_hashfn(addr); + + hlist_for_each_entry(peer, &mon->peers[thash], hash) { + if (peer->addr == addr) + return peer; + } + return NULL; +} + +static struct tipc_peer *get_self(struct net *net, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + + return mon->self; +} + +static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) +{ + struct tipc_net *tn = tipc_net(net); + + return mon->peer_cnt > tn->mon_threshold; +} + +/* mon_identify_lost_members() : - identify amd mark potentially lost members + */ +static void mon_identify_lost_members(struct tipc_peer *peer, + struct tipc_mon_domain *dom_bef, + int applied_bef) +{ + struct tipc_peer *member = peer; + struct tipc_mon_domain *dom_aft = peer->domain; + int applied_aft = peer->applied; + int i; + + for (i = 0; i < applied_bef; i++) { + member = peer_nxt(member); + + /* Do nothing if self or peer already see member as down */ + if (!member->is_up || !map_get(dom_bef->up_map, i)) + continue; + + /* Loss of local node must be detected by active probing */ + if (member->is_local) + continue; + + /* Start probing if member was removed from applied domain */ + if (!applied_aft || (applied_aft < i)) { + member->down_cnt = 1; + continue; + } + + /* Member loss is confirmed if it is still in applied domain */ + if (!map_get(dom_aft->up_map, i)) + member->down_cnt++; + } +} + +/* mon_apply_domain() : match a peer's domain record against monitor list + */ +static void mon_apply_domain(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + struct tipc_mon_domain *dom = peer->domain; + struct tipc_peer *member; + u32 addr; + int i; + + if (!dom || !peer->is_up) + return; + + /* Scan across domain members and match against monitor list */ + peer->applied = 0; + member = peer_nxt(peer); + for (i = 0; i < dom->member_cnt; i++) { + addr = dom->members[i]; + if (addr != member->addr) + return; + peer->applied++; + member = peer_nxt(member); + } +} + +/* mon_update_local_domain() : update after peer addition/removal/up/down + */ +static void mon_update_local_domain(struct tipc_monitor *mon) +{ + struct tipc_peer *self = mon->self; + struct tipc_mon_domain *cache = &mon->cache; + struct tipc_mon_domain *dom = self->domain; + struct tipc_peer *peer = self; + u64 prev_up_map = dom->up_map; + u16 member_cnt, i; + bool diff; + + /* Update local domain size based on current size of cluster */ + member_cnt = dom_size(mon->peer_cnt) - 1; + self->applied = member_cnt; + + /* Update native and cached outgoing local domain records */ + dom->len = dom_rec_len(dom, member_cnt); + diff = dom->member_cnt != member_cnt; + dom->member_cnt = member_cnt; + for (i = 0; i < member_cnt; i++) { + peer = peer_nxt(peer); + diff |= dom->members[i] != peer->addr; + dom->members[i] = peer->addr; + map_set(&dom->up_map, i, peer->is_up); + cache->members[i] = htonl(peer->addr); + } + diff |= dom->up_map != prev_up_map; + if (!diff) + return; + dom->gen = ++mon->dom_gen; + cache->len = htons(dom->len); + cache->gen = htons(dom->gen); + cache->member_cnt = htons(member_cnt); + cache->up_map = cpu_to_be64(dom->up_map); + mon_apply_domain(mon, self); +} + +/* mon_update_neighbors() : update preceding neighbors of added/removed peer + */ +static void mon_update_neighbors(struct tipc_monitor *mon, + struct tipc_peer *peer) +{ + int dz, i; + + dz = dom_size(mon->peer_cnt); + for (i = 0; i < dz; i++) { + mon_apply_domain(mon, peer); + peer = peer_prev(peer); + } +} + +/* mon_assign_roles() : reassign peer roles after a network change + * The monitor list is consistent at this stage; i.e., each peer is monitoring + * a set of domain members as matched between domain record and the monitor list + */ +static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) +{ + struct tipc_peer *peer = peer_nxt(head); + struct tipc_peer *self = mon->self; + int i = 0; + + for (; peer != self; peer = peer_nxt(peer)) { + peer->is_local = false; + + /* Update domain member */ + if (i++ < head->applied) { + peer->is_head = false; + if (head == self) + peer->is_local = true; + continue; + } + /* Assign next domain head */ + if (!peer->is_up) + continue; + if (peer->is_head) + break; + head = peer; + head->is_head = true; + i = 0; + } + mon->list_gen++; +} + +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *prev, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) + goto exit; + prev = peer_prev(peer); + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + mon->peer_cnt--; + head = peer_head(prev); + if (head == self) + mon_update_local_domain(mon); + mon_update_neighbors(mon, prev); + + /* Revert to full-mesh monitoring if we reach threshold */ + if (!tipc_mon_is_active(net, mon)) { + list_for_each_entry(peer, &self->list, list) { + kfree(peer->domain); + peer->domain = NULL; + peer->applied = 0; + } + } + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, + struct tipc_peer **peer) +{ + struct tipc_peer *self = mon->self; + struct tipc_peer *cur, *prev, *p; + + p = kzalloc(sizeof(*p), GFP_ATOMIC); + *peer = p; + if (!p) + return false; + p->addr = addr; + + /* Add new peer to lookup list */ + INIT_LIST_HEAD(&p->list); + hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); + + /* Sort new peer into iterator list, in ascending circular order */ + prev = self; + list_for_each_entry(cur, &self->list, list) { + if ((addr > prev->addr) && (addr < cur->addr)) + break; + if (((addr < cur->addr) || (addr > prev->addr)) && + (prev->addr > cur->addr)) + break; + prev = cur; + } + list_add_tail(&p->list, &cur->list); + mon->peer_cnt++; + mon_update_neighbors(mon, p); + return true; +} + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) + goto exit; + peer->is_up = true; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *head; + struct tipc_mon_domain *dom; + int applied; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer) { + pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); + goto exit; + } + applied = peer->applied; + peer->applied = 0; + dom = peer->domain; + peer->domain = NULL; + if (peer->is_head) + mon_identify_lost_members(peer, dom, applied); + kfree(dom); + peer->is_up = false; + peer->is_head = false; + peer->is_local = false; + peer->down_cnt = 0; + head = peer_head(peer); + if (head == self) + mon_update_local_domain(mon); + mon_assign_roles(mon, head); +exit: + write_unlock_bh(&mon->lock); +} + +/* tipc_mon_rcv - process monitor domain event message + */ +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *arrv_dom = data; + struct tipc_mon_domain dom_bef; + struct tipc_mon_domain *dom; + struct tipc_peer *peer; + u16 new_member_cnt = ntohs(arrv_dom->member_cnt); + int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); + u16 new_gen = ntohs(arrv_dom->gen); + u16 acked_gen = ntohs(arrv_dom->ack_gen); + bool probing = state->probing; + int i, applied_bef; + + state->probing = false; + if (!dlen) + return; + + /* Sanity check received domain record */ + if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) { + pr_warn_ratelimited("Received illegal domain record\n"); + return; + } + + /* Synch generation numbers with peer if link just came up */ + if (!state->synched) { + state->peer_gen = new_gen - 1; + state->acked_gen = acked_gen; + state->synched = true; + } + + if (more(acked_gen, state->acked_gen)) + state->acked_gen = acked_gen; + + /* Drop duplicate unless we are waiting for a probe response */ + if (!more(new_gen, state->peer_gen) && !probing) + return; + + write_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (!peer || !peer->is_up) + goto exit; + + /* Peer is confirmed, stop any ongoing probing */ + peer->down_cnt = 0; + + /* Task is done for duplicate record */ + if (!more(new_gen, state->peer_gen)) + goto exit; + + state->peer_gen = new_gen; + + /* Cache current domain record for later use */ + dom_bef.member_cnt = 0; + dom = peer->domain; + if (dom) + memcpy(&dom_bef, dom, dom->len); + + /* Transform and store received domain record */ + if (!dom || (dom->len < new_dlen)) { + kfree(dom); + dom = kmalloc(new_dlen, GFP_ATOMIC); + peer->domain = dom; + if (!dom) + goto exit; + } + dom->len = new_dlen; + dom->gen = new_gen; + dom->member_cnt = new_member_cnt; + dom->up_map = be64_to_cpu(arrv_dom->up_map); + for (i = 0; i < new_member_cnt; i++) + dom->members[i] = ntohl(arrv_dom->members[i]); + + /* Update peers affected by this domain record */ + applied_bef = peer->applied; + mon_apply_domain(mon, peer); + mon_identify_lost_members(peer, &dom_bef, applied_bef); + mon_assign_roles(mon, peer_head(peer)); +exit: + write_unlock_bh(&mon->lock); +} + +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_mon_domain *dom = data; + u16 gen = mon->dom_gen; + u16 len; + + if (!tipc_mon_is_active(net, mon)) + return; + + /* Send only a dummy record with ack if peer has acked our last sent */ + if (likely(state->acked_gen == gen)) { + len = dom_rec_len(dom, 0); + *dlen = len; + dom->len = htons(len); + dom->gen = htons(gen); + dom->ack_gen = htons(state->peer_gen); + dom->member_cnt = 0; + return; + } + /* Send the full record */ + read_lock_bh(&mon->lock); + len = ntohs(mon->cache.len); + *dlen = len; + memcpy(data, &mon->cache, len); + read_unlock_bh(&mon->lock); + dom->ack_gen = htons(state->peer_gen); +} + +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id) +{ + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *peer; + + /* Used cached state if table has not changed */ + if (!state->probing && + (state->list_gen == mon->list_gen) && + (state->acked_gen == mon->dom_gen)) + return; + + read_lock_bh(&mon->lock); + peer = get_peer(mon, addr); + if (peer) { + state->probing = state->acked_gen != mon->dom_gen; + state->probing |= peer->down_cnt; + state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; + state->monitoring = peer->is_local; + state->monitoring |= peer->is_head; + state->list_gen = mon->list_gen; + } + read_unlock_bh(&mon->lock); +} + +static void mon_timeout(unsigned long m) +{ + struct tipc_monitor *mon = (void *)m; + struct tipc_peer *self; + int best_member_cnt = dom_size(mon->peer_cnt) - 1; + + write_lock_bh(&mon->lock); + self = mon->self; + if (self && (best_member_cnt != self->applied)) { + mon_update_local_domain(mon); + mon_assign_roles(mon, self); + } + write_unlock_bh(&mon->lock); + mod_timer(&mon->timer, jiffies + mon->timer_intv); +} + +int tipc_mon_create(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon; + struct tipc_peer *self; + struct tipc_mon_domain *dom; + + if (tn->monitors[bearer_id]) + return 0; + + mon = kzalloc(sizeof(*mon), GFP_ATOMIC); + self = kzalloc(sizeof(*self), GFP_ATOMIC); + dom = kzalloc(sizeof(*dom), GFP_ATOMIC); + if (!mon || !self || !dom) { + kfree(mon); + kfree(self); + kfree(dom); + return -ENOMEM; + } + tn->monitors[bearer_id] = mon; + rwlock_init(&mon->lock); + mon->net = net; + mon->peer_cnt = 1; + mon->self = self; + self->domain = dom; + self->addr = tipc_own_addr(net); + self->is_up = true; + self->is_head = true; + INIT_LIST_HEAD(&self->list); + setup_timer(&mon->timer, mon_timeout, (unsigned long)mon); + mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); + mod_timer(&mon->timer, jiffies + mon->timer_intv); + return 0; +} + +void tipc_mon_delete(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_monitor *mon = tipc_monitor(net, bearer_id); + struct tipc_peer *self = get_self(net, bearer_id); + struct tipc_peer *peer, *tmp; + + write_lock_bh(&mon->lock); + tn->monitors[bearer_id] = NULL; + list_for_each_entry_safe(peer, tmp, &self->list, list) { + list_del(&peer->list); + hlist_del(&peer->hash); + kfree(peer->domain); + kfree(peer); + } + mon->self = NULL; + write_unlock_bh(&mon->lock); + del_timer_sync(&mon->timer); + kfree(self->domain); + kfree(self); + kfree(mon); +} diff --git a/net/tipc/monitor.h b/net/tipc/monitor.h new file mode 100644 index 000000000000..598459cbed5d --- /dev/null +++ b/net/tipc/monitor.h @@ -0,0 +1,73 @@ +/* + * net/tipc/monitor.h + * + * Copyright (c) 2015, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_MONITOR_H +#define _TIPC_MONITOR_H + +/* struct tipc_mon_state: link instance's cache of monitor list and domain state + * @list_gen: current generation of this node's monitor list + * @gen: current generation of this node's local domain + * @peer_gen: most recent domain generation received from peer + * @acked_gen: most recent generation of self's domain acked by peer + * @monitoring: this peer endpoint should continuously monitored + * @probing: peer endpoint should be temporarily probed for potential loss + * @synched: domain record's generation has been synched with peer after reset + */ +struct tipc_mon_state { + u16 list_gen; + u16 peer_gen; + u16 acked_gen; + bool monitoring :1; + bool probing :1; + bool reset :1; + bool synched :1; +}; + +int tipc_mon_create(struct net *net, int bearer_id); +void tipc_mon_delete(struct net *net, int bearer_id); + +void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id); +void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id); +void tipc_mon_prep(struct net *net, void *data, int *dlen, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, + struct tipc_mon_state *state, int bearer_id); +void tipc_mon_get_state(struct net *net, u32 addr, + struct tipc_mon_state *state, + int bearer_id); +void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id); + +extern const int tipc_max_domain_size; +#endif diff --git a/net/tipc/node.c b/net/tipc/node.c index d6a490f991a4..a3fc0a3f4077 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -40,6 +40,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "monitor.h" #include "discover.h" #include "netlink.h" @@ -205,17 +206,6 @@ u16 tipc_node_get_capabilities(struct net *net, u32 addr) return caps; } -/* - * A trivial power-of-two bitmask technique is used for speed, since this - * operation is done for every incoming TIPC packet. The number of hash table - * entries has been chosen so that no hash chain exceeds 8 nodes and will - * usually be much smaller (typically only a single node). - */ -static unsigned int tipc_hashfn(u32 addr) -{ - return addr & (NODE_HTABLE_SIZE - 1); -} - static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); @@ -279,6 +269,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) u32 addr = 0; u32 flags = n->action_flags; u32 link_id = 0; + u32 bearer_id; struct list_head *publ_list; if (likely(!flags)) { @@ -288,6 +279,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) addr = n->addr; link_id = n->link_id; + bearer_id = link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | @@ -301,13 +293,16 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, addr); - if (flags & TIPC_NOTIFY_LINK_UP) + if (flags & TIPC_NOTIFY_LINK_UP) { + tipc_mon_peer_up(net, addr, bearer_id); tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, TIPC_NODE_SCOPE, link_id, addr); - - if (flags & TIPC_NOTIFY_LINK_DOWN) + } + if (flags & TIPC_NOTIFY_LINK_DOWN) { + tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, link_id, addr); + } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) @@ -691,6 +686,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) struct tipc_link *l = le->link; struct tipc_media_addr *maddr; struct sk_buff_head xmitq; + int old_bearer_id = bearer_id; if (!l) return; @@ -710,6 +706,8 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_link_fsm_evt(l, LINK_RESET_EVT); } tipc_node_write_unlock(n); + if (delete) + tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); tipc_sk_rcv(n->net, &le->inputq); } diff --git a/tools/virtio/ringtest/Makefile b/tools/virtio/ringtest/Makefile index 6ba745529833..50e086c6a7b6 100644 --- a/tools/virtio/ringtest/Makefile +++ b/tools/virtio/ringtest/Makefile @@ -1,6 +1,6 @@ all: -all: ring virtio_ring_0_9 virtio_ring_poll virtio_ring_inorder +all: ring virtio_ring_0_9 virtio_ring_poll virtio_ring_inorder ptr_ring CFLAGS += -Wall CFLAGS += -pthread -O2 -ggdb @@ -8,6 +8,7 @@ LDFLAGS += -pthread -O2 -ggdb main.o: main.c main.h ring.o: ring.c main.h +ptr_ring.o: ptr_ring.c main.h ../../../include/linux/ptr_ring.h virtio_ring_0_9.o: virtio_ring_0_9.c main.h virtio_ring_poll.o: virtio_ring_poll.c virtio_ring_0_9.c main.h virtio_ring_inorder.o: virtio_ring_inorder.c virtio_ring_0_9.c main.h @@ -15,11 +16,13 @@ ring: ring.o main.o virtio_ring_0_9: virtio_ring_0_9.o main.o virtio_ring_poll: virtio_ring_poll.o main.o virtio_ring_inorder: virtio_ring_inorder.o main.o +ptr_ring: ptr_ring.o main.o clean: -rm main.o -rm ring.o ring -rm virtio_ring_0_9.o virtio_ring_0_9 -rm virtio_ring_poll.o virtio_ring_poll -rm virtio_ring_inorder.o virtio_ring_inorder + -rm ptr_ring.o ptr_ring .PHONY: all clean diff --git a/tools/virtio/ringtest/ptr_ring.c b/tools/virtio/ringtest/ptr_ring.c new file mode 100644 index 000000000000..74abd746ae91 --- /dev/null +++ b/tools/virtio/ringtest/ptr_ring.c @@ -0,0 +1,192 @@ +#define _GNU_SOURCE +#include "main.h" +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <malloc.h> +#include <assert.h> +#include <errno.h> +#include <limits.h> + +#define SMP_CACHE_BYTES 64 +#define cache_line_size() SMP_CACHE_BYTES +#define ____cacheline_aligned_in_smp __attribute__ ((aligned (SMP_CACHE_BYTES))) +#define unlikely(x) (__builtin_expect(!!(x), 0)) +#define ALIGN(x, a) (((x) + (a) - 1) / (a) * (a)) +typedef pthread_spinlock_t spinlock_t; + +typedef int gfp_t; +static void *kzalloc(unsigned size, gfp_t gfp) +{ + void *p = memalign(64, size); + if (!p) + return p; + memset(p, 0, size); + + return p; +} + +static void kfree(void *p) +{ + if (p) + free(p); +} + +static void spin_lock_init(spinlock_t *lock) +{ + int r = pthread_spin_init(lock, 0); + assert(!r); +} + +static void spin_lock(spinlock_t *lock) +{ + int ret = pthread_spin_lock(lock); + assert(!ret); +} + +static void spin_unlock(spinlock_t *lock) +{ + int ret = pthread_spin_unlock(lock); + assert(!ret); +} + +static void spin_lock_bh(spinlock_t *lock) +{ + spin_lock(lock); +} + +static void spin_unlock_bh(spinlock_t *lock) +{ + spin_unlock(lock); +} + +static void spin_lock_irq(spinlock_t *lock) +{ + spin_lock(lock); +} + +static void spin_unlock_irq(spinlock_t *lock) +{ + spin_unlock(lock); +} + +static void spin_lock_irqsave(spinlock_t *lock, unsigned long f) +{ + spin_lock(lock); +} + +static void spin_unlock_irqrestore(spinlock_t *lock, unsigned long f) +{ + spin_unlock(lock); +} + +#include "../../../include/linux/ptr_ring.h" + +static unsigned long long headcnt, tailcnt; +static struct ptr_ring array ____cacheline_aligned_in_smp; + +/* implemented by ring */ +void alloc_ring(void) +{ + int ret = ptr_ring_init(&array, ring_size, 0); + assert(!ret); +} + +/* guest side */ +int add_inbuf(unsigned len, void *buf, void *datap) +{ + int ret; + + ret = __ptr_ring_produce(&array, buf); + if (ret >= 0) { + ret = 0; + headcnt++; + } + + return ret; +} + +/* + * ptr_ring API provides no way for producer to find out whether a given + * buffer was consumed. Our tests merely require that a successful get_buf + * implies that add_inbuf succeed in the past, and that add_inbuf will succeed, + * fake it accordingly. + */ +void *get_buf(unsigned *lenp, void **bufp) +{ + void *datap; + + if (tailcnt == headcnt || __ptr_ring_full(&array)) + datap = NULL; + else { + datap = "Buffer\n"; + ++tailcnt; + } + + return datap; +} + +void poll_used(void) +{ + void *b; + + do { + if (tailcnt == headcnt || __ptr_ring_full(&array)) { + b = NULL; + barrier(); + } else { + b = "Buffer\n"; + } + } while (!b); +} + +void disable_call() +{ + assert(0); +} + +bool enable_call() +{ + assert(0); +} + +void kick_available(void) +{ + assert(0); +} + +/* host side */ +void disable_kick() +{ + assert(0); +} + +bool enable_kick() +{ + assert(0); +} + +void poll_avail(void) +{ + void *b; + + do { + barrier(); + b = __ptr_ring_peek(&array); + } while (!b); +} + +bool use_buf(unsigned *lenp, void **bufp) +{ + void *ptr; + + ptr = __ptr_ring_consume(&array); + + return ptr; +} + +void call_used(void) +{ + assert(0); +} |