We can increase the efficiency of rx path by using buffers to receive
packets then build SKBs around them just before passing into the network
stack. In contrast, preallocating SKBs too early reduces CPU cache
efficiency.
Performance is slightly increased but the changes allow more
potential optimizations.
Signed-off-by: Sieng Piaw Liew <liew.s.piaw@gmail.com>
[improve code format]
Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
#define ENETSW_TAG_SIZE 6
#define ENETSW_MTU_OVERHEAD (VLAN_ETH_HLEN + VLAN_HLEN + \
ENETSW_TAG_SIZE)
#define ENETSW_TAG_SIZE 6
#define ENETSW_MTU_OVERHEAD (VLAN_ETH_HLEN + VLAN_HLEN + \
ENETSW_TAG_SIZE)
+#define ENETSW_FRAG_SIZE(x) (SKB_DATA_ALIGN(NET_SKB_PAD + x + \
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))))
/* default number of descriptor */
#define ENETSW_DEF_RX_DESC 64
/* default number of descriptor */
#define ENETSW_DEF_RX_DESC 64
/* next dirty rx descriptor to refill */
int rx_dirty_desc;
/* next dirty rx descriptor to refill */
int rx_dirty_desc;
- /* size of allocated rx skbs */
- unsigned int rx_skb_size;
+ /* size of allocated rx buffer */
+ unsigned int rx_buf_size;
- /* list of skb given to hw for rx */
- struct sk_buff **rx_skb;
+ /* size of allocated rx frag */
+ unsigned int rx_frag_size;
- /* used when rx skb allocation failed, so we defer rx queue
+ /* list of buffer given to hw for rx */
+ unsigned char **rx_buf;
+
+ /* used when rx buffer allocation failed, so we defer rx queue
* refill */
struct timer_list rx_timeout;
* refill */
struct timer_list rx_timeout;
while (priv->rx_desc_count < priv->rx_ring_size) {
struct bcm6368_enetsw_desc *desc;
while (priv->rx_desc_count < priv->rx_ring_size) {
struct bcm6368_enetsw_desc *desc;
- struct sk_buff *skb;
- dma_addr_t p;
int desc_idx;
u32 len_stat;
desc_idx = priv->rx_dirty_desc;
desc = &priv->rx_desc_cpu[desc_idx];
int desc_idx;
u32 len_stat;
desc_idx = priv->rx_dirty_desc;
desc = &priv->rx_desc_cpu[desc_idx];
- if (!priv->rx_skb[desc_idx]) {
- skb = netdev_alloc_skb(dev, priv->rx_skb_size);
- if (!skb)
+ if (!priv->rx_buf[desc_idx]) {
+ unsigned char *buf =
+ netdev_alloc_frag(priv->rx_frag_size);
+
+ if (unlikely(!buf))
- priv->rx_skb[desc_idx] = skb;
- p = dma_map_single(&priv->pdev->dev, skb->data,
- priv->rx_skb_size,
- DMA_FROM_DEVICE);
- desc->address = p;
+
+ priv->rx_buf[desc_idx] = buf;
+ desc->address = dma_map_single(&priv->pdev->dev,
+ buf + NET_SKB_PAD,
+ priv->rx_buf_size,
+ DMA_FROM_DEVICE);
- len_stat = priv->rx_skb_size << DMADESC_LENGTH_SHIFT;
+ len_stat = priv->rx_buf_size << DMADESC_LENGTH_SHIFT;
len_stat |= DMADESC_OWNER_MASK;
if (priv->rx_dirty_desc == priv->rx_ring_size - 1) {
len_stat |= DMADESC_WRAP_MASK;
len_stat |= DMADESC_OWNER_MASK;
if (priv->rx_dirty_desc == priv->rx_ring_size - 1) {
len_stat |= DMADESC_WRAP_MASK;
do {
struct bcm6368_enetsw_desc *desc;
do {
struct bcm6368_enetsw_desc *desc;
+ unsigned int frag_size;
int desc_idx;
u32 len_stat;
unsigned int len;
int desc_idx;
u32 len_stat;
unsigned int len;
- skb = priv->rx_skb[desc_idx];
+ buf = priv->rx_buf[desc_idx];
len = (len_stat & DMADESC_LENGTH_MASK)
>> DMADESC_LENGTH_SHIFT;
/* don't include FCS */
len -= 4;
if (len < priv->copybreak) {
len = (len_stat & DMADESC_LENGTH_MASK)
>> DMADESC_LENGTH_SHIFT;
/* don't include FCS */
len -= 4;
if (len < priv->copybreak) {
+ unsigned int nfrag_size = ENETSW_FRAG_SIZE(len);
+ unsigned char *nbuf = napi_alloc_frag(nfrag_size);
- nskb = netdev_alloc_skb(dev, len);
- if (!nskb) {
/* forget packet, just rearm desc */
dev->stats.rx_dropped++;
continue;
/* forget packet, just rearm desc */
dev->stats.rx_dropped++;
continue;
dma_sync_single_for_cpu(kdev, desc->address,
len, DMA_FROM_DEVICE);
dma_sync_single_for_cpu(kdev, desc->address,
len, DMA_FROM_DEVICE);
- memcpy(nskb->data, skb->data, len);
+ memcpy(nbuf + NET_SKB_PAD, buf + NET_SKB_PAD, len);
dma_sync_single_for_device(kdev, desc->address,
len, DMA_FROM_DEVICE);
dma_sync_single_for_device(kdev, desc->address,
len, DMA_FROM_DEVICE);
+ buf = nbuf;
+ frag_size = nfrag_size;
- dma_unmap_single(&priv->pdev->dev, desc->address,
- priv->rx_skb_size, DMA_FROM_DEVICE);
- priv->rx_skb[desc_idx] = NULL;
+ dma_unmap_single(kdev, desc->address,
+ priv->rx_buf_size, DMA_FROM_DEVICE);
+ priv->rx_buf[desc_idx] = NULL;
+ frag_size = priv->rx_frag_size;
+ }
+
+ skb = build_skb(buf, frag_size);
+ if (unlikely(!skb)) {
+ skb_free_frag(buf);
+ dev->stats.rx_dropped++;
+ continue;
+ skb_reserve(skb, NET_SKB_PAD);
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, dev);
dev->stats.rx_packets++;
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, dev);
dev->stats.rx_packets++;
priv->tx_skb = kzalloc(sizeof(struct sk_buff *) * priv->tx_ring_size,
GFP_KERNEL);
if (!priv->tx_skb) {
priv->tx_skb = kzalloc(sizeof(struct sk_buff *) * priv->tx_ring_size,
GFP_KERNEL);
if (!priv->tx_skb) {
- dev_err(kdev, "cannot allocate rx skb queue\n");
+ dev_err(kdev, "cannot allocate tx skb queue\n");
ret = -ENOMEM;
goto out_free_tx_ring;
}
ret = -ENOMEM;
goto out_free_tx_ring;
}
priv->tx_curr_desc = 0;
spin_lock_init(&priv->tx_lock);
priv->tx_curr_desc = 0;
spin_lock_init(&priv->tx_lock);
- /* init & fill rx ring with skbs */
- priv->rx_skb = kzalloc(sizeof(struct sk_buff *) * priv->rx_ring_size,
+ /* init & fill rx ring with buffers */
+ priv->rx_buf = kzalloc(sizeof(unsigned char *) * priv->rx_ring_size,
- if (!priv->rx_skb) {
- dev_err(kdev, "cannot allocate rx skb queue\n");
+ if (!priv->rx_buf) {
+ dev_err(kdev, "cannot allocate rx buffer queue\n");
ret = -ENOMEM;
goto out_free_tx_skb;
}
ret = -ENOMEM;
goto out_free_tx_skb;
}
DMA_BUFALLOC_REG(priv->rx_chan));
if (bcm6368_enetsw_refill_rx(dev)) {
DMA_BUFALLOC_REG(priv->rx_chan));
if (bcm6368_enetsw_refill_rx(dev)) {
- dev_err(kdev, "cannot allocate rx skb queue\n");
+ dev_err(kdev, "cannot allocate rx buffer queue\n");
ret = -ENOMEM;
goto out;
}
ret = -ENOMEM;
goto out;
}
for (i = 0; i < priv->rx_ring_size; i++) {
struct bcm6368_enetsw_desc *desc;
for (i = 0; i < priv->rx_ring_size; i++) {
struct bcm6368_enetsw_desc *desc;
continue;
desc = &priv->rx_desc_cpu[i];
continue;
desc = &priv->rx_desc_cpu[i];
- dma_unmap_single(kdev, desc->address, priv->rx_skb_size,
+ dma_unmap_single(kdev, desc->address, priv->rx_buf_size,
- kfree_skb(priv->rx_skb[i]);
+ skb_free_frag(priv->rx_buf[i]);
out_free_tx_skb:
kfree(priv->tx_skb);
out_free_tx_skb:
kfree(priv->tx_skb);
/* force reclaim of all tx buffers */
bcm6368_enetsw_tx_reclaim(dev, 1);
/* force reclaim of all tx buffers */
bcm6368_enetsw_tx_reclaim(dev, 1);
- /* free the rx skb ring */
+ /* free the rx buffer ring */
for (i = 0; i < priv->rx_ring_size; i++) {
struct bcm6368_enetsw_desc *desc;
for (i = 0; i < priv->rx_ring_size; i++) {
struct bcm6368_enetsw_desc *desc;
continue;
desc = &priv->rx_desc_cpu[i];
continue;
desc = &priv->rx_desc_cpu[i];
- dma_unmap_single_attrs(kdev, desc->address, priv->rx_skb_size,
+ dma_unmap_single_attrs(kdev, desc->address, priv->rx_buf_size,
DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
DMA_FROM_DEVICE,
DMA_ATTR_SKIP_CPU_SYNC);
- kfree_skb(priv->rx_skb[i]);
+ skb_free_frag(priv->rx_buf[i]);
}
/* free remaining allocated memory */
}
/* free remaining allocated memory */
kfree(priv->tx_skb);
dma_free_coherent(kdev, priv->rx_desc_alloc_size,
priv->rx_desc_cpu, priv->rx_desc_dma);
kfree(priv->tx_skb);
dma_free_coherent(kdev, priv->rx_desc_alloc_size,
priv->rx_desc_cpu, priv->rx_desc_dma);
dev_info(dev, "random mac %pM\n", ndev->dev_addr);
}
dev_info(dev, "random mac %pM\n", ndev->dev_addr);
}
- priv->rx_skb_size = ALIGN(ndev->mtu + ENETSW_MTU_OVERHEAD,
+ priv->rx_buf_size = ALIGN(ndev->mtu + ENETSW_MTU_OVERHEAD,
+ priv->rx_frag_size = ENETSW_FRAG_SIZE(priv->rx_buf_size);
+
priv->num_clocks = of_clk_get_parent_count(node);
if (priv->num_clocks) {
priv->clock = devm_kcalloc(dev, priv->num_clocks,
priv->num_clocks = of_clk_get_parent_count(node);
if (priv->num_clocks) {
priv->clock = devm_kcalloc(dev, priv->num_clocks,