[gve] Add driver for Google Virtual Ethernet NIC

The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in Google
Cloud instances.  There is essentially zero documentation available
beyond the mostly uncommented source code in the Linux kernel.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown 2024-07-24 14:30:58 +01:00
parent 5a9f476d4f
commit c7b76e3adc
4 changed files with 2313 additions and 0 deletions

View File

@ -29,6 +29,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
/* Corrupt every N received PeerDist packets */
#define PEERBLK_CORRUPT_RATE 0
/* Experience virtual machine migration on every N watchdog checks */
#define VM_MIGRATED_RATE 0
#include <config/local/fault.h>
#endif /* CONFIG_FAULT_H */

1607
src/drivers/net/gve.c Normal file

File diff suppressed because it is too large Load Diff

702
src/drivers/net/gve.h Normal file
View File

@ -0,0 +1,702 @@
#ifndef _GVE_H
#define _GVE_H
/** @file
*
* Google Virtual Ethernet network driver
*
* The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in
* Google Cloud instances. There is essentially zero documentation
* available beyond the mostly uncommented source code in the Linux
* kernel.
*/
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
#include <stdint.h>
#include <ipxe/dma.h>
#include <ipxe/pci.h>
#include <ipxe/in.h>
#include <ipxe/uaccess.h>
#include <ipxe/process.h>
#include <ipxe/retry.h>
struct gve_nic;
/**
* A Google Cloud MAC address
*
* Google Cloud locally assigned MAC addresses encode the local IPv4
* address in the trailing 32 bits, presumably as a performance
* optimisation to allow ARP resolution to be skipped by a suitably
* aware network stack.
*/
struct google_mac {
/** Reserved */
uint8_t reserved[2];
/** Local IPv4 address */
struct in_addr in;
} __attribute__ (( packed ));
/** Page size */
#define GVE_PAGE_SIZE 0x1000
/**
* Address alignment
*
* All DMA data structure base addresses seem to need to be aligned to
* a page boundary. (This is not documented anywhere, but is inferred
* from existing source code and experimentation.)
*/
#define GVE_ALIGN GVE_PAGE_SIZE
/**
* Length alignment
*
* All DMA data structure lengths seem to need to be aligned to a
* multiple of 64 bytes. (This is not documented anywhere, but is
* inferred from existing source code and experimentation.)
*/
#define GVE_LEN_ALIGN 64
/** Maximum number of pages per queue (must be a power of two) */
#define GVE_QPL_MAX 16
/** Configuration BAR */
#define GVE_CFG_BAR PCI_BASE_ADDRESS_0
/**
* Configuration BAR size
*
* All registers within the configuration BAR are big-endian.
*/
#define GVE_CFG_SIZE 0x1000
/** Device status */
#define GVE_CFG_DEVSTAT 0x0000
#define GVE_CFG_DEVSTAT_RESET 0x00000010UL /**< Device is reset */
/** Driver status */
#define GVE_CFG_DRVSTAT 0x0004
#define GVE_CFG_DRVSTAT_RUN 0x00000001UL /**< Run admin queue */
/** Maximum time to wait for reset */
#define GVE_RESET_MAX_WAIT_MS 500
/** Admin queue page frame number (for older devices) */
#define GVE_CFG_ADMIN_PFN 0x0010
/** Admin queue doorbell */
#define GVE_CFG_ADMIN_DB 0x0014
/** Admin queue event counter */
#define GVE_CFG_ADMIN_EVT 0x0018
/** Driver version (8-bit register) */
#define GVE_CFG_VERSION 0x001f
/** Admin queue base address high 32 bits */
#define GVE_CFG_ADMIN_BASE_HI 0x0020
/** Admin queue base address low 32 bits */
#define GVE_CFG_ADMIN_BASE_LO 0x0024
/** Admin queue base address length (16-bit register) */
#define GVE_CFG_ADMIN_LEN 0x0028
/** Doorbell BAR */
#define GVE_DB_BAR PCI_BASE_ADDRESS_2
/**
* Admin queue entry header
*
* All values within admin queue entries are big-endian.
*/
struct gve_admin_header {
/** Reserved */
uint8_t reserved[3];
/** Operation code */
uint8_t opcode;
/** Status */
uint32_t status;
} __attribute__ (( packed ));
/** Command succeeded */
#define GVE_ADMIN_STATUS_OK 0x00000001
/** Simple admin command */
struct gve_admin_simple {
/** Header */
struct gve_admin_header hdr;
/** ID */
uint32_t id;
} __attribute__ (( packed ));
/** Describe device command */
#define GVE_ADMIN_DESCRIBE 0x0001
/** Describe device command */
struct gve_admin_describe {
/** Header */
struct gve_admin_header hdr;
/** Descriptor buffer address */
uint64_t addr;
/** Descriptor version */
uint32_t ver;
/** Descriptor maximum length */
uint32_t len;
} __attribute__ (( packed ));
/** Device descriptor version */
#define GVE_ADMIN_DESCRIBE_VER 1
/** Device descriptor */
struct gve_device_descriptor {
/** Reserved */
uint8_t reserved_a[10];
/** Number of transmit queue entries */
uint16_t tx_count;
/** Number of receive queue entries */
uint16_t rx_count;
/** Reserved */
uint8_t reserved_b[2];
/** Maximum transmit unit */
uint16_t mtu;
/** Number of event counters */
uint16_t counters;
/** Reserved */
uint8_t reserved_c[4];
/** MAC address */
struct google_mac mac;
/** Reserved */
uint8_t reserved_d[10];
} __attribute__ (( packed ));
/** Configure device resources command */
#define GVE_ADMIN_CONFIGURE 0x0002
/** Configure device resources command */
struct gve_admin_configure {
/** Header */
struct gve_admin_header hdr;
/** Event counter array */
uint64_t events;
/** IRQ doorbell address */
uint64_t irqs;
/** Number of event counters */
uint32_t num_events;
/** Number of IRQ doorbells */
uint32_t num_irqs;
/** IRQ doorbell stride */
uint32_t irq_stride;
} __attribute__ (( packed ));
/** Register page list command */
#define GVE_ADMIN_REGISTER 0x0003
/** Register page list command */
struct gve_admin_register {
/** Header */
struct gve_admin_header hdr;
/** Page list ID */
uint32_t id;
/** Number of pages */
uint32_t count;
/** Address list address */
uint64_t addr;
/** Page size */
uint64_t size;
} __attribute__ (( packed ));
/** Page list */
struct gve_pages {
/** Page address */
uint64_t addr[GVE_QPL_MAX];
} __attribute__ (( packed ));
/** Unregister page list command */
#define GVE_ADMIN_UNREGISTER 0x0004
/** Create transmit queue command */
#define GVE_ADMIN_CREATE_TX 0x0005
/** Create transmit queue command */
struct gve_admin_create_tx {
/** Header */
struct gve_admin_header hdr;
/** Queue ID */
uint32_t id;
/** Reserved */
uint8_t reserved_a[4];
/** Queue resources address */
uint64_t res;
/** Descriptor ring address */
uint64_t desc;
/** Queue page list ID */
uint32_t qpl_id;
/** Notification channel ID */
uint32_t notify_id;
} __attribute__ (( packed ));
/** Create receive queue command */
#define GVE_ADMIN_CREATE_RX 0x0006
/** Create receive queue command */
struct gve_admin_create_rx {
/** Header */
struct gve_admin_header hdr;
/** Queue ID */
uint32_t id;
/** Index */
uint32_t index;
/** Reserved */
uint8_t reserved_a[4];
/** Notification channel ID */
uint32_t notify_id;
/** Queue resources address */
uint64_t res;
/** Completion ring address */
uint64_t cmplt;
/** Descriptor ring address */
uint64_t desc;
/** Queue page list ID */
uint32_t qpl_id;
/** Reserved */
uint8_t reserved_b[2];
/** Packet buffer size */
uint16_t bufsz;
} __attribute__ (( packed ));
/** Destroy transmit queue command */
#define GVE_ADMIN_DESTROY_TX 0x0007
/** Destroy receive queue command */
#define GVE_ADMIN_DESTROY_RX 0x0008
/** Deconfigure device resources command */
#define GVE_ADMIN_DECONFIGURE 0x0009
/** An admin queue command */
union gve_admin_command {
/** Header */
struct gve_admin_header hdr;
/** Simple command */
struct gve_admin_simple simple;
/** Describe device */
struct gve_admin_describe desc;
/** Configure device resources */
struct gve_admin_configure conf;
/** Register page list */
struct gve_admin_register reg;
/** Create transmit queue */
struct gve_admin_create_tx create_tx;
/** Create receive queue */
struct gve_admin_create_rx create_rx;
/** Padding */
uint8_t pad[64];
};
/**
* Number of admin queue commands
*
* This is theoretically a policy decision. However, older revisions
* of the hardware seem to have only the "admin queue page frame
* number" register and no "admin queue length" register, with the
* implication that the admin queue must be exactly one page in
* length.
*
* Choose to use a one page (4kB) admin queue for both older and newer
* versions of the hardware, to minimise variability.
*/
#define GVE_ADMIN_COUNT ( GVE_PAGE_SIZE / sizeof ( union gve_admin_command ) )
/** Admin queue */
struct gve_admin {
/** Commands */
union gve_admin_command *cmd;
/** Producer counter */
uint32_t prod;
/** DMA mapping */
struct dma_mapping map;
};
/** Scratch buffer for admin queue commands */
struct gve_scratch {
/** Buffer contents */
union {
/** Device descriptor */
struct gve_device_descriptor desc;
/** Page address list */
struct gve_pages pages;
} *buf;
/** DMA mapping */
struct dma_mapping map;
};
/**
* An event counter
*
* Written by the device to indicate completions. The device chooses
* which counter to use for each transmit queue, and stores the index
* of the chosen counter in the queue resources.
*/
struct gve_event {
/** Number of events that have occurred */
volatile uint32_t count;
} __attribute__ (( packed ));
/**
* Maximum number of event counters
*
* We tell the device how many event counters we have provided via the
* "configure device resources" admin queue command. The device will
* accept being given only a single counter, but will subsequently
* fail to create a receive queue.
*
* There is, of course, no documentation indicating how may event
* counters actually need to be provided. In the absence of evidence
* to the contrary, assume that 16 counters (i.e. the smallest number
* we can allocate, given the length alignment constraint on
* allocations) will be sufficient.
*/
#define GVE_EVENT_MAX ( GVE_LEN_ALIGN / sizeof ( struct gve_event ) )
/** Event counter array */
struct gve_events {
/** Event counters */
struct gve_event *event;
/** DMA mapping */
struct dma_mapping map;
/** Actual number of event counters */
unsigned int count;
};
/** An interrupt channel */
struct gve_irq {
/** Interrupt doorbell index (within doorbell BAR) */
uint32_t db_idx;
/** Reserved */
uint8_t reserved[60];
} __attribute__ (( packed ));
/**
* Number of interrupt channels
*
* We tell the device how many interrupt channels we have provided via
* the "configure device resources" admin queue command. The device
* will accept being given zero interrupt channels, but will
* subsequently fail to create more than a single queue (either
* transmit or receive).
*
* There is, of course, no documentation indicating how may interrupt
* channels actually need to be provided. In the absence of evidence
* to the contrary, assume that two channels (one for transmit, one
* for receive) will be sufficient.
*/
#define GVE_IRQ_COUNT 2
/** Interrupt channel array */
struct gve_irqs {
/** Interrupt channels */
struct gve_irq *irq;
/** DMA mapping */
struct dma_mapping map;
/** Interrupt doorbells */
volatile uint32_t *db[GVE_IRQ_COUNT];
};
/** Disable interrupts */
#define GVE_IRQ_DISABLE 0x40000000UL
/**
* Queue resources
*
* Written by the device to indicate the indices of the chosen event
* counter and descriptor doorbell register.
*
* This appears to be a largely pointless data structure: the relevant
* information is static for the lifetime of the queue and could
* trivially have been returned in the response for the "create
* transmit/receive queue" command, instead of requiring yet another
* page-aligned coherent DMA buffer allocation.
*/
struct gve_resources {
/** Descriptor doorbell index (within doorbell BAR) */
uint32_t db_idx;
/** Event counter index (within event counter array) */
uint32_t evt_idx;
/** Reserved */
uint8_t reserved[56];
} __attribute__ (( packed ));
/**
* Queue data buffer size
*
* In theory, we may specify the size of receive buffers. However,
* the original version of the device seems not to have a parameter
* for this, and assumes the use of half-page (2kB) buffers. Choose
* to use this as the buffer size, on the assumption that older
* devices will not support any other buffer size.
*/
#define GVE_BUF_SIZE ( GVE_PAGE_SIZE / 2 )
/** Number of data buffers per page */
#define GVE_BUF_PER_PAGE ( GVE_PAGE_SIZE / GVE_BUF_SIZE )
/**
* Queue page list
*
* The device uses preregistered pages for fast-path DMA operations
* (i.e. transmit and receive buffers). A list of device addresses
* for each page must be registered before the transmit or receive
* queue is created, and cannot subsequently be modified.
*
* The Linux driver allocates pages as DMA_TO_DEVICE or
* DMA_FROM_DEVICE as appropriate, and uses dma_sync_single_for_cpu()
* etc to ensure that data is copied to/from bounce buffers as needed.
*
* Unfortunately there is no such sync operation available within our
* DMA API, since we are constrained by the limitations imposed by
* EFI_PCI_IO_PROTOCOL. There is no way to synchronise a buffer
* without also [un]mapping it, and no way to force the reuse of the
* same device address for a subsequent remapping. We are therefore
* constrained to use only DMA-coherent buffers, since this is the
* only way we can repeatedly reuse the same device address.
*
* Newer versions of the gVNIC device support "raw DMA addressing
* (RDA)", which is essentially a prebuilt queue page list covering
* the whole of the guest address space. Unfortunately we cannot rely
* on this, since older versions will not support it.
*
* Experimentation suggests that the device will accept a request to
* create a queue page list covering the whole of the guest address
* space via two giant "pages" of 2^63 bytes each. However,
* experimentation also suggests that the device will accept any old
* garbage value as the "page size". In the total absence of any
* documentation, it is probably unsafe to conclude that the device is
* bothering to look at or respect the "page size" parameter: it is
* most likely just presuming the use of 4kB pages.
*/
struct gve_qpl {
/** Page addresses */
userptr_t data;
/** Page mapping */
struct dma_mapping map;
/** Number of pages */
unsigned int count;
/** Queue page list ID */
unsigned int id;
};
/**
* Maximum number of transmit buffers
*
* This is a policy decision.
*/
#define GVE_TX_FILL 8
/** Transmit queue page list ID */
#define GVE_TX_QPL 0x18ae5458
/** Tranmsit queue interrupt channel */
#define GVE_TX_IRQ 0
/** A transmit or receive buffer descriptor */
struct gve_buffer {
/** Address (within queue page list address space) */
uint64_t addr;
} __attribute__ (( packed ));
/** A transmit packet descriptor */
struct gve_tx_packet {
/** Type */
uint8_t type;
/** Reserved */
uint8_t reserved_a[2];
/** Number of descriptors in this packet */
uint8_t count;
/** Total length of this packet */
uint16_t total;
/** Length of this descriptor */
uint16_t len;
} __attribute__ (( packed ));
/** A transmit descriptor */
struct gve_tx_descriptor {
/** Packet descriptor */
struct gve_tx_packet pkt;
/** Buffer descriptor */
struct gve_buffer buf;
} __attribute__ (( packed ));
/** Start of packet transmit descriptor type */
#define GVE_TX_TYPE_START 0x00
/** Continuation of packet transmit descriptor type */
#define GVE_TX_TYPE_CONT 0x20
/**
* Maximum number of receive buffers
*
* This is a policy decision.
*/
#define GVE_RX_FILL 16
/** Receive queue page list ID */
#define GVE_RX_QPL 0x18ae5258
/** Receive queue interrupt channel */
#define GVE_RX_IRQ 1
/** A receive descriptor */
struct gve_rx_descriptor {
/** Buffer descriptor */
struct gve_buffer buf;
} __attribute__ (( packed ));
/** A receive packet descriptor */
struct gve_rx_packet {
/** Length */
uint16_t len;
/** Flags */
uint8_t flags;
/** Sequence number */
uint8_t seq;
} __attribute__ (( packed ));
/** Receive error */
#define GVE_RXF_ERROR 0x08
/** Receive packet continues into next descriptor */
#define GVE_RXF_MORE 0x20
/** Receive sequence number mask */
#define GVE_RX_SEQ_MASK 0x07
/** A receive completion descriptor */
struct gve_rx_completion {
/** Reserved */
uint8_t reserved[60];
/** Packet descriptor */
struct gve_rx_packet pkt;
} __attribute__ (( packed ));
/** Padding at the start of all received packets */
#define GVE_RX_PAD 2
/** A descriptor queue */
struct gve_queue {
/** Descriptor ring */
userptr_t desc;
/** Completion ring */
userptr_t cmplt;
/** Queue resources */
struct gve_resources *res;
/** Queue type */
const struct gve_queue_type *type;
/** Number of descriptors (must be a power of two) */
unsigned int count;
/** Maximum fill level (must be a power of two) */
unsigned int fill;
/** Descriptor mapping */
struct dma_mapping desc_map;
/** Completion mapping */
struct dma_mapping cmplt_map;
/** Queue resources mapping */
struct dma_mapping res_map;
/** Doorbell register */
volatile uint32_t *db;
/** Event counter */
struct gve_event *event;
/** Producer counter */
uint32_t prod;
/** Consumer counter */
uint32_t cons;
/** Queue page list */
struct gve_qpl qpl;
};
/** A descriptor queue type */
struct gve_queue_type {
/** Name */
const char *name;
/**
* Populate command parameters to create queue
*
* @v queue Descriptor queue
* @v cmd Admin queue command
*/
void ( * param ) ( struct gve_queue *queue,
union gve_admin_command *cmd );
/** Queue page list ID */
uint32_t qpl;
/** Interrupt channel */
uint8_t irq;
/** Maximum fill level */
uint8_t fill;
/** Descriptor size */
uint8_t desc_len;
/** Completion size */
uint8_t cmplt_len;
/** Command to create queue */
uint8_t create;
/** Command to destroy queue */
uint8_t destroy;
};
/** A Google Virtual Ethernet NIC */
struct gve_nic {
/** Configuration registers */
void *cfg;
/** Doorbell registers */
void *db;
/** PCI revision */
uint8_t revision;
/** Network device */
struct net_device *netdev;
/** DMA device */
struct dma_device *dma;
/** Admin queue */
struct gve_admin admin;
/** Interrupt channels */
struct gve_irqs irqs;
/** Event counters */
struct gve_events events;
/** Scratch buffer */
struct gve_scratch scratch;
/** Transmit queue */
struct gve_queue tx;
/** Receive queue */
struct gve_queue rx;
/** Transmit I/O buffers */
struct io_buffer *tx_iobuf[GVE_TX_FILL];
/** Receive sequence number */
unsigned int seq;
/** Startup process */
struct process startup;
/** Startup process retry counter */
unsigned int retries;
/** Reset recovery watchdog timer */
struct retry_timer watchdog;
/** Reset recovery recorded activity counter */
uint32_t activity;
};
/** Maximum time to wait for admin queue commands */
#define GVE_ADMIN_MAX_WAIT_MS 500
/** Maximum number of times to reattempt device reset */
#define GVE_RESET_MAX_RETRY 5
/** Time between reset recovery checks */
#define GVE_WATCHDOG_TIMEOUT ( 1 * TICKS_PER_SEC )
#endif /* _GVE_H */

View File

@ -224,6 +224,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
#define ERRFILE_ecam ( ERRFILE_DRIVER | 0x00d30000 )
#define ERRFILE_pcibridge ( ERRFILE_DRIVER | 0x00d40000 )
#define ERRFILE_mnpnet ( ERRFILE_DRIVER | 0x00d50000 )
#define ERRFILE_gve ( ERRFILE_DRIVER | 0x00d60000 )
#define ERRFILE_aoe ( ERRFILE_NET | 0x00000000 )
#define ERRFILE_arp ( ERRFILE_NET | 0x00010000 )