/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
#define VFIO_API_VERSION 0
/* Kernel & User level defines for VFIO IOCTLs. */
#define VFIO_TYPE1_IOMMU 1
#define VFIO_SPAPR_TCE_IOMMU 2
#define VFIO_TYPE1v2_IOMMU 3
* IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping). This
* capability is subject to change as groups are added or removed.
#define VFIO_DMA_CC_IOMMU 4
/* Check if EEH is supported */
#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */
#define VFIO_SPAPR_TCE_v2_IOMMU 7
* The No-IOMMU IOMMU offers no translation or isolation for devices and
* supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU
* code will taint the host kernel and should be used with extreme caution.
#define VFIO_NOIOMMU_IOMMU 8
* The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between
* kernel and userspace. We therefore use the _IO() macro for these
* defines to avoid implicitly embedding a size into the ioctl request.
* As structure fields are added, argsz will increase to match and flag
* bits will be defined to indicate additional fields with valid data.
* It's *always* the caller's responsibility to indicate the size of
* the structure passed by setting argsz appropriately.
* For extension of INFO ioctls, VFIO makes use of a capability chain
* designed after PCI/e capabilities. A flag bit indicates whether
* this capability chain is supported and a field defined in the fixed
* structure defines the offset of the first capability in the chain.
* This field is only valid when the corresponding bit in the flags
* bitmap is set. This offset field is relative to the start of the
* INFO buffer, as is the next field within each capability header.
* The id within the header is a shared address space per INFO ioctl,
* while the version field is specific to the capability id. The
* contents following the header are specific to the capability id.
struct vfio_info_cap_header {
__u16 id; /* Identifies capability */
__u16 version; /* Version specific to the capability ID */
__u32 next; /* Offset of next capability */
* Callers of INFO ioctls passing insufficiently sized buffers will see
* the capability chain flag bit set, a zero value for the first capability
* offset (if available within the provided argsz), and argsz will be
* updated to report the necessary buffer size. For compatibility, the
* INFO ioctl will not report error in this case, but the capability chain
/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
* VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
* Report the version of the VFIO API. This allows us to bump the entire
* API version should we later need to add or change features in incompatible
* Return: VFIO_API_VERSION
#define VFIO_GET_API_VERSION _IO(VFIO_TYPE, VFIO_BASE + 0)
* VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
* Check whether an extension is supported.
* Return: 0 if not supported, 1 (or some other positive integer) if supported.
#define VFIO_CHECK_EXTENSION _IO(VFIO_TYPE, VFIO_BASE + 1)
* VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
* Set the iommu to the given type. The type must be supported by an
* iommu driver as verified by calling CHECK_EXTENSION using the same
* type. A group must be set to this file descriptor before this
* ioctl is available. The IOMMU interfaces enabled by this call are
* specific to the value set.
* Return: 0 on success, -errno on failure
* Availability: When VFIO group attached
#define VFIO_SET_IOMMU _IO(VFIO_TYPE, VFIO_BASE + 2)
/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
* VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
* struct vfio_group_status)
* Retrieve information about the group. Fills in provided
* struct vfio_group_info. Caller sets argsz.
* Return: 0 on succes, -errno on failure.
struct vfio_group_status {
#define VFIO_GROUP_FLAGS_VIABLE (1 << 0)
#define VFIO_GROUP_FLAGS_CONTAINER_SET (1 << 1)
#define VFIO_GROUP_GET_STATUS _IO(VFIO_TYPE, VFIO_BASE + 3)
* VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
* Set the container for the VFIO group to the open VFIO file
* descriptor provided. Groups may only belong to a single
* container. Containers may, at their discretion, support multiple
* groups. Only when a container is set are all of the interfaces
* of the VFIO file descriptor and the VFIO group file descriptor
* Return: 0 on success, -errno on failure.
#define VFIO_GROUP_SET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 4)
* VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
* Remove the group from the attached container. This is the
* opposite of the SET_CONTAINER call and returns the group to
* an initial state. All device file descriptors must be released
* prior to calling this interface. When removing the last group
* from a container, the IOMMU will be disabled and all state lost,
* effectively also returning the VFIO file descriptor to an initial
* Return: 0 on success, -errno on failure.
* Availability: When attached to container
#define VFIO_GROUP_UNSET_CONTAINER _IO(VFIO_TYPE, VFIO_BASE + 5)
* VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
* Return a new file descriptor for the device object described by
* the provided string. The string should match a device listed in
* the devices subdirectory of the IOMMU group sysfs entry. The
* group containing the device must already be added to this context.
* Return: new file descriptor on success, -errno on failure.
* Availability: When attached to container
#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6)
/* --------------- IOCTLs for DEVICE file descriptors --------------- */
* VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
* struct vfio_device_info)
* Retrieve information about the device. Fills in provided
* struct vfio_device_info. Caller sets argsz.
* Return: 0 on success, -errno on failure.
struct vfio_device_info {
#define VFIO_DEVICE_FLAGS_RESET (1 << 0) /* Device supports reset */
#define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
#define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
#define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */
#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */
__u32 num_regions; /* Max region index + 1 */
__u32 num_irqs; /* Max IRQ index + 1 */
__u32 cap_offset; /* Offset within info struct of first cap */
#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
* Vendor driver using Mediated device framework should provide device_api
* attribute in supported type attribute groups. Device API string should be one
* of the following corresponding to device flags in vfio_device_info structure.
#define VFIO_DEVICE_API_PCI_STRING "vfio-pci"
#define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform"
#define VFIO_DEVICE_API_AMBA_STRING "vfio-amba"
#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw"
#define VFIO_DEVICE_API_AP_STRING "vfio-ap"
* The following capabilities are unique to s390 zPCI devices. Their contents
* are further-defined in vfio_zdev.h
#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1
#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2
#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3
#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4
* VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
* struct vfio_region_info)
* Retrieve information about a device region. Caller provides
* struct vfio_region_info with index value set. Caller sets argsz.
* Implementation of region mapping is bus driver specific. This is
* intended to describe MMIO, I/O port, as well as bus specific
* regions (ex. PCI config space). Zero sized regions may be used
* to describe unimplemented regions (ex. unimplemented PCI BARs).
* Return: 0 on success, -errno on failure.
struct vfio_region_info {
#define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */
#define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */
#define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */
#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */
__u32 index; /* Region index */
__u32 cap_offset; /* Offset within info struct of first cap */
__u64 size; /* Region size (bytes) */
__u64 offset; /* Region offset from start of device fd */
#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8)
* The sparse mmap capability allows finer granularity of specifying areas
* within a region with mmap support. When specified, the user should only
* mmap the offset ranges specified by the areas array. mmaps outside of the
* areas specified may fail (such as the range covering a PCI MSI-X table) or
* may result in improper device behavior.
* The structures below define version 1 of this capability.
#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1
struct vfio_region_sparse_mmap_area {
__u64 offset; /* Offset of mmap'able area within region */
__u64 size; /* Size of mmap'able area */
struct vfio_region_info_cap_sparse_mmap {
struct vfio_info_cap_header header;
struct vfio_region_sparse_mmap_area areas[];
* The device specific type capability allows regions unique to a specific
* device or class of devices to be exposed. This helps solve the problem for
* vfio bus drivers of defining which region indexes correspond to which region
* on the device, without needing to resort to static indexes, as done by
* vfio-pci. For instance, if we were to go back in time, we might remove
* VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes
* greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd
* make a "VGA" device specific type to describe the VGA access space. This
* means that non-VGA devices wouldn't need to waste this index, and thus the
* address space associated with it due to implementation of device file
* descriptor offsets in vfio-pci.
* The current implementation is now part of the user ABI, so we can't use this
* for VGA, but there are other upcoming use cases, such as opregions for Intel
* IGD devices and framebuffers for vGPU devices. We missed VGA, but we'll
* use this for future additions.
* The structure below defines version 1 of this capability.
#define VFIO_REGION_INFO_CAP_TYPE 2
struct vfio_region_info_cap_type {
struct vfio_info_cap_header header;
__u32 type; /* global per bus driver */
__u32 subtype; /* type specific */
* List of region types, global per bus driver.
* If you introduce a new type, please add it here.
/* PCI region type containing a PCI vendor part */
#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31)
#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
#define VFIO_REGION_TYPE_GFX (1)
#define VFIO_REGION_TYPE_CCW (2)
#define VFIO_REGION_TYPE_MIGRATION (3)
/* sub-types for VFIO_REGION_TYPE_PCI_* */
/* 8086 vendor PCI sub-types */
#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3)
/* 10de vendor PCI sub-types */
* NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1)
/* 1014 vendor PCI sub-types */
* IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
* to do TLB invalidation on a GPU.
#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1)
/* sub-types for VFIO_REGION_TYPE_GFX */
#define VFIO_REGION_SUBTYPE_GFX_EDID (1)
* struct vfio_region_gfx_edid - EDID region layout.
* Set display link state and EDID blob.
* The EDID blob has monitor information such as brand, name, serial
* number, physical size, supported video modes and more.
* This special region allows userspace (typically qemu) set a virtual
* EDID for the virtual monitor, which allows a flexible display
* For the edid blob spec look here:
* https://en.wikipedia.org/wiki/Extended_Display_Identification_Data
* On linux systems you can find the EDID blob in sysfs:
* /sys/class/drm/${card}/${connector}/edid
* You can use the edid-decode ulility (comes with xorg-x11-utils) to
* @edid_offset: location of the edid blob, relative to the
* start of the region (readonly).
* @edid_max_size: max size of the edid blob (readonly).
* @edid_size: actual edid size (read/write).
* @link_state: display link state (read/write).
* VFIO_DEVICE_GFX_LINK_STATE_UP: Monitor is turned on.
* VFIO_DEVICE_GFX_LINK_STATE_DOWN: Monitor is turned off.
* @max_xres: max display width (0 == no limitation, readonly).
* @max_yres: max display height (0 == no limitation, readonly).
* (1) set link-state to down.
* (2) update edid blob and size.
* (3) set link-state to up.
struct vfio_region_gfx_edid {
#define VFIO_DEVICE_GFX_LINK_STATE_UP 1
#define VFIO_DEVICE_GFX_LINK_STATE_DOWN 2
/* sub-types for VFIO_REGION_TYPE_CCW */
#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1)
#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2)
#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
/* sub-types for VFIO_REGION_TYPE_MIGRATION */
#define VFIO_REGION_SUBTYPE_MIGRATION (1)
* The structure vfio_device_migration_info is placed at the 0th offset of
* the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
* migration information. Field accesses from this structure are only supported
* at their native width and alignment. Otherwise, the result is undefined and
* vendor drivers should return an error.
* device_state: (read/write)
* - The user application writes to this field to inform the vendor driver
* about the device state to be transitioned to.
* - The vendor driver should take the necessary actions to change the
* device state. After successful transition to a given state, the
* vendor driver should return success on write(device_state, state)
* system call. If the device state transition fails, the vendor driver
* should return an appropriate -errno for the fault condition.
* - On the user application side, if the device state transition fails,
* that is, if write(device_state, state) returns an error, read
* device_state again to determine the current state of the device from
* - The vendor driver should return previous state of the device unless
* the vendor driver has encountered an internal error, in which case
* the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
* - The user application must use the device reset ioctl to recover the
* device from VFIO_DEVICE_STATE_ERROR state. If the device is
* indicated to be in a valid device state by reading device_state, the
* user application may attempt to transition the device to any valid
* state reachable from the current state or terminate itself.
* device_state consists of 3 bits:
* - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
* it indicates the _STOP state. When the device state is changed to
* _STOP, driver should stop the device before write() returns.
* - If bit 1 is set, it indicates the _SAVING state, which means that the
* driver should start gathering device state information that will be
* provided to the VFIO user application to save the device's state.
* - If bit 2 is set, it indicates the _RESUMING state, which means that
* the driver should prepare to resume the device. Data provided through
* the migration region should be used to resume the device.
* Bits 3 - 31 are reserved for future use. To preserve them, the user
* application should perform a read-modify-write operation on this
* field when modifying the specified bits.
* 000b => Device Stopped, not saving or resuming
* 001b => Device running, which is the default state
* 010b => Stop the device & save the device state, stop-and-copy state
* 011b => Device running and save the device state, pre-copy state
* 100b => Device stopped and the device state is resuming
* _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
* (100b) (001b) (011b) (010b) (000b)
* 0. Running or default state
* 1. Normal Shutdown (optional)
* |------------------------------------->|
* 2. Save the state or suspend
* |------------------------->|---------->|
* 3. Save the state during live migration
* |----------->|------------>|---------->|
* 0. Default state of VFIO device is _RUNNING when the user application starts.
* 1. During normal shutdown of the user application, the user application may
* optionally change the VFIO device state from _RUNNING to _STOP. This
* transition is optional. The vendor driver must support this transition but
* 2. When the user application saves state or suspends the application, the
* device state transitions from _RUNNING to stop-and-copy and then to _STOP.
* On state transition from _RUNNING to stop-and-copy, driver must stop the
* device, save the device state and send it to the application through the
* migration region. The sequence to be followed for such transition is given
* 3. In live migration of user application, the state transitions from _RUNNING
* to pre-copy, to stop-and-copy, and to _STOP.
* On state transition from _RUNNING to pre-copy, the driver should start
* gathering the device state while the application is still running and send
* the device state data to application through the migration region.
* On state transition from pre-copy to stop-and-copy, the driver must stop
* the device, save the device state and send it to the user application
* through the migration region.
* Vendor drivers must support the pre-copy state even for implementations
* where no data is provided to the user before the stop-and-copy state. The
* user must not be required to consume all migration data before the device
* transitions to a new state, including the stop-and-copy state.
* The sequence to be followed for above two transitions is given below.
* 4. To start the resuming phase, the device state should be transitioned from