mirror of
https://github.com/edk2-porting/linux-next.git
synced 2024-12-12 07:14:03 +08:00
irqchip/genirq updates for 5.20:
* Core code update: - Non-SMP IRQ affinity fixes, allowing UP kernel to behave similarly to SMP ones for the purpose of interrupt affinity - Let irq_set_chip_handler_name_locked() take a const struct irq_chip * - Tidy-up the NOMAP irqdomain API variant - Teach action_show() to use for_each_action_of_desc() - Make irq_chip_request_resources_parent() allow the parent callback to be optional - Remove dynamic allocations from populate_parent_alloc_arg() * New drivers: - Merge the long awaited IRQ support for the LoongArch architecture, with the provisional ACPICA update (to be reverted once the official support lands) - New Renesas RZ/G2L IRQC driver, equipped with its companion GPIO driver * Driver updates - Optimise the hot path operations for the SiFive PLIC, trading the locking for per-CPU priority masking masking operations which are apparently faster - Work around broken PLIC implementations that deal pretty badly with edge-triggered interrupts. Flag two implementations as affected. - Simplify the irq-stm32-exti driver, particularly the table that remaps the interrupts from exti to the GIC, reducing the memory usage - Convert the ocelot irq_chip to being immutable - Check ioremap() return value in the MIPS GIC driver - Move MMP driver init function declarations into the common .h - The obligatory typo fixes -----BEGIN PGP SIGNATURE----- iQJDBAABCgAtFiEEn9UcU+C1Yxj9lZw9I9DQutE9ekMFAmLhi0EPHG1hekBrZXJu ZWwub3JnAAoJECPQ0LrRPXpDI+wP/2BPABqCwu7JAmue8hHtpGweVkEBNulaS1K+ 1v/ElU8E1P8ppn1AVmu0lwgmAWiTtPuVWT21+AUbfOjQQ/MYKkegkH4KLmK93qSi Dn3MEiYv8WYsEV4yrJ7Il7fuuzr1iHIhIfhg3tMxDAJx47lzZH0o8nVoNFqXD1Ro WUFab/qTAOxJ/I53R9nrpx/yRf5dVRFUAZznrabYKpc/CiD/X+RLcHkbiybbRERu n3xwEtGMA2faCUgifKhsXoNUaW9mZLaufoF/F3J3Dyt7jNB/TTmdnxqWo6e4/rtd +Ut0MlH0W7bUdHGiVl1E90hDQ3yuBykUpKlTfMoYWOxeTqAx0bPYjGIuh6ajrAIy +fXWcK89KimOGB+aLC0cR5YrzvShHnH1G2qlrQg3pAXporigAXfZvzhnouRBsVKO RfnQHNsHSQHXTEu1u2HjMt7AXtmy/SoJENuPPUrtLfojg8b3aupwOvPLVx7w1Ok0 5TKZ2yhOHdskfr3lCPisVPKK0KZ+QZhDdBkd319JkxR5/iA/tzMeMTzKslruhd2U Ug6hYhKNE2kKkBBBMcEVCHAuuq94DnU/q6l458UTSkkBmvq5cMMSz5Fs0kMwGFRc q/DncpKQnrPKGrwiilj1uGgOWO2vec8KfMJUYtKzSM/QELbKUvF7yzZeIjUQxiDO KqlWNczX =E5fZ -----END PGP SIGNATURE----- Merge tag 'irqchip-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms into irq/core Pull irqchip/genirq updates from Marc Zyngier: * Core code update: - Non-SMP IRQ affinity fixes, allowing UP kernel to behave similarly to SMP ones for the purpose of interrupt affinity - Let irq_set_chip_handler_name_locked() take a const struct irq_chip * - Tidy-up the NOMAP irqdomain API variant - Teach action_show() to use for_each_action_of_desc() - Make irq_chip_request_resources_parent() allow the parent callback to be optional - Remove dynamic allocations from populate_parent_alloc_arg() * New drivers: - Merge the long awaited IRQ support for the LoongArch architecture, with the provisional ACPICA update (to be reverted once the official support lands) - New Renesas RZ/G2L IRQC driver, equipped with its companion GPIO driver * Driver updates - Optimise the hot path operations for the SiFive PLIC, trading the locking for per-CPU priority masking masking operations which are apparently faster - Work around broken PLIC implementations that deal pretty badly with edge-triggered interrupts. Flag two implementations as affected. - Simplify the irq-stm32-exti driver, particularly the table that remaps the interrupts from exti to the GIC, reducing the memory usage - Convert the ocelot irq_chip to being immutable - Check ioremap() return value in the MIPS GIC driver - Move MMP driver init function declarations into the common .h - The obligatory typo fixes Link: https://lore.kernel.org/all/20220727192356.1860546-1-maz@kernel.org
This commit is contained in:
commit
779fda86bd
@ -526,6 +526,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/srbds
|
||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
|
@ -17,3 +17,4 @@ are configurable at compile, boot or run time.
|
||||
special-register-buffer-data-sampling.rst
|
||||
core-scheduling.rst
|
||||
l1d_flush.rst
|
||||
processor_mmio_stale_data.rst
|
||||
|
246
Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
Normal file
246
Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
Normal file
@ -0,0 +1,246 @@
|
||||
=========================================
|
||||
Processor MMIO Stale Data Vulnerabilities
|
||||
=========================================
|
||||
|
||||
Processor MMIO Stale Data Vulnerabilities are a class of memory-mapped I/O
|
||||
(MMIO) vulnerabilities that can expose data. The sequences of operations for
|
||||
exposing data range from simple to very complex. Because most of the
|
||||
vulnerabilities require the attacker to have access to MMIO, many environments
|
||||
are not affected. System environments using virtualization where MMIO access is
|
||||
provided to untrusted guests may need mitigation. These vulnerabilities are
|
||||
not transient execution attacks. However, these vulnerabilities may propagate
|
||||
stale data into core fill buffers where the data can subsequently be inferred
|
||||
by an unmitigated transient execution attack. Mitigation for these
|
||||
vulnerabilities includes a combination of microcode update and software
|
||||
changes, depending on the platform and usage model. Some of these mitigations
|
||||
are similar to those used to mitigate Microarchitectural Data Sampling (MDS) or
|
||||
those used to mitigate Special Register Buffer Data Sampling (SRBDS).
|
||||
|
||||
Data Propagators
|
||||
================
|
||||
Propagators are operations that result in stale data being copied or moved from
|
||||
one microarchitectural buffer or register to another. Processor MMIO Stale Data
|
||||
Vulnerabilities are operations that may result in stale data being directly
|
||||
read into an architectural, software-visible state or sampled from a buffer or
|
||||
register.
|
||||
|
||||
Fill Buffer Stale Data Propagator (FBSDP)
|
||||
-----------------------------------------
|
||||
Stale data may propagate from fill buffers (FB) into the non-coherent portion
|
||||
of the uncore on some non-coherent writes. Fill buffer propagation by itself
|
||||
does not make stale data architecturally visible. Stale data must be propagated
|
||||
to a location where it is subject to reading or sampling.
|
||||
|
||||
Sideband Stale Data Propagator (SSDP)
|
||||
-------------------------------------
|
||||
The sideband stale data propagator (SSDP) is limited to the client (including
|
||||
Intel Xeon server E3) uncore implementation. The sideband response buffer is
|
||||
shared by all client cores. For non-coherent reads that go to sideband
|
||||
destinations, the uncore logic returns 64 bytes of data to the core, including
|
||||
both requested data and unrequested stale data, from a transaction buffer and
|
||||
the sideband response buffer. As a result, stale data from the sideband
|
||||
response and transaction buffers may now reside in a core fill buffer.
|
||||
|
||||
Primary Stale Data Propagator (PSDP)
|
||||
------------------------------------
|
||||
The primary stale data propagator (PSDP) is limited to the client (including
|
||||
Intel Xeon server E3) uncore implementation. Similar to the sideband response
|
||||
buffer, the primary response buffer is shared by all client cores. For some
|
||||
processors, MMIO primary reads will return 64 bytes of data to the core fill
|
||||
buffer including both requested data and unrequested stale data. This is
|
||||
similar to the sideband stale data propagator.
|
||||
|
||||
Vulnerabilities
|
||||
===============
|
||||
Device Register Partial Write (DRPW) (CVE-2022-21166)
|
||||
-----------------------------------------------------
|
||||
Some endpoint MMIO registers incorrectly handle writes that are smaller than
|
||||
the register size. Instead of aborting the write or only copying the correct
|
||||
subset of bytes (for example, 2 bytes for a 2-byte write), more bytes than
|
||||
specified by the write transaction may be written to the register. On
|
||||
processors affected by FBSDP, this may expose stale data from the fill buffers
|
||||
of the core that created the write transaction.
|
||||
|
||||
Shared Buffers Data Sampling (SBDS) (CVE-2022-21125)
|
||||
----------------------------------------------------
|
||||
After propagators may have moved data around the uncore and copied stale data
|
||||
into client core fill buffers, processors affected by MFBDS can leak data from
|
||||
the fill buffer. It is limited to the client (including Intel Xeon server E3)
|
||||
uncore implementation.
|
||||
|
||||
Shared Buffers Data Read (SBDR) (CVE-2022-21123)
|
||||
------------------------------------------------
|
||||
It is similar to Shared Buffer Data Sampling (SBDS) except that the data is
|
||||
directly read into the architectural software-visible state. It is limited to
|
||||
the client (including Intel Xeon server E3) uncore implementation.
|
||||
|
||||
Affected Processors
|
||||
===================
|
||||
Not all the CPUs are affected by all the variants. For instance, most
|
||||
processors for the server market (excluding Intel Xeon E3 processors) are
|
||||
impacted by only Device Register Partial Write (DRPW).
|
||||
|
||||
Below is the list of affected Intel processors [#f1]_:
|
||||
|
||||
=================== ============ =========
|
||||
Common name Family_Model Steppings
|
||||
=================== ============ =========
|
||||
HASWELL_X 06_3FH 2,4
|
||||
SKYLAKE_L 06_4EH 3
|
||||
BROADWELL_X 06_4FH All
|
||||
SKYLAKE_X 06_55H 3,4,6,7,11
|
||||
BROADWELL_D 06_56H 3,4,5
|
||||
SKYLAKE 06_5EH 3
|
||||
ICELAKE_X 06_6AH 4,5,6
|
||||
ICELAKE_D 06_6CH 1
|
||||
ICELAKE_L 06_7EH 5
|
||||
ATOM_TREMONT_D 06_86H All
|
||||
LAKEFIELD 06_8AH 1
|
||||
KABYLAKE_L 06_8EH 9 to 12
|
||||
ATOM_TREMONT 06_96H 1
|
||||
ATOM_TREMONT_L 06_9CH 0
|
||||
KABYLAKE 06_9EH 9 to 13
|
||||
COMETLAKE 06_A5H 2,3,5
|
||||
COMETLAKE_L 06_A6H 0,1
|
||||
ROCKETLAKE 06_A7H 1
|
||||
=================== ============ =========
|
||||
|
||||
If a CPU is in the affected processor list, but not affected by a variant, it
|
||||
is indicated by new bits in MSR IA32_ARCH_CAPABILITIES. As described in a later
|
||||
section, mitigation largely remains the same for all the variants, i.e. to
|
||||
clear the CPU fill buffers via VERW instruction.
|
||||
|
||||
New bits in MSRs
|
||||
================
|
||||
Newer processors and microcode update on existing affected processors added new
|
||||
bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate
|
||||
specific variants of Processor MMIO Stale Data vulnerabilities and mitigation
|
||||
capability.
|
||||
|
||||
MSR IA32_ARCH_CAPABILITIES
|
||||
--------------------------
|
||||
Bit 13 - SBDR_SSDP_NO - When set, processor is not affected by either the
|
||||
Shared Buffers Data Read (SBDR) vulnerability or the sideband stale
|
||||
data propagator (SSDP).
|
||||
Bit 14 - FBSDP_NO - When set, processor is not affected by the Fill Buffer
|
||||
Stale Data Propagator (FBSDP).
|
||||
Bit 15 - PSDP_NO - When set, processor is not affected by Primary Stale Data
|
||||
Propagator (PSDP).
|
||||
Bit 17 - FB_CLEAR - When set, VERW instruction will overwrite CPU fill buffer
|
||||
values as part of MD_CLEAR operations. Processors that do not
|
||||
enumerate MDS_NO (meaning they are affected by MDS) but that do
|
||||
enumerate support for both L1D_FLUSH and MD_CLEAR implicitly enumerate
|
||||
FB_CLEAR as part of their MD_CLEAR support.
|
||||
Bit 18 - FB_CLEAR_CTRL - Processor supports read and write to MSR
|
||||
IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]. On such processors, the FB_CLEAR_DIS
|
||||
bit can be set to cause the VERW instruction to not perform the
|
||||
FB_CLEAR action. Not all processors that support FB_CLEAR will support
|
||||
FB_CLEAR_CTRL.
|
||||
|
||||
MSR IA32_MCU_OPT_CTRL
|
||||
---------------------
|
||||
Bit 3 - FB_CLEAR_DIS - When set, VERW instruction does not perform the FB_CLEAR
|
||||
action. This may be useful to reduce the performance impact of FB_CLEAR in
|
||||
cases where system software deems it warranted (for example, when performance
|
||||
is more critical, or the untrusted software has no MMIO access). Note that
|
||||
FB_CLEAR_DIS has no impact on enumeration (for example, it does not change
|
||||
FB_CLEAR or MD_CLEAR enumeration) and it may not be supported on all processors
|
||||
that enumerate FB_CLEAR.
|
||||
|
||||
Mitigation
|
||||
==========
|
||||
Like MDS, all variants of Processor MMIO Stale Data vulnerabilities have the
|
||||
same mitigation strategy to force the CPU to clear the affected buffers before
|
||||
an attacker can extract the secrets.
|
||||
|
||||
This is achieved by using the otherwise unused and obsolete VERW instruction in
|
||||
combination with a microcode update. The microcode clears the affected CPU
|
||||
buffers when the VERW instruction is executed.
|
||||
|
||||
Kernel reuses the MDS function to invoke the buffer clearing:
|
||||
|
||||
mds_clear_cpu_buffers()
|
||||
|
||||
On MDS affected CPUs, the kernel already invokes CPU buffer clear on
|
||||
kernel/userspace, hypervisor/guest and C-state (idle) transitions. No
|
||||
additional mitigation is needed on such CPUs.
|
||||
|
||||
For CPUs not affected by MDS or TAA, mitigation is needed only for the attacker
|
||||
with MMIO capability. Therefore, VERW is not required for kernel/userspace. For
|
||||
virtualization case, VERW is only needed at VMENTER for a guest with MMIO
|
||||
capability.
|
||||
|
||||
Mitigation points
|
||||
-----------------
|
||||
Return to user space
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
Same mitigation as MDS when affected by MDS/TAA, otherwise no mitigation
|
||||
needed.
|
||||
|
||||
C-State transition
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
Control register writes by CPU during C-state transition can propagate data
|
||||
from fill buffer to uncore buffers. Execute VERW before C-state transition to
|
||||
clear CPU fill buffers.
|
||||
|
||||
Guest entry point
|
||||
^^^^^^^^^^^^^^^^^
|
||||
Same mitigation as MDS when processor is also affected by MDS/TAA, otherwise
|
||||
execute VERW at VMENTER only for MMIO capable guests. On CPUs not affected by
|
||||
MDS/TAA, guest without MMIO access cannot extract secrets using Processor MMIO
|
||||
Stale Data vulnerabilities, so there is no need to execute VERW for such guests.
|
||||
|
||||
Mitigation control on the kernel command line
|
||||
---------------------------------------------
|
||||
The kernel command line allows to control the Processor MMIO Stale Data
|
||||
mitigations at boot time with the option "mmio_stale_data=". The valid
|
||||
arguments for this option are:
|
||||
|
||||
========== =================================================================
|
||||
full If the CPU is vulnerable, enable mitigation; CPU buffer clearing
|
||||
on exit to userspace and when entering a VM. Idle transitions are
|
||||
protected as well. It does not automatically disable SMT.
|
||||
full,nosmt Same as full, with SMT disabled on vulnerable CPUs. This is the
|
||||
complete mitigation.
|
||||
off Disables mitigation completely.
|
||||
========== =================================================================
|
||||
|
||||
If the CPU is affected and mmio_stale_data=off is not supplied on the kernel
|
||||
command line, then the kernel selects the appropriate mitigation.
|
||||
|
||||
Mitigation status information
|
||||
-----------------------------
|
||||
The Linux kernel provides a sysfs interface to enumerate the current
|
||||
vulnerability status of the system: whether the system is vulnerable, and
|
||||
which mitigations are active. The relevant sysfs file is:
|
||||
|
||||
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
||||
|
||||
The possible values in this file are:
|
||||
|
||||
.. list-table::
|
||||
|
||||
* - 'Not affected'
|
||||
- The processor is not vulnerable
|
||||
* - 'Vulnerable'
|
||||
- The processor is vulnerable, but no mitigation enabled
|
||||
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||
- The processor is vulnerable, but microcode is not updated. The
|
||||
mitigation is enabled on a best effort basis.
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||
enabled.
|
||||
|
||||
If the processor is vulnerable then the following information is appended to
|
||||
the above information:
|
||||
|
||||
======================== ===========================================
|
||||
'SMT vulnerable' SMT is enabled
|
||||
'SMT disabled' SMT is disabled
|
||||
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
||||
======================== ===========================================
|
||||
|
||||
References
|
||||
----------
|
||||
.. [#f1] Affected Processors
|
||||
https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html
|
@ -2469,7 +2469,6 @@
|
||||
|
||||
protected: nVHE-based mode with support for guests whose
|
||||
state is kept private from the host.
|
||||
Not valid if the kernel is running in EL2.
|
||||
|
||||
Defaults to VHE/nVHE based on hardware support. Setting
|
||||
mode to "protected" will disable kexec and hibernation
|
||||
@ -3176,6 +3175,7 @@
|
||||
srbds=off [X86,INTEL]
|
||||
no_entry_flush [PPC]
|
||||
no_uaccess_flush [PPC]
|
||||
mmio_stale_data=off [X86]
|
||||
|
||||
Exceptions:
|
||||
This does not have any effect on
|
||||
@ -3197,6 +3197,7 @@
|
||||
Equivalent to: l1tf=flush,nosmt [X86]
|
||||
mds=full,nosmt [X86]
|
||||
tsx_async_abort=full,nosmt [X86]
|
||||
mmio_stale_data=full,nosmt [X86]
|
||||
|
||||
mminit_loglevel=
|
||||
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
||||
@ -3206,6 +3207,40 @@
|
||||
log everything. Information is printed at KERN_DEBUG
|
||||
so loglevel=8 may also need to be specified.
|
||||
|
||||
mmio_stale_data=
|
||||
[X86,INTEL] Control mitigation for the Processor
|
||||
MMIO Stale Data vulnerabilities.
|
||||
|
||||
Processor MMIO Stale Data is a class of
|
||||
vulnerabilities that may expose data after an MMIO
|
||||
operation. Exposed data could originate or end in
|
||||
the same CPU buffers as affected by MDS and TAA.
|
||||
Therefore, similar to MDS and TAA, the mitigation
|
||||
is to clear the affected CPU buffers.
|
||||
|
||||
This parameter controls the mitigation. The
|
||||
options are:
|
||||
|
||||
full - Enable mitigation on vulnerable CPUs
|
||||
|
||||
full,nosmt - Enable mitigation and disable SMT on
|
||||
vulnerable CPUs.
|
||||
|
||||
off - Unconditionally disable mitigation
|
||||
|
||||
On MDS or TAA affected machines,
|
||||
mmio_stale_data=off can be prevented by an active
|
||||
MDS or TAA mitigation as these vulnerabilities are
|
||||
mitigated with the same mechanism so in order to
|
||||
disable this mitigation, you need to specify
|
||||
mds=off and tsx_async_abort=off too.
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
mmio_stale_data=full.
|
||||
|
||||
For details see:
|
||||
Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
|
||||
|
||||
module.sig_enforce
|
||||
[KNL] When CONFIG_MODULE_SIG is set, this means that
|
||||
modules without (valid) signatures will fail to load.
|
||||
|
@ -40,9 +40,8 @@ properties:
|
||||
value to be used for converting remote channel measurements to
|
||||
temperature.
|
||||
$ref: /schemas/types.yaml#/definitions/int32
|
||||
items:
|
||||
minimum: -128
|
||||
maximum: 127
|
||||
minimum: -128
|
||||
maximum: 127
|
||||
|
||||
ti,beta-compensation:
|
||||
description:
|
||||
|
@ -0,0 +1,134 @@
|
||||
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
||||
%YAML 1.2
|
||||
---
|
||||
$id: http://devicetree.org/schemas/interrupt-controller/renesas,rzg2l-irqc.yaml#
|
||||
$schema: http://devicetree.org/meta-schemas/core.yaml#
|
||||
|
||||
title: Renesas RZ/G2L (and alike SoC's) Interrupt Controller (IA55)
|
||||
|
||||
maintainers:
|
||||
- Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
|
||||
- Geert Uytterhoeven <geert+renesas@glider.be>
|
||||
|
||||
description: |
|
||||
IA55 performs various interrupt controls including synchronization for the external
|
||||
interrupts of NMI, IRQ, and GPIOINT and the interrupts of the built-in peripheral
|
||||
interrupts output by each IP. And it notifies the interrupt to the GIC
|
||||
- IRQ sense select for 8 external interrupts, mapped to 8 GIC SPI interrupts
|
||||
- GPIO pins used as external interrupt input pins, mapped to 32 GIC SPI interrupts
|
||||
- NMI edge select (NMI is not treated as NMI exception and supports fall edge and
|
||||
stand-up edge detection interrupts)
|
||||
|
||||
allOf:
|
||||
- $ref: /schemas/interrupt-controller.yaml#
|
||||
|
||||
properties:
|
||||
compatible:
|
||||
items:
|
||||
- enum:
|
||||
- renesas,r9a07g044-irqc # RZ/G2{L,LC}
|
||||
- renesas,r9a07g054-irqc # RZ/V2L
|
||||
- const: renesas,rzg2l-irqc
|
||||
|
||||
'#interrupt-cells':
|
||||
description: The first cell should contain external interrupt number (IRQ0-7) and the
|
||||
second cell is used to specify the flag.
|
||||
const: 2
|
||||
|
||||
'#address-cells':
|
||||
const: 0
|
||||
|
||||
interrupt-controller: true
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
||||
interrupts:
|
||||
maxItems: 41
|
||||
|
||||
clocks:
|
||||
maxItems: 2
|
||||
|
||||
clock-names:
|
||||
items:
|
||||
- const: clk
|
||||
- const: pclk
|
||||
|
||||
power-domains:
|
||||
maxItems: 1
|
||||
|
||||
resets:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- '#interrupt-cells'
|
||||
- '#address-cells'
|
||||
- interrupt-controller
|
||||
- reg
|
||||
- interrupts
|
||||
- clocks
|
||||
- clock-names
|
||||
- power-domains
|
||||
- resets
|
||||
|
||||
unevaluatedProperties: false
|
||||
|
||||
examples:
|
||||
- |
|
||||
#include <dt-bindings/interrupt-controller/arm-gic.h>
|
||||
#include <dt-bindings/clock/r9a07g044-cpg.h>
|
||||
|
||||
irqc: interrupt-controller@110a0000 {
|
||||
compatible = "renesas,r9a07g044-irqc", "renesas,rzg2l-irqc";
|
||||
reg = <0x110a0000 0x10000>;
|
||||
#interrupt-cells = <2>;
|
||||
#address-cells = <0>;
|
||||
interrupt-controller;
|
||||
interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 444 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 445 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 446 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 447 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 448 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 449 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 450 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 451 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 452 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 453 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 454 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 455 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 456 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 457 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 458 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 459 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 460 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 461 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 462 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 463 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 464 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 465 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 466 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 467 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 468 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 469 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 470 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 471 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 472 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 473 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 474 IRQ_TYPE_LEVEL_HIGH>,
|
||||
<GIC_SPI 475 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&cpg CPG_MOD R9A07G044_IA55_CLK>,
|
||||
<&cpg CPG_MOD R9A07G044_IA55_PCLK>;
|
||||
clock-names = "clk", "pclk";
|
||||
power-domains = <&cpg>;
|
||||
resets = <&cpg R9A07G044_IA55_RESETN>;
|
||||
};
|
@ -26,9 +26,14 @@ description:
|
||||
with priority below this threshold will not cause the PLIC to raise its
|
||||
interrupt line leading to the context.
|
||||
|
||||
While the PLIC supports both edge-triggered and level-triggered interrupts,
|
||||
interrupt handlers are oblivious to this distinction and therefore it is not
|
||||
specified in the PLIC device-tree binding.
|
||||
The PLIC supports both edge-triggered and level-triggered interrupts. For
|
||||
edge-triggered interrupts, the RISC-V PLIC spec allows two responses to edges
|
||||
seen while an interrupt handler is active; the PLIC may either queue them or
|
||||
ignore them. In the first case, handlers are oblivious to the trigger type, so
|
||||
it is not included in the interrupt specifier. In the second case, software
|
||||
needs to know the trigger type, so it can reorder the interrupt flow to avoid
|
||||
missing interrupts. This special handling is needed by at least the Renesas
|
||||
RZ/Five SoC (AX45MP AndesCore with a NCEPLIC100) and the T-HEAD C900 PLIC.
|
||||
|
||||
While the RISC-V ISA doesn't specify a memory layout for the PLIC, the
|
||||
"sifive,plic-1.0.0" device is a concrete implementation of the PLIC that
|
||||
@ -47,6 +52,10 @@ maintainers:
|
||||
properties:
|
||||
compatible:
|
||||
oneOf:
|
||||
- items:
|
||||
- enum:
|
||||
- renesas,r9a07g043-plic
|
||||
- const: andestech,nceplic100
|
||||
- items:
|
||||
- enum:
|
||||
- sifive,fu540-c000-plic
|
||||
@ -64,8 +73,7 @@ properties:
|
||||
'#address-cells':
|
||||
const: 0
|
||||
|
||||
'#interrupt-cells':
|
||||
const: 1
|
||||
'#interrupt-cells': true
|
||||
|
||||
interrupt-controller: true
|
||||
|
||||
@ -82,6 +90,12 @@ properties:
|
||||
description:
|
||||
Specifies how many external interrupts are supported by this controller.
|
||||
|
||||
clocks: true
|
||||
|
||||
power-domains: true
|
||||
|
||||
resets: true
|
||||
|
||||
required:
|
||||
- compatible
|
||||
- '#address-cells'
|
||||
@ -91,6 +105,47 @@ required:
|
||||
- interrupts-extended
|
||||
- riscv,ndev
|
||||
|
||||
allOf:
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
enum:
|
||||
- andestech,nceplic100
|
||||
- thead,c900-plic
|
||||
|
||||
then:
|
||||
properties:
|
||||
'#interrupt-cells':
|
||||
const: 2
|
||||
|
||||
else:
|
||||
properties:
|
||||
'#interrupt-cells':
|
||||
const: 1
|
||||
|
||||
- if:
|
||||
properties:
|
||||
compatible:
|
||||
contains:
|
||||
const: renesas,r9a07g043-plic
|
||||
|
||||
then:
|
||||
properties:
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
power-domains:
|
||||
maxItems: 1
|
||||
|
||||
resets:
|
||||
maxItems: 1
|
||||
|
||||
required:
|
||||
- clocks
|
||||
- power-domains
|
||||
- resets
|
||||
|
||||
additionalProperties: false
|
||||
|
||||
examples:
|
||||
|
@ -30,6 +30,7 @@ properties:
|
||||
- socionext,uniphier-ld11-aidet
|
||||
- socionext,uniphier-ld20-aidet
|
||||
- socionext,uniphier-pxs3-aidet
|
||||
- socionext,uniphier-nx1-aidet
|
||||
|
||||
reg:
|
||||
maxItems: 1
|
||||
|
@ -47,6 +47,17 @@ properties:
|
||||
gpio-ranges:
|
||||
maxItems: 1
|
||||
|
||||
interrupt-controller: true
|
||||
|
||||
'#interrupt-cells':
|
||||
const: 2
|
||||
description:
|
||||
The first cell contains the global GPIO port index, constructed using the
|
||||
RZG2L_GPIO() helper macro in <dt-bindings/pinctrl/rzg2l-pinctrl.h> and the
|
||||
second cell is used to specify the flag.
|
||||
E.g. "interrupts = <RZG2L_GPIO(43, 0) IRQ_TYPE_EDGE_FALLING>;" if P43_0 is
|
||||
being used as an interrupt.
|
||||
|
||||
clocks:
|
||||
maxItems: 1
|
||||
|
||||
@ -110,6 +121,8 @@ required:
|
||||
- gpio-controller
|
||||
- '#gpio-cells'
|
||||
- gpio-ranges
|
||||
- interrupt-controller
|
||||
- '#interrupt-cells'
|
||||
- clocks
|
||||
- power-domains
|
||||
- resets
|
||||
@ -126,6 +139,8 @@ examples:
|
||||
gpio-controller;
|
||||
#gpio-cells = <2>;
|
||||
gpio-ranges = <&pinctrl 0 0 392>;
|
||||
interrupt-controller;
|
||||
#interrupt-cells = <2>;
|
||||
clocks = <&cpg CPG_MOD R9A07G044_GPIO_HCLK>;
|
||||
resets = <&cpg R9A07G044_GPIO_RSTN>,
|
||||
<&cpg R9A07G044_GPIO_PORT_RESETN>,
|
||||
|
@ -13,8 +13,8 @@ disappeared as of Linux 3.0.
|
||||
|
||||
There are two places where extended attributes can be found. The first
|
||||
place is between the end of each inode entry and the beginning of the
|
||||
next inode entry. For example, if inode.i\_extra\_isize = 28 and
|
||||
sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes
|
||||
next inode entry. For example, if inode.i_extra_isize = 28 and
|
||||
sb.inode_size = 256, then there are 256 - (128 + 28) = 100 bytes
|
||||
available for in-inode extended attribute storage. The second place
|
||||
where extended attributes can be found is in the block pointed to by
|
||||
``inode.i_file_acl``. As of Linux 3.11, it is not possible for this
|
||||
@ -38,8 +38,8 @@ Extended attributes, when stored after the inode, have a header
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_magic
|
||||
- __le32
|
||||
- h_magic
|
||||
- Magic number for identification, 0xEA020000. This value is set by the
|
||||
Linux driver, though e2fsprogs doesn't seem to check it(?)
|
||||
|
||||
@ -55,28 +55,28 @@ The beginning of an extended attribute block is in
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_magic
|
||||
- __le32
|
||||
- h_magic
|
||||
- Magic number for identification, 0xEA020000.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- h\_refcount
|
||||
- __le32
|
||||
- h_refcount
|
||||
- Reference count.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- h\_blocks
|
||||
- __le32
|
||||
- h_blocks
|
||||
- Number of disk blocks used.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- h\_hash
|
||||
- __le32
|
||||
- h_hash
|
||||
- Hash value of all attributes.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- h\_checksum
|
||||
- __le32
|
||||
- h_checksum
|
||||
- Checksum of the extended attribute block.
|
||||
* - 0x14
|
||||
- \_\_u32
|
||||
- h\_reserved[3]
|
||||
- __u32
|
||||
- h_reserved[3]
|
||||
- Zero.
|
||||
|
||||
The checksum is calculated against the FS UUID, the 64-bit block number
|
||||
@ -100,46 +100,46 @@ Attributes stored inside an inode do not need be stored in sorted order.
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_u8
|
||||
- e\_name\_len
|
||||
- __u8
|
||||
- e_name_len
|
||||
- Length of name.
|
||||
* - 0x1
|
||||
- \_\_u8
|
||||
- e\_name\_index
|
||||
- __u8
|
||||
- e_name_index
|
||||
- Attribute name index. There is a discussion of this below.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- e\_value\_offs
|
||||
- __le16
|
||||
- e_value_offs
|
||||
- Location of this attribute's value on the disk block where it is stored.
|
||||
Multiple attributes can share the same value. For an inode attribute
|
||||
this value is relative to the start of the first entry; for a block this
|
||||
value is relative to the start of the block (i.e. the header).
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- e\_value\_inum
|
||||
- __le32
|
||||
- e_value_inum
|
||||
- The inode where the value is stored. Zero indicates the value is in the
|
||||
same block as this entry. This field is only used if the
|
||||
INCOMPAT\_EA\_INODE feature is enabled.
|
||||
INCOMPAT_EA_INODE feature is enabled.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- e\_value\_size
|
||||
- __le32
|
||||
- e_value_size
|
||||
- Length of attribute value.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- e\_hash
|
||||
- __le32
|
||||
- e_hash
|
||||
- Hash value of attribute name and attribute value. The kernel doesn't
|
||||
update the hash for in-inode attributes, so for that case this value
|
||||
must be zero, because e2fsck validates any non-zero hash regardless of
|
||||
where the xattr lives.
|
||||
* - 0x10
|
||||
- char
|
||||
- e\_name[e\_name\_len]
|
||||
- e_name[e_name_len]
|
||||
- Attribute name. Does not include trailing NULL.
|
||||
|
||||
Attribute values can follow the end of the entry table. There appears to
|
||||
be a requirement that they be aligned to 4-byte boundaries. The values
|
||||
are stored starting at the end of the block and grow towards the
|
||||
xattr\_header/xattr\_entry table. When the two collide, the overflow is
|
||||
xattr_header/xattr_entry table. When the two collide, the overflow is
|
||||
put into a separate disk block. If the disk block fills up, the
|
||||
filesystem returns -ENOSPC.
|
||||
|
||||
@ -167,15 +167,15 @@ the key name. Here is a map of name index values to key prefixes:
|
||||
* - 1
|
||||
- “user.”
|
||||
* - 2
|
||||
- “system.posix\_acl\_access”
|
||||
- “system.posix_acl_access”
|
||||
* - 3
|
||||
- “system.posix\_acl\_default”
|
||||
- “system.posix_acl_default”
|
||||
* - 4
|
||||
- “trusted.”
|
||||
* - 6
|
||||
- “security.”
|
||||
* - 7
|
||||
- “system.” (inline\_data only?)
|
||||
- “system.” (inline_data only?)
|
||||
* - 8
|
||||
- “system.richacl” (SuSE kernels only?)
|
||||
|
||||
|
@ -23,7 +23,7 @@ means that a block group addresses 32 gigabytes instead of 128 megabytes,
|
||||
also shrinking the amount of file system overhead for metadata.
|
||||
|
||||
The administrator can set a block cluster size at mkfs time (which is
|
||||
stored in the s\_log\_cluster\_size field in the superblock); from then
|
||||
stored in the s_log_cluster_size field in the superblock); from then
|
||||
on, the block bitmaps track clusters, not individual blocks. This means
|
||||
that block groups can be several gigabytes in size (instead of just
|
||||
128MiB); however, the minimum allocation unit becomes a cluster, not a
|
||||
|
@ -9,15 +9,15 @@ group.
|
||||
The inode bitmap records which entries in the inode table are in use.
|
||||
|
||||
As with most bitmaps, one bit represents the usage status of one data
|
||||
block or inode table entry. This implies a block group size of 8 \*
|
||||
number\_of\_bytes\_in\_a\_logical\_block.
|
||||
block or inode table entry. This implies a block group size of 8 *
|
||||
number_of_bytes_in_a_logical_block.
|
||||
|
||||
NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts
|
||||
of the kernel and e2fsprogs code pretends that the block bitmap contains
|
||||
zeros (i.e. all blocks in the group are free). However, it is not
|
||||
necessarily the case that no blocks are in use -- if ``meta_bg`` is set,
|
||||
the bitmaps and group descriptor live inside the group. Unfortunately,
|
||||
ext2fs\_test\_block\_bitmap2() will return '0' for those locations,
|
||||
ext2fs_test_block_bitmap2() will return '0' for those locations,
|
||||
which produces confusing debugfs output.
|
||||
|
||||
Inode Table
|
||||
|
@ -56,39 +56,39 @@ established that the super block and the group descriptor table, if
|
||||
present, will be at the beginning of the block group. The bitmaps and
|
||||
the inode table can be anywhere, and it is quite possible for the
|
||||
bitmaps to come after the inode table, or for both to be in different
|
||||
groups (flex\_bg). Leftover space is used for file data blocks, indirect
|
||||
groups (flex_bg). Leftover space is used for file data blocks, indirect
|
||||
block maps, extent tree blocks, and extended attributes.
|
||||
|
||||
Flexible Block Groups
|
||||
---------------------
|
||||
|
||||
Starting in ext4, there is a new feature called flexible block groups
|
||||
(flex\_bg). In a flex\_bg, several block groups are tied together as one
|
||||
(flex_bg). In a flex_bg, several block groups are tied together as one
|
||||
logical block group; the bitmap spaces and the inode table space in the
|
||||
first block group of the flex\_bg are expanded to include the bitmaps
|
||||
and inode tables of all other block groups in the flex\_bg. For example,
|
||||
if the flex\_bg size is 4, then group 0 will contain (in order) the
|
||||
first block group of the flex_bg are expanded to include the bitmaps
|
||||
and inode tables of all other block groups in the flex_bg. For example,
|
||||
if the flex_bg size is 4, then group 0 will contain (in order) the
|
||||
superblock, group descriptors, data block bitmaps for groups 0-3, inode
|
||||
bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining
|
||||
space in group 0 is for file data. The effect of this is to group the
|
||||
block group metadata close together for faster loading, and to enable
|
||||
large files to be continuous on disk. Backup copies of the superblock
|
||||
and group descriptors are always at the beginning of block groups, even
|
||||
if flex\_bg is enabled. The number of block groups that make up a
|
||||
flex\_bg is given by 2 ^ ``sb.s_log_groups_per_flex``.
|
||||
if flex_bg is enabled. The number of block groups that make up a
|
||||
flex_bg is given by 2 ^ ``sb.s_log_groups_per_flex``.
|
||||
|
||||
Meta Block Groups
|
||||
-----------------
|
||||
|
||||
Without the option META\_BG, for safety concerns, all block group
|
||||
Without the option META_BG, for safety concerns, all block group
|
||||
descriptors copies are kept in the first block group. Given the default
|
||||
128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4
|
||||
can have at most 2^27/64 = 2^21 block groups. This limits the entire
|
||||
filesystem size to 2^21 * 2^27 = 2^48bytes or 256TiB.
|
||||
|
||||
The solution to this problem is to use the metablock group feature
|
||||
(META\_BG), which is already in ext3 for all 2.6 releases. With the
|
||||
META\_BG feature, ext4 filesystems are partitioned into many metablock
|
||||
(META_BG), which is already in ext3 for all 2.6 releases. With the
|
||||
META_BG feature, ext4 filesystems are partitioned into many metablock
|
||||
groups. Each metablock group is a cluster of block groups whose group
|
||||
descriptor structures can be stored in a single disk block. For ext4
|
||||
filesystems with 4 KB block size, a single metablock group partition
|
||||
@ -110,7 +110,7 @@ bytes, a meta-block group contains 32 block groups for filesystems with
|
||||
a 1KB block size, and 128 block groups for filesystems with a 4KB
|
||||
blocksize. Filesystems can either be created using this new block group
|
||||
descriptor layout, or existing filesystems can be resized on-line, and
|
||||
the field s\_first\_meta\_bg in the superblock will indicate the first
|
||||
the field s_first_meta_bg in the superblock will indicate the first
|
||||
block group using this new layout.
|
||||
|
||||
Please see an important note about ``BLOCK_UNINIT`` in the section about
|
||||
@ -121,15 +121,15 @@ Lazy Block Group Initialization
|
||||
|
||||
A new feature for ext4 are three block group descriptor flags that
|
||||
enable mkfs to skip initializing other parts of the block group
|
||||
metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean
|
||||
metadata. Specifically, the INODE_UNINIT and BLOCK_UNINIT flags mean
|
||||
that the inode and block bitmaps for that group can be calculated and
|
||||
therefore the on-disk bitmap blocks are not initialized. This is
|
||||
generally the case for an empty block group or a block group containing
|
||||
only fixed-location block group metadata. The INODE\_ZEROED flag means
|
||||
only fixed-location block group metadata. The INODE_ZEROED flag means
|
||||
that the inode table has been initialized; mkfs will unset this flag and
|
||||
rely on the kernel to initialize the inode tables in the background.
|
||||
|
||||
By not writing zeroes to the bitmaps and inode table, mkfs time is
|
||||
reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM,
|
||||
but the dumpe2fs output prints this as “uninit\_bg”. They are the same
|
||||
reduced considerably. Note the feature flag is RO_COMPAT_GDT_CSUM,
|
||||
but the dumpe2fs output prints this as “uninit_bg”. They are the same
|
||||
thing.
|
||||
|
@ -1,7 +1,7 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| i.i\_block Offset | Where It Points |
|
||||
| i.i_block Offset | Where It Points |
|
||||
+=====================+==============================================================================================================================================================================================================================+
|
||||
| 0 to 11 | Direct map to file blocks 0 to 11. |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
@ -4,7 +4,7 @@ Checksums
|
||||
---------
|
||||
|
||||
Starting in early 2012, metadata checksums were added to all major ext4
|
||||
and jbd2 data structures. The associated feature flag is metadata\_csum.
|
||||
and jbd2 data structures. The associated feature flag is metadata_csum.
|
||||
The desired checksum algorithm is indicated in the superblock, though as
|
||||
of October 2012 the only supported algorithm is crc32c. Some data
|
||||
structures did not have space to fit a full 32-bit checksum, so only the
|
||||
@ -20,7 +20,7 @@ encounters directory blocks that lack sufficient empty space to add a
|
||||
checksum, it will request that you run ``e2fsck -D`` to have the
|
||||
directories rebuilt with checksums. This has the added benefit of
|
||||
removing slack space from the directory files and rebalancing the htree
|
||||
indexes. If you \_ignore\_ this step, your directories will not be
|
||||
indexes. If you _ignore_ this step, your directories will not be
|
||||
protected by a checksum!
|
||||
|
||||
The following table describes the data elements that go into each type
|
||||
@ -35,39 +35,39 @@ of checksum. The checksum function is whatever the superblock describes
|
||||
- Length
|
||||
- Ingredients
|
||||
* - Superblock
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- The entire superblock up to the checksum field. The UUID lives inside
|
||||
the superblock.
|
||||
* - MMP
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + the entire MMP block up to the checksum field.
|
||||
* - Extended Attributes
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + the entire extended attribute block. The checksum field is set to
|
||||
zero.
|
||||
* - Directory Entries
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + inode number + inode generation + the directory block up to the
|
||||
fake entry enclosing the checksum field.
|
||||
* - HTREE Nodes
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + inode number + inode generation + all valid extents + HTREE tail.
|
||||
The checksum field is set to zero.
|
||||
* - Extents
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + inode number + inode generation + the entire extent block up to
|
||||
the checksum field.
|
||||
* - Bitmaps
|
||||
- \_\_le32 or \_\_le16
|
||||
- __le32 or __le16
|
||||
- UUID + the entire bitmap. Checksums are stored in the group descriptor,
|
||||
and truncated if the group descriptor size is 32 bytes (i.e. ^64bit)
|
||||
* - Inodes
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- UUID + inode number + inode generation + the entire inode. The checksum
|
||||
field is set to zero. Each inode has its own checksum.
|
||||
* - Group Descriptors
|
||||
- \_\_le16
|
||||
- If metadata\_csum, then UUID + group number + the entire descriptor;
|
||||
else if gdt\_csum, then crc16(UUID + group number + the entire
|
||||
- __le16
|
||||
- If metadata_csum, then UUID + group number + the entire descriptor;
|
||||
else if gdt_csum, then crc16(UUID + group number + the entire
|
||||
descriptor). In all cases, only the lower 16 bits are stored.
|
||||
|
||||
|
@ -42,24 +42,24 @@ is at most 263 bytes long, though on disk you'll need to reference
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- inode
|
||||
- Number of the inode that this directory entry points to.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- rec\_len
|
||||
- __le16
|
||||
- rec_len
|
||||
- Length of this directory entry. Must be a multiple of 4.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- name\_len
|
||||
- __le16
|
||||
- name_len
|
||||
- Length of the file name.
|
||||
* - 0x8
|
||||
- char
|
||||
- name[EXT4\_NAME\_LEN]
|
||||
- name[EXT4_NAME_LEN]
|
||||
- File name.
|
||||
|
||||
Since file names cannot be longer than 255 bytes, the new directory
|
||||
entry format shortens the name\_len field and uses the space for a file
|
||||
entry format shortens the name_len field and uses the space for a file
|
||||
type flag, probably to avoid having to load every inode during directory
|
||||
tree traversal. This format is ``ext4_dir_entry_2``, which is at most
|
||||
263 bytes long, though on disk you'll need to reference
|
||||
@ -74,24 +74,24 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- inode
|
||||
- Number of the inode that this directory entry points to.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- rec\_len
|
||||
- __le16
|
||||
- rec_len
|
||||
- Length of this directory entry.
|
||||
* - 0x6
|
||||
- \_\_u8
|
||||
- name\_len
|
||||
- __u8
|
||||
- name_len
|
||||
- Length of the file name.
|
||||
* - 0x7
|
||||
- \_\_u8
|
||||
- file\_type
|
||||
- __u8
|
||||
- file_type
|
||||
- File type code, see ftype_ table below.
|
||||
* - 0x8
|
||||
- char
|
||||
- name[EXT4\_NAME\_LEN]
|
||||
- name[EXT4_NAME_LEN]
|
||||
- File name.
|
||||
|
||||
.. _ftype:
|
||||
@ -137,19 +137,19 @@ entry uses this extension, it may be up to 271 bytes.
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- hash
|
||||
- The hash of the directory name
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- minor\_hash
|
||||
- __le32
|
||||
- minor_hash
|
||||
- The minor hash of the directory name
|
||||
|
||||
|
||||
In order to add checksums to these classic directory blocks, a phony
|
||||
``struct ext4_dir_entry`` is placed at the end of each leaf block to
|
||||
hold the checksum. The directory entry is 12 bytes long. The inode
|
||||
number and name\_len fields are set to zero to fool old software into
|
||||
number and name_len fields are set to zero to fool old software into
|
||||
ignoring an apparently empty directory entry, and the checksum is stored
|
||||
in the place where the name normally goes. The structure is
|
||||
``struct ext4_dir_entry_tail``:
|
||||
@ -163,24 +163,24 @@ in the place where the name normally goes. The structure is
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- det\_reserved\_zero1
|
||||
- __le32
|
||||
- det_reserved_zero1
|
||||
- Inode number, which must be zero.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- det\_rec\_len
|
||||
- __le16
|
||||
- det_rec_len
|
||||
- Length of this directory entry, which must be 12.
|
||||
* - 0x6
|
||||
- \_\_u8
|
||||
- det\_reserved\_zero2
|
||||
- __u8
|
||||
- det_reserved_zero2
|
||||
- Length of the file name, which must be zero.
|
||||
* - 0x7
|
||||
- \_\_u8
|
||||
- det\_reserved\_ft
|
||||
- __u8
|
||||
- det_reserved_ft
|
||||
- File type, which must be 0xDE.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- det\_checksum
|
||||
- __le32
|
||||
- det_checksum
|
||||
- Directory leaf block checksum.
|
||||
|
||||
The leaf directory block checksum is calculated against the FS UUID, the
|
||||
@ -194,7 +194,7 @@ Hash Tree Directories
|
||||
A linear array of directory entries isn't great for performance, so a
|
||||
new feature was added to ext3 to provide a faster (but peculiar)
|
||||
balanced tree keyed off a hash of the directory entry name. If the
|
||||
EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a
|
||||
EXT4_INDEX_FL (0x1000) flag is set in the inode, this directory uses a
|
||||
hashed btree (htree) to organize and find directory entries. For
|
||||
backwards read-only compatibility with ext2, this tree is actually
|
||||
hidden inside the directory file, masquerading as “empty” directory data
|
||||
@ -206,14 +206,14 @@ rest of the directory block is empty so that it moves on.
|
||||
The root of the tree always lives in the first data block of the
|
||||
directory. By ext2 custom, the '.' and '..' entries must appear at the
|
||||
beginning of this first block, so they are put here as two
|
||||
``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of
|
||||
``struct ext4_dir_entry_2`` s and not stored in the tree. The rest of
|
||||
the root node contains metadata about the tree and finally a hash->block
|
||||
map to find nodes that are lower in the htree. If
|
||||
``dx_root.info.indirect_levels`` is non-zero then the htree has two
|
||||
levels; the data block pointed to by the root node's map is an interior
|
||||
node, which is indexed by a minor hash. Interior nodes in this tree
|
||||
contains a zeroed out ``struct ext4_dir_entry_2`` followed by a
|
||||
minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear
|
||||
minor_hash->block map to find leafe nodes. Leaf nodes contain a linear
|
||||
array of all ``struct ext4_dir_entry_2``; all of these entries
|
||||
(presumably) hash to the same value. If there is an overflow, the
|
||||
entries simply overflow into the next leaf node, and the
|
||||
@ -245,83 +245,83 @@ of a data block:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- dot.inode
|
||||
- inode number of this directory.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- dot.rec\_len
|
||||
- __le16
|
||||
- dot.rec_len
|
||||
- Length of this record, 12.
|
||||
* - 0x6
|
||||
- u8
|
||||
- dot.name\_len
|
||||
- dot.name_len
|
||||
- Length of the name, 1.
|
||||
* - 0x7
|
||||
- u8
|
||||
- dot.file\_type
|
||||
- dot.file_type
|
||||
- File type of this entry, 0x2 (directory) (if the feature flag is set).
|
||||
* - 0x8
|
||||
- char
|
||||
- dot.name[4]
|
||||
- “.\\0\\0\\0”
|
||||
- “.\0\0\0”
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- dotdot.inode
|
||||
- inode number of parent directory.
|
||||
* - 0x10
|
||||
- \_\_le16
|
||||
- dotdot.rec\_len
|
||||
- block\_size - 12. The record length is long enough to cover all htree
|
||||
- __le16
|
||||
- dotdot.rec_len
|
||||
- block_size - 12. The record length is long enough to cover all htree
|
||||
data.
|
||||
* - 0x12
|
||||
- u8
|
||||
- dotdot.name\_len
|
||||
- dotdot.name_len
|
||||
- Length of the name, 2.
|
||||
* - 0x13
|
||||
- u8
|
||||
- dotdot.file\_type
|
||||
- dotdot.file_type
|
||||
- File type of this entry, 0x2 (directory) (if the feature flag is set).
|
||||
* - 0x14
|
||||
- char
|
||||
- dotdot\_name[4]
|
||||
- “..\\0\\0”
|
||||
- dotdot_name[4]
|
||||
- “..\0\0”
|
||||
* - 0x18
|
||||
- \_\_le32
|
||||
- struct dx\_root\_info.reserved\_zero
|
||||
- __le32
|
||||
- struct dx_root_info.reserved_zero
|
||||
- Zero.
|
||||
* - 0x1C
|
||||
- u8
|
||||
- struct dx\_root\_info.hash\_version
|
||||
- struct dx_root_info.hash_version
|
||||
- Hash type, see dirhash_ table below.
|
||||
* - 0x1D
|
||||
- u8
|
||||
- struct dx\_root\_info.info\_length
|
||||
- struct dx_root_info.info_length
|
||||
- Length of the tree information, 0x8.
|
||||
* - 0x1E
|
||||
- u8
|
||||
- struct dx\_root\_info.indirect\_levels
|
||||
- Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR
|
||||
- struct dx_root_info.indirect_levels
|
||||
- Depth of the htree. Cannot be larger than 3 if the INCOMPAT_LARGEDIR
|
||||
feature is set; cannot be larger than 2 otherwise.
|
||||
* - 0x1F
|
||||
- u8
|
||||
- struct dx\_root\_info.unused\_flags
|
||||
- struct dx_root_info.unused_flags
|
||||
-
|
||||
* - 0x20
|
||||
- \_\_le16
|
||||
- __le16
|
||||
- limit
|
||||
- Maximum number of dx\_entries that can follow this header, plus 1 for
|
||||
- Maximum number of dx_entries that can follow this header, plus 1 for
|
||||
the header itself.
|
||||
* - 0x22
|
||||
- \_\_le16
|
||||
- __le16
|
||||
- count
|
||||
- Actual number of dx\_entries that follow this header, plus 1 for the
|
||||
- Actual number of dx_entries that follow this header, plus 1 for the
|
||||
header itself.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- block
|
||||
- The block number (within the directory file) that goes with hash=0.
|
||||
* - 0x28
|
||||
- struct dx\_entry
|
||||
- struct dx_entry
|
||||
- entries[0]
|
||||
- As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
|
||||
|
||||
@ -362,38 +362,38 @@ also the full length of a data block:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- fake.inode
|
||||
- Zero, to make it look like this entry is not in use.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- fake.rec\_len
|
||||
- The size of the block, in order to hide all of the dx\_node data.
|
||||
- __le16
|
||||
- fake.rec_len
|
||||
- The size of the block, in order to hide all of the dx_node data.
|
||||
* - 0x6
|
||||
- u8
|
||||
- name\_len
|
||||
- name_len
|
||||
- Zero. There is no name for this “unused” directory entry.
|
||||
* - 0x7
|
||||
- u8
|
||||
- file\_type
|
||||
- file_type
|
||||
- Zero. There is no file type for this “unused” directory entry.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- __le16
|
||||
- limit
|
||||
- Maximum number of dx\_entries that can follow this header, plus 1 for
|
||||
- Maximum number of dx_entries that can follow this header, plus 1 for
|
||||
the header itself.
|
||||
* - 0xA
|
||||
- \_\_le16
|
||||
- __le16
|
||||
- count
|
||||
- Actual number of dx\_entries that follow this header, plus 1 for the
|
||||
- Actual number of dx_entries that follow this header, plus 1 for the
|
||||
header itself.
|
||||
* - 0xE
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- block
|
||||
- The block number (within the directory file) that goes with the lowest
|
||||
hash value of this block. This value is stored in the parent block.
|
||||
* - 0x12
|
||||
- struct dx\_entry
|
||||
- struct dx_entry
|
||||
- entries[0]
|
||||
- As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
|
||||
|
||||
@ -410,11 +410,11 @@ long:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- hash
|
||||
- Hash code.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- __le32
|
||||
- block
|
||||
- Block number (within the directory file, not filesystem blocks) of the
|
||||
next node in the htree.
|
||||
@ -423,13 +423,13 @@ long:
|
||||
author.)
|
||||
|
||||
If metadata checksums are enabled, the last 8 bytes of the directory
|
||||
block (precisely the length of one dx\_entry) are used to store a
|
||||
block (precisely the length of one dx_entry) are used to store a
|
||||
``struct dx_tail``, which contains the checksum. The ``limit`` and
|
||||
``count`` entries in the dx\_root/dx\_node structures are adjusted as
|
||||
necessary to fit the dx\_tail into the block. If there is no space for
|
||||
the dx\_tail, the user is notified to run e2fsck -D to rebuild the
|
||||
``count`` entries in the dx_root/dx_node structures are adjusted as
|
||||
necessary to fit the dx_tail into the block. If there is no space for
|
||||
the dx_tail, the user is notified to run e2fsck -D to rebuild the
|
||||
directory index (which will ensure that there's space for the checksum.
|
||||
The dx\_tail structure is 8 bytes long and looks like this:
|
||||
The dx_tail structure is 8 bytes long and looks like this:
|
||||
|
||||
.. list-table::
|
||||
:widths: 8 8 24 40
|
||||
@ -441,13 +441,13 @@ The dx\_tail structure is 8 bytes long and looks like this:
|
||||
- Description
|
||||
* - 0x0
|
||||
- u32
|
||||
- dt\_reserved
|
||||
- dt_reserved
|
||||
- Zero.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- dt\_checksum
|
||||
- __le32
|
||||
- dt_checksum
|
||||
- Checksum of the htree directory block.
|
||||
|
||||
The checksum is calculated against the FS UUID, the htree index header
|
||||
(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in
|
||||
use, and the tail block (dx\_tail).
|
||||
(dx_root or dx_node), all of the htree indices (dx_entry) that are in
|
||||
use, and the tail block (dx_tail).
|
||||
|
@ -5,14 +5,14 @@ Large Extended Attribute Values
|
||||
|
||||
To enable ext4 to store extended attribute values that do not fit in the
|
||||
inode or in the single extended attribute block attached to an inode,
|
||||
the EA\_INODE feature allows us to store the value in the data blocks of
|
||||
the EA_INODE feature allows us to store the value in the data blocks of
|
||||
a regular file inode. This “EA inode” is linked only from the extended
|
||||
attribute name index and must not appear in a directory entry. The
|
||||
inode's i\_atime field is used to store a checksum of the xattr value;
|
||||
and i\_ctime/i\_version store a 64-bit reference count, which enables
|
||||
inode's i_atime field is used to store a checksum of the xattr value;
|
||||
and i_ctime/i_version store a 64-bit reference count, which enables
|
||||
sharing of large xattr values between multiple owning inodes. For
|
||||
backward compatibility with older versions of this feature, the
|
||||
i\_mtime/i\_generation *may* store a back-reference to the inode number
|
||||
and i\_generation of the **one** owning inode (in cases where the EA
|
||||
i_mtime/i_generation *may* store a back-reference to the inode number
|
||||
and i_generation of the **one** owning inode (in cases where the EA
|
||||
inode is not referenced by multiple inodes) to verify that the EA inode
|
||||
is the correct one being accessed.
|
||||
|
@ -7,34 +7,34 @@ Each block group on the filesystem has one of these descriptors
|
||||
associated with it. As noted in the Layout section above, the group
|
||||
descriptors (if present) are the second item in the block group. The
|
||||
standard configuration is for each block group to contain a full copy of
|
||||
the block group descriptor table unless the sparse\_super feature flag
|
||||
the block group descriptor table unless the sparse_super feature flag
|
||||
is set.
|
||||
|
||||
Notice how the group descriptor records the location of both bitmaps and
|
||||
the inode table (i.e. they can float). This means that within a block
|
||||
group, the only data structures with fixed locations are the superblock
|
||||
and the group descriptor table. The flex\_bg mechanism uses this
|
||||
and the group descriptor table. The flex_bg mechanism uses this
|
||||
property to group several block groups into a flex group and lay out all
|
||||
of the groups' bitmaps and inode tables into one long run in the first
|
||||
group of the flex group.
|
||||
|
||||
If the meta\_bg feature flag is set, then several block groups are
|
||||
grouped together into a meta group. Note that in the meta\_bg case,
|
||||
If the meta_bg feature flag is set, then several block groups are
|
||||
grouped together into a meta group. Note that in the meta_bg case,
|
||||
however, the first and last two block groups within the larger meta
|
||||
group contain only group descriptors for the groups inside the meta
|
||||
group.
|
||||
|
||||
flex\_bg and meta\_bg do not appear to be mutually exclusive features.
|
||||
flex_bg and meta_bg do not appear to be mutually exclusive features.
|
||||
|
||||
In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the
|
||||
block group descriptor was only 32 bytes long and therefore ends at
|
||||
bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the
|
||||
bg_checksum. On an ext4 filesystem with the 64bit feature enabled, the
|
||||
block group descriptor expands to at least the 64 bytes described below;
|
||||
the size is stored in the superblock.
|
||||
|
||||
If gdt\_csum is set and metadata\_csum is not set, the block group
|
||||
If gdt_csum is set and metadata_csum is not set, the block group
|
||||
checksum is the crc16 of the FS UUID, the group number, and the group
|
||||
descriptor structure. If metadata\_csum is set, then the block group
|
||||
descriptor structure. If metadata_csum is set, then the block group
|
||||
checksum is the lower 16 bits of the checksum of the FS UUID, the group
|
||||
number, and the group descriptor structure. Both block and inode bitmap
|
||||
checksums are calculated against the FS UUID, the group number, and the
|
||||
@ -51,59 +51,59 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- bg\_block\_bitmap\_lo
|
||||
- __le32
|
||||
- bg_block_bitmap_lo
|
||||
- Lower 32-bits of location of block bitmap.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- bg\_inode\_bitmap\_lo
|
||||
- __le32
|
||||
- bg_inode_bitmap_lo
|
||||
- Lower 32-bits of location of inode bitmap.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- bg\_inode\_table\_lo
|
||||
- __le32
|
||||
- bg_inode_table_lo
|
||||
- Lower 32-bits of location of inode table.
|
||||
* - 0xC
|
||||
- \_\_le16
|
||||
- bg\_free\_blocks\_count\_lo
|
||||
- __le16
|
||||
- bg_free_blocks_count_lo
|
||||
- Lower 16-bits of free block count.
|
||||
* - 0xE
|
||||
- \_\_le16
|
||||
- bg\_free\_inodes\_count\_lo
|
||||
- __le16
|
||||
- bg_free_inodes_count_lo
|
||||
- Lower 16-bits of free inode count.
|
||||
* - 0x10
|
||||
- \_\_le16
|
||||
- bg\_used\_dirs\_count\_lo
|
||||
- __le16
|
||||
- bg_used_dirs_count_lo
|
||||
- Lower 16-bits of directory count.
|
||||
* - 0x12
|
||||
- \_\_le16
|
||||
- bg\_flags
|
||||
- __le16
|
||||
- bg_flags
|
||||
- Block group flags. See the bgflags_ table below.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- bg\_exclude\_bitmap\_lo
|
||||
- __le32
|
||||
- bg_exclude_bitmap_lo
|
||||
- Lower 32-bits of location of snapshot exclusion bitmap.
|
||||
* - 0x18
|
||||
- \_\_le16
|
||||
- bg\_block\_bitmap\_csum\_lo
|
||||
- __le16
|
||||
- bg_block_bitmap_csum_lo
|
||||
- Lower 16-bits of the block bitmap checksum.
|
||||
* - 0x1A
|
||||
- \_\_le16
|
||||
- bg\_inode\_bitmap\_csum\_lo
|
||||
- __le16
|
||||
- bg_inode_bitmap_csum_lo
|
||||
- Lower 16-bits of the inode bitmap checksum.
|
||||
* - 0x1C
|
||||
- \_\_le16
|
||||
- bg\_itable\_unused\_lo
|
||||
- __le16
|
||||
- bg_itable_unused_lo
|
||||
- Lower 16-bits of unused inode count. If set, we needn't scan past the
|
||||
``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the
|
||||
``(sb.s_inodes_per_group - gdt.bg_itable_unused)`` th entry in the
|
||||
inode table for this group.
|
||||
* - 0x1E
|
||||
- \_\_le16
|
||||
- bg\_checksum
|
||||
- Group descriptor checksum; crc16(sb\_uuid+group\_num+bg\_desc) if the
|
||||
RO\_COMPAT\_GDT\_CSUM feature is set, or
|
||||
crc32c(sb\_uuid+group\_num+bg\_desc) & 0xFFFF if the
|
||||
RO\_COMPAT\_METADATA\_CSUM feature is set. The bg\_checksum
|
||||
field in bg\_desc is skipped when calculating crc16 checksum,
|
||||
- __le16
|
||||
- bg_checksum
|
||||
- Group descriptor checksum; crc16(sb_uuid+group_num+bg_desc) if the
|
||||
RO_COMPAT_GDT_CSUM feature is set, or
|
||||
crc32c(sb_uuid+group_num+bg_desc) & 0xFFFF if the
|
||||
RO_COMPAT_METADATA_CSUM feature is set. The bg_checksum
|
||||
field in bg_desc is skipped when calculating crc16 checksum,
|
||||
and set to zero if crc32c checksum is used.
|
||||
* -
|
||||
-
|
||||
@ -111,48 +111,48 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
|
||||
- These fields only exist if the 64bit feature is enabled and s_desc_size
|
||||
> 32.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- bg\_block\_bitmap\_hi
|
||||
- __le32
|
||||
- bg_block_bitmap_hi
|
||||
- Upper 32-bits of location of block bitmap.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- bg\_inode\_bitmap\_hi
|
||||
- __le32
|
||||
- bg_inode_bitmap_hi
|
||||
- Upper 32-bits of location of inodes bitmap.
|
||||
* - 0x28
|
||||
- \_\_le32
|
||||
- bg\_inode\_table\_hi
|
||||
- __le32
|
||||
- bg_inode_table_hi
|
||||
- Upper 32-bits of location of inodes table.
|
||||
* - 0x2C
|
||||
- \_\_le16
|
||||
- bg\_free\_blocks\_count\_hi
|
||||
- __le16
|
||||
- bg_free_blocks_count_hi
|
||||
- Upper 16-bits of free block count.
|
||||
* - 0x2E
|
||||
- \_\_le16
|
||||
- bg\_free\_inodes\_count\_hi
|
||||
- __le16
|
||||
- bg_free_inodes_count_hi
|
||||
- Upper 16-bits of free inode count.
|
||||
* - 0x30
|
||||
- \_\_le16
|
||||
- bg\_used\_dirs\_count\_hi
|
||||
- __le16
|
||||
- bg_used_dirs_count_hi
|
||||
- Upper 16-bits of directory count.
|
||||
* - 0x32
|
||||
- \_\_le16
|
||||
- bg\_itable\_unused\_hi
|
||||
- __le16
|
||||
- bg_itable_unused_hi
|
||||
- Upper 16-bits of unused inode count.
|
||||
* - 0x34
|
||||
- \_\_le32
|
||||
- bg\_exclude\_bitmap\_hi
|
||||
- __le32
|
||||
- bg_exclude_bitmap_hi
|
||||
- Upper 32-bits of location of snapshot exclusion bitmap.
|
||||
* - 0x38
|
||||
- \_\_le16
|
||||
- bg\_block\_bitmap\_csum\_hi
|
||||
- __le16
|
||||
- bg_block_bitmap_csum_hi
|
||||
- Upper 16-bits of the block bitmap checksum.
|
||||
* - 0x3A
|
||||
- \_\_le16
|
||||
- bg\_inode\_bitmap\_csum\_hi
|
||||
- __le16
|
||||
- bg_inode_bitmap_csum_hi
|
||||
- Upper 16-bits of the inode bitmap checksum.
|
||||
* - 0x3C
|
||||
- \_\_u32
|
||||
- bg\_reserved
|
||||
- __u32
|
||||
- bg_reserved
|
||||
- Padding to 64 bytes.
|
||||
|
||||
.. _bgflags:
|
||||
@ -166,8 +166,8 @@ Block group flags can be any combination of the following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT).
|
||||
- inode table and bitmap are not initialized (EXT4_BG_INODE_UNINIT).
|
||||
* - 0x2
|
||||
- block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT).
|
||||
- block bitmap is not initialized (EXT4_BG_BLOCK_UNINIT).
|
||||
* - 0x4
|
||||
- inode table is zeroed (EXT4\_BG\_INODE\_ZEROED).
|
||||
- inode table is zeroed (EXT4_BG_INODE_ZEROED).
|
||||
|
@ -1,6 +1,6 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
The Contents of inode.i\_block
|
||||
The Contents of inode.i_block
|
||||
------------------------------
|
||||
|
||||
Depending on the type of file an inode describes, the 60 bytes of
|
||||
@ -47,7 +47,7 @@ In ext4, the file to logical block map has been replaced with an extent
|
||||
tree. Under the old scheme, allocating a contiguous run of 1,000 blocks
|
||||
requires an indirect block to map all 1,000 entries; with extents, the
|
||||
mapping is reduced to a single ``struct ext4_extent`` with
|
||||
``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate
|
||||
``ee_len = 1000``. If flex_bg is enabled, it is possible to allocate
|
||||
very large files with a single extent, at a considerable reduction in
|
||||
metadata block use, and some improvement in disk efficiency. The inode
|
||||
must have the extents flag (0x80000) flag set for this feature to be in
|
||||
@ -76,28 +76,28 @@ which is 12 bytes long:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- eh\_magic
|
||||
- __le16
|
||||
- eh_magic
|
||||
- Magic number, 0xF30A.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- eh\_entries
|
||||
- __le16
|
||||
- eh_entries
|
||||
- Number of valid entries following the header.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- eh\_max
|
||||
- __le16
|
||||
- eh_max
|
||||
- Maximum number of entries that could follow the header.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- eh\_depth
|
||||
- __le16
|
||||
- eh_depth
|
||||
- Depth of this extent node in the extent tree. 0 = this extent node
|
||||
points to data blocks; otherwise, this extent node points to other
|
||||
extent nodes. The extent tree can be at most 5 levels deep: a logical
|
||||
block number can be at most ``2^32``, and the smallest ``n`` that
|
||||
satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- eh\_generation
|
||||
- __le32
|
||||
- eh_generation
|
||||
- Generation of the tree. (Used by Lustre, but not standard ext4).
|
||||
|
||||
Internal nodes of the extent tree, also known as index nodes, are
|
||||
@ -112,22 +112,22 @@ recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- ei\_block
|
||||
- __le32
|
||||
- ei_block
|
||||
- This index node covers file blocks from 'block' onward.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- ei\_leaf\_lo
|
||||
- __le32
|
||||
- ei_leaf_lo
|
||||
- Lower 32-bits of the block number of the extent node that is the next
|
||||
level lower in the tree. The tree node pointed to can be either another
|
||||
internal node or a leaf node, described below.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- ei\_leaf\_hi
|
||||
- __le16
|
||||
- ei_leaf_hi
|
||||
- Upper 16-bits of the previous field.
|
||||
* - 0xA
|
||||
- \_\_u16
|
||||
- ei\_unused
|
||||
- __u16
|
||||
- ei_unused
|
||||
-
|
||||
|
||||
Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
|
||||
@ -142,24 +142,24 @@ and are also 12 bytes long:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- ee\_block
|
||||
- __le32
|
||||
- ee_block
|
||||
- First file block number that this extent covers.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- ee\_len
|
||||
- __le16
|
||||
- ee_len
|
||||
- Number of blocks covered by extent. If the value of this field is <=
|
||||
32768, the extent is initialized. If the value of the field is > 32768,
|
||||
the extent is uninitialized and the actual extent length is ``ee_len`` -
|
||||
32768. Therefore, the maximum length of a initialized extent is 32768
|
||||
blocks, and the maximum length of an uninitialized extent is 32767.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- ee\_start\_hi
|
||||
- __le16
|
||||
- ee_start_hi
|
||||
- Upper 16-bits of the block number to which this extent points.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- ee\_start\_lo
|
||||
- __le32
|
||||
- ee_start_lo
|
||||
- Lower 32-bits of the block number to which this extent points.
|
||||
|
||||
Prior to the introduction of metadata checksums, the extent header +
|
||||
@ -182,8 +182,8 @@ including) the checksum itself.
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- eb\_checksum
|
||||
- __le32
|
||||
- eb_checksum
|
||||
- Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock)
|
||||
|
||||
Inline Data
|
||||
|
@ -11,12 +11,12 @@ file is smaller than 60 bytes, then the data are stored inline in
|
||||
attribute space, then it might be found as an extended attribute
|
||||
“system.data” within the inode body (“ibody EA”). This of course
|
||||
constrains the amount of extended attributes one can attach to an inode.
|
||||
If the data size increases beyond i\_block + ibody EA, a regular block
|
||||
If the data size increases beyond i_block + ibody EA, a regular block
|
||||
is allocated and the contents moved to that block.
|
||||
|
||||
Pending a change to compact the extended attribute key used to store
|
||||
inline data, one ought to be able to store 160 bytes of data in a
|
||||
256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to
|
||||
256-byte inode (as of June 2015, when i_extra_isize is 28). Prior to
|
||||
that, the limit was 156 bytes due to inefficient use of inode space.
|
||||
|
||||
The inline data feature requires the presence of an extended attribute
|
||||
@ -25,12 +25,12 @@ for “system.data”, even if the attribute value is zero length.
|
||||
Inline Directories
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The first four bytes of i\_block are the inode number of the parent
|
||||
The first four bytes of i_block are the inode number of the parent
|
||||
directory. Following that is a 56-byte space for an array of directory
|
||||
entries; see ``struct ext4_dir_entry``. If there is a “system.data”
|
||||
attribute in the inode body, the EA value is an array of
|
||||
``struct ext4_dir_entry`` as well. Note that for inline directories, the
|
||||
i\_block and EA space are treated as separate dirent blocks; directory
|
||||
i_block and EA space are treated as separate dirent blocks; directory
|
||||
entries cannot span the two.
|
||||
|
||||
Inline directory entries are not checksummed, as the inode checksum
|
||||
|
@ -38,138 +38,138 @@ The inode table entry is laid out in ``struct ext4_inode``.
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- i\_mode
|
||||
- __le16
|
||||
- i_mode
|
||||
- File mode. See the table i_mode_ below.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- i\_uid
|
||||
- __le16
|
||||
- i_uid
|
||||
- Lower 16-bits of Owner UID.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- i\_size\_lo
|
||||
- __le32
|
||||
- i_size_lo
|
||||
- Lower 32-bits of size in bytes.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- i\_atime
|
||||
- Last access time, in seconds since the epoch. However, if the EA\_INODE
|
||||
- __le32
|
||||
- i_atime
|
||||
- Last access time, in seconds since the epoch. However, if the EA_INODE
|
||||
inode flag is set, this inode stores an extended attribute value and
|
||||
this field contains the checksum of the value.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- i\_ctime
|
||||
- __le32
|
||||
- i_ctime
|
||||
- Last inode change time, in seconds since the epoch. However, if the
|
||||
EA\_INODE inode flag is set, this inode stores an extended attribute
|
||||
EA_INODE inode flag is set, this inode stores an extended attribute
|
||||
value and this field contains the lower 32 bits of the attribute value's
|
||||
reference count.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- i\_mtime
|
||||
- __le32
|
||||
- i_mtime
|
||||
- Last data modification time, in seconds since the epoch. However, if the
|
||||
EA\_INODE inode flag is set, this inode stores an extended attribute
|
||||
EA_INODE inode flag is set, this inode stores an extended attribute
|
||||
value and this field contains the number of the inode that owns the
|
||||
extended attribute.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- i\_dtime
|
||||
- __le32
|
||||
- i_dtime
|
||||
- Deletion Time, in seconds since the epoch.
|
||||
* - 0x18
|
||||
- \_\_le16
|
||||
- i\_gid
|
||||
- __le16
|
||||
- i_gid
|
||||
- Lower 16-bits of GID.
|
||||
* - 0x1A
|
||||
- \_\_le16
|
||||
- i\_links\_count
|
||||
- __le16
|
||||
- i_links_count
|
||||
- Hard link count. Normally, ext4 does not permit an inode to have more
|
||||
than 65,000 hard links. This applies to files as well as directories,
|
||||
which means that there cannot be more than 64,998 subdirectories in a
|
||||
directory (each subdirectory's '..' entry counts as a hard link, as does
|
||||
the '.' entry in the directory itself). With the DIR\_NLINK feature
|
||||
the '.' entry in the directory itself). With the DIR_NLINK feature
|
||||
enabled, ext4 supports more than 64,998 subdirectories by setting this
|
||||
field to 1 to indicate that the number of hard links is not known.
|
||||
* - 0x1C
|
||||
- \_\_le32
|
||||
- i\_blocks\_lo
|
||||
- Lower 32-bits of “block” count. If the huge\_file feature flag is not
|
||||
- __le32
|
||||
- i_blocks_lo
|
||||
- Lower 32-bits of “block” count. If the huge_file feature flag is not
|
||||
set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks
|
||||
on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in
|
||||
on disk. If huge_file is set and EXT4_HUGE_FILE_FL is NOT set in
|
||||
``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi
|
||||
<< 32)`` 512-byte blocks on disk. If huge\_file is set and
|
||||
EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file
|
||||
<< 32)`` 512-byte blocks on disk. If huge_file is set and
|
||||
EXT4_HUGE_FILE_FL IS set in ``inode.i_flags``, then this file
|
||||
consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on
|
||||
disk.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- i\_flags
|
||||
- __le32
|
||||
- i_flags
|
||||
- Inode flags. See the table i_flags_ below.
|
||||
* - 0x24
|
||||
- 4 bytes
|
||||
- i\_osd1
|
||||
- i_osd1
|
||||
- See the table i_osd1_ for more details.
|
||||
* - 0x28
|
||||
- 60 bytes
|
||||
- i\_block[EXT4\_N\_BLOCKS=15]
|
||||
- Block map or extent tree. See the section “The Contents of inode.i\_block”.
|
||||
- i_block[EXT4_N_BLOCKS=15]
|
||||
- Block map or extent tree. See the section “The Contents of inode.i_block”.
|
||||
* - 0x64
|
||||
- \_\_le32
|
||||
- i\_generation
|
||||
- __le32
|
||||
- i_generation
|
||||
- File version (for NFS).
|
||||
* - 0x68
|
||||
- \_\_le32
|
||||
- i\_file\_acl\_lo
|
||||
- __le32
|
||||
- i_file_acl_lo
|
||||
- Lower 32-bits of extended attribute block. ACLs are of course one of
|
||||
many possible extended attributes; I think the name of this field is a
|
||||
result of the first use of extended attributes being for ACLs.
|
||||
* - 0x6C
|
||||
- \_\_le32
|
||||
- i\_size\_high / i\_dir\_acl
|
||||
- __le32
|
||||
- i_size_high / i_dir_acl
|
||||
- Upper 32-bits of file/directory size. In ext2/3 this field was named
|
||||
i\_dir\_acl, though it was usually set to zero and never used.
|
||||
i_dir_acl, though it was usually set to zero and never used.
|
||||
* - 0x70
|
||||
- \_\_le32
|
||||
- i\_obso\_faddr
|
||||
- __le32
|
||||
- i_obso_faddr
|
||||
- (Obsolete) fragment address.
|
||||
* - 0x74
|
||||
- 12 bytes
|
||||
- i\_osd2
|
||||
- i_osd2
|
||||
- See the table i_osd2_ for more details.
|
||||
* - 0x80
|
||||
- \_\_le16
|
||||
- i\_extra\_isize
|
||||
- __le16
|
||||
- i_extra_isize
|
||||
- Size of this inode - 128. Alternately, the size of the extended inode
|
||||
fields beyond the original ext2 inode, including this field.
|
||||
* - 0x82
|
||||
- \_\_le16
|
||||
- i\_checksum\_hi
|
||||
- __le16
|
||||
- i_checksum_hi
|
||||
- Upper 16-bits of the inode checksum.
|
||||
* - 0x84
|
||||
- \_\_le32
|
||||
- i\_ctime\_extra
|
||||
- __le32
|
||||
- i_ctime_extra
|
||||
- Extra change time bits. This provides sub-second precision. See Inode
|
||||
Timestamps section.
|
||||
* - 0x88
|
||||
- \_\_le32
|
||||
- i\_mtime\_extra
|
||||
- __le32
|
||||
- i_mtime_extra
|
||||
- Extra modification time bits. This provides sub-second precision.
|
||||
* - 0x8C
|
||||
- \_\_le32
|
||||
- i\_atime\_extra
|
||||
- __le32
|
||||
- i_atime_extra
|
||||
- Extra access time bits. This provides sub-second precision.
|
||||
* - 0x90
|
||||
- \_\_le32
|
||||
- i\_crtime
|
||||
- __le32
|
||||
- i_crtime
|
||||
- File creation time, in seconds since the epoch.
|
||||
* - 0x94
|
||||
- \_\_le32
|
||||
- i\_crtime\_extra
|
||||
- __le32
|
||||
- i_crtime_extra
|
||||
- Extra file creation time bits. This provides sub-second precision.
|
||||
* - 0x98
|
||||
- \_\_le32
|
||||
- i\_version\_hi
|
||||
- __le32
|
||||
- i_version_hi
|
||||
- Upper 32-bits for version number.
|
||||
* - 0x9C
|
||||
- \_\_le32
|
||||
- i\_projid
|
||||
- __le32
|
||||
- i_projid
|
||||
- Project ID.
|
||||
|
||||
.. _i_mode:
|
||||
@ -183,45 +183,45 @@ The ``i_mode`` value is a combination of the following flags:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- S\_IXOTH (Others may execute)
|
||||
- S_IXOTH (Others may execute)
|
||||
* - 0x2
|
||||
- S\_IWOTH (Others may write)
|
||||
- S_IWOTH (Others may write)
|
||||
* - 0x4
|
||||
- S\_IROTH (Others may read)
|
||||
- S_IROTH (Others may read)
|
||||
* - 0x8
|
||||
- S\_IXGRP (Group members may execute)
|
||||
- S_IXGRP (Group members may execute)
|
||||
* - 0x10
|
||||
- S\_IWGRP (Group members may write)
|
||||
- S_IWGRP (Group members may write)
|
||||
* - 0x20
|
||||
- S\_IRGRP (Group members may read)
|
||||
- S_IRGRP (Group members may read)
|
||||
* - 0x40
|
||||
- S\_IXUSR (Owner may execute)
|
||||
- S_IXUSR (Owner may execute)
|
||||
* - 0x80
|
||||
- S\_IWUSR (Owner may write)
|
||||
- S_IWUSR (Owner may write)
|
||||
* - 0x100
|
||||
- S\_IRUSR (Owner may read)
|
||||
- S_IRUSR (Owner may read)
|
||||
* - 0x200
|
||||
- S\_ISVTX (Sticky bit)
|
||||
- S_ISVTX (Sticky bit)
|
||||
* - 0x400
|
||||
- S\_ISGID (Set GID)
|
||||
- S_ISGID (Set GID)
|
||||
* - 0x800
|
||||
- S\_ISUID (Set UID)
|
||||
- S_ISUID (Set UID)
|
||||
* -
|
||||
- These are mutually-exclusive file types:
|
||||
* - 0x1000
|
||||
- S\_IFIFO (FIFO)
|
||||
- S_IFIFO (FIFO)
|
||||
* - 0x2000
|
||||
- S\_IFCHR (Character device)
|
||||
- S_IFCHR (Character device)
|
||||
* - 0x4000
|
||||
- S\_IFDIR (Directory)
|
||||
- S_IFDIR (Directory)
|
||||
* - 0x6000
|
||||
- S\_IFBLK (Block device)
|
||||
- S_IFBLK (Block device)
|
||||
* - 0x8000
|
||||
- S\_IFREG (Regular file)
|
||||
- S_IFREG (Regular file)
|
||||
* - 0xA000
|
||||
- S\_IFLNK (Symbolic link)
|
||||
- S_IFLNK (Symbolic link)
|
||||
* - 0xC000
|
||||
- S\_IFSOCK (Socket)
|
||||
- S_IFSOCK (Socket)
|
||||
|
||||
.. _i_flags:
|
||||
|
||||
@ -234,56 +234,56 @@ The ``i_flags`` field is a combination of these values:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented)
|
||||
- This file requires secure deletion (EXT4_SECRM_FL). (not implemented)
|
||||
* - 0x2
|
||||
- This file should be preserved, should undeletion be desired
|
||||
(EXT4\_UNRM\_FL). (not implemented)
|
||||
(EXT4_UNRM_FL). (not implemented)
|
||||
* - 0x4
|
||||
- File is compressed (EXT4\_COMPR\_FL). (not really implemented)
|
||||
- File is compressed (EXT4_COMPR_FL). (not really implemented)
|
||||
* - 0x8
|
||||
- All writes to the file must be synchronous (EXT4\_SYNC\_FL).
|
||||
- All writes to the file must be synchronous (EXT4_SYNC_FL).
|
||||
* - 0x10
|
||||
- File is immutable (EXT4\_IMMUTABLE\_FL).
|
||||
- File is immutable (EXT4_IMMUTABLE_FL).
|
||||
* - 0x20
|
||||
- File can only be appended (EXT4\_APPEND\_FL).
|
||||
- File can only be appended (EXT4_APPEND_FL).
|
||||
* - 0x40
|
||||
- The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL).
|
||||
- The dump(1) utility should not dump this file (EXT4_NODUMP_FL).
|
||||
* - 0x80
|
||||
- Do not update access time (EXT4\_NOATIME\_FL).
|
||||
- Do not update access time (EXT4_NOATIME_FL).
|
||||
* - 0x100
|
||||
- Dirty compressed file (EXT4\_DIRTY\_FL). (not used)
|
||||
- Dirty compressed file (EXT4_DIRTY_FL). (not used)
|
||||
* - 0x200
|
||||
- File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used)
|
||||
- File has one or more compressed clusters (EXT4_COMPRBLK_FL). (not used)
|
||||
* - 0x400
|
||||
- Do not compress file (EXT4\_NOCOMPR\_FL). (not used)
|
||||
- Do not compress file (EXT4_NOCOMPR_FL). (not used)
|
||||
* - 0x800
|
||||
- Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was
|
||||
EXT4\_ECOMPR\_FL (compression error), which was never used.
|
||||
- Encrypted inode (EXT4_ENCRYPT_FL). This bit value previously was
|
||||
EXT4_ECOMPR_FL (compression error), which was never used.
|
||||
* - 0x1000
|
||||
- Directory has hashed indexes (EXT4\_INDEX\_FL).
|
||||
- Directory has hashed indexes (EXT4_INDEX_FL).
|
||||
* - 0x2000
|
||||
- AFS magic directory (EXT4\_IMAGIC\_FL).
|
||||
- AFS magic directory (EXT4_IMAGIC_FL).
|
||||
* - 0x4000
|
||||
- File data must always be written through the journal
|
||||
(EXT4\_JOURNAL\_DATA\_FL).
|
||||
(EXT4_JOURNAL_DATA_FL).
|
||||
* - 0x8000
|
||||
- File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4)
|
||||
- File tail should not be merged (EXT4_NOTAIL_FL). (not used by ext4)
|
||||
* - 0x10000
|
||||
- All directory entry data should be written synchronously (see
|
||||
``dirsync``) (EXT4\_DIRSYNC\_FL).
|
||||
``dirsync``) (EXT4_DIRSYNC_FL).
|
||||
* - 0x20000
|
||||
- Top of directory hierarchy (EXT4\_TOPDIR\_FL).
|
||||
- Top of directory hierarchy (EXT4_TOPDIR_FL).
|
||||
* - 0x40000
|
||||
- This is a huge file (EXT4\_HUGE\_FILE\_FL).
|
||||
- This is a huge file (EXT4_HUGE_FILE_FL).
|
||||
* - 0x80000
|
||||
- Inode uses extents (EXT4\_EXTENTS\_FL).
|
||||
- Inode uses extents (EXT4_EXTENTS_FL).
|
||||
* - 0x100000
|
||||
- Verity protected file (EXT4\_VERITY\_FL).
|
||||
- Verity protected file (EXT4_VERITY_FL).
|
||||
* - 0x200000
|
||||
- Inode stores a large extended attribute value in its data blocks
|
||||
(EXT4\_EA\_INODE\_FL).
|
||||
(EXT4_EA_INODE_FL).
|
||||
* - 0x400000
|
||||
- This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL).
|
||||
- This file has blocks allocated past EOF (EXT4_EOFBLOCKS_FL).
|
||||
(deprecated)
|
||||
* - 0x01000000
|
||||
- Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline)
|
||||
@ -294,21 +294,21 @@ The ``i_flags`` field is a combination of these values:
|
||||
- Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in
|
||||
mainline)
|
||||
* - 0x10000000
|
||||
- Inode has inline data (EXT4\_INLINE\_DATA\_FL).
|
||||
- Inode has inline data (EXT4_INLINE_DATA_FL).
|
||||
* - 0x20000000
|
||||
- Create children with the same project ID (EXT4\_PROJINHERIT\_FL).
|
||||
- Create children with the same project ID (EXT4_PROJINHERIT_FL).
|
||||
* - 0x80000000
|
||||
- Reserved for ext4 library (EXT4\_RESERVED\_FL).
|
||||
- Reserved for ext4 library (EXT4_RESERVED_FL).
|
||||
* -
|
||||
- Aggregate flags:
|
||||
* - 0x705BDFFF
|
||||
- User-visible flags.
|
||||
* - 0x604BC0FF
|
||||
- User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and
|
||||
EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's
|
||||
EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of
|
||||
- User-modifiable flags. Note that while EXT4_JOURNAL_DATA_FL and
|
||||
EXT4_EXTENTS_FL can be set with setattr, they are not in the kernel's
|
||||
EXT4_FL_USER_MODIFIABLE mask, since it needs to handle the setting of
|
||||
these flags in a special manner and they are masked out of the set of
|
||||
flags that are saved directly to i\_flags.
|
||||
flags that are saved directly to i_flags.
|
||||
|
||||
.. _i_osd1:
|
||||
|
||||
@ -325,9 +325,9 @@ Linux:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- l\_i\_version
|
||||
- Inode version. However, if the EA\_INODE inode flag is set, this inode
|
||||
- __le32
|
||||
- l_i_version
|
||||
- Inode version. However, if the EA_INODE inode flag is set, this inode
|
||||
stores an extended attribute value and this field contains the upper 32
|
||||
bits of the attribute value's reference count.
|
||||
|
||||
@ -342,8 +342,8 @@ Hurd:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_i\_translator
|
||||
- __le32
|
||||
- h_i_translator
|
||||
- ??
|
||||
|
||||
Masix:
|
||||
@ -357,8 +357,8 @@ Masix:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- m\_i\_reserved
|
||||
- __le32
|
||||
- m_i_reserved
|
||||
- ??
|
||||
|
||||
.. _i_osd2:
|
||||
@ -376,30 +376,30 @@ Linux:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- l\_i\_blocks\_high
|
||||
- __le16
|
||||
- l_i_blocks_high
|
||||
- Upper 16-bits of the block count. Please see the note attached to
|
||||
i\_blocks\_lo.
|
||||
i_blocks_lo.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- l\_i\_file\_acl\_high
|
||||
- __le16
|
||||
- l_i_file_acl_high
|
||||
- Upper 16-bits of the extended attribute block (historically, the file
|
||||
ACL location). See the Extended Attributes section below.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- l\_i\_uid\_high
|
||||
- __le16
|
||||
- l_i_uid_high
|
||||
- Upper 16-bits of the Owner UID.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- l\_i\_gid\_high
|
||||
- __le16
|
||||
- l_i_gid_high
|
||||
- Upper 16-bits of the GID.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- l\_i\_checksum\_lo
|
||||
- __le16
|
||||
- l_i_checksum_lo
|
||||
- Lower 16-bits of the inode checksum.
|
||||
* - 0xA
|
||||
- \_\_le16
|
||||
- l\_i\_reserved
|
||||
- __le16
|
||||
- l_i_reserved
|
||||
- Unused.
|
||||
|
||||
Hurd:
|
||||
@ -413,24 +413,24 @@ Hurd:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- h\_i\_reserved1
|
||||
- __le16
|
||||
- h_i_reserved1
|
||||
- ??
|
||||
* - 0x2
|
||||
- \_\_u16
|
||||
- h\_i\_mode\_high
|
||||
- __u16
|
||||
- h_i_mode_high
|
||||
- Upper 16-bits of the file mode.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- h\_i\_uid\_high
|
||||
- __le16
|
||||
- h_i_uid_high
|
||||
- Upper 16-bits of the Owner UID.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- h\_i\_gid\_high
|
||||
- __le16
|
||||
- h_i_gid_high
|
||||
- Upper 16-bits of the GID.
|
||||
* - 0x8
|
||||
- \_\_u32
|
||||
- h\_i\_author
|
||||
- __u32
|
||||
- h_i_author
|
||||
- Author code?
|
||||
|
||||
Masix:
|
||||
@ -444,17 +444,17 @@ Masix:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- h\_i\_reserved1
|
||||
- __le16
|
||||
- h_i_reserved1
|
||||
- ??
|
||||
* - 0x2
|
||||
- \_\_u16
|
||||
- m\_i\_file\_acl\_high
|
||||
- __u16
|
||||
- m_i_file_acl_high
|
||||
- Upper 16-bits of the extended attribute block (historically, the file
|
||||
ACL location).
|
||||
* - 0x4
|
||||
- \_\_u32
|
||||
- m\_i\_reserved2[2]
|
||||
- __u32
|
||||
- m_i_reserved2[2]
|
||||
- ??
|
||||
|
||||
Inode Size
|
||||
@ -466,11 +466,11 @@ In ext2 and ext3, the inode structure size was fixed at 128 bytes
|
||||
on-disk inode at format time for all inodes in the filesystem to provide
|
||||
space beyond the end of the original ext2 inode. The on-disk inode
|
||||
record size is recorded in the superblock as ``s_inode_size``. The
|
||||
number of bytes actually used by struct ext4\_inode beyond the original
|
||||
number of bytes actually used by struct ext4_inode beyond the original
|
||||
128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each
|
||||
inode, which allows struct ext4\_inode to grow for a new kernel without
|
||||
inode, which allows struct ext4_inode to grow for a new kernel without
|
||||
having to upgrade all of the on-disk inodes. Access to fields beyond
|
||||
EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within
|
||||
EXT2_GOOD_OLD_INODE_SIZE should be verified to be within
|
||||
``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as
|
||||
of August 2019) the inode structure is 160 bytes
|
||||
(``i_extra_isize = 32``). The extra space between the end of the inode
|
||||
@ -516,7 +516,7 @@ creation time (crtime); this field is 64-bits wide and decoded in the
|
||||
same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible
|
||||
through the regular stat() interface, though debugfs will report them.
|
||||
|
||||
We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)).
|
||||
We use the 32-bit signed time value plus (2^32 * (extra epoch bits)).
|
||||
In other words:
|
||||
|
||||
.. list-table::
|
||||
@ -525,8 +525,8 @@ In other words:
|
||||
|
||||
* - Extra epoch bits
|
||||
- MSB of 32-bit time
|
||||
- Adjustment for signed 32-bit to 64-bit tv\_sec
|
||||
- Decoded 64-bit tv\_sec
|
||||
- Adjustment for signed 32-bit to 64-bit tv_sec
|
||||
- Decoded 64-bit tv_sec
|
||||
- valid time range
|
||||
* - 0 0
|
||||
- 1
|
||||
|
@ -63,8 +63,8 @@ Generally speaking, the journal has this format:
|
||||
:header-rows: 1
|
||||
|
||||
* - Superblock
|
||||
- descriptor\_block (data\_blocks or revocation\_block) [more data or
|
||||
revocations] commmit\_block
|
||||
- descriptor_block (data_blocks or revocation_block) [more data or
|
||||
revocations] commmit_block
|
||||
- [more transactions...]
|
||||
* -
|
||||
- One transaction
|
||||
@ -93,8 +93,8 @@ superblock.
|
||||
* - 1024 bytes of padding
|
||||
- ext4 Superblock
|
||||
- Journal Superblock
|
||||
- descriptor\_block (data\_blocks or revocation\_block) [more data or
|
||||
revocations] commmit\_block
|
||||
- descriptor_block (data_blocks or revocation_block) [more data or
|
||||
revocations] commmit_block
|
||||
- [more transactions...]
|
||||
* -
|
||||
-
|
||||
@ -117,17 +117,17 @@ Every block in the journal starts with a common 12-byte header
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- h\_magic
|
||||
- __be32
|
||||
- h_magic
|
||||
- jbd2 magic number, 0xC03B3998.
|
||||
* - 0x4
|
||||
- \_\_be32
|
||||
- h\_blocktype
|
||||
- __be32
|
||||
- h_blocktype
|
||||
- Description of what this block contains. See the jbd2_blocktype_ table
|
||||
below.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- h\_sequence
|
||||
- __be32
|
||||
- h_sequence
|
||||
- The transaction ID that goes with this block.
|
||||
|
||||
.. _jbd2_blocktype:
|
||||
@ -177,99 +177,99 @@ which is 1024 bytes long:
|
||||
-
|
||||
- Static information describing the journal.
|
||||
* - 0x0
|
||||
- journal\_header\_t (12 bytes)
|
||||
- s\_header
|
||||
- journal_header_t (12 bytes)
|
||||
- s_header
|
||||
- Common header identifying this as a superblock.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- s\_blocksize
|
||||
- __be32
|
||||
- s_blocksize
|
||||
- Journal device block size.
|
||||
* - 0x10
|
||||
- \_\_be32
|
||||
- s\_maxlen
|
||||
- __be32
|
||||
- s_maxlen
|
||||
- Total number of blocks in this journal.
|
||||
* - 0x14
|
||||
- \_\_be32
|
||||
- s\_first
|
||||
- __be32
|
||||
- s_first
|
||||
- First block of log information.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- Dynamic information describing the current state of the log.
|
||||
* - 0x18
|
||||
- \_\_be32
|
||||
- s\_sequence
|
||||
- __be32
|
||||
- s_sequence
|
||||
- First commit ID expected in log.
|
||||
* - 0x1C
|
||||
- \_\_be32
|
||||
- s\_start
|
||||
- __be32
|
||||
- s_start
|
||||
- Block number of the start of log. Contrary to the comments, this field
|
||||
being zero does not imply that the journal is clean!
|
||||
* - 0x20
|
||||
- \_\_be32
|
||||
- s\_errno
|
||||
- Error value, as set by jbd2\_journal\_abort().
|
||||
- __be32
|
||||
- s_errno
|
||||
- Error value, as set by jbd2_journal_abort().
|
||||
* -
|
||||
-
|
||||
-
|
||||
- The remaining fields are only valid in a v2 superblock.
|
||||
* - 0x24
|
||||
- \_\_be32
|
||||
- s\_feature\_compat;
|
||||
- __be32
|
||||
- s_feature_compat;
|
||||
- Compatible feature set. See the table jbd2_compat_ below.
|
||||
* - 0x28
|
||||
- \_\_be32
|
||||
- s\_feature\_incompat
|
||||
- __be32
|
||||
- s_feature_incompat
|
||||
- Incompatible feature set. See the table jbd2_incompat_ below.
|
||||
* - 0x2C
|
||||
- \_\_be32
|
||||
- s\_feature\_ro\_compat
|
||||
- __be32
|
||||
- s_feature_ro_compat
|
||||
- Read-only compatible feature set. There aren't any of these currently.
|
||||
* - 0x30
|
||||
- \_\_u8
|
||||
- s\_uuid[16]
|
||||
- __u8
|
||||
- s_uuid[16]
|
||||
- 128-bit uuid for journal. This is compared against the copy in the ext4
|
||||
super block at mount time.
|
||||
* - 0x40
|
||||
- \_\_be32
|
||||
- s\_nr\_users
|
||||
- __be32
|
||||
- s_nr_users
|
||||
- Number of file systems sharing this journal.
|
||||
* - 0x44
|
||||
- \_\_be32
|
||||
- s\_dynsuper
|
||||
- __be32
|
||||
- s_dynsuper
|
||||
- Location of dynamic super block copy. (Not used?)
|
||||
* - 0x48
|
||||
- \_\_be32
|
||||
- s\_max\_transaction
|
||||
- __be32
|
||||
- s_max_transaction
|
||||
- Limit of journal blocks per transaction. (Not used?)
|
||||
* - 0x4C
|
||||
- \_\_be32
|
||||
- s\_max\_trans\_data
|
||||
- __be32
|
||||
- s_max_trans_data
|
||||
- Limit of data blocks per transaction. (Not used?)
|
||||
* - 0x50
|
||||
- \_\_u8
|
||||
- s\_checksum\_type
|
||||
- __u8
|
||||
- s_checksum_type
|
||||
- Checksum algorithm used for the journal. See jbd2_checksum_type_ for
|
||||
more info.
|
||||
* - 0x51
|
||||
- \_\_u8[3]
|
||||
- s\_padding2
|
||||
- __u8[3]
|
||||
- s_padding2
|
||||
-
|
||||
* - 0x54
|
||||
- \_\_be32
|
||||
- s\_num\_fc\_blocks
|
||||
- __be32
|
||||
- s_num_fc_blocks
|
||||
- Number of fast commit blocks in the journal.
|
||||
* - 0x58
|
||||
- \_\_u32
|
||||
- s\_padding[42]
|
||||
- __u32
|
||||
- s_padding[42]
|
||||
-
|
||||
* - 0xFC
|
||||
- \_\_be32
|
||||
- s\_checksum
|
||||
- __be32
|
||||
- s_checksum
|
||||
- Checksum of the entire superblock, with this field set to zero.
|
||||
* - 0x100
|
||||
- \_\_u8
|
||||
- s\_users[16\*48]
|
||||
- __u8
|
||||
- s_users[16*48]
|
||||
- ids of all file systems sharing the log. e2fsprogs/Linux don't allow
|
||||
shared external journals, but I imagine Lustre (or ocfs2?), which use
|
||||
the jbd2 code, might.
|
||||
@ -286,7 +286,7 @@ The journal compat features are any combination of the following:
|
||||
- Description
|
||||
* - 0x1
|
||||
- Journal maintains checksums on the data blocks.
|
||||
(JBD2\_FEATURE\_COMPAT\_CHECKSUM)
|
||||
(JBD2_FEATURE_COMPAT_CHECKSUM)
|
||||
|
||||
.. _jbd2_incompat:
|
||||
|
||||
@ -299,23 +299,23 @@ The journal incompat features are any combination of the following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE)
|
||||
- Journal has block revocation records. (JBD2_FEATURE_INCOMPAT_REVOKE)
|
||||
* - 0x2
|
||||
- Journal can deal with 64-bit block numbers.
|
||||
(JBD2\_FEATURE\_INCOMPAT\_64BIT)
|
||||
(JBD2_FEATURE_INCOMPAT_64BIT)
|
||||
* - 0x4
|
||||
- Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT)
|
||||
- Journal commits asynchronously. (JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)
|
||||
* - 0x8
|
||||
- This journal uses v2 of the checksum on-disk format. Each journal
|
||||
metadata block gets its own checksum, and the block tags in the
|
||||
descriptor table contain checksums for each of the data blocks in the
|
||||
journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2)
|
||||
journal. (JBD2_FEATURE_INCOMPAT_CSUM_V2)
|
||||
* - 0x10
|
||||
- This journal uses v3 of the checksum on-disk format. This is the same as
|
||||
v2, but the journal block tag size is fixed regardless of the size of
|
||||
block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3)
|
||||
block numbers. (JBD2_FEATURE_INCOMPAT_CSUM_V3)
|
||||
* - 0x20
|
||||
- Journal has fast commit blocks. (JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT)
|
||||
- Journal has fast commit blocks. (JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
|
||||
|
||||
.. _jbd2_checksum_type:
|
||||
|
||||
@ -355,11 +355,11 @@ Descriptor blocks consume at least 36 bytes, but use a full block:
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- journal\_header\_t
|
||||
- journal_header_t
|
||||
- (open coded)
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- struct journal\_block\_tag\_s
|
||||
- struct journal_block_tag_s
|
||||
- open coded array[]
|
||||
- Enough tags either to fill up the block or to describe all the data
|
||||
blocks that follow this descriptor block.
|
||||
@ -367,7 +367,7 @@ Descriptor blocks consume at least 36 bytes, but use a full block:
|
||||
Journal block tags have any of the following formats, depending on which
|
||||
journal feature and block tag flags are set.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is
|
||||
If JBD2_FEATURE_INCOMPAT_CSUM_V3 is set, the journal block tag is
|
||||
defined as ``struct journal_block_tag3_s``, which looks like the
|
||||
following. The size is 16 or 32 bytes.
|
||||
|
||||
@ -380,24 +380,24 @@ following. The size is 16 or 32 bytes.
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_blocknr
|
||||
- __be32
|
||||
- t_blocknr
|
||||
- Lower 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* - 0x4
|
||||
- \_\_be32
|
||||
- t\_flags
|
||||
- __be32
|
||||
- t_flags
|
||||
- Flags that go with the descriptor. See the table jbd2_tag_flags_ for
|
||||
more info.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- t\_blocknr\_high
|
||||
- __be32
|
||||
- t_blocknr_high
|
||||
- Upper 32-bits of the location of where the corresponding data block
|
||||
should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is
|
||||
should end up on disk. This is zero if JBD2_FEATURE_INCOMPAT_64BIT is
|
||||
not enabled.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- t\_checksum
|
||||
- __be32
|
||||
- t_checksum
|
||||
- Checksum of the journal UUID, the sequence number, and the data block.
|
||||
* -
|
||||
-
|
||||
@ -433,7 +433,7 @@ The journal tag flags are any combination of the following:
|
||||
* - 0x8
|
||||
- This is the last tag in this descriptor block.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag
|
||||
If JBD2_FEATURE_INCOMPAT_CSUM_V3 is NOT set, the journal block tag
|
||||
is defined as ``struct journal_block_tag_s``, which looks like the
|
||||
following. The size is 8, 12, 24, or 28 bytes:
|
||||
|
||||
@ -446,18 +446,18 @@ following. The size is 8, 12, 24, or 28 bytes:
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_blocknr
|
||||
- __be32
|
||||
- t_blocknr
|
||||
- Lower 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* - 0x4
|
||||
- \_\_be16
|
||||
- t\_checksum
|
||||
- __be16
|
||||
- t_checksum
|
||||
- Checksum of the journal UUID, the sequence number, and the data block.
|
||||
Note that only the lower 16 bits are stored.
|
||||
* - 0x6
|
||||
- \_\_be16
|
||||
- t\_flags
|
||||
- __be16
|
||||
- t_flags
|
||||
- Flags that go with the descriptor. See the table jbd2_tag_flags_ for
|
||||
more info.
|
||||
* -
|
||||
@ -466,8 +466,8 @@ following. The size is 8, 12, 24, or 28 bytes:
|
||||
- This next field is only present if the super block indicates support for
|
||||
64-bit block numbers.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- t\_blocknr\_high
|
||||
- __be32
|
||||
- t_blocknr_high
|
||||
- Upper 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* -
|
||||
@ -483,8 +483,8 @@ following. The size is 8, 12, 24, or 28 bytes:
|
||||
``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
|
||||
field.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
|
||||
If JBD2_FEATURE_INCOMPAT_CSUM_V2 or
|
||||
JBD2_FEATURE_INCOMPAT_CSUM_V3 are set, the end of the block is a
|
||||
``struct jbd2_journal_block_tail``, which looks like this:
|
||||
|
||||
.. list-table::
|
||||
@ -496,8 +496,8 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_checksum
|
||||
- __be32
|
||||
- t_checksum
|
||||
- Checksum of the journal UUID + the descriptor block, with this field set
|
||||
to zero.
|
||||
|
||||
@ -538,25 +538,25 @@ length, but use a full block:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- journal\_header\_t
|
||||
- r\_header
|
||||
- journal_header_t
|
||||
- r_header
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- r\_count
|
||||
- __be32
|
||||
- r_count
|
||||
- Number of bytes used in this block.
|
||||
* - 0x10
|
||||
- \_\_be32 or \_\_be64
|
||||
- __be32 or __be64
|
||||
- blocks[0]
|
||||
- Blocks to revoke.
|
||||
|
||||
After r\_count is a linear array of block numbers that are effectively
|
||||
After r_count is a linear array of block numbers that are effectively
|
||||
revoked by this transaction. The size of each block number is 8 bytes if
|
||||
the superblock advertises 64-bit block number support, or 4 bytes
|
||||
otherwise.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
|
||||
If JBD2_FEATURE_INCOMPAT_CSUM_V2 or
|
||||
JBD2_FEATURE_INCOMPAT_CSUM_V3 are set, the end of the revocation
|
||||
block is a ``struct jbd2_journal_revoke_tail``, which has this format:
|
||||
|
||||
.. list-table::
|
||||
@ -568,8 +568,8 @@ block is a ``struct jbd2_journal_revoke_tail``, which has this format:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- r\_checksum
|
||||
- __be32
|
||||
- r_checksum
|
||||
- Checksum of the journal UUID + revocation block
|
||||
|
||||
Commit Block
|
||||
@ -592,38 +592,38 @@ bytes long (but uses a full block):
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- journal\_header\_s
|
||||
- journal_header_s
|
||||
- (open coded)
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- unsigned char
|
||||
- h\_chksum\_type
|
||||
- h_chksum_type
|
||||
- The type of checksum to use to verify the integrity of the data blocks
|
||||
in the transaction. See jbd2_checksum_type_ for more info.
|
||||
* - 0xD
|
||||
- unsigned char
|
||||
- h\_chksum\_size
|
||||
- h_chksum_size
|
||||
- The number of bytes used by the checksum. Most likely 4.
|
||||
* - 0xE
|
||||
- unsigned char
|
||||
- h\_padding[2]
|
||||
- h_padding[2]
|
||||
-
|
||||
* - 0x10
|
||||
- \_\_be32
|
||||
- h\_chksum[JBD2\_CHECKSUM\_BYTES]
|
||||
- __be32
|
||||
- h_chksum[JBD2_CHECKSUM_BYTES]
|
||||
- 32 bytes of space to store checksums. If
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3
|
||||
JBD2_FEATURE_INCOMPAT_CSUM_V2 or JBD2_FEATURE_INCOMPAT_CSUM_V3
|
||||
are set, the first ``__be32`` is the checksum of the journal UUID and
|
||||
the entire commit block, with this field zeroed. If
|
||||
JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the
|
||||
JBD2_FEATURE_COMPAT_CHECKSUM is set, the first ``__be32`` is the
|
||||
crc32 of all the blocks already written to the transaction.
|
||||
* - 0x30
|
||||
- \_\_be64
|
||||
- h\_commit\_sec
|
||||
- __be64
|
||||
- h_commit_sec
|
||||
- The time that the transaction was committed, in seconds since the epoch.
|
||||
* - 0x38
|
||||
- \_\_be32
|
||||
- h\_commit\_nsec
|
||||
- __be32
|
||||
- h_commit_nsec
|
||||
- Nanoseconds component of the above timestamp.
|
||||
|
||||
Fast commits
|
||||
|
@ -7,8 +7,8 @@ Multiple mount protection (MMP) is a feature that protects the
|
||||
filesystem against multiple hosts trying to use the filesystem
|
||||
simultaneously. When a filesystem is opened (for mounting, or fsck,
|
||||
etc.), the MMP code running on the node (call it node A) checks a
|
||||
sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the
|
||||
open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then
|
||||
sequence number. If the sequence number is EXT4_MMP_SEQ_CLEAN, the
|
||||
open continues. If the sequence number is EXT4_MMP_SEQ_FSCK, then
|
||||
fsck is (hopefully) running, and open fails immediately. Otherwise, the
|
||||
open code will wait for twice the specified MMP check interval and check
|
||||
the sequence number again. If the sequence number has changed, then the
|
||||
@ -40,38 +40,38 @@ The MMP structure (``struct mmp_struct``) is as follows:
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- mmp\_magic
|
||||
- __le32
|
||||
- mmp_magic
|
||||
- Magic number for MMP, 0x004D4D50 (“MMP”).
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- mmp\_seq
|
||||
- __le32
|
||||
- mmp_seq
|
||||
- Sequence number, updated periodically.
|
||||
* - 0x8
|
||||
- \_\_le64
|
||||
- mmp\_time
|
||||
- __le64
|
||||
- mmp_time
|
||||
- Time that the MMP block was last updated.
|
||||
* - 0x10
|
||||
- char[64]
|
||||
- mmp\_nodename
|
||||
- mmp_nodename
|
||||
- Hostname of the node that opened the filesystem.
|
||||
* - 0x50
|
||||
- char[32]
|
||||
- mmp\_bdevname
|
||||
- mmp_bdevname
|
||||
- Block device name of the filesystem.
|
||||
* - 0x70
|
||||
- \_\_le16
|
||||
- mmp\_check\_interval
|
||||
- __le16
|
||||
- mmp_check_interval
|
||||
- The MMP re-check interval, in seconds.
|
||||
* - 0x72
|
||||
- \_\_le16
|
||||
- mmp\_pad1
|
||||
- __le16
|
||||
- mmp_pad1
|
||||
- Zero.
|
||||
* - 0x74
|
||||
- \_\_le32[226]
|
||||
- mmp\_pad2
|
||||
- __le32[226]
|
||||
- mmp_pad2
|
||||
- Zero.
|
||||
* - 0x3FC
|
||||
- \_\_le32
|
||||
- mmp\_checksum
|
||||
- __le32
|
||||
- mmp_checksum
|
||||
- Checksum of the MMP block.
|
||||
|
@ -7,7 +7,7 @@ An ext4 file system is split into a series of block groups. To reduce
|
||||
performance difficulties due to fragmentation, the block allocator tries
|
||||
very hard to keep each file's blocks within the same group, thereby
|
||||
reducing seek times. The size of a block group is specified in
|
||||
``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \*
|
||||
``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 *
|
||||
``block_size_in_bytes``. With the default block size of 4KiB, each group
|
||||
will contain 32,768 blocks, for a length of 128MiB. The number of block
|
||||
groups is the size of the device divided by the size of a block group.
|
||||
|
@ -34,7 +34,7 @@ ext4 reserves some inode for special features, as follows:
|
||||
* - 10
|
||||
- Replica inode, used for some non-upstream feature?
|
||||
* - 11
|
||||
- Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock.
|
||||
- Traditional first non-reserved inode. Usually this is the lost+found directory. See s_first_ino in the superblock.
|
||||
|
||||
Note that there are also some inodes allocated from non-reserved inode numbers
|
||||
for other filesystem features which are not referenced from standard directory
|
||||
@ -47,9 +47,9 @@ hierarchy. These are generally reference from the superblock. They are:
|
||||
* - Superblock field
|
||||
- Description
|
||||
|
||||
* - s\_lpf\_ino
|
||||
* - s_lpf_ino
|
||||
- Inode number of lost+found directory.
|
||||
* - s\_prj\_quota\_inum
|
||||
* - s_prj_quota_inum
|
||||
- Inode number of quota file tracking project quotas
|
||||
* - s\_orphan\_file\_inum
|
||||
* - s_orphan_file_inum
|
||||
- Inode number of file tracking orphan inodes.
|
||||
|
@ -7,7 +7,7 @@ The superblock records various information about the enclosing
|
||||
filesystem, such as block counts, inode counts, supported features,
|
||||
maintenance information, and more.
|
||||
|
||||
If the sparse\_super feature flag is set, redundant copies of the
|
||||
If the sparse_super feature flag is set, redundant copies of the
|
||||
superblock and group descriptors are kept only in the groups whose group
|
||||
number is either 0 or a power of 3, 5, or 7. If the flag is not set,
|
||||
redundant copies are kept in all groups.
|
||||
@ -27,107 +27,107 @@ The ext4 superblock is laid out as follows in
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- s\_inodes\_count
|
||||
- __le32
|
||||
- s_inodes_count
|
||||
- Total inode count.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- s\_blocks\_count\_lo
|
||||
- __le32
|
||||
- s_blocks_count_lo
|
||||
- Total block count.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- s\_r\_blocks\_count\_lo
|
||||
- __le32
|
||||
- s_r_blocks_count_lo
|
||||
- This number of blocks can only be allocated by the super-user.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- s\_free\_blocks\_count\_lo
|
||||
- __le32
|
||||
- s_free_blocks_count_lo
|
||||
- Free block count.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- s\_free\_inodes\_count
|
||||
- __le32
|
||||
- s_free_inodes_count
|
||||
- Free inode count.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- s\_first\_data\_block
|
||||
- __le32
|
||||
- s_first_data_block
|
||||
- First data block. This must be at least 1 for 1k-block filesystems and
|
||||
is typically 0 for all other block sizes.
|
||||
* - 0x18
|
||||
- \_\_le32
|
||||
- s\_log\_block\_size
|
||||
- Block size is 2 ^ (10 + s\_log\_block\_size).
|
||||
- __le32
|
||||
- s_log_block_size
|
||||
- Block size is 2 ^ (10 + s_log_block_size).
|
||||
* - 0x1C
|
||||
- \_\_le32
|
||||
- s\_log\_cluster\_size
|
||||
- Cluster size is 2 ^ (10 + s\_log\_cluster\_size) blocks if bigalloc is
|
||||
enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size.
|
||||
- __le32
|
||||
- s_log_cluster_size
|
||||
- Cluster size is 2 ^ (10 + s_log_cluster_size) blocks if bigalloc is
|
||||
enabled. Otherwise s_log_cluster_size must equal s_log_block_size.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- s\_blocks\_per\_group
|
||||
- __le32
|
||||
- s_blocks_per_group
|
||||
- Blocks per group.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- s\_clusters\_per\_group
|
||||
- __le32
|
||||
- s_clusters_per_group
|
||||
- Clusters per group, if bigalloc is enabled. Otherwise
|
||||
s\_clusters\_per\_group must equal s\_blocks\_per\_group.
|
||||
s_clusters_per_group must equal s_blocks_per_group.
|
||||
* - 0x28
|
||||
- \_\_le32
|
||||
- s\_inodes\_per\_group
|
||||
- __le32
|
||||
- s_inodes_per_group
|
||||
- Inodes per group.
|
||||
* - 0x2C
|
||||
- \_\_le32
|
||||
- s\_mtime
|
||||
- __le32
|
||||
- s_mtime
|
||||
- Mount time, in seconds since the epoch.
|
||||
* - 0x30
|
||||
- \_\_le32
|
||||
- s\_wtime
|
||||
- __le32
|
||||
- s_wtime
|
||||
- Write time, in seconds since the epoch.
|
||||
* - 0x34
|
||||
- \_\_le16
|
||||
- s\_mnt\_count
|
||||
- __le16
|
||||
- s_mnt_count
|
||||
- Number of mounts since the last fsck.
|
||||
* - 0x36
|
||||
- \_\_le16
|
||||
- s\_max\_mnt\_count
|
||||
- __le16
|
||||
- s_max_mnt_count
|
||||
- Number of mounts beyond which a fsck is needed.
|
||||
* - 0x38
|
||||
- \_\_le16
|
||||
- s\_magic
|
||||
- __le16
|
||||
- s_magic
|
||||
- Magic signature, 0xEF53
|
||||
* - 0x3A
|
||||
- \_\_le16
|
||||
- s\_state
|
||||
- __le16
|
||||
- s_state
|
||||
- File system state. See super_state_ for more info.
|
||||
* - 0x3C
|
||||
- \_\_le16
|
||||
- s\_errors
|
||||
- __le16
|
||||
- s_errors
|
||||
- Behaviour when detecting errors. See super_errors_ for more info.
|
||||
* - 0x3E
|
||||
- \_\_le16
|
||||
- s\_minor\_rev\_level
|
||||
- __le16
|
||||
- s_minor_rev_level
|
||||
- Minor revision level.
|
||||
* - 0x40
|
||||
- \_\_le32
|
||||
- s\_lastcheck
|
||||
- __le32
|
||||
- s_lastcheck
|
||||
- Time of last check, in seconds since the epoch.
|
||||
* - 0x44
|
||||
- \_\_le32
|
||||
- s\_checkinterval
|
||||
- __le32
|
||||
- s_checkinterval
|
||||
- Maximum time between checks, in seconds.
|
||||
* - 0x48
|
||||
- \_\_le32
|
||||
- s\_creator\_os
|
||||
- __le32
|
||||
- s_creator_os
|
||||
- Creator OS. See the table super_creator_ for more info.
|
||||
* - 0x4C
|
||||
- \_\_le32
|
||||
- s\_rev\_level
|
||||
- __le32
|
||||
- s_rev_level
|
||||
- Revision level. See the table super_revision_ for more info.
|
||||
* - 0x50
|
||||
- \_\_le16
|
||||
- s\_def\_resuid
|
||||
- __le16
|
||||
- s_def_resuid
|
||||
- Default uid for reserved blocks.
|
||||
* - 0x52
|
||||
- \_\_le16
|
||||
- s\_def\_resgid
|
||||
- __le16
|
||||
- s_def_resgid
|
||||
- Default gid for reserved blocks.
|
||||
* -
|
||||
-
|
||||
@ -143,50 +143,50 @@ The ext4 superblock is laid out as follows in
|
||||
about a feature in either the compatible or incompatible feature set, it
|
||||
must abort and not try to meddle with things it doesn't understand...
|
||||
* - 0x54
|
||||
- \_\_le32
|
||||
- s\_first\_ino
|
||||
- __le32
|
||||
- s_first_ino
|
||||
- First non-reserved inode.
|
||||
* - 0x58
|
||||
- \_\_le16
|
||||
- s\_inode\_size
|
||||
- __le16
|
||||
- s_inode_size
|
||||
- Size of inode structure, in bytes.
|
||||
* - 0x5A
|
||||
- \_\_le16
|
||||
- s\_block\_group\_nr
|
||||
- __le16
|
||||
- s_block_group_nr
|
||||
- Block group # of this superblock.
|
||||
* - 0x5C
|
||||
- \_\_le32
|
||||
- s\_feature\_compat
|
||||
- __le32
|
||||
- s_feature_compat
|
||||
- Compatible feature set flags. Kernel can still read/write this fs even
|
||||
if it doesn't understand a flag; fsck should not do that. See the
|
||||
super_compat_ table for more info.
|
||||
* - 0x60
|
||||
- \_\_le32
|
||||
- s\_feature\_incompat
|
||||
- __le32
|
||||
- s_feature_incompat
|
||||
- Incompatible feature set. If the kernel or fsck doesn't understand one
|
||||
of these bits, it should stop. See the super_incompat_ table for more
|
||||
info.
|
||||
* - 0x64
|
||||
- \_\_le32
|
||||
- s\_feature\_ro\_compat
|
||||
- __le32
|
||||
- s_feature_ro_compat
|
||||
- Readonly-compatible feature set. If the kernel doesn't understand one of
|
||||
these bits, it can still mount read-only. See the super_rocompat_ table
|
||||
for more info.
|
||||
* - 0x68
|
||||
- \_\_u8
|
||||
- s\_uuid[16]
|
||||
- __u8
|
||||
- s_uuid[16]
|
||||
- 128-bit UUID for volume.
|
||||
* - 0x78
|
||||
- char
|
||||
- s\_volume\_name[16]
|
||||
- s_volume_name[16]
|
||||
- Volume label.
|
||||
* - 0x88
|
||||
- char
|
||||
- s\_last\_mounted[64]
|
||||
- s_last_mounted[64]
|
||||
- Directory where filesystem was last mounted.
|
||||
* - 0xC8
|
||||
- \_\_le32
|
||||
- s\_algorithm\_usage\_bitmap
|
||||
- __le32
|
||||
- s_algorithm_usage_bitmap
|
||||
- For compression (Not used in e2fsprogs/Linux)
|
||||
* -
|
||||
-
|
||||
@ -194,18 +194,18 @@ The ext4 superblock is laid out as follows in
|
||||
- Performance hints. Directory preallocation should only happen if the
|
||||
EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
|
||||
* - 0xCC
|
||||
- \_\_u8
|
||||
- s\_prealloc\_blocks
|
||||
- __u8
|
||||
- s_prealloc_blocks
|
||||
- #. of blocks to try to preallocate for ... files? (Not used in
|
||||
e2fsprogs/Linux)
|
||||
* - 0xCD
|
||||
- \_\_u8
|
||||
- s\_prealloc\_dir\_blocks
|
||||
- __u8
|
||||
- s_prealloc_dir_blocks
|
||||
- #. of blocks to preallocate for directories. (Not used in
|
||||
e2fsprogs/Linux)
|
||||
* - 0xCE
|
||||
- \_\_le16
|
||||
- s\_reserved\_gdt\_blocks
|
||||
- __le16
|
||||
- s_reserved_gdt_blocks
|
||||
- Number of reserved GDT entries for future filesystem expansion.
|
||||
* -
|
||||
-
|
||||
@ -213,281 +213,281 @@ The ext4 superblock is laid out as follows in
|
||||
- Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is
|
||||
set.
|
||||
* - 0xD0
|
||||
- \_\_u8
|
||||
- s\_journal\_uuid[16]
|
||||
- __u8
|
||||
- s_journal_uuid[16]
|
||||
- UUID of journal superblock
|
||||
* - 0xE0
|
||||
- \_\_le32
|
||||
- s\_journal\_inum
|
||||
- __le32
|
||||
- s_journal_inum
|
||||
- inode number of journal file.
|
||||
* - 0xE4
|
||||
- \_\_le32
|
||||
- s\_journal\_dev
|
||||
- __le32
|
||||
- s_journal_dev
|
||||
- Device number of journal file, if the external journal feature flag is
|
||||
set.
|
||||
* - 0xE8
|
||||
- \_\_le32
|
||||
- s\_last\_orphan
|
||||
- __le32
|
||||
- s_last_orphan
|
||||
- Start of list of orphaned inodes to delete.
|
||||
* - 0xEC
|
||||
- \_\_le32
|
||||
- s\_hash\_seed[4]
|
||||
- __le32
|
||||
- s_hash_seed[4]
|
||||
- HTREE hash seed.
|
||||
* - 0xFC
|
||||
- \_\_u8
|
||||
- s\_def\_hash\_version
|
||||
- __u8
|
||||
- s_def_hash_version
|
||||
- Default hash algorithm to use for directory hashes. See super_def_hash_
|
||||
for more info.
|
||||
* - 0xFD
|
||||
- \_\_u8
|
||||
- s\_jnl\_backup\_type
|
||||
- If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the
|
||||
- __u8
|
||||
- s_jnl_backup_type
|
||||
- If this value is 0 or EXT3_JNL_BACKUP_BLOCKS (1), then the
|
||||
``s_jnl_blocks`` field contains a duplicate copy of the inode's
|
||||
``i_block[]`` array and ``i_size``.
|
||||
* - 0xFE
|
||||
- \_\_le16
|
||||
- s\_desc\_size
|
||||
- __le16
|
||||
- s_desc_size
|
||||
- Size of group descriptors, in bytes, if the 64bit incompat feature flag
|
||||
is set.
|
||||
* - 0x100
|
||||
- \_\_le32
|
||||
- s\_default\_mount\_opts
|
||||
- __le32
|
||||
- s_default_mount_opts
|
||||
- Default mount options. See the super_mountopts_ table for more info.
|
||||
* - 0x104
|
||||
- \_\_le32
|
||||
- s\_first\_meta\_bg
|
||||
- First metablock block group, if the meta\_bg feature is enabled.
|
||||
- __le32
|
||||
- s_first_meta_bg
|
||||
- First metablock block group, if the meta_bg feature is enabled.
|
||||
* - 0x108
|
||||
- \_\_le32
|
||||
- s\_mkfs\_time
|
||||
- __le32
|
||||
- s_mkfs_time
|
||||
- When the filesystem was created, in seconds since the epoch.
|
||||
* - 0x10C
|
||||
- \_\_le32
|
||||
- s\_jnl\_blocks[17]
|
||||
- __le32
|
||||
- s_jnl_blocks[17]
|
||||
- Backup copy of the journal inode's ``i_block[]`` array in the first 15
|
||||
elements and i\_size\_high and i\_size in the 16th and 17th elements,
|
||||
elements and i_size_high and i_size in the 16th and 17th elements,
|
||||
respectively.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set.
|
||||
* - 0x150
|
||||
- \_\_le32
|
||||
- s\_blocks\_count\_hi
|
||||
- __le32
|
||||
- s_blocks_count_hi
|
||||
- High 32-bits of the block count.
|
||||
* - 0x154
|
||||
- \_\_le32
|
||||
- s\_r\_blocks\_count\_hi
|
||||
- __le32
|
||||
- s_r_blocks_count_hi
|
||||
- High 32-bits of the reserved block count.
|
||||
* - 0x158
|
||||
- \_\_le32
|
||||
- s\_free\_blocks\_count\_hi
|
||||
- __le32
|
||||
- s_free_blocks_count_hi
|
||||
- High 32-bits of the free block count.
|
||||
* - 0x15C
|
||||
- \_\_le16
|
||||
- s\_min\_extra\_isize
|
||||
- __le16
|
||||
- s_min_extra_isize
|
||||
- All inodes have at least # bytes.
|
||||
* - 0x15E
|
||||
- \_\_le16
|
||||
- s\_want\_extra\_isize
|
||||
- __le16
|
||||
- s_want_extra_isize
|
||||
- New inodes should reserve # bytes.
|
||||
* - 0x160
|
||||
- \_\_le32
|
||||
- s\_flags
|
||||
- __le32
|
||||
- s_flags
|
||||
- Miscellaneous flags. See the super_flags_ table for more info.
|
||||
* - 0x164
|
||||
- \_\_le16
|
||||
- s\_raid\_stride
|
||||
- __le16
|
||||
- s_raid_stride
|
||||
- RAID stride. This is the number of logical blocks read from or written
|
||||
to the disk before moving to the next disk. This affects the placement
|
||||
of filesystem metadata, which will hopefully make RAID storage faster.
|
||||
* - 0x166
|
||||
- \_\_le16
|
||||
- s\_mmp\_interval
|
||||
- __le16
|
||||
- s_mmp_interval
|
||||
- #. seconds to wait in multi-mount prevention (MMP) checking. In theory,
|
||||
MMP is a mechanism to record in the superblock which host and device
|
||||
have mounted the filesystem, in order to prevent multiple mounts. This
|
||||
feature does not seem to be implemented...
|
||||
* - 0x168
|
||||
- \_\_le64
|
||||
- s\_mmp\_block
|
||||
- __le64
|
||||
- s_mmp_block
|
||||
- Block # for multi-mount protection data.
|
||||
* - 0x170
|
||||
- \_\_le32
|
||||
- s\_raid\_stripe\_width
|
||||
- __le32
|
||||
- s_raid_stripe_width
|
||||
- RAID stripe width. This is the number of logical blocks read from or
|
||||
written to the disk before coming back to the current disk. This is used
|
||||
by the block allocator to try to reduce the number of read-modify-write
|
||||
operations in a RAID5/6.
|
||||
* - 0x174
|
||||
- \_\_u8
|
||||
- s\_log\_groups\_per\_flex
|
||||
- __u8
|
||||
- s_log_groups_per_flex
|
||||
- Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``.
|
||||
* - 0x175
|
||||
- \_\_u8
|
||||
- s\_checksum\_type
|
||||
- __u8
|
||||
- s_checksum_type
|
||||
- Metadata checksum algorithm type. The only valid value is 1 (crc32c).
|
||||
* - 0x176
|
||||
- \_\_le16
|
||||
- s\_reserved\_pad
|
||||
- __le16
|
||||
- s_reserved_pad
|
||||
-
|
||||
* - 0x178
|
||||
- \_\_le64
|
||||
- s\_kbytes\_written
|
||||
- __le64
|
||||
- s_kbytes_written
|
||||
- Number of KiB written to this filesystem over its lifetime.
|
||||
* - 0x180
|
||||
- \_\_le32
|
||||
- s\_snapshot\_inum
|
||||
- __le32
|
||||
- s_snapshot_inum
|
||||
- inode number of active snapshot. (Not used in e2fsprogs/Linux.)
|
||||
* - 0x184
|
||||
- \_\_le32
|
||||
- s\_snapshot\_id
|
||||
- __le32
|
||||
- s_snapshot_id
|
||||
- Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.)
|
||||
* - 0x188
|
||||
- \_\_le64
|
||||
- s\_snapshot\_r\_blocks\_count
|
||||
- __le64
|
||||
- s_snapshot_r_blocks_count
|
||||
- Number of blocks reserved for active snapshot's future use. (Not used in
|
||||
e2fsprogs/Linux.)
|
||||
* - 0x190
|
||||
- \_\_le32
|
||||
- s\_snapshot\_list
|
||||
- __le32
|
||||
- s_snapshot_list
|
||||
- inode number of the head of the on-disk snapshot list. (Not used in
|
||||
e2fsprogs/Linux.)
|
||||
* - 0x194
|
||||
- \_\_le32
|
||||
- s\_error\_count
|
||||
- __le32
|
||||
- s_error_count
|
||||
- Number of errors seen.
|
||||
* - 0x198
|
||||
- \_\_le32
|
||||
- s\_first\_error\_time
|
||||
- __le32
|
||||
- s_first_error_time
|
||||
- First time an error happened, in seconds since the epoch.
|
||||
* - 0x19C
|
||||
- \_\_le32
|
||||
- s\_first\_error\_ino
|
||||
- __le32
|
||||
- s_first_error_ino
|
||||
- inode involved in first error.
|
||||
* - 0x1A0
|
||||
- \_\_le64
|
||||
- s\_first\_error\_block
|
||||
- __le64
|
||||
- s_first_error_block
|
||||
- Number of block involved of first error.
|
||||
* - 0x1A8
|
||||
- \_\_u8
|
||||
- s\_first\_error\_func[32]
|
||||
- __u8
|
||||
- s_first_error_func[32]
|
||||
- Name of function where the error happened.
|
||||
* - 0x1C8
|
||||
- \_\_le32
|
||||
- s\_first\_error\_line
|
||||
- __le32
|
||||
- s_first_error_line
|
||||
- Line number where error happened.
|
||||
* - 0x1CC
|
||||
- \_\_le32
|
||||
- s\_last\_error\_time
|
||||
- __le32
|
||||
- s_last_error_time
|
||||
- Time of most recent error, in seconds since the epoch.
|
||||
* - 0x1D0
|
||||
- \_\_le32
|
||||
- s\_last\_error\_ino
|
||||
- __le32
|
||||
- s_last_error_ino
|
||||
- inode involved in most recent error.
|
||||
* - 0x1D4
|
||||
- \_\_le32
|
||||
- s\_last\_error\_line
|
||||
- __le32
|
||||
- s_last_error_line
|
||||
- Line number where most recent error happened.
|
||||
* - 0x1D8
|
||||
- \_\_le64
|
||||
- s\_last\_error\_block
|
||||
- __le64
|
||||
- s_last_error_block
|
||||
- Number of block involved in most recent error.
|
||||
* - 0x1E0
|
||||
- \_\_u8
|
||||
- s\_last\_error\_func[32]
|
||||
- __u8
|
||||
- s_last_error_func[32]
|
||||
- Name of function where the most recent error happened.
|
||||
* - 0x200
|
||||
- \_\_u8
|
||||
- s\_mount\_opts[64]
|
||||
- __u8
|
||||
- s_mount_opts[64]
|
||||
- ASCIIZ string of mount options.
|
||||
* - 0x240
|
||||
- \_\_le32
|
||||
- s\_usr\_quota\_inum
|
||||
- __le32
|
||||
- s_usr_quota_inum
|
||||
- Inode number of user `quota <quota>`__ file.
|
||||
* - 0x244
|
||||
- \_\_le32
|
||||
- s\_grp\_quota\_inum
|
||||
- __le32
|
||||
- s_grp_quota_inum
|
||||
- Inode number of group `quota <quota>`__ file.
|
||||
* - 0x248
|
||||
- \_\_le32
|
||||
- s\_overhead\_blocks
|
||||
- __le32
|
||||
- s_overhead_blocks
|
||||
- Overhead blocks/clusters in fs. (Huh? This field is always zero, which
|
||||
means that the kernel calculates it dynamically.)
|
||||
* - 0x24C
|
||||
- \_\_le32
|
||||
- s\_backup\_bgs[2]
|
||||
- Block groups containing superblock backups (if sparse\_super2)
|
||||
- __le32
|
||||
- s_backup_bgs[2]
|
||||
- Block groups containing superblock backups (if sparse_super2)
|
||||
* - 0x254
|
||||
- \_\_u8
|
||||
- s\_encrypt\_algos[4]
|
||||
- __u8
|
||||
- s_encrypt_algos[4]
|
||||
- Encryption algorithms in use. There can be up to four algorithms in use
|
||||
at any time; valid algorithm codes are given in the super_encrypt_ table
|
||||
below.
|
||||
* - 0x258
|
||||
- \_\_u8
|
||||
- s\_encrypt\_pw\_salt[16]
|
||||
- __u8
|
||||
- s_encrypt_pw_salt[16]
|
||||
- Salt for the string2key algorithm for encryption.
|
||||
* - 0x268
|
||||
- \_\_le32
|
||||
- s\_lpf\_ino
|
||||
- __le32
|
||||
- s_lpf_ino
|
||||
- Inode number of lost+found
|
||||
* - 0x26C
|
||||
- \_\_le32
|
||||
- s\_prj\_quota\_inum
|
||||
- __le32
|
||||
- s_prj_quota_inum
|
||||
- Inode that tracks project quotas.
|
||||
* - 0x270
|
||||
- \_\_le32
|
||||
- s\_checksum\_seed
|
||||
- Checksum seed used for metadata\_csum calculations. This value is
|
||||
crc32c(~0, $orig\_fs\_uuid).
|
||||
- __le32
|
||||
- s_checksum_seed
|
||||
- Checksum seed used for metadata_csum calculations. This value is
|
||||
crc32c(~0, $orig_fs_uuid).
|
||||
* - 0x274
|
||||
- \_\_u8
|
||||
- s\_wtime_hi
|
||||
- __u8
|
||||
- s_wtime_hi
|
||||
- Upper 8 bits of the s_wtime field.
|
||||
* - 0x275
|
||||
- \_\_u8
|
||||
- s\_mtime_hi
|
||||
- __u8
|
||||
- s_mtime_hi
|
||||
- Upper 8 bits of the s_mtime field.
|
||||
* - 0x276
|
||||
- \_\_u8
|
||||
- s\_mkfs_time_hi
|
||||
- __u8
|
||||
- s_mkfs_time_hi
|
||||
- Upper 8 bits of the s_mkfs_time field.
|
||||
* - 0x277
|
||||
- \_\_u8
|
||||
- s\_lastcheck_hi
|
||||
- __u8
|
||||
- s_lastcheck_hi
|
||||
- Upper 8 bits of the s_lastcheck_hi field.
|
||||
* - 0x278
|
||||
- \_\_u8
|
||||
- s\_first_error_time_hi
|
||||
- __u8
|
||||
- s_first_error_time_hi
|
||||
- Upper 8 bits of the s_first_error_time_hi field.
|
||||
* - 0x279
|
||||
- \_\_u8
|
||||
- s\_last_error_time_hi
|
||||
- __u8
|
||||
- s_last_error_time_hi
|
||||
- Upper 8 bits of the s_last_error_time_hi field.
|
||||
* - 0x27A
|
||||
- \_\_u8
|
||||
- s\_pad[2]
|
||||
- __u8
|
||||
- s_pad[2]
|
||||
- Zero padding.
|
||||
* - 0x27C
|
||||
- \_\_le16
|
||||
- s\_encoding
|
||||
- __le16
|
||||
- s_encoding
|
||||
- Filename charset encoding.
|
||||
* - 0x27E
|
||||
- \_\_le16
|
||||
- s\_encoding_flags
|
||||
- __le16
|
||||
- s_encoding_flags
|
||||
- Filename charset encoding flags.
|
||||
* - 0x280
|
||||
- \_\_le32
|
||||
- s\_orphan\_file\_inum
|
||||
- __le32
|
||||
- s_orphan_file_inum
|
||||
- Orphan file inode number.
|
||||
* - 0x284
|
||||
- \_\_le32
|
||||
- s\_reserved[94]
|
||||
- __le32
|
||||
- s_reserved[94]
|
||||
- Padding to the end of the block.
|
||||
* - 0x3FC
|
||||
- \_\_le32
|
||||
- s\_checksum
|
||||
- __le32
|
||||
- s_checksum
|
||||
- Superblock checksum.
|
||||
|
||||
.. _super_state:
|
||||
@ -574,44 +574,44 @@ following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Directory preallocation (COMPAT\_DIR\_PREALLOC).
|
||||
- Directory preallocation (COMPAT_DIR_PREALLOC).
|
||||
* - 0x2
|
||||
- “imagic inodes”. Not clear from the code what this does
|
||||
(COMPAT\_IMAGIC\_INODES).
|
||||
(COMPAT_IMAGIC_INODES).
|
||||
* - 0x4
|
||||
- Has a journal (COMPAT\_HAS\_JOURNAL).
|
||||
- Has a journal (COMPAT_HAS_JOURNAL).
|
||||
* - 0x8
|
||||
- Supports extended attributes (COMPAT\_EXT\_ATTR).
|
||||
- Supports extended attributes (COMPAT_EXT_ATTR).
|
||||
* - 0x10
|
||||
- Has reserved GDT blocks for filesystem expansion
|
||||
(COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER.
|
||||
(COMPAT_RESIZE_INODE). Requires RO_COMPAT_SPARSE_SUPER.
|
||||
* - 0x20
|
||||
- Has directory indices (COMPAT\_DIR\_INDEX).
|
||||
- Has directory indices (COMPAT_DIR_INDEX).
|
||||
* - 0x40
|
||||
- “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized
|
||||
block groups? (COMPAT\_LAZY\_BG)
|
||||
block groups? (COMPAT_LAZY_BG)
|
||||
* - 0x80
|
||||
- “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE).
|
||||
- “Exclude inode”. Not used. (COMPAT_EXCLUDE_INODE).
|
||||
* - 0x100
|
||||
- “Exclude bitmap”. Seems to be used to indicate the presence of
|
||||
snapshot-related exclude bitmaps? Not defined in kernel or used in
|
||||
e2fsprogs (COMPAT\_EXCLUDE\_BITMAP).
|
||||
e2fsprogs (COMPAT_EXCLUDE_BITMAP).
|
||||
* - 0x200
|
||||
- Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs
|
||||
- Sparse Super Block, v2. If this flag is set, the SB field s_backup_bgs
|
||||
points to the two block groups that contain backup superblocks
|
||||
(COMPAT\_SPARSE\_SUPER2).
|
||||
(COMPAT_SPARSE_SUPER2).
|
||||
* - 0x400
|
||||
- Fast commits supported. Although fast commits blocks are
|
||||
backward incompatible, fast commit blocks are not always
|
||||
present in the journal. If fast commit blocks are present in
|
||||
the journal, JBD2 incompat feature
|
||||
(JBD2\_FEATURE\_INCOMPAT\_FAST\_COMMIT) gets
|
||||
set (COMPAT\_FAST\_COMMIT).
|
||||
(JBD2_FEATURE_INCOMPAT_FAST_COMMIT) gets
|
||||
set (COMPAT_FAST_COMMIT).
|
||||
* - 0x1000
|
||||
- Orphan file allocated. This is the special file for more efficient
|
||||
tracking of unlinked but still open inodes. When there may be any
|
||||
entries in the file, we additionally set proper rocompat feature
|
||||
(RO\_COMPAT\_ORPHAN\_PRESENT).
|
||||
(RO_COMPAT_ORPHAN_PRESENT).
|
||||
|
||||
.. _super_incompat:
|
||||
|
||||
@ -625,45 +625,45 @@ following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Compression (INCOMPAT\_COMPRESSION).
|
||||
- Compression (INCOMPAT_COMPRESSION).
|
||||
* - 0x2
|
||||
- Directory entries record the file type. See ext4\_dir\_entry\_2 below
|
||||
(INCOMPAT\_FILETYPE).
|
||||
- Directory entries record the file type. See ext4_dir_entry_2 below
|
||||
(INCOMPAT_FILETYPE).
|
||||
* - 0x4
|
||||
- Filesystem needs recovery (INCOMPAT\_RECOVER).
|
||||
- Filesystem needs recovery (INCOMPAT_RECOVER).
|
||||
* - 0x8
|
||||
- Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV).
|
||||
- Filesystem has a separate journal device (INCOMPAT_JOURNAL_DEV).
|
||||
* - 0x10
|
||||
- Meta block groups. See the earlier discussion of this feature
|
||||
(INCOMPAT\_META\_BG).
|
||||
(INCOMPAT_META_BG).
|
||||
* - 0x40
|
||||
- Files in this filesystem use extents (INCOMPAT\_EXTENTS).
|
||||
- Files in this filesystem use extents (INCOMPAT_EXTENTS).
|
||||
* - 0x80
|
||||
- Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT).
|
||||
- Enable a filesystem size of 2^64 blocks (INCOMPAT_64BIT).
|
||||
* - 0x100
|
||||
- Multiple mount protection (INCOMPAT\_MMP).
|
||||
- Multiple mount protection (INCOMPAT_MMP).
|
||||
* - 0x200
|
||||
- Flexible block groups. See the earlier discussion of this feature
|
||||
(INCOMPAT\_FLEX\_BG).
|
||||
(INCOMPAT_FLEX_BG).
|
||||
* - 0x400
|
||||
- Inodes can be used to store large extended attribute values
|
||||
(INCOMPAT\_EA\_INODE).
|
||||
(INCOMPAT_EA_INODE).
|
||||
* - 0x1000
|
||||
- Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?)
|
||||
- Data in directory entry (INCOMPAT_DIRDATA). (Not implemented?)
|
||||
* - 0x2000
|
||||
- Metadata checksum seed is stored in the superblock. This feature enables
|
||||
the administrator to change the UUID of a metadata\_csum filesystem
|
||||
the administrator to change the UUID of a metadata_csum filesystem
|
||||
while the filesystem is mounted; without it, the checksum definition
|
||||
requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED).
|
||||
requires all metadata blocks to be rewritten (INCOMPAT_CSUM_SEED).
|
||||
* - 0x4000
|
||||
- Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to
|
||||
- Large directory >2GB or 3-level htree (INCOMPAT_LARGEDIR). Prior to
|
||||
this feature, directories could not be larger than 4GiB and could not
|
||||
have an htree more than 2 levels deep. If this feature is enabled,
|
||||
directories can be larger than 4GiB and have a maximum htree depth of 3.
|
||||
* - 0x8000
|
||||
- Data in inode (INCOMPAT\_INLINE\_DATA).
|
||||
- Data in inode (INCOMPAT_INLINE_DATA).
|
||||
* - 0x10000
|
||||
- Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT).
|
||||
- Encrypted inodes are present on the filesystem. (INCOMPAT_ENCRYPT).
|
||||
|
||||
.. _super_rocompat:
|
||||
|
||||
@ -678,54 +678,54 @@ the following:
|
||||
- Description
|
||||
* - 0x1
|
||||
- Sparse superblocks. See the earlier discussion of this feature
|
||||
(RO\_COMPAT\_SPARSE\_SUPER).
|
||||
(RO_COMPAT_SPARSE_SUPER).
|
||||
* - 0x2
|
||||
- This filesystem has been used to store a file greater than 2GiB
|
||||
(RO\_COMPAT\_LARGE\_FILE).
|
||||
(RO_COMPAT_LARGE_FILE).
|
||||
* - 0x4
|
||||
- Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR).
|
||||
- Not used in kernel or e2fsprogs (RO_COMPAT_BTREE_DIR).
|
||||
* - 0x8
|
||||
- This filesystem has files whose sizes are represented in units of
|
||||
logical blocks, not 512-byte sectors. This implies a very large file
|
||||
indeed! (RO\_COMPAT\_HUGE\_FILE)
|
||||
indeed! (RO_COMPAT_HUGE_FILE)
|
||||
* - 0x10
|
||||
- Group descriptors have checksums. In addition to detecting corruption,
|
||||
this is useful for lazy formatting with uninitialized groups
|
||||
(RO\_COMPAT\_GDT\_CSUM).
|
||||
(RO_COMPAT_GDT_CSUM).
|
||||
* - 0x20
|
||||
- Indicates that the old ext3 32,000 subdirectory limit no longer applies
|
||||
(RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1
|
||||
(RO_COMPAT_DIR_NLINK). A directory's i_links_count will be set to 1
|
||||
if it is incremented past 64,999.
|
||||
* - 0x40
|
||||
- Indicates that large inodes exist on this filesystem
|
||||
(RO\_COMPAT\_EXTRA\_ISIZE).
|
||||
(RO_COMPAT_EXTRA_ISIZE).
|
||||
* - 0x80
|
||||
- This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT).
|
||||
- This filesystem has a snapshot (RO_COMPAT_HAS_SNAPSHOT).
|
||||
* - 0x100
|
||||
- `Quota <Quota>`__ (RO\_COMPAT\_QUOTA).
|
||||
- `Quota <Quota>`__ (RO_COMPAT_QUOTA).
|
||||
* - 0x200
|
||||
- This filesystem supports “bigalloc”, which means that file extents are
|
||||
tracked in units of clusters (of blocks) instead of blocks
|
||||
(RO\_COMPAT\_BIGALLOC).
|
||||
(RO_COMPAT_BIGALLOC).
|
||||
* - 0x400
|
||||
- This filesystem supports metadata checksumming.
|
||||
(RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though
|
||||
GDT\_CSUM must not be set)
|
||||
(RO_COMPAT_METADATA_CSUM; implies RO_COMPAT_GDT_CSUM, though
|
||||
GDT_CSUM must not be set)
|
||||
* - 0x800
|
||||
- Filesystem supports replicas. This feature is neither in the kernel nor
|
||||
e2fsprogs. (RO\_COMPAT\_REPLICA)
|
||||
e2fsprogs. (RO_COMPAT_REPLICA)
|
||||
* - 0x1000
|
||||
- Read-only filesystem image; the kernel will not mount this image
|
||||
read-write and most tools will refuse to write to the image.
|
||||
(RO\_COMPAT\_READONLY)
|
||||
(RO_COMPAT_READONLY)
|
||||
* - 0x2000
|
||||
- Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT)
|
||||
- Filesystem tracks project quotas. (RO_COMPAT_PROJECT)
|
||||
* - 0x8000
|
||||
- Verity inodes may be present on the filesystem. (RO\_COMPAT\_VERITY)
|
||||
- Verity inodes may be present on the filesystem. (RO_COMPAT_VERITY)
|
||||
* - 0x10000
|
||||
- Indicates orphan file may have valid orphan entries and thus we need
|
||||
to clean them up when mounting the filesystem
|
||||
(RO\_COMPAT\_ORPHAN\_PRESENT).
|
||||
(RO_COMPAT_ORPHAN_PRESENT).
|
||||
|
||||
.. _super_def_hash:
|
||||
|
||||
@ -761,36 +761,36 @@ The ``s_default_mount_opts`` field is any combination of the following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0001
|
||||
- Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG)
|
||||
- Print debugging info upon (re)mount. (EXT4_DEFM_DEBUG)
|
||||
* - 0x0002
|
||||
- New files take the gid of the containing directory (instead of the fsgid
|
||||
of the current process). (EXT4\_DEFM\_BSDGROUPS)
|
||||
of the current process). (EXT4_DEFM_BSDGROUPS)
|
||||
* - 0x0004
|
||||
- Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER)
|
||||
- Support userspace-provided extended attributes. (EXT4_DEFM_XATTR_USER)
|
||||
* - 0x0008
|
||||
- Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL)
|
||||
- Support POSIX access control lists (ACLs). (EXT4_DEFM_ACL)
|
||||
* - 0x0010
|
||||
- Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16)
|
||||
- Do not support 32-bit UIDs. (EXT4_DEFM_UID16)
|
||||
* - 0x0020
|
||||
- All data and metadata are commited to the journal.
|
||||
(EXT4\_DEFM\_JMODE\_DATA)
|
||||
(EXT4_DEFM_JMODE_DATA)
|
||||
* - 0x0040
|
||||
- All data are flushed to the disk before metadata are committed to the
|
||||
journal. (EXT4\_DEFM\_JMODE\_ORDERED)
|
||||
journal. (EXT4_DEFM_JMODE_ORDERED)
|
||||
* - 0x0060
|
||||
- Data ordering is not preserved; data may be written after the metadata
|
||||
has been written. (EXT4\_DEFM\_JMODE\_WBACK)
|
||||
has been written. (EXT4_DEFM_JMODE_WBACK)
|
||||
* - 0x0100
|
||||
- Disable write flushes. (EXT4\_DEFM\_NOBARRIER)
|
||||
- Disable write flushes. (EXT4_DEFM_NOBARRIER)
|
||||
* - 0x0200
|
||||
- Track which blocks in a filesystem are metadata and therefore should not
|
||||
be used as data blocks. This option will be enabled by default on 3.18,
|
||||
hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY)
|
||||
hopefully. (EXT4_DEFM_BLOCK_VALIDITY)
|
||||
* - 0x0400
|
||||
- Enable DISCARD support, where the storage device is told about blocks
|
||||
becoming unused. (EXT4\_DEFM\_DISCARD)
|
||||
becoming unused. (EXT4_DEFM_DISCARD)
|
||||
* - 0x0800
|
||||
- Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC)
|
||||
- Disable delayed allocation. (EXT4_DEFM_NODELALLOC)
|
||||
|
||||
.. _super_flags:
|
||||
|
||||
@ -820,12 +820,12 @@ The ``s_encrypt_algos`` list can contain any of the following:
|
||||
* - Value
|
||||
- Description
|
||||
* - 0
|
||||
- Invalid algorithm (ENCRYPTION\_MODE\_INVALID).
|
||||
- Invalid algorithm (ENCRYPTION_MODE_INVALID).
|
||||
* - 1
|
||||
- 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS).
|
||||
- 256-bit AES in XTS mode (ENCRYPTION_MODE_AES_256_XTS).
|
||||
* - 2
|
||||
- 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM).
|
||||
- 256-bit AES in GCM mode (ENCRYPTION_MODE_AES_256_GCM).
|
||||
* - 3
|
||||
- 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC).
|
||||
- 256-bit AES in CBC mode (ENCRYPTION_MODE_AES_256_CBC).
|
||||
|
||||
Total size of the superblock is 1024 bytes.
|
||||
|
@ -45,10 +45,12 @@ Name Alias Usage Preserved
|
||||
``$r23``-``$r31`` ``$s0``-``$s8`` Static registers Yes
|
||||
================= =============== =================== ============
|
||||
|
||||
Note: The register ``$r21`` is reserved in the ELF psABI, but used by the Linux
|
||||
kernel for storing the percpu base address. It normally has no ABI name, but is
|
||||
called ``$u0`` in the kernel. You may also see ``$v0`` or ``$v1`` in some old code,
|
||||
however they are deprecated aliases of ``$a0`` and ``$a1`` respectively.
|
||||
.. Note::
|
||||
The register ``$r21`` is reserved in the ELF psABI, but used by the Linux
|
||||
kernel for storing the percpu base address. It normally has no ABI name,
|
||||
but is called ``$u0`` in the kernel. You may also see ``$v0`` or ``$v1``
|
||||
in some old code,however they are deprecated aliases of ``$a0`` and ``$a1``
|
||||
respectively.
|
||||
|
||||
FPRs
|
||||
----
|
||||
@ -69,8 +71,9 @@ Name Alias Usage Preserved
|
||||
``$f24``-``$f31`` ``$fs0``-``$fs7`` Static registers Yes
|
||||
================= ================== =================== ============
|
||||
|
||||
Note: You may see ``$fv0`` or ``$fv1`` in some old code, however they are deprecated
|
||||
aliases of ``$fa0`` and ``$fa1`` respectively.
|
||||
.. Note::
|
||||
You may see ``$fv0`` or ``$fv1`` in some old code, however they are
|
||||
deprecated aliases of ``$fa0`` and ``$fa1`` respectively.
|
||||
|
||||
VRs
|
||||
----
|
||||
|
@ -145,12 +145,16 @@ Documentation of Loongson's LS7A chipset:
|
||||
|
||||
https://github.com/loongson/LoongArch-Documentation/releases/latest/download/Loongson-7A1000-usermanual-2.00-EN.pdf (in English)
|
||||
|
||||
Note: CPUINTC is CSR.ECFG/CSR.ESTAT and its interrupt controller described
|
||||
in Section 7.4 of "LoongArch Reference Manual, Vol 1"; LIOINTC is "Legacy I/O
|
||||
Interrupts" described in Section 11.1 of "Loongson 3A5000 Processor Reference
|
||||
Manual"; EIOINTC is "Extended I/O Interrupts" described in Section 11.2 of
|
||||
"Loongson 3A5000 Processor Reference Manual"; HTVECINTC is "HyperTransport
|
||||
Interrupts" described in Section 14.3 of "Loongson 3A5000 Processor Reference
|
||||
Manual"; PCH-PIC/PCH-MSI is "Interrupt Controller" described in Section 5 of
|
||||
"Loongson 7A1000 Bridge User Manual"; PCH-LPC is "LPC Interrupts" described in
|
||||
Section 24.3 of "Loongson 7A1000 Bridge User Manual".
|
||||
.. Note::
|
||||
- CPUINTC is CSR.ECFG/CSR.ESTAT and its interrupt controller described
|
||||
in Section 7.4 of "LoongArch Reference Manual, Vol 1";
|
||||
- LIOINTC is "Legacy I/OInterrupts" described in Section 11.1 of
|
||||
"Loongson 3A5000 Processor Reference Manual";
|
||||
- EIOINTC is "Extended I/O Interrupts" described in Section 11.2 of
|
||||
"Loongson 3A5000 Processor Reference Manual";
|
||||
- HTVECINTC is "HyperTransport Interrupts" described in Section 14.3 of
|
||||
"Loongson 3A5000 Processor Reference Manual";
|
||||
- PCH-PIC/PCH-MSI is "Interrupt Controller" described in Section 5 of
|
||||
"Loongson 7A1000 Bridge User Manual";
|
||||
- PCH-LPC is "LPC Interrupts" described in Section 24.3 of
|
||||
"Loongson 7A1000 Bridge User Manual".
|
||||
|
@ -2925,6 +2925,43 @@ plpmtud_probe_interval - INTEGER
|
||||
|
||||
Default: 0
|
||||
|
||||
reconf_enable - BOOLEAN
|
||||
Enable or disable extension of Stream Reconfiguration functionality
|
||||
specified in RFC6525. This extension provides the ability to "reset"
|
||||
a stream, and it includes the Parameters of "Outgoing/Incoming SSN
|
||||
Reset", "SSN/TSN Reset" and "Add Outgoing/Incoming Streams".
|
||||
|
||||
- 1: Enable extension.
|
||||
- 0: Disable extension.
|
||||
|
||||
Default: 0
|
||||
|
||||
intl_enable - BOOLEAN
|
||||
Enable or disable extension of User Message Interleaving functionality
|
||||
specified in RFC8260. This extension allows the interleaving of user
|
||||
messages sent on different streams. With this feature enabled, I-DATA
|
||||
chunk will replace DATA chunk to carry user messages if also supported
|
||||
by the peer. Note that to use this feature, one needs to set this option
|
||||
to 1 and also needs to set socket options SCTP_FRAGMENT_INTERLEAVE to 2
|
||||
and SCTP_INTERLEAVING_SUPPORTED to 1.
|
||||
|
||||
- 1: Enable extension.
|
||||
- 0: Disable extension.
|
||||
|
||||
Default: 0
|
||||
|
||||
ecn_enable - BOOLEAN
|
||||
Control use of Explicit Congestion Notification (ECN) by SCTP.
|
||||
Like in TCP, ECN is used only when both ends of the SCTP connection
|
||||
indicate support for it. This feature is useful in avoiding losses
|
||||
due to congestion by allowing supporting routers to signal congestion
|
||||
before having to drop packets.
|
||||
|
||||
1: Enable ecn.
|
||||
0: Disable ecn.
|
||||
|
||||
Default: 1
|
||||
|
||||
|
||||
``/proc/sys/net/core/*``
|
||||
========================
|
||||
|
@ -104,7 +104,7 @@ Whenever possible, use the PHY side RGMII delay for these reasons:
|
||||
|
||||
* PHY device drivers in PHYLIB being reusable by nature, being able to
|
||||
configure correctly a specified delay enables more designs with similar delay
|
||||
requirements to be operate correctly
|
||||
requirements to be operated correctly
|
||||
|
||||
For cases where the PHY is not capable of providing this delay, but the
|
||||
Ethernet MAC driver is capable of doing so, the correct phy_interface_t value
|
||||
|
@ -46,10 +46,11 @@ LA64中每个寄存器为64位宽。 ``$r0`` 的内容总是固定为0,而其
|
||||
``$r23``-``$r31`` ``$s0``-``$s8`` 静态寄存器 是
|
||||
================= =============== =================== ==========
|
||||
|
||||
注意:``$r21``寄存器在ELF psABI中保留未使用,但是在Linux内核用于保存每CPU
|
||||
变量基地址。该寄存器没有ABI命名,不过在内核中称为``$u0``。在一些遗留代码
|
||||
中有时可能见到``$v0``和``$v1``,它们是``$a0``和``$a1``的别名,属于已经废弃
|
||||
的用法。
|
||||
.. note::
|
||||
注意: ``$r21`` 寄存器在ELF psABI中保留未使用,但是在Linux内核用于保
|
||||
存每CPU变量基地址。该寄存器没有ABI命名,不过在内核中称为 ``$u0`` 。在
|
||||
一些遗留代码中有时可能见到 ``$v0`` 和 ``$v1`` ,它们是 ``$a0`` 和
|
||||
``$a1`` 的别名,属于已经废弃的用法。
|
||||
|
||||
浮点寄存器
|
||||
----------
|
||||
@ -68,8 +69,9 @@ LA64中每个寄存器为64位宽。 ``$r0`` 的内容总是固定为0,而其
|
||||
``$f24``-``$f31`` ``$fs0``-``$fs7`` 静态寄存器 是
|
||||
================= ================== =================== ==========
|
||||
|
||||
注意:在一些遗留代码中有时可能见到 ``$v0`` 和 ``$v1`` ,它们是 ``$a0``
|
||||
和 ``$a1`` 的别名,属于已经废弃的用法。
|
||||
.. note::
|
||||
注意:在一些遗留代码中有时可能见到 ``$v0`` 和 ``$v1`` ,它们是
|
||||
``$a0`` 和 ``$a1`` 的别名,属于已经废弃的用法。
|
||||
|
||||
|
||||
向量寄存器
|
||||
|
@ -147,9 +147,11 @@ PCH-LPC::
|
||||
|
||||
https://github.com/loongson/LoongArch-Documentation/releases/latest/download/Loongson-7A1000-usermanual-2.00-EN.pdf (英文版)
|
||||
|
||||
注:CPUINTC即《龙芯架构参考手册卷一》第7.4节所描述的CSR.ECFG/CSR.ESTAT寄存器及其中断
|
||||
控制逻辑;LIOINTC即《龙芯3A5000处理器使用手册》第11.1节所描述的“传统I/O中断”;EIOINTC
|
||||
即《龙芯3A5000处理器使用手册》第11.2节所描述的“扩展I/O中断”;HTVECINTC即《龙芯3A5000
|
||||
处理器使用手册》第14.3节所描述的“HyperTransport中断”;PCH-PIC/PCH-MSI即《龙芯7A1000桥
|
||||
片用户手册》第5章所描述的“中断控制器”;PCH-LPC即《龙芯7A1000桥片用户手册》第24.3节所
|
||||
描述的“LPC中断”。
|
||||
.. note::
|
||||
- CPUINTC:即《龙芯架构参考手册卷一》第7.4节所描述的CSR.ECFG/CSR.ESTAT寄存器及其
|
||||
中断控制逻辑;
|
||||
- LIOINTC:即《龙芯3A5000处理器使用手册》第11.1节所描述的“传统I/O中断”;
|
||||
- EIOINTC:即《龙芯3A5000处理器使用手册》第11.2节所描述的“扩展I/O中断”;
|
||||
- HTVECINTC:即《龙芯3A5000处理器使用手册》第14.3节所描述的“HyperTransport中断”;
|
||||
- PCH-PIC/PCH-MSI:即《龙芯7A1000桥片用户手册》第5章所描述的“中断控制器”;
|
||||
- PCH-LPC:即《龙芯7A1000桥片用户手册》第24.3节所描述的“LPC中断”。
|
||||
|
@ -9276,6 +9276,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git
|
||||
F: Documentation/devicetree/bindings/i2c/i2c.txt
|
||||
F: Documentation/i2c/
|
||||
F: drivers/i2c/*
|
||||
F: include/dt-bindings/i2c/i2c.h
|
||||
F: include/linux/i2c-dev.h
|
||||
F: include/linux/i2c-smbus.h
|
||||
F: include/linux/i2c.h
|
||||
@ -9291,6 +9292,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git
|
||||
F: Documentation/devicetree/bindings/i2c/
|
||||
F: drivers/i2c/algos/
|
||||
F: drivers/i2c/busses/
|
||||
F: include/dt-bindings/i2c/
|
||||
|
||||
I2C-TAOS-EVM DRIVER
|
||||
M: Jean Delvare <jdelvare@suse.com>
|
||||
@ -10872,7 +10874,6 @@ F: arch/riscv/include/asm/kvm*
|
||||
F: arch/riscv/include/uapi/asm/kvm*
|
||||
F: arch/riscv/kvm/
|
||||
F: tools/testing/selftests/kvm/*/riscv/
|
||||
F: tools/testing/selftests/kvm/riscv/
|
||||
|
||||
KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
|
||||
M: Christian Borntraeger <borntraeger@linux.ibm.com>
|
||||
@ -13801,6 +13802,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
|
||||
F: Documentation/devicetree/bindings/net/
|
||||
F: drivers/connector/
|
||||
F: drivers/net/
|
||||
F: include/dt-bindings/net/
|
||||
F: include/linux/etherdevice.h
|
||||
F: include/linux/fcdevice.h
|
||||
F: include/linux/fddidevice.h
|
||||
@ -19305,7 +19307,7 @@ R: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
|
||||
R: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||
R: Jan Dabros <jsd@semihalf.com>
|
||||
L: linux-i2c@vger.kernel.org
|
||||
S: Maintained
|
||||
S: Supported
|
||||
F: drivers/i2c/busses/i2c-designware-*
|
||||
|
||||
SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER
|
||||
|
2
Makefile
2
Makefile
@ -2,7 +2,7 @@
|
||||
VERSION = 5
|
||||
PATCHLEVEL = 19
|
||||
SUBLEVEL = 0
|
||||
EXTRAVERSION = -rc2
|
||||
EXTRAVERSION = -rc3
|
||||
NAME = Superb Owl
|
||||
|
||||
# *DOCUMENTATION*
|
||||
|
@ -60,7 +60,7 @@ int irq_select_affinity(unsigned int irq)
|
||||
cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
|
||||
last_cpu = cpu;
|
||||
|
||||
cpumask_copy(irq_data_get_affinity_mask(data), cpumask_of(cpu));
|
||||
irq_data_update_affinity(data, cpumask_of(cpu));
|
||||
chip->irq_set_affinity(data, cpumask_of(cpu), false);
|
||||
return 0;
|
||||
}
|
||||
|
@ -120,26 +120,31 @@
|
||||
port@0 {
|
||||
reg = <0>;
|
||||
label = "lan1";
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
port@1 {
|
||||
reg = <1>;
|
||||
label = "lan2";
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
port@2 {
|
||||
reg = <2>;
|
||||
label = "lan3";
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
port@3 {
|
||||
reg = <3>;
|
||||
label = "lan4";
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
port@4 {
|
||||
reg = <4>;
|
||||
label = "lan5";
|
||||
phy-mode = "internal";
|
||||
};
|
||||
|
||||
port@5 {
|
||||
|
@ -40,7 +40,7 @@ config ARCH_HIP04
|
||||
select HAVE_ARM_ARCH_TIMER
|
||||
select MCPM if SMP
|
||||
select MCPM_QUAD_CLUSTER if SMP
|
||||
select GENERIC_IRQ_EFFECTIVE_AFF_MASK
|
||||
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
|
||||
help
|
||||
Support for Hisilicon HiP04 SoC family
|
||||
|
||||
|
@ -5,13 +5,13 @@
|
||||
#include <linux/platform_data/pxa_sdhci.h>
|
||||
|
||||
extern void mmp2_timer_init(void);
|
||||
extern void __init mmp2_init_icu(void);
|
||||
extern void __init mmp2_init_irq(void);
|
||||
extern void mmp2_clear_pmic_int(void);
|
||||
|
||||
#include <linux/i2c.h>
|
||||
#include <linux/platform_data/i2c-pxa.h>
|
||||
#include <linux/platform_data/dma-mmp_tdma.h>
|
||||
#include <linux/irqchip/mmp.h>
|
||||
|
||||
#include "devices.h"
|
||||
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include <linux/reboot.h>
|
||||
|
||||
extern void pxa168_timer_init(void);
|
||||
extern void __init icu_init_irq(void);
|
||||
extern void __init pxa168_init_irq(void);
|
||||
extern void pxa168_restart(enum reboot_mode, const char *);
|
||||
extern void pxa168_clear_keypad_wakeup(void);
|
||||
@ -18,6 +17,7 @@ extern void pxa168_clear_keypad_wakeup(void);
|
||||
#include <linux/pxa168_eth.h>
|
||||
#include <linux/platform_data/mv_usb.h>
|
||||
#include <linux/soc/mmp/cputype.h>
|
||||
#include <linux/irqchip/mmp.h>
|
||||
|
||||
#include "devices.h"
|
||||
|
||||
|
@ -3,13 +3,13 @@
|
||||
#define __ASM_MACH_PXA910_H
|
||||
|
||||
extern void pxa910_timer_init(void);
|
||||
extern void __init icu_init_irq(void);
|
||||
extern void __init pxa910_init_irq(void);
|
||||
|
||||
#include <linux/i2c.h>
|
||||
#include <linux/platform_data/i2c-pxa.h>
|
||||
#include <linux/platform_data/mtd-nand-pxa3xx.h>
|
||||
#include <video/mmp_disp.h>
|
||||
#include <linux/irqchip/mmp.h>
|
||||
|
||||
#include "devices.h"
|
||||
|
||||
|
@ -362,11 +362,6 @@ struct kvm_vcpu_arch {
|
||||
struct arch_timer_cpu timer_cpu;
|
||||
struct kvm_pmu pmu;
|
||||
|
||||
/*
|
||||
* Anything that is not used directly from assembly code goes
|
||||
* here.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Guest registers we preserve during guest debugging.
|
||||
*
|
||||
|
@ -113,6 +113,9 @@ static __always_inline bool has_vhe(void)
|
||||
/*
|
||||
* Code only run in VHE/NVHE hyp context can assume VHE is present or
|
||||
* absent. Otherwise fall back to caps.
|
||||
* This allows the compiler to discard VHE-specific code from the
|
||||
* nVHE object, reducing the number of external symbol references
|
||||
* needed to link.
|
||||
*/
|
||||
if (is_vhe_hyp_code())
|
||||
return true;
|
||||
|
@ -1974,15 +1974,7 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
|
||||
#ifdef CONFIG_KVM
|
||||
static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
|
||||
{
|
||||
if (kvm_get_mode() != KVM_MODE_PROTECTED)
|
||||
return false;
|
||||
|
||||
if (is_kernel_in_hyp_mode()) {
|
||||
pr_warn("Protected KVM not available with VHE\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return kvm_get_mode() == KVM_MODE_PROTECTED;
|
||||
}
|
||||
#endif /* CONFIG_KVM */
|
||||
|
||||
@ -3109,7 +3101,6 @@ void cpu_set_feature(unsigned int num)
|
||||
WARN_ON(num >= MAX_CPU_FEATURES);
|
||||
elf_hwcap |= BIT(num);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cpu_set_feature);
|
||||
|
||||
bool cpu_have_feature(unsigned int num)
|
||||
{
|
||||
|
@ -102,7 +102,6 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
|
||||
* x19-x29 per the AAPCS, and we created frame records upon entry, so we need
|
||||
* to restore x0-x8, x29, and x30.
|
||||
*/
|
||||
ftrace_common_return:
|
||||
/* Restore function arguments */
|
||||
ldp x0, x1, [sp]
|
||||
ldp x2, x3, [sp, #S_X2]
|
||||
|
@ -77,6 +77,66 @@ static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the address the callsite must branch to in order to reach '*addr'.
|
||||
*
|
||||
* Due to the limited range of 'BL' instructions, modules may be placed too far
|
||||
* away to branch directly and must use a PLT.
|
||||
*
|
||||
* Returns true when '*addr' contains a reachable target address, or has been
|
||||
* modified to contain a PLT address. Returns false otherwise.
|
||||
*/
|
||||
static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
|
||||
struct module *mod,
|
||||
unsigned long *addr)
|
||||
{
|
||||
unsigned long pc = rec->ip;
|
||||
long offset = (long)*addr - (long)pc;
|
||||
struct plt_entry *plt;
|
||||
|
||||
/*
|
||||
* When the target is within range of the 'BL' instruction, use 'addr'
|
||||
* as-is and branch to that directly.
|
||||
*/
|
||||
if (offset >= -SZ_128M && offset < SZ_128M)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* When the target is outside of the range of a 'BL' instruction, we
|
||||
* must use a PLT to reach it. We can only place PLTs for modules, and
|
||||
* only when module PLT support is built-in.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* 'mod' is only set at module load time, but if we end up
|
||||
* dealing with an out-of-range condition, we can assume it
|
||||
* is due to a module being loaded far away from the kernel.
|
||||
*
|
||||
* NOTE: __module_text_address() must be called with preemption
|
||||
* disabled, but we can rely on ftrace_lock to ensure that 'mod'
|
||||
* retains its validity throughout the remainder of this code.
|
||||
*/
|
||||
if (!mod) {
|
||||
preempt_disable();
|
||||
mod = __module_text_address(pc);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
if (WARN_ON(!mod))
|
||||
return false;
|
||||
|
||||
plt = get_ftrace_plt(mod, *addr);
|
||||
if (!plt) {
|
||||
pr_err("ftrace: no module PLT for %ps\n", (void *)*addr);
|
||||
return false;
|
||||
}
|
||||
|
||||
*addr = (unsigned long)plt;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Turn on the call to ftrace_caller() in instrumented function
|
||||
*/
|
||||
@ -84,40 +144,9 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
|
||||
{
|
||||
unsigned long pc = rec->ip;
|
||||
u32 old, new;
|
||||
long offset = (long)pc - (long)addr;
|
||||
|
||||
if (offset < -SZ_128M || offset >= SZ_128M) {
|
||||
struct module *mod;
|
||||
struct plt_entry *plt;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* On kernels that support module PLTs, the offset between the
|
||||
* branch instruction and its target may legally exceed the
|
||||
* range of an ordinary relative 'bl' opcode. In this case, we
|
||||
* need to branch via a trampoline in the module.
|
||||
*
|
||||
* NOTE: __module_text_address() must be called with preemption
|
||||
* disabled, but we can rely on ftrace_lock to ensure that 'mod'
|
||||
* retains its validity throughout the remainder of this code.
|
||||
*/
|
||||
preempt_disable();
|
||||
mod = __module_text_address(pc);
|
||||
preempt_enable();
|
||||
|
||||
if (WARN_ON(!mod))
|
||||
return -EINVAL;
|
||||
|
||||
plt = get_ftrace_plt(mod, addr);
|
||||
if (!plt) {
|
||||
pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
addr = (unsigned long)plt;
|
||||
}
|
||||
if (!ftrace_find_callable_addr(rec, NULL, &addr))
|
||||
return -EINVAL;
|
||||
|
||||
old = aarch64_insn_gen_nop();
|
||||
new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
|
||||
@ -132,6 +161,11 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
|
||||
unsigned long pc = rec->ip;
|
||||
u32 old, new;
|
||||
|
||||
if (!ftrace_find_callable_addr(rec, NULL, &old_addr))
|
||||
return -EINVAL;
|
||||
if (!ftrace_find_callable_addr(rec, NULL, &addr))
|
||||
return -EINVAL;
|
||||
|
||||
old = aarch64_insn_gen_branch_imm(pc, old_addr,
|
||||
AARCH64_INSN_BRANCH_LINK);
|
||||
new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
|
||||
@ -181,54 +215,15 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
|
||||
unsigned long addr)
|
||||
{
|
||||
unsigned long pc = rec->ip;
|
||||
bool validate = true;
|
||||
u32 old = 0, new;
|
||||
long offset = (long)pc - (long)addr;
|
||||
|
||||
if (offset < -SZ_128M || offset >= SZ_128M) {
|
||||
u32 replaced;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* 'mod' is only set at module load time, but if we end up
|
||||
* dealing with an out-of-range condition, we can assume it
|
||||
* is due to a module being loaded far away from the kernel.
|
||||
*/
|
||||
if (!mod) {
|
||||
preempt_disable();
|
||||
mod = __module_text_address(pc);
|
||||
preempt_enable();
|
||||
|
||||
if (WARN_ON(!mod))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The instruction we are about to patch may be a branch and
|
||||
* link instruction that was redirected via a PLT entry. In
|
||||
* this case, the normal validation will fail, but we can at
|
||||
* least check that we are dealing with a branch and link
|
||||
* instruction that points into the right module.
|
||||
*/
|
||||
if (aarch64_insn_read((void *)pc, &replaced))
|
||||
return -EFAULT;
|
||||
|
||||
if (!aarch64_insn_is_bl(replaced) ||
|
||||
!within_module(pc + aarch64_get_branch_offset(replaced),
|
||||
mod))
|
||||
return -EINVAL;
|
||||
|
||||
validate = false;
|
||||
} else {
|
||||
old = aarch64_insn_gen_branch_imm(pc, addr,
|
||||
AARCH64_INSN_BRANCH_LINK);
|
||||
}
|
||||
if (!ftrace_find_callable_addr(rec, mod, &addr))
|
||||
return -EINVAL;
|
||||
|
||||
old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
|
||||
new = aarch64_insn_gen_nop();
|
||||
|
||||
return ftrace_modify_code(pc, old, new, validate);
|
||||
return ftrace_modify_code(pc, old, new, true);
|
||||
}
|
||||
|
||||
void arch_ftrace_update_code(int command)
|
||||
|
@ -303,14 +303,13 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
|
||||
early_fixmap_init();
|
||||
early_ioremap_init();
|
||||
|
||||
/*
|
||||
* Initialise the static keys early as they may be enabled by the
|
||||
* cpufeature code, early parameters, and DT setup.
|
||||
*/
|
||||
jump_label_init();
|
||||
|
||||
setup_machine_fdt(__fdt_pointer);
|
||||
|
||||
/*
|
||||
* Initialise the static keys early as they may be enabled by the
|
||||
* cpufeature code and early parameters.
|
||||
*/
|
||||
jump_label_init();
|
||||
parse_early_param();
|
||||
|
||||
/*
|
||||
|
@ -1230,6 +1230,9 @@ bool kvm_arch_timer_get_input_level(int vintid)
|
||||
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
|
||||
struct arch_timer_context *timer;
|
||||
|
||||
if (WARN(!vcpu, "No vcpu context!\n"))
|
||||
return false;
|
||||
|
||||
if (vintid == vcpu_vtimer(vcpu)->irq.irq)
|
||||
timer = vcpu_vtimer(vcpu);
|
||||
else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
|
||||
|
@ -150,8 +150,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
|
||||
if (ret)
|
||||
goto out_free_stage2_pgd;
|
||||
|
||||
if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL))
|
||||
if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_stage2_pgd;
|
||||
}
|
||||
cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
|
||||
|
||||
kvm_vgic_early_init(kvm);
|
||||
@ -2271,7 +2273,11 @@ static int __init early_kvm_mode_cfg(char *arg)
|
||||
return -EINVAL;
|
||||
|
||||
if (strcmp(arg, "protected") == 0) {
|
||||
kvm_mode = KVM_MODE_PROTECTED;
|
||||
if (!is_kernel_in_hyp_mode())
|
||||
kvm_mode = KVM_MODE_PROTECTED;
|
||||
else
|
||||
pr_warn_once("Protected KVM not available with VHE\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -80,6 +80,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
|
||||
vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED;
|
||||
vcpu->arch.flags |= KVM_ARM64_FP_HOST;
|
||||
|
||||
vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED;
|
||||
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
|
||||
vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED;
|
||||
|
||||
@ -93,6 +94,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
|
||||
* operations. Do this for ZA as well for now for simplicity.
|
||||
*/
|
||||
if (system_supports_sme()) {
|
||||
vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED;
|
||||
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
|
||||
vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED;
|
||||
|
||||
|
@ -314,15 +314,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
|
||||
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
|
||||
enum kvm_pgtable_prot prot)
|
||||
{
|
||||
hyp_assert_lock_held(&host_kvm.lock);
|
||||
|
||||
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
|
||||
}
|
||||
|
||||
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
|
||||
{
|
||||
hyp_assert_lock_held(&host_kvm.lock);
|
||||
|
||||
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
|
||||
addr, size, &host_s2_pool, owner_id);
|
||||
}
|
||||
|
@ -243,15 +243,9 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
|
||||
case SYS_ID_AA64MMFR2_EL1:
|
||||
return get_pvm_id_aa64mmfr2(vcpu);
|
||||
default:
|
||||
/*
|
||||
* Should never happen because all cases are covered in
|
||||
* pvm_sys_reg_descs[].
|
||||
*/
|
||||
WARN_ON(1);
|
||||
break;
|
||||
/* Unhandled ID register, RAZ */
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
|
||||
@ -332,6 +326,16 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu,
|
||||
/* Mark the specified system register as an AArch64 feature id register. */
|
||||
#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 }
|
||||
|
||||
/*
|
||||
* sys_reg_desc initialiser for architecturally unallocated cpufeature ID
|
||||
* register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
|
||||
* (1 <= crm < 8, 0 <= Op2 < 8).
|
||||
*/
|
||||
#define ID_UNALLOCATED(crm, op2) { \
|
||||
Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
|
||||
.access = pvm_access_id_aarch64, \
|
||||
}
|
||||
|
||||
/* Mark the specified system register as Read-As-Zero/Write-Ignored */
|
||||
#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi }
|
||||
|
||||
@ -375,24 +379,46 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
|
||||
AARCH32(SYS_MVFR0_EL1),
|
||||
AARCH32(SYS_MVFR1_EL1),
|
||||
AARCH32(SYS_MVFR2_EL1),
|
||||
ID_UNALLOCATED(3,3),
|
||||
AARCH32(SYS_ID_PFR2_EL1),
|
||||
AARCH32(SYS_ID_DFR1_EL1),
|
||||
AARCH32(SYS_ID_MMFR5_EL1),
|
||||
ID_UNALLOCATED(3,7),
|
||||
|
||||
/* AArch64 ID registers */
|
||||
/* CRm=4 */
|
||||
AARCH64(SYS_ID_AA64PFR0_EL1),
|
||||
AARCH64(SYS_ID_AA64PFR1_EL1),
|
||||
ID_UNALLOCATED(4,2),
|
||||
ID_UNALLOCATED(4,3),
|
||||
AARCH64(SYS_ID_AA64ZFR0_EL1),
|
||||
ID_UNALLOCATED(4,5),
|
||||
ID_UNALLOCATED(4,6),
|
||||
ID_UNALLOCATED(4,7),
|
||||
AARCH64(SYS_ID_AA64DFR0_EL1),
|
||||
AARCH64(SYS_ID_AA64DFR1_EL1),
|
||||
ID_UNALLOCATED(5,2),
|
||||
ID_UNALLOCATED(5,3),
|
||||
AARCH64(SYS_ID_AA64AFR0_EL1),
|
||||
AARCH64(SYS_ID_AA64AFR1_EL1),
|
||||
ID_UNALLOCATED(5,6),
|
||||
ID_UNALLOCATED(5,7),
|
||||
AARCH64(SYS_ID_AA64ISAR0_EL1),
|
||||
AARCH64(SYS_ID_AA64ISAR1_EL1),
|
||||
AARCH64(SYS_ID_AA64ISAR2_EL1),
|
||||
ID_UNALLOCATED(6,3),
|
||||
ID_UNALLOCATED(6,4),
|
||||
ID_UNALLOCATED(6,5),
|
||||
ID_UNALLOCATED(6,6),
|
||||
ID_UNALLOCATED(6,7),
|
||||
AARCH64(SYS_ID_AA64MMFR0_EL1),
|
||||
AARCH64(SYS_ID_AA64MMFR1_EL1),
|
||||
AARCH64(SYS_ID_AA64MMFR2_EL1),
|
||||
ID_UNALLOCATED(7,3),
|
||||
ID_UNALLOCATED(7,4),
|
||||
ID_UNALLOCATED(7,5),
|
||||
ID_UNALLOCATED(7,6),
|
||||
ID_UNALLOCATED(7,7),
|
||||
|
||||
/* Scalable Vector Registers are restricted. */
|
||||
|
||||
|
@ -429,11 +429,11 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_spending,
|
||||
NULL, vgic_uaccess_write_spending, 1,
|
||||
vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1,
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_cpending,
|
||||
NULL, vgic_uaccess_write_cpending, 1,
|
||||
vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1,
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
|
||||
vgic_mmio_read_active, vgic_mmio_write_sactive,
|
||||
|
@ -353,42 +353,6 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len)
|
||||
{
|
||||
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
|
||||
u32 value = 0;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* pending state of interrupt is latched in pending_latch variable.
|
||||
* Userspace will save and restore pending state and line_level
|
||||
* separately.
|
||||
* Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
|
||||
* for handling of ISPENDR and ICPENDR.
|
||||
*/
|
||||
for (i = 0; i < len * 8; i++) {
|
||||
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
|
||||
bool state = irq->pending_latch;
|
||||
|
||||
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
|
||||
int err;
|
||||
|
||||
err = irq_get_irqchip_state(irq->host_irq,
|
||||
IRQCHIP_STATE_PENDING,
|
||||
&state);
|
||||
WARN_ON(err);
|
||||
}
|
||||
|
||||
if (state)
|
||||
value |= (1U << i);
|
||||
|
||||
vgic_put_irq(vcpu->kvm, irq);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len,
|
||||
unsigned long val)
|
||||
@ -666,7 +630,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_spending,
|
||||
vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
|
||||
vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_cpending,
|
||||
@ -750,7 +714,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_spending,
|
||||
vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
|
||||
vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
|
||||
VGIC_ACCESS_32bit),
|
||||
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
|
||||
vgic_mmio_read_pending, vgic_mmio_write_cpending,
|
||||
|
@ -226,8 +226,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len)
|
||||
static unsigned long __read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len,
|
||||
bool is_user)
|
||||
{
|
||||
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
|
||||
u32 value = 0;
|
||||
@ -239,6 +240,15 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
unsigned long flags;
|
||||
bool val;
|
||||
|
||||
/*
|
||||
* When used from userspace with a GICv3 model:
|
||||
*
|
||||
* Pending state of interrupt is latched in pending_latch
|
||||
* variable. Userspace will save and restore pending state
|
||||
* and line_level separately.
|
||||
* Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
|
||||
* for handling of ISPENDR and ICPENDR.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&irq->irq_lock, flags);
|
||||
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
|
||||
int err;
|
||||
@ -248,10 +258,20 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
IRQCHIP_STATE_PENDING,
|
||||
&val);
|
||||
WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
|
||||
} else if (vgic_irq_is_mapped_level(irq)) {
|
||||
} else if (!is_user && vgic_irq_is_mapped_level(irq)) {
|
||||
val = vgic_get_phys_line_level(irq);
|
||||
} else {
|
||||
val = irq_is_pending(irq);
|
||||
switch (vcpu->kvm->arch.vgic.vgic_model) {
|
||||
case KVM_DEV_TYPE_ARM_VGIC_V3:
|
||||
if (is_user) {
|
||||
val = irq->pending_latch;
|
||||
break;
|
||||
}
|
||||
fallthrough;
|
||||
default:
|
||||
val = irq_is_pending(irq);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
value |= ((u32)val << i);
|
||||
@ -263,6 +283,18 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
return value;
|
||||
}
|
||||
|
||||
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len)
|
||||
{
|
||||
return __read_pending(vcpu, addr, len, false);
|
||||
}
|
||||
|
||||
unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len)
|
||||
{
|
||||
return __read_pending(vcpu, addr, len, true);
|
||||
}
|
||||
|
||||
static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
|
||||
{
|
||||
return (vgic_irq_is_sgi(irq->intid) &&
|
||||
|
@ -149,6 +149,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
|
||||
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len);
|
||||
|
||||
unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len);
|
||||
|
||||
void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
|
||||
gpa_t addr, unsigned int len,
|
||||
unsigned long val);
|
||||
|
@ -66,7 +66,7 @@ static void flush_context(void)
|
||||
* the next context-switch, we broadcast TLB flush + I-cache
|
||||
* invalidation over the inner shareable domain on rollover.
|
||||
*/
|
||||
kvm_call_hyp(__kvm_flush_vm_context);
|
||||
kvm_call_hyp(__kvm_flush_vm_context);
|
||||
}
|
||||
|
||||
static bool check_update_reserved_vmid(u64 vmid, u64 newvmid)
|
||||
|
@ -218,8 +218,6 @@ SYM_FUNC_ALIAS(__dma_flush_area, __pi___dma_flush_area)
|
||||
*/
|
||||
SYM_FUNC_START(__pi___dma_map_area)
|
||||
add x1, x0, x1
|
||||
cmp w2, #DMA_FROM_DEVICE
|
||||
b.eq __pi_dcache_inval_poc
|
||||
b __pi_dcache_clean_poc
|
||||
SYM_FUNC_END(__pi___dma_map_area)
|
||||
SYM_FUNC_ALIAS(__dma_map_area, __pi___dma_map_area)
|
||||
|
@ -834,7 +834,7 @@ iosapic_unregister_intr (unsigned int gsi)
|
||||
if (iosapic_intr_info[irq].count == 0) {
|
||||
#ifdef CONFIG_SMP
|
||||
/* Clear affinity */
|
||||
cpumask_setall(irq_get_affinity_mask(irq));
|
||||
irq_data_update_affinity(irq_get_irq_data(irq), cpu_all_mask);
|
||||
#endif
|
||||
/* Clear the interrupt information */
|
||||
iosapic_intr_info[irq].dest = 0;
|
||||
|
@ -57,8 +57,8 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
|
||||
void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
|
||||
{
|
||||
if (irq < NR_IRQS) {
|
||||
cpumask_copy(irq_get_affinity_mask(irq),
|
||||
cpumask_of(cpu_logical_id(hwid)));
|
||||
irq_data_update_affinity(irq_get_irq_data(irq),
|
||||
cpumask_of(cpu_logical_id(hwid)));
|
||||
irq_redir[irq] = (char) (redir & 0xff);
|
||||
}
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ static int ia64_set_msi_irq_affinity(struct irq_data *idata,
|
||||
msg.data = data;
|
||||
|
||||
pci_write_msi_msg(irq, &msg);
|
||||
cpumask_copy(irq_data_get_affinity_mask(idata), cpumask_of(cpu));
|
||||
irq_data_update_affinity(idata, cpumask_of(cpu));
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -132,7 +132,7 @@ static int dmar_msi_set_affinity(struct irq_data *data,
|
||||
msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
|
||||
|
||||
dmar_msi_write(irq, &msg);
|
||||
cpumask_copy(irq_data_get_affinity_mask(data), mask);
|
||||
irq_data_update_affinity(data, mask);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
config LOONGARCH
|
||||
bool
|
||||
default y
|
||||
select ACPI_GENERIC_GSI if ACPI
|
||||
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
|
||||
select ARCH_BINFMT_ELF_STATE
|
||||
select ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
|
@ -31,6 +31,148 @@ static inline bool acpi_has_cpu_in_madt(void)
|
||||
|
||||
extern struct list_head acpi_wakeup_device_list;
|
||||
|
||||
/*
|
||||
* Temporary definitions until the core ACPICA code gets updated (see
|
||||
* 1656837932-18257-1-git-send-email-lvjianmin@loongson.cn and its
|
||||
* follow-ups for the "rationale").
|
||||
*
|
||||
* Once the "legal reasons" are cleared and that the code is merged,
|
||||
* this can be dropped entierely.
|
||||
*/
|
||||
#if (ACPI_CA_VERSION == 0x20220331 && !defined(LOONGARCH_ACPICA_EXT))
|
||||
|
||||
#define LOONGARCH_ACPICA_EXT 1
|
||||
|
||||
#define ACPI_MADT_TYPE_CORE_PIC 17
|
||||
#define ACPI_MADT_TYPE_LIO_PIC 18
|
||||
#define ACPI_MADT_TYPE_HT_PIC 19
|
||||
#define ACPI_MADT_TYPE_EIO_PIC 20
|
||||
#define ACPI_MADT_TYPE_MSI_PIC 21
|
||||
#define ACPI_MADT_TYPE_BIO_PIC 22
|
||||
#define ACPI_MADT_TYPE_LPC_PIC 23
|
||||
|
||||
/* Values for Version field above */
|
||||
|
||||
enum acpi_madt_core_pic_version {
|
||||
ACPI_MADT_CORE_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_CORE_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_CORE_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_lio_pic_version {
|
||||
ACPI_MADT_LIO_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_LIO_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_LIO_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_eio_pic_version {
|
||||
ACPI_MADT_EIO_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_EIO_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_EIO_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_ht_pic_version {
|
||||
ACPI_MADT_HT_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_HT_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_HT_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_bio_pic_version {
|
||||
ACPI_MADT_BIO_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_BIO_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_BIO_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_msi_pic_version {
|
||||
ACPI_MADT_MSI_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_MSI_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_MSI_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
enum acpi_madt_lpc_pic_version {
|
||||
ACPI_MADT_LPC_PIC_VERSION_NONE = 0,
|
||||
ACPI_MADT_LPC_PIC_VERSION_V1 = 1,
|
||||
ACPI_MADT_LPC_PIC_VERSION_RESERVED = 2 /* 2 and greater are reserved */
|
||||
};
|
||||
|
||||
#pragma pack(1)
|
||||
|
||||
/* Core Interrupt Controller */
|
||||
|
||||
struct acpi_madt_core_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u32 processor_id;
|
||||
u32 core_id;
|
||||
u32 flags;
|
||||
};
|
||||
|
||||
/* Legacy I/O Interrupt Controller */
|
||||
|
||||
struct acpi_madt_lio_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u64 address;
|
||||
u16 size;
|
||||
u8 cascade[2];
|
||||
u32 cascade_map[2];
|
||||
};
|
||||
|
||||
/* Extend I/O Interrupt Controller */
|
||||
|
||||
struct acpi_madt_eio_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u8 cascade;
|
||||
u8 node;
|
||||
u64 node_map;
|
||||
};
|
||||
|
||||
/* HT Interrupt Controller */
|
||||
|
||||
struct acpi_madt_ht_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u64 address;
|
||||
u16 size;
|
||||
u8 cascade[8];
|
||||
};
|
||||
|
||||
/* Bridge I/O Interrupt Controller */
|
||||
|
||||
struct acpi_madt_bio_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u64 address;
|
||||
u16 size;
|
||||
u16 id;
|
||||
u16 gsi_base;
|
||||
};
|
||||
|
||||
/* MSI Interrupt Controller */
|
||||
|
||||
struct acpi_madt_msi_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u64 msg_address;
|
||||
u32 start;
|
||||
u32 count;
|
||||
};
|
||||
|
||||
/* LPC Interrupt Controller */
|
||||
|
||||
struct acpi_madt_lpc_pic {
|
||||
struct acpi_subtable_header header;
|
||||
u8 version;
|
||||
u64 address;
|
||||
u16 size;
|
||||
u8 cascade;
|
||||
};
|
||||
|
||||
#pragma pack()
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* !CONFIG_ACPI */
|
||||
|
||||
#define ACPI_TABLE_UPGRADE_MAX_PHYS ARCH_LOW_ADDRESS_LIMIT
|
||||
|
@ -35,9 +35,6 @@ static inline bool on_irq_stack(int cpu, unsigned long sp)
|
||||
return (low <= sp && sp <= high);
|
||||
}
|
||||
|
||||
int get_ipi_irq(void);
|
||||
int get_pmc_irq(void);
|
||||
int get_timer_irq(void);
|
||||
void spurious_interrupt(void);
|
||||
|
||||
#define NR_IRQS_LEGACY 16
|
||||
@ -48,6 +45,14 @@ void arch_trigger_cpumask_backtrace(const struct cpumask *mask, bool exclude_sel
|
||||
#define MAX_IO_PICS 2
|
||||
#define NR_IRQS (64 + (256 * MAX_IO_PICS))
|
||||
|
||||
struct acpi_vector_group {
|
||||
int node;
|
||||
int pci_segment;
|
||||
struct irq_domain *parent;
|
||||
};
|
||||
extern struct acpi_vector_group pch_group[MAX_IO_PICS];
|
||||
extern struct acpi_vector_group msi_group[MAX_IO_PICS];
|
||||
|
||||
#define CORES_PER_EIO_NODE 4
|
||||
|
||||
#define LOONGSON_CPU_UART0_VEC 10 /* CPU UART0 */
|
||||
@ -79,15 +84,6 @@ void arch_trigger_cpumask_backtrace(const struct cpumask *mask, bool exclude_sel
|
||||
extern int find_pch_pic(u32 gsi);
|
||||
extern int eiointc_get_node(int id);
|
||||
|
||||
static inline void eiointc_enable(void)
|
||||
{
|
||||
uint64_t misc;
|
||||
|
||||
misc = iocsr_read64(LOONGARCH_IOCSR_MISC_FUNC);
|
||||
misc |= IOCSR_MISC_FUNC_EXT_IOI_EN;
|
||||
iocsr_write64(misc, LOONGARCH_IOCSR_MISC_FUNC);
|
||||
}
|
||||
|
||||
struct acpi_madt_lio_pic;
|
||||
struct acpi_madt_eio_pic;
|
||||
struct acpi_madt_ht_pic;
|
||||
@ -95,21 +91,29 @@ struct acpi_madt_bio_pic;
|
||||
struct acpi_madt_msi_pic;
|
||||
struct acpi_madt_lpc_pic;
|
||||
|
||||
struct irq_domain *loongarch_cpu_irq_init(void);
|
||||
|
||||
struct irq_domain *liointc_acpi_init(struct irq_domain *parent,
|
||||
int liointc_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_lio_pic *acpi_liointc);
|
||||
struct irq_domain *eiointc_acpi_init(struct irq_domain *parent,
|
||||
int eiointc_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_eio_pic *acpi_eiointc);
|
||||
|
||||
struct irq_domain *htvec_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_ht_pic *acpi_htvec);
|
||||
struct irq_domain *pch_lpc_acpi_init(struct irq_domain *parent,
|
||||
int pch_lpc_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_lpc_pic *acpi_pchlpc);
|
||||
struct irq_domain *pch_msi_acpi_init(struct irq_domain *parent,
|
||||
#if IS_ENABLED(CONFIG_LOONGSON_PCH_MSI)
|
||||
int pch_msi_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_msi_pic *acpi_pchmsi);
|
||||
struct irq_domain *pch_pic_acpi_init(struct irq_domain *parent,
|
||||
#else
|
||||
static inline int pch_msi_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_msi_pic *acpi_pchmsi)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
int pch_pic_acpi_init(struct irq_domain *parent,
|
||||
struct acpi_madt_bio_pic *acpi_pchpic);
|
||||
int find_pch_pic(u32 gsi);
|
||||
struct fwnode_handle *get_pch_msi_handle(int pci_segment);
|
||||
|
||||
extern struct acpi_madt_lio_pic *acpi_liointc;
|
||||
extern struct acpi_madt_eio_pic *acpi_eiointc[MAX_IO_PICS];
|
||||
@ -119,11 +123,10 @@ extern struct acpi_madt_lpc_pic *acpi_pchlpc;
|
||||
extern struct acpi_madt_msi_pic *acpi_pchmsi[MAX_IO_PICS];
|
||||
extern struct acpi_madt_bio_pic *acpi_pchpic[MAX_IO_PICS];
|
||||
|
||||
extern struct irq_domain *cpu_domain;
|
||||
extern struct irq_domain *liointc_domain;
|
||||
extern struct irq_domain *pch_lpc_domain;
|
||||
extern struct irq_domain *pch_msi_domain[MAX_IO_PICS];
|
||||
extern struct irq_domain *pch_pic_domain[MAX_IO_PICS];
|
||||
extern struct fwnode_handle *cpuintc_handle;
|
||||
extern struct fwnode_handle *liointc_handle;
|
||||
extern struct fwnode_handle *pch_lpc_handle;
|
||||
extern struct fwnode_handle *pch_pic_handle[MAX_IO_PICS];
|
||||
|
||||
extern irqreturn_t loongson3_ipi_interrupt(int irq, void *dev);
|
||||
|
||||
|
@ -25,7 +25,6 @@ EXPORT_SYMBOL(acpi_pci_disabled);
|
||||
int acpi_strict = 1; /* We have no workarounds on LoongArch */
|
||||
int num_processors;
|
||||
int disabled_cpus;
|
||||
enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
|
||||
|
||||
u64 acpi_saved_sp;
|
||||
|
||||
@ -33,70 +32,6 @@ u64 acpi_saved_sp;
|
||||
|
||||
#define PREFIX "ACPI: "
|
||||
|
||||
int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
|
||||
{
|
||||
if (irqp != NULL)
|
||||
*irqp = acpi_register_gsi(NULL, gsi, -1, -1);
|
||||
return (*irqp >= 0) ? 0 : -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
|
||||
|
||||
int acpi_isa_irq_to_gsi(unsigned int isa_irq, u32 *gsi)
|
||||
{
|
||||
if (gsi)
|
||||
*gsi = isa_irq;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* success: return IRQ number (>=0)
|
||||
* failure: return < 0
|
||||
*/
|
||||
int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
|
||||
{
|
||||
struct irq_fwspec fwspec;
|
||||
|
||||
switch (gsi) {
|
||||
case GSI_MIN_CPU_IRQ ... GSI_MAX_CPU_IRQ:
|
||||
fwspec.fwnode = liointc_domain->fwnode;
|
||||
fwspec.param[0] = gsi - GSI_MIN_CPU_IRQ;
|
||||
fwspec.param_count = 1;
|
||||
|
||||
return irq_create_fwspec_mapping(&fwspec);
|
||||
|
||||
case GSI_MIN_LPC_IRQ ... GSI_MAX_LPC_IRQ:
|
||||
if (!pch_lpc_domain)
|
||||
return -EINVAL;
|
||||
|
||||
fwspec.fwnode = pch_lpc_domain->fwnode;
|
||||
fwspec.param[0] = gsi - GSI_MIN_LPC_IRQ;
|
||||
fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
|
||||
fwspec.param_count = 2;
|
||||
|
||||
return irq_create_fwspec_mapping(&fwspec);
|
||||
|
||||
case GSI_MIN_PCH_IRQ ... GSI_MAX_PCH_IRQ:
|
||||
if (!pch_pic_domain[0])
|
||||
return -EINVAL;
|
||||
|
||||
fwspec.fwnode = pch_pic_domain[0]->fwnode;
|
||||
fwspec.param[0] = gsi - GSI_MIN_PCH_IRQ;
|
||||
fwspec.param[1] = IRQ_TYPE_LEVEL_HIGH;
|
||||
fwspec.param_count = 2;
|
||||
|
||||
return irq_create_fwspec_mapping(&fwspec);
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(acpi_register_gsi);
|
||||
|
||||
void acpi_unregister_gsi(u32 gsi)
|
||||
{
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
|
||||
|
||||
void __init __iomem * __acpi_map_table(unsigned long phys, unsigned long size)
|
||||
{
|
||||
|
||||
|
@ -25,12 +25,8 @@ DEFINE_PER_CPU(unsigned long, irq_stack);
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
|
||||
EXPORT_PER_CPU_SYMBOL(irq_stat);
|
||||
|
||||
struct irq_domain *cpu_domain;
|
||||
struct irq_domain *liointc_domain;
|
||||
struct irq_domain *pch_lpc_domain;
|
||||
struct irq_domain *pch_msi_domain[MAX_IO_PICS];
|
||||
struct irq_domain *pch_pic_domain[MAX_IO_PICS];
|
||||
|
||||
struct acpi_vector_group pch_group[MAX_IO_PICS];
|
||||
struct acpi_vector_group msi_group[MAX_IO_PICS];
|
||||
/*
|
||||
* 'what should we do if we get a hw irq event on an illegal vector'.
|
||||
* each architecture has to answer this themselves.
|
||||
@ -56,6 +52,51 @@ int arch_show_interrupts(struct seq_file *p, int prec)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init early_pci_mcfg_parse(struct acpi_table_header *header)
|
||||
{
|
||||
struct acpi_table_mcfg *mcfg;
|
||||
struct acpi_mcfg_allocation *mptr;
|
||||
int i, n;
|
||||
|
||||
if (header->length < sizeof(struct acpi_table_mcfg))
|
||||
return -EINVAL;
|
||||
|
||||
n = (header->length - sizeof(struct acpi_table_mcfg)) /
|
||||
sizeof(struct acpi_mcfg_allocation);
|
||||
mcfg = (struct acpi_table_mcfg *)header;
|
||||
mptr = (struct acpi_mcfg_allocation *) &mcfg[1];
|
||||
|
||||
for (i = 0; i < n; i++, mptr++) {
|
||||
msi_group[i].pci_segment = mptr->pci_segment;
|
||||
pch_group[i].node = msi_group[i].node = (mptr->address >> 44) & 0xf;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __init init_vec_parent_group(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MAX_IO_PICS; i++) {
|
||||
msi_group[i].pci_segment = -1;
|
||||
msi_group[i].node = -1;
|
||||
pch_group[i].node = -1;
|
||||
}
|
||||
|
||||
acpi_table_parse(ACPI_SIG_MCFG, early_pci_mcfg_parse);
|
||||
}
|
||||
|
||||
static int __init get_ipi_irq(void)
|
||||
{
|
||||
struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
|
||||
|
||||
if (d)
|
||||
return irq_create_mapping(d, EXCCODE_IPI - EXCCODE_INT_START);
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
void __init init_IRQ(void)
|
||||
{
|
||||
int i;
|
||||
@ -69,9 +110,12 @@ void __init init_IRQ(void)
|
||||
clear_csr_ecfg(ECFG0_IM);
|
||||
clear_csr_estat(ESTATF_IP);
|
||||
|
||||
init_vec_parent_group();
|
||||
irqchip_init();
|
||||
#ifdef CONFIG_SMP
|
||||
ipi_irq = EXCCODE_IPI - EXCCODE_INT_START;
|
||||
ipi_irq = get_ipi_irq();
|
||||
if (ipi_irq < 0)
|
||||
panic("IPI IRQ mapping failed\n");
|
||||
irq_set_percpu_devid(ipi_irq);
|
||||
r = request_percpu_irq(ipi_irq, loongson3_ipi_interrupt, "IPI", &ipi_dummy_dev);
|
||||
if (r < 0)
|
||||
|
@ -123,6 +123,16 @@ void sync_counter(void)
|
||||
csr_write64(-init_timeval, LOONGARCH_CSR_CNTC);
|
||||
}
|
||||
|
||||
static int get_timer_irq(void)
|
||||
{
|
||||
struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
|
||||
|
||||
if (d)
|
||||
return irq_create_mapping(d, EXCCODE_TIMER - EXCCODE_INT_START);
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int constant_clockevent_init(void)
|
||||
{
|
||||
unsigned int irq;
|
||||
@ -132,7 +142,9 @@ int constant_clockevent_init(void)
|
||||
struct clock_event_device *cd;
|
||||
static int timer_irq_installed = 0;
|
||||
|
||||
irq = EXCCODE_TIMER - EXCCODE_INT_START;
|
||||
irq = get_timer_irq();
|
||||
if (irq < 0)
|
||||
pr_err("Failed to map irq %d (timer)\n", irq);
|
||||
|
||||
cd = &per_cpu(constant_clockevent_device, cpu);
|
||||
|
||||
|
@ -101,6 +101,7 @@ SECTIONS
|
||||
|
||||
STABS_DEBUG
|
||||
DWARF_DEBUG
|
||||
ELF_DETAILS
|
||||
|
||||
.gptab.sdata : {
|
||||
*(.gptab.data)
|
||||
|
@ -263,7 +263,7 @@ static int next_cpu_for_irq(struct irq_data *data)
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
int cpu;
|
||||
struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
const struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
int weight = cpumask_weight(mask);
|
||||
struct octeon_ciu_chip_data *cd = irq_data_get_irq_chip_data(data);
|
||||
|
||||
@ -758,7 +758,7 @@ static void octeon_irq_cpu_offline_ciu(struct irq_data *data)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
cpumask_t new_affinity;
|
||||
struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
const struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
|
||||
if (!cpumask_test_cpu(cpu, mask))
|
||||
return;
|
||||
|
@ -7,8 +7,9 @@
|
||||
#define NR_MIPS_CPU_IRQS 8
|
||||
#define NR_MAX_CHAINED_IRQS 40 /* Chained IRQs means those not directly used by devices */
|
||||
#define NR_IRQS (NR_IRQS_LEGACY + NR_MIPS_CPU_IRQS + NR_MAX_CHAINED_IRQS + 256)
|
||||
|
||||
#define MAX_IO_PICS 1
|
||||
#define MIPS_CPU_IRQ_BASE NR_IRQS_LEGACY
|
||||
#define GSI_MIN_CPU_IRQ 0
|
||||
|
||||
#include <asm/mach-generic/irq.h>
|
||||
|
||||
|
@ -315,7 +315,7 @@ unsigned long txn_affinity_addr(unsigned int irq, int cpu)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
struct irq_data *d = irq_get_irq_data(irq);
|
||||
cpumask_copy(irq_data_get_affinity_mask(d), cpumask_of(cpu));
|
||||
irq_data_update_affinity(d, cpumask_of(cpu));
|
||||
#endif
|
||||
|
||||
return per_cpu(cpu_data, cpu).txn_addr;
|
||||
|
@ -364,8 +364,13 @@ config RISCV_ISA_SVPBMT
|
||||
select RISCV_ALTERNATIVE
|
||||
default y
|
||||
help
|
||||
Adds support to dynamically detect the presence of the SVPBMT extension
|
||||
(Supervisor-mode: page-based memory types) and enable its usage.
|
||||
Adds support to dynamically detect the presence of the SVPBMT
|
||||
ISA-extension (Supervisor-mode: page-based memory types) and
|
||||
enable its usage.
|
||||
|
||||
The memory type for a page contains a combination of attributes
|
||||
that indicate the cacheability, idempotency, and ordering
|
||||
properties for access to that page.
|
||||
|
||||
The SVPBMT extension is only available on 64Bit cpus.
|
||||
|
||||
|
@ -35,6 +35,7 @@ config ERRATA_SIFIVE_CIP_1200
|
||||
|
||||
config ERRATA_THEAD
|
||||
bool "T-HEAD errata"
|
||||
depends on !XIP_KERNEL
|
||||
select RISCV_ALTERNATIVE
|
||||
help
|
||||
All T-HEAD errata Kconfig depend on this Kconfig. Disabling
|
||||
|
@ -192,6 +192,15 @@
|
||||
riscv,ndev = <186>;
|
||||
};
|
||||
|
||||
pdma: dma-controller@3000000 {
|
||||
compatible = "sifive,fu540-c000-pdma", "sifive,pdma0";
|
||||
reg = <0x0 0x3000000 0x0 0x8000>;
|
||||
interrupt-parent = <&plic>;
|
||||
interrupts = <5 6>, <7 8>, <9 10>, <11 12>;
|
||||
dma-channels = <4>;
|
||||
#dma-cells = <1>;
|
||||
};
|
||||
|
||||
clkcfg: clkcfg@20002000 {
|
||||
compatible = "microchip,mpfs-clkcfg";
|
||||
reg = <0x0 0x20002000 0x0 0x1000>, <0x0 0x3E001000 0x0 0x1000>;
|
||||
|
@ -293,7 +293,6 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
|
||||
unsigned int stage)
|
||||
{
|
||||
u32 cpu_req_feature = cpufeature_probe(stage);
|
||||
u32 cpu_apply_feature = 0;
|
||||
struct alt_entry *alt;
|
||||
u32 tmp;
|
||||
|
||||
@ -307,10 +306,8 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
|
||||
}
|
||||
|
||||
tmp = (1U << alt->errata_id);
|
||||
if (cpu_req_feature & tmp) {
|
||||
if (cpu_req_feature & tmp)
|
||||
patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
|
||||
cpu_apply_feature |= tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -97,7 +97,7 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
|
||||
* We ran out of VMIDs so we increment vmid_version and
|
||||
* start assigning VMIDs from 1.
|
||||
*
|
||||
* This also means existing VMIDs assignement to all Guest
|
||||
* This also means existing VMIDs assignment to all Guest
|
||||
* instances is invalid and we have force VMID re-assignement
|
||||
* for all Guest instances. The Guest instances that were not
|
||||
* running will automatically pick-up new VMIDs because will
|
||||
|
@ -230,16 +230,17 @@ void migrate_irqs(void)
|
||||
struct irq_data *data = irq_get_irq_data(irq);
|
||||
|
||||
if (irq_data_get_node(data) == cpu) {
|
||||
struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
const struct cpumask *mask = irq_data_get_affinity_mask(data);
|
||||
unsigned int newcpu = cpumask_any_and(mask,
|
||||
cpu_online_mask);
|
||||
if (newcpu >= nr_cpu_ids) {
|
||||
pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n",
|
||||
irq, cpu);
|
||||
|
||||
cpumask_setall(mask);
|
||||
irq_set_affinity(irq, cpu_all_mask);
|
||||
} else {
|
||||
irq_set_affinity(irq, mask);
|
||||
}
|
||||
irq_set_affinity(irq, mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -124,6 +124,51 @@ static u64 get_cc_mask(void)
|
||||
return BIT_ULL(gpa_width - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* The TDX module spec states that #VE may be injected for a limited set of
|
||||
* reasons:
|
||||
*
|
||||
* - Emulation of the architectural #VE injection on EPT violation;
|
||||
*
|
||||
* - As a result of guest TD execution of a disallowed instruction,
|
||||
* a disallowed MSR access, or CPUID virtualization;
|
||||
*
|
||||
* - A notification to the guest TD about anomalous behavior;
|
||||
*
|
||||
* The last one is opt-in and is not used by the kernel.
|
||||
*
|
||||
* The Intel Software Developer's Manual describes cases when instruction
|
||||
* length field can be used in section "Information for VM Exits Due to
|
||||
* Instruction Execution".
|
||||
*
|
||||
* For TDX, it ultimately means GET_VEINFO provides reliable instruction length
|
||||
* information if #VE occurred due to instruction execution, but not for EPT
|
||||
* violations.
|
||||
*/
|
||||
static int ve_instr_len(struct ve_info *ve)
|
||||
{
|
||||
switch (ve->exit_reason) {
|
||||
case EXIT_REASON_HLT:
|
||||
case EXIT_REASON_MSR_READ:
|
||||
case EXIT_REASON_MSR_WRITE:
|
||||
case EXIT_REASON_CPUID:
|
||||
case EXIT_REASON_IO_INSTRUCTION:
|
||||
/* It is safe to use ve->instr_len for #VE due instructions */
|
||||
return ve->instr_len;
|
||||
case EXIT_REASON_EPT_VIOLATION:
|
||||
/*
|
||||
* For EPT violations, ve->insn_len is not defined. For those,
|
||||
* the kernel must decode instructions manually and should not
|
||||
* be using this function.
|
||||
*/
|
||||
WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
|
||||
return 0;
|
||||
default:
|
||||
WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
|
||||
return ve->instr_len;
|
||||
}
|
||||
}
|
||||
|
||||
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
|
||||
{
|
||||
struct tdx_hypercall_args args = {
|
||||
@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
|
||||
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
|
||||
}
|
||||
|
||||
static bool handle_halt(void)
|
||||
static int handle_halt(struct ve_info *ve)
|
||||
{
|
||||
/*
|
||||
* Since non safe halt is mainly used in CPU offlining
|
||||
@ -158,9 +203,9 @@ static bool handle_halt(void)
|
||||
const bool do_sti = false;
|
||||
|
||||
if (__halt(irq_disabled, do_sti))
|
||||
return false;
|
||||
return -EIO;
|
||||
|
||||
return true;
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
void __cpuidle tdx_safe_halt(void)
|
||||
@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void)
|
||||
WARN_ONCE(1, "HLT instruction emulation failed\n");
|
||||
}
|
||||
|
||||
static bool read_msr(struct pt_regs *regs)
|
||||
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
struct tdx_hypercall_args args = {
|
||||
.r10 = TDX_HYPERCALL_STANDARD,
|
||||
@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs)
|
||||
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
|
||||
*/
|
||||
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
|
||||
return false;
|
||||
return -EIO;
|
||||
|
||||
regs->ax = lower_32_bits(args.r11);
|
||||
regs->dx = upper_32_bits(args.r11);
|
||||
return true;
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
static bool write_msr(struct pt_regs *regs)
|
||||
static int write_msr(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
struct tdx_hypercall_args args = {
|
||||
.r10 = TDX_HYPERCALL_STANDARD,
|
||||
@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs)
|
||||
* can be found in TDX Guest-Host-Communication Interface
|
||||
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
|
||||
*/
|
||||
return !__tdx_hypercall(&args, 0);
|
||||
if (__tdx_hypercall(&args, 0))
|
||||
return -EIO;
|
||||
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
static bool handle_cpuid(struct pt_regs *regs)
|
||||
static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
struct tdx_hypercall_args args = {
|
||||
.r10 = TDX_HYPERCALL_STANDARD,
|
||||
@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs)
|
||||
*/
|
||||
if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
|
||||
regs->ax = regs->bx = regs->cx = regs->dx = 0;
|
||||
return true;
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs)
|
||||
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
|
||||
*/
|
||||
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
|
||||
return false;
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
|
||||
@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs)
|
||||
regs->cx = args.r14;
|
||||
regs->dx = args.r15;
|
||||
|
||||
return true;
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
static bool mmio_read(int size, unsigned long addr, unsigned long *val)
|
||||
@ -283,10 +331,10 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
|
||||
EPT_WRITE, addr, val);
|
||||
}
|
||||
|
||||
static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
|
||||
static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
unsigned long *reg, val, vaddr;
|
||||
char buffer[MAX_INSN_SIZE];
|
||||
unsigned long *reg, val;
|
||||
struct insn insn = {};
|
||||
enum mmio_type mmio;
|
||||
int size, extend_size;
|
||||
@ -294,34 +342,49 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
|
||||
|
||||
/* Only in-kernel MMIO is supported */
|
||||
if (WARN_ON_ONCE(user_mode(regs)))
|
||||
return false;
|
||||
return -EFAULT;
|
||||
|
||||
if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
|
||||
return false;
|
||||
return -EFAULT;
|
||||
|
||||
if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
|
||||
return false;
|
||||
return -EINVAL;
|
||||
|
||||
mmio = insn_decode_mmio(&insn, &size);
|
||||
if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
|
||||
return false;
|
||||
return -EINVAL;
|
||||
|
||||
if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
|
||||
reg = insn_get_modrm_reg_ptr(&insn, regs);
|
||||
if (!reg)
|
||||
return false;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ve->instr_len = insn.length;
|
||||
/*
|
||||
* Reject EPT violation #VEs that split pages.
|
||||
*
|
||||
* MMIO accesses are supposed to be naturally aligned and therefore
|
||||
* never cross page boundaries. Seeing split page accesses indicates
|
||||
* a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
|
||||
*
|
||||
* load_unaligned_zeropad() will recover using exception fixups.
|
||||
*/
|
||||
vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
|
||||
if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
|
||||
return -EFAULT;
|
||||
|
||||
/* Handle writes first */
|
||||
switch (mmio) {
|
||||
case MMIO_WRITE:
|
||||
memcpy(&val, reg, size);
|
||||
return mmio_write(size, ve->gpa, val);
|
||||
if (!mmio_write(size, ve->gpa, val))
|
||||
return -EIO;
|
||||
return insn.length;
|
||||
case MMIO_WRITE_IMM:
|
||||
val = insn.immediate.value;
|
||||
return mmio_write(size, ve->gpa, val);
|
||||
if (!mmio_write(size, ve->gpa, val))
|
||||
return -EIO;
|
||||
return insn.length;
|
||||
case MMIO_READ:
|
||||
case MMIO_READ_ZERO_EXTEND:
|
||||
case MMIO_READ_SIGN_EXTEND:
|
||||
@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
|
||||
* decoded or handled properly. It was likely not using io.h
|
||||
* helpers or accessed MMIO accidentally.
|
||||
*/
|
||||
return false;
|
||||
return -EINVAL;
|
||||
default:
|
||||
WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
|
||||
return false;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Handle reads */
|
||||
if (!mmio_read(size, ve->gpa, &val))
|
||||
return false;
|
||||
return -EIO;
|
||||
|
||||
switch (mmio) {
|
||||
case MMIO_READ:
|
||||
@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
|
||||
default:
|
||||
/* All other cases has to be covered with the first switch() */
|
||||
WARN_ON_ONCE(1);
|
||||
return false;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (extend_size)
|
||||
memset(reg, extend_val, extend_size);
|
||||
memcpy(reg, &val, size);
|
||||
return true;
|
||||
return insn.length;
|
||||
}
|
||||
|
||||
static bool handle_in(struct pt_regs *regs, int size, int port)
|
||||
@ -421,13 +484,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port)
|
||||
*
|
||||
* Return True on success or False on failure.
|
||||
*/
|
||||
static bool handle_io(struct pt_regs *regs, u32 exit_qual)
|
||||
static int handle_io(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
u32 exit_qual = ve->exit_qual;
|
||||
int size, port;
|
||||
bool in;
|
||||
bool in, ret;
|
||||
|
||||
if (VE_IS_IO_STRING(exit_qual))
|
||||
return false;
|
||||
return -EIO;
|
||||
|
||||
in = VE_IS_IO_IN(exit_qual);
|
||||
size = VE_GET_IO_SIZE(exit_qual);
|
||||
@ -435,9 +499,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
|
||||
|
||||
|
||||
if (in)
|
||||
return handle_in(regs, size, port);
|
||||
ret = handle_in(regs, size, port);
|
||||
else
|
||||
return handle_out(regs, size, port);
|
||||
ret = handle_out(regs, size, port);
|
||||
if (!ret)
|
||||
return -EIO;
|
||||
|
||||
return ve_instr_len(ve);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
|
||||
__init bool tdx_early_handle_ve(struct pt_regs *regs)
|
||||
{
|
||||
struct ve_info ve;
|
||||
int insn_len;
|
||||
|
||||
tdx_get_ve_info(&ve);
|
||||
|
||||
if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
|
||||
return false;
|
||||
|
||||
return handle_io(regs, ve.exit_qual);
|
||||
insn_len = handle_io(regs, &ve);
|
||||
if (insn_len < 0)
|
||||
return false;
|
||||
|
||||
regs->ip += insn_len;
|
||||
return true;
|
||||
}
|
||||
|
||||
void tdx_get_ve_info(struct ve_info *ve)
|
||||
@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve)
|
||||
ve->instr_info = upper_32_bits(out.r10);
|
||||
}
|
||||
|
||||
/* Handle the user initiated #VE */
|
||||
static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
|
||||
/*
|
||||
* Handle the user initiated #VE.
|
||||
*
|
||||
* On success, returns the number of bytes RIP should be incremented (>=0)
|
||||
* or -errno on error.
|
||||
*/
|
||||
static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
switch (ve->exit_reason) {
|
||||
case EXIT_REASON_CPUID:
|
||||
return handle_cpuid(regs);
|
||||
return handle_cpuid(regs, ve);
|
||||
default:
|
||||
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
|
||||
return false;
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle the kernel #VE */
|
||||
static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
|
||||
/*
|
||||
* Handle the kernel #VE.
|
||||
*
|
||||
* On success, returns the number of bytes RIP should be incremented (>=0)
|
||||
* or -errno on error.
|
||||
*/
|
||||
static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
switch (ve->exit_reason) {
|
||||
case EXIT_REASON_HLT:
|
||||
return handle_halt();
|
||||
return handle_halt(ve);
|
||||
case EXIT_REASON_MSR_READ:
|
||||
return read_msr(regs);
|
||||
return read_msr(regs, ve);
|
||||
case EXIT_REASON_MSR_WRITE:
|
||||
return write_msr(regs);
|
||||
return write_msr(regs, ve);
|
||||
case EXIT_REASON_CPUID:
|
||||
return handle_cpuid(regs);
|
||||
return handle_cpuid(regs, ve);
|
||||
case EXIT_REASON_EPT_VIOLATION:
|
||||
return handle_mmio(regs, ve);
|
||||
case EXIT_REASON_IO_INSTRUCTION:
|
||||
return handle_io(regs, ve->exit_qual);
|
||||
return handle_io(regs, ve);
|
||||
default:
|
||||
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
|
||||
return false;
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
|
||||
{
|
||||
bool ret;
|
||||
int insn_len;
|
||||
|
||||
if (user_mode(regs))
|
||||
ret = virt_exception_user(regs, ve);
|
||||
insn_len = virt_exception_user(regs, ve);
|
||||
else
|
||||
ret = virt_exception_kernel(regs, ve);
|
||||
insn_len = virt_exception_kernel(regs, ve);
|
||||
if (insn_len < 0)
|
||||
return false;
|
||||
|
||||
/* After successful #VE handling, move the IP */
|
||||
if (ret)
|
||||
regs->ip += ve->instr_len;
|
||||
regs->ip += insn_len;
|
||||
|
||||
return ret;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool tdx_tlb_flush_required(bool private)
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/io.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/sev.h>
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/hyperv-tlfs.h>
|
||||
#include <asm/mshyperv.h>
|
||||
@ -405,6 +406,11 @@ void __init hyperv_init(void)
|
||||
}
|
||||
|
||||
if (hv_isolation_type_snp()) {
|
||||
/* Negotiate GHCB Version. */
|
||||
if (!hv_ghcb_negotiate_protocol())
|
||||
hv_ghcb_terminate(SEV_TERM_SET_GEN,
|
||||
GHCB_SEV_ES_PROT_UNSUPPORTED);
|
||||
|
||||
hv_ghcb_pg = alloc_percpu(union hv_ghcb *);
|
||||
if (!hv_ghcb_pg)
|
||||
goto free_vp_assist_page;
|
||||
|
@ -192,7 +192,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
|
||||
struct pci_dev *dev;
|
||||
struct hv_interrupt_entry out_entry, *stored_entry;
|
||||
struct irq_cfg *cfg = irqd_cfg(data);
|
||||
cpumask_t *affinity;
|
||||
const cpumask_t *affinity;
|
||||
int cpu;
|
||||
u64 status;
|
||||
|
||||
|
@ -53,6 +53,8 @@ union hv_ghcb {
|
||||
} hypercall;
|
||||
} __packed __aligned(HV_HYP_PAGE_SIZE);
|
||||
|
||||
static u16 hv_ghcb_version __ro_after_init;
|
||||
|
||||
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
|
||||
{
|
||||
union hv_ghcb *hv_ghcb;
|
||||
@ -96,12 +98,85 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
|
||||
return status;
|
||||
}
|
||||
|
||||
static inline u64 rd_ghcb_msr(void)
|
||||
{
|
||||
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
|
||||
}
|
||||
|
||||
static inline void wr_ghcb_msr(u64 val)
|
||||
{
|
||||
native_wrmsrl(MSR_AMD64_SEV_ES_GHCB, val);
|
||||
}
|
||||
|
||||
static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code,
|
||||
u64 exit_info_1, u64 exit_info_2)
|
||||
{
|
||||
/* Fill in protocol and format specifiers */
|
||||
ghcb->protocol_version = hv_ghcb_version;
|
||||
ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
|
||||
|
||||
ghcb_set_sw_exit_code(ghcb, exit_code);
|
||||
ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
|
||||
ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
|
||||
|
||||
VMGEXIT();
|
||||
|
||||
if (ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0))
|
||||
return ES_VMM_ERROR;
|
||||
else
|
||||
return ES_OK;
|
||||
}
|
||||
|
||||
void hv_ghcb_terminate(unsigned int set, unsigned int reason)
|
||||
{
|
||||
u64 val = GHCB_MSR_TERM_REQ;
|
||||
|
||||
/* Tell the hypervisor what went wrong. */
|
||||
val |= GHCB_SEV_TERM_REASON(set, reason);
|
||||
|
||||
/* Request Guest Termination from Hypvervisor */
|
||||
wr_ghcb_msr(val);
|
||||
VMGEXIT();
|
||||
|
||||
while (true)
|
||||
asm volatile("hlt\n" : : : "memory");
|
||||
}
|
||||
|
||||
bool hv_ghcb_negotiate_protocol(void)
|
||||
{
|
||||
u64 ghcb_gpa;
|
||||
u64 val;
|
||||
|
||||
/* Save ghcb page gpa. */
|
||||
ghcb_gpa = rd_ghcb_msr();
|
||||
|
||||
/* Do the GHCB protocol version negotiation */
|
||||
wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
|
||||
VMGEXIT();
|
||||
val = rd_ghcb_msr();
|
||||
|
||||
if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
|
||||
return false;
|
||||
|
||||
if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
|
||||
GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
|
||||
return false;
|
||||
|
||||
hv_ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val),
|
||||
GHCB_PROTOCOL_MAX);
|
||||
|
||||
/* Write ghcb page back after negotiating protocol. */
|
||||
wr_ghcb_msr(ghcb_gpa);
|
||||
VMGEXIT();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void hv_ghcb_msr_write(u64 msr, u64 value)
|
||||
{
|
||||
union hv_ghcb *hv_ghcb;
|
||||
void **ghcb_base;
|
||||
unsigned long flags;
|
||||
struct es_em_ctxt ctxt;
|
||||
|
||||
if (!hv_ghcb_pg)
|
||||
return;
|
||||
@ -120,8 +195,7 @@ void hv_ghcb_msr_write(u64 msr, u64 value)
|
||||
ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value));
|
||||
ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value));
|
||||
|
||||
if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
|
||||
SVM_EXIT_MSR, 1, 0))
|
||||
if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 1, 0))
|
||||
pr_warn("Fail to write msr via ghcb %llx.\n", msr);
|
||||
|
||||
local_irq_restore(flags);
|
||||
@ -133,7 +207,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
|
||||
union hv_ghcb *hv_ghcb;
|
||||
void **ghcb_base;
|
||||
unsigned long flags;
|
||||
struct es_em_ctxt ctxt;
|
||||
|
||||
/* Check size of union hv_ghcb here. */
|
||||
BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE);
|
||||
@ -152,8 +225,7 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
|
||||
}
|
||||
|
||||
ghcb_set_rcx(&hv_ghcb->ghcb, msr);
|
||||
if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
|
||||
SVM_EXIT_MSR, 0, 0))
|
||||
if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 0, 0))
|
||||
pr_warn("Fail to read msr via ghcb %llx.\n", msr);
|
||||
else
|
||||
*value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax)
|
||||
|
@ -446,5 +446,6 @@
|
||||
#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
|
||||
#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
|
||||
#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
|
||||
#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
|
||||
|
||||
#endif /* _ASM_X86_CPUFEATURES_H */
|
||||
|
@ -4,9 +4,6 @@
|
||||
|
||||
#include <asm/e820/types.h>
|
||||
|
||||
struct device;
|
||||
struct resource;
|
||||
|
||||
extern struct e820_table *e820_table;
|
||||
extern struct e820_table *e820_table_kexec;
|
||||
extern struct e820_table *e820_table_firmware;
|
||||
@ -46,8 +43,6 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn);
|
||||
|
||||
extern int e820__get_entry_type(u64 start, u64 end);
|
||||
|
||||
extern void remove_e820_regions(struct device *dev, struct resource *avail);
|
||||
|
||||
/*
|
||||
* Returns true iff the specified range [start,end) is completely contained inside
|
||||
* the ISA region.
|
||||
|
@ -1047,14 +1047,77 @@ struct kvm_x86_msr_filter {
|
||||
};
|
||||
|
||||
enum kvm_apicv_inhibit {
|
||||
|
||||
/********************************************************************/
|
||||
/* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
|
||||
/********************************************************************/
|
||||
|
||||
/*
|
||||
* APIC acceleration is disabled by a module parameter
|
||||
* and/or not supported in hardware.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_DISABLE,
|
||||
|
||||
/*
|
||||
* APIC acceleration is inhibited because AutoEOI feature is
|
||||
* being used by a HyperV guest.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_HYPERV,
|
||||
APICV_INHIBIT_REASON_NESTED,
|
||||
APICV_INHIBIT_REASON_IRQWIN,
|
||||
APICV_INHIBIT_REASON_PIT_REINJ,
|
||||
APICV_INHIBIT_REASON_X2APIC,
|
||||
APICV_INHIBIT_REASON_BLOCKIRQ,
|
||||
|
||||
/*
|
||||
* APIC acceleration is inhibited because the userspace didn't yet
|
||||
* enable the kernel/split irqchip.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_ABSENT,
|
||||
|
||||
/* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
|
||||
* (out of band, debug measure of blocking all interrupts on this vCPU)
|
||||
* was enabled, to avoid AVIC/APICv bypassing it.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_BLOCKIRQ,
|
||||
|
||||
/*
|
||||
* For simplicity, the APIC acceleration is inhibited
|
||||
* first time either APIC ID or APIC base are changed by the guest
|
||||
* from their reset values.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
|
||||
APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
|
||||
|
||||
/******************************************************/
|
||||
/* INHIBITs that are relevant only to the AMD's AVIC. */
|
||||
/******************************************************/
|
||||
|
||||
/*
|
||||
* AVIC is inhibited on a vCPU because it runs a nested guest.
|
||||
*
|
||||
* This is needed because unlike APICv, the peers of this vCPU
|
||||
* cannot use the doorbell mechanism to signal interrupts via AVIC when
|
||||
* a vCPU runs nested.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_NESTED,
|
||||
|
||||
/*
|
||||
* On SVM, the wait for the IRQ window is implemented with pending vIRQ,
|
||||
* which cannot be injected when the AVIC is enabled, thus AVIC
|
||||
* is inhibited while KVM waits for IRQ window.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_IRQWIN,
|
||||
|
||||
/*
|
||||
* PIT (i8254) 're-inject' mode, relies on EOI intercept,
|
||||
* which AVIC doesn't support for edge triggered interrupts.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_PIT_REINJ,
|
||||
|
||||
/*
|
||||
* AVIC is inhibited because the guest has x2apic in its CPUID.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_X2APIC,
|
||||
|
||||
/*
|
||||
* AVIC is disabled because SEV doesn't support it.
|
||||
*/
|
||||
APICV_INHIBIT_REASON_SEV,
|
||||
};
|
||||
|
||||
|
@ -179,9 +179,13 @@ int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
|
||||
#ifdef CONFIG_AMD_MEM_ENCRYPT
|
||||
void hv_ghcb_msr_write(u64 msr, u64 value);
|
||||
void hv_ghcb_msr_read(u64 msr, u64 *value);
|
||||
bool hv_ghcb_negotiate_protocol(void);
|
||||
void hv_ghcb_terminate(unsigned int set, unsigned int reason);
|
||||
#else
|
||||
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
|
||||
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
|
||||
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
|
||||
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
|
||||
#endif
|
||||
|
||||
extern bool hv_isolation_type_snp(void);
|
||||
|
@ -116,6 +116,30 @@
|
||||
* Not susceptible to
|
||||
* TSX Async Abort (TAA) vulnerabilities.
|
||||
*/
|
||||
#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /*
|
||||
* Not susceptible to SBDR and SSDP
|
||||
* variants of Processor MMIO stale data
|
||||
* vulnerabilities.
|
||||
*/
|
||||
#define ARCH_CAP_FBSDP_NO BIT(14) /*
|
||||
* Not susceptible to FBSDP variant of
|
||||
* Processor MMIO stale data
|
||||
* vulnerabilities.
|
||||
*/
|
||||
#define ARCH_CAP_PSDP_NO BIT(15) /*
|
||||
* Not susceptible to PSDP variant of
|
||||
* Processor MMIO stale data
|
||||
* vulnerabilities.
|
||||
*/
|
||||
#define ARCH_CAP_FB_CLEAR BIT(17) /*
|
||||
* VERW clears CPU fill buffer
|
||||
* even on MDS_NO CPUs.
|
||||
*/
|
||||
#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /*
|
||||
* MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
|
||||
* bit available to control VERW
|
||||
* behavior.
|
||||
*/
|
||||
|
||||
#define MSR_IA32_FLUSH_CMD 0x0000010b
|
||||
#define L1D_FLUSH BIT(0) /*
|
||||
@ -133,6 +157,7 @@
|
||||
#define MSR_IA32_MCU_OPT_CTRL 0x00000123
|
||||
#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
|
||||
#define RTM_ALLOW BIT(1) /* TSX development mode */
|
||||
#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
|
||||
|
||||
#define MSR_IA32_SYSENTER_CS 0x00000174
|
||||
#define MSR_IA32_SYSENTER_ESP 0x00000175
|
||||
|
@ -269,6 +269,8 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
|
||||
|
||||
#include <asm/segment.h>
|
||||
|
||||
/**
|
||||
|
@ -69,6 +69,8 @@ void pcibios_scan_specific_bus(int busn);
|
||||
|
||||
/* pci-irq.c */
|
||||
|
||||
struct pci_dev;
|
||||
|
||||
struct irq_info {
|
||||
u8 bus, devfn; /* Bus, device and function */
|
||||
struct {
|
||||
@ -246,3 +248,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
|
||||
# define x86_default_pci_init_irq NULL
|
||||
# define x86_default_pci_fixup_irqs NULL
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_PCI) && defined(CONFIG_ACPI)
|
||||
extern bool pci_use_e820;
|
||||
#else
|
||||
#define pci_use_e820 false
|
||||
#endif
|
||||
|
@ -108,19 +108,16 @@ extern unsigned long _brk_end;
|
||||
void *extend_brk(size_t size, size_t align);
|
||||
|
||||
/*
|
||||
* Reserve space in the brk section. The name must be unique within the file,
|
||||
* and somewhat descriptive. The size is in bytes.
|
||||
* Reserve space in the .brk section, which is a block of memory from which the
|
||||
* caller is allowed to allocate very early (before even memblock is available)
|
||||
* by calling extend_brk(). All allocated memory will be eventually converted
|
||||
* to memblock. Any leftover unallocated memory will be freed.
|
||||
*
|
||||
* The allocation is done using inline asm (rather than using a section
|
||||
* attribute on a normal variable) in order to allow the use of @nobits, so
|
||||
* that it doesn't take up any space in the vmlinux file.
|
||||
* The size is in bytes.
|
||||
*/
|
||||
#define RESERVE_BRK(name, size) \
|
||||
asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \
|
||||
".brk." #name ":\n\t" \
|
||||
".skip " __stringify(size) "\n\t" \
|
||||
".size .brk." #name ", " __stringify(size) "\n\t" \
|
||||
".popsection\n\t")
|
||||
#define RESERVE_BRK(name, size) \
|
||||
__section(".bss..brk") __aligned(1) __used \
|
||||
static char __brk_##name[size]
|
||||
|
||||
extern void probe_roms(void);
|
||||
#ifdef __i386__
|
||||
@ -133,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
|
||||
|
||||
#endif /* __i386__ */
|
||||
#endif /* _SETUP */
|
||||
#else
|
||||
#define RESERVE_BRK(name,sz) \
|
||||
.pushsection .brk_reservation,"aw",@nobits; \
|
||||
.brk.name: \
|
||||
1: .skip sz; \
|
||||
.size .brk.name,.-1b; \
|
||||
|
||||
#else /* __ASSEMBLY */
|
||||
|
||||
.macro __RESERVE_BRK name, size
|
||||
.pushsection .bss..brk, "aw"
|
||||
SYM_DATA_START(__brk_\name)
|
||||
.skip \size
|
||||
SYM_DATA_END(__brk_\name)
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_SETUP_H */
|
||||
|
@ -36,10 +36,6 @@ KCSAN_SANITIZE := n
|
||||
|
||||
OBJECT_FILES_NON_STANDARD_test_nx.o := y
|
||||
|
||||
ifdef CONFIG_FRAME_POINTER
|
||||
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
|
||||
endif
|
||||
|
||||
# If instrumentation of this dir is enabled, boot hangs during first second.
|
||||
# Probably could be more selective here, but note that files related to irqs,
|
||||
# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
|
||||
|
@ -41,8 +41,10 @@ static void __init spectre_v2_select_mitigation(void);
|
||||
static void __init ssb_select_mitigation(void);
|
||||
static void __init l1tf_select_mitigation(void);
|
||||
static void __init mds_select_mitigation(void);
|
||||
static void __init mds_print_mitigation(void);
|
||||
static void __init md_clear_update_mitigation(void);
|
||||
static void __init md_clear_select_mitigation(void);
|
||||
static void __init taa_select_mitigation(void);
|
||||
static void __init mmio_select_mitigation(void);
|
||||
static void __init srbds_select_mitigation(void);
|
||||
static void __init l1d_flush_select_mitigation(void);
|
||||
|
||||
@ -85,6 +87,10 @@ EXPORT_SYMBOL_GPL(mds_idle_clear);
|
||||
*/
|
||||
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
|
||||
|
||||
/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
|
||||
DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
|
||||
EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
|
||||
|
||||
void __init check_bugs(void)
|
||||
{
|
||||
identify_boot_cpu();
|
||||
@ -117,17 +123,10 @@ void __init check_bugs(void)
|
||||
spectre_v2_select_mitigation();
|
||||
ssb_select_mitigation();
|
||||
l1tf_select_mitigation();
|
||||
mds_select_mitigation();
|
||||
taa_select_mitigation();
|
||||
md_clear_select_mitigation();
|
||||
srbds_select_mitigation();
|
||||
l1d_flush_select_mitigation();
|
||||
|
||||
/*
|
||||
* As MDS and TAA mitigations are inter-related, print MDS
|
||||
* mitigation until after TAA mitigation selection is done.
|
||||
*/
|
||||
mds_print_mitigation();
|
||||
|
||||
arch_smt_update();
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
@ -267,14 +266,6 @@ static void __init mds_select_mitigation(void)
|
||||
}
|
||||
}
|
||||
|
||||
static void __init mds_print_mitigation(void)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
|
||||
return;
|
||||
|
||||
pr_info("%s\n", mds_strings[mds_mitigation]);
|
||||
}
|
||||
|
||||
static int __init mds_cmdline(char *str)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_MDS))
|
||||
@ -329,7 +320,7 @@ static void __init taa_select_mitigation(void)
|
||||
/* TSX previously disabled by tsx=off */
|
||||
if (!boot_cpu_has(X86_FEATURE_RTM)) {
|
||||
taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
|
||||
goto out;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cpu_mitigations_off()) {
|
||||
@ -343,7 +334,7 @@ static void __init taa_select_mitigation(void)
|
||||
*/
|
||||
if (taa_mitigation == TAA_MITIGATION_OFF &&
|
||||
mds_mitigation == MDS_MITIGATION_OFF)
|
||||
goto out;
|
||||
return;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
|
||||
taa_mitigation = TAA_MITIGATION_VERW;
|
||||
@ -375,18 +366,6 @@ static void __init taa_select_mitigation(void)
|
||||
|
||||
if (taa_nosmt || cpu_mitigations_auto_nosmt())
|
||||
cpu_smt_disable(false);
|
||||
|
||||
/*
|
||||
* Update MDS mitigation, if necessary, as the mds_user_clear is
|
||||
* now enabled for TAA mitigation.
|
||||
*/
|
||||
if (mds_mitigation == MDS_MITIGATION_OFF &&
|
||||
boot_cpu_has_bug(X86_BUG_MDS)) {
|
||||
mds_mitigation = MDS_MITIGATION_FULL;
|
||||
mds_select_mitigation();
|
||||
}
|
||||
out:
|
||||
pr_info("%s\n", taa_strings[taa_mitigation]);
|
||||
}
|
||||
|
||||
static int __init tsx_async_abort_parse_cmdline(char *str)
|
||||
@ -410,6 +389,151 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
|
||||
}
|
||||
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
|
||||
|
||||
enum mmio_mitigations {
|
||||
MMIO_MITIGATION_OFF,
|
||||
MMIO_MITIGATION_UCODE_NEEDED,
|
||||
MMIO_MITIGATION_VERW,
|
||||
};
|
||||
|
||||
/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
|
||||
static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
|
||||
static bool mmio_nosmt __ro_after_init = false;
|
||||
|
||||
static const char * const mmio_strings[] = {
|
||||
[MMIO_MITIGATION_OFF] = "Vulnerable",
|
||||
[MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
|
||||
[MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
|
||||
};
|
||||
|
||||
static void __init mmio_select_mitigation(void)
|
||||
{
|
||||
u64 ia32_cap;
|
||||
|
||||
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
|
||||
cpu_mitigations_off()) {
|
||||
mmio_mitigation = MMIO_MITIGATION_OFF;
|
||||
return;
|
||||
}
|
||||
|
||||
if (mmio_mitigation == MMIO_MITIGATION_OFF)
|
||||
return;
|
||||
|
||||
ia32_cap = x86_read_arch_cap_msr();
|
||||
|
||||
/*
|
||||
* Enable CPU buffer clear mitigation for host and VMM, if also affected
|
||||
* by MDS or TAA. Otherwise, enable mitigation for VMM only.
|
||||
*/
|
||||
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
|
||||
boot_cpu_has(X86_FEATURE_RTM)))
|
||||
static_branch_enable(&mds_user_clear);
|
||||
else
|
||||
static_branch_enable(&mmio_stale_data_clear);
|
||||
|
||||
/*
|
||||
* If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
|
||||
* be propagated to uncore buffers, clearing the Fill buffers on idle
|
||||
* is required irrespective of SMT state.
|
||||
*/
|
||||
if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
|
||||
static_branch_enable(&mds_idle_clear);
|
||||
|
||||
/*
|
||||
* Check if the system has the right microcode.
|
||||
*
|
||||
* CPU Fill buffer clear mitigation is enumerated by either an explicit
|
||||
* FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
|
||||
* affected systems.
|
||||
*/
|
||||
if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
|
||||
(boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
|
||||
boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
|
||||
!(ia32_cap & ARCH_CAP_MDS_NO)))
|
||||
mmio_mitigation = MMIO_MITIGATION_VERW;
|
||||
else
|
||||
mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
|
||||
|
||||
if (mmio_nosmt || cpu_mitigations_auto_nosmt())
|
||||
cpu_smt_disable(false);
|
||||
}
|
||||
|
||||
static int __init mmio_stale_data_parse_cmdline(char *str)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
|
||||
return 0;
|
||||
|
||||
if (!str)
|
||||
return -EINVAL;
|
||||
|
||||
if (!strcmp(str, "off")) {
|
||||
mmio_mitigation = MMIO_MITIGATION_OFF;
|
||||
} else if (!strcmp(str, "full")) {
|
||||
mmio_mitigation = MMIO_MITIGATION_VERW;
|
||||
} else if (!strcmp(str, "full,nosmt")) {
|
||||
mmio_mitigation = MMIO_MITIGATION_VERW;
|
||||
mmio_nosmt = true;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "" fmt
|
||||
|
||||
static void __init md_clear_update_mitigation(void)
|
||||
{
|
||||
if (cpu_mitigations_off())
|
||||
return;
|
||||
|
||||
if (!static_key_enabled(&mds_user_clear))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
|
||||
* mitigation, if necessary.
|
||||
*/
|
||||
if (mds_mitigation == MDS_MITIGATION_OFF &&
|
||||
boot_cpu_has_bug(X86_BUG_MDS)) {
|
||||
mds_mitigation = MDS_MITIGATION_FULL;
|
||||
mds_select_mitigation();
|
||||
}
|
||||
if (taa_mitigation == TAA_MITIGATION_OFF &&
|
||||
boot_cpu_has_bug(X86_BUG_TAA)) {
|
||||
taa_mitigation = TAA_MITIGATION_VERW;
|
||||
taa_select_mitigation();
|
||||
}
|
||||
if (mmio_mitigation == MMIO_MITIGATION_OFF &&
|
||||
boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
|
||||
mmio_mitigation = MMIO_MITIGATION_VERW;
|
||||
mmio_select_mitigation();
|
||||
}
|
||||
out:
|
||||
if (boot_cpu_has_bug(X86_BUG_MDS))
|
||||
pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
|
||||
if (boot_cpu_has_bug(X86_BUG_TAA))
|
||||
pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
|
||||
if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
|
||||
pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
|
||||
}
|
||||
|
||||
static void __init md_clear_select_mitigation(void)
|
||||
{
|
||||
mds_select_mitigation();
|
||||
taa_select_mitigation();
|
||||
mmio_select_mitigation();
|
||||
|
||||
/*
|
||||
* As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
|
||||
* and print their mitigation after MDS, TAA and MMIO Stale Data
|
||||
* mitigation selection is done.
|
||||
*/
|
||||
md_clear_update_mitigation();
|
||||
}
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "SRBDS: " fmt
|
||||
|
||||
@ -478,11 +602,13 @@ static void __init srbds_select_mitigation(void)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Check to see if this is one of the MDS_NO systems supporting
|
||||
* TSX that are only exposed to SRBDS when TSX is enabled.
|
||||
* Check to see if this is one of the MDS_NO systems supporting TSX that
|
||||
* are only exposed to SRBDS when TSX is enabled or when CPU is affected
|
||||
* by Processor MMIO Stale Data vulnerability.
|
||||
*/
|
||||
ia32_cap = x86_read_arch_cap_msr();
|
||||
if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
|
||||
if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
|
||||
!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
|
||||
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
|
||||
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
|
||||
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
|
||||
@ -1116,6 +1242,8 @@ static void update_indir_branch_cond(void)
|
||||
/* Update the static key controlling the MDS CPU buffer clear in idle */
|
||||
static void update_mds_branch_idle(void)
|
||||
{
|
||||
u64 ia32_cap = x86_read_arch_cap_msr();
|
||||
|
||||
/*
|
||||
* Enable the idle clearing if SMT is active on CPUs which are
|
||||
* affected only by MSBDS and not any other MDS variant.
|
||||
@ -1127,14 +1255,17 @@ static void update_mds_branch_idle(void)
|
||||
if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
|
||||
return;
|
||||
|
||||
if (sched_smt_active())
|
||||
if (sched_smt_active()) {
|
||||
static_branch_enable(&mds_idle_clear);
|
||||
else
|
||||
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
|
||||
(ia32_cap & ARCH_CAP_FBSDP_NO)) {
|
||||
static_branch_disable(&mds_idle_clear);
|
||||
}
|
||||
}
|
||||
|
||||
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
|
||||
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
|
||||
#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
|
||||
|
||||
void cpu_bugs_smt_update(void)
|
||||
{
|
||||
@ -1179,6 +1310,16 @@ void cpu_bugs_smt_update(void)
|
||||
break;
|
||||
}
|
||||
|
||||
switch (mmio_mitigation) {
|
||||
case MMIO_MITIGATION_VERW:
|
||||
case MMIO_MITIGATION_UCODE_NEEDED:
|
||||
if (sched_smt_active())
|
||||
pr_warn_once(MMIO_MSG_SMT);
|
||||
break;
|
||||
case MMIO_MITIGATION_OFF:
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_unlock(&spec_ctrl_mutex);
|
||||
}
|
||||
|
||||
@ -1781,6 +1922,20 @@ static ssize_t tsx_async_abort_show_state(char *buf)
|
||||
sched_smt_active() ? "vulnerable" : "disabled");
|
||||
}
|
||||
|
||||
static ssize_t mmio_stale_data_show_state(char *buf)
|
||||
{
|
||||
if (mmio_mitigation == MMIO_MITIGATION_OFF)
|
||||
return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
|
||||
return sysfs_emit(buf, "%s; SMT Host state unknown\n",
|
||||
mmio_strings[mmio_mitigation]);
|
||||
}
|
||||
|
||||
return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
|
||||
sched_smt_active() ? "vulnerable" : "disabled");
|
||||
}
|
||||
|
||||
static char *stibp_state(void)
|
||||
{
|
||||
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
|
||||
@ -1881,6 +2036,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
|
||||
case X86_BUG_SRBDS:
|
||||
return srbds_show_state(buf);
|
||||
|
||||
case X86_BUG_MMIO_STALE_DATA:
|
||||
return mmio_stale_data_show_state(buf);
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -1932,4 +2090,9 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *
|
||||
{
|
||||
return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
|
||||
}
|
||||
|
||||
ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
|
||||
}
|
||||
#endif
|
||||
|
@ -1211,18 +1211,42 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
X86_FEATURE_ANY, issues)
|
||||
|
||||
#define SRBDS BIT(0)
|
||||
/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
|
||||
#define MMIO BIT(1)
|
||||
/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
|
||||
#define MMIO_SBDS BIT(2)
|
||||
|
||||
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
|
||||
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
|
||||
BIT(7) | BIT(0xB), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
|
||||
VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
|
||||
VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
|
||||
VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
|
||||
VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
|
||||
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
|
||||
{}
|
||||
};
|
||||
|
||||
@ -1243,6 +1267,13 @@ u64 x86_read_arch_cap_msr(void)
|
||||
return ia32_cap;
|
||||
}
|
||||
|
||||
static bool arch_cap_mmio_immune(u64 ia32_cap)
|
||||
{
|
||||
return (ia32_cap & ARCH_CAP_FBSDP_NO &&
|
||||
ia32_cap & ARCH_CAP_PSDP_NO &&
|
||||
ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
|
||||
}
|
||||
|
||||
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u64 ia32_cap = x86_read_arch_cap_msr();
|
||||
@ -1296,12 +1327,27 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
||||
/*
|
||||
* SRBDS affects CPUs which support RDRAND or RDSEED and are listed
|
||||
* in the vulnerability blacklist.
|
||||
*
|
||||
* Some of the implications and mitigation of Shared Buffers Data
|
||||
* Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
|
||||
* SRBDS.
|
||||
*/
|
||||
if ((cpu_has(c, X86_FEATURE_RDRAND) ||
|
||||
cpu_has(c, X86_FEATURE_RDSEED)) &&
|
||||
cpu_matches(cpu_vuln_blacklist, SRBDS))
|
||||
cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
|
||||
setup_force_cpu_bug(X86_BUG_SRBDS);
|
||||
|
||||
/*
|
||||
* Processor MMIO Stale Data bug enumeration
|
||||
*
|
||||
* Affected CPU list is generally enough to enumerate the vulnerability,
|
||||
* but for virtualization case check for ARCH_CAP MSR bits also, VMM may
|
||||
* not want the guest to enumerate the bug.
|
||||
*/
|
||||
if (cpu_matches(cpu_vuln_blacklist, MMIO) &&
|
||||
!arch_cap_mmio_immune(ia32_cap))
|
||||
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
|
||||
|
||||
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
|
||||
return;
|
||||
|
||||
|
@ -175,6 +175,7 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
|
||||
|
||||
jmp ftrace_epilogue
|
||||
SYM_FUNC_END(ftrace_caller);
|
||||
STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
|
||||
|
||||
SYM_FUNC_START(ftrace_epilogue)
|
||||
/*
|
||||
@ -282,6 +283,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
|
||||
jmp ftrace_epilogue
|
||||
|
||||
SYM_FUNC_END(ftrace_regs_caller)
|
||||
STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
|
||||
|
||||
|
||||
#else /* ! CONFIG_DYNAMIC_FTRACE */
|
||||
@ -311,10 +313,14 @@ trace:
|
||||
jmp ftrace_stub
|
||||
SYM_FUNC_END(__fentry__)
|
||||
EXPORT_SYMBOL(__fentry__)
|
||||
STACK_FRAME_NON_STANDARD_FP(__fentry__)
|
||||
|
||||
#endif /* CONFIG_DYNAMIC_FTRACE */
|
||||
|
||||
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
||||
SYM_FUNC_START(return_to_handler)
|
||||
SYM_CODE_START(return_to_handler)
|
||||
UNWIND_HINT_EMPTY
|
||||
ANNOTATE_NOENDBR
|
||||
subq $16, %rsp
|
||||
|
||||
/* Save the return values */
|
||||
@ -339,7 +345,6 @@ SYM_FUNC_START(return_to_handler)
|
||||
int3
|
||||
.Ldo_rop:
|
||||
mov %rdi, (%rsp)
|
||||
UNWIND_HINT_FUNC
|
||||
RET
|
||||
SYM_FUNC_END(return_to_handler)
|
||||
SYM_CODE_END(return_to_handler)
|
||||
#endif
|
||||
|
@ -1,7 +1,8 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/dev_printk.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/printk.h>
|
||||
#include <asm/e820/api.h>
|
||||
#include <asm/pci_x86.h>
|
||||
|
||||
static void resource_clip(struct resource *res, resource_size_t start,
|
||||
resource_size_t end)
|
||||
@ -24,14 +25,14 @@ static void resource_clip(struct resource *res, resource_size_t start,
|
||||
res->start = end + 1;
|
||||
}
|
||||
|
||||
void remove_e820_regions(struct device *dev, struct resource *avail)
|
||||
static void remove_e820_regions(struct resource *avail)
|
||||
{
|
||||
int i;
|
||||
struct e820_entry *entry;
|
||||
u64 e820_start, e820_end;
|
||||
struct resource orig = *avail;
|
||||
|
||||
if (!(avail->flags & IORESOURCE_MEM))
|
||||
if (!pci_use_e820)
|
||||
return;
|
||||
|
||||
for (i = 0; i < e820_table->nr_entries; i++) {
|
||||
@ -41,7 +42,7 @@ void remove_e820_regions(struct device *dev, struct resource *avail)
|
||||
|
||||
resource_clip(avail, e820_start, e820_end);
|
||||
if (orig.start != avail->start || orig.end != avail->end) {
|
||||
dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
|
||||
pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
|
||||
&orig, avail, e820_start, e820_end);
|
||||
orig = *avail;
|
||||
}
|
||||
@ -55,6 +56,9 @@ void arch_remove_reservations(struct resource *avail)
|
||||
* the low 1MB unconditionally, as this area is needed for some ISA
|
||||
* cards requiring a memory range, e.g. the i82365 PCMCIA controller.
|
||||
*/
|
||||
if (avail->flags & IORESOURCE_MEM)
|
||||
if (avail->flags & IORESOURCE_MEM) {
|
||||
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
|
||||
|
||||
remove_e820_regions(avail);
|
||||
}
|
||||
}
|
||||
|
@ -67,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536);
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Range of the BSS area. The size of the BSS area is determined
|
||||
* at link time, with RESERVE_BRK() facility reserving additional
|
||||
* chunks.
|
||||
*/
|
||||
unsigned long _brk_start = (unsigned long)__brk_base;
|
||||
unsigned long _brk_end = (unsigned long)__brk_base;
|
||||
|
||||
|
@ -385,10 +385,10 @@ SECTIONS
|
||||
__end_of_kernel_reserve = .;
|
||||
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
|
||||
.brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) {
|
||||
__brk_base = .;
|
||||
. += 64 * 1024; /* 64k alignment slop space */
|
||||
*(.brk_reservation) /* areas brk users have reserved */
|
||||
*(.bss..brk) /* areas brk users have reserved */
|
||||
__brk_limit = .;
|
||||
}
|
||||
|
||||
|
@ -2039,6 +2039,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
|
||||
}
|
||||
}
|
||||
|
||||
static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
|
||||
{
|
||||
struct kvm *kvm = apic->vcpu->kvm;
|
||||
|
||||
if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
|
||||
return;
|
||||
|
||||
if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
|
||||
return;
|
||||
|
||||
kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
|
||||
}
|
||||
|
||||
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
||||
{
|
||||
int ret = 0;
|
||||
@ -2047,10 +2060,12 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
|
||||
|
||||
switch (reg) {
|
||||
case APIC_ID: /* Local APIC ID */
|
||||
if (!apic_x2apic_mode(apic))
|
||||
if (!apic_x2apic_mode(apic)) {
|
||||
kvm_apic_set_xapic_id(apic, val >> 24);
|
||||
else
|
||||
kvm_lapic_xapic_id_updated(apic);
|
||||
} else {
|
||||
ret = 1;
|
||||
}
|
||||
break;
|
||||
|
||||
case APIC_TASKPRI:
|
||||
@ -2336,8 +2351,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
|
||||
MSR_IA32_APICBASE_BASE;
|
||||
|
||||
if ((value & MSR_IA32_APICBASE_ENABLE) &&
|
||||
apic->base_address != APIC_DEFAULT_PHYS_BASE)
|
||||
pr_warn_once("APIC base relocation is unsupported by KVM");
|
||||
apic->base_address != APIC_DEFAULT_PHYS_BASE) {
|
||||
kvm_set_apicv_inhibit(apic->vcpu->kvm,
|
||||
APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
|
||||
}
|
||||
}
|
||||
|
||||
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
|
||||
@ -2648,6 +2665,8 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
|
||||
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
|
||||
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
|
||||
}
|
||||
} else {
|
||||
kvm_lapic_xapic_id_updated(vcpu->arch.apic);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -3411,7 +3411,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
|
||||
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
|
||||
i << 30, PT32_ROOT_LEVEL, true);
|
||||
mmu->pae_root[i] = root | PT_PRESENT_MASK |
|
||||
shadow_me_mask;
|
||||
shadow_me_value;
|
||||
}
|
||||
mmu->root.hpa = __pa(mmu->pae_root);
|
||||
} else {
|
||||
|
@ -291,58 +291,91 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
|
||||
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
|
||||
u32 icrl, u32 icrh, u32 index)
|
||||
{
|
||||
u32 dest, apic_id;
|
||||
struct kvm_vcpu *vcpu;
|
||||
u32 l1_physical_id, dest;
|
||||
struct kvm_vcpu *target_vcpu;
|
||||
int dest_mode = icrl & APIC_DEST_MASK;
|
||||
int shorthand = icrl & APIC_SHORT_MASK;
|
||||
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
|
||||
u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
|
||||
|
||||
if (shorthand != APIC_DEST_NOSHORT)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The AVIC incomplete IPI #vmexit info provides index into
|
||||
* the physical APIC ID table, which can be used to derive
|
||||
* guest physical APIC ID.
|
||||
*/
|
||||
if (dest_mode == APIC_DEST_PHYSICAL) {
|
||||
apic_id = index;
|
||||
} else {
|
||||
if (!apic_x2apic_mode(source)) {
|
||||
/* For xAPIC logical mode, the index is for logical APIC table. */
|
||||
apic_id = avic_logical_id_table[index] & 0x1ff;
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Assuming vcpu ID is the same as physical apic ID,
|
||||
* and use it to retrieve the target vCPU.
|
||||
*/
|
||||
vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
|
||||
if (!vcpu)
|
||||
return -EINVAL;
|
||||
|
||||
if (apic_x2apic_mode(vcpu->arch.apic))
|
||||
if (apic_x2apic_mode(source))
|
||||
dest = icrh;
|
||||
else
|
||||
dest = GET_APIC_DEST_FIELD(icrh);
|
||||
|
||||
/*
|
||||
* Try matching the destination APIC ID with the vCPU.
|
||||
*/
|
||||
if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
|
||||
vcpu->arch.apic->irr_pending = true;
|
||||
svm_complete_interrupt_delivery(vcpu,
|
||||
icrl & APIC_MODE_MASK,
|
||||
icrl & APIC_INT_LEVELTRIG,
|
||||
icrl & APIC_VECTOR_MASK);
|
||||
return 0;
|
||||
if (dest_mode == APIC_DEST_PHYSICAL) {
|
||||
/* broadcast destination, use slow path */
|
||||
if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
|
||||
return -EINVAL;
|
||||
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
|
||||
return -EINVAL;
|
||||
|
||||
l1_physical_id = dest;
|
||||
|
||||
if (WARN_ON_ONCE(l1_physical_id != index))
|
||||
return -EINVAL;
|
||||
|
||||
} else {
|
||||
u32 bitmap, cluster;
|
||||
int logid_index;
|
||||
|
||||
if (apic_x2apic_mode(source)) {
|
||||
/* 16 bit dest mask, 16 bit cluster id */
|
||||
bitmap = dest & 0xFFFF0000;
|
||||
cluster = (dest >> 16) << 4;
|
||||
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
|
||||
/* 8 bit dest mask*/
|
||||
bitmap = dest;
|
||||
cluster = 0;
|
||||
} else {
|
||||
/* 4 bit desk mask, 4 bit cluster id */
|
||||
bitmap = dest & 0xF;
|
||||
cluster = (dest >> 4) << 2;
|
||||
}
|
||||
|
||||
if (unlikely(!bitmap))
|
||||
/* guest bug: nobody to send the logical interrupt to */
|
||||
return 0;
|
||||
|
||||
if (!is_power_of_2(bitmap))
|
||||
/* multiple logical destinations, use slow path */
|
||||
return -EINVAL;
|
||||
|
||||
logid_index = cluster + __ffs(bitmap);
|
||||
|
||||
if (apic_x2apic_mode(source)) {
|
||||
l1_physical_id = logid_index;
|
||||
} else {
|
||||
u32 *avic_logical_id_table =
|
||||
page_address(kvm_svm->avic_logical_id_table_page);
|
||||
|
||||
u32 logid_entry = avic_logical_id_table[logid_index];
|
||||
|
||||
if (WARN_ON_ONCE(index != logid_index))
|
||||
return -EINVAL;
|
||||
|
||||
/* guest bug: non existing/reserved logical destination */
|
||||
if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
|
||||
return 0;
|
||||
|
||||
l1_physical_id = logid_entry &
|
||||
AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
|
||||
if (unlikely(!target_vcpu))
|
||||
/* guest bug: non existing vCPU is a target of this IPI*/
|
||||
return 0;
|
||||
|
||||
target_vcpu->arch.apic->irr_pending = true;
|
||||
svm_complete_interrupt_delivery(target_vcpu,
|
||||
icrl & APIC_MODE_MASK,
|
||||
icrl & APIC_INT_LEVELTRIG,
|
||||
icrl & APIC_VECTOR_MASK);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
|
||||
@ -508,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u64 *old, *new;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u32 id = kvm_xapic_id(vcpu->arch.apic);
|
||||
|
||||
if (vcpu->vcpu_id == id)
|
||||
return 0;
|
||||
|
||||
old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
|
||||
new = avic_get_physical_id_entry(vcpu, id);
|
||||
if (!new || !old)
|
||||
return 1;
|
||||
|
||||
/* We need to move physical_id_entry to new offset */
|
||||
*new = *old;
|
||||
*old = 0ULL;
|
||||
to_svm(vcpu)->avic_physical_id_cache = new;
|
||||
|
||||
/*
|
||||
* Also update the guest physical APIC ID in the logical
|
||||
* APIC ID table entry if already setup the LDR.
|
||||
*/
|
||||
if (svm->ldr_reg)
|
||||
avic_handle_ldr_update(vcpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
@ -555,10 +559,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
|
||||
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
|
||||
|
||||
switch (offset) {
|
||||
case APIC_ID:
|
||||
if (avic_handle_apic_id_update(vcpu))
|
||||
return 0;
|
||||
break;
|
||||
case APIC_LDR:
|
||||
if (avic_handle_ldr_update(vcpu))
|
||||
return 0;
|
||||
@ -650,8 +650,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
|
||||
|
||||
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (avic_handle_apic_id_update(vcpu) != 0)
|
||||
return;
|
||||
avic_handle_dfr_update(vcpu);
|
||||
avic_handle_ldr_update(vcpu);
|
||||
}
|
||||
@ -910,7 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
|
||||
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
|
||||
BIT(APICV_INHIBIT_REASON_X2APIC) |
|
||||
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
|
||||
BIT(APICV_INHIBIT_REASON_SEV);
|
||||
BIT(APICV_INHIBIT_REASON_SEV) |
|
||||
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
|
||||
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
|
||||
|
||||
return supported & BIT(reason);
|
||||
}
|
||||
@ -946,7 +946,7 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
{
|
||||
u64 entry;
|
||||
int h_physical_id = kvm_cpu_get_apicid(cpu);
|
||||
@ -978,7 +978,7 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
|
||||
}
|
||||
|
||||
void __avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u64 entry;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
@ -997,25 +997,6 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
}
|
||||
|
||||
static void avic_vcpu_load(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int cpu = get_cpu();
|
||||
|
||||
WARN_ON(cpu != vcpu->cpu);
|
||||
|
||||
__avic_vcpu_load(vcpu, cpu);
|
||||
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
static void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
preempt_disable();
|
||||
|
||||
__avic_vcpu_put(vcpu);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -1042,7 +1023,7 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
|
||||
vmcb_mark_dirty(vmcb, VMCB_AVIC);
|
||||
|
||||
if (activated)
|
||||
avic_vcpu_load(vcpu);
|
||||
avic_vcpu_load(vcpu, vcpu->cpu);
|
||||
else
|
||||
avic_vcpu_put(vcpu);
|
||||
|
||||
@ -1075,5 +1056,5 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
|
||||
if (!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
|
||||
avic_vcpu_load(vcpu);
|
||||
avic_vcpu_load(vcpu, vcpu->cpu);
|
||||
}
|
||||
|
@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
|
||||
struct kvm_vcpu *vcpu = &svm->vcpu;
|
||||
struct vmcb *vmcb01 = svm->vmcb01.ptr;
|
||||
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
|
||||
u32 pause_count12;
|
||||
u32 pause_thresh12;
|
||||
|
||||
/*
|
||||
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
|
||||
@ -671,27 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
|
||||
if (!nested_vmcb_needs_vls_intercept(svm))
|
||||
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
|
||||
|
||||
pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
|
||||
pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
|
||||
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
|
||||
/* use guest values since host doesn't use them */
|
||||
vmcb02->control.pause_filter_count =
|
||||
svm->pause_filter_enabled ?
|
||||
svm->nested.ctl.pause_filter_count : 0;
|
||||
/* use guest values since host doesn't intercept PAUSE */
|
||||
vmcb02->control.pause_filter_count = pause_count12;
|
||||
vmcb02->control.pause_filter_thresh = pause_thresh12;
|
||||
|
||||
vmcb02->control.pause_filter_thresh =
|
||||
svm->pause_threshold_enabled ?
|
||||
svm->nested.ctl.pause_filter_thresh : 0;
|
||||
|
||||
} else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
|
||||
/* use host values when guest doesn't use them */
|
||||
} else {
|
||||
/* start from host values otherwise */
|
||||
vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
|
||||
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
|
||||
} else {
|
||||
/*
|
||||
* Intercept every PAUSE otherwise and
|
||||
* ignore both host and guest values
|
||||
*/
|
||||
vmcb02->control.pause_filter_count = 0;
|
||||
vmcb02->control.pause_filter_thresh = 0;
|
||||
|
||||
/* ... but ensure filtering is disabled if so requested. */
|
||||
if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
|
||||
if (!pause_count12)
|
||||
vmcb02->control.pause_filter_count = 0;
|
||||
if (!pause_thresh12)
|
||||
vmcb02->control.pause_filter_thresh = 0;
|
||||
}
|
||||
}
|
||||
|
||||
nested_svm_transition_tlb_flush(vcpu);
|
||||
@ -951,8 +951,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
|
||||
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
|
||||
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
|
||||
|
||||
if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
|
||||
if (!kvm_pause_in_guest(vcpu->kvm)) {
|
||||
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
|
||||
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
|
||||
|
||||
}
|
||||
|
||||
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
|
||||
|
||||
|
@ -921,7 +921,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
|
||||
struct vmcb_control_area *control = &svm->vmcb->control;
|
||||
int old = control->pause_filter_count;
|
||||
|
||||
if (kvm_pause_in_guest(vcpu->kvm) || !old)
|
||||
if (kvm_pause_in_guest(vcpu->kvm))
|
||||
return;
|
||||
|
||||
control->pause_filter_count = __grow_ple_window(old,
|
||||
@ -942,7 +942,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
|
||||
struct vmcb_control_area *control = &svm->vmcb->control;
|
||||
int old = control->pause_filter_count;
|
||||
|
||||
if (kvm_pause_in_guest(vcpu->kvm) || !old)
|
||||
if (kvm_pause_in_guest(vcpu->kvm))
|
||||
return;
|
||||
|
||||
control->pause_filter_count =
|
||||
@ -1400,13 +1400,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
indirect_branch_prediction_barrier();
|
||||
}
|
||||
if (kvm_vcpu_apicv_active(vcpu))
|
||||
__avic_vcpu_load(vcpu, cpu);
|
||||
avic_vcpu_load(vcpu, cpu);
|
||||
}
|
||||
|
||||
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (kvm_vcpu_apicv_active(vcpu))
|
||||
__avic_vcpu_put(vcpu);
|
||||
avic_vcpu_put(vcpu);
|
||||
|
||||
svm_prepare_host_switch(vcpu);
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user