diff options
299 files changed, 26193 insertions, 4648 deletions
diff --git a/Documentation/devicetree/bindings/dma/dma.txt b/Documentation/devicetree/bindings/dma/dma.txt index 82104271e754..6312fb00ce8d 100644 --- a/Documentation/devicetree/bindings/dma/dma.txt +++ b/Documentation/devicetree/bindings/dma/dma.txt @@ -31,6 +31,34 @@ Example: dma-requests = <127>; }; +* DMA router + +DMA routers are transparent IP blocks used to route DMA request lines from +devices to the DMA controller. Some SoCs (like TI DRA7x) have more peripherals +integrated with DMA requests than what the DMA controller can handle directly. + +Required property: +- dma-masters: phandle of the DMA controller or list of phandles for + the DMA controllers the router can direct the signal to. +- #dma-cells: Must be at least 1. Used to provide DMA router specific + information. See DMA client binding below for more + details. + +Optional properties: +- dma-requests: Number of incoming request lines the router can handle. +- In the node pointed by the dma-masters: + - dma-requests: The router driver might need to look for this in order + to configure the routing. + +Example: + sdma_xbar: dma-router@4a002b78 { + compatible = "ti,dra7-dma-crossbar"; + reg = <0x4a002b78 0xfc>; + #dma-cells = <1>; + dma-requests = <205>; + ti,dma-safe-map = <0>; + dma-masters = <&sdma>; + }; * DMA client diff --git a/Documentation/devicetree/bindings/dma/mv-xor.txt b/Documentation/devicetree/bindings/dma/mv-xor.txt index 7c6cb7fcecd2..cc29c35266e2 100644 --- a/Documentation/devicetree/bindings/dma/mv-xor.txt +++ b/Documentation/devicetree/bindings/dma/mv-xor.txt @@ -1,7 +1,7 @@ * Marvell XOR engines Required properties: -- compatible: Should be "marvell,orion-xor" +- compatible: Should be "marvell,orion-xor" or "marvell,armada-380-xor" - reg: Should contain registers location and length (two sets) the first set is the low registers, the second set the high registers for the XOR engine. diff --git a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt index ecbc96ad36f8..ccd52d6a231a 100644 --- a/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt +++ b/Documentation/devicetree/bindings/dma/sirfsoc-dma.txt @@ -3,7 +3,8 @@ See dma.txt first Required properties: -- compatible: Should be "sirf,prima2-dmac" or "sirf,marco-dmac" +- compatible: Should be "sirf,prima2-dmac", "sirf,atlas7-dmac" or + "sirf,atlas7-dmac-v2" - reg: Should contain DMA registers location and length. - interrupts: Should contain one interrupt shared by all channel - #dma-cells: must be <1>. used to represent the number of integer diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt index 9cdcba24d7c3..d13c136cef8c 100644 --- a/Documentation/devicetree/bindings/dma/sun6i-dma.txt +++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt @@ -4,7 +4,10 @@ This driver follows the generic DMA bindings defined in dma.txt. Required properties: -- compatible: Must be "allwinner,sun6i-a31-dma" or "allwinner,sun8i-a23-dma" +- compatible: Must be one of + "allwinner,sun6i-a31-dma" + "allwinner,sun8i-a23-dma" + "allwinner,sun8i-h3-dma" - reg: Should contain the registers base address and length - interrupts: Should contain a reference to the interrupt used by this device - clocks: Should contain a reference to the parent AHB clock diff --git a/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt new file mode 100644 index 000000000000..63a48928f3a8 --- /dev/null +++ b/Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt @@ -0,0 +1,52 @@ +Texas Instruments DMA Crossbar (DMA request router) + +Required properties: +- compatible: "ti,dra7-dma-crossbar" for DRA7xx DMA crossbar +- reg: Memory map for accessing module +- #dma-cells: Should be set to <1>. + Clients should use the crossbar request number (input) +- dma-requests: Number of DMA requests the crossbar can receive +- dma-masters: phandle pointing to the DMA controller + +The DMA controller node need to have the following poroperties: +- dma-requests: Number of DMA requests the controller can handle + +Optional properties: +- ti,dma-safe-map: Safe routing value for unused request lines + +Example: + +/* DMA controller */ +sdma: dma-controller@4a056000 { + compatible = "ti,omap4430-sdma"; + reg = <0x4a056000 0x1000>; + interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>; + #dma-cells = <1>; + dma-channels = <32>; + dma-requests = <127>; +}; + +/* DMA crossbar */ +sdma_xbar: dma-router@4a002b78 { + compatible = "ti,dra7-dma-crossbar"; + reg = <0x4a002b78 0xfc>; + #dma-cells = <1>; + dma-requests = <205>; + ti,dma-safe-map = <0>; + dma-masters = <&sdma>; +}; + +/* DMA client */ +uart1: serial@4806a000 { + compatible = "ti,omap4-uart"; + reg = <0x4806a000 0x100>; + interrupts-extended = <&gic GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; + ti,hwmods = "uart1"; + clock-frequency = <48000000>; + status = "disabled"; + dmas = <&sdma_xbar 49>, <&sdma_xbar 50>; + dma-names = "tx", "rx"; +}; diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt index 05d2280190f1..ca67b0f04c6e 100644 --- a/Documentation/dmaengine/provider.txt +++ b/Documentation/dmaengine/provider.txt @@ -345,11 +345,12 @@ where to put them) that abstracts it away. * DMA_CTRL_ACK - - Undocumented feature - - No one really has an idea of what it's about, besides being - related to reusing the DMA transaction descriptors or having - additional transactions added to it in the async-tx API - - Useless in the case of the slave API + - If set, the transfer can be reused after being completed. + - There is a guarantee the transfer won't be freed until it is acked + by async_tx_ack(). + - As a consequence, if a device driver wants to skip the dma_map_sg() and + dma_unmap_sg() in between 2 transfers, because the DMA'd data wasn't used, + it can resubmit the transfer right after its completion. General Design Notes -------------------- diff --git a/Documentation/dmaengine/pxa_dma.txt b/Documentation/dmaengine/pxa_dma.txt new file mode 100644 index 000000000000..413ef9cfaa4d --- /dev/null +++ b/Documentation/dmaengine/pxa_dma.txt @@ -0,0 +1,153 @@ +PXA/MMP - DMA Slave controller +============================== + +Constraints +----------- + a) Transfers hot queuing + A driver submitting a transfer and issuing it should be granted the transfer + is queued even on a running DMA channel. + This implies that the queuing doesn't wait for the previous transfer end, + and that the descriptor chaining is not only done in the irq/tasklet code + triggered by the end of the transfer. + A transfer which is submitted and issued on a phy doesn't wait for a phy to + stop and restart, but is submitted on a "running channel". The other + drivers, especially mmp_pdma waited for the phy to stop before relaunching + a new transfer. + + b) All transfers having asked for confirmation should be signaled + Any issued transfer with DMA_PREP_INTERRUPT should trigger a callback call. + This implies that even if an irq/tasklet is triggered by end of tx1, but + at the time of irq/dma tx2 is already finished, tx1->complete() and + tx2->complete() should be called. + + c) Channel running state + A driver should be able to query if a channel is running or not. For the + multimedia case, such as video capture, if a transfer is submitted and then + a check of the DMA channel reports a "stopped channel", the transfer should + not be issued until the next "start of frame interrupt", hence the need to + know if a channel is in running or stopped state. + + d) Bandwidth guarantee + The PXA architecture has 4 levels of DMAs priorities : high, normal, low. + The high prorities get twice as much bandwidth as the normal, which get twice + as much as the low priorities. + A driver should be able to request a priority, especially the real-time + ones such as pxa_camera with (big) throughputs. + +Design +------ + a) Virtual channels + Same concept as in sa11x0 driver, ie. a driver was assigned a "virtual + channel" linked to the requestor line, and the physical DMA channel is + assigned on the fly when the transfer is issued. + + b) Transfer anatomy for a scatter-gather transfer + +------------+-----+---------------+----------------+-----------------+ + | desc-sg[0] | ... | desc-sg[last] | status updater | finisher/linker | + +------------+-----+---------------+----------------+-----------------+ + + This structure is pointed by dma->sg_cpu. + The descriptors are used as follows : + - desc-sg[i]: i-th descriptor, transferring the i-th sg + element to the video buffer scatter gather + - status updater + Transfers a single u32 to a well known dma coherent memory to leave + a trace that this transfer is done. The "well known" is unique per + physical channel, meaning that a read of this value will tell which + is the last finished transfer at that point in time. + - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN + - linker: has ddadr= desc-sg[0] of next transfer, dcmd=0 + + c) Transfers hot-chaining + Suppose the running chain is : + Buffer 1 Buffer 2 + +---------+----+---+ +----+----+----+---+ + | d0 | .. | dN | l | | d0 | .. | dN | f | + +---------+----+-|-+ ^----+----+----+---+ + | | + +----+ + + After a call to dmaengine_submit(b3), the chain will look like : + Buffer 1 Buffer 2 Buffer 3 + +---------+----+---+ +----+----+----+---+ +----+----+----+---+ + | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f | + +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+ + | | | | + +----+ +----+ + new_link + + If while new_link was created the DMA channel stopped, it is _not_ + restarted. Hot-chaining doesn't break the assumption that + dma_async_issue_pending() is to be used to ensure the transfer is actually started. + + One exception to this rule : + - if Buffer1 and Buffer2 had all their addresses 8 bytes aligned + - and if Buffer3 has at least one address not 4 bytes aligned + - then hot-chaining cannot happen, as the channel must be stopped, the + "align bit" must be set, and the channel restarted As a consequence, + such a transfer tx_submit() will be queued on the submitted queue, and + this specific case if the DMA is already running in aligned mode. + + d) Transfers completion updater + Each time a transfer is completed on a channel, an interrupt might be + generated or not, up to the client's request. But in each case, the last + descriptor of a transfer, the "status updater", will write the latest + transfer being completed into the physical channel's completion mark. + + This will speed up residue calculation, for large transfers such as video + buffers which hold around 6k descriptors or more. This also allows without + any lock to find out what is the latest completed transfer in a running + DMA chain. + + e) Transfers completion, irq and tasklet + When a transfer flagged as "DMA_PREP_INTERRUPT" is finished, the dma irq + is raised. Upon this interrupt, a tasklet is scheduled for the physical + channel. + The tasklet is responsible for : + - reading the physical channel last updater mark + - calling all the transfer callbacks of finished transfers, based on + that mark, and each transfer flags. + If a transfer is completed while this handling is done, a dma irq will + be raised, and the tasklet will be scheduled once again, having a new + updater mark. + + f) Residue + Residue granularity will be descriptor based. The issued but not completed + transfers will be scanned for all of their descriptors against the + currently running descriptor. + + g) Most complicated case of driver's tx queues + The most tricky situation is when : + - there are not "acked" transfers (tx0) + - a driver submitted an aligned tx1, not chained + - a driver submitted an aligned tx2 => tx2 is cold chained to tx1 + - a driver issued tx1+tx2 => channel is running in aligned mode + - a driver submitted an aligned tx3 => tx3 is hot-chained + - a driver submitted an unaligned tx4 => tx4 is put in submitted queue, + not chained + - a driver issued tx4 => tx4 is put in issued queue, not chained + - a driver submitted an aligned tx5 => tx5 is put in submitted queue, not + chained + - a driver submitted an aligned tx6 => tx6 is put in submitted queue, + cold chained to tx5 + + This translates into (after tx4 is issued) : + - issued queue + +-----+ +-----+ +-----+ +-----+ + | tx1 | | tx2 | | tx3 | | tx4 | + +---|-+ ^---|-+ ^-----+ +-----+ + | | | | + +---+ +---+ + - submitted queue + +-----+ +-----+ + | tx5 | | tx6 | + +---|-+ ^-----+ + | | + +---+ + - completed queue : empty + - allocated queue : tx0 + + It should be noted that after tx3 is completed, the channel is stopped, and + restarted in "unaligned mode" to handle tx4. + +Author: Robert Jarzmik <robert.jarzmik@free.fr> diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 5a5a05582b58..8146e9fd5ffc 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -236,10 +236,10 @@ Removed Mount Options Name Removed ---- ------- - delaylog/nodelaylog v3.20 - ihashsize v3.20 - irixsgid v3.20 - osyncisdsync/osyncisosync v3.20 + delaylog/nodelaylog v4.0 + ihashsize v4.0 + irixsgid v4.0 + osyncisdsync/osyncisosync v4.0 sysctls @@ -346,5 +346,5 @@ Removed Sysctls Name Removed ---- ------- - fs.xfs.xfsbufd_centisec v3.20 - fs.xfs.age_buffer_centisecs v3.20 + fs.xfs.xfsbufd_centisec v4.0 + fs.xfs.age_buffer_centisecs v4.0 diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt new file mode 100644 index 000000000000..b91443f577dc --- /dev/null +++ b/Documentation/nvdimm/btt.txt @@ -0,0 +1,283 @@ +BTT - Block Translation Table +============================= + + +1. Introduction +--------------- + +Persistent memory based storage is able to perform IO at byte (or more +accurately, cache line) granularity. However, we often want to expose such +storage as traditional block devices. The block drivers for persistent memory +will do exactly this. However, they do not provide any atomicity guarantees. +Traditional SSDs typically provide protection against torn sectors in hardware, +using stored energy in capacitors to complete in-flight block writes, or perhaps +in firmware. We don't have this luxury with persistent memory - if a write is in +progress, and we experience a power failure, the block will contain a mix of old +and new data. Applications may not be prepared to handle such a scenario. + +The Block Translation Table (BTT) provides atomic sector update semantics for +persistent memory devices, so that applications that rely on sector writes not +being torn can continue to do so. The BTT manifests itself as a stacked block +device, and reserves a portion of the underlying storage for its metadata. At +the heart of it, is an indirection table that re-maps all the blocks on the +volume. It can be thought of as an extremely simple file system that only +provides atomic sector updates. + + +2. Static Layout +---------------- + +The underlying storage on which a BTT can be laid out is not limited in any way. +The BTT, however, splits the available space into chunks of up to 512 GiB, +called "Arenas". + +Each arena follows the same layout for its metadata, and all references in an +arena are internal to it (with the exception of one field that points to the +next arena). The following depicts the "On-disk" metadata layout: + + + Backing Store +-------> Arena ++---------------+ | +------------------+ +| | | | Arena info block | +| Arena 0 +---+ | 4K | +| 512G | +------------------+ +| | | | ++---------------+ | | +| | | | +| Arena 1 | | Data Blocks | +| 512G | | | +| | | | ++---------------+ | | +| . | | | +| . | | | +| . | | | +| | | | +| | | | ++---------------+ +------------------+ + | | + | BTT Map | + | | + | | + +------------------+ + | | + | BTT Flog | + | | + +------------------+ + | Info block copy | + | 4K | + +------------------+ + + +3. Theory of Operation +---------------------- + + +a. The BTT Map +-------------- + +The map is a simple lookup/indirection table that maps an LBA to an internal +block. Each map entry is 32 bits. The two most significant bits are special +flags, and the remaining form the internal block number. + +Bit Description +31 - 30 : Error and Zero flags - Used in the following way: + Bit Description + 31 30 + ----------------------------------------------------------------------- + 00 Initial state. Reads return zeroes; Premap = Postmap + 01 Zero state: Reads return zeroes + 10 Error state: Reads fail; Writes clear 'E' bit + 11 Normal Block – has valid postmap + + +29 - 0 : Mappings to internal 'postmap' blocks + + +Some of the terminology that will be subsequently used: + +External LBA : LBA as made visible to upper layers. +ABA : Arena Block Address - Block offset/number within an arena +Premap ABA : The block offset into an arena, which was decided upon by range + checking the External LBA +Postmap ABA : The block number in the "Data Blocks" area obtained after + indirection from the map +nfree : The number of free blocks that are maintained at any given time. + This is the number of concurrent writes that can happen to the + arena. + + +For example, after adding a BTT, we surface a disk of 1024G. We get a read for +the external LBA at 768G. This falls into the second arena, and of the 512G +worth of blocks that this arena contributes, this block is at 256G. Thus, the +premap ABA is 256G. We now refer to the map, and find out the mapping for block +'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64. + + +b. The BTT Flog +--------------- + +The BTT provides sector atomicity by making every write an "allocating write", +i.e. Every write goes to a "free" block. A running list of free blocks is +maintained in the form of the BTT flog. 'Flog' is a combination of the words +"free list" and "log". The flog contains 'nfree' entries, and an entry contains: + +lba : The premap ABA that is being written to +old_map : The old postmap ABA - after 'this' write completes, this will be a + free block. +new_map : The new postmap ABA. The map will up updated to reflect this + lba->postmap_aba mapping, but we log it here in case we have to + recover. +seq : Sequence number to mark which of the 2 sections of this flog entry is + valid/newest. It cycles between 01->10->11->01 (binary) under normal + operation, with 00 indicating an uninitialized state. +lba' : alternate lba entry +old_map': alternate old postmap entry +new_map': alternate new postmap entry +seq' : alternate sequence number. + +Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also +padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are +done such that for any entry being written, it: +a. overwrites the 'old' section in the entry based on sequence numbers +b. writes the 'new' section such that the sequence number is written last. + + +c. The concept of lanes +----------------------- + +While 'nfree' describes the number of concurrent IOs an arena can process +concurrently, 'nlanes' is the number of IOs the BTT device as a whole can +process. + nlanes = min(nfree, num_cpus) +A lane number is obtained at the start of any IO, and is used for indexing into +all the on-disk and in-memory data structures for the duration of the IO. If +there are more CPUs than the max number of available lanes, than lanes are +protected by spinlocks. + + +d. In-memory data structure: Read Tracking Table (RTT) +------------------------------------------------------ + +Consider a case where we have two threads, one doing reads and the other, +writes. We can hit a condition where the writer thread grabs a free block to do +a new IO, but the (slow) reader thread is still reading from it. In other words, +the reader consulted a map entry, and started reading the corresponding block. A +writer started writing to the same external LBA, and finished the write updating +the map for that external LBA to point to its new postmap ABA. At this point the +internal, postmap block that the reader is (still) reading has been inserted +into the list of free blocks. If another write comes in for the same LBA, it can +grab this free block, and start writing to it, causing the reader to read +incorrect data. To prevent this, we introduce the RTT. + +The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts +into rtt[lane_number], the postmap ABA it is reading, and clears it after the +read is complete. Every writer thread, after grabbing a free block, checks the +RTT for its presence. If the postmap free block is in the RTT, it waits till the +reader clears the RTT entry, and only then starts writing to it. + + +e. In-memory data structure: map locks +-------------------------------------- + +Consider a case where two writer threads are writing to the same LBA. There can +be a race in the following sequence of steps: + +free[lane] = map[premap_aba] +map[premap_aba] = postmap_aba + +Both threads can update their respective free[lane] with the same old, freed +postmap_aba. This has made the layout inconsistent by losing a free entry, and +at the same time, duplicating another free entry for two lanes. + +To solve this, we could have a single map lock (per arena) that has to be taken +before performing the above sequence, but we feel that could be too contentious. +Instead we use an array of (nfree) map_locks that is indexed by +(premap_aba modulo nfree). + + +f. Reconstruction from the Flog +------------------------------- + +On startup, we analyze the BTT flog to create our list of free blocks. We walk +through all the entries, and for each lane, of the set of two possible +'sections', we always look at the most recent one only (based on the sequence +number). The reconstruction rules/steps are simple: +- Read map[log_entry.lba]. +- If log_entry.new matches the map entry, then log_entry.old is free. +- If log_entry.new does not match the map entry, then log_entry.new is free. + (This case can only be caused by power-fails/unsafe shutdowns) + + +g. Summarizing - Read and Write flows +------------------------------------- + +Read: + +1. Convert external LBA to arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Read map to get the entry for this pre-map ABA +4. Enter post-map ABA into RTT[lane] +5. If TRIM flag set in map, return zeroes, and end IO (go to step 8) +6. If ERROR flag set in map, end IO with EIO (go to step 8) +7. Read data from this block +8. Remove post-map ABA entry from RTT[lane] +9. Release lane (and lane_lock) + +Write: + +1. Convert external LBA to Arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Use lane to index into in-memory free list and obtain a new block, next flog + index, next sequence number +4. Scan the RTT to check if free block is present, and spin/wait if it is. +5. Write data to this free block +6. Read map to get the existing post-map ABA entry for this pre-map ABA +7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num] +8. Write new post-map ABA into map. +9. Write old post-map entry into the free list +10. Calculate next sequence number and write into the free list entry +11. Release lane (and lane_lock) + + +4. Error Handling +================= + +An arena would be in an error state if any of the metadata is corrupted +irrecoverably, either due to a bug or a media error. The following conditions +indicate an error: +- Info block checksum does not match (and recovering from the copy also fails) +- All internal available blocks are not uniquely and entirely addressed by the + sum of mapped blocks and free blocks (from the BTT flog). +- Rebuilding free list from the flog reveals missing/duplicate/impossible + entries +- A map entry is out of bounds + +If any of these error conditions are encountered, the arena is put into a read +only state using a flag in the info block. + + +5. In-kernel usage +================== + +Any block driver that supports byte granularity IO to the storage may register +with the BTT. It will have to provide the rw_bytes interface in its +block_device_operations struct: + + int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw); + +It may register with the BTT after it adds its own gendisk, using btt_init: + + struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize, + u32 lbasize, u8 uuid[], int maxlane); + +note that maxlane is the maximum amount of concurrency the driver wishes to +allow the BTT to use. + +The BTT 'disk' appears as a stacked block device that grabs the underlying block +device in the O_EXCL mode. + +When the driver wishes to remove the backing disk, it should similarly call +btt_fini using the same struct btt* handle that was provided to it by btt_init. + + void btt_fini(struct btt *btt); + diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt new file mode 100644 index 000000000000..197a0b6b0582 --- /dev/null +++ b/Documentation/nvdimm/nvdimm.txt @@ -0,0 +1,808 @@ + LIBNVDIMM: Non-Volatile Devices + libnvdimm - kernel / libndctl - userspace helper library + linux-nvdimm@lists.01.org + v13 + + + Glossary + Overview + Supporting Documents + Git Trees + LIBNVDIMM PMEM and BLK + Why BLK? + PMEM vs BLK + BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX + Example NVDIMM Platform + LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API + LIBNDCTL: Context + libndctl: instantiate a new library context example + LIBNVDIMM/LIBNDCTL: Bus + libnvdimm: control class device in /sys/class + libnvdimm: bus + libndctl: bus enumeration example + LIBNVDIMM/LIBNDCTL: DIMM (NMEM) + libnvdimm: DIMM (NMEM) + libndctl: DIMM enumeration example + LIBNVDIMM/LIBNDCTL: Region + libnvdimm: region + libndctl: region enumeration example + Why Not Encode the Region Type into the Region Name? + How Do I Determine the Major Type of a Region? + LIBNVDIMM/LIBNDCTL: Namespace + libnvdimm: namespace + libndctl: namespace enumeration example + libndctl: namespace creation example + Why the Term "namespace"? + LIBNVDIMM/LIBNDCTL: Block Translation Table "btt" + libnvdimm: btt layout + libndctl: btt creation example + Summary LIBNDCTL Diagram + + +Glossary +-------- + +PMEM: A system-physical-address range where writes are persistent. A +block device composed of PMEM is capable of DAX. A PMEM address range +may span an interleave of several DIMMs. + +BLK: A set of one or more programmable memory mapped apertures provided +by a DIMM to access its media. This indirection precludes the +performance benefit of interleaving, but enables DIMM-bounded failure +modes. + +DPA: DIMM Physical Address, is a DIMM-relative offset. With one DIMM in +the system there would be a 1:1 system-physical-address:DPA association. +Once more DIMMs are added a memory controller interleave must be +decoded to determine the DPA associated with a given +system-physical-address. BLK capacity always has a 1:1 relationship +with a single-DIMM's DPA range. + +DAX: File system extensions to bypass the page cache and block layer to +mmap persistent memory, from a PMEM block device, directly into a +process address space. + +BTT: Block Translation Table: Persistent memory is byte addressable. +Existing software may have an expectation that the power-fail-atomicity +of writes is at least one sector, 512 bytes. The BTT is an indirection +table with atomic update semantics to front a PMEM/BLK block device +driver and present arbitrary atomic sector sizes. + +LABEL: Metadata stored on a DIMM device that partitions and identifies +(persistently names) storage between PMEM and BLK. It also partitions +BLK storage to host BTTs with different parameters per BLK-partition. +Note that traditional partition tables, GPT/MBR, are layered on top of a +BLK or PMEM device. + + +Overview +-------- + +The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely, +PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM +and BLK mode access. These three modes of operation are described by +the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6. While the LIBNVDIMM +implementation is generic and supports pre-NFIT platforms, it was guided +by the superset of capabilities need to support this ACPI 6 definition +for NVDIMM resources. The bulk of the kernel implementation is in place +to handle the case where DPA accessible via PMEM is aliased with DPA +accessible via BLK. When that occurs a LABEL is needed to reserve DPA +for exclusive access via one mode a time. + +Supporting Documents +ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf +NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf +DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf +Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf + +Git Trees +LIBNVDIMM: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git +LIBNDCTL: https://github.com/pmem/ndctl.git +PMEM: https://github.com/01org/prd + + +LIBNVDIMM PMEM and BLK +------------------ + +Prior to the arrival of the NFIT, non-volatile memory was described to a +system in various ad-hoc ways. Usually only the bare minimum was +provided, namely, a single system-physical-address range where writes +are expected to be durable after a system power loss. Now, the NFIT +specification standardizes not only the description of PMEM, but also +BLK and platform message-passing entry points for control and +configuration. + +For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block +device driver: + + 1. PMEM (nd_pmem.ko): Drives a system-physical-address range. This + range is contiguous in system memory and may be interleaved (hardware + memory controller striped) across multiple DIMMs. When interleaved the + platform may optionally provide details of which DIMMs are participating + in the interleave. + + Note that while LIBNVDIMM describes system-physical-address ranges that may + alias with BLK access as ND_NAMESPACE_PMEM ranges and those without + alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no + distinction. The different device-types are an implementation detail + that userspace can exploit to implement policies like "only interface + with address ranges from certain DIMMs". It is worth noting that when + aliasing is present and a DIMM lacks a label, then no block device can + be created by default as userspace needs to do at least one allocation + of DPA to the PMEM range. In contrast ND_NAMESPACE_IO ranges, once + registered, can be immediately attached to nd_pmem. + + 2. BLK (nd_blk.ko): This driver performs I/O using a set of platform + defined apertures. A set of apertures will all access just one DIMM. + Multiple windows allow multiple concurrent accesses, much like + tagged-command-queuing, and would likely be used by different threads or + different CPUs. + + The NFIT specification defines a standard format for a BLK-aperture, but + the spec also allows for vendor specific layouts, and non-NFIT BLK + implementations may other designs for BLK I/O. For this reason "nd_blk" + calls back into platform-specific code to perform the I/O. One such + implementation is defined in the "Driver Writer's Guide" and "DSM + Interface Example". + + +Why BLK? +-------- + +While PMEM provides direct byte-addressable CPU-load/store access to +NVDIMM storage, it does not provide the best system RAS (recovery, +availability, and serviceability) model. An access to a corrupted +system-physical-address address causes a cpu exception while an access +to a corrupted address through an BLK-aperture causes that block window +to raise an error status in a register. The latter is more aligned with +the standard error model that host-bus-adapter attached disks present. +Also, if an administrator ever wants to replace a memory it is easier to +service a system at DIMM module boundaries. Compare this to PMEM where +data could be interleaved in an opaque hardware specific manner across +several DIMMs. + +PMEM vs BLK +BLK-apertures solve this RAS problem, but their presence is also the +major contributing factor to the complexity of the ND subsystem. They +complicate the implementation because PMEM and BLK alias in DPA space. +Any given DIMM's DPA-range may contribute to one or more +system-physical-address sets of interleaved DIMMs, *and* may also be +accessed in its entirety through its BLK-aperture. Accessing a DPA +through a system-physical-address while simultaneously accessing the +same DPA through a BLK-aperture has undefined results. For this reason, +DIMMs with this dual interface configuration include a DSM function to +store/retrieve a LABEL. The LABEL effectively partitions the DPA-space +into exclusive system-physical-address and BLK-aperture accessible +regions. For simplicity a DIMM is allowed a PMEM "region" per each +interleave set in which it is a member. The remaining DPA space can be +carved into an arbitrary number of BLK devices with discontiguous +extents. + +BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX +-------------------------------------------------- + +One of the few +reasons to allow multiple BLK namespaces per REGION is so that each +BLK-namespace can be configured with a BTT with unique atomic sector +sizes. While a PMEM device can host a BTT the LABEL specification does +not provide for a sector size to be specified for a PMEM namespace. +This is due to the expectation that the primary usage model for PMEM is +via DAX, and the BTT is incompatible with DAX. However, for the cases +where an application or filesystem still needs atomic sector update +guarantees it can register a BTT on a PMEM device or partition. See +LIBNVDIMM/NDCTL: Block Translation Table "btt" + + +Example NVDIMM Platform +----------------------- + +For the remainder of this document the following diagram will be +referenced for any example sysfs layouts. + + + (a) (b) DIMM BLK-REGION + +-------------------+--------+--------+--------+ ++------+ | pm0.0 | blk2.0 | pm1.0 | blk2.1 | 0 region2 +| imc0 +--+- - - region0- - - +--------+ +--------+ ++--+---+ | pm0.0 | blk3.0 | pm1.0 | blk3.1 | 1 region3 + | +-------------------+--------v v--------+ ++--+---+ | | +| cpu0 | region1 ++--+---+ | | + | +----------------------------^ ^--------+ ++--+---+ | blk4.0 | pm1.0 | blk4.0 | 2 region4 +| imc1 +--+----------------------------| +--------+ ++------+ | blk5.0 | pm1.0 | blk5.0 | 3 region5 + +----------------------------+--------+--------+ + +In this platform we have four DIMMs and two memory controllers in one +socket. Each unique interface (BLK or PMEM) to DPA space is identified +by a region device with a dynamically assigned id (REGION0 - REGION5). + + 1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A + single PMEM namespace is created in the REGION0-SPA-range that spans + DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that + interleaved system-physical-address range is reclaimed as BLK-aperture + accessed space starting at DPA-offset (a) into each DIMM. In that + reclaimed space we create two BLK-aperture "namespaces" from REGION2 and + REGION3 where "blk2.0" and "blk3.0" are just human readable names that + could be set to any user-desired name in the LABEL. + + 2. In the last portion of DIMM0 and DIMM1 we have an interleaved + system-physical-address range, REGION1, that spans those two DIMMs as + well as DIMM2 and DIMM3. Some of REGION1 allocated to a PMEM namespace + named "pm1.0" the rest is reclaimed in 4 BLK-aperture namespaces (for + each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and + "blk5.0". + + 3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1 + interleaved system-physical-address range (i.e. the DPA address below + offset (b) are also included in the "blk4.0" and "blk5.0" namespaces. + Note, that this example shows that BLK-aperture namespaces don't need to + be contiguous in DPA-space. + + This bus is provided by the kernel under the device + /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and + the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the + acpi_nfit.ko driver as well. + + +LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API +---------------------------------------------------- + +What follows is a description of the LIBNVDIMM sysfs layout and a +corresponding object hierarchy diagram as viewed through the LIBNDCTL +api. The example sysfs paths and diagrams are relative to the Example +NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit +test. + +LIBNDCTL: Context +Every api call in the LIBNDCTL library requires a context that holds the +logging parameters and other library instance state. The library is +based on the libabc template: +https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git/ + +LIBNDCTL: instantiate a new library context example + + struct ndctl_ctx *ctx; + + if (ndctl_new(&ctx) == 0) + return ctx; + else + return NULL; + +LIBNVDIMM/LIBNDCTL: Bus +------------------- + +A bus has a 1:1 relationship with an NFIT. The current expectation for +ACPI based systems is that there is only ever one platform-global NFIT. +That said, it is trivial to register multiple NFITs, the specification +does not preclude it. The infrastructure supports multiple busses and +we we use this capability to test multiple NFIT configurations in the +unit test. + +LIBNVDIMM: control class device in /sys/class + +This character device accepts DSM messages to be passed to DIMM +identified by its NFIT handle. + + /sys/class/nd/ndctl0 + |-- dev + |-- device -> ../../../ndbus0 + |-- subsystem -> ../../../../../../../class/nd + + + +LIBNVDIMM: bus + + struct nvdimm_bus *nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc); + + /sys/devices/platform/nfit_test.0/ndbus0 + |-- commands + |-- nd + |-- nfit + |-- nmem0 + |-- nmem1 + |-- nmem2 + |-- nmem3 + |-- power + |-- provider + |-- region0 + |-- region1 + |-- region2 + |-- region3 + |-- region4 + |-- region5 + |-- uevent + `-- wait_probe + +LIBNDCTL: bus enumeration example +Find the bus handle that describes the bus from Example NVDIMM Platform + + static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx, + const char *provider) + { + struct ndctl_bus *bus; + + ndctl_bus_foreach(ctx, bus) + if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0) + return bus; + + return NULL; + } + + bus = get_bus_by_provider(ctx, "nfit_test.0"); + + +LIBNVDIMM/LIBNDCTL: DIMM (NMEM) +--------------------------- + +The DIMM device provides a character device for sending commands to +hardware, and it is a container for LABELs. If the DIMM is defined by +NFIT then an optional 'nfit' attribute sub-directory is available to add +NFIT-specifics. + +Note that the kernel device name for "DIMMs" is "nmemX". The NFIT +describes these devices via "Memory Device to System Physical Address +Range Mapping Structure", and there is no requirement that they actually +be physical DIMMs, so we use a more generic name. + +LIBNVDIMM: DIMM (NMEM) + + struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); + + /sys/devices/platform/nfit_test.0/ndbus0 + |-- nmem0 + | |-- available_slots + | |-- commands + | |-- dev + | |-- devtype + | |-- driver -> ../../../../../bus/nd/drivers/nvdimm + | |-- modalias + | |-- nfit + | | |-- device + | | |-- format + | | |-- handle + | | |-- phys_id + | | |-- rev_id + | | |-- serial + | | `-- vendor + | |-- state + | |-- subsystem -> ../../../../../bus/nd + | `-- uevent + |-- nmem1 + [..] + + +LIBNDCTL: DIMM enumeration example + +Note, in this example we are assuming NFIT-defined DIMMs which are +identified by an "nfit_handle" a 32-bit value where: +Bit 3:0 DIMM number within the memory channel +Bit 7:4 memory channel number +Bit 11:8 memory controller ID +Bit 15:12 socket ID (within scope of a Node controller if node controller is present) +Bit 27:16 Node Controller ID +Bit 31:28 Reserved + + static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus, + unsigned int handle) + { + struct ndctl_dimm *dimm; + + ndctl_dimm_foreach(bus, dimm) + if (ndctl_dimm_get_handle(dimm) == handle) + return dimm; + + return NULL; + } + + #define DIMM_HANDLE(n, s, i, c, d) \ + (((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \ + | ((c & 0xf) << 4) | (d & 0xf)) + + dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0)); + +LIBNVDIMM/LIBNDCTL: Region +---------------------- + +A generic REGION device is registered for each PMEM range orBLK-aperture +set. Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture +sets on the "nfit_test.0" bus. The primary role of regions are to be a +container of "mappings". A mapping is a tuple of <DIMM, +DPA-start-offset, length>. + +LIBNVDIMM provides a built-in driver for these REGION devices. This driver +is responsible for reconciling the aliased DPA mappings across all +regions, parsing the LABEL, if present, and then emitting NAMESPACE +devices with the resolved/exclusive DPA-boundaries for the nd_pmem or +nd_blk device driver to consume. + +In addition to the generic attributes of "mapping"s, "interleave_ways" +and "size" the REGION device also exports some convenience attributes. +"nstype" indicates the integer type of namespace-device this region +emits, "devtype" duplicates the DEVTYPE variable stored by udev at the +'add' event, "modalias" duplicates the MODALIAS variable stored by udev +at the 'add' event, and finally, the optional "spa_index" is provided in +the case where the region is defined by a SPA. + +LIBNVDIMM: region + + struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); + struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); + + /sys/devices/platform/nfit_test.0/ndbus0 + |-- region0 + | |-- available_size + | |-- btt0 + | |-- btt_seed + | |-- devtype + | |-- driver -> ../../../../../bus/nd/drivers/nd_region + | |-- init_namespaces + | |-- mapping0 + | |-- mapping1 + | |-- mappings + | |-- modalias + | |-- namespace0.0 + | |-- namespace_seed + | |-- numa_node + | |-- nfit + | | `-- spa_index + | |-- nstype + | |-- set_cookie + | |-- size + | |-- subsystem -> ../../../../../bus/nd + | `-- uevent + |-- region1 + [..] + +LIBNDCTL: region enumeration example + +Sample region retrieval routines based on NFIT-unique data like +"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for +BLK. + + static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus, + unsigned int spa_index) + { + struct ndctl_region *region; + + ndctl_region_foreach(bus, region) { + if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM) + continue; + if (ndctl_region_get_spa_index(region) == spa_index) + return region; + } + return NULL; + } + + static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus, + unsigned int handle) + { + struct ndctl_region *region; + + ndctl_region_foreach(bus, region) { + struct ndctl_mapping *map; + + if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK) + continue; + ndctl_mapping_foreach(region, map) { + struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map); + + if (ndctl_dimm_get_handle(dimm) == handle) + return region; + } + } + return NULL; + } + + +Why Not Encode the Region Type into the Region Name? +---------------------------------------------------- + +At first glance it seems since NFIT defines just PMEM and BLK interface +types that we should simply name REGION devices with something derived +from those type names. However, the ND subsystem explicitly keeps the +REGION name generic and expects userspace to always consider the +region-attributes for 4 reasons: + + 1. There are already more than two REGION and "namespace" types. For + PMEM there are two subtypes. As mentioned previously we have PMEM where + the constituent DIMM devices are known and anonymous PMEM. For BLK + regions the NFIT specification already anticipates vendor specific + implementations. The exact distinction of what a region contains is in + the region-attributes not the region-name or the region-devtype. + + 2. A region with zero child-namespaces is a possible configuration. For + example, the NFIT allows for a DCR to be published without a + corresponding BLK-aperture. This equates to a DIMM that can only accept + control/configuration messages, but no i/o through a descendant block + device. Again, this "type" is advertised in the attributes ('mappings' + == 0) and the name does not tell you much. + + 3. What if a third major interface type arises in the future? Outside + of vendor specific implementations, it's not difficult to envision a + third class of interface type beyond BLK and PMEM. With a generic name + for the REGION level of the device-hierarchy old userspace + implementations can still make sense of new kernel advertised + region-types. Userspace can always rely on the generic region + attributes like "mappings", "size", etc and the expected child devices + named "namespace". This generic format of the device-model hierarchy + allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and + future-proof. + + 4. There are more robust mechanisms for determining the major type of a + region than a device name. See the next section, How Do I Determine the + Major Type of a Region? + +How Do I Determine the Major Type of a Region? +---------------------------------------------- + +Outside of the blanket recommendation of "use libndctl", or simply +looking at the kernel header (/usr/include/linux/ndctl.h) to decode the +"nstype" integer attribute, here are some other options. + + 1. module alias lookup: + + The whole point of region/namespace device type differentiation is to + decide which block-device driver will attach to a given LIBNVDIMM namespace. + One can simply use the modalias to lookup the resulting module. It's + important to note that this method is robust in the presence of a + vendor-specific driver down the road. If a vendor-specific + implementation wants to supplant the standard nd_blk driver it can with + minimal impact to the rest of LIBNVDIMM. + + In fact, a vendor may also want to have a vendor-specific region-driver + (outside of nd_region). For example, if a vendor defined its own LABEL + format it would need its own region driver to parse that LABEL and emit + the resulting namespaces. The output from module resolution is more + accurate than a region-name or region-devtype. + + 2. udev: + + The kernel "devtype" is registered in the udev database + # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0 + P: /devices/platform/nfit_test.0/ndbus0/region0 + E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0 + E: DEVTYPE=nd_pmem + E: MODALIAS=nd:t2 + E: SUBSYSTEM=nd + + # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4 + P: /devices/platform/nfit_test.0/ndbus0/region4 + E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4 + E: DEVTYPE=nd_blk + E: MODALIAS=nd:t3 + E: SUBSYSTEM=nd + + ...and is available as a region attribute, but keep in mind that the + "devtype" does not indicate sub-type variations and scripts should + really be understanding the other attributes. + + 3. type specific attributes: + + As it currently stands a BLK-aperture region will never have a + "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region. A + BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM + that does not allow I/O. A PMEM region with a "mappings" value of zero + is a simple system-physical-address range. + + +LIBNVDIMM/LIBNDCTL: Namespace +------------------------- + +A REGION, after resolving DPA aliasing and LABEL specified boundaries, +surfaces one or more "namespace" devices. The arrival of a "namespace" +device currently triggers either the nd_blk or nd_pmem driver to load +and register a disk/block device. + +LIBNVDIMM: namespace +Here is a sample layout from the three major types of NAMESPACE where +namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid' +attribute), namespace2.0 represents a BLK namespace (note it has a +'sector_size' attribute) that, and namespace6.0 represents an anonymous +PMEM namespace (note that has no 'uuid' attribute due to not support a +LABEL). + + /sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0 + |-- alt_name + |-- devtype + |-- dpa_extents + |-- force_raw + |-- modalias + |-- numa_node + |-- resource + |-- size + |-- subsystem -> ../../../../../../bus/nd + |-- type + |-- uevent + `-- uuid + /sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0 + |-- alt_name + |-- devtype + |-- dpa_extents + |-- force_raw + |-- modalias + |-- numa_node + |-- sector_size + |-- size + |-- subsystem -> ../../../../../../bus/nd + |-- type + |-- uevent + `-- uuid + /sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0 + |-- block + | `-- pmem0 + |-- devtype + |-- driver -> ../../../../../../bus/nd/drivers/pmem + |-- force_raw + |-- modalias + |-- numa_node + |-- resource + |-- size + |-- subsystem -> ../../../../../../bus/nd + |-- type + `-- uevent + +LIBNDCTL: namespace enumeration example +Namespaces are indexed relative to their parent region, example below. +These indexes are mostly static from boot to boot, but subsystem makes +no guarantees in this regard. For a static namespace identifier use its +'uuid' attribute. + +static struct ndctl_namespace *get_namespace_by_id(struct ndctl_region *region, + unsigned int id) +{ + struct ndctl_namespace *ndns; + + ndctl_namespace_foreach(region, ndns) + if (ndctl_namespace_get_id(ndns) == id) + return ndns; + + return NULL; +} + +LIBNDCTL: namespace creation example +Idle namespaces are automatically created by the kernel if a given +region has enough available capacity to create a new namespace. +Namespace instantiation involves finding an idle namespace and +configuring it. For the most part the setting of namespace attributes +can occur in any order, the only constraint is that 'uuid' must be set +before 'size'. This enables the kernel to track DPA allocations +internally with a static identifier. + +static int configure_namespace(struct ndctl_region *region, + struct ndctl_namespace *ndns, + struct namespace_parameters *parameters) +{ + char devname[50]; + + snprintf(devname, sizeof(devname), "namespace%d.%d", + ndctl_region_get_id(region), paramaters->id); + + ndctl_namespace_set_alt_name(ndns, devname); + /* 'uuid' must be set prior to setting size! */ + ndctl_namespace_set_uuid(ndns, paramaters->uuid); + ndctl_namespace_set_size(ndns, paramaters->size); + /* unlike pmem namespaces, blk namespaces have a sector size */ + if (parameters->lbasize) + ndctl_namespace_set_sector_size(ndns, parameters->lbasize); + ndctl_namespace_enable(ndns); +} + + +Why the Term "namespace"? + + 1. Why not "volume" for instance? "volume" ran the risk of confusing ND + as a volume manager like device-mapper. + + 2. The term originated to describe the sub-devices that can be created + within a NVME controller (see the nvme specification: + http://www.nvmexpress.org/specifications/), and NFIT namespaces are + meant to parallel the capabilities and configurability of + NVME-namespaces. + + +LIBNVDIMM/LIBNDCTL: Block Translation Table "btt" +--------------------------------------------- + +A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked +block device driver that fronts either the whole block device or a +partition of a block device emitted by either a PMEM or BLK NAMESPACE. + +LIBNVDIMM: btt layout +Every region will start out with at least one BTT device which is the +seed device. To activate it set the "namespace", "uuid", and +"sector_size" attributes and then bind the device to the nd_pmem or +nd_blk driver depending on the region type. + + /sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/ + |-- namespace + |-- delete + |-- devtype + |-- modalias + |-- numa_node + |-- sector_size + |-- subsystem -> ../../../../../bus/nd + |-- uevent + `-- uuid + +LIBNDCTL: btt creation example +Similar to namespaces an idle BTT device is automatically created per +region. Each time this "seed" btt device is configured and enabled a new +seed is created. Creating a BTT configuration involves two steps of +finding and idle BTT and assigning it to consume a PMEM or BLK namespace. + + static struct ndctl_btt *get_idle_btt(struct ndctl_region *region) + { + struct ndctl_btt *btt; + + ndctl_btt_foreach(region, btt) + if (!ndctl_btt_is_enabled(btt) + && !ndctl_btt_is_configured(btt)) + return btt; + + return NULL; + } + + static int configure_btt(struct ndctl_region *region, + struct btt_parameters *parameters) + { + btt = get_idle_btt(region); + + ndctl_btt_set_uuid(btt, parameters->uuid); + ndctl_btt_set_sector_size(btt, parameters->sector_size); + ndctl_btt_set_namespace(btt, parameters->ndns); + /* turn off raw mode device */ + ndctl_namespace_disable(parameters->ndns); + /* turn on btt access */ + ndctl_btt_enable(btt); + } + +Once instantiated a new inactive btt seed device will appear underneath +the region. + +Once a "namespace" is removed from a BTT that instance of the BTT device +will be deleted or otherwise reset to default values. This deletion is +only at the device model level. In order to destroy a BTT the "info +block" needs to be destroyed. Note, that to destroy a BTT the media +needs to be written in raw mode. By default, the kernel will autodetect +the presence of a BTT and disable raw mode. This autodetect behavior +can be suppressed by enabling raw mode for the namespace via the +ndctl_namespace_set_raw_mode() api. + + +Summary LIBNDCTL Diagram +------------------------ + +For the given example above, here is the view of the objects as seen by the LIBNDCTL api: + +---+ + |CTX| +---------+ +--------------+ +---------------+ + +-+-+ +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" | + | | +---------+ +--------------+ +---------------+ ++-------+ | | +---------+ +--------------+ +---------------+ +| DIMM0 <-+ | +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" | ++-------+ | | | +---------+ +--------------+ +---------------+ +| DIMM1 <-+ +-v--+ | +---------+ +--------------+ +---------------+ ++-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6 "blk2.0" | +| DIMM2 <-+ +----+ | +---------+ | +--------------+ +----------------------+ ++-------+ | | +-> NAMESPACE2.1 +--> ND5 "blk2.1" | BTT2 | +| DIMM3 <-+ | +--------------+ +----------------------+ ++-------+ | +---------+ +--------------+ +---------------+ + +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4 "blk3.0" | + | +---------+ | +--------------+ +----------------------+ + | +-> NAMESPACE3.1 +--> ND3 "blk3.1" | BTT1 | + | +--------------+ +----------------------+ + | +---------+ +--------------+ +---------------+ + +-> REGION4 +---> NAMESPACE4.0 +--> ND2 "blk4.0" | + | +---------+ +--------------+ +---------------+ + | +---------+ +--------------+ +----------------------+ + +-> REGION5 +---> NAMESPACE5.0 +--> ND1 "blk5.0" | BTT0 | + +---------+ +--------------+ +---------------+------+ + + diff --git a/MAINTAINERS b/MAINTAINERS index 6aedd5072323..0e6b09150aad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6102,6 +6102,39 @@ M: Sasha Levin <sasha.levin@oracle.com> S: Maintained F: tools/lib/lockdep/ +LIBNVDIMM: NON-VOLATILE MEMORY DEVICE SUBSYSTEM +M: Dan Williams <dan.j.williams@intel.com> +L: linux-nvdimm@lists.01.org +Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ +S: Supported +F: drivers/nvdimm/* +F: include/linux/nd.h +F: include/linux/libnvdimm.h +F: include/uapi/linux/ndctl.h + +LIBNVDIMM BLK: MMIO-APERTURE DRIVER +M: Ross Zwisler <ross.zwisler@linux.intel.com> +L: linux-nvdimm@lists.01.org +Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ +S: Supported +F: drivers/nvdimm/blk.c +F: drivers/nvdimm/region_devs.c +F: drivers/acpi/nfit* + +LIBNVDIMM BTT: BLOCK TRANSLATION TABLE +M: Vishal Verma <vishal.l.verma@intel.com> +L: linux-nvdimm@lists.01.org +Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ +S: Supported +F: drivers/nvdimm/btt* + +LIBNVDIMM PMEM: PERSISTENT MEMORY DRIVER +M: Ross Zwisler <ross.zwisler@linux.intel.com> +L: linux-nvdimm@lists.01.org +Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ +S: Supported +F: drivers/nvdimm/pmem.c + LINUX FOR IBM pSERIES (RS/6000) M: Paul Mackerras <paulus@au.ibm.com> W: http://www.ibm.com/linux/ltc/projects/ppc @@ -8174,6 +8207,7 @@ T: git git://github.com/hzhuang1/linux.git T: git git://github.com/rjarzmik/linux.git S: Maintained F: arch/arm/mach-pxa/ +F: drivers/dma/pxa* F: drivers/pcmcia/pxa2xx* F: drivers/spi/spi-pxa2xx* F: drivers/usb/gadget/udc/pxa2* @@ -8362,12 +8396,6 @@ S: Maintained F: Documentation/blockdev/ramdisk.txt F: drivers/block/brd.c -PERSISTENT MEMORY DRIVER -M: Ross Zwisler <ross.zwisler@linux.intel.com> -L: linux-nvdimm@lists.01.org -S: Supported -F: drivers/block/pmem.c - RANDOM NUMBER DRIVER M: "Theodore Ts'o" <tytso@mit.edu> S: Maintained @@ -8986,6 +9014,7 @@ S: Supported F: kernel/seccomp.c F: include/uapi/linux/seccomp.h F: include/linux/seccomp.h +F: tools/testing/selftests/seccomp/* K: \bsecure_computing K: \bTIF_SECCOMP\b @@ -10389,11 +10418,15 @@ S: Maintained F: Documentation/filesystems/ubifs.txt F: fs/ubifs/ -UCLINUX (AND M68KNOMMU) +UCLINUX (M68KNOMMU AND COLDFIRE) M: Greg Ungerer <gerg@uclinux.org> W: http://www.uclinux.org/ +L: linux-m68k@lists.linux-m68k.org L: uclinux-dev@uclinux.org (subscribers-only) +T: git git://git.kernel.org/pub/scm/linux/kernel/git/gerg/m68knommu.git S: Maintained +F: arch/m68k/coldfire/ +F: arch/m68k/68*/ F: arch/m68k/*/*_no.* F: arch/m68k/include/asm/*_no.* @@ -10786,6 +10819,12 @@ F: drivers/vfio/ F: include/linux/vfio.h F: include/uapi/linux/vfio.h +VFIO PLATFORM DRIVER +M: Baptiste Reynal <b.reynal@virtualopensystems.com> +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/platform/ + VIDEOBUF2 FRAMEWORK M: Pawel Osciak <pawel@osciak.com> M: Marek Szyprowski <m.szyprowski@samsung.com> diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index ab21e0d58278..9d4aa18f2a82 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -158,6 +158,7 @@ static __init int is_reserve_region(efi_memory_desc_t *md) case EFI_BOOT_SERVICES_CODE: case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: return 0; default: break; diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 47e962f7ed5a..caae3f4e4341 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -1222,6 +1222,10 @@ efi_initialize_iomem_resources(struct resource *code_resource, flags |= IORESOURCE_DISABLED; break; + case EFI_PERSISTENT_MEMORY: + name = "Persistent Memory"; + break; + case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: case EFI_RUNTIME_SERVICES_DATA: diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 5b7791dd3965..096731049538 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -96,3 +96,6 @@ EXPORT_SYMBOL(ia64_ivt); /* mcount is defined in assembly */ EXPORT_SYMBOL(_mcount); #endif + +#include <asm/cacheflush.h> +EXPORT_SYMBOL_GPL(flush_icache_range); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index dd5801eb4c69..2889412e03eb 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -2117,8 +2117,7 @@ ia64_mca_late_init(void) register_hotcpu_notifier(&mca_cpu_notifier); /* Setup the CMCI/P vector and handler */ - init_timer(&cmc_poll_timer); - cmc_poll_timer.function = ia64_mca_cmc_poll; + setup_timer(&cmc_poll_timer, ia64_mca_cmc_poll, 0UL); /* Unmask/enable the vector */ cmc_polling_enabled = 0; @@ -2129,8 +2128,7 @@ ia64_mca_late_init(void) #ifdef CONFIG_ACPI /* Setup the CPEI/P vector and handler */ cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI); - init_timer(&cpe_poll_timer); - cpe_poll_timer.function = ia64_mca_cpe_poll; + setup_timer(&cpe_poll_timer, ia64_mca_cpe_poll, 0UL); { unsigned int irq; diff --git a/arch/m68k/68000/m68EZ328.c b/arch/m68k/68000/m68EZ328.c index 21952906e9e2..e6ab321f93f8 100644 --- a/arch/m68k/68000/m68EZ328.c +++ b/arch/m68k/68000/m68EZ328.c @@ -62,8 +62,7 @@ void __init config_BSP(char *command, int len) #ifdef CONFIG_UCSIMM printk(KERN_INFO "uCsimm serial string [%s]\n",getserialnum()); p = cs8900a_hwaddr = gethwaddr(0); - printk(KERN_INFO "uCsimm hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n", - p[0], p[1], p[2], p[3], p[4], p[5]); + printk(KERN_INFO "uCsimm hwaddr %pM\n", p); p = getbenv("APPEND"); if (p) strcpy(p,command); diff --git a/arch/m68k/68000/m68VZ328.c b/arch/m68k/68000/m68VZ328.c index 0e5e5a10a021..1154bdb220a0 100644 --- a/arch/m68k/68000/m68VZ328.c +++ b/arch/m68k/68000/m68VZ328.c @@ -152,8 +152,7 @@ static void __init init_hardware(char *command, int size) printk(KERN_INFO "uCdimm serial string [%s]\n", getserialnum()); p = cs8900a_hwaddr = gethwaddr(0); - printk(KERN_INFO "uCdimm hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n", - p[0], p[1], p[2], p[3], p[4], p[5]); + printk(KERN_INFO "uCdimm hwaddr %pM\n", p); p = getbenv("APPEND"); if (p) strcpy(p, command); diff --git a/arch/m68k/68360/config.c b/arch/m68k/68360/config.c index fd1f948c7129..b65fe4eed38e 100644 --- a/arch/m68k/68360/config.c +++ b/arch/m68k/68360/config.c @@ -154,8 +154,7 @@ void __init config_BSP(char *command, int len) #if defined(CONFIG_UCQUICC) && 0 printk(KERN_INFO "uCquicc serial string [%s]\n",getserialnum()); p = scc1_hwaddr = gethwaddr(0); - printk(KERN_INFO "uCquicc hwaddr %.2x:%.2x:%.2x:%.2x:%.2x:%.2x\n", - p[0], p[1], p[2], p[3], p[4], p[5]); + printk(KERN_INFO "uCquicc hwaddr %pM\n", p); p = getbenv("APPEND"); if (p) diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index 6e67847f5272..28a9885e3a37 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -44,23 +44,9 @@ config HOSTFS If you'd like to be able to work with files stored on the host, say Y or M here; otherwise say N. -config HPPFS - tristate "HoneyPot ProcFS" - depends on PROC_FS - help - hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc - entries to be overridden, removed, or fabricated from the host. - Its purpose is to allow a UML to appear to be a physical machine - by removing or changing anything in /proc which gives away the - identity of a UML. - - See <http://user-mode-linux.sf.net/old/hppfs.html> for more information. - - You only need this if you are setting up a UML honeypot. Otherwise, - it is safe to say 'N' here. - config MCONSOLE bool "Management console" + depends on PROC_FS default y help The user mode linux management console is a low-level interface to diff --git a/arch/um/Makefile b/arch/um/Makefile index 17d4460b1af3..098ab3333e7c 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -68,9 +68,10 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ KBUILD_AFLAGS += $(ARCH_INCLUDE) -USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\ - $(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \ - $(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 -idirafter include +USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ + $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \ + -D_FILE_OFFSET_BITS=64 -idirafter include \ + -D__KERNEL__ -D__UM_HOST__ #This will adjust *FLAGS accordingly to the platform. include $(ARCH_DIR)/Makefile-os-$(OS) diff --git a/arch/um/drivers/harddog_user.c b/arch/um/drivers/harddog_user.c index f99b32a4dbff..3aa8b0d52a48 100644 --- a/arch/um/drivers/harddog_user.c +++ b/arch/um/drivers/harddog_user.c @@ -9,8 +9,8 @@ #include <os.h> struct dog_data { - int stdin; - int stdout; + int stdin_fd; + int stdout_fd; int close_me[2]; }; @@ -18,11 +18,11 @@ static void pre_exec(void *d) { struct dog_data *data = d; - dup2(data->stdin, 0); - dup2(data->stdout, 1); - dup2(data->stdout, 2); - close(data->stdin); - close(data->stdout); + dup2(data->stdin_fd, 0); + dup2(data->stdout_fd, 1); + dup2(data->stdout_fd, 2); + close(data->stdin_fd); + close(data->stdout_fd); close(data->close_me[0]); close(data->close_me[1]); } @@ -49,8 +49,8 @@ int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock) goto out_close_in; } - data.stdin = out_fds[0]; - data.stdout = in_fds[1]; + data.stdin_fd = out_fds[0]; + data.stdout_fd = in_fds[1]; data.close_me[0] = out_fds[1]; data.close_me[1] = in_fds[0]; diff --git a/arch/um/drivers/mconsole.h b/arch/um/drivers/mconsole.h index 8b22535c62ce..44af7379ea19 100644 --- a/arch/um/drivers/mconsole.h +++ b/arch/um/drivers/mconsole.h @@ -7,7 +7,7 @@ #ifndef __MCONSOLE_H__ #define __MCONSOLE_H__ -#ifndef __KERNEL__ +#ifdef __UM_HOST__ #include <stdint.h> #define u32 uint32_t #endif diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c index cd14157b556d..e697a4136707 100644 --- a/arch/um/drivers/net_user.c +++ b/arch/um/drivers/net_user.c @@ -166,7 +166,7 @@ int net_sendto(int fd, void *buf, int len, void *to, int sock_len) struct change_pre_exec_data { int close_me; - int stdout; + int stdout_fd; }; static void change_pre_exec(void *arg) @@ -174,7 +174,7 @@ static void change_pre_exec(void *arg) struct change_pre_exec_data *data = arg; close(data->close_me); - dup2(data->stdout, 1); + dup2(data->stdout_fd, 1); } static int change_tramp(char **argv, char *output, int output_len) @@ -189,7 +189,7 @@ static int change_tramp(char **argv, char *output, int output_len) return err; } pe_data.close_me = fds[0]; - pe_data.stdout = fds[1]; + pe_data.stdout_fd = fds[1]; pid = run_helper(change_pre_exec, &pe_data, argv); if (pid > 0) /* Avoid hang as we won't get data in failure case. */ diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c index 55c290d925f3..0d6b66c64a81 100644 --- a/arch/um/drivers/slip_user.c +++ b/arch/um/drivers/slip_user.c @@ -55,8 +55,8 @@ static int set_up_tty(int fd) } struct slip_pre_exec_data { - int stdin; - int stdout; + int stdin_fd; + int stdout_fd; int close_me; }; @@ -64,9 +64,9 @@ static void slip_pre_exec(void *arg) { struct slip_pre_exec_data *data = arg; - if (data->stdin >= 0) - dup2(data->stdin, 0); - dup2(data->stdout, 1); + if (data->stdin_fd >= 0) + dup2(data->stdin_fd, 0); + dup2(data->stdout_fd, 1); if (data->close_me >= 0) close(data->close_me); } @@ -85,8 +85,8 @@ static int slip_tramp(char **argv, int fd) } err = 0; - pe_data.stdin = fd; - pe_data.stdout = fds[1]; + pe_data.stdin_fd = fd; + pe_data.stdout_fd = fds[1]; pe_data.close_me = fds[0]; err = run_helper(slip_pre_exec, &pe_data, argv); if (err < 0) diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c index c999d187abb9..98b6a41a254e 100644 --- a/arch/um/drivers/slirp_user.c +++ b/arch/um/drivers/slirp_user.c @@ -20,18 +20,18 @@ static int slirp_user_init(void *data, void *dev) } struct slirp_pre_exec_data { - int stdin; - int stdout; + int stdin_fd; + int stdout_fd; }; static void slirp_pre_exec(void *arg) { struct slirp_pre_exec_data *data = arg; - if (data->stdin != -1) - dup2(data->stdin, 0); - if (data->stdout != -1) - dup2(data->stdout, 1); + if (data->stdin_fd != -1) + dup2(data->stdin_fd, 0); + if (data->stdout_fd != -1) + dup2(data->stdout_fd, 1); } static int slirp_tramp(char **argv, int fd) @@ -39,8 +39,8 @@ static int slirp_tramp(char **argv, int fd) struct slirp_pre_exec_data pe_data; int pid; - pe_data.stdin = fd; - pe_data.stdout = fd; + pe_data.stdin_fd = fd; + pe_data.stdout_fd = fd; pid = run_helper(slirp_pre_exec, &pe_data, argv); return pid; diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index b7df3ae9be51..3d63ff6f583f 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -21,7 +21,6 @@ generic-y += param.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h -generic-y += sections.h generic-y += switch_to.h generic-y += topology.h generic-y += trace_clock.h diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h index cb9b3c47ca8e..2966adbbdf6c 100644 --- a/arch/um/include/asm/ptrace-generic.h +++ b/arch/um/include/asm/ptrace-generic.h @@ -8,7 +8,6 @@ #ifndef __ASSEMBLY__ -#include <asm/ptrace-abi.h> #include <sysdep/ptrace.h> struct pt_regs { @@ -37,7 +36,7 @@ extern int putreg(struct task_struct *child, int regno, unsigned long value); extern int arch_copy_tls(struct task_struct *new); extern void clear_flushed_tls(struct task_struct *task); -extern void syscall_trace_enter(struct pt_regs *regs); +extern int syscall_trace_enter(struct pt_regs *regs); extern void syscall_trace_leave(struct pt_regs *regs); #endif diff --git a/arch/um/include/asm/sections.h b/arch/um/include/asm/sections.h new file mode 100644 index 000000000000..cafcf684d947 --- /dev/null +++ b/arch/um/include/asm/sections.h @@ -0,0 +1,9 @@ +#ifndef __UM_SECTIONS_H +#define __UM_SECTIONS_H + +#include <asm-generic/sections.h> + +extern char __binary_start[]; +extern char __syscall_stub_start[], __syscall_stub_end[]; + +#endif diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index b30c85b141d9..53968aaf76f9 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -10,7 +10,7 @@ #include <asm/types.h> #include <asm/page.h> -#include <asm/uaccess.h> +#include <asm/segment.h> struct thread_info { struct task_struct *task; /* main task structure */ diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 3f22fbf7ca1d..3705620ca298 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -1,178 +1,52 @@ /* * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2015 Richard Weinberger (richard@nod.at) * Licensed under the GPL */ #ifndef __UM_UACCESS_H #define __UM_UACCESS_H -/* thread_info has a mm_segment_t in it, so put the definition up here */ -typedef struct { - unsigned long seg; -} mm_segment_t; - -#include <linux/thread_info.h> -#include <linux/errno.h> -#include <asm/processor.h> +#include <asm/thread_info.h> #include <asm/elf.h> -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - -/* - * The fs value determines whether argument validity checking should be - * performed or not. If get_fs() == USER_DS, checking is performed, with - * get_fs() == KERNEL_DS, checking is bypassed. - * - * For historical reasons, these macros are grossly misnamed. - */ - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -#define USER_DS MAKE_MM_SEG(TASK_SIZE) - -#define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - -#define segment_eq(a, b) ((a).seg == (b).seg) - #define __under_task_size(addr, size) \ (((unsigned long) (addr) < TASK_SIZE) && \ (((unsigned long) (addr) + (size)) < TASK_SIZE)) -#define __access_ok_vsyscall(type, addr, size) \ - ((type == VERIFY_READ) && \ - ((unsigned long) (addr) >= FIXADDR_USER_START) && \ +#define __access_ok_vsyscall(addr, size) \ + (((unsigned long) (addr) >= FIXADDR_USER_START) && \ ((unsigned long) (addr) + (size) <= FIXADDR_USER_END) && \ ((unsigned long) (addr) + (size) >= (unsigned long)(addr))) #define __addr_range_nowrap(addr, size) \ ((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) -#define access_ok(type, addr, size) \ - (__addr_range_nowrap(addr, size) && \ - (__under_task_size(addr, size) || \ - __access_ok_vsyscall(type, addr, size) || \ - segment_eq(get_fs(), KERNEL_DS))) - -extern int copy_from_user(void *to, const void __user *from, int n); -extern int copy_to_user(void __user *to, const void *from, int n); - -/* - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ - -extern int strncpy_from_user(char *dst, const char __user *src, int count); - -/* - * __clear_user: - Zero a block of memory in user space, with less checking. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. Caller must check - * the specified block with access_ok() before calling this function. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -extern int __clear_user(void __user *mem, int len); - -/* - * clear_user: - Zero a block of memory in user space. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -extern int clear_user(void __user *mem, int len); - -/* - * strlen_user: - Get the size of a string in user space. - * @str: The string to measure. - * @n: The maximum valid length - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * If the string is too long, returns a value greater than @n. - */ -extern int strnlen_user(const void __user *str, int len); - -#define __copy_from_user(to, from, n) copy_from_user(to, from, n) - -#define __copy_to_user(to, from, n) copy_to_user(to, from, n) - +extern long __copy_from_user(void *to, const void __user *from, unsigned long n); +extern long __copy_to_user(void __user *to, const void *from, unsigned long n); +extern long __strncpy_from_user(char *dst, const char __user *src, long count); +extern long __strnlen_user(const void __user *str, long len); +extern unsigned long __clear_user(void __user *mem, unsigned long len); +static inline int __access_ok(unsigned long addr, unsigned long size); + +/* Teach asm-generic/uaccess.h that we have C functions for these. */ +#define __access_ok __access_ok +#define __clear_user __clear_user +#define __copy_to_user __copy_to_user +#define __copy_from_user __copy_from_user +#define __strnlen_user __strnlen_user +#define __strncpy_from_user __strncpy_from_user #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user -#define __get_user(x, ptr) \ -({ \ - const __typeof__(*(ptr)) __user *__private_ptr = (ptr); \ - __typeof__(x) __private_val; \ - int __private_ret = -EFAULT; \ - (x) = (__typeof__(*(__private_ptr)))0; \ - if (__copy_from_user((__force void *)&__private_val, (__private_ptr),\ - sizeof(*(__private_ptr))) == 0) { \ - (x) = (__typeof__(*(__private_ptr))) __private_val; \ - __private_ret = 0; \ - } \ - __private_ret; \ -}) - -#define get_user(x, ptr) \ -({ \ - const __typeof__((*(ptr))) __user *private_ptr = (ptr); \ - (access_ok(VERIFY_READ, private_ptr, sizeof(*private_ptr)) ? \ - __get_user(x, private_ptr) : ((x) = (__typeof__(*ptr))0, -EFAULT)); \ -}) - -#define __put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __user *__private_ptr = ptr; \ - __typeof__(*(__private_ptr)) __private_val; \ - int __private_ret = -EFAULT; \ - __private_val = (__typeof__(*(__private_ptr))) (x); \ - if (__copy_to_user((__private_ptr), &__private_val, \ - sizeof(*(__private_ptr))) == 0) { \ - __private_ret = 0; \ - } \ - __private_ret; \ -}) - -#define put_user(x, ptr) \ -({ \ - __typeof__(*(ptr)) __user *private_ptr = (ptr); \ - (access_ok(VERIFY_WRITE, private_ptr, sizeof(*private_ptr)) ? \ - __put_user(x, private_ptr) : -EFAULT); \ -}) - -#define strlen_user(str) strnlen_user(str, ~0U >> 1) +#include <asm-generic/uaccess.h> -struct exception_table_entry +static inline int __access_ok(unsigned long addr, unsigned long size) { - unsigned long insn; - unsigned long fixup; -}; + return __addr_range_nowrap(addr, size) && + (__under_task_size(addr, size) || + __access_ok_vsyscall(addr, size) || + segment_eq(get_fs(), KERNEL_DS)); +} #endif diff --git a/arch/um/include/shared/init.h b/arch/um/include/shared/init.h index b3906f860a87..233e2593eee0 100644 --- a/arch/um/include/shared/init.h +++ b/arch/um/include/shared/init.h @@ -40,28 +40,8 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -#ifndef __KERNEL__ -#ifndef __section -# define __section(S) __attribute__ ((__section__(#S))) -#endif - -#if __GNUC__ == 3 - -#if __GNUC_MINOR__ >= 3 -# define __used __attribute__((__used__)) -#else -# define __used __attribute__((__unused__)) -#endif - -#else -#if __GNUC__ == 4 -# define __used __attribute__((__used__)) -#endif -#endif - -#else #include <linux/compiler.h> -#endif + /* These are for everybody (although not all archs will actually discard it in modules) */ #define __init __section(.init.text) @@ -131,7 +111,7 @@ extern struct uml_param __uml_setup_start, __uml_setup_end; #define __uml_postsetup_call __used __section(.uml.postsetup.init) #define __uml_exit_call __used __section(.uml.exitcall.exit) -#ifndef __KERNEL__ +#ifdef __UM_HOST__ #define __define_initcall(level,fn) \ static initcall_t __initcall_##fn __used \ diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index d824528f6f62..ad3fa3ae6d34 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -301,4 +301,6 @@ extern int get_pty(void); /* sys-$ARCH/task_size.c */ extern unsigned long os_get_top_address(void); +long syscall(long number, ...); + #endif diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index cef068563336..4cff19f6207a 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -17,7 +17,7 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) /* This is to get size_t */ -#ifdef __KERNEL__ +#ifndef __UM_HOST__ #include <linux/types.h> #else #include <stddef.h> diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 543c04756939..232b22307fdd 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(os_makedev); EXPORT_SYMBOL(add_sigio_fd); EXPORT_SYMBOL(ignore_sigio_fd); EXPORT_SYMBOL(sigio_broken); + +EXPORT_SYMBOL(syscall); diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 9034fc8056b4..4c9861b421fd 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -8,6 +8,7 @@ #include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> +#include <asm/sections.h> #include <as-layout.h> #include <init.h> #include <kern.h> @@ -55,8 +56,6 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, } } -extern int __syscall_stub_start; - /** * setup_physmem() - Setup physical memory for UML * @start: Start address of the physical kernel memory, @@ -110,8 +109,8 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, * Special kludge - This page will be mapped in to userspace processes * from physmem_fd, so it needs to be written out there. */ - os_seek_file(physmem_fd, __pa(&__syscall_stub_start)); - os_write_file(physmem_fd, &__syscall_stub_start, PAGE_SIZE); + os_seek_file(physmem_fd, __pa(__syscall_stub_start)); + os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); os_fsync_file(physmem_fd); bootmap_size = init_bootmem(pfn, pfn + delta); diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 174ee5017264..6a826cbb15c4 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/tracehook.h> #include <asm/uaccess.h> +#include <asm/ptrace-abi.h> void user_enable_single_step(struct task_struct *child) { @@ -131,7 +132,7 @@ static void send_sigtrap(struct task_struct *tsk, struct uml_pt_regs *regs, * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check */ -void syscall_trace_enter(struct pt_regs *regs) +int syscall_trace_enter(struct pt_regs *regs) { audit_syscall_entry(UPT_SYSCALL_NR(®s->regs), UPT_SYSCALL_ARG1(®s->regs), @@ -140,9 +141,9 @@ void syscall_trace_enter(struct pt_regs *regs) UPT_SYSCALL_ARG4(®s->regs)); if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; + return 0; - tracehook_report_syscall_entry(regs); + return tracehook_report_syscall_entry(regs); } void syscall_trace_leave(struct pt_regs *regs) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 94abdcc1d6ad..fda1deba1757 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -8,12 +8,11 @@ #include <linux/slab.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> +#include <asm/sections.h> #include <as-layout.h> #include <os.h> #include <skas.h> -extern int __syscall_stub_start; - static int init_stub_pte(struct mm_struct *mm, unsigned long proc, unsigned long kernel) { @@ -93,7 +92,7 @@ void uml_setup_stubs(struct mm_struct *mm) int err, ret; ret = init_stub_pte(mm, STUB_CODE, - (unsigned long) &__syscall_stub_start); + (unsigned long) __syscall_stub_start); if (ret) goto out; @@ -101,7 +100,7 @@ void uml_setup_stubs(struct mm_struct *mm) if (ret) goto out; - mm->context.stub_pages[0] = virt_to_page(&__syscall_stub_start); + mm->context.stub_pages[0] = virt_to_page(__syscall_stub_start); mm->context.stub_pages[1] = virt_to_page(mm->context.id.stack); /* dup_mmap already holds mmap_sem */ diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index c0681e097432..d9ec0068b623 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -18,7 +18,10 @@ void handle_syscall(struct uml_pt_regs *r) long result; int syscall; - syscall_trace_enter(regs); + if (syscall_trace_enter(regs)) { + result = -ENOSYS; + goto out; + } /* * This should go in the declaration of syscall, but when I do that, @@ -34,6 +37,7 @@ void handle_syscall(struct uml_pt_regs *r) result = -ENOSYS; else result = EXECUTE_SYSCALL(syscall, regs); +out: PT_REGS_SET_SYSCALL_RETURN(regs, result); syscall_trace_leave(regs); diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index 4ffb644d6c07..85ac8adb069b 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -87,10 +87,10 @@ static int do_op_one_page(unsigned long addr, int len, int is_write, return n; } -static int buffer_op(unsigned long addr, int len, int is_write, - int (*op)(unsigned long, int, void *), void *arg) +static long buffer_op(unsigned long addr, int len, int is_write, + int (*op)(unsigned long, int, void *), void *arg) { - int size, remain, n; + long size, remain, n; size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); remain = len; @@ -139,18 +139,16 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int copy_from_user(void *to, const void __user *from, int n) +long __copy_from_user(void *to, const void __user *from, unsigned long n) { if (segment_eq(get_fs(), KERNEL_DS)) { memcpy(to, (__force void*)from, n); return 0; } - return access_ok(VERIFY_READ, from, n) ? - buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to): - n; + return buffer_op((unsigned long) from, n, 0, copy_chunk_from_user, &to); } -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(__copy_from_user); static int copy_chunk_to_user(unsigned long to, int len, void *arg) { @@ -161,18 +159,16 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg) return 0; } -int copy_to_user(void __user *to, const void *from, int n) +long __copy_to_user(void __user *to, const void *from, unsigned long n) { if (segment_eq(get_fs(), KERNEL_DS)) { memcpy((__force void *) to, from, n); return 0; } - return access_ok(VERIFY_WRITE, to, n) ? - buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from) : - n; + return buffer_op((unsigned long) to, n, 1, copy_chunk_to_user, &from); } -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(__copy_to_user); static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) { @@ -188,9 +184,9 @@ static int strncpy_chunk_from_user(unsigned long from, int len, void *arg) return 0; } -int strncpy_from_user(char *dst, const char __user *src, int count) +long __strncpy_from_user(char *dst, const char __user *src, long count) { - int n; + long n; char *ptr = dst; if (segment_eq(get_fs(), KERNEL_DS)) { @@ -198,16 +194,13 @@ int strncpy_from_user(char *dst, const char __user *src, int count) return strnlen(dst, count); } - if (!access_ok(VERIFY_READ, src, 1)) - return -EFAULT; - n = buffer_op((unsigned long) src, count, 0, strncpy_chunk_from_user, &ptr); if (n != 0) return -EFAULT; return strnlen(dst, count); } -EXPORT_SYMBOL(strncpy_from_user); +EXPORT_SYMBOL(__strncpy_from_user); static int clear_chunk(unsigned long addr, int len, void *unused) { @@ -215,22 +208,16 @@ static int clear_chunk(unsigned long addr, int len, void *unused) return 0; } -int __clear_user(void __user *mem, int len) -{ - return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL); -} - -int clear_user(void __user *mem, int len) +unsigned long __clear_user(void __user *mem, unsigned long len) { if (segment_eq(get_fs(), KERNEL_DS)) { memset((__force void*)mem, 0, len); return 0; } - return access_ok(VERIFY_WRITE, mem, len) ? - buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL) : len; + return buffer_op((unsigned long) mem, len, 1, clear_chunk, NULL); } -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); static int strnlen_chunk(unsigned long str, int len, void *arg) { @@ -244,7 +231,7 @@ static int strnlen_chunk(unsigned long str, int len, void *arg) return 0; } -int strnlen_user(const void __user *str, int len) +long __strnlen_user(const void __user *str, long len) { int count = 0, n; @@ -256,4 +243,4 @@ int strnlen_user(const void __user *str, int len) return count + 1; return 0; } -EXPORT_SYMBOL(strnlen_user); +EXPORT_SYMBOL(__strnlen_user); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 47ff9b7f3e5d..557232f758b6 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -220,6 +220,11 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } + else if (!is_user && address < TASK_SIZE) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx", + address, ip); + } if (SEGV_IS_FIXABLE(&fi)) err = handle_page_fault(address, ip, is_write, is_user, diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 07f798f4bcee..16630e75f056 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -248,8 +248,6 @@ EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -extern char __binary_start; - int __init linux_main(int argc, char **argv) { unsigned long avail, diff; @@ -294,7 +292,7 @@ int __init linux_main(int argc, char **argv) physmem_size += UML_ROUND_UP(brk_start) - UML_ROUND_UP(&_end); } - uml_physmem = (unsigned long) &__binary_start & PAGE_MASK; + uml_physmem = (unsigned long) __binary_start & PAGE_MASK; /* Reserve up to 4M after the current brk */ uml_reserved = ROUND_4M(brk_start) + (1 << 22); diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c index 14126d9176aa..c2e6e1dad876 100644 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ b/arch/um/os-Linux/drivers/tuntap_user.c @@ -47,7 +47,7 @@ static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, } struct tuntap_pre_exec_data { - int stdout; + int stdout_fd; int close_me; }; @@ -55,7 +55,7 @@ static void tuntap_pre_exec(void *arg) { struct tuntap_pre_exec_data *data = arg; - dup2(data->stdout, 1); + dup2(data->stdout_fd, 1); close(data->close_me); } @@ -74,7 +74,7 @@ static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, sprintf(version_buf, "%d", UML_NET_VERSION); - data.stdout = remote; + data.stdout_fd = remote; data.close_me = me; pid = run_helper(tuntap_pre_exec, &data, argv); diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 08d90fba952c..26e0164895e4 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -13,6 +13,7 @@ #include <sys/socket.h> #include <sys/stat.h> #include <sys/un.h> +#include <sys/types.h> #include <os.h> static void copy_stat(struct uml_stat *dst, const struct stat64 *src) diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 7b605e4dfffa..036d0dbc7b52 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -112,9 +112,11 @@ void timer_init(void) void set_sigstack(void *sig_stack, int size) { - stack_t stack = ((stack_t) { .ss_flags = 0, - .ss_sp = (__ptr_t) sig_stack, - .ss_size = size - sizeof(void *) }); + stack_t stack = { + .ss_flags = 0, + .ss_sp = sig_stack, + .ss_size = size - sizeof(void *) + }; if (sigaltstack(&stack, NULL) != 0) panic("enabling signal stack failed, errno = %d\n", errno); diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index e7f8c945a573..35015e3e1e87 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -18,7 +18,7 @@ #include <sysdep/ptrace.h> #include <sysdep/stub.h> -extern unsigned long batch_syscall_stub, __syscall_stub_start; +extern char batch_syscall_stub[], __syscall_stub_start[]; extern void wait_stub_done(int pid); @@ -38,8 +38,8 @@ static int __init init_syscall_regs(void) { get_safe_registers(syscall_regs, NULL); syscall_regs[REGS_IP_INDEX] = STUB_CODE + - ((unsigned long) &batch_syscall_stub - - (unsigned long) &__syscall_stub_start); + ((unsigned long) batch_syscall_stub - + (unsigned long) __syscall_stub_start); return 0; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 7a9777570a62..3dddedba3a07 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -174,7 +174,7 @@ static void handle_trap(int pid, struct uml_pt_regs *regs, handle_syscall(regs); } -extern int __syscall_stub_start; +extern char __syscall_stub_start[]; static int userspace_tramp(void *stack) { @@ -197,7 +197,7 @@ static int userspace_tramp(void *stack) * This has a pte, but it can't be mapped in with the usual * tlb_flush mechanism because this is part of that mechanism */ - fd = phys_mapping(to_phys(&__syscall_stub_start), &offset); + fd = phys_mapping(to_phys(__syscall_stub_start), &offset); addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); if (addr == MAP_FAILED) { @@ -223,7 +223,7 @@ static int userspace_tramp(void *stack) unsigned long v = STUB_CODE + (unsigned long) stub_segv_handler - - (unsigned long) &__syscall_stub_start; + (unsigned long) __syscall_stub_start; set_sigstack((void *) STUB_DATA, UM_KERN_PAGE_SIZE); sigemptyset(&sa.sa_mask); @@ -447,7 +447,7 @@ static int __init init_thread_regs(void) /* Set parent's instruction pointer to start of clone-stub */ thread_regs[REGS_IP_INDEX] = STUB_CODE + (unsigned long) stub_clone_handler - - (unsigned long) &__syscall_stub_start; + (unsigned long) __syscall_stub_start; thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE - sizeof(void *); #ifdef __SIGNAL_FRAMESIZE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4fcf0ade7e91..d05a42357ef0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,6 +27,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_PMEM_API select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -1419,6 +1420,9 @@ source "mm/Kconfig" config X86_PMEM_LEGACY bool "Support non-standard NVDIMMs and ADR protected memory" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used by the Intel Sandy Bridge-EP reference BIOS as protected memory. diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 48304b89b601..2c82bd150d43 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params, e820_type = E820_NVS; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; + default: continue; } diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index b6f7457d12e4..9bf3ea14b9f0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -4,6 +4,7 @@ /* Caches aren't brain-dead on the intel. */ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> +#include <asm/uaccess.h> /* * The set_memory_* API can be used to change various attributes of a virtual @@ -108,4 +109,75 @@ static inline int rodata_test(void) } #endif +#ifdef ARCH_HAS_NOCACHE_UACCESS + +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ +#ifdef CONFIG_X86_64 + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +#else + return false; +#endif +} +#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ +extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); +extern void arch_wmb_pmem(void); + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} +#endif + #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 83ec9b1d77cc..cc9c61bc1abe 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -248,6 +248,12 @@ static inline void flush_write_buffers(void) #endif } +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return (void __force __pmem *) ioremap_cache(offset, size); +} + #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 960a8a9dc4ab..0f457e6eab18 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -32,6 +32,7 @@ #define E820_ACPI 3 #define E820_NVS 4 #define E820_UNUSABLE 5 +#define E820_PMEM 7 /* * This is a non-standardized way to represent ADR or NVDIMM regions that diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c8dda42cb6a3..a102564d08eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type) case E820_UNUSABLE: printk(KERN_CONT "unusable"); break; + case E820_PMEM: case E820_PRAM: printk(KERN_CONT "persistent (type %u)", type); break; @@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type) case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; case E820_UNUSABLE: return "Unusable memory"; - case E820_PRAM: return "Persistent RAM"; + case E820_PRAM: return "Persistent Memory (legacy)"; + case E820_PMEM: return "Persistent Memory"; default: return "reserved"; } } +static bool do_mark_busy(u32 type, struct resource *res) +{ + /* this is the legacy bios/dos rom-shadow + mmio region */ + if (res->start < (1ULL<<20)) + return true; + + /* + * Treat persistent memory like device memory, i.e. reserve it + * for exclusive use of a driver + */ + switch (type) { + case E820_RESERVED: + case E820_PRAM: + case E820_PMEM: + return false; + default: + return true; + } +} + /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -952,9 +974,7 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (((e820.map[i].type != E820_RESERVED) && - (e820.map[i].type != E820_PRAM)) || - res->start < (1ULL<<20)) { + if (do_mark_busy(e820.map[i].type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3420c874ddc5..64f90f53bb85 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -1,53 +1,82 @@ /* * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. */ -#include <linux/memblock.h> #include <linux/platform_device.h> -#include <linux/slab.h> +#include <linux/libnvdimm.h> +#include <linux/module.h> #include <asm/e820.h> -#include <asm/page_types.h> -#include <asm/setup.h> -static __init void register_pmem_device(struct resource *res) +static void e820_pmem_release(struct device *dev) { - struct platform_device *pdev; - int error; + struct nvdimm_bus *nvdimm_bus = dev->platform_data; - pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); - if (!pdev) - return; + if (nvdimm_bus) + nvdimm_bus_unregister(nvdimm_bus); +} - error = platform_device_add_resources(pdev, res, 1); - if (error) - goto out_put_pdev; +static struct platform_device e820_pmem = { + .name = "e820_pmem", + .id = -1, + .dev = { + .release = e820_pmem_release, + }, +}; - error = platform_device_add(pdev); - if (error) - goto out_put_pdev; - return; +static const struct attribute_group *e820_pmem_attribute_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; -out_put_pdev: - dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); - platform_device_put(pdev); -} +static const struct attribute_group *e820_pmem_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + NULL, +}; -static __init int register_pmem_devices(void) +static __init int register_e820_pmem(void) { - int i; + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &e820_pmem.dev; + struct nvdimm_bus *nvdimm_bus; + int rc, i; + + rc = platform_device_register(&e820_pmem); + if (rc) + return rc; + + nd_desc.attr_groups = e820_pmem_attribute_groups; + nd_desc.provider_name = "e820"; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + dev->platform_data = nvdimm_bus; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + struct resource res = { + .flags = IORESOURCE_MEM, + .start = ei->addr, + .end = ei->addr + ei->size - 1, + }; + struct nd_region_desc ndr_desc; + + if (ei->type != E820_PRAM) + continue; - if (ei->type == E820_PRAM) { - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - register_pmem_device(&res); - } + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = &res; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = NUMA_NO_NODE; + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err; } return 0; + + err: + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + platform_device_unregister(&e820_pmem); + return -ENXIO; } -device_initcall(register_pmem_devices); +device_initcall(register_e820_pmem); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c1c382c58c60..cfba30f27392 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -174,6 +174,9 @@ static void __init do_add_efi_memmap(void) case EFI_UNUSABLE_MEMORY: e820_type = E820_UNUSABLE; break; + case EFI_PERSISTENT_MEMORY: + e820_type = E820_PMEM; + break; default: /* * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index 4b181b74454f..ee940185e89f 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -3,6 +3,7 @@ #include <linux/string.h> #include <linux/in6.h> +#include <linux/uaccess.h> /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 0a656b727b1a..548197212a45 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -200,8 +200,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG]; typedef struct user_i387_struct elf_fpregset_t; -#define task_pt_regs(t) (&(t)->thread.regs) - struct task_struct; extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 2a206d2b14ab..233ee09c1ce8 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -28,6 +28,8 @@ static inline void rep_nop(void) #define cpu_relax() rep_nop() #define cpu_relax_lowlatency() cpu_relax() +#define task_pt_regs(t) (&(t)->thread.regs) + #include <asm/processor-generic.h> #endif diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h index 45183fcd10b6..41dd5e1f3cd7 100644 --- a/arch/x86/um/asm/segment.h +++ b/arch/x86/um/asm/segment.h @@ -7,4 +7,12 @@ extern int host_gdt_entry_tls_min; #define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +typedef struct { + unsigned long seg; +} mm_segment_t; + +#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) +#define KERNEL_DS MAKE_MM_SEG(~0UL) +#define USER_DS MAKE_MM_SEG(TASK_SIZE) + #endif diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c index 5c0b711d2433..9701a4fd7bf2 100644 --- a/arch/x86/um/ldt.c +++ b/arch/x86/um/ldt.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #include <os.h> #include <skas.h> diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index f40281e5d6a2..744afdc18cf3 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -7,8 +7,7 @@ */ #include <linux/mm.h> -#include <asm/page.h> -#include <asm/mman.h> +#include <asm/elf.h> static struct vm_area_struct gate_vma; diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index f8fecaddcc0d..7642e2e2aa61 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -1,6 +1,5 @@ #include <linux/mm.h> -#include <asm/page.h> -#include <asm/mman.h> +#include <asm/elf.h> const char *arch_vma_name(struct vm_area_struct *vma) { diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ce3dd4f36f3f..a29756f2d940 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/sched.h> #include <asm/uaccess.h> +#include <asm/ptrace-abi.h> #include <skas.h> extern int arch_switch_tls(struct task_struct *to); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 3b52bf0b418a..a629694ee750 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -11,6 +11,7 @@ #define __FRAME_OFFSETS #include <asm/ptrace.h> #include <asm/uaccess.h> +#include <asm/ptrace-abi.h> /* * determines which flags the user has access to. diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index 27cce00c6b30..a682db13df23 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -1,7 +1,7 @@ #ifndef _SYSDEP_TLS_H #define _SYSDEP_TLS_H -# ifndef __KERNEL__ +#ifdef __UM_HOST__ /* Change name to avoid conflicts with the original one from <asm/ldt.h>, which * may be named user_desc (but in 2.4 and in header matching its API was named @@ -22,11 +22,11 @@ typedef struct um_dup_user_desc { #endif } user_desc_t; -# else /* __KERNEL__ */ +#else /* __UM_HOST__ */ typedef struct user_desc user_desc_t; -# endif /* __KERNEL__ */ +#endif /* __UM_HOST__ */ extern int os_set_thread_area(user_desc_t *info, int pid); extern int os_get_thread_area(user_desc_t *info, int pid); diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 592491d1d70d..06934a8a4872 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -541,7 +541,8 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, */ /* x86-64 should always use SA_RESTORER. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) - err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode); + err |= __put_user((void *)ksig->ka.sa.sa_restorer, + &frame->pretcode); else /* could use a vstub here */ return err; diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index adb08eb5c22a..e6552275320b 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -6,6 +6,7 @@ */ #include <linux/sched.h> +#include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ #include <os.h> diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 80ffa5b9982d..48e38584d5c1 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -7,6 +7,7 @@ #include <linux/sched.h> #include <linux/syscalls.h> #include <asm/uaccess.h> +#include <asm/ptrace-abi.h> #include <os.h> #include <skas.h> #include <sysdep/tls.h> diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c index d22363cb854e..3ad714373d7f 100644 --- a/arch/x86/um/tls_64.c +++ b/arch/x86/um/tls_64.c @@ -1,4 +1,5 @@ #include <linux/sched.h> +#include <asm/ptrace-abi.h> void clear_flushed_tls(struct task_struct *task) { diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 916cda4cd5b4..237c6831e095 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -10,6 +10,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <asm/page.h> +#include <asm/elf.h> #include <linux/init.h> static unsigned int __read_mostly vdso_enabled = 1; diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 55b6f15dac90..dda653ce7b24 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_put_request; } - ret = -EFAULT; - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) + ret = blk_fill_sghdr_rq(q, rq, hdr, mode); + if (ret < 0) goto out_free_cdb; ret = 0; diff --git a/drivers/Kconfig b/drivers/Kconfig index c0cc96bab9e7..6e973b8e3a3b 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -182,4 +182,6 @@ source "drivers/thunderbolt/Kconfig" source "drivers/android/Kconfig" +source "drivers/nvdimm/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 9a02fb7c5106..b64b49f6e01b 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ obj-$(CONFIG_PARPORT) += parport/ obj-y += base/ block/ misc/ mfd/ nfc/ +obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/ obj-$(CONFIG_NUBUS) += nubus/ obj-y += macintosh/ diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 35da507411a0..f15db002be8e 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -386,6 +386,32 @@ config ACPI_REDUCED_HARDWARE_ONLY If you are unsure what to do, do not enable this option. +config ACPI_NFIT + tristate "ACPI NVDIMM Firmware Interface Table (NFIT)" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + select LIBNVDIMM + help + Infrastructure to probe ACPI 6 compliant platforms for + NVDIMMs (NFIT) and register a libnvdimm device tree. In + addition to storage devices this also enables libnvdimm to pass + ACPI._DSM messages for platform/dimm configuration. + + To compile this driver as a module, choose M here: + the module will be called nfit. + +config ACPI_NFIT_DEBUG + bool "NFIT DSM debug" + depends on ACPI_NFIT + depends on DYNAMIC_DEBUG + default n + help + Enabling this option causes the nfit driver to dump the + input and output buffers of _DSM operations on the ACPI0012 + device and its children. This can be very verbose, so leave + it disabled unless you are debugging a hardware / firmware + issue. + source "drivers/acpi/apei/Kconfig" config ACPI_EXTLOG diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 73d840bef455..8321430d7f24 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-y += container.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o +obj-$(CONFIG_ACPI_NFIT) += nfit.o obj-y += acpi_memhotplug.o obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o obj-$(CONFIG_ACPI_BATTERY) += battery.o diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c new file mode 100644 index 000000000000..2161fa178c8d --- /dev/null +++ b/drivers/acpi/nfit.c @@ -0,0 +1,1587 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/list_sort.h> +#include <linux/libnvdimm.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ndctl.h> +#include <linux/list.h> +#include <linux/acpi.h> +#include <linux/sort.h> +#include <linux/io.h> +#include "nfit.h" + +/* + * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is + * irrelevant. + */ +#include <asm-generic/io-64-nonatomic-hi-lo.h> + +static bool force_enable_dimms; +module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); + +static u8 nfit_uuid[NFIT_UUID_MAX][16]; + +const u8 *to_nfit_uuid(enum nfit_uuids id) +{ + return nfit_uuid[id]; +} +EXPORT_SYMBOL(to_nfit_uuid); + +static struct acpi_nfit_desc *to_acpi_nfit_desc( + struct nvdimm_bus_descriptor *nd_desc) +{ + return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); +} + +static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + + /* + * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct + * acpi_device. + */ + if (!nd_desc->provider_name + || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0) + return NULL; + + return to_acpi_device(acpi_desc->dev); +} + +static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) +{ + struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc); + const struct nd_cmd_desc *desc = NULL; + union acpi_object in_obj, in_buf, *out_obj; + struct device *dev = acpi_desc->dev; + const char *cmd_name, *dimm_name; + unsigned long dsm_mask; + acpi_handle handle; + const u8 *uuid; + u32 offset; + int rc, i; + + if (nvdimm) { + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + struct acpi_device *adev = nfit_mem->adev; + + if (!adev) + return -ENOTTY; + dimm_name = nvdimm_name(nvdimm); + cmd_name = nvdimm_cmd_name(cmd); + dsm_mask = nfit_mem->dsm_mask; + desc = nd_cmd_dimm_desc(cmd); + uuid = to_nfit_uuid(NFIT_DEV_DIMM); + handle = adev->handle; + } else { + struct acpi_device *adev = to_acpi_dev(acpi_desc); + + cmd_name = nvdimm_bus_cmd_name(cmd); + dsm_mask = nd_desc->dsm_mask; + desc = nd_cmd_bus_desc(cmd); + uuid = to_nfit_uuid(NFIT_DEV_BUS); + handle = adev->handle; + dimm_name = "bus"; + } + + if (!desc || (cmd && (desc->out_num + desc->in_num == 0))) + return -ENOTTY; + + if (!test_bit(cmd, &dsm_mask)) + return -ENOTTY; + + in_obj.type = ACPI_TYPE_PACKAGE; + in_obj.package.count = 1; + in_obj.package.elements = &in_buf; + in_buf.type = ACPI_TYPE_BUFFER; + in_buf.buffer.pointer = buf; + in_buf.buffer.length = 0; + + /* libnvdimm has already validated the input envelope */ + for (i = 0; i < desc->in_num; i++) + in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc, + i, buf); + + if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { + dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__, + dimm_name, cmd_name, in_buf.buffer.length); + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, + 4, in_buf.buffer.pointer, min_t(u32, 128, + in_buf.buffer.length), true); + } + + out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj); + if (!out_obj) { + dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name, + cmd_name); + return -EINVAL; + } + + if (out_obj->package.type != ACPI_TYPE_BUFFER) { + dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n", + __func__, dimm_name, cmd_name, out_obj->type); + rc = -EINVAL; + goto out; + } + + if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { + dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, + dimm_name, cmd_name, out_obj->buffer.length); + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, + 4, out_obj->buffer.pointer, min_t(u32, 128, + out_obj->buffer.length), true); + } + + for (i = 0, offset = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, + (u32 *) out_obj->buffer.pointer); + + if (offset + out_size > out_obj->buffer.length) { + dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + break; + } + + if (in_buf.buffer.length + offset + out_size > buf_len) { + dev_dbg(dev, "%s:%s output overrun cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + rc = -ENXIO; + goto out; + } + memcpy(buf + in_buf.buffer.length + offset, + out_obj->buffer.pointer + offset, out_size); + offset += out_size; + } + if (offset + in_buf.buffer.length < buf_len) { + if (i >= 1) { + /* + * status valid, return the number of bytes left + * unfilled in the output buffer + */ + rc = buf_len - offset - in_buf.buffer.length; + } else { + dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n", + __func__, dimm_name, cmd_name, buf_len, + offset); + rc = -ENXIO; + } + } else + rc = 0; + + out: + ACPI_FREE(out_obj); + + return rc; +} + +static const char *spa_type_name(u16 type) +{ + static const char *to_name[] = { + [NFIT_SPA_VOLATILE] = "volatile", + [NFIT_SPA_PM] = "pmem", + [NFIT_SPA_DCR] = "dimm-control-region", + [NFIT_SPA_BDW] = "block-data-window", + [NFIT_SPA_VDISK] = "volatile-disk", + [NFIT_SPA_VCD] = "volatile-cd", + [NFIT_SPA_PDISK] = "persistent-disk", + [NFIT_SPA_PCD] = "persistent-cd", + + }; + + if (type > NFIT_SPA_PCD) + return "unknown"; + + return to_name[type]; +} + +static int nfit_spa_type(struct acpi_nfit_system_address *spa) +{ + int i; + + for (i = 0; i < NFIT_UUID_MAX; i++) + if (memcmp(to_nfit_uuid(i), spa->range_guid, 16) == 0) + return i; + return -1; +} + +static bool add_spa(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct device *dev = acpi_desc->dev; + struct nfit_spa *nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa), + GFP_KERNEL); + + if (!nfit_spa) + return false; + INIT_LIST_HEAD(&nfit_spa->list); + nfit_spa->spa = spa; + list_add_tail(&nfit_spa->list, &acpi_desc->spas); + dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__, + spa->range_index, + spa_type_name(nfit_spa_type(spa))); + return true; +} + +static bool add_memdev(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_memory_map *memdev) +{ + struct device *dev = acpi_desc->dev; + struct nfit_memdev *nfit_memdev = devm_kzalloc(dev, + sizeof(*nfit_memdev), GFP_KERNEL); + + if (!nfit_memdev) + return false; + INIT_LIST_HEAD(&nfit_memdev->list); + nfit_memdev->memdev = memdev; + list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); + dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n", + __func__, memdev->device_handle, memdev->range_index, + memdev->region_index); + return true; +} + +static bool add_dcr(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_control_region *dcr) +{ + struct device *dev = acpi_desc->dev; + struct nfit_dcr *nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr), + GFP_KERNEL); + + if (!nfit_dcr) + return false; + INIT_LIST_HEAD(&nfit_dcr->list); + nfit_dcr->dcr = dcr; + list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs); + dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__, + dcr->region_index, dcr->windows); + return true; +} + +static bool add_bdw(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_data_region *bdw) +{ + struct device *dev = acpi_desc->dev; + struct nfit_bdw *nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw), + GFP_KERNEL); + + if (!nfit_bdw) + return false; + INIT_LIST_HEAD(&nfit_bdw->list); + nfit_bdw->bdw = bdw; + list_add_tail(&nfit_bdw->list, &acpi_desc->bdws); + dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__, + bdw->region_index, bdw->windows); + return true; +} + +static bool add_idt(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_interleave *idt) +{ + struct device *dev = acpi_desc->dev; + struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt), + GFP_KERNEL); + + if (!nfit_idt) + return false; + INIT_LIST_HEAD(&nfit_idt->list); + nfit_idt->idt = idt; + list_add_tail(&nfit_idt->list, &acpi_desc->idts); + dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__, + idt->interleave_index, idt->line_count); + return true; +} + +static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, + const void *end) +{ + struct device *dev = acpi_desc->dev; + struct acpi_nfit_header *hdr; + void *err = ERR_PTR(-ENOMEM); + + if (table >= end) + return NULL; + + hdr = table; + switch (hdr->type) { + case ACPI_NFIT_TYPE_SYSTEM_ADDRESS: + if (!add_spa(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_MEMORY_MAP: + if (!add_memdev(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_CONTROL_REGION: + if (!add_dcr(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_DATA_REGION: + if (!add_bdw(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_INTERLEAVE: + if (!add_idt(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_FLUSH_ADDRESS: + dev_dbg(dev, "%s: flush\n", __func__); + break; + case ACPI_NFIT_TYPE_SMBIOS: + dev_dbg(dev, "%s: smbios\n", __func__); + break; + default: + dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type); + break; + } + + return table + hdr->length; +} + +static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem) +{ + u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle; + u16 dcr = nfit_mem->dcr->region_index; + struct nfit_spa *nfit_spa; + + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + u16 range_index = nfit_spa->spa->range_index; + int type = nfit_spa_type(nfit_spa->spa); + struct nfit_memdev *nfit_memdev; + + if (type != NFIT_SPA_BDW) + continue; + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + if (nfit_memdev->memdev->range_index != range_index) + continue; + if (nfit_memdev->memdev->device_handle != device_handle) + continue; + if (nfit_memdev->memdev->region_index != dcr) + continue; + + nfit_mem->spa_bdw = nfit_spa->spa; + return; + } + } + + dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n", + nfit_mem->spa_dcr->range_index); + nfit_mem->bdw = NULL; +} + +static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa) +{ + u16 dcr = __to_nfit_memdev(nfit_mem)->region_index; + struct nfit_memdev *nfit_memdev; + struct nfit_dcr *nfit_dcr; + struct nfit_bdw *nfit_bdw; + struct nfit_idt *nfit_idt; + u16 idt_idx, range_index; + + list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { + if (nfit_dcr->dcr->region_index != dcr) + continue; + nfit_mem->dcr = nfit_dcr->dcr; + break; + } + + if (!nfit_mem->dcr) { + dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n", + spa->range_index, __to_nfit_memdev(nfit_mem) + ? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR"); + return -ENODEV; + } + + /* + * We've found enough to create an nvdimm, optionally + * find an associated BDW + */ + list_add(&nfit_mem->list, &acpi_desc->dimms); + + list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) { + if (nfit_bdw->bdw->region_index != dcr) + continue; + nfit_mem->bdw = nfit_bdw->bdw; + break; + } + + if (!nfit_mem->bdw) + return 0; + + nfit_mem_find_spa_bdw(acpi_desc, nfit_mem); + + if (!nfit_mem->spa_bdw) + return 0; + + range_index = nfit_mem->spa_bdw->range_index; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + if (nfit_memdev->memdev->range_index != range_index || + nfit_memdev->memdev->region_index != dcr) + continue; + nfit_mem->memdev_bdw = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_bdw = nfit_idt->idt; + break; + } + break; + } + + return 0; +} + +static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_mem *nfit_mem, *found; + struct nfit_memdev *nfit_memdev; + int type = nfit_spa_type(spa); + u16 dcr; + + switch (type) { + case NFIT_SPA_DCR: + case NFIT_SPA_PM: + break; + default: + return 0; + } + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + int rc; + + if (nfit_memdev->memdev->range_index != spa->range_index) + continue; + found = NULL; + dcr = nfit_memdev->memdev->region_index; + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) + if (__to_nfit_memdev(nfit_mem)->region_index == dcr) { + found = nfit_mem; + break; + } + + if (found) + nfit_mem = found; + else { + nfit_mem = devm_kzalloc(acpi_desc->dev, + sizeof(*nfit_mem), GFP_KERNEL); + if (!nfit_mem) + return -ENOMEM; + INIT_LIST_HEAD(&nfit_mem->list); + } + + if (type == NFIT_SPA_DCR) { + struct nfit_idt *nfit_idt; + u16 idt_idx; + + /* multiple dimms may share a SPA when interleaved */ + nfit_mem->spa_dcr = spa; + nfit_mem->memdev_dcr = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_dcr = nfit_idt->idt; + break; + } + } else { + /* + * A single dimm may belong to multiple SPA-PM + * ranges, record at least one in addition to + * any SPA-DCR range. + */ + nfit_mem->memdev_pmem = nfit_memdev->memdev; + } + + if (found) + continue; + + rc = nfit_mem_add(acpi_desc, nfit_mem, spa); + if (rc) + return rc; + } + + return 0; +} + +static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b) +{ + struct nfit_mem *a = container_of(_a, typeof(*a), list); + struct nfit_mem *b = container_of(_b, typeof(*b), list); + u32 handleA, handleB; + + handleA = __to_nfit_memdev(a)->device_handle; + handleB = __to_nfit_memdev(b)->device_handle; + if (handleA < handleB) + return -1; + else if (handleA > handleB) + return 1; + return 0; +} + +static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_spa *nfit_spa; + + /* + * For each SPA-DCR or SPA-PMEM address range find its + * corresponding MEMDEV(s). From each MEMDEV find the + * corresponding DCR. Then, if we're operating on a SPA-DCR, + * try to find a SPA-BDW and a corresponding BDW that references + * the DCR. Throw it all into an nfit_mem object. Note, that + * BDWs are optional. + */ + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + int rc; + + rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa); + if (rc) + return rc; + } + + list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); + + return 0; +} + +static ssize_t revision_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + return sprintf(buf, "%d\n", acpi_desc->nfit->header.revision); +} +static DEVICE_ATTR_RO(revision); + +static struct attribute *acpi_nfit_attributes[] = { + &dev_attr_revision.attr, + NULL, +}; + +static struct attribute_group acpi_nfit_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_attributes, +}; + +const struct attribute_group *acpi_nfit_attribute_groups[] = { + &nvdimm_bus_attribute_group, + &acpi_nfit_attribute_group, + NULL, +}; +EXPORT_SYMBOL_GPL(acpi_nfit_attribute_groups); + +static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + + return __to_nfit_memdev(nfit_mem); +} + +static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + + return nfit_mem->dcr; +} + +static ssize_t handle_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); + + return sprintf(buf, "%#x\n", memdev->device_handle); +} +static DEVICE_ATTR_RO(handle); + +static ssize_t phys_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); + + return sprintf(buf, "%#x\n", memdev->physical_id); +} +static DEVICE_ATTR_RO(phys_id); + +static ssize_t vendor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->vendor_id); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t rev_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->revision_id); +} +static DEVICE_ATTR_RO(rev_id); + +static ssize_t device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->device_id); +} +static DEVICE_ATTR_RO(device); + +static ssize_t format_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->code); +} +static DEVICE_ATTR_RO(format); + +static ssize_t serial_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->serial_number); +} +static DEVICE_ATTR_RO(serial); + +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u16 flags = to_nfit_memdev(dev)->flags; + + return sprintf(buf, "%s%s%s%s%s\n", + flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "", + flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "", + flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "", + flags & ACPI_NFIT_MEM_ARMED ? "arm " : "", + flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart " : ""); +} +static DEVICE_ATTR_RO(flags); + +static struct attribute *acpi_nfit_dimm_attributes[] = { + &dev_attr_handle.attr, + &dev_attr_phys_id.attr, + &dev_attr_vendor.attr, + &dev_attr_device.attr, + &dev_attr_format.attr, + &dev_attr_serial.attr, + &dev_attr_rev_id.attr, + &dev_attr_flags.attr, + NULL, +}; + +static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (to_nfit_dcr(dev)) + return a->mode; + else + return 0; +} + +static struct attribute_group acpi_nfit_dimm_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_dimm_attributes, + .is_visible = acpi_nfit_dimm_attr_visible, +}; + +static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { + &nvdimm_attribute_group, + &nd_device_attribute_group, + &acpi_nfit_dimm_attribute_group, + NULL, +}; + +static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, + u32 device_handle) +{ + struct nfit_mem *nfit_mem; + + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) + if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) + return nfit_mem->nvdimm; + + return NULL; +} + +static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem, u32 device_handle) +{ + struct acpi_device *adev, *adev_dimm; + struct device *dev = acpi_desc->dev; + const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM); + unsigned long long sta; + int i, rc = -ENODEV; + acpi_status status; + + nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en; + adev = to_acpi_dev(acpi_desc); + if (!adev) + return 0; + + adev_dimm = acpi_find_child_device(adev, device_handle, false); + nfit_mem->adev = adev_dimm; + if (!adev_dimm) { + dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n", + device_handle); + return force_enable_dimms ? 0 : -ENODEV; + } + + status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta); + if (status == AE_NOT_FOUND) { + dev_dbg(dev, "%s missing _STA, assuming enabled...\n", + dev_name(&adev_dimm->dev)); + rc = 0; + } else if (ACPI_FAILURE(status)) + dev_err(dev, "%s failed to retrieve_STA, disabling...\n", + dev_name(&adev_dimm->dev)); + else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0) + dev_info(dev, "%s disabled by firmware\n", + dev_name(&adev_dimm->dev)); + else + rc = 0; + + for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++) + if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i)) + set_bit(i, &nfit_mem->dsm_mask); + + return force_enable_dimms ? 0 : rc; +} + +static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_mem *nfit_mem; + int dimm_count = 0; + + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct nvdimm *nvdimm; + unsigned long flags = 0; + u32 device_handle; + u16 mem_flags; + int rc; + + device_handle = __to_nfit_memdev(nfit_mem)->device_handle; + nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); + if (nvdimm) { + /* + * If for some reason we find multiple DCRs the + * first one wins + */ + dev_err(acpi_desc->dev, "duplicate DCR detected: %s\n", + nvdimm_name(nvdimm)); + continue; + } + + if (nfit_mem->bdw && nfit_mem->memdev_pmem) + flags |= NDD_ALIASING; + + mem_flags = __to_nfit_memdev(nfit_mem)->flags; + if (mem_flags & ACPI_NFIT_MEM_ARMED) + flags |= NDD_UNARMED; + + rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); + if (rc) + continue; + + nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, + acpi_nfit_dimm_attribute_groups, + flags, &nfit_mem->dsm_mask); + if (!nvdimm) + return -ENOMEM; + + nfit_mem->nvdimm = nvdimm; + dimm_count++; + + if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) + continue; + + dev_info(acpi_desc->dev, "%s: failed: %s%s%s%s\n", + nvdimm_name(nvdimm), + mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "", + mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "", + mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "", + mem_flags & ACPI_NFIT_MEM_ARMED ? "arm " : ""); + + } + + return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); +} + +static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + const u8 *uuid = to_nfit_uuid(NFIT_DEV_BUS); + struct acpi_device *adev; + int i; + + adev = to_acpi_dev(acpi_desc); + if (!adev) + return; + + for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++) + if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i)) + set_bit(i, &nd_desc->dsm_mask); +} + +static ssize_t range_index_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); + + return sprintf(buf, "%d\n", nfit_spa->spa->range_index); +} +static DEVICE_ATTR_RO(range_index); + +static struct attribute *acpi_nfit_region_attributes[] = { + &dev_attr_range_index.attr, + NULL, +}; + +static struct attribute_group acpi_nfit_region_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_region_attributes, +}; + +static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_mapping_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + &acpi_nfit_region_attribute_group, + NULL, +}; + +/* enough info to uniquely specify an interleave set */ +struct nfit_set_info { + struct nfit_set_info_map { + u64 region_offset; + u32 serial_number; + u32 pad; + } mapping[0]; +}; + +static size_t sizeof_nfit_set_info(int num_mappings) +{ + return sizeof(struct nfit_set_info) + + num_mappings * sizeof(struct nfit_set_info_map); +} + +static int cmp_map(const void *m0, const void *m1) +{ + const struct nfit_set_info_map *map0 = m0; + const struct nfit_set_info_map *map1 = m1; + + return memcmp(&map0->region_offset, &map1->region_offset, + sizeof(u64)); +} + +/* Retrieve the nth entry referencing this spa */ +static struct acpi_nfit_memory_map *memdev_from_spa( + struct acpi_nfit_desc *acpi_desc, u16 range_index, int n) +{ + struct nfit_memdev *nfit_memdev; + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) + if (nfit_memdev->memdev->range_index == range_index) + if (n-- == 0) + return nfit_memdev->memdev; + return NULL; +} + +static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc, + struct acpi_nfit_system_address *spa) +{ + int i, spa_type = nfit_spa_type(spa); + struct device *dev = acpi_desc->dev; + struct nd_interleave_set *nd_set; + u16 nr = ndr_desc->num_mappings; + struct nfit_set_info *info; + + if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE) + /* pass */; + else + return 0; + + nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) + return -ENOMEM; + + info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL); + if (!info) + return -ENOMEM; + for (i = 0; i < nr; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nfit_set_info_map *map = &info->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, + spa->range_index, i); + + if (!memdev || !nfit_mem->dcr) { + dev_err(dev, "%s: failed to find DCR\n", __func__); + return -ENODEV; + } + + map->region_offset = memdev->region_offset; + map->serial_number = nfit_mem->dcr->serial_number; + } + + sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map), + cmp_map, NULL); + nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); + ndr_desc->nd_set = nd_set; + devm_kfree(dev, info); + + return 0; +} + +static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio) +{ + struct acpi_nfit_interleave *idt = mmio->idt; + u32 sub_line_offset, line_index, line_offset; + u64 line_no, table_skip_count, table_offset; + + line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset); + table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index); + line_offset = idt->line_offset[line_index] + * mmio->line_size; + table_offset = table_skip_count * mmio->table_size; + + return mmio->base_offset + line_offset + table_offset + sub_line_offset; +} + +static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + u64 offset = nfit_blk->stat_offset + mmio->size * bw; + + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + return readq(mmio->base + offset); +} + +static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, + resource_size_t dpa, unsigned int len, unsigned int write) +{ + u64 cmd, offset; + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + + enum { + BCW_OFFSET_MASK = (1ULL << 48)-1, + BCW_LEN_SHIFT = 48, + BCW_LEN_MASK = (1ULL << 8) - 1, + BCW_CMD_SHIFT = 56, + }; + + cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK; + len = len >> L1_CACHE_SHIFT; + cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT; + cmd |= ((u64) write) << BCW_CMD_SHIFT; + + offset = nfit_blk->cmd_offset + mmio->size * bw; + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + writeq(cmd, mmio->base + offset); + /* FIXME: conditionally perform read-back if mandated by firmware */ +} + +static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, + resource_size_t dpa, void *iobuf, size_t len, int rw, + unsigned int lane) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + unsigned int copied = 0; + u64 base_offset; + int rc; + + base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES + + lane * mmio->size; + /* TODO: non-temporal access, flush hints, cache management etc... */ + write_blk_ctl(nfit_blk, lane, dpa, len, rw); + while (len) { + unsigned int c; + u64 offset; + + if (mmio->num_lines) { + u32 line_offset; + + offset = to_interleave_offset(base_offset + copied, + mmio); + div_u64_rem(offset, mmio->line_size, &line_offset); + c = min_t(size_t, len, mmio->line_size - line_offset); + } else { + offset = base_offset + nfit_blk->bdw_offset; + c = len; + } + + if (rw) + memcpy(mmio->aperture + offset, iobuf + copied, c); + else + memcpy(iobuf + copied, mmio->aperture + offset, c); + + copied += c; + len -= c; + } + rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0; + return rc; +} + +static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr, + resource_size_t dpa, void *iobuf, u64 len, int rw) +{ + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + struct nd_region *nd_region = nfit_blk->nd_region; + unsigned int lane, copied = 0; + int rc = 0; + + lane = nd_region_acquire_lane(nd_region); + while (len) { + u64 c = min(len, mmio->size); + + rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied, + iobuf + copied, c, rw, lane); + if (rc) + break; + + copied += c; + len -= c; + } + nd_region_release_lane(nd_region, lane); + + return rc; +} + +static void nfit_spa_mapping_release(struct kref *kref) +{ + struct nfit_spa_mapping *spa_map = to_spa_map(kref); + struct acpi_nfit_system_address *spa = spa_map->spa; + struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index); + iounmap(spa_map->iomem); + release_mem_region(spa->address, spa->length); + list_del(&spa_map->list); + kfree(spa_map); +} + +static struct nfit_spa_mapping *find_spa_mapping( + struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + list_for_each_entry(spa_map, &acpi_desc->spa_maps, list) + if (spa_map->spa == spa) + return spa_map; + + return NULL; +} + +static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + mutex_lock(&acpi_desc->spa_map_mutex); + spa_map = find_spa_mapping(acpi_desc, spa); + + if (spa_map) + kref_put(&spa_map->kref, nfit_spa_mapping_release); + mutex_unlock(&acpi_desc->spa_map_mutex); +} + +static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + resource_size_t start = spa->address; + resource_size_t n = spa->length; + struct nfit_spa_mapping *spa_map; + struct resource *res; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + + spa_map = find_spa_mapping(acpi_desc, spa); + if (spa_map) { + kref_get(&spa_map->kref); + return spa_map->iomem; + } + + spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL); + if (!spa_map) + return NULL; + + INIT_LIST_HEAD(&spa_map->list); + spa_map->spa = spa; + kref_init(&spa_map->kref); + spa_map->acpi_desc = acpi_desc; + + res = request_mem_region(start, n, dev_name(acpi_desc->dev)); + if (!res) + goto err_mem; + + /* TODO: cacheability based on the spa type */ + spa_map->iomem = ioremap_nocache(start, n); + if (!spa_map->iomem) + goto err_map; + + list_add_tail(&spa_map->list, &acpi_desc->spa_maps); + return spa_map->iomem; + + err_map: + release_mem_region(start, n); + err_mem: + kfree(spa_map); + return NULL; +} + +/** + * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges + * @nvdimm_bus: NFIT-bus that provided the spa table entry + * @nfit_spa: spa table to map + * + * In the case where block-data-window apertures and + * dimm-control-regions are interleaved they will end up sharing a + * single request_mem_region() + ioremap() for the address range. In + * the style of devm nfit_spa_map() mappings are automatically dropped + * when all region devices referencing the same mapping are disabled / + * unbound. + */ +static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + void __iomem *iomem; + + mutex_lock(&acpi_desc->spa_map_mutex); + iomem = __nfit_spa_map(acpi_desc, spa); + mutex_unlock(&acpi_desc->spa_map_mutex); + + return iomem; +} + +static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio, + struct acpi_nfit_interleave *idt, u16 interleave_ways) +{ + if (idt) { + mmio->num_lines = idt->line_count; + mmio->line_size = idt->line_size; + if (interleave_ways == 0) + return -ENXIO; + mmio->table_size = mmio->num_lines * interleave_ways + * mmio->line_size; + } + + return 0; +} + +static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk_mmio *mmio; + struct nfit_blk *nfit_blk; + struct nfit_mem *nfit_mem; + struct nvdimm *nvdimm; + int rc; + + nvdimm = nd_blk_region_to_dimm(ndbr); + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) { + dev_dbg(dev, "%s: missing%s%s%s\n", __func__, + nfit_mem ? "" : " nfit_mem", + nfit_mem->dcr ? "" : " dcr", + nfit_mem->bdw ? "" : " bdw"); + return -ENXIO; + } + + nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL); + if (!nfit_blk) + return -ENOMEM; + nd_blk_region_set_provider_data(ndbr, nfit_blk); + nfit_blk->nd_region = to_nd_region(dev); + + /* map block aperture memory */ + nfit_blk->bdw_offset = nfit_mem->bdw->offset; + mmio = &nfit_blk->mmio[BDW]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->bdw->size; + mmio->base_offset = nfit_mem->memdev_bdw->region_offset; + mmio->idt = nfit_mem->idt_bdw; + mmio->spa = nfit_mem->spa_bdw; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw, + nfit_mem->memdev_bdw->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init bdw interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + /* map block control memory */ + nfit_blk->cmd_offset = nfit_mem->dcr->command_offset; + nfit_blk->stat_offset = nfit_mem->dcr->status_offset; + mmio = &nfit_blk->mmio[DCR]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map dcr\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->dcr->window_size; + mmio->base_offset = nfit_mem->memdev_dcr->region_offset; + mmio->idt = nfit_mem->idt_dcr; + mmio->spa = nfit_mem->spa_dcr; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr, + nfit_mem->memdev_dcr->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init dcr interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + if (mmio->line_size == 0) + return 0; + + if ((u32) nfit_blk->cmd_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "cmd_offset crosses interleave boundary\n"); + return -ENXIO; + } else if ((u32) nfit_blk->stat_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "stat_offset crosses interleave boundary\n"); + return -ENXIO; + } + + return 0; +} + +static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + int i; + + if (!nfit_blk) + return; /* never enabled */ + + /* auto-free BLK spa mappings */ + for (i = 0; i < 2; i++) { + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i]; + + if (mmio->base) + nfit_spa_unmap(acpi_desc, mmio->spa); + } + nd_blk_region_set_provider_data(ndbr, NULL); + /* devm will free nfit_blk */ +} + +static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, + struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, + struct acpi_nfit_memory_map *memdev, + struct acpi_nfit_system_address *spa) +{ + struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, + memdev->device_handle); + struct nd_blk_region_desc *ndbr_desc; + struct nfit_mem *nfit_mem; + int blk_valid = 0; + + if (!nvdimm) { + dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n", + spa->range_index, memdev->device_handle); + return -ENODEV; + } + + nd_mapping->nvdimm = nvdimm; + switch (nfit_spa_type(spa)) { + case NFIT_SPA_PM: + case NFIT_SPA_VOLATILE: + nd_mapping->start = memdev->address; + nd_mapping->size = memdev->region_size; + break; + case NFIT_SPA_DCR: + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem || !nfit_mem->bdw) { + dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n", + spa->range_index, nvdimm_name(nvdimm)); + } else { + nd_mapping->size = nfit_mem->bdw->capacity; + nd_mapping->start = nfit_mem->bdw->start_address; + ndr_desc->num_lanes = nfit_mem->bdw->windows; + blk_valid = 1; + } + + ndr_desc->nd_mapping = nd_mapping; + ndr_desc->num_mappings = blk_valid; + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr_desc->enable = acpi_nfit_blk_region_enable; + ndbr_desc->disable = acpi_nfit_blk_region_disable; + ndbr_desc->do_io = acpi_desc->blk_do_io; + if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc)) + return -ENOMEM; + break; + } + + return 0; +} + +static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, + struct nfit_spa *nfit_spa) +{ + static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; + struct acpi_nfit_system_address *spa = nfit_spa->spa; + struct nd_blk_region_desc ndbr_desc; + struct nd_region_desc *ndr_desc; + struct nfit_memdev *nfit_memdev; + struct nvdimm_bus *nvdimm_bus; + struct resource res; + int count = 0, rc; + + if (spa->range_index == 0) { + dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n", + __func__); + return 0; + } + + memset(&res, 0, sizeof(res)); + memset(&nd_mappings, 0, sizeof(nd_mappings)); + memset(&ndbr_desc, 0, sizeof(ndbr_desc)); + res.start = spa->address; + res.end = res.start + spa->length - 1; + ndr_desc = &ndbr_desc.ndr_desc; + ndr_desc->res = &res; + ndr_desc->provider_data = nfit_spa; + ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) + ndr_desc->numa_node = acpi_map_pxm_to_online_node( + spa->proximity_domain); + else + ndr_desc->numa_node = NUMA_NO_NODE; + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; + struct nd_mapping *nd_mapping; + + if (memdev->range_index != spa->range_index) + continue; + if (count >= ND_MAX_MAPPINGS) { + dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n", + spa->range_index, ND_MAX_MAPPINGS); + return -ENXIO; + } + nd_mapping = &nd_mappings[count++]; + rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, + memdev, spa); + if (rc) + return rc; + } + + ndr_desc->nd_mapping = nd_mappings; + ndr_desc->num_mappings = count; + rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); + if (rc) + return rc; + + nvdimm_bus = acpi_desc->nvdimm_bus; + if (nfit_spa_type(spa) == NFIT_SPA_PM) { + if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) + return -ENOMEM; + } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { + if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc)) + return -ENOMEM; + } + return 0; +} + +static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_spa *nfit_spa; + + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + int rc = acpi_nfit_register_region(acpi_desc, nfit_spa); + + if (rc) + return rc; + } + return 0; +} + +int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) +{ + struct device *dev = acpi_desc->dev; + const void *end; + u8 *data; + int rc; + + INIT_LIST_HEAD(&acpi_desc->spa_maps); + INIT_LIST_HEAD(&acpi_desc->spas); + INIT_LIST_HEAD(&acpi_desc->dcrs); + INIT_LIST_HEAD(&acpi_desc->bdws); + INIT_LIST_HEAD(&acpi_desc->idts); + INIT_LIST_HEAD(&acpi_desc->memdevs); + INIT_LIST_HEAD(&acpi_desc->dimms); + mutex_init(&acpi_desc->spa_map_mutex); + + data = (u8 *) acpi_desc->nfit; + end = data + sz; + data += sizeof(struct acpi_table_nfit); + while (!IS_ERR_OR_NULL(data)) + data = add_table(acpi_desc, data, end); + + if (IS_ERR(data)) { + dev_dbg(dev, "%s: nfit table parsing error: %ld\n", __func__, + PTR_ERR(data)); + return PTR_ERR(data); + } + + if (nfit_mem_init(acpi_desc) != 0) + return -ENOMEM; + + acpi_nfit_init_dsms(acpi_desc); + + rc = acpi_nfit_register_dimms(acpi_desc); + if (rc) + return rc; + + return acpi_nfit_register_regions(acpi_desc); +} +EXPORT_SYMBOL_GPL(acpi_nfit_init); + +static int acpi_nfit_add(struct acpi_device *adev) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct device *dev = &adev->dev; + struct acpi_table_header *tbl; + acpi_status status = AE_OK; + acpi_size sz; + int rc; + + status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz); + if (ACPI_FAILURE(status)) { + dev_err(dev, "failed to find NFIT\n"); + return -ENXIO; + } + + acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); + if (!acpi_desc) + return -ENOMEM; + + dev_set_drvdata(dev, acpi_desc); + acpi_desc->dev = dev; + acpi_desc->nfit = (struct acpi_table_nfit *) tbl; + acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io; + nd_desc = &acpi_desc->nd_desc; + nd_desc->provider_name = "ACPI.NFIT"; + nd_desc->ndctl = acpi_nfit_ctl; + nd_desc->attr_groups = acpi_nfit_attribute_groups; + + acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc); + if (!acpi_desc->nvdimm_bus) + return -ENXIO; + + rc = acpi_nfit_init(acpi_desc, sz); + if (rc) { + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return rc; + } + return 0; +} + +static int acpi_nfit_remove(struct acpi_device *adev) +{ + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev); + + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return 0; +} + +static const struct acpi_device_id acpi_nfit_ids[] = { + { "ACPI0012", 0 }, + { "", 0 }, +}; +MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids); + +static struct acpi_driver acpi_nfit_driver = { + .name = KBUILD_MODNAME, + .ids = acpi_nfit_ids, + .ops = { + .add = acpi_nfit_add, + .remove = acpi_nfit_remove, + }, +}; + +static __init int nfit_init(void) +{ + BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40); + BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 56); + BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48); + BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20); + BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9); + BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80); + BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40); + + acpi_str_to_uuid(UUID_VOLATILE_MEMORY, nfit_uuid[NFIT_SPA_VOLATILE]); + acpi_str_to_uuid(UUID_PERSISTENT_MEMORY, nfit_uuid[NFIT_SPA_PM]); + acpi_str_to_uuid(UUID_CONTROL_REGION, nfit_uuid[NFIT_SPA_DCR]); + acpi_str_to_uuid(UUID_DATA_REGION, nfit_uuid[NFIT_SPA_BDW]); + acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_VDISK]); + acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_CD, nfit_uuid[NFIT_SPA_VCD]); + acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_PDISK]); + acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]); + acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]); + acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]); + + return acpi_bus_register_driver(&acpi_nfit_driver); +} + +static __exit void nfit_exit(void) +{ + acpi_bus_unregister_driver(&acpi_nfit_driver); +} + +module_init(nfit_init); +module_exit(nfit_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h new file mode 100644 index 000000000000..81f2e8c5a79c --- /dev/null +++ b/drivers/acpi/nfit.h @@ -0,0 +1,158 @@ +/* + * NVDIMM Firmware Interface Table - NFIT + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __NFIT_H__ +#define __NFIT_H__ +#include <linux/libnvdimm.h> +#include <linux/types.h> +#include <linux/uuid.h> +#include <linux/acpi.h> +#include <acpi/acuuid.h> + +#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" +#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" +#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ + | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ + | ACPI_NFIT_MEM_ARMED) + +enum nfit_uuids { + NFIT_SPA_VOLATILE, + NFIT_SPA_PM, + NFIT_SPA_DCR, + NFIT_SPA_BDW, + NFIT_SPA_VDISK, + NFIT_SPA_VCD, + NFIT_SPA_PDISK, + NFIT_SPA_PCD, + NFIT_DEV_BUS, + NFIT_DEV_DIMM, + NFIT_UUID_MAX, +}; + +struct nfit_spa { + struct acpi_nfit_system_address *spa; + struct list_head list; +}; + +struct nfit_dcr { + struct acpi_nfit_control_region *dcr; + struct list_head list; +}; + +struct nfit_bdw { + struct acpi_nfit_data_region *bdw; + struct list_head list; +}; + +struct nfit_idt { + struct acpi_nfit_interleave *idt; + struct list_head list; +}; + +struct nfit_memdev { + struct acpi_nfit_memory_map *memdev; + struct list_head list; +}; + +/* assembled tables for a given dimm/memory-device */ +struct nfit_mem { + struct nvdimm *nvdimm; + struct acpi_nfit_memory_map *memdev_dcr; + struct acpi_nfit_memory_map *memdev_pmem; + struct acpi_nfit_memory_map *memdev_bdw; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_data_region *bdw; + struct acpi_nfit_system_address *spa_dcr; + struct acpi_nfit_system_address *spa_bdw; + struct acpi_nfit_interleave *idt_dcr; + struct acpi_nfit_interleave *idt_bdw; + struct list_head list; + struct acpi_device *adev; + unsigned long dsm_mask; +}; + +struct acpi_nfit_desc { + struct nvdimm_bus_descriptor nd_desc; + struct acpi_table_nfit *nfit; + struct mutex spa_map_mutex; + struct list_head spa_maps; + struct list_head memdevs; + struct list_head dimms; + struct list_head spas; + struct list_head dcrs; + struct list_head bdws; + struct list_head idts; + struct nvdimm_bus *nvdimm_bus; + struct device *dev; + unsigned long dimm_dsm_force_en; + int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); +}; + +enum nd_blk_mmio_selector { + BDW, + DCR, +}; + +struct nfit_blk { + struct nfit_blk_mmio { + union { + void __iomem *base; + void *aperture; + }; + u64 size; + u64 base_offset; + u32 line_size; + u32 num_lines; + u32 table_size; + struct acpi_nfit_interleave *idt; + struct acpi_nfit_system_address *spa; + } mmio[2]; + struct nd_region *nd_region; + u64 bdw_offset; /* post interleave offset */ + u64 stat_offset; + u64 cmd_offset; +}; + +struct nfit_spa_mapping { + struct acpi_nfit_desc *acpi_desc; + struct acpi_nfit_system_address *spa; + struct list_head list; + struct kref kref; + void __iomem *iomem; +}; + +static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref) +{ + return container_of(kref, struct nfit_spa_mapping, kref); +} + +static inline struct acpi_nfit_memory_map *__to_nfit_memdev( + struct nfit_mem *nfit_mem) +{ + if (nfit_mem->memdev_dcr) + return nfit_mem->memdev_dcr; + return nfit_mem->memdev_pmem; +} + +static inline struct acpi_nfit_desc *to_acpi_desc( + struct nvdimm_bus_descriptor *nd_desc) +{ + return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); +} + +const u8 *to_nfit_uuid(enum nfit_uuids id); +int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz); +extern const struct attribute_group *acpi_nfit_attribute_groups[]; +#endif /* __NFIT_H__ */ diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 1333cbdc3ea2..acaa3b4ea504 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -29,6 +29,8 @@ #include <linux/errno.h> #include <linux/acpi.h> #include <linux/numa.h> +#include <linux/nodemask.h> +#include <linux/topology.h> #define PREFIX "ACPI: " @@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node) int acpi_map_pxm_to_node(int pxm) { - int node = pxm_to_node_map[pxm]; + int node; + + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS) + return NUMA_NO_NODE; + + node = pxm_to_node_map[pxm]; if (node == NUMA_NO_NODE) { if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) @@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm) return node; } +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +int acpi_map_pxm_to_online_node(int pxm) +{ + int node, n, dist, min_dist; + + node = acpi_map_pxm_to_node(pxm); + + if (node == NUMA_NO_NODE) + node = 0; + + if (!node_online(node)) { + min_dist = INT_MAX; + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + node = n; + } + } + } + + return node; +} +EXPORT_SYMBOL(acpi_map_pxm_to_online_node); + static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { @@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle) int pxm; pxm = acpi_get_pxm(handle); - if (pxm < 0 || pxm >= MAX_PXM_DOMAINS) - return NUMA_NO_NODE; return acpi_map_pxm_to_node(pxm); } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 3ccef9eba6f9..1b8094d4d7af 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -404,18 +404,6 @@ config BLK_DEV_RAM_DAX and will prevent RAM block device backing store memory from being allocated from highmem (only a problem for highmem systems). -config BLK_DEV_PMEM - tristate "Persistent memory block device support" - depends on HAS_IOMEM - help - Saying Y here will allow you to use a contiguous range of reserved - memory as one or more persistent block devices. - - To compile this driver as a module, choose M here: the module will be - called 'pmem'. - - If unsure, say N. - config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media" depends on !UML diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 9cc6c18a1c7e..02b688d1438d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o -obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e5112714188f..34338d7438f5 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -193,6 +193,13 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, return 0; } +static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->tags = NULL; +} + static int nvme_admin_init_request(void *data, struct request *req, unsigned int hctx_idx, unsigned int rq_idx, unsigned int numa_node) @@ -606,7 +613,10 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, return; } if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - req->errors = status; + if (cmd_rq->ctx == CMD_CTX_CANCELLED) + req->errors = -EINTR; + else + req->errors = status; } else { req->errors = nvme_error_status(status); } @@ -1161,12 +1171,13 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) { - struct nvme_command c = { - .identify.opcode = nvme_admin_identify, - .identify.cns = cpu_to_le32(1), - }; + struct nvme_command c = { }; int error; + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(1); + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); if (!*id) return -ENOMEM; @@ -1181,12 +1192,13 @@ int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, struct nvme_id_ns **id) { - struct nvme_command c = { - .identify.opcode = nvme_admin_identify, - .identify.nsid = cpu_to_le32(nsid), - }; + struct nvme_command c = { }; int error; + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); if (!*id) return -ENOMEM; @@ -1230,14 +1242,14 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) { - struct nvme_command c = { - .common.opcode = nvme_admin_get_log_page, - .common.nsid = cpu_to_le32(0xFFFFFFFF), - .common.cdw10[0] = cpu_to_le32( + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | NVME_LOG_SMART), - }; - int error; *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); if (!*log) @@ -1606,6 +1618,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_admin_exit_hctx, .init_request = nvme_admin_init_request, .timeout = nvme_timeout, }; @@ -1648,6 +1661,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) } if (!blk_get_queue(dev->admin_q)) { nvme_dev_remove_admin(dev); + dev->admin_q = NULL; return -ENODEV; } } else @@ -2349,19 +2363,20 @@ static int nvme_dev_add(struct nvme_dev *dev) } kfree(ctrl); - dev->tagset.ops = &nvme_mq_ops; - dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev_to_node(dev->dev); - dev->tagset.queue_depth = + if (!dev->tagset.tags) { + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(dev->dev); + dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = nvme_cmd_size(dev); - dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; - dev->tagset.driver_data = dev; - - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; + dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + if (blk_mq_alloc_tag_set(&dev->tagset)) + return 0; + } schedule_work(&dev->scan_work); return 0; } @@ -2734,8 +2749,10 @@ static void nvme_free_dev(struct kref *kref) put_device(dev->device); nvme_free_namespaces(dev); nvme_release_instance(dev); - blk_mq_free_tag_set(&dev->tagset); - blk_put_queue(dev->admin_q); + if (dev->tagset.tags) + blk_mq_free_tag_set(&dev->tagset); + if (dev->admin_q) + blk_put_queue(dev->admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2866,6 +2883,9 @@ static int nvme_dev_start(struct nvme_dev *dev) free_tags: nvme_dev_remove_admin(dev); + blk_put_queue(dev->admin_q); + dev->admin_q = NULL; + dev->queues[0]->tags = NULL; disable: nvme_disable_queue(dev, 0); nvme_dev_list_remove(dev); @@ -2907,25 +2927,43 @@ static int nvme_dev_resume(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } else { nvme_unfreeze_queues(dev); - schedule_work(&dev->scan_work); + nvme_dev_add(dev); nvme_set_irq_hints(dev); } return 0; } +static void nvme_dead_ctrl(struct nvme_dev *dev) +{ + dev_warn(dev->dev, "Device failed to resume\n"); + kref_get(&dev->kref); + if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", + dev->instance))) { + dev_err(dev->dev, + "Failed to start controller remove task\n"); + kref_put(&dev->kref, nvme_free_dev); + } +} + static void nvme_dev_reset(struct nvme_dev *dev) { + bool in_probe = work_busy(&dev->probe_work); + nvme_dev_shutdown(dev); - if (nvme_dev_resume(dev)) { - dev_warn(dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { - dev_err(dev->dev, - "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); - } + + /* Synchronize with device probe so that work will see failure status + * and exit gracefully without trying to schedule another reset */ + flush_work(&dev->probe_work); + + /* Fail this device if reset occured during probe to avoid + * infinite initialization loops. */ + if (in_probe) { + nvme_dead_ctrl(dev); + return; } + /* Schedule device resume asynchronously so the reset work is available + * to cleanup errors that may occur during reinitialization */ + schedule_work(&dev->probe_work); } static void nvme_reset_failed_dev(struct work_struct *ws) @@ -2957,6 +2995,7 @@ static int nvme_reset(struct nvme_dev *dev) if (!ret) { flush_work(&dev->reset_work); + flush_work(&dev->probe_work); return 0; } @@ -3053,26 +3092,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_async_probe(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - int result; - result = nvme_dev_start(dev); - if (result) - goto reset; - - if (dev->online_queues > 1) - result = nvme_dev_add(dev); - if (result) - goto reset; - - nvme_set_irq_hints(dev); - return; - reset: - spin_lock(&dev_list_lock); - if (!work_busy(&dev->reset_work)) { - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); - } - spin_unlock(&dev_list_lock); + if (nvme_dev_resume(dev) && !work_busy(&dev->reset_work)) + nvme_dead_ctrl(dev); } static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) @@ -3104,8 +3126,8 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->reset_work); flush_work(&dev->scan_work); device_remove_file(dev->device, &dev_attr_reset_controller); - nvme_dev_shutdown(dev); nvme_dev_remove(dev); + nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 713fc9ff1149..2126842fb6e8 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -84,6 +84,13 @@ MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); /* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); +/* * The LRU mechanism to clean the lists of persistent grants needs to * be executed periodically. The time interval between consecutive executions * of the purge mechanism is set in ms. @@ -1438,6 +1445,12 @@ static int __init xen_blkif_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER; + } + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index f620b5d3f77c..8ccc49d01c8e 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -44,6 +44,7 @@ #include <xen/interface/io/blkif.h> #include <xen/interface/io/protocols.h> +extern unsigned int xen_blkif_max_ring_order; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. @@ -248,7 +249,7 @@ struct backend_info; #define PERSISTENT_GNT_WAS_ACTIVE 1 /* Number of requests that we can fit in a ring */ -#define XEN_BLKIF_REQS 32 +#define XEN_BLKIF_REQS_PER_PAGE 32 struct persistent_gnt { struct page *page; @@ -320,6 +321,7 @@ struct xen_blkif { struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; + unsigned int nr_ring_pages; }; struct seg_buf { diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 6ab69ad61ee1..deb3f001791f 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -25,6 +25,7 @@ /* Enlarge the array size in order to fully show blkback name. */ #define BLKBACK_NAME_LEN (20) +#define RINGREF_NAME_LEN (20) struct backend_info { struct xenbus_device *dev; @@ -124,8 +125,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; - struct pending_req *req, *n; - int i, j; BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -151,55 +150,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - - for (i = 0; i < XEN_BLKIF_REQS; i++) { - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - goto fail; - list_add_tail(&req->free_list, - &blkif->pending_free); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - req->segments[j] = kzalloc(sizeof(*req->segments[0]), - GFP_KERNEL); - if (!req->segments[j]) - goto fail; - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), - GFP_KERNEL); - if (!req->indirect_pages[j]) - goto fail; - } - } spin_lock_init(&blkif->pending_free_lock); init_waitqueue_head(&blkif->pending_free_wq); init_waitqueue_head(&blkif->shutdown_wq); return blkif; - -fail: - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - if (!req->segments[j]) - break; - kfree(req->segments[j]); - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - if (!req->indirect_pages[j]) - break; - kfree(req->indirect_pages[j]); - } - kfree(req); - } - - kmem_cache_free(xen_blkif_cachep, blkif); - - return ERR_PTR(-ENOMEM); } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, - unsigned int evtchn) +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, + unsigned int nr_grefs, unsigned int evtchn) { int err; @@ -207,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, if (blkif->irq) return 0; - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, &blkif->blk_ring); if (err < 0) return err; @@ -217,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, { struct blkif_sring *sring; sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs); break; } default: @@ -312,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif) i++; } - WARN_ON(i != XEN_BLKIF_REQS); + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); kmem_cache_free(xen_blkif_cachep, blkif); } @@ -597,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev, if (err) goto fail; + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u", + xen_blkif_max_ring_order); + if (err) + pr_warn("%s write out 'max-ring-page-order' failed\n", __func__); + err = xenbus_switch_state(dev, XenbusStateInitWait); if (err) goto fail; @@ -860,22 +824,66 @@ again: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; - unsigned long ring_ref; - unsigned int evtchn; + unsigned int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int evtchn, nr_grefs, ring_page_order; unsigned int pers_grants; char protocol[64] = ""; - int err; + struct pending_req *req, *n; + int err, i, j; pr_debug("%s %s\n", __func__, dev->otherend); - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", - &ring_ref, "event-channel", "%u", &evtchn, NULL); - if (err) { - xenbus_dev_fatal(dev, err, - "reading %s/ring-ref and event-channel", + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + &evtchn); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dev->otherend); return err; } + pr_info("event-channel %u\n", evtchn); + + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", + &ring_page_order); + if (err != 1) { + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", + "%u", &ring_ref[0]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", + dev->otherend); + return err; + } + nr_grefs = 1; + pr_info("%s:using single page: ring-ref %d\n", dev->otherend, + ring_ref[0]); + } else { + unsigned int i; + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", + dev->otherend, ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + nr_grefs = 1 << ring_page_order; + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + "%u", &ring_ref[i]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/%s", + dev->otherend, ring_ref_name); + return err; + } + pr_info("ring-ref%u: %u\n", i, ring_ref[i]); + } + } be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", @@ -900,20 +908,55 @@ static int connect_ring(struct backend_info *be) be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.overflow_max_grants = 0; + be->blkif->nr_ring_pages = nr_grefs; - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", - ring_ref, evtchn, be->blkif->blk_protocol, protocol, + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", + nr_grefs, evtchn, be->blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); + for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto fail; + list_add_tail(&req->free_list, &be->blkif->pending_free); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); + if (!req->segments[j]) + goto fail; + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), + GFP_KERNEL); + if (!req->indirect_pages[j]) + goto fail; + } + } + /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, evtchn); + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); if (err) { - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", - ring_ref, evtchn); + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; } return 0; + +fail: + list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_del(&req->free_list); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + if (!req->segments[j]) + break; + kfree(req->segments[j]); + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + if (!req->indirect_pages[j]) + break; + kfree(req->indirect_pages[j]); + } + kfree(req); + } + return -ENOMEM; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2c61cf8c6f61..fc770b7d3beb 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -98,7 +98,21 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +/* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +static unsigned int xen_blkif_max_ring_order; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); + +#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages) +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES) +/* + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consist with backend. + */ +#define RINGREF_NAME_LEN (20) /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -114,13 +128,14 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref; + int ring_ref[XENBUS_MAX_RING_PAGES]; + unsigned int nr_ring_pages; struct blkif_front_ring ring; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_RING_SIZE]; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head grants; struct list_head indirect_pages; unsigned int persistent_gnts_c; @@ -139,8 +154,6 @@ static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) #define GRANT_INVALID_REF 0 #define PARTS_PER_DISK 16 @@ -170,7 +183,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE); + BUG_ON(free >= BLK_RING_SIZE(info)); info->shadow_free = info->shadow[free].req.u.rw.id; info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; @@ -983,7 +996,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring @@ -1033,12 +1046,15 @@ free_shadow: flush_work(&info->work); /* Free resources associated with old device channel. */ - if (info->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref, 0, - (unsigned long)info->ring.sring); - info->ring_ref = GRANT_INVALID_REF; - info->ring.sring = NULL; + for (i = 0; i < info->nr_ring_pages; i++) { + if (info->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref[i], 0, 0); + info->ring_ref[i] = GRANT_INVALID_REF; + } } + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + info->ring.sring = NULL; + if (info->irq) unbind_from_irqhandler(info->irq, info); info->evtchn = info->irq = 0; @@ -1157,7 +1173,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * never have given to it (we stamp it up to BLK_RING_SIZE - * look in get_id_from_freelist. */ - if (id >= BLK_RING_SIZE) { + if (id >= BLK_RING_SIZE(info)) { WARN(1, "%s: response to %s has incorrect id (%ld)\n", info->gd->disk_name, op_name(bret->operation), id); /* We can't safely get the 'struct request' as @@ -1245,26 +1261,30 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) { struct blkif_sring *sring; - grant_ref_t gref; - int err; + int err, i; + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE; + grant_ref_t gref[XENBUS_MAX_RING_PAGES]; - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, + get_order(ring_size)); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + FRONT_RING_INIT(&info->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_page((unsigned long)sring); + free_pages((unsigned long)sring, get_order(ring_size)); info->ring.sring = NULL; goto fail; } - info->ring_ref = gref; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = gref[i]; err = xenbus_alloc_evtchn(dev, &info->evtchn); if (err) @@ -1292,7 +1312,18 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err; + int err, i; + unsigned int max_page_order = 0; + unsigned int ring_page_order = 0; + + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "max-ring-page-order", "%u", &max_page_order); + if (err != 1) + info->nr_ring_pages = 1; + else { + ring_page_order = min(xen_blkif_max_ring_order, max_page_order); + info->nr_ring_pages = 1 << ring_page_order; + } /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, info); @@ -1306,11 +1337,32 @@ again: goto destroy_blkring; } - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + err = xenbus_printf(xbt, dev->nodename, + "ring-page-order", "%u", ring_page_order); + if (err) { + message = "writing ring-page-order"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dev->nodename, ring_ref_name, + "%u", info->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } } err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", info->evtchn); @@ -1338,6 +1390,9 @@ again: goto destroy_blkring; } + for (i = 0; i < BLK_RING_SIZE(info); i++) + info->shadow[i].req.u.rw.id = i+1; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1361,7 +1416,7 @@ again: static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - int err, vdevice, i; + int err, vdevice; struct blkfront_info *info; /* FIXME: Use dynamic device id if this is not set. */ @@ -1422,21 +1477,10 @@ static int blkfront_probe(struct xenbus_device *dev, info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); - for (i = 0; i < BLK_RING_SIZE; i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); dev_set_drvdata(&dev->dev, info); - err = talk_to_blkback(dev, info); - if (err) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); - return err; - } - return 0; } @@ -1476,10 +1520,10 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE; i++) + for (i = 0; i < BLK_RING_SIZE(info); i++) info->shadow[i].req.u.rw.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; rc = blkfront_setup_indirect(info); if (rc) { @@ -1491,7 +1535,7 @@ static int blkif_recover(struct blkfront_info *info) blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* Not in use? */ if (!copy[i].request) continue; @@ -1697,7 +1741,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) segs = info->max_indirect_segments; } - err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1707,7 +1751,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) * grants, we need to allocate a set of pages that can be * used for mapping indirect grefs */ - int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&info->indirect_pages)); for (i = 0; i < num; i++) { @@ -1718,7 +1762,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { info->shadow[i].grants_used = kzalloc( sizeof(info->shadow[i].grants_used[0]) * segs, GFP_NOIO); @@ -1740,7 +1784,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) return 0; out_of_memory: - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { kfree(info->shadow[i].grants_used); info->shadow[i].grants_used = NULL; kfree(info->shadow[i].sg); @@ -1906,8 +1950,15 @@ static void blkback_changed(struct xenbus_device *dev, dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); switch (backend_state) { - case XenbusStateInitialising: case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (talk_to_blkback(dev, info)) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + break; + } + case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: case XenbusStateReconfigured: @@ -2091,6 +2142,12 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER); + xen_blkif_max_ring_order = 0; + } + if (!xen_has_pv_disk_devices()) return -ENODEV; diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index bda2cb06dc7a..88d474b78076 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -162,6 +162,17 @@ config MX3_IPU_IRQS To avoid bloating the irq_desc[] array we allocate a sufficient number of IRQ slots and map them dynamically to specific sources. +config PXA_DMA + bool "PXA DMA support" + depends on (ARCH_MMP || ARCH_PXA) + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + Support the DMA engine for PXA. It is also compatible with MMP PDMA + platform. The internal DMA IP of all PXA variants is supported, with + 16 to 32 channels for peripheral to memory or memory to memory + transfers. + config TXX9_DMAC tristate "Toshiba TXx9 SoC DMA support" depends on MACH_TX49XX || MACH_TX39XX @@ -245,6 +256,9 @@ config TI_EDMA Enable support for the TI EDMA controller. This DMA engine is found on TI DaVinci and AM33xx parts. +config TI_DMA_CROSSBAR + bool + config ARCH_HAS_ASYNC_TX_FIND_CHANNEL bool @@ -330,6 +344,7 @@ config DMA_OMAP depends on ARCH_OMAP select DMA_ENGINE select DMA_VIRTUAL_CHANNELS + select TI_DMA_CROSSBAR if SOC_DRA7XX config DMA_BCM2835 tristate "BCM2835 DMA engine support" diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 69f77d5ba53b..6a4d6f2827da 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/ obj-$(CONFIG_IMX_SDMA) += imx-sdma.o obj-$(CONFIG_IMX_DMA) += imx-dma.o obj-$(CONFIG_MXS_DMA) += mxs-dma.o +obj-$(CONFIG_PXA_DMA) += pxa_dma.o obj-$(CONFIG_TIMB_DMA) += timb_dma.o obj-$(CONFIG_SIRF_DMA) += sirf-dma.o obj-$(CONFIG_TI_EDMA) += edma.o @@ -38,6 +39,7 @@ obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o obj-$(CONFIG_DMA_SA11X0) += sa11x0-dma.o obj-$(CONFIG_MMP_TDMA) += mmp_tdma.o obj-$(CONFIG_DMA_OMAP) += omap-dma.o +obj-$(CONFIG_TI_DMA_CROSSBAR) += ti-dma-crossbar.o obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o obj-$(CONFIG_MMP_PDMA) += mmp_pdma.o obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c index 49d396ec06e5..5de3cf453f35 100644 --- a/drivers/dma/amba-pl08x.c +++ b/drivers/dma/amba-pl08x.c @@ -474,7 +474,7 @@ static void pl08x_terminate_phy_chan(struct pl08x_driver_data *pl08x, u32 val = readl(ch->reg_config); val &= ~(PL080_CONFIG_ENABLE | PL080_CONFIG_ERR_IRQ_MASK | - PL080_CONFIG_TC_IRQ_MASK); + PL080_CONFIG_TC_IRQ_MASK); writel(val, ch->reg_config); diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 57b2141ddddc..59892126d175 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -247,6 +247,10 @@ static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first) channel_writel(atchan, CTRLA, 0); channel_writel(atchan, CTRLB, 0); channel_writel(atchan, DSCR, first->txd.phys); + channel_writel(atchan, SPIP, ATC_SPIP_HOLE(first->src_hole) | + ATC_SPIP_BOUNDARY(first->boundary)); + channel_writel(atchan, DPIP, ATC_DPIP_HOLE(first->dst_hole) | + ATC_DPIP_BOUNDARY(first->boundary)); dma_writel(atdma, CHER, atchan->mask); vdbg_dump_regs(atchan); @@ -635,6 +639,104 @@ static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx) } /** + * atc_prep_dma_interleaved - prepare memory to memory interleaved operation + * @chan: the channel to prepare operation on + * @xt: Interleaved transfer template + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_dma_interleaved(struct dma_chan *chan, + struct dma_interleaved_template *xt, + unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct data_chunk *first = xt->sgl; + struct at_desc *desc = NULL; + size_t xfer_count; + unsigned int dwidth; + u32 ctrla; + u32 ctrlb; + size_t len = 0; + int i; + + dev_info(chan2dev(chan), + "%s: src=0x%08x, dest=0x%08x, numf=%d, frame_size=%d, flags=0x%lx\n", + __func__, xt->src_start, xt->dst_start, xt->numf, + xt->frame_size, flags); + + if (unlikely(!xt || xt->numf != 1 || !xt->frame_size)) + return NULL; + + /* + * The controller can only "skip" X bytes every Y bytes, so we + * need to make sure we are given a template that fit that + * description, ie a template with chunks that always have the + * same size, with the same ICGs. + */ + for (i = 0; i < xt->frame_size; i++) { + struct data_chunk *chunk = xt->sgl + i; + + if ((chunk->size != xt->sgl->size) || + (dmaengine_get_dst_icg(xt, chunk) != dmaengine_get_dst_icg(xt, first)) || + (dmaengine_get_src_icg(xt, chunk) != dmaengine_get_src_icg(xt, first))) { + dev_err(chan2dev(chan), + "%s: the controller can transfer only identical chunks\n", + __func__); + return NULL; + } + + len += chunk->size; + } + + dwidth = atc_get_xfer_width(xt->src_start, + xt->dst_start, len); + + xfer_count = len >> dwidth; + if (xfer_count > ATC_BTSIZE_MAX) { + dev_err(chan2dev(chan), "%s: buffer is too big\n", __func__); + return NULL; + } + + ctrla = ATC_SRC_WIDTH(dwidth) | + ATC_DST_WIDTH(dwidth); + + ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN + | ATC_SRC_ADDR_MODE_INCR + | ATC_DST_ADDR_MODE_INCR + | ATC_SRC_PIP + | ATC_DST_PIP + | ATC_FC_MEM2MEM; + + /* create the transfer */ + desc = atc_desc_get(atchan); + if (!desc) { + dev_err(chan2dev(chan), + "%s: couldn't allocate our descriptor\n", __func__); + return NULL; + } + + desc->lli.saddr = xt->src_start; + desc->lli.daddr = xt->dst_start; + desc->lli.ctrla = ctrla | xfer_count; + desc->lli.ctrlb = ctrlb; + + desc->boundary = first->size >> dwidth; + desc->dst_hole = (dmaengine_get_dst_icg(xt, first) >> dwidth) + 1; + desc->src_hole = (dmaengine_get_src_icg(xt, first) >> dwidth) + 1; + + desc->txd.cookie = -EBUSY; + desc->total_len = desc->len = len; + desc->tx_width = dwidth; + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(desc); + + desc->txd.flags = flags; /* client is in control of this ack */ + + return &desc->txd; +} + +/** * atc_prep_dma_memcpy - prepare a memcpy operation * @chan: the channel to prepare operation on * @dest: operation virtual destination address @@ -1609,6 +1711,7 @@ static int __init at_dma_probe(struct platform_device *pdev) /* setup platform data for each SoC */ dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask); dma_cap_set(DMA_SG, at91sam9rl_config.cap_mask); + dma_cap_set(DMA_INTERLEAVE, at91sam9g45_config.cap_mask); dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask); dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask); dma_cap_set(DMA_SG, at91sam9g45_config.cap_mask); @@ -1713,6 +1816,9 @@ static int __init at_dma_probe(struct platform_device *pdev) atdma->dma_common.dev = &pdev->dev; /* set prep routines based on capability */ + if (dma_has_cap(DMA_INTERLEAVE, atdma->dma_common.cap_mask)) + atdma->dma_common.device_prep_interleaved_dma = atc_prep_dma_interleaved; + if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask)) atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy; diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h index 2727ca560572..bc8d5ebedd19 100644 --- a/drivers/dma/at_hdmac_regs.h +++ b/drivers/dma/at_hdmac_regs.h @@ -196,6 +196,11 @@ struct at_desc { size_t len; u32 tx_width; size_t total_len; + + /* Interleaved data */ + size_t boundary; + size_t dst_hole; + size_t src_hole; }; static inline struct at_desc * diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 7992164ea9ec..cf1213de7865 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -235,6 +235,10 @@ struct at_xdmac_lld { dma_addr_t mbr_sa; /* Source Address Member */ dma_addr_t mbr_da; /* Destination Address Member */ u32 mbr_cfg; /* Configuration Register */ + u32 mbr_bc; /* Block Control Register */ + u32 mbr_ds; /* Data Stride Register */ + u32 mbr_sus; /* Source Microblock Stride Register */ + u32 mbr_dus; /* Destination Microblock Stride Register */ }; @@ -358,6 +362,8 @@ static void at_xdmac_start_xfer(struct at_xdmac_chan *atchan, if (at_xdmac_chan_is_cyclic(atchan)) { reg = AT_XDMAC_CNDC_NDVIEW_NDV1; at_xdmac_chan_write(atchan, AT_XDMAC_CC, first->lld.mbr_cfg); + } else if (first->lld.mbr_ubc & AT_XDMAC_MBR_UBC_NDV3) { + reg = AT_XDMAC_CNDC_NDVIEW_NDV3; } else { /* * No need to write AT_XDMAC_CC reg, it will be done when the @@ -465,6 +471,33 @@ static struct at_xdmac_desc *at_xdmac_get_desc(struct at_xdmac_chan *atchan) return desc; } +static void at_xdmac_queue_desc(struct dma_chan *chan, + struct at_xdmac_desc *prev, + struct at_xdmac_desc *desc) +{ + if (!prev || !desc) + return; + + prev->lld.mbr_nda = desc->tx_dma_desc.phys; + prev->lld.mbr_ubc |= AT_XDMAC_MBR_UBC_NDE; + + dev_dbg(chan2dev(chan), "%s: chain lld: prev=0x%p, mbr_nda=%pad\n", + __func__, prev, &prev->lld.mbr_nda); +} + +static inline void at_xdmac_increment_block_count(struct dma_chan *chan, + struct at_xdmac_desc *desc) +{ + if (!desc) + return; + + desc->lld.mbr_bc++; + + dev_dbg(chan2dev(chan), + "%s: incrementing the block count of the desc 0x%p\n", + __func__, desc); +} + static struct dma_chan *at_xdmac_xlate(struct of_phandle_args *dma_spec, struct of_dma *of_dma) { @@ -656,19 +689,14 @@ at_xdmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2 /* next descriptor view */ | AT_XDMAC_MBR_UBC_NDEN /* next descriptor dst parameter update */ | AT_XDMAC_MBR_UBC_NSEN /* next descriptor src parameter update */ - | (i == sg_len - 1 ? 0 : AT_XDMAC_MBR_UBC_NDE) /* descriptor fetch */ | (len >> fixed_dwidth); /* microblock length */ dev_dbg(chan2dev(chan), "%s: lld: mbr_sa=%pad, mbr_da=%pad, mbr_ubc=0x%08x\n", __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc); /* Chain lld. */ - if (prev) { - prev->lld.mbr_nda = desc->tx_dma_desc.phys; - dev_dbg(chan2dev(chan), - "%s: chain lld: prev=0x%p, mbr_nda=%pad\n", - __func__, prev, &prev->lld.mbr_nda); - } + if (prev) + at_xdmac_queue_desc(chan, prev, desc); prev = desc; if (!first) @@ -748,7 +776,6 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV1 | AT_XDMAC_MBR_UBC_NDEN | AT_XDMAC_MBR_UBC_NSEN - | AT_XDMAC_MBR_UBC_NDE | period_len >> at_xdmac_get_dwidth(desc->lld.mbr_cfg); dev_dbg(chan2dev(chan), @@ -756,12 +783,8 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc); /* Chain lld. */ - if (prev) { - prev->lld.mbr_nda = desc->tx_dma_desc.phys; - dev_dbg(chan2dev(chan), - "%s: chain lld: prev=0x%p, mbr_nda=%pad\n", - __func__, prev, &prev->lld.mbr_nda); - } + if (prev) + at_xdmac_queue_desc(chan, prev, desc); prev = desc; if (!first) @@ -783,6 +806,215 @@ at_xdmac_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, return &first->tx_dma_desc; } +static inline u32 at_xdmac_align_width(struct dma_chan *chan, dma_addr_t addr) +{ + u32 width; + + /* + * Check address alignment to select the greater data width we + * can use. + * + * Some XDMAC implementations don't provide dword transfer, in + * this case selecting dword has the same behavior as + * selecting word transfers. + */ + if (!(addr & 7)) { + width = AT_XDMAC_CC_DWIDTH_DWORD; + dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__); + } else if (!(addr & 3)) { + width = AT_XDMAC_CC_DWIDTH_WORD; + dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__); + } else if (!(addr & 1)) { + width = AT_XDMAC_CC_DWIDTH_HALFWORD; + dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__); + } else { + width = AT_XDMAC_CC_DWIDTH_BYTE; + dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__); + } + + return width; +} + +static struct at_xdmac_desc * +at_xdmac_interleaved_queue_desc(struct dma_chan *chan, + struct at_xdmac_chan *atchan, + struct at_xdmac_desc *prev, + dma_addr_t src, dma_addr_t dst, + struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + struct at_xdmac_desc *desc; + u32 dwidth; + unsigned long flags; + size_t ublen; + /* + * WARNING: The channel configuration is set here since there is no + * dmaengine_slave_config call in this case. Moreover we don't know the + * direction, it involves we can't dynamically set the source and dest + * interface so we have to use the same one. Only interface 0 allows EBI + * access. Hopefully we can access DDR through both ports (at least on + * SAMA5D4x), so we can use the same interface for source and dest, + * that solves the fact we don't know the direction. + */ + u32 chan_cc = AT_XDMAC_CC_DIF(0) + | AT_XDMAC_CC_SIF(0) + | AT_XDMAC_CC_MBSIZE_SIXTEEN + | AT_XDMAC_CC_TYPE_MEM_TRAN; + + dwidth = at_xdmac_align_width(chan, src | dst | chunk->size); + if (chunk->size >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) { + dev_dbg(chan2dev(chan), + "%s: chunk too big (%d, max size %lu)...\n", + __func__, chunk->size, + AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth); + return NULL; + } + + if (prev) + dev_dbg(chan2dev(chan), + "Adding items at the end of desc 0x%p\n", prev); + + if (xt->src_inc) { + if (xt->src_sgl) + chan_cc |= AT_XDMAC_CC_SAM_UBS_DS_AM; + else + chan_cc |= AT_XDMAC_CC_SAM_INCREMENTED_AM; + } + + if (xt->dst_inc) { + if (xt->dst_sgl) + chan_cc |= AT_XDMAC_CC_DAM_UBS_DS_AM; + else + chan_cc |= AT_XDMAC_CC_DAM_INCREMENTED_AM; + } + + spin_lock_irqsave(&atchan->lock, flags); + desc = at_xdmac_get_desc(atchan); + spin_unlock_irqrestore(&atchan->lock, flags); + if (!desc) { + dev_err(chan2dev(chan), "can't get descriptor\n"); + return NULL; + } + + chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth); + + ublen = chunk->size >> dwidth; + + desc->lld.mbr_sa = src; + desc->lld.mbr_da = dst; + desc->lld.mbr_sus = dmaengine_get_src_icg(xt, chunk); + desc->lld.mbr_dus = dmaengine_get_dst_icg(xt, chunk); + + desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3 + | AT_XDMAC_MBR_UBC_NDEN + | AT_XDMAC_MBR_UBC_NSEN + | ublen; + desc->lld.mbr_cfg = chan_cc; + + dev_dbg(chan2dev(chan), + "%s: lld: mbr_sa=0x%08x, mbr_da=0x%08x, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n", + __func__, desc->lld.mbr_sa, desc->lld.mbr_da, + desc->lld.mbr_ubc, desc->lld.mbr_cfg); + + /* Chain lld. */ + if (prev) + at_xdmac_queue_desc(chan, prev, desc); + + return desc; +} + +static struct dma_async_tx_descriptor * +at_xdmac_prep_interleaved(struct dma_chan *chan, + struct dma_interleaved_template *xt, + unsigned long flags) +{ + struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); + struct at_xdmac_desc *prev = NULL, *first = NULL; + struct data_chunk *chunk, *prev_chunk = NULL; + dma_addr_t dst_addr, src_addr; + size_t dst_skip, src_skip, len = 0; + size_t prev_dst_icg = 0, prev_src_icg = 0; + int i; + + if (!xt || (xt->numf != 1) || (xt->dir != DMA_MEM_TO_MEM)) + return NULL; + + dev_dbg(chan2dev(chan), "%s: src=0x%08x, dest=0x%08x, numf=%d, frame_size=%d, flags=0x%lx\n", + __func__, xt->src_start, xt->dst_start, xt->numf, + xt->frame_size, flags); + + src_addr = xt->src_start; + dst_addr = xt->dst_start; + + for (i = 0; i < xt->frame_size; i++) { + struct at_xdmac_desc *desc; + size_t src_icg, dst_icg; + + chunk = xt->sgl + i; + + dst_icg = dmaengine_get_dst_icg(xt, chunk); + src_icg = dmaengine_get_src_icg(xt, chunk); + + src_skip = chunk->size + src_icg; + dst_skip = chunk->size + dst_icg; + + dev_dbg(chan2dev(chan), + "%s: chunk size=%d, src icg=%d, dst icg=%d\n", + __func__, chunk->size, src_icg, dst_icg); + + /* + * Handle the case where we just have the same + * transfer to setup, we can just increase the + * block number and reuse the same descriptor. + */ + if (prev_chunk && prev && + (prev_chunk->size == chunk->size) && + (prev_src_icg == src_icg) && + (prev_dst_icg == dst_icg)) { + dev_dbg(chan2dev(chan), + "%s: same configuration that the previous chunk, merging the descriptors...\n", + __func__); + at_xdmac_increment_block_count(chan, prev); + continue; + } + + desc = at_xdmac_interleaved_queue_desc(chan, atchan, + prev, + src_addr, dst_addr, + xt, chunk); + if (!desc) { + list_splice_init(&first->descs_list, + &atchan->free_descs_list); + return NULL; + } + + if (!first) + first = desc; + + dev_dbg(chan2dev(chan), "%s: add desc 0x%p to descs_list 0x%p\n", + __func__, desc, first); + list_add_tail(&desc->desc_node, &first->descs_list); + + if (xt->src_sgl) + src_addr += src_skip; + + if (xt->dst_sgl) + dst_addr += dst_skip; + + len += chunk->size; + prev_chunk = chunk; + prev_dst_icg = dst_icg; + prev_src_icg = src_icg; + prev = desc; + } + + first->tx_dma_desc.cookie = -EBUSY; + first->tx_dma_desc.flags = flags; + first->xfer_size = len; + + return &first->tx_dma_desc; +} + static struct dma_async_tx_descriptor * at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, size_t len, unsigned long flags) @@ -814,24 +1046,7 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, if (unlikely(!len)) return NULL; - /* - * Check address alignment to select the greater data width we can use. - * Some XDMAC implementations don't provide dword transfer, in this - * case selecting dword has the same behavior as selecting word transfers. - */ - if (!((src_addr | dst_addr) & 7)) { - dwidth = AT_XDMAC_CC_DWIDTH_DWORD; - dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__); - } else if (!((src_addr | dst_addr) & 3)) { - dwidth = AT_XDMAC_CC_DWIDTH_WORD; - dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__); - } else if (!((src_addr | dst_addr) & 1)) { - dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD; - dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__); - } else { - dwidth = AT_XDMAC_CC_DWIDTH_BYTE; - dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__); - } + dwidth = at_xdmac_align_width(chan, src_addr | dst_addr); /* Prepare descriptors. */ while (remaining_size) { @@ -861,19 +1076,8 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, dev_dbg(chan2dev(chan), "%s: xfer_size=%zu\n", __func__, xfer_size); /* Check remaining length and change data width if needed. */ - if (!((src_addr | dst_addr | xfer_size) & 7)) { - dwidth = AT_XDMAC_CC_DWIDTH_DWORD; - dev_dbg(chan2dev(chan), "%s: dwidth: double word\n", __func__); - } else if (!((src_addr | dst_addr | xfer_size) & 3)) { - dwidth = AT_XDMAC_CC_DWIDTH_WORD; - dev_dbg(chan2dev(chan), "%s: dwidth: word\n", __func__); - } else if (!((src_addr | dst_addr | xfer_size) & 1)) { - dwidth = AT_XDMAC_CC_DWIDTH_HALFWORD; - dev_dbg(chan2dev(chan), "%s: dwidth: half word\n", __func__); - } else if ((src_addr | dst_addr | xfer_size) & 1) { - dwidth = AT_XDMAC_CC_DWIDTH_BYTE; - dev_dbg(chan2dev(chan), "%s: dwidth: byte\n", __func__); - } + dwidth = at_xdmac_align_width(chan, + src_addr | dst_addr | xfer_size); chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth); ublen = xfer_size >> dwidth; @@ -884,7 +1088,6 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV2 | AT_XDMAC_MBR_UBC_NDEN | AT_XDMAC_MBR_UBC_NSEN - | (remaining_size ? AT_XDMAC_MBR_UBC_NDE : 0) | ublen; desc->lld.mbr_cfg = chan_cc; @@ -893,12 +1096,8 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, __func__, &desc->lld.mbr_sa, &desc->lld.mbr_da, desc->lld.mbr_ubc, desc->lld.mbr_cfg); /* Chain lld. */ - if (prev) { - prev->lld.mbr_nda = desc->tx_dma_desc.phys; - dev_dbg(chan2dev(chan), - "%s: chain lld: prev=0x%p, mbr_nda=0x%08x\n", - __func__, prev, prev->lld.mbr_nda); - } + if (prev) + at_xdmac_queue_desc(chan, prev, desc); prev = desc; if (!first) @@ -915,6 +1114,93 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, return &first->tx_dma_desc; } +static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan, + struct at_xdmac_chan *atchan, + dma_addr_t dst_addr, + size_t len, + int value) +{ + struct at_xdmac_desc *desc; + unsigned long flags; + size_t ublen; + u32 dwidth; + /* + * WARNING: The channel configuration is set here since there is no + * dmaengine_slave_config call in this case. Moreover we don't know the + * direction, it involves we can't dynamically set the source and dest + * interface so we have to use the same one. Only interface 0 allows EBI + * access. Hopefully we can access DDR through both ports (at least on + * SAMA5D4x), so we can use the same interface for source and dest, + * that solves the fact we don't know the direction. + */ + u32 chan_cc = AT_XDMAC_CC_DAM_INCREMENTED_AM + | AT_XDMAC_CC_SAM_INCREMENTED_AM + | AT_XDMAC_CC_DIF(0) + | AT_XDMAC_CC_SIF(0) + | AT_XDMAC_CC_MBSIZE_SIXTEEN + | AT_XDMAC_CC_MEMSET_HW_MODE + | AT_XDMAC_CC_TYPE_MEM_TRAN; + + dwidth = at_xdmac_align_width(chan, dst_addr); + + if (len >= (AT_XDMAC_MBR_UBC_UBLEN_MAX << dwidth)) { + dev_err(chan2dev(chan), + "%s: Transfer too large, aborting...\n", + __func__); + return NULL; + } + + spin_lock_irqsave(&atchan->lock, flags); + desc = at_xdmac_get_desc(atchan); + spin_unlock_irqrestore(&atchan->lock, flags); + if (!desc) { + dev_err(chan2dev(chan), "can't get descriptor\n"); + return NULL; + } + + chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth); + + ublen = len >> dwidth; + + desc->lld.mbr_da = dst_addr; + desc->lld.mbr_ds = value; + desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3 + | AT_XDMAC_MBR_UBC_NDEN + | AT_XDMAC_MBR_UBC_NSEN + | ublen; + desc->lld.mbr_cfg = chan_cc; + + dev_dbg(chan2dev(chan), + "%s: lld: mbr_da=0x%08x, mbr_ds=0x%08x, mbr_ubc=0x%08x, mbr_cfg=0x%08x\n", + __func__, desc->lld.mbr_da, desc->lld.mbr_ds, desc->lld.mbr_ubc, + desc->lld.mbr_cfg); + + return desc; +} + +struct dma_async_tx_descriptor * +at_xdmac_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value, + size_t len, unsigned long flags) +{ + struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); + struct at_xdmac_desc *desc; + + dev_dbg(chan2dev(chan), "%s: dest=0x%08x, len=%d, pattern=0x%x, flags=0x%lx\n", + __func__, dest, len, value, flags); + + if (unlikely(!len)) + return NULL; + + desc = at_xdmac_memset_create_desc(chan, atchan, dest, len, value); + list_add_tail(&desc->desc_node, &desc->descs_list); + + desc->tx_dma_desc.cookie = -EBUSY; + desc->tx_dma_desc.flags = flags; + desc->xfer_size = len; + + return &desc->tx_dma_desc; +} + static enum dma_status at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, struct dma_tx_state *txstate) @@ -1445,7 +1731,9 @@ static int at_xdmac_probe(struct platform_device *pdev) } dma_cap_set(DMA_CYCLIC, atxdmac->dma.cap_mask); + dma_cap_set(DMA_INTERLEAVE, atxdmac->dma.cap_mask); dma_cap_set(DMA_MEMCPY, atxdmac->dma.cap_mask); + dma_cap_set(DMA_MEMSET, atxdmac->dma.cap_mask); dma_cap_set(DMA_SLAVE, atxdmac->dma.cap_mask); /* * Without DMA_PRIVATE the driver is not able to allocate more than @@ -1458,7 +1746,9 @@ static int at_xdmac_probe(struct platform_device *pdev) atxdmac->dma.device_tx_status = at_xdmac_tx_status; atxdmac->dma.device_issue_pending = at_xdmac_issue_pending; atxdmac->dma.device_prep_dma_cyclic = at_xdmac_prep_dma_cyclic; + atxdmac->dma.device_prep_interleaved_dma = at_xdmac_prep_interleaved; atxdmac->dma.device_prep_dma_memcpy = at_xdmac_prep_dma_memcpy; + atxdmac->dma.device_prep_dma_memset = at_xdmac_prep_dma_memset; atxdmac->dma.device_prep_slave_sg = at_xdmac_prep_slave_sg; atxdmac->dma.device_config = at_xdmac_device_config; atxdmac->dma.device_pause = at_xdmac_device_pause; diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 3ddfd1f6c23c..4a4cce15f25d 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -267,6 +267,13 @@ static void dma_chan_put(struct dma_chan *chan) /* This channel is not in use anymore, free it */ if (!chan->client_count && chan->device->device_free_chan_resources) chan->device->device_free_chan_resources(chan); + + /* If the channel is used via a DMA request router, free the mapping */ + if (chan->router && chan->router->route_free) { + chan->router->route_free(chan->router->dev, chan->route_data); + chan->router = NULL; + chan->route_data = NULL; + } } enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) @@ -536,7 +543,7 @@ static struct dma_chan *private_candidate(const dma_cap_mask_t *mask, } /** - * dma_request_slave_channel - try to get specific channel exclusively + * dma_get_slave_channel - try to get specific channel exclusively * @chan: target channel */ struct dma_chan *dma_get_slave_channel(struct dma_chan *chan) @@ -648,7 +655,7 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, EXPORT_SYMBOL_GPL(__dma_request_channel); /** - * dma_request_slave_channel - try to allocate an exclusive slave channel + * dma_request_slave_channel_reason - try to allocate an exclusive slave channel * @dev: pointer to client device structure * @name: slave channel name * @@ -836,6 +843,8 @@ int dma_async_device_register(struct dma_device *device) !device->device_prep_dma_pq); BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) && !device->device_prep_dma_pq_val); + BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && + !device->device_prep_dma_memset); BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) && !device->device_prep_dma_interrupt); BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) && diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index 24e5290faa32..57ff46284f15 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -1364,7 +1364,7 @@ static int __init ep93xx_dma_probe(struct platform_device *pdev) return ret; } -static struct platform_device_id ep93xx_dma_driver_ids[] = { +static const struct platform_device_id ep93xx_dma_driver_ids[] = { { "ep93xx-dma-m2p", 0 }, { "ep93xx-dma-m2m", 1 }, { }, diff --git a/drivers/dma/fsl-edma.c b/drivers/dma/fsl-edma.c index 09e2842d15ec..915eec3cc279 100644 --- a/drivers/dma/fsl-edma.c +++ b/drivers/dma/fsl-edma.c @@ -881,10 +881,6 @@ static int fsl_edma_probe(struct platform_device *pdev) } - ret = fsl_edma_irq_init(pdev, fsl_edma); - if (ret) - return ret; - fsl_edma->big_endian = of_property_read_bool(np, "big-endian"); INIT_LIST_HEAD(&fsl_edma->dma_dev.channels); @@ -900,6 +896,11 @@ static int fsl_edma_probe(struct platform_device *pdev) fsl_edma_chan_mux(fsl_chan, 0, false); } + edma_writel(fsl_edma, ~0, fsl_edma->membase + EDMA_INTR); + ret = fsl_edma_irq_init(pdev, fsl_edma); + if (ret) + return ret; + dma_cap_set(DMA_PRIVATE, fsl_edma->dma_dev.cap_mask); dma_cap_set(DMA_SLAVE, fsl_edma->dma_dev.cap_mask); dma_cap_set(DMA_CYCLIC, fsl_edma->dma_dev.cap_mask); diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c index eed405976ea9..865501fcc67d 100644 --- a/drivers/dma/imx-dma.c +++ b/drivers/dma/imx-dma.c @@ -193,7 +193,7 @@ struct imxdma_filter_data { int request; }; -static struct platform_device_id imx_dma_devtype[] = { +static const struct platform_device_id imx_dma_devtype[] = { { .name = "imx1-dma", .driver_data = IMX1_DMA, diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 62bbd79338e0..77b6aab04f47 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -420,7 +420,7 @@ static struct sdma_driver_data sdma_imx6q = { .script_addrs = &sdma_script_imx6q, }; -static struct platform_device_id sdma_devtypes[] = { +static const struct platform_device_id sdma_devtypes[] = { { .name = "imx25-sdma", .driver_data = (unsigned long)&sdma_imx25, diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 1c56001df676..fbaf1ead2597 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -19,6 +19,7 @@ #include <linux/dma-mapping.h> #include <linux/spinlock.h> #include <linux/interrupt.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/memory.h> #include <linux/clk.h> @@ -30,6 +31,11 @@ #include "dmaengine.h" #include "mv_xor.h" +enum mv_xor_mode { + XOR_MODE_IN_REG, + XOR_MODE_IN_DESC, +}; + static void mv_xor_issue_pending(struct dma_chan *chan); #define to_mv_xor_chan(chan) \ @@ -56,18 +62,30 @@ static void mv_desc_init(struct mv_xor_desc_slot *desc, hw_desc->byte_count = byte_count; } -static void mv_desc_set_next_desc(struct mv_xor_desc_slot *desc, - u32 next_desc_addr) +static void mv_desc_set_mode(struct mv_xor_desc_slot *desc) { struct mv_xor_desc *hw_desc = desc->hw_desc; - BUG_ON(hw_desc->phy_next_desc); - hw_desc->phy_next_desc = next_desc_addr; + + switch (desc->type) { + case DMA_XOR: + case DMA_INTERRUPT: + hw_desc->desc_command |= XOR_DESC_OPERATION_XOR; + break; + case DMA_MEMCPY: + hw_desc->desc_command |= XOR_DESC_OPERATION_MEMCPY; + break; + default: + BUG(); + return; + } } -static void mv_desc_clear_next_desc(struct mv_xor_desc_slot *desc) +static void mv_desc_set_next_desc(struct mv_xor_desc_slot *desc, + u32 next_desc_addr) { struct mv_xor_desc *hw_desc = desc->hw_desc; - hw_desc->phy_next_desc = 0; + BUG_ON(hw_desc->phy_next_desc); + hw_desc->phy_next_desc = next_desc_addr; } static void mv_desc_set_src_addr(struct mv_xor_desc_slot *desc, @@ -104,7 +122,7 @@ static u32 mv_chan_get_intr_cause(struct mv_xor_chan *chan) return intr_cause; } -static void mv_xor_device_clear_eoc_cause(struct mv_xor_chan *chan) +static void mv_chan_clear_eoc_cause(struct mv_xor_chan *chan) { u32 val; @@ -114,14 +132,14 @@ static void mv_xor_device_clear_eoc_cause(struct mv_xor_chan *chan) writel_relaxed(val, XOR_INTR_CAUSE(chan)); } -static void mv_xor_device_clear_err_status(struct mv_xor_chan *chan) +static void mv_chan_clear_err_status(struct mv_xor_chan *chan) { u32 val = 0xFFFF0000 >> (chan->idx * 16); writel_relaxed(val, XOR_INTR_CAUSE(chan)); } -static void mv_set_mode(struct mv_xor_chan *chan, - enum dma_transaction_type type) +static void mv_chan_set_mode(struct mv_xor_chan *chan, + enum dma_transaction_type type) { u32 op_mode; u32 config = readl_relaxed(XOR_CONFIG(chan)); @@ -144,6 +162,25 @@ static void mv_set_mode(struct mv_xor_chan *chan, config &= ~0x7; config |= op_mode; + if (IS_ENABLED(__BIG_ENDIAN)) + config |= XOR_DESCRIPTOR_SWAP; + else + config &= ~XOR_DESCRIPTOR_SWAP; + + writel_relaxed(config, XOR_CONFIG(chan)); + chan->current_type = type; +} + +static void mv_chan_set_mode_to_desc(struct mv_xor_chan *chan) +{ + u32 op_mode; + u32 config = readl_relaxed(XOR_CONFIG(chan)); + + op_mode = XOR_OPERATION_MODE_IN_DESC; + + config &= ~0x7; + config |= op_mode; + #if defined(__BIG_ENDIAN) config |= XOR_DESCRIPTOR_SWAP; #else @@ -151,7 +188,6 @@ static void mv_set_mode(struct mv_xor_chan *chan, #endif writel_relaxed(config, XOR_CONFIG(chan)); - chan->current_type = type; } static void mv_chan_activate(struct mv_xor_chan *chan) @@ -171,28 +207,13 @@ static char mv_chan_is_busy(struct mv_xor_chan *chan) return (state == 1) ? 1 : 0; } -/** - * mv_xor_free_slots - flags descriptor slots for reuse - * @slot: Slot to free - * Caller must hold &mv_chan->lock while calling this function - */ -static void mv_xor_free_slots(struct mv_xor_chan *mv_chan, - struct mv_xor_desc_slot *slot) -{ - dev_dbg(mv_chan_to_devp(mv_chan), "%s %d slot %p\n", - __func__, __LINE__, slot); - - slot->slot_used = 0; - -} - /* - * mv_xor_start_new_chain - program the engine to operate on new chain headed by - * sw_desc + * mv_chan_start_new_chain - program the engine to operate on new + * chain headed by sw_desc * Caller must hold &mv_chan->lock while calling this function */ -static void mv_xor_start_new_chain(struct mv_xor_chan *mv_chan, - struct mv_xor_desc_slot *sw_desc) +static void mv_chan_start_new_chain(struct mv_xor_chan *mv_chan, + struct mv_xor_desc_slot *sw_desc) { dev_dbg(mv_chan_to_devp(mv_chan), "%s %d: sw_desc %p\n", __func__, __LINE__, sw_desc); @@ -205,8 +226,9 @@ static void mv_xor_start_new_chain(struct mv_xor_chan *mv_chan, } static dma_cookie_t -mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc, - struct mv_xor_chan *mv_chan, dma_cookie_t cookie) +mv_desc_run_tx_complete_actions(struct mv_xor_desc_slot *desc, + struct mv_xor_chan *mv_chan, + dma_cookie_t cookie) { BUG_ON(desc->async_tx.cookie < 0); @@ -230,93 +252,110 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc, } static int -mv_xor_clean_completed_slots(struct mv_xor_chan *mv_chan) +mv_chan_clean_completed_slots(struct mv_xor_chan *mv_chan) { struct mv_xor_desc_slot *iter, *_iter; dev_dbg(mv_chan_to_devp(mv_chan), "%s %d\n", __func__, __LINE__); list_for_each_entry_safe(iter, _iter, &mv_chan->completed_slots, - completed_node) { + node) { - if (async_tx_test_ack(&iter->async_tx)) { - list_del(&iter->completed_node); - mv_xor_free_slots(mv_chan, iter); - } + if (async_tx_test_ack(&iter->async_tx)) + list_move_tail(&iter->node, &mv_chan->free_slots); } return 0; } static int -mv_xor_clean_slot(struct mv_xor_desc_slot *desc, - struct mv_xor_chan *mv_chan) +mv_desc_clean_slot(struct mv_xor_desc_slot *desc, + struct mv_xor_chan *mv_chan) { dev_dbg(mv_chan_to_devp(mv_chan), "%s %d: desc %p flags %d\n", __func__, __LINE__, desc, desc->async_tx.flags); - list_del(&desc->chain_node); + /* the client is allowed to attach dependent operations * until 'ack' is set */ - if (!async_tx_test_ack(&desc->async_tx)) { + if (!async_tx_test_ack(&desc->async_tx)) /* move this slot to the completed_slots */ - list_add_tail(&desc->completed_node, &mv_chan->completed_slots); - return 0; - } + list_move_tail(&desc->node, &mv_chan->completed_slots); + else + list_move_tail(&desc->node, &mv_chan->free_slots); - mv_xor_free_slots(mv_chan, desc); return 0; } /* This function must be called with the mv_xor_chan spinlock held */ -static void mv_xor_slot_cleanup(struct mv_xor_chan *mv_chan) +static void mv_chan_slot_cleanup(struct mv_xor_chan *mv_chan) { struct mv_xor_desc_slot *iter, *_iter; dma_cookie_t cookie = 0; int busy = mv_chan_is_busy(mv_chan); u32 current_desc = mv_chan_get_current_desc(mv_chan); - int seen_current = 0; + int current_cleaned = 0; + struct mv_xor_desc *hw_desc; dev_dbg(mv_chan_to_devp(mv_chan), "%s %d\n", __func__, __LINE__); dev_dbg(mv_chan_to_devp(mv_chan), "current_desc %x\n", current_desc); - mv_xor_clean_completed_slots(mv_chan); + mv_chan_clean_completed_slots(mv_chan); /* free completed slots from the chain starting with * the oldest descriptor */ list_for_each_entry_safe(iter, _iter, &mv_chan->chain, - chain_node) { - prefetch(_iter); - prefetch(&_iter->async_tx); + node) { - /* do not advance past the current descriptor loaded into the - * hardware channel, subsequent descriptors are either in - * process or have not been submitted - */ - if (seen_current) - break; + /* clean finished descriptors */ + hw_desc = iter->hw_desc; + if (hw_desc->status & XOR_DESC_SUCCESS) { + cookie = mv_desc_run_tx_complete_actions(iter, mv_chan, + cookie); - /* stop the search if we reach the current descriptor and the - * channel is busy - */ - if (iter->async_tx.phys == current_desc) { - seen_current = 1; - if (busy) + /* done processing desc, clean slot */ + mv_desc_clean_slot(iter, mv_chan); + + /* break if we did cleaned the current */ + if (iter->async_tx.phys == current_desc) { + current_cleaned = 1; break; + } + } else { + if (iter->async_tx.phys == current_desc) { + current_cleaned = 0; + break; + } } - - cookie = mv_xor_run_tx_complete_actions(iter, mv_chan, cookie); - - if (mv_xor_clean_slot(iter, mv_chan)) - break; } if ((busy == 0) && !list_empty(&mv_chan->chain)) { - struct mv_xor_desc_slot *chain_head; - chain_head = list_entry(mv_chan->chain.next, - struct mv_xor_desc_slot, - chain_node); - - mv_xor_start_new_chain(mv_chan, chain_head); + if (current_cleaned) { + /* + * current descriptor cleaned and removed, run + * from list head + */ + iter = list_entry(mv_chan->chain.next, + struct mv_xor_desc_slot, + node); + mv_chan_start_new_chain(mv_chan, iter); + } else { + if (!list_is_last(&iter->node, &mv_chan->chain)) { + /* + * descriptors are still waiting after + * current, trigger them + */ + iter = list_entry(iter->node.next, + struct mv_xor_desc_slot, + node); + mv_chan_start_new_chain(mv_chan, iter); + } else { + /* + * some descriptors are still waiting + * to be cleaned + */ + tasklet_schedule(&mv_chan->irq_tasklet); + } + } } if (cookie > 0) @@ -328,56 +367,35 @@ static void mv_xor_tasklet(unsigned long data) struct mv_xor_chan *chan = (struct mv_xor_chan *) data; spin_lock_bh(&chan->lock); - mv_xor_slot_cleanup(chan); + mv_chan_slot_cleanup(chan); spin_unlock_bh(&chan->lock); } static struct mv_xor_desc_slot * -mv_xor_alloc_slot(struct mv_xor_chan *mv_chan) +mv_chan_alloc_slot(struct mv_xor_chan *mv_chan) { - struct mv_xor_desc_slot *iter, *_iter; - int retry = 0; + struct mv_xor_desc_slot *iter; - /* start search from the last allocated descrtiptor - * if a contiguous allocation can not be found start searching - * from the beginning of the list - */ -retry: - if (retry == 0) - iter = mv_chan->last_used; - else - iter = list_entry(&mv_chan->all_slots, - struct mv_xor_desc_slot, - slot_node); - - list_for_each_entry_safe_continue( - iter, _iter, &mv_chan->all_slots, slot_node) { - - prefetch(_iter); - prefetch(&_iter->async_tx); - if (iter->slot_used) { - /* give up after finding the first busy slot - * on the second pass through the list - */ - if (retry) - break; - continue; - } + spin_lock_bh(&mv_chan->lock); + + if (!list_empty(&mv_chan->free_slots)) { + iter = list_first_entry(&mv_chan->free_slots, + struct mv_xor_desc_slot, + node); + + list_move_tail(&iter->node, &mv_chan->allocated_slots); + + spin_unlock_bh(&mv_chan->lock); /* pre-ack descriptor */ async_tx_ack(&iter->async_tx); - - iter->slot_used = 1; - INIT_LIST_HEAD(&iter->chain_node); iter->async_tx.cookie = -EBUSY; - mv_chan->last_used = iter; - mv_desc_clear_next_desc(iter); return iter; } - if (!retry++) - goto retry; + + spin_unlock_bh(&mv_chan->lock); /* try to free some slots if the allocation fails */ tasklet_schedule(&mv_chan->irq_tasklet); @@ -403,14 +421,14 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx) cookie = dma_cookie_assign(tx); if (list_empty(&mv_chan->chain)) - list_add_tail(&sw_desc->chain_node, &mv_chan->chain); + list_move_tail(&sw_desc->node, &mv_chan->chain); else { new_hw_chain = 0; old_chain_tail = list_entry(mv_chan->chain.prev, struct mv_xor_desc_slot, - chain_node); - list_add_tail(&sw_desc->chain_node, &mv_chan->chain); + node); + list_move_tail(&sw_desc->node, &mv_chan->chain); dev_dbg(mv_chan_to_devp(mv_chan), "Append to last desc %pa\n", &old_chain_tail->async_tx.phys); @@ -431,7 +449,7 @@ mv_xor_tx_submit(struct dma_async_tx_descriptor *tx) } if (new_hw_chain) - mv_xor_start_new_chain(mv_chan, sw_desc); + mv_chan_start_new_chain(mv_chan, sw_desc); spin_unlock_bh(&mv_chan->lock); @@ -463,26 +481,20 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan) dma_async_tx_descriptor_init(&slot->async_tx, chan); slot->async_tx.tx_submit = mv_xor_tx_submit; - INIT_LIST_HEAD(&slot->chain_node); - INIT_LIST_HEAD(&slot->slot_node); + INIT_LIST_HEAD(&slot->node); dma_desc = mv_chan->dma_desc_pool; slot->async_tx.phys = dma_desc + idx * MV_XOR_SLOT_SIZE; slot->idx = idx++; spin_lock_bh(&mv_chan->lock); mv_chan->slots_allocated = idx; - list_add_tail(&slot->slot_node, &mv_chan->all_slots); + list_add_tail(&slot->node, &mv_chan->free_slots); spin_unlock_bh(&mv_chan->lock); } - if (mv_chan->slots_allocated && !mv_chan->last_used) - mv_chan->last_used = list_entry(mv_chan->all_slots.next, - struct mv_xor_desc_slot, - slot_node); - dev_dbg(mv_chan_to_devp(mv_chan), - "allocated %d descriptor slots last_used: %p\n", - mv_chan->slots_allocated, mv_chan->last_used); + "allocated %d descriptor slots\n", + mv_chan->slots_allocated); return mv_chan->slots_allocated ? : -ENOMEM; } @@ -503,16 +515,17 @@ mv_xor_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, "%s src_cnt: %d len: %u dest %pad flags: %ld\n", __func__, src_cnt, len, &dest, flags); - spin_lock_bh(&mv_chan->lock); - sw_desc = mv_xor_alloc_slot(mv_chan); + sw_desc = mv_chan_alloc_slot(mv_chan); if (sw_desc) { sw_desc->type = DMA_XOR; sw_desc->async_tx.flags = flags; mv_desc_init(sw_desc, dest, len, flags); + if (mv_chan->op_in_desc == XOR_MODE_IN_DESC) + mv_desc_set_mode(sw_desc); while (src_cnt--) mv_desc_set_src_addr(sw_desc, src_cnt, src[src_cnt]); } - spin_unlock_bh(&mv_chan->lock); + dev_dbg(mv_chan_to_devp(mv_chan), "%s sw_desc %p async_tx %p \n", __func__, sw_desc, &sw_desc->async_tx); @@ -556,25 +569,29 @@ static void mv_xor_free_chan_resources(struct dma_chan *chan) spin_lock_bh(&mv_chan->lock); - mv_xor_slot_cleanup(mv_chan); + mv_chan_slot_cleanup(mv_chan); list_for_each_entry_safe(iter, _iter, &mv_chan->chain, - chain_node) { + node) { in_use_descs++; - list_del(&iter->chain_node); + list_move_tail(&iter->node, &mv_chan->free_slots); } list_for_each_entry_safe(iter, _iter, &mv_chan->completed_slots, - completed_node) { + node) { in_use_descs++; - list_del(&iter->completed_node); + list_move_tail(&iter->node, &mv_chan->free_slots); + } + list_for_each_entry_safe(iter, _iter, &mv_chan->allocated_slots, + node) { + in_use_descs++; + list_move_tail(&iter->node, &mv_chan->free_slots); } list_for_each_entry_safe_reverse( - iter, _iter, &mv_chan->all_slots, slot_node) { - list_del(&iter->slot_node); + iter, _iter, &mv_chan->free_slots, node) { + list_del(&iter->node); kfree(iter); mv_chan->slots_allocated--; } - mv_chan->last_used = NULL; dev_dbg(mv_chan_to_devp(mv_chan), "%s slots_allocated %d\n", __func__, mv_chan->slots_allocated); @@ -603,13 +620,13 @@ static enum dma_status mv_xor_status(struct dma_chan *chan, return ret; spin_lock_bh(&mv_chan->lock); - mv_xor_slot_cleanup(mv_chan); + mv_chan_slot_cleanup(mv_chan); spin_unlock_bh(&mv_chan->lock); return dma_cookie_status(chan, cookie, txstate); } -static void mv_dump_xor_regs(struct mv_xor_chan *chan) +static void mv_chan_dump_regs(struct mv_xor_chan *chan) { u32 val; @@ -632,8 +649,8 @@ static void mv_dump_xor_regs(struct mv_xor_chan *chan) dev_err(mv_chan_to_devp(chan), "error addr 0x%08x\n", val); } -static void mv_xor_err_interrupt_handler(struct mv_xor_chan *chan, - u32 intr_cause) +static void mv_chan_err_interrupt_handler(struct mv_xor_chan *chan, + u32 intr_cause) { if (intr_cause & XOR_INT_ERR_DECODE) { dev_dbg(mv_chan_to_devp(chan), "ignoring address decode error\n"); @@ -643,7 +660,7 @@ static void mv_xor_err_interrupt_handler(struct mv_xor_chan *chan, dev_err(mv_chan_to_devp(chan), "error on chan %d. intr cause 0x%08x\n", chan->idx, intr_cause); - mv_dump_xor_regs(chan); + mv_chan_dump_regs(chan); WARN_ON(1); } @@ -655,11 +672,11 @@ static irqreturn_t mv_xor_interrupt_handler(int irq, void *data) dev_dbg(mv_chan_to_devp(chan), "intr cause %x\n", intr_cause); if (intr_cause & XOR_INTR_ERRORS) - mv_xor_err_interrupt_handler(chan, intr_cause); + mv_chan_err_interrupt_handler(chan, intr_cause); tasklet_schedule(&chan->irq_tasklet); - mv_xor_device_clear_eoc_cause(chan); + mv_chan_clear_eoc_cause(chan); return IRQ_HANDLED; } @@ -678,7 +695,7 @@ static void mv_xor_issue_pending(struct dma_chan *chan) * Perform a transaction to verify the HW works. */ -static int mv_xor_memcpy_self_test(struct mv_xor_chan *mv_chan) +static int mv_chan_memcpy_self_test(struct mv_xor_chan *mv_chan) { int i, ret; void *src, *dest; @@ -787,7 +804,7 @@ out: #define MV_XOR_NUM_SRC_TEST 4 /* must be <= 15 */ static int -mv_xor_xor_self_test(struct mv_xor_chan *mv_chan) +mv_chan_xor_self_test(struct mv_xor_chan *mv_chan) { int i, src_idx, ret; struct page *dest; @@ -951,7 +968,7 @@ static int mv_xor_channel_remove(struct mv_xor_chan *mv_chan) static struct mv_xor_chan * mv_xor_channel_add(struct mv_xor_device *xordev, struct platform_device *pdev, - int idx, dma_cap_mask_t cap_mask, int irq) + int idx, dma_cap_mask_t cap_mask, int irq, int op_in_desc) { int ret = 0; struct mv_xor_chan *mv_chan; @@ -963,6 +980,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev, mv_chan->idx = idx; mv_chan->irq = irq; + mv_chan->op_in_desc = op_in_desc; dma_dev = &mv_chan->dmadev; @@ -1014,7 +1032,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev, mv_chan); /* clear errors before enabling interrupts */ - mv_xor_device_clear_err_status(mv_chan); + mv_chan_clear_err_status(mv_chan); ret = request_irq(mv_chan->irq, mv_xor_interrupt_handler, 0, dev_name(&pdev->dev), mv_chan); @@ -1023,32 +1041,37 @@ mv_xor_channel_add(struct mv_xor_device *xordev, mv_chan_unmask_interrupts(mv_chan); - mv_set_mode(mv_chan, DMA_XOR); + if (mv_chan->op_in_desc == XOR_MODE_IN_DESC) + mv_chan_set_mode_to_desc(mv_chan); + else + mv_chan_set_mode(mv_chan, DMA_XOR); spin_lock_init(&mv_chan->lock); INIT_LIST_HEAD(&mv_chan->chain); INIT_LIST_HEAD(&mv_chan->completed_slots); - INIT_LIST_HEAD(&mv_chan->all_slots); + INIT_LIST_HEAD(&mv_chan->free_slots); + INIT_LIST_HEAD(&mv_chan->allocated_slots); mv_chan->dmachan.device = dma_dev; dma_cookie_init(&mv_chan->dmachan); list_add_tail(&mv_chan->dmachan.device_node, &dma_dev->channels); if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { - ret = mv_xor_memcpy_self_test(mv_chan); + ret = mv_chan_memcpy_self_test(mv_chan); dev_dbg(&pdev->dev, "memcpy self test returned %d\n", ret); if (ret) goto err_free_irq; } if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { - ret = mv_xor_xor_self_test(mv_chan); + ret = mv_chan_xor_self_test(mv_chan); dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); if (ret) goto err_free_irq; } - dev_info(&pdev->dev, "Marvell XOR: ( %s%s%s)\n", + dev_info(&pdev->dev, "Marvell XOR (%s): ( %s%s%s)\n", + mv_chan->op_in_desc ? "Descriptor Mode" : "Registers Mode", dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : ""); @@ -1097,6 +1120,13 @@ mv_xor_conf_mbus_windows(struct mv_xor_device *xordev, writel(0, base + WINDOW_OVERRIDE_CTRL(1)); } +static const struct of_device_id mv_xor_dt_ids[] = { + { .compatible = "marvell,orion-xor", .data = (void *)XOR_MODE_IN_REG }, + { .compatible = "marvell,armada-380-xor", .data = (void *)XOR_MODE_IN_DESC }, + {}, +}; +MODULE_DEVICE_TABLE(of, mv_xor_dt_ids); + static int mv_xor_probe(struct platform_device *pdev) { const struct mbus_dram_target_info *dram; @@ -1104,6 +1134,7 @@ static int mv_xor_probe(struct platform_device *pdev) struct mv_xor_platform_data *pdata = dev_get_platdata(&pdev->dev); struct resource *res; int i, ret; + int op_in_desc; dev_notice(&pdev->dev, "Marvell shared XOR driver\n"); @@ -1148,11 +1179,15 @@ static int mv_xor_probe(struct platform_device *pdev) if (pdev->dev.of_node) { struct device_node *np; int i = 0; + const struct of_device_id *of_id = + of_match_device(mv_xor_dt_ids, + &pdev->dev); for_each_child_of_node(pdev->dev.of_node, np) { struct mv_xor_chan *chan; dma_cap_mask_t cap_mask; int irq; + op_in_desc = (int)of_id->data; dma_cap_zero(cap_mask); if (of_property_read_bool(np, "dmacap,memcpy")) @@ -1169,7 +1204,7 @@ static int mv_xor_probe(struct platform_device *pdev) } chan = mv_xor_channel_add(xordev, pdev, i, - cap_mask, irq); + cap_mask, irq, op_in_desc); if (IS_ERR(chan)) { ret = PTR_ERR(chan); irq_dispose_mapping(irq); @@ -1198,7 +1233,8 @@ static int mv_xor_probe(struct platform_device *pdev) } chan = mv_xor_channel_add(xordev, pdev, i, - cd->cap_mask, irq); + cd->cap_mask, irq, + XOR_MODE_IN_REG); if (IS_ERR(chan)) { ret = PTR_ERR(chan); goto err_channel_add; @@ -1244,14 +1280,6 @@ static int mv_xor_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_OF -static const struct of_device_id mv_xor_dt_ids[] = { - { .compatible = "marvell,orion-xor", }, - {}, -}; -MODULE_DEVICE_TABLE(of, mv_xor_dt_ids); -#endif - static struct platform_driver mv_xor_driver = { .probe = mv_xor_probe, .remove = mv_xor_remove, diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h index 91958dba39a2..b7455b42137b 100644 --- a/drivers/dma/mv_xor.h +++ b/drivers/dma/mv_xor.h @@ -19,7 +19,7 @@ #include <linux/dmaengine.h> #include <linux/interrupt.h> -#define MV_XOR_POOL_SIZE PAGE_SIZE +#define MV_XOR_POOL_SIZE (MV_XOR_SLOT_SIZE * 3072) #define MV_XOR_SLOT_SIZE 64 #define MV_XOR_THRESHOLD 1 #define MV_XOR_MAX_CHANNELS 2 @@ -30,7 +30,13 @@ /* Values for the XOR_CONFIG register */ #define XOR_OPERATION_MODE_XOR 0 #define XOR_OPERATION_MODE_MEMCPY 2 +#define XOR_OPERATION_MODE_IN_DESC 7 #define XOR_DESCRIPTOR_SWAP BIT(14) +#define XOR_DESC_SUCCESS 0x40000000 + +#define XOR_DESC_OPERATION_XOR (0 << 24) +#define XOR_DESC_OPERATION_CRC32C (1 << 24) +#define XOR_DESC_OPERATION_MEMCPY (2 << 24) #define XOR_DESC_DMA_OWNED BIT(31) #define XOR_DESC_EOD_INT_EN BIT(31) @@ -88,13 +94,14 @@ struct mv_xor_device { * @mmr_base: memory mapped register base * @idx: the index of the xor channel * @chain: device chain view of the descriptors + * @free_slots: free slots usable by the channel + * @allocated_slots: slots allocated by the driver * @completed_slots: slots completed by HW but still need to be acked * @device: parent device * @common: common dmaengine channel object members - * @last_used: place holder for allocation to continue from where it left off - * @all_slots: complete domain of slots usable by the channel * @slots_allocated: records the actual size of the descriptor slot pool * @irq_tasklet: bottom half where mv_xor_slot_cleanup runs + * @op_in_desc: new mode of driver, each op is writen to descriptor. */ struct mv_xor_chan { int pending; @@ -105,16 +112,17 @@ struct mv_xor_chan { int irq; enum dma_transaction_type current_type; struct list_head chain; + struct list_head free_slots; + struct list_head allocated_slots; struct list_head completed_slots; dma_addr_t dma_desc_pool; void *dma_desc_pool_virt; size_t pool_size; struct dma_device dmadev; struct dma_chan dmachan; - struct mv_xor_desc_slot *last_used; - struct list_head all_slots; int slots_allocated; struct tasklet_struct irq_tasklet; + int op_in_desc; char dummy_src[MV_XOR_MIN_BYTE_COUNT]; char dummy_dst[MV_XOR_MIN_BYTE_COUNT]; dma_addr_t dummy_src_addr, dummy_dst_addr; @@ -122,9 +130,7 @@ struct mv_xor_chan { /** * struct mv_xor_desc_slot - software descriptor - * @slot_node: node on the mv_xor_chan.all_slots list - * @chain_node: node on the mv_xor_chan.chain list - * @completed_node: node on the mv_xor_chan.completed_slots list + * @node: node on the mv_xor_chan lists * @hw_desc: virtual address of the hardware descriptor chain * @phys: hardware address of the hardware descriptor chain * @slot_used: slot in use or not @@ -133,12 +139,9 @@ struct mv_xor_chan { * @async_tx: support for the async_tx api */ struct mv_xor_desc_slot { - struct list_head slot_node; - struct list_head chain_node; - struct list_head completed_node; + struct list_head node; enum dma_transaction_type type; void *hw_desc; - u16 slot_used; u16 idx; struct dma_async_tx_descriptor async_tx; }; diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c index 829ec686dac3..60de35251da5 100644 --- a/drivers/dma/mxs-dma.c +++ b/drivers/dma/mxs-dma.c @@ -170,7 +170,7 @@ static struct mxs_dma_type mxs_dma_types[] = { } }; -static struct platform_device_id mxs_dma_ids[] = { +static const struct platform_device_id mxs_dma_ids[] = { { .name = "imx23-dma-apbh", .driver_data = (kernel_ulong_t) &mxs_dma_types[0], diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c index 88b77c98365d..2b5a198ac77e 100644 --- a/drivers/dma/nbpfaxi.c +++ b/drivers/dma/nbpfaxi.c @@ -1455,7 +1455,7 @@ static int nbpf_remove(struct platform_device *pdev) return 0; } -static struct platform_device_id nbpf_ids[] = { +static const struct platform_device_id nbpf_ids[] = { {"nbpfaxi64dmac1b4", (kernel_ulong_t)&nbpf_cfg[NBPF1B4]}, {"nbpfaxi64dmac1b8", (kernel_ulong_t)&nbpf_cfg[NBPF1B8]}, {"nbpfaxi64dmac1b16", (kernel_ulong_t)&nbpf_cfg[NBPF1B16]}, diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c index cbd4a8aff120..1e1f2986eba8 100644 --- a/drivers/dma/of-dma.c +++ b/drivers/dma/of-dma.c @@ -45,6 +45,50 @@ static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec) } /** + * of_dma_router_xlate - translation function for router devices + * @dma_spec: pointer to DMA specifier as found in the device tree + * @of_dma: pointer to DMA controller data (router information) + * + * The function creates new dma_spec to be passed to the router driver's + * of_dma_route_allocate() function to prepare a dma_spec which will be used + * to request channel from the real DMA controller. + */ +static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct dma_chan *chan; + struct of_dma *ofdma_target; + struct of_phandle_args dma_spec_target; + void *route_data; + + /* translate the request for the real DMA controller */ + memcpy(&dma_spec_target, dma_spec, sizeof(dma_spec_target)); + route_data = ofdma->of_dma_route_allocate(&dma_spec_target, ofdma); + if (IS_ERR(route_data)) + return NULL; + + ofdma_target = of_dma_find_controller(&dma_spec_target); + if (!ofdma_target) + return NULL; + + chan = ofdma_target->of_dma_xlate(&dma_spec_target, ofdma_target); + if (chan) { + chan->router = ofdma->dma_router; + chan->route_data = route_data; + } else { + ofdma->dma_router->route_free(ofdma->dma_router->dev, + route_data); + } + + /* + * Need to put the node back since the ofdma->of_dma_route_allocate + * has taken it for generating the new, translated dma_spec + */ + of_node_put(dma_spec_target.np); + return chan; +} + +/** * of_dma_controller_register - Register a DMA controller to DT DMA helpers * @np: device node of DMA controller * @of_dma_xlate: translation function which converts a phandle @@ -110,6 +154,51 @@ void of_dma_controller_free(struct device_node *np) EXPORT_SYMBOL_GPL(of_dma_controller_free); /** + * of_dma_router_register - Register a DMA router to DT DMA helpers as a + * controller + * @np: device node of DMA router + * @of_dma_route_allocate: setup function for the router which need to + * modify the dma_spec for the DMA controller to + * use and to set up the requested route. + * @dma_router: pointer to dma_router structure to be used when + * the route need to be free up. + * + * Returns 0 on success or appropriate errno value on error. + * + * Allocated memory should be freed with appropriate of_dma_controller_free() + * call. + */ +int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + struct of_dma *ofdma; + + if (!np || !of_dma_route_allocate || !dma_router) { + pr_err("%s: not enough information provided\n", __func__); + return -EINVAL; + } + + ofdma = kzalloc(sizeof(*ofdma), GFP_KERNEL); + if (!ofdma) + return -ENOMEM; + + ofdma->of_node = np; + ofdma->of_dma_xlate = of_dma_router_xlate; + ofdma->of_dma_route_allocate = of_dma_route_allocate; + ofdma->dma_router = dma_router; + + /* Now queue of_dma controller structure in list */ + mutex_lock(&of_dma_lock); + list_add_tail(&ofdma->of_dma_controllers, &of_dma_list); + mutex_unlock(&of_dma_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(of_dma_router_register); + +/** * of_dma_match_channel - Check if a DMA specifier matches name * @np: device node to look for DMA channels * @name: channel name to be matched diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c index 167dbaf65742..249445c8a4c6 100644 --- a/drivers/dma/omap-dma.c +++ b/drivers/dma/omap-dma.c @@ -22,6 +22,9 @@ #include "virt-dma.h" +#define OMAP_SDMA_REQUESTS 127 +#define OMAP_SDMA_CHANNELS 32 + struct omap_dmadev { struct dma_device ddev; spinlock_t lock; @@ -31,9 +34,10 @@ struct omap_dmadev { const struct omap_dma_reg *reg_map; struct omap_system_dma_plat_info *plat; bool legacy; + unsigned dma_requests; spinlock_t irq_lock; uint32_t irq_enable_mask; - struct omap_chan *lch_map[32]; + struct omap_chan *lch_map[OMAP_SDMA_CHANNELS]; }; struct omap_chan { @@ -362,7 +366,7 @@ static void omap_dma_start_sg(struct omap_chan *c, struct omap_desc *d, struct omap_sg *sg = d->sg + idx; unsigned cxsa, cxei, cxfi; - if (d->dir == DMA_DEV_TO_MEM) { + if (d->dir == DMA_DEV_TO_MEM || d->dir == DMA_MEM_TO_MEM) { cxsa = CDSA; cxei = CDEI; cxfi = CDFI; @@ -408,7 +412,7 @@ static void omap_dma_start_desc(struct omap_chan *c) if (dma_omap1()) omap_dma_chan_write(c, CCR2, d->ccr >> 16); - if (d->dir == DMA_DEV_TO_MEM) { + if (d->dir == DMA_DEV_TO_MEM || d->dir == DMA_MEM_TO_MEM) { cxsa = CSSA; cxei = CSEI; cxfi = CSFI; @@ -589,6 +593,7 @@ static void omap_dma_free_chan_resources(struct dma_chan *chan) omap_free_dma(c->dma_ch); dev_dbg(od->ddev.dev, "freeing channel for %u\n", c->dma_sig); + c->dma_sig = 0; } static size_t omap_dma_sg_size(struct omap_sg *sg) @@ -948,6 +953,51 @@ static struct dma_async_tx_descriptor *omap_dma_prep_dma_cyclic( return vchan_tx_prep(&c->vc, &d->vd, flags); } +static struct dma_async_tx_descriptor *omap_dma_prep_dma_memcpy( + struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, + size_t len, unsigned long tx_flags) +{ + struct omap_chan *c = to_omap_dma_chan(chan); + struct omap_desc *d; + uint8_t data_type; + + d = kzalloc(sizeof(*d) + sizeof(d->sg[0]), GFP_ATOMIC); + if (!d) + return NULL; + + data_type = __ffs((src | dest | len)); + if (data_type > CSDP_DATA_TYPE_32) + data_type = CSDP_DATA_TYPE_32; + + d->dir = DMA_MEM_TO_MEM; + d->dev_addr = src; + d->fi = 0; + d->es = data_type; + d->sg[0].en = len / BIT(data_type); + d->sg[0].fn = 1; + d->sg[0].addr = dest; + d->sglen = 1; + d->ccr = c->ccr; + d->ccr |= CCR_DST_AMODE_POSTINC | CCR_SRC_AMODE_POSTINC; + + d->cicr = CICR_DROP_IE; + if (tx_flags & DMA_PREP_INTERRUPT) + d->cicr |= CICR_FRAME_IE; + + d->csdp = data_type; + + if (dma_omap1()) { + d->cicr |= CICR_TOUT_IE; + d->csdp |= CSDP_DST_PORT_EMIFF | CSDP_SRC_PORT_EMIFF; + } else { + d->csdp |= CSDP_DST_PACKED | CSDP_SRC_PACKED; + d->cicr |= CICR_MISALIGNED_ERR_IE | CICR_TRANS_ERR_IE; + d->csdp |= CSDP_DST_BURST_64 | CSDP_SRC_BURST_64; + } + + return vchan_tx_prep(&c->vc, &d->vd, tx_flags); +} + static int omap_dma_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg) { struct omap_chan *c = to_omap_dma_chan(chan); @@ -1037,7 +1087,7 @@ static int omap_dma_resume(struct dma_chan *chan) return 0; } -static int omap_dma_chan_init(struct omap_dmadev *od, int dma_sig) +static int omap_dma_chan_init(struct omap_dmadev *od) { struct omap_chan *c; @@ -1046,7 +1096,6 @@ static int omap_dma_chan_init(struct omap_dmadev *od, int dma_sig) return -ENOMEM; c->reg_map = od->reg_map; - c->dma_sig = dma_sig; c->vc.desc_free = omap_dma_desc_free; vchan_init(&c->vc, &od->ddev); INIT_LIST_HEAD(&c->node); @@ -1094,12 +1143,14 @@ static int omap_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_SLAVE, od->ddev.cap_mask); dma_cap_set(DMA_CYCLIC, od->ddev.cap_mask); + dma_cap_set(DMA_MEMCPY, od->ddev.cap_mask); od->ddev.device_alloc_chan_resources = omap_dma_alloc_chan_resources; od->ddev.device_free_chan_resources = omap_dma_free_chan_resources; od->ddev.device_tx_status = omap_dma_tx_status; od->ddev.device_issue_pending = omap_dma_issue_pending; od->ddev.device_prep_slave_sg = omap_dma_prep_slave_sg; od->ddev.device_prep_dma_cyclic = omap_dma_prep_dma_cyclic; + od->ddev.device_prep_dma_memcpy = omap_dma_prep_dma_memcpy; od->ddev.device_config = omap_dma_slave_config; od->ddev.device_pause = omap_dma_pause; od->ddev.device_resume = omap_dma_resume; @@ -1116,8 +1167,17 @@ static int omap_dma_probe(struct platform_device *pdev) tasklet_init(&od->task, omap_dma_sched, (unsigned long)od); - for (i = 0; i < 127; i++) { - rc = omap_dma_chan_init(od, i); + od->dma_requests = OMAP_SDMA_REQUESTS; + if (pdev->dev.of_node && of_property_read_u32(pdev->dev.of_node, + "dma-requests", + &od->dma_requests)) { + dev_info(&pdev->dev, + "Missing dma-requests property, using %u.\n", + OMAP_SDMA_REQUESTS); + } + + for (i = 0; i < OMAP_SDMA_CHANNELS; i++) { + rc = omap_dma_chan_init(od); if (rc) { omap_dma_free(od); return rc; @@ -1208,10 +1268,14 @@ static struct platform_driver omap_dma_driver = { bool omap_dma_filter_fn(struct dma_chan *chan, void *param) { if (chan->device->dev->driver == &omap_dma_driver.driver) { + struct omap_dmadev *od = to_omap_dma_dev(chan->device); struct omap_chan *c = to_omap_dma_chan(chan); unsigned req = *(unsigned *)param; - return req == c->dma_sig; + if (req <= od->dma_requests) { + c->dma_sig = req; + return true; + } } return false; } diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 340f9e607cd8..f513f77b1d85 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -1424,8 +1424,8 @@ static int pl330_submit_req(struct pl330_thread *thrd, goto xfer_exit; if (ret > pl330->mcbufsz / 2) { - dev_info(pl330->ddma.dev, "%s:%d Trying increasing mcbufsz\n", - __func__, __LINE__); + dev_info(pl330->ddma.dev, "%s:%d Try increasing mcbufsz (%i/%i)\n", + __func__, __LINE__, ret, pl330->mcbufsz / 2); ret = -ENOMEM; goto xfer_exit; } @@ -2584,12 +2584,14 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst, { struct dma_pl330_desc *desc; struct dma_pl330_chan *pch = to_pchan(chan); - struct pl330_dmac *pl330 = pch->dmac; + struct pl330_dmac *pl330; int burst; if (unlikely(!pch || !len)) return NULL; + pl330 = pch->dmac; + desc = __pl330_prep_dma_memcpy(pch, dst, src, len); if (!desc) return NULL; diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c new file mode 100644 index 000000000000..ddcbbf5cd9e9 --- /dev/null +++ b/drivers/dma/pxa_dma.c @@ -0,0 +1,1467 @@ +/* + * Copyright 2015 Robert Jarzmik <robert.jarzmik@free.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/err.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/interrupt.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/dmaengine.h> +#include <linux/platform_device.h> +#include <linux/device.h> +#include <linux/platform_data/mmp_dma.h> +#include <linux/dmapool.h> +#include <linux/of_device.h> +#include <linux/of_dma.h> +#include <linux/of.h> +#include <linux/dma/pxa-dma.h> + +#include "dmaengine.h" +#include "virt-dma.h" + +#define DCSR(n) (0x0000 + ((n) << 2)) +#define DALGN(n) 0x00a0 +#define DINT 0x00f0 +#define DDADR(n) (0x0200 + ((n) << 4)) +#define DSADR(n) (0x0204 + ((n) << 4)) +#define DTADR(n) (0x0208 + ((n) << 4)) +#define DCMD(n) (0x020c + ((n) << 4)) + +#define PXA_DCSR_RUN BIT(31) /* Run Bit (read / write) */ +#define PXA_DCSR_NODESC BIT(30) /* No-Descriptor Fetch (read / write) */ +#define PXA_DCSR_STOPIRQEN BIT(29) /* Stop Interrupt Enable (R/W) */ +#define PXA_DCSR_REQPEND BIT(8) /* Request Pending (read-only) */ +#define PXA_DCSR_STOPSTATE BIT(3) /* Stop State (read-only) */ +#define PXA_DCSR_ENDINTR BIT(2) /* End Interrupt (read / write) */ +#define PXA_DCSR_STARTINTR BIT(1) /* Start Interrupt (read / write) */ +#define PXA_DCSR_BUSERR BIT(0) /* Bus Error Interrupt (read / write) */ + +#define PXA_DCSR_EORIRQEN BIT(28) /* End of Receive IRQ Enable (R/W) */ +#define PXA_DCSR_EORJMPEN BIT(27) /* Jump to next descriptor on EOR */ +#define PXA_DCSR_EORSTOPEN BIT(26) /* STOP on an EOR */ +#define PXA_DCSR_SETCMPST BIT(25) /* Set Descriptor Compare Status */ +#define PXA_DCSR_CLRCMPST BIT(24) /* Clear Descriptor Compare Status */ +#define PXA_DCSR_CMPST BIT(10) /* The Descriptor Compare Status */ +#define PXA_DCSR_EORINTR BIT(9) /* The end of Receive */ + +#define DRCMR_MAPVLD BIT(7) /* Map Valid (read / write) */ +#define DRCMR_CHLNUM 0x1f /* mask for Channel Number (read / write) */ + +#define DDADR_DESCADDR 0xfffffff0 /* Address of next descriptor (mask) */ +#define DDADR_STOP BIT(0) /* Stop (read / write) */ + +#define PXA_DCMD_INCSRCADDR BIT(31) /* Source Address Increment Setting. */ +#define PXA_DCMD_INCTRGADDR BIT(30) /* Target Address Increment Setting. */ +#define PXA_DCMD_FLOWSRC BIT(29) /* Flow Control by the source. */ +#define PXA_DCMD_FLOWTRG BIT(28) /* Flow Control by the target. */ +#define PXA_DCMD_STARTIRQEN BIT(22) /* Start Interrupt Enable */ +#define PXA_DCMD_ENDIRQEN BIT(21) /* End Interrupt Enable */ +#define PXA_DCMD_ENDIAN BIT(18) /* Device Endian-ness. */ +#define PXA_DCMD_BURST8 (1 << 16) /* 8 byte burst */ +#define PXA_DCMD_BURST16 (2 << 16) /* 16 byte burst */ +#define PXA_DCMD_BURST32 (3 << 16) /* 32 byte burst */ +#define PXA_DCMD_WIDTH1 (1 << 14) /* 1 byte width */ +#define PXA_DCMD_WIDTH2 (2 << 14) /* 2 byte width (HalfWord) */ +#define PXA_DCMD_WIDTH4 (3 << 14) /* 4 byte width (Word) */ +#define PXA_DCMD_LENGTH 0x01fff /* length mask (max = 8K - 1) */ + +#define PDMA_ALIGNMENT 3 +#define PDMA_MAX_DESC_BYTES (PXA_DCMD_LENGTH & ~((1 << PDMA_ALIGNMENT) - 1)) + +struct pxad_desc_hw { + u32 ddadr; /* Points to the next descriptor + flags */ + u32 dsadr; /* DSADR value for the current transfer */ + u32 dtadr; /* DTADR value for the current transfer */ + u32 dcmd; /* DCMD value for the current transfer */ +} __aligned(16); + +struct pxad_desc_sw { + struct virt_dma_desc vd; /* Virtual descriptor */ + int nb_desc; /* Number of hw. descriptors */ + size_t len; /* Number of bytes xfered */ + dma_addr_t first; /* First descriptor's addr */ + + /* At least one descriptor has an src/dst address not multiple of 8 */ + bool misaligned; + bool cyclic; + struct dma_pool *desc_pool; /* Channel's used allocator */ + + struct pxad_desc_hw *hw_desc[]; /* DMA coherent descriptors */ +}; + +struct pxad_phy { + int idx; + void __iomem *base; + struct pxad_chan *vchan; +}; + +struct pxad_chan { + struct virt_dma_chan vc; /* Virtual channel */ + u32 drcmr; /* Requestor of the channel */ + enum pxad_chan_prio prio; /* Required priority of phy */ + /* + * At least one desc_sw in submitted or issued transfers on this channel + * has one address such as: addr % 8 != 0. This implies the DALGN + * setting on the phy. + */ + bool misaligned; + struct dma_slave_config cfg; /* Runtime config */ + + /* protected by vc->lock */ + struct pxad_phy *phy; + struct dma_pool *desc_pool; /* Descriptors pool */ +}; + +struct pxad_device { + struct dma_device slave; + int nr_chans; + void __iomem *base; + struct pxad_phy *phys; + spinlock_t phy_lock; /* Phy association */ +#ifdef CONFIG_DEBUG_FS + struct dentry *dbgfs_root; + struct dentry *dbgfs_state; + struct dentry **dbgfs_chan; +#endif +}; + +#define tx_to_pxad_desc(tx) \ + container_of(tx, struct pxad_desc_sw, async_tx) +#define to_pxad_chan(dchan) \ + container_of(dchan, struct pxad_chan, vc.chan) +#define to_pxad_dev(dmadev) \ + container_of(dmadev, struct pxad_device, slave) +#define to_pxad_sw_desc(_vd) \ + container_of((_vd), struct pxad_desc_sw, vd) + +#define _phy_readl_relaxed(phy, _reg) \ + readl_relaxed((phy)->base + _reg((phy)->idx)) +#define phy_readl_relaxed(phy, _reg) \ + ({ \ + u32 _v; \ + _v = readl_relaxed((phy)->base + _reg((phy)->idx)); \ + dev_vdbg(&phy->vchan->vc.chan.dev->device, \ + "%s(): readl(%s): 0x%08x\n", __func__, #_reg, \ + _v); \ + _v; \ + }) +#define phy_writel(phy, val, _reg) \ + do { \ + writel((val), (phy)->base + _reg((phy)->idx)); \ + dev_vdbg(&phy->vchan->vc.chan.dev->device, \ + "%s(): writel(0x%08x, %s)\n", \ + __func__, (u32)(val), #_reg); \ + } while (0) +#define phy_writel_relaxed(phy, val, _reg) \ + do { \ + writel_relaxed((val), (phy)->base + _reg((phy)->idx)); \ + dev_vdbg(&phy->vchan->vc.chan.dev->device, \ + "%s(): writel_relaxed(0x%08x, %s)\n", \ + __func__, (u32)(val), #_reg); \ + } while (0) + +static unsigned int pxad_drcmr(unsigned int line) +{ + if (line < 64) + return 0x100 + line * 4; + return 0x1000 + line * 4; +} + +/* + * Debug fs + */ +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/seq_file.h> + +static int dbg_show_requester_chan(struct seq_file *s, void *p) +{ + int pos = 0; + struct pxad_phy *phy = s->private; + int i; + u32 drcmr; + + pos += seq_printf(s, "DMA channel %d requester :\n", phy->idx); + for (i = 0; i < 70; i++) { + drcmr = readl_relaxed(phy->base + pxad_drcmr(i)); + if ((drcmr & DRCMR_CHLNUM) == phy->idx) + pos += seq_printf(s, "\tRequester %d (MAPVLD=%d)\n", i, + !!(drcmr & DRCMR_MAPVLD)); + } + return pos; +} + +static inline int dbg_burst_from_dcmd(u32 dcmd) +{ + int burst = (dcmd >> 16) & 0x3; + + return burst ? 4 << burst : 0; +} + +static int is_phys_valid(unsigned long addr) +{ + return pfn_valid(__phys_to_pfn(addr)); +} + +#define PXA_DCSR_STR(flag) (dcsr & PXA_DCSR_##flag ? #flag" " : "") +#define PXA_DCMD_STR(flag) (dcmd & PXA_DCMD_##flag ? #flag" " : "") + +static int dbg_show_descriptors(struct seq_file *s, void *p) +{ + struct pxad_phy *phy = s->private; + int i, max_show = 20, burst, width; + u32 dcmd; + unsigned long phys_desc, ddadr; + struct pxad_desc_hw *desc; + + phys_desc = ddadr = _phy_readl_relaxed(phy, DDADR); + + seq_printf(s, "DMA channel %d descriptors :\n", phy->idx); + seq_printf(s, "[%03d] First descriptor unknown\n", 0); + for (i = 1; i < max_show && is_phys_valid(phys_desc); i++) { + desc = phys_to_virt(phys_desc); + dcmd = desc->dcmd; + burst = dbg_burst_from_dcmd(dcmd); + width = (1 << ((dcmd >> 14) & 0x3)) >> 1; + + seq_printf(s, "[%03d] Desc at %08lx(virt %p)\n", + i, phys_desc, desc); + seq_printf(s, "\tDDADR = %08x\n", desc->ddadr); + seq_printf(s, "\tDSADR = %08x\n", desc->dsadr); + seq_printf(s, "\tDTADR = %08x\n", desc->dtadr); + seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n", + dcmd, + PXA_DCMD_STR(INCSRCADDR), PXA_DCMD_STR(INCTRGADDR), + PXA_DCMD_STR(FLOWSRC), PXA_DCMD_STR(FLOWTRG), + PXA_DCMD_STR(STARTIRQEN), PXA_DCMD_STR(ENDIRQEN), + PXA_DCMD_STR(ENDIAN), burst, width, + dcmd & PXA_DCMD_LENGTH); + phys_desc = desc->ddadr; + } + if (i == max_show) + seq_printf(s, "[%03d] Desc at %08lx ... max display reached\n", + i, phys_desc); + else + seq_printf(s, "[%03d] Desc at %08lx is %s\n", + i, phys_desc, phys_desc == DDADR_STOP ? + "DDADR_STOP" : "invalid"); + + return 0; +} + +static int dbg_show_chan_state(struct seq_file *s, void *p) +{ + struct pxad_phy *phy = s->private; + u32 dcsr, dcmd; + int burst, width; + static const char * const str_prio[] = { + "high", "normal", "low", "invalid" + }; + + dcsr = _phy_readl_relaxed(phy, DCSR); + dcmd = _phy_readl_relaxed(phy, DCMD); + burst = dbg_burst_from_dcmd(dcmd); + width = (1 << ((dcmd >> 14) & 0x3)) >> 1; + + seq_printf(s, "DMA channel %d\n", phy->idx); + seq_printf(s, "\tPriority : %s\n", + str_prio[(phy->idx & 0xf) / 4]); + seq_printf(s, "\tUnaligned transfer bit: %s\n", + _phy_readl_relaxed(phy, DALGN) & BIT(phy->idx) ? + "yes" : "no"); + seq_printf(s, "\tDCSR = %08x (%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", + dcsr, PXA_DCSR_STR(RUN), PXA_DCSR_STR(NODESC), + PXA_DCSR_STR(STOPIRQEN), PXA_DCSR_STR(EORIRQEN), + PXA_DCSR_STR(EORJMPEN), PXA_DCSR_STR(EORSTOPEN), + PXA_DCSR_STR(SETCMPST), PXA_DCSR_STR(CLRCMPST), + PXA_DCSR_STR(CMPST), PXA_DCSR_STR(EORINTR), + PXA_DCSR_STR(REQPEND), PXA_DCSR_STR(STOPSTATE), + PXA_DCSR_STR(ENDINTR), PXA_DCSR_STR(STARTINTR), + PXA_DCSR_STR(BUSERR)); + + seq_printf(s, "\tDCMD = %08x (%s%s%s%s%s%s%sburst=%d width=%d len=%d)\n", + dcmd, + PXA_DCMD_STR(INCSRCADDR), PXA_DCMD_STR(INCTRGADDR), + PXA_DCMD_STR(FLOWSRC), PXA_DCMD_STR(FLOWTRG), + PXA_DCMD_STR(STARTIRQEN), PXA_DCMD_STR(ENDIRQEN), + PXA_DCMD_STR(ENDIAN), burst, width, dcmd & PXA_DCMD_LENGTH); + seq_printf(s, "\tDSADR = %08x\n", _phy_readl_relaxed(phy, DSADR)); + seq_printf(s, "\tDTADR = %08x\n", _phy_readl_relaxed(phy, DTADR)); + seq_printf(s, "\tDDADR = %08x\n", _phy_readl_relaxed(phy, DDADR)); + + return 0; +} + +static int dbg_show_state(struct seq_file *s, void *p) +{ + struct pxad_device *pdev = s->private; + + /* basic device status */ + seq_puts(s, "DMA engine status\n"); + seq_printf(s, "\tChannel number: %d\n", pdev->nr_chans); + + return 0; +} + +#define DBGFS_FUNC_DECL(name) \ +static int dbg_open_##name(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, dbg_show_##name, inode->i_private); \ +} \ +static const struct file_operations dbg_fops_##name = { \ + .owner = THIS_MODULE, \ + .open = dbg_open_##name, \ + .llseek = seq_lseek, \ + .read = seq_read, \ + .release = single_release, \ +} + +DBGFS_FUNC_DECL(state); +DBGFS_FUNC_DECL(chan_state); +DBGFS_FUNC_DECL(descriptors); +DBGFS_FUNC_DECL(requester_chan); + +static struct dentry *pxad_dbg_alloc_chan(struct pxad_device *pdev, + int ch, struct dentry *chandir) +{ + char chan_name[11]; + struct dentry *chan, *chan_state = NULL, *chan_descr = NULL; + struct dentry *chan_reqs = NULL; + void *dt; + + scnprintf(chan_name, sizeof(chan_name), "%d", ch); + chan = debugfs_create_dir(chan_name, chandir); + dt = (void *)&pdev->phys[ch]; + + if (chan) + chan_state = debugfs_create_file("state", 0400, chan, dt, + &dbg_fops_chan_state); + if (chan_state) + chan_descr = debugfs_create_file("descriptors", 0400, chan, dt, + &dbg_fops_descriptors); + if (chan_descr) + chan_reqs = debugfs_create_file("requesters", 0400, chan, dt, + &dbg_fops_requester_chan); + if (!chan_reqs) + goto err_state; + + return chan; + +err_state: + debugfs_remove_recursive(chan); + return NULL; +} + +static void pxad_init_debugfs(struct pxad_device *pdev) +{ + int i; + struct dentry *chandir; + + pdev->dbgfs_root = debugfs_create_dir(dev_name(pdev->slave.dev), NULL); + if (IS_ERR(pdev->dbgfs_root) || !pdev->dbgfs_root) + goto err_root; + + pdev->dbgfs_state = debugfs_create_file("state", 0400, pdev->dbgfs_root, + pdev, &dbg_fops_state); + if (!pdev->dbgfs_state) + goto err_state; + + pdev->dbgfs_chan = + kmalloc_array(pdev->nr_chans, sizeof(*pdev->dbgfs_state), + GFP_KERNEL); + if (!pdev->dbgfs_chan) + goto err_alloc; + + chandir = debugfs_create_dir("channels", pdev->dbgfs_root); + if (!chandir) + goto err_chandir; + + for (i = 0; i < pdev->nr_chans; i++) { + pdev->dbgfs_chan[i] = pxad_dbg_alloc_chan(pdev, i, chandir); + if (!pdev->dbgfs_chan[i]) + goto err_chans; + } + + return; +err_chans: +err_chandir: + kfree(pdev->dbgfs_chan); +err_alloc: +err_state: + debugfs_remove_recursive(pdev->dbgfs_root); +err_root: + pr_err("pxad: debugfs is not available\n"); +} + +static void pxad_cleanup_debugfs(struct pxad_device *pdev) +{ + debugfs_remove_recursive(pdev->dbgfs_root); +} +#else +static inline void pxad_init_debugfs(struct pxad_device *pdev) {} +static inline void pxad_cleanup_debugfs(struct pxad_device *pdev) {} +#endif + +/* + * In the transition phase where legacy pxa handling is done at the same time as + * mmp_dma, the DMA physical channel split between the 2 DMA providers is done + * through legacy_reserved. Legacy code reserves DMA channels by settings + * corresponding bits in legacy_reserved. + */ +static u32 legacy_reserved; +static u32 legacy_unavailable; + +static struct pxad_phy *lookup_phy(struct pxad_chan *pchan) +{ + int prio, i; + struct pxad_device *pdev = to_pxad_dev(pchan->vc.chan.device); + struct pxad_phy *phy, *found = NULL; + unsigned long flags; + + /* + * dma channel priorities + * ch 0 - 3, 16 - 19 <--> (0) + * ch 4 - 7, 20 - 23 <--> (1) + * ch 8 - 11, 24 - 27 <--> (2) + * ch 12 - 15, 28 - 31 <--> (3) + */ + + spin_lock_irqsave(&pdev->phy_lock, flags); + for (prio = pchan->prio; prio >= PXAD_PRIO_HIGHEST; prio--) { + for (i = 0; i < pdev->nr_chans; i++) { + if (prio != (i & 0xf) >> 2) + continue; + if ((i < 32) && (legacy_reserved & BIT(i))) + continue; + phy = &pdev->phys[i]; + if (!phy->vchan) { + phy->vchan = pchan; + found = phy; + if (i < 32) + legacy_unavailable |= BIT(i); + goto out_unlock; + } + } + } + +out_unlock: + spin_unlock_irqrestore(&pdev->phy_lock, flags); + dev_dbg(&pchan->vc.chan.dev->device, + "%s(): phy=%p(%d)\n", __func__, found, + found ? found->idx : -1); + + return found; +} + +static void pxad_free_phy(struct pxad_chan *chan) +{ + struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device); + unsigned long flags; + u32 reg; + int i; + + dev_dbg(&chan->vc.chan.dev->device, + "%s(): freeing\n", __func__); + if (!chan->phy) + return; + + /* clear the channel mapping in DRCMR */ + reg = pxad_drcmr(chan->drcmr); + writel_relaxed(0, chan->phy->base + reg); + + spin_lock_irqsave(&pdev->phy_lock, flags); + for (i = 0; i < 32; i++) + if (chan->phy == &pdev->phys[i]) + legacy_unavailable &= ~BIT(i); + chan->phy->vchan = NULL; + chan->phy = NULL; + spin_unlock_irqrestore(&pdev->phy_lock, flags); +} + +static bool is_chan_running(struct pxad_chan *chan) +{ + u32 dcsr; + struct pxad_phy *phy = chan->phy; + + if (!phy) + return false; + dcsr = phy_readl_relaxed(phy, DCSR); + return dcsr & PXA_DCSR_RUN; +} + +static bool is_running_chan_misaligned(struct pxad_chan *chan) +{ + u32 dalgn; + + BUG_ON(!chan->phy); + dalgn = phy_readl_relaxed(chan->phy, DALGN); + return dalgn & (BIT(chan->phy->idx)); +} + +static void phy_enable(struct pxad_phy *phy, bool misaligned) +{ + u32 reg, dalgn; + + if (!phy->vchan) + return; + + dev_dbg(&phy->vchan->vc.chan.dev->device, + "%s(); phy=%p(%d) misaligned=%d\n", __func__, + phy, phy->idx, misaligned); + + reg = pxad_drcmr(phy->vchan->drcmr); + writel_relaxed(DRCMR_MAPVLD | phy->idx, phy->base + reg); + + dalgn = phy_readl_relaxed(phy, DALGN); + if (misaligned) + dalgn |= BIT(phy->idx); + else + dalgn &= ~BIT(phy->idx); + phy_writel_relaxed(phy, dalgn, DALGN); + + phy_writel(phy, PXA_DCSR_STOPIRQEN | PXA_DCSR_ENDINTR | + PXA_DCSR_BUSERR | PXA_DCSR_RUN, DCSR); +} + +static void phy_disable(struct pxad_phy *phy) +{ + u32 dcsr; + + if (!phy) + return; + + dcsr = phy_readl_relaxed(phy, DCSR); + dev_dbg(&phy->vchan->vc.chan.dev->device, + "%s(): phy=%p(%d)\n", __func__, phy, phy->idx); + phy_writel(phy, dcsr & ~PXA_DCSR_RUN & ~PXA_DCSR_STOPIRQEN, DCSR); +} + +static void pxad_launch_chan(struct pxad_chan *chan, + struct pxad_desc_sw *desc) +{ + dev_dbg(&chan->vc.chan.dev->device, + "%s(): desc=%p\n", __func__, desc); + if (!chan->phy) { + chan->phy = lookup_phy(chan); + if (!chan->phy) { + dev_dbg(&chan->vc.chan.dev->device, + "%s(): no free dma channel\n", __func__); + return; + } + } + + /* + * Program the descriptor's address into the DMA controller, + * then start the DMA transaction + */ + phy_writel(chan->phy, desc->first, DDADR); + phy_enable(chan->phy, chan->misaligned); +} + +static void set_updater_desc(struct pxad_desc_sw *sw_desc, + unsigned long flags) +{ + struct pxad_desc_hw *updater = + sw_desc->hw_desc[sw_desc->nb_desc - 1]; + dma_addr_t dma = sw_desc->hw_desc[sw_desc->nb_desc - 2]->ddadr; + + updater->ddadr = DDADR_STOP; + updater->dsadr = dma; + updater->dtadr = dma + 8; + updater->dcmd = PXA_DCMD_WIDTH4 | PXA_DCMD_BURST32 | + (PXA_DCMD_LENGTH & sizeof(u32)); + if (flags & DMA_PREP_INTERRUPT) + updater->dcmd |= PXA_DCMD_ENDIRQEN; +} + +static bool is_desc_completed(struct virt_dma_desc *vd) +{ + struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd); + struct pxad_desc_hw *updater = + sw_desc->hw_desc[sw_desc->nb_desc - 1]; + + return updater->dtadr != (updater->dsadr + 8); +} + +static void pxad_desc_chain(struct virt_dma_desc *vd1, + struct virt_dma_desc *vd2) +{ + struct pxad_desc_sw *desc1 = to_pxad_sw_desc(vd1); + struct pxad_desc_sw *desc2 = to_pxad_sw_desc(vd2); + dma_addr_t dma_to_chain; + + dma_to_chain = desc2->first; + desc1->hw_desc[desc1->nb_desc - 1]->ddadr = dma_to_chain; +} + +static bool pxad_try_hotchain(struct virt_dma_chan *vc, + struct virt_dma_desc *vd) +{ + struct virt_dma_desc *vd_last_issued = NULL; + struct pxad_chan *chan = to_pxad_chan(&vc->chan); + + /* + * Attempt to hot chain the tx if the phy is still running. This is + * considered successful only if either the channel is still running + * after the chaining, or if the chained transfer is completed after + * having been hot chained. + * A change of alignment is not allowed, and forbids hotchaining. + */ + if (is_chan_running(chan)) { + BUG_ON(list_empty(&vc->desc_issued)); + + if (!is_running_chan_misaligned(chan) && + to_pxad_sw_desc(vd)->misaligned) + return false; + + vd_last_issued = list_entry(vc->desc_issued.prev, + struct virt_dma_desc, node); + pxad_desc_chain(vd_last_issued, vd); + if (is_chan_running(chan) || is_desc_completed(vd_last_issued)) + return true; + } + + return false; +} + +static unsigned int clear_chan_irq(struct pxad_phy *phy) +{ + u32 dcsr; + u32 dint = readl(phy->base + DINT); + + if (!(dint & BIT(phy->idx))) + return PXA_DCSR_RUN; + + /* clear irq */ + dcsr = phy_readl_relaxed(phy, DCSR); + phy_writel(phy, dcsr, DCSR); + if ((dcsr & PXA_DCSR_BUSERR) && (phy->vchan)) + dev_warn(&phy->vchan->vc.chan.dev->device, + "%s(chan=%p): PXA_DCSR_BUSERR\n", + __func__, &phy->vchan); + + return dcsr & ~PXA_DCSR_RUN; +} + +static irqreturn_t pxad_chan_handler(int irq, void *dev_id) +{ + struct pxad_phy *phy = dev_id; + struct pxad_chan *chan = phy->vchan; + struct virt_dma_desc *vd, *tmp; + unsigned int dcsr; + unsigned long flags; + + BUG_ON(!chan); + + dcsr = clear_chan_irq(phy); + if (dcsr & PXA_DCSR_RUN) + return IRQ_NONE; + + spin_lock_irqsave(&chan->vc.lock, flags); + list_for_each_entry_safe(vd, tmp, &chan->vc.desc_issued, node) { + dev_dbg(&chan->vc.chan.dev->device, + "%s(): checking txd %p[%x]: completed=%d\n", + __func__, vd, vd->tx.cookie, is_desc_completed(vd)); + if (is_desc_completed(vd)) { + list_del(&vd->node); + vchan_cookie_complete(vd); + } else { + break; + } + } + + if (dcsr & PXA_DCSR_STOPSTATE) { + dev_dbg(&chan->vc.chan.dev->device, + "%s(): channel stopped, submitted_empty=%d issued_empty=%d", + __func__, + list_empty(&chan->vc.desc_submitted), + list_empty(&chan->vc.desc_issued)); + phy_writel_relaxed(phy, dcsr & ~PXA_DCSR_STOPIRQEN, DCSR); + + if (list_empty(&chan->vc.desc_issued)) { + chan->misaligned = + !list_empty(&chan->vc.desc_submitted); + } else { + vd = list_first_entry(&chan->vc.desc_issued, + struct virt_dma_desc, node); + pxad_launch_chan(chan, to_pxad_sw_desc(vd)); + } + } + spin_unlock_irqrestore(&chan->vc.lock, flags); + + return IRQ_HANDLED; +} + +static irqreturn_t pxad_int_handler(int irq, void *dev_id) +{ + struct pxad_device *pdev = dev_id; + struct pxad_phy *phy; + u32 dint = readl(pdev->base + DINT); + int i, ret = IRQ_NONE; + + while (dint) { + i = __ffs(dint); + dint &= (dint - 1); + phy = &pdev->phys[i]; + if ((i < 32) && (legacy_reserved & BIT(i))) + continue; + if (pxad_chan_handler(irq, phy) == IRQ_HANDLED) + ret = IRQ_HANDLED; + } + + return ret; +} + +static int pxad_alloc_chan_resources(struct dma_chan *dchan) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device); + + if (chan->desc_pool) + return 1; + + chan->desc_pool = dma_pool_create(dma_chan_name(dchan), + pdev->slave.dev, + sizeof(struct pxad_desc_hw), + __alignof__(struct pxad_desc_hw), + 0); + if (!chan->desc_pool) { + dev_err(&chan->vc.chan.dev->device, + "%s(): unable to allocate descriptor pool\n", + __func__); + return -ENOMEM; + } + + return 1; +} + +static void pxad_free_chan_resources(struct dma_chan *dchan) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + + vchan_free_chan_resources(&chan->vc); + dma_pool_destroy(chan->desc_pool); + chan->desc_pool = NULL; + +} + +static void pxad_free_desc(struct virt_dma_desc *vd) +{ + int i; + dma_addr_t dma; + struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd); + + BUG_ON(sw_desc->nb_desc == 0); + for (i = sw_desc->nb_desc - 1; i >= 0; i--) { + if (i > 0) + dma = sw_desc->hw_desc[i - 1]->ddadr; + else + dma = sw_desc->first; + dma_pool_free(sw_desc->desc_pool, + sw_desc->hw_desc[i], dma); + } + sw_desc->nb_desc = 0; + kfree(sw_desc); +} + +static struct pxad_desc_sw * +pxad_alloc_desc(struct pxad_chan *chan, unsigned int nb_hw_desc) +{ + struct pxad_desc_sw *sw_desc; + dma_addr_t dma; + int i; + + sw_desc = kzalloc(sizeof(*sw_desc) + + nb_hw_desc * sizeof(struct pxad_desc_hw *), + GFP_NOWAIT); + if (!sw_desc) + return NULL; + sw_desc->desc_pool = chan->desc_pool; + + for (i = 0; i < nb_hw_desc; i++) { + sw_desc->hw_desc[i] = dma_pool_alloc(sw_desc->desc_pool, + GFP_NOWAIT, &dma); + if (!sw_desc->hw_desc[i]) { + dev_err(&chan->vc.chan.dev->device, + "%s(): Couldn't allocate the %dth hw_desc from dma_pool %p\n", + __func__, i, sw_desc->desc_pool); + goto err; + } + + if (i == 0) + sw_desc->first = dma; + else + sw_desc->hw_desc[i - 1]->ddadr = dma; + sw_desc->nb_desc++; + } + + return sw_desc; +err: + pxad_free_desc(&sw_desc->vd); + return NULL; +} + +static dma_cookie_t pxad_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct virt_dma_chan *vc = to_virt_chan(tx->chan); + struct pxad_chan *chan = to_pxad_chan(&vc->chan); + struct virt_dma_desc *vd_chained = NULL, + *vd = container_of(tx, struct virt_dma_desc, tx); + dma_cookie_t cookie; + unsigned long flags; + + set_updater_desc(to_pxad_sw_desc(vd), tx->flags); + + spin_lock_irqsave(&vc->lock, flags); + cookie = dma_cookie_assign(tx); + + if (list_empty(&vc->desc_submitted) && pxad_try_hotchain(vc, vd)) { + list_move_tail(&vd->node, &vc->desc_issued); + dev_dbg(&chan->vc.chan.dev->device, + "%s(): txd %p[%x]: submitted (hot linked)\n", + __func__, vd, cookie); + goto out; + } + + /* + * Fallback to placing the tx in the submitted queue + */ + if (!list_empty(&vc->desc_submitted)) { + vd_chained = list_entry(vc->desc_submitted.prev, + struct virt_dma_desc, node); + /* + * Only chain the descriptors if no new misalignment is + * introduced. If a new misalignment is chained, let the channel + * stop, and be relaunched in misalign mode from the irq + * handler. + */ + if (chan->misaligned || !to_pxad_sw_desc(vd)->misaligned) + pxad_desc_chain(vd_chained, vd); + else + vd_chained = NULL; + } + dev_dbg(&chan->vc.chan.dev->device, + "%s(): txd %p[%x]: submitted (%s linked)\n", + __func__, vd, cookie, vd_chained ? "cold" : "not"); + list_move_tail(&vd->node, &vc->desc_submitted); + chan->misaligned |= to_pxad_sw_desc(vd)->misaligned; + +out: + spin_unlock_irqrestore(&vc->lock, flags); + return cookie; +} + +static void pxad_issue_pending(struct dma_chan *dchan) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct virt_dma_desc *vd_first; + unsigned long flags; + + spin_lock_irqsave(&chan->vc.lock, flags); + if (list_empty(&chan->vc.desc_submitted)) + goto out; + + vd_first = list_first_entry(&chan->vc.desc_submitted, + struct virt_dma_desc, node); + dev_dbg(&chan->vc.chan.dev->device, + "%s(): txd %p[%x]", __func__, vd_first, vd_first->tx.cookie); + + vchan_issue_pending(&chan->vc); + if (!pxad_try_hotchain(&chan->vc, vd_first)) + pxad_launch_chan(chan, to_pxad_sw_desc(vd_first)); +out: + spin_unlock_irqrestore(&chan->vc.lock, flags); +} + +static inline struct dma_async_tx_descriptor * +pxad_tx_prep(struct virt_dma_chan *vc, struct virt_dma_desc *vd, + unsigned long tx_flags) +{ + struct dma_async_tx_descriptor *tx; + struct pxad_chan *chan = container_of(vc, struct pxad_chan, vc); + + tx = vchan_tx_prep(vc, vd, tx_flags); + tx->tx_submit = pxad_tx_submit; + dev_dbg(&chan->vc.chan.dev->device, + "%s(): vc=%p txd=%p[%x] flags=0x%lx\n", __func__, + vc, vd, vd->tx.cookie, + tx_flags); + + return tx; +} + +static void pxad_get_config(struct pxad_chan *chan, + enum dma_transfer_direction dir, + u32 *dcmd, u32 *dev_src, u32 *dev_dst) +{ + u32 maxburst = 0, dev_addr = 0; + enum dma_slave_buswidth width = DMA_SLAVE_BUSWIDTH_UNDEFINED; + + *dcmd = 0; + if (chan->cfg.direction == DMA_DEV_TO_MEM) { + maxburst = chan->cfg.src_maxburst; + width = chan->cfg.src_addr_width; + dev_addr = chan->cfg.src_addr; + *dev_src = dev_addr; + *dcmd |= PXA_DCMD_INCTRGADDR | PXA_DCMD_FLOWSRC; + } + if (chan->cfg.direction == DMA_MEM_TO_DEV) { + maxburst = chan->cfg.dst_maxburst; + width = chan->cfg.dst_addr_width; + dev_addr = chan->cfg.dst_addr; + *dev_dst = dev_addr; + *dcmd |= PXA_DCMD_INCSRCADDR | PXA_DCMD_FLOWTRG; + } + if (chan->cfg.direction == DMA_MEM_TO_MEM) + *dcmd |= PXA_DCMD_BURST32 | PXA_DCMD_INCTRGADDR | + PXA_DCMD_INCSRCADDR; + + dev_dbg(&chan->vc.chan.dev->device, + "%s(): dev_addr=0x%x maxburst=%d width=%d dir=%d\n", + __func__, dev_addr, maxburst, width, dir); + + if (width == DMA_SLAVE_BUSWIDTH_1_BYTE) + *dcmd |= PXA_DCMD_WIDTH1; + else if (width == DMA_SLAVE_BUSWIDTH_2_BYTES) + *dcmd |= PXA_DCMD_WIDTH2; + else if (width == DMA_SLAVE_BUSWIDTH_4_BYTES) + *dcmd |= PXA_DCMD_WIDTH4; + + if (maxburst == 8) + *dcmd |= PXA_DCMD_BURST8; + else if (maxburst == 16) + *dcmd |= PXA_DCMD_BURST16; + else if (maxburst == 32) + *dcmd |= PXA_DCMD_BURST32; + + /* FIXME: drivers should be ported over to use the filter + * function. Once that's done, the following two lines can + * be removed. + */ + if (chan->cfg.slave_id) + chan->drcmr = chan->cfg.slave_id; +} + +static struct dma_async_tx_descriptor * +pxad_prep_memcpy(struct dma_chan *dchan, + dma_addr_t dma_dst, dma_addr_t dma_src, + size_t len, unsigned long flags) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct pxad_desc_sw *sw_desc; + struct pxad_desc_hw *hw_desc; + u32 dcmd; + unsigned int i, nb_desc = 0; + size_t copy; + + if (!dchan || !len) + return NULL; + + dev_dbg(&chan->vc.chan.dev->device, + "%s(): dma_dst=0x%lx dma_src=0x%lx len=%zu flags=%lx\n", + __func__, (unsigned long)dma_dst, (unsigned long)dma_src, + len, flags); + pxad_get_config(chan, DMA_MEM_TO_MEM, &dcmd, NULL, NULL); + + nb_desc = DIV_ROUND_UP(len, PDMA_MAX_DESC_BYTES); + sw_desc = pxad_alloc_desc(chan, nb_desc + 1); + if (!sw_desc) + return NULL; + sw_desc->len = len; + + if (!IS_ALIGNED(dma_src, 1 << PDMA_ALIGNMENT) || + !IS_ALIGNED(dma_dst, 1 << PDMA_ALIGNMENT)) + sw_desc->misaligned = true; + + i = 0; + do { + hw_desc = sw_desc->hw_desc[i++]; + copy = min_t(size_t, len, PDMA_MAX_DESC_BYTES); + hw_desc->dcmd = dcmd | (PXA_DCMD_LENGTH & copy); + hw_desc->dsadr = dma_src; + hw_desc->dtadr = dma_dst; + len -= copy; + dma_src += copy; + dma_dst += copy; + } while (len); + set_updater_desc(sw_desc, flags); + + return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags); +} + +static struct dma_async_tx_descriptor * +pxad_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_transfer_direction dir, + unsigned long flags, void *context) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct pxad_desc_sw *sw_desc; + size_t len, avail; + struct scatterlist *sg; + dma_addr_t dma; + u32 dcmd, dsadr = 0, dtadr = 0; + unsigned int nb_desc = 0, i, j = 0; + + if ((sgl == NULL) || (sg_len == 0)) + return NULL; + + pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr); + dev_dbg(&chan->vc.chan.dev->device, + "%s(): dir=%d flags=%lx\n", __func__, dir, flags); + + for_each_sg(sgl, sg, sg_len, i) + nb_desc += DIV_ROUND_UP(sg_dma_len(sg), PDMA_MAX_DESC_BYTES); + sw_desc = pxad_alloc_desc(chan, nb_desc + 1); + if (!sw_desc) + return NULL; + + for_each_sg(sgl, sg, sg_len, i) { + dma = sg_dma_address(sg); + avail = sg_dma_len(sg); + sw_desc->len += avail; + + do { + len = min_t(size_t, avail, PDMA_MAX_DESC_BYTES); + if (dma & 0x7) + sw_desc->misaligned = true; + + sw_desc->hw_desc[j]->dcmd = + dcmd | (PXA_DCMD_LENGTH & len); + sw_desc->hw_desc[j]->dsadr = dsadr ? dsadr : dma; + sw_desc->hw_desc[j++]->dtadr = dtadr ? dtadr : dma; + + dma += len; + avail -= len; + } while (avail); + } + set_updater_desc(sw_desc, flags); + + return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags); +} + +static struct dma_async_tx_descriptor * +pxad_prep_dma_cyclic(struct dma_chan *dchan, + dma_addr_t buf_addr, size_t len, size_t period_len, + enum dma_transfer_direction dir, unsigned long flags) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct pxad_desc_sw *sw_desc; + struct pxad_desc_hw **phw_desc; + dma_addr_t dma; + u32 dcmd, dsadr = 0, dtadr = 0; + unsigned int nb_desc = 0; + + if (!dchan || !len || !period_len) + return NULL; + if ((dir != DMA_DEV_TO_MEM) && (dir != DMA_MEM_TO_DEV)) { + dev_err(&chan->vc.chan.dev->device, + "Unsupported direction for cyclic DMA\n"); + return NULL; + } + /* the buffer length must be a multiple of period_len */ + if (len % period_len != 0 || period_len > PDMA_MAX_DESC_BYTES || + !IS_ALIGNED(period_len, 1 << PDMA_ALIGNMENT)) + return NULL; + + pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr); + dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH | period_len); + dev_dbg(&chan->vc.chan.dev->device, + "%s(): buf_addr=0x%lx len=%zu period=%zu dir=%d flags=%lx\n", + __func__, (unsigned long)buf_addr, len, period_len, dir, flags); + + nb_desc = DIV_ROUND_UP(period_len, PDMA_MAX_DESC_BYTES); + nb_desc *= DIV_ROUND_UP(len, period_len); + sw_desc = pxad_alloc_desc(chan, nb_desc + 1); + if (!sw_desc) + return NULL; + sw_desc->cyclic = true; + sw_desc->len = len; + + phw_desc = sw_desc->hw_desc; + dma = buf_addr; + do { + phw_desc[0]->dsadr = dsadr ? dsadr : dma; + phw_desc[0]->dtadr = dtadr ? dtadr : dma; + phw_desc[0]->dcmd = dcmd; + phw_desc++; + dma += period_len; + len -= period_len; + } while (len); + set_updater_desc(sw_desc, flags); + + return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags); +} + +static int pxad_config(struct dma_chan *dchan, + struct dma_slave_config *cfg) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + + if (!dchan) + return -EINVAL; + + chan->cfg = *cfg; + return 0; +} + +static int pxad_terminate_all(struct dma_chan *dchan) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device); + struct virt_dma_desc *vd = NULL; + unsigned long flags; + struct pxad_phy *phy; + LIST_HEAD(head); + + dev_dbg(&chan->vc.chan.dev->device, + "%s(): vchan %p: terminate all\n", __func__, &chan->vc); + + spin_lock_irqsave(&chan->vc.lock, flags); + vchan_get_all_descriptors(&chan->vc, &head); + + list_for_each_entry(vd, &head, node) { + dev_dbg(&chan->vc.chan.dev->device, + "%s(): cancelling txd %p[%x] (completed=%d)", __func__, + vd, vd->tx.cookie, is_desc_completed(vd)); + } + + phy = chan->phy; + if (phy) { + phy_disable(chan->phy); + pxad_free_phy(chan); + chan->phy = NULL; + spin_lock(&pdev->phy_lock); + phy->vchan = NULL; + spin_unlock(&pdev->phy_lock); + } + spin_unlock_irqrestore(&chan->vc.lock, flags); + vchan_dma_desc_free_list(&chan->vc, &head); + + return 0; +} + +static unsigned int pxad_residue(struct pxad_chan *chan, + dma_cookie_t cookie) +{ + struct virt_dma_desc *vd = NULL; + struct pxad_desc_sw *sw_desc = NULL; + struct pxad_desc_hw *hw_desc = NULL; + u32 curr, start, len, end, residue = 0; + unsigned long flags; + bool passed = false; + int i; + + /* + * If the channel does not have a phy pointer anymore, it has already + * been completed. Therefore, its residue is 0. + */ + if (!chan->phy) + return 0; + + spin_lock_irqsave(&chan->vc.lock, flags); + + vd = vchan_find_desc(&chan->vc, cookie); + if (!vd) + goto out; + + sw_desc = to_pxad_sw_desc(vd); + if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR) + curr = phy_readl_relaxed(chan->phy, DSADR); + else + curr = phy_readl_relaxed(chan->phy, DTADR); + + for (i = 0; i < sw_desc->nb_desc - 1; i++) { + hw_desc = sw_desc->hw_desc[i]; + if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR) + start = hw_desc->dsadr; + else + start = hw_desc->dtadr; + len = hw_desc->dcmd & PXA_DCMD_LENGTH; + end = start + len; + + /* + * 'passed' will be latched once we found the descriptor + * which lies inside the boundaries of the curr + * pointer. All descriptors that occur in the list + * _after_ we found that partially handled descriptor + * are still to be processed and are hence added to the + * residual bytes counter. + */ + + if (passed) { + residue += len; + } else if (curr >= start && curr <= end) { + residue += end - curr; + passed = true; + } + } + if (!passed) + residue = sw_desc->len; + +out: + spin_unlock_irqrestore(&chan->vc.lock, flags); + dev_dbg(&chan->vc.chan.dev->device, + "%s(): txd %p[%x] sw_desc=%p: %d\n", + __func__, vd, cookie, sw_desc, residue); + return residue; +} + +static enum dma_status pxad_tx_status(struct dma_chan *dchan, + dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + struct pxad_chan *chan = to_pxad_chan(dchan); + enum dma_status ret; + + ret = dma_cookie_status(dchan, cookie, txstate); + if (likely(txstate && (ret != DMA_ERROR))) + dma_set_residue(txstate, pxad_residue(chan, cookie)); + + return ret; +} + +static void pxad_free_channels(struct dma_device *dmadev) +{ + struct pxad_chan *c, *cn; + + list_for_each_entry_safe(c, cn, &dmadev->channels, + vc.chan.device_node) { + list_del(&c->vc.chan.device_node); + tasklet_kill(&c->vc.task); + } +} + +static int pxad_remove(struct platform_device *op) +{ + struct pxad_device *pdev = platform_get_drvdata(op); + + pxad_cleanup_debugfs(pdev); + pxad_free_channels(&pdev->slave); + dma_async_device_unregister(&pdev->slave); + return 0; +} + +static int pxad_init_phys(struct platform_device *op, + struct pxad_device *pdev, + unsigned int nb_phy_chans) +{ + int irq0, irq, nr_irq = 0, i, ret; + struct pxad_phy *phy; + + irq0 = platform_get_irq(op, 0); + if (irq0 < 0) + return irq0; + + pdev->phys = devm_kcalloc(&op->dev, nb_phy_chans, + sizeof(pdev->phys[0]), GFP_KERNEL); + if (!pdev->phys) + return -ENOMEM; + + for (i = 0; i < nb_phy_chans; i++) + if (platform_get_irq(op, i) > 0) + nr_irq++; + + for (i = 0; i < nb_phy_chans; i++) { + phy = &pdev->phys[i]; + phy->base = pdev->base; + phy->idx = i; + irq = platform_get_irq(op, i); + if ((nr_irq > 1) && (irq > 0)) + ret = devm_request_irq(&op->dev, irq, + pxad_chan_handler, + IRQF_SHARED, "pxa-dma", phy); + if ((nr_irq == 1) && (i == 0)) + ret = devm_request_irq(&op->dev, irq0, + pxad_int_handler, + IRQF_SHARED, "pxa-dma", pdev); + if (ret) { + dev_err(pdev->slave.dev, + "%s(): can't request irq %d:%d\n", __func__, + irq, ret); + return ret; + } + } + + return 0; +} + +static const struct of_device_id const pxad_dt_ids[] = { + { .compatible = "marvell,pdma-1.0", }, + {} +}; +MODULE_DEVICE_TABLE(of, pxad_dt_ids); + +static struct dma_chan *pxad_dma_xlate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct pxad_device *d = ofdma->of_dma_data; + struct dma_chan *chan; + + chan = dma_get_any_slave_channel(&d->slave); + if (!chan) + return NULL; + + to_pxad_chan(chan)->drcmr = dma_spec->args[0]; + to_pxad_chan(chan)->prio = dma_spec->args[1]; + + return chan; +} + +static int pxad_init_dmadev(struct platform_device *op, + struct pxad_device *pdev, + unsigned int nr_phy_chans) +{ + int ret; + unsigned int i; + struct pxad_chan *c; + + pdev->nr_chans = nr_phy_chans; + INIT_LIST_HEAD(&pdev->slave.channels); + pdev->slave.device_alloc_chan_resources = pxad_alloc_chan_resources; + pdev->slave.device_free_chan_resources = pxad_free_chan_resources; + pdev->slave.device_tx_status = pxad_tx_status; + pdev->slave.device_issue_pending = pxad_issue_pending; + pdev->slave.device_config = pxad_config; + pdev->slave.device_terminate_all = pxad_terminate_all; + + if (op->dev.coherent_dma_mask) + dma_set_mask(&op->dev, op->dev.coherent_dma_mask); + else + dma_set_mask(&op->dev, DMA_BIT_MASK(32)); + + ret = pxad_init_phys(op, pdev, nr_phy_chans); + if (ret) + return ret; + + for (i = 0; i < nr_phy_chans; i++) { + c = devm_kzalloc(&op->dev, sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + c->vc.desc_free = pxad_free_desc; + vchan_init(&c->vc, &pdev->slave); + } + + return dma_async_device_register(&pdev->slave); +} + +static int pxad_probe(struct platform_device *op) +{ + struct pxad_device *pdev; + const struct of_device_id *of_id; + struct mmp_dma_platdata *pdata = dev_get_platdata(&op->dev); + struct resource *iores; + int ret, dma_channels = 0; + const enum dma_slave_buswidth widths = + DMA_SLAVE_BUSWIDTH_1_BYTE | DMA_SLAVE_BUSWIDTH_2_BYTES | + DMA_SLAVE_BUSWIDTH_4_BYTES; + + pdev = devm_kzalloc(&op->dev, sizeof(*pdev), GFP_KERNEL); + if (!pdev) + return -ENOMEM; + + spin_lock_init(&pdev->phy_lock); + + iores = platform_get_resource(op, IORESOURCE_MEM, 0); + pdev->base = devm_ioremap_resource(&op->dev, iores); + if (IS_ERR(pdev->base)) + return PTR_ERR(pdev->base); + + of_id = of_match_device(pxad_dt_ids, &op->dev); + if (of_id) + of_property_read_u32(op->dev.of_node, "#dma-channels", + &dma_channels); + else if (pdata && pdata->dma_channels) + dma_channels = pdata->dma_channels; + else + dma_channels = 32; /* default 32 channel */ + + dma_cap_set(DMA_SLAVE, pdev->slave.cap_mask); + dma_cap_set(DMA_MEMCPY, pdev->slave.cap_mask); + dma_cap_set(DMA_CYCLIC, pdev->slave.cap_mask); + dma_cap_set(DMA_PRIVATE, pdev->slave.cap_mask); + pdev->slave.device_prep_dma_memcpy = pxad_prep_memcpy; + pdev->slave.device_prep_slave_sg = pxad_prep_slave_sg; + pdev->slave.device_prep_dma_cyclic = pxad_prep_dma_cyclic; + + pdev->slave.copy_align = PDMA_ALIGNMENT; + pdev->slave.src_addr_widths = widths; + pdev->slave.dst_addr_widths = widths; + pdev->slave.directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM); + pdev->slave.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR; + + pdev->slave.dev = &op->dev; + ret = pxad_init_dmadev(op, pdev, dma_channels); + if (ret) { + dev_err(pdev->slave.dev, "unable to register\n"); + return ret; + } + + if (op->dev.of_node) { + /* Device-tree DMA controller registration */ + ret = of_dma_controller_register(op->dev.of_node, + pxad_dma_xlate, pdev); + if (ret < 0) { + dev_err(pdev->slave.dev, + "of_dma_controller_register failed\n"); + return ret; + } + } + + platform_set_drvdata(op, pdev); + pxad_init_debugfs(pdev); + dev_info(pdev->slave.dev, "initialized %d channels\n", dma_channels); + return 0; +} + +static const struct platform_device_id pxad_id_table[] = { + { "pxa-dma", }, + { }, +}; + +static struct platform_driver pxad_driver = { + .driver = { + .name = "pxa-dma", + .of_match_table = pxad_dt_ids, + }, + .id_table = pxad_id_table, + .probe = pxad_probe, + .remove = pxad_remove, +}; + +bool pxad_filter_fn(struct dma_chan *chan, void *param) +{ + struct pxad_chan *c = to_pxad_chan(chan); + struct pxad_param *p = param; + + if (chan->device->dev->driver != &pxad_driver.driver) + return false; + + c->drcmr = p->drcmr; + c->prio = p->prio; + + return true; +} +EXPORT_SYMBOL_GPL(pxad_filter_fn); + +int pxad_toggle_reserved_channel(int legacy_channel) +{ + if (legacy_unavailable & (BIT(legacy_channel))) + return -EBUSY; + legacy_reserved ^= BIT(legacy_channel); + return 0; +} +EXPORT_SYMBOL_GPL(pxad_toggle_reserved_channel); + +module_platform_driver(pxad_driver); + +MODULE_DESCRIPTION("Marvell PXA Peripheral DMA Driver"); +MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/s3c24xx-dma.c b/drivers/dma/s3c24xx-dma.c index 01dcaf21b988..17ccdfd28f37 100644 --- a/drivers/dma/s3c24xx-dma.c +++ b/drivers/dma/s3c24xx-dma.c @@ -1168,7 +1168,7 @@ static struct soc_data soc_s3c2443 = { .has_clocks = true, }; -static struct platform_device_id s3c24xx_dma_driver_ids[] = { +static const struct platform_device_id s3c24xx_dma_driver_ids[] = { { .name = "s3c2410-dma", .driver_data = (kernel_ulong_t)&soc_s3c2410, diff --git a/drivers/dma/sh/rcar-dmac.c b/drivers/dma/sh/rcar-dmac.c index e0302c784ba4..7820d07e7bee 100644 --- a/drivers/dma/sh/rcar-dmac.c +++ b/drivers/dma/sh/rcar-dmac.c @@ -183,7 +183,7 @@ struct rcar_dmac { unsigned int n_channels; struct rcar_dmac_chan *channels; - unsigned long modules[256 / BITS_PER_LONG]; + DECLARE_BITMAP(modules, 256); }; #define to_rcar_dmac(d) container_of(d, struct rcar_dmac, engine) diff --git a/drivers/dma/sh/shdma-r8a73a4.c b/drivers/dma/sh/shdma-r8a73a4.c index 4fb99970a3ea..96ea3828c3eb 100644 --- a/drivers/dma/sh/shdma-r8a73a4.c +++ b/drivers/dma/sh/shdma-r8a73a4.c @@ -11,7 +11,7 @@ #include "shdma-arm.h" -const unsigned int dma_ts_shift[] = SH_DMAE_TS_SHIFT; +static const unsigned int dma_ts_shift[] = SH_DMAE_TS_SHIFT; static const struct sh_dmae_slave_config dma_slaves[] = { { diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c index a1afda43b8ef..8c5186cc9f63 100644 --- a/drivers/dma/sirf-dma.c +++ b/drivers/dma/sirf-dma.c @@ -23,8 +23,13 @@ #include "dmaengine.h" +#define SIRFSOC_DMA_VER_A7V1 1 +#define SIRFSOC_DMA_VER_A7V2 2 +#define SIRFSOC_DMA_VER_A6 4 + #define SIRFSOC_DMA_DESCRIPTORS 16 #define SIRFSOC_DMA_CHANNELS 16 +#define SIRFSOC_DMA_TABLE_NUM 256 #define SIRFSOC_DMA_CH_ADDR 0x00 #define SIRFSOC_DMA_CH_XLEN 0x04 @@ -35,15 +40,44 @@ #define SIRFSOC_DMA_CH_VALID 0x140 #define SIRFSOC_DMA_CH_INT 0x144 #define SIRFSOC_DMA_INT_EN 0x148 -#define SIRFSOC_DMA_INT_EN_CLR 0x14C +#define SIRFSOC_DMA_INT_EN_CLR 0x14C #define SIRFSOC_DMA_CH_LOOP_CTRL 0x150 -#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR 0x15C +#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR 0x154 +#define SIRFSOC_DMA_WIDTH_ATLAS7 0x10 +#define SIRFSOC_DMA_VALID_ATLAS7 0x14 +#define SIRFSOC_DMA_INT_ATLAS7 0x18 +#define SIRFSOC_DMA_INT_EN_ATLAS7 0x1c +#define SIRFSOC_DMA_LOOP_CTRL_ATLAS7 0x20 +#define SIRFSOC_DMA_CUR_DATA_ADDR 0x34 +#define SIRFSOC_DMA_MUL_ATLAS7 0x38 +#define SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7 0x158 +#define SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7 0x15C +#define SIRFSOC_DMA_IOBG_SCMD_EN 0x800 +#define SIRFSOC_DMA_EARLY_RESP_SET 0x818 +#define SIRFSOC_DMA_EARLY_RESP_CLR 0x81C #define SIRFSOC_DMA_MODE_CTRL_BIT 4 #define SIRFSOC_DMA_DIR_CTRL_BIT 5 +#define SIRFSOC_DMA_MODE_CTRL_BIT_ATLAS7 2 +#define SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7 3 +#define SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7 4 +#define SIRFSOC_DMA_TAB_NUM_ATLAS7 7 +#define SIRFSOC_DMA_CHAIN_INT_BIT_ATLAS7 5 +#define SIRFSOC_DMA_CHAIN_FLAG_SHIFT_ATLAS7 25 +#define SIRFSOC_DMA_CHAIN_ADDR_SHIFT 32 + +#define SIRFSOC_DMA_INT_FINI_INT_ATLAS7 BIT(0) +#define SIRFSOC_DMA_INT_CNT_INT_ATLAS7 BIT(1) +#define SIRFSOC_DMA_INT_PAU_INT_ATLAS7 BIT(2) +#define SIRFSOC_DMA_INT_LOOP_INT_ATLAS7 BIT(3) +#define SIRFSOC_DMA_INT_INV_INT_ATLAS7 BIT(4) +#define SIRFSOC_DMA_INT_END_INT_ATLAS7 BIT(5) +#define SIRFSOC_DMA_INT_ALL_ATLAS7 0x3F /* xlen and dma_width register is in 4 bytes boundary */ #define SIRFSOC_DMA_WORD_LEN 4 +#define SIRFSOC_DMA_XLEN_MAX_V1 0x800 +#define SIRFSOC_DMA_XLEN_MAX_V2 0x1000 struct sirfsoc_dma_desc { struct dma_async_tx_descriptor desc; @@ -56,7 +90,9 @@ struct sirfsoc_dma_desc { int width; /* DMA width */ int dir; bool cyclic; /* is loop DMA? */ + bool chain; /* is chain DMA? */ u32 addr; /* DMA buffer address */ + u64 chain_table[SIRFSOC_DMA_TABLE_NUM]; /* chain tbl */ }; struct sirfsoc_dma_chan { @@ -87,10 +123,25 @@ struct sirfsoc_dma { void __iomem *base; int irq; struct clk *clk; - bool is_marco; + int type; + void (*exec_desc)(struct sirfsoc_dma_desc *sdesc, + int cid, int burst_mode, void __iomem *base); struct sirfsoc_dma_regs regs_save; }; +struct sirfsoc_dmadata { + void (*exec)(struct sirfsoc_dma_desc *sdesc, + int cid, int burst_mode, void __iomem *base); + int type; +}; + +enum sirfsoc_dma_chain_flag { + SIRFSOC_DMA_CHAIN_NORMAL = 0x01, + SIRFSOC_DMA_CHAIN_PAUSE = 0x02, + SIRFSOC_DMA_CHAIN_LOOP = 0x03, + SIRFSOC_DMA_CHAIN_END = 0x04 +}; + #define DRV_NAME "sirfsoc_dma" static int sirfsoc_dma_runtime_suspend(struct device *dev); @@ -109,48 +160,105 @@ static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c) return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]); } +static void sirfsoc_dma_execute_hw_a7v2(struct sirfsoc_dma_desc *sdesc, + int cid, int burst_mode, void __iomem *base) +{ + if (sdesc->chain) { + /* DMA v2 HW chain mode */ + writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) | + (sdesc->chain << + SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) | + (0x8 << SIRFSOC_DMA_TAB_NUM_ATLAS7) | 0x3, + base + SIRFSOC_DMA_CH_CTRL); + } else { + /* DMA v2 legacy mode */ + writel_relaxed(sdesc->xlen, base + SIRFSOC_DMA_CH_XLEN); + writel_relaxed(sdesc->ylen, base + SIRFSOC_DMA_CH_YLEN); + writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_ATLAS7); + writel_relaxed((sdesc->width*((sdesc->ylen+1)>>1)), + base + SIRFSOC_DMA_MUL_ATLAS7); + writel_relaxed((sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT_ATLAS7) | + (sdesc->chain << + SIRFSOC_DMA_CHAIN_CTRL_BIT_ATLAS7) | + 0x3, base + SIRFSOC_DMA_CH_CTRL); + } + writel_relaxed(sdesc->chain ? SIRFSOC_DMA_INT_END_INT_ATLAS7 : + (SIRFSOC_DMA_INT_FINI_INT_ATLAS7 | + SIRFSOC_DMA_INT_LOOP_INT_ATLAS7), + base + SIRFSOC_DMA_INT_EN_ATLAS7); + writel(sdesc->addr, base + SIRFSOC_DMA_CH_ADDR); + if (sdesc->cyclic) + writel(0x10001, base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7); +} + +static void sirfsoc_dma_execute_hw_a7v1(struct sirfsoc_dma_desc *sdesc, + int cid, int burst_mode, void __iomem *base) +{ + writel_relaxed(1, base + SIRFSOC_DMA_IOBG_SCMD_EN); + writel_relaxed((1 << cid), base + SIRFSOC_DMA_EARLY_RESP_SET); + writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4); + writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) | + (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT), + base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL); + writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN); + writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN); + writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) | + (1 << cid), base + SIRFSOC_DMA_INT_EN); + writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR); + if (sdesc->cyclic) { + writel((1 << cid) | 1 << (cid + 16) | + readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7), + base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7); + } + +} + +static void sirfsoc_dma_execute_hw_a6(struct sirfsoc_dma_desc *sdesc, + int cid, int burst_mode, void __iomem *base) +{ + writel_relaxed(sdesc->width, base + SIRFSOC_DMA_WIDTH_0 + cid * 4); + writel_relaxed(cid | (burst_mode << SIRFSOC_DMA_MODE_CTRL_BIT) | + (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT), + base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL); + writel_relaxed(sdesc->xlen, base + cid * 0x10 + SIRFSOC_DMA_CH_XLEN); + writel_relaxed(sdesc->ylen, base + cid * 0x10 + SIRFSOC_DMA_CH_YLEN); + writel_relaxed(readl_relaxed(base + SIRFSOC_DMA_INT_EN) | + (1 << cid), base + SIRFSOC_DMA_INT_EN); + writel(sdesc->addr >> 2, base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR); + if (sdesc->cyclic) { + writel((1 << cid) | 1 << (cid + 16) | + readl_relaxed(base + SIRFSOC_DMA_CH_LOOP_CTRL), + base + SIRFSOC_DMA_CH_LOOP_CTRL); + } + +} + /* Execute all queued DMA descriptors */ static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan) { struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan); int cid = schan->chan.chan_id; struct sirfsoc_dma_desc *sdesc = NULL; + void __iomem *base; /* * lock has been held by functions calling this, so we don't hold * lock again */ - + base = sdma->base; sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc, - node); + node); /* Move the first queued descriptor to active list */ list_move_tail(&sdesc->node, &schan->active); - /* Start the DMA transfer */ - writel_relaxed(sdesc->width, sdma->base + SIRFSOC_DMA_WIDTH_0 + - cid * 4); - writel_relaxed(cid | (schan->mode << SIRFSOC_DMA_MODE_CTRL_BIT) | - (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT), - sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL); - writel_relaxed(sdesc->xlen, sdma->base + cid * 0x10 + - SIRFSOC_DMA_CH_XLEN); - writel_relaxed(sdesc->ylen, sdma->base + cid * 0x10 + - SIRFSOC_DMA_CH_YLEN); - writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) | - (1 << cid), sdma->base + SIRFSOC_DMA_INT_EN); + if (sdma->type == SIRFSOC_DMA_VER_A7V2) + cid = 0; - /* - * writel has an implict memory write barrier to make sure data is - * flushed into memory before starting DMA - */ - writel(sdesc->addr >> 2, sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR); + /* Start the DMA transfer */ + sdma->exec_desc(sdesc, cid, schan->mode, base); - if (sdesc->cyclic) { - writel((1 << cid) | 1 << (cid + 16) | - readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + if (sdesc->cyclic) schan->happened_cyclic = schan->completed_cyclic = 0; - } } /* Interrupt handler */ @@ -160,27 +268,65 @@ static irqreturn_t sirfsoc_dma_irq(int irq, void *data) struct sirfsoc_dma_chan *schan; struct sirfsoc_dma_desc *sdesc = NULL; u32 is; + bool chain; int ch; + void __iomem *reg; + + switch (sdma->type) { + case SIRFSOC_DMA_VER_A6: + case SIRFSOC_DMA_VER_A7V1: + is = readl(sdma->base + SIRFSOC_DMA_CH_INT); + reg = sdma->base + SIRFSOC_DMA_CH_INT; + while ((ch = fls(is) - 1) >= 0) { + is &= ~(1 << ch); + writel_relaxed(1 << ch, reg); + schan = &sdma->channels[ch]; + spin_lock(&schan->lock); + sdesc = list_first_entry(&schan->active, + struct sirfsoc_dma_desc, node); + if (!sdesc->cyclic) { + /* Execute queued descriptors */ + list_splice_tail_init(&schan->active, + &schan->completed); + dma_cookie_complete(&sdesc->desc); + if (!list_empty(&schan->queued)) + sirfsoc_dma_execute(schan); + } else + schan->happened_cyclic++; + spin_unlock(&schan->lock); + } + break; - is = readl(sdma->base + SIRFSOC_DMA_CH_INT); - while ((ch = fls(is) - 1) >= 0) { - is &= ~(1 << ch); - writel_relaxed(1 << ch, sdma->base + SIRFSOC_DMA_CH_INT); - schan = &sdma->channels[ch]; + case SIRFSOC_DMA_VER_A7V2: + is = readl(sdma->base + SIRFSOC_DMA_INT_ATLAS7); + reg = sdma->base + SIRFSOC_DMA_INT_ATLAS7; + writel_relaxed(SIRFSOC_DMA_INT_ALL_ATLAS7, reg); + schan = &sdma->channels[0]; spin_lock(&schan->lock); - - sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, - node); + sdesc = list_first_entry(&schan->active, + struct sirfsoc_dma_desc, node); if (!sdesc->cyclic) { - /* Execute queued descriptors */ - list_splice_tail_init(&schan->active, &schan->completed); - if (!list_empty(&schan->queued)) - sirfsoc_dma_execute(schan); - } else + chain = sdesc->chain; + if ((chain && (is & SIRFSOC_DMA_INT_END_INT_ATLAS7)) || + (!chain && + (is & SIRFSOC_DMA_INT_FINI_INT_ATLAS7))) { + /* Execute queued descriptors */ + list_splice_tail_init(&schan->active, + &schan->completed); + dma_cookie_complete(&sdesc->desc); + if (!list_empty(&schan->queued)) + sirfsoc_dma_execute(schan); + } + } else if (sdesc->cyclic && (is & + SIRFSOC_DMA_INT_LOOP_INT_ATLAS7)) schan->happened_cyclic++; spin_unlock(&schan->lock); + break; + + default: + break; } /* Schedule tasklet */ @@ -227,16 +373,15 @@ static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma) schan->chan.completed_cookie = last_cookie; spin_unlock_irqrestore(&schan->lock, flags); } else { - /* for cyclic channel, desc is always in active list */ - sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, - node); - - if (!sdesc || (sdesc && !sdesc->cyclic)) { - /* without active cyclic DMA */ + if (list_empty(&schan->active)) { spin_unlock_irqrestore(&schan->lock, flags); continue; } + /* for cyclic channel, desc is always in active list */ + sdesc = list_first_entry(&schan->active, + struct sirfsoc_dma_desc, node); + /* cyclic DMA */ happened_cyclic = schan->happened_cyclic; spin_unlock_irqrestore(&schan->lock, flags); @@ -307,20 +452,32 @@ static int sirfsoc_dma_terminate_all(struct dma_chan *chan) spin_lock_irqsave(&schan->lock, flags); - if (!sdma->is_marco) { - writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) & - ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN); - writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL) - & ~((1 << cid) | 1 << (cid + 16)), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); - } else { + switch (sdma->type) { + case SIRFSOC_DMA_VER_A7V1: writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_INT_EN_CLR); writel_relaxed((1 << cid) | 1 << (cid + 16), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_CLR); + sdma->base + + SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7); + writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID); + break; + case SIRFSOC_DMA_VER_A7V2: + writel_relaxed(0, sdma->base + SIRFSOC_DMA_INT_EN_ATLAS7); + writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7); + writel_relaxed(0, sdma->base + SIRFSOC_DMA_VALID_ATLAS7); + break; + case SIRFSOC_DMA_VER_A6: + writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) & + ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN); + writel_relaxed(readl_relaxed(sdma->base + + SIRFSOC_DMA_CH_LOOP_CTRL) & + ~((1 << cid) | 1 << (cid + 16)), + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID); + break; + default: + break; } - writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID); - list_splice_tail_init(&schan->active, &schan->free); list_splice_tail_init(&schan->queued, &schan->free); @@ -338,13 +495,25 @@ static int sirfsoc_dma_pause_chan(struct dma_chan *chan) spin_lock_irqsave(&schan->lock, flags); - if (!sdma->is_marco) - writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL) - & ~((1 << cid) | 1 << (cid + 16)), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); - else + switch (sdma->type) { + case SIRFSOC_DMA_VER_A7V1: writel_relaxed((1 << cid) | 1 << (cid + 16), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_CLR); + sdma->base + + SIRFSOC_DMA_CH_LOOP_CTRL_CLR_ATLAS7); + break; + case SIRFSOC_DMA_VER_A7V2: + writel_relaxed(0, sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7); + break; + case SIRFSOC_DMA_VER_A6: + writel_relaxed(readl_relaxed(sdma->base + + SIRFSOC_DMA_CH_LOOP_CTRL) & + ~((1 << cid) | 1 << (cid + 16)), + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + break; + + default: + break; + } spin_unlock_irqrestore(&schan->lock, flags); @@ -359,14 +528,25 @@ static int sirfsoc_dma_resume_chan(struct dma_chan *chan) unsigned long flags; spin_lock_irqsave(&schan->lock, flags); - - if (!sdma->is_marco) - writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL) - | ((1 << cid) | 1 << (cid + 16)), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); - else + switch (sdma->type) { + case SIRFSOC_DMA_VER_A7V1: writel_relaxed((1 << cid) | 1 << (cid + 16), - sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL_ATLAS7); + break; + case SIRFSOC_DMA_VER_A7V2: + writel_relaxed(0x10001, + sdma->base + SIRFSOC_DMA_LOOP_CTRL_ATLAS7); + break; + case SIRFSOC_DMA_VER_A6: + writel_relaxed(readl_relaxed(sdma->base + + SIRFSOC_DMA_CH_LOOP_CTRL) | + ((1 << cid) | 1 << (cid + 16)), + sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL); + break; + + default: + break; + } spin_unlock_irqrestore(&schan->lock, flags); @@ -473,14 +653,31 @@ sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie, spin_lock_irqsave(&schan->lock, flags); - sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, - node); - dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) * - (sdesc->width * SIRFSOC_DMA_WORD_LEN); + if (list_empty(&schan->active)) { + ret = dma_cookie_status(chan, cookie, txstate); + dma_set_residue(txstate, 0); + spin_unlock_irqrestore(&schan->lock, flags); + return ret; + } + sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc, node); + if (sdesc->cyclic) + dma_request_bytes = (sdesc->xlen + 1) * (sdesc->ylen + 1) * + (sdesc->width * SIRFSOC_DMA_WORD_LEN); + else + dma_request_bytes = sdesc->xlen * SIRFSOC_DMA_WORD_LEN; ret = dma_cookie_status(chan, cookie, txstate); - dma_pos = readl_relaxed(sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR) - << 2; + + if (sdma->type == SIRFSOC_DMA_VER_A7V2) + cid = 0; + + if (sdma->type == SIRFSOC_DMA_VER_A7V2) { + dma_pos = readl_relaxed(sdma->base + SIRFSOC_DMA_CUR_DATA_ADDR); + } else { + dma_pos = readl_relaxed( + sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR) << 2; + } + residue = dma_request_bytes - (dma_pos - sdesc->addr); dma_set_residue(txstate, residue); @@ -647,6 +844,7 @@ static int sirfsoc_dma_probe(struct platform_device *op) struct dma_device *dma; struct sirfsoc_dma *sdma; struct sirfsoc_dma_chan *schan; + struct sirfsoc_dmadata *data; struct resource res; ulong regs_start, regs_size; u32 id; @@ -657,9 +855,11 @@ static int sirfsoc_dma_probe(struct platform_device *op) dev_err(dev, "Memory exhausted!\n"); return -ENOMEM; } - - if (of_device_is_compatible(dn, "sirf,marco-dmac")) - sdma->is_marco = true; + data = (struct sirfsoc_dmadata *) + (of_match_device(op->dev.driver->of_match_table, + &op->dev)->data); + sdma->exec_desc = data->exec; + sdma->type = data->type; if (of_property_read_u32(dn, "cell-index", &id)) { dev_err(dev, "Fail to get DMAC index\n"); @@ -816,6 +1016,8 @@ static int sirfsoc_dma_pm_suspend(struct device *dev) struct sirfsoc_dma_chan *schan; int ch; int ret; + int count; + u32 int_offset; /* * if we were runtime-suspended before, resume to enable clock @@ -827,11 +1029,19 @@ static int sirfsoc_dma_pm_suspend(struct device *dev) return ret; } + if (sdma->type == SIRFSOC_DMA_VER_A7V2) { + count = 1; + int_offset = SIRFSOC_DMA_INT_EN_ATLAS7; + } else { + count = SIRFSOC_DMA_CHANNELS; + int_offset = SIRFSOC_DMA_INT_EN; + } + /* * DMA controller will lose all registers while suspending * so we need to save registers for active channels */ - for (ch = 0; ch < SIRFSOC_DMA_CHANNELS; ch++) { + for (ch = 0; ch < count; ch++) { schan = &sdma->channels[ch]; if (list_empty(&schan->active)) continue; @@ -841,7 +1051,7 @@ static int sirfsoc_dma_pm_suspend(struct device *dev) save->ctrl[ch] = readl_relaxed(sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_CTRL); } - save->interrupt_en = readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN); + save->interrupt_en = readl_relaxed(sdma->base + int_offset); /* Disable clock */ sirfsoc_dma_runtime_suspend(dev); @@ -857,14 +1067,27 @@ static int sirfsoc_dma_pm_resume(struct device *dev) struct sirfsoc_dma_chan *schan; int ch; int ret; + int count; + u32 int_offset; + u32 width_offset; /* Enable clock before accessing register */ ret = sirfsoc_dma_runtime_resume(dev); if (ret < 0) return ret; - writel_relaxed(save->interrupt_en, sdma->base + SIRFSOC_DMA_INT_EN); - for (ch = 0; ch < SIRFSOC_DMA_CHANNELS; ch++) { + if (sdma->type == SIRFSOC_DMA_VER_A7V2) { + count = 1; + int_offset = SIRFSOC_DMA_INT_EN_ATLAS7; + width_offset = SIRFSOC_DMA_WIDTH_ATLAS7; + } else { + count = SIRFSOC_DMA_CHANNELS; + int_offset = SIRFSOC_DMA_INT_EN; + width_offset = SIRFSOC_DMA_WIDTH_0; + } + + writel_relaxed(save->interrupt_en, sdma->base + int_offset); + for (ch = 0; ch < count; ch++) { schan = &sdma->channels[ch]; if (list_empty(&schan->active)) continue; @@ -872,15 +1095,21 @@ static int sirfsoc_dma_pm_resume(struct device *dev) struct sirfsoc_dma_desc, node); writel_relaxed(sdesc->width, - sdma->base + SIRFSOC_DMA_WIDTH_0 + ch * 4); + sdma->base + width_offset + ch * 4); writel_relaxed(sdesc->xlen, sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_XLEN); writel_relaxed(sdesc->ylen, sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_YLEN); writel_relaxed(save->ctrl[ch], sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_CTRL); - writel_relaxed(sdesc->addr >> 2, - sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR); + if (sdma->type == SIRFSOC_DMA_VER_A7V2) { + writel_relaxed(sdesc->addr, + sdma->base + SIRFSOC_DMA_CH_ADDR); + } else { + writel_relaxed(sdesc->addr >> 2, + sdma->base + ch * 0x10 + SIRFSOC_DMA_CH_ADDR); + + } } /* if we were runtime-suspended before, suspend again */ @@ -896,9 +1125,25 @@ static const struct dev_pm_ops sirfsoc_dma_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(sirfsoc_dma_pm_suspend, sirfsoc_dma_pm_resume) }; +struct sirfsoc_dmadata sirfsoc_dmadata_a6 = { + .exec = sirfsoc_dma_execute_hw_a6, + .type = SIRFSOC_DMA_VER_A6, +}; + +struct sirfsoc_dmadata sirfsoc_dmadata_a7v1 = { + .exec = sirfsoc_dma_execute_hw_a7v1, + .type = SIRFSOC_DMA_VER_A7V1, +}; + +struct sirfsoc_dmadata sirfsoc_dmadata_a7v2 = { + .exec = sirfsoc_dma_execute_hw_a7v2, + .type = SIRFSOC_DMA_VER_A7V2, +}; + static const struct of_device_id sirfsoc_dma_match[] = { - { .compatible = "sirf,prima2-dmac", }, - { .compatible = "sirf,marco-dmac", }, + { .compatible = "sirf,prima2-dmac", .data = &sirfsoc_dmadata_a6,}, + { .compatible = "sirf,atlas7-dmac", .data = &sirfsoc_dmadata_a7v1,}, + { .compatible = "sirf,atlas7-dmac-v2", .data = &sirfsoc_dmadata_a7v2,}, {}, }; @@ -925,7 +1170,7 @@ static void __exit sirfsoc_dma_exit(void) subsys_initcall(sirfsoc_dma_init); module_exit(sirfsoc_dma_exit); -MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>, " - "Barry Song <baohua.song@csr.com>"); +MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>"); +MODULE_AUTHOR("Barry Song <baohua.song@csr.com>"); MODULE_DESCRIPTION("SIRFSOC DMA control driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c index 11e536586812..842ff97c2cfb 100644 --- a/drivers/dma/sun6i-dma.c +++ b/drivers/dma/sun6i-dma.c @@ -891,9 +891,21 @@ static struct sun6i_dma_config sun8i_a23_dma_cfg = { .nr_max_vchans = 37, }; +/* + * The H3 has 12 physical channels, a maximum DRQ port id of 27, + * and a total of 34 usable source and destination endpoints. + */ + +static struct sun6i_dma_config sun8i_h3_dma_cfg = { + .nr_max_channels = 12, + .nr_max_requests = 27, + .nr_max_vchans = 34, +}; + static const struct of_device_id sun6i_dma_match[] = { { .compatible = "allwinner,sun6i-a31-dma", .data = &sun6i_a31_dma_cfg }, { .compatible = "allwinner,sun8i-a23-dma", .data = &sun8i_a23_dma_cfg }, + { .compatible = "allwinner,sun8i-h3-dma", .data = &sun8i_h3_dma_cfg }, { /* sentinel */ } }; diff --git a/drivers/dma/ti-dma-crossbar.c b/drivers/dma/ti-dma-crossbar.c new file mode 100644 index 000000000000..24f5ca2356bf --- /dev/null +++ b/drivers/dma/ti-dma-crossbar.c @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2015 Texas Instruments Incorporated - http://www.ti.com + * Author: Peter Ujfalusi <peter.ujfalusi@ti.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/io.h> +#include <linux/idr.h> +#include <linux/of_address.h> +#include <linux/of_device.h> +#include <linux/of_dma.h> + +#define TI_XBAR_OUTPUTS 127 +#define TI_XBAR_INPUTS 256 + +static DEFINE_IDR(map_idr); + +struct ti_dma_xbar_data { + void __iomem *iomem; + + struct dma_router dmarouter; + + u16 safe_val; /* Value to rest the crossbar lines */ + u32 xbar_requests; /* number of DMA requests connected to XBAR */ + u32 dma_requests; /* number of DMA requests forwarded to DMA */ +}; + +struct ti_dma_xbar_map { + u16 xbar_in; + int xbar_out; +}; + +static inline void ti_dma_xbar_write(void __iomem *iomem, int xbar, u16 val) +{ + writew_relaxed(val, iomem + (xbar * 2)); +} + +static void ti_dma_xbar_free(struct device *dev, void *route_data) +{ + struct ti_dma_xbar_data *xbar = dev_get_drvdata(dev); + struct ti_dma_xbar_map *map = route_data; + + dev_dbg(dev, "Unmapping XBAR%u (was routed to %d)\n", + map->xbar_in, map->xbar_out); + + ti_dma_xbar_write(xbar->iomem, map->xbar_out, xbar->safe_val); + idr_remove(&map_idr, map->xbar_out); + kfree(map); +} + +static void *ti_dma_xbar_route_allocate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct platform_device *pdev = of_find_device_by_node(ofdma->of_node); + struct ti_dma_xbar_data *xbar = platform_get_drvdata(pdev); + struct ti_dma_xbar_map *map; + + if (dma_spec->args[0] >= xbar->xbar_requests) { + dev_err(&pdev->dev, "Invalid XBAR request number: %d\n", + dma_spec->args[0]); + return ERR_PTR(-EINVAL); + } + + /* The of_node_put() will be done in the core for the node */ + dma_spec->np = of_parse_phandle(ofdma->of_node, "dma-masters", 0); + if (!dma_spec->np) { + dev_err(&pdev->dev, "Can't get DMA master\n"); + return ERR_PTR(-EINVAL); + } + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) { + of_node_put(dma_spec->np); + return ERR_PTR(-ENOMEM); + } + + map->xbar_out = idr_alloc(&map_idr, NULL, 0, xbar->dma_requests, + GFP_KERNEL); + map->xbar_in = (u16)dma_spec->args[0]; + + /* The DMA request is 1 based in sDMA */ + dma_spec->args[0] = map->xbar_out + 1; + + dev_dbg(&pdev->dev, "Mapping XBAR%u to DMA%d\n", + map->xbar_in, map->xbar_out); + + ti_dma_xbar_write(xbar->iomem, map->xbar_out, map->xbar_in); + + return map; +} + +static int ti_dma_xbar_probe(struct platform_device *pdev) +{ + struct device_node *node = pdev->dev.of_node; + struct device_node *dma_node; + struct ti_dma_xbar_data *xbar; + struct resource *res; + u32 safe_val; + void __iomem *iomem; + int i, ret; + + if (!node) + return -ENODEV; + + xbar = devm_kzalloc(&pdev->dev, sizeof(*xbar), GFP_KERNEL); + if (!xbar) + return -ENOMEM; + + dma_node = of_parse_phandle(node, "dma-masters", 0); + if (!dma_node) { + dev_err(&pdev->dev, "Can't get DMA master node\n"); + return -ENODEV; + } + + if (of_property_read_u32(dma_node, "dma-requests", + &xbar->dma_requests)) { + dev_info(&pdev->dev, + "Missing XBAR output information, using %u.\n", + TI_XBAR_OUTPUTS); + xbar->dma_requests = TI_XBAR_OUTPUTS; + } + of_node_put(dma_node); + + if (of_property_read_u32(node, "dma-requests", &xbar->xbar_requests)) { + dev_info(&pdev->dev, + "Missing XBAR input information, using %u.\n", + TI_XBAR_INPUTS); + xbar->xbar_requests = TI_XBAR_INPUTS; + } + + if (!of_property_read_u32(node, "ti,dma-safe-map", &safe_val)) + xbar->safe_val = (u16)safe_val; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + iomem = devm_ioremap_resource(&pdev->dev, res); + if (!iomem) + return -ENOMEM; + + xbar->iomem = iomem; + + xbar->dmarouter.dev = &pdev->dev; + xbar->dmarouter.route_free = ti_dma_xbar_free; + + platform_set_drvdata(pdev, xbar); + + /* Reset the crossbar */ + for (i = 0; i < xbar->dma_requests; i++) + ti_dma_xbar_write(xbar->iomem, i, xbar->safe_val); + + ret = of_dma_router_register(node, ti_dma_xbar_route_allocate, + &xbar->dmarouter); + if (ret) { + /* Restore the defaults for the crossbar */ + for (i = 0; i < xbar->dma_requests; i++) + ti_dma_xbar_write(xbar->iomem, i, i); + } + + return ret; +} + +static const struct of_device_id ti_dma_xbar_match[] = { + { .compatible = "ti,dra7-dma-crossbar" }, + {}, +}; + +static struct platform_driver ti_dma_xbar_driver = { + .driver = { + .name = "ti-dma-crossbar", + .of_match_table = of_match_ptr(ti_dma_xbar_match), + }, + .probe = ti_dma_xbar_probe, +}; + +int omap_dmaxbar_init(void) +{ + return platform_driver_register(&ti_dma_xbar_driver); +} +arch_initcall(omap_dmaxbar_init); diff --git a/drivers/dma/virt-dma.c b/drivers/dma/virt-dma.c index 6f80432a3f0a..7d2c17d8d30f 100644 --- a/drivers/dma/virt-dma.c +++ b/drivers/dma/virt-dma.c @@ -29,7 +29,7 @@ dma_cookie_t vchan_tx_submit(struct dma_async_tx_descriptor *tx) spin_lock_irqsave(&vc->lock, flags); cookie = dma_cookie_assign(tx); - list_add_tail(&vd->node, &vc->desc_submitted); + list_move_tail(&vd->node, &vc->desc_submitted); spin_unlock_irqrestore(&vc->lock, flags); dev_dbg(vc->chan.device->dev, "vchan %p: txd %p[%x]: submitted\n", @@ -83,8 +83,10 @@ static void vchan_complete(unsigned long arg) cb_data = vd->tx.callback_param; list_del(&vd->node); - - vc->desc_free(vd); + if (async_tx_test_ack(&vd->tx)) + list_add(&vd->node, &vc->desc_allocated); + else + vc->desc_free(vd); if (cb) cb(cb_data); @@ -96,9 +98,13 @@ void vchan_dma_desc_free_list(struct virt_dma_chan *vc, struct list_head *head) while (!list_empty(head)) { struct virt_dma_desc *vd = list_first_entry(head, struct virt_dma_desc, node); - list_del(&vd->node); - dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd); - vc->desc_free(vd); + if (async_tx_test_ack(&vd->tx)) { + list_move_tail(&vd->node, &vc->desc_allocated); + } else { + dev_dbg(vc->chan.device->dev, "txd %p: freeing\n", vd); + list_del(&vd->node); + vc->desc_free(vd); + } } } EXPORT_SYMBOL_GPL(vchan_dma_desc_free_list); @@ -108,6 +114,7 @@ void vchan_init(struct virt_dma_chan *vc, struct dma_device *dmadev) dma_cookie_init(&vc->chan); spin_lock_init(&vc->lock); + INIT_LIST_HEAD(&vc->desc_allocated); INIT_LIST_HEAD(&vc->desc_submitted); INIT_LIST_HEAD(&vc->desc_issued); INIT_LIST_HEAD(&vc->desc_completed); diff --git a/drivers/dma/virt-dma.h b/drivers/dma/virt-dma.h index 181b95267866..189e75dbcb15 100644 --- a/drivers/dma/virt-dma.h +++ b/drivers/dma/virt-dma.h @@ -29,6 +29,7 @@ struct virt_dma_chan { spinlock_t lock; /* protected by vc.lock */ + struct list_head desc_allocated; struct list_head desc_submitted; struct list_head desc_issued; struct list_head desc_completed; @@ -55,11 +56,16 @@ static inline struct dma_async_tx_descriptor *vchan_tx_prep(struct virt_dma_chan struct virt_dma_desc *vd, unsigned long tx_flags) { extern dma_cookie_t vchan_tx_submit(struct dma_async_tx_descriptor *); + unsigned long flags; dma_async_tx_descriptor_init(&vd->tx, &vc->chan); vd->tx.flags = tx_flags; vd->tx.tx_submit = vchan_tx_submit; + spin_lock_irqsave(&vc->lock, flags); + list_add_tail(&vd->node, &vc->desc_allocated); + spin_unlock_irqrestore(&vc->lock, flags); + return &vd->tx; } @@ -122,7 +128,8 @@ static inline struct virt_dma_desc *vchan_next_desc(struct virt_dma_chan *vc) } /** - * vchan_get_all_descriptors - obtain all submitted and issued descriptors + * vchan_get_all_descriptors - obtain all allocated, submitted and issued + * descriptors * vc: virtual channel to get descriptors from * head: list of descriptors found * @@ -134,6 +141,7 @@ static inline struct virt_dma_desc *vchan_next_desc(struct virt_dma_chan *vc) static inline void vchan_get_all_descriptors(struct virt_dma_chan *vc, struct list_head *head) { + list_splice_tail_init(&vc->desc_allocated, head); list_splice_tail_init(&vc->desc_submitted, head); list_splice_tail_init(&vc->desc_issued, head); list_splice_tail_init(&vc->desc_completed, head); @@ -141,11 +149,14 @@ static inline void vchan_get_all_descriptors(struct virt_dma_chan *vc, static inline void vchan_free_chan_resources(struct virt_dma_chan *vc) { + struct virt_dma_desc *vd; unsigned long flags; LIST_HEAD(head); spin_lock_irqsave(&vc->lock, flags); vchan_get_all_descriptors(vc, &head); + list_for_each_entry(vd, &head, node) + async_tx_clear_ack(&vd->tx); spin_unlock_irqrestore(&vc->lock, flags); vchan_dma_desc_free_list(vc, &head); diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c index f52e37502254..620fd55ec766 100755..100644 --- a/drivers/dma/xgene-dma.c +++ b/drivers/dma/xgene-dma.c @@ -124,32 +124,8 @@ #define XGENE_DMA_DESC_ELERR_POS 46 #define XGENE_DMA_DESC_RTYPE_POS 56 #define XGENE_DMA_DESC_LERR_POS 60 -#define XGENE_DMA_DESC_FLYBY_POS 4 #define XGENE_DMA_DESC_BUFLEN_POS 48 #define XGENE_DMA_DESC_HOENQ_NUM_POS 48 - -#define XGENE_DMA_DESC_NV_SET(m) \ - (((u64 *)(m))[0] |= XGENE_DMA_DESC_NV_BIT) -#define XGENE_DMA_DESC_IN_SET(m) \ - (((u64 *)(m))[0] |= XGENE_DMA_DESC_IN_BIT) -#define XGENE_DMA_DESC_RTYPE_SET(m, v) \ - (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_RTYPE_POS)) -#define XGENE_DMA_DESC_BUFADDR_SET(m, v) \ - (((u64 *)(m))[0] |= (v)) -#define XGENE_DMA_DESC_BUFLEN_SET(m, v) \ - (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_BUFLEN_POS)) -#define XGENE_DMA_DESC_C_SET(m) \ - (((u64 *)(m))[1] |= XGENE_DMA_DESC_C_BIT) -#define XGENE_DMA_DESC_FLYBY_SET(m, v) \ - (((u64 *)(m))[2] |= ((v) << XGENE_DMA_DESC_FLYBY_POS)) -#define XGENE_DMA_DESC_MULTI_SET(m, v, i) \ - (((u64 *)(m))[2] |= ((u64)(v) << (((i) + 1) * 8))) -#define XGENE_DMA_DESC_DR_SET(m) \ - (((u64 *)(m))[2] |= XGENE_DMA_DESC_DR_BIT) -#define XGENE_DMA_DESC_DST_ADDR_SET(m, v) \ - (((u64 *)(m))[3] |= (v)) -#define XGENE_DMA_DESC_H0ENQ_NUM_SET(m, v) \ - (((u64 *)(m))[3] |= ((u64)(v) << XGENE_DMA_DESC_HOENQ_NUM_POS)) #define XGENE_DMA_DESC_ELERR_RD(m) \ (((m) >> XGENE_DMA_DESC_ELERR_POS) & 0x3) #define XGENE_DMA_DESC_LERR_RD(m) \ @@ -158,14 +134,7 @@ (((elerr) << 4) | (lerr)) /* X-Gene DMA descriptor empty s/w signature */ -#define XGENE_DMA_DESC_EMPTY_INDEX 0 #define XGENE_DMA_DESC_EMPTY_SIGNATURE ~0ULL -#define XGENE_DMA_DESC_SET_EMPTY(m) \ - (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] = \ - XGENE_DMA_DESC_EMPTY_SIGNATURE) -#define XGENE_DMA_DESC_IS_EMPTY(m) \ - (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] == \ - XGENE_DMA_DESC_EMPTY_SIGNATURE) /* X-Gene DMA configurable parameters defines */ #define XGENE_DMA_RING_NUM 512 @@ -184,7 +153,7 @@ #define XGENE_DMA_XOR_ALIGNMENT 6 /* 64 Bytes */ #define XGENE_DMA_MAX_XOR_SRC 5 #define XGENE_DMA_16K_BUFFER_LEN_CODE 0x0 -#define XGENE_DMA_INVALID_LEN_CODE 0x7800 +#define XGENE_DMA_INVALID_LEN_CODE 0x7800000000000000ULL /* X-Gene DMA descriptor error codes */ #define ERR_DESC_AXI 0x01 @@ -214,10 +183,10 @@ #define ERR_DESC_SRC_INT 0xB /* X-Gene DMA flyby operation code */ -#define FLYBY_2SRC_XOR 0x8 -#define FLYBY_3SRC_XOR 0x9 -#define FLYBY_4SRC_XOR 0xA -#define FLYBY_5SRC_XOR 0xB +#define FLYBY_2SRC_XOR 0x80 +#define FLYBY_3SRC_XOR 0x90 +#define FLYBY_4SRC_XOR 0xA0 +#define FLYBY_5SRC_XOR 0xB0 /* X-Gene DMA SW descriptor flags */ #define XGENE_DMA_FLAG_64B_DESC BIT(0) @@ -238,10 +207,10 @@ dev_err(chan->dev, "%s: " fmt, chan->name, ##arg) struct xgene_dma_desc_hw { - u64 m0; - u64 m1; - u64 m2; - u64 m3; + __le64 m0; + __le64 m1; + __le64 m2; + __le64 m3; }; enum xgene_dma_ring_cfgsize { @@ -388,18 +357,11 @@ static bool is_pq_enabled(struct xgene_dma *pdma) return !(val & XGENE_DMA_PQ_DISABLE_MASK); } -static void xgene_dma_cpu_to_le64(u64 *desc, int count) -{ - int i; - - for (i = 0; i < count; i++) - desc[i] = cpu_to_le64(desc[i]); -} - -static u16 xgene_dma_encode_len(u32 len) +static u64 xgene_dma_encode_len(size_t len) { return (len < XGENE_DMA_MAX_BYTE_CNT) ? - len : XGENE_DMA_16K_BUFFER_LEN_CODE; + ((u64)len << XGENE_DMA_DESC_BUFLEN_POS) : + XGENE_DMA_16K_BUFFER_LEN_CODE; } static u8 xgene_dma_encode_xor_flyby(u32 src_cnt) @@ -424,34 +386,50 @@ static u32 xgene_dma_ring_desc_cnt(struct xgene_dma_ring *ring) return XGENE_DMA_RING_DESC_CNT(ring_state); } -static void xgene_dma_set_src_buffer(void *ext8, size_t *len, +static void xgene_dma_set_src_buffer(__le64 *ext8, size_t *len, dma_addr_t *paddr) { size_t nbytes = (*len < XGENE_DMA_MAX_BYTE_CNT) ? *len : XGENE_DMA_MAX_BYTE_CNT; - XGENE_DMA_DESC_BUFADDR_SET(ext8, *paddr); - XGENE_DMA_DESC_BUFLEN_SET(ext8, xgene_dma_encode_len(nbytes)); + *ext8 |= cpu_to_le64(*paddr); + *ext8 |= cpu_to_le64(xgene_dma_encode_len(nbytes)); *len -= nbytes; *paddr += nbytes; } -static void xgene_dma_invalidate_buffer(void *ext8) +static void xgene_dma_invalidate_buffer(__le64 *ext8) { - XGENE_DMA_DESC_BUFLEN_SET(ext8, XGENE_DMA_INVALID_LEN_CODE); + *ext8 |= cpu_to_le64(XGENE_DMA_INVALID_LEN_CODE); } -static void *xgene_dma_lookup_ext8(u64 *desc, int idx) +static __le64 *xgene_dma_lookup_ext8(struct xgene_dma_desc_hw *desc, int idx) { - return (idx % 2) ? (desc + idx - 1) : (desc + idx + 1); + switch (idx) { + case 0: + return &desc->m1; + case 1: + return &desc->m0; + case 2: + return &desc->m3; + case 3: + return &desc->m2; + default: + pr_err("Invalid dma descriptor index\n"); + } + + return NULL; } -static void xgene_dma_init_desc(void *desc, u16 dst_ring_num) +static void xgene_dma_init_desc(struct xgene_dma_desc_hw *desc, + u16 dst_ring_num) { - XGENE_DMA_DESC_C_SET(desc); /* Coherent IO */ - XGENE_DMA_DESC_IN_SET(desc); - XGENE_DMA_DESC_H0ENQ_NUM_SET(desc, dst_ring_num); - XGENE_DMA_DESC_RTYPE_SET(desc, XGENE_DMA_RING_OWNER_DMA); + desc->m0 |= cpu_to_le64(XGENE_DMA_DESC_IN_BIT); + desc->m0 |= cpu_to_le64((u64)XGENE_DMA_RING_OWNER_DMA << + XGENE_DMA_DESC_RTYPE_POS); + desc->m1 |= cpu_to_le64(XGENE_DMA_DESC_C_BIT); + desc->m3 |= cpu_to_le64((u64)dst_ring_num << + XGENE_DMA_DESC_HOENQ_NUM_POS); } static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan, @@ -459,7 +437,7 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan, dma_addr_t dst, dma_addr_t src, size_t len) { - void *desc1, *desc2; + struct xgene_dma_desc_hw *desc1, *desc2; int i; /* Get 1st descriptor */ @@ -467,23 +445,21 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan, xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num); /* Set destination address */ - XGENE_DMA_DESC_DR_SET(desc1); - XGENE_DMA_DESC_DST_ADDR_SET(desc1, dst); + desc1->m2 |= cpu_to_le64(XGENE_DMA_DESC_DR_BIT); + desc1->m3 |= cpu_to_le64(dst); /* Set 1st source address */ - xgene_dma_set_src_buffer(desc1 + 8, &len, &src); + xgene_dma_set_src_buffer(&desc1->m1, &len, &src); - if (len <= 0) { - desc2 = NULL; - goto skip_additional_src; - } + if (!len) + return; /* * We need to split this source buffer, * and need to use 2nd descriptor */ desc2 = &desc_sw->desc2; - XGENE_DMA_DESC_NV_SET(desc1); + desc1->m0 |= cpu_to_le64(XGENE_DMA_DESC_NV_BIT); /* Set 2nd to 5th source address */ for (i = 0; i < 4 && len; i++) @@ -496,12 +472,6 @@ static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan, /* Updated flag that we have prepared 64B descriptor */ desc_sw->flags |= XGENE_DMA_FLAG_64B_DESC; - -skip_additional_src: - /* Hardware stores descriptor in little endian format */ - xgene_dma_cpu_to_le64(desc1, 4); - if (desc2) - xgene_dma_cpu_to_le64(desc2, 4); } static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan, @@ -510,7 +480,7 @@ static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan, u32 src_cnt, size_t *nbytes, const u8 *scf) { - void *desc1, *desc2; + struct xgene_dma_desc_hw *desc1, *desc2; size_t len = *nbytes; int i; @@ -521,28 +491,24 @@ static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan, xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num); /* Set destination address */ - XGENE_DMA_DESC_DR_SET(desc1); - XGENE_DMA_DESC_DST_ADDR_SET(desc1, *dst); + desc1->m2 |= cpu_to_le64(XGENE_DMA_DESC_DR_BIT); + desc1->m3 |= cpu_to_le64(*dst); /* We have multiple source addresses, so need to set NV bit*/ - XGENE_DMA_DESC_NV_SET(desc1); + desc1->m0 |= cpu_to_le64(XGENE_DMA_DESC_NV_BIT); /* Set flyby opcode */ - XGENE_DMA_DESC_FLYBY_SET(desc1, xgene_dma_encode_xor_flyby(src_cnt)); + desc1->m2 |= cpu_to_le64(xgene_dma_encode_xor_flyby(src_cnt)); /* Set 1st to 5th source addresses */ for (i = 0; i < src_cnt; i++) { len = *nbytes; - xgene_dma_set_src_buffer((i == 0) ? (desc1 + 8) : + xgene_dma_set_src_buffer((i == 0) ? &desc1->m1 : xgene_dma_lookup_ext8(desc2, i - 1), &len, &src[i]); - XGENE_DMA_DESC_MULTI_SET(desc1, scf[i], i); + desc1->m2 |= cpu_to_le64((scf[i] << ((i + 1) * 8))); } - /* Hardware stores descriptor in little endian format */ - xgene_dma_cpu_to_le64(desc1, 4); - xgene_dma_cpu_to_le64(desc2, 4); - /* Update meta data */ *nbytes = len; *dst += XGENE_DMA_MAX_BYTE_CNT; @@ -738,7 +704,7 @@ static int xgene_chan_xfer_request(struct xgene_dma_ring *ring, * xgene_chan_xfer_ld_pending - push any pending transactions to hw * @chan : X-Gene DMA channel * - * LOCKING: must hold chan->desc_lock + * LOCKING: must hold chan->lock */ static void xgene_chan_xfer_ld_pending(struct xgene_dma_chan *chan) { @@ -808,7 +774,8 @@ static void xgene_dma_cleanup_descriptors(struct xgene_dma_chan *chan) desc_hw = &ring->desc_hw[ring->head]; /* Check if this descriptor has been completed */ - if (unlikely(XGENE_DMA_DESC_IS_EMPTY(desc_hw))) + if (unlikely(le64_to_cpu(desc_hw->m0) == + XGENE_DMA_DESC_EMPTY_SIGNATURE)) break; if (++ring->head == ring->slots) @@ -842,7 +809,7 @@ static void xgene_dma_cleanup_descriptors(struct xgene_dma_chan *chan) iowrite32(-1, ring->cmd); /* Mark this hw descriptor as processed */ - XGENE_DMA_DESC_SET_EMPTY(desc_hw); + desc_hw->m0 = cpu_to_le64(XGENE_DMA_DESC_EMPTY_SIGNATURE); xgene_dma_run_tx_complete_actions(chan, desc_sw); @@ -889,7 +856,7 @@ static int xgene_dma_alloc_chan_resources(struct dma_chan *dchan) * @chan: X-Gene DMA channel * @list: the list to free * - * LOCKING: must hold chan->desc_lock + * LOCKING: must hold chan->lock */ static void xgene_dma_free_desc_list(struct xgene_dma_chan *chan, struct list_head *list) @@ -900,15 +867,6 @@ static void xgene_dma_free_desc_list(struct xgene_dma_chan *chan, xgene_dma_clean_descriptor(chan, desc); } -static void xgene_dma_free_tx_desc_list(struct xgene_dma_chan *chan, - struct list_head *list) -{ - struct xgene_dma_desc_sw *desc, *_desc; - - list_for_each_entry_safe(desc, _desc, list, node) - xgene_dma_clean_descriptor(chan, desc); -} - static void xgene_dma_free_chan_resources(struct dma_chan *dchan) { struct xgene_dma_chan *chan = to_dma_chan(dchan); @@ -985,7 +943,7 @@ fail: if (!first) return NULL; - xgene_dma_free_tx_desc_list(chan, &first->tx_list); + xgene_dma_free_desc_list(chan, &first->tx_list); return NULL; } @@ -1093,7 +1051,7 @@ fail: if (!first) return NULL; - xgene_dma_free_tx_desc_list(chan, &first->tx_list); + xgene_dma_free_desc_list(chan, &first->tx_list); return NULL; } @@ -1141,7 +1099,7 @@ fail: if (!first) return NULL; - xgene_dma_free_tx_desc_list(chan, &first->tx_list); + xgene_dma_free_desc_list(chan, &first->tx_list); return NULL; } @@ -1218,7 +1176,7 @@ fail: if (!first) return NULL; - xgene_dma_free_tx_desc_list(chan, &first->tx_list); + xgene_dma_free_desc_list(chan, &first->tx_list); return NULL; } @@ -1316,7 +1274,6 @@ static void xgene_dma_setup_ring(struct xgene_dma_ring *ring) { void *ring_cfg = ring->state; u64 addr = ring->desc_paddr; - void *desc; u32 i, val; ring->slots = ring->size / XGENE_DMA_RING_WQ_DESC_SIZE; @@ -1358,8 +1315,10 @@ static void xgene_dma_setup_ring(struct xgene_dma_ring *ring) /* Set empty signature to DMA Rx ring descriptors */ for (i = 0; i < ring->slots; i++) { + struct xgene_dma_desc_hw *desc; + desc = &ring->desc_hw[i]; - XGENE_DMA_DESC_SET_EMPTY(desc); + desc->m0 = cpu_to_le64(XGENE_DMA_DESC_EMPTY_SIGNATURE); } /* Enable DMA Rx ring interrupt */ diff --git a/drivers/md/md.c b/drivers/md/md.c index 8d9f89b4519d..df92d30ca054 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2628,13 +2628,14 @@ errors_show(struct md_rdev *rdev, char *page) static ssize_t errors_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&rdev->corrected_errors, n); - return len; - } - return -EINVAL; + unsigned int n; + int rv; + + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&rdev->corrected_errors, n); + return len; } static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); @@ -2651,13 +2652,16 @@ slot_show(struct md_rdev *rdev, char *page) static ssize_t slot_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; + int slot; int err; - int slot = simple_strtoul(buf, &e, 10); + if (strncmp(buf, "none", 4)==0) slot = -1; - else if (e==buf || (*e && *e!= '\n')) - return -EINVAL; + else { + err = kstrtouint(buf, 10, (unsigned int *)&slot); + if (err < 0) + return err; + } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also * updating the 'rd%d' link, and communicating @@ -3542,12 +3546,12 @@ layout_show(struct mddev *mddev, char *page) static ssize_t layout_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; int err; - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) return err; @@ -3591,12 +3595,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; + unsigned int n; int err; - unsigned long n = simple_strtoul(buf, &e, 10); - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtouint(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) @@ -3643,12 +3647,12 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + unsigned long n; int err; - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); - if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = kstrtoul(buf, 10, &n); + if (err < 0) + return err; err = mddev_lock(mddev); if (err) @@ -3686,19 +3690,24 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + unsigned long long n; int err; - char *e; - unsigned long long n = simple_strtoull(buf, &e, 10); + + if (cmd_match(buf, "none")) + n = MaxSector; + else { + err = kstrtoull(buf, 10, &n); + if (err < 0) + return err; + if (n != (sector_t)n) + return -EINVAL; + } err = mddev_lock(mddev); if (err) return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) err = -EBUSY; - else if (cmd_match(buf, "none")) - n = MaxSector; - else if (!*buf || (*e && *e != '\n')) - err = -EINVAL; if (!err) { mddev->recovery_cp = n; @@ -3934,14 +3943,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) { static ssize_t max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long n = simple_strtoul(buf, &e, 10); + unsigned int n; + int rv; - if (*buf && (*e == 0 || *e == '\n')) { - atomic_set(&mddev->max_corr_read_errors, n); - return len; - } - return -EINVAL; + rv = kstrtouint(buf, 10, &n); + if (rv < 0) + return rv; + atomic_set(&mddev->max_corr_read_errors, n); + return len; } static struct md_sysfs_entry max_corr_read_errors = @@ -4003,8 +4012,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) else rdev = md_import_device(dev, -1, -1); - if (IS_ERR(rdev)) + if (IS_ERR(rdev)) { + mddev_unlock(mddev); return PTR_ERR(rdev); + } err = bind_rdev_to_array(rdev, mddev); out: if (err) @@ -4298,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page) static ssize_t sync_min_store(struct mddev *mddev, const char *buf, size_t len) { - int min; - char *e; + unsigned int min; + int rv; + if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_min = 0; - return len; + min = 0; + } else { + rv = kstrtouint(buf, 10, &min); + if (rv < 0) + return rv; + if (min == 0) + return -EINVAL; } - min = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || min <= 0) - return -EINVAL; mddev->sync_speed_min = min; return len; } @@ -4324,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page) static ssize_t sync_max_store(struct mddev *mddev, const char *buf, size_t len) { - int max; - char *e; + unsigned int max; + int rv; + if (strncmp(buf, "system", 6)==0) { - mddev->sync_speed_max = 0; - return len; + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; } - max = simple_strtoul(buf, &e, 10); - if (buf == e || (*e && *e != '\n') || max <= 0) - return -EINVAL; mddev->sync_speed_max = max; return len; } @@ -4515,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old; + unsigned long long old, new; int err; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); @@ -4557,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - char *e; - unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old; + unsigned long long old, new; int err; - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); @@ -4604,11 +4623,13 @@ static ssize_t reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; - char *e; + unsigned long long new; int err; - unsigned long long new = simple_strtoull(buf, &e, 10); - if (buf == e || (*e && *e != '\n')) + err = kstrtoull(buf, 10, &new); + if (err < 0) + return err; + if (new != (sector_t)new) return -EINVAL; err = mddev_lock(mddev); if (err) @@ -5157,6 +5178,7 @@ int md_run(struct mddev *mddev) mddev_detach(mddev); if (mddev->private) pers->free(mddev, mddev->private); + mddev->private = NULL; module_put(pers->owner); bitmap_destroy(mddev); return err; @@ -5292,6 +5314,7 @@ static void md_clean(struct mddev *mddev) mddev->changed = 0; mddev->degraded = 0; mddev->safemode = 0; + mddev->private = NULL; mddev->merge_check_needed = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; @@ -5364,6 +5387,7 @@ static void __md_stop(struct mddev *mddev) mddev->pers = NULL; spin_unlock(&mddev->lock); pers->free(mddev, mddev->private); + mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); @@ -6373,7 +6397,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->ctime != info->ctime || mddev->level != info->level || /* mddev->layout != info->layout || */ - !mddev->persistent != info->not_persistent|| + mddev->persistent != !info->not_persistent || mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) @@ -8104,6 +8128,15 @@ void md_check_recovery(struct mddev *mddev) int spares = 0; if (mddev->ro) { + struct md_rdev *rdev; + if (!mddev->external && mddev->in_sync) + /* 'Blocked' flag not needed as failed devices + * will be recorded if array switched to read/write. + * Leaving it set will prevent the device + * from being removed. + */ + rdev_for_each(rdev, mddev) + clear_bit(Blocked, &rdev->flags); /* On a read-only array we can: * - remove failed devices * - add already-in_sync devices if the array itself @@ -9011,13 +9044,7 @@ static int get_ro(char *buffer, struct kernel_param *kp) } static int set_ro(const char *val, struct kernel_param *kp) { - char *e; - int num = simple_strtoul(val, &e, 10); - if (*val && (*e == '\0' || *e == '\n')) { - start_readonly = num; - return 0; - } - return -EINVAL; + return kstrtouint(val, 10, (unsigned int *)&start_readonly); } module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 188d8e9a6bdc..940f2f365461 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2099,17 +2099,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; - - for (j=0; j < vcnt ; j++) { - tbio->bi_io_vec[j].bv_offset = 0; - tbio->bi_io_vec[j].bv_len = PAGE_SIZE; - - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } tbio->bi_end_io = end_sync_write; + bio_copy_data(tbio, fbio); + d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); @@ -2124,17 +2117,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * that are active */ for (i = 0; i < conf->copies; i++) { - int j, d; + int d; tbio = r10_bio->devs[i].repl_bio; if (!tbio || !tbio->bi_end_io) continue; if (r10_bio->devs[i].bio->bi_end_io != end_sync_write && r10_bio->devs[i].bio != fbio) - for (j = 0; j < vcnt; j++) - memcpy(page_address(tbio->bi_io_vec[j].bv_page), - page_address(fbio->bi_io_vec[j].bv_page), - PAGE_SIZE); + bio_copy_data(tbio, fbio); d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b6793d2e051f..59e44e99eef3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf, int hash) { int size; - bool do_wakeup = false; + unsigned long do_wakeup = 0; + int i = 0; unsigned long flags; if (hash == NR_STRIPE_HASH_LOCKS) { @@ -365,15 +366,21 @@ static void release_inactive_stripe_list(struct r5conf *conf, !list_empty(list)) atomic_dec(&conf->empty_inactive_list_nr); list_splice_tail_init(list, conf->inactive_list + hash); - do_wakeup = true; + do_wakeup |= 1 << hash; spin_unlock_irqrestore(conf->hash_locks + hash, flags); } size--; hash--; } + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + if (do_wakeup & (1 << i)) + wake_up(&conf->wait_for_stripe[i]); + } + if (do_wakeup) { - wake_up(&conf->wait_for_stripe); + if (atomic_read(&conf->active_stripes) == 0) + wake_up(&conf->wait_for_quiescent); if (conf->retry_read_aligned) md_wakeup_thread(conf->mddev->thread); } @@ -667,15 +674,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, spin_lock_irq(conf->hash_locks + hash); do { - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { sh = get_free_stripe(conf, hash); - if (!sh && llist_empty(&conf->released_stripes) && - !test_bit(R5_DID_ALLOC, &conf->cache_state)) + if (!sh && !test_bit(R5_DID_ALLOC, + &conf->cache_state)) set_bit(R5_ALLOC_MORE, &conf->cache_state); } @@ -684,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); - wait_event_lock_irq( - conf->wait_for_stripe, + wait_event_exclusive_cmd( + conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), - *(conf->hash_locks + hash)); + spin_unlock_irq(conf->hash_locks + hash), + spin_lock_irq(conf->hash_locks + hash)); clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); } else { @@ -716,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector, } } while (sh == NULL); + if (!list_empty(conf->inactive_list + hash)) + wake_up(&conf->wait_for_stripe[hash]); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -2177,7 +2188,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { lock_device_hash_lock(conf, hash); - wait_event_cmd(conf->wait_for_stripe, + wait_event_exclusive_cmd(conf->wait_for_stripe[hash], !list_empty(conf->inactive_list + hash), unlock_device_hash_lock(conf, hash), lock_device_hash_lock(conf, hash)); @@ -4760,7 +4771,7 @@ static void raid5_align_endio(struct bio *bi, int error) raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return; } @@ -4855,7 +4866,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_iter.bi_sector += rdev->data_offset; spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, conf->device_lock); atomic_inc(&conf->active_aligned_reads); @@ -5699,7 +5710,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) bio_endio(raid_bio, 0); } if (atomic_dec_and_test(&conf->active_aligned_reads)) - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); return handled; } @@ -6433,7 +6444,10 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); - init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_quiescent); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { + init_waitqueue_head(&conf->wait_for_stripe[i]); + } init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->hold_list); @@ -7466,7 +7480,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) * active stripes can drain */ conf->quiesce = 2; - wait_event_cmd(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, unlock_all_device_hash_locks_irq(conf), @@ -7480,7 +7494,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) case 0: /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; - wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); break; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 896d603ad0da..02c3bf8fbfe7 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -511,7 +511,8 @@ struct r5conf { struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; atomic_t empty_inactive_list_nr; struct llist_head released_stripes; - wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_quiescent; + wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS]; wait_queue_head_t wait_for_overlap; unsigned long cache_state; #define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig new file mode 100644 index 000000000000..72226acb5c0f --- /dev/null +++ b/drivers/nvdimm/Kconfig @@ -0,0 +1,68 @@ +menuconfig LIBNVDIMM + tristate "NVDIMM (Non-Volatile Memory Device) Support" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + help + Generic support for non-volatile memory devices including + ACPI-6-NFIT defined resources. On platforms that define an + NFIT, or otherwise can discover NVDIMM resources, a libnvdimm + bus is registered to advertise PMEM (persistent memory) + namespaces (/dev/pmemX) and BLK (sliding mmio window(s)) + namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a + memory resource that may span multiple DIMMs and support DAX + (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control + region which exposes an mmio register set for windowed access + mode to non-volatile memory. + +if LIBNVDIMM + +config BLK_DEV_PMEM + tristate "PMEM: Persistent memory block device support" + default LIBNVDIMM + depends on HAS_IOMEM + select ND_BTT if BTT + help + Memory ranges for PMEM are described by either an NFIT + (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a + non-standard OEM-specific E820 memory type (type-12, see + CONFIG_X86_PMEM_LEGACY), or it is manually specified by the + 'memmap=nn[KMG]!ss[KMG]' kernel command line (see + Documentation/kernel-parameters.txt). This driver converts + these persistent memory ranges into block devices that are + capable of DAX (direct-access) file system mappings. See + Documentation/nvdimm/nvdimm.txt for more details. + + Say Y if you want to use an NVDIMM + +config ND_BLK + tristate "BLK: Block data window (aperture) device support" + default LIBNVDIMM + select ND_BTT if BTT + help + Support NVDIMMs, or other devices, that implement a BLK-mode + access capability. BLK-mode access uses memory-mapped-i/o + apertures to access persistent media. + + Say Y if your platform firmware emits an ACPI.NFIT table + (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode + capabilities. + +config ND_BTT + tristate + +config BTT + bool "BTT: Block Translation Table (atomic sector updates)" + default y if LIBNVDIMM + help + The Block Translation Table (BTT) provides atomic sector + update semantics for persistent memory devices, so that + applications that rely on sector writes not being torn (a + guarantee that typical disks provide) can continue to do so. + The BTT manifests itself as an alternate personality for an + NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX, + ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys, + etc...). + + Select Y if unsure + +endif diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile new file mode 100644 index 000000000000..594bb97c867a --- /dev/null +++ b/drivers/nvdimm/Makefile @@ -0,0 +1,20 @@ +obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o +obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o + +nd_pmem-y := pmem.o + +nd_btt-y := btt.o + +nd_blk-y := blk.o + +libnvdimm-y := core.o +libnvdimm-y += bus.o +libnvdimm-y += dimm_devs.o +libnvdimm-y += dimm.o +libnvdimm-y += region_devs.o +libnvdimm-y += region.o +libnvdimm-y += namespace_devs.o +libnvdimm-y += label.o +libnvdimm-$(CONFIG_BTT) += btt_devs.o diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c new file mode 100644 index 000000000000..4f97b248c236 --- /dev/null +++ b/drivers/nvdimm/blk.c @@ -0,0 +1,384 @@ +/* + * NVDIMM Block Window Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/nd.h> +#include <linux/sizes.h> +#include "nd.h" + +struct nd_blk_device { + struct request_queue *queue; + struct gendisk *disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + size_t disk_size; + u32 sector_size; + u32 internal_lbasize; +}; + +static int nd_blk_major; + +static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev) +{ + return blk_dev->nsblk->lbasize - blk_dev->sector_size; +} + +static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, + resource_size_t ns_offset, unsigned int len) +{ + int i; + + for (i = 0; i < nsblk->num_resources; i++) { + if (ns_offset < resource_size(nsblk->res[i])) { + if (ns_offset + len > resource_size(nsblk->res[i])) { + dev_WARN_ONCE(&nsblk->common.dev, 1, + "illegal request\n"); + return SIZE_MAX; + } + return nsblk->res[i]->start + ns_offset; + } + ns_offset -= resource_size(nsblk->res[i]); + } + + dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n"); + return SIZE_MAX; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + unsigned int len = nd_blk_meta_size(blk_dev); + resource_size_t dev_offset, ns_offset; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + int err = 0; + + nsblk = blk_dev->nsblk; + ndbr = blk_dev->ndbr; + ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size; + dev_offset = to_dev_offset(nsblk, ns_offset, len); + if (dev_offset == SIZE_MAX) + return -EIO; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *iobuf; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + iobuf = kmap_atomic(bv.bv_page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset, + cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + len -= cur_len; + dev_offset += cur_len; + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); + } + + return err; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + return 0; +} +#endif + +static int nd_blk_do_bvec(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset, ns_offset; + int err = 0; + void *iobuf; + u64 lba; + + while (len) { + unsigned int cur_len; + + /* + * If we don't have an integrity payload, we don't have to + * split the bvec into sectors, as this would cause unnecessary + * Block Window setup/move steps. the do_io routine is capable + * of handling len <= PAGE_SIZE. + */ + cur_len = bip ? min(len, blk_dev->sector_size) : len; + + lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size); + ns_offset = lba * blk_dev->internal_lbasize; + dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len); + if (dev_offset == SIZE_MAX) + return -EIO; + + iobuf = kmap_atomic(page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + if (bip) { + err = nd_blk_rw_integrity(blk_dev, bip, lba, rw); + if (err) + return err; + } + len -= cur_len; + off += cur_len; + sector += blk_dev->sector_size >> SECTOR_SHIFT; + } + + return err; +} + +static void nd_blk_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + struct bio_integrity_payload *bip; + struct nd_blk_device *blk_dev; + struct bvec_iter iter; + unsigned long start; + struct bio_vec bvec; + int err = 0, rw; + bool do_acct; + + /* + * bio_integrity_enabled also checks if the bio already has an + * integrity payload attached. If it does, we *don't* do a + * bio_integrity_prep here - the payload has been generated by + * another kernel subsystem, and we just pass it through. + */ + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + err = -EIO; + goto out; + } + + bip = bio_integrity(bio); + blk_dev = disk->private_data; + rw = bio_data_dir(bio); + do_acct = nd_iostat_start(bio, &start); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len, + bvec.bv_offset, rw, iter.bi_sector); + if (err) { + dev_info(&blk_dev->nsblk->common.dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + break; + } + } + if (do_acct) + nd_iostat_end(bio, start); + + out: + bio_endio(bio, err); +} + +static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *iobuf, size_t n, int rw) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim); + struct nd_namespace_blk *nsblk = blk_dev->nsblk; + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset; + + dev_offset = to_dev_offset(nsblk, offset, n); + + if (unlikely(offset + n > blk_dev->disk_size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (dev_offset == SIZE_MAX) + return -EIO; + + return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw); +} + +static const struct block_device_operations nd_blk_fops = { + .owner = THIS_MODULE, + .revalidate_disk = nvdimm_revalidate_disk, +}; + +static int nd_blk_attach_disk(struct nd_namespace_common *ndns, + struct nd_blk_device *blk_dev) +{ + resource_size_t available_disk_size; + struct gendisk *disk; + u64 internal_nlba; + + internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize); + available_disk_size = internal_nlba * blk_dev->sector_size; + + blk_dev->queue = blk_alloc_queue(GFP_KERNEL); + if (!blk_dev->queue) + return -ENOMEM; + + blk_queue_make_request(blk_dev->queue, nd_blk_make_request); + blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX); + blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY); + blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue); + + disk = blk_dev->disk = alloc_disk(0); + if (!disk) { + blk_cleanup_queue(blk_dev->queue); + return -ENOMEM; + } + + disk->driverfs_dev = &ndns->dev; + disk->major = nd_blk_major; + disk->first_minor = 0; + disk->fops = &nd_blk_fops; + disk->private_data = blk_dev; + disk->queue = blk_dev->queue; + disk->flags = GENHD_FL_EXT_DEVT; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + set_capacity(disk, 0); + add_disk(disk); + + if (nd_blk_meta_size(blk_dev)) { + int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev)); + + if (rc) { + del_gendisk(disk); + put_disk(disk); + blk_cleanup_queue(blk_dev->queue); + return rc; + } + } + + set_capacity(disk, available_disk_size >> SECTOR_SHIFT); + revalidate_disk(disk); + return 0; +} + +static int nd_blk_probe(struct device *dev) +{ + struct nd_namespace_common *ndns; + struct nd_namespace_blk *nsblk; + struct nd_blk_device *blk_dev; + int rc; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL); + if (!blk_dev) + return -ENOMEM; + + nsblk = to_nd_namespace_blk(&ndns->dev); + blk_dev->disk_size = nvdimm_namespace_capacity(ndns); + blk_dev->ndbr = to_nd_blk_region(dev->parent); + blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev); + blk_dev->internal_lbasize = roundup(nsblk->lbasize, + INT_LBASIZE_ALIGNMENT); + blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512); + dev_set_drvdata(dev, blk_dev); + + ndns->rw_bytes = nd_blk_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, blk_dev) == 0) { + /* we'll come back as btt-blk */ + rc = -ENXIO; + } else + rc = nd_blk_attach_disk(ndns, blk_dev); + if (rc) + kfree(blk_dev); + return rc; +} + +static void nd_blk_detach_disk(struct nd_blk_device *blk_dev) +{ + del_gendisk(blk_dev->disk); + put_disk(blk_dev->disk); + blk_cleanup_queue(blk_dev->queue); +} + +static int nd_blk_remove(struct device *dev) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + nd_blk_detach_disk(blk_dev); + kfree(blk_dev); + + return 0; +} + +static struct nd_device_driver nd_blk_driver = { + .probe = nd_blk_probe, + .remove = nd_blk_remove, + .drv = { + .name = "nd_blk", + }, + .type = ND_DRIVER_NAMESPACE_BLK, +}; + +static int __init nd_blk_init(void) +{ + int rc; + + rc = register_blkdev(0, "nd_blk"); + if (rc < 0) + return rc; + + nd_blk_major = rc; + rc = nd_driver_register(&nd_blk_driver); + + if (rc < 0) + unregister_blkdev(nd_blk_major, "nd_blk"); + + return rc; +} + +static void __exit nd_blk_exit(void) +{ + driver_unregister(&nd_blk_driver.drv); + unregister_blkdev(nd_blk_major, "nd_blk"); +} + +MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK); +module_init(nd_blk_init); +module_exit(nd_blk_exit); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c new file mode 100644 index 000000000000..411c7b2bb37a --- /dev/null +++ b/drivers/nvdimm/btt.c @@ -0,0 +1,1479 @@ +/* + * Block Translation Table + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/highmem.h> +#include <linux/debugfs.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/hdreg.h> +#include <linux/genhd.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/fs.h> +#include <linux/nd.h> +#include "btt.h" +#include "nd.h" + +enum log_ent_request { + LOG_NEW_ENT = 0, + LOG_OLD_ENT +}; + +static int btt_major; + +static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_read_bytes(ndns, offset, buf, n); +} + +static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_write_bytes(ndns, offset, buf, n); +} + +static int btt_info_write(struct arena_info *arena, struct btt_sb *super) +{ + int ret; + + ret = arena_write_bytes(arena, arena->info2off, super, + sizeof(struct btt_sb)); + if (ret) + return ret; + + return arena_write_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +static int btt_info_read(struct arena_info *arena, struct btt_sb *super) +{ + WARN_ON(!super); + return arena_read_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +/* + * 'raw' version of btt_map write + * Assumptions: + * mapping is in little-endian + * mapping contains 'E' and 'Z' flags as desired + */ +static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping) +{ + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE); +} + +static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, + u32 z_flag, u32 e_flag) +{ + u32 ze; + __le32 mapping_le; + + /* + * This 'mapping' is supposed to be just the LBA mapping, without + * any flags set, so strip the flag bits. + */ + mapping &= MAP_LBA_MASK; + + ze = (z_flag << 1) + e_flag; + switch (ze) { + case 0: + /* + * We want to set neither of the Z or E flags, and + * in the actual layout, this means setting the bit + * positions of both to '1' to indicate a 'normal' + * map entry + */ + mapping |= MAP_ENT_NORMAL; + break; + case 1: + mapping |= (1 << MAP_ERR_SHIFT); + break; + case 2: + mapping |= (1 << MAP_TRIM_SHIFT); + break; + default: + /* + * The case where Z and E are both sent in as '1' could be + * construed as a valid 'normal' case, but we decide not to, + * to avoid confusion + */ + WARN_ONCE(1, "Invalid use of Z and E flags\n"); + return -EIO; + } + + mapping_le = cpu_to_le32(mapping); + return __btt_map_write(arena, lba, mapping_le); +} + +static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + int *trim, int *error) +{ + int ret; + __le32 in; + u32 raw_mapping, postmap, ze, z_flag, e_flag; + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + + ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE); + if (ret) + return ret; + + raw_mapping = le32_to_cpu(in); + + z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT; + e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT; + ze = (z_flag << 1) + e_flag; + postmap = raw_mapping & MAP_LBA_MASK; + + /* Reuse the {z,e}_flag variables for *trim and *error */ + z_flag = 0; + e_flag = 0; + + switch (ze) { + case 0: + /* Initial state. Return postmap = premap */ + *mapping = lba; + break; + case 1: + *mapping = postmap; + e_flag = 1; + break; + case 2: + *mapping = postmap; + z_flag = 1; + break; + case 3: + *mapping = postmap; + break; + default: + return -EIO; + } + + if (trim) + *trim = z_flag; + if (error) + *error = e_flag; + + return ret; +} + +static int btt_log_read_pair(struct arena_info *arena, u32 lane, + struct log_entry *ent) +{ + WARN_ON(!ent); + return arena_read_bytes(arena, + arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, + 2 * LOG_ENT_SIZE); +} + +static struct dentry *debugfs_root; + +static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + int idx) +{ + char dirname[32]; + struct dentry *d; + + /* If for some reason, parent bttN was not created, exit */ + if (!parent) + return; + + snprintf(dirname, 32, "arena%d", idx); + d = debugfs_create_dir(dirname, parent); + if (IS_ERR_OR_NULL(d)) + return; + a->debugfs_dir = d; + + debugfs_create_x64("size", S_IRUGO, d, &a->size); + debugfs_create_x64("external_lba_start", S_IRUGO, d, + &a->external_lba_start); + debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); + debugfs_create_u32("internal_lbasize", S_IRUGO, d, + &a->internal_lbasize); + debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); + debugfs_create_u32("external_lbasize", S_IRUGO, d, + &a->external_lbasize); + debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); + debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); + debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); + debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); + debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); + debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); + debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); +} + +static void btt_debugfs_init(struct btt *btt) +{ + int i = 0; + struct arena_info *arena; + + btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), + debugfs_root); + if (IS_ERR_OR_NULL(btt->debugfs_dir)) + return; + + list_for_each_entry(arena, &btt->arena_list, list) { + arena_debugfs_init(arena, btt->debugfs_dir, i); + i++; + } +} + +/* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. + * It also updates the sequence number in this old entry to + * make it the 'new' one if the mark_flag is set. + * Finally, it returns which of the entries was the older one. + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +static int btt_log_get_old(struct log_entry *ent) +{ + int old; + + /* + * the first ever time this is seen, the entry goes into [0] + * the next time, the following logic works out to put this + * (next) entry into [1] + */ + if (ent[0].seq == 0) { + ent[0].seq = cpu_to_le32(1); + return 0; + } + + if (ent[0].seq == ent[1].seq) + return -EINVAL; + if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) + return -EINVAL; + + if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { + if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) + old = 0; + else + old = 1; + } else { + if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) + old = 1; + else + old = 0; + } + + return old; +} + +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +/* + * This function copies the desired (old/new) log entry into ent if + * it is not NULL. It returns the sub-slot number (0 or 1) + * where the desired log entry was found. Negative return values + * indicate errors. + */ +static int btt_log_read(struct arena_info *arena, u32 lane, + struct log_entry *ent, int old_flag) +{ + int ret; + int old_ent, ret_ent; + struct log_entry log[2]; + + ret = btt_log_read_pair(arena, lane, log); + if (ret) + return -EIO; + + old_ent = btt_log_get_old(log); + if (old_ent < 0 || old_ent > 1) { + dev_info(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", + old_ent, lane, log[0].seq, log[1].seq); + /* TODO set error state? */ + return -EIO; + } + + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) + memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); + + return ret_ent; +} + +/* + * This function commits a log entry to media + * It does _not_ prepare the freelist entry for the next write + * btt_flog_write is the wrapper for updating the freelist elements + */ +static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent) +{ + int ret; + /* + * Ignore the padding in log_entry for calculating log_half. + * The entry is 'committed' when we write the sequence number, + * and we want to ensure that that is the last thing written. + * We don't bother writing the padding as that would be extra + * media wear and write amplification + */ + unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; + u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); + void *src = ent; + + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half); + if (ret) + return ret; + + ns_off += log_half; + src += log_half; + return arena_write_bytes(arena, ns_off, src, log_half); +} + +static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, + struct log_entry *ent) +{ + int ret; + + ret = __btt_log_write(arena, lane, sub, ent); + if (ret) + return ret; + + /* prepare the next free entry */ + arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; + if (++(arena->freelist[lane].seq) == 4) + arena->freelist[lane].seq = 1; + arena->freelist[lane].block = le32_to_cpu(ent->old_map); + + return ret; +} + +/* + * This function initializes the BTT map to the initial state, which is + * all-zeroes, and indicates an identity mapping + */ +static int btt_map_init(struct arena_info *arena) +{ + int ret = -EINVAL; + void *zerobuf; + size_t offset = 0; + size_t chunk_size = SZ_2M; + size_t mapsize = arena->logoff - arena->mapoff; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + + while (mapsize) { + size_t size = min(mapsize, chunk_size); + + ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, + size); + if (ret) + goto free; + + offset += size; + mapsize -= size; + cond_resched(); + } + + free: + kfree(zerobuf); + return ret; +} + +/* + * This function initializes the BTT log with 'fake' entries pointing + * to the initial reserved set of blocks as being free + */ +static int btt_log_init(struct arena_info *arena) +{ + int ret; + u32 i; + struct log_entry log, zerolog; + + memset(&zerolog, 0, sizeof(zerolog)); + + for (i = 0; i < arena->nfree; i++) { + log.lba = cpu_to_le32(i); + log.old_map = cpu_to_le32(arena->external_nlba + i); + log.new_map = cpu_to_le32(arena->external_nlba + i); + log.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &log); + if (ret) + return ret; + ret = __btt_log_write(arena, i, 1, &zerolog); + if (ret) + return ret; + } + + return 0; +} + +static int btt_freelist_init(struct arena_info *arena) +{ + int old, new, ret; + u32 i, map_entry; + struct log_entry log_new, log_old; + + arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), + GFP_KERNEL); + if (!arena->freelist) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) { + old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT); + if (old < 0) + return old; + + new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); + if (new < 0) + return new; + + /* sub points to the next one to be overwritten */ + arena->freelist[i].sub = 1 - new; + arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); + arena->freelist[i].block = le32_to_cpu(log_new.old_map); + + /* This implies a newly created or untouched flog entry */ + if (log_new.old_map == log_new.new_map) + continue; + + /* Check if map recovery is needed */ + ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, + NULL, NULL); + if (ret) + return ret; + if ((le32_to_cpu(log_new.new_map) != map_entry) && + (le32_to_cpu(log_new.old_map) == map_entry)) { + /* + * Last transaction wrote the flog, but wasn't able + * to complete the map write. So fix up the map. + */ + ret = btt_map_write(arena, le32_to_cpu(log_new.lba), + le32_to_cpu(log_new.new_map), 0, 0); + if (ret) + return ret; + } + + } + + return 0; +} + +static int btt_rtt_init(struct arena_info *arena) +{ + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); + if (arena->rtt == NULL) + return -ENOMEM; + + return 0; +} + +static int btt_maplocks_init(struct arena_info *arena) +{ + u32 i; + + arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), + GFP_KERNEL); + if (!arena->map_locks) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) + spin_lock_init(&arena->map_locks[i].lock); + + return 0; +} + +static struct arena_info *alloc_arena(struct btt *btt, size_t size, + size_t start, size_t arena_off) +{ + struct arena_info *arena; + u64 logsize, mapsize, datasize; + u64 available = size; + + arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); + if (!arena) + return NULL; + arena->nd_btt = btt->nd_btt; + + if (!size) + return arena; + + arena->size = size; + arena->external_lba_start = start; + arena->external_lbasize = btt->lbasize; + arena->internal_lbasize = roundup(arena->external_lbasize, + INT_LBASIZE_ALIGNMENT); + arena->nfree = BTT_DEFAULT_NFREE; + arena->version_major = 1; + arena->version_minor = 1; + + if (available % BTT_PG_SIZE) + available -= (available % BTT_PG_SIZE); + + /* Two pages are reserved for the super block and its copy */ + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ + logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), + BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ + arena->internal_nlba = div_u64(available - BTT_PG_SIZE, + arena->internal_lbasize + MAP_ENT_SIZE); + arena->external_nlba = arena->internal_nlba - arena->nfree; + + mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); + datasize = available - mapsize; + + /* 'Absolute' values, relative to start of storage space */ + arena->infooff = arena_off; + arena->dataoff = arena->infooff + BTT_PG_SIZE; + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; + return arena; +} + +static void free_arenas(struct btt *btt) +{ + struct arena_info *arena, *next; + + list_for_each_entry_safe(arena, next, &btt->arena_list, list) { + list_del(&arena->list); + kfree(arena->rtt); + kfree(arena->map_locks); + kfree(arena->freelist); + debugfs_remove_recursive(arena->debugfs_dir); + kfree(arena); + } +} + +/* + * This function checks if the metadata layout is valid and error free + */ +static int arena_is_valid(struct arena_info *arena, struct btt_sb *super, + u8 *uuid, u32 lbasize) +{ + u64 checksum; + + if (memcmp(super->uuid, uuid, 16)) + return 0; + + checksum = le64_to_cpu(super->checksum); + super->checksum = 0; + if (checksum != nd_btt_sb_checksum(super)) + return 0; + super->checksum = cpu_to_le64(checksum); + + if (lbasize != le32_to_cpu(super->external_lbasize)) + return 0; + + /* TODO: figure out action for this */ + if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) + dev_info(to_dev(arena), "Found arena with an error flag\n"); + + return 1; +} + +/* + * This function reads an existing valid btt superblock and + * populates the corresponding arena_info struct + */ +static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, + u64 arena_off) +{ + arena->internal_nlba = le32_to_cpu(super->internal_nlba); + arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); + arena->external_nlba = le32_to_cpu(super->external_nlba); + arena->external_lbasize = le32_to_cpu(super->external_lbasize); + arena->nfree = le32_to_cpu(super->nfree); + arena->version_major = le16_to_cpu(super->version_major); + arena->version_minor = le16_to_cpu(super->version_minor); + + arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + + le64_to_cpu(super->nextoff)); + arena->infooff = arena_off; + arena->dataoff = arena_off + le64_to_cpu(super->dataoff); + arena->mapoff = arena_off + le64_to_cpu(super->mapoff); + arena->logoff = arena_off + le64_to_cpu(super->logoff); + arena->info2off = arena_off + le64_to_cpu(super->info2off); + + arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) : + (arena->info2off - arena->infooff + BTT_PG_SIZE); + + arena->flags = le32_to_cpu(super->flags); +} + +static int discover_arenas(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + struct btt_sb *super; + size_t remaining = btt->rawsize; + u64 cur_nlba = 0; + size_t cur_off = 0; + int num_arenas = 0; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return -ENOMEM; + + while (remaining) { + /* Alloc memory for arena */ + arena = alloc_arena(btt, 0, 0, 0); + if (!arena) { + ret = -ENOMEM; + goto out_super; + } + + arena->infooff = cur_off; + ret = btt_info_read(arena, super); + if (ret) + goto out; + + if (!arena_is_valid(arena, super, btt->nd_btt->uuid, + btt->lbasize)) { + if (remaining == btt->rawsize) { + btt->init_state = INIT_NOTFOUND; + dev_info(to_dev(arena), "No existing arenas\n"); + goto out; + } else { + dev_info(to_dev(arena), + "Found corrupted metadata!\n"); + ret = -ENODEV; + goto out; + } + } + + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + + ret = btt_freelist_init(arena); + if (ret) + goto out; + + ret = btt_rtt_init(arena); + if (ret) + goto out; + + ret = btt_maplocks_init(arena); + if (ret) + goto out; + + list_add_tail(&arena->list, &btt->arena_list); + + remaining -= arena->size; + cur_off += arena->size; + cur_nlba += arena->external_nlba; + num_arenas++; + + if (arena->nextoff == 0) + break; + } + btt->num_arenas = num_arenas; + btt->nlba = cur_nlba; + btt->init_state = INIT_READY; + + kfree(super); + return ret; + + out: + kfree(arena); + free_arenas(btt); + out_super: + kfree(super); + return ret; +} + +static int create_arenas(struct btt *btt) +{ + size_t remaining = btt->rawsize; + size_t cur_off = 0; + + while (remaining) { + struct arena_info *arena; + size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); + + remaining -= arena_size; + if (arena_size < ARENA_MIN_SIZE) + break; + + arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); + if (!arena) { + free_arenas(btt); + return -ENOMEM; + } + btt->nlba += arena->external_nlba; + if (remaining >= ARENA_MIN_SIZE) + arena->nextoff = arena->size; + else + arena->nextoff = 0; + cur_off += arena_size; + list_add_tail(&arena->list, &btt->arena_list); + } + + return 0; +} + +/* + * This function completes arena initialization by writing + * all the metadata. + * It is only called for an uninitialized arena when a write + * to that arena occurs for the first time. + */ +static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid) +{ + int ret; + struct btt_sb *super; + + ret = btt_map_init(arena); + if (ret) + return ret; + + ret = btt_log_init(arena); + if (ret) + return ret; + + super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); + if (!super) + return -ENOMEM; + + strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); + memcpy(super->uuid, uuid, 16); + super->flags = cpu_to_le32(arena->flags); + super->version_major = cpu_to_le16(arena->version_major); + super->version_minor = cpu_to_le16(arena->version_minor); + super->external_lbasize = cpu_to_le32(arena->external_lbasize); + super->external_nlba = cpu_to_le32(arena->external_nlba); + super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); + super->internal_nlba = cpu_to_le32(arena->internal_nlba); + super->nfree = cpu_to_le32(arena->nfree); + super->infosize = cpu_to_le32(sizeof(struct btt_sb)); + super->nextoff = cpu_to_le64(arena->nextoff); + /* + * Subtract arena->infooff (arena start) so numbers are relative + * to 'this' arena + */ + super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); + super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); + super->logoff = cpu_to_le64(arena->logoff - arena->infooff); + super->info2off = cpu_to_le64(arena->info2off - arena->infooff); + + super->flags = 0; + super->checksum = cpu_to_le64(nd_btt_sb_checksum(super)); + + ret = btt_info_write(arena, super); + + kfree(super); + return ret; +} + +/* + * This function completes the initialization for the BTT namespace + * such that it is ready to accept IOs + */ +static int btt_meta_init(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + + mutex_lock(&btt->init_lock); + list_for_each_entry(arena, &btt->arena_list, list) { + ret = btt_arena_write_layout(arena, btt->nd_btt->uuid); + if (ret) + goto unlock; + + ret = btt_freelist_init(arena); + if (ret) + goto unlock; + + ret = btt_rtt_init(arena); + if (ret) + goto unlock; + + ret = btt_maplocks_init(arena); + if (ret) + goto unlock; + } + + btt->init_state = INIT_READY; + + unlock: + mutex_unlock(&btt->init_lock); + return ret; +} + +static u32 btt_meta_size(struct btt *btt) +{ + return btt->lbasize - btt->sector_size; +} + +/* + * This function calculates the arena in which the given LBA lies + * by doing a linear walk. This is acceptable since we expect only + * a few arenas. If we have backing devices that get much larger, + * we can construct a balanced binary tree of arenas at init time + * so that this range search becomes faster. + */ +static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, + struct arena_info **arena) +{ + struct arena_info *arena_list; + __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); + + list_for_each_entry(arena_list, &btt->arena_list, list) { + if (lba < arena_list->external_nlba) { + *arena = arena_list; + *premap = lba; + return 0; + } + lba -= arena_list->external_nlba; + } + + return -EIO; +} + +/* + * The following (lock_map, unlock_map) are mostly just to improve + * readability, since they index into an array of locks + */ +static void lock_map(struct arena_info *arena, u32 premap) + __acquires(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_lock(&arena->map_locks[idx].lock); +} + +static void unlock_map(struct arena_info *arena, u32 premap) + __releases(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_unlock(&arena->map_locks[idx].lock); +} + +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int btt_data_read(struct arena_info *arena, struct page *page, + unsigned int off, u32 lba, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_read_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static int btt_data_write(struct arena_info *arena, u32 lba, + struct page *page, unsigned int off, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_write_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static void zero_fill_data(struct page *page, unsigned int off, u32 len) +{ + void *mem = kmap_atomic(page); + + memset(mem + off, 0, len); + kunmap_atomic(mem); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + unsigned int len = btt_meta_size(btt); + u64 meta_nsoff; + int ret = 0; + + if (bip == NULL) + return 0; + + meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *mem; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + mem = kmap_atomic(bv.bv_page); + if (rw) + ret = arena_write_bytes(arena, meta_nsoff, + mem + bv.bv_offset, cur_len); + else + ret = arena_read_bytes(arena, meta_nsoff, + mem + bv.bv_offset, cur_len); + + kunmap_atomic(mem); + if (ret) + return ret; + + len -= cur_len; + meta_nsoff += cur_len; + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); + } + + return ret; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + return 0; +} +#endif + +static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int off, sector_t sector, + unsigned int len) +{ + int ret = 0; + int t_flag, e_flag; + struct arena_info *arena = NULL; + u32 lane = 0, premap, postmap; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + + cur_len = min(btt->sector_size, len); + + ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag); + if (ret) + goto out_lane; + + /* + * We loop to make sure that the post map LBA didn't change + * from under us between writing the RTT and doing the actual + * read. + */ + while (1) { + u32 new_map; + + if (t_flag) { + zero_fill_data(page, off, cur_len); + goto out_lane; + } + + if (e_flag) { + ret = -EIO; + goto out_lane; + } + + arena->rtt[lane] = RTT_VALID | postmap; + /* + * Barrier to make sure this write is not reordered + * to do the verification map_read before the RTT store + */ + barrier(); + + ret = btt_map_read(arena, premap, &new_map, &t_flag, + &e_flag); + if (ret) + goto out_rtt; + + if (postmap == new_map) + break; + + postmap = new_map; + } + + ret = btt_data_read(arena, page, off, postmap, cur_len); + if (ret) + goto out_rtt; + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, postmap, READ); + if (ret) + goto out_rtt; + } + + arena->rtt[lane] = RTT_INVALID; + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_rtt: + arena->rtt[lane] = RTT_INVALID; + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, + sector_t sector, struct page *page, unsigned int off, + unsigned int len) +{ + int ret = 0; + struct arena_info *arena = NULL; + u32 premap = 0, old_postmap, new_postmap, lane = 0, i; + struct log_entry log; + int sub; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + cur_len = min(btt->sector_size, len); + + if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { + ret = -EIO; + goto out_lane; + } + + new_postmap = arena->freelist[lane].block; + + /* Wait if the new block is being read from */ + for (i = 0; i < arena->nfree; i++) + while (arena->rtt[i] == (RTT_VALID | new_postmap)) + cpu_relax(); + + + if (new_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_lane; + } + + ret = btt_data_write(arena, new_postmap, page, off, cur_len); + if (ret) + goto out_lane; + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, new_postmap, + WRITE); + if (ret) + goto out_lane; + } + + lock_map(arena, premap); + ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL); + if (ret) + goto out_map; + if (old_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_map; + } + + log.lba = cpu_to_le32(premap); + log.old_map = cpu_to_le32(old_postmap); + log.new_map = cpu_to_le32(new_postmap); + log.seq = cpu_to_le32(arena->freelist[lane].seq); + sub = arena->freelist[lane].sub; + ret = btt_flog_write(arena, lane, sub, &log); + if (ret) + goto out_map; + + ret = btt_map_write(arena, premap, new_postmap, 0, 0); + if (ret) + goto out_map; + + unlock_map(arena, premap); + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_map: + unlock_map(arena, premap); + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int len, unsigned int off, + int rw, sector_t sector) +{ + int ret; + + if (rw == READ) { + ret = btt_read_pg(btt, bip, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + ret = btt_write_pg(btt, bip, sector, page, off, len); + } + + return ret; +} + +static void btt_make_request(struct request_queue *q, struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct btt *btt = q->queuedata; + struct bvec_iter iter; + unsigned long start; + struct bio_vec bvec; + int err = 0, rw; + bool do_acct; + + /* + * bio_integrity_enabled also checks if the bio already has an + * integrity payload attached. If it does, we *don't* do a + * bio_integrity_prep here - the payload has been generated by + * another kernel subsystem, and we just pass it through. + */ + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + err = -EIO; + goto out; + } + + do_acct = nd_iostat_start(bio, &start); + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + /* Make sure len is in multiples of sector size. */ + /* XXX is this right? */ + BUG_ON(len < btt->sector_size); + BUG_ON(len % btt->sector_size); + + err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, + rw, iter.bi_sector); + if (err) { + dev_info(&btt->nd_btt->dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + break; + } + } + if (do_acct) + nd_iostat_end(bio, start); + +out: + bio_endio(bio, err); +} + +static int btt_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct btt *btt = bdev->bd_disk->private_data; + + btt_do_bvec(btt, NULL, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, 0); + return 0; +} + + +static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static const struct block_device_operations btt_fops = { + .owner = THIS_MODULE, + .rw_page = btt_rw_page, + .getgeo = btt_getgeo, + .revalidate_disk = nvdimm_revalidate_disk, +}; + +static int btt_blk_init(struct btt *btt) +{ + struct nd_btt *nd_btt = btt->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* create a new disk and request queue for btt */ + btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + if (!btt->btt_queue) + return -ENOMEM; + + btt->btt_disk = alloc_disk(0); + if (!btt->btt_disk) { + blk_cleanup_queue(btt->btt_queue); + return -ENOMEM; + } + + nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); + btt->btt_disk->driverfs_dev = &btt->nd_btt->dev; + btt->btt_disk->major = btt_major; + btt->btt_disk->first_minor = 0; + btt->btt_disk->fops = &btt_fops; + btt->btt_disk->private_data = btt; + btt->btt_disk->queue = btt->btt_queue; + btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + + blk_queue_make_request(btt->btt_queue, btt_make_request); + blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); + blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); + btt->btt_queue->queuedata = btt; + + set_capacity(btt->btt_disk, 0); + add_disk(btt->btt_disk); + if (btt_meta_size(btt)) { + int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); + + if (rc) { + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); + return rc; + } + } + set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); + revalidate_disk(btt->btt_disk); + + return 0; +} + +static void btt_blk_cleanup(struct btt *btt) +{ + blk_integrity_unregister(btt->btt_disk); + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); +} + +/** + * btt_init - initialize a block translation table for the given device + * @nd_btt: device with BTT geometry and backing device info + * @rawsize: raw size in bytes of the backing device + * @lbasize: lba size of the backing device + * @uuid: A uuid for the backing device - this is stored on media + * @maxlane: maximum number of parallel requests the device can handle + * + * Initialize a Block Translation Table on a backing device to provide + * single sector power fail atomicity. + * + * Context: + * Might sleep. + * + * Returns: + * Pointer to a new struct btt on success, NULL on failure. + */ +static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, + u32 lbasize, u8 *uuid, struct nd_region *nd_region) +{ + int ret; + struct btt *btt; + struct device *dev = &nd_btt->dev; + + btt = kzalloc(sizeof(struct btt), GFP_KERNEL); + if (!btt) + return NULL; + + btt->nd_btt = nd_btt; + btt->rawsize = rawsize; + btt->lbasize = lbasize; + btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); + INIT_LIST_HEAD(&btt->arena_list); + mutex_init(&btt->init_lock); + btt->nd_region = nd_region; + + ret = discover_arenas(btt); + if (ret) { + dev_err(dev, "init: error in arena_discover: %d\n", ret); + goto out_free; + } + + if (btt->init_state != INIT_READY && nd_region->ro) { + dev_info(dev, "%s is read-only, unable to init btt metadata\n", + dev_name(&nd_region->dev)); + goto out_free; + } else if (btt->init_state != INIT_READY) { + btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); + dev_dbg(dev, "init: %d arenas for %llu rawsize\n", + btt->num_arenas, rawsize); + + ret = create_arenas(btt); + if (ret) { + dev_info(dev, "init: create_arenas: %d\n", ret); + goto out_free; + } + + ret = btt_meta_init(btt); + if (ret) { + dev_err(dev, "init: error in meta_init: %d\n", ret); + goto out_free; + } + } + + ret = btt_blk_init(btt); + if (ret) { + dev_err(dev, "init: error in blk_init: %d\n", ret); + goto out_free; + } + + btt_debugfs_init(btt); + + return btt; + + out_free: + kfree(btt); + return NULL; +} + +/** + * btt_fini - de-initialize a BTT + * @btt: the BTT handle that was generated by btt_init + * + * De-initialize a Block Translation Table on device removal + * + * Context: + * Might sleep. + */ +static void btt_fini(struct btt *btt) +{ + if (btt) { + btt_blk_cleanup(btt); + free_arenas(btt); + debugfs_remove_recursive(btt->debugfs_dir); + kfree(btt); + } +} + +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct nd_region *nd_region; + struct btt *btt; + size_t rawsize; + + if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) + return -ENODEV; + + rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K; + if (rawsize < ARENA_MIN_SIZE) { + return -ENXIO; + } + nd_region = to_nd_region(nd_btt->dev.parent); + btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, + nd_region); + if (!btt) + return -ENOMEM; + nd_btt->btt = btt; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_attach_btt); + +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct btt *btt = nd_btt->btt; + + btt_fini(btt); + nd_btt->btt = NULL; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_detach_btt); + +static int __init nd_btt_init(void) +{ + int rc; + + BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K); + + btt_major = register_blkdev(0, "btt"); + if (btt_major < 0) + return btt_major; + + debugfs_root = debugfs_create_dir("btt", NULL); + if (IS_ERR_OR_NULL(debugfs_root)) { + rc = -ENXIO; + goto err_debugfs; + } + + return 0; + + err_debugfs: + unregister_blkdev(btt_major, "btt"); + + return rc; +} + +static void __exit nd_btt_exit(void) +{ + debugfs_remove_recursive(debugfs_root); + unregister_blkdev(btt_major, "btt"); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); +MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +module_init(nd_btt_init); +module_exit(nd_btt_exit); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h new file mode 100644 index 000000000000..75b0d80a6bd9 --- /dev/null +++ b/drivers/nvdimm/btt.h @@ -0,0 +1,185 @@ +/* + * Block Translation Table library + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_BTT_H +#define _LINUX_BTT_H + +#include <linux/types.h> + +#define BTT_SIG_LEN 16 +#define BTT_SIG "BTT_ARENA_INFO\0" +#define MAP_ENT_SIZE 4 +#define MAP_TRIM_SHIFT 31 +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT) +#define MAP_ERR_SHIFT 30 +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) +#define MAP_ENT_NORMAL 0xC0000000 +#define LOG_ENT_SIZE sizeof(struct log_entry) +#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ +#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +#define RTT_VALID (1UL << 31) +#define RTT_INVALID 0 +#define BTT_PG_SIZE 4096 +#define BTT_DEFAULT_NFREE ND_MAX_LANES +#define LOG_SEQ_INIT 1 + +#define IB_FLAG_ERROR 0x00000001 +#define IB_FLAG_ERROR_MASK 0x00000001 + +enum btt_init_state { + INIT_UNCHECKED = 0, + INIT_NOTFOUND, + INIT_READY +}; + +struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; + __le64 padding[2]; +}; + +struct btt_sb { + u8 signature[BTT_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le32 external_lbasize; + __le32 external_nlba; + __le32 internal_lbasize; + __le32 internal_nlba; + __le32 nfree; + __le32 infosize; + __le64 nextoff; + __le64 dataoff; + __le64 mapoff; + __le64 logoff; + __le64 info2off; + u8 padding[3968]; + __le64 checksum; +}; + +struct free_entry { + u32 block; + u8 sub; + u8 seq; +}; + +struct aligned_lock { + union { + spinlock_t lock; + u8 cacheline_padding[L1_CACHE_BYTES]; + }; +}; + +/** + * struct arena_info - handle for an arena + * @size: Size in bytes this arena occupies on the raw device. + * This includes arena metadata. + * @external_lba_start: The first external LBA in this arena. + * @internal_nlba: Number of internal blocks available in the arena + * including nfree reserved blocks + * @internal_lbasize: Internal and external lba sizes may be different as + * we can round up 'odd' external lbasizes such as 520B + * to be aligned. + * @external_nlba: Number of blocks contributed by the arena to the number + * reported to upper layers. (internal_nlba - nfree) + * @external_lbasize: LBA size as exposed to upper layers. + * @nfree: A reserve number of 'free' blocks that is used to + * handle incoming writes. + * @version_major: Metadata layout version major. + * @version_minor: Metadata layout version minor. + * @nextoff: Offset in bytes to the start of the next arena. + * @infooff: Offset in bytes to the info block of this arena. + * @dataoff: Offset in bytes to the data area of this arena. + * @mapoff: Offset in bytes to the map area of this arena. + * @logoff: Offset in bytes to the log area of this arena. + * @info2off: Offset in bytes to the backup info block of this arena. + * @freelist: Pointer to in-memory list of free blocks + * @rtt: Pointer to in-memory "Read Tracking Table" + * @map_locks: Spinlocks protecting concurrent map writes + * @nd_btt: Pointer to parent nd_btt structure. + * @list: List head for list of arenas + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. + */ +struct arena_info { + u64 size; /* Total bytes for this arena */ + u64 external_lba_start; + u32 internal_nlba; + u32 internal_lbasize; + u32 external_nlba; + u32 external_lbasize; + u32 nfree; + u16 version_major; + u16 version_minor; + /* Byte offsets to the different on-media structures */ + u64 nextoff; + u64 infooff; + u64 dataoff; + u64 mapoff; + u64 logoff; + u64 info2off; + /* Pointers to other in-memory structures for this arena */ + struct free_entry *freelist; + u32 *rtt; + struct aligned_lock *map_locks; + struct nd_btt *nd_btt; + struct list_head list; + struct dentry *debugfs_dir; + /* Arena flags */ + u32 flags; +}; + +/** + * struct btt - handle for a BTT instance + * @btt_disk: Pointer to the gendisk for BTT device + * @btt_queue: Pointer to the request queue for the BTT device + * @arena_list: Head of the list of arenas + * @debugfs_dir: Debugfs dentry + * @nd_btt: Parent nd_btt struct + * @nlba: Number of logical blocks exposed to the upper layers + * after removing the amount of space needed by metadata + * @rawsize: Total size in bytes of the available backing device + * @lbasize: LBA size as requested and presented to upper layers. + * This is sector_size + size of any metadata. + * @sector_size: The Linux sector size - 512 or 4096 + * @lanes: Per-lane spinlocks + * @init_lock: Mutex used for the BTT initialization + * @init_state: Flag describing the initialization state for the BTT + * @num_arenas: Number of arenas in the BTT instance + */ +struct btt { + struct gendisk *btt_disk; + struct request_queue *btt_queue; + struct list_head arena_list; + struct dentry *debugfs_dir; + struct nd_btt *nd_btt; + u64 nlba; + unsigned long long rawsize; + u32 lbasize; + u32 sector_size; + struct nd_region *nd_region; + struct mutex init_lock; + int init_state; + int num_arenas; +}; +#endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c new file mode 100644 index 000000000000..6ac8c0fea3ec --- /dev/null +++ b/drivers/nvdimm/btt_devs.c @@ -0,0 +1,425 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/genhd.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "btt.h" +#include "nd.h" + +static void __nd_btt_detach_ndns(struct nd_btt *nd_btt) +{ + struct nd_namespace_common *ndns = nd_btt->ndns; + + dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex) + || ndns->claim != &nd_btt->dev, + "%s: invalid claim\n", __func__); + ndns->claim = NULL; + nd_btt->ndns = NULL; + put_device(&ndns->dev); +} + +static void nd_btt_detach_ndns(struct nd_btt *nd_btt) +{ + struct nd_namespace_common *ndns = nd_btt->ndns; + + if (!ndns) + return; + get_device(&ndns->dev); + device_lock(&ndns->dev); + __nd_btt_detach_ndns(nd_btt); + device_unlock(&ndns->dev); + put_device(&ndns->dev); +} + +static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns) +{ + if (ndns->claim) + return false; + dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex) + || nd_btt->ndns, + "%s: invalid claim\n", __func__); + ndns->claim = &nd_btt->dev; + nd_btt->ndns = ndns; + get_device(&ndns->dev); + return true; +} + +static bool nd_btt_attach_ndns(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns) +{ + bool claimed; + + device_lock(&ndns->dev); + claimed = __nd_btt_attach_ndns(nd_btt, ndns); + device_unlock(&ndns->dev); + return claimed; +} + +static void nd_btt_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + dev_dbg(dev, "%s\n", __func__); + nd_btt_detach_ndns(nd_btt); + ida_simple_remove(&nd_region->btt_ida, nd_btt->id); + kfree(nd_btt->uuid); + kfree(nd_btt); +} + +static struct device_type nd_btt_device_type = { + .name = "nd_btt", + .release = nd_btt_release, +}; + +bool is_nd_btt(struct device *dev) +{ + return dev->type == &nd_btt_device_type; +} +EXPORT_SYMBOL(is_nd_btt); + +struct nd_btt *to_nd_btt(struct device *dev) +{ + struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev); + + WARN_ON(!is_nd_btt(dev)); + return nd_btt; +} +EXPORT_SYMBOL(to_nd_btt); + +static const unsigned long btt_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize, + btt_lbasize_supported); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_btt->uuid) + return sprintf(buf, "%pUb\n", nd_btt->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_btt->ndns + ? dev_name(&nd_btt->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static int namespace_match(struct device *dev, void *data) +{ + char *name = data; + + return strcmp(name, dev_name(dev)) == 0; +} + +static bool is_nd_btt_idle(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver) + return false; + return true; +} + +static ssize_t __namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + struct nd_namespace_common *ndns; + struct device *found; + char *name; + + if (dev->driver) { + dev_dbg(dev, "%s: -EBUSY\n", __func__); + return -EBUSY; + } + + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + strim(name); + + if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0) + /* pass */; + else { + len = -EINVAL; + goto out; + } + + ndns = nd_btt->ndns; + if (strcmp(name, "") == 0) { + /* detach the namespace and destroy / reset the btt device */ + nd_btt_detach_ndns(nd_btt); + if (is_nd_btt_idle(dev)) + nd_device_unregister(dev, ND_ASYNC); + else { + nd_btt->lbasize = 0; + kfree(nd_btt->uuid); + nd_btt->uuid = NULL; + } + goto out; + } else if (ndns) { + dev_dbg(dev, "namespace already set to: %s\n", + dev_name(&ndns->dev)); + len = -EBUSY; + goto out; + } + + found = device_find_child(dev->parent, name, namespace_match); + if (!found) { + dev_dbg(dev, "'%s' not found under %s\n", name, + dev_name(dev->parent)); + len = -ENODEV; + goto out; + } + + ndns = to_ndns(found); + if (__nvdimm_namespace_capacity(ndns) < SZ_16M) { + dev_dbg(dev, "%s too small to host btt\n", name); + len = -ENXIO; + goto out_attach; + } + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev)); + if (!nd_btt_attach_ndns(nd_btt, ndns)) { + dev_dbg(dev, "%s already claimed\n", + dev_name(&ndns->dev)); + len = -EBUSY; + } + + out_attach: + put_device(&ndns->dev); /* from device_find_child */ + out: + kfree(name); + return len; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + ssize_t rc; + + nvdimm_bus_lock(dev); + device_lock(dev); + rc = __namespace_store(dev, attr, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static struct attribute *nd_btt_attributes[] = { + &dev_attr_sector_size.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + NULL, +}; + +static struct attribute_group nd_btt_attribute_group = { + .attrs = nd_btt_attributes, +}; + +static const struct attribute_group *nd_btt_attribute_groups[] = { + &nd_btt_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static struct device *__nd_btt_create(struct nd_region *nd_region, + unsigned long lbasize, u8 *uuid, + struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt; + struct device *dev; + + nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL); + if (!nd_btt) + return NULL; + + nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL); + if (nd_btt->id < 0) { + kfree(nd_btt); + return NULL; + } + + nd_btt->lbasize = lbasize; + if (uuid) + uuid = kmemdup(uuid, 16, GFP_KERNEL); + nd_btt->uuid = uuid; + dev = &nd_btt->dev; + dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id); + dev->parent = &nd_region->dev; + dev->type = &nd_btt_device_type; + dev->groups = nd_btt_attribute_groups; + device_initialize(&nd_btt->dev); + if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) { + dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n", + __func__, dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; +} + +struct device *nd_btt_create(struct nd_region *nd_region) +{ + struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL); + + if (dev) + __nd_device_register(dev); + return dev; +} + +/* + * nd_btt_sb_checksum: compute checksum for btt info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_btt_sb_checksum(struct btt_sb *btt_sb) +{ + u64 sum; + __le64 sum_save; + + sum_save = btt_sb->checksum; + btt_sb->checksum = 0; + sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1); + btt_sb->checksum = sum_save; + return sum; +} +EXPORT_SYMBOL(nd_btt_sb_checksum); + +static int __nd_btt_probe(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns, struct btt_sb *btt_sb) +{ + u64 checksum; + + if (!btt_sb || !ndns || !nd_btt) + return -ENODEV; + + if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb))) + return -ENXIO; + + if (nvdimm_namespace_capacity(ndns) < SZ_16M) + return -ENXIO; + + if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0) + return -ENODEV; + + checksum = le64_to_cpu(btt_sb->checksum); + btt_sb->checksum = 0; + if (checksum != nd_btt_sb_checksum(btt_sb)) + return -ENODEV; + btt_sb->checksum = cpu_to_le64(checksum); + + nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); + nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL); + if (!nd_btt->uuid) + return -ENOMEM; + + __nd_device_register(&nd_btt->dev); + + return 0; +} + +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + int rc; + struct device *dev; + struct btt_sb *btt_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + nvdimm_bus_lock(&ndns->dev); + dev = __nd_btt_create(nd_region, 0, NULL, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!dev) + return -ENOMEM; + dev_set_drvdata(dev, drvdata); + btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL); + rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb); + kfree(btt_sb); + dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__, + rc == 0 ? dev_name(dev) : "<none>"); + if (rc < 0) { + __nd_btt_detach_ndns(to_nd_btt(dev)); + put_device(dev); + } + + return rc; +} +EXPORT_SYMBOL(nd_btt_probe); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c new file mode 100644 index 000000000000..8eb22c0ca7ce --- /dev/null +++ b/drivers/nvdimm/bus.c @@ -0,0 +1,730 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/vmalloc.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/fcntl.h> +#include <linux/async.h> +#include <linux/genhd.h> +#include <linux/ndctl.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +int nvdimm_major; +static int nvdimm_bus_major; +static struct class *nd_class; + +static int to_nd_device_type(struct device *dev) +{ + if (is_nvdimm(dev)) + return ND_DEVICE_DIMM; + else if (is_nd_pmem(dev)) + return ND_DEVICE_REGION_PMEM; + else if (is_nd_blk(dev)) + return ND_DEVICE_REGION_BLK; + else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent)) + return nd_region_to_nstype(to_nd_region(dev->parent)); + + return 0; +} + +static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + /* + * Ensure that region devices always have their numa node set as + * early as possible. + */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + set_dev_node(dev, to_nd_region(dev)->numa_node); + return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, + to_nd_device_type(dev)); +} + +static int nvdimm_bus_match(struct device *dev, struct device_driver *drv) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(drv); + + return test_bit(to_nd_device_type(dev), &nd_drv->type); +} + +static struct module *to_bus_provider(struct device *dev) +{ + /* pin bus providers while regions are enabled */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + return nvdimm_bus->module; + } + return NULL; +} + +static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + nvdimm_bus->probe_active++; + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + if (--nvdimm_bus->probe_active == 0) + wake_up(&nvdimm_bus->probe_wait); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static int nvdimm_bus_probe(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + if (!try_module_get(provider)) + return -ENXIO; + + nvdimm_bus_probe_start(nvdimm_bus); + rc = nd_drv->probe(dev); + if (rc == 0) + nd_region_probe_success(nvdimm_bus, dev); + else + nd_region_disable(nvdimm_bus, dev); + nvdimm_bus_probe_end(nvdimm_bus); + + dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + + if (rc != 0) + module_put(provider); + return rc; +} + +static int nvdimm_bus_remove(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + rc = nd_drv->remove(dev); + nd_region_disable(nvdimm_bus, dev); + + dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + module_put(provider); + return rc; +} + +static struct bus_type nvdimm_bus_type = { + .name = "nd", + .uevent = nvdimm_bus_uevent, + .match = nvdimm_bus_match, + .probe = nvdimm_bus_probe, + .remove = nvdimm_bus_remove, +}; + +static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain); + +void nd_synchronize(void) +{ + async_synchronize_full_domain(&nd_async_domain); +} +EXPORT_SYMBOL_GPL(nd_synchronize); + +static void nd_async_device_register(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + if (device_add(dev) != 0) { + dev_err(dev, "%s: failed\n", __func__); + put_device(dev); + } + put_device(dev); +} + +static void nd_async_device_unregister(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + /* flush bus operations before delete */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + + device_unregister(dev); + put_device(dev); +} + +void __nd_device_register(struct device *dev) +{ + dev->bus = &nvdimm_bus_type; + get_device(dev); + async_schedule_domain(nd_async_device_register, dev, + &nd_async_domain); +} + +void nd_device_register(struct device *dev) +{ + device_initialize(dev); + __nd_device_register(dev); +} +EXPORT_SYMBOL(nd_device_register); + +void nd_device_unregister(struct device *dev, enum nd_async_mode mode) +{ + switch (mode) { + case ND_ASYNC: + get_device(dev); + async_schedule_domain(nd_async_device_unregister, dev, + &nd_async_domain); + break; + case ND_SYNC: + nd_synchronize(); + device_unregister(dev); + break; + } +} +EXPORT_SYMBOL(nd_device_unregister); + +/** + * __nd_driver_register() - register a region or a namespace driver + * @nd_drv: driver to register + * @owner: automatically set by nd_driver_register() macro + * @mod_name: automatically set by nd_driver_register() macro + */ +int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, + const char *mod_name) +{ + struct device_driver *drv = &nd_drv->drv; + + if (!nd_drv->type) { + pr_debug("driver type bitmask not set (%pf)\n", + __builtin_return_address(0)); + return -EINVAL; + } + + if (!nd_drv->probe || !nd_drv->remove) { + pr_debug("->probe() and ->remove() must be specified\n"); + return -EINVAL; + } + + drv->bus = &nvdimm_bus_type; + drv->owner = owner; + drv->mod_name = mod_name; + + return driver_register(drv); +} +EXPORT_SYMBOL(__nd_driver_register); + +int nvdimm_revalidate_disk(struct gendisk *disk) +{ + struct device *dev = disk->driverfs_dev; + struct nd_region *nd_region = to_nd_region(dev->parent); + const char *pol = nd_region->ro ? "only" : "write"; + + if (nd_region->ro == get_disk_ro(disk)) + return 0; + + dev_info(dev, "%s read-%s, marking %s read-%s\n", + dev_name(&nd_region->dev), pol, disk->disk_name, pol); + set_disk_ro(disk, nd_region->ro); + + return 0; + +} +EXPORT_SYMBOL(nvdimm_revalidate_disk); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n", + to_nd_device_type(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static struct attribute *nd_device_attributes[] = { + &dev_attr_modalias.attr, + &dev_attr_devtype.attr, + NULL, +}; + +/** + * nd_device_attribute_group - generic attributes for all devices on an nd bus + */ +struct attribute_group nd_device_attribute_group = { + .attrs = nd_device_attributes, +}; +EXPORT_SYMBOL_GPL(nd_device_attribute_group); + +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static struct attribute *nd_numa_attributes[] = { + &dev_attr_numa_node.attr, + NULL, +}; + +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA)) + return 0; + + return a->mode; +} + +/** + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus + */ +struct attribute_group nd_numa_attribute_group = { + .attrs = nd_numa_attributes, + .is_visible = nd_numa_attr_visible, +}; +EXPORT_SYMBOL_GPL(nd_numa_attribute_group); + +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); + struct device *dev; + + dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus, + "ndctl%d", nvdimm_bus->id); + + if (IS_ERR(dev)) { + dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n", + nvdimm_bus->id, PTR_ERR(dev)); + return PTR_ERR(dev); + } + return 0; +} + +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id)); +} + +static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_SMART] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_SMART_THRESHOLD] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_DIMM_FLAGS] = { + .out_num = 2, + .out_sizes = { 4, 4 }, + }, + [ND_CMD_GET_CONFIG_SIZE] = { + .out_num = 3, + .out_sizes = { 4, 4, 4, }, + }, + [ND_CMD_GET_CONFIG_DATA] = { + .in_num = 2, + .in_sizes = { 4, 4, }, + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, + [ND_CMD_SET_CONFIG_DATA] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_VENDOR] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 3, + .out_sizes = { 4, 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs)) + return &__nd_cmd_dimm_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc); + +static const struct nd_cmd_desc __nd_cmd_bus_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_ARS_CAP] = { + .in_num = 2, + .in_sizes = { 8, 8, }, + .out_num = 2, + .out_sizes = { 4, 4, }, + }, + [ND_CMD_ARS_START] = { + .in_num = 4, + .in_sizes = { 8, 8, 2, 6, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_ARS_STATUS] = { + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs)) + return &__nd_cmd_bus_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_bus_desc); + +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf) +{ + if (idx >= desc->in_num) + return UINT_MAX; + + if (desc->in_sizes[idx] < UINT_MAX) + return desc->in_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) { + struct nd_cmd_set_config_hdr *hdr = buf; + + return hdr->in_length; + } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) { + struct nd_cmd_vendor_hdr *hdr = buf; + + return hdr->in_length; + } + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_in_size); + +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field) +{ + if (idx >= desc->out_num) + return UINT_MAX; + + if (desc->out_sizes[idx] < UINT_MAX) + return desc->out_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1) + return in_field[1]; + else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) + return out_field[1]; + else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1) + return ND_CMD_ARS_STATUS_MAX; + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_out_size); + +void wait_nvdimm_bus_probe_idle(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + do { + if (nvdimm_bus->probe_active == 0) + break; + nvdimm_bus_unlock(&nvdimm_bus->dev); + wait_event(nvdimm_bus->probe_wait, + nvdimm_bus->probe_active == 0); + nvdimm_bus_lock(&nvdimm_bus->dev); + } while (true); +} + +/* set_config requires an idle interleave set */ +static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) + return 0; + + nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); + wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev); + + if (atomic_read(&nvdimm->busy)) + return -EBUSY; + return 0; +} + +static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, + int read_only, unsigned int ioctl_cmd, unsigned long arg) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + size_t buf_len = 0, in_len = 0, out_len = 0; + static char out_env[ND_CMD_MAX_ENVELOPE]; + static char in_env[ND_CMD_MAX_ENVELOPE]; + const struct nd_cmd_desc *desc = NULL; + unsigned int cmd = _IOC_NR(ioctl_cmd); + void __user *p = (void __user *) arg; + struct device *dev = &nvdimm_bus->dev; + const char *cmd_name, *dimm_name; + unsigned long dsm_mask; + void *buf; + int rc, i; + + if (nvdimm) { + desc = nd_cmd_dimm_desc(cmd); + cmd_name = nvdimm_cmd_name(cmd); + dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0; + dimm_name = dev_name(&nvdimm->dev); + } else { + desc = nd_cmd_bus_desc(cmd); + cmd_name = nvdimm_bus_cmd_name(cmd); + dsm_mask = nd_desc->dsm_mask; + dimm_name = "bus"; + } + + if (!desc || (desc->out_num + desc->in_num == 0) || + !test_bit(cmd, &dsm_mask)) + return -ENOTTY; + + /* fail write commands (when read-only) */ + if (read_only) + switch (ioctl_cmd) { + case ND_IOCTL_VENDOR: + case ND_IOCTL_SET_CONFIG_DATA: + case ND_IOCTL_ARS_START: + dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n", + nvdimm ? nvdimm_cmd_name(cmd) + : nvdimm_bus_cmd_name(cmd)); + return -EPERM; + default: + break; + } + + /* process an input envelope */ + for (i = 0; i < desc->in_num; i++) { + u32 in_size, copy; + + in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env); + if (in_size == UINT_MAX) { + dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -ENXIO; + } + if (!access_ok(VERIFY_READ, p + in_len, in_size)) + return -EFAULT; + if (in_len < sizeof(in_env)) + copy = min_t(u32, sizeof(in_env) - in_len, in_size); + else + copy = 0; + if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) + return -EFAULT; + in_len += in_size; + } + + /* process an output envelope */ + for (i = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, + (u32 *) in_env, (u32 *) out_env); + u32 copy; + + if (out_size == UINT_MAX) { + dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -EFAULT; + } + if (!access_ok(VERIFY_WRITE, p + in_len + out_len, out_size)) + return -EFAULT; + if (out_len < sizeof(out_env)) + copy = min_t(u32, sizeof(out_env) - out_len, out_size); + else + copy = 0; + if (copy && copy_from_user(&out_env[out_len], + p + in_len + out_len, copy)) + return -EFAULT; + out_len += out_size; + } + + buf_len = out_len + in_len; + if (!access_ok(VERIFY_WRITE, p, sizeof(buf_len))) + return -EFAULT; + + if (buf_len > ND_IOCTL_MAX_BUFLEN) { + dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__, + dimm_name, cmd_name, buf_len, + ND_IOCTL_MAX_BUFLEN); + return -EINVAL; + } + + buf = vmalloc(buf_len); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, p, buf_len)) { + rc = -EFAULT; + goto out; + } + + nvdimm_bus_lock(&nvdimm_bus->dev); + rc = nd_cmd_clear_to_send(nvdimm, cmd); + if (rc) + goto out_unlock; + + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len); + if (rc < 0) + goto out_unlock; + if (copy_to_user(p, buf, buf_len)) + rc = -EFAULT; + out_unlock: + nvdimm_bus_unlock(&nvdimm_bus->dev); + out: + vfree(buf); + return rc; +} + +static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long id = (long) file->private_data; + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + if (nvdimm_bus->id == id) { + rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg); + break; + } + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int match_dimm(struct device *dev, void *data) +{ + long id = (long) data; + + if (is_nvdimm(dev)) { + struct nvdimm *nvdimm = to_nvdimm(dev); + + return nvdimm->id == id; + } + + return 0; +} + +static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + struct device *dev = device_find_child(&nvdimm_bus->dev, + file->private_data, match_dimm); + struct nvdimm *nvdimm; + + if (!dev) + continue; + + nvdimm = to_nvdimm(dev); + rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg); + put_device(dev); + break; + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int nd_open(struct inode *inode, struct file *file) +{ + long minor = iminor(inode); + + file->private_data = (void *) minor; + return 0; +} + +static const struct file_operations nvdimm_bus_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = nd_ioctl, + .compat_ioctl = nd_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations nvdimm_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = nvdimm_ioctl, + .compat_ioctl = nvdimm_ioctl, + .llseek = noop_llseek, +}; + +int __init nvdimm_bus_init(void) +{ + int rc; + + rc = bus_register(&nvdimm_bus_type); + if (rc) + return rc; + + rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); + if (rc < 0) + goto err_bus_chrdev; + nvdimm_bus_major = rc; + + rc = register_chrdev(0, "dimmctl", &nvdimm_fops); + if (rc < 0) + goto err_dimm_chrdev; + nvdimm_major = rc; + + nd_class = class_create(THIS_MODULE, "nd"); + if (IS_ERR(nd_class)) + goto err_class; + + return 0; + + err_class: + unregister_chrdev(nvdimm_major, "dimmctl"); + err_dimm_chrdev: + unregister_chrdev(nvdimm_bus_major, "ndctl"); + err_bus_chrdev: + bus_unregister(&nvdimm_bus_type); + + return rc; +} + +void nvdimm_bus_exit(void) +{ + class_destroy(nd_class); + unregister_chrdev(nvdimm_bus_major, "ndctl"); + unregister_chrdev(nvdimm_major, "dimmctl"); + bus_unregister(&nvdimm_bus_type); +} diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c new file mode 100644 index 000000000000..cb62ec6a12d0 --- /dev/null +++ b/drivers/nvdimm/core.c @@ -0,0 +1,465 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/libnvdimm.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include "nd-core.h" +#include "nd.h" + +LIST_HEAD(nvdimm_bus_list); +DEFINE_MUTEX(nvdimm_bus_list_mutex); +static DEFINE_IDA(nd_ida); + +void nvdimm_bus_lock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_lock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_lock); + +void nvdimm_bus_unlock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_unlock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_unlock); + +bool is_nvdimm_bus_locked(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + return mutex_is_locked(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(is_nvdimm_bus_locked); + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} +EXPORT_SYMBOL_GPL(nd_fletcher64); + +static void nvdimm_bus_release(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + ida_simple_remove(&nd_ida, nvdimm_bus->id); + kfree(nvdimm_bus); +} + +struct nvdimm_bus *to_nvdimm_bus(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release); + return nvdimm_bus; +} +EXPORT_SYMBOL_GPL(to_nvdimm_bus); + +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus) +{ + /* struct nvdimm_bus definition is private to libnvdimm */ + return nvdimm_bus->nd_desc; +} +EXPORT_SYMBOL_GPL(to_nd_desc); + +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) +{ + struct device *dev; + + for (dev = nd_dev; dev; dev = dev->parent) + if (dev->release == nvdimm_bus_release) + break; + dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n"); + if (dev) + return to_nvdimm_bus(dev); + return NULL; +} + +static bool is_uuid_sep(char sep) +{ + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') + return true; + return false; +} + +static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf, + size_t len) +{ + const char *str = buf; + u8 uuid[16]; + int i; + + for (i = 0; i < 16; i++) { + if (!isxdigit(str[0]) || !isxdigit(str[1])) { + dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n", + __func__, i, str - buf, str[0], + str + 1 - buf, str[1]); + return -EINVAL; + } + + uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); + str += 2; + if (is_uuid_sep(*str)) + str++; + } + + memcpy(uuid_out, uuid, sizeof(uuid)); + return 0; +} + +/** + * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes + * @dev: container device for the uuid property + * @uuid_out: uuid buffer to replace + * @buf: raw sysfs buffer to parse + * + * Enforce that uuids can only be changed while the device is disabled + * (driver detached) + * LOCKING: expects device_lock() is held on entry + */ +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len) +{ + u8 uuid[16]; + int rc; + + if (dev->driver) + return -EBUSY; + + rc = nd_uuid_parse(dev, uuid, buf, len); + if (rc) + return rc; + + kfree(*uuid_out); + *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL); + if (!(*uuid_out)) + return -ENOMEM; + + return 0; +} + +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf) +{ + ssize_t len = 0; + int i; + + for (i = 0; supported[i]; i++) + if (current_lbasize == supported[i]) + len += sprintf(buf + len, "[%ld] ", supported[i]); + else + len += sprintf(buf + len, "%ld ", supported[i]); + len += sprintf(buf + len, "\n"); + return len; +} + +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported) +{ + unsigned long lbasize; + int rc, i; + + if (dev->driver) + return -EBUSY; + + rc = kstrtoul(buf, 0, &lbasize); + if (rc) + return rc; + + for (i = 0; supported[i]; i++) + if (lbasize == supported[i]) + break; + + if (supported[i]) { + *current_lbasize = lbasize; + return 0; + } else { + return -EINVAL; + } +} + +void __nd_iostat_start(struct bio *bio, unsigned long *start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + + *start = jiffies; + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); +} +EXPORT_SYMBOL(__nd_iostat_start); + +void nd_iostat_end(struct bio *bio, unsigned long start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + unsigned long duration = jiffies - start; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); +} +EXPORT_SYMBOL(nd_iostat_end); + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cmd, len = 0; + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct device *parent = nvdimm_bus->dev.parent; + + if (nd_desc->provider_name) + return nd_desc->provider_name; + else if (parent) + return dev_name(parent); + else + return "unknown"; +} + +static ssize_t provider_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + + return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus)); +} +static DEVICE_ATTR_RO(provider); + +static int flush_namespaces(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + return 0; +} + +static int flush_regions_dimms(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + device_for_each_child(dev, NULL, flush_namespaces); + return 0; +} + +static ssize_t wait_probe_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + nd_synchronize(); + device_for_each_child(dev, NULL, flush_regions_dimms); + return sprintf(buf, "1\n"); +} +static DEVICE_ATTR_RO(wait_probe); + +static struct attribute *nvdimm_bus_attributes[] = { + &dev_attr_commands.attr, + &dev_attr_wait_probe.attr, + &dev_attr_provider.attr, + NULL, +}; + +struct attribute_group nvdimm_bus_attribute_group = { + .attrs = nvdimm_bus_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); + +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nd_desc, struct module *module) +{ + struct nvdimm_bus *nvdimm_bus; + int rc; + + nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL); + if (!nvdimm_bus) + return NULL; + INIT_LIST_HEAD(&nvdimm_bus->list); + init_waitqueue_head(&nvdimm_bus->probe_wait); + nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); + mutex_init(&nvdimm_bus->reconfig_mutex); + if (nvdimm_bus->id < 0) { + kfree(nvdimm_bus); + return NULL; + } + nvdimm_bus->nd_desc = nd_desc; + nvdimm_bus->module = module; + nvdimm_bus->dev.parent = parent; + nvdimm_bus->dev.release = nvdimm_bus_release; + nvdimm_bus->dev.groups = nd_desc->attr_groups; + dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id); + rc = device_register(&nvdimm_bus->dev); + if (rc) { + dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc); + goto err; + } + + rc = nvdimm_bus_create_ndctl(nvdimm_bus); + if (rc) + goto err; + + mutex_lock(&nvdimm_bus_list_mutex); + list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list); + mutex_unlock(&nvdimm_bus_list_mutex); + + return nvdimm_bus; + err: + put_device(&nvdimm_bus->dev); + return NULL; +} +EXPORT_SYMBOL_GPL(__nvdimm_bus_register); + +static int child_unregister(struct device *dev, void *data) +{ + /* + * the singular ndctl class device per bus needs to be + * "device_destroy"ed, so skip it here + * + * i.e. remove classless children + */ + if (dev->class) + /* pass */; + else + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) +{ + if (!nvdimm_bus) + return; + + mutex_lock(&nvdimm_bus_list_mutex); + list_del_init(&nvdimm_bus->list); + mutex_unlock(&nvdimm_bus_list_mutex); + + nd_synchronize(); + device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + nvdimm_bus_destroy_ndctl(nvdimm_bus); + + device_unregister(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_unregister); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int nd_pi_nop_generate_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + struct blk_integrity integrity = { + .name = "ND-PI-NOP", + .generate_fn = nd_pi_nop_generate_verify, + .verify_fn = nd_pi_nop_generate_verify, + .tuple_size = meta_size, + .tag_size = meta_size, + }; + int ret; + + if (meta_size == 0) + return 0; + + ret = blk_integrity_register(disk, &integrity); + if (ret) + return ret; + + blk_queue_max_integrity_segments(disk->queue, 1); + + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#endif + +static __init int libnvdimm_init(void) +{ + int rc; + + rc = nvdimm_bus_init(); + if (rc) + return rc; + rc = nvdimm_init(); + if (rc) + goto err_dimm; + rc = nd_region_init(); + if (rc) + goto err_region; + return 0; + err_region: + nvdimm_exit(); + err_dimm: + nvdimm_bus_exit(); + return rc; +} + +static __exit void libnvdimm_exit(void) +{ + WARN_ON(!list_empty(&nvdimm_bus_list)); + nd_region_exit(); + nvdimm_exit(); + nvdimm_bus_exit(); +} + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +subsys_initcall(libnvdimm_init); +module_exit(libnvdimm_exit); diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c new file mode 100644 index 000000000000..71d12bb67339 --- /dev/null +++ b/drivers/nvdimm/dimm.c @@ -0,0 +1,102 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "label.h" +#include "nd.h" + +static int nvdimm_probe(struct device *dev) +{ + struct nvdimm_drvdata *ndd; + int rc; + + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); + if (!ndd) + return -ENOMEM; + + dev_set_drvdata(dev, ndd); + ndd->dpa.name = dev_name(dev); + ndd->ns_current = -1; + ndd->ns_next = -1; + ndd->dpa.start = 0; + ndd->dpa.end = -1; + ndd->dev = dev; + get_device(dev); + kref_init(&ndd->kref); + + rc = nvdimm_init_nsarea(ndd); + if (rc) + goto err; + + rc = nvdimm_init_config_data(ndd); + if (rc) + goto err; + + dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size); + + nvdimm_bus_lock(dev); + ndd->ns_current = nd_label_validate(ndd); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_current); + nd_label_copy(ndd, to_next_namespace_index(ndd), + to_current_namespace_index(ndd)); + rc = nd_label_reserve_dpa(ndd); + nvdimm_bus_unlock(dev); + + if (rc) + goto err; + + return 0; + + err: + put_ndd(ndd); + return rc; +} + +static int nvdimm_remove(struct device *dev) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + + nvdimm_bus_lock(dev); + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + put_ndd(ndd); + + return 0; +} + +static struct nd_device_driver nvdimm_driver = { + .probe = nvdimm_probe, + .remove = nvdimm_remove, + .drv = { + .name = "nvdimm", + }, + .type = ND_DRIVER_DIMM, +}; + +int __init nvdimm_init(void) +{ + return nd_driver_register(&nvdimm_driver); +} + +void nvdimm_exit(void) +{ + driver_unregister(&nvdimm_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c new file mode 100644 index 000000000000..c05eb807d674 --- /dev/null +++ b/drivers/nvdimm/dimm_devs.c @@ -0,0 +1,551 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/vmalloc.h> +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "label.h" +#include "nd.h" + +static DEFINE_IDA(dimm_ida); + +/* + * Retrieve bus and dimm handle and return if this bus supports + * get_config_data commands + */ +static int __validate_dimm(struct nvdimm_drvdata *ndd) +{ + struct nvdimm *nvdimm; + + if (!ndd) + return -EINVAL; + + nvdimm = to_nvdimm(ndd->dev); + + if (!nvdimm->dsm_mask) + return -ENXIO; + if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask)) + return -ENXIO; + + return 0; +} + +static int validate_dimm(struct nvdimm_drvdata *ndd) +{ + int rc = __validate_dimm(ndd); + + if (rc && ndd) + dev_dbg(ndd->dev, "%pf: %s error: %d\n", + __builtin_return_address(0), __func__, rc); + return rc; +} + +/** + * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area + * @nvdimm: dimm to initialize + */ +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) +{ + struct nd_cmd_get_config_size *cmd = &ndd->nsarea; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + + if (rc) + return rc; + + if (cmd->config_size) + return 0; /* already valid */ + + memset(cmd, 0, sizeof(*cmd)); + nd_desc = nvdimm_bus->nd_desc; + return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd)); +} + +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nd_cmd_get_config_data_hdr *cmd; + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + u32 max_cmd_size, config_size; + size_t offset; + + if (rc) + return rc; + + if (ndd->data) + return 0; + + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0 + || ndd->nsarea.config_size < ND_LABEL_MIN_SIZE) { + dev_dbg(ndd->dev, "failed to init config data area: (%d:%d)\n", + ndd->nsarea.max_xfer, ndd->nsarea.config_size); + return -ENXIO; + } + + ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL); + if (!ndd->data) + ndd->data = vmalloc(ndd->nsarea.config_size); + + if (!ndd->data) + return -ENOMEM; + + max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + nd_desc = nvdimm_bus->nd_desc; + for (config_size = ndd->nsarea.config_size, offset = 0; + config_size; config_size -= cmd->in_length, + offset += cmd->in_length) { + cmd->in_length = min(config_size, max_cmd_size); + cmd->in_offset = offset; + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_DATA, cmd, + cmd->in_length + sizeof(*cmd)); + if (rc || cmd->status) { + rc = -ENXIO; + break; + } + memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length); + } + dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc); + kfree(cmd); + + return rc; +} + +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len) +{ + int rc = validate_dimm(ndd); + size_t max_cmd_size, buf_offset; + struct nd_cmd_set_config_hdr *cmd; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + if (rc) + return rc; + + if (!ndd->data) + return -ENXIO; + + if (offset + len > ndd->nsarea.config_size) + return -ENXIO; + + max_cmd_size = min_t(u32, PAGE_SIZE, len); + max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + for (buf_offset = 0; len; len -= cmd->in_length, + buf_offset += cmd->in_length) { + size_t cmd_size; + u32 *status; + + cmd->in_offset = offset + buf_offset; + cmd->in_length = min(max_cmd_size, len); + memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length); + + /* status is output in the last 4-bytes of the command buffer */ + cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32); + status = ((void *) cmd) + cmd_size - sizeof(u32); + + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_SET_CONFIG_DATA, cmd, cmd_size); + if (rc || *status) { + rc = rc ? rc : -ENXIO; + break; + } + } + kfree(cmd); + + return rc; +} + +static void nvdimm_release(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + ida_simple_remove(&dimm_ida, nvdimm->id); + kfree(nvdimm); +} + +static struct device_type nvdimm_device_type = { + .name = "nvdimm", + .release = nvdimm_release, +}; + +bool is_nvdimm(struct device *dev) +{ + return dev->type == &nvdimm_device_type; +} + +struct nvdimm *to_nvdimm(struct device *dev) +{ + struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev); + + WARN_ON(!is_nvdimm(dev)); + return nvdimm; +} +EXPORT_SYMBOL_GPL(to_nvdimm); + +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr) +{ + struct nd_region *nd_region = &ndbr->nd_region; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + return nd_mapping->nvdimm; +} +EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm); + +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev)); + + return dev_get_drvdata(&nvdimm->dev); +} +EXPORT_SYMBOL(to_ndd); + +void nvdimm_drvdata_release(struct kref *kref) +{ + struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref); + struct device *dev = ndd->dev; + struct resource *res, *_r; + + dev_dbg(dev, "%s\n", __func__); + + nvdimm_bus_lock(dev); + for_each_dpa_resource_safe(ndd, res, _r) + nvdimm_free_dpa(ndd, res); + nvdimm_bus_unlock(dev); + + if (ndd->data && is_vmalloc_addr(ndd->data)) + vfree(ndd->data); + else + kfree(ndd->data); + kfree(ndd); + put_device(dev); +} + +void get_ndd(struct nvdimm_drvdata *ndd) +{ + kref_get(&ndd->kref); +} + +void put_ndd(struct nvdimm_drvdata *ndd) +{ + if (ndd) + kref_put(&ndd->kref, nvdimm_drvdata_release); +} + +const char *nvdimm_name(struct nvdimm *nvdimm) +{ + return dev_name(&nvdimm->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_name); + +void *nvdimm_provider_data(struct nvdimm *nvdimm) +{ + if (nvdimm) + return nvdimm->provider_data; + return NULL; +} +EXPORT_SYMBOL_GPL(nvdimm_provider_data); + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + int cmd, len = 0; + + if (!nvdimm->dsm_mask) + return sprintf(buf, "\n"); + + for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static ssize_t state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + /* + * The state may be in the process of changing, userspace should + * quiesce probing if it wants a static answer + */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy) + ? "active" : "idle"); +} +static DEVICE_ATTR_RO(state); + +static ssize_t available_slots_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + ssize_t rc; + u32 nfree; + + if (!ndd) + return -ENXIO; + + nvdimm_bus_lock(dev); + nfree = nd_label_nfree(ndd); + if (nfree - 1 > nfree) { + dev_WARN_ONCE(dev, 1, "we ate our last label?\n"); + nfree = 0; + } else + nfree--; + rc = sprintf(buf, "%d\n", nfree); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(available_slots); + +static struct attribute *nvdimm_attributes[] = { + &dev_attr_state.attr, + &dev_attr_commands.attr, + &dev_attr_available_slots.attr, + NULL, +}; + +struct attribute_group nvdimm_attribute_group = { + .attrs = nvdimm_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_attribute_group); + +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask) +{ + struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL); + struct device *dev; + + if (!nvdimm) + return NULL; + + nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL); + if (nvdimm->id < 0) { + kfree(nvdimm); + return NULL; + } + nvdimm->provider_data = provider_data; + nvdimm->flags = flags; + nvdimm->dsm_mask = dsm_mask; + atomic_set(&nvdimm->busy, 0); + dev = &nvdimm->dev; + dev_set_name(dev, "nmem%d", nvdimm->id); + dev->parent = &nvdimm_bus->dev; + dev->type = &nvdimm_device_type; + dev->devt = MKDEV(nvdimm_major, nvdimm->id); + dev->groups = groups; + nd_device_register(dev); + + return nvdimm; +} +EXPORT_SYMBOL_GPL(nvdimm_create); + +/** + * nd_blk_available_dpa - account the unused dpa of BLK region + * @nd_mapping: container of dpa-resource-root + labels + * + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + */ +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t map_end, busy = 0, available; + struct resource *res; + + if (!ndd) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + for_each_dpa_resource(ndd, res) + if (res->start >= nd_mapping->start && res->start < map_end) { + resource_size_t end = min(map_end, res->end); + + busy += end - res->start + 1; + } else if (res->end >= nd_mapping->start + && res->end <= map_end) { + busy += res->end - nd_mapping->start; + } else if (nd_mapping->start > res->start + && nd_mapping->start < res->end) { + /* total eclipse of the BLK region mapping */ + busy += nd_mapping->size; + } + + available = map_end - nd_mapping->start + 1; + if (busy < available) + return available - busy; + return 0; +} + +/** + * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa + * @nd_mapping: container of dpa-resource-root + labels + * @nd_region: constrain available space check to this reference region + * @overlap: calculate available space assuming this level of overlap + * + * Validate that a PMEM label, if present, aligns with the start of an + * interleave set and truncate the available size at the lowest BLK + * overlap point. + * + * The expectation is that this routine is called multiple times as it + * probes for the largest BLK encroachment for any single member DIMM of + * the interleave set. Once that value is determined the PMEM-limit for + * the set can be established. + */ +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap) +{ + resource_size_t map_start, map_end, busy = 0, available, blk_start; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + const char *reason; + + if (!ndd) + return 0; + + map_start = nd_mapping->start; + map_end = map_start + nd_mapping->size - 1; + blk_start = max(map_start, map_end + 1 - *overlap); + for_each_dpa_resource(ndd, res) + if (res->start >= map_start && res->start < map_end) { + if (strncmp(res->name, "blk", 3) == 0) + blk_start = min(blk_start, res->start); + else if (res->start != map_start) { + reason = "misaligned to iset"; + goto err; + } else { + if (busy) { + reason = "duplicate overlapping PMEM reservations?"; + goto err; + } + busy += resource_size(res); + continue; + } + } else if (res->end >= map_start && res->end <= map_end) { + if (strncmp(res->name, "blk", 3) == 0) { + /* + * If a BLK allocation overlaps the start of + * PMEM the entire interleave set may now only + * be used for BLK. + */ + blk_start = map_start; + } else { + reason = "misaligned to iset"; + goto err; + } + } else if (map_start > res->start && map_start < res->end) { + /* total eclipse of the mapping */ + busy += nd_mapping->size; + blk_start = map_start; + } + + *overlap = map_end + 1 - blk_start; + available = blk_start - map_start; + if (busy < available) + return available - busy; + return 0; + + err: + /* + * Something is wrong, PMEM must align with the start of the + * interleave set, and there can only be one allocation per set. + */ + nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); + return 0; +} + +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res) +{ + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + kfree(res->name); + __release_region(&ndd->dpa, res->start, resource_size(res)); +} + +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n) +{ + char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL); + struct resource *res; + + if (!name) + return NULL; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + res = __request_region(&ndd->dpa, start, n, name, 0); + if (!res) + kfree(name); + return res; +} + +/** + * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id + * @nvdimm: container of dpa-resource-root + labels + * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid> + */ +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id) +{ + resource_size_t allocated = 0; + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + allocated += resource_size(res); + + return allocated; +} + +static int count_dimms(struct device *dev, void *c) +{ + int *count = c; + + if (is_nvdimm(dev)) + (*count)++; + return 0; +} + +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count) +{ + int count = 0; + /* Flush any possible dimm registration failures */ + nd_synchronize(); + + device_for_each_child(&nvdimm_bus->dev, &count, count_dimms); + dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count); + if (count != dimm_count) + return -ENXIO; + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count); diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c new file mode 100644 index 000000000000..96526dcfdd37 --- /dev/null +++ b/drivers/nvdimm/label.c @@ -0,0 +1,927 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "label.h" +#include "nd.h" + +static u32 best_seq(u32 a, u32 b) +{ + a &= NSINDEX_SEQ_MASK; + b &= NSINDEX_SEQ_MASK; + + if (a == 0 || a == b) + return b; + else if (b == 0) + return a; + else if (nd_inc_seq(a) == b) + return b; + else + return a; +} + +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) +{ + u32 index_span; + + if (ndd->nsindex_size) + return ndd->nsindex_size; + + /* + * The minimum index space is 512 bytes, with that amount of + * index we can describe ~1400 labels which is less than a byte + * of overhead per label. Round up to a byte of overhead per + * label and determine the size of the index region. Yes, this + * starts to waste space at larger config_sizes, but it's + * unlikely we'll ever see anything but 128K. + */ + index_span = ndd->nsarea.config_size / 129; + index_span /= NSINDEX_ALIGN * 2; + ndd->nsindex_size = index_span * NSINDEX_ALIGN; + + return ndd->nsindex_size; +} + +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) +{ + return ndd->nsarea.config_size / 129; +} + +int nd_label_validate(struct nvdimm_drvdata *ndd) +{ + /* + * On media label format consists of two index blocks followed + * by an array of labels. None of these structures are ever + * updated in place. A sequence number tracks the current + * active index and the next one to write, while labels are + * written to free slots. + * + * +------------+ + * | | + * | nsindex0 | + * | | + * +------------+ + * | | + * | nsindex1 | + * | | + * +------------+ + * | label0 | + * +------------+ + * | label1 | + * +------------+ + * | | + * ....nslot... + * | | + * +------------+ + * | labelN | + * +------------+ + */ + struct nd_namespace_index *nsindex[] = { + to_namespace_index(ndd, 0), + to_namespace_index(ndd, 1), + }; + const int num_index = ARRAY_SIZE(nsindex); + struct device *dev = ndd->dev; + bool valid[2] = { 0 }; + int i, num_valid = 0; + u32 seq; + + for (i = 0; i < num_index; i++) { + u32 nslot; + u8 sig[NSINDEX_SIG_LEN]; + u64 sum_save, sum, size; + + memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN); + if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) { + dev_dbg(dev, "%s: nsindex%d signature invalid\n", + __func__, i); + continue; + } + sum_save = __le64_to_cpu(nsindex[i]->checksum); + nsindex[i]->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1); + nsindex[i]->checksum = __cpu_to_le64(sum_save); + if (sum != sum_save) { + dev_dbg(dev, "%s: nsindex%d checksum invalid\n", + __func__, i); + continue; + } + + seq = __le32_to_cpu(nsindex[i]->seq); + if ((seq & NSINDEX_SEQ_MASK) == 0) { + dev_dbg(dev, "%s: nsindex%d sequence: %#x invalid\n", + __func__, i, seq); + continue; + } + + /* sanity check the index against expected values */ + if (__le64_to_cpu(nsindex[i]->myoff) + != i * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "%s: nsindex%d myoff: %#llx invalid\n", + __func__, i, (unsigned long long) + __le64_to_cpu(nsindex[i]->myoff)); + continue; + } + if (__le64_to_cpu(nsindex[i]->otheroff) + != (!i) * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "%s: nsindex%d otheroff: %#llx invalid\n", + __func__, i, (unsigned long long) + __le64_to_cpu(nsindex[i]->otheroff)); + continue; + } + + size = __le64_to_cpu(nsindex[i]->mysize); + if (size > sizeof_namespace_index(ndd) + || size < sizeof(struct nd_namespace_index)) { + dev_dbg(dev, "%s: nsindex%d mysize: %#llx invalid\n", + __func__, i, size); + continue; + } + + nslot = __le32_to_cpu(nsindex[i]->nslot); + if (nslot * sizeof(struct nd_namespace_label) + + 2 * sizeof_namespace_index(ndd) + > ndd->nsarea.config_size) { + dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n", + __func__, i, nslot, + ndd->nsarea.config_size); + continue; + } + valid[i] = true; + num_valid++; + } + + switch (num_valid) { + case 0: + break; + case 1: + for (i = 0; i < num_index; i++) + if (valid[i]) + return i; + /* can't have num_valid > 0 but valid[] = { false, false } */ + WARN_ON(1); + break; + default: + /* pick the best index... */ + seq = best_seq(__le32_to_cpu(nsindex[0]->seq), + __le32_to_cpu(nsindex[1]->seq)); + if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK)) + return 1; + else + return 0; + break; + } + + return -1; +} + +void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, + struct nd_namespace_index *src) +{ + if (dst && src) + /* pass */; + else + return; + + memcpy(dst, src, sizeof_namespace_index(ndd)); +} + +static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd) +{ + void *base = to_namespace_index(ndd, 0); + + return base + 2 * sizeof_namespace_index(ndd); +} + +static int to_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return nd_label - nd_label_base(ndd); +} + +#define for_each_clear_bit_le(bit, addr, size) \ + for ((bit) = find_next_zero_bit_le((addr), (size), 0); \ + (bit) < (size); \ + (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1)) + +/** + * preamble_index - common variable initialization for nd_label_* routines + * @ndd: dimm container for the relevant label set + * @idx: namespace_index index + * @nsindex_out: on return set to the currently active namespace index + * @free: on return set to the free label bitmap in the index + * @nslot: on return set to the number of slots in the label space + */ +static bool preamble_index(struct nvdimm_drvdata *ndd, int idx, + struct nd_namespace_index **nsindex_out, + unsigned long **free, u32 *nslot) +{ + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, idx); + if (nsindex == NULL) + return false; + + *free = (unsigned long *) nsindex->free; + *nslot = __le32_to_cpu(nsindex->nslot); + *nsindex_out = nsindex; + + return true; +} + +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags) +{ + if (!label_id || !uuid) + return NULL; + snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb", + flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid); + return label_id->id; +} + +static bool preamble_current(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_current, nsindex, + free, nslot); +} + +static bool preamble_next(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_next, nsindex, + free, nslot); +} + +static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot) +{ + /* check that we are written where we expect to be written */ + if (slot != __le32_to_cpu(nd_label->slot)) + return false; + + /* check that DPA allocations are page aligned */ + if ((__le64_to_cpu(nd_label->dpa) + | __le64_to_cpu(nd_label->rawsize)) % SZ_4K) + return false; + + return true; +} + +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; /* no label, nothing to reserve */ + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + struct nd_region *nd_region = NULL; + u8 label_uuid[NSLABEL_UUID_LEN]; + struct nd_label_id label_id; + struct resource *res; + u32 flags; + + nd_label = nd_label_base(ndd) + slot; + + if (!slot_valid(nd_label, slot)) + continue; + + memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); + flags = __le32_to_cpu(nd_label->flags); + nd_label_gen_id(&label_id, label_uuid, flags); + res = nvdimm_allocate_dpa(ndd, &label_id, + __le64_to_cpu(nd_label->dpa), + __le64_to_cpu(nd_label->rawsize)); + nd_dbg_dpa(nd_region, ndd, res, "reserve\n"); + if (!res) + return -EBUSY; + } + + return 0; +} + +int nd_label_active_count(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + int count = 0; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + + if (!slot_valid(nd_label, slot)) { + u32 label_slot = __le32_to_cpu(nd_label->slot); + u64 size = __le64_to_cpu(nd_label->rawsize); + u64 dpa = __le64_to_cpu(nd_label->dpa); + + dev_dbg(ndd->dev, + "%s: slot%d invalid slot: %d dpa: %llx size: %llx\n", + __func__, slot, label_slot, dpa, size); + continue; + } + count++; + } + return count; +} + +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return NULL; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + if (!slot_valid(nd_label, slot)) + continue; + + if (n-- == 0) + return nd_label_base(ndd) + slot; + } + + return NULL; +} + +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return UINT_MAX; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + slot = find_next_bit_le(free, nslot, 0); + if (slot == nslot) + return UINT_MAX; + + clear_bit_le(slot, free); + + return slot; +} + +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return false; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (slot < nslot) + return !test_and_set_bit_le(slot, free); + return false; +} + +u32 nd_label_nfree(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return nvdimm_num_label_slots(ndd); + + return bitmap_weight(free, nslot); +} + +static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, + unsigned long flags) +{ + struct nd_namespace_index *nsindex; + unsigned long offset; + u64 checksum; + u32 nslot; + int rc; + + nsindex = to_namespace_index(ndd, index); + if (flags & ND_NSINDEX_INIT) + nslot = nvdimm_num_label_slots(ndd); + else + nslot = __le32_to_cpu(nsindex->nslot); + + memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN); + nsindex->flags = __cpu_to_le32(0); + nsindex->seq = __cpu_to_le32(seq); + offset = (unsigned long) nsindex + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->myoff = __cpu_to_le64(offset); + nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd)); + offset = (unsigned long) to_namespace_index(ndd, + nd_label_next_nsindex(index)) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->otheroff = __cpu_to_le64(offset); + offset = (unsigned long) nd_label_base(ndd) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->labeloff = __cpu_to_le64(offset); + nsindex->nslot = __cpu_to_le32(nslot); + nsindex->major = __cpu_to_le16(1); + nsindex->minor = __cpu_to_le16(1); + nsindex->checksum = __cpu_to_le64(0); + if (flags & ND_NSINDEX_INIT) { + unsigned long *free = (unsigned long *) nsindex->free; + u32 nfree = ALIGN(nslot, BITS_PER_LONG); + int last_bits, i; + + memset(nsindex->free, 0xff, nfree / 8); + for (i = 0, last_bits = nfree - nslot; i < last_bits; i++) + clear_bit_le(nslot + i, free); + } + checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1); + nsindex->checksum = __cpu_to_le64(checksum); + rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff), + nsindex, sizeof_namespace_index(ndd)); + if (rc < 0) + return rc; + + if (flags & ND_NSINDEX_INIT) + return 0; + + /* copy the index we just wrote to the new 'next' */ + WARN_ON(index != ndd->ns_next); + nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex); + ndd->ns_current = nd_label_next_nsindex(ndd->ns_current); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_next); + WARN_ON(ndd->ns_current == ndd->ns_next); + + return 0; +} + +static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return (unsigned long) nd_label + - (unsigned long) to_namespace_index(ndd, 0); +} + +static int __pmem_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, + int pos) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *victim_label; + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + size_t offset; + int rc; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + /* allocate and write the label to the staging (next) index */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + return -ENXIO; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN); + if (nspm->alt_name) + memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_UPDATING); + nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); + nd_label->position = __cpu_to_le16(pos); + nd_label->isetcookie = __cpu_to_le64(cookie); + rawsize = div_u64(resource_size(&nspm->nsio.res), + nd_region->ndr_mappings); + nd_label->rawsize = __cpu_to_le64(rawsize); + nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + return rc; + + /* Garbage collect the previous label */ + victim_label = nd_mapping->labels[0]; + if (victim_label) { + slot = to_slot(ndd, victim_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc < 0) + return rc; + + nd_mapping->labels[0] = nd_label; + + return 0; +} + +static void del_label(struct nd_mapping *nd_mapping, int l) +{ + struct nd_namespace_label *next_label, *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + unsigned int slot; + int j; + + nd_label = nd_mapping->labels[l]; + slot = to_slot(ndd, nd_label); + dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); + + for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) + nd_mapping->labels[j] = next_label; + nd_mapping->labels[j] = NULL; +} + +static bool is_old_resource(struct resource *res, struct resource **list, int n) +{ + int i; + + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + for (i = 0; i < n; i++) + if (res == list[i]) + return true; + return false; +} + +static struct resource *to_resource(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + struct resource *res; + + for_each_dpa_resource(ndd, res) { + if (res->start != __le64_to_cpu(nd_label->dpa)) + continue; + if (resource_size(res) != __le64_to_cpu(nd_label->rawsize)) + continue; + return res; + } + + return NULL; +} + +/* + * 1/ Account all the labels that can be freed after this update + * 2/ Allocate and write the label to the staging (next) index + * 3/ Record the resources in the namespace device + */ +static int __blk_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, + int num_labels) +{ + int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free, *victim_map = NULL; + struct resource *res, **old_res_list; + struct nd_label_id label_id; + u8 uuid[NSLABEL_UUID_LEN]; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + old_res_list = nsblk->res; + nfree = nd_label_nfree(ndd); + old_num_resources = nsblk->num_resources; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + + /* + * We need to loop over the old resources a few times, which seems a + * bit inefficient, but we need to know that we have the label + * space before we start mutating the tracking structures. + * Otherwise the recovery method of last resort for userspace is + * disable and re-enable the parent region. + */ + alloc = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!is_old_resource(res, old_res_list, old_num_resources)) + alloc++; + } + + victims = 0; + if (old_num_resources) { + /* convert old local-label-map to dimm-slot victim-map */ + victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long), + GFP_KERNEL); + if (!victim_map) + return -ENOMEM; + + /* mark unused labels for garbage collection */ + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + if (res && is_old_resource(res, old_res_list, + old_num_resources)) + continue; + slot = to_slot(ndd, nd_label); + set_bit(slot, victim_map); + victims++; + } + } + + /* don't allow updates that consume the last label */ + if (nfree - alloc < 0 || nfree - alloc + victims < 1) { + dev_info(&nsblk->common.dev, "insufficient label space\n"); + kfree(victim_map); + return -ENOSPC; + } + /* from here on we need to abort on error */ + + + /* assign all resources to the namespace before writing the labels */ + nsblk->res = NULL; + nsblk->num_resources = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) { + rc = -ENOMEM; + goto abort; + } + } + + for (i = 0; i < nsblk->num_resources; i++) { + size_t offset; + + res = nsblk->res[i]; + if (is_old_resource(res, old_res_list, old_num_resources)) + continue; /* carry-over */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + goto abort; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN); + if (nsblk->alt_name) + memcpy(nd_label->name, nsblk->alt_name, + NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL); + nd_label->nlabel = __cpu_to_le16(0); /* N/A */ + nd_label->position = __cpu_to_le16(0); /* N/A */ + nd_label->isetcookie = __cpu_to_le64(0); /* N/A */ + nd_label->dpa = __cpu_to_le64(res->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->lbasize = __cpu_to_le64(nsblk->lbasize); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + goto abort; + } + + /* free up now unused slots in the new index */ + for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) { + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + nd_label_free_slot(ndd, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc) + goto abort; + + /* + * Now that the on-dimm labels are up to date, fix up the tracking + * entries in nd_mapping->labels + */ + nlabel = 0; + for_each_label(l, nd_label, nd_mapping->labels) { + nlabel++; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + nlabel--; + del_label(nd_mapping, l); + l--; /* retry with the new label at this index */ + } + if (nlabel + nsblk->num_resources > num_labels) { + /* + * Bug, we can't end up with more resources than + * available labels + */ + WARN_ON_ONCE(1); + rc = -ENXIO; + goto out; + } + + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + res->flags &= ~DPA_RESOURCE_ADJUSTED; + dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", + l, slot); + nd_mapping->labels[l++] = nd_label; + } + nd_mapping->labels[l] = NULL; + + out: + kfree(old_res_list); + kfree(victim_map); + return rc; + + abort: + /* + * 1/ repair the allocated label bitmap in the index + * 2/ restore the resource list + */ + nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd)); + kfree(nsblk->res); + nsblk->res = old_res_list; + nsblk->num_resources = old_num_resources; + old_res_list = NULL; + goto out; +} + +static int init_labels(struct nd_mapping *nd_mapping, int num_labels) +{ + int i, l, old_num_labels = 0; + struct nd_namespace_index *nsindex; + struct nd_namespace_label *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); + + for_each_label(l, nd_label, nd_mapping->labels) + old_num_labels++; + + /* + * We need to preserve all the old labels for the mapping so + * they can be garbage collected after writing the new labels. + */ + if (num_labels > old_num_labels) { + struct nd_namespace_label **labels; + + labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); + if (!labels) + return -ENOMEM; + nd_mapping->labels = labels; + } + if (!nd_mapping->labels) + return -ENOMEM; + + for (i = old_num_labels; i <= num_labels; i++) + nd_mapping->labels[i] = NULL; + + if (ndd->ns_current == -1 || ndd->ns_next == -1) + /* pass */; + else + return max(num_labels, old_num_labels); + + nsindex = to_namespace_index(ndd, 0); + memset(nsindex, 0, ndd->nsarea.config_size); + for (i = 0; i < 2; i++) { + int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + + if (rc) + return rc; + } + ndd->ns_next = 1; + ndd->ns_current = 0; + + return max(num_labels, old_num_labels); +} + +static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + u8 label_uuid[NSLABEL_UUID_LEN]; + int l, num_freed = 0; + unsigned long *free; + u32 nslot, slot; + + if (!uuid) + return 0; + + /* no index || no labels == nothing to delete */ + if (!preamble_next(ndd, &nsindex, &free, &nslot) + || !nd_mapping->labels) + return 0; + + for_each_label(l, nd_label, nd_mapping->labels) { + memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + slot = to_slot(ndd, nd_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + del_label(nd_mapping, l); + num_freed++; + l--; /* retry with new label at this index */ + } + + if (num_freed > l) { + /* + * num_freed will only ever be > l when we delete the last + * label + */ + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + } + + return nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); +} + +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + if (size == 0) { + rc = del_labels(nd_mapping, nspm->uuid); + if (rc) + return rc; + continue; + } + + rc = init_labels(nd_mapping, 1); + if (rc < 0) + return rc; + + rc = __pmem_label_update(nd_region, nd_mapping, nspm, i); + if (rc) + return rc; + } + + return 0; +} + +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct resource *res; + int count = 0; + + if (size == 0) + return del_labels(nd_mapping, nsblk->uuid); + + for_each_dpa_resource(to_ndd(nd_mapping), res) + count++; + + count = init_labels(nd_mapping, count); + if (count < 0) + return count; + + return __blk_label_update(nd_region, nd_mapping, nsblk, count); +} diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h new file mode 100644 index 000000000000..a59ef6eef2a3 --- /dev/null +++ b/drivers/nvdimm/label.h @@ -0,0 +1,141 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LABEL_H__ +#define __LABEL_H__ + +#include <linux/ndctl.h> +#include <linux/sizes.h> +#include <linux/io.h> + +enum { + NSINDEX_SIG_LEN = 16, + NSINDEX_ALIGN = 256, + NSINDEX_SEQ_MASK = 0x3, + NSLABEL_UUID_LEN = 16, + NSLABEL_NAME_LEN = 64, + NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */ + NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */ + NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */ + NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */ + BTT_ALIGN = 4096, /* all btt structures */ + BTTINFO_SIG_LEN = 16, + BTTINFO_UUID_LEN = 16, + BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */ + BTTINFO_MAJOR_VERSION = 1, + ND_LABEL_MIN_SIZE = 512 * 129, /* see sizeof_namespace_index() */ + ND_LABEL_ID_SIZE = 50, + ND_NSINDEX_INIT = 0x1, +}; + +static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0"; + +/** + * struct nd_namespace_index - label set superblock + * @sig: NAMESPACE_INDEX\0 + * @flags: placeholder + * @seq: sequence number for this index + * @myoff: offset of this index in label area + * @mysize: size of this index struct + * @otheroff: offset of other index + * @labeloff: offset of first label slot + * @nslot: total number of label slots + * @major: label area major version + * @minor: label area minor version + * @checksum: fletcher64 of all fields + * @free[0]: bitmap, nlabel bits + * + * The size of free[] is rounded up so the total struct size is a + * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond + * nlabel bits must be zero. + */ +struct nd_namespace_index { + u8 sig[NSINDEX_SIG_LEN]; + __le32 flags; + __le32 seq; + __le64 myoff; + __le64 mysize; + __le64 otheroff; + __le64 labeloff; + __le32 nslot; + __le16 major; + __le16 minor; + __le64 checksum; + u8 free[0]; +}; + +/** + * struct nd_namespace_label - namespace superblock + * @uuid: UUID per RFC 4122 + * @name: optional name (NULL-terminated) + * @flags: see NSLABEL_FLAG_* + * @nlabel: num labels to describe this ns + * @position: labels position in set + * @isetcookie: interleave set cookie + * @lbasize: LBA size in bytes or 0 for pmem + * @dpa: DPA of NVM range on this DIMM + * @rawsize: size of namespace + * @slot: slot of this label in label area + * @unused: must be zero + */ +struct nd_namespace_label { + u8 uuid[NSLABEL_UUID_LEN]; + u8 name[NSLABEL_NAME_LEN]; + __le32 flags; + __le16 nlabel; + __le16 position; + __le64 isetcookie; + __le64 lbasize; + __le64 dpa; + __le64 rawsize; + __le32 slot; + __le32 unused; +}; + +/** + * struct nd_label_id - identifier string for dpa allocation + * @id: "{blk|pmem}-<namespace uuid>" + */ +struct nd_label_id { + char id[ND_LABEL_ID_SIZE]; +}; + +/* + * If the 'best' index is invalid, so is the 'next' index. Otherwise, + * the next index is MOD(index+1, 2) + */ +static inline int nd_label_next_nsindex(int index) +{ + if (index < 0) + return -1; + + return (index + 1) % 2; +} + +struct nvdimm_drvdata; +int nd_label_validate(struct nvdimm_drvdata *ndd); +void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, + struct nd_namespace_index *src); +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); +int nd_label_active_count(struct nvdimm_drvdata *ndd); +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd); +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot); +u32 nd_label_nfree(struct nvdimm_drvdata *ndd); +struct nd_region; +struct nd_namespace_pmem; +struct nd_namespace_blk; +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size); +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size); +#endif /* __LABEL_H__ */ diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c new file mode 100644 index 000000000000..fef0dd80d4ad --- /dev/null +++ b/drivers/nvdimm/namespace_devs.c @@ -0,0 +1,1870 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/module.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +static void namespace_io_release(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + kfree(nsio); +} + +static void namespace_pmem_release(struct device *dev) +{ + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + kfree(nspm->alt_name); + kfree(nspm->uuid); + kfree(nspm); +} + +static void namespace_blk_release(struct device *dev) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + + if (nsblk->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nsblk->id); + kfree(nsblk->alt_name); + kfree(nsblk->uuid); + kfree(nsblk->res); + kfree(nsblk); +} + +static struct device_type namespace_io_device_type = { + .name = "nd_namespace_io", + .release = namespace_io_release, +}; + +static struct device_type namespace_pmem_device_type = { + .name = "nd_namespace_pmem", + .release = namespace_pmem_release, +}; + +static struct device_type namespace_blk_device_type = { + .name = "nd_namespace_blk", + .release = namespace_blk_release, +}; + +static bool is_namespace_pmem(struct device *dev) +{ + return dev ? dev->type == &namespace_pmem_device_type : false; +} + +static bool is_namespace_blk(struct device *dev) +{ + return dev ? dev->type == &namespace_blk_device_type : false; +} + +static bool is_namespace_io(struct device *dev) +{ + return dev ? dev->type == &namespace_io_device_type : false; +} + +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + const char *suffix = ""; + + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; + + if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) + sprintf(name, "pmem%d%s", nd_region->id, suffix); + else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix); + } else { + return NULL; + } + + return name; +} +EXPORT_SYMBOL(nvdimm_namespace_disk_name); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t __alt_name_store(struct device *dev, const char *buf, + const size_t len) +{ + char *input, *pos, *alt_name, **ns_altname; + ssize_t rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = &nspm->alt_name; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = &nsblk->alt_name; + } else + return -ENXIO; + + if (dev->driver || to_ndns(dev)->claim) + return -EBUSY; + + input = kmemdup(buf, len + 1, GFP_KERNEL); + if (!input) + return -ENOMEM; + + input[len] = '\0'; + pos = strim(input); + if (strlen(pos) + 1 > NSLABEL_NAME_LEN) { + rc = -EINVAL; + goto out; + } + + alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL); + if (!alt_name) { + rc = -ENOMEM; + goto out; + } + kfree(*ns_altname); + *ns_altname = alt_name; + sprintf(*ns_altname, "%s", pos); + rc = len; + +out: + kfree(input); + return rc; +} + +static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + resource_size_t size = 0; + struct resource *res; + + if (!nsblk->uuid) + return 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + size += resource_size(res); + return size; +} + +static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + int count, i; + + if (!nsblk->uuid || !nsblk->lbasize || !ndd) + return false; + + count = 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + /* + * Resources with unacknoweldged adjustments indicate a + * failure to update labels + */ + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + count++; + } + + /* These values match after a successful label update */ + if (count != nsblk->num_resources) + return false; + + for (i = 0; i < nsblk->num_resources; i++) { + struct resource *found = NULL; + + for_each_dpa_resource(ndd, res) + if (res == nsblk->res[i]) { + found = res; + break; + } + /* stale resource */ + if (!found) + return false; + } + + return true; +} + +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + resource_size_t size; + + nvdimm_bus_lock(&nsblk->common.dev); + size = __nd_namespace_blk_validate(nsblk); + nvdimm_bus_unlock(&nsblk->common.dev); + + return size; +} +EXPORT_SYMBOL(nd_namespace_blk_validate); + + +static int nd_namespace_label_update(struct nd_region *nd_region, + struct device *dev) +{ + dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim, + "namespace must be idle during label update\n"); + if (dev->driver || to_ndns(dev)->claim) + return 0; + + /* + * Only allow label writes that will result in a valid namespace + * or deletion of an existing namespace. + */ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + resource_size_t size = resource_size(&nspm->nsio.res); + + if (size == 0 && nspm->uuid) + /* delete allocation */; + else if (!nspm->uuid) + return 0; + + return nd_pmem_namespace_label_update(nd_region, nspm, size); + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + resource_size_t size = nd_namespace_blk_size(nsblk); + + if (size == 0 && nsblk->uuid) + /* delete allocation */; + else if (!nsblk->uuid || !nsblk->lbasize) + return 0; + + return nd_blk_namespace_label_update(nd_region, nsblk, size); + } else + return -ENXIO; +} + +static ssize_t alt_name_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __alt_name_store(dev, buf, len); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +static ssize_t alt_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char *ns_altname; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = nspm->alt_name; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = nsblk->alt_name; + } else + return -ENXIO; + + return sprintf(buf, "%s\n", ns_altname ? ns_altname : ""); +} +static DEVICE_ATTR_RW(alt_name); + +static int scan_free(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int rc = 0; + + while (n) { + struct resource *res, *last; + resource_size_t new_start; + + last = NULL; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + last = res; + res = last; + if (!res) + return 0; + + if (n >= resource_size(res)) { + n -= resource_size(res); + nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc); + nvdimm_free_dpa(ndd, res); + /* retry with last resource deleted */ + continue; + } + + /* + * Keep BLK allocations relegated to high DPA as much as + * possible + */ + if (is_blk) + new_start = res->start + n; + else + new_start = res->start; + + rc = adjust_resource(res, new_start, resource_size(res) - n); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc); + break; + } + + return rc; +} + +/** + * shrink_dpa_allocation - for each dimm in region free n bytes for label_id + * @nd_region: the set of dimms to reclaim @n bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to release + * + * Assumes resources are ordered. Starting from the end try to + * adjust_resource() the allocation to @n, but if @n is larger than the + * allocation delete it and find the 'new' last allocation in the label + * set. + */ +static int shrink_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + rc = scan_free(nd_region, nd_mapping, label_id, n); + if (rc) + return rc; + } + + return 0; +} + +static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, + struct nd_region *nd_region, struct nd_mapping *nd_mapping, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t first_dpa; + struct resource *res; + int rc = 0; + + /* allocate blk from highest dpa first */ + if (is_blk) + first_dpa = nd_mapping->start + nd_mapping->size - n; + else + first_dpa = nd_mapping->start; + + /* first resource allocation for this label-id or dimm */ + res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n); + if (!res) + rc = -EBUSY; + + nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc); + return rc ? n : 0; +} + +static bool space_valid(bool is_pmem, bool is_reserve, + struct nd_label_id *label_id, struct resource *res) +{ + /* + * For BLK-space any space is valid, for PMEM-space, it must be + * contiguous with an existing allocation unless we are + * reserving pmem. + */ + if (is_reserve || !is_pmem) + return true; + if (!res || strcmp(res->name, label_id->id) == 0) + return true; + return false; +} + +enum alloc_loc { + ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER, +}; + +static resource_size_t scan_allocate(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + const resource_size_t to_allocate = n; + struct resource *res; + int first; + + retry: + first = 0; + for_each_dpa_resource(ndd, res) { + resource_size_t allocate, available = 0, free_start, free_end; + struct resource *next = res->sibling, *new_res = NULL; + enum alloc_loc loc = ALLOC_ERR; + const char *action; + int rc = 0; + + /* ignore resources outside this nd_mapping */ + if (res->start > mapping_end) + continue; + if (res->end < nd_mapping->start) + continue; + + /* space at the beginning of the mapping */ + if (!first++ && res->start > nd_mapping->start) { + free_start = nd_mapping->start; + available = res->start - free_start; + if (space_valid(is_pmem, is_reserve, label_id, NULL)) + loc = ALLOC_BEFORE; + } + + /* space between allocations */ + if (!loc && next) { + free_start = res->start + resource_size(res); + free_end = min(mapping_end, next->start - 1); + if (space_valid(is_pmem, is_reserve, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_MID; + } + } + + /* space at the end of the mapping */ + if (!loc && !next) { + free_start = res->start + resource_size(res); + free_end = mapping_end; + if (space_valid(is_pmem, is_reserve, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_AFTER; + } + } + + if (!loc || !available) + continue; + allocate = min(available, n); + switch (loc) { + case ALLOC_BEFORE: + if (strcmp(res->name, label_id->id) == 0) { + /* adjust current resource up */ + if (is_pmem && !is_reserve) + return n; + rc = adjust_resource(res, res->start - allocate, + resource_size(res) + allocate); + action = "cur grow up"; + } else + action = "allocate"; + break; + case ALLOC_MID: + if (strcmp(next->name, label_id->id) == 0) { + /* adjust next resource up */ + if (is_pmem && !is_reserve) + return n; + rc = adjust_resource(next, next->start + - allocate, resource_size(next) + + allocate); + new_res = next; + action = "next grow up"; + } else if (strcmp(res->name, label_id->id) == 0) { + action = "grow down"; + } else + action = "allocate"; + break; + case ALLOC_AFTER: + if (strcmp(res->name, label_id->id) == 0) + action = "grow down"; + else + action = "allocate"; + break; + default: + return n; + } + + if (strcmp(action, "allocate") == 0) { + /* BLK allocate bottom up */ + if (!is_pmem) + free_start += available - allocate; + else if (!is_reserve && free_start != nd_mapping->start) + return n; + + new_res = nvdimm_allocate_dpa(ndd, label_id, + free_start, allocate); + if (!new_res) + rc = -EBUSY; + } else if (strcmp(action, "grow down") == 0) { + /* adjust current resource down */ + rc = adjust_resource(res, res->start, resource_size(res) + + allocate); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + } + + if (!new_res) + new_res = res; + + nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n", + action, loc, rc); + + if (rc) + return n; + + n -= allocate; + if (n) { + /* + * Retry scan with newly inserted resources. + * For example, if we did an ALLOC_BEFORE + * insertion there may also have been space + * available for an ALLOC_AFTER insertion, so we + * need to check this same resource again + */ + goto retry; + } else + return 0; + } + + /* + * If we allocated nothing in the BLK case it may be because we are in + * an initial "pmem-reserve pass". Only do an initial BLK allocation + * when none of the DPA space is reserved. + */ + if ((is_pmem || !ndd->dpa.child) && n == to_allocate) + return init_dpa_allocation(label_id, nd_region, nd_mapping, n); + return n; +} + +static int merge_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + if (strncmp("pmem", label_id->id, 4) == 0) + return 0; + retry: + for_each_dpa_resource(ndd, res) { + int rc; + struct resource *next = res->sibling; + resource_size_t end = res->start + resource_size(res); + + if (!next || strcmp(res->name, label_id->id) != 0 + || strcmp(next->name, label_id->id) != 0 + || end != next->start) + continue; + end += resource_size(next); + nvdimm_free_dpa(ndd, next); + rc = adjust_resource(res, res->start, end - res->start); + nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc); + if (rc) + return rc; + res->flags |= DPA_RESOURCE_ADJUSTED; + goto retry; + } + + return 0; +} + +static int __reserve_free_pmem(struct device *dev, void *data) +{ + struct nvdimm *nvdimm = data; + struct nd_region *nd_region; + struct nd_label_id label_id; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + if (nd_region->ndr_mappings == 0) + return 0; + + memset(&label_id, 0, sizeof(label_id)); + strcat(label_id.id, "pmem-reserve"); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t n, rem = 0; + + if (nd_mapping->nvdimm != nvdimm) + continue; + + n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem); + if (n == 0) + return 0; + rem = scan_allocate(nd_region, nd_mapping, &label_id, n); + dev_WARN_ONCE(&nd_region->dev, rem, + "pmem reserve underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + return rem ? -ENXIO : 0; + } + + return 0; +} + +static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *_res; + + for_each_dpa_resource_safe(ndd, res, _res) + if (strcmp(res->name, "pmem-reserve") == 0) + nvdimm_free_dpa(ndd, res); +} + +static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int rc; + + rc = device_for_each_child(&nvdimm_bus->dev, nvdimm, + __reserve_free_pmem); + if (rc) + release_free_pmem(nvdimm_bus, nd_mapping); + return rc; +} + +/** + * grow_dpa_allocation - for each dimm allocate n bytes for @label_id + * @nd_region: the set of dimms to allocate @n more bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to add to the existing allocation + * + * Assumes resources are ordered. For BLK regions, first consume + * BLK-only available DPA free space, then consume PMEM-aliased DPA + * space starting at the highest DPA. For PMEM regions start + * allocations from the start of an interleave set and end at the first + * BLK allocation or the end of the interleave set, whichever comes + * first. + */ +static int grow_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t rem = n; + int rc, j; + + /* + * In the BLK case try once with all unallocated PMEM + * reserved, and once without + */ + for (j = is_pmem; j < 2; j++) { + bool blk_only = j == 0; + + if (blk_only) { + rc = reserve_free_pmem(nvdimm_bus, nd_mapping); + if (rc) + return rc; + } + rem = scan_allocate(nd_region, nd_mapping, + label_id, rem); + if (blk_only) + release_free_pmem(nvdimm_bus, nd_mapping); + + /* try again and allow encroachments into PMEM */ + if (rem == 0) + break; + } + + dev_WARN_ONCE(&nd_region->dev, rem, + "allocation underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + if (rem) + return -ENXIO; + + rc = merge_dpa(nd_region, nd_mapping, label_id); + if (rc) + return rc; + } + + return 0; +} + +static void nd_namespace_pmem_set_size(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + struct resource *res = &nspm->nsio.res; + + res->start = nd_region->ndr_start; + res->end = nd_region->ndr_start + size - 1; +} + +static ssize_t __size_store(struct device *dev, unsigned long long val) +{ + resource_size_t allocated = 0, available = 0; + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_mapping *nd_mapping; + struct nvdimm_drvdata *ndd; + struct nd_label_id label_id; + u32 flags = 0, remainder; + u8 *uuid = NULL; + int rc, i; + + if (dev->driver || to_ndns(dev)->claim) + return -EBUSY; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; + } + + /* + * We need a uuid for the allocation-label and dimm(s) on which + * to store the label. + */ + if (!uuid || nd_region->ndr_mappings == 0) + return -ENXIO; + + div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); + if (remainder) { + dev_dbg(dev, "%llu is not %dK aligned\n", val, + (SZ_4K * nd_region->ndr_mappings) / SZ_1K); + return -EINVAL; + } + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + ndd = to_ndd(nd_mapping); + + /* + * All dimms in an interleave set, or the base dimm for a blk + * region, need to be enabled for the size to be changed. + */ + if (!ndd) + return -ENXIO; + + allocated += nvdimm_allocated_dpa(ndd, &label_id); + } + available = nd_region_available_dpa(nd_region); + + if (val > available + allocated) + return -ENOSPC; + + if (val == allocated) + return 0; + + val = div_u64(val, nd_region->ndr_mappings); + allocated = div_u64(allocated, nd_region->ndr_mappings); + if (val < allocated) + rc = shrink_dpa_allocation(nd_region, &label_id, + allocated - val); + else + rc = grow_dpa_allocation(nd_region, &label_id, val - allocated); + + if (rc) + return rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + nd_namespace_pmem_set_size(nd_region, nspm, + val * nd_region->ndr_mappings); + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + /* + * Try to delete the namespace if we deleted all of its + * allocation, this is not the seed device for the + * region, and it is not actively claimed by a btt + * instance. + */ + if (val == 0 && nd_region->ns_seed != dev + && !nsblk->common.claim) + nd_device_unregister(dev, ND_ASYNC); + } + + return rc; +} + +static ssize_t size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + unsigned long long val; + u8 **uuid = NULL; + int rc; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __size_store(dev, val); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = &nsblk->uuid; + } + + if (rc == 0 && val == 0 && uuid) { + /* setting size zero == 'delete namespace' */ + kfree(*uuid); + *uuid = NULL; + } + + dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0 + ? "fail" : "success", rc); + + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + struct device *dev = &ndns->dev; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return resource_size(&nspm->nsio.res); + } else if (is_namespace_blk(dev)) { + return nd_namespace_blk_size(to_nd_namespace_blk(dev)); + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return resource_size(&nsio->res); + } else + WARN_ONCE(1, "unknown namespace type\n"); + return 0; +} + +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + resource_size_t size; + + nvdimm_bus_lock(&ndns->dev); + size = __nvdimm_namespace_capacity(ndns); + nvdimm_bus_unlock(&ndns->dev); + + return size; +} +EXPORT_SYMBOL(nvdimm_namespace_capacity); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long) + nvdimm_namespace_capacity(to_ndns(dev))); +} +static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + } else + return -ENXIO; + + if (uuid) + return sprintf(buf, "%pUb\n", uuid); + return sprintf(buf, "\n"); +} + +/** + * namespace_update_uuid - check for a unique uuid and whether we're "renaming" + * @nd_region: parent region so we can updates all dimms in the set + * @dev: namespace type for generating label_id + * @new_uuid: incoming uuid + * @old_uuid: reference to the uuid storage location in the namespace object + */ +static int namespace_update_uuid(struct nd_region *nd_region, + struct device *dev, u8 *new_uuid, u8 **old_uuid) +{ + u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0; + struct nd_label_id old_label_id; + struct nd_label_id new_label_id; + int i; + + if (!nd_is_uuid_unique(dev, new_uuid)) + return -EINVAL; + + if (*old_uuid == NULL) + goto out; + + /* + * If we've already written a label with this uuid, then it's + * too late to rename because we can't reliably update the uuid + * without losing the old namespace. Userspace must delete this + * namespace to abandon the old uuid. + */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + /* + * This check by itself is sufficient because old_uuid + * would be NULL above if this uuid did not exist in the + * currently written set. + * + * FIXME: can we delete uuid with zero dpa allocated? + */ + if (nd_mapping->labels) + return -EBUSY; + } + + nd_label_gen_id(&old_label_id, *old_uuid, flags); + nd_label_gen_id(&new_label_id, new_uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, old_label_id.id) == 0) + sprintf((void *) res->name, "%s", + new_label_id.id); + } + kfree(*old_uuid); + out: + *old_uuid = new_uuid; + return 0; +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + u8 *uuid = NULL; + ssize_t rc = 0; + u8 **ns_uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_uuid = &nsblk->uuid; + } else + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_uuid_store(dev, &uuid, buf, len); + if (rc >= 0) + rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + else + kfree(uuid); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct resource *res; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + res = &nspm->nsio.res; + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + res = &nsio->res; + } else + return -ENXIO; + + /* no address to convey if the namespace has no allocation */ + if (resource_size(res) == 0) + return -ENXIO; + return sprintf(buf, "%#llx\n", (unsigned long long) res->start); +} +static DEVICE_ATTR_RO(resource); + +static const unsigned long ns_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + if (!is_namespace_blk(dev)) + return -ENXIO; + + return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc = 0; + + if (!is_namespace_blk(dev)) + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, + ns_lbasize_supported); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, + rc, rc < 0 ? "tried" : "wrote", buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t dpa_extents_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_label_id label_id; + int count = 0, i; + u8 *uuid = NULL; + u32 flags = 0; + + nvdimm_bus_lock(dev); + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + flags = 0; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; + } + + if (!uuid) + goto out; + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + count++; + } + out: + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(dpa_extents); + +static ssize_t holder_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(holder); + +static ssize_t force_raw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool force_raw; + int rc = strtobool(buf, &force_raw); + + if (rc) + return rc; + + to_ndns(dev)->force_raw = force_raw; + return len; +} + +static ssize_t force_raw_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_ndns(dev)->force_raw); +} +static DEVICE_ATTR_RW(force_raw); + +static struct attribute *nd_namespace_attributes[] = { + &dev_attr_nstype.attr, + &dev_attr_size.attr, + &dev_attr_uuid.attr, + &dev_attr_holder.attr, + &dev_attr_resource.attr, + &dev_attr_alt_name.attr, + &dev_attr_force_raw.attr, + &dev_attr_sector_size.attr, + &dev_attr_dpa_extents.attr, + NULL, +}; + +static umode_t namespace_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (a == &dev_attr_resource.attr) { + if (is_namespace_blk(dev)) + return 0; + return a->mode; + } + + if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { + if (a == &dev_attr_size.attr) + return S_IWUSR | S_IRUGO; + + if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr) + return 0; + + return a->mode; + } + + if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr + || a == &dev_attr_holder.attr + || a == &dev_attr_force_raw.attr) + return a->mode; + + return 0; +} + +static struct attribute_group nd_namespace_attribute_group = { + .attrs = nd_namespace_attributes, + .is_visible = namespace_visible, +}; + +static const struct attribute_group *nd_namespace_attribute_groups[] = { + &nd_device_attribute_group, + &nd_namespace_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) +{ + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_namespace_common *ndns; + resource_size_t size; + + if (nd_btt) { + ndns = nd_btt->ndns; + if (!ndns) + return ERR_PTR(-ENODEV); + + /* + * Flush any in-progess probes / removals in the driver + * for the raw personality of this namespace. + */ + device_lock(&ndns->dev); + device_unlock(&ndns->dev); + if (ndns->dev.driver) { + dev_dbg(&ndns->dev, "is active, can't bind %s\n", + dev_name(&nd_btt->dev)); + return ERR_PTR(-EBUSY); + } + if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev, + "host (%s) vs claim (%s) mismatch\n", + dev_name(&nd_btt->dev), + dev_name(ndns->claim))) + return ERR_PTR(-ENXIO); + } else { + ndns = to_ndns(dev); + if (ndns->claim) { + dev_dbg(dev, "claimed by %s, failing probe\n", + dev_name(ndns->claim)); + + return ERR_PTR(-ENXIO); + } + } + + size = nvdimm_namespace_capacity(ndns); + if (size < ND_MIN_NAMESPACE_SIZE) { + dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n", + &size, ND_MIN_NAMESPACE_SIZE); + return ERR_PTR(-ENODEV); + } + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (!nspm->uuid) { + dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__); + return ERR_PTR(-ENODEV); + } + } else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + if (!nd_namespace_blk_validate(nsblk)) + return ERR_PTR(-ENODEV); + } + + return ndns; +} +EXPORT_SYMBOL(nvdimm_namespace_common_probe); + +static struct device **create_namespace_io(struct nd_region *nd_region) +{ + struct nd_namespace_io *nsio; + struct device *dev, **devs; + struct resource *res; + + nsio = kzalloc(sizeof(*nsio), GFP_KERNEL); + if (!nsio) + return NULL; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) { + kfree(nsio); + return NULL; + } + + dev = &nsio->common.dev; + dev->type = &namespace_io_device_type; + dev->parent = &nd_region->dev; + res = &nsio->res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + res->start = nd_region->ndr_start; + res->end = res->start + nd_region->ndr_size - 1; + + devs[0] = dev; + return devs; +} + +static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, + u64 cookie, u16 pos) +{ + struct nd_namespace_label *found = NULL; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + bool found_uuid = false; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + u16 position = __le16_to_cpu(nd_label->position); + u16 nlabel = __le16_to_cpu(nd_label->nlabel); + + if (isetcookie != cookie) + continue; + + if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + + if (found_uuid) { + dev_dbg(to_ndd(nd_mapping)->dev, + "%s duplicate entry for uuid\n", + __func__); + return false; + } + found_uuid = true; + if (nlabel != nd_region->ndr_mappings) + continue; + if (position != pos) + continue; + found = nd_label; + break; + } + if (found) + break; + } + return found != NULL; +} + +static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) +{ + struct nd_namespace_label *select = NULL; + int i; + + if (!pmem_id) + return -ENODEV; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + u64 hw_start, hw_end, pmem_start, pmem_end; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) + if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) + break; + + if (!nd_label) { + WARN_ON(1); + return -EINVAL; + } + + select = nd_label; + /* + * Check that this label is compliant with the dpa + * range published in NFIT + */ + hw_start = nd_mapping->start; + hw_end = hw_start + nd_mapping->size; + pmem_start = __le64_to_cpu(select->dpa); + pmem_end = pmem_start + __le64_to_cpu(select->rawsize); + if (pmem_start == hw_start && pmem_end <= hw_end) + /* pass */; + else + return -EINVAL; + + nd_mapping->labels[0] = select; + nd_mapping->labels[1] = NULL; + } + return 0; +} + +/** + * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * @nd_region: region with mappings to validate + */ +static int find_pmem_label_set(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region); + struct nd_namespace_label *nd_label; + u8 select_id[NSLABEL_UUID_LEN]; + resource_size_t size = 0; + u8 *pmem_id = NULL; + int rc = -ENODEV, l; + u16 i; + + if (cookie == 0) + return -ENXIO; + + /* + * Find a complete set of labels by uuid. By definition we can start + * with any mapping as the reference label + */ + for_each_label(l, nd_label, nd_region->mapping[0].labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + + if (isetcookie != cookie) + continue; + + for (i = 0; nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, + cookie, i)) + break; + if (i < nd_region->ndr_mappings) { + /* + * Give up if we don't find an instance of a + * uuid at each position (from 0 to + * nd_region->ndr_mappings - 1), or if we find a + * dimm with two instances of the same uuid. + */ + rc = -EINVAL; + goto err; + } else if (pmem_id) { + /* + * If there is more than one valid uuid set, we + * need userspace to clean this up. + */ + rc = -EBUSY; + goto err; + } + memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); + pmem_id = select_id; + } + + /* + * Fix up each mapping's 'labels' to have the validated pmem label for + * that position at labels[0], and NULL at labels[1]. In the process, + * check that the namespace aligns with interleave-set. We know + * that it does not overlap with any blk namespaces by virtue of + * the dimm being enabled (i.e. nd_label_reserve_dpa() + * succeeded). + */ + rc = select_pmem_id(nd_region, pmem_id); + if (rc) + goto err; + + /* Calculate total size and populate namespace properties from label0 */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *label0 = nd_mapping->labels[0]; + + size += __le64_to_cpu(label0->rawsize); + if (__le16_to_cpu(label0->position) != 0) + continue; + WARN_ON(nspm->alt_name || nspm->uuid); + nspm->alt_name = kmemdup((void __force *) label0->name, + NSLABEL_NAME_LEN, GFP_KERNEL); + nspm->uuid = kmemdup((void __force *) label0->uuid, + NSLABEL_UUID_LEN, GFP_KERNEL); + } + + if (!nspm->alt_name || !nspm->uuid) { + rc = -ENOMEM; + goto err; + } + + nd_namespace_pmem_set_size(nd_region, nspm, size); + + return 0; + err: + switch (rc) { + case -EINVAL: + dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); + break; + case -ENODEV: + dev_dbg(&nd_region->dev, "%s: label not found\n", __func__); + break; + default: + dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n", + __func__, rc); + break; + } + return rc; +} + +static struct device **create_namespace_pmem(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct device *dev, **devs; + struct resource *res; + int rc; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + rc = find_pmem_label_set(nd_region, nspm); + if (rc == -ENODEV) { + int i; + + /* Pass, try to permit namespace creation... */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + /* Publish a zero-sized namespace for userspace to configure. */ + nd_namespace_pmem_set_size(nd_region, nspm, 0); + + rc = 0; + } else if (rc) + goto err; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) + goto err; + + devs[0] = dev; + return devs; + + err: + namespace_pmem_release(&nspm->nsio.common.dev); + return NULL; +} + +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start) +{ + struct nd_label_id label_id; + struct resource *res; + + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + res = krealloc(nsblk->res, + sizeof(void *) * (nsblk->num_resources + 1), + GFP_KERNEL); + if (!res) + return NULL; + nsblk->res = (struct resource **) res; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0 + && res->start == start) { + nsblk->res[nsblk->num_resources++] = res; + return res; + } + return NULL; +} + +static struct device *nd_namespace_blk_create(struct nd_region *nd_region) +{ + struct nd_namespace_blk *nsblk; + struct device *dev; + + if (!is_nd_blk(&nd_region->dev)) + return NULL; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return NULL; + + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nsblk->id < 0) { + kfree(nsblk); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + + return &nsblk->common.dev; +} + +void nd_region_create_blk_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->ns_seed) + dev_err(&nd_region->dev, "failed to create blk namespace\n"); + else + nd_device_register(nd_region->ns_seed); +} + +void nd_region_create_btt_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->btt_seed = nd_btt_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->btt_seed) + dev_err(&nd_region->dev, "failed to create btt namespace\n"); +} + +static struct device **create_namespace_blk(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nd_namespace_label *nd_label; + struct device *dev, **devs = NULL; + struct nd_namespace_blk *nsblk; + struct nvdimm_drvdata *ndd; + int i, l, count = 0; + struct resource *res; + + if (nd_region->ndr_mappings == 0) + return NULL; + + ndd = to_ndd(nd_mapping); + for_each_label(l, nd_label, nd_mapping->labels) { + u32 flags = __le32_to_cpu(nd_label->flags); + char *name[NSLABEL_NAME_LEN]; + struct device **__devs; + + if (flags & NSLABEL_FLAG_LOCAL) + /* pass */; + else + continue; + + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + if (memcmp(nsblk->uuid, nd_label->uuid, + NSLABEL_UUID_LEN) == 0) { + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->common.dev)); + break; + } + } + if (i < count) + continue; + __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); + if (!__devs) + goto err; + memcpy(__devs, devs, sizeof(dev) * count); + kfree(devs); + devs = __devs; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + dev_set_name(dev, "namespace%d.%d", nd_region->id, count); + devs[count++] = dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->common.dev)); + } + + dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", + __func__, count, count == 1 ? "" : "s"); + + if (count == 0) { + /* Publish a zero-sized namespace for userspace to configure. */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + devs = kcalloc(2, sizeof(dev), GFP_KERNEL); + if (!devs) + goto err; + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + devs[count++] = dev; + } + + return devs; + +err: + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + namespace_blk_release(&nsblk->common.dev); + } + kfree(devs); + return NULL; +} + +static int init_active_labels(struct nd_region *nd_region) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int count, j; + + /* + * If the dimm is disabled then prevent the region from + * being activated if it aliases DPA. + */ + if (!ndd) { + if ((nvdimm->flags & NDD_ALIASING) == 0) + return 0; + dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", + dev_name(&nd_mapping->nvdimm->dev)); + return -ENXIO; + } + nd_mapping->ndd = ndd; + atomic_inc(&nvdimm->busy); + get_ndd(ndd); + + count = nd_label_active_count(ndd); + dev_dbg(ndd->dev, "%s: %d\n", __func__, count); + if (!count) + continue; + nd_mapping->labels = kcalloc(count + 1, sizeof(void *), + GFP_KERNEL); + if (!nd_mapping->labels) + return -ENOMEM; + for (j = 0; j < count; j++) { + struct nd_namespace_label *label; + + label = nd_label_active(ndd, j); + nd_mapping->labels[j] = label; + } + } + + return 0; +} + +int nd_region_register_namespaces(struct nd_region *nd_region, int *err) +{ + struct device **devs = NULL; + int i, rc = 0, type; + + *err = 0; + nvdimm_bus_lock(&nd_region->dev); + rc = init_active_labels(nd_region); + if (rc) { + nvdimm_bus_unlock(&nd_region->dev); + return rc; + } + + type = nd_region_to_nstype(nd_region); + switch (type) { + case ND_DEVICE_NAMESPACE_IO: + devs = create_namespace_io(nd_region); + break; + case ND_DEVICE_NAMESPACE_PMEM: + devs = create_namespace_pmem(nd_region); + break; + case ND_DEVICE_NAMESPACE_BLK: + devs = create_namespace_blk(nd_region); + break; + default: + break; + } + nvdimm_bus_unlock(&nd_region->dev); + + if (!devs) + return -ENODEV; + + for (i = 0; devs[i]; i++) { + struct device *dev = devs[i]; + int id; + + if (type == ND_DEVICE_NAMESPACE_BLK) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nsblk->id = id; + } else + id = i; + + if (id < 0) + break; + dev_set_name(dev, "namespace%d.%d", nd_region->id, id); + dev->groups = nd_namespace_attribute_groups; + nd_device_register(dev); + } + if (i) + nd_region->ns_seed = devs[0]; + + if (devs[i]) { + int j; + + for (j = i; devs[j]; j++) { + struct device *dev = devs[j]; + + device_initialize(dev); + put_device(dev); + } + *err = j - i; + /* + * All of the namespaces we tried to register failed, so + * fail region activation. + */ + if (*err == 0) + rc = -ENODEV; + } + kfree(devs); + + if (rc == -ENODEV) + return rc; + + return i; +} diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h new file mode 100644 index 000000000000..e1970c71ad1c --- /dev/null +++ b/drivers/nvdimm/nd-core.h @@ -0,0 +1,83 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ND_CORE_H__ +#define __ND_CORE_H__ +#include <linux/libnvdimm.h> +#include <linux/device.h> +#include <linux/libnvdimm.h> +#include <linux/sizes.h> +#include <linux/mutex.h> +#include <linux/nd.h> + +extern struct list_head nvdimm_bus_list; +extern struct mutex nvdimm_bus_list_mutex; +extern int nvdimm_major; + +struct nvdimm_bus { + struct nvdimm_bus_descriptor *nd_desc; + wait_queue_head_t probe_wait; + struct module *module; + struct list_head list; + struct device dev; + int id, probe_active; + struct mutex reconfig_mutex; +}; + +struct nvdimm { + unsigned long flags; + void *provider_data; + unsigned long *dsm_mask; + struct device dev; + atomic_t busy; + int id; +}; + +bool is_nvdimm(struct device *dev); +bool is_nd_pmem(struct device *dev); +bool is_nd_blk(struct device *dev); +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); +int __init nvdimm_bus_init(void); +void nvdimm_bus_exit(void); +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); +struct nd_region; +void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_btt_seed(struct nd_region *nd_region); +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); +void nd_synchronize(void); +int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus); +void __nd_device_register(struct device *dev); +int nd_match_dimm(struct device *dev, void *data); +struct nd_label_id; +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags); +bool nd_is_uuid_unique(struct device *dev, u8 *uuid); +struct nd_region; +struct nvdimm_drvdata; +struct nd_mapping; +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap); +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); +resource_size_t nd_region_available_dpa(struct nd_region *nd_region); +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id); +struct nd_mapping; +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start); +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); +void get_ndd(struct nvdimm_drvdata *ndd); +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +#endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h new file mode 100644 index 000000000000..c41f53e74277 --- /dev/null +++ b/drivers/nvdimm/nd.h @@ -0,0 +1,220 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ND_H__ +#define __ND_H__ +#include <linux/libnvdimm.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/ndctl.h> +#include <linux/types.h> +#include "label.h" + +enum { + /* + * Limits the maximum number of block apertures a dimm can + * support and is an input to the geometry/on-disk-format of a + * BTT instance + */ + ND_MAX_LANES = 256, + SECTOR_SHIFT = 9, + INT_LBASIZE_ALIGNMENT = 64, +}; + +struct nvdimm_drvdata { + struct device *dev; + int nsindex_size; + struct nd_cmd_get_config_size nsarea; + void *data; + int ns_current, ns_next; + struct resource dpa; + struct kref kref; +}; + +struct nd_region_namespaces { + int count; + int active; +}; + +static inline struct nd_namespace_index *to_namespace_index( + struct nvdimm_drvdata *ndd, int i) +{ + if (i < 0) + return NULL; + + return ndd->data + sizeof_namespace_index(ndd) * i; +} + +static inline struct nd_namespace_index *to_current_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_current); +} + +static inline struct nd_namespace_index *to_next_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_next); +} + +#define nd_dbg_dpa(r, d, res, fmt, arg...) \ + dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \ + (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \ + (unsigned long long) (res ? resource_size(res) : 0), \ + (unsigned long long) (res ? res->start : 0), ##arg) + +#define for_each_label(l, label, labels) \ + for (l = 0; (label = labels ? labels[l] : NULL); l++) + +#define for_each_dpa_resource(ndd, res) \ + for (res = (ndd)->dpa.child; res; res = res->sibling) + +#define for_each_dpa_resource_safe(ndd, res, next) \ + for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ + res; res = next, next = next ? next->sibling : NULL) + +struct nd_percpu_lane { + int count; + spinlock_t lock; +}; + +struct nd_region { + struct device dev; + struct ida ns_ida; + struct ida btt_ida; + struct device *ns_seed; + struct device *btt_seed; + u16 ndr_mappings; + u64 ndr_size; + u64 ndr_start; + int id, num_lanes, ro, numa_node; + void *provider_data; + struct nd_interleave_set *nd_set; + struct nd_percpu_lane __percpu *lane; + struct nd_mapping mapping[0]; +}; + +struct nd_blk_region { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + void *blk_provider_data; + struct nd_region nd_region; +}; + +/* + * Lookup next in the repeating sequence of 01, 10, and 11. + */ +static inline unsigned nd_inc_seq(unsigned seq) +{ + static const unsigned next[] = { 0, 2, 3, 1 }; + + return next[seq & 3]; +} + +struct btt; +struct nd_btt { + struct device dev; + struct nd_namespace_common *ndns; + struct btt *btt; + unsigned long lbasize; + u8 *uuid; + int id; +}; + +enum nd_async_mode { + ND_SYNC, + ND_ASYNC, +}; + +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size); +void wait_nvdimm_bus_probe_idle(struct device *dev); +void nd_device_register(struct device *dev); +void nd_device_unregister(struct device *dev, enum nd_async_mode mode); +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len); +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf); +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported); +int __init nvdimm_init(void); +int __init nd_region_init(void); +void nvdimm_exit(void); +void nd_region_exit(void); +struct nvdimm; +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len); +struct nd_btt *to_nd_btt(struct device *dev); +struct btt_sb; +u64 nd_btt_sb_checksum(struct btt_sb *btt_sb); +#if IS_ENABLED(CONFIG_BTT) +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata); +bool is_nd_btt(struct device *dev); +struct device *nd_btt_create(struct nd_region *nd_region); +#else +static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + return -ENODEV; +} + +static inline bool is_nd_btt(struct device *dev) +{ + return false; +} + +static inline struct device *nd_btt_create(struct nd_region *nd_region) +{ + return NULL; +} + +#endif +struct nd_region *to_nd_region(struct device *dev); +int nd_region_to_nstype(struct nd_region *nd_region); +int nd_region_register_namespaces(struct nd_region *nd_region, int *err); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); +void nvdimm_bus_lock(struct device *dev); +void nvdimm_bus_unlock(struct device *dev); +bool is_nvdimm_bus_locked(struct device *dev); +int nvdimm_revalidate_disk(struct gendisk *disk); +void nvdimm_drvdata_release(struct kref *kref); +void put_ndd(struct nvdimm_drvdata *ndd); +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd); +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res); +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n); +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name); +int nd_blk_region_init(struct nd_region *nd_region); +void __nd_iostat_start(struct bio *bio, unsigned long *start); +static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + if (!blk_queue_io_stat(disk->queue)) + return false; + + __nd_iostat_start(bio, start); + return true; +} +void nd_iostat_end(struct bio *bio, unsigned long start); +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk); +#endif /* __ND_H__ */ diff --git a/drivers/block/pmem.c b/drivers/nvdimm/pmem.c index 095dfaadcaa5..ade9eb917a4d 100644 --- a/drivers/block/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -1,7 +1,7 @@ /* * Persistent Memory Driver * - * Copyright (c) 2014, Intel Corporation. + * Copyright (c) 2014-2015, Intel Corporation. * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. * @@ -23,8 +23,9 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/slab.h> - -#define PMEM_MINORS 16 +#include <linux/pmem.h> +#include <linux/nd.h> +#include "nd.h" struct pmem_device { struct request_queue *pmem_queue; @@ -32,12 +33,11 @@ struct pmem_device { /* One contiguous memory region per device */ phys_addr_t phys_addr; - void *virt_addr; + void __pmem *virt_addr; size_t size; }; static int pmem_major; -static atomic_t pmem_index; static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, int rw, @@ -45,13 +45,14 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, { void *mem = kmap_atomic(page); size_t pmem_off = sector << 9; + void __pmem *pmem_addr = pmem->virt_addr + pmem_off; if (rw == READ) { - memcpy(mem + off, pmem->virt_addr + pmem_off, len); + memcpy_from_pmem(mem + off, pmem_addr, len); flush_dcache_page(page); } else { flush_dcache_page(page); - memcpy(pmem->virt_addr + pmem_off, mem + off, len); + memcpy_to_pmem(pmem_addr, mem + off, len); } kunmap_atomic(mem); @@ -59,31 +60,24 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, static void pmem_make_request(struct request_queue *q, struct bio *bio) { - struct block_device *bdev = bio->bi_bdev; - struct pmem_device *pmem = bdev->bd_disk->private_data; - int rw; + bool do_acct; + unsigned long start; struct bio_vec bvec; - sector_t sector; struct bvec_iter iter; - int err = 0; - - if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) { - err = -EIO; - goto out; - } - - BUG_ON(bio->bi_rw & REQ_DISCARD); + struct block_device *bdev = bio->bi_bdev; + struct pmem_device *pmem = bdev->bd_disk->private_data; - rw = bio_data_dir(bio); - sector = bio->bi_iter.bi_sector; - bio_for_each_segment(bvec, bio, iter) { + do_acct = nd_iostat_start(bio, &start); + bio_for_each_segment(bvec, bio, iter) pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - rw, sector); - sector += bvec.bv_len >> 9; - } + bio_data_dir(bio), iter.bi_sector); + if (do_acct) + nd_iostat_end(bio, start); -out: - bio_endio(bio, err); + if (bio_data_dir(bio)) + wmb_pmem(); + + bio_endio(bio, 0); } static int pmem_rw_page(struct block_device *bdev, sector_t sector, @@ -106,7 +100,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector, if (!pmem) return -ENODEV; - *kaddr = pmem->virt_addr + offset; + /* FIXME convert DAX to comprehend that this mapping has a lifetime */ + *kaddr = (void __force *) pmem->virt_addr + offset; *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; return pmem->size - offset; @@ -116,124 +111,165 @@ static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .rw_page = pmem_rw_page, .direct_access = pmem_direct_access, + .revalidate_disk = nvdimm_revalidate_disk, }; -static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) +static struct pmem_device *pmem_alloc(struct device *dev, + struct resource *res, int id) { struct pmem_device *pmem; - struct gendisk *disk; - int idx, err; - err = -ENOMEM; pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); if (!pmem) - goto out; + return ERR_PTR(-ENOMEM); pmem->phys_addr = res->start; pmem->size = resource_size(res); + if (!arch_has_pmem_api()) + dev_warn(dev, "unable to guarantee persistence of writes\n"); + + if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) { + dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", + &pmem->phys_addr, pmem->size); + kfree(pmem); + return ERR_PTR(-EBUSY); + } - err = -EINVAL; - if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { - dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); - goto out_free_dev; + pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size); + if (!pmem->virt_addr) { + release_mem_region(pmem->phys_addr, pmem->size); + kfree(pmem); + return ERR_PTR(-ENXIO); } - /* - * Map the memory as write-through, as we can't write back the contents - * of the CPU caches in case of a crash. - */ - err = -ENOMEM; - pmem->virt_addr = ioremap_wt(pmem->phys_addr, pmem->size); - if (!pmem->virt_addr) - goto out_release_region; + return pmem; +} + +static void pmem_detach_disk(struct pmem_device *pmem) +{ + del_gendisk(pmem->pmem_disk); + put_disk(pmem->pmem_disk); + blk_cleanup_queue(pmem->pmem_queue); +} + +static int pmem_attach_disk(struct nd_namespace_common *ndns, + struct pmem_device *pmem) +{ + struct gendisk *disk; pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); if (!pmem->pmem_queue) - goto out_unmap; + return -ENOMEM; blk_queue_make_request(pmem->pmem_queue, pmem_make_request); - blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); + blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); - disk = alloc_disk(PMEM_MINORS); - if (!disk) - goto out_free_queue; - - idx = atomic_inc_return(&pmem_index) - 1; + disk = alloc_disk(0); + if (!disk) { + blk_cleanup_queue(pmem->pmem_queue); + return -ENOMEM; + } disk->major = pmem_major; - disk->first_minor = PMEM_MINORS * idx; + disk->first_minor = 0; disk->fops = &pmem_fops; disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", idx); - disk->driverfs_dev = dev; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + disk->driverfs_dev = &ndns->dev; set_capacity(disk, pmem->size >> 9); pmem->pmem_disk = disk; add_disk(disk); + revalidate_disk(disk); - return pmem; + return 0; +} -out_free_queue: - blk_cleanup_queue(pmem->pmem_queue); -out_unmap: - iounmap(pmem->virt_addr); -out_release_region: - release_mem_region(pmem->phys_addr, pmem->size); -out_free_dev: - kfree(pmem); -out: - return ERR_PTR(err); +static int pmem_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size, int rw) +{ + struct pmem_device *pmem = dev_get_drvdata(ndns->claim); + + if (unlikely(offset + size > pmem->size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (rw == READ) + memcpy_from_pmem(buf, pmem->virt_addr + offset, size); + else { + memcpy_to_pmem(pmem->virt_addr + offset, buf, size); + wmb_pmem(); + } + + return 0; } static void pmem_free(struct pmem_device *pmem) { - del_gendisk(pmem->pmem_disk); - put_disk(pmem->pmem_disk); - blk_cleanup_queue(pmem->pmem_queue); - iounmap(pmem->virt_addr); + memunmap_pmem(pmem->virt_addr); release_mem_region(pmem->phys_addr, pmem->size); kfree(pmem); } -static int pmem_probe(struct platform_device *pdev) +static int nd_pmem_probe(struct device *dev) { + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_namespace_common *ndns; + struct nd_namespace_io *nsio; struct pmem_device *pmem; - struct resource *res; - - if (WARN_ON(pdev->num_resources > 1)) - return -ENXIO; + int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); - pmem = pmem_alloc(&pdev->dev, res); + nsio = to_nd_namespace_io(&ndns->dev); + pmem = pmem_alloc(dev, &nsio->res, nd_region->id); if (IS_ERR(pmem)) return PTR_ERR(pmem); - platform_set_drvdata(pdev, pmem); - - return 0; + dev_set_drvdata(dev, pmem); + ndns->rw_bytes = pmem_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, pmem) == 0) { + /* we'll come back as btt-pmem */ + rc = -ENXIO; + } else + rc = pmem_attach_disk(ndns, pmem); + if (rc) + pmem_free(pmem); + return rc; } -static int pmem_remove(struct platform_device *pdev) +static int nd_pmem_remove(struct device *dev) { - struct pmem_device *pmem = platform_get_drvdata(pdev); + struct pmem_device *pmem = dev_get_drvdata(dev); + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + pmem_detach_disk(pmem); pmem_free(pmem); + return 0; } -static struct platform_driver pmem_driver = { - .probe = pmem_probe, - .remove = pmem_remove, - .driver = { - .owner = THIS_MODULE, - .name = "pmem", +MODULE_ALIAS("pmem"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); +static struct nd_device_driver nd_pmem_driver = { + .probe = nd_pmem_probe, + .remove = nd_pmem_remove, + .drv = { + .name = "nd_pmem", }, + .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, }; static int __init pmem_init(void) @@ -244,16 +280,19 @@ static int __init pmem_init(void) if (pmem_major < 0) return pmem_major; - error = platform_driver_register(&pmem_driver); - if (error) + error = nd_driver_register(&nd_pmem_driver); + if (error) { unregister_blkdev(pmem_major, "pmem"); - return error; + return error; + } + + return 0; } module_init(pmem_init); static void pmem_exit(void) { - platform_driver_unregister(&pmem_driver); + driver_unregister(&nd_pmem_driver.drv); unregister_blkdev(pmem_major, "pmem"); } module_exit(pmem_exit); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c new file mode 100644 index 000000000000..f28f78ccff19 --- /dev/null +++ b/drivers/nvdimm/region.c @@ -0,0 +1,114 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/nd.h> +#include "nd.h" + +static int nd_region_probe(struct device *dev) +{ + int err, rc; + static unsigned long once; + struct nd_region_namespaces *num_ns; + struct nd_region *nd_region = to_nd_region(dev); + + if (nd_region->num_lanes > num_online_cpus() + && nd_region->num_lanes < num_possible_cpus() + && !test_and_set_bit(0, &once)) { + dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n", + num_online_cpus(), nd_region->num_lanes, + num_possible_cpus()); + dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n", + nd_region->num_lanes); + } + + rc = nd_blk_region_init(nd_region); + if (rc) + return rc; + + rc = nd_region_register_namespaces(nd_region, &err); + num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); + if (!num_ns) + return -ENOMEM; + + if (rc < 0) + return rc; + + num_ns->active = rc; + num_ns->count = rc + err; + dev_set_drvdata(dev, num_ns); + + if (rc && err && rc == err) + return -ENODEV; + + nd_region->btt_seed = nd_btt_create(nd_region); + if (err == 0) + return 0; + + /* + * Given multiple namespaces per region, we do not want to + * disable all the successfully registered peer namespaces upon + * a single registration failure. If userspace is missing a + * namespace that it expects it can disable/re-enable the region + * to retry discovery after correcting the failure. + * <regionX>/namespaces returns the current + * "<async-registered>/<total>" namespace count. + */ + dev_err(dev, "failed to register %d namespace%s, continuing...\n", + err, err == 1 ? "" : "s"); + return 0; +} + +static int child_unregister(struct device *dev, void *data) +{ + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +static int nd_region_remove(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + /* flush attribute readers and disable */ + nvdimm_bus_lock(dev); + nd_region->ns_seed = NULL; + nd_region->btt_seed = NULL; + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + + device_for_each_child(dev, NULL, child_unregister); + return 0; +} + +static struct nd_device_driver nd_region_driver = { + .probe = nd_region_probe, + .remove = nd_region_remove, + .drv = { + .name = "nd_region", + }, + .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM, +}; + +int __init nd_region_init(void) +{ + return nd_driver_register(&nd_region_driver); +} + +void nd_region_exit(void) +{ + driver_unregister(&nd_region_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c new file mode 100644 index 000000000000..a5233422f9dc --- /dev/null +++ b/drivers/nvdimm/region_devs.c @@ -0,0 +1,787 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +static DEFINE_IDA(region_ida); + +static void nd_region_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + u16 i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + put_device(&nvdimm->dev); + } + free_percpu(nd_region->lane); + ida_simple_remove(®ion_ida, nd_region->id); + if (is_nd_blk(dev)) + kfree(to_nd_blk_region(dev)); + else + kfree(nd_region); +} + +static struct device_type nd_blk_device_type = { + .name = "nd_blk", + .release = nd_region_release, +}; + +static struct device_type nd_pmem_device_type = { + .name = "nd_pmem", + .release = nd_region_release, +}; + +static struct device_type nd_volatile_device_type = { + .name = "nd_volatile", + .release = nd_region_release, +}; + +bool is_nd_pmem(struct device *dev) +{ + return dev ? dev->type == &nd_pmem_device_type : false; +} + +bool is_nd_blk(struct device *dev) +{ + return dev ? dev->type == &nd_blk_device_type : false; +} + +struct nd_region *to_nd_region(struct device *dev) +{ + struct nd_region *nd_region = container_of(dev, struct nd_region, dev); + + WARN_ON(dev->type->release != nd_region_release); + return nd_region; +} +EXPORT_SYMBOL_GPL(to_nd_region); + +struct nd_blk_region *to_nd_blk_region(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + WARN_ON(!is_nd_blk(dev)); + return container_of(nd_region, struct nd_blk_region, nd_region); +} +EXPORT_SYMBOL_GPL(to_nd_blk_region); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr) +{ + return ndbr->blk_provider_data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_provider_data); + +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data) +{ + ndbr->blk_provider_data = data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); + +/** + * nd_region_to_nstype() - region to an integer namespace type + * @nd_region: region-device to interrogate + * + * This is the 'nstype' attribute of a region as well, an input to the + * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match + * namespace devices with namespace drivers. + */ +int nd_region_to_nstype(struct nd_region *nd_region) +{ + if (is_nd_pmem(&nd_region->dev)) { + u16 i, alias; + + for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (nvdimm->flags & NDD_ALIASING) + alias++; + } + if (alias) + return ND_DEVICE_NAMESPACE_PMEM; + else + return ND_DEVICE_NAMESPACE_IO; + } else if (is_nd_blk(&nd_region->dev)) { + return ND_DEVICE_NAMESPACE_BLK; + } + + return 0; +} +EXPORT_SYMBOL(nd_region_to_nstype); + +static int is_uuid_busy(struct device *dev, void *data) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + u8 *uuid = data; + + switch (nd_region_to_nstype(nd_region)) { + case ND_DEVICE_NAMESPACE_PMEM: { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + if (!nspm->uuid) + break; + if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + break; + } + case ND_DEVICE_NAMESPACE_BLK: { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + if (!nsblk->uuid) + break; + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + break; + } + default: + break; + } + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, u8 *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long size = 0; + + if (is_nd_pmem(dev)) { + size = nd_region->ndr_size; + } else if (nd_region->ndr_mappings == 1) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + size = nd_mapping->size; + } + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(size); + +static ssize_t mappings_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ndr_mappings); +} +static DEVICE_ATTR_RO(mappings); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t set_cookie_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (is_nd_pmem(dev) && nd_set) + /* pass, should be precluded by region_visible */; + else + return -ENXIO; + + return sprintf(buf, "%#llx\n", nd_set->cookie); +} +static DEVICE_ATTR_RO(set_cookie); + +resource_size_t nd_region_available_dpa(struct nd_region *nd_region) +{ + resource_size_t blk_max_overlap = 0, available, overlap; + int i; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + + retry: + available = 0; + overlap = blk_max_overlap; + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + if (is_nd_pmem(&nd_region->dev)) { + available += nd_pmem_available_dpa(nd_region, + nd_mapping, &overlap); + if (overlap > blk_max_overlap) { + blk_max_overlap = overlap; + goto retry; + } + } else if (is_nd_blk(&nd_region->dev)) { + available += nd_blk_available_dpa(nd_mapping); + } + } + + return available; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long available = 0; + + /* + * Flush in-flight updates and grab a snapshot of the available + * size. Of course, this value is potentially invalidated the + * memory nvdimm_bus_lock() is dropped, but that's userspace's + * problem to not race itself. + */ + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + available = nd_region_available_dpa(nd_region); + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%llu\n", available); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t init_namespaces_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region_namespaces *num_ns = dev_get_drvdata(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (num_ns) + rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count); + else + rc = -ENXIO; + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(init_namespaces); + +static ssize_t namespace_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->ns_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(namespace_seed); + +static ssize_t btt_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->btt_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(btt_seed); + +static ssize_t read_only_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ro); +} + +static ssize_t read_only_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool ro; + int rc = strtobool(buf, &ro); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + + nd_region->ro = ro; + return len; +} +static DEVICE_ATTR_RW(read_only); + +static struct attribute *nd_region_attributes[] = { + &dev_attr_size.attr, + &dev_attr_nstype.attr, + &dev_attr_mappings.attr, + &dev_attr_btt_seed.attr, + &dev_attr_read_only.attr, + &dev_attr_set_cookie.attr, + &dev_attr_available_size.attr, + &dev_attr_namespace_seed.attr, + &dev_attr_init_namespaces.attr, + NULL, +}; + +static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + int type = nd_region_to_nstype(nd_region); + + if (a != &dev_attr_set_cookie.attr + && a != &dev_attr_available_size.attr) + return a->mode; + + if ((type == ND_DEVICE_NAMESPACE_PMEM + || type == ND_DEVICE_NAMESPACE_BLK) + && a == &dev_attr_available_size.attr) + return a->mode; + else if (is_nd_pmem(dev) && nd_set) + return a->mode; + + return 0; +} + +struct attribute_group nd_region_attribute_group = { + .attrs = nd_region_attributes, + .is_visible = region_visible, +}; +EXPORT_SYMBOL_GPL(nd_region_attribute_group); + +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) +{ + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (nd_set) + return nd_set->cookie; + return 0; +} + +/* + * Upon successful probe/remove, take/release a reference on the + * associated interleave set (if present), and plant new btt + namespace + * seeds. Also, on the removal of a BLK region, notify the provider to + * disable the region. + */ +static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, + struct device *dev, bool probe) +{ + struct nd_region *nd_region; + + if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) { + int i; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = nd_mapping->ndd; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + put_ndd(ndd); + nd_mapping->ndd = NULL; + if (ndd) + atomic_dec(&nvdimm->busy); + } + + if (is_nd_pmem(dev)) + return; + + to_nd_blk_region(dev)->disable(nvdimm_bus, dev); + } + if (dev->parent && is_nd_blk(dev->parent) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->ns_seed == dev) + nd_region_create_blk_seed(nd_region); + nvdimm_bus_unlock(dev); + } + if (is_nd_btt(dev) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->btt_seed == dev) + nd_region_create_btt_seed(nd_region); + nvdimm_bus_unlock(dev); + } +} + +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, true); +} + +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, false); +} + +static ssize_t mappingN(struct device *dev, char *buf, int n) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_mapping *nd_mapping; + struct nvdimm *nvdimm; + + if (n >= nd_region->ndr_mappings) + return -ENXIO; + nd_mapping = &nd_region->mapping[n]; + nvdimm = nd_mapping->nvdimm; + + return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev), + nd_mapping->start, nd_mapping->size); +} + +#define REGION_MAPPING(idx) \ +static ssize_t mapping##idx##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return mappingN(dev, buf, idx); \ +} \ +static DEVICE_ATTR_RO(mapping##idx) + +/* + * 32 should be enough for a while, even in the presence of socket + * interleave a 32-way interleave set is a degenerate case. + */ +REGION_MAPPING(0); +REGION_MAPPING(1); +REGION_MAPPING(2); +REGION_MAPPING(3); +REGION_MAPPING(4); +REGION_MAPPING(5); +REGION_MAPPING(6); +REGION_MAPPING(7); +REGION_MAPPING(8); +REGION_MAPPING(9); +REGION_MAPPING(10); +REGION_MAPPING(11); +REGION_MAPPING(12); +REGION_MAPPING(13); +REGION_MAPPING(14); +REGION_MAPPING(15); +REGION_MAPPING(16); +REGION_MAPPING(17); +REGION_MAPPING(18); +REGION_MAPPING(19); +REGION_MAPPING(20); +REGION_MAPPING(21); +REGION_MAPPING(22); +REGION_MAPPING(23); +REGION_MAPPING(24); +REGION_MAPPING(25); +REGION_MAPPING(26); +REGION_MAPPING(27); +REGION_MAPPING(28); +REGION_MAPPING(29); +REGION_MAPPING(30); +REGION_MAPPING(31); + +static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nd_region *nd_region = to_nd_region(dev); + + if (n < nd_region->ndr_mappings) + return a->mode; + return 0; +} + +static struct attribute *mapping_attributes[] = { + &dev_attr_mapping0.attr, + &dev_attr_mapping1.attr, + &dev_attr_mapping2.attr, + &dev_attr_mapping3.attr, + &dev_attr_mapping4.attr, + &dev_attr_mapping5.attr, + &dev_attr_mapping6.attr, + &dev_attr_mapping7.attr, + &dev_attr_mapping8.attr, + &dev_attr_mapping9.attr, + &dev_attr_mapping10.attr, + &dev_attr_mapping11.attr, + &dev_attr_mapping12.attr, + &dev_attr_mapping13.attr, + &dev_attr_mapping14.attr, + &dev_attr_mapping15.attr, + &dev_attr_mapping16.attr, + &dev_attr_mapping17.attr, + &dev_attr_mapping18.attr, + &dev_attr_mapping19.attr, + &dev_attr_mapping20.attr, + &dev_attr_mapping21.attr, + &dev_attr_mapping22.attr, + &dev_attr_mapping23.attr, + &dev_attr_mapping24.attr, + &dev_attr_mapping25.attr, + &dev_attr_mapping26.attr, + &dev_attr_mapping27.attr, + &dev_attr_mapping28.attr, + &dev_attr_mapping29.attr, + &dev_attr_mapping30.attr, + &dev_attr_mapping31.attr, + NULL, +}; + +struct attribute_group nd_mapping_attribute_group = { + .is_visible = mapping_visible, + .attrs = mapping_attributes, +}; +EXPORT_SYMBOL_GPL(nd_mapping_attribute_group); + +int nd_blk_region_init(struct nd_region *nd_region) +{ + struct device *dev = &nd_region->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!is_nd_blk(dev)) + return 0; + + if (nd_region->ndr_mappings < 1) { + dev_err(dev, "invalid BLK region\n"); + return -ENXIO; + } + + return to_nd_blk_region(dev)->enable(nvdimm_bus, dev); +} + +/** + * nd_region_acquire_lane - allocate and lock a lane + * @nd_region: region id and number of lanes possible + * + * A lane correlates to a BLK-data-window and/or a log slot in the BTT. + * We optimize for the common case where there are 256 lanes, one + * per-cpu. For larger systems we need to lock to share lanes. For now + * this implementation assumes the cost of maintaining an allocator for + * free lanes is on the order of the lock hold time, so it implements a + * static lane = cpu % num_lanes mapping. + * + * In the case of a BTT instance on top of a BLK namespace a lane may be + * acquired recursively. We lock on the first instance. + * + * In the case of a BTT instance on top of PMEM, we only acquire a lane + * for the BTT metadata updates. + */ +unsigned int nd_region_acquire_lane(struct nd_region *nd_region) +{ + unsigned int cpu, lane; + + cpu = get_cpu(); + if (nd_region->num_lanes < nr_cpu_ids) { + struct nd_percpu_lane *ndl_lock, *ndl_count; + + lane = cpu % nd_region->num_lanes; + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (ndl_count->count++ == 0) + spin_lock(&ndl_lock->lock); + } else + lane = cpu; + + return lane; +} +EXPORT_SYMBOL(nd_region_acquire_lane); + +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) +{ + if (nd_region->num_lanes < nr_cpu_ids) { + unsigned int cpu = get_cpu(); + struct nd_percpu_lane *ndl_lock, *ndl_count; + + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (--ndl_count->count == 0) + spin_unlock(&ndl_lock->lock); + put_cpu(); + } + put_cpu(); +} +EXPORT_SYMBOL(nd_region_release_lane); + +static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc, struct device_type *dev_type, + const char *caller) +{ + struct nd_region *nd_region; + struct device *dev; + void *region_buf; + unsigned int i; + int ro = 0; + + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", + caller, dev_name(&nvdimm->dev), i); + + return NULL; + } + + if (nvdimm->flags & NDD_UNARMED) + ro = 1; + } + + if (dev_type == &nd_blk_device_type) { + struct nd_blk_region_desc *ndbr_desc; + struct nd_blk_region *ndbr; + + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + if (ndbr) { + nd_region = &ndbr->nd_region; + ndbr->enable = ndbr_desc->enable; + ndbr->disable = ndbr_desc->disable; + ndbr->do_io = ndbr_desc->do_io; + } + region_buf = ndbr; + } else { + nd_region = kzalloc(sizeof(struct nd_region) + + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + region_buf = nd_region; + } + + if (!region_buf) + return NULL; + nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); + if (nd_region->id < 0) + goto err_id; + + nd_region->lane = alloc_percpu(struct nd_percpu_lane); + if (!nd_region->lane) + goto err_percpu; + + for (i = 0; i < nr_cpu_ids; i++) { + struct nd_percpu_lane *ndl; + + ndl = per_cpu_ptr(nd_region->lane, i); + spin_lock_init(&ndl->lock); + ndl->count = 0; + } + + memcpy(nd_region->mapping, ndr_desc->nd_mapping, + sizeof(struct nd_mapping) * ndr_desc->num_mappings); + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + get_device(&nvdimm->dev); + } + nd_region->ndr_mappings = ndr_desc->num_mappings; + nd_region->provider_data = ndr_desc->provider_data; + nd_region->nd_set = ndr_desc->nd_set; + nd_region->num_lanes = ndr_desc->num_lanes; + nd_region->ro = ro; + nd_region->numa_node = ndr_desc->numa_node; + ida_init(&nd_region->ns_ida); + ida_init(&nd_region->btt_ida); + dev = &nd_region->dev; + dev_set_name(dev, "region%d", nd_region->id); + dev->parent = &nvdimm_bus->dev; + dev->type = dev_type; + dev->groups = ndr_desc->attr_groups; + nd_region->ndr_size = resource_size(ndr_desc->res); + nd_region->ndr_start = ndr_desc->res->start; + nd_device_register(dev); + + return nd_region; + + err_percpu: + ida_simple_remove(®ion_ida, nd_region->id); + err_id: + kfree(region_buf); + return NULL; +} + +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create); + +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + if (ndr_desc->num_mappings > 1) + return NULL; + ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES); + return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_blk_region_create); + +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index e9851add6f4e..964ad572aaee 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1056,19 +1056,21 @@ struct vfio_devices { static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) { struct vfio_devices *devs = data; - struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); - - if (pci_drv != &vfio_pci_driver) - return -EBUSY; + struct vfio_device *device; if (devs->cur_index == devs->max_index) return -ENOSPC; - devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); - if (!devs->devices[devs->cur_index]) + device = vfio_device_get_from_dev(&pdev->dev); + if (!device) return -EINVAL; - devs->cur_index++; + if (pci_dev_driver(pdev) != &vfio_pci_driver) { + vfio_device_put(device); + return -EBUSY; + } + + devs->devices[devs->cur_index++] = device; return 0; } diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig index 9a4403e2a36c..bb30128782aa 100644 --- a/drivers/vfio/platform/Kconfig +++ b/drivers/vfio/platform/Kconfig @@ -1,6 +1,6 @@ config VFIO_PLATFORM tristate "VFIO support for platform devices" - depends on VFIO && EVENTFD && ARM + depends on VFIO && EVENTFD && (ARM || ARM64) select VFIO_VIRQFD help Support for platform devices with VFIO. This is required to make @@ -18,3 +18,5 @@ config VFIO_AMBA framework. If you don't know what to do here, say N. + +source "drivers/vfio/platform/reset/Kconfig" diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile index 81de144c0eaa..9ce8afe28450 100644 --- a/drivers/vfio/platform/Makefile +++ b/drivers/vfio/platform/Makefile @@ -2,7 +2,9 @@ vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o +obj-$(CONFIG_VFIO_PLATFORM) += reset/ vfio-amba-y := vfio_amba.o obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o +obj-$(CONFIG_VFIO_AMBA) += reset/ diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig new file mode 100644 index 000000000000..746b96b0003b --- /dev/null +++ b/drivers/vfio/platform/reset/Kconfig @@ -0,0 +1,7 @@ +config VFIO_PLATFORM_CALXEDAXGMAC_RESET + tristate "VFIO support for calxeda xgmac reset" + depends on VFIO_PLATFORM + help + Enables the VFIO platform driver to handle reset for Calxeda xgmac + + If you don't know what to do here, say N. diff --git a/drivers/vfio/platform/reset/Makefile b/drivers/vfio/platform/reset/Makefile new file mode 100644 index 000000000000..2a486af9f8fa --- /dev/null +++ b/drivers/vfio/platform/reset/Makefile @@ -0,0 +1,5 @@ +vfio-platform-calxedaxgmac-y := vfio_platform_calxedaxgmac.o + +ccflags-y += -Idrivers/vfio/platform + +obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c new file mode 100644 index 000000000000..619dc7d22082 --- /dev/null +++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c @@ -0,0 +1,86 @@ +/* + * VFIO platform driver specialized for Calxeda xgmac reset + * reset code is inherited from calxeda xgmac native driver + * + * Copyright 2010-2011 Calxeda, Inc. + * Copyright (c) 2015 Linaro Ltd. + * www.linaro.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Eric Auger <eric.auger@linaro.org>" +#define DRIVER_DESC "Reset support for Calxeda xgmac vfio platform device" + +#define CALXEDAXGMAC_COMPAT "calxeda,hb-xgmac" + +/* XGMAC Register definitions */ +#define XGMAC_CONTROL 0x00000000 /* MAC Configuration */ + +/* DMA Control and Status Registers */ +#define XGMAC_DMA_CONTROL 0x00000f18 /* Ctrl (Operational Mode) */ +#define XGMAC_DMA_INTR_ENA 0x00000f1c /* Interrupt Enable */ + +/* DMA Control registe defines */ +#define DMA_CONTROL_ST 0x00002000 /* Start/Stop Transmission */ +#define DMA_CONTROL_SR 0x00000002 /* Start/Stop Receive */ + +/* Common MAC defines */ +#define MAC_ENABLE_TX 0x00000008 /* Transmitter Enable */ +#define MAC_ENABLE_RX 0x00000004 /* Receiver Enable */ + +static inline void xgmac_mac_disable(void __iomem *ioaddr) +{ + u32 value = readl(ioaddr + XGMAC_DMA_CONTROL); + + value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR); + writel(value, ioaddr + XGMAC_DMA_CONTROL); + + value = readl(ioaddr + XGMAC_CONTROL); + value &= ~(MAC_ENABLE_TX | MAC_ENABLE_RX); + writel(value, ioaddr + XGMAC_CONTROL); +} + +int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev) +{ + struct vfio_platform_region reg = vdev->regions[0]; + + if (!reg.ioaddr) { + reg.ioaddr = + ioremap_nocache(reg.addr, reg.size); + if (!reg.ioaddr) + return -ENOMEM; + } + + /* disable IRQ */ + writel(0, reg.ioaddr + XGMAC_DMA_INTR_ENA); + + /* Disable the MAC core */ + xgmac_mac_disable(reg.ioaddr); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_calxedaxgmac_reset); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index abcff7a1aa66..e43efb5e92bf 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -25,6 +25,44 @@ static DEFINE_MUTEX(driver_lock); +static const struct vfio_platform_reset_combo reset_lookup_table[] = { + { + .compat = "calxeda,hb-xgmac", + .reset_function_name = "vfio_platform_calxedaxgmac_reset", + .module_name = "vfio-platform-calxedaxgmac", + }, +}; + +static void vfio_platform_get_reset(struct vfio_platform_device *vdev, + struct device *dev) +{ + const char *compat; + int (*reset)(struct vfio_platform_device *); + int ret, i; + + ret = device_property_read_string(dev, "compatible", &compat); + if (ret) + return; + + for (i = 0 ; i < ARRAY_SIZE(reset_lookup_table); i++) { + if (!strcmp(reset_lookup_table[i].compat, compat)) { + request_module(reset_lookup_table[i].module_name); + reset = __symbol_get( + reset_lookup_table[i].reset_function_name); + if (reset) { + vdev->reset = reset; + return; + } + } + } +} + +static void vfio_platform_put_reset(struct vfio_platform_device *vdev) +{ + if (vdev->reset) + symbol_put_addr(vdev->reset); +} + static int vfio_platform_regions_init(struct vfio_platform_device *vdev) { int cnt = 0, i; @@ -100,6 +138,8 @@ static void vfio_platform_release(void *device_data) mutex_lock(&driver_lock); if (!(--vdev->refcnt)) { + if (vdev->reset) + vdev->reset(vdev); vfio_platform_regions_cleanup(vdev); vfio_platform_irq_cleanup(vdev); } @@ -127,6 +167,9 @@ static int vfio_platform_open(void *device_data) ret = vfio_platform_irq_init(vdev); if (ret) goto err_irq; + + if (vdev->reset) + vdev->reset(vdev); } vdev->refcnt++; @@ -159,6 +202,8 @@ static long vfio_platform_ioctl(void *device_data, if (info.argsz < minsz) return -EINVAL; + if (vdev->reset) + vdev->flags |= VFIO_DEVICE_FLAGS_RESET; info.flags = vdev->flags; info.num_regions = vdev->num_regions; info.num_irqs = vdev->num_irqs; @@ -252,8 +297,12 @@ static long vfio_platform_ioctl(void *device_data, return ret; - } else if (cmd == VFIO_DEVICE_RESET) - return -EINVAL; + } else if (cmd == VFIO_DEVICE_RESET) { + if (vdev->reset) + return vdev->reset(vdev); + else + return -EINVAL; + } return -ENOTTY; } @@ -502,6 +551,8 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, return ret; } + vfio_platform_get_reset(vdev, dev); + mutex_init(&vdev->igate); return 0; @@ -513,8 +564,11 @@ struct vfio_platform_device *vfio_platform_remove_common(struct device *dev) struct vfio_platform_device *vdev; vdev = vfio_del_group_dev(dev); - if (vdev) + + if (vdev) { + vfio_platform_put_reset(vdev); iommu_group_put(dev->iommu_group); + } return vdev; } diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 5d31e0473406..1c9b3d59543c 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -67,6 +67,13 @@ struct vfio_platform_device { struct resource* (*get_resource)(struct vfio_platform_device *vdev, int i); int (*get_irq)(struct vfio_platform_device *vdev, int i); + int (*reset)(struct vfio_platform_device *vdev); +}; + +struct vfio_platform_reset_combo { + const char *compat; + const char *reset_function_name; + const char *module_name; }; extern int vfio_platform_probe_common(struct vfio_platform_device *vdev, diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index e1278fe04b1e..2fb29dfeffbd 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -661,18 +661,29 @@ int vfio_add_group_dev(struct device *dev, EXPORT_SYMBOL_GPL(vfio_add_group_dev); /** - * Get a reference to the vfio_device for a device that is known to - * be bound to a vfio driver. The driver implicitly holds a - * vfio_device reference between vfio_add_group_dev and - * vfio_del_group_dev. We can therefore use drvdata to increment - * that reference from the struct device. This additional - * reference must be released by calling vfio_device_put. + * Get a reference to the vfio_device for a device. Even if the + * caller thinks they own the device, they could be racing with a + * release call path, so we can't trust drvdata for the shortcut. + * Go the long way around, from the iommu_group to the vfio_group + * to the vfio_device. */ struct vfio_device *vfio_device_get_from_dev(struct device *dev) { - struct vfio_device *device = dev_get_drvdata(dev); + struct iommu_group *iommu_group; + struct vfio_group *group; + struct vfio_device *device; + + iommu_group = iommu_group_get(dev); + if (!iommu_group) + return NULL; - vfio_device_get(device); + group = vfio_group_get_from_iommu(iommu_group); + iommu_group_put(iommu_group); + if (!group) + return NULL; + + device = vfio_group_get_device(group, dev); + vfio_group_put(group); return device; } diff --git a/fs/Makefile b/fs/Makefile index cb92fd4c3172..cb20e4bf2303 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ -obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_TRACING) += tracefs/ diff --git a/fs/block_dev.c b/fs/block_dev.c index f04c873a7365..4fe10f93db8a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } -inline struct block_device *I_BDEV(struct inode *inode) +struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } @@ -377,7 +377,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); } @@ -408,7 +408,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index df9932b00d08..1ce06c849a86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ec2ee477f8ba..b0b093b6afec 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 614aaa1969bd..802fabb30e15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -934,9 +953,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1034,7 +1054,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0f11ebc92f02..54114b4887dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f364e1d8d3d..80a9aefb0c46 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1619,10 +1619,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1698,6 +1695,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1735,7 +1733,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -3458,6 +3456,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3515,6 +3514,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -4050,6 +4052,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4065,10 +4068,12 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4111,11 +4116,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4132,6 +4143,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8f8ed7d20bac..ac3e81da6d4e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5eb0892396d0..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; @@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0573848c7333..862fbc206755 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_error(root->fs_info, ret, "kobj add dev failed"); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0bccf18dc1dc..3f43bfea3684 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; @@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -2320,8 +2319,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2494,7 +2497,6 @@ int open_ctree(struct super_block *sb, seqlock_init(&fs_info->profiles_lock); init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2797,8 +2799,8 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; @@ -2834,8 +2836,8 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); @@ -2874,10 +2876,22 @@ retry_root_backup: btrfs_close_extra_devices(fs_devices, 1); + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } + ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -3055,6 +3069,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3725,6 +3742,7 @@ void close_ctree(struct btrfs_root *root) } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4053,6 +4071,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4068,11 +4087,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec3acd14cbf..38b76cc02f48 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2293,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + if (list_empty(&head->ref_list)) + return NULL; - /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. - */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->action == BTRFS_ADD_DELAYED_REF) - return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); - } - return last; + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2352,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2429,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2443,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2864,9 +2811,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2905,7 +2849,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2934,11 +2877,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3693,7 +3632,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3721,7 +3661,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3975,6 +3918,9 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4088,7 +4034,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -4102,24 +4048,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -4130,7 +4095,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -5188,6 +5167,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -6092,11 +6089,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -6110,10 +6106,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6294,7 +6292,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6356,18 +6353,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6393,7 +6378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -7303,13 +7288,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7391,14 +7369,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7755,12 +7725,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7783,13 +7759,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7858,6 +7827,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7920,7 +7891,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7931,16 +7906,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -8151,7 +8116,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8533,24 +8500,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8604,14 +8553,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -9562,6 +9503,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9569,6 +9523,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c374e1e71e5f..02d05817cbdf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -4490,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b072e17479aa..795d754327a7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9dbe5b548fa6..fb5a6b1c62a6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, { int ret = 0; struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; if (!path) { ret = -ENOMEM; @@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, } if (block_group) { + locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); @@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - mutex_unlock(&trans->transaction->cache_write_mutex); - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); - if (block_group) - mutex_unlock(&trans->transaction->cache_write_mutex); - fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bb013672aee..855935f6671a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4986,24 +4986,40 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that didn't get their end io callbacks called yet or they are still + * in progress ((extent_io.c:end_bio_extent_readpage()). This means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1c22c6518504..c86b835da7a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -553,8 +553,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -1318,7 +1318,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1368,7 +1368,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -2271,10 +2271,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2282,13 +2279,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2413,8 +2425,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2508,7 +2518,7 @@ out_up_write: out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2879,12 +2889,19 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2892,10 +2909,11 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; /* * btrfs_clone() can't handle extents in the same file @@ -2910,11 +2928,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, btrfs_double_lock(src, loff, dst, dst_loff, len); - ret = extent_same_check_offsets(src, loff, len); + ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, len); + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); if (ret) goto out_unlock; @@ -2927,7 +2945,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff); out_unlock: btrfs_double_unlock(src, loff, dst, dst_loff, len); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760c4a5e096b..89656d799ff6 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -844,6 +851,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd805..7176cc0fe43f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3d6546581bb9..d5f1f033b7a0 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct ulist *tmp; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1356,239 +1388,86 @@ out: return ret; } -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; + int ret = 0; -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); + if (ret < 0) + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; + return ret; } -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) p = &(*p)->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } - } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; - } - } - - ret = insert_qgroup_oper(fs_info, oper); - if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; -} - -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *tmp; - int sign = 0; - int ret = 0; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, - oper->num_bytes, sign); -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1603,29 +1482,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1644,161 +1504,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1810,423 +1555,191 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; - - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } + } if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); - if (ret < 0) - goto out; + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. + * Bump qgroup_seq to avoid seq overlap */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - - - /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. - */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = SEQ_LIST_INIT(elem); - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2637,15 +2150,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2695,33 +2206,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2735,7 +2228,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; @@ -2743,12 +2235,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2764,7 +2250,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2774,8 +2260,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c5242aa9a4b2..6387dcfa354c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74b24b01d574..827951fbf7fc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ab5811545a98..9f2feabe99f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +{ + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); + struct scrub_ctx *sctx = sparity->sctx; + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + static void scrub_parity_bio_endio(struct bio *bio, int error) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - struct scrub_ctx *sctx = sparity->sctx; if (error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -3589,6 +3601,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) { + ret = -ENOMEM; + goto out; + } } ++fs_info->scrub_workers_refcnt; out: @@ -3601,6 +3620,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a1216f9b4917..aa72bfd28f7d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1158,6 +1159,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3353,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3374,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3420,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3589,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3609,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4508,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9e66f5e724db..cd7ef34d2dce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } } + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); + } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; + path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - if (!(sb->s_flags & MS_RDONLY)) { - int ret; - down_read(&fs_info->cleanup_work_sem); - ret = btrfs_orphan_cleanup(new_root); - up_read(&fs_info->cleanup_work_sem); - if (ret) - return ERR_PTR(ret); - } - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && d_inode(sb->s_root) == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; + char *buf, *dst, *sep; - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; - - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e8a4c86d274d..603b0cc2b9bb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3a4bbed723fd..6392527bcc15 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index c32a7ba76bca..846d277b1901 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5628e25250c0..c0f18e7266b6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -225,12 +225,14 @@ loop: cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -509,6 +511,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { @@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2123,6 +2161,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0b24755596ba..eb09c2067fa8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -102,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_unlock(&BTRFS_I(inode)->lock); } +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9578..a4b9c8b2d35a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d04968374e9d..1ce80c1c4eb6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b2778a..91feb2bdefee 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604bbe..a01a2c45825f 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 53af23f2c087..4b438b4c8c91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1067,15 +1132,31 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } if (search_list == &trans->transaction->pending_chunks) { @@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -3908,9 +3996,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3965,6 +4053,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -4045,15 +4134,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4065,6 +4145,35 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans, device, &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4079,6 +4188,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -6072,6 +6191,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6191,10 +6312,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6722,3 +6844,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cedae0356558..95842a909e7f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -535,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif @@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, out: i_mmap_unlock_read(mapping); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) + complete_unwritten(&bh, !error); out: if (error == -ENOMEM) @@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3a0a6c6406d0..3b57c9f83c9b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -28,12 +28,12 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac517f15741c..bc313ac5d3fa 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -192,15 +192,27 @@ out: } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); /* Is this the right get_block? */ } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f8a8d4ee7459..41f8e55afcd1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -656,18 +656,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile deleted file mode 100644 index 3a982bd975d2..000000000000 --- a/fs/hppfs/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# -# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL -# - -obj-$(CONFIG_HPPFS) += hppfs.o diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c deleted file mode 100644 index 2867837909a9..000000000000 --- a/fs/hppfs/hppfs.c +++ /dev/null @@ -1,765 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/ctype.h> -#include <linux/dcache.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/statfs.h> -#include <linux/types.h> -#include <linux/pid_namespace.h> -#include <linux/namei.h> -#include <asm/uaccess.h> -#include <os.h> - -static struct inode *get_inode(struct super_block *, struct dentry *); - -struct hppfs_data { - struct list_head list; - char contents[PAGE_SIZE - sizeof(struct list_head)]; -}; - -struct hppfs_private { - struct file *proc_file; - int host_fd; - loff_t len; - struct hppfs_data *contents; -}; - -struct hppfs_inode_info { - struct dentry *proc_dentry; - struct inode vfs_inode; -}; - -static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) -{ - return container_of(inode, struct hppfs_inode_info, vfs_inode); -} - -#define HPPFS_SUPER_MAGIC 0xb00000ee - -static const struct super_operations hppfs_sbops; - -static int is_pid(struct dentry *dentry) -{ - struct super_block *sb; - int i; - - sb = dentry->d_sb; - if (dentry->d_parent != sb->s_root) - return 0; - - for (i = 0; i < dentry->d_name.len; i++) { - if (!isdigit(dentry->d_name.name[i])) - return 0; - } - return 1; -} - -static char *dentry_name(struct dentry *dentry, int extra) -{ - struct dentry *parent; - char *root, *name; - const char *seg_name; - int len, seg_len, root_len; - - len = 0; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) - len += strlen("pid") + 1; - else len += parent->d_name.len + 1; - parent = parent->d_parent; - } - - root = "proc"; - root_len = strlen(root); - len += root_len; - name = kmalloc(len + extra + 1, GFP_KERNEL); - if (name == NULL) - return NULL; - - name[len] = '\0'; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) { - seg_name = "pid"; - seg_len = strlen(seg_name); - } - else { - seg_name = parent->d_name.name; - seg_len = parent->d_name.len; - } - - len -= seg_len + 1; - name[len] = '/'; - memcpy(&name[len + 1], seg_name, seg_len); - parent = parent->d_parent; - } - memcpy(name, root, root_len); - return name; -} - -static int file_removed(struct dentry *dentry, const char *file) -{ - char *host_file; - int extra, fd; - - extra = 0; - if (file != NULL) - extra += strlen(file) + 1; - - host_file = dentry_name(dentry, extra + strlen("/remove")); - if (host_file == NULL) { - printk(KERN_ERR "file_removed : allocation failed\n"); - return -ENOMEM; - } - - if (file != NULL) { - strcat(host_file, "/"); - strcat(host_file, file); - } - strcat(host_file, "/remove"); - - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - kfree(host_file); - if (fd > 0) { - os_close_file(fd); - return 1; - } - return 0; -} - -static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *proc_dentry, *parent; - struct qstr *name = &dentry->d_name; - struct inode *inode; - int err, deleted; - - deleted = file_removed(dentry, NULL); - if (deleted < 0) - return ERR_PTR(deleted); - else if (deleted) - return ERR_PTR(-ENOENT); - - parent = HPPFS_I(ino)->proc_dentry; - mutex_lock(&d_inode(parent)->i_mutex); - proc_dentry = lookup_one_len(name->name, parent, name->len); - mutex_unlock(&d_inode(parent)->i_mutex); - - if (IS_ERR(proc_dentry)) - return proc_dentry; - - err = -ENOMEM; - inode = get_inode(ino->i_sb, proc_dentry); - if (!inode) - goto out; - - d_add(dentry, inode); - return NULL; - - out: - return ERR_PTR(err); -} - -static const struct inode_operations hppfs_file_iops = { -}; - -static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, - loff_t *ppos, int is_user) -{ - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - ssize_t n; - - read = file_inode(file)->i_fop->read; - - if (!is_user) - set_fs(KERNEL_DS); - - n = (*read)(file, buf, count, &file->f_pos); - - if (!is_user) - set_fs(USER_DS); - - if (ppos) - *ppos = file->f_pos; - return n; -} - -static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) -{ - ssize_t n; - int cur, err; - char *new_buf; - - n = -ENOMEM; - new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (new_buf == NULL) { - printk(KERN_ERR "hppfs_read_file : kmalloc failed\n"); - goto out; - } - n = 0; - while (count > 0) { - cur = min_t(ssize_t, count, PAGE_SIZE); - err = os_read_file(fd, new_buf, cur); - if (err < 0) { - printk(KERN_ERR "hppfs_read : read failed, " - "errno = %d\n", err); - n = err; - goto out_free; - } else if (err == 0) - break; - - if (copy_to_user(buf, new_buf, err)) { - n = -EFAULT; - goto out_free; - } - n += err; - count -= err; - } - out_free: - kfree(new_buf); - out: - return n; -} - -static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - struct hppfs_private *hppfs = file->private_data; - struct hppfs_data *data; - loff_t off; - int err; - - if (hppfs->contents != NULL) { - int rem; - - if (*ppos >= hppfs->len) - return 0; - - data = hppfs->contents; - off = *ppos; - while (off >= sizeof(data->contents)) { - data = list_entry(data->list.next, struct hppfs_data, - list); - off -= sizeof(data->contents); - } - - if (off + count > hppfs->len) - count = hppfs->len - off; - rem = copy_to_user(buf, &data->contents[off], count); - *ppos += count - rem; - if (rem > 0) - return -EFAULT; - } else if (hppfs->host_fd != -1) { - err = os_seek_file(hppfs->host_fd, *ppos); - if (err) { - printk(KERN_ERR "hppfs_read : seek failed, " - "errno = %d\n", err); - return err; - } - err = hppfs_read_file(hppfs->host_fd, buf, count); - if (err < 0) { - printk(KERN_ERR "hppfs_read: read failed: %d\n", err); - return err; - } - count = err; - if (count > 0) - *ppos += count; - } - else count = read_proc(hppfs->proc_file, buf, count, ppos, 1); - - return count; -} - -static ssize_t hppfs_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - write = file_inode(proc_file)->i_fop->write; - return (*write)(proc_file, buf, len, ppos); -} - -static int open_host_sock(char *host_file, int *filter_out) -{ - char *end; - int fd; - - end = &host_file[strlen(host_file)]; - strcpy(end, "/rw"); - *filter_out = 1; - fd = os_connect_socket(host_file); - if (fd > 0) - return fd; - - strcpy(end, "/r"); - *filter_out = 0; - fd = os_connect_socket(host_file); - return fd; -} - -static void free_contents(struct hppfs_data *head) -{ - struct hppfs_data *data; - struct list_head *ele, *next; - - if (head == NULL) - return; - - list_for_each_safe(ele, next, &head->list) { - data = list_entry(ele, struct hppfs_data, list); - kfree(data); - } - kfree(head); -} - -static struct hppfs_data *hppfs_get_data(int fd, int filter, - struct file *proc_file, - struct file *hppfs_file, - loff_t *size_out) -{ - struct hppfs_data *data, *new, *head; - int n, err; - - err = -ENOMEM; - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) { - printk(KERN_ERR "hppfs_get_data : head allocation failed\n"); - goto failed; - } - - INIT_LIST_HEAD(&data->list); - - head = data; - *size_out = 0; - - if (filter) { - while ((n = read_proc(proc_file, data->contents, - sizeof(data->contents), NULL, 0)) > 0) - os_write_file(fd, data->contents, n); - err = os_shutdown_socket(fd, 0, 1); - if (err) { - printk(KERN_ERR "hppfs_get_data : failed to shut down " - "socket\n"); - goto failed_free; - } - } - while (1) { - n = os_read_file(fd, data->contents, sizeof(data->contents)); - if (n < 0) { - err = n; - printk(KERN_ERR "hppfs_get_data : read failed, " - "errno = %d\n", err); - goto failed_free; - } else if (n == 0) - break; - - *size_out += n; - - if (n < sizeof(data->contents)) - break; - - new = kmalloc(sizeof(*data), GFP_KERNEL); - if (new == 0) { - printk(KERN_ERR "hppfs_get_data : data allocation " - "failed\n"); - err = -ENOMEM; - goto failed_free; - } - - INIT_LIST_HEAD(&new->list); - list_add(&new->list, &data->list); - data = new; - } - return head; - - failed_free: - free_contents(head); - failed: - return ERR_PTR(err); -} - -static struct hppfs_private *hppfs_data(void) -{ - struct hppfs_private *data; - - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return data; - - *data = ((struct hppfs_private ) { .host_fd = -1, - .len = -1, - .contents = NULL } ); - return data; -} - -static int file_mode(int fmode) -{ - if (fmode == (FMODE_READ | FMODE_WRITE)) - return O_RDWR; - if (fmode == FMODE_READ) - return O_RDONLY; - if (fmode == FMODE_WRITE) - return O_WRONLY; - return 0; -} - -static int hppfs_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - char *host_file; - int err, fd, type, filter; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - host_file = dentry_name(file->f_path.dentry, strlen("/rw")); - if (host_file == NULL) - goto out_free2; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free1; - - type = os_file_type(host_file); - if (type == OS_TYPE_FILE) { - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - if (fd >= 0) - data->host_fd = fd; - else - printk(KERN_ERR "hppfs_open : failed to open '%s', " - "errno = %d\n", host_file, -fd); - - data->contents = NULL; - } else if (type == OS_TYPE_DIR) { - fd = open_host_sock(host_file, &filter); - if (fd > 0) { - data->contents = hppfs_get_data(fd, filter, - data->proc_file, - file, &data->len); - if (!IS_ERR(data->contents)) - data->host_fd = fd; - } else - printk(KERN_ERR "hppfs_open : failed to open a socket " - "in '%s', errno = %d\n", host_file, -fd); - } - kfree(host_file); - - file->private_data = data; - return 0; - - out_free1: - kfree(host_file); - out_free2: - free_contents(data->contents); - kfree(data); - out: - return err; -} - -static int hppfs_dir_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - int err; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free; - - file->private_data = data; - return 0; - - out_free: - kfree(data); - out: - return err; -} - -static loff_t hppfs_llseek(struct file *file, loff_t off, int where) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - loff_t (*llseek)(struct file *, loff_t, int); - loff_t ret; - - llseek = file_inode(proc_file)->i_fop->llseek; - if (llseek != NULL) { - ret = (*llseek)(proc_file, off, where); - if (ret < 0) - return ret; - } - - return default_llseek(file, off, where); -} - -static int hppfs_release(struct inode *inode, struct file *file) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - if (proc_file) - fput(proc_file); - kfree(data); - return 0; -} - -static const struct file_operations hppfs_file_fops = { - .owner = NULL, - .llseek = hppfs_llseek, - .read = hppfs_read, - .write = hppfs_write, - .open = hppfs_open, - .release = hppfs_release, -}; - -struct hppfs_dirent { - struct dir_context ctx; - struct dir_context *caller; - struct dentry *dentry; -}; - -static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, - loff_t offset, u64 inode, unsigned int type) -{ - struct hppfs_dirent *dirent = - container_of(ctx, struct hppfs_dirent, ctx); - - if (file_removed(dirent->dentry, name)) - return 0; - - dirent->caller->pos = dirent->ctx.pos; - return !dir_emit(dirent->caller, name, size, inode, type); -} - -static int hppfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - struct hppfs_dirent d = { - .ctx.actor = hppfs_filldir, - .caller = ctx, - .dentry = file->f_path.dentry - }; - int err; - proc_file->f_pos = ctx->pos; - err = iterate_dir(proc_file, &d.ctx); - ctx->pos = d.ctx.pos; - return err; -} - -static const struct file_operations hppfs_dir_fops = { - .owner = NULL, - .iterate = hppfs_readdir, - .open = hppfs_dir_open, - .llseek = default_llseek, - .release = hppfs_release, -}; - -static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) -{ - sf->f_blocks = 0; - sf->f_bfree = 0; - sf->f_bavail = 0; - sf->f_files = 0; - sf->f_ffree = 0; - sf->f_type = HPPFS_SUPER_MAGIC; - return 0; -} - -static struct inode *hppfs_alloc_inode(struct super_block *sb) -{ - struct hppfs_inode_info *hi; - - hi = kmalloc(sizeof(*hi), GFP_KERNEL); - if (!hi) - return NULL; - - hi->proc_dentry = NULL; - inode_init_once(&hi->vfs_inode); - return &hi->vfs_inode; -} - -void hppfs_evict_inode(struct inode *ino) -{ - clear_inode(ino); - dput(HPPFS_I(ino)->proc_dentry); - mntput(ino->i_sb->s_fs_info); -} - -static void hppfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(HPPFS_I(inode)); -} - -static void hppfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hppfs_i_callback); -} - -static const struct super_operations hppfs_sbops = { - .alloc_inode = hppfs_alloc_inode, - .destroy_inode = hppfs_destroy_inode, - .evict_inode = hppfs_evict_inode, - .statfs = hppfs_statfs, -}; - -static int hppfs_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer, - buflen); -} - -static const char *hppfs_follow_link(struct dentry *dentry, void **cookie) -{ - struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - - return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie); -} - -static void hppfs_put_link(struct inode *inode, void *cookie) -{ - struct inode *proc_inode = d_inode(HPPFS_I(inode)->proc_dentry); - - if (proc_inode->i_op->put_link) - proc_inode->i_op->put_link(proc_inode, cookie); -} - -static const struct inode_operations hppfs_dir_iops = { - .lookup = hppfs_lookup, -}; - -static const struct inode_operations hppfs_link_iops = { - .readlink = hppfs_readlink, - .follow_link = hppfs_follow_link, - .put_link = hppfs_put_link, -}; - -static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) -{ - struct inode *proc_ino = d_inode(dentry); - struct inode *inode = new_inode(sb); - - if (!inode) { - dput(dentry); - return NULL; - } - - if (d_is_dir(dentry)) { - inode->i_op = &hppfs_dir_iops; - inode->i_fop = &hppfs_dir_fops; - } else if (d_is_symlink(dentry)) { - inode->i_op = &hppfs_link_iops; - inode->i_fop = &hppfs_file_fops; - } else { - inode->i_op = &hppfs_file_iops; - inode->i_fop = &hppfs_file_fops; - } - - HPPFS_I(inode)->proc_dentry = dentry; - - inode->i_uid = proc_ino->i_uid; - inode->i_gid = proc_ino->i_gid; - inode->i_atime = proc_ino->i_atime; - inode->i_mtime = proc_ino->i_mtime; - inode->i_ctime = proc_ino->i_ctime; - inode->i_ino = proc_ino->i_ino; - inode->i_mode = proc_ino->i_mode; - set_nlink(inode, proc_ino->i_nlink); - inode->i_size = proc_ino->i_size; - inode->i_blocks = proc_ino->i_blocks; - - return inode; -} - -static int hppfs_fill_super(struct super_block *sb, void *d, int silent) -{ - struct inode *root_inode; - struct vfsmount *proc_mnt; - int err = -ENOENT; - - proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); - if (IS_ERR(proc_mnt)) - goto out; - - sb->s_blocksize = 1024; - sb->s_blocksize_bits = 10; - sb->s_magic = HPPFS_SUPER_MAGIC; - sb->s_op = &hppfs_sbops; - sb->s_fs_info = proc_mnt; - - err = -ENOMEM; - root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) - goto out_mntput; - - return 0; - - out_mntput: - mntput(proc_mnt); - out: - return(err); -} - -static struct dentry *hppfs_read_super(struct file_system_type *type, - int flags, const char *dev_name, - void *data) -{ - return mount_nodev(type, flags, data, hppfs_fill_super); -} - -static struct file_system_type hppfs_type = { - .owner = THIS_MODULE, - .name = "hppfs", - .mount = hppfs_read_super, - .kill_sb = kill_anon_super, - .fs_flags = 0, -}; -MODULE_ALIAS_FS("hppfs"); - -static int __init init_hppfs(void) -{ - return register_filesystem(&hppfs_type); -} - -static void __exit exit_hppfs(void) -{ - unregister_filesystem(&hppfs_type); -} - -module_init(init_hppfs) -module_exit(exit_hppfs) -MODULE_LICENSE("GPL"); diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155be8..52b492721603 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -538,6 +538,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 516162be1398..f9e9ffe6fb46 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -795,9 +809,13 @@ xfs_alloc_find_best_extent( * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -976,6 +1005,8 @@ restart: <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -1096,11 +1127,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1112,11 +1143,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1216,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, @@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels( xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_perag *pag, + xfs_extlen_t need) { - xfs_extlen_t need, delta = 0; + xfs_extlen_t delta = 0; - need = XFS_MIN_FREELIST_PAG(pag, mp); if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; @@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +unsigned int +xfs_alloc_min_freelist( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + unsigned int min_free; + + /* space needed by-bno freespace btree */ + min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + mp->m_ag_maxlevels); + /* space needed by-size freespace btree */ + min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + mp->m_ag_maxlevels); + + return min_free; +} + +/* + * Check if the operation we are fixing up the freelist for should go ahead or + * not. If we are freeing blocks, we always allow it, otherwise the allocation + * is dependent on whether the size and shape of free space available will + * permit the requested allocation to take place. + */ +static bool +xfs_alloc_space_available( + struct xfs_alloc_arg *args, + xfs_extlen_t min_free, + int flags) +{ + struct xfs_perag *pag = args->pag; + xfs_extlen_t longest; + int available; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return true; + + /* do we have enough contiguous free space for the allocation? */ + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + return false; + + /* do have enough free space remaining for the allocation? */ + available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + min_free - args->total); + if (available < (int)args->minleft) + return false; + + return true; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ STATIC int /* error */ xfs_alloc_fix_freelist( - xfs_alloc_arg_t *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + struct xfs_alloc_arg *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ { - xfs_buf_t *agbp; /* agf buffer pointer */ - xfs_agf_t *agf; /* a.g. freespace structure pointer */ - xfs_buf_t *agflbp;/* agfl buffer pointer */ - xfs_agblock_t bno; /* freelist block */ - xfs_extlen_t delta; /* new blocks needed in freelist */ - int error; /* error result code */ - xfs_extlen_t longest;/* longest extent in allocation group */ - xfs_mount_t *mp; /* file system mount point structure */ - xfs_extlen_t need; /* total blocks needed in freelist */ - xfs_perag_t *pag; /* per-ag information structure */ - xfs_alloc_arg_t targs; /* local allocation arguments */ - xfs_trans_t *tp; /* transaction pointer */ - - mp = args->mp; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag = args->pag; + struct xfs_trans *tp = args->tp; + struct xfs_buf *agbp = NULL; + struct xfs_buf *agflbp = NULL; + struct xfs_alloc_arg targs; /* local allocation arguments */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t need; /* total blocks needed in freelist */ + int error; - pag = args->pag; - tp = args->tp; if (!pag->pagf_init) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; if (!pag->pagf_init) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - } else - agbp = NULL; + } /* - * If this is a metadata preferred pag and we are user data - * then try somewhere else if we are not being asked to - * try harder at this point + * If this is a metadata preferred pag and we are user data then try + * somewhere else if we are not being asked to try harder at this + * point */ if (pag->pagf_metadata && args->userdata && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - need = XFS_MIN_FREELIST_PAG(pag, mp); - longest = xfs_alloc_longest_free_extent(mp, pag); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; - } - } + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. */ - if (agbp == NULL) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; - if (agbp == NULL) { + if (!agbp) { + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; + if (!agbp) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; - } - } - /* - * Figure out how many blocks we should have in the freelist. - */ - agf = XFS_BUF_TO_AGF(agbp); - need = XFS_MIN_FREELIST(agf, mp); - /* - * If there isn't enough total or single-extent, reject it. - */ - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + goto out_no_agbp; } } + + /* If there isn't enough total space or single-extent, reject it. */ + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; + /* * Make the freelist shorter if it's too long. + * + * Note that from this point onwards, we will always release the agf and + * agfl buffers on error. This handles the case where we error out and + * the buffers are clean or may not have been joined to the transaction + * and hence need to be released manually. If they have been joined to + * the transaction, then xfs_trans_brelse() will handle them + * appropriately based on the recursion count and dirty state of the + * buffer. + * + * XXX (dgc): When we have lots of free space, does this buy us + * anything other than extra overhead when we need to put more blocks + * back on the free list? Maybe we should only do this when space is + * getting low or the AGFL is more than half full? */ - while (be32_to_cpu(agf->agf_flcount) > need) { - xfs_buf_t *bp; + while (pag->pagf_flcount > need) { + struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) - return error; - if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) - return error; + goto out_agbp_relse; + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + if (error) + goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - /* - * Initialize the args structure. - */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; @@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) - return error; - /* - * Make the freelist longer if it's too short. - */ - while (be32_to_cpu(agf->agf_flcount) < need) { + error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + if (error) + goto out_agbp_relse; + + /* Make the freelist longer if it's too short. */ + while (pag->pagf_flcount < need) { targs.agbno = 0; - targs.maxlen = need - be32_to_cpu(agf->agf_flcount); - /* - * Allocate as many blocks as possible at once. - */ - if ((error = xfs_alloc_ag_vextent(&targs))) { - xfs_trans_brelse(tp, agflbp); - return error; - } + targs.maxlen = need - pag->pagf_flcount; + + /* Allocate as many blocks as possible at once. */ + error = xfs_alloc_ag_vextent(&targs); + if (error) + goto out_agflbp_relse; + /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist( if (targs.agbno == NULLAGBLOCK) { if (flags & XFS_ALLOC_FLAG_FREEING) break; - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; + goto out_agflbp_relse; } /* * Put each allocated block on the list. @@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist( error = xfs_alloc_put_freelist(tp, agbp, agflbp, bno, 0); if (error) - return error; + goto out_agflbp_relse; } } xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; + +out_agflbp_relse: + xfs_trans_brelse(tp, agflbp); +out_agbp_relse: + if (agbp) + xfs_trans_brelse(tp, agbp); +out_no_agbp: + args->agbp = NULL; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c894..ca1c8168373a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ @@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ -/* - * Find the length of the longest extent in an AG. - */ -xfs_extlen_t -xfs_alloc_longest_free_extent(struct xfs_mount *mp, +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag, xfs_extlen_t need); +unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0a472fbe06d4..3349c9a1e845 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -266,7 +266,7 @@ xfs_attr_set( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -276,7 +276,7 @@ xfs_attr_set( XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(args.trans); return error; } @@ -320,8 +320,7 @@ xfs_attr_set( xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); } - err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES); + err2 = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error ? error : err2; @@ -383,16 +382,14 @@ xfs_attr_set( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -462,7 +459,7 @@ xfs_attr_remove( error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } @@ -501,16 +498,14 @@ xfs_attr_remove( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f1026e86dabc..63e05b663380 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ - int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork( tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); if (error) goto trans_cancel; - cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { @@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; bmap_cancel: xfs_bmap_cancel(&flist); trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (*blen < longest) *blen = longest; @@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten( error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); - bma->logflags |= tmp_logflags; + /* + * Log the inode core unconditionally in the unwritten extent conversion + * path because the conversion might not have done so (e.g., if the + * extent count hasn't changed). We need to make sure the inode is dirty + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. + */ + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -5918,7 +5924,7 @@ xfs_bmap_split_extent( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -5936,10 +5942,9 @@ xfs_bmap_split_extent( if (error) goto out; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - + return xfs_trans_commit(tp); out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 4daaa662337b..a0ae572051de 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -170,7 +170,7 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ @@ -256,7 +256,7 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ @@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + /* * end of superblock version macros */ @@ -758,19 +766,6 @@ typedef struct xfs_agfl { #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ @@ -1216,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure @@ -1453,8 +1476,8 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..89689c6a43e2 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1c9e75521250..66efc702452a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -347,6 +378,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + do_sparse = prandom_u32() & 1; +#endif /* * Locking will ensure that we don't have two callers in here @@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + ASSERT(newlen <= XFS_INODES_PER_CHUNK); + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -645,7 +990,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -732,6 +1077,27 @@ xfs_ialloc_get_rec( } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -961,7 +1327,7 @@ newino: } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1210,7 +1576,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1439,6 +1805,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1446,8 +1889,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1501,20 +1943,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1530,11 +1975,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1599,7 +2042,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1634,8 +2079,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1671,8 +2121,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1723,8 +2172,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56449..6e450df2979b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -44,8 +51,7 @@ xfs_icluster_size_fsb( static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (struct xfs_dinode *) - (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); + return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } /* @@ -90,8 +96,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca69c..674ad8f760be 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..bd88453217ce 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 002b6b3a1988..6526e7696184 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -46,8 +46,7 @@ xfs_inobp_check( j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); + dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", @@ -86,8 +85,7 @@ xfs_inode_buf_verify( int di_ok; xfs_dinode_t *dip; - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); + dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -186,7 +184,7 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dc4bfc5d88fc..df9851c46b5c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -174,6 +174,27 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -374,7 +395,7 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* Convert on-disk flags to in-memory flags? */ @@ -516,7 +537,7 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -689,6 +710,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* @@ -792,12 +818,12 @@ xfs_sync_sb( tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8dda4b321343..5be529707903 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer count in superblock */ /* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* * Field values for xfs_trans_mod_sb. */ #define XFS_TRANS_SB_ICOUNT 0x00000001 diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 2d5bdfce6d8f..797815012c0e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,9 +73,9 @@ struct xfs_trans_resv { * 2 trees * (2 blocks/level * max depth - 1) * block size */ #define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) /* * Per-directory log reservation for any directory change. diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c4579334d..41e0428d8175 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -67,7 +67,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * XFS_AG_MAXLEVELS(mp)) + (2 * (mp)->m_ag_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e5099f268032..3859f5e27a4d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -145,7 +145,7 @@ xfs_setfilesize( isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return 0; } @@ -155,7 +155,7 @@ xfs_setfilesize( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } STATIC int @@ -1348,7 +1348,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1413,6 +1413,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1507,49 +1508,29 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, loff_t offset, - ssize_t size, - void *private) + ssize_t size) { - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_ioend *ioend = private; - - trace_xfs_gbmap_direct_endio(ip, offset, size, - ioend ? ioend->io_type : 0, NULL); + struct xfs_mount *mp = XFS_I(inode)->i_mount; - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; - } - - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) goto out_end_io; /* @@ -1586,10 +1567,10 @@ xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&ip->i_flags_lock); + spin_lock(&XFS_I(inode)->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); + spin_unlock(&XFS_I(inode)->i_flags_lock); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1606,6 +1587,98 @@ out_end_io: return; } +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; + + /* + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); + +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); +} + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1613,16 +1686,11 @@ xfs_vm_direct_IO( loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (iov_iter_rw(iter) == WRITE) { - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0137a4..86afd1ac7895 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 3fbf167cfb4c..2bb959ada45b 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -394,7 +394,6 @@ xfs_attr_inactive( { struct xfs_trans *trans; struct xfs_mount *mp; - int cancel_flags = 0; int lock_mode = XFS_ILOCK_SHARED; int error = 0; @@ -423,7 +422,6 @@ xfs_attr_inactive( goto out_cancel; lock_mode = XFS_ILOCK_EXCL; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; xfs_ilock(dp, lock_mode); if (!XFS_IFORK_Q(dp)) @@ -435,8 +433,14 @@ xfs_attr_inactive( */ xfs_trans_ijoin(trans, dp, 0); - /* invalidate and truncate the attribute fork extents */ - if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + /* + * Invalidate and truncate the attribute fork extents. Make sure the + * fork actually has attributes as otherwise the invalidation has no + * blocks to read and returns an error. In this case, just do the fork + * removal below. + */ + if (xfs_inode_hasattr(dp) && + dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -449,12 +453,12 @@ xfs_attr_inactive( /* Reset the attribute fork - this also destroys the in-core fork */ xfs_attr_fork_remove(dp, trans); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(trans); xfs_iunlock(dp, lock_mode); return error; out_cancel: - xfs_trans_cancel(trans, cancel_flags); + xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ if (dp->i_afp) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3abc7d..0f34886cf726 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,28 +75,20 @@ xfs_bmap_finish( xfs_efi_log_item_t *efi; /* extent free intention */ int error; /* error return value */ xfs_bmap_free_item_t *free; /* free extent item */ - struct xfs_trans_res tres; /* new log reservation */ xfs_mount_t *mp; /* filesystem mount structure */ xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { *committed = 0; return 0; } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - tres.tr_logres = ntp->t_log_res; - tres.tr_logcount = ntp->t_log_count; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; + error = xfs_trans_roll(tp, NULL); *committed = 1; /* * We have a new transaction, so we should return committed=1, @@ -105,19 +97,10 @@ xfs_bmap_finish( if (error) return error; - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - error = xfs_trans_reserve(ntp, &tres, 0, 0); - if (error) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + if ((error = xfs_free_extent(*tp, free->xbfi_startblock, free->xbfi_blockcount))) { /* * The bmap free list will be cleaned up at a @@ -127,7 +110,7 @@ xfs_bmap_finish( * happens, since this transaction may not be * dirty yet. */ - mp = ntp->t_mountp; + mp = (*tp)->t_mountp; if (!XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, (error == -EFSCORRUPTED) ? @@ -135,7 +118,7 @@ xfs_bmap_finish( SHUTDOWN_META_IO_ERROR); return error; } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); xfs_bmap_del_free(flist, NULL, free); } @@ -878,7 +861,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return -EAGAIN; } } @@ -886,7 +869,7 @@ xfs_free_eofblocks( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -908,12 +891,9 @@ xfs_free_eofblocks( * If we get an error at this point we simply don't * bother truncating the file. */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (!error) xfs_inode_clear_eofblocks_tag(ip); } @@ -1026,7 +1006,7 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1053,7 +1033,7 @@ xfs_alloc_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -1289,7 +1284,7 @@ xfs_free_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1320,7 +1315,7 @@ xfs_free_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1330,7 +1325,7 @@ xfs_free_file_space( error0: xfs_bmap_cancel(&free_list); error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out; } @@ -1462,7 +1457,7 @@ xfs_shift_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } @@ -1492,13 +1487,13 @@ xfs_shift_file_space( if (error) goto out; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); } return error; out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -1718,7 +1713,7 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_unlock; } @@ -1901,7 +1896,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); @@ -1915,6 +1910,6 @@ out_unlock: goto out; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1790b00bea7a..a4b7d92e946c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1419,9 +1419,9 @@ xfs_buf_submit_wait( return error; } -xfs_caddr_t +void * xfs_buf_offset( - xfs_buf_t *bp, + struct xfs_buf *bp, size_t offset) { struct page *page; @@ -1431,7 +1431,7 @@ xfs_buf_offset( offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); + return page_address(page) + (offset & (PAGE_SIZE-1)); } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 75ff5d5a7d2e..331c1ccf8264 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) /* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); +extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 02c01bbbc789..4143dc75dca4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -568,8 +568,6 @@ xfs_qm_dqread( struct xfs_buf *bp; struct xfs_trans *tp = NULL; int error; - int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -617,7 +615,6 @@ xfs_qm_dqread( XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; } /* @@ -632,7 +629,6 @@ xfs_qm_dqread( * allocate (ENOENT). */ trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; goto error1; } @@ -670,7 +666,7 @@ xfs_qm_dqread( xfs_trans_brelse(tp, bp); if (tp) { - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; } @@ -680,7 +676,7 @@ xfs_qm_dqread( error1: if (tp) - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 338e50bbfd1e..74d0e5966ebc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -127,7 +127,7 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, @@ -146,7 +146,7 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 64); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c0394ed126fc..4ed3042a0f16 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,10 +21,10 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, inst_t *ra); + const char *filename, int linenum, void *ra); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, inst_t *ra); + int linenum, void *ra); extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cb7fe64cdbfa..adc8f8fdd145 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -239,7 +239,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (__psint_t)(void*)efip; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); atomic_set(&efip->efi_refcount, 2); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7c62fca53e2f..874507de3485 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,14 +80,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -97,7 +98,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -109,20 +111,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return status; @@ -139,7 +148,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -161,7 +170,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* @@ -285,7 +294,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -379,7 +388,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -673,7 +686,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -759,8 +772,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -843,7 +859,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1064,17 +1080,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1455,48 +1460,83 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. */ STATIC int -xfs_filemap_fault( +xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return error; + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; } -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ STATIC int -xfs_filemap_page_mkwrite( +xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file)); + int ret; + + trace_xfs_filemap_fault(ip); - trace_xfs_filemap_page_mkwrite(ip); + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip))) + return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = filemap_fault(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - return error; + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; } const struct file_operations xfs_file_operations = { @@ -1527,9 +1567,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index da82f1cb4b9b..c4c130f9bfb6 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -196,7 +196,8 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb7e8a29dfb6..9b3438a7680f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -101,7 +101,9 @@ xfs_fs_geometry( (xfs_sb_version_hasftype(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -201,7 +203,7 @@ xfs_growfs_data_private( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -489,7 +491,7 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) return error; @@ -557,7 +559,7 @@ xfs_growfs_data_private( return saved_error ? saved_error : error; error0: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 539a85fddbc2..3da9f4da4f3d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -905,7 +905,6 @@ xfs_dir_ialloc( { xfs_trans_t *tp; - xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; @@ -954,8 +953,6 @@ xfs_dir_ialloc( * to succeed the second time. */ if (ialloc_context) { - struct xfs_trans_res tres; - /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -964,12 +961,6 @@ xfs_dir_ialloc( * allocation group. */ xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - tres.tr_logres = xfs_trans_get_log_res(tp); - tres.tr_logcount = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next @@ -985,35 +976,9 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - code = xfs_trans_reserve(tp, &tres, 0, 0); /* * Re-attach the quota info that we detached from prev trx. @@ -1025,7 +990,7 @@ xfs_dir_ialloc( if (code) { xfs_buf_relse(ialloc_context); - *tpp = ntp; + *tpp = tp; *ipp = NULL; return code; } @@ -1127,7 +1092,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; prid_t prid; struct xfs_dquot *udqp = NULL; @@ -1164,8 +1128,6 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1183,10 +1145,9 @@ xfs_create( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1217,7 +1178,7 @@ xfs_create( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } /* @@ -1235,7 +1196,7 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); - goto out_trans_abort; + goto out_trans_cancel; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -1269,7 +1230,7 @@ xfs_create( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1282,10 +1243,8 @@ xfs_create( out_bmap_cancel: xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1317,7 +1276,6 @@ xfs_create_tmpfile( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1350,10 +1308,8 @@ xfs_create_tmpfile( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1365,7 +1321,7 @@ xfs_create_tmpfile( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -1381,9 +1337,9 @@ xfs_create_tmpfile( ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) - goto out_trans_abort; + goto out_trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1394,10 +1350,8 @@ xfs_create_tmpfile( *ipp = ip; return 0; - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1427,7 +1381,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; int resblks; @@ -1447,17 +1400,14 @@ xfs_link( goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto error_return; - } xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1486,19 +1436,19 @@ xfs_link( if (sip->i_d.di_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) - goto abort_return; + goto error_return; } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) - goto abort_return; + goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); if (error) - goto abort_return; + goto error_return; /* * If this is a synchronous mount, make sure that the @@ -1512,15 +1462,13 @@ xfs_link( error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); - goto abort_return; + goto error_return; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; error_return: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -1555,7 +1503,6 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1613,29 +1560,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1756,7 +1681,7 @@ xfs_inactive_truncate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1777,7 +1702,7 @@ xfs_inactive_truncate( ASSERT(ip->i_d.di_nextents == 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error_unlock; @@ -1785,7 +1710,7 @@ xfs_inactive_truncate( return 0; error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1835,7 +1760,7 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(tp); return error; } @@ -1855,7 +1780,7 @@ xfs_inactive_ifree( __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1874,7 +1799,7 @@ xfs_inactive_ifree( if (error) xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); @@ -2235,28 +2160,42 @@ xfs_iunlink_remove( */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; int inodes_per_cluster; int nbufs; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); blks_per_cluster = xfs_icluster_size_fsb(mp); inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; nbufs = mp->m_ialloc_blks / blks_per_cluster; for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2414,8 +2353,7 @@ xfs_ifree( xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2431,7 +2369,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + error = xfs_difree(tp, ip->i_ino, flist, &xic); if (error) return error; @@ -2448,8 +2386,8 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) - error = xfs_ifree_cluster(ip, tp, first_ino); + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } @@ -2536,7 +2474,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; uint resblks; @@ -2557,7 +2494,6 @@ xfs_remove( tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * We try to get the real space reservation first, @@ -2576,7 +2512,6 @@ xfs_remove( } if (error) { ASSERT(error != -ENOSPC); - cancel_flags = 0; goto out_trans_cancel; } @@ -2588,7 +2523,6 @@ xfs_remove( /* * If we're removing a directory perform some additional validation. */ - cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2644,7 +2578,7 @@ xfs_remove( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto std_return; @@ -2656,7 +2590,7 @@ xfs_remove( out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -2730,11 +2664,11 @@ xfs_finish_rename( error = xfs_bmap_finish(&tp, free_list, &committed); if (error) { xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); } /* @@ -2855,7 +2789,7 @@ xfs_cross_rename( out_trans_abort: xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -2915,7 +2849,6 @@ xfs_rename( int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - int cancel_flags = 0; int spaceres; int error; @@ -2951,7 +2884,6 @@ xfs_rename( } if (error) goto out_trans_cancel; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes @@ -3022,10 +2954,8 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == -ENOSPC) - goto out_bmap_cancel; if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3033,7 +2963,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } else { /* target_ip != NULL */ /* @@ -3065,7 +2995,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3076,7 +3006,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; if (src_is_directory) { /* @@ -3084,7 +3014,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } /* target_ip != NULL */ @@ -3101,7 +3031,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3127,7 +3057,7 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3142,7 +3072,7 @@ xfs_rename( error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; /* * For whiteouts, we need to bump the link count on the whiteout inode. @@ -3156,10 +3086,10 @@ xfs_rename( ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; error = xfs_iunlink_remove(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); /* @@ -3180,12 +3110,10 @@ xfs_rename( IRELE(wip); return error; -out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); if (wip) IRELE(wip); return error; @@ -3464,7 +3392,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 87f67c6b654c..ea7d85af5310 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -336,7 +336,7 @@ xfs_set_dmattrs( tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -346,7 +346,7 @@ xfs_set_dmattrs( ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; } @@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans( return tp; out_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return ERR_PTR(error); } @@ -1253,7 +1253,7 @@ xfs_ioctl_setattr( else ip->i_d.di_extsize = 0; - code = xfs_trans_commit(tp, 0); + code = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1265,7 +1265,7 @@ xfs_ioctl_setattr( return code; error_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); @@ -1338,11 +1338,11 @@ xfs_ioc_setxflags( error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_write; } - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_write: mnt_drop_write_file(filp); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 38e633bad8c2..1f86033171c8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -183,7 +183,7 @@ xfs_iomap_write_direct( * Check for running out of space, note: need lock to return */ if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -213,7 +213,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -236,7 +236,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } @@ -690,7 +690,7 @@ xfs_iomap_write_allocate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, nres, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -760,7 +760,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; @@ -791,7 +791,7 @@ xfs_iomap_write_allocate( trans_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -853,7 +853,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -890,7 +890,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -914,7 +914,7 @@ xfs_iomap_write_unwritten( error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 7f51f39f8acc..766b23f86ce9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -699,7 +699,7 @@ xfs_setattr_nonsize( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -730,7 +730,7 @@ xfs_setattr_nonsize( return 0; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); @@ -752,7 +752,6 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); @@ -848,7 +847,11 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); @@ -858,7 +861,6 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -898,7 +900,7 @@ xfs_setattr_size( if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -925,16 +927,14 @@ xfs_setattr_size( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } @@ -981,7 +981,7 @@ xfs_vn_update_time( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1003,7 +1003,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1188,22 +1188,22 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 80429891dc9b..f41b0c3fddab 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk( } irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + *icount = irec->ir_count - irec->ir_freecount; } return 0; @@ -415,6 +415,8 @@ xfs_bulkstat( goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; @@ -447,13 +449,15 @@ xfs_bulkstat( * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < r.ir_count) { xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + icount += r.ir_count - r.ir_freecount; } error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { @@ -599,8 +603,7 @@ xfs_inumbers( agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = - XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; if (++bufidx == bcount) { long written; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7c7842c85a08..85f883dd6207 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,26 +32,12 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef __uint32_t inst_t; /* an instruction */ - typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - #include "xfs_types.h" #include "kmem.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcc7cfabb787..08d4fe46f0fa 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,7 +109,7 @@ xlog_ungrant_log_space( STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -513,7 +513,7 @@ xfs_log_done( struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -526,14 +526,11 @@ xfs_log_done( (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -541,7 +538,6 @@ xfs_log_done( * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -553,6 +549,7 @@ xfs_log_done( ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -1447,7 +1444,7 @@ xlog_alloc_log( iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1602,7 +1599,7 @@ xlog_pack_data( int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -3664,7 +3661,7 @@ xlog_ticket_alloc( void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3767,9 +3764,8 @@ xlog_verify_iclog( xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3788,9 +3784,9 @@ xlog_verify_iclog( if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3798,20 +3794,19 @@ xlog_verify_iclog( /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,13 +3824,13 @@ xlog_verify_iclog( (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e0deb95abd..fa27aaec72cb 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* - * Macros, structures, prototypes for interface to the log manager. - */ - -/* - * Flags to xfs_log_done() - */ -#define XFS_LOG_REL_PERM_RESERV 0x1 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -138,7 +129,7 @@ struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags); + bool regrant); int _xfs_log_force(struct xfs_mount *mp, uint flags, int *log_forced); @@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, int flags); + xfs_lsn_t *commit_lsn, bool regrant); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 45cc0ce18adf..abc2ccbff739 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -624,7 +624,7 @@ restart: spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); if (commit_lsn == -1) goto out_abort; @@ -773,14 +773,10 @@ xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, - int flags) + bool regrant) { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; - int log_flags = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; /* lock out background commit */ down_read(&cil->xc_ctx_lock); @@ -795,7 +791,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = tp->t_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); /* @@ -809,7 +805,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index db7cbdeb2b42..1c87c8abfbed 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -409,7 +409,7 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG - char *l_iclog_bak[XLOG_MAX_ICLOGS]; + void *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4f5784f85a5b..01dd228ca05e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_put_bp( * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC xfs_caddr_t +STATIC char * xlog_align( struct xlog *log, xfs_daddr_t blk_no, @@ -203,7 +203,7 @@ xlog_bread( xfs_daddr_t blk_no, int nbblks, struct xfs_buf *bp, - xfs_caddr_t *offset) + char **offset) { int error; @@ -225,9 +225,9 @@ xlog_bread_offset( xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ struct xfs_buf *bp, - xfs_caddr_t offset) + char *offset) { - xfs_caddr_t orig_offset = bp->b_addr; + char *orig_offset = bp->b_addr; int orig_len = BBTOB(bp->b_length); int error, error2; @@ -396,7 +396,7 @@ xlog_find_cycle_start( xfs_daddr_t *last_blk, uint cycle) { - xfs_caddr_t offset; + char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; @@ -443,7 +443,7 @@ xlog_find_verify_cycle( uint cycle; xfs_buf_t *bp; xfs_daddr_t bufblks; - xfs_caddr_t buf = NULL; + char *buf = NULL; int error = 0; /* @@ -509,7 +509,7 @@ xlog_find_verify_log_record( { xfs_daddr_t i; xfs_buf_t *bp; - xfs_caddr_t offset = NULL; + char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; @@ -616,7 +616,7 @@ xlog_find_head( xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; @@ -891,7 +891,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; - xfs_caddr_t offset = NULL; + char *offset = NULL; xfs_buf_t *bp; int error, i, found; xfs_daddr_t umount_data_blk; @@ -1099,7 +1099,7 @@ xlog_find_zeroed( xfs_daddr_t *blk_no) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; @@ -1199,7 +1199,7 @@ bp_err: STATIC void xlog_add_record( struct xlog *log, - xfs_caddr_t buf, + char *buf, int cycle, int block, int tail_cycle, @@ -1227,7 +1227,7 @@ xlog_write_log_records( int tail_cycle, int tail_block) { - xfs_caddr_t offset; + char *offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; @@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer( return -EFSCORRUPTED; } - buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, - next_unlinked_offset); + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* @@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } @@ -2503,8 +2502,8 @@ xlog_recover_inode_pass2( xfs_buf_t *bp; xfs_dinode_t *dip; int len; - xfs_caddr_t src; - xfs_caddr_t dest; + char *src; + char *dest; int error; int attr_index; uint fields; @@ -2546,7 +2545,7 @@ xlog_recover_inode_pass2( goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks @@ -2885,7 +2884,7 @@ xlog_recover_dquot_pass2( return error; ASSERT(bp); - ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); /* * If the dquot has an LSN in it, recover the dquot only if it's less @@ -3068,12 +3067,22 @@ xlog_recover_do_icreate_pass2( return -EINVAL; } - /* existing allocation is fixed value */ - ASSERT(count == mp->m_ialloc_inos); - ASSERT(length == mp->m_ialloc_blks); - if (count != mp->m_ialloc_inos || - length != mp->m_ialloc_blks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + /* + * The inode chunk is either full or sparse and we only support + * m_ialloc_min_blks sized sparse allocations at this time. + */ + if (length != mp->m_ialloc_blks && + length != mp->m_ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); return -EINVAL; } @@ -3091,8 +3100,8 @@ xlog_recover_do_icreate_pass2( XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) return 0; - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); + xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -3364,17 +3373,17 @@ STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; + char *ptr, *old_ptr; int old_len; if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + + ptr = (char *)&trans->r_theader + sizeof(xfs_trans_header_t) - len; memcpy(ptr, dp, len); return 0; @@ -3410,12 +3419,12 @@ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; - xfs_caddr_t ptr; + char *ptr; if (!len) return 0; @@ -3504,7 +3513,7 @@ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, unsigned int len, unsigned int flags, int pass) @@ -3611,8 +3620,8 @@ xlog_recover_process_ophdr( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, - xfs_caddr_t dp, - xfs_caddr_t end, + char *dp, + char *end, int pass) { struct xlog_recover *trans; @@ -3661,11 +3670,11 @@ xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, int pass) { struct xlog_op_header *ohead; - xfs_caddr_t end; + char *end; int num_logops; int error; @@ -3751,11 +3760,11 @@ xlog_recover_process_efi( } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; abort_error: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -3857,13 +3866,13 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; @@ -4010,7 +4019,7 @@ xlog_recover_process_iunlinks( STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { __le32 crc; @@ -4040,7 +4049,7 @@ xlog_unpack_data_crc( STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { int i, j, k; @@ -4122,7 +4131,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t offset; + char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f23fbdfb365..461e791efad7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -725,6 +725,22 @@ xfs_mountfs( } /* + * If enabled, sparse inode chunk alignment is expected to match the + * cluster size. Full inode chunk alignment must match the chunk size, + * but that is checked on sb read verification... + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + mp->m_sb.sb_spino_align != + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + xfs_warn(mp, + "Sparse inode block alignment (%u) must match cluster size (%llu).", + mp->m_sb.sb_spino_align, + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + error = -EINVAL; + goto out_remove_uuid; + } + + /* * Set inode alignment fields */ xfs_set_inoalignment(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c995a2ccb6f..7999e91cd49a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -101,6 +101,8 @@ typedef struct xfs_mount { __uint64_t m_flags; /* global mount flags */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode + * allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ @@ -179,6 +181,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 981a657eca39..ab4a6066f7ca 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -306,7 +306,7 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_iolock; } @@ -321,7 +321,7 @@ xfs_fs_commit_blocks( } xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5538468c7f63..eac9549efd52 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -756,7 +756,7 @@ xfs_qm_qino_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -764,8 +764,7 @@ xfs_qm_qino_alloc( error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } } @@ -796,7 +795,7 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9a25c9275fb3..3640c6e896af 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile( tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile( error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } ASSERT(ip->i_d.di_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); @@ -437,7 +436,7 @@ xfs_qm_scall_setqlim( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_rele; } @@ -548,7 +547,7 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_rele: xfs_qm_dqrele(dqp); @@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_trans_commit(tp); } @@ -605,7 +603,7 @@ xfs_qm_log_quotaoff( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } @@ -624,7 +622,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5376dd406ba2..ce6506adab7b 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -55,7 +55,6 @@ struct xfs_trans; typedef struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_blk_res_used; /* blks used from the reservation */ ulong qt_ino_res; /* inode reserved on a dquot */ ulong qt_ino_res_used; /* inodes used from the reservation */ long qt_bcount_delta; /* dquot blk count changes */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f2079b6911cc..f4e8c06eee26 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,7 +780,6 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - int cancelflags = 0; xfs_trans_t *tp; tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); @@ -792,7 +791,6 @@ xfs_growfs_rt_alloc( resblks, 0); if (error) goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ @@ -804,7 +802,6 @@ xfs_growfs_rt_alloc( * Allocate blocks to the bitmap file. */ nmap = 1; - cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); @@ -818,14 +815,13 @@ xfs_growfs_rt_alloc( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. */ - cancelflags = 0; for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { @@ -851,7 +847,7 @@ xfs_growfs_rt_alloc( if (bp == NULL) { error = -EIO; error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); goto error; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); @@ -859,7 +855,7 @@ error_cancel: /* * Commit the transaction. */ - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto error; } @@ -973,7 +969,6 @@ xfs_growfs_rt( bmbno < nrbmblocks; bmbno++) { xfs_trans_t *tp; - int cancelflags = 0; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1015,7 +1010,6 @@ xfs_growfs_rt( mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - cancelflags |= XFS_TRANS_ABORT; /* * Get the summary inode into the transaction. */ @@ -1062,7 +1056,7 @@ xfs_growfs_rt( nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) { error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); break; } /* @@ -1076,7 +1070,7 @@ error_cancel: mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 858e1e62bbaa..1fb16562c159 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -363,6 +365,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -452,8 +458,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -474,6 +480,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1507,6 +1514,20 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3df411eadb86..4be27b0210af 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -104,7 +104,7 @@ xfs_readlink_bmap( cur_chunk += sizeof(struct xfs_dsymlink_hdr); } - memcpy(link + offset, bp->b_addr, byte_cnt); + memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; @@ -224,7 +223,6 @@ xfs_symlink( return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -239,10 +237,8 @@ xfs_symlink( resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -394,7 +390,7 @@ xfs_symlink( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -407,9 +403,8 @@ xfs_symlink( out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt( /* * Commit the transaction containing extent freeing and EFDs. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error_unlock; @@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt( error_bmap_cancel: xfs_bmap_cancel(&free_list); error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 615781bf4ee5..8d916d33d93d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 220ef2c906b2..0582a27107d4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -113,7 +113,7 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -xfs_trans_t * +STATIC xfs_trans_t * xfs_trans_dup( xfs_trans_t *tp) { @@ -251,14 +251,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - int log_flags; - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -744,7 +737,7 @@ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags) + bool abort) { struct xfs_log_item_desc *lidp, *next; @@ -755,7 +748,7 @@ xfs_trans_free_items( if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); - if (flags & XFS_TRANS_ABORT) + if (abort) lip->li_flags |= XFS_LI_ABORTED; lip->li_ops->iop_unlock(lip); @@ -892,27 +885,17 @@ xfs_trans_committed_bulk( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ -int -xfs_trans_commit( +static int +__xfs_trans_commit( struct xfs_trans *tp, - uint flags) + bool regrant) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; int error = 0; - int log_flags = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. - */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } - - /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. @@ -936,7 +919,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -964,18 +947,25 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); return error; } +int +xfs_trans_commit( + struct xfs_trans *tp) +{ + return __xfs_trans_commit(tp, false); +} + /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -986,29 +976,22 @@ out_unreserve: */ void xfs_trans_cancel( - xfs_trans_t *tp, - int flags) + struct xfs_trans *tp) { - int log_flags; - xfs_mount_t *mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); /* - * See if the caller is being too lazy to figure out if - * the transaction really needs an abort. - */ - if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) - flags &= ~XFS_TRANS_ABORT; - /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item_desc *lidp; list_for_each_entry(lidp, &tp->t_items, lid_trans) @@ -1018,27 +1001,20 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - } + if (tp->t_ticket) + xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); } /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when - * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * committed with xfs_trans_commit(), but we still want as soon * as possible to let chunks of it go to the log. So we commit the * chunk we've been working on and get a new transaction to continue. */ @@ -1055,7 +1031,8 @@ xfs_trans_roll( * Ensure that the inode is always logged. */ trans = *tpp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + if (dp) + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* * Copy the critical parameters from one trans to the next. @@ -1071,20 +1048,13 @@ xfs_trans_roll( * is in progress. The caller takes the responsibility to cancel * the duplicate transaction that gets returned. */ - error = xfs_trans_commit(trans, 0); + error = __xfs_trans_commit(trans, true); if (error) return error; trans = *tpp; /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(trans->t_ticket); - - - /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1100,6 +1070,7 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, 0); + if (dp) + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b5bc1ab3c4da..3b21b4e5e467 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,8 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) -#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) @@ -153,7 +151,6 @@ typedef struct xfs_trans { */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); -void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 573aefb5a573..1098cf490189 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next( { struct xfs_log_item *lip = cur->item; - if ((__psint_t)lip & 1) + if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); @@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear( list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) - ((__psint_t)cur->item | 1); + ((uintptr_t)cur->item | 1); } } @@ -287,7 +287,7 @@ xfs_ail_splice( * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; - if (!lip || (__psint_t) lip & 1) + if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 76a16df55ef7..ce78534a047e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo( xfs_trans_t *ntp) { xfs_dqtrx_t *oq, *nq; - int i,j; + int i, j; xfs_dqtrx_t *oqa, *nqa; + ulong blk_res_used; if (!otp->t_dqinfo) return; @@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo( * Because the quota blk reservation is carried forward, * it is also necessary to carry forward the DQ_DIRTY flag. */ - if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + if (otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { oqa = otp->t_dqinfo->dqs[j]; nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + blk_res_used = 0; + if (oqa[i].qt_dquot == NULL) break; oq = &oqa[i]; nq = &nqa[i]; + if (oq->qt_blk_res && oq->qt_bcount_delta > 0) + blk_res_used = oq->qt_bcount_delta; + nq->qt_dquot = oq->qt_dquot; nq->qt_bcount_delta = nq->qt_icount_delta = 0; nq->qt_rtbcount_delta = 0; @@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo( /* * Transfer whatever is left of the reservations. */ - nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; - oq->qt_blk_res = oq->qt_blk_res_used; + nq->qt_blk_res = oq->qt_blk_res - blk_res_used; + oq->qt_blk_res = blk_res_used; nq->qt_rtblk_res = oq->qt_rtblk_res - oq->qt_rtblk_res_used; @@ -239,10 +245,6 @@ xfs_trans_mod_dquot( * disk blocks used. */ case XFS_TRANS_DQ_BCOUNT: - if (qtrx->qt_blk_res && delta > 0) { - qtrx->qt_blk_res_used += (ulong)delta; - ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); - } qtrx->qt_bcount_delta += delta; break; @@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { - if (qtrx->qt_blk_res > - qtrx->qt_blk_res_used) + ulong blk_res_used = 0; + + if (qtrx->qt_bcount_delta > 0) + blk_res_used = qtrx->qt_bcount_delta; + + if (qtrx->qt_blk_res != blk_res_used) { + if (qtrx->qt_blk_res > blk_res_used) dqp->q_res_bcount -= (xfs_qcnt_t) (qtrx->qt_blk_res - - qtrx->qt_blk_res_used); + blk_res_used); else dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res_used - + (blk_res_used - qtrx->qt_blk_res); } } else { diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd1281862ad7..1b736294558a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); + bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index c187817471fb..1618cdfb38c7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -261,8 +261,13 @@ extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA +int acpi_map_pxm_to_online_node(int pxm); int acpi_get_node(acpi_handle handle); #else +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 05be2352fef8..26fc8bc77f85 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -21,6 +21,7 @@ # define __rcu __attribute__((noderef, address_space(4))) #else # define __rcu +# define __pmem __attribute__((noderef, address_space(5))) #endif extern void __chk_user_ptr(const volatile void __user *); extern void __chk_io_ptr(const volatile void __iomem *); @@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *); # define __cond_lock(x,c) (c) # define __percpu # define __rcu +# define __pmem #endif /* Indirect macros required for expanded argument pasting, eg. __LINE__. */ diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h new file mode 100644 index 000000000000..3edc99294bf6 --- /dev/null +++ b/include/linux/dma/pxa-dma.h @@ -0,0 +1,27 @@ +#ifndef _PXA_DMA_H_ +#define _PXA_DMA_H_ + +enum pxad_chan_prio { + PXAD_PRIO_HIGHEST = 0, + PXAD_PRIO_NORMAL, + PXAD_PRIO_LOW, + PXAD_PRIO_LOWEST, +}; + +struct pxad_param { + unsigned int drcmr; + enum pxad_chan_prio prio; +}; + +struct dma_chan; + +#ifdef CONFIG_PXA_DMA +bool pxad_filter_fn(struct dma_chan *chan, void *param); +#else +static inline bool pxad_filter_fn(struct dma_chan *chan, void *param) +{ + return false; +} +#endif + +#endif /* _PXA_DMA_H_ */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index ad419757241f..e2f5eb419976 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -65,6 +65,7 @@ enum dma_transaction_type { DMA_PQ, DMA_XOR_VAL, DMA_PQ_VAL, + DMA_MEMSET, DMA_INTERRUPT, DMA_SG, DMA_PRIVATE, @@ -122,10 +123,18 @@ enum dma_transfer_direction { * chunk and before first src/dst address for next chunk. * Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false. * Ignored for src(assumed 0), if src_inc is true and src_sgl is false. + * @dst_icg: Number of bytes to jump after last dst address of this + * chunk and before the first dst address for next chunk. + * Ignored if dst_inc is true and dst_sgl is false. + * @src_icg: Number of bytes to jump after last src address of this + * chunk and before the first src address for next chunk. + * Ignored if src_inc is true and src_sgl is false. */ struct data_chunk { size_t size; size_t icg; + size_t dst_icg; + size_t src_icg; }; /** @@ -222,6 +231,16 @@ struct dma_chan_percpu { }; /** + * struct dma_router - DMA router structure + * @dev: pointer to the DMA router device + * @route_free: function to be called when the route can be disconnected + */ +struct dma_router { + struct device *dev; + void (*route_free)(struct device *dev, void *route_data); +}; + +/** * struct dma_chan - devices supply DMA channels, clients use them * @device: ptr to the dma device who supplies this channel, always !%NULL * @cookie: last cookie value returned to client @@ -232,6 +251,8 @@ struct dma_chan_percpu { * @local: per-cpu pointer to a struct dma_chan_percpu * @client_count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table + * @router: pointer to the DMA router structure + * @route_data: channel specific data for the router * @private: private data for certain client-channel associations */ struct dma_chan { @@ -247,6 +268,11 @@ struct dma_chan { struct dma_chan_percpu __percpu *local; int client_count; int table_count; + + /* DMA router */ + struct dma_router *router; + void *route_data; + void *private; }; @@ -570,6 +596,7 @@ struct dma_tx_state { * @copy_align: alignment shift for memcpy operations * @xor_align: alignment shift for xor operations * @pq_align: alignment shift for pq operations + * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @src_addr_widths: bit mask of src addr widths the device supports @@ -588,6 +615,7 @@ struct dma_tx_state { * @device_prep_dma_xor_val: prepares a xor validation operation * @device_prep_dma_pq: prepares a pq operation * @device_prep_dma_pq_val: prepares a pqzero_sum operation + * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. @@ -620,6 +648,7 @@ struct dma_device { u8 copy_align; u8 xor_align; u8 pq_align; + u8 fill_align; #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; @@ -650,6 +679,9 @@ struct dma_device { struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, enum sum_check_flags *pqres, unsigned long flags); + struct dma_async_tx_descriptor *(*device_prep_dma_memset)( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( struct dma_chan *chan, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_sg)( @@ -745,6 +777,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( return chan->device->device_prep_interleaved_dma(chan, xt, flags); } +static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset( + struct dma_chan *chan, dma_addr_t dest, int value, size_t len, + unsigned long flags) +{ + if (!chan || !chan->device) + return NULL; + + return chan->device->device_prep_dma_memset(chan, dest, value, + len, flags); +} + static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg( struct dma_chan *chan, struct scatterlist *dst_sg, unsigned int dst_nents, @@ -820,6 +863,12 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, return dmaengine_check_align(dev->pq_align, off1, off2, len); } +static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, + size_t off2, size_t len) +{ + return dmaengine_check_align(dev->fill_align, off1, off2, len); +} + static inline void dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) { @@ -874,6 +923,33 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) BUG(); } +static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg, + size_t dir_icg) +{ + if (inc) { + if (dir_icg) + return dir_icg; + else if (sgl) + return icg; + } + + return 0; +} + +static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl, + chunk->icg, chunk->dst_icg); +} + +static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt, + struct data_chunk *chunk) +{ + return dmaengine_get_icg(xt->src_inc, xt->src_sgl, + chunk->icg, chunk->src_icg); +} + /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE diff --git a/include/linux/efi.h b/include/linux/efi.h index 5f19efe4eb3f..85ef051ac6fb 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -85,7 +85,8 @@ typedef struct { #define EFI_MEMORY_MAPPED_IO 11 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE 12 #define EFI_PAL_CODE 13 -#define EFI_MAX_MEMORY_TYPE 14 +#define EFI_PERSISTENT_MEMORY 14 +#define EFI_MAX_MEMORY_TYPE 15 /* Attribute values: */ #define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ diff --git a/include/linux/fs.h b/include/linux/fs.h index e351da4a934f..3f1a84635da8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 @@ -2655,9 +2656,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); +int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t, + dax_iodone_t); int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) +#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod) +#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod) #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h new file mode 100644 index 000000000000..75e3af01ee32 --- /dev/null +++ b/include/linux/libnvdimm.h @@ -0,0 +1,151 @@ +/* + * libnvdimm - Non-volatile-memory Devices Subsystem + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LIBNVDIMM_H__ +#define __LIBNVDIMM_H__ +#include <linux/kernel.h> +#include <linux/sizes.h> +#include <linux/types.h> + +enum { + /* when a dimm supports both PMEM and BLK access a label is required */ + NDD_ALIASING = 1 << 0, + /* unarmed memory devices may not persist writes */ + NDD_UNARMED = 1 << 1, + + /* need to set a limit somewhere, but yes, this is likely overkill */ + ND_IOCTL_MAX_BUFLEN = SZ_4M, + ND_CMD_MAX_ELEM = 4, + ND_CMD_MAX_ENVELOPE = 16, + ND_CMD_ARS_STATUS_MAX = SZ_4K, + ND_MAX_MAPPINGS = 32, + + /* mark newly adjusted resources as requiring a label update */ + DPA_RESOURCE_ADJUSTED = 1 << 0, +}; + +extern struct attribute_group nvdimm_bus_attribute_group; +extern struct attribute_group nvdimm_attribute_group; +extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; +extern struct attribute_group nd_region_attribute_group; +extern struct attribute_group nd_mapping_attribute_group; + +struct nvdimm; +struct nvdimm_bus_descriptor; +typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len); + +struct nd_namespace_label; +struct nvdimm_drvdata; +struct nd_mapping { + struct nvdimm *nvdimm; + struct nd_namespace_label **labels; + u64 start; + u64 size; + /* + * @ndd is for private use at region enable / disable time for + * get_ndd() + put_ndd(), all other nd_mapping to ndd + * conversions use to_ndd() which respects enabled state of the + * nvdimm. + */ + struct nvdimm_drvdata *ndd; +}; + +struct nvdimm_bus_descriptor { + const struct attribute_group **attr_groups; + unsigned long dsm_mask; + char *provider_name; + ndctl_fn ndctl; +}; + +struct nd_cmd_desc { + int in_num; + int out_num; + u32 in_sizes[ND_CMD_MAX_ELEM]; + int out_sizes[ND_CMD_MAX_ELEM]; +}; + +struct nd_interleave_set { + u64 cookie; +}; + +struct nd_region_desc { + struct resource *res; + struct nd_mapping *nd_mapping; + u16 num_mappings; + const struct attribute_group **attr_groups; + struct nd_interleave_set *nd_set; + void *provider_data; + int num_lanes; + int numa_node; +}; + +struct nvdimm_bus; +struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc, struct module *module); +#define nvdimm_bus_register(parent, desc) \ + __nvdimm_bus_register(parent, desc, THIS_MODULE) +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm *to_nvdimm(struct device *dev); +struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); +const char *nvdimm_name(struct nvdimm *nvdimm); +void *nvdimm_provider_data(struct nvdimm *nvdimm); +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf); +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field); +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); +u64 nd_fletcher64(void *addr, size_t len, bool le); +#endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h new file mode 100644 index 000000000000..507e47c86737 --- /dev/null +++ b/include/linux/nd.h @@ -0,0 +1,151 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LINUX_ND_H__ +#define __LINUX_ND_H__ +#include <linux/fs.h> +#include <linux/ndctl.h> +#include <linux/device.h> + +struct nd_device_driver { + struct device_driver drv; + unsigned long type; + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); +}; + +static inline struct nd_device_driver *to_nd_device_driver( + struct device_driver *drv) +{ + return container_of(drv, struct nd_device_driver, drv); +}; + +/** + * struct nd_namespace_common - core infrastructure of a namespace + * @force_raw: ignore other personalities for the namespace (e.g. btt) + * @dev: device model node + * @claim: when set a another personality has taken ownership of the namespace + * @rw_bytes: access the raw namespace capacity with byte-aligned transfers + */ +struct nd_namespace_common { + int force_raw; + struct device dev; + struct device *claim; + int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, + void *buf, size_t size, int rw); +}; + +static inline struct nd_namespace_common *to_ndns(struct device *dev) +{ + return container_of(dev, struct nd_namespace_common, dev); +} + +/** + * struct nd_namespace_io - infrastructure for loading an nd_pmem instance + * @dev: namespace device created by the nd region driver + * @res: struct resource conversion of a NFIT SPA table + */ +struct nd_namespace_io { + struct nd_namespace_common common; + struct resource res; +}; + +/** + * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory + * @nsio: device and system physical address range to drive + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + */ +struct nd_namespace_pmem { + struct nd_namespace_io nsio; + char *alt_name; + u8 *uuid; +}; + +/** + * struct nd_namespace_blk - namespace for dimm-bounded persistent memory + * @alt_name: namespace name supplied in the dimm label + * @uuid: namespace name supplied in the dimm label + * @id: ida allocated id + * @lbasize: blk namespaces have a native sector size when btt not present + * @num_resources: number of dpa extents to claim + * @res: discontiguous dpa extents for given dimm + */ +struct nd_namespace_blk { + struct nd_namespace_common common; + char *alt_name; + u8 *uuid; + int id; + unsigned long lbasize; + int num_resources; + struct resource **res; +}; + +static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +{ + return container_of(dev, struct nd_namespace_io, common.dev); +} + +static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return container_of(nsio, struct nd_namespace_pmem, nsio); +} + +static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev) +{ + return container_of(dev, struct nd_namespace_blk, common.dev); +} + +/** + * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to fill + * @size: transfer length + * + * @buf is up-to-date upon return from this routine. + */ +static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, READ); +} + +/** + * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace + * @ndns: device to read + * @offset: namespace-relative starting offset + * @buf: buffer to drain + * @size: transfer length + * + * NVDIMM Namepaces disks do not implement sectors internally. Depending on + * the @ndns, the contents of @buf may be in cpu cache, platform buffers, + * or on backing memory media upon return from this routine. Flushing + * to media is handled internal to the @ndns driver, if at all. + */ +static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size) +{ + return ndns->rw_bytes(ndns, offset, buf, size, WRITE); +} + +#define MODULE_ALIAS_ND_DEVICE(type) \ + MODULE_ALIAS("nd:t" __stringify(type) "*") +#define ND_DEVICE_MODALIAS_FMT "nd:t%d" + +int __must_check __nd_driver_register(struct nd_device_driver *nd_drv, + struct module *module, const char *mod_name); +#define nd_driver_register(driver) \ + __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +#endif /* __LINUX_ND_H__ */ diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index 56bc026c143f..98ba7525929e 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -23,6 +23,9 @@ struct of_dma { struct device_node *of_node; struct dma_chan *(*of_dma_xlate) (struct of_phandle_args *, struct of_dma *); + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *); + struct dma_router *dma_router; void *of_dma_data; }; @@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np, (struct of_phandle_args *, struct of_dma *), void *data); extern void of_dma_controller_free(struct device_node *np); + +extern int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router); +#define of_dma_router_free of_dma_controller_free + extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name); extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma); extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec, struct of_dma *ofdma); + #else static inline int of_dma_controller_register(struct device_node *np, struct dma_chan *(*of_dma_xlate) @@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int of_dma_router_register(struct device_node *np, + void *(*of_dma_route_allocate) + (struct of_phandle_args *, struct of_dma *), + struct dma_router *dma_router) +{ + return -ENODEV; +} + +#define of_dma_router_free of_dma_controller_free + static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np, const char *name) { diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h deleted file mode 100644 index 471fffebbeb4..000000000000 --- a/include/linux/platform_data/dma-rcar-audmapp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * This is for Renesas R-Car Audio-DMAC-peri-peri. - * - * Copyright (C) 2014 Renesas Electronics Corporation - * Copyright (C) 2014 Kuninori Morimoto <kuninori.morimoto.gx@renesas.com> - * - * This file is based on the include/linux/sh_dma.h - * - * Header for the new SH dmaengine driver - * - * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#ifndef SH_AUDMAPP_H -#define SH_AUDMAPP_H - -#include <linux/dmaengine.h> - -struct audmapp_slave_config { - int slave_id; - dma_addr_t src; - dma_addr_t dst; - u32 chcr; -}; - -struct audmapp_pdata { - struct audmapp_slave_config *slave; - int slave_num; -}; - -#endif /* SH_AUDMAPP_H */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h new file mode 100644 index 000000000000..d2114045a6c4 --- /dev/null +++ b/include/linux/pmem.h @@ -0,0 +1,152 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __PMEM_H__ +#define __PMEM_H__ + +#include <linux/io.h> + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#include <asm/cacheflush.h> +#else +static inline void arch_wmb_pmem(void) +{ + BUG(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} + +static inline void __pmem *arch_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return NULL; +} + +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + BUG(); +} +#endif + +/* + * Architectures that define ARCH_HAS_PMEM_API must provide + * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(), + * arch_wmb_pmem(), and __arch_has_wmb_pmem(). + */ + +static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) +{ + memcpy(dst, (void __force const *) src, size); +} + +static inline void memunmap_pmem(void __pmem *addr) +{ + iounmap((void __force __iomem *) addr); +} + +/** + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * + * For a given cpu implementation within an architecture it is possible + * that wmb_pmem() resolves to a nop. In the case this returns + * false, pmem api users are unable to ensure durability and may want to + * fall back to a different data consistency model, or otherwise notify + * the user. + */ +static inline bool arch_has_wmb_pmem(void) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) + return __arch_has_wmb_pmem(); + return false; +} + +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); +} + +/* + * These defaults seek to offer decent performance and minimize the + * window between i/o completion and writes being durable on media. + * However, it is undefined / architecture specific whether + * default_memremap_pmem + default_memcpy_to_pmem is sufficient for + * making data durable relative to i/o completion. + */ +static void default_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t size) +{ + memcpy((void __force *) dst, src, size); +} + +static void __pmem *default_memremap_pmem(resource_size_t offset, + unsigned long size) +{ + return (void __pmem __force *)ioremap_wt(offset, size); +} + +/** + * memremap_pmem - map physical persistent memory for pmem api + * @offset: physical address of persistent memory + * @size: size of the mapping + * + * Establish a mapping of the architecture specific memory type expected + * by memcpy_to_pmem() and wmb_pmem(). For example, it may be + * the case that an uncacheable or writethrough mapping is sufficient, + * or a writeback mapping provided memcpy_to_pmem() and + * wmb_pmem() arrange for the data to be written through the + * cache to persistent media. + */ +static inline void __pmem *memremap_pmem(resource_size_t offset, + unsigned long size) +{ + if (arch_has_pmem_api()) + return arch_memremap_pmem(offset, size); + return default_memremap_pmem(offset, size); +} + +/** + * memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Perform a memory copy that results in the destination of the copy + * being effectively evicted from, or never written to, the processor + * cache hierarchy after the copy completes. After memcpy_to_pmem() + * data may still reside in cpu or platform buffers, so this operation + * must be followed by a wmb_pmem(). + */ +static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) +{ + if (arch_has_pmem_api()) + arch_memcpy_to_pmem(dst, src, n); + else + default_memcpy_to_pmem(dst, src, n); +} + +/** + * wmb_pmem - synchronize writes to persistent memory + * + * After a series of memcpy_to_pmem() operations this drains data from + * cpu write buffers and any platform (memory controller) buffers to + * ensure that written data is durable on persistent memory media. + */ +static inline void wmb_pmem(void) +{ + if (arch_has_pmem_api()) + arch_wmb_pmem(); +} +#endif /* __PMEM_H__ */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 9de2fdc8b5e4..a99f0e5243e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -153,30 +153,8 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) -/* - * The KMALLOC_LOOP_LOW is the definition for the for loop index start number - * to create the kmalloc_caches object in create_kmalloc_caches(). The first - * and the second are 96 and 192. You can see that in the kmalloc_index(), if - * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64, - * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't - * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW. - */ -#if KMALLOC_MIN_SIZE <= 32 -#define KMALLOC_LOOP_LOW 1 -#elif KMALLOC_MIN_SIZE <= 64 -#define KMALLOC_LOOP_LOW 2 -#else -#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW -#endif - #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -/* - * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used. - * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be - * initialized. - */ -#define KMALLOC_LOOP_LOW 1 #endif /* diff --git a/include/linux/wait.h b/include/linux/wait.h index d69ac4ecc88b..1e1bf9f963a9 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -358,6 +358,19 @@ do { \ __ret; \ }) +#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ + cmd1; schedule(); cmd2) +/* + * Just like wait_event_cmd(), except it sets exclusive flag + */ +#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 7f79cf459591..0b73af9be12f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1117,61 +1117,6 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, TP_ARGS(wq) ); -#define show_oper_type(type) \ - __print_symbolic(type, \ - { BTRFS_QGROUP_OPER_ADD_EXCL, "OPER_ADD_EXCL" }, \ - { BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" }, \ - { BTRFS_QGROUP_OPER_SUB_EXCL, "OPER_SUB_EXCL" }, \ - { BTRFS_QGROUP_OPER_SUB_SHARED, "OPER_SUB_SHARED" }) - -DECLARE_EVENT_CLASS(btrfs_qgroup_oper, - - TP_PROTO(struct btrfs_qgroup_operation *oper), - - TP_ARGS(oper), - - TP_STRUCT__entry( - __field( u64, ref_root ) - __field( u64, bytenr ) - __field( u64, num_bytes ) - __field( u64, seq ) - __field( int, type ) - __field( u64, elem_seq ) - ), - - TP_fast_assign( - __entry->ref_root = oper->ref_root; - __entry->bytenr = oper->bytenr, - __entry->num_bytes = oper->num_bytes; - __entry->seq = oper->seq; - __entry->type = oper->type; - __entry->elem_seq = oper->elem.seq; - ), - - TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, " - "seq = %llu, elem.seq = %llu, type = %s", - (unsigned long long)__entry->ref_root, - (unsigned long long)__entry->bytenr, - (unsigned long long)__entry->num_bytes, - (unsigned long long)__entry->seq, - (unsigned long long)__entry->elem_seq, - show_oper_type(__entry->type)) -); - -DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account, - - TP_PROTO(struct btrfs_qgroup_operation *oper), - - TP_ARGS(oper) -); - -DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref, - - TP_PROTO(struct btrfs_qgroup_operation *oper), - - TP_ARGS(oper) -); - #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index c1c23f19d4a2..1ff9942718fe 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -272,6 +272,7 @@ header-y += ncp_fs.h header-y += ncp.h header-y += ncp_mount.h header-y += ncp_no.h +header-y += ndctl.h header-y += neighbour.h header-y += netconf.h header-y += netdevice.h diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h new file mode 100644 index 000000000000..2b94ea2287bb --- /dev/null +++ b/include/uapi/linux/ndctl.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU Lesser General Public License, + * version 2.1, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + */ +#ifndef __NDCTL_H__ +#define __NDCTL_H__ + +#include <linux/types.h> + +struct nd_cmd_smart { + __u32 status; + __u8 data[128]; +} __packed; + +struct nd_cmd_smart_threshold { + __u32 status; + __u8 data[8]; +} __packed; + +struct nd_cmd_dimm_flags { + __u32 status; + __u32 flags; +} __packed; + +struct nd_cmd_get_config_size { + __u32 status; + __u32 config_size; + __u32 max_xfer; +} __packed; + +struct nd_cmd_get_config_data_hdr { + __u32 in_offset; + __u32 in_length; + __u32 status; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_set_config_hdr { + __u32 in_offset; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_hdr { + __u32 opcode; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_tail { + __u32 status; + __u32 out_length; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_ars_cap { + __u64 address; + __u64 length; + __u32 status; + __u32 max_ars_out; +} __packed; + +struct nd_cmd_ars_start { + __u64 address; + __u64 length; + __u16 type; + __u8 reserved[6]; + __u32 status; +} __packed; + +struct nd_cmd_ars_status { + __u32 status; + __u32 out_length; + __u64 address; + __u64 length; + __u16 type; + __u32 num_records; + struct nd_ars_record { + __u32 handle; + __u32 flags; + __u64 err_address; + __u64 mask; + } __packed records[0]; +} __packed; + +enum { + ND_CMD_IMPLEMENTED = 0, + + /* bus commands */ + ND_CMD_ARS_CAP = 1, + ND_CMD_ARS_START = 2, + ND_CMD_ARS_STATUS = 3, + + /* per-dimm commands */ + ND_CMD_SMART = 1, + ND_CMD_SMART_THRESHOLD = 2, + ND_CMD_DIMM_FLAGS = 3, + ND_CMD_GET_CONFIG_SIZE = 4, + ND_CMD_GET_CONFIG_DATA = 5, + ND_CMD_SET_CONFIG_DATA = 6, + ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7, + ND_CMD_VENDOR_EFFECT_LOG = 8, + ND_CMD_VENDOR = 9, +}; + +static inline const char *nvdimm_bus_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_ARS_CAP] = "ars_cap", + [ND_CMD_ARS_START] = "ars_start", + [ND_CMD_ARS_STATUS] = "ars_status", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +static inline const char *nvdimm_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_SMART] = "smart", + [ND_CMD_SMART_THRESHOLD] = "smart_thresh", + [ND_CMD_DIMM_FLAGS] = "flags", + [ND_CMD_GET_CONFIG_SIZE] = "get_size", + [ND_CMD_GET_CONFIG_DATA] = "get_data", + [ND_CMD_SET_CONFIG_DATA] = "set_data", + [ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size", + [ND_CMD_VENDOR_EFFECT_LOG] = "effect_log", + [ND_CMD_VENDOR] = "vendor", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +#define ND_IOCTL 'N' + +#define ND_IOCTL_SMART _IOWR(ND_IOCTL, ND_CMD_SMART,\ + struct nd_cmd_smart) + +#define ND_IOCTL_SMART_THRESHOLD _IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\ + struct nd_cmd_smart_threshold) + +#define ND_IOCTL_DIMM_FLAGS _IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\ + struct nd_cmd_dimm_flags) + +#define ND_IOCTL_GET_CONFIG_SIZE _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\ + struct nd_cmd_get_config_size) + +#define ND_IOCTL_GET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\ + struct nd_cmd_get_config_data_hdr) + +#define ND_IOCTL_SET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\ + struct nd_cmd_set_config_hdr) + +#define ND_IOCTL_VENDOR _IOWR(ND_IOCTL, ND_CMD_VENDOR,\ + struct nd_cmd_vendor_hdr) + +#define ND_IOCTL_ARS_CAP _IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\ + struct nd_cmd_ars_cap) + +#define ND_IOCTL_ARS_START _IOWR(ND_IOCTL, ND_CMD_ARS_START,\ + struct nd_cmd_ars_start) + +#define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ + struct nd_cmd_ars_status) + +#define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ +#define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */ +#define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */ +#define ND_DEVICE_NAMESPACE_IO 4 /* legacy persistent memory */ +#define ND_DEVICE_NAMESPACE_PMEM 5 /* PMEM namespace (may alias with BLK) */ +#define ND_DEVICE_NAMESPACE_BLK 6 /* BLK namespace (may alias with PMEM) */ + +enum nd_driver_flags { + ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM, + ND_DRIVER_REGION_PMEM = 1 << ND_DEVICE_REGION_PMEM, + ND_DRIVER_REGION_BLK = 1 << ND_DEVICE_REGION_BLK, + ND_DRIVER_NAMESPACE_IO = 1 << ND_DEVICE_NAMESPACE_IO, + ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM, + ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK, +}; + +enum { + ND_MIN_NAMESPACE_SIZE = 0x00400000, +}; +#endif /* __NDCTL_H__ */ diff --git a/lib/Kconfig b/lib/Kconfig index 34e332b8d326..3a2ef67db6c7 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -528,4 +528,7 @@ source "lib/fonts/Kconfig" config ARCH_HAS_SG_CHAIN def_bool n +config ARCH_HAS_PMEM_API + bool + endmenu diff --git a/lib/kobject.c b/lib/kobject.c index 75ee63834fd1..2e3bd01964a9 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -545,6 +545,7 @@ out: kfree(devpath); return error; } +EXPORT_SYMBOL_GPL(kobject_move); /** * kobject_del - unlink kobject from hierarchy. diff --git a/mm/slab_common.c b/mm/slab_common.c index 9f8d71f78404..983b78694c46 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -855,6 +855,12 @@ void __init setup_kmalloc_cache_index_table(void) } } +static void new_kmalloc_cache(int idx, unsigned long flags) +{ + kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_info[idx].size, flags); +} + /* * Create the kmalloc array. Some of the regular kmalloc arrays * may already have been created because they were needed to @@ -864,25 +870,19 @@ void __init create_kmalloc_caches(unsigned long flags) { int i; - for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache( - kmalloc_info[i].name, - kmalloc_info[i].size, - flags); - } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[i]) + new_kmalloc_cache(i, flags); /* - * "i == 2" is the "kmalloc-192" case which is the last special - * case for initialization and it's the point to jump to - * allocate the minimize size of the object. In slab allocator, - * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4 - * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is - * defined, it may be larger than 2^5 and here is also the - * trick to skip the empty gap. + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches */ - if (i == 2) - i = (KMALLOC_SHIFT_LOW - 1); + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + new_kmalloc_cache(1, flags); + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + new_kmalloc_cache(2, flags); } /* Kmalloc array is now usable */ diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild new file mode 100644 index 000000000000..8e9b64520ec1 --- /dev/null +++ b/tools/testing/nvdimm/Kbuild @@ -0,0 +1,40 @@ +ldflags-y += --wrap=ioremap_cache +ldflags-y += --wrap=ioremap_nocache +ldflags-y += --wrap=iounmap +ldflags-y += --wrap=__request_region +ldflags-y += --wrap=__release_region + +DRIVERS := ../../../drivers +NVDIMM_SRC := $(DRIVERS)/nvdimm +ACPI_SRC := $(DRIVERS)/acpi + +obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o +obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_ACPI_NFIT) += nfit.o + +nfit-y := $(ACPI_SRC)/nfit.o +nfit-y += config_check.o + +nd_pmem-y := $(NVDIMM_SRC)/pmem.o +nd_pmem-y += config_check.o + +nd_btt-y := $(NVDIMM_SRC)/btt.o +nd_btt-y += config_check.o + +nd_blk-y := $(NVDIMM_SRC)/blk.o +nd_blk-y += config_check.o + +libnvdimm-y := $(NVDIMM_SRC)/core.o +libnvdimm-y += $(NVDIMM_SRC)/bus.o +libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o +libnvdimm-y += $(NVDIMM_SRC)/dimm.o +libnvdimm-y += $(NVDIMM_SRC)/region_devs.o +libnvdimm-y += $(NVDIMM_SRC)/region.o +libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o +libnvdimm-y += $(NVDIMM_SRC)/label.o +libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o +libnvdimm-y += config_check.o + +obj-m += test/ diff --git a/tools/testing/nvdimm/Makefile b/tools/testing/nvdimm/Makefile new file mode 100644 index 000000000000..3dfe024b4e7e --- /dev/null +++ b/tools/testing/nvdimm/Makefile @@ -0,0 +1,7 @@ +KDIR ?= ../../../ + +default: + $(MAKE) -C $(KDIR) M=$$PWD + +install: default + $(MAKE) -C $(KDIR) M=$$PWD modules_install diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c new file mode 100644 index 000000000000..f2c7615554eb --- /dev/null +++ b/tools/testing/nvdimm/config_check.c @@ -0,0 +1,15 @@ +#include <linux/kconfig.h> +#include <linux/bug.h> + +void check(void) +{ + /* + * These kconfig symbols must be set to "m" for nfit_test to + * load and operate. + */ + BUILD_BUG_ON(!IS_MODULE(CONFIG_LIBNVDIMM)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT)); +} diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild new file mode 100644 index 000000000000..9241064970fe --- /dev/null +++ b/tools/testing/nvdimm/test/Kbuild @@ -0,0 +1,8 @@ +ccflags-y := -I$(src)/../../../../drivers/nvdimm/ +ccflags-y += -I$(src)/../../../../drivers/acpi/ + +obj-m += nfit_test.o +obj-m += nfit_test_iomap.o + +nfit_test-y := nfit.o +nfit_test_iomap-y := iomap.o diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c new file mode 100644 index 000000000000..c85a6f6ba559 --- /dev/null +++ b/tools/testing/nvdimm/test/iomap.c @@ -0,0 +1,151 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/rculist.h> +#include <linux/export.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/io.h> +#include "nfit_test.h" + +static LIST_HEAD(iomap_head); + +static struct iomap_ops { + nfit_test_lookup_fn nfit_test_lookup; + struct list_head list; +} iomap_ops = { + .list = LIST_HEAD_INIT(iomap_ops.list), +}; + +void nfit_test_setup(nfit_test_lookup_fn lookup) +{ + iomap_ops.nfit_test_lookup = lookup; + list_add_rcu(&iomap_ops.list, &iomap_head); +} +EXPORT_SYMBOL(nfit_test_setup); + +void nfit_test_teardown(void) +{ + list_del_rcu(&iomap_ops.list); + synchronize_rcu(); +} +EXPORT_SYMBOL(nfit_test_teardown); + +static struct nfit_test_resource *get_nfit_res(resource_size_t resource) +{ + struct iomap_ops *ops; + + ops = list_first_or_null_rcu(&iomap_head, typeof(*ops), list); + if (ops) + return ops->nfit_test_lookup(resource); + return NULL; +} + +void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, + void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res(offset); + rcu_read_unlock(); + if (nfit_res) + return (void __iomem *) nfit_res->buf + offset + - nfit_res->res->start; + return fallback_fn(offset, size); +} + +void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size) +{ + return __nfit_test_ioremap(offset, size, ioremap_cache); +} +EXPORT_SYMBOL(__wrap_ioremap_cache); + +void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size) +{ + return __nfit_test_ioremap(offset, size, ioremap_nocache); +} +EXPORT_SYMBOL(__wrap_ioremap_nocache); + +void __wrap_iounmap(volatile void __iomem *addr) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res((unsigned long) addr); + rcu_read_unlock(); + if (nfit_res) + return; + return iounmap(addr); +} +EXPORT_SYMBOL(__wrap_iounmap); + +struct resource *__wrap___request_region(struct resource *parent, + resource_size_t start, resource_size_t n, const char *name, + int flags) +{ + struct nfit_test_resource *nfit_res; + + if (parent == &iomem_resource) { + rcu_read_lock(); + nfit_res = get_nfit_res(start); + rcu_read_unlock(); + if (nfit_res) { + struct resource *res = nfit_res->res + 1; + + if (start + n > nfit_res->res->start + + resource_size(nfit_res->res)) { + pr_debug("%s: start: %llx n: %llx overflow: %pr\n", + __func__, start, n, + nfit_res->res); + return NULL; + } + + res->start = start; + res->end = start + n - 1; + res->name = name; + res->flags = resource_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + pr_debug("%s: %pr\n", __func__, res); + return res; + } + } + return __request_region(parent, start, n, name, flags); +} +EXPORT_SYMBOL(__wrap___request_region); + +void __wrap___release_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct nfit_test_resource *nfit_res; + + if (parent == &iomem_resource) { + rcu_read_lock(); + nfit_res = get_nfit_res(start); + rcu_read_unlock(); + if (nfit_res) { + struct resource *res = nfit_res->res + 1; + + if (start != res->start || resource_size(res) != n) + pr_info("%s: start: %llx n: %llx mismatch: %pr\n", + __func__, start, n, res); + else + memset(res, 0, sizeof(*res)); + return; + } + } + __release_region(parent, start, n); +} +EXPORT_SYMBOL(__wrap___release_region); + +MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c new file mode 100644 index 000000000000..4b69b8368de0 --- /dev/null +++ b/tools/testing/nvdimm/test/nfit.c @@ -0,0 +1,1116 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/platform_device.h> +#include <linux/dma-mapping.h> +#include <linux/libnvdimm.h> +#include <linux/vmalloc.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/ndctl.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <nfit.h> +#include <nd.h> +#include "nfit_test.h" + +/* + * Generate an NFIT table to describe the following topology: + * + * BUS0: Interleaved PMEM regions, and aliasing with BLK regions + * + * (a) (b) DIMM BLK-REGION + * +----------+--------------+----------+---------+ + * +------+ | blk2.0 | pm0.0 | blk2.1 | pm1.0 | 0 region2 + * | imc0 +--+- - - - - region0 - - - -+----------+ + + * +--+---+ | blk3.0 | pm0.0 | blk3.1 | pm1.0 | 1 region3 + * | +----------+--------------v----------v v + * +--+---+ | | + * | cpu0 | region1 + * +--+---+ | | + * | +-------------------------^----------^ ^ + * +--+---+ | blk4.0 | pm1.0 | 2 region4 + * | imc1 +--+-------------------------+----------+ + + * +------+ | blk5.0 | pm1.0 | 3 region5 + * +-------------------------+----------+-+-------+ + * + * *) In this layout we have four dimms and two memory controllers in one + * socket. Each unique interface (BLK or PMEM) to DPA space + * is identified by a region device with a dynamically assigned id. + * + * *) The first portion of dimm0 and dimm1 are interleaved as REGION0. + * A single PMEM namespace "pm0.0" is created using half of the + * REGION0 SPA-range. REGION0 spans dimm0 and dimm1. PMEM namespace + * allocate from from the bottom of a region. The unallocated + * portion of REGION0 aliases with REGION2 and REGION3. That + * unallacted capacity is reclaimed as BLK namespaces ("blk2.0" and + * "blk3.0") starting at the base of each DIMM to offset (a) in those + * DIMMs. "pm0.0", "blk2.0" and "blk3.0" are free-form readable + * names that can be assigned to a namespace. + * + * *) In the last portion of dimm0 and dimm1 we have an interleaved + * SPA range, REGION1, that spans those two dimms as well as dimm2 + * and dimm3. Some of REGION1 allocated to a PMEM namespace named + * "pm1.0" the rest is reclaimed in 4 BLK namespaces (for each + * dimm in the interleave set), "blk2.1", "blk3.1", "blk4.0", and + * "blk5.0". + * + * *) The portion of dimm2 and dimm3 that do not participate in the + * REGION1 interleaved SPA range (i.e. the DPA address below offset + * (b) are also included in the "blk4.0" and "blk5.0" namespaces. + * Note, that BLK namespaces need not be contiguous in DPA-space, and + * can consume aliased capacity from multiple interleave sets. + * + * BUS1: Legacy NVDIMM (single contiguous range) + * + * region2 + * +---------------------+ + * |---------------------| + * || pm2.0 || + * |---------------------| + * +---------------------+ + * + * *) A NFIT-table may describe a simple system-physical-address range + * with no BLK aliasing. This type of region may optionally + * reference an NVDIMM. + */ +enum { + NUM_PM = 2, + NUM_DCR = 4, + NUM_BDW = NUM_DCR, + NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW, + NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */, + DIMM_SIZE = SZ_32M, + LABEL_SIZE = SZ_128K, + SPA0_SIZE = DIMM_SIZE, + SPA1_SIZE = DIMM_SIZE*2, + SPA2_SIZE = DIMM_SIZE, + BDW_SIZE = 64 << 8, + DCR_SIZE = 12, + NUM_NFITS = 2, /* permit testing multiple NFITs per system */ +}; + +struct nfit_test_dcr { + __le64 bdw_addr; + __le32 bdw_status; + __u8 aperature[BDW_SIZE]; +}; + +#define NFIT_DIMM_HANDLE(node, socket, imc, chan, dimm) \ + (((node & 0xfff) << 16) | ((socket & 0xf) << 12) \ + | ((imc & 0xf) << 8) | ((chan & 0xf) << 4) | (dimm & 0xf)) + +static u32 handle[NUM_DCR] = { + [0] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 0), + [1] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 1), + [2] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 0), + [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1), +}; + +struct nfit_test { + struct acpi_nfit_desc acpi_desc; + struct platform_device pdev; + struct list_head resources; + void *nfit_buf; + dma_addr_t nfit_dma; + size_t nfit_size; + int num_dcr; + int num_pm; + void **dimm; + dma_addr_t *dimm_dma; + void **label; + dma_addr_t *label_dma; + void **spa_set; + dma_addr_t *spa_set_dma; + struct nfit_test_dcr **dcr; + dma_addr_t *dcr_dma; + int (*alloc)(struct nfit_test *t); + void (*setup)(struct nfit_test *t); +}; + +static struct nfit_test *to_nfit_test(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + + return container_of(pdev, struct nfit_test, pdev); +} + +static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) +{ + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + int i, rc; + + if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) + return -ENXIO; + + /* lookup label space for the given dimm */ + for (i = 0; i < ARRAY_SIZE(handle); i++) + if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i]) + break; + if (i >= ARRAY_SIZE(handle)) + return -ENXIO; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: { + struct nd_cmd_get_config_size *nd_cmd = buf; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + nd_cmd->status = 0; + nd_cmd->config_size = LABEL_SIZE; + nd_cmd->max_xfer = SZ_4K; + rc = 0; + break; + } + case ND_CMD_GET_CONFIG_DATA: { + struct nd_cmd_get_config_data_hdr *nd_cmd = buf; + unsigned int len, offset = nd_cmd->in_offset; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) + return -EINVAL; + + nd_cmd->status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(nd_cmd->out_buf, t->label[i] + offset, len); + rc = buf_len - sizeof(*nd_cmd) - len; + break; + } + case ND_CMD_SET_CONFIG_DATA: { + struct nd_cmd_set_config_hdr *nd_cmd = buf; + unsigned int len, offset = nd_cmd->in_offset; + u32 *status; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) + return -EINVAL; + + status = buf + nd_cmd->in_length + sizeof(*nd_cmd); + *status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(t->label[i] + offset, nd_cmd->in_buf, len); + rc = buf_len - sizeof(*nd_cmd) - (len + 4); + break; + } + default: + return -ENOTTY; + } + + return rc; +} + +static DEFINE_SPINLOCK(nfit_test_lock); +static struct nfit_test *instances[NUM_NFITS]; + +static void release_nfit_res(void *data) +{ + struct nfit_test_resource *nfit_res = data; + struct resource *res = nfit_res->res; + + spin_lock(&nfit_test_lock); + list_del(&nfit_res->list); + spin_unlock(&nfit_test_lock); + + if (is_vmalloc_addr(nfit_res->buf)) + vfree(nfit_res->buf); + else + dma_free_coherent(nfit_res->dev, resource_size(res), + nfit_res->buf, res->start); + kfree(res); + kfree(nfit_res); +} + +static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, + void *buf) +{ + struct device *dev = &t->pdev.dev; + struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL); + struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res), + GFP_KERNEL); + int rc; + + if (!res || !buf || !nfit_res) + goto err; + rc = devm_add_action(dev, release_nfit_res, nfit_res); + if (rc) + goto err; + INIT_LIST_HEAD(&nfit_res->list); + memset(buf, 0, size); + nfit_res->dev = dev; + nfit_res->buf = buf; + nfit_res->res = res; + res->start = *dma; + res->end = *dma + size - 1; + res->name = "NFIT"; + spin_lock(&nfit_test_lock); + list_add(&nfit_res->list, &t->resources); + spin_unlock(&nfit_test_lock); + + return nfit_res->buf; + err: + if (buf && !is_vmalloc_addr(buf)) + dma_free_coherent(dev, size, buf, *dma); + else if (buf) + vfree(buf); + kfree(res); + kfree(nfit_res); + return NULL; +} + +static void *test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma) +{ + void *buf = vmalloc(size); + + *dma = (unsigned long) buf; + return __test_alloc(t, size, dma, buf); +} + +static void *test_alloc_coherent(struct nfit_test *t, size_t size, + dma_addr_t *dma) +{ + struct device *dev = &t->pdev.dev; + void *buf = dma_alloc_coherent(dev, size, dma, GFP_KERNEL); + + return __test_alloc(t, size, dma, buf); +} + +static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(instances); i++) { + struct nfit_test_resource *n, *nfit_res = NULL; + struct nfit_test *t = instances[i]; + + if (!t) + continue; + spin_lock(&nfit_test_lock); + list_for_each_entry(n, &t->resources, list) { + if (addr >= n->res->start && (addr < n->res->start + + resource_size(n->res))) { + nfit_res = n; + break; + } else if (addr >= (unsigned long) n->buf + && (addr < (unsigned long) n->buf + + resource_size(n->res))) { + nfit_res = n; + break; + } + } + spin_unlock(&nfit_test_lock); + if (nfit_res) + return nfit_res; + } + + return NULL; +} + +static int nfit_test0_alloc(struct nfit_test *t) +{ + size_t nfit_size = sizeof(struct acpi_table_nfit) + + sizeof(struct acpi_nfit_system_address) * NUM_SPA + + sizeof(struct acpi_nfit_memory_map) * NUM_MEM + + sizeof(struct acpi_nfit_control_region) * NUM_DCR + + sizeof(struct acpi_nfit_data_region) * NUM_BDW; + int i; + + t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); + if (!t->nfit_buf) + return -ENOMEM; + t->nfit_size = nfit_size; + + t->spa_set[0] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[0]); + if (!t->spa_set[0]) + return -ENOMEM; + + t->spa_set[1] = test_alloc_coherent(t, SPA1_SIZE, &t->spa_set_dma[1]); + if (!t->spa_set[1]) + return -ENOMEM; + + for (i = 0; i < NUM_DCR; i++) { + t->dimm[i] = test_alloc(t, DIMM_SIZE, &t->dimm_dma[i]); + if (!t->dimm[i]) + return -ENOMEM; + + t->label[i] = test_alloc(t, LABEL_SIZE, &t->label_dma[i]); + if (!t->label[i]) + return -ENOMEM; + sprintf(t->label[i], "label%d", i); + } + + for (i = 0; i < NUM_DCR; i++) { + t->dcr[i] = test_alloc(t, LABEL_SIZE, &t->dcr_dma[i]); + if (!t->dcr[i]) + return -ENOMEM; + } + + return 0; +} + +static int nfit_test1_alloc(struct nfit_test *t) +{ + size_t nfit_size = sizeof(struct acpi_table_nfit) + + sizeof(struct acpi_nfit_system_address) + + sizeof(struct acpi_nfit_memory_map) + + sizeof(struct acpi_nfit_control_region); + + t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); + if (!t->nfit_buf) + return -ENOMEM; + t->nfit_size = nfit_size; + + t->spa_set[0] = test_alloc_coherent(t, SPA2_SIZE, &t->spa_set_dma[0]); + if (!t->spa_set[0]) + return -ENOMEM; + + return 0; +} + +static void nfit_test_init_header(struct acpi_table_nfit *nfit, size_t size) +{ + memcpy(nfit->header.signature, ACPI_SIG_NFIT, 4); + nfit->header.length = size; + nfit->header.revision = 1; + memcpy(nfit->header.oem_id, "LIBND", 6); + memcpy(nfit->header.oem_table_id, "TEST", 5); + nfit->header.oem_revision = 1; + memcpy(nfit->header.asl_compiler_id, "TST", 4); + nfit->header.asl_compiler_revision = 1; +} + +static void nfit_test0_setup(struct nfit_test *t) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct acpi_nfit_memory_map *memdev; + void *nfit_buf = t->nfit_buf; + size_t size = t->nfit_size; + struct acpi_nfit_system_address *spa; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_data_region *bdw; + unsigned int offset; + + nfit_test_init_header(nfit_buf, size); + + /* + * spa0 (interleave first half of dimm0 and dimm1, note storage + * does not actually alias the related block-data-window + * regions) + */ + spa = nfit_buf + sizeof(struct acpi_table_nfit); + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 0+1; + spa->address = t->spa_set_dma[0]; + spa->length = SPA0_SIZE; + + /* + * spa1 (interleave last half of the 4 DIMMS, note storage + * does not actually alias the related block-data-window + * regions) + */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa); + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 1+1; + spa->address = t->spa_set_dma[1]; + spa->length = SPA1_SIZE; + + /* spa2 (dcr0) dimm0 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 2; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 2+1; + spa->address = t->dcr_dma[0]; + spa->length = DCR_SIZE; + + /* spa3 (dcr1) dimm1 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 3; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 3+1; + spa->address = t->dcr_dma[1]; + spa->length = DCR_SIZE; + + /* spa4 (dcr2) dimm2 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 4; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 4+1; + spa->address = t->dcr_dma[2]; + spa->length = DCR_SIZE; + + /* spa5 (dcr3) dimm3 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 5; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 5+1; + spa->address = t->dcr_dma[3]; + spa->length = DCR_SIZE; + + /* spa6 (bdw for dcr0) dimm0 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 6; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 6+1; + spa->address = t->dimm_dma[0]; + spa->length = DIMM_SIZE; + + /* spa7 (bdw for dcr1) dimm1 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 7; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 7+1; + spa->address = t->dimm_dma[1]; + spa->length = DIMM_SIZE; + + /* spa8 (bdw for dcr2) dimm2 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 8; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 8+1; + spa->address = t->dimm_dma[2]; + spa->length = DIMM_SIZE; + + /* spa9 (bdw for dcr3) dimm3 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 9; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 9+1; + spa->address = t->dimm_dma[3]; + spa->length = DIMM_SIZE; + + offset = sizeof(struct acpi_table_nfit) + sizeof(*spa) * 10; + /* mem-region0 (spa0, dimm0) */ + memdev = nfit_buf + offset; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 0+1; + memdev->region_size = SPA0_SIZE/2; + memdev->region_offset = t->spa_set_dma[0]; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 2; + + /* mem-region1 (spa0, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map); + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 1+1; + memdev->region_size = SPA0_SIZE/2; + memdev->region_offset = t->spa_set_dma[0] + SPA0_SIZE/2; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 2; + + /* mem-region2 (spa1, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 1; + memdev->range_index = 1+1; + memdev->region_index = 0+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1]; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region3 (spa1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 1; + memdev->range_index = 1+1; + memdev->region_index = 1+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region4 (spa1, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 4; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 1+1; + memdev->region_index = 2+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + 2*SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region5 (spa1, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 1+1; + memdev->region_index = 3+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + 3*SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region6 (spa/dcr0, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 6; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 2+1; + memdev->region_index = 0+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region7 (spa/dcr1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 7; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 3+1; + memdev->region_index = 1+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region8 (spa/dcr2, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 8; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 4+1; + memdev->region_index = 2+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region9 (spa/dcr3, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 9; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 5+1; + memdev->region_index = 3+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region10 (spa/bdw0, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 10; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 6+1; + memdev->region_index = 0+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region11 (spa/bdw1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 11; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 7+1; + memdev->region_index = 1+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region12 (spa/bdw2, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 12; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 8+1; + memdev->region_index = 2+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region13 (spa/dcr3, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 13; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 9+1; + memdev->region_index = 3+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + offset = offset + sizeof(struct acpi_nfit_memory_map) * 14; + /* dcr-descriptor0 */ + dcr = nfit_buf + offset; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 0+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[0]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor1 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region); + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 1+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[1]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor2 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 2; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 2+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[2]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor3 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 3; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 3+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[3]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + offset = offset + sizeof(struct acpi_nfit_control_region) * 4; + /* bdw0 (spa/dcr0, dimm0) */ + bdw = nfit_buf + offset; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 0+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw1 (spa/dcr1, dimm1) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region); + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 1+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw2 (spa/dcr2, dimm2) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 2; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 2+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw3 (spa/dcr3, dimm3) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 3; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 3+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + acpi_desc = &t->acpi_desc; + set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + nd_desc = &acpi_desc->nd_desc; + nd_desc->ndctl = nfit_test_ctl; +} + +static void nfit_test1_setup(struct nfit_test *t) +{ + size_t size = t->nfit_size, offset; + void *nfit_buf = t->nfit_buf; + struct acpi_nfit_memory_map *memdev; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_system_address *spa; + + nfit_test_init_header(nfit_buf, size); + + offset = sizeof(struct acpi_table_nfit); + /* spa0 (flat range with no bdw aliasing) */ + spa = nfit_buf + offset; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 0+1; + spa->address = t->spa_set_dma[0]; + spa->length = SPA2_SIZE; + + offset += sizeof(*spa); + /* mem-region0 (spa0, dimm0) */ + memdev = nfit_buf + offset; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = 0; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 0+1; + memdev->region_size = SPA2_SIZE; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + memdev->flags = ACPI_NFIT_MEM_SAVE_FAILED | ACPI_NFIT_MEM_RESTORE_FAILED + | ACPI_NFIT_MEM_FLUSH_FAILED | ACPI_NFIT_MEM_HEALTH_OBSERVED + | ACPI_NFIT_MEM_ARMED; + + offset += sizeof(*memdev); + /* dcr-descriptor0 */ + dcr = nfit_buf + offset; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 0+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~0; + dcr->code = 0x201; + dcr->windows = 0; + dcr->window_size = 0; + dcr->command_offset = 0; + dcr->command_size = 0; + dcr->status_offset = 0; + dcr->status_size = 0; +} + +static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw) +{ + struct nfit_blk *nfit_blk = ndbr->blk_provider_data; + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + struct nd_region *nd_region = &ndbr->nd_region; + unsigned int lane; + + lane = nd_region_acquire_lane(nd_region); + if (rw) + memcpy(mmio->base + dpa, iobuf, len); + else + memcpy(iobuf, mmio->base + dpa, len); + nd_region_release_lane(nd_region, lane); + + return 0; +} + +static int nfit_test_probe(struct platform_device *pdev) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct device *dev = &pdev->dev; + struct nfit_test *nfit_test; + int rc; + + nfit_test = to_nfit_test(&pdev->dev); + + /* common alloc */ + if (nfit_test->num_dcr) { + int num = nfit_test->num_dcr; + + nfit_test->dimm = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->dimm_dma = devm_kcalloc(dev, num, sizeof(dma_addr_t), + GFP_KERNEL); + nfit_test->label = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->label_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + nfit_test->dcr = devm_kcalloc(dev, num, + sizeof(struct nfit_test_dcr *), GFP_KERNEL); + nfit_test->dcr_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + if (nfit_test->dimm && nfit_test->dimm_dma && nfit_test->label + && nfit_test->label_dma && nfit_test->dcr + && nfit_test->dcr_dma) + /* pass */; + else + return -ENOMEM; + } + + if (nfit_test->num_pm) { + int num = nfit_test->num_pm; + + nfit_test->spa_set = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->spa_set_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + if (nfit_test->spa_set && nfit_test->spa_set_dma) + /* pass */; + else + return -ENOMEM; + } + + /* per-nfit specific alloc */ + if (nfit_test->alloc(nfit_test)) + return -ENOMEM; + + nfit_test->setup(nfit_test); + acpi_desc = &nfit_test->acpi_desc; + acpi_desc->dev = &pdev->dev; + acpi_desc->nfit = nfit_test->nfit_buf; + acpi_desc->blk_do_io = nfit_test_blk_do_io; + nd_desc = &acpi_desc->nd_desc; + nd_desc->attr_groups = acpi_nfit_attribute_groups; + acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc); + if (!acpi_desc->nvdimm_bus) + return -ENXIO; + + rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size); + if (rc) { + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return rc; + } + + return 0; +} + +static int nfit_test_remove(struct platform_device *pdev) +{ + struct nfit_test *nfit_test = to_nfit_test(&pdev->dev); + struct acpi_nfit_desc *acpi_desc = &nfit_test->acpi_desc; + + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + + return 0; +} + +static void nfit_test_release(struct device *dev) +{ + struct nfit_test *nfit_test = to_nfit_test(dev); + + kfree(nfit_test); +} + +static const struct platform_device_id nfit_test_id[] = { + { KBUILD_MODNAME }, + { }, +}; + +static struct platform_driver nfit_test_driver = { + .probe = nfit_test_probe, + .remove = nfit_test_remove, + .driver = { + .name = KBUILD_MODNAME, + }, + .id_table = nfit_test_id, +}; + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +static __init int nfit_test_init(void) +{ + int rc, i; + + nfit_test_setup(nfit_test_lookup); + + for (i = 0; i < NUM_NFITS; i++) { + struct nfit_test *nfit_test; + struct platform_device *pdev; + static int once; + + nfit_test = kzalloc(sizeof(*nfit_test), GFP_KERNEL); + if (!nfit_test) { + rc = -ENOMEM; + goto err_register; + } + INIT_LIST_HEAD(&nfit_test->resources); + switch (i) { + case 0: + nfit_test->num_pm = NUM_PM; + nfit_test->num_dcr = NUM_DCR; + nfit_test->alloc = nfit_test0_alloc; + nfit_test->setup = nfit_test0_setup; + break; + case 1: + nfit_test->num_pm = 1; + nfit_test->alloc = nfit_test1_alloc; + nfit_test->setup = nfit_test1_setup; + break; + default: + rc = -EINVAL; + goto err_register; + } + pdev = &nfit_test->pdev; + pdev->name = KBUILD_MODNAME; + pdev->id = i; + pdev->dev.release = nfit_test_release; + rc = platform_device_register(pdev); + if (rc) { + put_device(&pdev->dev); + goto err_register; + } + + rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) + goto err_register; + + instances[i] = nfit_test; + + if (!once++) { + dma_addr_t dma; + void *buf; + + buf = dma_alloc_coherent(&pdev->dev, SZ_128M, &dma, + GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + dev_warn(&pdev->dev, "need 128M of free cma\n"); + goto err_register; + } + dma_free_coherent(&pdev->dev, SZ_128M, buf, dma); + } + } + + rc = platform_driver_register(&nfit_test_driver); + if (rc) + goto err_register; + return 0; + + err_register: + for (i = 0; i < NUM_NFITS; i++) + if (instances[i]) + platform_device_unregister(&instances[i]->pdev); + nfit_test_teardown(); + return rc; +} + +static __exit void nfit_test_exit(void) +{ + int i; + + platform_driver_unregister(&nfit_test_driver); + for (i = 0; i < NUM_NFITS; i++) + platform_device_unregister(&instances[i]->pdev); + nfit_test_teardown(); +} + +module_init(nfit_test_init); +module_exit(nfit_test_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h new file mode 100644 index 000000000000..96c5e16d7db9 --- /dev/null +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -0,0 +1,29 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __NFIT_TEST_H__ +#define __NFIT_TEST_H__ + +struct nfit_test_resource { + struct list_head list; + struct resource *res; + struct device *dev; + void *buf; +}; + +typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t); +void __iomem *__wrap_ioremap_nocache(resource_size_t offset, + unsigned long size); +void __wrap_iounmap(volatile void __iomem *addr); +void nfit_test_setup(nfit_test_lookup_fn lookup); +void nfit_test_teardown(void); +#endif diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 95abddcd7839..24ae9e829e9a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -4,6 +4,7 @@ TARGETS += efivarfs TARGETS += exec TARGETS += firmware TARGETS += ftrace +TARGETS += futex TARGETS += kcmp TARGETS += memfd TARGETS += memory-hotplug @@ -12,13 +13,18 @@ TARGETS += mqueue TARGETS += net TARGETS += powerpc TARGETS += ptrace +TARGETS += seccomp TARGETS += size TARGETS += sysctl +ifneq (1, $(quicktest)) TARGETS += timers +endif TARGETS += user TARGETS += vm TARGETS += x86 #Please keep the TARGETS list alphabetically sorted +# Run "make quicktest=1 run_tests" or +# "make quicktest=1 kselftest from top level Makefile TARGETS_HOTPLUG = cpu-hotplug TARGETS_HOTPLUG += memory-hotplug @@ -27,7 +33,7 @@ TARGETS_HOTPLUG += memory-hotplug # Makefile to avoid test build failures when test # Makefile doesn't have explicit build rules. ifeq (1,$(MAKELEVEL)) -undefine LDFLAGS +override LDFLAGS = override MAKEFLAGS = endif diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 4edb7d0da29b..6b76bfdc847e 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -1,6 +1,6 @@ CFLAGS = -Wall BINARIES = execveat -DEPS = execveat.symlink execveat.denatured script subdir +DEPS = execveat.symlink execveat.denatured script all: $(BINARIES) $(DEPS) subdir: diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile index 346720639d1d..0acbeca47225 100644 --- a/tools/testing/selftests/ftrace/Makefile +++ b/tools/testing/selftests/ftrace/Makefile @@ -1,6 +1,7 @@ all: TEST_PROGS := ftracetest +TEST_DIRS := test.d/ include ../lib.mk diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile new file mode 100644 index 000000000000..6a1752956283 --- /dev/null +++ b/tools/testing/selftests/futex/Makefile @@ -0,0 +1,29 @@ +SUBDIRS := functional + +TEST_PROGS := run.sh + +.PHONY: all clean +all: + for DIR in $(SUBDIRS); do $(MAKE) -C $$DIR $@ ; done + +include ../lib.mk + +override define RUN_TESTS + ./run.sh +endef + +override define INSTALL_RULE + mkdir -p $(INSTALL_PATH) + install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) + + @for SUBDIR in $(SUBDIRS); do \ + $(MAKE) -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \ + done; +endef + +override define EMIT_TESTS + echo "./run.sh" +endef + +clean: + for DIR in $(SUBDIRS); do $(MAKE) -C $$DIR $@ ; done diff --git a/tools/testing/selftests/futex/README b/tools/testing/selftests/futex/README new file mode 100644 index 000000000000..3224a049b196 --- /dev/null +++ b/tools/testing/selftests/futex/README @@ -0,0 +1,62 @@ +Futex Test +========== +Futex Test is intended to thoroughly test the Linux kernel futex system call +API. + +Functional tests shall test the documented behavior of the futex operation +code under test. This includes checking for proper behavior under normal use, +odd corner cases, regression tests, and abject abuse and misuse. + +Futextest will also provide example implementation of mutual exclusion +primitives. These can be used as is in user applications or can serve as +examples for system libraries. These will likely be added to either a new lib/ +directory or purely as header files under include/, I'm leaning toward the +latter. + +Quick Start +----------- +# make +# ./run.sh + +Design and Implementation Goals +------------------------------- +o Tests should be as self contained as is practical so as to facilitate sharing + the individual tests on mailing list discussions and bug reports. +o The build system shall remain as simple as possible, avoiding any archive or + shared object building and linking. +o Where possible, any helper functions or other package-wide code shall be + implemented in header files, avoiding the need to compile intermediate object + files. +o External dependendencies shall remain as minimal as possible. Currently gcc + and glibc are the only dependencies. +o Tests return 0 for success and < 0 for failure. + +Output Formatting +----------------- +Test output shall be easily parsable by both human and machine. Title and +results are printed to stdout, while intermediate ERROR or FAIL messages are +sent to stderr. Tests shall support the -c option to print PASS, FAIL, and +ERROR strings in color for easy visual parsing. Output shall conform to the +following format: + +test_name: Description of the test + Arguments: arg1=val1 #units specified for clarity where appropriate + ERROR: Description of unexpected error + FAIL: Reason for test failure + # FIXME: Perhaps an " INFO: informational message" option would be + # useful here. Using -v to toggle it them on and off, as with -c. + # there may be multiple ERROR or FAIL messages +Result: (PASS|FAIL|ERROR) + +Naming +------ +o FIXME: decide on a sane test naming scheme. Currently the tests are named + based on the primary futex operation they test. Eventually this will become a + problem as we intend to write multiple tests which collide in this namespace. + Perhaps something like "wait-wake-1" "wait-wake-2" is adequate, leaving the + detailed description in the test source and the output. + +Coding Style +------------ +o The Futex Test project adheres to the coding standards set forth by Linux + kernel as defined in the Linux source Documentation/CodingStyle. diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore new file mode 100644 index 000000000000..a09f57061902 --- /dev/null +++ b/tools/testing/selftests/futex/functional/.gitignore @@ -0,0 +1,7 @@ +futex_requeue_pi +futex_requeue_pi_mismatched_ops +futex_requeue_pi_signal_restart +futex_wait_private_mapped_file +futex_wait_timeout +futex_wait_uninitialized_heap +futex_wait_wouldblock diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile new file mode 100644 index 000000000000..9d6b75ef7b5d --- /dev/null +++ b/tools/testing/selftests/futex/functional/Makefile @@ -0,0 +1,25 @@ +INCLUDES := -I../include -I../../ +CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) +LDFLAGS := $(LDFLAGS) -pthread -lrt + +HEADERS := ../include/futextest.h +TARGETS := \ + futex_wait_timeout \ + futex_wait_wouldblock \ + futex_requeue_pi \ + futex_requeue_pi_signal_restart \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ + futex_wait_private_mapped_file + +TEST_PROGS := $(TARGETS) run.sh + +.PHONY: all clean +all: $(TARGETS) + +$(TARGETS): $(HEADERS) + +include ../../lib.mk + +clean: + rm -f $(TARGETS) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c new file mode 100644 index 000000000000..3da06ad23996 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c @@ -0,0 +1,409 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2006-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * This test excercises the futex syscall op codes needed for requeuing + * priority inheritance aware POSIX condition variables and mutexes. + * + * AUTHORS + * Sripathi Kodi <sripathik@in.ibm.com> + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2008-Jan-13: Initial version by Sripathi Kodi <sripathik@in.ibm.com> + * 2009-Nov-6: futex test adaptation by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#include <errno.h> +#include <limits.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <string.h> +#include "atomic.h" +#include "futextest.h" +#include "logging.h" + +#define MAX_WAKE_ITERS 1000 +#define THREAD_MAX 10 +#define SIGNAL_PERIOD_US 100 + +atomic_t waiters_blocked = ATOMIC_INITIALIZER; +atomic_t waiters_woken = ATOMIC_INITIALIZER; + +futex_t f1 = FUTEX_INITIALIZER; +futex_t f2 = FUTEX_INITIALIZER; +futex_t wake_complete = FUTEX_INITIALIZER; + +/* Test option defaults */ +static long timeout_ns; +static int broadcast; +static int owner; +static int locked; + +struct thread_arg { + long id; + struct timespec *timeout; + int lock; + int ret; +}; +#define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 } + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -b Broadcast wakeup (all waiters)\n"); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -l Lock the pi futex across requeue\n"); + printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n"); + printf(" -t N Timeout in nanoseconds (default: 0)\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, + int policy, int prio) +{ + int ret; + struct sched_param schedp; + pthread_attr_t attr; + + pthread_attr_init(&attr); + memset(&schedp, 0, sizeof(schedp)); + + ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); + if (ret) { + error("pthread_attr_setinheritsched\n", ret); + return -1; + } + + ret = pthread_attr_setschedpolicy(&attr, policy); + if (ret) { + error("pthread_attr_setschedpolicy\n", ret); + return -1; + } + + schedp.sched_priority = prio; + ret = pthread_attr_setschedparam(&attr, &schedp); + if (ret) { + error("pthread_attr_setschedparam\n", ret); + return -1; + } + + ret = pthread_create(pth, &attr, func, arg); + if (ret) { + error("pthread_create\n", ret); + return -1; + } + return 0; +} + + +void *waiterfn(void *arg) +{ + struct thread_arg *args = (struct thread_arg *)arg; + futex_t old_val; + + info("Waiter %ld: running\n", args->id); + /* Each thread sleeps for a different amount of time + * This is to avoid races, because we don't lock the + * external mutex here */ + usleep(1000 * (long)args->id); + + old_val = f1; + atomic_inc(&waiters_blocked); + info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n", + &f1, f1, &f2); + args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout, + FUTEX_PRIVATE_FLAG); + + info("waiter %ld woke with %d %s\n", args->id, args->ret, + args->ret < 0 ? strerror(errno) : ""); + atomic_inc(&waiters_woken); + if (args->ret < 0) { + if (args->timeout && errno == ETIMEDOUT) + args->ret = 0; + else { + args->ret = RET_ERROR; + error("futex_wait_requeue_pi\n", errno); + } + futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); + } + futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + + info("Waiter %ld: exiting with %d\n", args->id, args->ret); + pthread_exit((void *)&args->ret); +} + +void *broadcast_wakerfn(void *arg) +{ + struct thread_arg *args = (struct thread_arg *)arg; + int nr_requeue = INT_MAX; + int task_count = 0; + futex_t old_val; + int nr_wake = 1; + int i = 0; + + info("Waker: waiting for waiters to block\n"); + while (waiters_blocked.val < THREAD_MAX) + usleep(1000); + usleep(1000); + + info("Waker: Calling broadcast\n"); + if (args->lock) { + info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2); + futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); + } + continue_requeue: + old_val = f1; + args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue, + FUTEX_PRIVATE_FLAG); + if (args->ret < 0) { + args->ret = RET_ERROR; + error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + } else if (++i < MAX_WAKE_ITERS) { + task_count += args->ret; + if (task_count < THREAD_MAX - waiters_woken.val) + goto continue_requeue; + } else { + error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n", + 0, MAX_WAKE_ITERS, task_count, THREAD_MAX); + args->ret = RET_ERROR; + } + + futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); + + if (args->lock) + futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + + if (args->ret > 0) + args->ret = task_count; + + info("Waker: exiting with %d\n", args->ret); + pthread_exit((void *)&args->ret); +} + +void *signal_wakerfn(void *arg) +{ + struct thread_arg *args = (struct thread_arg *)arg; + unsigned int old_val; + int nr_requeue = 0; + int task_count = 0; + int nr_wake = 1; + int i = 0; + + info("Waker: waiting for waiters to block\n"); + while (waiters_blocked.val < THREAD_MAX) + usleep(1000); + usleep(1000); + + while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) { + info("task_count: %d, waiters_woken: %d\n", + task_count, waiters_woken.val); + if (args->lock) { + info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", + f2, &f2); + futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); + } + info("Waker: Calling signal\n"); + /* cond_signal */ + old_val = f1; + args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, + nr_wake, nr_requeue, + FUTEX_PRIVATE_FLAG); + if (args->ret < 0) + args->ret = -errno; + info("futex: %x\n", f2); + if (args->lock) { + info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", + f2, &f2); + futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + } + info("futex: %x\n", f2); + if (args->ret < 0) { + error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + args->ret = RET_ERROR; + break; + } + + task_count += args->ret; + usleep(SIGNAL_PERIOD_US); + i++; + /* we have to loop at least THREAD_MAX times */ + if (i > MAX_WAKE_ITERS + THREAD_MAX) { + error("max signaling iterations (%d) reached, giving up on pending waiters.\n", + 0, MAX_WAKE_ITERS + THREAD_MAX); + args->ret = RET_ERROR; + break; + } + } + + futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG); + + if (args->ret >= 0) + args->ret = task_count; + + info("Waker: exiting with %d\n", args->ret); + info("Waker: waiters_woken: %d\n", waiters_woken.val); + pthread_exit((void *)&args->ret); +} + +void *third_party_blocker(void *arg) +{ + struct thread_arg *args = (struct thread_arg *)arg; + int ret2 = 0; + + args->ret = futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG); + if (args->ret) + goto out; + args->ret = futex_wait(&wake_complete, wake_complete, NULL, + FUTEX_PRIVATE_FLAG); + ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + + out: + if (args->ret || ret2) { + error("third_party_blocker() futex error", 0); + args->ret = RET_ERROR; + } + + pthread_exit((void *)&args->ret); +} + +int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns) +{ + void *(*wakerfn)(void *) = signal_wakerfn; + struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER; + struct thread_arg waker_arg = THREAD_ARG_INITIALIZER; + pthread_t waiter[THREAD_MAX], waker, blocker; + struct timespec ts, *tsp = NULL; + struct thread_arg args[THREAD_MAX]; + int *waiter_ret; + int i, ret = RET_PASS; + + if (timeout_ns) { + time_t secs; + + info("timeout_ns = %ld\n", timeout_ns); + ret = clock_gettime(CLOCK_MONOTONIC, &ts); + secs = (ts.tv_nsec + timeout_ns) / 1000000000; + ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000; + ts.tv_sec += secs; + info("ts.tv_sec = %ld\n", ts.tv_sec); + info("ts.tv_nsec = %ld\n", ts.tv_nsec); + tsp = &ts; + } + + if (broadcast) + wakerfn = broadcast_wakerfn; + + if (third_party_owner) { + if (create_rt_thread(&blocker, third_party_blocker, + (void *)&blocker_arg, SCHED_FIFO, 1)) { + error("Creating third party blocker thread failed\n", + errno); + ret = RET_ERROR; + goto out; + } + } + + atomic_set(&waiters_woken, 0); + for (i = 0; i < THREAD_MAX; i++) { + args[i].id = i; + args[i].timeout = tsp; + info("Starting thread %d\n", i); + if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i], + SCHED_FIFO, 1)) { + error("Creating waiting thread failed\n", errno); + ret = RET_ERROR; + goto out; + } + } + waker_arg.lock = lock; + if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg, + SCHED_FIFO, 1)) { + error("Creating waker thread failed\n", errno); + ret = RET_ERROR; + goto out; + } + + /* Wait for threads to finish */ + /* Store the first error or failure encountered in waiter_ret */ + waiter_ret = &args[0].ret; + for (i = 0; i < THREAD_MAX; i++) + pthread_join(waiter[i], + *waiter_ret ? NULL : (void **)&waiter_ret); + + if (third_party_owner) + pthread_join(blocker, NULL); + pthread_join(waker, NULL); + +out: + if (!ret) { + if (*waiter_ret) + ret = *waiter_ret; + else if (waker_arg.ret < 0) + ret = waker_arg.ret; + else if (blocker_arg.ret) + ret = blocker_arg.ret; + } + + return ret; +} + +int main(int argc, char *argv[]) +{ + int c, ret; + + while ((c = getopt(argc, argv, "bchlot:v:")) != -1) { + switch (c) { + case 'b': + broadcast = 1; + break; + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'l': + locked = 1; + break; + case 'o': + owner = 1; + locked = 0; + break; + case 't': + timeout_ns = atoi(optarg); + break; + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Test requeue functionality\n", basename(argv[0])); + printf("\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n", + broadcast, locked, owner, timeout_ns); + + /* + * FIXME: unit_test is obsolete now that we parse options and the + * various style of runs are done by run.sh - simplify the code and move + * unit_test into main() + */ + ret = unit_test(broadcast, locked, owner, timeout_ns); + + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c new file mode 100644 index 000000000000..d5e4f2c4da2a --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c @@ -0,0 +1,135 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * 1. Block a thread using FUTEX_WAIT + * 2. Attempt to use FUTEX_CMP_REQUEUE_PI on the futex from 1. + * 3. The kernel must detect the mismatch and return -EINVAL. + * + * AUTHOR + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#include <errno.h> +#include <getopt.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include "futextest.h" +#include "logging.h" + +futex_t f1 = FUTEX_INITIALIZER; +futex_t f2 = FUTEX_INITIALIZER; +int child_ret = 0; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *blocking_child(void *arg) +{ + child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG); + if (child_ret < 0) { + child_ret = -errno; + error("futex_wait\n", errno); + } + return (void *)&child_ret; +} + +int main(int argc, char *argv[]) +{ + int ret = RET_PASS; + pthread_t child; + int c; + + while ((c = getopt(argc, argv, "chv:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Detect mismatched requeue_pi operations\n", + basename(argv[0])); + + if (pthread_create(&child, NULL, blocking_child, NULL)) { + error("pthread_create\n", errno); + ret = RET_ERROR; + goto out; + } + /* Allow the child to block in the kernel. */ + sleep(1); + + /* + * The kernel should detect the waiter did not setup the + * q->requeue_pi_key and return -EINVAL. If it does not, + * it likely gave the lock to the child, which is now hung + * in the kernel. + */ + ret = futex_cmp_requeue_pi(&f1, f1, &f2, 1, 0, FUTEX_PRIVATE_FLAG); + if (ret < 0) { + if (errno == EINVAL) { + /* + * The kernel correctly detected the mismatched + * requeue_pi target and aborted. Wake the child with + * FUTEX_WAKE. + */ + ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG); + if (ret == 1) { + ret = RET_PASS; + } else if (ret < 0) { + error("futex_wake\n", errno); + ret = RET_ERROR; + } else { + error("futex_wake did not wake the child\n", 0); + ret = RET_ERROR; + } + } else { + error("futex_cmp_requeue_pi\n", errno); + ret = RET_ERROR; + } + } else if (ret > 0) { + fail("futex_cmp_requeue_pi failed to detect the mismatch\n"); + ret = RET_FAIL; + } else { + error("futex_cmp_requeue_pi found no waiters\n", 0); + ret = RET_ERROR; + } + + pthread_join(child, NULL); + + if (!ret) + ret = child_ret; + + out: + /* If the kernel crashes, we shouldn't return at all. */ + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c new file mode 100644 index 000000000000..7f0c756993af --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -0,0 +1,223 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2006-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * This test exercises the futex_wait_requeue_pi() signal handling both + * before and after the requeue. The first should be restarted by the + * kernel. The latter should return EWOULDBLOCK to the waiter. + * + * AUTHORS + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2008-May-5: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#include <errno.h> +#include <getopt.h> +#include <limits.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "atomic.h" +#include "futextest.h" +#include "logging.h" + +#define DELAY_US 100 + +futex_t f1 = FUTEX_INITIALIZER; +futex_t f2 = FUTEX_INITIALIZER; +atomic_t requeued = ATOMIC_INITIALIZER; + +int waiter_ret = 0; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg, + int policy, int prio) +{ + struct sched_param schedp; + pthread_attr_t attr; + int ret; + + pthread_attr_init(&attr); + memset(&schedp, 0, sizeof(schedp)); + + ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED); + if (ret) { + error("pthread_attr_setinheritsched\n", ret); + return -1; + } + + ret = pthread_attr_setschedpolicy(&attr, policy); + if (ret) { + error("pthread_attr_setschedpolicy\n", ret); + return -1; + } + + schedp.sched_priority = prio; + ret = pthread_attr_setschedparam(&attr, &schedp); + if (ret) { + error("pthread_attr_setschedparam\n", ret); + return -1; + } + + ret = pthread_create(pth, &attr, func, arg); + if (ret) { + error("pthread_create\n", ret); + return -1; + } + return 0; +} + +void handle_signal(int signo) +{ + info("signal received %s requeue\n", + requeued.val ? "after" : "prior to"); +} + +void *waiterfn(void *arg) +{ + unsigned int old_val; + int res; + + waiter_ret = RET_PASS; + + info("Waiter running\n"); + info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + old_val = f1; + res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL, + FUTEX_PRIVATE_FLAG); + if (!requeued.val || errno != EWOULDBLOCK) { + fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n", + res, strerror(errno)); + info("w2:futex: %x\n", f2); + if (!res) + futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + waiter_ret = RET_FAIL; + } + + info("Waiter exiting with %d\n", waiter_ret); + pthread_exit(NULL); +} + + +int main(int argc, char *argv[]) +{ + unsigned int old_val; + struct sigaction sa; + pthread_t waiter; + int c, res, ret = RET_PASS; + + while ((c = getopt(argc, argv, "chv:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Test signal handling during requeue_pi\n", + basename(argv[0])); + printf("\tArguments: <none>\n"); + + sa.sa_handler = handle_signal; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + if (sigaction(SIGUSR1, &sa, NULL)) { + error("sigaction\n", errno); + exit(1); + } + + info("m1:f2: %x\n", f2); + info("Creating waiter\n"); + res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1); + if (res) { + error("Creating waiting thread failed", res); + ret = RET_ERROR; + goto out; + } + + info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2); + info("m2:f2: %x\n", f2); + futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG); + info("m3:f2: %x\n", f2); + + while (1) { + /* + * signal the waiter before requeue, waiter should automatically + * restart futex_wait_requeue_pi() in the kernel. Wait for the + * waiter to block on f1 again. + */ + info("Issuing SIGUSR1 to waiter\n"); + pthread_kill(waiter, SIGUSR1); + usleep(DELAY_US); + + info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n"); + old_val = f1; + res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0, + FUTEX_PRIVATE_FLAG); + /* + * If res is non-zero, we either requeued the waiter or hit an + * error, break out and handle it. If it is zero, then the + * signal may have hit before the the waiter was blocked on f1. + * Try again. + */ + if (res > 0) { + atomic_set(&requeued, 1); + break; + } else if (res > 0) { + error("FUTEX_CMP_REQUEUE_PI failed\n", errno); + ret = RET_ERROR; + break; + } + } + info("m4:f2: %x\n", f2); + + /* + * Signal the waiter after requeue, waiter should return from + * futex_wait_requeue_pi() with EWOULDBLOCK. Join the thread here so the + * futex_unlock_pi() can't happen before the signal wakeup is detected + * in the kernel. + */ + info("Issuing SIGUSR1 to waiter\n"); + pthread_kill(waiter, SIGUSR1); + info("Waiting for waiter to return\n"); + pthread_join(waiter, NULL); + + info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2); + futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG); + info("m5:f2: %x\n", f2); + + out: + if (ret == RET_PASS && waiter_ret) + ret = waiter_ret; + + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c new file mode 100644 index 000000000000..5f687f247454 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c @@ -0,0 +1,125 @@ +/****************************************************************************** + * + * Copyright FUJITSU LIMITED 2010 + * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Internally, Futex has two handling mode, anon and file. The private file + * mapping is special. At first it behave as file, but after write anything + * it behave as anon. This test is intent to test such case. + * + * AUTHOR + * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + * HISTORY + * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + *****************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <syscall.h> +#include <unistd.h> +#include <errno.h> +#include <linux/futex.h> +#include <pthread.h> +#include <libgen.h> +#include <signal.h> + +#include "logging.h" +#include "futextest.h" + +#define PAGE_SZ 4096 + +char pad[PAGE_SZ] = {1}; +futex_t val = 1; +char pad2[PAGE_SZ] = {1}; + +#define WAKE_WAIT_US 3000000 +struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0}; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *thr_futex_wait(void *arg) +{ + int ret; + + info("futex wait\n"); + ret = futex_wait(&val, 1, &wait_timeout, 0); + if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) { + error("futex error.\n", errno); + print_result(RET_ERROR); + exit(RET_ERROR); + } + + if (ret && errno == ETIMEDOUT) + fail("waiter timedout\n"); + + info("futex_wait: ret = %d, errno = %d\n", ret, errno); + + return NULL; +} + +int main(int argc, char **argv) +{ + pthread_t thr; + int ret = RET_PASS; + int res; + int c; + + while ((c = getopt(argc, argv, "chv:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Test the futex value of private file mappings in FUTEX_WAIT\n", + basename(argv[0])); + + ret = pthread_create(&thr, NULL, thr_futex_wait, NULL); + if (ret < 0) { + fprintf(stderr, "pthread_create error\n"); + ret = RET_ERROR; + goto out; + } + + info("wait a while\n"); + usleep(WAKE_WAIT_US); + val = 2; + res = futex_wake(&val, 1, 0); + info("futex_wake %d\n", res); + if (res != 1) { + fail("FUTEX_WAKE didn't find the waiting thread.\n"); + ret = RET_FAIL; + } + + info("join\n"); + pthread_join(thr, NULL); + + out: + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c new file mode 100644 index 000000000000..ab428ca894de --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c @@ -0,0 +1,86 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Block on a futex and wait for timeout. + * + * AUTHOR + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include "futextest.h" +#include "logging.h" + +static long timeout_ns = 100000; /* 100us default timeout */ + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -t N Timeout in nanoseconds (default: 100,000)\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +int main(int argc, char *argv[]) +{ + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; + int res, ret = RET_PASS; + int c; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 't': + timeout_ns = atoi(optarg); + break; + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Block on a futex and wait for timeout\n", + basename(argv[0])); + printf("\tArguments: timeout=%ldns\n", timeout_ns); + + /* initialize timeout */ + to.tv_sec = 0; + to.tv_nsec = timeout_ns; + + info("Calling futex_wait on f1: %u @ %p\n", f1, &f1); + res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != ETIMEDOUT) { + fail("futex_wait returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; + } + + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c new file mode 100644 index 000000000000..fe7aee96844b --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c @@ -0,0 +1,124 @@ +/****************************************************************************** + * + * Copyright FUJITSU LIMITED 2010 + * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Wait on uninitialized heap. It shold be zero and FUTEX_WAIT should + * return immediately. This test is intent to test zero page handling in + * futex. + * + * AUTHOR + * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + * HISTORY + * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> + * + *****************************************************************************/ + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <syscall.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> +#include <linux/futex.h> +#include <libgen.h> + +#include "logging.h" +#include "futextest.h" + +#define WAIT_US 5000000 + +static int child_blocked = 1; +static int child_ret; +void *buf; + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +void *wait_thread(void *arg) +{ + int res; + + child_ret = RET_PASS; + res = futex_wait(buf, 1, NULL, 0); + child_blocked = 0; + + if (res != 0 && errno != EWOULDBLOCK) { + error("futex failure\n", errno); + child_ret = RET_ERROR; + } + pthread_exit(NULL); +} + +int main(int argc, char **argv) +{ + int c, ret = RET_PASS; + long page_size; + pthread_t thr; + + while ((c = getopt(argc, argv, "chv:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + page_size = sysconf(_SC_PAGESIZE); + + buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + if (buf == (void *)-1) { + error("mmap\n", errno); + exit(1); + } + + printf("%s: Test the uninitialized futex value in FUTEX_WAIT\n", + basename(argv[0])); + + + ret = pthread_create(&thr, NULL, wait_thread, NULL); + if (ret) { + error("pthread_create\n", errno); + ret = RET_ERROR; + goto out; + } + + info("waiting %dus for child to return\n", WAIT_US); + usleep(WAIT_US); + + ret = child_ret; + if (child_blocked) { + fail("child blocked in kernel\n"); + ret = RET_FAIL; + } + + out: + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c new file mode 100644 index 000000000000..b6b027448825 --- /dev/null +++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c @@ -0,0 +1,79 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Test if FUTEX_WAIT op returns -EWOULDBLOCK if the futex value differs + * from the expected one. + * + * AUTHOR + * Gowrishankar <gowrishankar.m@in.ibm.com> + * + * HISTORY + * 2009-Nov-14: Initial version by Gowrishankar <gowrishankar.m@in.ibm.com> + * + *****************************************************************************/ + +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include "futextest.h" +#include "logging.h" + +#define timeout_ns 100000 + +void usage(char *prog) +{ + printf("Usage: %s\n", prog); + printf(" -c Use color\n"); + printf(" -h Display this help message\n"); + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", + VQUIET, VCRITICAL, VINFO); +} + +int main(int argc, char *argv[]) +{ + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; + int res, ret = RET_PASS; + int c; + + while ((c = getopt(argc, argv, "cht:v:")) != -1) { + switch (c) { + case 'c': + log_color(1); + break; + case 'h': + usage(basename(argv[0])); + exit(0); + case 'v': + log_verbosity(atoi(optarg)); + break; + default: + usage(basename(argv[0])); + exit(1); + } + } + + printf("%s: Test the unexpected futex value in FUTEX_WAIT\n", + basename(argv[0])); + + info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); + res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); + if (!res || errno != EWOULDBLOCK) { + fail("futex_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; + } + + print_result(ret); + return ret; +} diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh new file mode 100755 index 000000000000..e87dbe2a0b0d --- /dev/null +++ b/tools/testing/selftests/futex/functional/run.sh @@ -0,0 +1,79 @@ +#!/bin/sh + +############################################################################### +# +# Copyright © International Business Machines Corp., 2009 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# DESCRIPTION +# Run tests in the current directory. +# +# AUTHOR +# Darren Hart <dvhart@linux.intel.com> +# +# HISTORY +# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com> +# 2010-Jan-6: Add futex_wait_uninitialized_heap and futex_wait_private_mapped_file +# by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> +# +############################################################################### + +# Test for a color capable console +if [ -z "$USE_COLOR" ]; then + tput setf 7 + if [ $? -eq 0 ]; then + USE_COLOR=1 + tput sgr0 + fi +fi +if [ "$USE_COLOR" -eq 1 ]; then + COLOR="-c" +fi + + +echo +# requeue pi testing +# without timeouts +./futex_requeue_pi $COLOR +./futex_requeue_pi $COLOR -b +./futex_requeue_pi $COLOR -b -l +./futex_requeue_pi $COLOR -b -o +./futex_requeue_pi $COLOR -l +./futex_requeue_pi $COLOR -o +# with timeouts +./futex_requeue_pi $COLOR -b -l -t 5000 +./futex_requeue_pi $COLOR -l -t 5000 +./futex_requeue_pi $COLOR -b -l -t 500000 +./futex_requeue_pi $COLOR -l -t 500000 +./futex_requeue_pi $COLOR -b -t 5000 +./futex_requeue_pi $COLOR -t 5000 +./futex_requeue_pi $COLOR -b -t 500000 +./futex_requeue_pi $COLOR -t 500000 +./futex_requeue_pi $COLOR -b -o -t 5000 +./futex_requeue_pi $COLOR -l -t 5000 +./futex_requeue_pi $COLOR -b -o -t 500000 +./futex_requeue_pi $COLOR -l -t 500000 +# with long timeout +./futex_requeue_pi $COLOR -b -l -t 2000000000 +./futex_requeue_pi $COLOR -l -t 2000000000 + + +echo +./futex_requeue_pi_mismatched_ops $COLOR + +echo +./futex_requeue_pi_signal_restart $COLOR + +echo +./futex_wait_timeout $COLOR + +echo +./futex_wait_wouldblock $COLOR + +echo +./futex_wait_uninitialized_heap $COLOR +./futex_wait_private_mapped_file $COLOR diff --git a/tools/testing/selftests/futex/include/atomic.h b/tools/testing/selftests/futex/include/atomic.h new file mode 100644 index 000000000000..f861da3e31ab --- /dev/null +++ b/tools/testing/selftests/futex/include/atomic.h @@ -0,0 +1,83 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * GCC atomic builtin wrappers + * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html + * + * AUTHOR + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2009-Nov-17: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#ifndef _ATOMIC_H +#define _ATOMIC_H + +typedef struct { + volatile int val; +} atomic_t; + +#define ATOMIC_INITIALIZER { 0 } + +/** + * atomic_cmpxchg() - Atomic compare and exchange + * @uaddr: The address of the futex to be modified + * @oldval: The expected value of the futex + * @newval: The new value to try and assign the futex + * + * Return the old value of addr->val. + */ +static inline int +atomic_cmpxchg(atomic_t *addr, int oldval, int newval) +{ + return __sync_val_compare_and_swap(&addr->val, oldval, newval); +} + +/** + * atomic_inc() - Atomic incrememnt + * @addr: Address of the variable to increment + * + * Return the new value of addr->val. + */ +static inline int +atomic_inc(atomic_t *addr) +{ + return __sync_add_and_fetch(&addr->val, 1); +} + +/** + * atomic_dec() - Atomic decrement + * @addr: Address of the variable to decrement + * + * Return the new value of addr-val. + */ +static inline int +atomic_dec(atomic_t *addr) +{ + return __sync_sub_and_fetch(&addr->val, 1); +} + +/** + * atomic_set() - Atomic set + * @addr: Address of the variable to set + * @newval: New value for the atomic_t + * + * Return the new value of addr->val. + */ +static inline int +atomic_set(atomic_t *addr, int newval) +{ + addr->val = newval; + return newval; +} + +#endif diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h new file mode 100644 index 000000000000..b98c3aba7102 --- /dev/null +++ b/tools/testing/selftests/futex/include/futextest.h @@ -0,0 +1,266 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Glibc independent futex library for testing kernel functionality. + * + * AUTHOR + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#ifndef _FUTEXTEST_H +#define _FUTEXTEST_H + +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <linux/futex.h> + +typedef volatile u_int32_t futex_t; +#define FUTEX_INITIALIZER 0 + +/* Define the newer op codes if the system header file is not up to date. */ +#ifndef FUTEX_WAIT_BITSET +#define FUTEX_WAIT_BITSET 9 +#endif +#ifndef FUTEX_WAKE_BITSET +#define FUTEX_WAKE_BITSET 10 +#endif +#ifndef FUTEX_WAIT_REQUEUE_PI +#define FUTEX_WAIT_REQUEUE_PI 11 +#endif +#ifndef FUTEX_CMP_REQUEUE_PI +#define FUTEX_CMP_REQUEUE_PI 12 +#endif +#ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_REQUEUE_PI_PRIVATE +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#endif + +/** + * futex() - SYS_futex syscall wrapper + * @uaddr: address of first futex + * @op: futex op code + * @val: typically expected value of uaddr, but varies by op + * @timeout: typically an absolute struct timespec (except where noted + * otherwise). Overloaded by some ops + * @uaddr2: address of second futex for some ops\ + * @val3: varies by op + * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG + * + * futex() is used by all the following futex op wrappers. It can also be + * used for misuse and abuse testing. Generally, the specific op wrappers + * should be used instead. It is a macro instead of an static inline function as + * some of the types over overloaded (timeout is used for nr_requeue for + * example). + * + * These argument descriptions are the defaults for all + * like-named arguments in the following wrappers except where noted below. + */ +#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ + syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) + +/** + * futex_wait() - block on uaddr with optional timeout + * @timeout: relative timeout + */ +static inline int +futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) +{ + return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); +} + +/** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake: wake up to this many tasks + */ +static inline int +futex_wake(futex_t *uaddr, int nr_wake, int opflags) +{ + return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); +} + +/** + * futex_wait_bitset() - block on uaddr with bitset + * @bitset: bitset to be used with futex_wake_bitset + */ +static inline int +futex_wait_bitset(futex_t *uaddr, futex_t val, struct timespec *timeout, + u_int32_t bitset, int opflags) +{ + return futex(uaddr, FUTEX_WAIT_BITSET, val, timeout, NULL, bitset, + opflags); +} + +/** + * futex_wake_bitset() - wake one or more tasks blocked on uaddr with bitset + * @bitset: bitset to compare with that used in futex_wait_bitset + */ +static inline int +futex_wake_bitset(futex_t *uaddr, int nr_wake, u_int32_t bitset, int opflags) +{ + return futex(uaddr, FUTEX_WAKE_BITSET, nr_wake, NULL, NULL, bitset, + opflags); +} + +/** + * futex_lock_pi() - block on uaddr as a PI mutex + * @detect: whether (1) or not (0) to perform deadlock detection + */ +static inline int +futex_lock_pi(futex_t *uaddr, struct timespec *timeout, int detect, + int opflags) +{ + return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags); +} + +/** + * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter + */ +static inline int +futex_unlock_pi(futex_t *uaddr, int opflags) +{ + return futex(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags); +} + +/** + * futex_wake_op() - FIXME: COME UP WITH A GOOD ONE LINE DESCRIPTION + */ +static inline int +futex_wake_op(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_wake2, + int wake_op, int opflags) +{ + return futex(uaddr, FUTEX_WAKE_OP, nr_wake, nr_wake2, uaddr2, wake_op, + opflags); +} + +/** + * futex_requeue() - requeue without expected value comparison, deprecated + * @nr_wake: wake up to this many tasks + * @nr_requeue: requeue up to this many tasks + * + * Due to its inherently racy implementation, futex_requeue() is deprecated in + * favor of futex_cmp_requeue(). + */ +static inline int +futex_requeue(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_requeue, + int opflags) +{ + return futex(uaddr, FUTEX_REQUEUE, nr_wake, nr_requeue, uaddr2, 0, + opflags); +} + +/** + * futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 + * @nr_wake: wake up to this many tasks + * @nr_requeue: requeue up to this many tasks + */ +static inline int +futex_cmp_requeue(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake, + int nr_requeue, int opflags) +{ + return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, + val, opflags); +} + +/** + * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2 + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * + * This is the first half of the requeue_pi mechanism. It shall always be + * paired with futex_cmp_requeue_pi(). + */ +static inline int +futex_wait_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, + struct timespec *timeout, int opflags) +{ + return futex(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0, + opflags); +} + +/** + * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2 (PI aware) + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * @nr_wake: wake up to this many tasks + * @nr_requeue: requeue up to this many tasks + */ +static inline int +futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake, + int nr_requeue, int opflags) +{ + return futex(uaddr, FUTEX_CMP_REQUEUE_PI, nr_wake, nr_requeue, uaddr2, + val, opflags); +} + +/** + * futex_cmpxchg() - atomic compare and exchange + * @uaddr: The address of the futex to be modified + * @oldval: The expected value of the futex + * @newval: The new value to try and assign the futex + * + * Implement cmpxchg using gcc atomic builtins. + * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html + * + * Return the old futex value. + */ +static inline u_int32_t +futex_cmpxchg(futex_t *uaddr, u_int32_t oldval, u_int32_t newval) +{ + return __sync_val_compare_and_swap(uaddr, oldval, newval); +} + +/** + * futex_dec() - atomic decrement of the futex value + * @uaddr: The address of the futex to be modified + * + * Return the new futex value. + */ +static inline u_int32_t +futex_dec(futex_t *uaddr) +{ + return __sync_sub_and_fetch(uaddr, 1); +} + +/** + * futex_inc() - atomic increment of the futex value + * @uaddr: the address of the futex to be modified + * + * Return the new futex value. + */ +static inline u_int32_t +futex_inc(futex_t *uaddr) +{ + return __sync_add_and_fetch(uaddr, 1); +} + +/** + * futex_set() - atomic decrement of the futex value + * @uaddr: the address of the futex to be modified + * @newval: New value for the atomic_t + * + * Return the new futex value. + */ +static inline u_int32_t +futex_set(futex_t *uaddr, u_int32_t newval) +{ + *uaddr = newval; + return newval; +} + +#endif diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h new file mode 100644 index 000000000000..014aa01197af --- /dev/null +++ b/tools/testing/selftests/futex/include/logging.h @@ -0,0 +1,153 @@ +/****************************************************************************** + * + * Copyright © International Business Machines Corp., 2009 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * DESCRIPTION + * Glibc independent futex library for testing kernel functionality. + * + * AUTHOR + * Darren Hart <dvhart@linux.intel.com> + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com> + * + *****************************************************************************/ + +#ifndef _LOGGING_H +#define _LOGGING_H + +#include <string.h> +#include <unistd.h> +#include <linux/futex.h> +#include "kselftest.h" + +/* + * Define PASS, ERROR, and FAIL strings with and without color escape + * sequences, default to no color. + */ +#define ESC 0x1B, '[' +#define BRIGHT '1' +#define GREEN '3', '2' +#define YELLOW '3', '3' +#define RED '3', '1' +#define ESCEND 'm' +#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND +#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND +#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND +#define RESET_COLOR ESC, '0', 'm' +static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S', + RESET_COLOR, 0}; +static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R', + RESET_COLOR, 0}; +static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L', + RESET_COLOR, 0}; +static const char INFO_NORMAL[] = " INFO"; +static const char PASS_NORMAL[] = " PASS"; +static const char ERROR_NORMAL[] = "ERROR"; +static const char FAIL_NORMAL[] = " FAIL"; +const char *INFO = INFO_NORMAL; +const char *PASS = PASS_NORMAL; +const char *ERROR = ERROR_NORMAL; +const char *FAIL = FAIL_NORMAL; + +/* Verbosity setting for INFO messages */ +#define VQUIET 0 +#define VCRITICAL 1 +#define VINFO 2 +#define VMAX VINFO +int _verbose = VCRITICAL; + +/* Functional test return codes */ +#define RET_PASS 0 +#define RET_ERROR -1 +#define RET_FAIL -2 + +/** + * log_color() - Use colored output for PASS, ERROR, and FAIL strings + * @use_color: use color (1) or not (0) + */ +void log_color(int use_color) +{ + if (use_color) { + PASS = PASS_COLOR; + ERROR = ERROR_COLOR; + FAIL = FAIL_COLOR; + } else { + PASS = PASS_NORMAL; + ERROR = ERROR_NORMAL; + FAIL = FAIL_NORMAL; + } +} + +/** + * log_verbosity() - Set verbosity of test output + * @verbose: Enable (1) verbose output or not (0) + * + * Currently setting verbose=1 will enable INFO messages and 0 will disable + * them. FAIL and ERROR messages are always displayed. + */ +void log_verbosity(int level) +{ + if (level > VMAX) + level = VMAX; + else if (level < 0) + level = 0; + _verbose = level; +} + +/** + * print_result() - Print standard PASS | ERROR | FAIL results + * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL + * + * print_result() is primarily intended for functional tests. + */ +void print_result(int ret) +{ + const char *result = "Unknown return code"; + + switch (ret) { + case RET_PASS: + ksft_inc_pass_cnt(); + result = PASS; + break; + case RET_ERROR: + result = ERROR; + break; + case RET_FAIL: + ksft_inc_fail_cnt(); + result = FAIL; + break; + } + printf("Result: %s\n", result); +} + +/* log level macros */ +#define info(message, vargs...) \ +do { \ + if (_verbose >= VINFO) \ + fprintf(stderr, "\t%s: "message, INFO, ##vargs); \ +} while (0) + +#define error(message, err, args...) \ +do { \ + if (_verbose >= VCRITICAL) {\ + if (err) \ + fprintf(stderr, "\t%s: %s: "message, \ + ERROR, strerror(err), ##args); \ + else \ + fprintf(stderr, "\t%s: "message, ERROR, ##args); \ + } \ +} while (0) + +#define fail(message, args...) \ +do { \ + if (_verbose >= VCRITICAL) \ + fprintf(stderr, "\t%s: "message, FAIL, ##args); \ +} while (0) + +#endif diff --git a/tools/testing/selftests/futex/run.sh b/tools/testing/selftests/futex/run.sh new file mode 100755 index 000000000000..4126312ad64e --- /dev/null +++ b/tools/testing/selftests/futex/run.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +############################################################################### +# +# Copyright © International Business Machines Corp., 2009 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# DESCRIPTION +# Run all tests under the functional, performance, and stress directories. +# Format and summarize the results. +# +# AUTHOR +# Darren Hart <dvhart@linux.intel.com> +# +# HISTORY +# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com> +# +############################################################################### + +# Test for a color capable shell and pass the result to the subdir scripts +USE_COLOR=0 +tput setf 7 +if [ $? -eq 0 ]; then + USE_COLOR=1 + tput sgr0 +fi +export USE_COLOR + +(cd functional; ./run.sh) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 572c8888167a..ef1c80d67ac7 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -13,6 +13,13 @@ #include <stdlib.h> #include <unistd.h> +/* define kselftest exit codes */ +#define KSFT_PASS 0 +#define KSFT_FAIL 1 +#define KSFT_XFAIL 2 +#define KSFT_XPASS 3 +#define KSFT_SKIP 4 + /* counters */ struct ksft_count { unsigned int ksft_pass; @@ -40,23 +47,23 @@ static inline void ksft_print_cnts(void) static inline int ksft_exit_pass(void) { - exit(0); + exit(KSFT_PASS); } static inline int ksft_exit_fail(void) { - exit(1); + exit(KSFT_FAIL); } static inline int ksft_exit_xfail(void) { - exit(2); + exit(KSFT_XFAIL); } static inline int ksft_exit_xpass(void) { - exit(3); + exit(KSFT_XPASS); } static inline int ksft_exit_skip(void) { - exit(4); + exit(KSFT_SKIP); } #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 2194155ae62a..ee412bab7ed4 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -13,6 +13,9 @@ run_tests: all define INSTALL_RULE mkdir -p $(INSTALL_PATH) + @for TEST_DIR in $(TEST_DIRS); do\ + cp -r $$TEST_DIR $(INSTALL_PATH); \ + done; install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) endef diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile index 95580a97326e..5e35c9c50b72 100644 --- a/tools/testing/selftests/mount/Makefile +++ b/tools/testing/selftests/mount/Makefile @@ -9,7 +9,12 @@ unprivileged-remount-test: unprivileged-remount-test.c include ../lib.mk TEST_PROGS := unprivileged-remount-test -override RUN_TESTS := if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi +override RUN_TESTS := if [ -f /proc/self/uid_map ] ; \ + then \ + ./unprivileged-remount-test ; \ + else \ + echo "WARN: No /proc/self/uid_map exist, test skipped." ; \ + fi override EMIT_TESTS := echo "$(RUN_TESTS)" clean: diff --git a/tools/testing/selftests/seccomp/.gitignore b/tools/testing/selftests/seccomp/.gitignore new file mode 100644 index 000000000000..346d83ca8069 --- /dev/null +++ b/tools/testing/selftests/seccomp/.gitignore @@ -0,0 +1 @@ +seccomp_bpf diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile new file mode 100644 index 000000000000..8401e87e34e1 --- /dev/null +++ b/tools/testing/selftests/seccomp/Makefile @@ -0,0 +1,10 @@ +TEST_PROGS := seccomp_bpf +CFLAGS += -Wl,-no-as-needed -Wall +LDFLAGS += -lpthread + +all: $(TEST_PROGS) + +include ../lib.mk + +clean: + $(RM) $(TEST_PROGS) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c new file mode 100644 index 000000000000..c5abe7fd7590 --- /dev/null +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -0,0 +1,2109 @@ +/* + * Copyright (c) 2012 The Chromium OS Authors. All rights reserved. + * Use of this source code is governed by the GPLv2 license. + * + * Test code for seccomp bpf. + */ + +#include <asm/siginfo.h> +#define __have_siginfo_t 1 +#define __have_sigval_t 1 +#define __have_sigevent_t 1 + +#include <errno.h> +#include <linux/filter.h> +#include <sys/prctl.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <linux/prctl.h> +#include <linux/ptrace.h> +#include <linux/seccomp.h> +#include <poll.h> +#include <pthread.h> +#include <semaphore.h> +#include <signal.h> +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <linux/elf.h> +#include <sys/uio.h> + +#define _GNU_SOURCE +#include <unistd.h> +#include <sys/syscall.h> + +#include "test_harness.h" + +#ifndef PR_SET_PTRACER +# define PR_SET_PTRACER 0x59616d61 +#endif + +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 +#endif + +#ifndef PR_SECCOMP_EXT +#define PR_SECCOMP_EXT 43 +#endif + +#ifndef SECCOMP_EXT_ACT +#define SECCOMP_EXT_ACT 1 +#endif + +#ifndef SECCOMP_EXT_ACT_TSYNC +#define SECCOMP_EXT_ACT_TSYNC 1 +#endif + +#ifndef SECCOMP_MODE_STRICT +#define SECCOMP_MODE_STRICT 1 +#endif + +#ifndef SECCOMP_MODE_FILTER +#define SECCOMP_MODE_FILTER 2 +#endif + +#ifndef SECCOMP_RET_KILL +#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ +#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ +#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ +#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ +#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ + +/* Masks for the return value sections. */ +#define SECCOMP_RET_ACTION 0x7fff0000U +#define SECCOMP_RET_DATA 0x0000ffffU + +struct seccomp_data { + int nr; + __u32 arch; + __u64 instruction_pointer; + __u64 args[6]; +}; +#endif + +#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n])) + +#define SIBLING_EXIT_UNKILLED 0xbadbeef +#define SIBLING_EXIT_FAILURE 0xbadface +#define SIBLING_EXIT_NEWPRIVS 0xbadfeed + +TEST(mode_strict_support) +{ + long ret; + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SECCOMP"); + } + syscall(__NR_exit, 1); +} + +TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL) +{ + long ret; + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SECCOMP"); + } + syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER, + NULL, NULL, NULL); + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +/* Note! This doesn't test no new privs behavior */ +TEST(no_new_privs_support) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } +} + +/* Tests kernel support by checking for a copy_from_user() fault on * NULL. */ +TEST(mode_filter_support) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!"); + } +} + +TEST(mode_filter_without_nnp) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0); + ASSERT_LE(0, ret) { + TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS"); + } + errno = 0; + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + /* Succeeds with CAP_SYS_ADMIN, fails without */ + /* TODO(wad) check caps not euid */ + if (geteuid()) { + EXPECT_EQ(-1, ret); + EXPECT_EQ(EACCES, errno); + } else { + EXPECT_EQ(0, ret); + } +} + +#define MAX_INSNS_PER_PATH 32768 + +TEST(filter_size_limits) +{ + int i; + int count = BPF_MAXINSNS + 1; + struct sock_filter allow[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter *filter; + struct sock_fprog prog = { }; + long ret; + + filter = calloc(count, sizeof(*filter)); + ASSERT_NE(NULL, filter); + + for (i = 0; i < count; i++) + filter[i] = allow[0]; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + prog.filter = filter; + prog.len = count; + + /* Too many filter instructions in a single filter. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_NE(0, ret) { + TH_LOG("Installing %d insn filter was allowed", prog.len); + } + + /* One less is okay, though. */ + prog.len -= 1; + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Installing %d insn filter wasn't allowed", prog.len); + } +} + +TEST(filter_chain_limits) +{ + int i; + int count = BPF_MAXINSNS; + struct sock_filter allow[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter *filter; + struct sock_fprog prog = { }; + long ret; + + filter = calloc(count, sizeof(*filter)); + ASSERT_NE(NULL, filter); + + for (i = 0; i < count; i++) + filter[i] = allow[0]; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + prog.filter = filter; + prog.len = 1; + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + prog.len = count; + + /* Too many total filter instructions. */ + for (i = 0; i < MAX_INSNS_PER_PATH; i++) { + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + if (ret != 0) + break; + } + ASSERT_NE(0, ret) { + TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)", + i, count, i * (count + 4)); + } +} + +TEST(mode_filter_cannot_move_to_strict) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); +} + + +TEST(mode_filter_get_seccomp) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + EXPECT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0); + EXPECT_EQ(2, ret); +} + + +TEST(ALLOW_all) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); +} + +TEST(empty_prog) +{ + struct sock_filter filter[] = { + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); +} + +TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, 0x10000000U), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + EXPECT_EQ(0, syscall(__NR_getpid)) { + TH_LOG("getpid() shouldn't ever return"); + } +} + +/* return code >= 0x80000000 is unused. */ +TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, 0x90000000U), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + EXPECT_EQ(0, syscall(__NR_getpid)) { + TH_LOG("getpid() shouldn't ever return"); + } +} + +TEST_SIGNAL(KILL_all, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); +} + +TEST_SIGNAL(KILL_one, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_SIGNAL(KILL_one_arg_one, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + /* Only both with lower 32-bit for now. */ + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + pid_t pid = getpid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(pid, syscall(__NR_getpid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid, 0x0C0FFEE)); +} + +TEST_SIGNAL(KILL_one_arg_six, SIGSYS) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + /* Only both with lower 32-bit for now. */ + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + pid_t pid = getpid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(pid, syscall(__NR_getpid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid, 1, 2, 3, 4, 5, 0x0C0FFEE)); +} + +/* TODO(wad) add 64-bit versus 32-bit arg tests. */ +TEST(arg_out_of_range) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); +} + +TEST(ERRNO_valid) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(-1, read(0, NULL, 0)); + EXPECT_EQ(E2BIG, errno); +} + +TEST(ERRNO_zero) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* "errno" of 0 is ok. */ + EXPECT_EQ(0, read(0, NULL, 0)); +} + +TEST(ERRNO_capped) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + pid_t parent = getppid(); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + ASSERT_EQ(0, ret); + + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(-1, read(0, NULL, 0)); + EXPECT_EQ(4095, errno); +} + +FIXTURE_DATA(TRAP) { + struct sock_fprog prog; +}; + +FIXTURE_SETUP(TRAP) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + memset(&self->prog, 0, sizeof(self->prog)); + self->prog.filter = malloc(sizeof(filter)); + ASSERT_NE(NULL, self->prog.filter); + memcpy(self->prog.filter, filter, sizeof(filter)); + self->prog.len = (unsigned short)ARRAY_SIZE(filter); +} + +FIXTURE_TEARDOWN(TRAP) +{ + if (self->prog.filter) + free(self->prog.filter); +} + +TEST_F_SIGNAL(TRAP, dfl, SIGSYS) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog); + ASSERT_EQ(0, ret); + syscall(__NR_getpid); +} + +/* Ensure that SIGSYS overrides SIG_IGN */ +TEST_F_SIGNAL(TRAP, ign, SIGSYS) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + signal(SIGSYS, SIG_IGN); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog); + ASSERT_EQ(0, ret); + syscall(__NR_getpid); +} + +static struct siginfo TRAP_info; +static volatile int TRAP_nr; +static void TRAP_action(int nr, siginfo_t *info, void *void_context) +{ + memcpy(&TRAP_info, info, sizeof(TRAP_info)); + TRAP_nr = nr; +} + +TEST_F(TRAP, handler) +{ + int ret, test; + struct sigaction act; + sigset_t mask; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + sigaddset(&mask, SIGSYS); + + act.sa_sigaction = &TRAP_action; + act.sa_flags = SA_SIGINFO; + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret) { + TH_LOG("sigaction failed"); + } + ret = sigprocmask(SIG_UNBLOCK, &mask, NULL); + ASSERT_EQ(0, ret) { + TH_LOG("sigprocmask failed"); + } + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog); + ASSERT_EQ(0, ret); + TRAP_nr = 0; + memset(&TRAP_info, 0, sizeof(TRAP_info)); + /* Expect the registers to be rolled back. (nr = error) may vary + * based on arch. */ + ret = syscall(__NR_getpid); + /* Silence gcc warning about volatile. */ + test = TRAP_nr; + EXPECT_EQ(SIGSYS, test); + struct local_sigsys { + void *_call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } *sigsys = (struct local_sigsys *) +#ifdef si_syscall + &(TRAP_info.si_call_addr); +#else + &TRAP_info.si_pid; +#endif + EXPECT_EQ(__NR_getpid, sigsys->_syscall); + /* Make sure arch is non-zero. */ + EXPECT_NE(0, sigsys->_arch); + EXPECT_NE(0, (unsigned long)sigsys->_call_addr); +} + +FIXTURE_DATA(precedence) { + struct sock_fprog allow; + struct sock_fprog trace; + struct sock_fprog error; + struct sock_fprog trap; + struct sock_fprog kill; +}; + +FIXTURE_SETUP(precedence) +{ + struct sock_filter allow_insns[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter trace_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE), + }; + struct sock_filter error_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO), + }; + struct sock_filter trap_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP), + }; + struct sock_filter kill_insns[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + }; + + memset(self, 0, sizeof(*self)); +#define FILTER_ALLOC(_x) \ + self->_x.filter = malloc(sizeof(_x##_insns)); \ + ASSERT_NE(NULL, self->_x.filter); \ + memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \ + self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns) + FILTER_ALLOC(allow); + FILTER_ALLOC(trace); + FILTER_ALLOC(error); + FILTER_ALLOC(trap); + FILTER_ALLOC(kill); +} + +FIXTURE_TEARDOWN(precedence) +{ +#define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter) + FILTER_FREE(allow); + FILTER_FREE(trace); + FILTER_FREE(error); + FILTER_FREE(trap); + FILTER_FREE(kill); +} + +TEST_F(precedence, allow_ok) +{ + pid_t parent, res = 0; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + res = syscall(__NR_getppid); + EXPECT_EQ(parent, res); +} + +TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS) +{ + pid_t parent, res = 0; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + res = syscall(__NR_getppid); + EXPECT_EQ(parent, res); + /* getpid() should never return. */ + res = syscall(__NR_getpid); + EXPECT_EQ(0, res); +} + +TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* getpid() should never return. */ + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_F(precedence, errno_is_third) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_F(precedence, errno_is_third_in_any_order) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + EXPECT_EQ(0, syscall(__NR_getpid)); +} + +TEST_F(precedence, trace_is_fourth) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* No ptracer */ + EXPECT_EQ(-1, syscall(__NR_getpid)); +} + +TEST_F(precedence, trace_is_fourth_in_any_order) +{ + pid_t parent; + long ret; + + parent = getppid(); + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace); + ASSERT_EQ(0, ret); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow); + ASSERT_EQ(0, ret); + /* Should work just fine. */ + EXPECT_EQ(parent, syscall(__NR_getppid)); + /* No ptracer */ + EXPECT_EQ(-1, syscall(__NR_getpid)); +} + +#ifndef PTRACE_O_TRACESECCOMP +#define PTRACE_O_TRACESECCOMP 0x00000080 +#endif + +/* Catch the Ubuntu 12.04 value error. */ +#if PTRACE_EVENT_SECCOMP != 7 +#undef PTRACE_EVENT_SECCOMP +#endif + +#ifndef PTRACE_EVENT_SECCOMP +#define PTRACE_EVENT_SECCOMP 7 +#endif + +#define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP) +bool tracer_running; +void tracer_stop(int sig) +{ + tracer_running = false; +} + +typedef void tracer_func_t(struct __test_metadata *_metadata, + pid_t tracee, int status, void *args); + +void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee, + tracer_func_t tracer_func, void *args) +{ + int ret = -1; + struct sigaction action = { + .sa_handler = tracer_stop, + }; + + /* Allow external shutdown. */ + tracer_running = true; + ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL)); + + errno = 0; + while (ret == -1 && errno != EINVAL) + ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0); + ASSERT_EQ(0, ret) { + kill(tracee, SIGKILL); + } + /* Wait for attach stop */ + wait(NULL); + + ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, PTRACE_O_TRACESECCOMP); + ASSERT_EQ(0, ret) { + TH_LOG("Failed to set PTRACE_O_TRACESECCOMP"); + kill(tracee, SIGKILL); + } + ptrace(PTRACE_CONT, tracee, NULL, 0); + + /* Unblock the tracee */ + ASSERT_EQ(1, write(fd, "A", 1)); + ASSERT_EQ(0, close(fd)); + + /* Run until we're shut down. Must assert to stop execution. */ + while (tracer_running) { + int status; + + if (wait(&status) != tracee) + continue; + if (WIFSIGNALED(status) || WIFEXITED(status)) + /* Child is dead. Time to go. */ + return; + + /* Make sure this is a seccomp event. */ + ASSERT_EQ(true, IS_SECCOMP_EVENT(status)); + + tracer_func(_metadata, tracee, status, args); + + ret = ptrace(PTRACE_CONT, tracee, NULL, NULL); + ASSERT_EQ(0, ret); + } + /* Directly report the status of our test harness results. */ + syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE); +} + +/* Common tracer setup/teardown functions. */ +void cont_handler(int num) +{ } +pid_t setup_trace_fixture(struct __test_metadata *_metadata, + tracer_func_t func, void *args) +{ + char sync; + int pipefd[2]; + pid_t tracer_pid; + pid_t tracee = getpid(); + + /* Setup a pipe for clean synchronization. */ + ASSERT_EQ(0, pipe(pipefd)); + + /* Fork a child which we'll promote to tracer */ + tracer_pid = fork(); + ASSERT_LE(0, tracer_pid); + signal(SIGALRM, cont_handler); + if (tracer_pid == 0) { + close(pipefd[0]); + tracer(_metadata, pipefd[1], tracee, func, args); + syscall(__NR_exit, 0); + } + close(pipefd[1]); + prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0); + read(pipefd[0], &sync, 1); + close(pipefd[0]); + + return tracer_pid; +} +void teardown_trace_fixture(struct __test_metadata *_metadata, + pid_t tracer) +{ + if (tracer) { + int status; + /* + * Extract the exit code from the other process and + * adopt it for ourselves in case its asserts failed. + */ + ASSERT_EQ(0, kill(tracer, SIGUSR1)); + ASSERT_EQ(tracer, waitpid(tracer, &status, 0)); + if (WEXITSTATUS(status)) + _metadata->passed = 0; + } +} + +/* "poke" tracer arguments and function. */ +struct tracer_args_poke_t { + unsigned long poke_addr; +}; + +void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status, + void *args) +{ + int ret; + unsigned long msg; + struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args; + + ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); + EXPECT_EQ(0, ret); + /* If this fails, don't try to recover. */ + ASSERT_EQ(0x1001, msg) { + kill(tracee, SIGKILL); + } + /* + * Poke in the message. + * Registers are not touched to try to keep this relatively arch + * agnostic. + */ + ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001); + EXPECT_EQ(0, ret); +} + +FIXTURE_DATA(TRACE_poke) { + struct sock_fprog prog; + pid_t tracer; + long poked; + struct tracer_args_poke_t tracer_args; +}; + +FIXTURE_SETUP(TRACE_poke) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + self->poked = 0; + memset(&self->prog, 0, sizeof(self->prog)); + self->prog.filter = malloc(sizeof(filter)); + ASSERT_NE(NULL, self->prog.filter); + memcpy(self->prog.filter, filter, sizeof(filter)); + self->prog.len = (unsigned short)ARRAY_SIZE(filter); + + /* Set up tracer args. */ + self->tracer_args.poke_addr = (unsigned long)&self->poked; + + /* Launch tracer. */ + self->tracer = setup_trace_fixture(_metadata, tracer_poke, + &self->tracer_args); +} + +FIXTURE_TEARDOWN(TRACE_poke) +{ + teardown_trace_fixture(_metadata, self->tracer); + if (self->prog.filter) + free(self->prog.filter); +} + +TEST_F(TRACE_poke, read_has_side_effects) +{ + ssize_t ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + EXPECT_EQ(0, self->poked); + ret = read(-1, NULL, 0); + EXPECT_EQ(-1, ret); + EXPECT_EQ(0x1001, self->poked); +} + +TEST_F(TRACE_poke, getpid_runs_normally) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + EXPECT_EQ(0, self->poked); + EXPECT_NE(0, syscall(__NR_getpid)); + EXPECT_EQ(0, self->poked); +} + +#if defined(__x86_64__) +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM orig_rax +# define SYSCALL_RET rax +#elif defined(__i386__) +# define ARCH_REGS struct user_regs_struct +# define SYSCALL_NUM orig_eax +# define SYSCALL_RET eax +#elif defined(__arm__) +# define ARCH_REGS struct pt_regs +# define SYSCALL_NUM ARM_r7 +# define SYSCALL_RET ARM_r0 +#elif defined(__aarch64__) +# define ARCH_REGS struct user_pt_regs +# define SYSCALL_NUM regs[8] +# define SYSCALL_RET regs[0] +#else +# error "Do not know how to find your architecture's registers and syscalls" +#endif + +/* Architecture-specific syscall fetching routine. */ +int get_syscall(struct __test_metadata *_metadata, pid_t tracee) +{ + struct iovec iov; + ARCH_REGS regs; + + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) { + TH_LOG("PTRACE_GETREGSET failed"); + return -1; + } + + return regs.SYSCALL_NUM; +} + +/* Architecture-specific syscall changing routine. */ +void change_syscall(struct __test_metadata *_metadata, + pid_t tracee, int syscall) +{ + struct iovec iov; + int ret; + ARCH_REGS regs; + + iov.iov_base = ®s; + iov.iov_len = sizeof(regs); + ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov); + EXPECT_EQ(0, ret); + +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) + { + regs.SYSCALL_NUM = syscall; + } + +#elif defined(__arm__) +# ifndef PTRACE_SET_SYSCALL +# define PTRACE_SET_SYSCALL 23 +# endif + { + ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall); + EXPECT_EQ(0, ret); + } + +#else + ASSERT_EQ(1, 0) { + TH_LOG("How is the syscall changed on this architecture?"); + } +#endif + + /* If syscall is skipped, change return value. */ + if (syscall == -1) + regs.SYSCALL_RET = 1; + + ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov); + EXPECT_EQ(0, ret); +} + +void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee, + int status, void *args) +{ + int ret; + unsigned long msg; + + /* Make sure we got the right message. */ + ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); + EXPECT_EQ(0, ret); + + switch (msg) { + case 0x1002: + /* change getpid to getppid. */ + change_syscall(_metadata, tracee, __NR_getppid); + break; + case 0x1003: + /* skip gettid. */ + change_syscall(_metadata, tracee, -1); + break; + case 0x1004: + /* do nothing (allow getppid) */ + break; + default: + EXPECT_EQ(0, msg) { + TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg); + kill(tracee, SIGKILL); + } + } + +} + +FIXTURE_DATA(TRACE_syscall) { + struct sock_fprog prog; + pid_t tracer, mytid, mypid, parent; +}; + +FIXTURE_SETUP(TRACE_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + memset(&self->prog, 0, sizeof(self->prog)); + self->prog.filter = malloc(sizeof(filter)); + ASSERT_NE(NULL, self->prog.filter); + memcpy(self->prog.filter, filter, sizeof(filter)); + self->prog.len = (unsigned short)ARRAY_SIZE(filter); + + /* Prepare some testable syscall results. */ + self->mytid = syscall(__NR_gettid); + ASSERT_GT(self->mytid, 0); + ASSERT_NE(self->mytid, 1) { + TH_LOG("Running this test as init is not supported. :)"); + } + + self->mypid = getpid(); + ASSERT_GT(self->mypid, 0); + ASSERT_EQ(self->mytid, self->mypid); + + self->parent = getppid(); + ASSERT_GT(self->parent, 0); + ASSERT_NE(self->parent, self->mypid); + + /* Launch tracer. */ + self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL); +} + +FIXTURE_TEARDOWN(TRACE_syscall) +{ + teardown_trace_fixture(_metadata, self->tracer); + if (self->prog.filter) + free(self->prog.filter); +} + +TEST_F(TRACE_syscall, syscall_allowed) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + /* getppid works as expected (no changes). */ + EXPECT_EQ(self->parent, syscall(__NR_getppid)); + EXPECT_NE(self->mypid, syscall(__NR_getppid)); +} + +TEST_F(TRACE_syscall, syscall_redirected) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + /* getpid has been redirected to getppid as expected. */ + EXPECT_EQ(self->parent, syscall(__NR_getpid)); + EXPECT_NE(self->mypid, syscall(__NR_getpid)); +} + +TEST_F(TRACE_syscall, syscall_dropped) +{ + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); + ASSERT_EQ(0, ret); + + /* gettid has been skipped and an altered return value stored. */ + EXPECT_EQ(1, syscall(__NR_gettid)); + EXPECT_NE(self->mytid, syscall(__NR_gettid)); +} + +#ifndef __NR_seccomp +# if defined(__i386__) +# define __NR_seccomp 354 +# elif defined(__x86_64__) +# define __NR_seccomp 317 +# elif defined(__arm__) +# define __NR_seccomp 383 +# elif defined(__aarch64__) +# define __NR_seccomp 277 +# else +# warning "seccomp syscall number unknown for this architecture" +# define __NR_seccomp 0xffff +# endif +#endif + +#ifndef SECCOMP_SET_MODE_STRICT +#define SECCOMP_SET_MODE_STRICT 0 +#endif + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_FLAG_FILTER_TSYNC +#define SECCOMP_FLAG_FILTER_TSYNC 1 +#endif + +#ifndef seccomp +int seccomp(unsigned int op, unsigned int flags, struct sock_fprog *filter) +{ + errno = 0; + return syscall(__NR_seccomp, op, flags, filter); +} +#endif + +TEST(seccomp_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Reject insane operation. */ + ret = seccomp(-1, 0, &prog); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Did not reject crazy op value!"); + } + + /* Reject strict with flags or pointer. */ + ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Did not reject mode strict with flags!"); + } + ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Did not reject mode strict with uargs!"); + } + + /* Reject insane args for filter. */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Did not reject crazy filter flags!"); + } + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL); + EXPECT_EQ(EFAULT, errno) { + TH_LOG("Did not reject NULL filter!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog); + EXPECT_EQ(0, errno) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s", + strerror(errno)); + } +} + +TEST(seccomp_syscall_mode_lock) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog); + EXPECT_EQ(0, ret) { + TH_LOG("Could not install filter!"); + } + + /* Make sure neither entry point will switch to strict. */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Switched to mode strict!"); + } + + ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL); + EXPECT_EQ(EINVAL, errno) { + TH_LOG("Switched to mode strict!"); + } +} + +TEST(TSYNC_first) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &prog); + EXPECT_EQ(0, ret) { + TH_LOG("Could not install initial filter with TSYNC!"); + } +} + +#define TSYNC_SIBLINGS 2 +struct tsync_sibling { + pthread_t tid; + pid_t system_tid; + sem_t *started; + pthread_cond_t *cond; + pthread_mutex_t *mutex; + int diverge; + int num_waits; + struct sock_fprog *prog; + struct __test_metadata *metadata; +}; + +FIXTURE_DATA(TSYNC) { + struct sock_fprog root_prog, apply_prog; + struct tsync_sibling sibling[TSYNC_SIBLINGS]; + sem_t started; + pthread_cond_t cond; + pthread_mutex_t mutex; + int sibling_count; +}; + +FIXTURE_SETUP(TSYNC) +{ + struct sock_filter root_filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_filter apply_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + memset(&self->root_prog, 0, sizeof(self->root_prog)); + memset(&self->apply_prog, 0, sizeof(self->apply_prog)); + memset(&self->sibling, 0, sizeof(self->sibling)); + self->root_prog.filter = malloc(sizeof(root_filter)); + ASSERT_NE(NULL, self->root_prog.filter); + memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter)); + self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter); + + self->apply_prog.filter = malloc(sizeof(apply_filter)); + ASSERT_NE(NULL, self->apply_prog.filter); + memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter)); + self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter); + + self->sibling_count = 0; + pthread_mutex_init(&self->mutex, NULL); + pthread_cond_init(&self->cond, NULL); + sem_init(&self->started, 0, 0); + self->sibling[0].tid = 0; + self->sibling[0].cond = &self->cond; + self->sibling[0].started = &self->started; + self->sibling[0].mutex = &self->mutex; + self->sibling[0].diverge = 0; + self->sibling[0].num_waits = 1; + self->sibling[0].prog = &self->root_prog; + self->sibling[0].metadata = _metadata; + self->sibling[1].tid = 0; + self->sibling[1].cond = &self->cond; + self->sibling[1].started = &self->started; + self->sibling[1].mutex = &self->mutex; + self->sibling[1].diverge = 0; + self->sibling[1].prog = &self->root_prog; + self->sibling[1].num_waits = 1; + self->sibling[1].metadata = _metadata; +} + +FIXTURE_TEARDOWN(TSYNC) +{ + int sib = 0; + + if (self->root_prog.filter) + free(self->root_prog.filter); + if (self->apply_prog.filter) + free(self->apply_prog.filter); + + for ( ; sib < self->sibling_count; ++sib) { + struct tsync_sibling *s = &self->sibling[sib]; + void *status; + + if (!s->tid) + continue; + if (pthread_kill(s->tid, 0)) { + pthread_cancel(s->tid); + pthread_join(s->tid, &status); + } + } + pthread_mutex_destroy(&self->mutex); + pthread_cond_destroy(&self->cond); + sem_destroy(&self->started); +} + +void *tsync_sibling(void *data) +{ + long ret = 0; + struct tsync_sibling *me = data; + + me->system_tid = syscall(__NR_gettid); + + pthread_mutex_lock(me->mutex); + if (me->diverge) { + /* Just re-apply the root prog to fork the tree */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, + me->prog, 0, 0); + } + sem_post(me->started); + /* Return outside of started so parent notices failures. */ + if (ret) { + pthread_mutex_unlock(me->mutex); + return (void *)SIBLING_EXIT_FAILURE; + } + do { + pthread_cond_wait(me->cond, me->mutex); + me->num_waits = me->num_waits - 1; + } while (me->num_waits); + pthread_mutex_unlock(me->mutex); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (!ret) + return (void *)SIBLING_EXIT_NEWPRIVS; + read(0, NULL, 0); + return (void *)SIBLING_EXIT_UNKILLED; +} + +void tsync_start_sibling(struct tsync_sibling *sibling) +{ + pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling); +} + +TEST_F(TSYNC, siblings_fail_prctl) +{ + long ret; + void *status; + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Check prctl failure detection by requesting sib 0 diverge. */ + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog); + ASSERT_EQ(0, ret) { + TH_LOG("setting filter failed"); + } + + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + /* Signal the threads to clean up*/ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure diverging sibling failed to call prctl. */ + pthread_join(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status); + pthread_join(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); +} + +TEST_F(TSYNC, two_siblings_with_ancestor) +{ + long ret; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Could install filter on all threads!"); + } + /* Tell the siblings to test the policy */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + /* Ensure they are both killed and don't exit cleanly. */ + pthread_join(self->sibling[0].tid, &status); + EXPECT_EQ(0x0, (long)status); + pthread_join(self->sibling[1].tid, &status); + EXPECT_EQ(0x0, (long)status); +} + +TEST_F(TSYNC, two_sibling_want_nnp) +{ + void *status; + + /* start siblings before any prctl() operations */ + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + /* Tell the siblings to test no policy */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both upset about lacking nnp. */ + pthread_join(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status); + pthread_join(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status); +} + +TEST_F(TSYNC, two_siblings_with_no_filter) +{ + long ret; + void *status; + + /* start siblings before any prctl() operations */ + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Could install filter on all threads!"); + } + + /* Tell the siblings to test the policy */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both killed and don't exit cleanly. */ + pthread_join(self->sibling[0].tid, &status); + EXPECT_EQ(0x0, (long)status); + pthread_join(self->sibling[1].tid, &status); + EXPECT_EQ(0x0, (long)status); +} + +TEST_F(TSYNC, two_siblings_with_one_divergence) +{ + long ret; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(self->sibling[0].system_tid, ret) { + TH_LOG("Did not fail on diverged sibling."); + } + + /* Wake the threads */ + pthread_mutex_lock(&self->mutex); + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + + /* Ensure they are both unkilled. */ + pthread_join(self->sibling[0].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); + pthread_join(self->sibling[1].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); +} + +TEST_F(TSYNC, two_siblings_not_under_filter) +{ + long ret, sib; + void *status; + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* + * Sibling 0 will have its own seccomp policy + * and Sibling 1 will not be under seccomp at + * all. Sibling 1 will enter seccomp and 0 + * will cause failure. + */ + self->sibling[0].diverge = 1; + tsync_start_sibling(&self->sibling[0]); + tsync_start_sibling(&self->sibling[1]); + + while (self->sibling_count < TSYNC_SIBLINGS) { + sem_wait(&self->started); + self->sibling_count++; + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!"); + } + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(ret, self->sibling[0].system_tid) { + TH_LOG("Did not fail on diverged sibling."); + } + sib = 1; + if (ret == self->sibling[0].system_tid) + sib = 0; + + pthread_mutex_lock(&self->mutex); + + /* Increment the other siblings num_waits so we can clean up + * the one we just saw. + */ + self->sibling[!sib].num_waits += 1; + + /* Signal the thread to clean up*/ + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + pthread_join(self->sibling[sib].tid, &status); + EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status); + /* Poll for actual task death. pthread_join doesn't guarantee it. */ + while (!kill(self->sibling[sib].system_tid, 0)) + sleep(0.1); + /* Switch to the remaining sibling */ + sib = !sib; + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(0, ret) { + TH_LOG("Expected the remaining sibling to sync"); + }; + + pthread_mutex_lock(&self->mutex); + + /* If remaining sibling didn't have a chance to wake up during + * the first broadcast, manually reduce the num_waits now. + */ + if (self->sibling[sib].num_waits > 1) + self->sibling[sib].num_waits = 1; + ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) { + TH_LOG("cond broadcast non-zero"); + } + pthread_mutex_unlock(&self->mutex); + pthread_join(self->sibling[sib].tid, &status); + EXPECT_EQ(0, (long)status); + /* Poll for actual task death. pthread_join doesn't guarantee it. */ + while (!kill(self->sibling[sib].system_tid, 0)) + sleep(0.1); + + ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FLAG_FILTER_TSYNC, + &self->apply_prog); + ASSERT_EQ(0, ret); /* just us chickens */ +} + +/* Make sure restarted syscalls are seen directly as "restart_syscall". */ +TEST(syscall_restart) +{ + long ret; + unsigned long msg; + pid_t child_pid; + int pipefd[2]; + int status; + siginfo_t info = { }; + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + +#ifdef __NR_sigreturn + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0), +#endif + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_poll, 4, 0), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0), + + /* Allow __NR_write for easy logging. */ + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100), /* poll */ + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200), /* restart */ + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, pipe(pipefd)); + + child_pid = fork(); + ASSERT_LE(0, child_pid); + if (child_pid == 0) { + /* Child uses EXPECT not ASSERT to deliver status correctly. */ + char buf = ' '; + struct pollfd fds = { + .fd = pipefd[0], + .events = POLLIN, + }; + + /* Attach parent as tracer and stop. */ + EXPECT_EQ(0, ptrace(PTRACE_TRACEME)); + EXPECT_EQ(0, raise(SIGSTOP)); + + EXPECT_EQ(0, close(pipefd[1])); + + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Failed to install filter!"); + } + + EXPECT_EQ(1, read(pipefd[0], &buf, 1)) { + TH_LOG("Failed to read() sync from parent"); + } + EXPECT_EQ('.', buf) { + TH_LOG("Failed to get sync data from read()"); + } + + /* Start poll to be interrupted. */ + errno = 0; + EXPECT_EQ(1, poll(&fds, 1, -1)) { + TH_LOG("Call to poll() failed (errno %d)", errno); + } + + /* Read final sync from parent. */ + EXPECT_EQ(1, read(pipefd[0], &buf, 1)) { + TH_LOG("Failed final read() from parent"); + } + EXPECT_EQ('!', buf) { + TH_LOG("Failed to get final data from read()"); + } + + /* Directly report the status of our test harness results. */ + syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS + : EXIT_FAILURE); + } + EXPECT_EQ(0, close(pipefd[0])); + + /* Attach to child, setup options, and release. */ + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + ASSERT_EQ(true, WIFSTOPPED(status)); + ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL, + PTRACE_O_TRACESECCOMP)); + ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0)); + ASSERT_EQ(1, write(pipefd[1], ".", 1)); + + /* Wait for poll() to start. */ + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + ASSERT_EQ(true, WIFSTOPPED(status)); + ASSERT_EQ(SIGTRAP, WSTOPSIG(status)); + ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16)); + ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg)); + ASSERT_EQ(0x100, msg); + EXPECT_EQ(__NR_poll, get_syscall(_metadata, child_pid)); + + /* Might as well check siginfo for sanity while we're here. */ + ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info)); + ASSERT_EQ(SIGTRAP, info.si_signo); + ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code); + EXPECT_EQ(0, info.si_errno); + EXPECT_EQ(getuid(), info.si_uid); + /* Verify signal delivery came from child (seccomp-triggered). */ + EXPECT_EQ(child_pid, info.si_pid); + + /* Interrupt poll with SIGSTOP (which we'll need to handle). */ + ASSERT_EQ(0, kill(child_pid, SIGSTOP)); + ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0)); + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + ASSERT_EQ(true, WIFSTOPPED(status)); + ASSERT_EQ(SIGSTOP, WSTOPSIG(status)); + /* Verify signal delivery came from parent now. */ + ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info)); + EXPECT_EQ(getpid(), info.si_pid); + + /* Restart poll with SIGCONT, which triggers restart_syscall. */ + ASSERT_EQ(0, kill(child_pid, SIGCONT)); + ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0)); + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + ASSERT_EQ(true, WIFSTOPPED(status)); + ASSERT_EQ(SIGCONT, WSTOPSIG(status)); + ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0)); + + /* Wait for restart_syscall() to start. */ + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + ASSERT_EQ(true, WIFSTOPPED(status)); + ASSERT_EQ(SIGTRAP, WSTOPSIG(status)); + ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16)); + ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg)); + ASSERT_EQ(0x200, msg); + ret = get_syscall(_metadata, child_pid); +#if defined(__arm__) + /* FIXME: ARM does not expose true syscall in registers. */ + EXPECT_EQ(__NR_poll, ret); +#else + EXPECT_EQ(__NR_restart_syscall, ret); +#endif + + /* Write again to end poll. */ + ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0)); + ASSERT_EQ(1, write(pipefd[1], "!", 1)); + EXPECT_EQ(0, close(pipefd[1])); + + ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0)); + if (WIFSIGNALED(status) || WEXITSTATUS(status)) + _metadata->passed = 0; +} + +/* + * TODO: + * - add microbenchmarks + * - expand NNP testing + * - better arch-specific TRACE and TRAP handlers. + * - endianness checking when appropriate + * - 64-bit arg prodding + * - arch value testing (x86 modes especially) + * - ... + */ + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/seccomp/test_harness.h b/tools/testing/selftests/seccomp/test_harness.h new file mode 100644 index 000000000000..977a6afc4489 --- /dev/null +++ b/tools/testing/selftests/seccomp/test_harness.h @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2012 The Chromium OS Authors. All rights reserved. + * Use of this source code is governed by the GPLv2 license. + * + * test_harness.h: simple C unit test helper. + * + * Usage: + * #include "test_harness.h" + * TEST(standalone_test) { + * do_some_stuff; + * EXPECT_GT(10, stuff) { + * stuff_state_t state; + * enumerate_stuff_state(&state); + * TH_LOG("expectation failed with state: %s", state.msg); + * } + * more_stuff; + * ASSERT_NE(some_stuff, NULL) TH_LOG("how did it happen?!"); + * last_stuff; + * EXPECT_EQ(0, last_stuff); + * } + * + * FIXTURE(my_fixture) { + * mytype_t *data; + * int awesomeness_level; + * }; + * FIXTURE_SETUP(my_fixture) { + * self->data = mytype_new(); + * ASSERT_NE(NULL, self->data); + * } + * FIXTURE_TEARDOWN(my_fixture) { + * mytype_free(self->data); + * } + * TEST_F(my_fixture, data_is_good) { + * EXPECT_EQ(1, is_my_data_good(self->data)); + * } + * + * TEST_HARNESS_MAIN + * + * API inspired by code.google.com/p/googletest + */ +#ifndef TEST_HARNESS_H_ +#define TEST_HARNESS_H_ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +/* All exported functionality should be declared through this macro. */ +#define TEST_API(x) _##x + +/* + * Exported APIs + */ + +/* TEST(name) { implementation } + * Defines a test by name. + * Names must be unique and tests must not be run in parallel. The + * implementation containing block is a function and scoping should be treated + * as such. Returning early may be performed with a bare "return;" statement. + * + * EXPECT_* and ASSERT_* are valid in a TEST() { } context. + */ +#define TEST TEST_API(TEST) + +/* TEST_SIGNAL(name, signal) { implementation } + * Defines a test by name and the expected term signal. + * Names must be unique and tests must not be run in parallel. The + * implementation containing block is a function and scoping should be treated + * as such. Returning early may be performed with a bare "return;" statement. + * + * EXPECT_* and ASSERT_* are valid in a TEST() { } context. + */ +#define TEST_SIGNAL TEST_API(TEST_SIGNAL) + +/* FIXTURE(datatype name) { + * type property1; + * ... + * }; + * Defines the data provided to TEST_F()-defined tests as |self|. It should be + * populated and cleaned up using FIXTURE_SETUP and FIXTURE_TEARDOWN. + */ +#define FIXTURE TEST_API(FIXTURE) + +/* FIXTURE_DATA(datatype name) + * This call may be used when the type of the fixture data + * is needed. In general, this should not be needed unless + * the |self| is being passed to a helper directly. + */ +#define FIXTURE_DATA TEST_API(FIXTURE_DATA) + +/* FIXTURE_SETUP(fixture name) { implementation } + * Populates the required "setup" function for a fixture. An instance of the + * datatype defined with _FIXTURE_DATA will be exposed as |self| for the + * implementation. + * + * ASSERT_* are valid for use in this context and will prempt the execution + * of any dependent fixture tests. + * + * A bare "return;" statement may be used to return early. + */ +#define FIXTURE_SETUP TEST_API(FIXTURE_SETUP) + +/* FIXTURE_TEARDOWN(fixture name) { implementation } + * Populates the required "teardown" function for a fixture. An instance of the + * datatype defined with _FIXTURE_DATA will be exposed as |self| for the + * implementation to clean up. + * + * A bare "return;" statement may be used to return early. + */ +#define FIXTURE_TEARDOWN TEST_API(FIXTURE_TEARDOWN) + +/* TEST_F(fixture, name) { implementation } + * Defines a test that depends on a fixture (e.g., is part of a test case). + * Very similar to TEST() except that |self| is the setup instance of fixture's + * datatype exposed for use by the implementation. + */ +#define TEST_F TEST_API(TEST_F) + +#define TEST_F_SIGNAL TEST_API(TEST_F_SIGNAL) + +/* Use once to append a main() to the test file. E.g., + * TEST_HARNESS_MAIN + */ +#define TEST_HARNESS_MAIN TEST_API(TEST_HARNESS_MAIN) + +/* + * Operators for use in TEST and TEST_F. + * ASSERT_* calls will stop test execution immediately. + * EXPECT_* calls will emit a failure warning, note it, and continue. + */ + +/* ASSERT_EQ(expected, measured): expected == measured */ +#define ASSERT_EQ TEST_API(ASSERT_EQ) +/* ASSERT_NE(expected, measured): expected != measured */ +#define ASSERT_NE TEST_API(ASSERT_NE) +/* ASSERT_LT(expected, measured): expected < measured */ +#define ASSERT_LT TEST_API(ASSERT_LT) +/* ASSERT_LE(expected, measured): expected <= measured */ +#define ASSERT_LE TEST_API(ASSERT_LE) +/* ASSERT_GT(expected, measured): expected > measured */ +#define ASSERT_GT TEST_API(ASSERT_GT) +/* ASSERT_GE(expected, measured): expected >= measured */ +#define ASSERT_GE TEST_API(ASSERT_GE) +/* ASSERT_NULL(measured): NULL == measured */ +#define ASSERT_NULL TEST_API(ASSERT_NULL) +/* ASSERT_TRUE(measured): measured != 0 */ +#define ASSERT_TRUE TEST_API(ASSERT_TRUE) +/* ASSERT_FALSE(measured): measured == 0 */ +#define ASSERT_FALSE TEST_API(ASSERT_FALSE) +/* ASSERT_STREQ(expected, measured): !strcmp(expected, measured) */ +#define ASSERT_STREQ TEST_API(ASSERT_STREQ) +/* ASSERT_STRNE(expected, measured): strcmp(expected, measured) */ +#define ASSERT_STRNE TEST_API(ASSERT_STRNE) +/* EXPECT_EQ(expected, measured): expected == measured */ +#define EXPECT_EQ TEST_API(EXPECT_EQ) +/* EXPECT_NE(expected, measured): expected != measured */ +#define EXPECT_NE TEST_API(EXPECT_NE) +/* EXPECT_LT(expected, measured): expected < measured */ +#define EXPECT_LT TEST_API(EXPECT_LT) +/* EXPECT_LE(expected, measured): expected <= measured */ +#define EXPECT_LE TEST_API(EXPECT_LE) +/* EXPECT_GT(expected, measured): expected > measured */ +#define EXPECT_GT TEST_API(EXPECT_GT) +/* EXPECT_GE(expected, measured): expected >= measured */ +#define EXPECT_GE TEST_API(EXPECT_GE) +/* EXPECT_NULL(measured): NULL == measured */ +#define EXPECT_NULL TEST_API(EXPECT_NULL) +/* EXPECT_TRUE(measured): 0 != measured */ +#define EXPECT_TRUE TEST_API(EXPECT_TRUE) +/* EXPECT_FALSE(measured): 0 == measured */ +#define EXPECT_FALSE TEST_API(EXPECT_FALSE) +/* EXPECT_STREQ(expected, measured): !strcmp(expected, measured) */ +#define EXPECT_STREQ TEST_API(EXPECT_STREQ) +/* EXPECT_STRNE(expected, measured): strcmp(expected, measured) */ +#define EXPECT_STRNE TEST_API(EXPECT_STRNE) + +/* TH_LOG(format, ...) + * Optional debug logging function available for use in tests. + * Logging may be enabled or disabled by defining TH_LOG_ENABLED. + * E.g., #define TH_LOG_ENABLED 1 + * If no definition is provided, logging is enabled by default. + */ +#define TH_LOG TEST_API(TH_LOG) + +/* + * Internal implementation. + * + */ + +/* Utilities exposed to the test definitions */ +#ifndef TH_LOG_STREAM +# define TH_LOG_STREAM stderr +#endif + +#ifndef TH_LOG_ENABLED +# define TH_LOG_ENABLED 1 +#endif + +#define _TH_LOG(fmt, ...) do { \ + if (TH_LOG_ENABLED) \ + __TH_LOG(fmt, ##__VA_ARGS__); \ +} while (0) + +/* Unconditional logger for internal use. */ +#define __TH_LOG(fmt, ...) \ + fprintf(TH_LOG_STREAM, "%s:%d:%s:" fmt "\n", \ + __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__) + +/* Defines the test function and creates the registration stub. */ +#define _TEST(test_name) __TEST_IMPL(test_name, -1) + +#define _TEST_SIGNAL(test_name, signal) __TEST_IMPL(test_name, signal) + +#define __TEST_IMPL(test_name, _signal) \ + static void test_name(struct __test_metadata *_metadata); \ + static struct __test_metadata _##test_name##_object = \ + { name: "global." #test_name, \ + fn: &test_name, termsig: _signal }; \ + static void __attribute__((constructor)) _register_##test_name(void) \ + { \ + __register_test(&_##test_name##_object); \ + } \ + static void test_name( \ + struct __test_metadata __attribute__((unused)) *_metadata) + +/* Wraps the struct name so we have one less argument to pass around. */ +#define _FIXTURE_DATA(fixture_name) struct _test_data_##fixture_name + +/* Called once per fixture to setup the data and register. */ +#define _FIXTURE(fixture_name) \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_data(void) \ + { \ + __fixture_count++; \ + } \ + _FIXTURE_DATA(fixture_name) + +/* Prepares the setup function for the fixture. |_metadata| is included + * so that ASSERT_* work as a convenience. + */ +#define _FIXTURE_SETUP(fixture_name) \ + void fixture_name##_setup( \ + struct __test_metadata __attribute__((unused)) *_metadata, \ + _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) +#define _FIXTURE_TEARDOWN(fixture_name) \ + void fixture_name##_teardown( \ + struct __test_metadata __attribute__((unused)) *_metadata, \ + _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + +/* Emits test registration and helpers for fixture-based test + * cases. + * TODO(wad) register fixtures on dedicated test lists. + */ +#define _TEST_F(fixture_name, test_name) \ + __TEST_F_IMPL(fixture_name, test_name, -1) + +#define _TEST_F_SIGNAL(fixture_name, test_name, signal) \ + __TEST_F_IMPL(fixture_name, test_name, signal) + +#define __TEST_F_IMPL(fixture_name, test_name, signal) \ + static void fixture_name##_##test_name( \ + struct __test_metadata *_metadata, \ + _FIXTURE_DATA(fixture_name) *self); \ + static inline void wrapper_##fixture_name##_##test_name( \ + struct __test_metadata *_metadata) \ + { \ + /* fixture data is alloced, setup, and torn down per call. */ \ + _FIXTURE_DATA(fixture_name) self; \ + memset(&self, 0, sizeof(_FIXTURE_DATA(fixture_name))); \ + fixture_name##_setup(_metadata, &self); \ + /* Let setup failure terminate early. */ \ + if (!_metadata->passed) \ + return; \ + fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_teardown(_metadata, &self); \ + } \ + static struct __test_metadata \ + _##fixture_name##_##test_name##_object = { \ + name: #fixture_name "." #test_name, \ + fn: &wrapper_##fixture_name##_##test_name, \ + termsig: signal, \ + }; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##test_name(void) \ + { \ + __register_test(&_##fixture_name##_##test_name##_object); \ + } \ + static void fixture_name##_##test_name( \ + struct __test_metadata __attribute__((unused)) *_metadata, \ + _FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + +/* Exports a simple wrapper to run the test harness. */ +#define _TEST_HARNESS_MAIN \ + static void __attribute__((constructor)) \ + __constructor_order_last(void) \ + { \ + if (!__constructor_order) \ + __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \ + } \ + int main(int argc, char **argv) { \ + return test_harness_run(argc, argv); \ + } + +#define _ASSERT_EQ(_expected, _seen) \ + __EXPECT(_expected, _seen, ==, 1) +#define _ASSERT_NE(_expected, _seen) \ + __EXPECT(_expected, _seen, !=, 1) +#define _ASSERT_LT(_expected, _seen) \ + __EXPECT(_expected, _seen, <, 1) +#define _ASSERT_LE(_expected, _seen) \ + __EXPECT(_expected, _seen, <=, 1) +#define _ASSERT_GT(_expected, _seen) \ + __EXPECT(_expected, _seen, >, 1) +#define _ASSERT_GE(_expected, _seen) \ + __EXPECT(_expected, _seen, >=, 1) +#define _ASSERT_NULL(_seen) \ + __EXPECT(NULL, _seen, ==, 1) + +#define _ASSERT_TRUE(_seen) \ + _ASSERT_NE(0, _seen) +#define _ASSERT_FALSE(_seen) \ + _ASSERT_EQ(0, _seen) +#define _ASSERT_STREQ(_expected, _seen) \ + __EXPECT_STR(_expected, _seen, ==, 1) +#define _ASSERT_STRNE(_expected, _seen) \ + __EXPECT_STR(_expected, _seen, !=, 1) + +#define _EXPECT_EQ(_expected, _seen) \ + __EXPECT(_expected, _seen, ==, 0) +#define _EXPECT_NE(_expected, _seen) \ + __EXPECT(_expected, _seen, !=, 0) +#define _EXPECT_LT(_expected, _seen) \ + __EXPECT(_expected, _seen, <, 0) +#define _EXPECT_LE(_expected, _seen) \ + __EXPECT(_expected, _seen, <=, 0) +#define _EXPECT_GT(_expected, _seen) \ + __EXPECT(_expected, _seen, >, 0) +#define _EXPECT_GE(_expected, _seen) \ + __EXPECT(_expected, _seen, >=, 0) + +#define _EXPECT_NULL(_seen) \ + __EXPECT(NULL, _seen, ==, 0) +#define _EXPECT_TRUE(_seen) \ + _EXPECT_NE(0, _seen) +#define _EXPECT_FALSE(_seen) \ + _EXPECT_EQ(0, _seen) + +#define _EXPECT_STREQ(_expected, _seen) \ + __EXPECT_STR(_expected, _seen, ==, 0) +#define _EXPECT_STRNE(_expected, _seen) \ + __EXPECT_STR(_expected, _seen, !=, 0) + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) + +/* Support an optional handler after and ASSERT_* or EXPECT_*. The approach is + * not thread-safe, but it should be fine in most sane test scenarios. + * + * Using __bail(), which optionally abort()s, is the easiest way to early + * return while still providing an optional block to the API consumer. + */ +#define OPTIONAL_HANDLER(_assert) \ + for (; _metadata->trigger; _metadata->trigger = __bail(_assert)) + +#define __EXPECT(_expected, _seen, _t, _assert) do { \ + /* Avoid multiple evaluation of the cases */ \ + __typeof__(_expected) __exp = (_expected); \ + __typeof__(_seen) __seen = (_seen); \ + if (!(__exp _t __seen)) { \ + unsigned long long __exp_print = 0; \ + unsigned long long __seen_print = 0; \ + /* Avoid casting complaints the scariest way we can. */ \ + memcpy(&__exp_print, &__exp, sizeof(__exp)); \ + memcpy(&__seen_print, &__seen, sizeof(__seen)); \ + __TH_LOG("Expected %s (%llu) %s %s (%llu)", \ + #_expected, __exp_print, #_t, \ + #_seen, __seen_print); \ + _metadata->passed = 0; \ + /* Ensure the optional handler is triggered */ \ + _metadata->trigger = 1; \ + } \ +} while (0); OPTIONAL_HANDLER(_assert) + +#define __EXPECT_STR(_expected, _seen, _t, _assert) do { \ + const char *__exp = (_expected); \ + const char *__seen = (_seen); \ + if (!(strcmp(__exp, __seen) _t 0)) { \ + __TH_LOG("Expected '%s' %s '%s'.", __exp, #_t, __seen); \ + _metadata->passed = 0; \ + _metadata->trigger = 1; \ + } \ +} while (0); OPTIONAL_HANDLER(_assert) + +/* Contains all the information for test execution and status checking. */ +struct __test_metadata { + const char *name; + void (*fn)(struct __test_metadata *); + int termsig; + int passed; + int trigger; /* extra handler after the evaluation */ + struct __test_metadata *prev, *next; +}; + +/* Storage for the (global) tests to be run. */ +static struct __test_metadata *__test_list; +static unsigned int __test_count; +static unsigned int __fixture_count; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +/* + * Since constructors are called in reverse order, reverse the test + * list so tests are run in source declaration order. + * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html + * However, it seems not all toolchains do this correctly, so use + * __constructor_order to detect which direction is called first + * and adjust list building logic to get things running in the right + * direction. + */ +static inline void __register_test(struct __test_metadata *t) +{ + __test_count++; + /* Circular linked list where only prev is circular. */ + if (__test_list == NULL) { + __test_list = t; + t->next = NULL; + t->prev = t; + return; + } + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { + t->next = NULL; + t->prev = __test_list->prev; + t->prev->next = t; + __test_list->prev = t; + } else { + t->next = __test_list; + t->next->prev = t; + t->prev = t; + __test_list = t; + } +} + +static inline int __bail(int for_realz) +{ + if (for_realz) + abort(); + return 0; +} + +void __run_test(struct __test_metadata *t) +{ + pid_t child_pid; + int status; + + t->passed = 1; + t->trigger = 0; + printf("[ RUN ] %s\n", t->name); + child_pid = fork(); + if (child_pid < 0) { + printf("ERROR SPAWNING TEST CHILD\n"); + t->passed = 0; + } else if (child_pid == 0) { + t->fn(t); + _exit(t->passed); + } else { + /* TODO(wad) add timeout support. */ + waitpid(child_pid, &status, 0); + if (WIFEXITED(status)) { + t->passed = t->termsig == -1 ? WEXITSTATUS(status) : 0; + if (t->termsig != -1) { + fprintf(TH_LOG_STREAM, + "%s: Test exited normally " + "instead of by signal (code: %d)\n", + t->name, + WEXITSTATUS(status)); + } + } else if (WIFSIGNALED(status)) { + t->passed = 0; + if (WTERMSIG(status) == SIGABRT) { + fprintf(TH_LOG_STREAM, + "%s: Test terminated by assertion\n", + t->name); + } else if (WTERMSIG(status) == t->termsig) { + t->passed = 1; + } else { + fprintf(TH_LOG_STREAM, + "%s: Test terminated unexpectedly " + "by signal %d\n", + t->name, + WTERMSIG(status)); + } + } else { + fprintf(TH_LOG_STREAM, + "%s: Test ended in some other way [%u]\n", + t->name, + status); + } + } + printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); +} + +static int test_harness_run(int __attribute__((unused)) argc, + char __attribute__((unused)) **argv) +{ + struct __test_metadata *t; + int ret = 0; + unsigned int count = 0; + unsigned int pass_count = 0; + + /* TODO(wad) add optional arguments similar to gtest. */ + printf("[==========] Running %u tests from %u test cases.\n", + __test_count, __fixture_count + 1); + for (t = __test_list; t; t = t->next) { + count++; + __run_test(t); + if (t->passed) + pass_count++; + else + ret = 1; + } + printf("[==========] %u / %u tests passed.\n", pass_count, count); + printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); + return ret; +} + +static void __attribute__((constructor)) __constructor_order_first(void) +{ + if (!__constructor_order) + __constructor_order = _CONSTRUCTOR_ORDER_FORWARD; +} + +#endif /* TEST_HARNESS_H_ */ diff --git a/tools/testing/selftests/timers/.gitignore b/tools/testing/selftests/timers/.gitignore new file mode 100644 index 000000000000..ced998151bc4 --- /dev/null +++ b/tools/testing/selftests/timers/.gitignore @@ -0,0 +1,18 @@ +alarmtimer-suspend +change_skew +clocksource-switch +inconsistency-check +leap-a-day +leapcrash +mqueue-lat +nanosleep +nsleep-lat +posix_timers +raw_skew +rtctest +set-2038 +set-tai +set-timer-lat +skew_consistency +threadtest +valid-adjtimex diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index aaffbde1d5ee..72cacf5383dd 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -57,7 +57,7 @@ static inline int ksft_exit_fail(void) #define NSEC_PER_SEC 1000000000ULL -#define UNREASONABLE_LAT (NSEC_PER_SEC * 4) /* hopefully we resume in 4secs */ +#define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */ #define SUSPEND_SECS 15 int alarmcount; @@ -152,7 +152,11 @@ int main(void) alarm_clock_id++) { alarmcount = 0; - timer_create(alarm_clock_id, &se, &tm1); + if (timer_create(alarm_clock_id, &se, &tm1) == -1) { + printf("timer_create failled, %s unspported?\n", + clockstring(alarm_clock_id)); + break; + } clock_gettime(alarm_clock_id, &start_time); printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id), @@ -172,7 +176,7 @@ int main(void) while (alarmcount < 10) { int ret; - sleep(1); + sleep(3); ret = system("echo mem > /sys/power/state"); if (ret) break; diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index a5ce9534eb15..231b9a031f6a 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,7 +1,12 @@ # Makefile for vm selftests CFLAGS = -Wall -BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest +BINARIES = compaction_test +BINARIES += hugepage-mmap +BINARIES += hugepage-shm +BINARIES += hugetlbfstest +BINARIES += map_hugetlb +BINARIES += thuge-gen BINARIES += transhuge-stress all: $(BINARIES) diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c new file mode 100644 index 000000000000..932ff577ffc0 --- /dev/null +++ b/tools/testing/selftests/vm/compaction_test.c @@ -0,0 +1,225 @@ +/* + * + * A test for the patch "Allow compaction of unevictable pages". + * With this patch we should be able to allocate at least 1/4 + * of RAM in huge pages. Without the patch much less is + * allocated. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> + +#define MAP_SIZE 1048576 + +struct map_list { + void *map; + struct map_list *next; +}; + +int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) +{ + char buffer[256] = {0}; + char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; + FILE *cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + + *memfree = atoll(buffer); + cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; + cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + *hugepagesize = atoll(buffer); + + return 0; +} + +int prereq(void) +{ + char allowed; + int fd; + + fd = open("/proc/sys/vm/compact_unevictable_allowed", + O_RDONLY | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + return -1; + } + + if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { + perror("Failed to read from\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + close(fd); + return -1; + } + + close(fd); + if (allowed == '1') + return 0; + + return -1; +} + +int check_compaction(unsigned long mem_free, unsigned int hugepage_size) +{ + int fd; + int compaction_index = 0; + char initial_nr_hugepages[10] = {0}; + char nr_hugepages[10] = {0}; + + /* We want to test with 80% of available memory. Else, OOM killer comes + in to play */ + mem_free = mem_free * 0.8; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open /proc/sys/vm/nr_hugepages"); + return -1; + } + + if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { + perror("Failed to read from /proc/sys/vm/nr_hugepages"); + goto close_fd; + } + + /* Start with the initial condition of 0 huge pages*/ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + perror("Failed to write to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Request a large number of huge pages. The Kernel will allocate + as much as it can */ + if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { + perror("Failed to write to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + perror("Failed to read from /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + /* We should have been able to request at least 1/3 rd of the memory in + huge pages */ + compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); + + if (compaction_index > 3) { + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" + "as huge pages\n", compaction_index); + goto close_fd; + } + + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + + if (write(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) + != strlen(initial_nr_hugepages)) { + perror("Failed to write to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + close(fd); + return 0; + + close_fd: + close(fd); + printf("Not OK. Compaction test failed."); + return -1; +} + + +int main(int argc, char **argv) +{ + struct rlimit lim; + struct map_list *list, *entry; + size_t page_size, i; + void *map = NULL; + unsigned long mem_free = 0; + unsigned long hugepage_size = 0; + unsigned long mem_fragmentable = 0; + + if (prereq() != 0) { + printf("Either the sysctl compact_unevictable_allowed is not\n" + "set to 1 or couldn't read the proc file.\n" + "Skipping the test\n"); + return 0; + } + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &lim)) { + perror("Failed to set rlimit:\n"); + return -1; + } + + page_size = getpagesize(); + + list = NULL; + + if (read_memory_info(&mem_free, &hugepage_size) != 0) { + printf("ERROR: Cannot read meminfo\n"); + return -1; + } + + mem_fragmentable = mem_free * 0.8 / 1024; + + while (mem_fragmentable > 0) { + map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); + if (map == MAP_FAILED) + break; + + entry = malloc(sizeof(struct map_list)); + if (!entry) { + munmap(map, MAP_SIZE); + break; + } + entry->map = map; + entry->next = list; + list = entry; + + /* Write something (in this case the address of the map) to + * ensure that KSM can't merge the mapped pages + */ + for (i = 0; i < MAP_SIZE; i += page_size) + *(unsigned long *)(map + i) = (unsigned long)map + i; + + mem_fragmentable--; + } + + for (entry = list; entry != NULL; entry = entry->next) { + munmap(entry->map, MAP_SIZE); + if (!entry->next) + break; + entry = entry->next; + } + + if (check_compaction(mem_free, hugepage_size) == 0) + return 0; + + return -1; +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index c87b6812300d..49ece11ff7fd 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -90,4 +90,16 @@ fi umount $mnt rm -rf $mnt echo $nr_hugepgs > /proc/sys/vm/nr_hugepages + +echo "-----------------------" +echo "running compaction_test" +echo "-----------------------" +./compaction_test +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + exit $exitcode diff --git a/tools/testing/selftests/x86/trivial_64bit_program.c b/tools/testing/selftests/x86/trivial_64bit_program.c index b994946c40fb..05c6a41b3671 100644 --- a/tools/testing/selftests/x86/trivial_64bit_program.c +++ b/tools/testing/selftests/x86/trivial_64bit_program.c @@ -1,5 +1,5 @@ /* - * Trivial program to check that we have a valid 32-bit build environment. + * Trivial program to check that we have a valid 64-bit build environment. * Copyright (c) 2015 Andy Lutomirski * GPL v2 */ |