diff options
468 files changed, 14555 insertions, 6832 deletions
diff --git a/Documentation/assoc_array.txt b/Documentation/assoc_array.txt new file mode 100644 index 00000000000..f4faec0f66e --- /dev/null +++ b/Documentation/assoc_array.txt @@ -0,0 +1,574 @@ + ======================================== + GENERIC ASSOCIATIVE ARRAY IMPLEMENTATION + ======================================== + +Contents: + + - Overview. + + - The public API. + - Edit script. + - Operations table. + - Manipulation functions. + - Access functions. + - Index key form. + + - Internal workings. + - Basic internal tree layout. + - Shortcuts. + - Splitting and collapsing nodes. + - Non-recursive iteration. + - Simultaneous alteration and iteration. + + +======== +OVERVIEW +======== + +This associative array implementation is an object container with the following +properties: + + (1) Objects are opaque pointers. The implementation does not care where they + point (if anywhere) or what they point to (if anything). + + [!] NOTE: Pointers to objects _must_ be zero in the least significant bit. + + (2) Objects do not need to contain linkage blocks for use by the array. This + permits an object to be located in multiple arrays simultaneously. + Rather, the array is made up of metadata blocks that point to objects. + + (3) Objects require index keys to locate them within the array. + + (4) Index keys must be unique. Inserting an object with the same key as one + already in the array will replace the old object. + + (5) Index keys can be of any length and can be of different lengths. + + (6) Index keys should encode the length early on, before any variation due to + length is seen. + + (7) Index keys can include a hash to scatter objects throughout the array. + + (8) The array can iterated over. The objects will not necessarily come out in + key order. + + (9) The array can be iterated over whilst it is being modified, provided the + RCU readlock is being held by the iterator. Note, however, under these + circumstances, some objects may be seen more than once. If this is a + problem, the iterator should lock against modification. Objects will not + be missed, however, unless deleted. + +(10) Objects in the array can be looked up by means of their index key. + +(11) Objects can be looked up whilst the array is being modified, provided the + RCU readlock is being held by the thread doing the look up. + +The implementation uses a tree of 16-pointer nodes internally that are indexed +on each level by nibbles from the index key in the same manner as in a radix +tree. To improve memory efficiency, shortcuts can be emplaced to skip over +what would otherwise be a series of single-occupancy nodes. Further, nodes +pack leaf object pointers into spare space in the node rather than making an +extra branch until as such time an object needs to be added to a full node. + + +============== +THE PUBLIC API +============== + +The public API can be found in <linux/assoc_array.h>. The associative array is +rooted on the following structure: + + struct assoc_array { + ... + }; + +The code is selected by enabling CONFIG_ASSOCIATIVE_ARRAY. + + +EDIT SCRIPT +----------- + +The insertion and deletion functions produce an 'edit script' that can later be +applied to effect the changes without risking ENOMEM. This retains the +preallocated metadata blocks that will be installed in the internal tree and +keeps track of the metadata blocks that will be removed from the tree when the +script is applied. + +This is also used to keep track of dead blocks and dead objects after the +script has been applied so that they can be freed later. The freeing is done +after an RCU grace period has passed - thus allowing access functions to +proceed under the RCU read lock. + +The script appears as outside of the API as a pointer of the type: + + struct assoc_array_edit; + +There are two functions for dealing with the script: + + (1) Apply an edit script. + + void assoc_array_apply_edit(struct assoc_array_edit *edit); + + This will perform the edit functions, interpolating various write barriers + to permit accesses under the RCU read lock to continue. The edit script + will then be passed to call_rcu() to free it and any dead stuff it points + to. + + (2) Cancel an edit script. + + void assoc_array_cancel_edit(struct assoc_array_edit *edit); + + This frees the edit script and all preallocated memory immediately. If + this was for insertion, the new object is _not_ released by this function, + but must rather be released by the caller. + +These functions are guaranteed not to fail. + + +OPERATIONS TABLE +---------------- + +Various functions take a table of operations: + + struct assoc_array_ops { + ... + }; + +This points to a number of methods, all of which need to be provided: + + (1) Get a chunk of index key from caller data: + + unsigned long (*get_key_chunk)(const void *index_key, int level); + + This should return a chunk of caller-supplied index key starting at the + *bit* position given by the level argument. The level argument will be a + multiple of ASSOC_ARRAY_KEY_CHUNK_SIZE and the function should return + ASSOC_ARRAY_KEY_CHUNK_SIZE bits. No error is possible. + + + (2) Get a chunk of an object's index key. + + unsigned long (*get_object_key_chunk)(const void *object, int level); + + As the previous function, but gets its data from an object in the array + rather than from a caller-supplied index key. + + + (3) See if this is the object we're looking for. + + bool (*compare_object)(const void *object, const void *index_key); + + Compare the object against an index key and return true if it matches and + false if it doesn't. + + + (4) Diff the index keys of two objects. + + int (*diff_objects)(const void *a, const void *b); + + Return the bit position at which the index keys of two objects differ or + -1 if they are the same. + + + (5) Free an object. + + void (*free_object)(void *object); + + Free the specified object. Note that this may be called an RCU grace + period after assoc_array_apply_edit() was called, so synchronize_rcu() may + be necessary on module unloading. + + +MANIPULATION FUNCTIONS +---------------------- + +There are a number of functions for manipulating an associative array: + + (1) Initialise an associative array. + + void assoc_array_init(struct assoc_array *array); + + This initialises the base structure for an associative array. It can't + fail. + + + (2) Insert/replace an object in an associative array. + + struct assoc_array_edit * + assoc_array_insert(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + void *object); + + This inserts the given object into the array. Note that the least + significant bit of the pointer must be zero as it's used to type-mark + pointers internally. + + If an object already exists for that key then it will be replaced with the + new object and the old one will be freed automatically. + + The index_key argument should hold index key information and is + passed to the methods in the ops table when they are called. + + This function makes no alteration to the array itself, but rather returns + an edit script that must be applied. -ENOMEM is returned in the case of + an out-of-memory error. + + The caller should lock exclusively against other modifiers of the array. + + + (3) Delete an object from an associative array. + + struct assoc_array_edit * + assoc_array_delete(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key); + + This deletes an object that matches the specified data from the array. + + The index_key argument should hold index key information and is + passed to the methods in the ops table when they are called. + + This function makes no alteration to the array itself, but rather returns + an edit script that must be applied. -ENOMEM is returned in the case of + an out-of-memory error. NULL will be returned if the specified object is + not found within the array. + + The caller should lock exclusively against other modifiers of the array. + + + (4) Delete all objects from an associative array. + + struct assoc_array_edit * + assoc_array_clear(struct assoc_array *array, + const struct assoc_array_ops *ops); + + This deletes all the objects from an associative array and leaves it + completely empty. + + This function makes no alteration to the array itself, but rather returns + an edit script that must be applied. -ENOMEM is returned in the case of + an out-of-memory error. + + The caller should lock exclusively against other modifiers of the array. + + + (5) Destroy an associative array, deleting all objects. + + void assoc_array_destroy(struct assoc_array *array, + const struct assoc_array_ops *ops); + + This destroys the contents of the associative array and leaves it + completely empty. It is not permitted for another thread to be traversing + the array under the RCU read lock at the same time as this function is + destroying it as no RCU deferral is performed on memory release - + something that would require memory to be allocated. + + The caller should lock exclusively against other modifiers and accessors + of the array. + + + (6) Garbage collect an associative array. + + int assoc_array_gc(struct assoc_array *array, + const struct assoc_array_ops *ops, + bool (*iterator)(void *object, void *iterator_data), + void *iterator_data); + + This iterates over the objects in an associative array and passes each one + to iterator(). If iterator() returns true, the object is kept. If it + returns false, the object will be freed. If the iterator() function + returns true, it must perform any appropriate refcount incrementing on the + object before returning. + + The internal tree will be packed down if possible as part of the iteration + to reduce the number of nodes in it. + + The iterator_data is passed directly to iterator() and is otherwise + ignored by the function. + + The function will return 0 if successful and -ENOMEM if there wasn't + enough memory. + + It is possible for other threads to iterate over or search the array under + the RCU read lock whilst this function is in progress. The caller should + lock exclusively against other modifiers of the array. + + +ACCESS FUNCTIONS +---------------- + +There are two functions for accessing an associative array: + + (1) Iterate over all the objects in an associative array. + + int assoc_array_iterate(const struct assoc_array *array, + int (*iterator)(const void *object, + void *iterator_data), + void *iterator_data); + + This passes each object in the array to the iterator callback function. + iterator_data is private data for that function. + + This may be used on an array at the same time as the array is being + modified, provided the RCU read lock is held. Under such circumstances, + it is possible for the iteration function to see some objects twice. If + this is a problem, then modification should be locked against. The + iteration algorithm should not, however, miss any objects. + + The function will return 0 if no objects were in the array or else it will + return the result of the last iterator function called. Iteration stops + immediately if any call to the iteration function results in a non-zero + return. + + + (2) Find an object in an associative array. + + void *assoc_array_find(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key); + + This walks through the array's internal tree directly to the object + specified by the index key.. + + This may be used on an array at the same time as the array is being + modified, provided the RCU read lock is held. + + The function will return the object if found (and set *_type to the object + type) or will return NULL if the object was not found. + + +INDEX KEY FORM +-------------- + +The index key can be of any form, but since the algorithms aren't told how long +the key is, it is strongly recommended that the index key includes its length +very early on before any variation due to the length would have an effect on +comparisons. + +This will cause leaves with different length keys to scatter away from each +other - and those with the same length keys to cluster together. + +It is also recommended that the index key begin with a hash of the rest of the +key to maximise scattering throughout keyspace. + +The better the scattering, the wider and lower the internal tree will be. + +Poor scattering isn't too much of a problem as there are shortcuts and nodes +can contain mixtures of leaves and metadata pointers. + +The index key is read in chunks of machine word. Each chunk is subdivided into +one nibble (4 bits) per level, so on a 32-bit CPU this is good for 8 levels and +on a 64-bit CPU, 16 levels. Unless the scattering is really poor, it is +unlikely that more than one word of any particular index key will have to be +used. + + +================= +INTERNAL WORKINGS +================= + +The associative array data structure has an internal tree. This tree is +constructed of two types of metadata blocks: nodes and shortcuts. + +A node is an array of slots. Each slot can contain one of four things: + + (*) A NULL pointer, indicating that the slot is empty. + + (*) A pointer to an object (a leaf). + + (*) A pointer to a node at the next level. + + (*) A pointer to a shortcut. + + +BASIC INTERNAL TREE LAYOUT +-------------------------- + +Ignoring shortcuts for the moment, the nodes form a multilevel tree. The index +key space is strictly subdivided by the nodes in the tree and nodes occur on +fixed levels. For example: + + Level: 0 1 2 3 + =============== =============== =============== =============== + NODE D + NODE B NODE C +------>+---+ + +------>+---+ +------>+---+ | | 0 | + NODE A | | 0 | | | 0 | | +---+ + +---+ | +---+ | +---+ | : : + | 0 | | : : | : : | +---+ + +---+ | +---+ | +---+ | | f | + | 1 |---+ | 3 |---+ | 7 |---+ +---+ + +---+ +---+ +---+ + : : : : | 8 |---+ + +---+ +---+ +---+ | NODE E + | e |---+ | f | : : +------>+---+ + +---+ | +---+ +---+ | 0 | + | f | | | f | +---+ + +---+ | +---+ : : + | NODE F +---+ + +------>+---+ | f | + | 0 | NODE G +---+ + +---+ +------>+---+ + : : | | 0 | + +---+ | +---+ + | 6 |---+ : : + +---+ +---+ + : : | f | + +---+ +---+ + | f | + +---+ + +In the above example, there are 7 nodes (A-G), each with 16 slots (0-f). +Assuming no other meta data nodes in the tree, the key space is divided thusly: + + KEY PREFIX NODE + ========== ==== + 137* D + 138* E + 13[0-69-f]* C + 1[0-24-f]* B + e6* G + e[0-57-f]* F + [02-df]* A + +So, for instance, keys with the following example index keys will be found in +the appropriate nodes: + + INDEX KEY PREFIX NODE + =============== ======= ==== + 13694892892489 13 C + 13795289025897 137 D + 13889dde88793 138 E + 138bbb89003093 138 E + 1394879524789 12 C + 1458952489 1 B + 9431809de993ba - A + b4542910809cd - A + e5284310def98 e F + e68428974237 e6 G + e7fffcbd443 e F + f3842239082 - A + +To save memory, if a node can hold all the leaves in its portion of keyspace, +then the node will have all those leaves in it and will not have any metadata +pointers - even if some of those leaves would like to be in the same slot. + +A node can contain a heterogeneous mix of leaves and metadata pointers. +Metadata pointers must be in the slots that match their subdivisions of key +space. The leaves can be in any slot not occupied by a metadata pointer. It +is guaranteed that none of the leaves in a node will match a slot occupied by a +metadata pointer. If the metadata pointer is there, any leaf whose key matches +the metadata key prefix must be in the subtree that the metadata pointer points +to. + +In the above example list of index keys, node A will contain: + + SLOT CONTENT INDEX KEY (PREFIX) + ==== =============== ================== + 1 PTR TO NODE B 1* + any LEAF 9431809de993ba + any LEAF b4542910809cd + e PTR TO NODE F e* + any LEAF f3842239082 + +and node B: + + 3 PTR TO NODE C 13* + any LEAF 1458952489 + + +SHORTCUTS +--------- + +Shortcuts are metadata records that jump over a piece of keyspace. A shortcut +is a replacement for a series of single-occupancy nodes ascending through the +levels. Shortcuts exist to save memory and to speed up traversal. + +It is possible for the root of the tree to be a shortcut - say, for example, +the tree contains at least 17 nodes all with key prefix '1111'. The insertion +algorithm will insert a shortcut to skip over the '1111' keyspace in a single +bound and get to the fourth level where these actually become different. + + +SPLITTING AND COLLAPSING NODES +------------------------------ + +Each node has a maximum capacity of 16 leaves and metadata pointers. If the +insertion algorithm finds that it is trying to insert a 17th object into a +node, that node will be split such that at least two leaves that have a common +key segment at that level end up in a separate node rooted on that slot for +that common key segment. + +If the leaves in a full node and the leaf that is being inserted are +sufficiently similar, then a shortcut will be inserted into the tree. + +When the number of objects in the subtree rooted at a node falls to 16 or +fewer, then the subtree will be collapsed down to a single node - and this will +ripple towards the root if possible. + + +NON-RECURSIVE ITERATION +----------------------- + +Each node and shortcut contains a back pointer to its parent and the number of +slot in that parent that points to it. None-recursive iteration uses these to +proceed rootwards through the tree, going to the parent node, slot N + 1 to +make sure progress is made without the need for a stack. + +The backpointers, however, make simultaneous alteration and iteration tricky. + + +SIMULTANEOUS ALTERATION AND ITERATION +------------------------------------- + +There are a number of cases to consider: + + (1) Simple insert/replace. This involves simply replacing a NULL or old + matching leaf pointer with the pointer to the new leaf after a barrier. + The metadata blocks don't change otherwise. An old leaf won't be freed + until after the RCU grace period. + + (2) Simple delete. This involves just clearing an old matching leaf. The + metadata blocks don't change otherwise. The old leaf won't be freed until + after the RCU grace period. + + (3) Insertion replacing part of a subtree that we haven't yet entered. This + may involve replacement of part of that subtree - but that won't affect + the iteration as we won't have reached the pointer to it yet and the + ancestry blocks are not replaced (the layout of those does not change). + + (4) Insertion replacing nodes that we're actively processing. This isn't a + problem as we've passed the anchoring pointer and won't switch onto the + new layout until we follow the back pointers - at which point we've + already examined the leaves in the replaced node (we iterate over all the + leaves in a node before following any of its metadata pointers). + + We might, however, re-see some leaves that have been split out into a new + branch that's in a slot further along than we were at. + + (5) Insertion replacing nodes that we're processing a dependent branch of. + This won't affect us until we follow the back pointers. Similar to (4). + + (6) Deletion collapsing a branch under us. This doesn't affect us because the + back pointers will get us back to the parent of the new node before we + could see the new node. The entire collapsed subtree is thrown away + unchanged - and will still be rooted on the same slot, so we shouldn't + process it a second time as we'll go back to slot + 1. + +Note: + + (*) Under some circumstances, we need to simultaneously change the parent + pointer and the parent slot pointer on a node (say, for example, we + inserted another node before it and moved it up a level). We cannot do + this without locking against a read - so we have to replace that node too. + + However, when we're changing a shortcut into a node this isn't a problem + as shortcuts only have one slot and so the parent slot number isn't used + when traversing backwards over one. This means that it's okay to change + the slot number first - provided suitable barriers are used to make sure + the parent slot number is read after the back pointer. + +Obsolete blocks and leaves are freed up after an RCU grace period has passed, +so as long as anyone doing walking or iteration holds the RCU read lock, the +old superstructure should not go away on them. diff --git a/Documentation/devicetree/bindings/dma/atmel-dma.txt b/Documentation/devicetree/bindings/dma/atmel-dma.txt index e1f343c7a34..f69bcf5a634 100644 --- a/Documentation/devicetree/bindings/dma/atmel-dma.txt +++ b/Documentation/devicetree/bindings/dma/atmel-dma.txt @@ -28,7 +28,7 @@ The three cells in order are: dependent: - bit 7-0: peripheral identifier for the hardware handshaking interface. The identifier can be different for tx and rx. - - bit 11-8: FIFO configuration. 0 for half FIFO, 1 for ALAP, 1 for ASAP. + - bit 11-8: FIFO configuration. 0 for half FIFO, 1 for ALAP, 2 for ASAP. Example: diff --git a/Documentation/devicetree/bindings/i2c/trivial-devices.txt b/Documentation/devicetree/bindings/i2c/trivial-devices.txt index ad6a73852f0..f1fb26eed0e 100644 --- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt +++ b/Documentation/devicetree/bindings/i2c/trivial-devices.txt @@ -15,6 +15,7 @@ adi,adt7461 +/-1C TDM Extended Temp Range I.C adt7461 +/-1C TDM Extended Temp Range I.C at,24c08 i2c serial eeprom (24cxx) atmel,24c02 i2c serial eeprom (24cxx) +atmel,at97sc3204t i2c trusted platform module (TPM) catalyst,24c32 i2c serial eeprom dallas,ds1307 64 x 8, Serial, I2C Real-Time Clock dallas,ds1338 I2C RTC with 56-Byte NV RAM @@ -44,6 +45,7 @@ mc,rv3029c2 Real Time Clock Module with I2C-Bus national,lm75 I2C TEMP SENSOR national,lm80 Serial Interface ACPI-Compatible Microprocessor System Hardware Monitor national,lm92 ±0.33°C Accurate, 12-Bit + Sign Temperature Sensor and Thermal Window Comparator with Two-Wire Interface +nuvoton,npct501 i2c trusted platform module (TPM) nxp,pca9556 Octal SMBus and I2C registered interface nxp,pca9557 8-bit I2C-bus and SMBus I/O port with reset nxp,pcf8563 Real-time clock/calendar @@ -61,3 +63,4 @@ taos,tsl2550 Ambient Light Sensor with SMBUS/Two Wire Serial Interface ti,tsc2003 I2C Touch-Screen Controller ti,tmp102 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface ti,tmp275 Digital Temperature Sensor +winbond,wpct301 i2c trusted platform module (TPM) diff --git a/Documentation/devicetree/bindings/powerpc/fsl/dma.txt b/Documentation/devicetree/bindings/powerpc/fsl/dma.txt index 2a4b4bce611..7fc1b010fa7 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/dma.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/dma.txt @@ -1,33 +1,30 @@ -* Freescale 83xx DMA Controller +* Freescale DMA Controllers -Freescale PowerPC 83xx have on chip general purpose DMA controllers. +** Freescale Elo DMA Controller + This is a little-endian 4-channel DMA controller, used in Freescale mpc83xx + series chips such as mpc8315, mpc8349, mpc8379 etc. Required properties: -- compatible : compatible list, contains 2 entries, first is - "fsl,CHIP-dma", where CHIP is the processor - (mpc8349, mpc8360, etc.) and the second is - "fsl,elo-dma" -- reg : <registers mapping for DMA general status reg> -- ranges : Should be defined as specified in 1) to describe the - DMA controller channels. +- compatible : must include "fsl,elo-dma" +- reg : DMA General Status Register, i.e. DGSR which contains + status for all the 4 DMA channels +- ranges : describes the mapping between the address space of the + DMA channels and the address space of the DMA controller - cell-index : controller index. 0 for controller @ 0x8100 -- interrupts : <interrupt mapping for DMA IRQ> +- interrupts : interrupt specifier for DMA IRQ - interrupt-parent : optional, if needed for interrupt mapping - - DMA channel nodes: - - compatible : compatible list, contains 2 entries, first is - "fsl,CHIP-dma-channel", where CHIP is the processor - (mpc8349, mpc8350, etc.) and the second is - "fsl,elo-dma-channel". However, see note below. - - reg : <registers mapping for channel> - - cell-index : dma channel index starts at 0. + - compatible : must include "fsl,elo-dma-channel" + However, see note below. + - reg : DMA channel specific registers + - cell-index : DMA channel index starts at 0. Optional properties: - - interrupts : <interrupt mapping for DMA channel IRQ> - (on 83xx this is expected to be identical to - the interrupts property of the parent node) + - interrupts : interrupt specifier for DMA channel IRQ + (on 83xx this is expected to be identical to + the interrupts property of the parent node) - interrupt-parent : optional, if needed for interrupt mapping Example: @@ -70,30 +67,27 @@ Example: }; }; -* Freescale 85xx/86xx DMA Controller - -Freescale PowerPC 85xx/86xx have on chip general purpose DMA controllers. +** Freescale EloPlus DMA Controller + This is a 4-channel DMA controller with extended addresses and chaining, + mainly used in Freescale mpc85xx/86xx, Pxxx and BSC series chips, such as + mpc8540, mpc8641 p4080, bsc9131 etc. Required properties: -- compatible : compatible list, contains 2 entries, first is - "fsl,CHIP-dma", where CHIP is the processor - (mpc8540, mpc8540, etc.) and the second is - "fsl,eloplus-dma" -- reg : <registers mapping for DMA general status reg> +- compatible : must include "fsl,eloplus-dma" +- reg : DMA General Status Register, i.e. DGSR which contains + status for all the 4 DMA channels - cell-index : controller index. 0 for controller @ 0x21000, 1 for controller @ 0xc000 -- ranges : Should be defined as specified in 1) to describe the - DMA controller channels. +- ranges : describes the mapping between the address space of the + DMA channels and the address space of the DMA controller - DMA channel nodes: - - compatible : compatible list, contains 2 entries, first is - "fsl,CHIP-dma-channel", where CHIP is the processor - (mpc8540, mpc8560, etc.) and the second is - "fsl,eloplus-dma-channel". However, see note below. - - cell-index : dma channel index starts at 0. - - reg : <registers mapping for channel> - - interrupts : <interrupt mapping for DMA channel IRQ> + - compatible : must include "fsl,eloplus-dma-channel" + However, see note below. + - cell-index : DMA channel index starts at 0. + - reg : DMA channel specific registers + - interrupts : interrupt specifier for DMA channel IRQ - interrupt-parent : optional, if needed for interrupt mapping Example: @@ -134,6 +128,76 @@ Example: }; }; +** Freescale Elo3 DMA Controller + DMA controller which has same function as EloPlus except that Elo3 has 8 + channels while EloPlus has only 4, it is used in Freescale Txxx and Bxxx + series chips, such as t1040, t4240, b4860. + +Required properties: + +- compatible : must include "fsl,elo3-dma" +- reg : contains two entries for DMA General Status Registers, + i.e. DGSR0 which includes status for channel 1~4, and + DGSR1 for channel 5~8 +- ranges : describes the mapping between the address space of the + DMA channels and the address space of the DMA controller + +- DMA channel nodes: + - compatible : must include "fsl,eloplus-dma-channel" + - reg : DMA channel specific registers + - interrupts : interrupt specifier for DMA channel IRQ + - interrupt-parent : optional, if needed for interrupt mapping + +Example: +dma@100300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,elo3-dma"; + reg = <0x100300 0x4>, + <0x100600 0x4>; + ranges = <0x0 0x100100 0x500>; + dma-channel@0 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + interrupts = <28 2 0 0>; + }; + dma-channel@80 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + interrupts = <29 2 0 0>; + }; + dma-channel@100 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + interrupts = <30 2 0 0>; + }; + dma-channel@180 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + interrupts = <31 2 0 0>; + }; + dma-channel@300 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x300 0x80>; + interrupts = <76 2 0 0>; + }; + dma-channel@380 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x380 0x80>; + interrupts = <77 2 0 0>; + }; + dma-channel@400 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x400 0x80>; + interrupts = <78 2 0 0>; + }; + dma-channel@480 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x480 0x80>; + interrupts = <79 2 0 0>; + }; +}; + Note on DMA channel compatible properties: The compatible property must say "fsl,elo-dma-channel" or "fsl,eloplus-dma-channel" to be used by the Elo DMA driver (fsldma). Any DMA channel used by fsldma cannot be used by another diff --git a/Documentation/dmatest.txt b/Documentation/dmatest.txt index a2b5663eae2..dd77a81bdb8 100644 --- a/Documentation/dmatest.txt +++ b/Documentation/dmatest.txt @@ -15,39 +15,48 @@ be built as module or inside kernel. Let's consider those cases. Part 2 - When dmatest is built as a module... -After mounting debugfs and loading the module, the /sys/kernel/debug/dmatest -folder with nodes will be created. There are two important files located. First -is the 'run' node that controls run and stop phases of the test, and the second -one, 'results', is used to get the test case results. - -Note that in this case test will not run on load automatically. - Example of usage: + % modprobe dmatest channel=dma0chan0 timeout=2000 iterations=1 run=1 + +...or: + % modprobe dmatest % echo dma0chan0 > /sys/module/dmatest/parameters/channel % echo 2000 > /sys/module/dmatest/parameters/timeout % echo 1 > /sys/module/dmatest/parameters/iterations - % echo 1 > /sys/kernel/debug/dmatest/run + % echo 1 > /sys/module/dmatest/parameters/run + +...or on the kernel command line: + + dmatest.channel=dma0chan0 dmatest.timeout=2000 dmatest.iterations=1 dmatest.run=1 Hint: available channel list could be extracted by running the following command: % ls -1 /sys/class/dma/ -After a while you will start to get messages about current status or error like -in the original code. +Once started a message like "dmatest: Started 1 threads using dma0chan0" is +emitted. After that only test failure messages are reported until the test +stops. Note that running a new test will not stop any in progress test. -The following command should return actual state of the test. - % cat /sys/kernel/debug/dmatest/run - -To wait for test done the user may perform a busy loop that checks the state. - - % while [ $(cat /sys/kernel/debug/dmatest/run) = "Y" ] - > do - > echo -n "." - > sleep 1 - > done - > echo +The following command returns the state of the test. + % cat /sys/module/dmatest/parameters/run + +To wait for test completion userpace can poll 'run' until it is false, or use +the wait parameter. Specifying 'wait=1' when loading the module causes module +initialization to pause until a test run has completed, while reading +/sys/module/dmatest/parameters/wait waits for any running test to complete +before returning. For example, the following scripts wait for 42 tests +to complete before exiting. Note that if 'iterations' is set to 'infinite' then +waiting is disabled. + +Example: + % modprobe dmatest run=1 iterations=42 wait=1 + % modprobe -r dmatest +...or: + % modprobe dmatest run=1 iterations=42 + % cat /sys/module/dmatest/parameters/wait + % modprobe -r dmatest Part 3 - When built-in in the kernel... @@ -62,21 +71,22 @@ case. You always could check them at run-time by running Part 4 - Gathering the test results -The module provides a storage for the test results in the memory. The gathered -data could be used after test is done. +Test results are printed to the kernel log buffer with the format: -The special file 'results' in the debugfs represents gathered data of the in -progress test. The messages collected are printed to the kernel log as well. +"dmatest: result <channel>: <test id>: '<error msg>' with src_off=<val> dst_off=<val> len=<val> (<err code>)" Example of output: - % cat /sys/kernel/debug/dmatest/results - dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0) + % dmesg | tail -n 1 + dmatest: result dma0chan0-copy0: #1: No errors with src_off=0x7bf dst_off=0x8ad len=0x3fea (0) The message format is unified across the different types of errors. A number in the parens represents additional information, e.g. error code, error counter, -or status. +or status. A test thread also emits a summary line at completion listing the +number of tests executed, number that failed, and a result code. -Comparison between buffers is stored to the dedicated structure. +Example: + % dmesg | tail -n 1 + dmatest: dma0chan0-copy0: summary 1 test, 0 failures 1000 iops 100000 KB/s (0) -Note that the verify result is now accessible only via file 'results' in the -debugfs. +The details of a data miscompare error are also emitted, but do not follow the +above format. diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt index 9dae5940743..5dd282dda55 100644 --- a/Documentation/filesystems/btrfs.txt +++ b/Documentation/filesystems/btrfs.txt @@ -70,6 +70,12 @@ Unless otherwise specified, all options default to off. See comments at the top of fs/btrfs/check-integrity.c for more info. + commit=<seconds> + Set the interval of periodic commit, 30 seconds by default. Higher + values defer data being synced to permanent storage with obvious + consequences when the system crashes. The upper bound is not forced, + but a warning is printed if it's more than 300 seconds (5 minutes). + compress compress=<type> compress-force @@ -154,7 +160,11 @@ Unless otherwise specified, all options default to off. Currently this scans a list of several previous tree roots and tries to use the first readable. - skip_balance + rescan_uuid_tree + Force check and rebuild procedure of the UUID tree. This should not + normally be needed. + + skip_balance Skip automatic resume of interrupted balance operation after mount. May be resumed with "btrfs balance resume." @@ -234,24 +244,14 @@ available from the git repository at the following location: These include the following tools: -mkfs.btrfs: create a filesystem - -btrfsctl: control program to create snapshots and subvolumes: +* mkfs.btrfs: create a filesystem - mount /dev/sda2 /mnt - btrfsctl -s new_subvol_name /mnt - btrfsctl -s snapshot_of_default /mnt/default - btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name - btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol - ls /mnt - default snapshot_of_a_snapshot snapshot_of_new_subvol - new_subvol_name snapshot_of_default +* btrfs: a single tool to manage the filesystems, refer to the manpage for more details - Snapshots and subvolumes cannot be deleted right now, but you can - rm -rf all the files and directories inside them. +* 'btrfsck' or 'btrfs check': do a consistency check of the filesystem -btrfsck: do a limited check of the FS extent trees. +Other tools for specific tasks: -btrfs-debug-tree: print all of the FS metadata in text form. Example: +* btrfs-convert: in-place conversion from ext2/3/4 filesystems - btrfs-debug-tree /dev/sda2 >& big_output_file +* btrfs-image: dump filesystem metadata for debugging diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9ca3e74a10e..50680a59a2f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1190,15 +1190,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted. owned by uid=0. ima_hash= [IMA] - Format: { "sha1" | "md5" } + Format: { md5 | sha1 | rmd160 | sha256 | sha384 + | sha512 | ... } default: "sha1" + The list of supported hash algorithms is defined + in crypto/hash_info.h. + ima_tcb [IMA] Load a policy which meets the needs of the Trusted Computing Base. This means IMA will measure all programs exec'd, files mmap'd for exec, and all files opened for read by uid=0. + ima_template= [IMA] + Select one of defined IMA measurements template formats. + Formats: { "ima" | "ima-ng" } + Default: "ima-ng" + init= [KNL] Format: <full_path> Run specified binary instead of /sbin/init as init diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt index 0f54333b0ff..b6ce00b2be9 100644 --- a/Documentation/power/runtime_pm.txt +++ b/Documentation/power/runtime_pm.txt @@ -547,13 +547,11 @@ helper functions described in Section 4. In that case, pm_runtime_resume() should be used. Of course, for this purpose the device's runtime PM has to be enabled earlier by calling pm_runtime_enable(). -If the device bus type's or driver's ->probe() callback runs -pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts, -they will fail returning -EAGAIN, because the device's usage counter is -incremented by the driver core before executing ->probe(). Still, it may be -desirable to suspend the device as soon as ->probe() has finished, so the driver -core uses pm_runtime_put_sync() to invoke the subsystem-level idle callback for -the device at that time. +It may be desirable to suspend the device once ->probe() has finished. +Therefore the driver core uses the asyncronous pm_request_idle() to submit a +request to execute the subsystem-level idle callback for the device at that +time. A driver that makes use of the runtime autosuspend feature, may want to +update the last busy mark before returning from ->probe(). Moreover, the driver core prevents runtime PM callbacks from racing with the bus notifier callback in __device_release_driver(), which is necessary, because the @@ -656,7 +654,7 @@ out the following operations: __pm_runtime_disable() with 'false' as the second argument for every device right before executing the subsystem-level .suspend_late() callback for it. - * During system resume it calls pm_runtime_enable() and pm_runtime_put_sync() + * During system resume it calls pm_runtime_enable() and pm_runtime_put() for every device right after executing the subsystem-level .resume_early() callback and right after executing the subsystem-level .resume() callback for it, respectively. diff --git a/Documentation/security/00-INDEX b/Documentation/security/00-INDEX index 414235c1fcf..45c82fd3e9d 100644 --- a/Documentation/security/00-INDEX +++ b/Documentation/security/00-INDEX @@ -22,3 +22,5 @@ keys.txt - description of the kernel key retention service. tomoyo.txt - documentation on the TOMOYO Linux Security Module. +IMA-templates.txt + - documentation on the template management mechanism for IMA. diff --git a/Documentation/security/IMA-templates.txt b/Documentation/security/IMA-templates.txt new file mode 100644 index 00000000000..a777e5f1df5 --- /dev/null +++ b/Documentation/security/IMA-templates.txt @@ -0,0 +1,87 @@ + IMA Template Management Mechanism + + +==== INTRODUCTION ==== + +The original 'ima' template is fixed length, containing the filedata hash +and pathname. The filedata hash is limited to 20 bytes (md5/sha1). +The pathname is a null terminated string, limited to 255 characters. +To overcome these limitations and to add additional file metadata, it is +necessary to extend the current version of IMA by defining additional +templates. For example, information that could be possibly reported are +the inode UID/GID or the LSM labels either of the inode and of the process +that is accessing it. + +However, the main problem to introduce this feature is that, each time +a new template is defined, the functions that generate and display +the measurements list would include the code for handling a new format +and, thus, would significantly grow over the time. + +The proposed solution solves this problem by separating the template +management from the remaining IMA code. The core of this solution is the +definition of two new data structures: a template descriptor, to determine +which information should be included in the measurement list; a template +field, to generate and display data of a given type. + +Managing templates with these structures is very simple. To support +a new data type, developers define the field identifier and implement +two functions, init() and show(), respectively to generate and display +measurement entries. Defining a new template descriptor requires +specifying the template format, a string of field identifiers separated +by the '|' character. While in the current implementation it is possible +to define new template descriptors only by adding their definition in the +template specific code (ima_template.c), in a future version it will be +possible to register a new template on a running kernel by supplying to IMA +the desired format string. In this version, IMA initializes at boot time +all defined template descriptors by translating the format into an array +of template fields structures taken from the set of the supported ones. + +After the initialization step, IMA will call ima_alloc_init_template() +(new function defined within the patches for the new template management +mechanism) to generate a new measurement entry by using the template +descriptor chosen through the kernel configuration or through the newly +introduced 'ima_template=' kernel command line parameter. It is during this +phase that the advantages of the new architecture are clearly shown: +the latter function will not contain specific code to handle a given template +but, instead, it simply calls the init() method of the template fields +associated to the chosen template descriptor and store the result (pointer +to allocated data and data length) in the measurement entry structure. + +The same mechanism is employed to display measurements entries. +The functions ima[_ascii]_measurements_show() retrieve, for each entry, +the template descriptor used to produce that entry and call the show() +method for each item of the array of template fields structures. + + + +==== SUPPORTED TEMPLATE FIELDS AND DESCRIPTORS ==== + +In the following, there is the list of supported template fields +('<identifier>': description), that can be used to define new template +descriptors by adding their identifier to the format string +(support for more data types will be added later): + + - 'd': the digest of the event (i.e. the digest of a measured file), + calculated with the SHA1 or MD5 hash algorithm; + - 'n': the name of the event (i.e. the file name), with size up to 255 bytes; + - 'd-ng': the digest of the event, calculated with an arbitrary hash + algorithm (field format: [<hash algo>:]digest, where the digest + prefix is shown only if the hash algorithm is not SHA1 or MD5); + - 'n-ng': the name of the event, without size limitations. + + +Below, there is the list of defined template descriptors: + - "ima": its format is 'd|n'; + - "ima-ng" (default): its format is 'd-ng|n-ng'. + + + +==== USE ==== + +To specify the template descriptor to be used to generate measurement entries, +currently the following methods are supported: + + - select a template descriptor among those supported in the kernel + configuration ('ima-ng' is the default choice); + - specify a template descriptor name from the kernel command line through + the 'ima_template=' parameter. diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 7b4145d0045..a4c33f1a7c6 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -865,15 +865,14 @@ encountered: calling processes has a searchable link to the key from one of its keyrings. There are three functions for dealing with these: - key_ref_t make_key_ref(const struct key *key, - unsigned long possession); + key_ref_t make_key_ref(const struct key *key, bool possession); struct key *key_ref_to_ptr(const key_ref_t key_ref); - unsigned long is_key_possessed(const key_ref_t key_ref); + bool is_key_possessed(const key_ref_t key_ref); The first function constructs a key reference from a key pointer and - possession information (which must be 0 or 1 and not any other value). + possession information (which must be true or false). The second function retrieves the key pointer from a reference and the third retrieves the possession flag. @@ -961,14 +960,17 @@ payload contents" for more information. the argument will not be parsed. -(*) Extra references can be made to a key by calling the following function: +(*) Extra references can be made to a key by calling one of the following + functions: + struct key *__key_get(struct key *key); struct key *key_get(struct key *key); - These need to be disposed of by calling key_put() when they've been - finished with. The key pointer passed in will be returned. If the pointer - is NULL or CONFIG_KEYS is not set then the key will not be dereferenced and - no increment will take place. + Keys so references will need to be disposed of by calling key_put() when + they've been finished with. The key pointer passed in will be returned. + + In the case of key_get(), if the pointer is NULL or CONFIG_KEYS is not set + then the key will not be dereferenced and no increment will take place. (*) A key's serial number can be obtained by calling: diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock index 7521d367f21..6dea4fd5c96 100644 --- a/Documentation/vm/split_page_table_lock +++ b/Documentation/vm/split_page_table_lock @@ -63,9 +63,9 @@ levels. PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table allocation and pgtable_pmd_page_dtor() on freeing. -Allocation usually happens in pmd_alloc_one(), freeing in pmd_free(), but -make sure you cover all PMD table allocation / freeing paths: i.e X86_PAE -preallocate few PMDs on pgd_alloc(). +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and +pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing +paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. diff --git a/MAINTAINERS b/MAINTAINERS index 63f30484932..8285ed4676b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4065,6 +4065,7 @@ F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c F: drivers/hid/hid-hyperv.c F: drivers/hv/ +F: drivers/input/serio/hyperv-keyboard.c F: drivers/net/hyperv/ F: drivers/scsi/storvsc_drv.c F: drivers/video/hyperv_fb.c @@ -7515,9 +7516,10 @@ SELINUX SECURITY MODULE M: Stephen Smalley <sds@tycho.nsa.gov> M: James Morris <james.l.morris@oracle.com> M: Eric Paris <eparis@parisplace.org> +M: Paul Moore <paul@paul-moore.com> L: selinux@tycho.nsa.gov (subscribers-only, general discussion) W: http://selinuxproject.org -T: git git://git.infradead.org/users/eparis/selinux.git +T: git git://git.infradead.org/users/pcmoore/selinux S: Supported F: include/linux/selinux* F: security/selinux/ @@ -8664,6 +8666,7 @@ F: drivers/media/usb/tm6000/ TPM DEVICE DRIVER M: Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com> M: Ashley Lai <ashley@ashleylai.com> +M: Peter Huewe <peterhuewe@gmx.de> M: Rajiv Andrade <mail@srajiv.net> W: http://tpmdd.sourceforge.net M: Marcel Selhorst <tpmdd@selhorst.net> @@ -9522,8 +9525,8 @@ F: drivers/xen/*swiotlb* XFS FILESYSTEM P: Silicon Graphics Inc +M: Dave Chinner <dchinner@fromorbit.com> M: Ben Myers <bpm@sgi.com> -M: Alex Elder <elder@kernel.org> M: xfs@oss.sgi.com L: xfs@oss.sgi.com W: http://oss.sgi.com/projects/xfs diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 135c674eaf9..d39dc9b95a2 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -16,8 +16,8 @@ config ALPHA select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE + select GENERIC_CLOCKEVENTS select GENERIC_SMP_IDLE_THREAD - select GENERIC_CMOS_UPDATE select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select HAVE_MOD_ARCH_SPECIFIC @@ -488,6 +488,20 @@ config VGA_HOSE which always have multiple hoses, and whose consoles support it. +config ALPHA_QEMU + bool "Run under QEMU emulation" + depends on !ALPHA_GENERIC + ---help--- + Assume the presence of special features supported by QEMU PALcode + that reduce the overhead of system emulation. + + Generic kernels will auto-detect QEMU. But when building a + system-specific kernel, the assumption is that we want to + elimiate as many runtime tests as possible. + + If unsure, say N. + + config ALPHA_SRM bool "Use SRM as bootloader" if ALPHA_CABRIOLET || ALPHA_AVANTI_CH || ALPHA_EB64P || ALPHA_PC164 || ALPHA_TAKARA || ALPHA_EB164 || ALPHA_ALCOR || ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_NAUTILUS || ALPHA_NONAME depends on TTY @@ -572,6 +586,30 @@ config NUMA Access). This option is for configuring high-end multiprocessor server machines. If in doubt, say N. +config ALPHA_WTINT + bool "Use WTINT" if ALPHA_SRM || ALPHA_GENERIC + default y if ALPHA_QEMU + default n if ALPHA_EV5 || ALPHA_EV56 || (ALPHA_EV4 && !ALPHA_LCA) + default n if !ALPHA_SRM && !ALPHA_GENERIC + default y if SMP + ---help--- + The Wait for Interrupt (WTINT) PALcall attempts to place the CPU + to sleep until the next interrupt. This may reduce the power + consumed, and the heat produced by the computer. However, it has + the side effect of making the cycle counter unreliable as a timing + device across the sleep. + + For emulation under QEMU, definitely say Y here, as we have other + mechanisms for measuring time than the cycle counter. + + For EV4 (but not LCA), EV5 and EV56 systems, or for systems running + MILO, sleep mode is not supported so you might as well say N here. + + For SMP systems we cannot use the cycle counter for timing anyway, + so you might as well say Y here. + + If unsure, say N. + config NODES_SHIFT int default "7" @@ -613,9 +651,41 @@ config VERBOSE_MCHECK_ON Take the default (1) unless you want more control or more info. +choice + prompt "Timer interrupt frequency (HZ)?" + default HZ_128 if ALPHA_QEMU + default HZ_1200 if ALPHA_RAWHIDE + default HZ_1024 + ---help--- + The frequency at which timer interrupts occur. A high frequency + minimizes latency, whereas a low frequency minimizes overhead of + process accounting. The later effect is especially significant + when being run under QEMU. + + Note that some Alpha hardware cannot change the interrupt frequency + of the timer. If unsure, say 1024 (or 1200 for Rawhide). + + config HZ_32 + bool "32 Hz" + config HZ_64 + bool "64 Hz" + config HZ_128 + bool "128 Hz" + config HZ_256 + bool "256 Hz" + config HZ_1024 + bool "1024 Hz" + config HZ_1200 + bool "1200 Hz" +endchoice + config HZ - int - default 1200 if ALPHA_RAWHIDE + int + default 32 if HZ_32 + default 64 if HZ_64 + default 128 if HZ_128 + default 256 if HZ_256 + default 1200 if HZ_1200 default 1024 source "drivers/pci/Kconfig" diff --git a/arch/alpha/include/asm/machvec.h b/arch/alpha/include/asm/machvec.h index 72dbf235927..75cb3641ed2 100644 --- a/arch/alpha/include/asm/machvec.h +++ b/arch/alpha/include/asm/machvec.h @@ -33,6 +33,7 @@ struct alpha_machine_vector int nr_irqs; int rtc_port; + int rtc_boot_cpu_only; unsigned int max_asn; unsigned long max_isa_dma_address; unsigned long irq_probe_mask; @@ -95,9 +96,6 @@ struct alpha_machine_vector struct _alpha_agp_info *(*agp_info)(void); - unsigned int (*rtc_get_time)(struct rtc_time *); - int (*rtc_set_time)(struct rtc_time *); - const char *vector_name; /* NUMA information */ @@ -126,13 +124,19 @@ extern struct alpha_machine_vector alpha_mv; #ifdef CONFIG_ALPHA_GENERIC extern int alpha_using_srm; +extern int alpha_using_qemu; #else -#ifdef CONFIG_ALPHA_SRM -#define alpha_using_srm 1 -#else -#define alpha_using_srm 0 -#endif +# ifdef CONFIG_ALPHA_SRM +# define alpha_using_srm 1 +# else +# define alpha_using_srm 0 +# endif +# ifdef CONFIG_ALPHA_QEMU +# define alpha_using_qemu 1 +# else +# define alpha_using_qemu 0 +# endif #endif /* GENERIC */ -#endif +#endif /* __KERNEL__ */ #endif /* __ALPHA_MACHVEC_H */ diff --git a/arch/alpha/include/asm/pal.h b/arch/alpha/include/asm/pal.h index 6fcd2b5b08f..5422a47646f 100644 --- a/arch/alpha/include/asm/pal.h +++ b/arch/alpha/include/asm/pal.h @@ -89,6 +89,7 @@ __CALL_PAL_W1(wrmces, unsigned long); __CALL_PAL_RW2(wrperfmon, unsigned long, unsigned long, unsigned long); __CALL_PAL_W1(wrusp, unsigned long); __CALL_PAL_W1(wrvptptr, unsigned long); +__CALL_PAL_RW1(wtint, unsigned long, unsigned long); /* * TB routines.. @@ -111,5 +112,75 @@ __CALL_PAL_W1(wrvptptr, unsigned long); #define tbiap() __tbi(-1, /* no second argument */) #define tbia() __tbi(-2, /* no second argument */) +/* + * QEMU Cserv routines.. + */ + +static inline unsigned long +qemu_get_walltime(void) +{ + register unsigned long v0 __asm__("$0"); + register unsigned long a0 __asm__("$16") = 3; + + asm("call_pal %2 # cserve get_time" + : "=r"(v0), "+r"(a0) + : "i"(PAL_cserve) + : "$17", "$18", "$19", "$20", "$21"); + + return v0; +} + +static inline unsigned long +qemu_get_alarm(void) +{ + register unsigned long v0 __asm__("$0"); + register unsigned long a0 __asm__("$16") = 4; + + asm("call_pal %2 # cserve get_alarm" + : "=r"(v0), "+r"(a0) + : "i"(PAL_cserve) + : "$17", "$18", "$19", "$20", "$21"); + + return v0; +} + +static inline void +qemu_set_alarm_rel(unsigned long expire) +{ + register unsigned long a0 __asm__("$16") = 5; + register unsigned long a1 __asm__("$17") = expire; + + asm volatile("call_pal %2 # cserve set_alarm_rel" + : "+r"(a0), "+r"(a1) + : "i"(PAL_cserve) + : "$0", "$18", "$19", "$20", "$21"); +} + +static inline void +qemu_set_alarm_abs(unsigned long expire) +{ + register unsigned long a0 __asm__("$16") = 6; + register unsigned long a1 __asm__("$17") = expire; + + asm volatile("call_pal %2 # cserve set_alarm_abs" + : "+r"(a0), "+r"(a1) + : "i"(PAL_cserve) + : "$0", "$18", "$19", "$20", "$21"); +} + +static inline unsigned long +qemu_get_vmtime(void) +{ + register unsigned long v0 __asm__("$0"); + register unsigned long a0 __asm__("$16") = 7; + + asm("call_pal %2 # cserve get_time" + : "=r"(v0), "+r"(a0) + : "i"(PAL_cserve) + : "$17", "$18", "$19", "$20", "$21"); + + return v0; +} + #endif /* !__ASSEMBLY__ */ #endif /* __ALPHA_PAL_H */ diff --git a/arch/alpha/include/asm/rtc.h b/arch/alpha/include/asm/rtc.h index d70408d3667..f71c3b0ed36 100644 --- a/arch/alpha/include/asm/rtc.h +++ b/arch/alpha/include/asm/rtc.h @@ -1,12 +1 @@ -#ifndef _ALPHA_RTC_H -#define _ALPHA_RTC_H - -#if defined(CONFIG_ALPHA_MARVEL) && defined(CONFIG_SMP) \ - || defined(CONFIG_ALPHA_GENERIC) -# define get_rtc_time alpha_mv.rtc_get_time -# define set_rtc_time alpha_mv.rtc_set_time -#endif - #include <asm-generic/rtc.h> - -#endif diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h index b02b8a28294..c2911f59170 100644 --- a/arch/alpha/include/asm/string.h +++ b/arch/alpha/include/asm/string.h @@ -22,15 +22,27 @@ extern void * __memcpy(void *, const void *, size_t); #define __HAVE_ARCH_MEMSET extern void * __constant_c_memset(void *, unsigned long, size_t); +extern void * ___memset(void *, int, size_t); extern void * __memset(void *, int, size_t); extern void * memset(void *, int, size_t); -#define memset(s, c, n) \ -(__builtin_constant_p(c) \ - ? (__builtin_constant_p(n) && (c) == 0 \ - ? __builtin_memset((s),0,(n)) \ - : __constant_c_memset((s),0x0101010101010101UL*(unsigned char)(c),(n))) \ - : __memset((s),(c),(n))) +/* For gcc 3.x, we cannot have the inline function named "memset" because + the __builtin_memset will attempt to resolve to the inline as well, + leading to a "sorry" about unimplemented recursive inlining. */ +extern inline void *__memset(void *s, int c, size_t n) +{ + if (__builtin_constant_p(c)) { + if (__builtin_constant_p(n)) { + return __builtin_memset(s, c, n); + } else { + unsigned long c8 = (c & 0xff) * 0x0101010101010101UL; + return __constant_c_memset(s, c8, n); + } + } + return ___memset(s, c, n); +} + +#define memset __memset #define __HAVE_ARCH_STRCPY extern char * strcpy(char *,const char *); diff --git a/arch/alpha/include/uapi/asm/pal.h b/arch/alpha/include/uapi/asm/pal.h index 3c0ce08e5f5..dfc8140b908 100644 --- a/arch/alpha/include/uapi/asm/pal.h +++ b/arch/alpha/include/uapi/asm/pal.h @@ -46,6 +46,7 @@ #define PAL_rdusp 58 #define PAL_whami 60 #define PAL_retsys 61 +#define PAL_wtint 62 #define PAL_rti 63 diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile index 84ec46b38f7..0d54650e78f 100644 --- a/arch/alpha/kernel/Makefile +++ b/arch/alpha/kernel/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_PCI) += pci.o pci_iommu.o pci-sysfs.o obj-$(CONFIG_SRM_ENV) += srm_env.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_RTC_DRV_ALPHA) += rtc.o ifdef CONFIG_ALPHA_GENERIC diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c index 89566b346c0..f4c7ab6f43b 100644 --- a/arch/alpha/kernel/alpha_ksyms.c +++ b/arch/alpha/kernel/alpha_ksyms.c @@ -40,6 +40,7 @@ EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memset); +EXPORT_SYMBOL(___memset); EXPORT_SYMBOL(__memsetw); EXPORT_SYMBOL(__constant_c_memset); EXPORT_SYMBOL(copy_page); diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c index 28e4429596f..1c8625cb0e2 100644 --- a/arch/alpha/kernel/irq_alpha.c +++ b/arch/alpha/kernel/irq_alpha.c @@ -66,21 +66,7 @@ do_entInt(unsigned long type, unsigned long vector, break; case 1: old_regs = set_irq_regs(regs); -#ifdef CONFIG_SMP - { - long cpu; - - smp_percpu_timer_interrupt(regs); - cpu = smp_processor_id(); - if (cpu != boot_cpuid) { - kstat_incr_irqs_this_cpu(RTC_IRQ, irq_to_desc(RTC_IRQ)); - } else { - handle_irq(RTC_IRQ); - } - } -#else handle_irq(RTC_IRQ); -#endif set_irq_regs(old_regs); return; case 2: @@ -228,7 +214,7 @@ process_mcheck_info(unsigned long vector, unsigned long la_ptr, */ struct irqaction timer_irqaction = { - .handler = timer_interrupt, + .handler = rtc_timer_interrupt, .name = "timer", }; diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h index 7fa62488bd1..f54bdf658cd 100644 --- a/arch/alpha/kernel/machvec_impl.h +++ b/arch/alpha/kernel/machvec_impl.h @@ -43,10 +43,7 @@ #define CAT1(x,y) x##y #define CAT(x,y) CAT1(x,y) -#define DO_DEFAULT_RTC \ - .rtc_port = 0x70, \ - .rtc_get_time = common_get_rtc_time, \ - .rtc_set_time = common_set_rtc_time +#define DO_DEFAULT_RTC .rtc_port = 0x70 #define DO_EV4_MMU \ .max_asn = EV4_MAX_ASN, \ diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index d821b17047e..c52e7f0ee5f 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -83,6 +83,8 @@ struct alpha_pmu_t { long pmc_left[3]; /* Subroutine for allocation of PMCs. Enforces constraints. */ int (*check_constraints)(struct perf_event **, unsigned long *, int); + /* Subroutine for checking validity of a raw event for this PMU. */ + int (*raw_event_valid)(u64 config); }; /* @@ -203,6 +205,12 @@ success: } +static int ev67_raw_event_valid(u64 config) +{ + return config >= EV67_CYCLES && config < EV67_LAST_ET; +}; + + static const struct alpha_pmu_t ev67_pmu = { .event_map = ev67_perfmon_event_map, .max_events = ARRAY_SIZE(ev67_perfmon_event_map), @@ -211,7 +219,8 @@ static const struct alpha_pmu_t ev67_pmu = { .pmc_count_mask = {EV67_PCTR_0_COUNT_MASK, EV67_PCTR_1_COUNT_MASK, 0}, .pmc_max_period = {(1UL<<20) - 1, (1UL<<20) - 1, 0}, .pmc_left = {16, 4, 0}, - .check_constraints = ev67_check_constraints + .check_constraints = ev67_check_constraints, + .raw_event_valid = ev67_raw_event_valid, }; @@ -609,7 +618,9 @@ static int __hw_perf_event_init(struct perf_event *event) } else if (attr->type == PERF_TYPE_HW_CACHE) { return -EOPNOTSUPP; } else if (attr->type == PERF_TYPE_RAW) { - ev = attr->config & 0xff; + if (!alpha_pmu->raw_event_valid(attr->config)) + return -EINVAL; + ev = attr->config; } else { return -EOPNOTSUPP; } diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index f2360a74e5d..1941a07b581 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -46,6 +46,23 @@ void (*pm_power_off)(void) = machine_power_off; EXPORT_SYMBOL(pm_power_off); +#ifdef CONFIG_ALPHA_WTINT +/* + * Sleep the CPU. + * EV6, LCA45 and QEMU know how to power down, skipping N timer interrupts. + */ +void arch_cpu_idle(void) +{ + wtint(0); + local_irq_enable(); +} + +void arch_cpu_idle_dead(void) +{ + wtint(INT_MAX); +} +#endif /* ALPHA_WTINT */ + struct halt_info { int mode; char *restart_cmd; diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h index d3e52d3fd59..da2d6ec9c37 100644 --- a/arch/alpha/kernel/proto.h +++ b/arch/alpha/kernel/proto.h @@ -135,17 +135,15 @@ extern void unregister_srm_console(void); /* smp.c */ extern void setup_smp(void); extern void handle_ipi(struct pt_regs *); -extern void smp_percpu_timer_interrupt(struct pt_regs *); /* bios32.c */ /* extern void reset_for_srm(void); */ /* time.c */ -extern irqreturn_t timer_interrupt(int irq, void *dev); +extern irqreturn_t rtc_timer_interrupt(int irq, void *dev); +extern void init_clockevent(void); extern void common_init_rtc(void); extern unsigned long est_cycle_freq; -extern unsigned int common_get_rtc_time(struct rtc_time *time); -extern int common_set_rtc_time(struct rtc_time *time); /* smc37c93x.c */ extern void SMC93x_Init(void); diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c new file mode 100644 index 00000000000..c8d284d8521 --- /dev/null +++ b/arch/alpha/kernel/rtc.c @@ -0,0 +1,323 @@ +/* + * linux/arch/alpha/kernel/rtc.c + * + * Copyright (C) 1991, 1992, 1995, 1999, 2000 Linus Torvalds + * + * This file contains date handling. + */ +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mc146818rtc.h> +#include <linux/bcd.h> +#include <linux/rtc.h> +#include <linux/platform_device.h> + +#include <asm/rtc.h> + +#include "proto.h" + + +/* + * Support for the RTC device. + * + * We don't want to use the rtc-cmos driver, because we don't want to support + * alarms, as that would be indistinguishable from timer interrupts. + * + * Further, generic code is really, really tied to a 1900 epoch. This is + * true in __get_rtc_time as well as the users of struct rtc_time e.g. + * rtc_tm_to_time. Thankfully all of the other epochs in use are later + * than 1900, and so it's easy to adjust. + */ + +static unsigned long rtc_epoch; + +static int __init +specifiy_epoch(char *str) +{ + unsigned long epoch = simple_strtoul(str, NULL, 0); + if (epoch < 1900) + printk("Ignoring invalid user specified epoch %lu\n", epoch); + else + rtc_epoch = epoch; + return 1; +} +__setup("epoch=", specifiy_epoch); + +static void __init +init_rtc_epoch(void) +{ + int epoch, year, ctrl; + + if (rtc_epoch != 0) { + /* The epoch was specified on the command-line. */ + return; + } + + /* Detect the epoch in use on this computer. */ + ctrl = CMOS_READ(RTC_CONTROL); + year = CMOS_READ(RTC_YEAR); + if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + year = bcd2bin(year); + + /* PC-like is standard; used for year >= 70 */ + epoch = 1900; + if (year < 20) { + epoch = 2000; + } else if (year >= 20 && year < 48) { + /* NT epoch */ + epoch = 1980; + } else if (year >= 48 && year < 70) { + /* Digital UNIX epoch */ + epoch = 1952; + } + rtc_epoch = epoch; + + printk(KERN_INFO "Using epoch %d for rtc year %d\n", epoch, year); +} + +static int +alpha_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + __get_rtc_time(tm); + + /* Adjust for non-default epochs. It's easier to depend on the + generic __get_rtc_time and adjust the epoch here than create + a copy of __get_rtc_time with the edits we need. */ + if (rtc_epoch != 1900) { + int year = tm->tm_year; + /* Undo the century adjustment made in __get_rtc_time. */ + if (year >= 100) + year -= 100; + year += rtc_epoch - 1900; + /* Redo the century adjustment with the epoch in place. */ + if (year <= 69) + year += 100; + tm->tm_year = year; + } + + return rtc_valid_tm(tm); +} + +static int +alpha_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct rtc_time xtm; + + if (rtc_epoch != 1900) { + xtm = *tm; + xtm.tm_year -= rtc_epoch - 1900; + tm = &xtm; + } + + return __set_rtc_time(tm); +} + +static int +alpha_rtc_set_mmss(struct device *dev, unsigned long nowtime) +{ + int retval = 0; + int real_seconds, real_minutes, cmos_minutes; + unsigned char save_control, save_freq_select; + + /* Note: This code only updates minutes and seconds. Comments + indicate this was to avoid messing with unknown time zones, + and with the epoch nonsense described above. In order for + this to work, the existing clock cannot be off by more than + 15 minutes. + + ??? This choice is may be out of date. The x86 port does + not have problems with timezones, and the epoch processing has + now been fixed in alpha_set_rtc_time. + + In either case, one can always force a full rtc update with + the userland hwclock program, so surely 15 minute accuracy + is no real burden. */ + + /* In order to set the CMOS clock precisely, we have to be called + 500 ms after the second nowtime has started, because when + nowtime is written into the registers of the CMOS clock, it will + jump to the next second precisely 500 ms later. Check the Motorola + MC146818A or Dallas DS12887 data sheet for details. */ + + /* irq are locally disabled here */ + spin_lock(&rtc_lock); + /* Tell the clock it's being set */ + save_control = CMOS_READ(RTC_CONTROL); + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); + + /* Stop and reset prescaler */ + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + + cmos_minutes = CMOS_READ(RTC_MINUTES); + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + cmos_minutes = bcd2bin(cmos_minutes); + + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) { + /* correct for half hour time zone */ + real_minutes += 30; + } + real_minutes %= 60; + + if (abs(real_minutes - cmos_minutes) < 30) { + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); + } + CMOS_WRITE(real_seconds,RTC_SECONDS); + CMOS_WRITE(real_minutes,RTC_MINUTES); + } else { + printk_once(KERN_NOTICE + "set_rtc_mmss: can't update from %d to %d\n", + cmos_minutes, real_minutes); + retval = -1; + } + + /* The following flags have to be released exactly in this order, + * otherwise the DS12887 (popular MC146818A clone with integrated + * battery and quartz) will not reset the oscillator and will not + * update precisely 500 ms later. You won't find this mentioned in + * the Dallas Semiconductor data sheets, but who believes data + * sheets anyway ... -- Markus Kuhn + */ + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + spin_unlock(&rtc_lock); + + return retval; +} + +static int +alpha_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case RTC_EPOCH_READ: + return put_user(rtc_epoch, (unsigned long __user *)arg); + case RTC_EPOCH_SET: + if (arg < 1900) + return -EINVAL; + rtc_epoch = arg; + return 0; + default: + return -ENOIOCTLCMD; + } +} + +static const struct rtc_class_ops alpha_rtc_ops = { + .read_time = alpha_rtc_read_time, + .set_time = alpha_rtc_set_time, + .set_mmss = alpha_rtc_set_mmss, + .ioctl = alpha_rtc_ioctl, +}; + +/* + * Similarly, except do the actual CMOS access on the boot cpu only. + * This requires marshalling the data across an interprocessor call. + */ + +#if defined(CONFIG_SMP) && \ + (defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_MARVEL)) +# define HAVE_REMOTE_RTC 1 + +union remote_data { + struct rtc_time *tm; + unsigned long now; + long retval; +}; + +static void +do_remote_read(void *data) +{ + union remote_data *x = data; + x->retval = alpha_rtc_read_time(NULL, x->tm); +} + +static int +remote_read_time(struct device *dev, struct rtc_time *tm) +{ + union remote_data x; + if (smp_processor_id() != boot_cpuid) { + x.tm = tm; + smp_call_function_single(boot_cpuid, do_remote_read, &x, 1); + return x.retval; + } + return alpha_rtc_read_time(NULL, tm); +} + +static void +do_remote_set(void *data) +{ + union remote_data *x = data; + x->retval = alpha_rtc_set_time(NULL, x->tm); +} + +static int +remote_set_time(struct device *dev, struct rtc_time *tm) +{ + union remote_data x; + if (smp_processor_id() != boot_cpuid) { + x.tm = tm; + smp_call_function_single(boot_cpuid, do_remote_set, &x, 1); + return x.retval; + } + return alpha_rtc_set_time(NULL, tm); +} + +static void +do_remote_mmss(void *data) +{ + union remote_data *x = data; + x->retval = alpha_rtc_set_mmss(NULL, x->now); +} + +static int +remote_set_mmss(struct device *dev, unsigned long now) +{ + union remote_data x; + if (smp_processor_id() != boot_cpuid) { + x.now = now; + smp_call_function_single(boot_cpuid, do_remote_mmss, &x, 1); + return x.retval; + } + return alpha_rtc_set_mmss(NULL, now); +} + +static const struct rtc_class_ops remote_rtc_ops = { + .read_time = remote_read_time, + .set_time = remote_set_time, + .set_mmss = remote_set_mmss, + .ioctl = alpha_rtc_ioctl, +}; +#endif + +static int __init +alpha_rtc_init(void) +{ + const struct rtc_class_ops *ops; + struct platform_device *pdev; + struct rtc_device *rtc; + const char *name; + + init_rtc_epoch(); + name = "rtc-alpha"; + ops = &alpha_rtc_ops; + +#ifdef HAVE_REMOTE_RTC + if (alpha_mv.rtc_boot_cpu_only) + ops = &remote_rtc_ops; +#endif + + pdev = platform_device_register_simple(name, -1, NULL, 0); + rtc = devm_rtc_device_register(&pdev->dev, name, ops, THIS_MODULE); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + platform_set_drvdata(pdev, rtc); + return 0; +} +device_initcall(alpha_rtc_init); diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 9e3107cc5eb..b20af76f12c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -115,10 +115,17 @@ unsigned long alpha_agpgart_size = DEFAULT_AGP_APER_SIZE; #ifdef CONFIG_ALPHA_GENERIC struct alpha_machine_vector alpha_mv; +#endif + +#ifndef alpha_using_srm int alpha_using_srm; EXPORT_SYMBOL(alpha_using_srm); #endif +#ifndef alpha_using_qemu +int alpha_using_qemu; +#endif + static struct alpha_machine_vector *get_sysvec(unsigned long, unsigned long, unsigned long); static struct alpha_machine_vector *get_sysvec_byname(const char *); @@ -529,11 +536,15 @@ setup_arch(char **cmdline_p) atomic_notifier_chain_register(&panic_notifier_list, &alpha_panic_block); -#ifdef CONFIG_ALPHA_GENERIC +#ifndef alpha_using_srm /* Assume that we've booted from SRM if we haven't booted from MILO. Detect the later by looking for "MILO" in the system serial nr. */ alpha_using_srm = strncmp((const char *)hwrpb->ssn, "MILO", 4) != 0; #endif +#ifndef alpha_using_qemu + /* Similarly, look for QEMU. */ + alpha_using_qemu = strstr((const char *)hwrpb->ssn, "QEMU") != 0; +#endif /* If we are using SRM, we want to allow callbacks as early as possible, so do this NOW, and then @@ -1207,6 +1218,7 @@ show_cpuinfo(struct seq_file *f, void *slot) char *systype_name; char *sysvariation_name; int nr_processors; + unsigned long timer_freq; cpu_index = (unsigned) (cpu->type - 1); cpu_name = "Unknown"; @@ -1218,6 +1230,12 @@ show_cpuinfo(struct seq_file *f, void *slot) nr_processors = get_nr_processors(cpu, hwrpb->nr_processors); +#if CONFIG_HZ == 1024 || CONFIG_HZ == 1200 + timer_freq = (100UL * hwrpb->intr_freq) / 4096; +#else + timer_freq = 100UL * CONFIG_HZ; +#endif + seq_printf(f, "cpu\t\t\t: Alpha\n" "cpu model\t\t: %s\n" "cpu variation\t\t: %ld\n" @@ -1243,8 +1261,7 @@ show_cpuinfo(struct seq_file *f, void *slot) (char*)hwrpb->ssn, est_cycle_freq ? : hwrpb->cycle_freq, est_cycle_freq ? "est." : "", - hwrpb->intr_freq / 4096, - (100 * hwrpb->intr_freq / 4096) % 100, + timer_freq / 100, timer_freq % 100, hwrpb->pagesize, hwrpb->pa_bits, hwrpb->max_asn, diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 9dbbcb3b914..99ac36d5de4 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -138,9 +138,11 @@ smp_callin(void) /* Get our local ticker going. */ smp_setup_percpu_timer(cpuid); + init_clockevent(); /* Call platform-specific callin, if specified */ - if (alpha_mv.smp_callin) alpha_mv.smp_callin(); + if (alpha_mv.smp_callin) + alpha_mv.smp_callin(); /* All kernel threads share the same mm context. */ atomic_inc(&init_mm.mm_count); @@ -498,35 +500,6 @@ smp_cpus_done(unsigned int max_cpus) ((bogosum + 2500) / (5000/HZ)) % 100); } - -void -smp_percpu_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs; - int cpu = smp_processor_id(); - unsigned long user = user_mode(regs); - struct cpuinfo_alpha *data = &cpu_data[cpu]; - - old_regs = set_irq_regs(regs); - - /* Record kernel PC. */ - profile_tick(CPU_PROFILING); - - if (!--data->prof_counter) { - /* We need to make like a normal interrupt -- otherwise - timer interrupts ignore the global interrupt lock, - which would be a Bad Thing. */ - irq_enter(); - - update_process_times(user); - - data->prof_counter = data->prof_multiplier; - - irq_exit(); - } - set_irq_regs(old_regs); -} - int setup_profiling_timer(unsigned int multiplier) { diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index 5a0af11b3a6..608f2a7fa0a 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -224,8 +224,6 @@ struct alpha_machine_vector jensen_mv __initmv = { .machine_check = jensen_machine_check, .max_isa_dma_address = ALPHA_MAX_ISA_DMA_ADDRESS, .rtc_port = 0x170, - .rtc_get_time = common_get_rtc_time, - .rtc_set_time = common_set_rtc_time, .nr_irqs = 16, .device_interrupt = jensen_device_interrupt, diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index c92e389ff21..f21d61fab67 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -22,7 +22,6 @@ #include <asm/hwrpb.h> #include <asm/tlbflush.h> #include <asm/vga.h> -#include <asm/rtc.h> #include "proto.h" #include "err_impl.h" @@ -400,57 +399,6 @@ marvel_init_rtc(void) init_rtc_irq(); } -struct marvel_rtc_time { - struct rtc_time *time; - int retval; -}; - -#ifdef CONFIG_SMP -static void -smp_get_rtc_time(void *data) -{ - struct marvel_rtc_time *mrt = data; - mrt->retval = __get_rtc_time(mrt->time); -} - -static void -smp_set_rtc_time(void *data) -{ - struct marvel_rtc_time *mrt = data; - mrt->retval = __set_rtc_time(mrt->time); -} -#endif - -static unsigned int -marvel_get_rtc_time(struct rtc_time *time) -{ -#ifdef CONFIG_SMP - struct marvel_rtc_time mrt; - - if (smp_processor_id() != boot_cpuid) { - mrt.time = time; - smp_call_function_single(boot_cpuid, smp_get_rtc_time, &mrt, 1); - return mrt.retval; - } -#endif - return __get_rtc_time(time); -} - -static int -marvel_set_rtc_time(struct rtc_time *time) -{ -#ifdef CONFIG_SMP - struct marvel_rtc_time mrt; - - if (smp_processor_id() != boot_cpuid) { - mrt.time = time; - smp_call_function_single(boot_cpuid, smp_set_rtc_time, &mrt, 1); - return mrt.retval; - } -#endif - return __set_rtc_time(time); -} - static void marvel_smp_callin(void) { @@ -492,8 +440,7 @@ struct alpha_machine_vector marvel_ev7_mv __initmv = { .vector_name = "MARVEL/EV7", DO_EV7_MMU, .rtc_port = 0x70, - .rtc_get_time = marvel_get_rtc_time, - .rtc_set_time = marvel_set_rtc_time, + .rtc_boot_cpu_only = 1, DO_MARVEL_IO, .machine_check = marvel_machine_check, .max_isa_dma_address = ALPHA_MAX_ISA_DMA_ADDRESS, diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index ea339503655..ee39cee8064 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -3,13 +3,7 @@ * * Copyright (C) 1991, 1992, 1995, 1999, 2000 Linus Torvalds * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update + * This file contains the clocksource time handling. * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1997-01-09 Adrian Sun @@ -21,9 +15,6 @@ * 1999-04-16 Thorsten Kranzkowski (dl8bcu@gmx.net) * fixed algorithm in do_gettimeofday() for calculating the precise time * from processor cycle counter (now taking lost_ticks into account) - * 2000-08-13 Jan-Benedict Glaw <jbglaw@lug-owl.de> - * Fixed time_init to be aware of epoches != 1900. This prevents - * booting up in 2048 for me;) Code is stolen from rtc.c. * 2003-06-03 R. Scott Bailey <scott.bailey@eds.com> * Tighten sanity in time_init from 1% (10,000 PPM) to 250 PPM */ @@ -46,40 +37,19 @@ #include <asm/uaccess.h> #include <asm/io.h> #include <asm/hwrpb.h> -#include <asm/rtc.h> #include <linux/mc146818rtc.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/clocksource.h> +#include <linux/clockchips.h> #include "proto.h" #include "irq_impl.h" -static int set_rtc_mmss(unsigned long); - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#define TICK_SIZE (tick_nsec / 1000) - -/* - * Shift amount by which scaled_ticks_per_cycle is scaled. Shifting - * by 48 gives us 16 bits for HZ while keeping the accuracy good even - * for large CPU clock rates. - */ -#define FIX_SHIFT 48 - -/* lump static variables together for more efficient access: */ -static struct { - /* cycle counter last time it got invoked */ - __u32 last_time; - /* ticks/cycle * 2^48 */ - unsigned long scaled_ticks_per_cycle; - /* partial unused tick */ - unsigned long partial_tick; -} state; - unsigned long est_cycle_freq; #ifdef CONFIG_IRQ_WORK @@ -108,109 +78,156 @@ static inline __u32 rpcc(void) return __builtin_alpha_rpcc(); } -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} -void read_persistent_clock(struct timespec *ts) + +/* + * The RTC as a clock_event_device primitive. + */ + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ce); + +irqreturn_t +rtc_timer_interrupt(int irq, void *dev) { - unsigned int year, mon, day, hour, min, sec, epoch; - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - - if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hour = bcd2bin(hour); - day = bcd2bin(day); - mon = bcd2bin(mon); - year = bcd2bin(year); - } + int cpu = smp_processor_id(); + struct clock_event_device *ce = &per_cpu(cpu_ce, cpu); - /* PC-like is standard; used for year >= 70 */ - epoch = 1900; - if (year < 20) - epoch = 2000; - else if (year >= 20 && year < 48) - /* NT epoch */ - epoch = 1980; - else if (year >= 48 && year < 70) - /* Digital UNIX epoch */ - epoch = 1952; + /* Don't run the hook for UNUSED or SHUTDOWN. */ + if (likely(ce->mode == CLOCK_EVT_MODE_PERIODIC)) + ce->event_handler(ce); - printk(KERN_INFO "Using epoch = %d\n", epoch); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); + } - if ((year += epoch) < 1970) - year += 100; + return IRQ_HANDLED; +} - ts->tv_sec = mktime(year, mon, day, hour, min, sec); - ts->tv_nsec = 0; +static void +rtc_ce_set_mode(enum clock_event_mode mode, struct clock_event_device *ce) +{ + /* The mode member of CE is updated in generic code. + Since we only support periodic events, nothing to do. */ +} + +static int +rtc_ce_set_next_event(unsigned long evt, struct clock_event_device *ce) +{ + /* This hook is for oneshot mode, which we don't support. */ + return -EINVAL; } +static void __init +init_rtc_clockevent(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *ce = &per_cpu(cpu_ce, cpu); + + *ce = (struct clock_event_device){ + .name = "rtc", + .features = CLOCK_EVT_FEAT_PERIODIC, + .rating = 100, + .cpumask = cpumask_of(cpu), + .set_mode = rtc_ce_set_mode, + .set_next_event = rtc_ce_set_next_event, + }; + clockevents_config_and_register(ce, CONFIG_HZ, 0, 0); +} + /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "xtime_update()" routine every clocktick + * The QEMU clock as a clocksource primitive. */ -irqreturn_t timer_interrupt(int irq, void *dev) + +static cycle_t +qemu_cs_read(struct clocksource *cs) { - unsigned long delta; - __u32 now; - long nticks; + return qemu_get_vmtime(); +} -#ifndef CONFIG_SMP - /* Not SMP, do kernel PC profiling here. */ - profile_tick(CPU_PROFILING); -#endif +static struct clocksource qemu_cs = { + .name = "qemu", + .rating = 400, + .read = qemu_cs_read, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .max_idle_ns = LONG_MAX +}; - /* - * Calculate how many ticks have passed since the last update, - * including any previous partial leftover. Save any resulting - * fraction for the next pass. - */ - now = rpcc(); - delta = now - state.last_time; - state.last_time = now; - delta = delta * state.scaled_ticks_per_cycle + state.partial_tick; - state.partial_tick = delta & ((1UL << FIX_SHIFT) - 1); - nticks = delta >> FIX_SHIFT; - if (nticks) - xtime_update(nticks); +/* + * The QEMU alarm as a clock_event_device primitive. + */ - if (test_irq_work_pending()) { - clear_irq_work_pending(); - irq_work_run(); - } +static void +qemu_ce_set_mode(enum clock_event_mode mode, struct clock_event_device *ce) +{ + /* The mode member of CE is updated for us in generic code. + Just make sure that the event is disabled. */ + qemu_set_alarm_abs(0); +} -#ifndef CONFIG_SMP - while (nticks--) - update_process_times(user_mode(get_irq_regs())); -#endif +static int +qemu_ce_set_next_event(unsigned long evt, struct clock_event_device *ce) +{ + qemu_set_alarm_rel(evt); + return 0; +} +static irqreturn_t +qemu_timer_interrupt(int irq, void *dev) +{ + int cpu = smp_processor_id(); + struct clock_event_device *ce = &per_cpu(cpu_ce, cpu); + + ce->event_handler(ce); return IRQ_HANDLED; } +static void __init +init_qemu_clockevent(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *ce = &per_cpu(cpu_ce, cpu); + + *ce = (struct clock_event_device){ + .name = "qemu", + .features = CLOCK_EVT_FEAT_ONESHOT, + .rating = 400, + .cpumask = cpumask_of(cpu), + .set_mode = qemu_ce_set_mode, + .set_next_event = qemu_ce_set_next_event, + }; + + clockevents_config_and_register(ce, NSEC_PER_SEC, 1000, LONG_MAX); +} + + void __init common_init_rtc(void) { - unsigned char x; + unsigned char x, sel = 0; /* Reset periodic interrupt frequency. */ - x = CMOS_READ(RTC_FREQ_SELECT) & 0x3f; - /* Test includes known working values on various platforms - where 0x26 is wrong; we refuse to change those. */ - if (x != 0x26 && x != 0x25 && x != 0x19 && x != 0x06) { - printk("Setting RTC_FREQ to 1024 Hz (%x)\n", x); - CMOS_WRITE(0x26, RTC_FREQ_SELECT); +#if CONFIG_HZ == 1024 || CONFIG_HZ == 1200 + x = CMOS_READ(RTC_FREQ_SELECT) & 0x3f; + /* Test includes known working values on various platforms + where 0x26 is wrong; we refuse to change those. */ + if (x != 0x26 && x != 0x25 && x != 0x19 && x != 0x06) { + sel = RTC_REF_CLCK_32KHZ + 6; } +#elif CONFIG_HZ == 256 || CONFIG_HZ == 128 || CONFIG_HZ == 64 || CONFIG_HZ == 32 + sel = RTC_REF_CLCK_32KHZ + __builtin_ffs(32768 / CONFIG_HZ); +#else +# error "Unknown HZ from arch/alpha/Kconfig" +#endif + if (sel) { + printk(KERN_INFO "Setting RTC_FREQ to %d Hz (%x)\n", + CONFIG_HZ, sel); + CMOS_WRITE(sel, RTC_FREQ_SELECT); + } /* Turn on periodic interrupts. */ x = CMOS_READ(RTC_CONTROL); @@ -233,16 +250,37 @@ common_init_rtc(void) init_rtc_irq(); } -unsigned int common_get_rtc_time(struct rtc_time *time) -{ - return __get_rtc_time(time); -} + +#ifndef CONFIG_ALPHA_WTINT +/* + * The RPCC as a clocksource primitive. + * + * While we have free-running timecounters running on all CPUs, and we make + * a half-hearted attempt in init_rtc_rpcc_info to sync the timecounter + * with the wall clock, that initialization isn't kept up-to-date across + * different time counters in SMP mode. Therefore we can only use this + * method when there's only one CPU enabled. + * + * When using the WTINT PALcall, the RPCC may shift to a lower frequency, + * or stop altogether, while waiting for the interrupt. Therefore we cannot + * use this method when WTINT is in use. + */ -int common_set_rtc_time(struct rtc_time *time) +static cycle_t read_rpcc(struct clocksource *cs) { - return __set_rtc_time(time); + return rpcc(); } +static struct clocksource clocksource_rpcc = { + .name = "rpcc", + .rating = 300, + .read = read_rpcc, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS +}; +#endif /* ALPHA_WTINT */ + + /* Validate a computed cycle counter result against the known bounds for the given processor core. There's too much brokenness in the way of timing hardware for any one method to work everywhere. :-( @@ -353,33 +391,6 @@ rpcc_after_update_in_progress(void) return rpcc(); } -#ifndef CONFIG_SMP -/* Until and unless we figure out how to get cpu cycle counters - in sync and keep them there, we can't use the rpcc. */ -static cycle_t read_rpcc(struct clocksource *cs) -{ - cycle_t ret = (cycle_t)rpcc(); - return ret; -} - -static struct clocksource clocksource_rpcc = { - .name = "rpcc", - .rating = 300, - .read = read_rpcc, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS -}; - -static inline void register_rpcc_clocksource(long cycle_freq) -{ - clocksource_register_hz(&clocksource_rpcc, cycle_freq); -} -#else /* !CONFIG_SMP */ -static inline void register_rpcc_clocksource(long cycle_freq) -{ -} -#endif /* !CONFIG_SMP */ - void __init time_init(void) { @@ -387,6 +398,15 @@ time_init(void) unsigned long cycle_freq, tolerance; long diff; + if (alpha_using_qemu) { + clocksource_register_hz(&qemu_cs, NSEC_PER_SEC); + init_qemu_clockevent(); + + timer_irqaction.handler = qemu_timer_interrupt; + init_rtc_irq(); + return; + } + /* Calibrate CPU clock -- attempt #1. */ if (!est_cycle_freq) est_cycle_freq = validate_cc_value(calibrate_cc_with_pit()); @@ -421,100 +441,25 @@ time_init(void) "and unable to estimate a proper value!\n"); } - /* From John Bowman <bowman@math.ualberta.ca>: allow the values - to settle, as the Update-In-Progress bit going low isn't good - enough on some hardware. 2ms is our guess; we haven't found - bogomips yet, but this is close on a 500Mhz box. */ - __delay(1000000); - - - if (HZ > (1<<16)) { - extern void __you_loose (void); - __you_loose(); - } - - register_rpcc_clocksource(cycle_freq); - - state.last_time = cc1; - state.scaled_ticks_per_cycle - = ((unsigned long) HZ << FIX_SHIFT) / cycle_freq; - state.partial_tick = 0L; + /* See above for restrictions on using clocksource_rpcc. */ +#ifndef CONFIG_ALPHA_WTINT + if (hwrpb->nr_processors == 1) + clocksource_register_hz(&clocksource_rpcc, cycle_freq); +#endif /* Startup the timer source. */ alpha_mv.init_rtc(); + init_rtc_clockevent(); } -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be - * called 500 ms after the second nowtime has started, because when - * nowtime is written into the registers of the CMOS clock, it will - * jump to the next second precisely 500 ms later. Check the Motorola - * MC146818A or Dallas DS12887 data sheet for details. - * - * BUG: This routine does not handle hour overflow properly; it just - * sets the minutes. Usually you won't notice until after reboot! - */ - - -static int -set_rtc_mmss(unsigned long nowtime) +/* Initialize the clock_event_device for secondary cpus. */ +#ifdef CONFIG_SMP +void __init +init_clockevent(void) { - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char save_control, save_freq_select; - - /* irq are locally disabled here */ - spin_lock(&rtc_lock); - /* Tell the clock it's being set */ - save_control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); - - /* Stop and reset prescaler */ - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - cmos_minutes = bcd2bin(cmos_minutes); - - /* - * since we're only adjusting minutes and seconds, - * don't interfere with hour overflow. This avoids - * messing with unknown time zones but requires your - * RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) { - /* correct for half hour time zone */ - real_minutes += 30; - } - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) < 30) { - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - real_seconds = bin2bcd(real_seconds); - real_minutes = bin2bcd(real_minutes); - } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); - } else { - printk_once(KERN_NOTICE - "set_rtc_mmss: can't update from %d to %d\n", - cmos_minutes, real_minutes); - retval = -1; - } - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - spin_unlock(&rtc_lock); - - return retval; + if (alpha_using_qemu) + init_qemu_clockevent(); + else + init_rtc_clockevent(); } +#endif diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index bd0665cdc84..9c4c189eb22 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -241,6 +241,21 @@ do_entIF(unsigned long type, struct pt_regs *regs) (const char *)(data[1] | (long)data[2] << 32), data[0]); } +#ifdef CONFIG_ALPHA_WTINT + if (type == 4) { + /* If CALL_PAL WTINT is totally unsupported by the + PALcode, e.g. MILO, "emulate" it by overwriting + the insn. */ + unsigned int *pinsn + = (unsigned int *) regs->pc - 1; + if (*pinsn == PAL_wtint) { + *pinsn = 0x47e01400; /* mov 0,$0 */ + imb(); + regs->r0 = 0; + return; + } + } +#endif /* ALPHA_WTINT */ die_if_kernel((type == 1 ? "Kernel Bug" : "Instruction fault"), regs, type, NULL); } diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index ffb19b7da99..ff3c10721ca 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -130,7 +130,7 @@ csum_partial_cfu_aligned(const unsigned long __user *src, unsigned long *dst, *dst = word | tmp; checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -185,7 +185,7 @@ csum_partial_cfu_dest_aligned(const unsigned long __user *src, *dst = word | tmp; checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -242,7 +242,7 @@ csum_partial_cfu_src_aligned(const unsigned long __user *src, stq_u(partial_dest | second_dest, dst); out: checksum += carry; - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -325,7 +325,7 @@ csum_partial_cfu_unaligned(const unsigned long __user * src, stq_u(partial_dest | word | second_dest, dst); checksum += carry; } - if (err) *errp = err; + if (err && errp) *errp = err; return checksum; } @@ -339,7 +339,7 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, if (len) { if (!access_ok(VERIFY_READ, src, len)) { - *errp = -EFAULT; + if (errp) *errp = -EFAULT; memset(dst, 0, len); return sum; } diff --git a/arch/alpha/lib/ev6-memset.S b/arch/alpha/lib/ev6-memset.S index d8b94e1c7fc..356bb2fdd70 100644 --- a/arch/alpha/lib/ev6-memset.S +++ b/arch/alpha/lib/ev6-memset.S @@ -30,14 +30,15 @@ .set noat .set noreorder .text + .globl memset .globl __memset + .globl ___memset .globl __memsetw .globl __constant_c_memset - .globl memset - .ent __memset + .ent ___memset .align 5 -__memset: +___memset: .frame $30,0,$26,0 .prologue 0 @@ -227,7 +228,7 @@ end_b: nop nop ret $31,($26),1 # L0 : - .end __memset + .end ___memset /* * This is the original body of code, prior to replication and @@ -594,4 +595,5 @@ end_w: .end __memsetw -memset = __memset +memset = ___memset +__memset = ___memset diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S index 311b8cfc691..76ccc6d1f36 100644 --- a/arch/alpha/lib/memset.S +++ b/arch/alpha/lib/memset.S @@ -19,11 +19,13 @@ .text .globl memset .globl __memset + .globl ___memset .globl __memsetw .globl __constant_c_memset - .ent __memset + + .ent ___memset .align 5 -__memset: +___memset: .frame $30,0,$26,0 .prologue 0 @@ -103,7 +105,7 @@ within_one_quad: end: ret $31,($26),1 /* E1 */ - .end __memset + .end ___memset .align 5 .ent __memsetw @@ -121,4 +123,5 @@ __memsetw: .end __memsetw -memset = __memset +memset = ___memset +__memset = ___memset diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 214b698cefe..c1f1a7eee95 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -25,7 +25,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KGDB - select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) select HAVE_ARCH_TRACEHOOK select HAVE_BPF_JIT select HAVE_CONTEXT_TRACKING @@ -1496,6 +1496,7 @@ config HAVE_ARM_ARCH_TIMER bool "Architected timer support" depends on CPU_V7 select ARM_ARCH_TIMER + select GENERIC_CLOCKEVENTS help This option enables support for the ARM architected timer @@ -1719,7 +1720,6 @@ config AEABI config OABI_COMPAT bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)" depends on AEABI && !THUMB2_KERNEL - default y help This option preserves the old syscall interface along with the new (ARM EABI) one. It also provides a compatibility layer to @@ -1727,11 +1727,16 @@ config OABI_COMPAT in memory differs between the legacy ABI and the new ARM EABI (only for non "thumb" binaries). This option adds a tiny overhead to all syscalls and produces a slightly larger kernel. + + The seccomp filter system will not be available when this is + selected, since there is no way yet to sensibly distinguish + between calling conventions during filtering. + If you know you'll be using only pure EABI user space then you can say N here. If this option is not selected and you attempt to execute a legacy ABI binary then the result will be UNPREDICTABLE (in fact it can be predicted that it won't work - at all). If in doubt say Y. + at all). If in doubt say N. config ARCH_HAS_HOLES_MEMORYMODEL bool diff --git a/arch/arm/common/edma.c b/arch/arm/common/edma.c index 8e1a0245907..41bca32409f 100644 --- a/arch/arm/common/edma.c +++ b/arch/arm/common/edma.c @@ -404,7 +404,7 @@ static irqreturn_t dma_irq_handler(int irq, void *data) BIT(slot)); if (edma_cc[ctlr]->intr_data[channel].callback) edma_cc[ctlr]->intr_data[channel].callback( - channel, DMA_COMPLETE, + channel, EDMA_DMA_COMPLETE, edma_cc[ctlr]->intr_data[channel].data); } } while (sh_ipr); @@ -459,7 +459,7 @@ static irqreturn_t dma_ccerr_handler(int irq, void *data) callback) { edma_cc[ctlr]->intr_data[k]. callback(k, - DMA_CC_ERROR, + EDMA_DMA_CC_ERROR, edma_cc[ctlr]->intr_data [k].data); } diff --git a/arch/arm/include/asm/hardware/iop3xx-adma.h b/arch/arm/include/asm/hardware/iop3xx-adma.h index 9b28f1243bd..240b29ef17d 100644 --- a/arch/arm/include/asm/hardware/iop3xx-adma.h +++ b/arch/arm/include/asm/hardware/iop3xx-adma.h @@ -393,36 +393,6 @@ static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt, return slot_cnt; } -static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc) -{ - return 0; -} - -static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; - - switch (chan->device->id) { - case DMA0_ID: - case DMA1_ID: - return hw_desc.dma->dest_addr; - case AAU_ID: - return hw_desc.aau->dest_addr; - default: - BUG(); - } - return 0; -} - - -static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - BUG(); - return 0; -} - static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) { diff --git a/arch/arm/include/asm/hardware/iop_adma.h b/arch/arm/include/asm/hardware/iop_adma.h index 122f86d8c99..250760e0810 100644 --- a/arch/arm/include/asm/hardware/iop_adma.h +++ b/arch/arm/include/asm/hardware/iop_adma.h @@ -82,8 +82,6 @@ struct iop_adma_chan { * @slot_cnt: total slots used in an transaction (group of operations) * @slots_per_op: number of slots per operation * @idx: pool index - * @unmap_src_cnt: number of xor sources - * @unmap_len: transaction bytecount * @tx_list: list of descriptors that are associated with one operation * @async_tx: support for the async_tx api * @group_list: list of slots that make up a multi-descriptor transaction @@ -99,8 +97,6 @@ struct iop_adma_desc_slot { u16 slot_cnt; u16 slots_per_op; u16 idx; - u16 unmap_src_cnt; - size_t unmap_len; struct list_head tx_list; struct dma_async_tx_descriptor async_tx; union { diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 4dd21457ef9..9ecccc86504 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -226,7 +226,14 @@ static inline phys_addr_t __virt_to_phys(unsigned long x) static inline unsigned long __phys_to_virt(phys_addr_t x) { unsigned long t; - __pv_stub(x, t, "sub", __PV_BITS_31_24); + + /* + * 'unsigned long' cast discard upper word when + * phys_addr_t is 64 bit, and makes sure that inline + * assembler expression receives 32 bit argument + * in place where 'r' 32 bit operand is expected. + */ + __pv_stub((unsigned long) x, t, "sub", __PV_BITS_31_24); return t; } diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 7801866e626..11d59b32fb8 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -508,6 +508,7 @@ __fixup_smp: teq r0, #0x0 @ '0' on actual UP A9 hardware beq __fixup_smp_on_up @ So its an A9 UP ldr r0, [r0, #4] @ read SCU Config +ARM_BE8(rev r0, r0) @ byteswap if big endian and r0, r0, #0x3 @ number of CPUs teq r0, #0x0 @ is 1? movne pc, lr @@ -644,7 +645,11 @@ ARM_BE8(rev16 ip, ip) bcc 1b bx lr #else +#ifdef CONFIG_CPU_ENDIAN_BE8 + moveq r0, #0x00004000 @ set bit 22, mov to mvn instruction +#else moveq r0, #0x400000 @ set bit 22, mov to mvn instruction +#endif b 2f 1: ldr ip, [r7, r3] #ifdef CONFIG_CPU_ENDIAN_BE8 @@ -653,7 +658,7 @@ ARM_BE8(rev16 ip, ip) tst ip, #0x000f0000 @ check the rotation field orrne ip, ip, r6, lsl #24 @ mask in offset bits 31-24 biceq ip, ip, #0x00004000 @ clear bit 22 - orreq ip, ip, r0, lsl #24 @ mask in offset bits 7-0 + orreq ip, ip, r0 @ mask in offset bits 7-0 #else bic ip, ip, #0x000000ff tst ip, #0xf00 @ check the rotation field diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 6125f259b7b..dbf0923e8d7 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -856,7 +856,7 @@ static void __init kuser_init(void *vectors) memcpy(vectors + 0xfe0, vectors + 0xfe8, 4); } #else -static void __init kuser_init(void *vectors) +static inline void __init kuser_init(void *vectors) { } #endif diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 371958370de..580906989db 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -334,6 +334,17 @@ out: return err; } +static phys_addr_t kvm_kaddr_to_phys(void *kaddr) +{ + if (!is_vmalloc_addr(kaddr)) { + BUG_ON(!virt_addr_valid(kaddr)); + return __pa(kaddr); + } else { + return page_to_phys(vmalloc_to_page(kaddr)) + + offset_in_page(kaddr); + } +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -345,16 +356,27 @@ out: */ int create_hyp_mappings(void *from, void *to) { - unsigned long phys_addr = virt_to_phys(from); + phys_addr_t phys_addr; + unsigned long virt_addr; unsigned long start = KERN_TO_HYP((unsigned long)from); unsigned long end = KERN_TO_HYP((unsigned long)to); - /* Check for a valid kernel memory mapping */ - if (!virt_addr_valid(from) || !virt_addr_valid(to - 1)) - return -EINVAL; + start = start & PAGE_MASK; + end = PAGE_ALIGN(end); - return __create_hyp_mappings(hyp_pgd, start, end, - __phys_to_pfn(phys_addr), PAGE_HYP); + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { + int err; + + phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); + err = __create_hyp_mappings(hyp_pgd, virt_addr, + virt_addr + PAGE_SIZE, + __phys_to_pfn(phys_addr), + PAGE_HYP); + if (err) + return err; + } + + return 0; } /** diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index e0c68d5bb7d..52886b89706 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -10,7 +10,7 @@ UNWIND( .fnstart ) and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset -#if __LINUX_ARM_ARCH__ >= 7 +#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP) .arch_extension mp ALT_SMP(W(pldw) [r1]) ALT_UP(W(nop)) diff --git a/arch/arm/mach-iop13xx/include/mach/adma.h b/arch/arm/mach-iop13xx/include/mach/adma.h index 6d3782d85a9..a86fd0ed775 100644 --- a/arch/arm/mach-iop13xx/include/mach/adma.h +++ b/arch/arm/mach-iop13xx/include/mach/adma.h @@ -218,20 +218,6 @@ iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op) #define iop_chan_pq_slot_count iop_chan_xor_slot_count #define iop_chan_pq_zero_sum_slot_count iop_chan_xor_slot_count -static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - return hw_desc->dest_addr; -} - -static inline u32 iop_desc_get_qdest_addr(struct iop_adma_desc_slot *desc, - struct iop_adma_chan *chan) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - return hw_desc->q_dest_addr; -} - static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, struct iop_adma_chan *chan) { @@ -350,18 +336,6 @@ iop_desc_init_pq(struct iop_adma_desc_slot *desc, int src_cnt, hw_desc->desc_ctrl = u_desc_ctrl.value; } -static inline int iop_desc_is_pq(struct iop_adma_desc_slot *desc) -{ - struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; - union { - u32 value; - struct iop13xx_adma_desc_ctrl field; - } u_desc_ctrl; - - u_desc_ctrl.value = hw_desc->desc_ctrl; - return u_desc_ctrl.field.pq_xfer_en; -} - static inline void iop_desc_init_pq_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, unsigned long flags) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 78eeeca78f5..580ef2de82d 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -558,8 +558,8 @@ static void __init build_mem_type_table(void) mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_WB; break; } - printk("Memory policy: ECC %sabled, Data cache %s\n", - ecc_mask ? "en" : "dis", cp->policy); + pr_info("Memory policy: %sData cache %s\n", + ecc_mask ? "ECC enabled, " : "", cp->policy); for (i = 0; i < ARRAY_SIZE(mem_types); i++) { struct mem_type *t = &mem_types[i]; diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 5c668b7a31f..55764a7ef1f 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -18,6 +18,7 @@ #include <asm/mach/arch.h> #include <asm/cputype.h> #include <asm/mpu.h> +#include <asm/procinfo.h> #include "mm.h" diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 60920f62fdf..bd1781979a3 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -92,7 +92,7 @@ ENDPROC(cpu_v7_dcache_clean_area) /* Suspend/resume support: derived from arch/arm/mach-s5pv210/sleep.S */ .globl cpu_v7_suspend_size -.equ cpu_v7_suspend_size, 4 * 8 +.equ cpu_v7_suspend_size, 4 * 9 #ifdef CONFIG_ARM_CPU_SUSPEND ENTRY(cpu_v7_do_suspend) stmfd sp!, {r4 - r10, lr} @@ -101,13 +101,17 @@ ENTRY(cpu_v7_do_suspend) stmia r0!, {r4 - r5} #ifdef CONFIG_MMU mrc p15, 0, r6, c3, c0, 0 @ Domain ID +#ifdef CONFIG_ARM_LPAE + mrrc p15, 1, r5, r7, c2 @ TTB 1 +#else mrc p15, 0, r7, c2, c0, 1 @ TTB 1 +#endif mrc p15, 0, r11, c2, c0, 2 @ TTB control register #endif mrc p15, 0, r8, c1, c0, 0 @ Control register mrc p15, 0, r9, c1, c0, 1 @ Auxiliary control register mrc p15, 0, r10, c1, c0, 2 @ Co-processor access control - stmia r0, {r6 - r11} + stmia r0, {r5 - r11} ldmfd sp!, {r4 - r10, pc} ENDPROC(cpu_v7_do_suspend) @@ -118,16 +122,19 @@ ENTRY(cpu_v7_do_resume) ldmia r0!, {r4 - r5} mcr p15, 0, r4, c13, c0, 0 @ FCSE/PID mcr p15, 0, r5, c13, c0, 3 @ User r/o thread ID - ldmia r0, {r6 - r11} + ldmia r0, {r5 - r11} #ifdef CONFIG_MMU mcr p15, 0, ip, c8, c7, 0 @ invalidate TLBs mcr p15, 0, r6, c3, c0, 0 @ Domain ID -#ifndef CONFIG_ARM_LPAE +#ifdef CONFIG_ARM_LPAE + mcrr p15, 0, r1, ip, c2 @ TTB 0 + mcrr p15, 1, r5, r7, c2 @ TTB 1 +#else ALT_SMP(orr r1, r1, #TTB_FLAGS_SMP) ALT_UP(orr r1, r1, #TTB_FLAGS_UP) -#endif mcr p15, 0, r1, c2, c0, 0 @ TTB 0 mcr p15, 0, r7, c2, c0, 1 @ TTB 1 +#endif mcr p15, 0, r11, c2, c0, 2 @ TTB control register ldr r4, =PRRR @ PRRR ldr r5, =NMRR @ NMRR diff --git a/arch/avr32/boot/u-boot/head.S b/arch/avr32/boot/u-boot/head.S index 4488fa27fe9..2ffc298f061 100644 --- a/arch/avr32/boot/u-boot/head.S +++ b/arch/avr32/boot/u-boot/head.S @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ #include <asm/setup.h> +#include <asm/thread_info.h> +#include <asm/sysreg.h> /* * The kernel is loaded where we want it to be and all caches @@ -20,11 +22,6 @@ .section .init.text,"ax" .global _start _start: - /* Check if the boot loader actually provided a tag table */ - lddpc r0, magic_number - cp.w r12, r0 - brne no_tag_table - /* Initialize .bss */ lddpc r2, bss_start_addr lddpc r3, end_addr @@ -34,6 +31,25 @@ _start: cp r2, r3 brlo 1b + /* Initialize status register */ + lddpc r0, init_sr + mtsr SYSREG_SR, r0 + + /* Set initial stack pointer */ + lddpc sp, stack_addr + sub sp, -THREAD_SIZE + +#ifdef CONFIG_FRAME_POINTER + /* Mark last stack frame */ + mov lr, 0 + mov r7, 0 +#endif + + /* Check if the boot loader actually provided a tag table */ + lddpc r0, magic_number + cp.w r12, r0 + brne no_tag_table + /* * Save the tag table address for later use. This must be done * _after_ .bss has been initialized... @@ -53,8 +69,15 @@ bss_start_addr: .long __bss_start end_addr: .long _end +init_sr: + .long 0x007f0000 /* Supervisor mode, everything masked */ +stack_addr: + .long init_thread_union +panic_addr: + .long panic no_tag_table: sub r12, pc, (. - 2f) - bral panic + /* branch to panic() which can be far away with that construct */ + lddpc pc, panic_addr 2: .asciz "Boot loader didn't provide correct magic number\n" diff --git a/arch/avr32/include/asm/kprobes.h b/arch/avr32/include/asm/kprobes.h index 996cb656474..45f563ed73f 100644 --- a/arch/avr32/include/asm/kprobes.h +++ b/arch/avr32/include/asm/kprobes.h @@ -16,6 +16,7 @@ typedef u16 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xd673 /* breakpoint */ #define MAX_INSN_SIZE 2 +#define MAX_STACK_SIZE 64 /* 32 would probably be OK */ #define kretprobe_blacklist_size 0 @@ -26,6 +27,19 @@ struct arch_specific_insn { kprobe_opcode_t insn[MAX_INSN_SIZE]; }; +struct prev_kprobe { + struct kprobe *kp; + unsigned int status; +}; + +/* per-cpu kprobe control block */ +struct kprobe_ctlblk { + unsigned int kprobe_status; + struct prev_kprobe prev_kprobe; + struct pt_regs jprobe_saved_regs; + char jprobes_stack[MAX_STACK_SIZE]; +}; + extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); diff --git a/arch/avr32/include/uapi/asm/Kbuild b/arch/avr32/include/uapi/asm/Kbuild index 3b85eaddf52..08d8a3d76ea 100644 --- a/arch/avr32/include/uapi/asm/Kbuild +++ b/arch/avr32/include/uapi/asm/Kbuild @@ -2,35 +2,35 @@ include include/uapi/asm-generic/Kbuild.asm header-y += auxvec.h -header-y += bitsperlong.h header-y += byteorder.h header-y += cachectl.h -header-y += errno.h -header-y += fcntl.h -header-y += ioctl.h -header-y += ioctls.h -header-y += ipcbuf.h -header-y += kvm_para.h -header-y += mman.h header-y += msgbuf.h header-y += param.h -header-y += poll.h header-y += posix_types.h header-y += ptrace.h -header-y += resource.h header-y += sembuf.h header-y += setup.h header-y += shmbuf.h header-y += sigcontext.h -header-y += siginfo.h header-y += signal.h header-y += socket.h header-y += sockios.h header-y += stat.h -header-y += statfs.h header-y += swab.h header-y += termbits.h header-y += termios.h header-y += types.h header-y += unistd.h +generic-y += bitsperlong.h +generic-y += errno.h +generic-y += fcntl.h +generic-y += ioctl.h +generic-y += ioctls.h +generic-y += ipcbuf.h +generic-y += kvm_para.h +generic-y += mman.h generic-y += param.h +generic-y += poll.h +generic-y += resource.h +generic-y += siginfo.h +generic-y += statfs.h diff --git a/arch/avr32/include/uapi/asm/auxvec.h b/arch/avr32/include/uapi/asm/auxvec.h index d5dd435bf8f..4f02da3ffef 100644 --- a/arch/avr32/include/uapi/asm/auxvec.h +++ b/arch/avr32/include/uapi/asm/auxvec.h @@ -1,4 +1,4 @@ -#ifndef __ASM_AVR32_AUXVEC_H -#define __ASM_AVR32_AUXVEC_H +#ifndef _UAPI__ASM_AVR32_AUXVEC_H +#define _UAPI__ASM_AVR32_AUXVEC_H -#endif /* __ASM_AVR32_AUXVEC_H */ +#endif /* _UAPI__ASM_AVR32_AUXVEC_H */ diff --git a/arch/avr32/include/uapi/asm/bitsperlong.h b/arch/avr32/include/uapi/asm/bitsperlong.h deleted file mode 100644 index 6dc0bb0c13b..00000000000 --- a/arch/avr32/include/uapi/asm/bitsperlong.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/bitsperlong.h> diff --git a/arch/avr32/include/uapi/asm/byteorder.h b/arch/avr32/include/uapi/asm/byteorder.h index 50abc21619a..71242f0d39c 100644 --- a/arch/avr32/include/uapi/asm/byteorder.h +++ b/arch/avr32/include/uapi/asm/byteorder.h @@ -1,9 +1,9 @@ /* * AVR32 endian-conversion functions. */ -#ifndef __ASM_AVR32_BYTEORDER_H -#define __ASM_AVR32_BYTEORDER_H +#ifndef _UAPI__ASM_AVR32_BYTEORDER_H +#define _UAPI__ASM_AVR32_BYTEORDER_H #include <linux/byteorder/big_endian.h> -#endif /* __ASM_AVR32_BYTEORDER_H */ +#endif /* _UAPI__ASM_AVR32_BYTEORDER_H */ diff --git a/arch/avr32/include/uapi/asm/cachectl.h b/arch/avr32/include/uapi/asm/cachectl.h index 4faf1ce6006..573a9584dd5 100644 --- a/arch/avr32/include/uapi/asm/cachectl.h +++ b/arch/avr32/include/uapi/asm/cachectl.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_CACHECTL_H -#define __ASM_AVR32_CACHECTL_H +#ifndef _UAPI__ASM_AVR32_CACHECTL_H +#define _UAPI__ASM_AVR32_CACHECTL_H /* * Operations that can be performed through the cacheflush system call @@ -8,4 +8,4 @@ /* Clean the data cache, then invalidate the icache */ #define CACHE_IFLUSH 0 -#endif /* __ASM_AVR32_CACHECTL_H */ +#endif /* _UAPI__ASM_AVR32_CACHECTL_H */ diff --git a/arch/avr32/include/uapi/asm/errno.h b/arch/avr32/include/uapi/asm/errno.h deleted file mode 100644 index 558a7249f06..00000000000 --- a/arch/avr32/include/uapi/asm/errno.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_ERRNO_H -#define __ASM_AVR32_ERRNO_H - -#include <asm-generic/errno.h> - -#endif /* __ASM_AVR32_ERRNO_H */ diff --git a/arch/avr32/include/uapi/asm/fcntl.h b/arch/avr32/include/uapi/asm/fcntl.h deleted file mode 100644 index 14c0c4402b1..00000000000 --- a/arch/avr32/include/uapi/asm/fcntl.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_FCNTL_H -#define __ASM_AVR32_FCNTL_H - -#include <asm-generic/fcntl.h> - -#endif /* __ASM_AVR32_FCNTL_H */ diff --git a/arch/avr32/include/uapi/asm/ioctl.h b/arch/avr32/include/uapi/asm/ioctl.h deleted file mode 100644 index c8472c1398e..00000000000 --- a/arch/avr32/include/uapi/asm/ioctl.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_IOCTL_H -#define __ASM_AVR32_IOCTL_H - -#include <asm-generic/ioctl.h> - -#endif /* __ASM_AVR32_IOCTL_H */ diff --git a/arch/avr32/include/uapi/asm/ioctls.h b/arch/avr32/include/uapi/asm/ioctls.h deleted file mode 100644 index 909cf66feaf..00000000000 --- a/arch/avr32/include/uapi/asm/ioctls.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_IOCTLS_H -#define __ASM_AVR32_IOCTLS_H - -#include <asm-generic/ioctls.h> - -#endif /* __ASM_AVR32_IOCTLS_H */ diff --git a/arch/avr32/include/uapi/asm/ipcbuf.h b/arch/avr32/include/uapi/asm/ipcbuf.h deleted file mode 100644 index 84c7e51cb6d..00000000000 --- a/arch/avr32/include/uapi/asm/ipcbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipcbuf.h> diff --git a/arch/avr32/include/uapi/asm/kvm_para.h b/arch/avr32/include/uapi/asm/kvm_para.h deleted file mode 100644 index 14fab8f0b95..00000000000 --- a/arch/avr32/include/uapi/asm/kvm_para.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/kvm_para.h> diff --git a/arch/avr32/include/uapi/asm/mman.h b/arch/avr32/include/uapi/asm/mman.h deleted file mode 100644 index 8eebf89f5ab..00000000000 --- a/arch/avr32/include/uapi/asm/mman.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/mman.h> diff --git a/arch/avr32/include/uapi/asm/msgbuf.h b/arch/avr32/include/uapi/asm/msgbuf.h index ac18bc4da7f..9eae6effad1 100644 --- a/arch/avr32/include/uapi/asm/msgbuf.h +++ b/arch/avr32/include/uapi/asm/msgbuf.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_MSGBUF_H -#define __ASM_AVR32_MSGBUF_H +#ifndef _UAPI__ASM_AVR32_MSGBUF_H +#define _UAPI__ASM_AVR32_MSGBUF_H /* * The msqid64_ds structure for i386 architecture. @@ -28,4 +28,4 @@ struct msqid64_ds { unsigned long __unused5; }; -#endif /* __ASM_AVR32_MSGBUF_H */ +#endif /* _UAPI__ASM_AVR32_MSGBUF_H */ diff --git a/arch/avr32/include/uapi/asm/poll.h b/arch/avr32/include/uapi/asm/poll.h deleted file mode 100644 index c98509d3149..00000000000 --- a/arch/avr32/include/uapi/asm/poll.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/poll.h> diff --git a/arch/avr32/include/uapi/asm/posix_types.h b/arch/avr32/include/uapi/asm/posix_types.h index 9ba9e749b3f..5b813a8abf0 100644 --- a/arch/avr32/include/uapi/asm/posix_types.h +++ b/arch/avr32/include/uapi/asm/posix_types.h @@ -5,8 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#ifndef __ASM_AVR32_POSIX_TYPES_H -#define __ASM_AVR32_POSIX_TYPES_H +#ifndef _UAPI__ASM_AVR32_POSIX_TYPES_H +#define _UAPI__ASM_AVR32_POSIX_TYPES_H /* * This file is generally used by user-level software, so you need to @@ -34,4 +34,4 @@ typedef unsigned short __kernel_old_dev_t; #include <asm-generic/posix_types.h> -#endif /* __ASM_AVR32_POSIX_TYPES_H */ +#endif /* _UAPI__ASM_AVR32_POSIX_TYPES_H */ diff --git a/arch/avr32/include/uapi/asm/resource.h b/arch/avr32/include/uapi/asm/resource.h deleted file mode 100644 index c6dd101472b..00000000000 --- a/arch/avr32/include/uapi/asm/resource.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_RESOURCE_H -#define __ASM_AVR32_RESOURCE_H - -#include <asm-generic/resource.h> - -#endif /* __ASM_AVR32_RESOURCE_H */ diff --git a/arch/avr32/include/uapi/asm/sembuf.h b/arch/avr32/include/uapi/asm/sembuf.h index e472216e0c9..6c6f7cf1e75 100644 --- a/arch/avr32/include/uapi/asm/sembuf.h +++ b/arch/avr32/include/uapi/asm/sembuf.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_SEMBUF_H -#define __ASM_AVR32_SEMBUF_H +#ifndef _UAPI__ASM_AVR32_SEMBUF_H +#define _UAPI__ASM_AVR32_SEMBUF_H /* * The semid64_ds structure for AVR32 architecture. @@ -22,4 +22,4 @@ struct semid64_ds { unsigned long __unused4; }; -#endif /* __ASM_AVR32_SEMBUF_H */ +#endif /* _UAPI__ASM_AVR32_SEMBUF_H */ diff --git a/arch/avr32/include/uapi/asm/setup.h b/arch/avr32/include/uapi/asm/setup.h index e58aa9356fa..a654df7dba4 100644 --- a/arch/avr32/include/uapi/asm/setup.h +++ b/arch/avr32/include/uapi/asm/setup.h @@ -13,5 +13,4 @@ #define COMMAND_LINE_SIZE 256 - #endif /* _UAPI__ASM_AVR32_SETUP_H__ */ diff --git a/arch/avr32/include/uapi/asm/shmbuf.h b/arch/avr32/include/uapi/asm/shmbuf.h index c62fba41739..b94cf8b60b7 100644 --- a/arch/avr32/include/uapi/asm/shmbuf.h +++ b/arch/avr32/include/uapi/asm/shmbuf.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_SHMBUF_H -#define __ASM_AVR32_SHMBUF_H +#ifndef _UAPI__ASM_AVR32_SHMBUF_H +#define _UAPI__ASM_AVR32_SHMBUF_H /* * The shmid64_ds structure for i386 architecture. @@ -39,4 +39,4 @@ struct shminfo64 { unsigned long __unused4; }; -#endif /* __ASM_AVR32_SHMBUF_H */ +#endif /* _UAPI__ASM_AVR32_SHMBUF_H */ diff --git a/arch/avr32/include/uapi/asm/sigcontext.h b/arch/avr32/include/uapi/asm/sigcontext.h index e04062b5f39..27e56bf6377 100644 --- a/arch/avr32/include/uapi/asm/sigcontext.h +++ b/arch/avr32/include/uapi/asm/sigcontext.h @@ -5,8 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#ifndef __ASM_AVR32_SIGCONTEXT_H -#define __ASM_AVR32_SIGCONTEXT_H +#ifndef _UAPI__ASM_AVR32_SIGCONTEXT_H +#define _UAPI__ASM_AVR32_SIGCONTEXT_H struct sigcontext { unsigned long oldmask; @@ -31,4 +31,4 @@ struct sigcontext { unsigned long r0; }; -#endif /* __ASM_AVR32_SIGCONTEXT_H */ +#endif /* _UAPI__ASM_AVR32_SIGCONTEXT_H */ diff --git a/arch/avr32/include/uapi/asm/siginfo.h b/arch/avr32/include/uapi/asm/siginfo.h deleted file mode 100644 index 5ee93f40a8a..00000000000 --- a/arch/avr32/include/uapi/asm/siginfo.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _AVR32_SIGINFO_H -#define _AVR32_SIGINFO_H - -#include <asm-generic/siginfo.h> - -#endif diff --git a/arch/avr32/include/uapi/asm/signal.h b/arch/avr32/include/uapi/asm/signal.h index 1b77a93eff5..ffe8c770caf 100644 --- a/arch/avr32/include/uapi/asm/signal.h +++ b/arch/avr32/include/uapi/asm/signal.h @@ -118,5 +118,4 @@ typedef struct sigaltstack { size_t ss_size; } stack_t; - #endif /* _UAPI__ASM_AVR32_SIGNAL_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 43993642143..cbf902e4cd9 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_SOCKET_H -#define __ASM_AVR32_SOCKET_H +#ifndef _UAPI__ASM_AVR32_SOCKET_H +#define _UAPI__ASM_AVR32_SOCKET_H #include <asm/sockios.h> @@ -78,4 +78,4 @@ #define SO_MAX_PACING_RATE 47 -#endif /* __ASM_AVR32_SOCKET_H */ +#endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/sockios.h b/arch/avr32/include/uapi/asm/sockios.h index 0802d742f97..d0478545353 100644 --- a/arch/avr32/include/uapi/asm/sockios.h +++ b/arch/avr32/include/uapi/asm/sockios.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_SOCKIOS_H -#define __ASM_AVR32_SOCKIOS_H +#ifndef _UAPI__ASM_AVR32_SOCKIOS_H +#define _UAPI__ASM_AVR32_SOCKIOS_H /* Socket-level I/O control calls. */ #define FIOSETOWN 0x8901 @@ -10,4 +10,4 @@ #define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ #define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ -#endif /* __ASM_AVR32_SOCKIOS_H */ +#endif /* _UAPI__ASM_AVR32_SOCKIOS_H */ diff --git a/arch/avr32/include/uapi/asm/stat.h b/arch/avr32/include/uapi/asm/stat.h index e72881e1023..c06acef7fce 100644 --- a/arch/avr32/include/uapi/asm/stat.h +++ b/arch/avr32/include/uapi/asm/stat.h @@ -5,8 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#ifndef __ASM_AVR32_STAT_H -#define __ASM_AVR32_STAT_H +#ifndef _UAPI__ASM_AVR32_STAT_H +#define _UAPI__ASM_AVR32_STAT_H struct __old_kernel_stat { unsigned short st_dev; @@ -76,4 +76,4 @@ struct stat64 { unsigned long __unused2; }; -#endif /* __ASM_AVR32_STAT_H */ +#endif /* _UAPI__ASM_AVR32_STAT_H */ diff --git a/arch/avr32/include/uapi/asm/statfs.h b/arch/avr32/include/uapi/asm/statfs.h deleted file mode 100644 index 2961bd18c50..00000000000 --- a/arch/avr32/include/uapi/asm/statfs.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_AVR32_STATFS_H -#define __ASM_AVR32_STATFS_H - -#include <asm-generic/statfs.h> - -#endif /* __ASM_AVR32_STATFS_H */ diff --git a/arch/avr32/include/uapi/asm/swab.h b/arch/avr32/include/uapi/asm/swab.h index 14cc737bbca..1a03549e7dc 100644 --- a/arch/avr32/include/uapi/asm/swab.h +++ b/arch/avr32/include/uapi/asm/swab.h @@ -1,8 +1,8 @@ /* * AVR32 byteswapping functions. */ -#ifndef __ASM_AVR32_SWAB_H -#define __ASM_AVR32_SWAB_H +#ifndef _UAPI__ASM_AVR32_SWAB_H +#define _UAPI__ASM_AVR32_SWAB_H #include <linux/types.h> #include <linux/compiler.h> @@ -32,4 +32,4 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 val) #define __arch_swab32 __arch_swab32 #endif -#endif /* __ASM_AVR32_SWAB_H */ +#endif /* _UAPI__ASM_AVR32_SWAB_H */ diff --git a/arch/avr32/include/uapi/asm/termbits.h b/arch/avr32/include/uapi/asm/termbits.h index 366adc5ebb1..32789ccb38f 100644 --- a/arch/avr32/include/uapi/asm/termbits.h +++ b/arch/avr32/include/uapi/asm/termbits.h @@ -1,5 +1,5 @@ -#ifndef __ASM_AVR32_TERMBITS_H -#define __ASM_AVR32_TERMBITS_H +#ifndef _UAPI__ASM_AVR32_TERMBITS_H +#define _UAPI__ASM_AVR32_TERMBITS_H #include <linux/posix_types.h> @@ -193,4 +193,4 @@ struct ktermios { #define TCSADRAIN 1 #define TCSAFLUSH 2 -#endif /* __ASM_AVR32_TERMBITS_H */ +#endif /* _UAPI__ASM_AVR32_TERMBITS_H */ diff --git a/arch/avr32/include/uapi/asm/termios.h b/arch/avr32/include/uapi/asm/termios.h index b8ef8ea6335..c8a0081556c 100644 --- a/arch/avr32/include/uapi/asm/termios.h +++ b/arch/avr32/include/uapi/asm/termios.h @@ -46,5 +46,4 @@ struct termio { /* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - #endif /* _UAPI__ASM_AVR32_TERMIOS_H */ diff --git a/arch/avr32/include/uapi/asm/types.h b/arch/avr32/include/uapi/asm/types.h index bb34ad349df..7c986c4e99b 100644 --- a/arch/avr32/include/uapi/asm/types.h +++ b/arch/avr32/include/uapi/asm/types.h @@ -5,4 +5,9 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#ifndef _UAPI__ASM_AVR32_TYPES_H +#define _UAPI__ASM_AVR32_TYPES_H + #include <asm-generic/int-ll64.h> + +#endif /* _UAPI__ASM_AVR32_TYPES_H */ diff --git a/arch/avr32/include/uapi/asm/unistd.h b/arch/avr32/include/uapi/asm/unistd.h index 3eaa68753ad..8822bf46ddc 100644 --- a/arch/avr32/include/uapi/asm/unistd.h +++ b/arch/avr32/include/uapi/asm/unistd.h @@ -301,5 +301,4 @@ #define __NR_eventfd 281 #define __NR_setns 283 - #endif /* _UAPI__ASM_AVR32_UNISTD_H */ diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S index 9899d3cc6f0..7301f4806bb 100644 --- a/arch/avr32/kernel/entry-avr32b.S +++ b/arch/avr32/kernel/entry-avr32b.S @@ -401,9 +401,10 @@ handle_critical: /* We should never get here... */ bad_return: sub r12, pc, (. - 1f) - bral panic + lddpc pc, 2f .align 2 1: .asciz "Return from critical exception!" +2: .long panic .align 1 do_bus_error_write: diff --git a/arch/avr32/kernel/head.S b/arch/avr32/kernel/head.S index 6163bd0acb9..59eae6dfbed 100644 --- a/arch/avr32/kernel/head.S +++ b/arch/avr32/kernel/head.S @@ -10,33 +10,13 @@ #include <linux/linkage.h> #include <asm/page.h> -#include <asm/thread_info.h> -#include <asm/sysreg.h> .section .init.text,"ax" .global kernel_entry kernel_entry: - /* Initialize status register */ - lddpc r0, init_sr - mtsr SYSREG_SR, r0 - - /* Set initial stack pointer */ - lddpc sp, stack_addr - sub sp, -THREAD_SIZE - -#ifdef CONFIG_FRAME_POINTER - /* Mark last stack frame */ - mov lr, 0 - mov r7, 0 -#endif - /* Start the show */ lddpc pc, kernel_start_addr .align 2 -init_sr: - .long 0x007f0000 /* Supervisor mode, everything masked */ -stack_addr: - .long init_thread_union kernel_start_addr: .long start_kernel diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index d43daf192b2..4c530a82fc4 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1992,7 +1992,7 @@ sba_connect_bus(struct pci_bus *bus) if (PCI_CONTROLLER(bus)->iommu) return; - handle = PCI_CONTROLLER(bus)->acpi_handle; + handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion); if (!handle) return; diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h index 80775f55f03..71fbaaa495c 100644 --- a/arch/ia64/include/asm/pci.h +++ b/arch/ia64/include/asm/pci.h @@ -95,7 +95,7 @@ struct iospace_resource { }; struct pci_controller { - void *acpi_handle; + struct acpi_device *companion; void *iommu; int segment; int node; /* nearest node with memory or -1 for global allocation */ diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5a9ff1c3c3e..cb592773c78 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2166,12 +2166,6 @@ static const struct file_operations pfm_file_ops = { .flush = pfm_flush }; -static int -pfmfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]", @@ -2179,7 +2173,7 @@ static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) } static const struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = pfmfs_dname, }; diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 2326790b7d8..9e4938d8ca4 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -436,9 +436,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) if (!controller) return NULL; - controller->acpi_handle = device->handle; + controller->companion = device; - pxm = acpi_get_pxm(controller->acpi_handle); + pxm = acpi_get_pxm(device->handle); #ifdef CONFIG_NUMA if (pxm >= 0) controller->node = pxm_to_node(pxm); @@ -489,7 +489,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { struct pci_controller *controller = bridge->bus->sysdata; - ACPI_HANDLE_SET(&bridge->dev, controller->acpi_handle); + ACPI_COMPANION_SET(&bridge->dev, controller->companion); return 0; } diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c index b1725398b5a..0640739cc20 100644 --- a/arch/ia64/sn/kernel/io_acpi_init.c +++ b/arch/ia64/sn/kernel/io_acpi_init.c @@ -132,7 +132,7 @@ sn_get_bussoft_ptr(struct pci_bus *bus) struct acpi_resource_vendor_typed *vendor; - handle = PCI_CONTROLLER(bus)->acpi_handle; + handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion); status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS, &sn_uuid, &buffer); if (ACPI_FAILURE(status)) { @@ -360,7 +360,7 @@ sn_acpi_get_pcidev_info(struct pci_dev *dev, struct pcidev_info **pcidev_info, acpi_status status; struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - rootbus_handle = PCI_CONTROLLER(dev)->acpi_handle; + rootbus_handle = acpi_device_handle(PCI_CONTROLLER(dev)->companion); status = acpi_evaluate_integer(rootbus_handle, METHOD_NAME__SEG, NULL, &segment); if (ACPI_SUCCESS(status)) { diff --git a/arch/parisc/include/asm/socket.h b/arch/parisc/include/asm/socket.h new file mode 100644 index 00000000000..748016cb122 --- /dev/null +++ b/arch/parisc/include/asm/socket.h @@ -0,0 +1,11 @@ +#ifndef _ASM_SOCKET_H +#define _ASM_SOCKET_H + +#include <uapi/asm/socket.h> + +/* O_NONBLOCK clashes with the bits used for socket types. Therefore we + * have to define SOCK_NONBLOCK to a different value here. + */ +#define SOCK_NONBLOCK 0x40000000 + +#endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 63f4dd0b49c..4006964d8e1 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -4,14 +4,11 @@ /* * User space memory access functions */ -#include <asm/processor.h> #include <asm/page.h> #include <asm/cache.h> #include <asm/errno.h> #include <asm-generic/uaccess-unaligned.h> -#include <linux/sched.h> - #define VERIFY_READ 0 #define VERIFY_WRITE 1 @@ -36,43 +33,12 @@ extern int __get_user_bad(void); extern int __put_kernel_bad(void); extern int __put_user_bad(void); - -/* - * Test whether a block of memory is a valid user space address. - * Returns 0 if the range is valid, nonzero otherwise. - */ -static inline int __range_not_ok(unsigned long addr, unsigned long size, - unsigned long limit) +static inline long access_ok(int type, const void __user * addr, + unsigned long size) { - unsigned long __newaddr = addr + size; - return (__newaddr < addr || __newaddr > limit || size > limit); + return 1; } -/** - * access_ok: - Checks if a user space pointer is valid - * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that - * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe - * to write to a block, it is always safe to read from it. - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Returns true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - * - * Note that, depending on architecture, this function probably just - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - */ -#define access_ok(type, addr, size) \ -( __chk_user_ptr(addr), \ - !__range_not_ok((unsigned long) (__force void *) (addr), \ - size, user_addr_max()) \ -) - #define put_user __put_user #define get_user __get_user @@ -253,11 +219,7 @@ extern long lstrnlen_user(const char __user *,long); /* * Complex access routines -- macros */ -#ifdef CONFIG_COMPAT -#define user_addr_max() (TASK_SIZE) -#else -#define user_addr_max() (DEFAULT_TASK_SIZE) -#endif +#define user_addr_max() (~0UL) #define strnlen_user lstrnlen_user #define strlen_user(str) lstrnlen_user(str, 0x7fffffffL) diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 7c614d01f1f..f33113a6141 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -1,5 +1,5 @@ -#ifndef _ASM_SOCKET_H -#define _ASM_SOCKET_H +#ifndef _UAPI_ASM_SOCKET_H +#define _UAPI_ASM_SOCKET_H #include <asm/sockios.h> @@ -77,9 +77,4 @@ #define SO_MAX_PACING_RATE 0x4048 -/* O_NONBLOCK clashes with the bits used for socket types. Therefore we - * have to define SOCK_NONBLOCK to a different value here. - */ -#define SOCK_NONBLOCK 0x40000000 - -#endif /* _ASM_SOCKET_H */ +#endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index b5507ec06b8..413dc176929 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -161,7 +161,7 @@ static inline void prefetch_dst(const void *addr) /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words * per loop. This code is derived from glibc. */ -static inline unsigned long copy_dstaligned(unsigned long dst, +static noinline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len) { /* gcc complains that a2 and a3 may be uninitialized, but actually @@ -276,7 +276,7 @@ handle_store_error: /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR. * In case of an access fault the faulty address can be read from the per_cpu * exception data struct. */ -static unsigned long pa_memcpy_internal(void *dstp, const void *srcp, +static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp, unsigned long len) { register unsigned long src, dst, t1, t2, t3; @@ -529,7 +529,7 @@ long probe_kernel_read(void *dst, const void *src, size_t size) { unsigned long addr = (unsigned long)src; - if (size < 0 || addr < PAGE_SIZE) + if (addr < PAGE_SIZE) return -EFAULT; /* check for I/O space F_EXTEND(0xfff00000) access as well? */ diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 7584a5df0fa..9d08c71a967 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -282,16 +282,34 @@ bad_area: #endif switch (code) { case 15: /* Data TLB miss fault/Data page fault */ + /* send SIGSEGV when outside of vma */ + if (!vma || + address < vma->vm_start || address > vma->vm_end) { + si.si_signo = SIGSEGV; + si.si_code = SEGV_MAPERR; + break; + } + + /* send SIGSEGV for wrong permissions */ + if ((vma->vm_flags & acc_type) != acc_type) { + si.si_signo = SIGSEGV; + si.si_code = SEGV_ACCERR; + break; + } + + /* probably address is outside of mapped file */ + /* fall through */ case 17: /* NA data TLB miss / page fault */ case 18: /* Unaligned access - PCXS only */ si.si_signo = SIGBUS; - si.si_code = BUS_ADRERR; + si.si_code = (code == 18) ? BUS_ADRALN : BUS_ADRERR; break; case 16: /* Non-access instruction TLB miss fault */ case 26: /* PCXL: Data memory access rights trap */ default: si.si_signo = SIGSEGV; - si.si_code = SEGV_MAPERR; + si.si_code = (code == 26) ? SEGV_ACCERR : SEGV_MAPERR; + break; } si.si_errno = 0; si.si_addr = (void __user *) address; diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 607acf54a42..8a2463670a5 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -111,6 +111,7 @@ endif endif CFLAGS-$(CONFIG_PPC64) := -mtraceback=no -mcall-aixdesc +CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,-mminimal-toc) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions) CFLAGS-$(CONFIG_PPC32) := -ffixed-r2 $(MULTIPLEWORD) diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi index 4c617bf8cdb..4f6e48277c4 100644 --- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi @@ -223,13 +223,13 @@ reg = <0xe2000 0x1000>; }; -/include/ "qoriq-dma-0.dtsi" +/include/ "elo3-dma-0.dtsi" dma@100300 { fsl,iommu-parent = <&pamu0>; fsl,liodn-reg = <&guts 0x580>; /* DMA1LIODNR */ }; -/include/ "qoriq-dma-1.dtsi" +/include/ "elo3-dma-1.dtsi" dma@101300 { fsl,iommu-parent = <&pamu0>; fsl,liodn-reg = <&guts 0x584>; /* DMA2LIODNR */ diff --git a/arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi b/arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi new file mode 100644 index 00000000000..3c210e0d520 --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/elo3-dma-0.dtsi @@ -0,0 +1,82 @@ +/* + * QorIQ Elo3 DMA device tree stub [ controller @ offset 0x100000 ] + * + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +dma0: dma@100300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,elo3-dma"; + reg = <0x100300 0x4>, + <0x100600 0x4>; + ranges = <0x0 0x100100 0x500>; + dma-channel@0 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + interrupts = <28 2 0 0>; + }; + dma-channel@80 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + interrupts = <29 2 0 0>; + }; + dma-channel@100 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + interrupts = <30 2 0 0>; + }; + dma-channel@180 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + interrupts = <31 2 0 0>; + }; + dma-channel@300 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x300 0x80>; + interrupts = <76 2 0 0>; + }; + dma-channel@380 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x380 0x80>; + interrupts = <77 2 0 0>; + }; + dma-channel@400 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x400 0x80>; + interrupts = <78 2 0 0>; + }; + dma-channel@480 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x480 0x80>; + interrupts = <79 2 0 0>; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi b/arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi new file mode 100644 index 00000000000..cccf3bb3822 --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/elo3-dma-1.dtsi @@ -0,0 +1,82 @@ +/* + * QorIQ Elo3 DMA device tree stub [ controller @ offset 0x101000 ] + * + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +dma1: dma@101300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,elo3-dma"; + reg = <0x101300 0x4>, + <0x101600 0x4>; + ranges = <0x0 0x101100 0x500>; + dma-channel@0 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + interrupts = <32 2 0 0>; + }; + dma-channel@80 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + interrupts = <33 2 0 0>; + }; + dma-channel@100 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + interrupts = <34 2 0 0>; + }; + dma-channel@180 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + interrupts = <35 2 0 0>; + }; + dma-channel@300 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x300 0x80>; + interrupts = <80 2 0 0>; + }; + dma-channel@380 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x380 0x80>; + interrupts = <81 2 0 0>; + }; + dma-channel@400 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x400 0x80>; + interrupts = <82 2 0 0>; + }; + dma-channel@480 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x480 0x80>; + interrupts = <83 2 0 0>; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi index 510afa362de..4143a9733cd 100644 --- a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi @@ -387,8 +387,8 @@ reg = <0xea000 0x4000>; }; -/include/ "qoriq-dma-0.dtsi" -/include/ "qoriq-dma-1.dtsi" +/include/ "elo3-dma-0.dtsi" +/include/ "elo3-dma-1.dtsi" /include/ "qoriq-espi-0.dtsi" spi@110000 { diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig new file mode 100644 index 00000000000..62771e0adb7 --- /dev/null +++ b/arch/powerpc/configs/pseries_le_defconfig @@ -0,0 +1,352 @@ +CONFIG_PPC64=y +CONFIG_ALTIVEC=y +CONFIG_VSX=y +CONFIG_SMP=y +CONFIG_NR_CPUS=2048 +CONFIG_CPU_LITTLE_ENDIAN=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_IRQ_DOMAIN_DEBUG=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CPUSETS=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_BLK_DEV_INITRD=y +# CONFIG_COMPAT_BRK is not set +CONFIG_PROFILING=y +CONFIG_OPROFILE=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_PPC_SPLPAR=y +CONFIG_SCANLOG=m +CONFIG_PPC_SMLPAR=y +CONFIG_DTL=y +# CONFIG_PPC_PMAC is not set +CONFIG_RTAS_FLASH=m +CONFIG_IBMEBUS=y +CONFIG_HZ_100=y +CONFIG_BINFMT_MISC=m +CONFIG_PPC_TRANSACTIONAL_MEM=y +CONFIG_KEXEC=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_CMA=y +CONFIG_PPC_64K_PAGES=y +CONFIG_PPC_SUBPAGE_PROT=y +CONFIG_SCHED_SMT=y +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_RPA=m +CONFIG_HOTPLUG_PCI_RPA_DLPAR=m +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_XFRM_USER=m +CONFIG_NET_KEY=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_NET_IPIP=y +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +# CONFIG_IPV6 is not set +CONFIG_NETFILTER=y +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_UDPLITE=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_PROC_DEVICETREE=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_BLK_DEV_FD=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_VIRTIO_BLK=m +CONFIG_IDE=y +CONFIG_BLK_DEV_IDECD=y +CONFIG_BLK_DEV_GENERIC=y +CONFIG_BLK_DEV_AMD74XX=y +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_BE2ISCSI=m +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_IBMVSCSI=y +CONFIG_SCSI_IBMVFC=m +CONFIG_SCSI_SYM53C8XX_2=y +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 +CONFIG_SCSI_IPR=y +CONFIG_SCSI_QLA_FC=m +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=m +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_ATA=y +# CONFIG_ATA_SFF is not set +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=y +CONFIG_MD_RAID0=y +CONFIG_MD_RAID1=y +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_UEVENT=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_NETCONSOLE=y +CONFIG_NETPOLL_TRAP=y +CONFIG_TUN=m +CONFIG_VIRTIO_NET=m +CONFIG_VORTEX=y +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +CONFIG_PCNET32=y +CONFIG_TIGON3=y +CONFIG_CHELSIO_T1=m +CONFIG_BE2NET=m +CONFIG_S2IO=m +CONFIG_IBMVETH=y +CONFIG_EHEA=y +CONFIG_E100=y +CONFIG_E1000=y +CONFIG_E1000E=y +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_MLX4_EN=m +CONFIG_MYRI10GE=m +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPPOE=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +CONFIG_INPUT_EVDEV=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_PCSPKR=m +# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_ICOM=m +CONFIG_SERIAL_JSM=m +CONFIG_HVC_CONSOLE=y +CONFIG_HVC_RTAS=y +CONFIG_HVCS=m +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IBM_BSR=m +CONFIG_GEN_RTC=y +CONFIG_RAW_DRIVER=y +CONFIG_MAX_RAW_DEVS=1024 +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_OF=y +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G=y +CONFIG_FB_RADEON=y +CONFIG_FB_IBM_GXT4500=y +CONFIG_LCD_PLATFORM=m +# CONFIG_VGA_CONSOLE is not set +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_LOGO=y +CONFIG_HID_GYRATION=y +CONFIG_HID_PANTHERLORD=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SUNPLUS=y +CONFIG_USB_HIDDEV=y +CONFIG_USB=y +CONFIG_USB_MON=m +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_HCD_PPC_OF is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_STORAGE=m +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_EHCA=m +CONFIG_INFINIBAND_CXGB3=m +CONFIG_INFINIBAND_CXGB4=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_ISER=m +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT2_FS_XIP=y +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_REISERFS_FS=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +CONFIG_XFS_FS=m +CONFIG_XFS_POSIX_ACL=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_NILFS2_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_FUSE_FS=m +CONFIG_ISO9660_FS=y +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_HUGETLBFS=y +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_PSTORE=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFSD=m +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_CIFS=m +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_CRC_T10DIF=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_LOCKUP_DETECTOR=y +CONFIG_LATENCYTOP=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_CODE_PATCHING_SELFTEST=y +CONFIG_FTR_FIXUP_SELFTEST=y +CONFIG_MSI_BITMAP_SELFTEST=y +CONFIG_XMON=y +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_LZO=m +# CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_CRYPTO_DEV_NX=y +CONFIG_CRYPTO_DEV_NX_ENCRYPT=m diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index cc0655a702a..935b5e7a143 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -31,6 +31,8 @@ extern unsigned long randomize_et_dyn(unsigned long base); #define ELF_ET_DYN_BASE (randomize_et_dyn(0x20000000)) +#define ELF_CORE_EFLAGS (is_elf2_task() ? 2 : 0) + /* * Our registers are always unsigned longs, whether we're a 32 bit * process or 64 bit, on either a 64 bit or 32 bit kernel. @@ -86,6 +88,8 @@ typedef elf_vrregset_t elf_fpxregset_t; #ifdef __powerpc64__ # define SET_PERSONALITY(ex) \ do { \ + if (((ex).e_flags & 0x3) == 2) \ + set_thread_flag(TIF_ELF2ABI); \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ set_thread_flag(TIF_32BIT); \ else \ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 0c7f2bfcf13..d8b600b3f05 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -403,6 +403,8 @@ static inline unsigned long cmo_get_page_size(void) extern long pSeries_enable_reloc_on_exc(void); extern long pSeries_disable_reloc_on_exc(void); +extern long pseries_big_endian_exceptions(void); + #else #define pSeries_enable_reloc_on_exc() do {} while (0) diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index a63b045e707..12c32c5f533 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -287,6 +287,32 @@ static inline long disable_reloc_on_exceptions(void) { return plpar_set_mode(0, 3, 0, 0); } +/* + * Take exceptions in big endian mode on this partition + * + * Note: this call has a partition wide scope and can take a while to complete. + * If it returns H_LONG_BUSY_* it should be retried periodically until it + * returns H_SUCCESS. + */ +static inline long enable_big_endian_exceptions(void) +{ + /* mflags = 0: big endian exceptions */ + return plpar_set_mode(0, 4, 0, 0); +} + +/* + * Take exceptions in little endian mode on this partition + * + * Note: this call has a partition wide scope and can take a while to complete. + * If it returns H_LONG_BUSY_* it should be retried periodically until it + * returns H_SUCCESS. + */ +static inline long enable_little_endian_exceptions(void) +{ + /* mflags = 1: little endian exceptions */ + return plpar_set_mode(1, 4, 0, 0); +} + static inline long plapr_set_ciabr(unsigned long ciabr) { return plpar_set_mode(0, 1, ciabr, 0); diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 98da78e0c2c..084e0807db9 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -33,6 +33,7 @@ extern int boot_cpuid; extern int spinning_secondaries; extern void cpu_die(void); +extern int cpu_to_chip_id(int cpu); #ifdef CONFIG_SMP @@ -112,7 +113,6 @@ static inline struct cpumask *cpu_core_mask(int cpu) } extern int cpu_to_core_id(int cpu); -extern int cpu_to_chip_id(int cpu); /* Since OpenPIC has only 4 IPIs, we use slightly different message numbers. * diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 8fd6cf6dcee..9854c564ac5 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -105,6 +105,9 @@ static inline struct thread_info *current_thread_info(void) #define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation for stack store? */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ +#if defined(CONFIG_PPC64) +#define TIF_ELF2ABI 18 /* function descriptors must die! */ +#endif /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -183,6 +186,12 @@ static inline bool test_thread_local_flags(unsigned int flags) #define is_32bit_task() (1) #endif +#if defined(CONFIG_PPC64) +#define is_elf2_task() (test_thread_flag(TIF_ELF2ABI)) +#else +#define is_elf2_task() (0) +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 67130206534..4bd687d5e7a 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -686,6 +686,15 @@ void eeh_save_bars(struct eeh_dev *edev) for (i = 0; i < 16; i++) eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); + + /* + * For PCI bridges including root port, we need enable bus + * master explicitly. Otherwise, it can't fetch IODA table + * entries correctly. So we cache the bit in advance so that + * we can restore it after reset, either PHB range or PE range. + */ + if (edev->mode & EEH_DEV_BRIDGE) + edev->config_space[1] |= PCI_COMMAND_MASTER; } /** diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index d27c5afc90a..72d748b56c8 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -74,8 +74,13 @@ static int eeh_event_handler(void * dummy) pe = event->pe; if (pe) { eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n", - pe->phb->global_number, pe->addr); + if (pe->type & EEH_PE_PHB) + pr_info("EEH: Detected error on PHB#%d\n", + pe->phb->global_number); + else + pr_info("EEH: Detected PCI bus error on " + "PHB#%d-PE#%x\n", + pe->phb->global_number, pe->addr); eeh_handle_event(pe); eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } else { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 75c2d100998..3386d8ab7eb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -858,17 +858,21 @@ void show_regs(struct pt_regs * regs) printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); -#ifdef CONFIG_PPC64 - printk("SOFTE: %ld\n", regs->softe); -#endif trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) - printk("CFAR: "REG"\n", regs->orig_gpr3); - if (trap == 0x300 || trap == 0x600) + printk("CFAR: "REG" ", regs->orig_gpr3); + if (trap == 0x200 || trap == 0x300 || trap == 0x600) #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - printk("DEAR: "REG", ESR: "REG"\n", regs->dar, regs->dsisr); + printk("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); #else - printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr); + printk("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); +#endif +#ifdef CONFIG_PPC64 + printk("SOFTE: %ld ", regs->softe); +#endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (MSR_TM_ACTIVE(regs->msr)) + printk("\nPACATMSCRATCH: %016llx ", get_paca()->tm_scratch); #endif for (i = 0; i < 32; i++) { @@ -887,9 +891,6 @@ void show_regs(struct pt_regs * regs) printk("NIP ["REG"] %pS\n", regs->nip, (void *)regs->nip); printk("LR ["REG"] %pS\n", regs->link, (void *)regs->link); #endif -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - printk("PACATMSCRATCH [%llx]\n", get_paca()->tm_scratch); -#endif show_stack(current, (unsigned long *) regs->gpr[1]); if (!user_mode(regs)) show_instructions(regs); @@ -1086,25 +1087,45 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) regs->msr = MSR_USER; #else if (!is_32bit_task()) { - unsigned long entry, toc; + unsigned long entry; - /* start is a relocated pointer to the function descriptor for - * the elf _start routine. The first entry in the function - * descriptor is the entry address of _start and the second - * entry is the TOC value we need to use. - */ - __get_user(entry, (unsigned long __user *)start); - __get_user(toc, (unsigned long __user *)start+1); + if (is_elf2_task()) { + /* Look ma, no function descriptors! */ + entry = start; - /* Check whether the e_entry function descriptor entries - * need to be relocated before we can use them. - */ - if (load_addr != 0) { - entry += load_addr; - toc += load_addr; + /* + * Ulrich says: + * The latest iteration of the ABI requires that when + * calling a function (at its global entry point), + * the caller must ensure r12 holds the entry point + * address (so that the function can quickly + * establish addressability). + */ + regs->gpr[12] = start; + /* Make sure that's restored on entry to userspace. */ + set_thread_flag(TIF_RESTOREALL); + } else { + unsigned long toc; + + /* start is a relocated pointer to the function + * descriptor for the elf _start routine. The first + * entry in the function descriptor is the entry + * address of _start and the second entry is the TOC + * value we need to use. + */ + __get_user(entry, (unsigned long __user *)start); + __get_user(toc, (unsigned long __user *)start+1); + + /* Check whether the e_entry function descriptor entries + * need to be relocated before we can use them. + */ + if (load_addr != 0) { + entry += load_addr; + toc += load_addr; + } + regs->gpr[2] = toc; } regs->nip = entry; - regs->gpr[2] = toc; regs->msr = MSR_USER64; } else { regs->nip = start; diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f3a47098fb8..fa0ad8aafbc 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -777,6 +777,26 @@ int of_get_ibm_chip_id(struct device_node *np) return -1; } +/** + * cpu_to_chip_id - Return the cpus chip-id + * @cpu: The logical cpu number. + * + * Return the value of the ibm,chip-id property corresponding to the given + * logical cpu number. If the chip-id can not be found, returns -1. + */ +int cpu_to_chip_id(int cpu) +{ + struct device_node *np; + + np = of_get_cpu_node(cpu, NULL); + if (!np) + return -1; + + of_node_put(np); + return of_get_ibm_chip_id(np); +} +EXPORT_SYMBOL(cpu_to_chip_id); + #ifdef CONFIG_PPC_PSERIES /* * Fix up the uninitialized fields in a new device node: diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 749778e0a69..1844298f5ea 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -457,7 +457,15 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, if (copy_vsx_to_user(&frame->mc_vsregs, current)) return 1; msr |= MSR_VSX; - } + } else if (!ctx_has_vsx_region) + /* + * With a small context structure we can't hold the VSX + * registers, hence clear the MSR value to indicate the state + * was not saved. + */ + msr &= ~MSR_VSX; + + #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE /* save spe registers */ diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index b3c615764c9..e66f67b8b9e 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -701,12 +701,6 @@ badframe: int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { - /* Handler is *really* a pointer to the function descriptor for - * the signal routine. The first entry in the function - * descriptor is the entry address of signal and the second - * entry is the TOC value we need to use. - */ - func_descr_t __user *funct_desc_ptr; struct rt_sigframe __user *frame; unsigned long newsp = 0; long err = 0; @@ -766,19 +760,32 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info, goto badframe; regs->link = (unsigned long) &frame->tramp[0]; } - funct_desc_ptr = (func_descr_t __user *) ka->sa.sa_handler; /* Allocate a dummy caller frame for the signal handler. */ newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE; err |= put_user(regs->gpr[1], (unsigned long __user *)newsp); /* Set up "regs" so we "return" to the signal handler. */ - err |= get_user(regs->nip, &funct_desc_ptr->entry); + if (is_elf2_task()) { + regs->nip = (unsigned long) ka->sa.sa_handler; + regs->gpr[12] = regs->nip; + } else { + /* Handler is *really* a pointer to the function descriptor for + * the signal routine. The first entry in the function + * descriptor is the entry address of signal and the second + * entry is the TOC value we need to use. + */ + func_descr_t __user *funct_desc_ptr = + (func_descr_t __user *) ka->sa.sa_handler; + + err |= get_user(regs->nip, &funct_desc_ptr->entry); + err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); + } + /* enter the signal handler in native-endian mode */ regs->msr &= ~MSR_LE; regs->msr |= (MSR_KERNEL & MSR_LE); regs->gpr[1] = newsp; - err |= get_user(regs->gpr[2], &funct_desc_ptr->toc); regs->gpr[3] = signr; regs->result = 0; if (ka->sa.sa_flags & SA_SIGINFO) { diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 930cd8af350..a3b64f3bf9a 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -597,22 +597,6 @@ out: return id; } -/* Return the value of the chip-id property corresponding - * to the given logical cpu. - */ -int cpu_to_chip_id(int cpu) -{ - struct device_node *np; - - np = of_get_cpu_node(cpu, NULL); - if (!np) - return -1; - - of_node_put(np); - return of_get_ibm_chip_id(np); -} -EXPORT_SYMBOL(cpu_to_chip_id); - /* Helper routines for cpu to core mapping */ int cpu_core_index_of_thread(int cpu) { diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 192b051df97..b3b144121cc 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -213,8 +213,6 @@ static u64 scan_dispatch_log(u64 stop_tb) if (i == be64_to_cpu(vpa->dtl_idx)) return 0; while (i < be64_to_cpu(vpa->dtl_idx)) { - if (dtl_consumer) - dtl_consumer(dtl, i); dtb = be64_to_cpu(dtl->timebase); tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) + be32_to_cpu(dtl->ready_to_enqueue_time); @@ -227,6 +225,8 @@ static u64 scan_dispatch_log(u64 stop_tb) } if (dtb > stop_tb) break; + if (dtl_consumer) + dtl_consumer(dtl, i); stolen += tb_delta; ++i; ++dtl; diff --git a/arch/powerpc/kernel/vdso64/sigtramp.S b/arch/powerpc/kernel/vdso64/sigtramp.S index 45ea281e9a2..542c6f422e4 100644 --- a/arch/powerpc/kernel/vdso64/sigtramp.S +++ b/arch/powerpc/kernel/vdso64/sigtramp.S @@ -142,6 +142,13 @@ V_FUNCTION_END(__kernel_sigtramp_rt64) /* Size of CR reg in DWARF unwind info. */ #define CRSIZE 4 +/* Offset of CR reg within a full word. */ +#ifdef __LITTLE_ENDIAN__ +#define CROFF 0 +#else +#define CROFF (RSIZE - CRSIZE) +#endif + /* This is the offset of the VMX reg pointer. */ #define VREGS 48*RSIZE+33*8 @@ -181,7 +188,14 @@ V_FUNCTION_END(__kernel_sigtramp_rt64) rsave (31, 31*RSIZE); \ rsave (67, 32*RSIZE); /* ap, used as temp for nip */ \ rsave (65, 36*RSIZE); /* lr */ \ - rsave (70, 38*RSIZE + (RSIZE - CRSIZE)) /* cr */ + rsave (68, 38*RSIZE + CROFF); /* cr fields */ \ + rsave (69, 38*RSIZE + CROFF); \ + rsave (70, 38*RSIZE + CROFF); \ + rsave (71, 38*RSIZE + CROFF); \ + rsave (72, 38*RSIZE + CROFF); \ + rsave (73, 38*RSIZE + CROFF); \ + rsave (74, 38*RSIZE + CROFF); \ + rsave (75, 38*RSIZE + CROFF) /* Describe where the FP regs are saved. */ #define EH_FRAME_FP \ diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index e7d0c88f621..76a64821f4a 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1419,7 +1419,7 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) /* needed to ensure proper operation of coherent allocations * later, in case driver doesn't set it explicitly */ - dma_set_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64)); + dma_coerce_mask_and_coherent(&viodev->dev, DMA_BIT_MASK(64)); } /* register with generic device framework */ diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index 6936547018b..c5f734e20b0 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -123,6 +123,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct mm_struct *mm = current->mm; unsigned long addr, len, end; unsigned long next; + unsigned long flags; pgd_t *pgdp; int nr = 0; @@ -156,7 +157,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * So long as we atomically load page table pointers versus teardown, * we can follow the address down to the the page and take a ref on it. */ - local_irq_disable(); + local_irq_save(flags); pgdp = pgd_offset(mm, addr); do { @@ -179,7 +180,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, break; } while (pgdp++, addr = next, addr != end); - local_irq_enable(); + local_irq_restore(flags); return nr; } diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 3e99c149271..7ce9cf3b698 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -258,7 +258,7 @@ static bool slice_scan_available(unsigned long addr, slice = GET_HIGH_SLICE_INDEX(addr); *boundary_addr = (slice + end) ? ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!(available.high_slices & (1u << slice)); + return !!(available.high_slices & (1ul << slice)); } } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index c2a566fb8bb..132f8726a25 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -403,3 +403,14 @@ config PPC_DOORBELL default n endmenu + +config CPU_LITTLE_ENDIAN + bool "Build little endian kernel" + default n + help + This option selects whether a big endian or little endian kernel will + be built. + + Note that if cross compiling a little endian kernel, + CROSS_COMPILE must point to a toolchain capable of targeting + little endian powerpc. diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 8844628915d..1cb160dc160 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -19,6 +19,7 @@ #include <asm/io.h> #include <asm/prom.h> #include <asm/machdep.h> +#include <asm/smp.h> struct powernv_rng { diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 7fbc25b1813..ccb633e077b 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -189,8 +189,9 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) struct eeh_dev *edev; struct eeh_pe pe; struct pci_dn *pdn = PCI_DN(dn); - const u32 *class_code, *vendor_id, *device_id; - const u32 *regs; + const __be32 *classp, *vendorp, *devicep; + u32 class_code; + const __be32 *regs; u32 pcie_flags; int enable = 0; int ret; @@ -201,22 +202,24 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) return NULL; /* Retrieve class/vendor/device IDs */ - class_code = of_get_property(dn, "class-code", NULL); - vendor_id = of_get_property(dn, "vendor-id", NULL); - device_id = of_get_property(dn, "device-id", NULL); + classp = of_get_property(dn, "class-code", NULL); + vendorp = of_get_property(dn, "vendor-id", NULL); + devicep = of_get_property(dn, "device-id", NULL); /* Skip for bad OF node or PCI-ISA bridge */ - if (!class_code || !vendor_id || !device_id) + if (!classp || !vendorp || !devicep) return NULL; if (dn->type && !strcmp(dn->type, "isa")) return NULL; + class_code = of_read_number(classp, 1); + /* * Update class code and mode of eeh device. We need * correctly reflects that current device is root port * or PCIe switch downstream port. */ - edev->class_code = *class_code; + edev->class_code = class_code; edev->pcie_cap = pseries_eeh_find_cap(dn, PCI_CAP_ID_EXP); edev->mode &= 0xFFFFFF00; if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { @@ -243,12 +246,12 @@ static void *pseries_eeh_of_probe(struct device_node *dn, void *flag) /* Initialize the fake PE */ memset(&pe, 0, sizeof(struct eeh_pe)); pe.phb = edev->phb; - pe.config_addr = regs[0]; + pe.config_addr = of_read_number(regs, 1); /* Enable EEH on the device */ ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); if (!ret) { - edev->config_addr = regs[0]; + edev->config_addr = of_read_number(regs, 1); /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 356bc75ca74..4fca3def9db 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -245,6 +245,23 @@ static void pSeries_lpar_hptab_clear(void) &(ptes[j].pteh), &(ptes[j].ptel)); } } + +#ifdef __LITTLE_ENDIAN__ + /* Reset exceptions to big endian */ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + + rc = pseries_big_endian_exceptions(); + /* + * At this point it is unlikely panic() will get anything + * out to the user, but at least this will stop us from + * continuing on further and creating an even more + * difficult to debug situation. + */ + if (rc) + panic("Could not enable big endian exceptions"); + } +#endif } /* diff --git a/arch/powerpc/platforms/pseries/rng.c b/arch/powerpc/platforms/pseries/rng.c index a702f1c0824..72a102758d4 100644 --- a/arch/powerpc/platforms/pseries/rng.c +++ b/arch/powerpc/platforms/pseries/rng.c @@ -13,6 +13,7 @@ #include <linux/of.h> #include <asm/archrandom.h> #include <asm/machdep.h> +#include <asm/plpar_wrappers.h> static int pseries_get_random_long(unsigned long *v) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 1f97e2b87a6..c1f19085870 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -442,6 +442,32 @@ static void pSeries_machine_kexec(struct kimage *image) } #endif +#ifdef __LITTLE_ENDIAN__ +long pseries_big_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_big_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} + +static long pseries_little_endian_exceptions(void) +{ + long rc; + + while (1) { + rc = enable_little_endian_exceptions(); + if (!H_IS_LONG_BUSY(rc)) + return rc; + mdelay(get_longbusy_msecs(rc)); + } +} +#endif + static void __init pSeries_setup_arch(void) { panic_timeout = 10; @@ -698,6 +724,22 @@ static int __init pSeries_probe(void) /* Now try to figure out if we are running on LPAR */ of_scan_flat_dt(pseries_probe_fw_features, NULL); +#ifdef __LITTLE_ENDIAN__ + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + long rc; + /* + * Tell the hypervisor that we want our exceptions to + * be taken in little endian mode. If this fails we don't + * want to use BUG() because it will trigger an exception. + */ + rc = pseries_little_endian_exceptions(); + if (rc) { + ppc_md.progress("H_SET_MODE LE exception fail", 0); + panic("Could not enable little endian exceptions"); + } + } +#endif + if (firmware_has_feature(FW_FEATURE_LPAR)) hpte_init_lpar(); else diff --git a/arch/powerpc/platforms/wsp/chroma.c b/arch/powerpc/platforms/wsp/chroma.c index 8ef53bc2e70..aaa46b35371 100644 --- a/arch/powerpc/platforms/wsp/chroma.c +++ b/arch/powerpc/platforms/wsp/chroma.c @@ -15,6 +15,7 @@ #include <linux/of.h> #include <linux/smp.h> #include <linux/time.h> +#include <linux/of_fdt.h> #include <asm/machdep.h> #include <asm/udbg.h> diff --git a/arch/powerpc/platforms/wsp/h8.c b/arch/powerpc/platforms/wsp/h8.c index d18e6cc19df..a3c87f39575 100644 --- a/arch/powerpc/platforms/wsp/h8.c +++ b/arch/powerpc/platforms/wsp/h8.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/io.h> +#include <linux/of_address.h> #include "wsp.h" diff --git a/arch/powerpc/platforms/wsp/ics.c b/arch/powerpc/platforms/wsp/ics.c index 2d3b1dd9571..9cd92e64502 100644 --- a/arch/powerpc/platforms/wsp/ics.c +++ b/arch/powerpc/platforms/wsp/ics.c @@ -18,6 +18,8 @@ #include <linux/smp.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> #include <asm/io.h> #include <asm/irq.h> diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c index cb565bf9365..3f672980793 100644 --- a/arch/powerpc/platforms/wsp/opb_pic.c +++ b/arch/powerpc/platforms/wsp/opb_pic.c @@ -15,6 +15,8 @@ #include <linux/of.h> #include <linux/slab.h> #include <linux/time.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> #include <asm/reg_a2.h> #include <asm/irq.h> diff --git a/arch/powerpc/platforms/wsp/psr2.c b/arch/powerpc/platforms/wsp/psr2.c index 508ec8282b9..a87b414c766 100644 --- a/arch/powerpc/platforms/wsp/psr2.c +++ b/arch/powerpc/platforms/wsp/psr2.c @@ -15,6 +15,7 @@ #include <linux/of.h> #include <linux/smp.h> #include <linux/time.h> +#include <linux/of_fdt.h> #include <asm/machdep.h> #include <asm/udbg.h> diff --git a/arch/powerpc/platforms/wsp/scom_wsp.c b/arch/powerpc/platforms/wsp/scom_wsp.c index 8928507affe..6538b4de34f 100644 --- a/arch/powerpc/platforms/wsp/scom_wsp.c +++ b/arch/powerpc/platforms/wsp/scom_wsp.c @@ -14,6 +14,7 @@ #include <linux/of.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/of_address.h> #include <asm/cputhreads.h> #include <asm/reg_a2.h> diff --git a/arch/powerpc/platforms/wsp/wsp.c b/arch/powerpc/platforms/wsp/wsp.c index ddb6efe8891..58cd1f00e1e 100644 --- a/arch/powerpc/platforms/wsp/wsp.c +++ b/arch/powerpc/platforms/wsp/wsp.c @@ -13,6 +13,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/time.h> +#include <linux/of_address.h> #include <asm/scom.h> diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 7d7443283a9..947b5c417e8 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -15,7 +15,7 @@ struct pci_sysdata { int domain; /* PCI domain */ int node; /* NUMA node */ #ifdef CONFIG_ACPI - void *acpi; /* ACPI-specific data */ + struct acpi_device *companion; /* ACPI companion device */ #endif #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index b93e09a0fa2..37813b5ddc3 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -147,6 +147,8 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_CORE_C1_RES 0x00000660 + #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index daff69e2115..1185fe7a7f4 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -296,4 +296,4 @@ static struct kernel_param_ops audit_param_ops = { .get = param_get_bool, }; -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a7cccb6d7fe..c96314abd14 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if PAGETABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { + struct page *page = virt_to_page(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - tlb_remove_page(tlb, virt_to_page(pmd)); + pgtable_pmd_page_dtor(page); + tlb_remove_page(tlb, page); } #if PAGETABLE_LEVELS > 3 @@ -209,7 +211,7 @@ static int preallocate_pmds(pmd_t *pmds[]) if (!pmd) failed = true; if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { - free_page((unsigned long)pmds[i]); + free_page((unsigned long)pmd); pmd = NULL; failed = true; } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 7fb24e53d4c..4f25ec07755 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -518,7 +518,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) sd = &info->sd; sd->domain = domain; sd->node = node; - sd->acpi = device->handle; + sd->companion = device; /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -589,7 +589,7 @@ int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { struct pci_sysdata *sd = bridge->bus->sysdata; - ACPI_HANDLE_SET(&bridge->dev, sd->acpi); + ACPI_COMPANION_SET(&bridge->dev, sd->companion); return 0; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 862f458d476..cdc629cf075 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -171,9 +171,12 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq, - unsigned int rw_flags) +static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int rw_flags) { + if (blk_queue_io_stat(q)) + rw_flags |= REQ_IO_STAT; + rq->mq_ctx = ctx; rq->cmd_flags = rw_flags; ctx->rq_dispatched[rw_is_sync(rw_flags)]++; @@ -197,7 +200,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); if (rq) { - blk_mq_rq_ctx_init(ctx, rq, rw); + blk_mq_rq_ctx_init(q, ctx, rq, rw); break; } else if (!(gfp & __GFP_WAIT)) break; @@ -718,6 +721,8 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, { struct blk_mq_ctx *ctx = rq->mq_ctx; + trace_block_rq_insert(hctx->queue, rq); + list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); @@ -921,7 +926,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_getrq(q, bio, rw); rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); if (likely(rq)) - blk_mq_rq_ctx_init(ctx, rq, rw); + blk_mq_rq_ctx_init(q, ctx, rq, rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); @@ -1377,6 +1382,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, q->queue_hw_ctx = hctxs; q->mq_ops = reg->ops; + q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; blk_queue_make_request(q, blk_mq_make_request); blk_queue_rq_timed_out(q, reg->ops->timeout); diff --git a/block/partitions/efi.c b/block/partitions/efi.c index a8287b49d06..dc51f467a56 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -96,6 +96,7 @@ * - Code works, detects all the partitions. * ************************************************************/ +#include <linux/kernel.h> #include <linux/crc32.h> #include <linux/ctype.h> #include <linux/math64.h> @@ -715,8 +716,8 @@ int efi_partition(struct parsed_partitions *state) efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); /* Naively convert UTF16-LE to 7 bits. */ - label_max = min(sizeof(info->volname) - 1, - sizeof(ptes[i].partition_name)); + label_max = min(ARRAY_SIZE(info->volname) - 1, + ARRAY_SIZE(ptes[i].partition_name)); info->volname[label_max] = 0; while (label_count < label_max) { u8 c = ptes[i].partition_name[label_count] & 0xff; diff --git a/crypto/Kconfig b/crypto/Kconfig index 71f337aefa3..4ae5734fb47 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1402,6 +1402,9 @@ config CRYPTO_USER_API_SKCIPHER This option enables the user-spaces interface for symmetric key cipher algorithms. +config CRYPTO_HASH_INFO + bool + source "drivers/crypto/Kconfig" source crypto/asymmetric_keys/Kconfig diff --git a/crypto/Makefile b/crypto/Makefile index 80019ba8da3..b3a7e807e08 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o obj-$(CONFIG_XOR_BLOCKS) += xor.o obj-$(CONFIG_ASYNC_CORE) += async_tx/ obj-$(CONFIG_ASYMMETRIC_KEY_TYPE) += asymmetric_keys/ +obj-$(CONFIG_CRYPTO_HASH_INFO) += hash_info.o diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 6d2c2ea1255..03a6eb95ab5 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -12,6 +12,8 @@ if ASYMMETRIC_KEY_TYPE config ASYMMETRIC_PUBLIC_KEY_SUBTYPE tristate "Asymmetric public-key crypto algorithm subtype" select MPILIB + select PUBLIC_KEY_ALGO_RSA + select CRYPTO_HASH_INFO help This option provides support for asymmetric public key type handling. If signature generation and/or verification are to be used, @@ -20,8 +22,8 @@ config ASYMMETRIC_PUBLIC_KEY_SUBTYPE config PUBLIC_KEY_ALGO_RSA tristate "RSA public-key algorithm" - depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE select MPILIB_EXTRA + select MPILIB help This option enables support for the RSA algorithm (PKCS#1, RFC3447). diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c index cf807654d22..b77eb530478 100644 --- a/crypto/asymmetric_keys/asymmetric_type.c +++ b/crypto/asymmetric_keys/asymmetric_type.c @@ -209,6 +209,7 @@ struct key_type key_type_asymmetric = { .match = asymmetric_key_match, .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, + .def_lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index cb2e29180a8..97eb001960b 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -22,29 +22,25 @@ MODULE_LICENSE("GPL"); -const char *const pkey_algo[PKEY_ALGO__LAST] = { +const char *const pkey_algo_name[PKEY_ALGO__LAST] = { [PKEY_ALGO_DSA] = "DSA", [PKEY_ALGO_RSA] = "RSA", }; -EXPORT_SYMBOL_GPL(pkey_algo); +EXPORT_SYMBOL_GPL(pkey_algo_name); -const char *const pkey_hash_algo[PKEY_HASH__LAST] = { - [PKEY_HASH_MD4] = "md4", - [PKEY_HASH_MD5] = "md5", - [PKEY_HASH_SHA1] = "sha1", - [PKEY_HASH_RIPE_MD_160] = "rmd160", - [PKEY_HASH_SHA256] = "sha256", - [PKEY_HASH_SHA384] = "sha384", - [PKEY_HASH_SHA512] = "sha512", - [PKEY_HASH_SHA224] = "sha224", +const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST] = { +#if defined(CONFIG_PUBLIC_KEY_ALGO_RSA) || \ + defined(CONFIG_PUBLIC_KEY_ALGO_RSA_MODULE) + [PKEY_ALGO_RSA] = &RSA_public_key_algorithm, +#endif }; -EXPORT_SYMBOL_GPL(pkey_hash_algo); +EXPORT_SYMBOL_GPL(pkey_algo); -const char *const pkey_id_type[PKEY_ID_TYPE__LAST] = { +const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = { [PKEY_ID_PGP] = "PGP", [PKEY_ID_X509] = "X509", }; -EXPORT_SYMBOL_GPL(pkey_id_type); +EXPORT_SYMBOL_GPL(pkey_id_type_name); /* * Provide a part of a description of the key for /proc/keys. @@ -56,7 +52,7 @@ static void public_key_describe(const struct key *asymmetric_key, if (key) seq_printf(m, "%s.%s", - pkey_id_type[key->id_type], key->algo->name); + pkey_id_type_name[key->id_type], key->algo->name); } /* @@ -78,21 +74,45 @@ EXPORT_SYMBOL_GPL(public_key_destroy); /* * Verify a signature using a public key. */ -static int public_key_verify_signature(const struct key *key, - const struct public_key_signature *sig) +int public_key_verify_signature(const struct public_key *pk, + const struct public_key_signature *sig) { - const struct public_key *pk = key->payload.data; + const struct public_key_algorithm *algo; + + BUG_ON(!pk); + BUG_ON(!pk->mpi[0]); + BUG_ON(!pk->mpi[1]); + BUG_ON(!sig); + BUG_ON(!sig->digest); + BUG_ON(!sig->mpi[0]); + + algo = pk->algo; + if (!algo) { + if (pk->pkey_algo >= PKEY_ALGO__LAST) + return -ENOPKG; + algo = pkey_algo[pk->pkey_algo]; + if (!algo) + return -ENOPKG; + } - if (!pk->algo->verify_signature) + if (!algo->verify_signature) return -ENOTSUPP; - if (sig->nr_mpi != pk->algo->n_sig_mpi) { + if (sig->nr_mpi != algo->n_sig_mpi) { pr_debug("Signature has %u MPI not %u\n", - sig->nr_mpi, pk->algo->n_sig_mpi); + sig->nr_mpi, algo->n_sig_mpi); return -EINVAL; } - return pk->algo->verify_signature(pk, sig); + return algo->verify_signature(pk, sig); +} +EXPORT_SYMBOL_GPL(public_key_verify_signature); + +static int public_key_verify_signature_2(const struct key *key, + const struct public_key_signature *sig) +{ + const struct public_key *pk = key->payload.data; + return public_key_verify_signature(pk, sig); } /* @@ -103,6 +123,6 @@ struct asymmetric_key_subtype public_key_subtype = { .name = "public_key", .describe = public_key_describe, .destroy = public_key_destroy, - .verify_signature = public_key_verify_signature, + .verify_signature = public_key_verify_signature_2, }; EXPORT_SYMBOL_GPL(public_key_subtype); diff --git a/crypto/asymmetric_keys/public_key.h b/crypto/asymmetric_keys/public_key.h index 5e5e3562689..5c37a22a063 100644 --- a/crypto/asymmetric_keys/public_key.h +++ b/crypto/asymmetric_keys/public_key.h @@ -28,3 +28,9 @@ struct public_key_algorithm { }; extern const struct public_key_algorithm RSA_public_key_algorithm; + +/* + * public_key.c + */ +extern int public_key_verify_signature(const struct public_key *pk, + const struct public_key_signature *sig); diff --git a/crypto/asymmetric_keys/rsa.c b/crypto/asymmetric_keys/rsa.c index 4a6a0696f8a..90a17f59ba2 100644 --- a/crypto/asymmetric_keys/rsa.c +++ b/crypto/asymmetric_keys/rsa.c @@ -73,13 +73,13 @@ static const struct { size_t size; } RSA_ASN1_templates[PKEY_HASH__LAST] = { #define _(X) { RSA_digest_info_##X, sizeof(RSA_digest_info_##X) } - [PKEY_HASH_MD5] = _(MD5), - [PKEY_HASH_SHA1] = _(SHA1), - [PKEY_HASH_RIPE_MD_160] = _(RIPE_MD_160), - [PKEY_HASH_SHA256] = _(SHA256), - [PKEY_HASH_SHA384] = _(SHA384), - [PKEY_HASH_SHA512] = _(SHA512), - [PKEY_HASH_SHA224] = _(SHA224), + [HASH_ALGO_MD5] = _(MD5), + [HASH_ALGO_SHA1] = _(SHA1), + [HASH_ALGO_RIPE_MD_160] = _(RIPE_MD_160), + [HASH_ALGO_SHA256] = _(SHA256), + [HASH_ALGO_SHA384] = _(SHA384), + [HASH_ALGO_SHA512] = _(SHA512), + [HASH_ALGO_SHA224] = _(SHA224), #undef _ }; diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index facbf26bc6b..29893162497 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -47,6 +47,8 @@ void x509_free_certificate(struct x509_certificate *cert) kfree(cert->subject); kfree(cert->fingerprint); kfree(cert->authority); + kfree(cert->sig.digest); + mpi_free(cert->sig.rsa.s); kfree(cert); } } @@ -152,33 +154,33 @@ int x509_note_pkey_algo(void *context, size_t hdrlen, return -ENOPKG; /* Unsupported combination */ case OID_md4WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_MD5; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_MD5; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha1WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_SHA1; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA1; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha256WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_SHA256; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA256; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha384WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_SHA384; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA384; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha512WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_SHA512; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA512; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; case OID_sha224WithRSAEncryption: - ctx->cert->sig_hash_algo = PKEY_HASH_SHA224; - ctx->cert->sig_pkey_algo = PKEY_ALGO_RSA; + ctx->cert->sig.pkey_hash_algo = HASH_ALGO_SHA224; + ctx->cert->sig.pkey_algo = PKEY_ALGO_RSA; break; } @@ -203,8 +205,8 @@ int x509_note_signature(void *context, size_t hdrlen, return -EINVAL; } - ctx->cert->sig = value; - ctx->cert->sig_size = vlen; + ctx->cert->raw_sig = value; + ctx->cert->raw_sig_size = vlen; return 0; } @@ -343,8 +345,9 @@ int x509_extract_key_data(void *context, size_t hdrlen, if (ctx->last_oid != OID_rsaEncryption) return -ENOPKG; - /* There seems to be an extraneous 0 byte on the front of the data */ - ctx->cert->pkey_algo = PKEY_ALGO_RSA; + ctx->cert->pub->pkey_algo = PKEY_ALGO_RSA; + + /* Discard the BIT STRING metadata */ ctx->key = value + 1; ctx->key_size = vlen - 1; return 0; diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h index f86dc5fcc4a..87d9cc26f63 100644 --- a/crypto/asymmetric_keys/x509_parser.h +++ b/crypto/asymmetric_keys/x509_parser.h @@ -9,6 +9,7 @@ * 2 of the Licence, or (at your option) any later version. */ +#include <linux/time.h> #include <crypto/public_key.h> struct x509_certificate { @@ -20,13 +21,11 @@ struct x509_certificate { char *authority; /* Authority key fingerprint as hex */ struct tm valid_from; struct tm valid_to; - enum pkey_algo pkey_algo : 8; /* Public key algorithm */ - enum pkey_algo sig_pkey_algo : 8; /* Signature public key algorithm */ - enum pkey_hash_algo sig_hash_algo : 8; /* Signature hash algorithm */ const void *tbs; /* Signed data */ - size_t tbs_size; /* Size of signed data */ - const void *sig; /* Signature data */ - size_t sig_size; /* Size of sigature */ + unsigned tbs_size; /* Size of signed data */ + unsigned raw_sig_size; /* Size of sigature */ + const void *raw_sig; /* Signature data */ + struct public_key_signature sig; /* Signature parameters */ }; /* @@ -34,3 +33,10 @@ struct x509_certificate { */ extern void x509_free_certificate(struct x509_certificate *cert); extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen); + +/* + * x509_public_key.c + */ +extern int x509_get_sig_params(struct x509_certificate *cert); +extern int x509_check_signature(const struct public_key *pub, + struct x509_certificate *cert); diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 06007f0e880..f83300b6e8c 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -18,85 +18,162 @@ #include <linux/asn1_decoder.h> #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> +#include <keys/system_keyring.h> #include <crypto/hash.h> #include "asymmetric_keys.h" #include "public_key.h" #include "x509_parser.h" -static const -struct public_key_algorithm *x509_public_key_algorithms[PKEY_ALGO__LAST] = { - [PKEY_ALGO_DSA] = NULL, -#if defined(CONFIG_PUBLIC_KEY_ALGO_RSA) || \ - defined(CONFIG_PUBLIC_KEY_ALGO_RSA_MODULE) - [PKEY_ALGO_RSA] = &RSA_public_key_algorithm, -#endif -}; +/* + * Find a key in the given keyring by issuer and authority. + */ +static struct key *x509_request_asymmetric_key( + struct key *keyring, + const char *signer, size_t signer_len, + const char *authority, size_t auth_len) +{ + key_ref_t key; + char *id; + + /* Construct an identifier. */ + id = kmalloc(signer_len + 2 + auth_len + 1, GFP_KERNEL); + if (!id) + return ERR_PTR(-ENOMEM); + + memcpy(id, signer, signer_len); + id[signer_len + 0] = ':'; + id[signer_len + 1] = ' '; + memcpy(id + signer_len + 2, authority, auth_len); + id[signer_len + 2 + auth_len] = 0; + + pr_debug("Look up: \"%s\"\n", id); + + key = keyring_search(make_key_ref(keyring, 1), + &key_type_asymmetric, id); + if (IS_ERR(key)) + pr_debug("Request for module key '%s' err %ld\n", + id, PTR_ERR(key)); + kfree(id); + + if (IS_ERR(key)) { + switch (PTR_ERR(key)) { + /* Hide some search errors */ + case -EACCES: + case -ENOTDIR: + case -EAGAIN: + return ERR_PTR(-ENOKEY); + default: + return ERR_CAST(key); + } + } + + pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); + return key_ref_to_ptr(key); +} /* - * Check the signature on a certificate using the provided public key + * Set up the signature parameters in an X.509 certificate. This involves + * digesting the signed data and extracting the signature. */ -static int x509_check_signature(const struct public_key *pub, - const struct x509_certificate *cert) +int x509_get_sig_params(struct x509_certificate *cert) { - struct public_key_signature *sig; struct crypto_shash *tfm; struct shash_desc *desc; size_t digest_size, desc_size; + void *digest; int ret; pr_devel("==>%s()\n", __func__); - + + if (cert->sig.rsa.s) + return 0; + + cert->sig.rsa.s = mpi_read_raw_data(cert->raw_sig, cert->raw_sig_size); + if (!cert->sig.rsa.s) + return -ENOMEM; + cert->sig.nr_mpi = 1; + /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[cert->sig_hash_algo], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[cert->sig.pkey_hash_algo], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); digest_size = crypto_shash_digestsize(tfm); - /* We allocate the hash operational data storage on the end of our - * context data. + /* We allocate the hash operational data storage on the end of the + * digest storage space. */ ret = -ENOMEM; - sig = kzalloc(sizeof(*sig) + desc_size + digest_size, GFP_KERNEL); - if (!sig) - goto error_no_sig; + digest = kzalloc(digest_size + desc_size, GFP_KERNEL); + if (!digest) + goto error; - sig->pkey_hash_algo = cert->sig_hash_algo; - sig->digest = (u8 *)sig + sizeof(*sig) + desc_size; - sig->digest_size = digest_size; + cert->sig.digest = digest; + cert->sig.digest_size = digest_size; - desc = (void *)sig + sizeof(*sig); - desc->tfm = tfm; - desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + desc = digest + digest_size; + desc->tfm = tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; ret = crypto_shash_init(desc); if (ret < 0) goto error; + might_sleep(); + ret = crypto_shash_finup(desc, cert->tbs, cert->tbs_size, digest); +error: + crypto_free_shash(tfm); + pr_devel("<==%s() = %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(x509_get_sig_params); - ret = -ENOMEM; - sig->rsa.s = mpi_read_raw_data(cert->sig, cert->sig_size); - if (!sig->rsa.s) - goto error; +/* + * Check the signature on a certificate using the provided public key + */ +int x509_check_signature(const struct public_key *pub, + struct x509_certificate *cert) +{ + int ret; - ret = crypto_shash_finup(desc, cert->tbs, cert->tbs_size, sig->digest); - if (ret < 0) - goto error_mpi; + pr_devel("==>%s()\n", __func__); - ret = pub->algo->verify_signature(pub, sig); + ret = x509_get_sig_params(cert); + if (ret < 0) + return ret; + ret = public_key_verify_signature(pub, &cert->sig); pr_debug("Cert Verification: %d\n", ret); + return ret; +} +EXPORT_SYMBOL_GPL(x509_check_signature); -error_mpi: - mpi_free(sig->rsa.s); -error: - kfree(sig); -error_no_sig: - crypto_free_shash(tfm); +/* + * Check the new certificate against the ones in the trust keyring. If one of + * those is the signing key and validates the new certificate, then mark the + * new certificate as being trusted. + * + * Return 0 if the new certificate was successfully validated, 1 if we couldn't + * find a matching parent certificate in the trusted list and an error if there + * is a matching certificate but the signature check fails. + */ +static int x509_validate_trust(struct x509_certificate *cert, + struct key *trust_keyring) +{ + const struct public_key *pk; + struct key *key; + int ret = 1; - pr_devel("<==%s() = %d\n", __func__, ret); + key = x509_request_asymmetric_key(trust_keyring, + cert->issuer, strlen(cert->issuer), + cert->authority, + strlen(cert->authority)); + if (!IS_ERR(key)) { + pk = key->payload.data; + ret = x509_check_signature(pk, cert); + } return ret; } @@ -106,7 +183,6 @@ error_no_sig: static int x509_key_preparse(struct key_preparsed_payload *prep) { struct x509_certificate *cert; - struct tm now; size_t srlen, sulen; char *desc = NULL; int ret; @@ -117,7 +193,18 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) pr_devel("Cert Issuer: %s\n", cert->issuer); pr_devel("Cert Subject: %s\n", cert->subject); - pr_devel("Cert Key Algo: %s\n", pkey_algo[cert->pkey_algo]); + + if (cert->pub->pkey_algo >= PKEY_ALGO__LAST || + cert->sig.pkey_algo >= PKEY_ALGO__LAST || + cert->sig.pkey_hash_algo >= PKEY_HASH__LAST || + !pkey_algo[cert->pub->pkey_algo] || + !pkey_algo[cert->sig.pkey_algo] || + !hash_algo_name[cert->sig.pkey_hash_algo]) { + ret = -ENOPKG; + goto error_free_cert; + } + + pr_devel("Cert Key Algo: %s\n", pkey_algo_name[cert->pub->pkey_algo]); pr_devel("Cert Valid From: %04ld-%02d-%02d %02d:%02d:%02d\n", cert->valid_from.tm_year + 1900, cert->valid_from.tm_mon + 1, cert->valid_from.tm_mday, cert->valid_from.tm_hour, @@ -127,61 +214,29 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) cert->valid_to.tm_mday, cert->valid_to.tm_hour, cert->valid_to.tm_min, cert->valid_to.tm_sec); pr_devel("Cert Signature: %s + %s\n", - pkey_algo[cert->sig_pkey_algo], - pkey_hash_algo[cert->sig_hash_algo]); + pkey_algo_name[cert->sig.pkey_algo], + hash_algo_name[cert->sig.pkey_hash_algo]); - if (!cert->fingerprint || !cert->authority) { - pr_warn("Cert for '%s' must have SubjKeyId and AuthKeyId extensions\n", + if (!cert->fingerprint) { + pr_warn("Cert for '%s' must have a SubjKeyId extension\n", cert->subject); ret = -EKEYREJECTED; goto error_free_cert; } - time_to_tm(CURRENT_TIME.tv_sec, 0, &now); - pr_devel("Now: %04ld-%02d-%02d %02d:%02d:%02d\n", - now.tm_year + 1900, now.tm_mon + 1, now.tm_mday, - now.tm_hour, now.tm_min, now.tm_sec); - if (now.tm_year < cert->valid_from.tm_year || - (now.tm_year == cert->valid_from.tm_year && - (now.tm_mon < cert->valid_from.tm_mon || - (now.tm_mon == cert->valid_from.tm_mon && - (now.tm_mday < cert->valid_from.tm_mday || - (now.tm_mday == cert->valid_from.tm_mday && - (now.tm_hour < cert->valid_from.tm_hour || - (now.tm_hour == cert->valid_from.tm_hour && - (now.tm_min < cert->valid_from.tm_min || - (now.tm_min == cert->valid_from.tm_min && - (now.tm_sec < cert->valid_from.tm_sec - ))))))))))) { - pr_warn("Cert %s is not yet valid\n", cert->fingerprint); - ret = -EKEYREJECTED; - goto error_free_cert; - } - if (now.tm_year > cert->valid_to.tm_year || - (now.tm_year == cert->valid_to.tm_year && - (now.tm_mon > cert->valid_to.tm_mon || - (now.tm_mon == cert->valid_to.tm_mon && - (now.tm_mday > cert->valid_to.tm_mday || - (now.tm_mday == cert->valid_to.tm_mday && - (now.tm_hour > cert->valid_to.tm_hour || - (now.tm_hour == cert->valid_to.tm_hour && - (now.tm_min > cert->valid_to.tm_min || - (now.tm_min == cert->valid_to.tm_min && - (now.tm_sec > cert->valid_to.tm_sec - ))))))))))) { - pr_warn("Cert %s has expired\n", cert->fingerprint); - ret = -EKEYEXPIRED; - goto error_free_cert; - } - - cert->pub->algo = x509_public_key_algorithms[cert->pkey_algo]; + cert->pub->algo = pkey_algo[cert->pub->pkey_algo]; cert->pub->id_type = PKEY_ID_X509; - /* Check the signature on the key */ - if (strcmp(cert->fingerprint, cert->authority) == 0) { - ret = x509_check_signature(cert->pub, cert); + /* Check the signature on the key if it appears to be self-signed */ + if (!cert->authority || + strcmp(cert->fingerprint, cert->authority) == 0) { + ret = x509_check_signature(cert->pub, cert); /* self-signed */ if (ret < 0) goto error_free_cert; + } else { + ret = x509_validate_trust(cert, system_trusted_keyring); + if (!ret) + prep->trusted = 1; } /* Propose a description */ @@ -237,3 +292,6 @@ static void __exit x509_key_exit(void) module_init(x509_key_init); module_exit(x509_key_exit); + +MODULE_DESCRIPTION("X.509 certificate parser"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c index 9e62feffb37..f8c0b8dbeb7 100644 --- a/crypto/async_tx/async_memcpy.c +++ b/crypto/async_tx/async_memcpy.c @@ -50,33 +50,36 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, &dest, 1, &src, 1, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; + struct dmaengine_unmap_data *unmap = NULL; - if (device && is_dma_copy_aligned(device, src_offset, dest_offset, len)) { - dma_addr_t dma_dest, dma_src; + if (device) + unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOIO); + + if (unmap && is_dma_copy_aligned(device, src_offset, dest_offset, len)) { unsigned long dma_prep_flags = 0; if (submit->cb_fn) dma_prep_flags |= DMA_PREP_INTERRUPT; if (submit->flags & ASYNC_TX_FENCE) dma_prep_flags |= DMA_PREP_FENCE; - dma_dest = dma_map_page(device->dev, dest, dest_offset, len, - DMA_FROM_DEVICE); - - dma_src = dma_map_page(device->dev, src, src_offset, len, - DMA_TO_DEVICE); - - tx = device->device_prep_dma_memcpy(chan, dma_dest, dma_src, - len, dma_prep_flags); - if (!tx) { - dma_unmap_page(device->dev, dma_dest, len, - DMA_FROM_DEVICE); - dma_unmap_page(device->dev, dma_src, len, - DMA_TO_DEVICE); - } + + unmap->to_cnt = 1; + unmap->addr[0] = dma_map_page(device->dev, src, src_offset, len, + DMA_TO_DEVICE); + unmap->from_cnt = 1; + unmap->addr[1] = dma_map_page(device->dev, dest, dest_offset, len, + DMA_FROM_DEVICE); + unmap->len = len; + + tx = device->device_prep_dma_memcpy(chan, unmap->addr[1], + unmap->addr[0], len, + dma_prep_flags); } if (tx) { pr_debug("%s: (async) len: %zu\n", __func__, len); + + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); } else { void *dest_buf, *src_buf; @@ -96,6 +99,8 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, async_tx_sync_epilog(submit); } + dmaengine_unmap_put(unmap); + return tx; } EXPORT_SYMBOL_GPL(async_memcpy); diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 91d5d385899..d05327caf69 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -46,49 +46,24 @@ static struct page *pq_scribble_page; * do_async_gen_syndrome - asynchronously calculate P and/or Q */ static __async_inline struct dma_async_tx_descriptor * -do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks, - const unsigned char *scfs, unsigned int offset, int disks, - size_t len, dma_addr_t *dma_src, +do_async_gen_syndrome(struct dma_chan *chan, + const unsigned char *scfs, int disks, + struct dmaengine_unmap_data *unmap, + enum dma_ctrl_flags dma_flags, struct async_submit_ctl *submit) { struct dma_async_tx_descriptor *tx = NULL; struct dma_device *dma = chan->device; - enum dma_ctrl_flags dma_flags = 0; enum async_tx_flags flags_orig = submit->flags; dma_async_tx_callback cb_fn_orig = submit->cb_fn; dma_async_tx_callback cb_param_orig = submit->cb_param; int src_cnt = disks - 2; - unsigned char coefs[src_cnt]; unsigned short pq_src_cnt; dma_addr_t dma_dest[2]; int src_off = 0; - int idx; - int i; - /* DMAs use destinations as sources, so use BIDIRECTIONAL mapping */ - if (P(blocks, disks)) - dma_dest[0] = dma_map_page(dma->dev, P(blocks, disks), offset, - len, DMA_BIDIRECTIONAL); - else - dma_flags |= DMA_PREP_PQ_DISABLE_P; - if (Q(blocks, disks)) - dma_dest[1] = dma_map_page(dma->dev, Q(blocks, disks), offset, - len, DMA_BIDIRECTIONAL); - else - dma_flags |= DMA_PREP_PQ_DISABLE_Q; - - /* convert source addresses being careful to collapse 'empty' - * sources and update the coefficients accordingly - */ - for (i = 0, idx = 0; i < src_cnt; i++) { - if (blocks[i] == NULL) - continue; - dma_src[idx] = dma_map_page(dma->dev, blocks[i], offset, len, - DMA_TO_DEVICE); - coefs[idx] = scfs[i]; - idx++; - } - src_cnt = idx; + if (submit->flags & ASYNC_TX_FENCE) + dma_flags |= DMA_PREP_FENCE; while (src_cnt > 0) { submit->flags = flags_orig; @@ -100,28 +75,25 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks, if (src_cnt > pq_src_cnt) { submit->flags &= ~ASYNC_TX_ACK; submit->flags |= ASYNC_TX_FENCE; - dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP; submit->cb_fn = NULL; submit->cb_param = NULL; } else { - dma_flags &= ~DMA_COMPL_SKIP_DEST_UNMAP; submit->cb_fn = cb_fn_orig; submit->cb_param = cb_param_orig; if (cb_fn_orig) dma_flags |= DMA_PREP_INTERRUPT; } - if (submit->flags & ASYNC_TX_FENCE) - dma_flags |= DMA_PREP_FENCE; - /* Since we have clobbered the src_list we are committed - * to doing this asynchronously. Drivers force forward - * progress in case they can not provide a descriptor + /* Drivers force forward progress in case they can not provide + * a descriptor */ for (;;) { + dma_dest[0] = unmap->addr[disks - 2]; + dma_dest[1] = unmap->addr[disks - 1]; tx = dma->device_prep_dma_pq(chan, dma_dest, - &dma_src[src_off], + &unmap->addr[src_off], pq_src_cnt, - &coefs[src_off], len, + &scfs[src_off], unmap->len, dma_flags); if (likely(tx)) break; @@ -129,6 +101,7 @@ do_async_gen_syndrome(struct dma_chan *chan, struct page **blocks, dma_async_issue_pending(chan); } + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); submit->depend_tx = tx; @@ -188,10 +161,6 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, * set to NULL those buffers will be replaced with the raid6_zero_page * in the synchronous path and omitted in the hardware-asynchronous * path. - * - * 'blocks' note: if submit->scribble is NULL then the contents of - * 'blocks' may be overwritten to perform address conversions - * (dma_map_page() or page_address()). */ struct dma_async_tx_descriptor * async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, @@ -202,26 +171,69 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, &P(blocks, disks), 2, blocks, src_cnt, len); struct dma_device *device = chan ? chan->device : NULL; - dma_addr_t *dma_src = NULL; + struct dmaengine_unmap_data *unmap = NULL; BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks))); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) blocks; + if (device) + unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (dma_src && device && + if (unmap && (src_cnt <= dma_maxpq(device, 0) || dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && is_dma_pq_aligned(device, offset, 0, len)) { + struct dma_async_tx_descriptor *tx; + enum dma_ctrl_flags dma_flags = 0; + unsigned char coefs[src_cnt]; + int i, j; + /* run the p+q asynchronously */ pr_debug("%s: (async) disks: %d len: %zu\n", __func__, disks, len); - return do_async_gen_syndrome(chan, blocks, raid6_gfexp, offset, - disks, len, dma_src, submit); + + /* convert source addresses being careful to collapse 'empty' + * sources and update the coefficients accordingly + */ + unmap->len = len; + for (i = 0, j = 0; i < src_cnt; i++) { + if (blocks[i] == NULL) + continue; + unmap->addr[j] = dma_map_page(device->dev, blocks[i], offset, + len, DMA_TO_DEVICE); + coefs[j] = raid6_gfexp[i]; + unmap->to_cnt++; + j++; + } + + /* + * DMAs use destinations as sources, + * so use BIDIRECTIONAL mapping + */ + unmap->bidi_cnt++; + if (P(blocks, disks)) + unmap->addr[j++] = dma_map_page(device->dev, P(blocks, disks), + offset, len, DMA_BIDIRECTIONAL); + else { + unmap->addr[j++] = 0; + dma_flags |= DMA_PREP_PQ_DISABLE_P; + } + + unmap->bidi_cnt++; + if (Q(blocks, disks)) + unmap->addr[j++] = dma_map_page(device->dev, Q(blocks, disks), + offset, len, DMA_BIDIRECTIONAL); + else { + unmap->addr[j++] = 0; + dma_flags |= DMA_PREP_PQ_DISABLE_Q; + } + + tx = do_async_gen_syndrome(chan, coefs, j, unmap, dma_flags, submit); + dmaengine_unmap_put(unmap); + return tx; } + dmaengine_unmap_put(unmap); + /* run the pq synchronously */ pr_debug("%s: (sync) disks: %d len: %zu\n", __func__, disks, len); @@ -277,50 +289,60 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, struct dma_async_tx_descriptor *tx; unsigned char coefs[disks-2]; enum dma_ctrl_flags dma_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0; - dma_addr_t *dma_src = NULL; - int src_cnt = 0; + struct dmaengine_unmap_data *unmap = NULL; BUG_ON(disks < 4); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) blocks; + if (device) + unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (dma_src && device && disks <= dma_maxpq(device, 0) && + if (unmap && disks <= dma_maxpq(device, 0) && is_dma_pq_aligned(device, offset, 0, len)) { struct device *dev = device->dev; - dma_addr_t *pq = &dma_src[disks-2]; - int i; + dma_addr_t pq[2]; + int i, j = 0, src_cnt = 0; pr_debug("%s: (async) disks: %d len: %zu\n", __func__, disks, len); - if (!P(blocks, disks)) + + unmap->len = len; + for (i = 0; i < disks-2; i++) + if (likely(blocks[i])) { + unmap->addr[j] = dma_map_page(dev, blocks[i], + offset, len, + DMA_TO_DEVICE); + coefs[j] = raid6_gfexp[i]; + unmap->to_cnt++; + src_cnt++; + j++; + } + + if (!P(blocks, disks)) { + pq[0] = 0; dma_flags |= DMA_PREP_PQ_DISABLE_P; - else + } else { pq[0] = dma_map_page(dev, P(blocks, disks), offset, len, DMA_TO_DEVICE); - if (!Q(blocks, disks)) + unmap->addr[j++] = pq[0]; + unmap->to_cnt++; + } + if (!Q(blocks, disks)) { + pq[1] = 0; dma_flags |= DMA_PREP_PQ_DISABLE_Q; - else + } else { pq[1] = dma_map_page(dev, Q(blocks, disks), offset, len, DMA_TO_DEVICE); + unmap->addr[j++] = pq[1]; + unmap->to_cnt++; + } if (submit->flags & ASYNC_TX_FENCE) dma_flags |= DMA_PREP_FENCE; - for (i = 0; i < disks-2; i++) - if (likely(blocks[i])) { - dma_src[src_cnt] = dma_map_page(dev, blocks[i], - offset, len, - DMA_TO_DEVICE); - coefs[src_cnt] = raid6_gfexp[i]; - src_cnt++; - } - for (;;) { - tx = device->device_prep_dma_pq_val(chan, pq, dma_src, + tx = device->device_prep_dma_pq_val(chan, pq, + unmap->addr, src_cnt, coefs, len, pqres, @@ -330,6 +352,8 @@ async_syndrome_val(struct page **blocks, unsigned int offset, int disks, async_tx_quiesce(&submit->depend_tx); dma_async_issue_pending(chan); } + + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); return tx; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index a9f08a6a582..934a8498149 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -26,6 +26,7 @@ #include <linux/dma-mapping.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> +#include <linux/dmaengine.h> static struct dma_async_tx_descriptor * async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, @@ -34,35 +35,45 @@ async_sum_product(struct page *dest, struct page **srcs, unsigned char *coef, struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, &dest, 1, srcs, 2, len); struct dma_device *dma = chan ? chan->device : NULL; + struct dmaengine_unmap_data *unmap = NULL; const u8 *amul, *bmul; u8 ax, bx; u8 *a, *b, *c; - if (dma) { - dma_addr_t dma_dest[2]; - dma_addr_t dma_src[2]; + if (dma) + unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO); + + if (unmap) { struct device *dev = dma->dev; + dma_addr_t pq[2]; struct dma_async_tx_descriptor *tx; enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; if (submit->flags & ASYNC_TX_FENCE) dma_flags |= DMA_PREP_FENCE; - dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); - dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE); - dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE); - tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 2, coef, + unmap->addr[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE); + unmap->addr[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE); + unmap->to_cnt = 2; + + unmap->addr[2] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); + unmap->bidi_cnt = 1; + /* engine only looks at Q, but expects it to follow P */ + pq[1] = unmap->addr[2]; + + unmap->len = len; + tx = dma->device_prep_dma_pq(chan, pq, unmap->addr, 2, coef, len, dma_flags); if (tx) { + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); + dmaengine_unmap_put(unmap); return tx; } /* could not get a descriptor, unmap and fall through to * the synchronous path */ - dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE); - dma_unmap_page(dev, dma_src[1], len, DMA_TO_DEVICE); + dmaengine_unmap_put(unmap); } /* run the operation synchronously */ @@ -89,23 +100,38 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len, struct dma_chan *chan = async_tx_find_channel(submit, DMA_PQ, &dest, 1, &src, 1, len); struct dma_device *dma = chan ? chan->device : NULL; + struct dmaengine_unmap_data *unmap = NULL; const u8 *qmul; /* Q multiplier table */ u8 *d, *s; - if (dma) { + if (dma) + unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO); + + if (unmap) { dma_addr_t dma_dest[2]; - dma_addr_t dma_src[1]; struct device *dev = dma->dev; struct dma_async_tx_descriptor *tx; enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P; if (submit->flags & ASYNC_TX_FENCE) dma_flags |= DMA_PREP_FENCE; - dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); - dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE); - tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef, - len, dma_flags); + unmap->addr[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE); + unmap->to_cnt++; + unmap->addr[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL); + dma_dest[1] = unmap->addr[1]; + unmap->bidi_cnt++; + unmap->len = len; + + /* this looks funny, but the engine looks for Q at + * dma_dest[1] and ignores dma_dest[0] as a dest + * due to DMA_PREP_PQ_DISABLE_P + */ + tx = dma->device_prep_dma_pq(chan, dma_dest, unmap->addr, + 1, &coef, len, dma_flags); + if (tx) { + dma_set_unmap(tx, unmap); + dmaengine_unmap_put(unmap); async_tx_submit(chan, tx, submit); return tx; } @@ -113,8 +139,7 @@ async_mult(struct page *dest, struct page *src, u8 coef, size_t len, /* could not get a descriptor, unmap and fall through to * the synchronous path */ - dma_unmap_page(dev, dma_dest[1], len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, dma_src[0], len, DMA_TO_DEVICE); + dmaengine_unmap_put(unmap); } /* no channel available, or failed to allocate a descriptor, so diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index 7be34248b45..39ea4791a3c 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -128,7 +128,7 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx, } device->device_issue_pending(chan); } else { - if (dma_wait_for_async_tx(depend_tx) != DMA_SUCCESS) + if (dma_wait_for_async_tx(depend_tx) != DMA_COMPLETE) panic("%s: DMA error waiting for depend_tx\n", __func__); tx->tx_submit(tx); @@ -280,7 +280,7 @@ void async_tx_quiesce(struct dma_async_tx_descriptor **tx) * we are referring to the correct operation */ BUG_ON(async_tx_test_ack(*tx)); - if (dma_wait_for_async_tx(*tx) != DMA_SUCCESS) + if (dma_wait_for_async_tx(*tx) != DMA_COMPLETE) panic("%s: DMA error waiting for transaction\n", __func__); async_tx_ack(*tx); diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 8ade0a0481c..3c562f5a60b 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -33,48 +33,31 @@ /* do_async_xor - dma map the pages and perform the xor with an engine */ static __async_inline struct dma_async_tx_descriptor * -do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, - unsigned int offset, int src_cnt, size_t len, dma_addr_t *dma_src, +do_async_xor(struct dma_chan *chan, struct dmaengine_unmap_data *unmap, struct async_submit_ctl *submit) { struct dma_device *dma = chan->device; struct dma_async_tx_descriptor *tx = NULL; - int src_off = 0; - int i; dma_async_tx_callback cb_fn_orig = submit->cb_fn; void *cb_param_orig = submit->cb_param; enum async_tx_flags flags_orig = submit->flags; - enum dma_ctrl_flags dma_flags; - int xor_src_cnt = 0; - dma_addr_t dma_dest; - - /* map the dest bidrectional in case it is re-used as a source */ - dma_dest = dma_map_page(dma->dev, dest, offset, len, DMA_BIDIRECTIONAL); - for (i = 0; i < src_cnt; i++) { - /* only map the dest once */ - if (!src_list[i]) - continue; - if (unlikely(src_list[i] == dest)) { - dma_src[xor_src_cnt++] = dma_dest; - continue; - } - dma_src[xor_src_cnt++] = dma_map_page(dma->dev, src_list[i], offset, - len, DMA_TO_DEVICE); - } - src_cnt = xor_src_cnt; + enum dma_ctrl_flags dma_flags = 0; + int src_cnt = unmap->to_cnt; + int xor_src_cnt; + dma_addr_t dma_dest = unmap->addr[unmap->to_cnt]; + dma_addr_t *src_list = unmap->addr; while (src_cnt) { + dma_addr_t tmp; + submit->flags = flags_orig; - dma_flags = 0; xor_src_cnt = min(src_cnt, (int)dma->max_xor); - /* if we are submitting additional xors, leave the chain open, - * clear the callback parameters, and leave the destination - * buffer mapped + /* if we are submitting additional xors, leave the chain open + * and clear the callback parameters */ if (src_cnt > xor_src_cnt) { submit->flags &= ~ASYNC_TX_ACK; submit->flags |= ASYNC_TX_FENCE; - dma_flags = DMA_COMPL_SKIP_DEST_UNMAP; submit->cb_fn = NULL; submit->cb_param = NULL; } else { @@ -85,12 +68,18 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, dma_flags |= DMA_PREP_INTERRUPT; if (submit->flags & ASYNC_TX_FENCE) dma_flags |= DMA_PREP_FENCE; - /* Since we have clobbered the src_list we are committed - * to doing this asynchronously. Drivers force forward progress - * in case they can not provide a descriptor + + /* Drivers force forward progress in case they can not provide a + * descriptor */ - tx = dma->device_prep_dma_xor(chan, dma_dest, &dma_src[src_off], - xor_src_cnt, len, dma_flags); + tmp = src_list[0]; + if (src_list > unmap->addr) + src_list[0] = dma_dest; + tx = dma->device_prep_dma_xor(chan, dma_dest, src_list, + xor_src_cnt, unmap->len, + dma_flags); + src_list[0] = tmp; + if (unlikely(!tx)) async_tx_quiesce(&submit->depend_tx); @@ -99,22 +88,21 @@ do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, while (unlikely(!tx)) { dma_async_issue_pending(chan); tx = dma->device_prep_dma_xor(chan, dma_dest, - &dma_src[src_off], - xor_src_cnt, len, + src_list, + xor_src_cnt, unmap->len, dma_flags); } + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); submit->depend_tx = tx; if (src_cnt > xor_src_cnt) { /* drop completed sources */ src_cnt -= xor_src_cnt; - src_off += xor_src_cnt; - /* use the intermediate result a source */ - dma_src[--src_off] = dma_dest; src_cnt++; + src_list += xor_src_cnt - 1; } else break; } @@ -189,22 +177,40 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, struct dma_chan *chan = async_tx_find_channel(submit, DMA_XOR, &dest, 1, src_list, src_cnt, len); - dma_addr_t *dma_src = NULL; + struct dma_device *device = chan ? chan->device : NULL; + struct dmaengine_unmap_data *unmap = NULL; BUG_ON(src_cnt <= 1); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) src_list; + if (device) + unmap = dmaengine_get_unmap_data(device->dev, src_cnt+1, GFP_NOIO); + + if (unmap && is_dma_xor_aligned(device, offset, 0, len)) { + struct dma_async_tx_descriptor *tx; + int i, j; - if (dma_src && chan && is_dma_xor_aligned(chan->device, offset, 0, len)) { /* run the xor asynchronously */ pr_debug("%s (async): len: %zu\n", __func__, len); - return do_async_xor(chan, dest, src_list, offset, src_cnt, len, - dma_src, submit); + unmap->len = len; + for (i = 0, j = 0; i < src_cnt; i++) { + if (!src_list[i]) + continue; + unmap->to_cnt++; + unmap->addr[j++] = dma_map_page(device->dev, src_list[i], + offset, len, DMA_TO_DEVICE); + } + + /* map it bidirectional as it may be re-used as a source */ + unmap->addr[j] = dma_map_page(device->dev, dest, offset, len, + DMA_BIDIRECTIONAL); + unmap->bidi_cnt = 1; + + tx = do_async_xor(chan, unmap, submit); + dmaengine_unmap_put(unmap); + return tx; } else { + dmaengine_unmap_put(unmap); /* run the xor synchronously */ pr_debug("%s (sync): len: %zu\n", __func__, len); WARN_ONCE(chan, "%s: no space for dma address conversion\n", @@ -268,16 +274,14 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, struct dma_chan *chan = xor_val_chan(submit, dest, src_list, src_cnt, len); struct dma_device *device = chan ? chan->device : NULL; struct dma_async_tx_descriptor *tx = NULL; - dma_addr_t *dma_src = NULL; + struct dmaengine_unmap_data *unmap = NULL; BUG_ON(src_cnt <= 1); - if (submit->scribble) - dma_src = submit->scribble; - else if (sizeof(dma_addr_t) <= sizeof(struct page *)) - dma_src = (dma_addr_t *) src_list; + if (device) + unmap = dmaengine_get_unmap_data(device->dev, src_cnt, GFP_NOIO); - if (dma_src && device && src_cnt <= device->max_xor && + if (unmap && src_cnt <= device->max_xor && is_dma_xor_aligned(device, offset, 0, len)) { unsigned long dma_prep_flags = 0; int i; @@ -288,11 +292,15 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, dma_prep_flags |= DMA_PREP_INTERRUPT; if (submit->flags & ASYNC_TX_FENCE) dma_prep_flags |= DMA_PREP_FENCE; - for (i = 0; i < src_cnt; i++) - dma_src[i] = dma_map_page(device->dev, src_list[i], - offset, len, DMA_TO_DEVICE); - tx = device->device_prep_dma_xor_val(chan, dma_src, src_cnt, + for (i = 0; i < src_cnt; i++) { + unmap->addr[i] = dma_map_page(device->dev, src_list[i], + offset, len, DMA_TO_DEVICE); + unmap->to_cnt++; + } + unmap->len = len; + + tx = device->device_prep_dma_xor_val(chan, unmap->addr, src_cnt, len, result, dma_prep_flags); if (unlikely(!tx)) { @@ -301,11 +309,11 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, while (!tx) { dma_async_issue_pending(chan); tx = device->device_prep_dma_xor_val(chan, - dma_src, src_cnt, len, result, + unmap->addr, src_cnt, len, result, dma_prep_flags); } } - + dma_set_unmap(tx, unmap); async_tx_submit(chan, tx, submit); } else { enum async_tx_flags flags_orig = submit->flags; @@ -327,6 +335,7 @@ async_xor_val(struct page *dest, struct page **src_list, unsigned int offset, async_tx_sync_epilog(submit); submit->flags = flags_orig; } + dmaengine_unmap_put(unmap); return tx; } diff --git a/crypto/async_tx/raid6test.c b/crypto/async_tx/raid6test.c index 4a92bac744d..dad95f45b88 100644 --- a/crypto/async_tx/raid6test.c +++ b/crypto/async_tx/raid6test.c @@ -28,7 +28,7 @@ #undef pr #define pr(fmt, args...) pr_info("raid6test: " fmt, ##args) -#define NDISKS 16 /* Including P and Q */ +#define NDISKS 64 /* Including P and Q */ static struct page *dataptrs[NDISKS]; static addr_conv_t addr_conv[NDISKS]; @@ -219,6 +219,14 @@ static int raid6_test(void) err += test(11, &tests); err += test(12, &tests); } + + /* the 24 disk case is special for ioatdma as it is the boudary point + * at which it needs to switch from 8-source ops to 16-source + * ops for continuation (assumes DMA_HAS_PQ_CONTINUE is not set) + */ + if (NDISKS > 24) + err += test(24, &tests); + err += test(NDISKS, &tests); pr("\n"); diff --git a/crypto/hash_info.c b/crypto/hash_info.c new file mode 100644 index 00000000000..3e7ff46f26e --- /dev/null +++ b/crypto/hash_info.c @@ -0,0 +1,56 @@ +/* + * Hash Info: Hash algorithms information + * + * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/export.h> +#include <crypto/hash_info.h> + +const char *const hash_algo_name[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = "md4", + [HASH_ALGO_MD5] = "md5", + [HASH_ALGO_SHA1] = "sha1", + [HASH_ALGO_RIPE_MD_160] = "rmd160", + [HASH_ALGO_SHA256] = "sha256", + [HASH_ALGO_SHA384] = "sha384", + [HASH_ALGO_SHA512] = "sha512", + [HASH_ALGO_SHA224] = "sha224", + [HASH_ALGO_RIPE_MD_128] = "rmd128", + [HASH_ALGO_RIPE_MD_256] = "rmd256", + [HASH_ALGO_RIPE_MD_320] = "rmd320", + [HASH_ALGO_WP_256] = "wp256", + [HASH_ALGO_WP_384] = "wp384", + [HASH_ALGO_WP_512] = "wp512", + [HASH_ALGO_TGR_128] = "tgr128", + [HASH_ALGO_TGR_160] = "tgr160", + [HASH_ALGO_TGR_192] = "tgr192", +}; +EXPORT_SYMBOL_GPL(hash_algo_name); + +const int hash_digest_size[HASH_ALGO__LAST] = { + [HASH_ALGO_MD4] = MD5_DIGEST_SIZE, + [HASH_ALGO_MD5] = MD5_DIGEST_SIZE, + [HASH_ALGO_SHA1] = SHA1_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_160] = RMD160_DIGEST_SIZE, + [HASH_ALGO_SHA256] = SHA256_DIGEST_SIZE, + [HASH_ALGO_SHA384] = SHA384_DIGEST_SIZE, + [HASH_ALGO_SHA512] = SHA512_DIGEST_SIZE, + [HASH_ALGO_SHA224] = SHA224_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_128] = RMD128_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_256] = RMD256_DIGEST_SIZE, + [HASH_ALGO_RIPE_MD_320] = RMD320_DIGEST_SIZE, + [HASH_ALGO_WP_256] = WP256_DIGEST_SIZE, + [HASH_ALGO_WP_384] = WP384_DIGEST_SIZE, + [HASH_ALGO_WP_512] = WP512_DIGEST_SIZE, + [HASH_ALGO_TGR_128] = TGR128_DIGEST_SIZE, + [HASH_ALGO_TGR_160] = TGR160_DIGEST_SIZE, + [HASH_ALGO_TGR_192] = TGR192_DIGEST_SIZE, +}; +EXPORT_SYMBOL_GPL(hash_digest_size); diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index c95df0b8c88..5d9248526d7 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -235,17 +235,6 @@ config ACPI_INITRD_TABLE_OVERRIDE initrd, therefore it's safe to say Y. See Documentation/acpi/initrd_table_override.txt for details -config ACPI_BLACKLIST_YEAR - int "Disable ACPI for systems before Jan 1st this year" if X86_32 - default 0 - help - Enter a 4-digit year, e.g., 2001, to disable ACPI by default - on platforms with DMI BIOS date before January 1st that year. - "acpi=force" can be used to override this mechanism. - - Enter 0 to disable this mechanism and allow ACPI to - run by default no matter what the year. (default) - config ACPI_DEBUG bool "Debug Statements" default n diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index b9f0d5f4bba..8711e379716 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -56,7 +56,6 @@ static int ac_sleep_before_get_state_ms; struct acpi_ac { struct power_supply charger; - struct acpi_device *adev; struct platform_device *pdev; unsigned long long state; }; @@ -70,8 +69,9 @@ struct acpi_ac { static int acpi_ac_get_state(struct acpi_ac *ac) { acpi_status status; + acpi_handle handle = ACPI_HANDLE(&ac->pdev->dev); - status = acpi_evaluate_integer(ac->adev->handle, "_PSR", NULL, + status = acpi_evaluate_integer(handle, "_PSR", NULL, &ac->state); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, @@ -119,6 +119,7 @@ static enum power_supply_property ac_props[] = { static void acpi_ac_notify_handler(acpi_handle handle, u32 event, void *data) { struct acpi_ac *ac = data; + struct acpi_device *adev; if (!ac) return; @@ -141,10 +142,11 @@ static void acpi_ac_notify_handler(acpi_handle handle, u32 event, void *data) msleep(ac_sleep_before_get_state_ms); acpi_ac_get_state(ac); - acpi_bus_generate_netlink_event(ac->adev->pnp.device_class, + adev = ACPI_COMPANION(&ac->pdev->dev); + acpi_bus_generate_netlink_event(adev->pnp.device_class, dev_name(&ac->pdev->dev), event, (u32) ac->state); - acpi_notifier_call_chain(ac->adev, event, (u32) ac->state); + acpi_notifier_call_chain(adev, event, (u32) ac->state); kobject_uevent(&ac->charger.dev->kobj, KOBJ_CHANGE); } @@ -178,8 +180,8 @@ static int acpi_ac_probe(struct platform_device *pdev) if (!pdev) return -EINVAL; - result = acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev); - if (result) + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) return -ENODEV; ac = kzalloc(sizeof(struct acpi_ac), GFP_KERNEL); @@ -188,7 +190,6 @@ static int acpi_ac_probe(struct platform_device *pdev) strcpy(acpi_device_name(adev), ACPI_AC_DEVICE_NAME); strcpy(acpi_device_class(adev), ACPI_AC_CLASS); - ac->adev = adev; ac->pdev = pdev; platform_set_drvdata(pdev, ac); diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index d3961014aad..6745fe137b9 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -163,6 +163,15 @@ static const struct acpi_device_id acpi_lpss_device_ids[] = { { "80860F41", (unsigned long)&byt_i2c_dev_desc }, { "INT33B2", }, + { "INT3430", (unsigned long)&lpt_dev_desc }, + { "INT3431", (unsigned long)&lpt_dev_desc }, + { "INT3432", (unsigned long)&lpt_dev_desc }, + { "INT3433", (unsigned long)&lpt_dev_desc }, + { "INT3434", (unsigned long)&lpt_uart_dev_desc }, + { "INT3435", (unsigned long)&lpt_uart_dev_desc }, + { "INT3436", (unsigned long)&lpt_sdio_dev_desc }, + { "INT3437", }, + { } }; diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 8a4cfc7e71f..dbfe49e5fd6 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -111,7 +111,7 @@ int acpi_create_platform_device(struct acpi_device *adev, pdevinfo.id = -1; pdevinfo.res = resources; pdevinfo.num_res = count; - pdevinfo.acpi_node.handle = adev->handle; + pdevinfo.acpi_node.companion = adev; pdev = platform_device_register_full(&pdevinfo); if (IS_ERR(pdev)) { dev_err(&adev->dev, "platform device creation failed: %ld\n", diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c index fb848378d58..078c4f7fe2d 100644 --- a/drivers/acpi/blacklist.c +++ b/drivers/acpi/blacklist.c @@ -75,39 +75,6 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = { {""} }; -#if CONFIG_ACPI_BLACKLIST_YEAR - -static int __init blacklist_by_year(void) -{ - int year; - - /* Doesn't exist? Likely an old system */ - if (!dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL)) { - printk(KERN_ERR PREFIX "no DMI BIOS year, " - "acpi=force is required to enable ACPI\n" ); - return 1; - } - /* 0? Likely a buggy new BIOS */ - if (year == 0) { - printk(KERN_ERR PREFIX "DMI BIOS year==0, " - "assuming ACPI-capable machine\n" ); - return 0; - } - if (year < CONFIG_ACPI_BLACKLIST_YEAR) { - printk(KERN_ERR PREFIX "BIOS age (%d) fails cutoff (%d), " - "acpi=force is required to enable ACPI\n", - year, CONFIG_ACPI_BLACKLIST_YEAR); - return 1; - } - return 0; -} -#else -static inline int blacklist_by_year(void) -{ - return 0; -} -#endif - int __init acpi_blacklisted(void) { int i = 0; @@ -166,8 +133,6 @@ int __init acpi_blacklisted(void) } } - blacklisted += blacklist_by_year(); - dmi_check_system(acpi_osi_dmi_table); return blacklisted; diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index d42b2fb5a7e..b3480cf7db1 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -22,16 +22,12 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <linux/device.h> +#include <linux/acpi.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm_qos.h> #include <linux/pm_runtime.h> -#include <acpi/acpi.h> -#include <acpi/acpi_bus.h> -#include <acpi/acpi_drivers.h> - #include "internal.h" #define _COMPONENT ACPI_POWER_COMPONENT @@ -548,7 +544,7 @@ static int acpi_dev_pm_get_state(struct device *dev, struct acpi_device *adev, */ int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p, int d_max_in) { - acpi_handle handle = DEVICE_ACPI_HANDLE(dev); + acpi_handle handle = ACPI_HANDLE(dev); struct acpi_device *adev; int ret, d_min, d_max; @@ -656,7 +652,7 @@ int acpi_pm_device_run_wake(struct device *phys_dev, bool enable) if (!device_run_wake(phys_dev)) return -EINVAL; - handle = DEVICE_ACPI_HANDLE(phys_dev); + handle = ACPI_HANDLE(phys_dev); if (!handle || acpi_bus_get_device(handle, &adev)) { dev_dbg(phys_dev, "ACPI handle without context in %s!\n", __func__); @@ -700,7 +696,7 @@ int acpi_pm_device_sleep_wake(struct device *dev, bool enable) if (!device_can_wakeup(dev)) return -EINVAL; - handle = DEVICE_ACPI_HANDLE(dev); + handle = ACPI_HANDLE(dev); if (!handle || acpi_bus_get_device(handle, &adev)) { dev_dbg(dev, "ACPI handle without context in %s!\n", __func__); return -ENODEV; @@ -722,7 +718,7 @@ int acpi_pm_device_sleep_wake(struct device *dev, bool enable) */ struct acpi_device *acpi_dev_pm_get_node(struct device *dev) { - acpi_handle handle = DEVICE_ACPI_HANDLE(dev); + acpi_handle handle = ACPI_HANDLE(dev); struct acpi_device *adev; return handle && !acpi_bus_get_device(handle, &adev) ? adev : NULL; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d5309fd4945..ba5b56db9d2 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -173,9 +173,10 @@ static void start_transaction(struct acpi_ec *ec) static void advance_transaction(struct acpi_ec *ec, u8 status) { unsigned long flags; - struct transaction *t = ec->curr; + struct transaction *t; spin_lock_irqsave(&ec->lock, flags); + t = ec->curr; if (!t) goto unlock; if (t->wlen > t->wi) { diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 10f0f40587b..a22a295edb6 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -197,30 +197,28 @@ static void acpi_physnode_link_name(char *buf, unsigned int node_id) int acpi_bind_one(struct device *dev, acpi_handle handle) { - struct acpi_device *acpi_dev; - acpi_status status; + struct acpi_device *acpi_dev = NULL; struct acpi_device_physical_node *physical_node, *pn; char physical_node_name[PHYSICAL_NODE_NAME_SIZE]; struct list_head *physnode_list; unsigned int node_id; int retval = -EINVAL; - if (ACPI_HANDLE(dev)) { + if (ACPI_COMPANION(dev)) { if (handle) { - dev_warn(dev, "ACPI handle is already set\n"); + dev_warn(dev, "ACPI companion already set\n"); return -EINVAL; } else { - handle = ACPI_HANDLE(dev); + acpi_dev = ACPI_COMPANION(dev); } + } else { + acpi_bus_get_device(handle, &acpi_dev); } - if (!handle) + if (!acpi_dev) return -EINVAL; + get_device(&acpi_dev->dev); get_device(dev); - status = acpi_bus_get_device(handle, &acpi_dev); - if (ACPI_FAILURE(status)) - goto err; - physical_node = kzalloc(sizeof(*physical_node), GFP_KERNEL); if (!physical_node) { retval = -ENOMEM; @@ -242,10 +240,11 @@ int acpi_bind_one(struct device *dev, acpi_handle handle) dev_warn(dev, "Already associated with ACPI node\n"); kfree(physical_node); - if (ACPI_HANDLE(dev) != handle) + if (ACPI_COMPANION(dev) != acpi_dev) goto err; put_device(dev); + put_device(&acpi_dev->dev); return 0; } if (pn->node_id == node_id) { @@ -259,8 +258,8 @@ int acpi_bind_one(struct device *dev, acpi_handle handle) list_add(&physical_node->node, physnode_list); acpi_dev->physical_node_count++; - if (!ACPI_HANDLE(dev)) - ACPI_HANDLE_SET(dev, acpi_dev->handle); + if (!ACPI_COMPANION(dev)) + ACPI_COMPANION_SET(dev, acpi_dev); acpi_physnode_link_name(physical_node_name, node_id); retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, @@ -283,27 +282,21 @@ int acpi_bind_one(struct device *dev, acpi_handle handle) return 0; err: - ACPI_HANDLE_SET(dev, NULL); + ACPI_COMPANION_SET(dev, NULL); put_device(dev); + put_device(&acpi_dev->dev); return retval; } EXPORT_SYMBOL_GPL(acpi_bind_one); int acpi_unbind_one(struct device *dev) { + struct acpi_device *acpi_dev = ACPI_COMPANION(dev); struct acpi_device_physical_node *entry; - struct acpi_device *acpi_dev; - acpi_status status; - if (!ACPI_HANDLE(dev)) + if (!acpi_dev) return 0; - status = acpi_bus_get_device(ACPI_HANDLE(dev), &acpi_dev); - if (ACPI_FAILURE(status)) { - dev_err(dev, "Oops, ACPI handle corrupt in %s()\n", __func__); - return -EINVAL; - } - mutex_lock(&acpi_dev->physical_node_lock); list_for_each_entry(entry, &acpi_dev->physical_node_list, node) @@ -316,9 +309,10 @@ int acpi_unbind_one(struct device *dev) acpi_physnode_link_name(physnode_name, entry->node_id); sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name); sysfs_remove_link(&dev->kobj, "firmware_node"); - ACPI_HANDLE_SET(dev, NULL); - /* acpi_bind_one() increase refcnt by one. */ + ACPI_COMPANION_SET(dev, NULL); + /* Drop references taken by acpi_bind_one(). */ put_device(dev); + put_device(&acpi_dev->dev); kfree(entry); break; } @@ -328,6 +322,15 @@ int acpi_unbind_one(struct device *dev) } EXPORT_SYMBOL_GPL(acpi_unbind_one); +void acpi_preset_companion(struct device *dev, acpi_handle parent, u64 addr) +{ + struct acpi_device *adev; + + if (!acpi_bus_get_device(acpi_get_child(parent, addr), &adev)) + ACPI_COMPANION_SET(dev, adev); +} +EXPORT_SYMBOL_GPL(acpi_preset_companion); + static int acpi_platform_notify(struct device *dev) { struct acpi_bus_type *type = acpi_get_bus_type(dev); diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 56f05869b08..0703bff5e60 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -575,6 +575,7 @@ static int acpi_pci_root_add(struct acpi_device *device, dev_err(&device->dev, "Bus %04x:%02x not present in PCI namespace\n", root->segment, (unsigned int)root->secondary.start); + device->driver_data = NULL; result = -ENODEV; goto end; } diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 55f9dedbbf9..15daa21fcd0 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -289,24 +289,17 @@ void acpi_bus_device_eject(void *data, u32 ost_src) { struct acpi_device *device = data; acpi_handle handle = device->handle; - struct acpi_scan_handler *handler; u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; int error; lock_device_hotplug(); mutex_lock(&acpi_scan_lock); - handler = device->handler; - if (!handler || !handler->hotplug.enabled) { - put_device(&device->dev); - goto err_support; - } - if (ost_src == ACPI_NOTIFY_EJECT_REQUEST) acpi_evaluate_hotplug_ost(handle, ACPI_NOTIFY_EJECT_REQUEST, ACPI_OST_SC_EJECT_IN_PROGRESS, NULL); - if (handler->hotplug.mode == AHM_CONTAINER) + if (device->handler && device->handler->hotplug.mode == AHM_CONTAINER) kobject_uevent(&device->dev.kobj, KOBJ_OFFLINE); error = acpi_scan_hot_remove(device); @@ -411,8 +404,7 @@ static void acpi_hotplug_notify_cb(acpi_handle handle, u32 type, void *data) break; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); - status = acpi_bus_get_device(handle, &adev); - if (ACPI_FAILURE(status)) + if (acpi_bus_get_device(handle, &adev)) goto err_out; get_device(&adev->dev); @@ -1997,6 +1989,7 @@ static int acpi_bus_scan_fixed(void) if (result) return result; + device->flags.match_driver = true; result = device_attach(&device->dev); if (result < 0) return result; @@ -2013,6 +2006,7 @@ static int acpi_bus_scan_fixed(void) if (result) return result; + device->flags.match_driver = true; result = device_attach(&device->dev); } diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index 18dbdff4656..995e91bcb97 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -82,13 +82,6 @@ static bool allow_duplicates; module_param(allow_duplicates, bool, 0644); /* - * Some BIOSes claim they use minimum backlight at boot, - * and this may bring dimming screen after boot - */ -static bool use_bios_initial_backlight = 1; -module_param(use_bios_initial_backlight, bool, 0644); - -/* * For Windows 8 systems: if set ture and the GPU driver has * registered a backlight interface, skip registering ACPI video's. */ @@ -406,12 +399,6 @@ static int __init video_set_bqc_offset(const struct dmi_system_id *d) return 0; } -static int video_ignore_initial_backlight(const struct dmi_system_id *d) -{ - use_bios_initial_backlight = 0; - return 0; -} - static struct dmi_system_id video_dmi_table[] __initdata = { /* * Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121 @@ -456,54 +443,6 @@ static struct dmi_system_id video_dmi_table[] __initdata = { DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 7720"), }, }, - { - .callback = video_ignore_initial_backlight, - .ident = "HP Folio 13-2000", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP Folio 13 - 2000 Notebook PC"), - }, - }, - { - .callback = video_ignore_initial_backlight, - .ident = "Fujitsu E753", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "FUJITSU"), - DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK E753"), - }, - }, - { - .callback = video_ignore_initial_backlight, - .ident = "HP Pavilion dm4", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dm4 Notebook PC"), - }, - }, - { - .callback = video_ignore_initial_backlight, - .ident = "HP Pavilion g6 Notebook PC", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion g6 Notebook PC"), - }, - }, - { - .callback = video_ignore_initial_backlight, - .ident = "HP 1000 Notebook PC", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP 1000 Notebook PC"), - }, - }, - { - .callback = video_ignore_initial_backlight, - .ident = "HP Pavilion m4", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion m4 Notebook PC"), - }, - }, {} }; @@ -839,20 +778,18 @@ acpi_video_init_brightness(struct acpi_video_device *device) if (!device->cap._BQC) goto set_level; - if (use_bios_initial_backlight) { - level = acpi_video_bqc_value_to_level(device, level_old); - /* - * On some buggy laptops, _BQC returns an uninitialized - * value when invoked for the first time, i.e. - * level_old is invalid (no matter whether it's a level - * or an index). Set the backlight to max_level in this case. - */ - for (i = 2; i < br->count; i++) - if (level == br->levels[i]) - break; - if (i == br->count || !level) - level = max_level; - } + level = acpi_video_bqc_value_to_level(device, level_old); + /* + * On some buggy laptops, _BQC returns an uninitialized + * value when invoked for the first time, i.e. + * level_old is invalid (no matter whether it's a level + * or an index). Set the backlight to max_level in this case. + */ + for (i = 2; i < br->count; i++) + if (level == br->levels[i]) + break; + if (i == br->count || !level) + level = max_level; set_level: result = acpi_video_device_lcd_set_level(device, level); diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index ab714d2ad97..4372cfa883c 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -185,7 +185,7 @@ void ata_acpi_bind_port(struct ata_port *ap) if (libata_noacpi || ap->flags & ATA_FLAG_ACPI_SATA || !host_handle) return; - ACPI_HANDLE_SET(&ap->tdev, acpi_get_child(host_handle, ap->port_no)); + acpi_preset_companion(&ap->tdev, host_handle, ap->port_no); if (ata_acpi_gtm(ap, &ap->__acpi_init_gtm) == 0) ap->pflags |= ATA_PFLAG_INIT_GTM_VALID; @@ -222,7 +222,7 @@ void ata_acpi_bind_dev(struct ata_device *dev) parent_handle = port_handle; } - ACPI_HANDLE_SET(&dev->tdev, acpi_get_child(parent_handle, adr)); + acpi_preset_companion(&dev->tdev, parent_handle, adr); register_hotplug_dock_device(ata_dev_acpi_handle(dev), &ata_acpi_dev_dock_ops, dev, NULL, NULL); diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c index 853f610af28..e88690ebfd8 100644 --- a/drivers/ata/pata_arasan_cf.c +++ b/drivers/ata/pata_arasan_cf.c @@ -396,8 +396,7 @@ dma_xfer(struct arasan_cf_dev *acdev, dma_addr_t src, dma_addr_t dest, u32 len) struct dma_async_tx_descriptor *tx; struct dma_chan *chan = acdev->dma_chan; dma_cookie_t cookie; - unsigned long flags = DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP | - DMA_COMPL_SKIP_DEST_UNMAP; + unsigned long flags = DMA_PREP_INTERRUPT; int ret = 0; tx = chan->device->device_prep_dma_memcpy(chan, dest, src, len, flags); diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 47051cd2511..3a94b799f16 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -432,7 +432,7 @@ struct platform_device *platform_device_register_full( goto err_alloc; pdev->dev.parent = pdevinfo->parent; - ACPI_HANDLE_SET(&pdev->dev, pdevinfo->acpi_node.handle); + ACPI_COMPANION_SET(&pdev->dev, pdevinfo->acpi_node.companion); if (pdevinfo->dma_mask) { /* @@ -463,7 +463,7 @@ struct platform_device *platform_device_register_full( ret = platform_device_add(pdev); if (ret) { err: - ACPI_HANDLE_SET(&pdev->dev, NULL); + ACPI_COMPANION_SET(&pdev->dev, NULL); kfree(pdev->dev.dma_mask); err_alloc: diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index c12e9b9556b..1b41fca3d65 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1350,6 +1350,9 @@ static int device_prepare(struct device *dev, pm_message_t state) device_unlock(dev); + if (error) + pm_runtime_put(dev); + return error; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index b5d842370cc..ea192ec029c 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -223,7 +223,7 @@ static void null_softirq_done_fn(struct request *rq) blk_end_request_all(rq, 0); } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP static void null_ipi_cmd_end_io(void *data) { @@ -260,7 +260,7 @@ static void null_cmd_end_ipi(struct nullb_cmd *cmd) put_cpu(); } -#endif /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#endif /* CONFIG_SMP */ static inline void null_handle_cmd(struct nullb_cmd *cmd) { @@ -270,7 +270,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) end_cmd(cmd); break; case NULL_IRQ_SOFTIRQ: -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP null_cmd_end_ipi(cmd); #else end_cmd(cmd); @@ -571,7 +571,7 @@ static int __init null_init(void) { unsigned int i; -#if !defined(CONFIG_SMP) || !defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#if !defined(CONFIG_SMP) if (irqmode == NULL_IRQ_SOFTIRQ) { pr_warn("null_blk: softirq completions not available.\n"); pr_warn("null_blk: using direct completions.\n"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 588479d58f5..6a680d4de7f 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -199,15 +199,16 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) spin_lock_irqsave(&vblk->vq_lock, flags); if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) { + virtqueue_kick(vblk->vq); spin_unlock_irqrestore(&vblk->vq_lock, flags); blk_mq_stop_hw_queue(hctx); - virtqueue_kick(vblk->vq); return BLK_MQ_RQ_QUEUE_BUSY; } - spin_unlock_irqrestore(&vblk->vq_lock, flags); if (last) virtqueue_kick(vblk->vq); + + spin_unlock_irqrestore(&vblk->vq_lock, flags); return BLK_MQ_RQ_QUEUE_OK; } diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 94c0c74434e..1a65838888c 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -33,6 +33,15 @@ config TCG_TIS from within Linux. To compile this driver as a module, choose M here; the module will be called tpm_tis. +config TCG_TIS_I2C_ATMEL + tristate "TPM Interface Specification 1.2 Interface (I2C - Atmel)" + depends on I2C + ---help--- + If you have an Atmel I2C TPM security chip say Yes and it will be + accessible from within Linux. + To compile this driver as a module, choose M here; the module will + be called tpm_tis_i2c_atmel. + config TCG_TIS_I2C_INFINEON tristate "TPM Interface Specification 1.2 Interface (I2C - Infineon)" depends on I2C @@ -42,7 +51,17 @@ config TCG_TIS_I2C_INFINEON Specification 0.20 say Yes and it will be accessible from within Linux. To compile this driver as a module, choose M here; the module - will be called tpm_tis_i2c_infineon. + will be called tpm_i2c_infineon. + +config TCG_TIS_I2C_NUVOTON + tristate "TPM Interface Specification 1.2 Interface (I2C - Nuvoton)" + depends on I2C + ---help--- + If you have a TPM security chip with an I2C interface from + Nuvoton Technology Corp. say Yes and it will be accessible + from within Linux. + To compile this driver as a module, choose M here; the module + will be called tpm_i2c_nuvoton. config TCG_NSC tristate "National Semiconductor TPM Interface" @@ -82,14 +101,14 @@ config TCG_IBMVTPM as a module, choose M here; the module will be called tpm_ibmvtpm. config TCG_ST33_I2C - tristate "STMicroelectronics ST33 I2C TPM" - depends on I2C - depends on GPIOLIB - ---help--- - If you have a TPM security chip from STMicroelectronics working with - an I2C bus say Yes and it will be accessible from within Linux. - To compile this driver as a module, choose M here; the module will be - called tpm_stm_st33_i2c. + tristate "STMicroelectronics ST33 I2C TPM" + depends on I2C + depends on GPIOLIB + ---help--- + If you have a TPM security chip from STMicroelectronics working with + an I2C bus say Yes and it will be accessible from within Linux. + To compile this driver as a module, choose M here; the module will be + called tpm_stm_st33_i2c. config TCG_XEN tristate "XEN TPM Interface" diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index eb41ff97d0a..b80a4000dae 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -2,17 +2,20 @@ # Makefile for the kernel tpm device drivers. # obj-$(CONFIG_TCG_TPM) += tpm.o +tpm-y := tpm-interface.o +tpm-$(CONFIG_ACPI) += tpm_ppi.o + ifdef CONFIG_ACPI - obj-$(CONFIG_TCG_TPM) += tpm_bios.o - tpm_bios-objs += tpm_eventlog.o tpm_acpi.o tpm_ppi.o + tpm-y += tpm_eventlog.o tpm_acpi.o else ifdef CONFIG_TCG_IBMVTPM - obj-$(CONFIG_TCG_TPM) += tpm_bios.o - tpm_bios-objs += tpm_eventlog.o tpm_of.o + tpm-y += tpm_eventlog.o tpm_of.o endif endif obj-$(CONFIG_TCG_TIS) += tpm_tis.o +obj-$(CONFIG_TCG_TIS_I2C_ATMEL) += tpm_i2c_atmel.o obj-$(CONFIG_TCG_TIS_I2C_INFINEON) += tpm_i2c_infineon.o +obj-$(CONFIG_TCG_TIS_I2C_NUVOTON) += tpm_i2c_nuvoton.o obj-$(CONFIG_TCG_NSC) += tpm_nsc.o obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm-interface.c index e3c974a6c52..6ae41d33763 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm-interface.c @@ -10,13 +10,13 @@ * Maintained by: <tpmdd-devel@lists.sourceforge.net> * * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org + * Specifications at www.trustedcomputinggroup.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. - * + * * Note, the TPM chip is not interrupt driven (only polling) * and can have very long timeouts (minutes!). Hence the unusual * calls to msleep. @@ -371,13 +371,14 @@ static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf, return -ENODATA; if (count > bufsiz) { dev_err(chip->dev, - "invalid count value %x %zx \n", count, bufsiz); + "invalid count value %x %zx\n", count, bufsiz); return -E2BIG; } mutex_lock(&chip->tpm_mutex); - if ((rc = chip->vendor.send(chip, (u8 *) buf, count)) < 0) { + rc = chip->vendor.send(chip, (u8 *) buf, count); + if (rc < 0) { dev_err(chip->dev, "tpm_transmit: tpm_send: error %zd\n", rc); goto out; @@ -444,7 +445,7 @@ static ssize_t transmit_cmd(struct tpm_chip *chip, struct tpm_cmd_t *cmd, { int err; - len = tpm_transmit(chip,(u8 *) cmd, len); + len = tpm_transmit(chip, (u8 *) cmd, len); if (len < 0) return len; else if (len < TPM_HEADER_SIZE) @@ -658,7 +659,7 @@ static int tpm_continue_selftest(struct tpm_chip *chip) return rc; } -ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr, +ssize_t tpm_show_enabled(struct device *dev, struct device_attribute *attr, char *buf) { cap_t cap; @@ -674,7 +675,7 @@ ssize_t tpm_show_enabled(struct device * dev, struct device_attribute * attr, } EXPORT_SYMBOL_GPL(tpm_show_enabled); -ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr, +ssize_t tpm_show_active(struct device *dev, struct device_attribute *attr, char *buf) { cap_t cap; @@ -690,7 +691,7 @@ ssize_t tpm_show_active(struct device * dev, struct device_attribute * attr, } EXPORT_SYMBOL_GPL(tpm_show_active); -ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr, +ssize_t tpm_show_owned(struct device *dev, struct device_attribute *attr, char *buf) { cap_t cap; @@ -706,8 +707,8 @@ ssize_t tpm_show_owned(struct device * dev, struct device_attribute * attr, } EXPORT_SYMBOL_GPL(tpm_show_owned); -ssize_t tpm_show_temp_deactivated(struct device * dev, - struct device_attribute * attr, char *buf) +ssize_t tpm_show_temp_deactivated(struct device *dev, + struct device_attribute *attr, char *buf) { cap_t cap; ssize_t rc; @@ -769,10 +770,10 @@ static int __tpm_pcr_read(struct tpm_chip *chip, int pcr_idx, u8 *res_buf) /** * tpm_pcr_read - read a pcr value - * @chip_num: tpm idx # or ANY + * @chip_num: tpm idx # or ANY * @pcr_idx: pcr idx to retrieve - * @res_buf: TPM_PCR value - * size of res_buf is 20 bytes (or NULL if you don't care) + * @res_buf: TPM_PCR value + * size of res_buf is 20 bytes (or NULL if you don't care) * * The TPM driver should be built-in, but for whatever reason it * isn't, protect against the chip disappearing, by incrementing @@ -794,9 +795,9 @@ EXPORT_SYMBOL_GPL(tpm_pcr_read); /** * tpm_pcr_extend - extend pcr value with hash - * @chip_num: tpm idx # or AN& + * @chip_num: tpm idx # or AN& * @pcr_idx: pcr idx to extend - * @hash: hash value used to extend pcr value + * @hash: hash value used to extend pcr value * * The TPM driver should be built-in, but for whatever reason it * isn't, protect against the chip disappearing, by incrementing @@ -847,8 +848,7 @@ int tpm_do_selftest(struct tpm_chip *chip) unsigned long duration; struct tpm_cmd_t cmd; - duration = tpm_calc_ordinal_duration(chip, - TPM_ORD_CONTINUE_SELFTEST); + duration = tpm_calc_ordinal_duration(chip, TPM_ORD_CONTINUE_SELFTEST); loops = jiffies_to_msecs(duration) / delay_msec; @@ -965,12 +965,12 @@ ssize_t tpm_show_pubek(struct device *dev, struct device_attribute *attr, if (err) goto out; - /* + /* ignore header 10 bytes algorithm 32 bits (1 == RSA ) encscheme 16 bits sigscheme 16 bits - parameters (RSA 12->bytes: keybit, #primes, expbit) + parameters (RSA 12->bytes: keybit, #primes, expbit) keylenbytes 32 bits 256 byte modulus ignore checksum 20 bytes @@ -1020,43 +1020,33 @@ ssize_t tpm_show_caps(struct device *dev, struct device_attribute *attr, str += sprintf(str, "Manufacturer: 0x%x\n", be32_to_cpu(cap.manufacturer_id)); - rc = tpm_getcap(dev, CAP_VERSION_1_1, &cap, - "attempting to determine the 1.1 version"); - if (rc) - return 0; - str += sprintf(str, - "TCG version: %d.%d\nFirmware version: %d.%d\n", - cap.tpm_version.Major, cap.tpm_version.Minor, - cap.tpm_version.revMajor, cap.tpm_version.revMinor); - return str - buf; -} -EXPORT_SYMBOL_GPL(tpm_show_caps); - -ssize_t tpm_show_caps_1_2(struct device * dev, - struct device_attribute * attr, char *buf) -{ - cap_t cap; - ssize_t rc; - char *str = buf; - - rc = tpm_getcap(dev, TPM_CAP_PROP_MANUFACTURER, &cap, - "attempting to determine the manufacturer"); - if (rc) - return 0; - str += sprintf(str, "Manufacturer: 0x%x\n", - be32_to_cpu(cap.manufacturer_id)); + /* Try to get a TPM version 1.2 TPM_CAP_VERSION_INFO */ rc = tpm_getcap(dev, CAP_VERSION_1_2, &cap, "attempting to determine the 1.2 version"); - if (rc) - return 0; - str += sprintf(str, - "TCG version: %d.%d\nFirmware version: %d.%d\n", - cap.tpm_version_1_2.Major, cap.tpm_version_1_2.Minor, - cap.tpm_version_1_2.revMajor, - cap.tpm_version_1_2.revMinor); + if (!rc) { + str += sprintf(str, + "TCG version: %d.%d\nFirmware version: %d.%d\n", + cap.tpm_version_1_2.Major, + cap.tpm_version_1_2.Minor, + cap.tpm_version_1_2.revMajor, + cap.tpm_version_1_2.revMinor); + } else { + /* Otherwise just use TPM_STRUCT_VER */ + rc = tpm_getcap(dev, CAP_VERSION_1_1, &cap, + "attempting to determine the 1.1 version"); + if (rc) + return 0; + str += sprintf(str, + "TCG version: %d.%d\nFirmware version: %d.%d\n", + cap.tpm_version.Major, + cap.tpm_version.Minor, + cap.tpm_version.revMajor, + cap.tpm_version.revMinor); + } + return str - buf; } -EXPORT_SYMBOL_GPL(tpm_show_caps_1_2); +EXPORT_SYMBOL_GPL(tpm_show_caps); ssize_t tpm_show_durations(struct device *dev, struct device_attribute *attr, char *buf) @@ -1102,8 +1092,8 @@ ssize_t tpm_store_cancel(struct device *dev, struct device_attribute *attr, } EXPORT_SYMBOL_GPL(tpm_store_cancel); -static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask, bool check_cancel, - bool *canceled) +static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask, + bool check_cancel, bool *canceled) { u8 status = chip->vendor.status(chip); @@ -1170,38 +1160,25 @@ EXPORT_SYMBOL_GPL(wait_for_tpm_stat); */ int tpm_open(struct inode *inode, struct file *file) { - int minor = iminor(inode); - struct tpm_chip *chip = NULL, *pos; - - rcu_read_lock(); - list_for_each_entry_rcu(pos, &tpm_chip_list, list) { - if (pos->vendor.miscdev.minor == minor) { - chip = pos; - get_device(chip->dev); - break; - } - } - rcu_read_unlock(); - - if (!chip) - return -ENODEV; + struct miscdevice *misc = file->private_data; + struct tpm_chip *chip = container_of(misc, struct tpm_chip, + vendor.miscdev); if (test_and_set_bit(0, &chip->is_open)) { dev_dbg(chip->dev, "Another process owns this TPM\n"); - put_device(chip->dev); return -EBUSY; } chip->data_buffer = kzalloc(TPM_BUFSIZE, GFP_KERNEL); if (chip->data_buffer == NULL) { clear_bit(0, &chip->is_open); - put_device(chip->dev); return -ENOMEM; } atomic_set(&chip->data_pending, 0); file->private_data = chip; + get_device(chip->dev); return 0; } EXPORT_SYMBOL_GPL(tpm_open); @@ -1463,7 +1440,6 @@ void tpm_dev_vendor_release(struct tpm_chip *chip) chip->vendor.release(chip->dev); clear_bit(chip->dev_num, dev_mask); - kfree(chip->vendor.miscdev.name); } EXPORT_SYMBOL_GPL(tpm_dev_vendor_release); @@ -1487,7 +1463,7 @@ void tpm_dev_release(struct device *dev) EXPORT_SYMBOL_GPL(tpm_dev_release); /* - * Called from tpm_<specific>.c probe function only for devices + * Called from tpm_<specific>.c probe function only for devices * the driver has determined it should claim. Prior to calling * this function the specific probe function has called pci_enable_device * upon errant exit from this function specific probe function should call @@ -1496,17 +1472,13 @@ EXPORT_SYMBOL_GPL(tpm_dev_release); struct tpm_chip *tpm_register_hardware(struct device *dev, const struct tpm_vendor_specific *entry) { -#define DEVNAME_SIZE 7 - - char *devname; struct tpm_chip *chip; /* Driver specific per-device data */ chip = kzalloc(sizeof(*chip), GFP_KERNEL); - devname = kmalloc(DEVNAME_SIZE, GFP_KERNEL); - if (chip == NULL || devname == NULL) - goto out_free; + if (chip == NULL) + return NULL; mutex_init(&chip->buffer_mutex); mutex_init(&chip->tpm_mutex); @@ -1531,8 +1503,9 @@ struct tpm_chip *tpm_register_hardware(struct device *dev, set_bit(chip->dev_num, dev_mask); - scnprintf(devname, DEVNAME_SIZE, "%s%d", "tpm", chip->dev_num); - chip->vendor.miscdev.name = devname; + scnprintf(chip->devname, sizeof(chip->devname), "%s%d", "tpm", + chip->dev_num); + chip->vendor.miscdev.name = chip->devname; chip->vendor.miscdev.parent = dev; chip->dev = get_device(dev); @@ -1558,7 +1531,7 @@ struct tpm_chip *tpm_register_hardware(struct device *dev, goto put_device; } - chip->bios_dir = tpm_bios_log_setup(devname); + chip->bios_dir = tpm_bios_log_setup(chip->devname); /* Make chip available */ spin_lock(&driver_lock); @@ -1571,7 +1544,6 @@ put_device: put_device(chip->dev); out_free: kfree(chip); - kfree(devname); return NULL; } EXPORT_SYMBOL_GPL(tpm_register_hardware); diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index a7bfc176ed4..f3284787219 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -59,8 +59,6 @@ extern ssize_t tpm_show_pcrs(struct device *, struct device_attribute *attr, char *); extern ssize_t tpm_show_caps(struct device *, struct device_attribute *attr, char *); -extern ssize_t tpm_show_caps_1_2(struct device *, struct device_attribute *attr, - char *); extern ssize_t tpm_store_cancel(struct device *, struct device_attribute *attr, const char *, size_t); extern ssize_t tpm_show_enabled(struct device *, struct device_attribute *attr, @@ -122,6 +120,7 @@ struct tpm_chip { struct device *dev; /* Device stuff */ int dev_num; /* /dev/tpm# */ + char devname[7]; unsigned long is_open; /* only one allowed */ int time_expired; diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c index 99d6820c611..c9a528d25d2 100644 --- a/drivers/char/tpm/tpm_atmel.c +++ b/drivers/char/tpm/tpm_atmel.c @@ -202,7 +202,7 @@ static int __init init_atmel(void) have_region = (atmel_request_region - (tpm_atmel.base, region_size, "tpm_atmel0") == NULL) ? 0 : 1; + (base, region_size, "tpm_atmel0") == NULL) ? 0 : 1; pdev = platform_device_register_simple("tpm_atmel", -1, NULL, 0); if (IS_ERR(pdev)) { diff --git a/drivers/char/tpm/tpm_eventlog.c b/drivers/char/tpm/tpm_eventlog.c index 84ddc557b8f..59f7cb28260 100644 --- a/drivers/char/tpm/tpm_eventlog.c +++ b/drivers/char/tpm/tpm_eventlog.c @@ -406,7 +406,6 @@ out_tpm: out: return NULL; } -EXPORT_SYMBOL_GPL(tpm_bios_log_setup); void tpm_bios_log_teardown(struct dentry **lst) { @@ -415,5 +414,3 @@ void tpm_bios_log_teardown(struct dentry **lst) for (i = 0; i < 3; i++) securityfs_remove(lst[i]); } -EXPORT_SYMBOL_GPL(tpm_bios_log_teardown); -MODULE_LICENSE("GPL"); diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c new file mode 100644 index 00000000000..c3cd7fe481a --- /dev/null +++ b/drivers/char/tpm/tpm_i2c_atmel.c @@ -0,0 +1,284 @@ +/* + * ATMEL I2C TPM AT97SC3204T + * + * Copyright (C) 2012 V Lab Technologies + * Teddy Reed <teddy@prosauce.org> + * Copyright (C) 2013, Obsidian Research Corp. + * Jason Gunthorpe <jgunthorpe@obsidianresearch.com> + * Device driver for ATMEL I2C TPMs. + * + * Teddy Reed determined the basic I2C command flow, unlike other I2C TPM + * devices the raw TCG formatted TPM command data is written via I2C and then + * raw TCG formatted TPM command data is returned via I2C. + * + * TGC status/locality/etc functions seen in the LPC implementation do not + * seem to be present. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/>. + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <linux/i2c.h> +#include "tpm.h" + +#define I2C_DRIVER_NAME "tpm_i2c_atmel" + +#define TPM_I2C_SHORT_TIMEOUT 750 /* ms */ +#define TPM_I2C_LONG_TIMEOUT 2000 /* 2 sec */ + +#define ATMEL_STS_OK 1 + +struct priv_data { + size_t len; + /* This is the amount we read on the first try. 25 was chosen to fit a + * fair number of read responses in the buffer so a 2nd retry can be + * avoided in small message cases. */ + u8 buffer[sizeof(struct tpm_output_header) + 25]; +}; + +static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t len) +{ + struct priv_data *priv = chip->vendor.priv; + struct i2c_client *client = to_i2c_client(chip->dev); + s32 status; + + priv->len = 0; + + if (len <= 2) + return -EIO; + + status = i2c_master_send(client, buf, len); + + dev_dbg(chip->dev, + "%s(buf=%*ph len=%0zx) -> sts=%d\n", __func__, + (int)min_t(size_t, 64, len), buf, len, status); + return status; +} + +static int i2c_atmel_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct priv_data *priv = chip->vendor.priv; + struct i2c_client *client = to_i2c_client(chip->dev); + struct tpm_output_header *hdr = + (struct tpm_output_header *)priv->buffer; + u32 expected_len; + int rc; + + if (priv->len == 0) + return -EIO; + + /* Get the message size from the message header, if we didn't get the + * whole message in read_status then we need to re-read the + * message. */ + expected_len = be32_to_cpu(hdr->length); + if (expected_len > count) + return -ENOMEM; + + if (priv->len >= expected_len) { + dev_dbg(chip->dev, + "%s early(buf=%*ph count=%0zx) -> ret=%d\n", __func__, + (int)min_t(size_t, 64, expected_len), buf, count, + expected_len); + memcpy(buf, priv->buffer, expected_len); + return expected_len; + } + + rc = i2c_master_recv(client, buf, expected_len); + dev_dbg(chip->dev, + "%s reread(buf=%*ph count=%0zx) -> ret=%d\n", __func__, + (int)min_t(size_t, 64, expected_len), buf, count, + expected_len); + return rc; +} + +static void i2c_atmel_cancel(struct tpm_chip *chip) +{ + dev_err(chip->dev, "TPM operation cancellation was requested, but is not supported"); +} + +static u8 i2c_atmel_read_status(struct tpm_chip *chip) +{ + struct priv_data *priv = chip->vendor.priv; + struct i2c_client *client = to_i2c_client(chip->dev); + int rc; + + /* The TPM fails the I2C read until it is ready, so we do the entire + * transfer here and buffer it locally. This way the common code can + * properly handle the timeouts. */ + priv->len = 0; + memset(priv->buffer, 0, sizeof(priv->buffer)); + + + /* Once the TPM has completed the command the command remains readable + * until another command is issued. */ + rc = i2c_master_recv(client, priv->buffer, sizeof(priv->buffer)); + dev_dbg(chip->dev, + "%s: sts=%d", __func__, rc); + if (rc <= 0) + return 0; + + priv->len = rc; + + return ATMEL_STS_OK; +} + +static const struct file_operations i2c_atmel_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); +static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); +static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); + +static struct attribute *i2c_atmel_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + &dev_attr_durations.attr, + &dev_attr_timeouts.attr, + NULL, +}; + +static struct attribute_group i2c_atmel_attr_grp = { + .attrs = i2c_atmel_attrs +}; + +static bool i2c_atmel_req_canceled(struct tpm_chip *chip, u8 status) +{ + return 0; +} + +static const struct tpm_vendor_specific i2c_atmel = { + .status = i2c_atmel_read_status, + .recv = i2c_atmel_recv, + .send = i2c_atmel_send, + .cancel = i2c_atmel_cancel, + .req_complete_mask = ATMEL_STS_OK, + .req_complete_val = ATMEL_STS_OK, + .req_canceled = i2c_atmel_req_canceled, + .attr_group = &i2c_atmel_attr_grp, + .miscdev.fops = &i2c_atmel_ops, +}; + +static int i2c_atmel_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + int rc; + struct tpm_chip *chip; + struct device *dev = &client->dev; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) + return -ENODEV; + + chip = tpm_register_hardware(dev, &i2c_atmel); + if (!chip) { + dev_err(dev, "%s() error in tpm_register_hardware\n", __func__); + return -ENODEV; + } + + chip->vendor.priv = devm_kzalloc(dev, sizeof(struct priv_data), + GFP_KERNEL); + + /* Default timeouts */ + chip->vendor.timeout_a = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + chip->vendor.timeout_b = msecs_to_jiffies(TPM_I2C_LONG_TIMEOUT); + chip->vendor.timeout_c = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + chip->vendor.timeout_d = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + chip->vendor.irq = 0; + + /* There is no known way to probe for this device, and all version + * information seems to be read via TPM commands. Thus we rely on the + * TPM startup process in the common code to detect the device. */ + if (tpm_get_timeouts(chip)) { + rc = -ENODEV; + goto out_err; + } + + if (tpm_do_selftest(chip)) { + rc = -ENODEV; + goto out_err; + } + + return 0; + +out_err: + tpm_dev_vendor_release(chip); + tpm_remove_hardware(chip->dev); + return rc; +} + +static int i2c_atmel_remove(struct i2c_client *client) +{ + struct device *dev = &(client->dev); + struct tpm_chip *chip = dev_get_drvdata(dev); + + if (chip) + tpm_dev_vendor_release(chip); + tpm_remove_hardware(dev); + kfree(chip); + return 0; +} + +static const struct i2c_device_id i2c_atmel_id[] = { + {I2C_DRIVER_NAME, 0}, + {} +}; +MODULE_DEVICE_TABLE(i2c, i2c_atmel_id); + +#ifdef CONFIG_OF +static const struct of_device_id i2c_atmel_of_match[] = { + {.compatible = "atmel,at97sc3204t"}, + {}, +}; +MODULE_DEVICE_TABLE(of, i2c_atmel_of_match); +#endif + +static SIMPLE_DEV_PM_OPS(i2c_atmel_pm_ops, tpm_pm_suspend, tpm_pm_resume); + +static struct i2c_driver i2c_atmel_driver = { + .id_table = i2c_atmel_id, + .probe = i2c_atmel_probe, + .remove = i2c_atmel_remove, + .driver = { + .name = I2C_DRIVER_NAME, + .owner = THIS_MODULE, + .pm = &i2c_atmel_pm_ops, + .of_match_table = of_match_ptr(i2c_atmel_of_match), + }, +}; + +module_i2c_driver(i2c_atmel_driver); + +MODULE_AUTHOR("Jason Gunthorpe <jgunthorpe@obsidianresearch.com>"); +MODULE_DESCRIPTION("Atmel TPM I2C Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/tpm/tpm_i2c_infineon.c b/drivers/char/tpm/tpm_i2c_infineon.c index b8735de8ce9..fefd2aa5c81 100644 --- a/drivers/char/tpm/tpm_i2c_infineon.c +++ b/drivers/char/tpm/tpm_i2c_infineon.c @@ -581,7 +581,7 @@ static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); -static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); @@ -685,7 +685,6 @@ out_vendor: chip->dev->release = NULL; chip->release = NULL; tpm_dev.client = NULL; - dev_set_drvdata(chip->dev, chip); out_err: return rc; } @@ -766,7 +765,6 @@ static int tpm_tis_i2c_remove(struct i2c_client *client) chip->dev->release = NULL; chip->release = NULL; tpm_dev.client = NULL; - dev_set_drvdata(chip->dev, chip); return 0; } diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c new file mode 100644 index 00000000000..6276fea01ff --- /dev/null +++ b/drivers/char/tpm/tpm_i2c_nuvoton.c @@ -0,0 +1,710 @@ +/****************************************************************************** + * Nuvoton TPM I2C Device Driver Interface for WPCT301/NPCT501, + * based on the TCG TPM Interface Spec version 1.2. + * Specifications at www.trustedcomputinggroup.org + * + * Copyright (C) 2011, Nuvoton Technology Corporation. + * Dan Morav <dan.morav@nuvoton.com> + * Copyright (C) 2013, Obsidian Research Corp. + * Jason Gunthorpe <jgunthorpe@obsidianresearch.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/>. + * + * Nuvoton contact information: APC.Support@nuvoton.com + *****************************************************************************/ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/wait.h> +#include <linux/i2c.h> +#include "tpm.h" + +/* I2C interface offsets */ +#define TPM_STS 0x00 +#define TPM_BURST_COUNT 0x01 +#define TPM_DATA_FIFO_W 0x20 +#define TPM_DATA_FIFO_R 0x40 +#define TPM_VID_DID_RID 0x60 +/* TPM command header size */ +#define TPM_HEADER_SIZE 10 +#define TPM_RETRY 5 +/* + * I2C bus device maximum buffer size w/o counting I2C address or command + * i.e. max size required for I2C write is 34 = addr, command, 32 bytes data + */ +#define TPM_I2C_MAX_BUF_SIZE 32 +#define TPM_I2C_RETRY_COUNT 32 +#define TPM_I2C_BUS_DELAY 1 /* msec */ +#define TPM_I2C_RETRY_DELAY_SHORT 2 /* msec */ +#define TPM_I2C_RETRY_DELAY_LONG 10 /* msec */ + +#define I2C_DRIVER_NAME "tpm_i2c_nuvoton" + +struct priv_data { + unsigned int intrs; +}; + +static s32 i2c_nuvoton_read_buf(struct i2c_client *client, u8 offset, u8 size, + u8 *data) +{ + s32 status; + + status = i2c_smbus_read_i2c_block_data(client, offset, size, data); + dev_dbg(&client->dev, + "%s(offset=%u size=%u data=%*ph) -> sts=%d\n", __func__, + offset, size, (int)size, data, status); + return status; +} + +static s32 i2c_nuvoton_write_buf(struct i2c_client *client, u8 offset, u8 size, + u8 *data) +{ + s32 status; + + status = i2c_smbus_write_i2c_block_data(client, offset, size, data); + dev_dbg(&client->dev, + "%s(offset=%u size=%u data=%*ph) -> sts=%d\n", __func__, + offset, size, (int)size, data, status); + return status; +} + +#define TPM_STS_VALID 0x80 +#define TPM_STS_COMMAND_READY 0x40 +#define TPM_STS_GO 0x20 +#define TPM_STS_DATA_AVAIL 0x10 +#define TPM_STS_EXPECT 0x08 +#define TPM_STS_RESPONSE_RETRY 0x02 +#define TPM_STS_ERR_VAL 0x07 /* bit2...bit0 reads always 0 */ + +#define TPM_I2C_SHORT_TIMEOUT 750 /* ms */ +#define TPM_I2C_LONG_TIMEOUT 2000 /* 2 sec */ + +/* read TPM_STS register */ +static u8 i2c_nuvoton_read_status(struct tpm_chip *chip) +{ + struct i2c_client *client = to_i2c_client(chip->dev); + s32 status; + u8 data; + + status = i2c_nuvoton_read_buf(client, TPM_STS, 1, &data); + if (status <= 0) { + dev_err(chip->dev, "%s() error return %d\n", __func__, + status); + data = TPM_STS_ERR_VAL; + } + + return data; +} + +/* write byte to TPM_STS register */ +static s32 i2c_nuvoton_write_status(struct i2c_client *client, u8 data) +{ + s32 status; + int i; + + /* this causes the current command to be aborted */ + for (i = 0, status = -1; i < TPM_I2C_RETRY_COUNT && status < 0; i++) { + status = i2c_nuvoton_write_buf(client, TPM_STS, 1, &data); + msleep(TPM_I2C_BUS_DELAY); + } + return status; +} + +/* write commandReady to TPM_STS register */ +static void i2c_nuvoton_ready(struct tpm_chip *chip) +{ + struct i2c_client *client = to_i2c_client(chip->dev); + s32 status; + + /* this causes the current command to be aborted */ + status = i2c_nuvoton_write_status(client, TPM_STS_COMMAND_READY); + if (status < 0) + dev_err(chip->dev, + "%s() fail to write TPM_STS.commandReady\n", __func__); +} + +/* read burstCount field from TPM_STS register + * return -1 on fail to read */ +static int i2c_nuvoton_get_burstcount(struct i2c_client *client, + struct tpm_chip *chip) +{ + unsigned long stop = jiffies + chip->vendor.timeout_d; + s32 status; + int burst_count = -1; + u8 data; + + /* wait for burstcount to be non-zero */ + do { + /* in I2C burstCount is 1 byte */ + status = i2c_nuvoton_read_buf(client, TPM_BURST_COUNT, 1, + &data); + if (status > 0 && data > 0) { + burst_count = min_t(u8, TPM_I2C_MAX_BUF_SIZE, data); + break; + } + msleep(TPM_I2C_BUS_DELAY); + } while (time_before(jiffies, stop)); + + return burst_count; +} + +/* + * WPCT301/NPCT501 SINT# supports only dataAvail + * any call to this function which is not waiting for dataAvail will + * set queue to NULL to avoid waiting for interrupt + */ +static bool i2c_nuvoton_check_status(struct tpm_chip *chip, u8 mask, u8 value) +{ + u8 status = i2c_nuvoton_read_status(chip); + return (status != TPM_STS_ERR_VAL) && ((status & mask) == value); +} + +static int i2c_nuvoton_wait_for_stat(struct tpm_chip *chip, u8 mask, u8 value, + u32 timeout, wait_queue_head_t *queue) +{ + if (chip->vendor.irq && queue) { + s32 rc; + DEFINE_WAIT(wait); + struct priv_data *priv = chip->vendor.priv; + unsigned int cur_intrs = priv->intrs; + + enable_irq(chip->vendor.irq); + rc = wait_event_interruptible_timeout(*queue, + cur_intrs != priv->intrs, + timeout); + if (rc > 0) + return 0; + /* At this point we know that the SINT pin is asserted, so we + * do not need to do i2c_nuvoton_check_status */ + } else { + unsigned long ten_msec, stop; + bool status_valid; + + /* check current status */ + status_valid = i2c_nuvoton_check_status(chip, mask, value); + if (status_valid) + return 0; + + /* use polling to wait for the event */ + ten_msec = jiffies + msecs_to_jiffies(TPM_I2C_RETRY_DELAY_LONG); + stop = jiffies + timeout; + do { + if (time_before(jiffies, ten_msec)) + msleep(TPM_I2C_RETRY_DELAY_SHORT); + else + msleep(TPM_I2C_RETRY_DELAY_LONG); + status_valid = i2c_nuvoton_check_status(chip, mask, + value); + if (status_valid) + return 0; + } while (time_before(jiffies, stop)); + } + dev_err(chip->dev, "%s(%02x, %02x) -> timeout\n", __func__, mask, + value); + return -ETIMEDOUT; +} + +/* wait for dataAvail field to be set in the TPM_STS register */ +static int i2c_nuvoton_wait_for_data_avail(struct tpm_chip *chip, u32 timeout, + wait_queue_head_t *queue) +{ + return i2c_nuvoton_wait_for_stat(chip, + TPM_STS_DATA_AVAIL | TPM_STS_VALID, + TPM_STS_DATA_AVAIL | TPM_STS_VALID, + timeout, queue); +} + +/* Read @count bytes into @buf from TPM_RD_FIFO register */ +static int i2c_nuvoton_recv_data(struct i2c_client *client, + struct tpm_chip *chip, u8 *buf, size_t count) +{ + s32 rc; + int burst_count, bytes2read, size = 0; + + while (size < count && + i2c_nuvoton_wait_for_data_avail(chip, + chip->vendor.timeout_c, + &chip->vendor.read_queue) == 0) { + burst_count = i2c_nuvoton_get_burstcount(client, chip); + if (burst_count < 0) { + dev_err(chip->dev, + "%s() fail to read burstCount=%d\n", __func__, + burst_count); + return -EIO; + } + bytes2read = min_t(size_t, burst_count, count - size); + rc = i2c_nuvoton_read_buf(client, TPM_DATA_FIFO_R, + bytes2read, &buf[size]); + if (rc < 0) { + dev_err(chip->dev, + "%s() fail on i2c_nuvoton_read_buf()=%d\n", + __func__, rc); + return -EIO; + } + dev_dbg(chip->dev, "%s(%d):", __func__, bytes2read); + size += bytes2read; + } + + return size; +} + +/* Read TPM command results */ +static int i2c_nuvoton_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct device *dev = chip->dev; + struct i2c_client *client = to_i2c_client(dev); + s32 rc; + int expected, status, burst_count, retries, size = 0; + + if (count < TPM_HEADER_SIZE) { + i2c_nuvoton_ready(chip); /* return to idle */ + dev_err(dev, "%s() count < header size\n", __func__); + return -EIO; + } + for (retries = 0; retries < TPM_RETRY; retries++) { + if (retries > 0) { + /* if this is not the first trial, set responseRetry */ + i2c_nuvoton_write_status(client, + TPM_STS_RESPONSE_RETRY); + } + /* + * read first available (> 10 bytes), including: + * tag, paramsize, and result + */ + status = i2c_nuvoton_wait_for_data_avail( + chip, chip->vendor.timeout_c, &chip->vendor.read_queue); + if (status != 0) { + dev_err(dev, "%s() timeout on dataAvail\n", __func__); + size = -ETIMEDOUT; + continue; + } + burst_count = i2c_nuvoton_get_burstcount(client, chip); + if (burst_count < 0) { + dev_err(dev, "%s() fail to get burstCount\n", __func__); + size = -EIO; + continue; + } + size = i2c_nuvoton_recv_data(client, chip, buf, + burst_count); + if (size < TPM_HEADER_SIZE) { + dev_err(dev, "%s() fail to read header\n", __func__); + size = -EIO; + continue; + } + /* + * convert number of expected bytes field from big endian 32 bit + * to machine native + */ + expected = be32_to_cpu(*(__be32 *) (buf + 2)); + if (expected > count) { + dev_err(dev, "%s() expected > count\n", __func__); + size = -EIO; + continue; + } + rc = i2c_nuvoton_recv_data(client, chip, &buf[size], + expected - size); + size += rc; + if (rc < 0 || size < expected) { + dev_err(dev, "%s() fail to read remainder of result\n", + __func__); + size = -EIO; + continue; + } + if (i2c_nuvoton_wait_for_stat( + chip, TPM_STS_VALID | TPM_STS_DATA_AVAIL, + TPM_STS_VALID, chip->vendor.timeout_c, + NULL)) { + dev_err(dev, "%s() error left over data\n", __func__); + size = -ETIMEDOUT; + continue; + } + break; + } + i2c_nuvoton_ready(chip); + dev_dbg(chip->dev, "%s() -> %d\n", __func__, size); + return size; +} + +/* + * Send TPM command. + * + * If interrupts are used (signaled by an irq set in the vendor structure) + * tpm.c can skip polling for the data to be available as the interrupt is + * waited for here + */ +static int i2c_nuvoton_send(struct tpm_chip *chip, u8 *buf, size_t len) +{ + struct device *dev = chip->dev; + struct i2c_client *client = to_i2c_client(dev); + u32 ordinal; + size_t count = 0; + int burst_count, bytes2write, retries, rc = -EIO; + + for (retries = 0; retries < TPM_RETRY; retries++) { + i2c_nuvoton_ready(chip); + if (i2c_nuvoton_wait_for_stat(chip, TPM_STS_COMMAND_READY, + TPM_STS_COMMAND_READY, + chip->vendor.timeout_b, NULL)) { + dev_err(dev, "%s() timeout on commandReady\n", + __func__); + rc = -EIO; + continue; + } + rc = 0; + while (count < len - 1) { + burst_count = i2c_nuvoton_get_burstcount(client, + chip); + if (burst_count < 0) { + dev_err(dev, "%s() fail get burstCount\n", + __func__); + rc = -EIO; + break; + } + bytes2write = min_t(size_t, burst_count, + len - 1 - count); + rc = i2c_nuvoton_write_buf(client, TPM_DATA_FIFO_W, + bytes2write, &buf[count]); + if (rc < 0) { + dev_err(dev, "%s() fail i2cWriteBuf\n", + __func__); + break; + } + dev_dbg(dev, "%s(%d):", __func__, bytes2write); + count += bytes2write; + rc = i2c_nuvoton_wait_for_stat(chip, + TPM_STS_VALID | + TPM_STS_EXPECT, + TPM_STS_VALID | + TPM_STS_EXPECT, + chip->vendor.timeout_c, + NULL); + if (rc < 0) { + dev_err(dev, "%s() timeout on Expect\n", + __func__); + rc = -ETIMEDOUT; + break; + } + } + if (rc < 0) + continue; + + /* write last byte */ + rc = i2c_nuvoton_write_buf(client, TPM_DATA_FIFO_W, 1, + &buf[count]); + if (rc < 0) { + dev_err(dev, "%s() fail to write last byte\n", + __func__); + rc = -EIO; + continue; + } + dev_dbg(dev, "%s(last): %02x", __func__, buf[count]); + rc = i2c_nuvoton_wait_for_stat(chip, + TPM_STS_VALID | TPM_STS_EXPECT, + TPM_STS_VALID, + chip->vendor.timeout_c, NULL); + if (rc) { + dev_err(dev, "%s() timeout on Expect to clear\n", + __func__); + rc = -ETIMEDOUT; + continue; + } + break; + } + if (rc < 0) { + /* retries == TPM_RETRY */ + i2c_nuvoton_ready(chip); + return rc; + } + /* execute the TPM command */ + rc = i2c_nuvoton_write_status(client, TPM_STS_GO); + if (rc < 0) { + dev_err(dev, "%s() fail to write Go\n", __func__); + i2c_nuvoton_ready(chip); + return rc; + } + ordinal = be32_to_cpu(*((__be32 *) (buf + 6))); + rc = i2c_nuvoton_wait_for_data_avail(chip, + tpm_calc_ordinal_duration(chip, + ordinal), + &chip->vendor.read_queue); + if (rc) { + dev_err(dev, "%s() timeout command duration\n", __func__); + i2c_nuvoton_ready(chip); + return rc; + } + + dev_dbg(dev, "%s() -> %zd\n", __func__, len); + return len; +} + +static bool i2c_nuvoton_req_canceled(struct tpm_chip *chip, u8 status) +{ + return (status == TPM_STS_COMMAND_READY); +} + +static const struct file_operations i2c_nuvoton_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); +static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); +static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); + +static struct attribute *i2c_nuvoton_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + &dev_attr_durations.attr, + &dev_attr_timeouts.attr, + NULL, +}; + +static struct attribute_group i2c_nuvoton_attr_grp = { + .attrs = i2c_nuvoton_attrs +}; + +static const struct tpm_vendor_specific tpm_i2c = { + .status = i2c_nuvoton_read_status, + .recv = i2c_nuvoton_recv, + .send = i2c_nuvoton_send, + .cancel = i2c_nuvoton_ready, + .req_complete_mask = TPM_STS_DATA_AVAIL | TPM_STS_VALID, + .req_complete_val = TPM_STS_DATA_AVAIL | TPM_STS_VALID, + .req_canceled = i2c_nuvoton_req_canceled, + .attr_group = &i2c_nuvoton_attr_grp, + .miscdev.fops = &i2c_nuvoton_ops, +}; + +/* The only purpose for the handler is to signal to any waiting threads that + * the interrupt is currently being asserted. The driver does not do any + * processing triggered by interrupts, and the chip provides no way to mask at + * the source (plus that would be slow over I2C). Run the IRQ as a one-shot, + * this means it cannot be shared. */ +static irqreturn_t i2c_nuvoton_int_handler(int dummy, void *dev_id) +{ + struct tpm_chip *chip = dev_id; + struct priv_data *priv = chip->vendor.priv; + + priv->intrs++; + wake_up(&chip->vendor.read_queue); + disable_irq_nosync(chip->vendor.irq); + return IRQ_HANDLED; +} + +static int get_vid(struct i2c_client *client, u32 *res) +{ + static const u8 vid_did_rid_value[] = { 0x50, 0x10, 0xfe }; + u32 temp; + s32 rc; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + return -ENODEV; + rc = i2c_nuvoton_read_buf(client, TPM_VID_DID_RID, 4, (u8 *)&temp); + if (rc < 0) + return rc; + + /* check WPCT301 values - ignore RID */ + if (memcmp(&temp, vid_did_rid_value, sizeof(vid_did_rid_value))) { + /* + * f/w rev 2.81 has an issue where the VID_DID_RID is not + * reporting the right value. so give it another chance at + * offset 0x20 (FIFO_W). + */ + rc = i2c_nuvoton_read_buf(client, TPM_DATA_FIFO_W, 4, + (u8 *) (&temp)); + if (rc < 0) + return rc; + + /* check WPCT301 values - ignore RID */ + if (memcmp(&temp, vid_did_rid_value, + sizeof(vid_did_rid_value))) + return -ENODEV; + } + + *res = temp; + return 0; +} + +static int i2c_nuvoton_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + int rc; + struct tpm_chip *chip; + struct device *dev = &client->dev; + u32 vid = 0; + + rc = get_vid(client, &vid); + if (rc) + return rc; + + dev_info(dev, "VID: %04X DID: %02X RID: %02X\n", (u16) vid, + (u8) (vid >> 16), (u8) (vid >> 24)); + + chip = tpm_register_hardware(dev, &tpm_i2c); + if (!chip) { + dev_err(dev, "%s() error in tpm_register_hardware\n", __func__); + return -ENODEV; + } + + chip->vendor.priv = devm_kzalloc(dev, sizeof(struct priv_data), + GFP_KERNEL); + init_waitqueue_head(&chip->vendor.read_queue); + init_waitqueue_head(&chip->vendor.int_queue); + + /* Default timeouts */ + chip->vendor.timeout_a = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + chip->vendor.timeout_b = msecs_to_jiffies(TPM_I2C_LONG_TIMEOUT); + chip->vendor.timeout_c = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + chip->vendor.timeout_d = msecs_to_jiffies(TPM_I2C_SHORT_TIMEOUT); + + /* + * I2C intfcaps (interrupt capabilitieis) in the chip are hard coded to: + * TPM_INTF_INT_LEVEL_LOW | TPM_INTF_DATA_AVAIL_INT + * The IRQ should be set in the i2c_board_info (which is done + * automatically in of_i2c_register_devices, for device tree users */ + chip->vendor.irq = client->irq; + + if (chip->vendor.irq) { + dev_dbg(dev, "%s() chip-vendor.irq\n", __func__); + rc = devm_request_irq(dev, chip->vendor.irq, + i2c_nuvoton_int_handler, + IRQF_TRIGGER_LOW, + chip->vendor.miscdev.name, + chip); + if (rc) { + dev_err(dev, "%s() Unable to request irq: %d for use\n", + __func__, chip->vendor.irq); + chip->vendor.irq = 0; + } else { + /* Clear any pending interrupt */ + i2c_nuvoton_ready(chip); + /* - wait for TPM_STS==0xA0 (stsValid, commandReady) */ + rc = i2c_nuvoton_wait_for_stat(chip, + TPM_STS_COMMAND_READY, + TPM_STS_COMMAND_READY, + chip->vendor.timeout_b, + NULL); + if (rc == 0) { + /* + * TIS is in ready state + * write dummy byte to enter reception state + * TPM_DATA_FIFO_W <- rc (0) + */ + rc = i2c_nuvoton_write_buf(client, + TPM_DATA_FIFO_W, + 1, (u8 *) (&rc)); + if (rc < 0) + goto out_err; + /* TPM_STS <- 0x40 (commandReady) */ + i2c_nuvoton_ready(chip); + } else { + /* + * timeout_b reached - command was + * aborted. TIS should now be in idle state - + * only TPM_STS_VALID should be set + */ + if (i2c_nuvoton_read_status(chip) != + TPM_STS_VALID) { + rc = -EIO; + goto out_err; + } + } + } + } + + if (tpm_get_timeouts(chip)) { + rc = -ENODEV; + goto out_err; + } + + if (tpm_do_selftest(chip)) { + rc = -ENODEV; + goto out_err; + } + + return 0; + +out_err: + tpm_dev_vendor_release(chip); + tpm_remove_hardware(chip->dev); + return rc; +} + +static int i2c_nuvoton_remove(struct i2c_client *client) +{ + struct device *dev = &(client->dev); + struct tpm_chip *chip = dev_get_drvdata(dev); + + if (chip) + tpm_dev_vendor_release(chip); + tpm_remove_hardware(dev); + kfree(chip); + return 0; +} + + +static const struct i2c_device_id i2c_nuvoton_id[] = { + {I2C_DRIVER_NAME, 0}, + {} +}; +MODULE_DEVICE_TABLE(i2c, i2c_nuvoton_id); + +#ifdef CONFIG_OF +static const struct of_device_id i2c_nuvoton_of_match[] = { + {.compatible = "nuvoton,npct501"}, + {.compatible = "winbond,wpct301"}, + {}, +}; +MODULE_DEVICE_TABLE(of, i2c_nuvoton_of_match); +#endif + +static SIMPLE_DEV_PM_OPS(i2c_nuvoton_pm_ops, tpm_pm_suspend, tpm_pm_resume); + +static struct i2c_driver i2c_nuvoton_driver = { + .id_table = i2c_nuvoton_id, + .probe = i2c_nuvoton_probe, + .remove = i2c_nuvoton_remove, + .driver = { + .name = I2C_DRIVER_NAME, + .owner = THIS_MODULE, + .pm = &i2c_nuvoton_pm_ops, + .of_match_table = of_match_ptr(i2c_nuvoton_of_match), + }, +}; + +module_i2c_driver(i2c_nuvoton_driver); + +MODULE_AUTHOR("Dan Morav (dan.morav@nuvoton.com)"); +MODULE_DESCRIPTION("Nuvoton TPM I2C Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c index 5bb8e2ddd3b..a0d6ceb5d00 100644 --- a/drivers/char/tpm/tpm_i2c_stm_st33.c +++ b/drivers/char/tpm/tpm_i2c_stm_st33.c @@ -584,7 +584,7 @@ static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); -static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); static struct attribute *stm_tpm_attrs[] = { @@ -746,8 +746,6 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id) tpm_get_timeouts(chip); - i2c_set_clientdata(client, chip); - dev_info(chip->dev, "TPM I2C Initialized\n"); return 0; _irq_set: @@ -807,24 +805,18 @@ static int tpm_st33_i2c_remove(struct i2c_client *client) #ifdef CONFIG_PM_SLEEP /* * tpm_st33_i2c_pm_suspend suspend the TPM device - * Added: Work around when suspend and no tpm application is running, suspend - * may fail because chip->data_buffer is not set (only set in tpm_open in Linux - * TPM core) * @param: client, the i2c_client drescription (TPM I2C description). * @param: mesg, the power management message. * @return: 0 in case of success. */ static int tpm_st33_i2c_pm_suspend(struct device *dev) { - struct tpm_chip *chip = dev_get_drvdata(dev); struct st33zp24_platform_data *pin_infos = dev->platform_data; int ret = 0; if (power_mgt) { gpio_set_value(pin_infos->io_lpcpd, 0); } else { - if (chip->data_buffer == NULL) - chip->data_buffer = pin_infos->tpm_i2c_buffer[0]; ret = tpm_pm_suspend(dev); } return ret; @@ -849,8 +841,6 @@ static int tpm_st33_i2c_pm_resume(struct device *dev) TPM_STS_VALID) == TPM_STS_VALID, chip->vendor.timeout_b); } else { - if (chip->data_buffer == NULL) - chip->data_buffer = pin_infos->tpm_i2c_buffer[0]; ret = tpm_pm_resume(dev); if (!ret) tpm_do_selftest(chip); diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index 56b07c35a13..2783a42aa73 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -98,7 +98,7 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) if (count < len) { dev_err(ibmvtpm->dev, - "Invalid size in recv: count=%ld, crq_size=%d\n", + "Invalid size in recv: count=%zd, crq_size=%d\n", count, len); return -EIO; } @@ -136,7 +136,7 @@ static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) if (count > ibmvtpm->rtce_size) { dev_err(ibmvtpm->dev, - "Invalid size in send: count=%ld, rtce_size=%d\n", + "Invalid size in send: count=%zd, rtce_size=%d\n", count, ibmvtpm->rtce_size); return -EIO; } @@ -419,7 +419,7 @@ static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); -static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); diff --git a/drivers/char/tpm/tpm_ppi.c b/drivers/char/tpm/tpm_ppi.c index 2168d15bc72..8e562dc6560 100644 --- a/drivers/char/tpm/tpm_ppi.c +++ b/drivers/char/tpm/tpm_ppi.c @@ -452,12 +452,8 @@ int tpm_add_ppi(struct kobject *parent) { return sysfs_create_group(parent, &ppi_attr_grp); } -EXPORT_SYMBOL_GPL(tpm_add_ppi); void tpm_remove_ppi(struct kobject *parent) { sysfs_remove_group(parent, &ppi_attr_grp); } -EXPORT_SYMBOL_GPL(tpm_remove_ppi); - -MODULE_LICENSE("GPL"); diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 5796d0157ce..1b74459c072 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -448,7 +448,7 @@ static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, NULL); -static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps_1_2, NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c index 94c280d36e8..c8ff4df8177 100644 --- a/drivers/char/tpm/xen-tpmfront.c +++ b/drivers/char/tpm/xen-tpmfront.c @@ -351,8 +351,6 @@ static int tpmfront_probe(struct xenbus_device *dev, tpm_get_timeouts(priv->chip); - dev_set_drvdata(&dev->dev, priv->chip); - return rv; } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 218460fcd2e..25a70d06c5b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -68,6 +68,9 @@ static void cs_check_cpu(int cpu, unsigned int load) dbs_info->requested_freq += get_freq_target(cs_tuners, policy); + if (dbs_info->requested_freq > policy->max) + dbs_info->requested_freq = policy->max; + __cpufreq_driver_target(policy, dbs_info->requested_freq, CPUFREQ_RELATION_H); return; diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 0806c31e576..e6be63561fa 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -328,10 +328,6 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy, dbs_data->cdata->gov_dbs_timer); } - /* - * conservative does not implement micro like ondemand - * governor, thus we are bound to jiffes/HZ - */ if (dbs_data->cdata->governor == GOV_CONSERVATIVE) { cs_dbs_info->down_skip = 0; cs_dbs_info->enable = 1; diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index be6d14307aa..a0acd0bfba4 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -53,6 +53,7 @@ static unsigned int omap_getspeed(unsigned int cpu) static int omap_target(struct cpufreq_policy *policy, unsigned int index) { + int r, ret; struct dev_pm_opp *opp; unsigned long freq, volt = 0, volt_old = 0, tol = 0; unsigned int old_freq, new_freq; diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index dd2874ec192..446687cc233 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -89,14 +89,15 @@ config AT_HDMAC Support the Atmel AHB DMA controller. config FSL_DMA - tristate "Freescale Elo and Elo Plus DMA support" + tristate "Freescale Elo series DMA support" depends on FSL_SOC select DMA_ENGINE select ASYNC_TX_ENABLE_CHANNEL_SWITCH ---help--- - Enable support for the Freescale Elo and Elo Plus DMA controllers. - The Elo is the DMA controller on some 82xx and 83xx parts, and the - Elo Plus is the DMA controller on 85xx and 86xx parts. + Enable support for the Freescale Elo series DMA controllers. + The Elo is the DMA controller on some mpc82xx and mpc83xx parts, the + EloPlus is on mpc85xx and mpc86xx and Pxxx parts, and the Elo3 is on + some Txxx and Bxxx parts. config MPC512X_DMA tristate "Freescale MPC512x built-in DMA engine support" diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c index e51a9832ef0..16a2aa28f85 100644 --- a/drivers/dma/amba-pl08x.c +++ b/drivers/dma/amba-pl08x.c @@ -1164,42 +1164,12 @@ static void pl08x_free_txd(struct pl08x_driver_data *pl08x, kfree(txd); } -static void pl08x_unmap_buffers(struct pl08x_txd *txd) -{ - struct device *dev = txd->vd.tx.chan->device->dev; - struct pl08x_sg *dsg; - - if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->vd.tx.flags & DMA_COMPL_SRC_UNMAP_SINGLE) - list_for_each_entry(dsg, &txd->dsg_list, node) - dma_unmap_single(dev, dsg->src_addr, dsg->len, - DMA_TO_DEVICE); - else { - list_for_each_entry(dsg, &txd->dsg_list, node) - dma_unmap_page(dev, dsg->src_addr, dsg->len, - DMA_TO_DEVICE); - } - } - if (!(txd->vd.tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->vd.tx.flags & DMA_COMPL_DEST_UNMAP_SINGLE) - list_for_each_entry(dsg, &txd->dsg_list, node) - dma_unmap_single(dev, dsg->dst_addr, dsg->len, - DMA_FROM_DEVICE); - else - list_for_each_entry(dsg, &txd->dsg_list, node) - dma_unmap_page(dev, dsg->dst_addr, dsg->len, - DMA_FROM_DEVICE); - } -} - static void pl08x_desc_free(struct virt_dma_desc *vd) { struct pl08x_txd *txd = to_pl08x_txd(&vd->tx); struct pl08x_dma_chan *plchan = to_pl08x_chan(vd->tx.chan); - if (!plchan->slave) - pl08x_unmap_buffers(txd); - + dma_descriptor_unmap(txd); if (!txd->done) pl08x_release_mux(plchan); @@ -1252,7 +1222,7 @@ static enum dma_status pl08x_dma_tx_status(struct dma_chan *chan, size_t bytes = 0; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; /* @@ -1267,7 +1237,7 @@ static enum dma_status pl08x_dma_tx_status(struct dma_chan *chan, spin_lock_irqsave(&plchan->vc.lock, flags); ret = dma_cookie_status(chan, cookie, txstate); - if (ret != DMA_SUCCESS) { + if (ret != DMA_COMPLETE) { vd = vchan_find_desc(&plchan->vc, cookie); if (vd) { /* On the issued list, so hasn't been processed yet */ @@ -2138,8 +2108,7 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id) writel(0x000000FF, pl08x->base + PL080_ERR_CLEAR); writel(0x000000FF, pl08x->base + PL080_TC_CLEAR); - ret = request_irq(adev->irq[0], pl08x_irq, IRQF_DISABLED, - DRIVER_NAME, pl08x); + ret = request_irq(adev->irq[0], pl08x_irq, 0, DRIVER_NAME, pl08x); if (ret) { dev_err(&adev->dev, "%s failed to request interrupt %d\n", __func__, adev->irq[0]); diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index c787f38a186..e2c04dc81e2 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -344,31 +344,7 @@ atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc) /* move myself to free_list */ list_move(&desc->desc_node, &atchan->free_list); - /* unmap dma addresses (not on slave channels) */ - if (!atchan->chan_common.private) { - struct device *parent = chan2parent(&atchan->chan_common); - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(parent, - desc->lli.daddr, - desc->len, DMA_FROM_DEVICE); - else - dma_unmap_page(parent, - desc->lli.daddr, - desc->len, DMA_FROM_DEVICE); - } - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(parent, - desc->lli.saddr, - desc->len, DMA_TO_DEVICE); - else - dma_unmap_page(parent, - desc->lli.saddr, - desc->len, DMA_TO_DEVICE); - } - } - + dma_descriptor_unmap(txd); /* for cyclic transfers, * no need to replay callback function while stopping */ if (!atc_chan_is_cyclic(atchan)) { @@ -1102,7 +1078,7 @@ atc_tx_status(struct dma_chan *chan, int bytes = 0; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; /* * There's no point calculating the residue if there's diff --git a/drivers/dma/coh901318.c b/drivers/dma/coh901318.c index 31011d2a26f..3c6716e0b78 100644 --- a/drivers/dma/coh901318.c +++ b/drivers/dma/coh901318.c @@ -2369,7 +2369,7 @@ coh901318_tx_status(struct dma_chan *chan, dma_cookie_t cookie, enum dma_status ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; dma_set_residue(txstate, coh901318_get_bytes_left(chan)); @@ -2694,7 +2694,7 @@ static int __init coh901318_probe(struct platform_device *pdev) if (irq < 0) return irq; - err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, IRQF_DISABLED, + err = devm_request_irq(&pdev->dev, irq, dma_irq_handler, 0, "coh901318", base); if (err) return err; diff --git a/drivers/dma/cppi41.c b/drivers/dma/cppi41.c index 7c82b92f9b1..c29dacff66f 100644 --- a/drivers/dma/cppi41.c +++ b/drivers/dma/cppi41.c @@ -141,6 +141,9 @@ struct cppi41_dd { const struct chan_queues *queues_rx; const struct chan_queues *queues_tx; struct chan_queues td_queue; + + /* context for suspend/resume */ + unsigned int dma_tdfdq; }; #define FIST_COMPLETION_QUEUE 93 @@ -263,6 +266,15 @@ static u32 pd_trans_len(u32 val) return val & ((1 << (DESC_LENGTH_BITS_NUM + 1)) - 1); } +static u32 cppi41_pop_desc(struct cppi41_dd *cdd, unsigned queue_num) +{ + u32 desc; + + desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(queue_num)); + desc &= ~0x1f; + return desc; +} + static irqreturn_t cppi41_irq(int irq, void *data) { struct cppi41_dd *cdd = data; @@ -300,8 +312,7 @@ static irqreturn_t cppi41_irq(int irq, void *data) q_num = __fls(val); val &= ~(1 << q_num); q_num += 32 * i; - desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(q_num)); - desc &= ~0x1f; + desc = cppi41_pop_desc(cdd, q_num); c = desc_to_chan(cdd, desc); if (WARN_ON(!c)) { pr_err("%s() q %d desc %08x\n", __func__, @@ -353,7 +364,7 @@ static enum dma_status cppi41_dma_tx_status(struct dma_chan *chan, /* lock */ ret = dma_cookie_status(chan, cookie, txstate); - if (txstate && ret == DMA_SUCCESS) + if (txstate && ret == DMA_COMPLETE) txstate->residue = c->residue; /* unlock */ @@ -517,15 +528,6 @@ static void cppi41_compute_td_desc(struct cppi41_desc *d) d->pd0 = DESC_TYPE_TEARD << DESC_TYPE; } -static u32 cppi41_pop_desc(struct cppi41_dd *cdd, unsigned queue_num) -{ - u32 desc; - - desc = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(queue_num)); - desc &= ~0x1f; - return desc; -} - static int cppi41_tear_down_chan(struct cppi41_channel *c) { struct cppi41_dd *cdd = c->cdd; @@ -561,36 +563,26 @@ static int cppi41_tear_down_chan(struct cppi41_channel *c) c->td_retry = 100; } - if (!c->td_seen) { - unsigned td_comp_queue; + if (!c->td_seen || !c->td_desc_seen) { - if (c->is_tx) - td_comp_queue = cdd->td_queue.complete; - else - td_comp_queue = c->q_comp_num; + desc_phys = cppi41_pop_desc(cdd, cdd->td_queue.complete); + if (!desc_phys) + desc_phys = cppi41_pop_desc(cdd, c->q_comp_num); - desc_phys = cppi41_pop_desc(cdd, td_comp_queue); - if (desc_phys) { - __iormb(); + if (desc_phys == c->desc_phys) { + c->td_desc_seen = 1; + + } else if (desc_phys == td_desc_phys) { + u32 pd0; - if (desc_phys == td_desc_phys) { - u32 pd0; - pd0 = td->pd0; - WARN_ON((pd0 >> DESC_TYPE) != DESC_TYPE_TEARD); - WARN_ON(!c->is_tx && !(pd0 & TD_DESC_IS_RX)); - WARN_ON((pd0 & 0x1f) != c->port_num); - } else { - WARN_ON_ONCE(1); - } - c->td_seen = 1; - } - } - if (!c->td_desc_seen) { - desc_phys = cppi41_pop_desc(cdd, c->q_comp_num); - if (desc_phys) { __iormb(); - WARN_ON(c->desc_phys != desc_phys); - c->td_desc_seen = 1; + pd0 = td->pd0; + WARN_ON((pd0 >> DESC_TYPE) != DESC_TYPE_TEARD); + WARN_ON(!c->is_tx && !(pd0 & TD_DESC_IS_RX)); + WARN_ON((pd0 & 0x1f) != c->port_num); + c->td_seen = 1; + } else if (desc_phys) { + WARN_ON_ONCE(1); } } c->td_retry--; @@ -609,7 +601,7 @@ static int cppi41_tear_down_chan(struct cppi41_channel *c) WARN_ON(!c->td_retry); if (!c->td_desc_seen) { - desc_phys = cppi_readl(cdd->qmgr_mem + QMGR_QUEUE_D(c->q_num)); + desc_phys = cppi41_pop_desc(cdd, c->q_num); WARN_ON(!desc_phys); } @@ -674,14 +666,14 @@ static void cleanup_chans(struct cppi41_dd *cdd) } } -static int cppi41_add_chans(struct platform_device *pdev, struct cppi41_dd *cdd) +static int cppi41_add_chans(struct device *dev, struct cppi41_dd *cdd) { struct cppi41_channel *cchan; int i; int ret; u32 n_chans; - ret = of_property_read_u32(pdev->dev.of_node, "#dma-channels", + ret = of_property_read_u32(dev->of_node, "#dma-channels", &n_chans); if (ret) return ret; @@ -719,7 +711,7 @@ err: return -ENOMEM; } -static void purge_descs(struct platform_device *pdev, struct cppi41_dd *cdd) +static void purge_descs(struct device *dev, struct cppi41_dd *cdd) { unsigned int mem_decs; int i; @@ -731,7 +723,7 @@ static void purge_descs(struct platform_device *pdev, struct cppi41_dd *cdd) cppi_writel(0, cdd->qmgr_mem + QMGR_MEMBASE(i)); cppi_writel(0, cdd->qmgr_mem + QMGR_MEMCTRL(i)); - dma_free_coherent(&pdev->dev, mem_decs, cdd->cd, + dma_free_coherent(dev, mem_decs, cdd->cd, cdd->descs_phys); } } @@ -741,19 +733,19 @@ static void disable_sched(struct cppi41_dd *cdd) cppi_writel(0, cdd->sched_mem + DMA_SCHED_CTRL); } -static void deinit_cpii41(struct platform_device *pdev, struct cppi41_dd *cdd) +static void deinit_cppi41(struct device *dev, struct cppi41_dd *cdd) { disable_sched(cdd); - purge_descs(pdev, cdd); + purge_descs(dev, cdd); cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM0_BASE); cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM0_BASE); - dma_free_coherent(&pdev->dev, QMGR_SCRATCH_SIZE, cdd->qmgr_scratch, + dma_free_coherent(dev, QMGR_SCRATCH_SIZE, cdd->qmgr_scratch, cdd->scratch_phys); } -static int init_descs(struct platform_device *pdev, struct cppi41_dd *cdd) +static int init_descs(struct device *dev, struct cppi41_dd *cdd) { unsigned int desc_size; unsigned int mem_decs; @@ -777,7 +769,7 @@ static int init_descs(struct platform_device *pdev, struct cppi41_dd *cdd) reg |= ilog2(ALLOC_DECS_NUM) - 5; BUILD_BUG_ON(DESCS_AREAS != 1); - cdd->cd = dma_alloc_coherent(&pdev->dev, mem_decs, + cdd->cd = dma_alloc_coherent(dev, mem_decs, &cdd->descs_phys, GFP_KERNEL); if (!cdd->cd) return -ENOMEM; @@ -813,12 +805,12 @@ static void init_sched(struct cppi41_dd *cdd) cppi_writel(reg, cdd->sched_mem + DMA_SCHED_CTRL); } -static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd) +static int init_cppi41(struct device *dev, struct cppi41_dd *cdd) { int ret; BUILD_BUG_ON(QMGR_SCRATCH_SIZE > ((1 << 14) - 1)); - cdd->qmgr_scratch = dma_alloc_coherent(&pdev->dev, QMGR_SCRATCH_SIZE, + cdd->qmgr_scratch = dma_alloc_coherent(dev, QMGR_SCRATCH_SIZE, &cdd->scratch_phys, GFP_KERNEL); if (!cdd->qmgr_scratch) return -ENOMEM; @@ -827,7 +819,7 @@ static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd) cppi_writel(QMGR_SCRATCH_SIZE, cdd->qmgr_mem + QMGR_LRAM_SIZE); cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM1_BASE); - ret = init_descs(pdev, cdd); + ret = init_descs(dev, cdd); if (ret) goto err_td; @@ -835,7 +827,7 @@ static int init_cppi41(struct platform_device *pdev, struct cppi41_dd *cdd) init_sched(cdd); return 0; err_td: - deinit_cpii41(pdev, cdd); + deinit_cppi41(dev, cdd); return ret; } @@ -914,11 +906,11 @@ static const struct of_device_id cppi41_dma_ids[] = { }; MODULE_DEVICE_TABLE(of, cppi41_dma_ids); -static const struct cppi_glue_infos *get_glue_info(struct platform_device *pdev) +static const struct cppi_glue_infos *get_glue_info(struct device *dev) { const struct of_device_id *of_id; - of_id = of_match_node(cppi41_dma_ids, pdev->dev.of_node); + of_id = of_match_node(cppi41_dma_ids, dev->of_node); if (!of_id) return NULL; return of_id->data; @@ -927,11 +919,12 @@ static const struct cppi_glue_infos *get_glue_info(struct platform_device *pdev) static int cppi41_dma_probe(struct platform_device *pdev) { struct cppi41_dd *cdd; + struct device *dev = &pdev->dev; const struct cppi_glue_infos *glue_info; int irq; int ret; - glue_info = get_glue_info(pdev); + glue_info = get_glue_info(dev); if (!glue_info) return -EINVAL; @@ -946,14 +939,14 @@ static int cppi41_dma_probe(struct platform_device *pdev) cdd->ddev.device_issue_pending = cppi41_dma_issue_pending; cdd->ddev.device_prep_slave_sg = cppi41_dma_prep_slave_sg; cdd->ddev.device_control = cppi41_dma_control; - cdd->ddev.dev = &pdev->dev; + cdd->ddev.dev = dev; INIT_LIST_HEAD(&cdd->ddev.channels); cpp41_dma_info.dma_cap = cdd->ddev.cap_mask; - cdd->usbss_mem = of_iomap(pdev->dev.of_node, 0); - cdd->ctrl_mem = of_iomap(pdev->dev.of_node, 1); - cdd->sched_mem = of_iomap(pdev->dev.of_node, 2); - cdd->qmgr_mem = of_iomap(pdev->dev.of_node, 3); + cdd->usbss_mem = of_iomap(dev->of_node, 0); + cdd->ctrl_mem = of_iomap(dev->of_node, 1); + cdd->sched_mem = of_iomap(dev->of_node, 2); + cdd->qmgr_mem = of_iomap(dev->of_node, 3); if (!cdd->usbss_mem || !cdd->ctrl_mem || !cdd->sched_mem || !cdd->qmgr_mem) { @@ -961,31 +954,31 @@ static int cppi41_dma_probe(struct platform_device *pdev) goto err_remap; } - pm_runtime_enable(&pdev->dev); - ret = pm_runtime_get_sync(&pdev->dev); - if (ret) + pm_runtime_enable(dev); + ret = pm_runtime_get_sync(dev); + if (ret < 0) goto err_get_sync; cdd->queues_rx = glue_info->queues_rx; cdd->queues_tx = glue_info->queues_tx; cdd->td_queue = glue_info->td_queue; - ret = init_cppi41(pdev, cdd); + ret = init_cppi41(dev, cdd); if (ret) goto err_init_cppi; - ret = cppi41_add_chans(pdev, cdd); + ret = cppi41_add_chans(dev, cdd); if (ret) goto err_chans; - irq = irq_of_parse_and_map(pdev->dev.of_node, 0); + irq = irq_of_parse_and_map(dev->of_node, 0); if (!irq) goto err_irq; cppi_writel(USBSS_IRQ_PD_COMP, cdd->usbss_mem + USBSS_IRQ_ENABLER); ret = request_irq(irq, glue_info->isr, IRQF_SHARED, - dev_name(&pdev->dev), cdd); + dev_name(dev), cdd); if (ret) goto err_irq; cdd->irq = irq; @@ -994,7 +987,7 @@ static int cppi41_dma_probe(struct platform_device *pdev) if (ret) goto err_dma_reg; - ret = of_dma_controller_register(pdev->dev.of_node, + ret = of_dma_controller_register(dev->of_node, cppi41_dma_xlate, &cpp41_dma_info); if (ret) goto err_of; @@ -1009,11 +1002,11 @@ err_irq: cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR); cleanup_chans(cdd); err_chans: - deinit_cpii41(pdev, cdd); + deinit_cppi41(dev, cdd); err_init_cppi: - pm_runtime_put(&pdev->dev); + pm_runtime_put(dev); err_get_sync: - pm_runtime_disable(&pdev->dev); + pm_runtime_disable(dev); iounmap(cdd->usbss_mem); iounmap(cdd->ctrl_mem); iounmap(cdd->sched_mem); @@ -1033,7 +1026,7 @@ static int cppi41_dma_remove(struct platform_device *pdev) cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR); free_irq(cdd->irq, cdd); cleanup_chans(cdd); - deinit_cpii41(pdev, cdd); + deinit_cppi41(&pdev->dev, cdd); iounmap(cdd->usbss_mem); iounmap(cdd->ctrl_mem); iounmap(cdd->sched_mem); @@ -1044,12 +1037,53 @@ static int cppi41_dma_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_PM_SLEEP +static int cppi41_suspend(struct device *dev) +{ + struct cppi41_dd *cdd = dev_get_drvdata(dev); + + cdd->dma_tdfdq = cppi_readl(cdd->ctrl_mem + DMA_TDFDQ); + cppi_writel(0, cdd->usbss_mem + USBSS_IRQ_CLEARR); + disable_sched(cdd); + + return 0; +} + +static int cppi41_resume(struct device *dev) +{ + struct cppi41_dd *cdd = dev_get_drvdata(dev); + struct cppi41_channel *c; + int i; + + for (i = 0; i < DESCS_AREAS; i++) + cppi_writel(cdd->descs_phys, cdd->qmgr_mem + QMGR_MEMBASE(i)); + + list_for_each_entry(c, &cdd->ddev.channels, chan.device_node) + if (!c->is_tx) + cppi_writel(c->q_num, c->gcr_reg + RXHPCRA0); + + init_sched(cdd); + + cppi_writel(cdd->dma_tdfdq, cdd->ctrl_mem + DMA_TDFDQ); + cppi_writel(cdd->scratch_phys, cdd->qmgr_mem + QMGR_LRAM0_BASE); + cppi_writel(QMGR_SCRATCH_SIZE, cdd->qmgr_mem + QMGR_LRAM_SIZE); + cppi_writel(0, cdd->qmgr_mem + QMGR_LRAM1_BASE); + + cppi_writel(USBSS_IRQ_PD_COMP, cdd->usbss_mem + USBSS_IRQ_ENABLER); + + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(cppi41_pm_ops, cppi41_suspend, cppi41_resume); + static struct platform_driver cpp41_dma_driver = { .probe = cppi41_dma_probe, .remove = cppi41_dma_remove, .driver = { .name = "cppi41-dma-engine", .owner = THIS_MODULE, + .pm = &cppi41_pm_ops, .of_match_table = of_match_ptr(cppi41_dma_ids), }, }; diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c index b0c0c8268d4..94c380f0753 100644 --- a/drivers/dma/dma-jz4740.c +++ b/drivers/dma/dma-jz4740.c @@ -491,7 +491,7 @@ static enum dma_status jz4740_dma_tx_status(struct dma_chan *c, unsigned long flags; status = dma_cookie_status(c, cookie, state); - if (status == DMA_SUCCESS || !state) + if (status == DMA_COMPLETE || !state) return status; spin_lock_irqsave(&chan->vchan.lock, flags); diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 9162ac80c18..ea806bdc12e 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -65,6 +65,7 @@ #include <linux/acpi.h> #include <linux/acpi_dma.h> #include <linux/of_dma.h> +#include <linux/mempool.h> static DEFINE_MUTEX(dma_list_mutex); static DEFINE_IDR(dma_idr); @@ -901,98 +902,132 @@ void dma_async_device_unregister(struct dma_device *device) } EXPORT_SYMBOL(dma_async_device_unregister); -/** - * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses - * @chan: DMA channel to offload copy to - * @dest: destination address (virtual) - * @src: source address (virtual) - * @len: length - * - * Both @dest and @src must be mappable to a bus address according to the - * DMA mapping API rules for streaming mappings. - * Both @dest and @src must stay memory resident (kernel memory or locked - * user space pages). - */ -dma_cookie_t -dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, - void *src, size_t len) -{ - struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - dma_addr_t dma_dest, dma_src; - dma_cookie_t cookie; - unsigned long flags; +struct dmaengine_unmap_pool { + struct kmem_cache *cache; + const char *name; + mempool_t *pool; + size_t size; +}; - dma_src = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE); - dma_dest = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE); - flags = DMA_CTRL_ACK | - DMA_COMPL_SRC_UNMAP_SINGLE | - DMA_COMPL_DEST_UNMAP_SINGLE; - tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags); +#define __UNMAP_POOL(x) { .size = x, .name = "dmaengine-unmap-" __stringify(x) } +static struct dmaengine_unmap_pool unmap_pool[] = { + __UNMAP_POOL(2), + #if IS_ENABLED(CONFIG_ASYNC_TX_DMA) + __UNMAP_POOL(16), + __UNMAP_POOL(128), + __UNMAP_POOL(256), + #endif +}; - if (!tx) { - dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE); - dma_unmap_single(dev->dev, dma_dest, len, DMA_FROM_DEVICE); - return -ENOMEM; +static struct dmaengine_unmap_pool *__get_unmap_pool(int nr) +{ + int order = get_count_order(nr); + + switch (order) { + case 0 ... 1: + return &unmap_pool[0]; + case 2 ... 4: + return &unmap_pool[1]; + case 5 ... 7: + return &unmap_pool[2]; + case 8: + return &unmap_pool[3]; + default: + BUG(); + return NULL; } +} - tx->callback = NULL; - cookie = tx->tx_submit(tx); +static void dmaengine_unmap(struct kref *kref) +{ + struct dmaengine_unmap_data *unmap = container_of(kref, typeof(*unmap), kref); + struct device *dev = unmap->dev; + int cnt, i; + + cnt = unmap->to_cnt; + for (i = 0; i < cnt; i++) + dma_unmap_page(dev, unmap->addr[i], unmap->len, + DMA_TO_DEVICE); + cnt += unmap->from_cnt; + for (; i < cnt; i++) + dma_unmap_page(dev, unmap->addr[i], unmap->len, + DMA_FROM_DEVICE); + cnt += unmap->bidi_cnt; + for (; i < cnt; i++) { + if (unmap->addr[i] == 0) + continue; + dma_unmap_page(dev, unmap->addr[i], unmap->len, + DMA_BIDIRECTIONAL); + } + mempool_free(unmap, __get_unmap_pool(cnt)->pool); +} - preempt_disable(); - __this_cpu_add(chan->local->bytes_transferred, len); - __this_cpu_inc(chan->local->memcpy_count); - preempt_enable(); +void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap) +{ + if (unmap) + kref_put(&unmap->kref, dmaengine_unmap); +} +EXPORT_SYMBOL_GPL(dmaengine_unmap_put); - return cookie; +static void dmaengine_destroy_unmap_pool(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(unmap_pool); i++) { + struct dmaengine_unmap_pool *p = &unmap_pool[i]; + + if (p->pool) + mempool_destroy(p->pool); + p->pool = NULL; + if (p->cache) + kmem_cache_destroy(p->cache); + p->cache = NULL; + } } -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); -/** - * dma_async_memcpy_buf_to_pg - offloaded copy from address to page - * @chan: DMA channel to offload copy to - * @page: destination page - * @offset: offset in page to copy to - * @kdata: source address (virtual) - * @len: length - * - * Both @page/@offset and @kdata must be mappable to a bus address according - * to the DMA mapping API rules for streaming mappings. - * Both @page/@offset and @kdata must stay memory resident (kernel memory or - * locked user space pages) - */ -dma_cookie_t -dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page, - unsigned int offset, void *kdata, size_t len) +static int __init dmaengine_init_unmap_pool(void) { - struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - dma_addr_t dma_dest, dma_src; - dma_cookie_t cookie; - unsigned long flags; + int i; - dma_src = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE); - dma_dest = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE); - flags = DMA_CTRL_ACK | DMA_COMPL_SRC_UNMAP_SINGLE; - tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags); + for (i = 0; i < ARRAY_SIZE(unmap_pool); i++) { + struct dmaengine_unmap_pool *p = &unmap_pool[i]; + size_t size; - if (!tx) { - dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE); - dma_unmap_page(dev->dev, dma_dest, len, DMA_FROM_DEVICE); - return -ENOMEM; + size = sizeof(struct dmaengine_unmap_data) + + sizeof(dma_addr_t) * p->size; + + p->cache = kmem_cache_create(p->name, size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!p->cache) + break; + p->pool = mempool_create_slab_pool(1, p->cache); + if (!p->pool) + break; } - tx->callback = NULL; - cookie = tx->tx_submit(tx); + if (i == ARRAY_SIZE(unmap_pool)) + return 0; - preempt_disable(); - __this_cpu_add(chan->local->bytes_transferred, len); - __this_cpu_inc(chan->local->memcpy_count); - preempt_enable(); + dmaengine_destroy_unmap_pool(); + return -ENOMEM; +} - return cookie; +struct dmaengine_unmap_data * +dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) +{ + struct dmaengine_unmap_data *unmap; + + unmap = mempool_alloc(__get_unmap_pool(nr)->pool, flags); + if (!unmap) + return NULL; + + memset(unmap, 0, sizeof(*unmap)); + kref_init(&unmap->kref); + unmap->dev = dev; + + return unmap; } -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); +EXPORT_SYMBOL(dmaengine_get_unmap_data); /** * dma_async_memcpy_pg_to_pg - offloaded copy from page to page @@ -1015,24 +1050,33 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, { struct dma_device *dev = chan->device; struct dma_async_tx_descriptor *tx; - dma_addr_t dma_dest, dma_src; + struct dmaengine_unmap_data *unmap; dma_cookie_t cookie; unsigned long flags; - dma_src = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE); - dma_dest = dma_map_page(dev->dev, dest_pg, dest_off, len, - DMA_FROM_DEVICE); + unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOIO); + if (!unmap) + return -ENOMEM; + + unmap->to_cnt = 1; + unmap->from_cnt = 1; + unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len, + DMA_TO_DEVICE); + unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len, + DMA_FROM_DEVICE); + unmap->len = len; flags = DMA_CTRL_ACK; - tx = dev->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, flags); + tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0], + len, flags); if (!tx) { - dma_unmap_page(dev->dev, dma_src, len, DMA_TO_DEVICE); - dma_unmap_page(dev->dev, dma_dest, len, DMA_FROM_DEVICE); + dmaengine_unmap_put(unmap); return -ENOMEM; } - tx->callback = NULL; + dma_set_unmap(tx, unmap); cookie = tx->tx_submit(tx); + dmaengine_unmap_put(unmap); preempt_disable(); __this_cpu_add(chan->local->bytes_transferred, len); @@ -1043,6 +1087,52 @@ dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, } EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); +/** + * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses + * @chan: DMA channel to offload copy to + * @dest: destination address (virtual) + * @src: source address (virtual) + * @len: length + * + * Both @dest and @src must be mappable to a bus address according to the + * DMA mapping API rules for streaming mappings. + * Both @dest and @src must stay memory resident (kernel memory or locked + * user space pages). + */ +dma_cookie_t +dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, + void *src, size_t len) +{ + return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest), + (unsigned long) dest & ~PAGE_MASK, + virt_to_page(src), + (unsigned long) src & ~PAGE_MASK, len); +} +EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); + +/** + * dma_async_memcpy_buf_to_pg - offloaded copy from address to page + * @chan: DMA channel to offload copy to + * @page: destination page + * @offset: offset in page to copy to + * @kdata: source address (virtual) + * @len: length + * + * Both @page/@offset and @kdata must be mappable to a bus address according + * to the DMA mapping API rules for streaming mappings. + * Both @page/@offset and @kdata must stay memory resident (kernel memory or + * locked user space pages) + */ +dma_cookie_t +dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page, + unsigned int offset, void *kdata, size_t len) +{ + return dma_async_memcpy_pg_to_pg(chan, page, offset, + virt_to_page(kdata), + (unsigned long) kdata & ~PAGE_MASK, len); +} +EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); + void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, struct dma_chan *chan) { @@ -1062,7 +1152,7 @@ dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); if (!tx) - return DMA_SUCCESS; + return DMA_COMPLETE; while (tx->cookie == -EBUSY) { if (time_after_eq(jiffies, dma_sync_wait_timeout)) { @@ -1116,6 +1206,10 @@ EXPORT_SYMBOL_GPL(dma_run_dependencies); static int __init dma_bus_init(void) { + int err = dmaengine_init_unmap_pool(); + + if (err) + return err; return class_register(&dma_devclass); } arch_initcall(dma_bus_init); diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 92f796cdc6a..20f9a3aaf92 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -8,6 +8,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/delay.h> #include <linux/dma-mapping.h> #include <linux/dmaengine.h> @@ -19,10 +21,6 @@ #include <linux/random.h> #include <linux/slab.h> #include <linux/wait.h> -#include <linux/ctype.h> -#include <linux/debugfs.h> -#include <linux/uaccess.h> -#include <linux/seq_file.h> static unsigned int test_buf_size = 16384; module_param(test_buf_size, uint, S_IRUGO | S_IWUSR); @@ -68,92 +66,13 @@ module_param(timeout, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(timeout, "Transfer Timeout in msec (default: 3000), " "Pass -1 for infinite timeout"); -/* Maximum amount of mismatched bytes in buffer to print */ -#define MAX_ERROR_COUNT 32 - -/* - * Initialization patterns. All bytes in the source buffer has bit 7 - * set, all bytes in the destination buffer has bit 7 cleared. - * - * Bit 6 is set for all bytes which are to be copied by the DMA - * engine. Bit 5 is set for all bytes which are to be overwritten by - * the DMA engine. - * - * The remaining bits are the inverse of a counter which increments by - * one for each byte address. - */ -#define PATTERN_SRC 0x80 -#define PATTERN_DST 0x00 -#define PATTERN_COPY 0x40 -#define PATTERN_OVERWRITE 0x20 -#define PATTERN_COUNT_MASK 0x1f - -enum dmatest_error_type { - DMATEST_ET_OK, - DMATEST_ET_MAP_SRC, - DMATEST_ET_MAP_DST, - DMATEST_ET_PREP, - DMATEST_ET_SUBMIT, - DMATEST_ET_TIMEOUT, - DMATEST_ET_DMA_ERROR, - DMATEST_ET_DMA_IN_PROGRESS, - DMATEST_ET_VERIFY, - DMATEST_ET_VERIFY_BUF, -}; - -struct dmatest_verify_buffer { - unsigned int index; - u8 expected; - u8 actual; -}; - -struct dmatest_verify_result { - unsigned int error_count; - struct dmatest_verify_buffer data[MAX_ERROR_COUNT]; - u8 pattern; - bool is_srcbuf; -}; - -struct dmatest_thread_result { - struct list_head node; - unsigned int n; - unsigned int src_off; - unsigned int dst_off; - unsigned int len; - enum dmatest_error_type type; - union { - unsigned long data; - dma_cookie_t cookie; - enum dma_status status; - int error; - struct dmatest_verify_result *vr; - }; -}; - -struct dmatest_result { - struct list_head node; - char *name; - struct list_head results; -}; - -struct dmatest_info; - -struct dmatest_thread { - struct list_head node; - struct dmatest_info *info; - struct task_struct *task; - struct dma_chan *chan; - u8 **srcs; - u8 **dsts; - enum dma_transaction_type type; - bool done; -}; +static bool noverify; +module_param(noverify, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(noverify, "Disable random data setup and verification"); -struct dmatest_chan { - struct list_head node; - struct dma_chan *chan; - struct list_head threads; -}; +static bool verbose; +module_param(verbose, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(verbose, "Enable \"success\" result messages (default: off)"); /** * struct dmatest_params - test parameters. @@ -177,6 +96,7 @@ struct dmatest_params { unsigned int xor_sources; unsigned int pq_sources; int timeout; + bool noverify; }; /** @@ -184,7 +104,7 @@ struct dmatest_params { * @params: test parameters * @lock: access protection to the fields of this structure */ -struct dmatest_info { +static struct dmatest_info { /* Test parameters */ struct dmatest_params params; @@ -192,16 +112,95 @@ struct dmatest_info { struct list_head channels; unsigned int nr_channels; struct mutex lock; + bool did_init; +} test_info = { + .channels = LIST_HEAD_INIT(test_info.channels), + .lock = __MUTEX_INITIALIZER(test_info.lock), +}; + +static int dmatest_run_set(const char *val, const struct kernel_param *kp); +static int dmatest_run_get(char *val, const struct kernel_param *kp); +static struct kernel_param_ops run_ops = { + .set = dmatest_run_set, + .get = dmatest_run_get, +}; +static bool dmatest_run; +module_param_cb(run, &run_ops, &dmatest_run, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(run, "Run the test (default: false)"); + +/* Maximum amount of mismatched bytes in buffer to print */ +#define MAX_ERROR_COUNT 32 + +/* + * Initialization patterns. All bytes in the source buffer has bit 7 + * set, all bytes in the destination buffer has bit 7 cleared. + * + * Bit 6 is set for all bytes which are to be copied by the DMA + * engine. Bit 5 is set for all bytes which are to be overwritten by + * the DMA engine. + * + * The remaining bits are the inverse of a counter which increments by + * one for each byte address. + */ +#define PATTERN_SRC 0x80 +#define PATTERN_DST 0x00 +#define PATTERN_COPY 0x40 +#define PATTERN_OVERWRITE 0x20 +#define PATTERN_COUNT_MASK 0x1f - /* debugfs related stuff */ - struct dentry *root; +struct dmatest_thread { + struct list_head node; + struct dmatest_info *info; + struct task_struct *task; + struct dma_chan *chan; + u8 **srcs; + u8 **dsts; + enum dma_transaction_type type; + bool done; +}; - /* Test results */ - struct list_head results; - struct mutex results_lock; +struct dmatest_chan { + struct list_head node; + struct dma_chan *chan; + struct list_head threads; }; -static struct dmatest_info test_info; +static DECLARE_WAIT_QUEUE_HEAD(thread_wait); +static bool wait; + +static bool is_threaded_test_run(struct dmatest_info *info) +{ + struct dmatest_chan *dtc; + + list_for_each_entry(dtc, &info->channels, node) { + struct dmatest_thread *thread; + + list_for_each_entry(thread, &dtc->threads, node) { + if (!thread->done) + return true; + } + } + + return false; +} + +static int dmatest_wait_get(char *val, const struct kernel_param *kp) +{ + struct dmatest_info *info = &test_info; + struct dmatest_params *params = &info->params; + + if (params->iterations) + wait_event(thread_wait, !is_threaded_test_run(info)); + wait = true; + return param_get_bool(val, kp); +} + +static struct kernel_param_ops wait_ops = { + .get = dmatest_wait_get, + .set = param_set_bool, +}; +module_param_cb(wait, &wait_ops, &wait, S_IRUGO); +MODULE_PARM_DESC(wait, "Wait for tests to complete (default: false)"); static bool dmatest_match_channel(struct dmatest_params *params, struct dma_chan *chan) @@ -223,7 +222,7 @@ static unsigned long dmatest_random(void) { unsigned long buf; - get_random_bytes(&buf, sizeof(buf)); + prandom_bytes(&buf, sizeof(buf)); return buf; } @@ -262,9 +261,31 @@ static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len, } } -static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs, - unsigned int start, unsigned int end, unsigned int counter, - u8 pattern, bool is_srcbuf) +static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index, + unsigned int counter, bool is_srcbuf) +{ + u8 diff = actual ^ pattern; + u8 expected = pattern | (~counter & PATTERN_COUNT_MASK); + const char *thread_name = current->comm; + + if (is_srcbuf) + pr_warn("%s: srcbuf[0x%x] overwritten! Expected %02x, got %02x\n", + thread_name, index, expected, actual); + else if ((pattern & PATTERN_COPY) + && (diff & (PATTERN_COPY | PATTERN_OVERWRITE))) + pr_warn("%s: dstbuf[0x%x] not copied! Expected %02x, got %02x\n", + thread_name, index, expected, actual); + else if (diff & PATTERN_SRC) + pr_warn("%s: dstbuf[0x%x] was copied! Expected %02x, got %02x\n", + thread_name, index, expected, actual); + else + pr_warn("%s: dstbuf[0x%x] mismatch! Expected %02x, got %02x\n", + thread_name, index, expected, actual); +} + +static unsigned int dmatest_verify(u8 **bufs, unsigned int start, + unsigned int end, unsigned int counter, u8 pattern, + bool is_srcbuf) { unsigned int i; unsigned int error_count = 0; @@ -272,7 +293,6 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs, u8 expected; u8 *buf; unsigned int counter_orig = counter; - struct dmatest_verify_buffer *vb; for (; (buf = *bufs); bufs++) { counter = counter_orig; @@ -280,12 +300,9 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs, actual = buf[i]; expected = pattern | (~counter & PATTERN_COUNT_MASK); if (actual != expected) { - if (error_count < MAX_ERROR_COUNT && vr) { - vb = &vr->data[error_count]; - vb->index = i; - vb->expected = expected; - vb->actual = actual; - } + if (error_count < MAX_ERROR_COUNT) + dmatest_mismatch(actual, pattern, i, + counter, is_srcbuf); error_count++; } counter++; @@ -293,7 +310,7 @@ static unsigned int dmatest_verify(struct dmatest_verify_result *vr, u8 **bufs, } if (error_count > MAX_ERROR_COUNT) - pr_warning("%s: %u errors suppressed\n", + pr_warn("%s: %u errors suppressed\n", current->comm, error_count - MAX_ERROR_COUNT); return error_count; @@ -313,20 +330,6 @@ static void dmatest_callback(void *arg) wake_up_all(done->wait); } -static inline void unmap_src(struct device *dev, dma_addr_t *addr, size_t len, - unsigned int count) -{ - while (count--) - dma_unmap_single(dev, addr[count], len, DMA_TO_DEVICE); -} - -static inline void unmap_dst(struct device *dev, dma_addr_t *addr, size_t len, - unsigned int count) -{ - while (count--) - dma_unmap_single(dev, addr[count], len, DMA_BIDIRECTIONAL); -} - static unsigned int min_odd(unsigned int x, unsigned int y) { unsigned int val = min(x, y); @@ -334,172 +337,49 @@ static unsigned int min_odd(unsigned int x, unsigned int y) return val % 2 ? val : val - 1; } -static char *verify_result_get_one(struct dmatest_verify_result *vr, - unsigned int i) +static void result(const char *err, unsigned int n, unsigned int src_off, + unsigned int dst_off, unsigned int len, unsigned long data) { - struct dmatest_verify_buffer *vb = &vr->data[i]; - u8 diff = vb->actual ^ vr->pattern; - static char buf[512]; - char *msg; - - if (vr->is_srcbuf) - msg = "srcbuf overwritten!"; - else if ((vr->pattern & PATTERN_COPY) - && (diff & (PATTERN_COPY | PATTERN_OVERWRITE))) - msg = "dstbuf not copied!"; - else if (diff & PATTERN_SRC) - msg = "dstbuf was copied!"; - else - msg = "dstbuf mismatch!"; - - snprintf(buf, sizeof(buf) - 1, "%s [0x%x] Expected %02x, got %02x", msg, - vb->index, vb->expected, vb->actual); - - return buf; + pr_info("%s: result #%u: '%s' with src_off=0x%x dst_off=0x%x len=0x%x (%lu)", + current->comm, n, err, src_off, dst_off, len, data); } -static char *thread_result_get(const char *name, - struct dmatest_thread_result *tr) +static void dbg_result(const char *err, unsigned int n, unsigned int src_off, + unsigned int dst_off, unsigned int len, + unsigned long data) { - static const char * const messages[] = { - [DMATEST_ET_OK] = "No errors", - [DMATEST_ET_MAP_SRC] = "src mapping error", - [DMATEST_ET_MAP_DST] = "dst mapping error", - [DMATEST_ET_PREP] = "prep error", - [DMATEST_ET_SUBMIT] = "submit error", - [DMATEST_ET_TIMEOUT] = "test timed out", - [DMATEST_ET_DMA_ERROR] = - "got completion callback (DMA_ERROR)", - [DMATEST_ET_DMA_IN_PROGRESS] = - "got completion callback (DMA_IN_PROGRESS)", - [DMATEST_ET_VERIFY] = "errors", - [DMATEST_ET_VERIFY_BUF] = "verify errors", - }; - static char buf[512]; - - snprintf(buf, sizeof(buf) - 1, - "%s: #%u: %s with src_off=0x%x ""dst_off=0x%x len=0x%x (%lu)", - name, tr->n, messages[tr->type], tr->src_off, tr->dst_off, - tr->len, tr->data); - - return buf; + pr_debug("%s: result #%u: '%s' with src_off=0x%x dst_off=0x%x len=0x%x (%lu)", + current->comm, n, err, src_off, dst_off, len, data); } -static int thread_result_add(struct dmatest_info *info, - struct dmatest_result *r, enum dmatest_error_type type, - unsigned int n, unsigned int src_off, unsigned int dst_off, - unsigned int len, unsigned long data) -{ - struct dmatest_thread_result *tr; - - tr = kzalloc(sizeof(*tr), GFP_KERNEL); - if (!tr) - return -ENOMEM; - - tr->type = type; - tr->n = n; - tr->src_off = src_off; - tr->dst_off = dst_off; - tr->len = len; - tr->data = data; +#define verbose_result(err, n, src_off, dst_off, len, data) ({ \ + if (verbose) \ + result(err, n, src_off, dst_off, len, data); \ + else \ + dbg_result(err, n, src_off, dst_off, len, data); \ +}) - mutex_lock(&info->results_lock); - list_add_tail(&tr->node, &r->results); - mutex_unlock(&info->results_lock); - - if (tr->type == DMATEST_ET_OK) - pr_debug("%s\n", thread_result_get(r->name, tr)); - else - pr_warn("%s\n", thread_result_get(r->name, tr)); - - return 0; -} - -static unsigned int verify_result_add(struct dmatest_info *info, - struct dmatest_result *r, unsigned int n, - unsigned int src_off, unsigned int dst_off, unsigned int len, - u8 **bufs, int whence, unsigned int counter, u8 pattern, - bool is_srcbuf) +static unsigned long long dmatest_persec(s64 runtime, unsigned int val) { - struct dmatest_verify_result *vr; - unsigned int error_count; - unsigned int buf_off = is_srcbuf ? src_off : dst_off; - unsigned int start, end; - - if (whence < 0) { - start = 0; - end = buf_off; - } else if (whence > 0) { - start = buf_off + len; - end = info->params.buf_size; - } else { - start = buf_off; - end = buf_off + len; - } + unsigned long long per_sec = 1000000; - vr = kmalloc(sizeof(*vr), GFP_KERNEL); - if (!vr) { - pr_warn("dmatest: No memory to store verify result\n"); - return dmatest_verify(NULL, bufs, start, end, counter, pattern, - is_srcbuf); - } - - vr->pattern = pattern; - vr->is_srcbuf = is_srcbuf; - - error_count = dmatest_verify(vr, bufs, start, end, counter, pattern, - is_srcbuf); - if (error_count) { - vr->error_count = error_count; - thread_result_add(info, r, DMATEST_ET_VERIFY_BUF, n, src_off, - dst_off, len, (unsigned long)vr); - return error_count; - } - - kfree(vr); - return 0; -} - -static void result_free(struct dmatest_info *info, const char *name) -{ - struct dmatest_result *r, *_r; - - mutex_lock(&info->results_lock); - list_for_each_entry_safe(r, _r, &info->results, node) { - struct dmatest_thread_result *tr, *_tr; - - if (name && strcmp(r->name, name)) - continue; - - list_for_each_entry_safe(tr, _tr, &r->results, node) { - if (tr->type == DMATEST_ET_VERIFY_BUF) - kfree(tr->vr); - list_del(&tr->node); - kfree(tr); - } + if (runtime <= 0) + return 0; - kfree(r->name); - list_del(&r->node); - kfree(r); + /* drop precision until runtime is 32-bits */ + while (runtime > UINT_MAX) { + runtime >>= 1; + per_sec <<= 1; } - mutex_unlock(&info->results_lock); + per_sec *= val; + do_div(per_sec, runtime); + return per_sec; } -static struct dmatest_result *result_init(struct dmatest_info *info, - const char *name) +static unsigned long long dmatest_KBs(s64 runtime, unsigned long long len) { - struct dmatest_result *r; - - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r) { - r->name = kstrdup(name, GFP_KERNEL); - INIT_LIST_HEAD(&r->results); - mutex_lock(&info->results_lock); - list_add_tail(&r->node, &info->results); - mutex_unlock(&info->results_lock); - } - return r; + return dmatest_persec(runtime, len >> 10); } /* @@ -525,7 +405,6 @@ static int dmatest_func(void *data) struct dmatest_params *params; struct dma_chan *chan; struct dma_device *dev; - const char *thread_name; unsigned int src_off, dst_off, len; unsigned int error_count; unsigned int failed_tests = 0; @@ -538,9 +417,10 @@ static int dmatest_func(void *data) int src_cnt; int dst_cnt; int i; - struct dmatest_result *result; + ktime_t ktime; + s64 runtime = 0; + unsigned long long total_len = 0; - thread_name = current->comm; set_freezable(); ret = -ENOMEM; @@ -570,10 +450,6 @@ static int dmatest_func(void *data) } else goto err_thread_type; - result = result_init(info, thread_name); - if (!result) - goto err_srcs; - thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL); if (!thread->srcs) goto err_srcs; @@ -597,17 +473,17 @@ static int dmatest_func(void *data) set_user_nice(current, 10); /* - * src buffers are freed by the DMAEngine code with dma_unmap_single() - * dst buffers are freed by ourselves below + * src and dst buffers are freed by ourselves below */ - flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT - | DMA_COMPL_SKIP_DEST_UNMAP | DMA_COMPL_SRC_UNMAP_SINGLE; + flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT; + ktime = ktime_get(); while (!kthread_should_stop() && !(params->iterations && total_tests >= params->iterations)) { struct dma_async_tx_descriptor *tx = NULL; - dma_addr_t dma_srcs[src_cnt]; - dma_addr_t dma_dsts[dst_cnt]; + struct dmaengine_unmap_data *um; + dma_addr_t srcs[src_cnt]; + dma_addr_t *dsts; u8 align = 0; total_tests++; @@ -626,81 +502,103 @@ static int dmatest_func(void *data) break; } - len = dmatest_random() % params->buf_size + 1; + if (params->noverify) { + len = params->buf_size; + src_off = 0; + dst_off = 0; + } else { + len = dmatest_random() % params->buf_size + 1; + len = (len >> align) << align; + if (!len) + len = 1 << align; + src_off = dmatest_random() % (params->buf_size - len + 1); + dst_off = dmatest_random() % (params->buf_size - len + 1); + + src_off = (src_off >> align) << align; + dst_off = (dst_off >> align) << align; + + dmatest_init_srcs(thread->srcs, src_off, len, + params->buf_size); + dmatest_init_dsts(thread->dsts, dst_off, len, + params->buf_size); + } + len = (len >> align) << align; if (!len) len = 1 << align; - src_off = dmatest_random() % (params->buf_size - len + 1); - dst_off = dmatest_random() % (params->buf_size - len + 1); + total_len += len; - src_off = (src_off >> align) << align; - dst_off = (dst_off >> align) << align; - - dmatest_init_srcs(thread->srcs, src_off, len, params->buf_size); - dmatest_init_dsts(thread->dsts, dst_off, len, params->buf_size); + um = dmaengine_get_unmap_data(dev->dev, src_cnt+dst_cnt, + GFP_KERNEL); + if (!um) { + failed_tests++; + result("unmap data NULL", total_tests, + src_off, dst_off, len, ret); + continue; + } + um->len = params->buf_size; for (i = 0; i < src_cnt; i++) { - u8 *buf = thread->srcs[i] + src_off; - - dma_srcs[i] = dma_map_single(dev->dev, buf, len, - DMA_TO_DEVICE); - ret = dma_mapping_error(dev->dev, dma_srcs[i]); + unsigned long buf = (unsigned long) thread->srcs[i]; + struct page *pg = virt_to_page(buf); + unsigned pg_off = buf & ~PAGE_MASK; + + um->addr[i] = dma_map_page(dev->dev, pg, pg_off, + um->len, DMA_TO_DEVICE); + srcs[i] = um->addr[i] + src_off; + ret = dma_mapping_error(dev->dev, um->addr[i]); if (ret) { - unmap_src(dev->dev, dma_srcs, len, i); - thread_result_add(info, result, - DMATEST_ET_MAP_SRC, - total_tests, src_off, dst_off, - len, ret); + dmaengine_unmap_put(um); + result("src mapping error", total_tests, + src_off, dst_off, len, ret); failed_tests++; continue; } + um->to_cnt++; } /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */ + dsts = &um->addr[src_cnt]; for (i = 0; i < dst_cnt; i++) { - dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i], - params->buf_size, - DMA_BIDIRECTIONAL); - ret = dma_mapping_error(dev->dev, dma_dsts[i]); + unsigned long buf = (unsigned long) thread->dsts[i]; + struct page *pg = virt_to_page(buf); + unsigned pg_off = buf & ~PAGE_MASK; + + dsts[i] = dma_map_page(dev->dev, pg, pg_off, um->len, + DMA_BIDIRECTIONAL); + ret = dma_mapping_error(dev->dev, dsts[i]); if (ret) { - unmap_src(dev->dev, dma_srcs, len, src_cnt); - unmap_dst(dev->dev, dma_dsts, params->buf_size, - i); - thread_result_add(info, result, - DMATEST_ET_MAP_DST, - total_tests, src_off, dst_off, - len, ret); + dmaengine_unmap_put(um); + result("dst mapping error", total_tests, + src_off, dst_off, len, ret); failed_tests++; continue; } + um->bidi_cnt++; } if (thread->type == DMA_MEMCPY) tx = dev->device_prep_dma_memcpy(chan, - dma_dsts[0] + dst_off, - dma_srcs[0], len, - flags); + dsts[0] + dst_off, + srcs[0], len, flags); else if (thread->type == DMA_XOR) tx = dev->device_prep_dma_xor(chan, - dma_dsts[0] + dst_off, - dma_srcs, src_cnt, + dsts[0] + dst_off, + srcs, src_cnt, len, flags); else if (thread->type == DMA_PQ) { dma_addr_t dma_pq[dst_cnt]; for (i = 0; i < dst_cnt; i++) - dma_pq[i] = dma_dsts[i] + dst_off; - tx = dev->device_prep_dma_pq(chan, dma_pq, dma_srcs, + dma_pq[i] = dsts[i] + dst_off; + tx = dev->device_prep_dma_pq(chan, dma_pq, srcs, src_cnt, pq_coefs, len, flags); } if (!tx) { - unmap_src(dev->dev, dma_srcs, len, src_cnt); - unmap_dst(dev->dev, dma_dsts, params->buf_size, - dst_cnt); - thread_result_add(info, result, DMATEST_ET_PREP, - total_tests, src_off, dst_off, - len, 0); + dmaengine_unmap_put(um); + result("prep error", total_tests, src_off, + dst_off, len, ret); msleep(100); failed_tests++; continue; @@ -712,9 +610,9 @@ static int dmatest_func(void *data) cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { - thread_result_add(info, result, DMATEST_ET_SUBMIT, - total_tests, src_off, dst_off, - len, cookie); + dmaengine_unmap_put(um); + result("submit error", total_tests, src_off, + dst_off, len, ret); msleep(100); failed_tests++; continue; @@ -735,59 +633,59 @@ static int dmatest_func(void *data) * free it this time?" dancing. For now, just * leave it dangling. */ - thread_result_add(info, result, DMATEST_ET_TIMEOUT, - total_tests, src_off, dst_off, - len, 0); + dmaengine_unmap_put(um); + result("test timed out", total_tests, src_off, dst_off, + len, 0); failed_tests++; continue; - } else if (status != DMA_SUCCESS) { - enum dmatest_error_type type = (status == DMA_ERROR) ? - DMATEST_ET_DMA_ERROR : DMATEST_ET_DMA_IN_PROGRESS; - thread_result_add(info, result, type, - total_tests, src_off, dst_off, - len, status); + } else if (status != DMA_COMPLETE) { + dmaengine_unmap_put(um); + result(status == DMA_ERROR ? + "completion error status" : + "completion busy status", total_tests, src_off, + dst_off, len, ret); failed_tests++; continue; } - /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */ - unmap_dst(dev->dev, dma_dsts, params->buf_size, dst_cnt); + dmaengine_unmap_put(um); - error_count = 0; + if (params->noverify) { + verbose_result("test passed", total_tests, src_off, + dst_off, len, 0); + continue; + } - pr_debug("%s: verifying source buffer...\n", thread_name); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->srcs, -1, + pr_debug("%s: verifying source buffer...\n", current->comm); + error_count = dmatest_verify(thread->srcs, 0, src_off, 0, PATTERN_SRC, true); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->srcs, 0, - src_off, PATTERN_SRC | PATTERN_COPY, true); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->srcs, 1, - src_off + len, PATTERN_SRC, true); - - pr_debug("%s: verifying dest buffer...\n", thread_name); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->dsts, -1, + error_count += dmatest_verify(thread->srcs, src_off, + src_off + len, src_off, + PATTERN_SRC | PATTERN_COPY, true); + error_count += dmatest_verify(thread->srcs, src_off + len, + params->buf_size, src_off + len, + PATTERN_SRC, true); + + pr_debug("%s: verifying dest buffer...\n", current->comm); + error_count += dmatest_verify(thread->dsts, 0, dst_off, 0, PATTERN_DST, false); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->dsts, 0, - src_off, PATTERN_SRC | PATTERN_COPY, false); - error_count += verify_result_add(info, result, total_tests, - src_off, dst_off, len, thread->dsts, 1, - dst_off + len, PATTERN_DST, false); + error_count += dmatest_verify(thread->dsts, dst_off, + dst_off + len, src_off, + PATTERN_SRC | PATTERN_COPY, false); + error_count += dmatest_verify(thread->dsts, dst_off + len, + params->buf_size, dst_off + len, + PATTERN_DST, false); if (error_count) { - thread_result_add(info, result, DMATEST_ET_VERIFY, - total_tests, src_off, dst_off, - len, error_count); + result("data error", total_tests, src_off, dst_off, + len, error_count); failed_tests++; } else { - thread_result_add(info, result, DMATEST_ET_OK, - total_tests, src_off, dst_off, - len, 0); + verbose_result("test passed", total_tests, src_off, + dst_off, len, 0); } } + runtime = ktime_us_delta(ktime_get(), ktime); ret = 0; for (i = 0; thread->dsts[i]; i++) @@ -802,20 +700,17 @@ err_srcbuf: err_srcs: kfree(pq_coefs); err_thread_type: - pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", - thread_name, total_tests, failed_tests, ret); + pr_info("%s: summary %u tests, %u failures %llu iops %llu KB/s (%d)\n", + current->comm, total_tests, failed_tests, + dmatest_persec(runtime, total_tests), + dmatest_KBs(runtime, total_len), ret); /* terminate all transfers on specified channels */ if (ret) dmaengine_terminate_all(chan); thread->done = true; - - if (params->iterations > 0) - while (!kthread_should_stop()) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait_dmatest_exit); - interruptible_sleep_on(&wait_dmatest_exit); - } + wake_up(&thread_wait); return ret; } @@ -828,9 +723,10 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc) list_for_each_entry_safe(thread, _thread, &dtc->threads, node) { ret = kthread_stop(thread->task); - pr_debug("dmatest: thread %s exited with status %d\n", - thread->task->comm, ret); + pr_debug("thread %s exited with status %d\n", + thread->task->comm, ret); list_del(&thread->node); + put_task_struct(thread->task); kfree(thread); } @@ -861,27 +757,27 @@ static int dmatest_add_threads(struct dmatest_info *info, for (i = 0; i < params->threads_per_chan; i++) { thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL); if (!thread) { - pr_warning("dmatest: No memory for %s-%s%u\n", - dma_chan_name(chan), op, i); - + pr_warn("No memory for %s-%s%u\n", + dma_chan_name(chan), op, i); break; } thread->info = info; thread->chan = dtc->chan; thread->type = type; smp_wmb(); - thread->task = kthread_run(dmatest_func, thread, "%s-%s%u", + thread->task = kthread_create(dmatest_func, thread, "%s-%s%u", dma_chan_name(chan), op, i); if (IS_ERR(thread->task)) { - pr_warning("dmatest: Failed to run thread %s-%s%u\n", - dma_chan_name(chan), op, i); + pr_warn("Failed to create thread %s-%s%u\n", + dma_chan_name(chan), op, i); kfree(thread); break; } /* srcbuf and dstbuf are allocated by the thread itself */ - + get_task_struct(thread->task); list_add_tail(&thread->node, &dtc->threads); + wake_up_process(thread->task); } return i; @@ -897,7 +793,7 @@ static int dmatest_add_channel(struct dmatest_info *info, dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); if (!dtc) { - pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); + pr_warn("No memory for %s\n", dma_chan_name(chan)); return -ENOMEM; } @@ -917,7 +813,7 @@ static int dmatest_add_channel(struct dmatest_info *info, thread_count += cnt > 0 ? cnt : 0; } - pr_info("dmatest: Started %u threads using %s\n", + pr_info("Started %u threads using %s\n", thread_count, dma_chan_name(chan)); list_add_tail(&dtc->node, &info->channels); @@ -937,20 +833,20 @@ static bool filter(struct dma_chan *chan, void *param) return true; } -static int __run_threaded_test(struct dmatest_info *info) +static void request_channels(struct dmatest_info *info, + enum dma_transaction_type type) { dma_cap_mask_t mask; - struct dma_chan *chan; - struct dmatest_params *params = &info->params; - int err = 0; dma_cap_zero(mask); - dma_cap_set(DMA_MEMCPY, mask); + dma_cap_set(type, mask); for (;;) { + struct dmatest_params *params = &info->params; + struct dma_chan *chan; + chan = dma_request_channel(mask, filter, params); if (chan) { - err = dmatest_add_channel(info, chan); - if (err) { + if (dmatest_add_channel(info, chan)) { dma_release_channel(chan); break; /* add_channel failed, punt */ } @@ -960,22 +856,30 @@ static int __run_threaded_test(struct dmatest_info *info) info->nr_channels >= params->max_channels) break; /* we have all we need */ } - return err; } -#ifndef MODULE -static int run_threaded_test(struct dmatest_info *info) +static void run_threaded_test(struct dmatest_info *info) { - int ret; + struct dmatest_params *params = &info->params; - mutex_lock(&info->lock); - ret = __run_threaded_test(info); - mutex_unlock(&info->lock); - return ret; + /* Copy test parameters */ + params->buf_size = test_buf_size; + strlcpy(params->channel, strim(test_channel), sizeof(params->channel)); + strlcpy(params->device, strim(test_device), sizeof(params->device)); + params->threads_per_chan = threads_per_chan; + params->max_channels = max_channels; + params->iterations = iterations; + params->xor_sources = xor_sources; + params->pq_sources = pq_sources; + params->timeout = timeout; + params->noverify = noverify; + + request_channels(info, DMA_MEMCPY); + request_channels(info, DMA_XOR); + request_channels(info, DMA_PQ); } -#endif -static void __stop_threaded_test(struct dmatest_info *info) +static void stop_threaded_test(struct dmatest_info *info) { struct dmatest_chan *dtc, *_dtc; struct dma_chan *chan; @@ -984,203 +888,86 @@ static void __stop_threaded_test(struct dmatest_info *info) list_del(&dtc->node); chan = dtc->chan; dmatest_cleanup_channel(dtc); - pr_debug("dmatest: dropped channel %s\n", dma_chan_name(chan)); + pr_debug("dropped channel %s\n", dma_chan_name(chan)); dma_release_channel(chan); } info->nr_channels = 0; } -static void stop_threaded_test(struct dmatest_info *info) +static void restart_threaded_test(struct dmatest_info *info, bool run) { - mutex_lock(&info->lock); - __stop_threaded_test(info); - mutex_unlock(&info->lock); -} - -static int __restart_threaded_test(struct dmatest_info *info, bool run) -{ - struct dmatest_params *params = &info->params; + /* we might be called early to set run=, defer running until all + * parameters have been evaluated + */ + if (!info->did_init) + return; /* Stop any running test first */ - __stop_threaded_test(info); - - if (run == false) - return 0; - - /* Clear results from previous run */ - result_free(info, NULL); - - /* Copy test parameters */ - params->buf_size = test_buf_size; - strlcpy(params->channel, strim(test_channel), sizeof(params->channel)); - strlcpy(params->device, strim(test_device), sizeof(params->device)); - params->threads_per_chan = threads_per_chan; - params->max_channels = max_channels; - params->iterations = iterations; - params->xor_sources = xor_sources; - params->pq_sources = pq_sources; - params->timeout = timeout; + stop_threaded_test(info); /* Run test with new parameters */ - return __run_threaded_test(info); -} - -static bool __is_threaded_test_run(struct dmatest_info *info) -{ - struct dmatest_chan *dtc; - - list_for_each_entry(dtc, &info->channels, node) { - struct dmatest_thread *thread; - - list_for_each_entry(thread, &dtc->threads, node) { - if (!thread->done) - return true; - } - } - - return false; + run_threaded_test(info); } -static ssize_t dtf_read_run(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static int dmatest_run_get(char *val, const struct kernel_param *kp) { - struct dmatest_info *info = file->private_data; - char buf[3]; + struct dmatest_info *info = &test_info; mutex_lock(&info->lock); - - if (__is_threaded_test_run(info)) { - buf[0] = 'Y'; + if (is_threaded_test_run(info)) { + dmatest_run = true; } else { - __stop_threaded_test(info); - buf[0] = 'N'; + stop_threaded_test(info); + dmatest_run = false; } - mutex_unlock(&info->lock); - buf[1] = '\n'; - buf[2] = 0x00; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t dtf_write_run(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct dmatest_info *info = file->private_data; - char buf[16]; - bool bv; - int ret = 0; - if (copy_from_user(buf, user_buf, min(count, (sizeof(buf) - 1)))) - return -EFAULT; - - if (strtobool(buf, &bv) == 0) { - mutex_lock(&info->lock); - - if (__is_threaded_test_run(info)) - ret = -EBUSY; - else - ret = __restart_threaded_test(info, bv); - - mutex_unlock(&info->lock); - } - - return ret ? ret : count; + return param_get_bool(val, kp); } -static const struct file_operations dtf_run_fops = { - .read = dtf_read_run, - .write = dtf_write_run, - .open = simple_open, - .llseek = default_llseek, -}; - -static int dtf_results_show(struct seq_file *sf, void *data) +static int dmatest_run_set(const char *val, const struct kernel_param *kp) { - struct dmatest_info *info = sf->private; - struct dmatest_result *result; - struct dmatest_thread_result *tr; - unsigned int i; + struct dmatest_info *info = &test_info; + int ret; - mutex_lock(&info->results_lock); - list_for_each_entry(result, &info->results, node) { - list_for_each_entry(tr, &result->results, node) { - seq_printf(sf, "%s\n", - thread_result_get(result->name, tr)); - if (tr->type == DMATEST_ET_VERIFY_BUF) { - for (i = 0; i < tr->vr->error_count; i++) { - seq_printf(sf, "\t%s\n", - verify_result_get_one(tr->vr, i)); - } - } - } + mutex_lock(&info->lock); + ret = param_set_bool(val, kp); + if (ret) { + mutex_unlock(&info->lock); + return ret; } - mutex_unlock(&info->results_lock); - return 0; -} - -static int dtf_results_open(struct inode *inode, struct file *file) -{ - return single_open(file, dtf_results_show, inode->i_private); -} - -static const struct file_operations dtf_results_fops = { - .open = dtf_results_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int dmatest_register_dbgfs(struct dmatest_info *info) -{ - struct dentry *d; - - d = debugfs_create_dir("dmatest", NULL); - if (IS_ERR(d)) - return PTR_ERR(d); - if (!d) - goto err_root; + if (is_threaded_test_run(info)) + ret = -EBUSY; + else if (dmatest_run) + restart_threaded_test(info, dmatest_run); - info->root = d; - - /* Run or stop threaded test */ - debugfs_create_file("run", S_IWUSR | S_IRUGO, info->root, info, - &dtf_run_fops); - - /* Results of test in progress */ - debugfs_create_file("results", S_IRUGO, info->root, info, - &dtf_results_fops); - - return 0; + mutex_unlock(&info->lock); -err_root: - pr_err("dmatest: Failed to initialize debugfs\n"); - return -ENOMEM; + return ret; } static int __init dmatest_init(void) { struct dmatest_info *info = &test_info; - int ret; - - memset(info, 0, sizeof(*info)); + struct dmatest_params *params = &info->params; - mutex_init(&info->lock); - INIT_LIST_HEAD(&info->channels); + if (dmatest_run) { + mutex_lock(&info->lock); + run_threaded_test(info); + mutex_unlock(&info->lock); + } - mutex_init(&info->results_lock); - INIT_LIST_HEAD(&info->results); + if (params->iterations && wait) + wait_event(thread_wait, !is_threaded_test_run(info)); - ret = dmatest_register_dbgfs(info); - if (ret) - return ret; + /* module parameters are stable, inittime tests are started, + * let userspace take over 'run' control + */ + info->did_init = true; -#ifdef MODULE return 0; -#else - return run_threaded_test(info); -#endif } /* when compiled-in wait for drivers to load first */ late_initcall(dmatest_init); @@ -1189,9 +976,9 @@ static void __exit dmatest_exit(void) { struct dmatest_info *info = &test_info; - debugfs_remove_recursive(info->root); + mutex_lock(&info->lock); stop_threaded_test(info); - result_free(info, NULL); + mutex_unlock(&info->lock); } module_exit(dmatest_exit); diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 89eb89f2228..7516be4677c 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -85,10 +85,6 @@ static struct device *chan2dev(struct dma_chan *chan) { return &chan->dev->device; } -static struct device *chan2parent(struct dma_chan *chan) -{ - return chan->dev->device.parent; -} static struct dw_desc *dwc_first_active(struct dw_dma_chan *dwc) { @@ -311,26 +307,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc, list_splice_init(&desc->tx_list, &dwc->free_list); list_move(&desc->desc_node, &dwc->free_list); - if (!is_slave_direction(dwc->direction)) { - struct device *parent = chan2parent(&dwc->chan); - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(parent, desc->lli.dar, - desc->total_len, DMA_FROM_DEVICE); - else - dma_unmap_page(parent, desc->lli.dar, - desc->total_len, DMA_FROM_DEVICE); - } - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(parent, desc->lli.sar, - desc->total_len, DMA_TO_DEVICE); - else - dma_unmap_page(parent, desc->lli.sar, - desc->total_len, DMA_TO_DEVICE); - } - } - + dma_descriptor_unmap(txd); spin_unlock_irqrestore(&dwc->lock, flags); if (callback) @@ -1098,13 +1075,13 @@ dwc_tx_status(struct dma_chan *chan, enum dma_status ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; dwc_scan_descriptors(to_dw_dma(chan->device), dwc); ret = dma_cookie_status(chan, cookie, txstate); - if (ret != DMA_SUCCESS) + if (ret != DMA_COMPLETE) dma_set_residue(txstate, dwc_get_residue(dwc)); if (dwc->paused && ret == DMA_IN_PROGRESS) diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index bef8a368c8d..2539ea0cbc6 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -46,14 +46,21 @@ #define EDMA_CHANS 64 #endif /* CONFIG_ARCH_DAVINCI_DA8XX */ -/* Max of 16 segments per channel to conserve PaRAM slots */ -#define MAX_NR_SG 16 +/* + * Max of 20 segments per channel to conserve PaRAM slots + * Also note that MAX_NR_SG should be atleast the no.of periods + * that are required for ASoC, otherwise DMA prep calls will + * fail. Today davinci-pcm is the only user of this driver and + * requires atleast 17 slots, so we setup the default to 20. + */ +#define MAX_NR_SG 20 #define EDMA_MAX_SLOTS MAX_NR_SG #define EDMA_DESCRIPTORS 16 struct edma_desc { struct virt_dma_desc vdesc; struct list_head node; + int cyclic; int absync; int pset_nr; int processed; @@ -167,8 +174,13 @@ static void edma_execute(struct edma_chan *echan) * then setup a link to the dummy slot, this results in all future * events being absorbed and that's OK because we're done */ - if (edesc->processed == edesc->pset_nr) - edma_link(echan->slot[nslots-1], echan->ecc->dummy_slot); + if (edesc->processed == edesc->pset_nr) { + if (edesc->cyclic) + edma_link(echan->slot[nslots-1], echan->slot[1]); + else + edma_link(echan->slot[nslots-1], + echan->ecc->dummy_slot); + } edma_resume(echan->ch_num); @@ -250,6 +262,117 @@ static int edma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, return ret; } +/* + * A PaRAM set configuration abstraction used by other modes + * @chan: Channel who's PaRAM set we're configuring + * @pset: PaRAM set to initialize and setup. + * @src_addr: Source address of the DMA + * @dst_addr: Destination address of the DMA + * @burst: In units of dev_width, how much to send + * @dev_width: How much is the dev_width + * @dma_length: Total length of the DMA transfer + * @direction: Direction of the transfer + */ +static int edma_config_pset(struct dma_chan *chan, struct edmacc_param *pset, + dma_addr_t src_addr, dma_addr_t dst_addr, u32 burst, + enum dma_slave_buswidth dev_width, unsigned int dma_length, + enum dma_transfer_direction direction) +{ + struct edma_chan *echan = to_edma_chan(chan); + struct device *dev = chan->device->dev; + int acnt, bcnt, ccnt, cidx; + int src_bidx, dst_bidx, src_cidx, dst_cidx; + int absync; + + acnt = dev_width; + /* + * If the maxburst is equal to the fifo width, use + * A-synced transfers. This allows for large contiguous + * buffer transfers using only one PaRAM set. + */ + if (burst == 1) { + /* + * For the A-sync case, bcnt and ccnt are the remainder + * and quotient respectively of the division of: + * (dma_length / acnt) by (SZ_64K -1). This is so + * that in case bcnt over flows, we have ccnt to use. + * Note: In A-sync tranfer only, bcntrld is used, but it + * only applies for sg_dma_len(sg) >= SZ_64K. + * In this case, the best way adopted is- bccnt for the + * first frame will be the remainder below. Then for + * every successive frame, bcnt will be SZ_64K-1. This + * is assured as bcntrld = 0xffff in end of function. + */ + absync = false; + ccnt = dma_length / acnt / (SZ_64K - 1); + bcnt = dma_length / acnt - ccnt * (SZ_64K - 1); + /* + * If bcnt is non-zero, we have a remainder and hence an + * extra frame to transfer, so increment ccnt. + */ + if (bcnt) + ccnt++; + else + bcnt = SZ_64K - 1; + cidx = acnt; + } else { + /* + * If maxburst is greater than the fifo address_width, + * use AB-synced transfers where A count is the fifo + * address_width and B count is the maxburst. In this + * case, we are limited to transfers of C count frames + * of (address_width * maxburst) where C count is limited + * to SZ_64K-1. This places an upper bound on the length + * of an SG segment that can be handled. + */ + absync = true; + bcnt = burst; + ccnt = dma_length / (acnt * bcnt); + if (ccnt > (SZ_64K - 1)) { + dev_err(dev, "Exceeded max SG segment size\n"); + return -EINVAL; + } + cidx = acnt * bcnt; + } + + if (direction == DMA_MEM_TO_DEV) { + src_bidx = acnt; + src_cidx = cidx; + dst_bidx = 0; + dst_cidx = 0; + } else if (direction == DMA_DEV_TO_MEM) { + src_bidx = 0; + src_cidx = 0; + dst_bidx = acnt; + dst_cidx = cidx; + } else { + dev_err(dev, "%s: direction not implemented yet\n", __func__); + return -EINVAL; + } + + pset->opt = EDMA_TCC(EDMA_CHAN_SLOT(echan->ch_num)); + /* Configure A or AB synchronized transfers */ + if (absync) + pset->opt |= SYNCDIM; + + pset->src = src_addr; + pset->dst = dst_addr; + + pset->src_dst_bidx = (dst_bidx << 16) | src_bidx; + pset->src_dst_cidx = (dst_cidx << 16) | src_cidx; + + pset->a_b_cnt = bcnt << 16 | acnt; + pset->ccnt = ccnt; + /* + * Only time when (bcntrld) auto reload is required is for + * A-sync case, and in this case, a requirement of reload value + * of SZ_64K-1 only is assured. 'link' is initially set to NULL + * and then later will be populated by edma_execute. + */ + pset->link_bcntrld = 0xffffffff; + return absync; +} + static struct dma_async_tx_descriptor *edma_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction direction, @@ -258,23 +381,21 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg( struct edma_chan *echan = to_edma_chan(chan); struct device *dev = chan->device->dev; struct edma_desc *edesc; - dma_addr_t dev_addr; + dma_addr_t src_addr = 0, dst_addr = 0; enum dma_slave_buswidth dev_width; u32 burst; struct scatterlist *sg; - int acnt, bcnt, ccnt, src, dst, cidx; - int src_bidx, dst_bidx, src_cidx, dst_cidx; - int i, nslots; + int i, nslots, ret; if (unlikely(!echan || !sgl || !sg_len)) return NULL; if (direction == DMA_DEV_TO_MEM) { - dev_addr = echan->cfg.src_addr; + src_addr = echan->cfg.src_addr; dev_width = echan->cfg.src_addr_width; burst = echan->cfg.src_maxburst; } else if (direction == DMA_MEM_TO_DEV) { - dev_addr = echan->cfg.dst_addr; + dst_addr = echan->cfg.dst_addr; dev_width = echan->cfg.dst_addr_width; burst = echan->cfg.dst_maxburst; } else { @@ -307,7 +428,6 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg( if (echan->slot[i] < 0) { kfree(edesc); dev_err(dev, "Failed to allocate slot\n"); - kfree(edesc); return NULL; } } @@ -315,64 +435,21 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg( /* Configure PaRAM sets for each SG */ for_each_sg(sgl, sg, sg_len, i) { - - acnt = dev_width; - - /* - * If the maxburst is equal to the fifo width, use - * A-synced transfers. This allows for large contiguous - * buffer transfers using only one PaRAM set. - */ - if (burst == 1) { - edesc->absync = false; - ccnt = sg_dma_len(sg) / acnt / (SZ_64K - 1); - bcnt = sg_dma_len(sg) / acnt - ccnt * (SZ_64K - 1); - if (bcnt) - ccnt++; - else - bcnt = SZ_64K - 1; - cidx = acnt; - /* - * If maxburst is greater than the fifo address_width, - * use AB-synced transfers where A count is the fifo - * address_width and B count is the maxburst. In this - * case, we are limited to transfers of C count frames - * of (address_width * maxburst) where C count is limited - * to SZ_64K-1. This places an upper bound on the length - * of an SG segment that can be handled. - */ - } else { - edesc->absync = true; - bcnt = burst; - ccnt = sg_dma_len(sg) / (acnt * bcnt); - if (ccnt > (SZ_64K - 1)) { - dev_err(dev, "Exceeded max SG segment size\n"); - kfree(edesc); - return NULL; - } - cidx = acnt * bcnt; + /* Get address for each SG */ + if (direction == DMA_DEV_TO_MEM) + dst_addr = sg_dma_address(sg); + else + src_addr = sg_dma_address(sg); + + ret = edma_config_pset(chan, &edesc->pset[i], src_addr, + dst_addr, burst, dev_width, + sg_dma_len(sg), direction); + if (ret < 0) { + kfree(edesc); + return NULL; } - if (direction == DMA_MEM_TO_DEV) { - src = sg_dma_address(sg); - dst = dev_addr; - src_bidx = acnt; - src_cidx = cidx; - dst_bidx = 0; - dst_cidx = 0; - } else { - src = dev_addr; - dst = sg_dma_address(sg); - src_bidx = 0; - src_cidx = 0; - dst_bidx = acnt; - dst_cidx = cidx; - } - - edesc->pset[i].opt = EDMA_TCC(EDMA_CHAN_SLOT(echan->ch_num)); - /* Configure A or AB synchronized transfers */ - if (edesc->absync) - edesc->pset[i].opt |= SYNCDIM; + edesc->absync = ret; /* If this is the last in a current SG set of transactions, enable interrupts so that next set is processed */ @@ -382,17 +459,138 @@ static struct dma_async_tx_descriptor *edma_prep_slave_sg( /* If this is the last set, enable completion interrupt flag */ if (i == sg_len - 1) edesc->pset[i].opt |= TCINTEN; + } - edesc->pset[i].src = src; - edesc->pset[i].dst = dst; + return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags); +} - edesc->pset[i].src_dst_bidx = (dst_bidx << 16) | src_bidx; - edesc->pset[i].src_dst_cidx = (dst_cidx << 16) | src_cidx; +static struct dma_async_tx_descriptor *edma_prep_dma_cyclic( + struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, + size_t period_len, enum dma_transfer_direction direction, + unsigned long tx_flags, void *context) +{ + struct edma_chan *echan = to_edma_chan(chan); + struct device *dev = chan->device->dev; + struct edma_desc *edesc; + dma_addr_t src_addr, dst_addr; + enum dma_slave_buswidth dev_width; + u32 burst; + int i, ret, nslots; + + if (unlikely(!echan || !buf_len || !period_len)) + return NULL; + + if (direction == DMA_DEV_TO_MEM) { + src_addr = echan->cfg.src_addr; + dst_addr = buf_addr; + dev_width = echan->cfg.src_addr_width; + burst = echan->cfg.src_maxburst; + } else if (direction == DMA_MEM_TO_DEV) { + src_addr = buf_addr; + dst_addr = echan->cfg.dst_addr; + dev_width = echan->cfg.dst_addr_width; + burst = echan->cfg.dst_maxburst; + } else { + dev_err(dev, "%s: bad direction?\n", __func__); + return NULL; + } + + if (dev_width == DMA_SLAVE_BUSWIDTH_UNDEFINED) { + dev_err(dev, "Undefined slave buswidth\n"); + return NULL; + } + + if (unlikely(buf_len % period_len)) { + dev_err(dev, "Period should be multiple of Buffer length\n"); + return NULL; + } + + nslots = (buf_len / period_len) + 1; + + /* + * Cyclic DMA users such as audio cannot tolerate delays introduced + * by cases where the number of periods is more than the maximum + * number of SGs the EDMA driver can handle at a time. For DMA types + * such as Slave SGs, such delays are tolerable and synchronized, + * but the synchronization is difficult to achieve with Cyclic and + * cannot be guaranteed, so we error out early. + */ + if (nslots > MAX_NR_SG) + return NULL; + + edesc = kzalloc(sizeof(*edesc) + nslots * + sizeof(edesc->pset[0]), GFP_ATOMIC); + if (!edesc) { + dev_dbg(dev, "Failed to allocate a descriptor\n"); + return NULL; + } + + edesc->cyclic = 1; + edesc->pset_nr = nslots; + + dev_dbg(dev, "%s: nslots=%d\n", __func__, nslots); + dev_dbg(dev, "%s: period_len=%d\n", __func__, period_len); + dev_dbg(dev, "%s: buf_len=%d\n", __func__, buf_len); + + for (i = 0; i < nslots; i++) { + /* Allocate a PaRAM slot, if needed */ + if (echan->slot[i] < 0) { + echan->slot[i] = + edma_alloc_slot(EDMA_CTLR(echan->ch_num), + EDMA_SLOT_ANY); + if (echan->slot[i] < 0) { + dev_err(dev, "Failed to allocate slot\n"); + return NULL; + } + } + + if (i == nslots - 1) { + memcpy(&edesc->pset[i], &edesc->pset[0], + sizeof(edesc->pset[0])); + break; + } + + ret = edma_config_pset(chan, &edesc->pset[i], src_addr, + dst_addr, burst, dev_width, period_len, + direction); + if (ret < 0) + return NULL; - edesc->pset[i].a_b_cnt = bcnt << 16 | acnt; - edesc->pset[i].ccnt = ccnt; - edesc->pset[i].link_bcntrld = 0xffffffff; + if (direction == DMA_DEV_TO_MEM) + dst_addr += period_len; + else + src_addr += period_len; + dev_dbg(dev, "%s: Configure period %d of buf:\n", __func__, i); + dev_dbg(dev, + "\n pset[%d]:\n" + " chnum\t%d\n" + " slot\t%d\n" + " opt\t%08x\n" + " src\t%08x\n" + " dst\t%08x\n" + " abcnt\t%08x\n" + " ccnt\t%08x\n" + " bidx\t%08x\n" + " cidx\t%08x\n" + " lkrld\t%08x\n", + i, echan->ch_num, echan->slot[i], + edesc->pset[i].opt, + edesc->pset[i].src, + edesc->pset[i].dst, + edesc->pset[i].a_b_cnt, + edesc->pset[i].ccnt, + edesc->pset[i].src_dst_bidx, + edesc->pset[i].src_dst_cidx, + edesc->pset[i].link_bcntrld); + + edesc->absync = ret; + + /* + * Enable interrupts for every period because callback + * has to be called for every period. + */ + edesc->pset[i].opt |= TCINTEN; } return vchan_tx_prep(&echan->vchan, &edesc->vdesc, tx_flags); @@ -406,30 +604,34 @@ static void edma_callback(unsigned ch_num, u16 ch_status, void *data) unsigned long flags; struct edmacc_param p; - /* Pause the channel */ - edma_pause(echan->ch_num); + edesc = echan->edesc; + + /* Pause the channel for non-cyclic */ + if (!edesc || (edesc && !edesc->cyclic)) + edma_pause(echan->ch_num); switch (ch_status) { - case DMA_COMPLETE: + case EDMA_DMA_COMPLETE: spin_lock_irqsave(&echan->vchan.lock, flags); - edesc = echan->edesc; if (edesc) { - if (edesc->processed == edesc->pset_nr) { + if (edesc->cyclic) { + vchan_cyclic_callback(&edesc->vdesc); + } else if (edesc->processed == edesc->pset_nr) { dev_dbg(dev, "Transfer complete, stopping channel %d\n", ch_num); edma_stop(echan->ch_num); vchan_cookie_complete(&edesc->vdesc); + edma_execute(echan); } else { dev_dbg(dev, "Intermediate transfer complete on channel %d\n", ch_num); + edma_execute(echan); } - - edma_execute(echan); } spin_unlock_irqrestore(&echan->vchan.lock, flags); break; - case DMA_CC_ERROR: + case EDMA_DMA_CC_ERROR: spin_lock_irqsave(&echan->vchan.lock, flags); edma_read_slot(EDMA_CHAN_SLOT(echan->slot[0]), &p); @@ -579,7 +781,7 @@ static enum dma_status edma_tx_status(struct dma_chan *chan, unsigned long flags; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS || !txstate) + if (ret == DMA_COMPLETE || !txstate) return ret; spin_lock_irqsave(&echan->vchan.lock, flags); @@ -619,6 +821,7 @@ static void edma_dma_init(struct edma_cc *ecc, struct dma_device *dma, struct device *dev) { dma->device_prep_slave_sg = edma_prep_slave_sg; + dma->device_prep_dma_cyclic = edma_prep_dma_cyclic; dma->device_alloc_chan_resources = edma_alloc_chan_resources; dma->device_free_chan_resources = edma_free_chan_resources; dma->device_issue_pending = edma_issue_pending; diff --git a/drivers/dma/ep93xx_dma.c b/drivers/dma/ep93xx_dma.c index 591cd8c63ab..cb4bf682a70 100644 --- a/drivers/dma/ep93xx_dma.c +++ b/drivers/dma/ep93xx_dma.c @@ -733,28 +733,6 @@ static void ep93xx_dma_advance_work(struct ep93xx_dma_chan *edmac) spin_unlock_irqrestore(&edmac->lock, flags); } -static void ep93xx_dma_unmap_buffers(struct ep93xx_dma_desc *desc) -{ - struct device *dev = desc->txd.chan->device->dev; - - if (!(desc->txd.flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (desc->txd.flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(dev, desc->src_addr, desc->size, - DMA_TO_DEVICE); - else - dma_unmap_page(dev, desc->src_addr, desc->size, - DMA_TO_DEVICE); - } - if (!(desc->txd.flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (desc->txd.flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(dev, desc->dst_addr, desc->size, - DMA_FROM_DEVICE); - else - dma_unmap_page(dev, desc->dst_addr, desc->size, - DMA_FROM_DEVICE); - } -} - static void ep93xx_dma_tasklet(unsigned long data) { struct ep93xx_dma_chan *edmac = (struct ep93xx_dma_chan *)data; @@ -787,13 +765,7 @@ static void ep93xx_dma_tasklet(unsigned long data) /* Now we can release all the chained descriptors */ list_for_each_entry_safe(desc, d, &list, node) { - /* - * For the memcpy channels the API requires us to unmap the - * buffers unless requested otherwise. - */ - if (!edmac->chan.private) - ep93xx_dma_unmap_buffers(desc); - + dma_descriptor_unmap(&desc->txd); ep93xx_dma_desc_put(edmac, desc); } diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 61517dd0d0b..7086a16a55f 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -870,22 +870,7 @@ static void fsldma_cleanup_descriptor(struct fsldma_chan *chan, /* Run any dependencies */ dma_run_dependencies(txd); - /* Unmap the dst buffer, if requested */ - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(dev, dst, len, DMA_FROM_DEVICE); - else - dma_unmap_page(dev, dst, len, DMA_FROM_DEVICE); - } - - /* Unmap the src buffer, if requested */ - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(dev, src, len, DMA_TO_DEVICE); - else - dma_unmap_page(dev, src, len, DMA_TO_DEVICE); - } - + dma_descriptor_unmap(txd); #ifdef FSL_DMA_LD_DEBUG chan_dbg(chan, "LD %p free\n", desc); #endif @@ -1255,7 +1240,9 @@ static int fsl_dma_chan_probe(struct fsldma_device *fdev, WARN_ON(fdev->feature != chan->feature); chan->dev = fdev->dev; - chan->id = ((res.start - 0x100) & 0xfff) >> 7; + chan->id = (res.start & 0xfff) < 0x300 ? + ((res.start - 0x100) & 0xfff) >> 7 : + ((res.start - 0x200) & 0xfff) >> 7; if (chan->id >= FSL_DMA_MAX_CHANS_PER_DEVICE) { dev_err(fdev->dev, "too many channels for device\n"); err = -EINVAL; @@ -1428,6 +1415,7 @@ static int fsldma_of_remove(struct platform_device *op) } static const struct of_device_id fsldma_of_ids[] = { + { .compatible = "fsl,elo3-dma", }, { .compatible = "fsl,eloplus-dma", }, { .compatible = "fsl,elo-dma", }, {} @@ -1449,7 +1437,7 @@ static struct platform_driver fsldma_of_driver = { static __init int fsldma_init(void) { - pr_info("Freescale Elo / Elo Plus DMA driver\n"); + pr_info("Freescale Elo series DMA driver\n"); return platform_driver_register(&fsldma_of_driver); } @@ -1461,5 +1449,5 @@ static void __exit fsldma_exit(void) subsys_initcall(fsldma_init); module_exit(fsldma_exit); -MODULE_DESCRIPTION("Freescale Elo / Elo Plus DMA driver"); +MODULE_DESCRIPTION("Freescale Elo series DMA driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h index f5c38791fc7..1ffc24484d2 100644 --- a/drivers/dma/fsldma.h +++ b/drivers/dma/fsldma.h @@ -112,7 +112,7 @@ struct fsldma_chan_regs { }; struct fsldma_chan; -#define FSL_DMA_MAX_CHANS_PER_DEVICE 4 +#define FSL_DMA_MAX_CHANS_PER_DEVICE 8 struct fsldma_device { void __iomem *regs; /* DGSR register base */ diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c index 55852c02679..6f9ac2022ab 100644 --- a/drivers/dma/imx-dma.c +++ b/drivers/dma/imx-dma.c @@ -572,9 +572,11 @@ static int imxdma_xfer_desc(struct imxdma_desc *d) imx_dmav1_writel(imxdma, d->len, DMA_CNTR(imxdmac->channel)); - dev_dbg(imxdma->dev, "%s channel: %d dest=0x%08x src=0x%08x " - "dma_length=%d\n", __func__, imxdmac->channel, - d->dest, d->src, d->len); + dev_dbg(imxdma->dev, + "%s channel: %d dest=0x%08llx src=0x%08llx dma_length=%zu\n", + __func__, imxdmac->channel, + (unsigned long long)d->dest, + (unsigned long long)d->src, d->len); break; /* Cyclic transfer is the same as slave_sg with special sg configuration. */ @@ -586,20 +588,22 @@ static int imxdma_xfer_desc(struct imxdma_desc *d) imx_dmav1_writel(imxdma, imxdmac->ccr_from_device, DMA_CCR(imxdmac->channel)); - dev_dbg(imxdma->dev, "%s channel: %d sg=%p sgcount=%d " - "total length=%d dev_addr=0x%08x (dev2mem)\n", - __func__, imxdmac->channel, d->sg, d->sgcount, - d->len, imxdmac->per_address); + dev_dbg(imxdma->dev, + "%s channel: %d sg=%p sgcount=%d total length=%zu dev_addr=0x%08llx (dev2mem)\n", + __func__, imxdmac->channel, + d->sg, d->sgcount, d->len, + (unsigned long long)imxdmac->per_address); } else if (d->direction == DMA_MEM_TO_DEV) { imx_dmav1_writel(imxdma, imxdmac->per_address, DMA_DAR(imxdmac->channel)); imx_dmav1_writel(imxdma, imxdmac->ccr_to_device, DMA_CCR(imxdmac->channel)); - dev_dbg(imxdma->dev, "%s channel: %d sg=%p sgcount=%d " - "total length=%d dev_addr=0x%08x (mem2dev)\n", - __func__, imxdmac->channel, d->sg, d->sgcount, - d->len, imxdmac->per_address); + dev_dbg(imxdma->dev, + "%s channel: %d sg=%p sgcount=%d total length=%zu dev_addr=0x%08llx (mem2dev)\n", + __func__, imxdmac->channel, + d->sg, d->sgcount, d->len, + (unsigned long long)imxdmac->per_address); } else { dev_err(imxdma->dev, "%s channel: %d bad dma mode\n", __func__, imxdmac->channel); @@ -771,7 +775,7 @@ static int imxdma_alloc_chan_resources(struct dma_chan *chan) desc->desc.tx_submit = imxdma_tx_submit; /* txd.flags will be overwritten in prep funcs */ desc->desc.flags = DMA_CTRL_ACK; - desc->status = DMA_SUCCESS; + desc->status = DMA_COMPLETE; list_add_tail(&desc->node, &imxdmac->ld_free); imxdmac->descs_allocated++; @@ -870,7 +874,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic( int i; unsigned int periods = buf_len / period_len; - dev_dbg(imxdma->dev, "%s channel: %d buf_len=%d period_len=%d\n", + dev_dbg(imxdma->dev, "%s channel: %d buf_len=%zu period_len=%zu\n", __func__, imxdmac->channel, buf_len, period_len); if (list_empty(&imxdmac->ld_free) || @@ -926,8 +930,9 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_memcpy( struct imxdma_engine *imxdma = imxdmac->imxdma; struct imxdma_desc *desc; - dev_dbg(imxdma->dev, "%s channel: %d src=0x%x dst=0x%x len=%d\n", - __func__, imxdmac->channel, src, dest, len); + dev_dbg(imxdma->dev, "%s channel: %d src=0x%llx dst=0x%llx len=%zu\n", + __func__, imxdmac->channel, (unsigned long long)src, + (unsigned long long)dest, len); if (list_empty(&imxdmac->ld_free) || imxdma_chan_is_doing_cyclic(imxdmac)) @@ -956,9 +961,10 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_interleaved( struct imxdma_engine *imxdma = imxdmac->imxdma; struct imxdma_desc *desc; - dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%x dst_start=0x%x\n" - " src_sgl=%s dst_sgl=%s numf=%d frame_size=%d\n", __func__, - imxdmac->channel, xt->src_start, xt->dst_start, + dev_dbg(imxdma->dev, "%s channel: %d src_start=0x%llx dst_start=0x%llx\n" + " src_sgl=%s dst_sgl=%s numf=%zu frame_size=%zu\n", __func__, + imxdmac->channel, (unsigned long long)xt->src_start, + (unsigned long long) xt->dst_start, xt->src_sgl ? "true" : "false", xt->dst_sgl ? "true" : "false", xt->numf, xt->frame_size); diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index c1fd504cae2..c75679d4202 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -638,7 +638,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac) if (error) sdmac->status = DMA_ERROR; else - sdmac->status = DMA_SUCCESS; + sdmac->status = DMA_COMPLETE; dma_cookie_complete(&sdmac->desc); if (sdmac->desc.callback) @@ -1089,8 +1089,8 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg( param &= ~BD_CONT; } - dev_dbg(sdma->dev, "entry %d: count: %d dma: 0x%08x %s%s\n", - i, count, sg->dma_address, + dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n", + i, count, (u64)sg->dma_address, param & BD_WRAP ? "wrap" : "", param & BD_INTR ? " intr" : ""); @@ -1163,8 +1163,8 @@ static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic( if (i + 1 == num_periods) param |= BD_WRAP; - dev_dbg(sdma->dev, "entry %d: count: %d dma: 0x%08x %s%s\n", - i, period_len, dma_addr, + dev_dbg(sdma->dev, "entry %d: count: %d dma: %#llx %s%s\n", + i, period_len, (u64)dma_addr, param & BD_WRAP ? "wrap" : "", param & BD_INTR ? " intr" : ""); diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c index a975ebebea8..1aab8130efa 100644 --- a/drivers/dma/intel_mid_dma.c +++ b/drivers/dma/intel_mid_dma.c @@ -309,7 +309,7 @@ static void midc_descriptor_complete(struct intel_mid_dma_chan *midc, callback_txd(param_txd); } if (midc->raw_tfr) { - desc->status = DMA_SUCCESS; + desc->status = DMA_COMPLETE; if (desc->lli != NULL) { pci_pool_free(desc->lli_pool, desc->lli, desc->lli_phys); @@ -481,7 +481,7 @@ static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan, enum dma_status ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret != DMA_SUCCESS) { + if (ret != DMA_COMPLETE) { spin_lock_bh(&midc->lock); midc_scan_descriptors(to_middma_device(chan->device), midc); spin_unlock_bh(&midc->lock); diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 5ff6fc1819d..1a49c777607 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -531,21 +531,6 @@ static void ioat1_cleanup_event(unsigned long data) writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET); } -void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, - size_t len, struct ioat_dma_descriptor *hw) -{ - struct pci_dev *pdev = chan->device->pdev; - size_t offset = len - hw->size; - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) - ioat_unmap(pdev, hw->dst_addr - offset, len, - PCI_DMA_FROMDEVICE, flags, 1); - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) - ioat_unmap(pdev, hw->src_addr - offset, len, - PCI_DMA_TODEVICE, flags, 0); -} - dma_addr_t ioat_get_current_completion(struct ioat_chan_common *chan) { dma_addr_t phys_complete; @@ -602,7 +587,7 @@ static void __cleanup(struct ioat_dma_chan *ioat, dma_addr_t phys_complete) dump_desc_dbg(ioat, desc); if (tx->cookie) { dma_cookie_complete(tx); - ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw); + dma_descriptor_unmap(tx); ioat->active -= desc->hw->tx_cnt; if (tx->callback) { tx->callback(tx->callback_param); @@ -733,7 +718,7 @@ ioat_dma_tx_status(struct dma_chan *c, dma_cookie_t cookie, enum dma_status ret; ret = dma_cookie_status(c, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; device->cleanup_fn((unsigned long) c); @@ -833,8 +818,7 @@ int ioat_dma_self_test(struct ioatdma_device *device) dma_src = dma_map_single(dev, src, IOAT_TEST_SIZE, DMA_TO_DEVICE); dma_dest = dma_map_single(dev, dest, IOAT_TEST_SIZE, DMA_FROM_DEVICE); - flags = DMA_COMPL_SKIP_SRC_UNMAP | DMA_COMPL_SKIP_DEST_UNMAP | - DMA_PREP_INTERRUPT; + flags = DMA_PREP_INTERRUPT; tx = device->common.device_prep_dma_memcpy(dma_chan, dma_dest, dma_src, IOAT_TEST_SIZE, flags); if (!tx) { @@ -859,7 +843,7 @@ int ioat_dma_self_test(struct ioatdma_device *device) if (tmo == 0 || dma->device_tx_status(dma_chan, cookie, NULL) - != DMA_SUCCESS) { + != DMA_COMPLETE) { dev_err(dev, "Self-test copy timed out, disabling\n"); err = -ENODEV; goto unmap_dma; @@ -885,8 +869,7 @@ static char ioat_interrupt_style[32] = "msix"; module_param_string(ioat_interrupt_style, ioat_interrupt_style, sizeof(ioat_interrupt_style), 0644); MODULE_PARM_DESC(ioat_interrupt_style, - "set ioat interrupt style: msix (default), " - "msix-single-vector, msi, intx)"); + "set ioat interrupt style: msix (default), msi, intx"); /** * ioat_dma_setup_interrupts - setup interrupt handler @@ -904,8 +887,6 @@ int ioat_dma_setup_interrupts(struct ioatdma_device *device) if (!strcmp(ioat_interrupt_style, "msix")) goto msix; - if (!strcmp(ioat_interrupt_style, "msix-single-vector")) - goto msix_single_vector; if (!strcmp(ioat_interrupt_style, "msi")) goto msi; if (!strcmp(ioat_interrupt_style, "intx")) @@ -920,10 +901,8 @@ msix: device->msix_entries[i].entry = i; err = pci_enable_msix(pdev, device->msix_entries, msixcnt); - if (err < 0) + if (err) goto msi; - if (err > 0) - goto msix_single_vector; for (i = 0; i < msixcnt; i++) { msix = &device->msix_entries[i]; @@ -937,29 +916,13 @@ msix: chan = ioat_chan_by_index(device, j); devm_free_irq(dev, msix->vector, chan); } - goto msix_single_vector; + goto msi; } } intrctrl |= IOAT_INTRCTRL_MSIX_VECTOR_CONTROL; device->irq_mode = IOAT_MSIX; goto done; -msix_single_vector: - msix = &device->msix_entries[0]; - msix->entry = 0; - err = pci_enable_msix(pdev, device->msix_entries, 1); - if (err) - goto msi; - - err = devm_request_irq(dev, msix->vector, ioat_dma_do_interrupt, 0, - "ioat-msix", device); - if (err) { - pci_disable_msix(pdev); - goto msi; - } - device->irq_mode = IOAT_MSIX_SINGLE; - goto done; - msi: err = pci_enable_msi(pdev); if (err) @@ -971,7 +934,7 @@ msi: pci_disable_msi(pdev); goto intx; } - device->irq_mode = IOAT_MSIX; + device->irq_mode = IOAT_MSI; goto done; intx: diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index 54fb7b9ff9a..11fb877ddca 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -52,7 +52,6 @@ enum ioat_irq_mode { IOAT_NOIRQ = 0, IOAT_MSIX, - IOAT_MSIX_SINGLE, IOAT_MSI, IOAT_INTX }; @@ -83,7 +82,6 @@ struct ioatdma_device { struct pci_pool *completion_pool; #define MAX_SED_POOLS 5 struct dma_pool *sed_hw_pool[MAX_SED_POOLS]; - struct kmem_cache *sed_pool; struct dma_device common; u8 version; struct msix_entry msix_entries[4]; @@ -342,16 +340,6 @@ static inline bool is_ioat_bug(unsigned long err) return !!err; } -static inline void ioat_unmap(struct pci_dev *pdev, dma_addr_t addr, size_t len, - int direction, enum dma_ctrl_flags flags, bool dst) -{ - if ((dst && (flags & DMA_COMPL_DEST_UNMAP_SINGLE)) || - (!dst && (flags & DMA_COMPL_SRC_UNMAP_SINGLE))) - pci_unmap_single(pdev, addr, len, direction); - else - pci_unmap_page(pdev, addr, len, direction); -} - int ioat_probe(struct ioatdma_device *device); int ioat_register(struct ioatdma_device *device); int ioat1_dma_probe(struct ioatdma_device *dev, int dca); @@ -363,8 +351,6 @@ void ioat_init_channel(struct ioatdma_device *device, struct ioat_chan_common *chan, int idx); enum dma_status ioat_dma_tx_status(struct dma_chan *c, dma_cookie_t cookie, struct dma_tx_state *txstate); -void ioat_dma_unmap(struct ioat_chan_common *chan, enum dma_ctrl_flags flags, - size_t len, struct ioat_dma_descriptor *hw); bool ioat_cleanup_preamble(struct ioat_chan_common *chan, dma_addr_t *phys_complete); void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type); diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index b925e1b1d13..5d3affe7e97 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -148,7 +148,7 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete) tx = &desc->txd; dump_desc_dbg(ioat, desc); if (tx->cookie) { - ioat_dma_unmap(chan, tx->flags, desc->len, desc->hw); + dma_descriptor_unmap(tx); dma_cookie_complete(tx); if (tx->callback) { tx->callback(tx->callback_param); diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h index 212d584fe42..470292767e6 100644 --- a/drivers/dma/ioat/dma_v2.h +++ b/drivers/dma/ioat/dma_v2.h @@ -157,7 +157,6 @@ static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr) int ioat2_dma_probe(struct ioatdma_device *dev, int dca); int ioat3_dma_probe(struct ioatdma_device *dev, int dca); -void ioat3_dma_remove(struct ioatdma_device *dev); struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase); struct dca_provider *ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase); int ioat2_check_space_lock(struct ioat2_dma_chan *ioat, int num_descs); diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c index d8ececaf1b5..820817e97e6 100644 --- a/drivers/dma/ioat/dma_v3.c +++ b/drivers/dma/ioat/dma_v3.c @@ -67,6 +67,8 @@ #include "dma.h" #include "dma_v2.h" +extern struct kmem_cache *ioat3_sed_cache; + /* ioat hardware assumes at least two sources for raid operations */ #define src_cnt_to_sw(x) ((x) + 2) #define src_cnt_to_hw(x) ((x) - 2) @@ -87,22 +89,8 @@ static const u8 pq_idx_to_field[] = { 1, 4, 5, 0, 1, 2, 4, 5 }; static const u8 pq16_idx_to_field[] = { 1, 4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6 }; -/* - * technically sources 1 and 2 do not require SED, but the op will have - * at least 9 descriptors so that's irrelevant. - */ -static const u8 pq16_idx_to_sed[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1 }; - static void ioat3_eh(struct ioat2_dma_chan *ioat); -static dma_addr_t xor_get_src(struct ioat_raw_descriptor *descs[2], int idx) -{ - struct ioat_raw_descriptor *raw = descs[xor_idx_to_desc >> idx & 1]; - - return raw->field[xor_idx_to_field[idx]]; -} - static void xor_set_src(struct ioat_raw_descriptor *descs[2], dma_addr_t addr, u32 offset, int idx) { @@ -135,12 +123,6 @@ static void pq_set_src(struct ioat_raw_descriptor *descs[2], pq->coef[idx] = coef; } -static int sed_get_pq16_pool_idx(int src_cnt) -{ - - return pq16_idx_to_sed[src_cnt]; -} - static bool is_jf_ioat(struct pci_dev *pdev) { switch (pdev->device) { @@ -272,7 +254,7 @@ ioat3_alloc_sed(struct ioatdma_device *device, unsigned int hw_pool) struct ioat_sed_ent *sed; gfp_t flags = __GFP_ZERO | GFP_ATOMIC; - sed = kmem_cache_alloc(device->sed_pool, flags); + sed = kmem_cache_alloc(ioat3_sed_cache, flags); if (!sed) return NULL; @@ -280,7 +262,7 @@ ioat3_alloc_sed(struct ioatdma_device *device, unsigned int hw_pool) sed->hw = dma_pool_alloc(device->sed_hw_pool[hw_pool], flags, &sed->dma); if (!sed->hw) { - kmem_cache_free(device->sed_pool, sed); + kmem_cache_free(ioat3_sed_cache, sed); return NULL; } @@ -293,165 +275,7 @@ static void ioat3_free_sed(struct ioatdma_device *device, struct ioat_sed_ent *s return; dma_pool_free(device->sed_hw_pool[sed->hw_pool], sed->hw, sed->dma); - kmem_cache_free(device->sed_pool, sed); -} - -static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat, - struct ioat_ring_ent *desc, int idx) -{ - struct ioat_chan_common *chan = &ioat->base; - struct pci_dev *pdev = chan->device->pdev; - size_t len = desc->len; - size_t offset = len - desc->hw->size; - struct dma_async_tx_descriptor *tx = &desc->txd; - enum dma_ctrl_flags flags = tx->flags; - - switch (desc->hw->ctl_f.op) { - case IOAT_OP_COPY: - if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */ - ioat_dma_unmap(chan, flags, len, desc->hw); - break; - case IOAT_OP_XOR_VAL: - case IOAT_OP_XOR: { - struct ioat_xor_descriptor *xor = desc->xor; - struct ioat_ring_ent *ext; - struct ioat_xor_ext_descriptor *xor_ex = NULL; - int src_cnt = src_cnt_to_sw(xor->ctl_f.src_cnt); - struct ioat_raw_descriptor *descs[2]; - int i; - - if (src_cnt > 5) { - ext = ioat2_get_ring_ent(ioat, idx + 1); - xor_ex = ext->xor_ex; - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - descs[0] = (struct ioat_raw_descriptor *) xor; - descs[1] = (struct ioat_raw_descriptor *) xor_ex; - for (i = 0; i < src_cnt; i++) { - dma_addr_t src = xor_get_src(descs, i); - - ioat_unmap(pdev, src - offset, len, - PCI_DMA_TODEVICE, flags, 0); - } - - /* dest is a source in xor validate operations */ - if (xor->ctl_f.op == IOAT_OP_XOR_VAL) { - ioat_unmap(pdev, xor->dst_addr - offset, len, - PCI_DMA_TODEVICE, flags, 1); - break; - } - } - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) - ioat_unmap(pdev, xor->dst_addr - offset, len, - PCI_DMA_FROMDEVICE, flags, 1); - break; - } - case IOAT_OP_PQ_VAL: - case IOAT_OP_PQ: { - struct ioat_pq_descriptor *pq = desc->pq; - struct ioat_ring_ent *ext; - struct ioat_pq_ext_descriptor *pq_ex = NULL; - int src_cnt = src_cnt_to_sw(pq->ctl_f.src_cnt); - struct ioat_raw_descriptor *descs[2]; - int i; - - if (src_cnt > 3) { - ext = ioat2_get_ring_ent(ioat, idx + 1); - pq_ex = ext->pq_ex; - } - - /* in the 'continue' case don't unmap the dests as sources */ - if (dmaf_p_disabled_continue(flags)) - src_cnt--; - else if (dmaf_continue(flags)) - src_cnt -= 3; - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - descs[0] = (struct ioat_raw_descriptor *) pq; - descs[1] = (struct ioat_raw_descriptor *) pq_ex; - for (i = 0; i < src_cnt; i++) { - dma_addr_t src = pq_get_src(descs, i); - - ioat_unmap(pdev, src - offset, len, - PCI_DMA_TODEVICE, flags, 0); - } - - /* the dests are sources in pq validate operations */ - if (pq->ctl_f.op == IOAT_OP_XOR_VAL) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, - len, PCI_DMA_TODEVICE, flags, 0); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, - len, PCI_DMA_TODEVICE, flags, 0); - break; - } - } - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - } - break; - } - case IOAT_OP_PQ_16S: - case IOAT_OP_PQ_VAL_16S: { - struct ioat_pq_descriptor *pq = desc->pq; - int src_cnt = src16_cnt_to_sw(pq->ctl_f.src_cnt); - struct ioat_raw_descriptor *descs[4]; - int i; - - /* in the 'continue' case don't unmap the dests as sources */ - if (dmaf_p_disabled_continue(flags)) - src_cnt--; - else if (dmaf_continue(flags)) - src_cnt -= 3; - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - descs[0] = (struct ioat_raw_descriptor *)pq; - descs[1] = (struct ioat_raw_descriptor *)(desc->sed->hw); - descs[2] = (struct ioat_raw_descriptor *)(&desc->sed->hw->b[0]); - for (i = 0; i < src_cnt; i++) { - dma_addr_t src = pq16_get_src(descs, i); - - ioat_unmap(pdev, src - offset, len, - PCI_DMA_TODEVICE, flags, 0); - } - - /* the dests are sources in pq validate operations */ - if (pq->ctl_f.op == IOAT_OP_XOR_VAL) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, - len, PCI_DMA_TODEVICE, - flags, 0); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, - len, PCI_DMA_TODEVICE, - flags, 0); - break; - } - } - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - if (!(flags & DMA_PREP_PQ_DISABLE_P)) - ioat_unmap(pdev, pq->p_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - if (!(flags & DMA_PREP_PQ_DISABLE_Q)) - ioat_unmap(pdev, pq->q_addr - offset, len, - PCI_DMA_BIDIRECTIONAL, flags, 1); - } - break; - } - default: - dev_err(&pdev->dev, "%s: unknown op type: %#x\n", - __func__, desc->hw->ctl_f.op); - } + kmem_cache_free(ioat3_sed_cache, sed); } static bool desc_has_ext(struct ioat_ring_ent *desc) @@ -577,7 +401,7 @@ static void __cleanup(struct ioat2_dma_chan *ioat, dma_addr_t phys_complete) tx = &desc->txd; if (tx->cookie) { dma_cookie_complete(tx); - ioat3_dma_unmap(ioat, desc, idx + i); + dma_descriptor_unmap(tx); if (tx->callback) { tx->callback(tx->callback_param); tx->callback = NULL; @@ -807,7 +631,7 @@ ioat3_tx_status(struct dma_chan *c, dma_cookie_t cookie, enum dma_status ret; ret = dma_cookie_status(c, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; ioat3_cleanup(ioat); @@ -1129,9 +953,6 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result, u8 op; int i, s, idx, num_descs; - /* this function only handles src_cnt 9 - 16 */ - BUG_ON(src_cnt < 9); - /* this function is only called with 9-16 sources */ op = result ? IOAT_OP_PQ_VAL_16S : IOAT_OP_PQ_16S; @@ -1159,8 +980,7 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result, descs[0] = (struct ioat_raw_descriptor *) pq; - desc->sed = ioat3_alloc_sed(device, - sed_get_pq16_pool_idx(src_cnt)); + desc->sed = ioat3_alloc_sed(device, (src_cnt-2) >> 3); if (!desc->sed) { dev_err(to_dev(chan), "%s: no free sed entries\n", __func__); @@ -1218,13 +1038,21 @@ __ioat3_prep_pq16_lock(struct dma_chan *c, enum sum_check_flags *result, return &desc->txd; } +static int src_cnt_flags(unsigned int src_cnt, unsigned long flags) +{ + if (dmaf_p_disabled_continue(flags)) + return src_cnt + 1; + else if (dmaf_continue(flags)) + return src_cnt + 3; + else + return src_cnt; +} + static struct dma_async_tx_descriptor * ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, unsigned long flags) { - struct dma_device *dma = chan->device; - /* specify valid address for disabled result */ if (flags & DMA_PREP_PQ_DISABLE_P) dst[0] = dst[1]; @@ -1244,7 +1072,7 @@ ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, single_source_coef[0] = scf[0]; single_source_coef[1] = 0; - return (src_cnt > 8) && (dma->max_pq > 8) ? + return src_cnt_flags(src_cnt, flags) > 8 ? __ioat3_prep_pq16_lock(chan, NULL, dst, single_source, 2, single_source_coef, len, flags) : @@ -1252,7 +1080,7 @@ ioat3_prep_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, single_source_coef, len, flags); } else { - return (src_cnt > 8) && (dma->max_pq > 8) ? + return src_cnt_flags(src_cnt, flags) > 8 ? __ioat3_prep_pq16_lock(chan, NULL, dst, src, src_cnt, scf, len, flags) : __ioat3_prep_pq_lock(chan, NULL, dst, src, src_cnt, @@ -1265,8 +1093,6 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, enum sum_check_flags *pqres, unsigned long flags) { - struct dma_device *dma = chan->device; - /* specify valid address for disabled result */ if (flags & DMA_PREP_PQ_DISABLE_P) pq[0] = pq[1]; @@ -1278,7 +1104,7 @@ ioat3_prep_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, */ *pqres = 0; - return (src_cnt > 8) && (dma->max_pq > 8) ? + return src_cnt_flags(src_cnt, flags) > 8 ? __ioat3_prep_pq16_lock(chan, pqres, pq, src, src_cnt, scf, len, flags) : __ioat3_prep_pq_lock(chan, pqres, pq, src, src_cnt, scf, len, @@ -1289,7 +1115,6 @@ static struct dma_async_tx_descriptor * ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags) { - struct dma_device *dma = chan->device; unsigned char scf[src_cnt]; dma_addr_t pq[2]; @@ -1298,7 +1123,7 @@ ioat3_prep_pqxor(struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, flags |= DMA_PREP_PQ_DISABLE_Q; pq[1] = dst; /* specify valid address for disabled result */ - return (src_cnt > 8) && (dma->max_pq > 8) ? + return src_cnt_flags(src_cnt, flags) > 8 ? __ioat3_prep_pq16_lock(chan, NULL, pq, src, src_cnt, scf, len, flags) : __ioat3_prep_pq_lock(chan, NULL, pq, src, src_cnt, scf, len, @@ -1310,7 +1135,6 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, size_t len, enum sum_check_flags *result, unsigned long flags) { - struct dma_device *dma = chan->device; unsigned char scf[src_cnt]; dma_addr_t pq[2]; @@ -1324,8 +1148,7 @@ ioat3_prep_pqxor_val(struct dma_chan *chan, dma_addr_t *src, flags |= DMA_PREP_PQ_DISABLE_Q; pq[1] = pq[0]; /* specify valid address for disabled result */ - - return (src_cnt > 8) && (dma->max_pq > 8) ? + return src_cnt_flags(src_cnt, flags) > 8 ? __ioat3_prep_pq16_lock(chan, result, pq, &src[1], src_cnt - 1, scf, len, flags) : __ioat3_prep_pq_lock(chan, result, pq, &src[1], src_cnt - 1, @@ -1444,9 +1267,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) DMA_TO_DEVICE); tx = dma->device_prep_dma_xor(dma_chan, dest_dma, dma_srcs, IOAT_NUM_SRC_TEST, PAGE_SIZE, - DMA_PREP_INTERRUPT | - DMA_COMPL_SKIP_SRC_UNMAP | - DMA_COMPL_SKIP_DEST_UNMAP); + DMA_PREP_INTERRUPT); if (!tx) { dev_err(dev, "Self-test xor prep failed\n"); @@ -1468,7 +1289,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) { + if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) { dev_err(dev, "Self-test xor timed out\n"); err = -ENODEV; goto dma_unmap; @@ -1507,9 +1328,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) DMA_TO_DEVICE); tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, - &xor_val_result, DMA_PREP_INTERRUPT | - DMA_COMPL_SKIP_SRC_UNMAP | - DMA_COMPL_SKIP_DEST_UNMAP); + &xor_val_result, DMA_PREP_INTERRUPT); if (!tx) { dev_err(dev, "Self-test zero prep failed\n"); err = -ENODEV; @@ -1530,7 +1349,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) { + if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) { dev_err(dev, "Self-test validate timed out\n"); err = -ENODEV; goto dma_unmap; @@ -1545,6 +1364,8 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) goto free_resources; } + memset(page_address(dest), 0, PAGE_SIZE); + /* test for non-zero parity sum */ op = IOAT_OP_XOR_VAL; @@ -1554,9 +1375,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) DMA_TO_DEVICE); tx = dma->device_prep_dma_xor_val(dma_chan, dma_srcs, IOAT_NUM_SRC_TEST + 1, PAGE_SIZE, - &xor_val_result, DMA_PREP_INTERRUPT | - DMA_COMPL_SKIP_SRC_UNMAP | - DMA_COMPL_SKIP_DEST_UNMAP); + &xor_val_result, DMA_PREP_INTERRUPT); if (!tx) { dev_err(dev, "Self-test 2nd zero prep failed\n"); err = -ENODEV; @@ -1577,7 +1396,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device) tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) { + if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_COMPLETE) { dev_err(dev, "Self-test 2nd validate timed out\n"); err = -ENODEV; goto dma_unmap; @@ -1630,52 +1449,36 @@ static int ioat3_dma_self_test(struct ioatdma_device *device) static int ioat3_irq_reinit(struct ioatdma_device *device) { - int msixcnt = device->common.chancnt; struct pci_dev *pdev = device->pdev; - int i; - struct msix_entry *msix; - struct ioat_chan_common *chan; - int err = 0; + int irq = pdev->irq, i; + + if (!is_bwd_ioat(pdev)) + return 0; switch (device->irq_mode) { case IOAT_MSIX: + for (i = 0; i < device->common.chancnt; i++) { + struct msix_entry *msix = &device->msix_entries[i]; + struct ioat_chan_common *chan; - for (i = 0; i < msixcnt; i++) { - msix = &device->msix_entries[i]; chan = ioat_chan_by_index(device, i); devm_free_irq(&pdev->dev, msix->vector, chan); } pci_disable_msix(pdev); break; - - case IOAT_MSIX_SINGLE: - msix = &device->msix_entries[0]; - chan = ioat_chan_by_index(device, 0); - devm_free_irq(&pdev->dev, msix->vector, chan); - pci_disable_msix(pdev); - break; - case IOAT_MSI: - chan = ioat_chan_by_index(device, 0); - devm_free_irq(&pdev->dev, pdev->irq, chan); pci_disable_msi(pdev); - break; - + /* fall through */ case IOAT_INTX: - chan = ioat_chan_by_index(device, 0); - devm_free_irq(&pdev->dev, pdev->irq, chan); + devm_free_irq(&pdev->dev, irq, device); break; - default: return 0; } - device->irq_mode = IOAT_NOIRQ; - err = ioat_dma_setup_interrupts(device); - - return err; + return ioat_dma_setup_interrupts(device); } static int ioat3_reset_hw(struct ioat_chan_common *chan) @@ -1718,14 +1521,12 @@ static int ioat3_reset_hw(struct ioat_chan_common *chan) } err = ioat2_reset_sync(chan, msecs_to_jiffies(200)); - if (err) { - dev_err(&pdev->dev, "Failed to reset!\n"); - return err; - } - - if (device->irq_mode != IOAT_NOIRQ && is_bwd_ioat(pdev)) + if (!err) err = ioat3_irq_reinit(device); + if (err) + dev_err(&pdev->dev, "Failed to reset: %d\n", err); + return err; } @@ -1835,21 +1636,15 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca) char pool_name[14]; int i; - /* allocate sw descriptor pool for SED */ - device->sed_pool = kmem_cache_create("ioat_sed", - sizeof(struct ioat_sed_ent), 0, 0, NULL); - if (!device->sed_pool) - return -ENOMEM; - for (i = 0; i < MAX_SED_POOLS; i++) { snprintf(pool_name, 14, "ioat_hw%d_sed", i); /* allocate SED DMA pool */ - device->sed_hw_pool[i] = dma_pool_create(pool_name, + device->sed_hw_pool[i] = dmam_pool_create(pool_name, &pdev->dev, SED_SIZE * (i + 1), 64, 0); if (!device->sed_hw_pool[i]) - goto sed_pool_cleanup; + return -ENOMEM; } } @@ -1875,28 +1670,4 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca) device->dca = ioat3_dca_init(pdev, device->reg_base); return 0; - -sed_pool_cleanup: - if (device->sed_pool) { - int i; - kmem_cache_destroy(device->sed_pool); - - for (i = 0; i < MAX_SED_POOLS; i++) - if (device->sed_hw_pool[i]) - dma_pool_destroy(device->sed_hw_pool[i]); - } - - return -ENOMEM; -} - -void ioat3_dma_remove(struct ioatdma_device *device) -{ - if (device->sed_pool) { - int i; - kmem_cache_destroy(device->sed_pool); - - for (i = 0; i < MAX_SED_POOLS; i++) - if (device->sed_hw_pool[i]) - dma_pool_destroy(device->sed_hw_pool[i]); - } } diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c index 2c8d560e633..1d051cd045d 100644 --- a/drivers/dma/ioat/pci.c +++ b/drivers/dma/ioat/pci.c @@ -123,6 +123,7 @@ module_param(ioat_dca_enabled, int, 0644); MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)"); struct kmem_cache *ioat2_cache; +struct kmem_cache *ioat3_sed_cache; #define DRV_NAME "ioatdma" @@ -207,9 +208,6 @@ static void ioat_remove(struct pci_dev *pdev) if (!device) return; - if (device->version >= IOAT_VER_3_0) - ioat3_dma_remove(device); - dev_err(&pdev->dev, "Removing dma and dca services\n"); if (device->dca) { unregister_dca_provider(device->dca, &pdev->dev); @@ -221,7 +219,7 @@ static void ioat_remove(struct pci_dev *pdev) static int __init ioat_init_module(void) { - int err; + int err = -ENOMEM; pr_info("%s: Intel(R) QuickData Technology Driver %s\n", DRV_NAME, IOAT_DMA_VERSION); @@ -231,9 +229,21 @@ static int __init ioat_init_module(void) if (!ioat2_cache) return -ENOMEM; + ioat3_sed_cache = KMEM_CACHE(ioat_sed_ent, 0); + if (!ioat3_sed_cache) + goto err_ioat2_cache; + err = pci_register_driver(&ioat_pci_driver); if (err) - kmem_cache_destroy(ioat2_cache); + goto err_ioat3_cache; + + return 0; + + err_ioat3_cache: + kmem_cache_destroy(ioat3_sed_cache); + + err_ioat2_cache: + kmem_cache_destroy(ioat2_cache); return err; } diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index dd8b44a56e5..c56137bc386 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -61,80 +61,6 @@ static void iop_adma_free_slots(struct iop_adma_desc_slot *slot) } } -static void -iop_desc_unmap(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) -{ - struct dma_async_tx_descriptor *tx = &desc->async_tx; - struct iop_adma_desc_slot *unmap = desc->group_head; - struct device *dev = &iop_chan->device->pdev->dev; - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = tx->flags; - u32 src_cnt; - dma_addr_t addr; - dma_addr_t dest; - - src_cnt = unmap->unmap_src_cnt; - dest = iop_desc_get_dest_addr(unmap, iop_chan); - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - enum dma_data_direction dir; - - if (src_cnt > 1) /* is xor? */ - dir = DMA_BIDIRECTIONAL; - else - dir = DMA_FROM_DEVICE; - - dma_unmap_page(dev, dest, len, dir); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - while (src_cnt--) { - addr = iop_desc_get_src_addr(unmap, iop_chan, src_cnt); - if (addr == dest) - continue; - dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); - } - } - desc->group_head = NULL; -} - -static void -iop_desc_unmap_pq(struct iop_adma_chan *iop_chan, struct iop_adma_desc_slot *desc) -{ - struct dma_async_tx_descriptor *tx = &desc->async_tx; - struct iop_adma_desc_slot *unmap = desc->group_head; - struct device *dev = &iop_chan->device->pdev->dev; - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = tx->flags; - u32 src_cnt = unmap->unmap_src_cnt; - dma_addr_t pdest = iop_desc_get_dest_addr(unmap, iop_chan); - dma_addr_t qdest = iop_desc_get_qdest_addr(unmap, iop_chan); - int i; - - if (tx->flags & DMA_PREP_CONTINUE) - src_cnt -= 3; - - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP) && !desc->pq_check_result) { - dma_unmap_page(dev, pdest, len, DMA_BIDIRECTIONAL); - dma_unmap_page(dev, qdest, len, DMA_BIDIRECTIONAL); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - dma_addr_t addr; - - for (i = 0; i < src_cnt; i++) { - addr = iop_desc_get_src_addr(unmap, iop_chan, i); - dma_unmap_page(dev, addr, len, DMA_TO_DEVICE); - } - if (desc->pq_check_result) { - dma_unmap_page(dev, pdest, len, DMA_TO_DEVICE); - dma_unmap_page(dev, qdest, len, DMA_TO_DEVICE); - } - } - - desc->group_head = NULL; -} - - static dma_cookie_t iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, struct iop_adma_chan *iop_chan, dma_cookie_t cookie) @@ -152,15 +78,9 @@ iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, if (tx->callback) tx->callback(tx->callback_param); - /* unmap dma addresses - * (unmap_single vs unmap_page?) - */ - if (desc->group_head && desc->unmap_len) { - if (iop_desc_is_pq(desc)) - iop_desc_unmap_pq(iop_chan, desc); - else - iop_desc_unmap(iop_chan, desc); - } + dma_descriptor_unmap(tx); + if (desc->group_head) + desc->group_head = NULL; } /* run dependent operations */ @@ -591,7 +511,6 @@ iop_adma_prep_dma_interrupt(struct dma_chan *chan, unsigned long flags) if (sw_desc) { grp_start = sw_desc->group_head; iop_desc_init_interrupt(grp_start, iop_chan); - grp_start->unmap_len = 0; sw_desc->async_tx.flags = flags; } spin_unlock_bh(&iop_chan->lock); @@ -623,8 +542,6 @@ iop_adma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dma_dest, iop_desc_set_byte_count(grp_start, iop_chan, len); iop_desc_set_dest_addr(grp_start, iop_chan, dma_dest); iop_desc_set_memcpy_src_addr(grp_start, dma_src); - sw_desc->unmap_src_cnt = 1; - sw_desc->unmap_len = len; sw_desc->async_tx.flags = flags; } spin_unlock_bh(&iop_chan->lock); @@ -657,8 +574,6 @@ iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest, iop_desc_init_xor(grp_start, src_cnt, flags); iop_desc_set_byte_count(grp_start, iop_chan, len); iop_desc_set_dest_addr(grp_start, iop_chan, dma_dest); - sw_desc->unmap_src_cnt = src_cnt; - sw_desc->unmap_len = len; sw_desc->async_tx.flags = flags; while (src_cnt--) iop_desc_set_xor_src_addr(grp_start, src_cnt, @@ -694,8 +609,6 @@ iop_adma_prep_dma_xor_val(struct dma_chan *chan, dma_addr_t *dma_src, grp_start->xor_check_result = result; pr_debug("\t%s: grp_start->xor_check_result: %p\n", __func__, grp_start->xor_check_result); - sw_desc->unmap_src_cnt = src_cnt; - sw_desc->unmap_len = len; sw_desc->async_tx.flags = flags; while (src_cnt--) iop_desc_set_zero_sum_src_addr(grp_start, src_cnt, @@ -748,8 +661,6 @@ iop_adma_prep_dma_pq(struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, dst[0] = dst[1] & 0x7; iop_desc_set_pq_addr(g, dst); - sw_desc->unmap_src_cnt = src_cnt; - sw_desc->unmap_len = len; sw_desc->async_tx.flags = flags; for (i = 0; i < src_cnt; i++) iop_desc_set_pq_src_addr(g, i, src[i], scf[i]); @@ -804,8 +715,6 @@ iop_adma_prep_dma_pq_val(struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, g->pq_check_result = pqres; pr_debug("\t%s: g->pq_check_result: %p\n", __func__, g->pq_check_result); - sw_desc->unmap_src_cnt = src_cnt+2; - sw_desc->unmap_len = len; sw_desc->async_tx.flags = flags; while (src_cnt--) iop_desc_set_pq_zero_sum_src_addr(g, src_cnt, @@ -864,7 +773,7 @@ static enum dma_status iop_adma_status(struct dma_chan *chan, int ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; iop_adma_slot_cleanup(iop_chan); @@ -983,7 +892,7 @@ static int iop_adma_memcpy_self_test(struct iop_adma_device *device) msleep(1); if (iop_adma_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test copy timed out, disabling\n"); err = -ENODEV; @@ -1083,7 +992,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) msleep(8); if (iop_adma_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test xor timed out, disabling\n"); err = -ENODEV; @@ -1129,7 +1038,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) iop_adma_issue_pending(dma_chan); msleep(8); - if (iop_adma_status(dma_chan, cookie, NULL) != DMA_SUCCESS) { + if (iop_adma_status(dma_chan, cookie, NULL) != DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test zero sum timed out, disabling\n"); err = -ENODEV; @@ -1158,7 +1067,7 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device) iop_adma_issue_pending(dma_chan); msleep(8); - if (iop_adma_status(dma_chan, cookie, NULL) != DMA_SUCCESS) { + if (iop_adma_status(dma_chan, cookie, NULL) != DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test non-zero sum timed out, disabling\n"); err = -ENODEV; @@ -1254,7 +1163,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device) msleep(8); if (iop_adma_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dev, "Self-test pq timed out, disabling\n"); err = -ENODEV; goto free_resources; @@ -1291,7 +1200,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device) msleep(8); if (iop_adma_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dev, "Self-test pq-zero-sum timed out, disabling\n"); err = -ENODEV; goto free_resources; @@ -1323,7 +1232,7 @@ iop_adma_pq_zero_sum_self_test(struct iop_adma_device *device) msleep(8); if (iop_adma_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dev, "Self-test !pq-zero-sum timed out, disabling\n"); err = -ENODEV; goto free_resources; diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index cb9c0bc317e..128ca143486 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -1232,8 +1232,10 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list); descnew = desc; - dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n", - irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf); + dev_dbg(dev, "IDMAC irq %d, dma %#llx, next dma %#llx, current %d, curbuf %#x\n", + irq, (u64)sg_dma_address(*sg), + sgnext ? (u64)sg_dma_address(sgnext) : 0, + ichan->active_buffer, curbuf); /* Find the descriptor of sgnext */ sgnew = idmac_sg_next(ichan, &descnew, *sg); diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index a2c330f5f95..e26075408e9 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -344,7 +344,7 @@ static enum dma_status k3_dma_tx_status(struct dma_chan *chan, size_t bytes = 0; ret = dma_cookie_status(&c->vc.chan, cookie, state); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; spin_lock_irqsave(&c->vc.lock, flags); @@ -693,7 +693,7 @@ static int k3_dma_probe(struct platform_device *op) irq = platform_get_irq(op, 0); ret = devm_request_irq(&op->dev, irq, - k3_dma_int_handler, IRQF_DISABLED, DRIVER_NAME, d); + k3_dma_int_handler, 0, DRIVER_NAME, d); if (ret) return ret; diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c index ff8d7827f8c..dcb1e05149a 100644 --- a/drivers/dma/mmp_pdma.c +++ b/drivers/dma/mmp_pdma.c @@ -798,8 +798,7 @@ static void dma_do_tasklet(unsigned long data) * move the descriptors to a temporary list so we can drop * the lock during the entire cleanup operation */ - list_del(&desc->node); - list_add(&desc->node, &chain_cleanup); + list_move(&desc->node, &chain_cleanup); /* * Look for the first list entry which has the ENDIRQEN flag @@ -863,7 +862,7 @@ static int mmp_pdma_chan_init(struct mmp_pdma_device *pdev, if (irq) { ret = devm_request_irq(pdev->dev, irq, - mmp_pdma_chan_handler, IRQF_DISABLED, "pdma", phy); + mmp_pdma_chan_handler, 0, "pdma", phy); if (ret) { dev_err(pdev->dev, "channel request irq fail!\n"); return ret; @@ -970,7 +969,7 @@ static int mmp_pdma_probe(struct platform_device *op) /* all chan share one irq, demux inside */ irq = platform_get_irq(op, 0); ret = devm_request_irq(pdev->dev, irq, - mmp_pdma_int_handler, IRQF_DISABLED, "pdma", pdev); + mmp_pdma_int_handler, 0, "pdma", pdev); if (ret) return ret; } diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c index d3b6358e5a2..3ddacc14a73 100644 --- a/drivers/dma/mmp_tdma.c +++ b/drivers/dma/mmp_tdma.c @@ -62,6 +62,11 @@ #define TDCR_BURSTSZ_16B (0x3 << 6) #define TDCR_BURSTSZ_32B (0x6 << 6) #define TDCR_BURSTSZ_64B (0x7 << 6) +#define TDCR_BURSTSZ_SQU_1B (0x5 << 6) +#define TDCR_BURSTSZ_SQU_2B (0x6 << 6) +#define TDCR_BURSTSZ_SQU_4B (0x0 << 6) +#define TDCR_BURSTSZ_SQU_8B (0x1 << 6) +#define TDCR_BURSTSZ_SQU_16B (0x3 << 6) #define TDCR_BURSTSZ_SQU_32B (0x7 << 6) #define TDCR_BURSTSZ_128B (0x5 << 6) #define TDCR_DSTDIR_MSK (0x3 << 4) /* Dst Direction */ @@ -158,7 +163,7 @@ static void mmp_tdma_disable_chan(struct mmp_tdma_chan *tdmac) /* disable irq */ writel(0, tdmac->reg_base + TDIMR); - tdmac->status = DMA_SUCCESS; + tdmac->status = DMA_COMPLETE; } static void mmp_tdma_resume_chan(struct mmp_tdma_chan *tdmac) @@ -228,8 +233,31 @@ static int mmp_tdma_config_chan(struct mmp_tdma_chan *tdmac) return -EINVAL; } } else if (tdmac->type == PXA910_SQU) { - tdcr |= TDCR_BURSTSZ_SQU_32B; tdcr |= TDCR_SSPMOD; + + switch (tdmac->burst_sz) { + case 1: + tdcr |= TDCR_BURSTSZ_SQU_1B; + break; + case 2: + tdcr |= TDCR_BURSTSZ_SQU_2B; + break; + case 4: + tdcr |= TDCR_BURSTSZ_SQU_4B; + break; + case 8: + tdcr |= TDCR_BURSTSZ_SQU_8B; + break; + case 16: + tdcr |= TDCR_BURSTSZ_SQU_16B; + break; + case 32: + tdcr |= TDCR_BURSTSZ_SQU_32B; + break; + default: + dev_err(tdmac->dev, "mmp_tdma: unknown burst size.\n"); + return -EINVAL; + } } writel(tdcr, tdmac->reg_base + TDCR); @@ -324,7 +352,7 @@ static int mmp_tdma_alloc_chan_resources(struct dma_chan *chan) if (tdmac->irq) { ret = devm_request_irq(tdmac->dev, tdmac->irq, - mmp_tdma_chan_handler, IRQF_DISABLED, "tdma", tdmac); + mmp_tdma_chan_handler, 0, "tdma", tdmac); if (ret) return ret; } @@ -365,7 +393,7 @@ static struct dma_async_tx_descriptor *mmp_tdma_prep_dma_cyclic( int num_periods = buf_len / period_len; int i = 0, buf = 0; - if (tdmac->status != DMA_SUCCESS) + if (tdmac->status != DMA_COMPLETE) return NULL; if (period_len > TDMA_MAX_XFER_BYTES) { @@ -499,7 +527,7 @@ static int mmp_tdma_chan_init(struct mmp_tdma_device *tdev, tdmac->idx = idx; tdmac->type = type; tdmac->reg_base = (unsigned long)tdev->base + idx * 4; - tdmac->status = DMA_SUCCESS; + tdmac->status = DMA_COMPLETE; tdev->tdmac[tdmac->idx] = tdmac; tasklet_init(&tdmac->tasklet, dma_do_tasklet, (unsigned long)tdmac); @@ -554,7 +582,7 @@ static int mmp_tdma_probe(struct platform_device *pdev) if (irq_num != chan_num) { irq = platform_get_irq(pdev, 0); ret = devm_request_irq(&pdev->dev, irq, - mmp_tdma_int_handler, IRQF_DISABLED, "tdma", tdev); + mmp_tdma_int_handler, 0, "tdma", tdev); if (ret) return ret; } diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 536dcb8ba5f..7807f0ef4e2 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -60,14 +60,6 @@ static u32 mv_desc_get_dest_addr(struct mv_xor_desc_slot *desc) return hw_desc->phy_dest_addr; } -static u32 mv_desc_get_src_addr(struct mv_xor_desc_slot *desc, - int src_idx) -{ - struct mv_xor_desc *hw_desc = desc->hw_desc; - return hw_desc->phy_src_addr[mv_phy_src_idx(src_idx)]; -} - - static void mv_desc_set_byte_count(struct mv_xor_desc_slot *desc, u32 byte_count) { @@ -278,42 +270,9 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc, desc->async_tx.callback( desc->async_tx.callback_param); - /* unmap dma addresses - * (unmap_single vs unmap_page?) - */ - if (desc->group_head && desc->unmap_len) { - struct mv_xor_desc_slot *unmap = desc->group_head; - struct device *dev = mv_chan_to_devp(mv_chan); - u32 len = unmap->unmap_len; - enum dma_ctrl_flags flags = desc->async_tx.flags; - u32 src_cnt; - dma_addr_t addr; - dma_addr_t dest; - - src_cnt = unmap->unmap_src_cnt; - dest = mv_desc_get_dest_addr(unmap); - if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - enum dma_data_direction dir; - - if (src_cnt > 1) /* is xor ? */ - dir = DMA_BIDIRECTIONAL; - else - dir = DMA_FROM_DEVICE; - dma_unmap_page(dev, dest, len, dir); - } - - if (!(flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - while (src_cnt--) { - addr = mv_desc_get_src_addr(unmap, - src_cnt); - if (addr == dest) - continue; - dma_unmap_page(dev, addr, len, - DMA_TO_DEVICE); - } - } + dma_descriptor_unmap(&desc->async_tx); + if (desc->group_head) desc->group_head = NULL; - } } /* run dependent operations */ @@ -749,7 +708,7 @@ static enum dma_status mv_xor_status(struct dma_chan *chan, enum dma_status ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) { + if (ret == DMA_COMPLETE) { mv_xor_clean_completed_slots(mv_chan); return ret; } @@ -874,7 +833,7 @@ static int mv_xor_memcpy_self_test(struct mv_xor_chan *mv_chan) msleep(1); if (mv_xor_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test copy timed out, disabling\n"); err = -ENODEV; @@ -968,7 +927,7 @@ mv_xor_xor_self_test(struct mv_xor_chan *mv_chan) msleep(8); if (mv_xor_status(dma_chan, cookie, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { dev_err(dma_chan->device->dev, "Self-test xor timed out, disabling\n"); err = -ENODEV; @@ -1076,10 +1035,7 @@ mv_xor_channel_add(struct mv_xor_device *xordev, } mv_chan->mmr_base = xordev->xor_base; - if (!mv_chan->mmr_base) { - ret = -ENOMEM; - goto err_free_dma; - } + mv_chan->mmr_high_base = xordev->xor_high_base; tasklet_init(&mv_chan->irq_tasklet, mv_xor_tasklet, (unsigned long) mv_chan); @@ -1138,7 +1094,7 @@ static void mv_xor_conf_mbus_windows(struct mv_xor_device *xordev, const struct mbus_dram_target_info *dram) { - void __iomem *base = xordev->xor_base; + void __iomem *base = xordev->xor_high_base; u32 win_enable = 0; int i; diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h index 06b067f24c9..d0749229c87 100644 --- a/drivers/dma/mv_xor.h +++ b/drivers/dma/mv_xor.h @@ -34,13 +34,13 @@ #define XOR_OPERATION_MODE_MEMCPY 2 #define XOR_DESCRIPTOR_SWAP BIT(14) -#define XOR_CURR_DESC(chan) (chan->mmr_base + 0x210 + (chan->idx * 4)) -#define XOR_NEXT_DESC(chan) (chan->mmr_base + 0x200 + (chan->idx * 4)) -#define XOR_BYTE_COUNT(chan) (chan->mmr_base + 0x220 + (chan->idx * 4)) -#define XOR_DEST_POINTER(chan) (chan->mmr_base + 0x2B0 + (chan->idx * 4)) -#define XOR_BLOCK_SIZE(chan) (chan->mmr_base + 0x2C0 + (chan->idx * 4)) -#define XOR_INIT_VALUE_LOW(chan) (chan->mmr_base + 0x2E0) -#define XOR_INIT_VALUE_HIGH(chan) (chan->mmr_base + 0x2E4) +#define XOR_CURR_DESC(chan) (chan->mmr_high_base + 0x10 + (chan->idx * 4)) +#define XOR_NEXT_DESC(chan) (chan->mmr_high_base + 0x00 + (chan->idx * 4)) +#define XOR_BYTE_COUNT(chan) (chan->mmr_high_base + 0x20 + (chan->idx * 4)) +#define XOR_DEST_POINTER(chan) (chan->mmr_high_base + 0xB0 + (chan->idx * 4)) +#define XOR_BLOCK_SIZE(chan) (chan->mmr_high_base + 0xC0 + (chan->idx * 4)) +#define XOR_INIT_VALUE_LOW(chan) (chan->mmr_high_base + 0xE0) +#define XOR_INIT_VALUE_HIGH(chan) (chan->mmr_high_base + 0xE4) #define XOR_CONFIG(chan) (chan->mmr_base + 0x10 + (chan->idx * 4)) #define XOR_ACTIVATION(chan) (chan->mmr_base + 0x20 + (chan->idx * 4)) @@ -50,11 +50,11 @@ #define XOR_ERROR_ADDR(chan) (chan->mmr_base + 0x60) #define XOR_INTR_MASK_VALUE 0x3F5 -#define WINDOW_BASE(w) (0x250 + ((w) << 2)) -#define WINDOW_SIZE(w) (0x270 + ((w) << 2)) -#define WINDOW_REMAP_HIGH(w) (0x290 + ((w) << 2)) -#define WINDOW_BAR_ENABLE(chan) (0x240 + ((chan) << 2)) -#define WINDOW_OVERRIDE_CTRL(chan) (0x2A0 + ((chan) << 2)) +#define WINDOW_BASE(w) (0x50 + ((w) << 2)) +#define WINDOW_SIZE(w) (0x70 + ((w) << 2)) +#define WINDOW_REMAP_HIGH(w) (0x90 + ((w) << 2)) +#define WINDOW_BAR_ENABLE(chan) (0x40 + ((chan) << 2)) +#define WINDOW_OVERRIDE_CTRL(chan) (0xA0 + ((chan) << 2)) struct mv_xor_device { void __iomem *xor_base; @@ -82,6 +82,7 @@ struct mv_xor_chan { int pending; spinlock_t lock; /* protects the descriptor slot pool */ void __iomem *mmr_base; + void __iomem *mmr_high_base; unsigned int idx; int irq; enum dma_transaction_type current_type; diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c index ccd13df841d..ead491346da 100644 --- a/drivers/dma/mxs-dma.c +++ b/drivers/dma/mxs-dma.c @@ -27,6 +27,7 @@ #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_dma.h> +#include <linux/list.h> #include <asm/irq.h> @@ -57,6 +58,9 @@ (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x050 : 0x110) + (n) * 0x70) #define HW_APBHX_CHn_SEMA(d, n) \ (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x080 : 0x140) + (n) * 0x70) +#define HW_APBHX_CHn_BAR(d, n) \ + (((dma_is_apbh(d) && apbh_is_old(d)) ? 0x070 : 0x130) + (n) * 0x70) +#define HW_APBX_CHn_DEBUG1(d, n) (0x150 + (n) * 0x70) /* * ccw bits definitions @@ -115,7 +119,9 @@ struct mxs_dma_chan { int desc_count; enum dma_status status; unsigned int flags; + bool reset; #define MXS_DMA_SG_LOOP (1 << 0) +#define MXS_DMA_USE_SEMAPHORE (1 << 1) }; #define MXS_DMA_CHANNELS 16 @@ -201,12 +207,47 @@ static void mxs_dma_reset_chan(struct mxs_dma_chan *mxs_chan) struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; int chan_id = mxs_chan->chan.chan_id; - if (dma_is_apbh(mxs_dma) && apbh_is_old(mxs_dma)) + /* + * mxs dma channel resets can cause a channel stall. To recover from a + * channel stall, we have to reset the whole DMA engine. To avoid this, + * we use cyclic DMA with semaphores, that are enhanced in + * mxs_dma_int_handler. To reset the channel, we can simply stop writing + * into the semaphore counter. + */ + if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE && + mxs_chan->flags & MXS_DMA_SG_LOOP) { + mxs_chan->reset = true; + } else if (dma_is_apbh(mxs_dma) && apbh_is_old(mxs_dma)) { writel(1 << (chan_id + BP_APBH_CTRL0_RESET_CHANNEL), mxs_dma->base + HW_APBHX_CTRL0 + STMP_OFFSET_REG_SET); - else + } else { + unsigned long elapsed = 0; + const unsigned long max_wait = 50000; /* 50ms */ + void __iomem *reg_dbg1 = mxs_dma->base + + HW_APBX_CHn_DEBUG1(mxs_dma, chan_id); + + /* + * On i.MX28 APBX, the DMA channel can stop working if we reset + * the channel while it is in READ_FLUSH (0x08) state. + * We wait here until we leave the state. Then we trigger the + * reset. Waiting a maximum of 50ms, the kernel shouldn't crash + * because of this. + */ + while ((readl(reg_dbg1) & 0xf) == 0x8 && elapsed < max_wait) { + udelay(100); + elapsed += 100; + } + + if (elapsed >= max_wait) + dev_err(&mxs_chan->mxs_dma->pdev->dev, + "Failed waiting for the DMA channel %d to leave state READ_FLUSH, trying to reset channel in READ_FLUSH state now\n", + chan_id); + writel(1 << (chan_id + BP_APBHX_CHANNEL_CTRL_RESET_CHANNEL), mxs_dma->base + HW_APBHX_CHANNEL_CTRL + STMP_OFFSET_REG_SET); + } + + mxs_chan->status = DMA_COMPLETE; } static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan) @@ -219,12 +260,21 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan) mxs_dma->base + HW_APBHX_CHn_NXTCMDAR(mxs_dma, chan_id)); /* write 1 to SEMA to kick off the channel */ - writel(1, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id)); + if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE && + mxs_chan->flags & MXS_DMA_SG_LOOP) { + /* A cyclic DMA consists of at least 2 segments, so initialize + * the semaphore with 2 so we have enough time to add 1 to the + * semaphore if we need to */ + writel(2, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id)); + } else { + writel(1, mxs_dma->base + HW_APBHX_CHn_SEMA(mxs_dma, chan_id)); + } + mxs_chan->reset = false; } static void mxs_dma_disable_chan(struct mxs_dma_chan *mxs_chan) { - mxs_chan->status = DMA_SUCCESS; + mxs_chan->status = DMA_COMPLETE; } static void mxs_dma_pause_chan(struct mxs_dma_chan *mxs_chan) @@ -272,58 +322,88 @@ static void mxs_dma_tasklet(unsigned long data) mxs_chan->desc.callback(mxs_chan->desc.callback_param); } +static int mxs_dma_irq_to_chan(struct mxs_dma_engine *mxs_dma, int irq) +{ + int i; + + for (i = 0; i != mxs_dma->nr_channels; ++i) + if (mxs_dma->mxs_chans[i].chan_irq == irq) + return i; + + return -EINVAL; +} + static irqreturn_t mxs_dma_int_handler(int irq, void *dev_id) { struct mxs_dma_engine *mxs_dma = dev_id; - u32 stat1, stat2; + struct mxs_dma_chan *mxs_chan; + u32 completed; + u32 err; + int chan = mxs_dma_irq_to_chan(mxs_dma, irq); + + if (chan < 0) + return IRQ_NONE; /* completion status */ - stat1 = readl(mxs_dma->base + HW_APBHX_CTRL1); - stat1 &= MXS_DMA_CHANNELS_MASK; - writel(stat1, mxs_dma->base + HW_APBHX_CTRL1 + STMP_OFFSET_REG_CLR); + completed = readl(mxs_dma->base + HW_APBHX_CTRL1); + completed = (completed >> chan) & 0x1; + + /* Clear interrupt */ + writel((1 << chan), + mxs_dma->base + HW_APBHX_CTRL1 + STMP_OFFSET_REG_CLR); /* error status */ - stat2 = readl(mxs_dma->base + HW_APBHX_CTRL2); - writel(stat2, mxs_dma->base + HW_APBHX_CTRL2 + STMP_OFFSET_REG_CLR); + err = readl(mxs_dma->base + HW_APBHX_CTRL2); + err &= (1 << (MXS_DMA_CHANNELS + chan)) | (1 << chan); + + /* + * error status bit is in the upper 16 bits, error irq bit in the lower + * 16 bits. We transform it into a simpler error code: + * err: 0x00 = no error, 0x01 = TERMINATION, 0x02 = BUS_ERROR + */ + err = (err >> (MXS_DMA_CHANNELS + chan)) + (err >> chan); + + /* Clear error irq */ + writel((1 << chan), + mxs_dma->base + HW_APBHX_CTRL2 + STMP_OFFSET_REG_CLR); /* * When both completion and error of termination bits set at the * same time, we do not take it as an error. IOW, it only becomes - * an error we need to handle here in case of either it's (1) a bus - * error or (2) a termination error with no completion. + * an error we need to handle here in case of either it's a bus + * error or a termination error with no completion. 0x01 is termination + * error, so we can subtract err & completed to get the real error case. */ - stat2 = ((stat2 >> MXS_DMA_CHANNELS) & stat2) | /* (1) */ - (~(stat2 >> MXS_DMA_CHANNELS) & stat2 & ~stat1); /* (2) */ - - /* combine error and completion status for checking */ - stat1 = (stat2 << MXS_DMA_CHANNELS) | stat1; - while (stat1) { - int channel = fls(stat1) - 1; - struct mxs_dma_chan *mxs_chan = - &mxs_dma->mxs_chans[channel % MXS_DMA_CHANNELS]; - - if (channel >= MXS_DMA_CHANNELS) { - dev_dbg(mxs_dma->dma_device.dev, - "%s: error in channel %d\n", __func__, - channel - MXS_DMA_CHANNELS); - mxs_chan->status = DMA_ERROR; - mxs_dma_reset_chan(mxs_chan); - } else { - if (mxs_chan->flags & MXS_DMA_SG_LOOP) - mxs_chan->status = DMA_IN_PROGRESS; - else - mxs_chan->status = DMA_SUCCESS; - } + err -= err & completed; - stat1 &= ~(1 << channel); + mxs_chan = &mxs_dma->mxs_chans[chan]; - if (mxs_chan->status == DMA_SUCCESS) - dma_cookie_complete(&mxs_chan->desc); + if (err) { + dev_dbg(mxs_dma->dma_device.dev, + "%s: error in channel %d\n", __func__, + chan); + mxs_chan->status = DMA_ERROR; + mxs_dma_reset_chan(mxs_chan); + } else if (mxs_chan->status != DMA_COMPLETE) { + if (mxs_chan->flags & MXS_DMA_SG_LOOP) { + mxs_chan->status = DMA_IN_PROGRESS; + if (mxs_chan->flags & MXS_DMA_USE_SEMAPHORE) + writel(1, mxs_dma->base + + HW_APBHX_CHn_SEMA(mxs_dma, chan)); + } else { + mxs_chan->status = DMA_COMPLETE; + } + } - /* schedule tasklet on this channel */ - tasklet_schedule(&mxs_chan->tasklet); + if (mxs_chan->status == DMA_COMPLETE) { + if (mxs_chan->reset) + return IRQ_HANDLED; + dma_cookie_complete(&mxs_chan->desc); } + /* schedule tasklet on this channel */ + tasklet_schedule(&mxs_chan->tasklet); + return IRQ_HANDLED; } @@ -523,6 +603,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic( mxs_chan->status = DMA_IN_PROGRESS; mxs_chan->flags |= MXS_DMA_SG_LOOP; + mxs_chan->flags |= MXS_DMA_USE_SEMAPHORE; if (num_periods > NUM_CCW) { dev_err(mxs_dma->dma_device.dev, @@ -554,6 +635,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic( ccw->bits |= CCW_IRQ; ccw->bits |= CCW_HALT_ON_TERM; ccw->bits |= CCW_TERM_FLUSH; + ccw->bits |= CCW_DEC_SEM; ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ? MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ, COMMAND); @@ -599,8 +681,24 @@ static enum dma_status mxs_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie, struct dma_tx_state *txstate) { struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan); + struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma; + u32 residue = 0; + + if (mxs_chan->status == DMA_IN_PROGRESS && + mxs_chan->flags & MXS_DMA_SG_LOOP) { + struct mxs_dma_ccw *last_ccw; + u32 bar; + + last_ccw = &mxs_chan->ccw[mxs_chan->desc_count - 1]; + residue = last_ccw->xfer_bytes + last_ccw->bufaddr; + + bar = readl(mxs_dma->base + + HW_APBHX_CHn_BAR(mxs_dma, chan->chan_id)); + residue -= bar; + } - dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie, 0); + dma_set_tx_state(txstate, chan->completed_cookie, chan->cookie, + residue); return mxs_chan->status; } diff --git a/drivers/dma/omap-dma.c b/drivers/dma/omap-dma.c index ec3fc4fd916..2f66cf4e54f 100644 --- a/drivers/dma/omap-dma.c +++ b/drivers/dma/omap-dma.c @@ -248,7 +248,7 @@ static enum dma_status omap_dma_tx_status(struct dma_chan *chan, unsigned long flags; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS || !txstate) + if (ret == DMA_COMPLETE || !txstate) return ret; spin_lock_irqsave(&c->vc.lock, flags); diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index df8b10fd172..cdf0483b8f2 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -2268,6 +2268,8 @@ static void pl330_tasklet(unsigned long data) list_move_tail(&desc->node, &pch->dmac->desc_pool); } + dma_descriptor_unmap(&desc->txd); + if (callback) { spin_unlock_irqrestore(&pch->lock, flags); callback(callback_param); @@ -2314,7 +2316,7 @@ bool pl330_filter(struct dma_chan *chan, void *param) return false; peri_id = chan->private; - return *peri_id == (unsigned)param; + return *peri_id == (unsigned long)param; } EXPORT_SYMBOL(pl330_filter); @@ -2926,16 +2928,23 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) amba_set_drvdata(adev, pdmac); - irq = adev->irq[0]; - ret = request_irq(irq, pl330_irq_handler, 0, - dev_name(&adev->dev), pi); - if (ret) - return ret; + for (i = 0; i < AMBA_NR_IRQS; i++) { + irq = adev->irq[i]; + if (irq) { + ret = devm_request_irq(&adev->dev, irq, + pl330_irq_handler, 0, + dev_name(&adev->dev), pi); + if (ret) + return ret; + } else { + break; + } + } pi->pcfg.periph_id = adev->periphid; ret = pl330_add(pi); if (ret) - goto probe_err1; + return ret; INIT_LIST_HEAD(&pdmac->desc_pool); spin_lock_init(&pdmac->pool_lock); @@ -3033,8 +3042,6 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id) return 0; probe_err3: - amba_set_drvdata(adev, NULL); - /* Idle the DMAC */ list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels, chan.device_node) { @@ -3048,8 +3055,6 @@ probe_err3: } probe_err2: pl330_del(pi); -probe_err1: - free_irq(irq, pi); return ret; } @@ -3059,7 +3064,6 @@ static int pl330_remove(struct amba_device *adev) struct dma_pl330_dmac *pdmac = amba_get_drvdata(adev); struct dma_pl330_chan *pch, *_p; struct pl330_info *pi; - int irq; if (!pdmac) return 0; @@ -3068,7 +3072,6 @@ static int pl330_remove(struct amba_device *adev) of_dma_controller_free(adev->dev.of_node); dma_async_device_unregister(&pdmac->ddma); - amba_set_drvdata(adev, NULL); /* Idle the DMAC */ list_for_each_entry_safe(pch, _p, &pdmac->ddma.channels, @@ -3086,9 +3089,6 @@ static int pl330_remove(struct amba_device *adev) pl330_del(pi); - irq = adev->irq[0]; - free_irq(irq, pi); - return 0; } diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c index e24b5ef486b..8da48c6b2a3 100644 --- a/drivers/dma/ppc4xx/adma.c +++ b/drivers/dma/ppc4xx/adma.c @@ -804,218 +804,6 @@ static void ppc440spe_desc_set_link(struct ppc440spe_adma_chan *chan, } /** - * ppc440spe_desc_get_src_addr - extract the source address from the descriptor - */ -static u32 ppc440spe_desc_get_src_addr(struct ppc440spe_adma_desc_slot *desc, - struct ppc440spe_adma_chan *chan, int src_idx) -{ - struct dma_cdb *dma_hw_desc; - struct xor_cb *xor_hw_desc; - - switch (chan->device->id) { - case PPC440SPE_DMA0_ID: - case PPC440SPE_DMA1_ID: - dma_hw_desc = desc->hw_desc; - /* May have 0, 1, 2, or 3 sources */ - switch (dma_hw_desc->opc) { - case DMA_CDB_OPC_NO_OP: - case DMA_CDB_OPC_DFILL128: - return 0; - case DMA_CDB_OPC_DCHECK128: - if (unlikely(src_idx)) { - printk(KERN_ERR "%s: try to get %d source for" - " DCHECK128\n", __func__, src_idx); - BUG(); - } - return le32_to_cpu(dma_hw_desc->sg1l); - case DMA_CDB_OPC_MULTICAST: - case DMA_CDB_OPC_MV_SG1_SG2: - if (unlikely(src_idx > 2)) { - printk(KERN_ERR "%s: try to get %d source from" - " DMA descr\n", __func__, src_idx); - BUG(); - } - if (src_idx) { - if (le32_to_cpu(dma_hw_desc->sg1u) & - DMA_CUED_XOR_WIN_MSK) { - u8 region; - - if (src_idx == 1) - return le32_to_cpu( - dma_hw_desc->sg1l) + - desc->unmap_len; - - region = (le32_to_cpu( - dma_hw_desc->sg1u)) >> - DMA_CUED_REGION_OFF; - - region &= DMA_CUED_REGION_MSK; - switch (region) { - case DMA_RXOR123: - return le32_to_cpu( - dma_hw_desc->sg1l) + - (desc->unmap_len << 1); - case DMA_RXOR124: - return le32_to_cpu( - dma_hw_desc->sg1l) + - (desc->unmap_len * 3); - case DMA_RXOR125: - return le32_to_cpu( - dma_hw_desc->sg1l) + - (desc->unmap_len << 2); - default: - printk(KERN_ERR - "%s: try to" - " get src3 for region %02x" - "PPC440SPE_DESC_RXOR12?\n", - __func__, region); - BUG(); - } - } else { - printk(KERN_ERR - "%s: try to get %d" - " source for non-cued descr\n", - __func__, src_idx); - BUG(); - } - } - return le32_to_cpu(dma_hw_desc->sg1l); - default: - printk(KERN_ERR "%s: unknown OPC 0x%02x\n", - __func__, dma_hw_desc->opc); - BUG(); - } - return le32_to_cpu(dma_hw_desc->sg1l); - case PPC440SPE_XOR_ID: - /* May have up to 16 sources */ - xor_hw_desc = desc->hw_desc; - return xor_hw_desc->ops[src_idx].l; - } - return 0; -} - -/** - * ppc440spe_desc_get_dest_addr - extract the destination address from the - * descriptor - */ -static u32 ppc440spe_desc_get_dest_addr(struct ppc440spe_adma_desc_slot *desc, - struct ppc440spe_adma_chan *chan, int idx) -{ - struct dma_cdb *dma_hw_desc; - struct xor_cb *xor_hw_desc; - - switch (chan->device->id) { - case PPC440SPE_DMA0_ID: - case PPC440SPE_DMA1_ID: - dma_hw_desc = desc->hw_desc; - - if (likely(!idx)) - return le32_to_cpu(dma_hw_desc->sg2l); - return le32_to_cpu(dma_hw_desc->sg3l); - case PPC440SPE_XOR_ID: - xor_hw_desc = desc->hw_desc; - return xor_hw_desc->cbtal; - } - return 0; -} - -/** - * ppc440spe_desc_get_src_num - extract the number of source addresses from - * the descriptor - */ -static u32 ppc440spe_desc_get_src_num(struct ppc440spe_adma_desc_slot *desc, - struct ppc440spe_adma_chan *chan) -{ - struct dma_cdb *dma_hw_desc; - struct xor_cb *xor_hw_desc; - - switch (chan->device->id) { - case PPC440SPE_DMA0_ID: - case PPC440SPE_DMA1_ID: - dma_hw_desc = desc->hw_desc; - - switch (dma_hw_desc->opc) { - case DMA_CDB_OPC_NO_OP: - case DMA_CDB_OPC_DFILL128: - return 0; - case DMA_CDB_OPC_DCHECK128: - return 1; - case DMA_CDB_OPC_MV_SG1_SG2: - case DMA_CDB_OPC_MULTICAST: - /* - * Only for RXOR operations we have more than - * one source - */ - if (le32_to_cpu(dma_hw_desc->sg1u) & - DMA_CUED_XOR_WIN_MSK) { - /* RXOR op, there are 2 or 3 sources */ - if (((le32_to_cpu(dma_hw_desc->sg1u) >> - DMA_CUED_REGION_OFF) & - DMA_CUED_REGION_MSK) == DMA_RXOR12) { - /* RXOR 1-2 */ - return 2; - } else { - /* RXOR 1-2-3/1-2-4/1-2-5 */ - return 3; - } - } - return 1; - default: - printk(KERN_ERR "%s: unknown OPC 0x%02x\n", - __func__, dma_hw_desc->opc); - BUG(); - } - case PPC440SPE_XOR_ID: - /* up to 16 sources */ - xor_hw_desc = desc->hw_desc; - return xor_hw_desc->cbc & XOR_CDCR_OAC_MSK; - default: - BUG(); - } - return 0; -} - -/** - * ppc440spe_desc_get_dst_num - get the number of destination addresses in - * this descriptor - */ -static u32 ppc440spe_desc_get_dst_num(struct ppc440spe_adma_desc_slot *desc, - struct ppc440spe_adma_chan *chan) -{ - struct dma_cdb *dma_hw_desc; - - switch (chan->device->id) { - case PPC440SPE_DMA0_ID: - case PPC440SPE_DMA1_ID: - /* May be 1 or 2 destinations */ - dma_hw_desc = desc->hw_desc; - switch (dma_hw_desc->opc) { - case DMA_CDB_OPC_NO_OP: - case DMA_CDB_OPC_DCHECK128: - return 0; - case DMA_CDB_OPC_MV_SG1_SG2: - case DMA_CDB_OPC_DFILL128: - return 1; - case DMA_CDB_OPC_MULTICAST: - if (desc->dst_cnt == 2) - return 2; - else - return 1; - default: - printk(KERN_ERR "%s: unknown OPC 0x%02x\n", - __func__, dma_hw_desc->opc); - BUG(); - } - case PPC440SPE_XOR_ID: - /* Always only 1 destination */ - return 1; - default: - BUG(); - } - return 0; -} - -/** * ppc440spe_desc_get_link - get the address of the descriptor that * follows this one */ @@ -1707,43 +1495,6 @@ static void ppc440spe_adma_free_slots(struct ppc440spe_adma_desc_slot *slot, } } -static void ppc440spe_adma_unmap(struct ppc440spe_adma_chan *chan, - struct ppc440spe_adma_desc_slot *desc) -{ - u32 src_cnt, dst_cnt; - dma_addr_t addr; - - /* - * get the number of sources & destination - * included in this descriptor and unmap - * them all - */ - src_cnt = ppc440spe_desc_get_src_num(desc, chan); - dst_cnt = ppc440spe_desc_get_dst_num(desc, chan); - - /* unmap destinations */ - if (!(desc->async_tx.flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - while (dst_cnt--) { - addr = ppc440spe_desc_get_dest_addr( - desc, chan, dst_cnt); - dma_unmap_page(chan->device->dev, - addr, desc->unmap_len, - DMA_FROM_DEVICE); - } - } - - /* unmap sources */ - if (!(desc->async_tx.flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - while (src_cnt--) { - addr = ppc440spe_desc_get_src_addr( - desc, chan, src_cnt); - dma_unmap_page(chan->device->dev, - addr, desc->unmap_len, - DMA_TO_DEVICE); - } - } -} - /** * ppc440spe_adma_run_tx_complete_actions - call functions to be called * upon completion @@ -1767,26 +1518,7 @@ static dma_cookie_t ppc440spe_adma_run_tx_complete_actions( desc->async_tx.callback( desc->async_tx.callback_param); - /* unmap dma addresses - * (unmap_single vs unmap_page?) - * - * actually, ppc's dma_unmap_page() functions are empty, so - * the following code is just for the sake of completeness - */ - if (chan && chan->needs_unmap && desc->group_head && - desc->unmap_len) { - struct ppc440spe_adma_desc_slot *unmap = - desc->group_head; - /* assume 1 slot per op always */ - u32 slot_count = unmap->slot_cnt; - - /* Run through the group list and unmap addresses */ - for (i = 0; i < slot_count; i++) { - BUG_ON(!unmap); - ppc440spe_adma_unmap(chan, unmap); - unmap = unmap->hw_next; - } - } + dma_descriptor_unmap(&desc->async_tx); } /* run dependent operations */ @@ -3893,7 +3625,7 @@ static enum dma_status ppc440spe_adma_tx_status(struct dma_chan *chan, ppc440spe_chan = to_ppc440spe_adma_chan(chan); ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; ppc440spe_adma_slot_cleanup(ppc440spe_chan); diff --git a/drivers/dma/sa11x0-dma.c b/drivers/dma/sa11x0-dma.c index 461a91ab70b..ab26d46bbe1 100644 --- a/drivers/dma/sa11x0-dma.c +++ b/drivers/dma/sa11x0-dma.c @@ -436,7 +436,7 @@ static enum dma_status sa11x0_dma_tx_status(struct dma_chan *chan, enum dma_status ret; ret = dma_cookie_status(&c->vc.chan, cookie, state); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; if (!state) diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index d94ab592cc1..2e7b394def8 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -724,7 +724,7 @@ static enum dma_status shdma_tx_status(struct dma_chan *chan, * If we don't find cookie on the queue, it has been aborted and we have * to report error */ - if (status != DMA_SUCCESS) { + if (status != DMA_COMPLETE) { struct shdma_desc *sdesc; status = DMA_ERROR; list_for_each_entry(sdesc, &schan->ld_queue, node) diff --git a/drivers/dma/sh/shdmac.c b/drivers/dma/sh/shdmac.c index 1069e8869f2..0d765c0e21e 100644 --- a/drivers/dma/sh/shdmac.c +++ b/drivers/dma/sh/shdmac.c @@ -685,7 +685,7 @@ MODULE_DEVICE_TABLE(of, sh_dmae_of_match); static int sh_dmae_probe(struct platform_device *pdev) { const struct sh_dmae_pdata *pdata; - unsigned long irqflags = IRQF_DISABLED, + unsigned long irqflags = 0, chan_flag[SH_DMAE_MAX_CHANNELS] = {}; int errirq, chan_irq[SH_DMAE_MAX_CHANNELS]; int err, i, irq_cnt = 0, irqres = 0, irq_cap = 0; @@ -838,7 +838,7 @@ static int sh_dmae_probe(struct platform_device *pdev) IORESOURCE_IRQ_SHAREABLE) chan_flag[irq_cnt] = IRQF_SHARED; else - chan_flag[irq_cnt] = IRQF_DISABLED; + chan_flag[irq_cnt] = 0; dev_dbg(&pdev->dev, "Found IRQ %d for channel %d\n", i, irq_cnt); diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 82d2b97ad94..b8c031b7de4 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -14,6 +14,7 @@ #include <linux/platform_device.h> #include <linux/clk.h> #include <linux/delay.h> +#include <linux/log2.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/err.h> @@ -2626,7 +2627,7 @@ static enum dma_status d40_tx_status(struct dma_chan *chan, } ret = dma_cookie_status(chan, cookie, txstate); - if (ret != DMA_SUCCESS) + if (ret != DMA_COMPLETE) dma_set_residue(txstate, stedma40_residue(chan)); if (d40_is_paused(d40c)) @@ -2796,8 +2797,8 @@ static int d40_set_runtime_config(struct dma_chan *chan, src_addr_width > DMA_SLAVE_BUSWIDTH_8_BYTES || dst_addr_width <= DMA_SLAVE_BUSWIDTH_UNDEFINED || dst_addr_width > DMA_SLAVE_BUSWIDTH_8_BYTES || - ((src_addr_width > 1) && (src_addr_width & 1)) || - ((dst_addr_width > 1) && (dst_addr_width & 1))) + !is_power_of_2(src_addr_width) || + !is_power_of_2(dst_addr_width)) return -EINVAL; cfg->src_info.data_width = src_addr_width; diff --git a/drivers/dma/tegra20-apb-dma.c b/drivers/dma/tegra20-apb-dma.c index 5d4986e5f5f..73654e33f13 100644 --- a/drivers/dma/tegra20-apb-dma.c +++ b/drivers/dma/tegra20-apb-dma.c @@ -570,7 +570,7 @@ static void handle_once_dma_done(struct tegra_dma_channel *tdc, list_del(&sgreq->node); if (sgreq->last_sg) { - dma_desc->dma_status = DMA_SUCCESS; + dma_desc->dma_status = DMA_COMPLETE; dma_cookie_complete(&dma_desc->txd); if (!dma_desc->cb_count) list_add_tail(&dma_desc->cb_node, &tdc->cb_desc); @@ -768,7 +768,7 @@ static enum dma_status tegra_dma_tx_status(struct dma_chan *dc, unsigned int residual; ret = dma_cookie_status(dc, cookie, txstate); - if (ret == DMA_SUCCESS) + if (ret == DMA_COMPLETE) return ret; spin_lock_irqsave(&tdc->lock, flags); @@ -1018,7 +1018,7 @@ static struct dma_async_tx_descriptor *tegra_dma_prep_slave_sg( return &dma_desc->txd; } -struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic( +static struct dma_async_tx_descriptor *tegra_dma_prep_dma_cyclic( struct dma_chan *dc, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_transfer_direction direction, unsigned long flags, void *context) diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c index 28af214fce0..4506a7b4f97 100644 --- a/drivers/dma/timb_dma.c +++ b/drivers/dma/timb_dma.c @@ -154,38 +154,6 @@ static bool __td_dma_done_ack(struct timb_dma_chan *td_chan) return done; } -static void __td_unmap_desc(struct timb_dma_chan *td_chan, const u8 *dma_desc, - bool single) -{ - dma_addr_t addr; - int len; - - addr = (dma_desc[7] << 24) | (dma_desc[6] << 16) | (dma_desc[5] << 8) | - dma_desc[4]; - - len = (dma_desc[3] << 8) | dma_desc[2]; - - if (single) - dma_unmap_single(chan2dev(&td_chan->chan), addr, len, - DMA_TO_DEVICE); - else - dma_unmap_page(chan2dev(&td_chan->chan), addr, len, - DMA_TO_DEVICE); -} - -static void __td_unmap_descs(struct timb_dma_desc *td_desc, bool single) -{ - struct timb_dma_chan *td_chan = container_of(td_desc->txd.chan, - struct timb_dma_chan, chan); - u8 *descs; - - for (descs = td_desc->desc_list; ; descs += TIMB_DMA_DESC_SIZE) { - __td_unmap_desc(td_chan, descs, single); - if (descs[0] & 0x02) - break; - } -} - static int td_fill_desc(struct timb_dma_chan *td_chan, u8 *dma_desc, struct scatterlist *sg, bool last) { @@ -293,10 +261,7 @@ static void __td_finish(struct timb_dma_chan *td_chan) list_move(&td_desc->desc_node, &td_chan->free_list); - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) - __td_unmap_descs(td_desc, - txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE); - + dma_descriptor_unmap(txd); /* * The API requires that no submissions are done from a * callback, so we don't need to drop the lock here diff --git a/drivers/dma/txx9dmac.c b/drivers/dma/txx9dmac.c index 71e8e775189..bae6c29f550 100644 --- a/drivers/dma/txx9dmac.c +++ b/drivers/dma/txx9dmac.c @@ -419,30 +419,7 @@ txx9dmac_descriptor_complete(struct txx9dmac_chan *dc, list_splice_init(&desc->tx_list, &dc->free_list); list_move(&desc->desc_node, &dc->free_list); - if (!ds) { - dma_addr_t dmaaddr; - if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { - dmaaddr = is_dmac64(dc) ? - desc->hwdesc.DAR : desc->hwdesc32.DAR; - if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) - dma_unmap_single(chan2parent(&dc->chan), - dmaaddr, desc->len, DMA_FROM_DEVICE); - else - dma_unmap_page(chan2parent(&dc->chan), - dmaaddr, desc->len, DMA_FROM_DEVICE); - } - if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { - dmaaddr = is_dmac64(dc) ? - desc->hwdesc.SAR : desc->hwdesc32.SAR; - if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) - dma_unmap_single(chan2parent(&dc->chan), - dmaaddr, desc->len, DMA_TO_DEVICE); - else - dma_unmap_page(chan2parent(&dc->chan), - dmaaddr, desc->len, DMA_TO_DEVICE); - } - } - + dma_descriptor_unmap(txd); /* * The API requires that no submissions are done from a * callback, so we don't need to drop the lock here @@ -962,8 +939,8 @@ txx9dmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, enum dma_status ret; ret = dma_cookie_status(chan, cookie, txstate); - if (ret == DMA_SUCCESS) - return DMA_SUCCESS; + if (ret == DMA_COMPLETE) + return DMA_COMPLETE; spin_lock_bh(&dc->lock); txx9dmac_scan_descriptors(dc); diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 7dd44615029..4e10b10d3dd 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -13,6 +13,7 @@ #include <linux/acpi_gpio.h> #include <linux/idr.h> #include <linux/slab.h> +#include <linux/acpi.h> #define CREATE_TRACE_POINTS #include <trace/events/gpio.h> diff --git a/drivers/gpu/drm/i915/intel_acpi.c b/drivers/gpu/drm/i915/intel_acpi.c index 43959edd429..dfff0907f70 100644 --- a/drivers/gpu/drm/i915/intel_acpi.c +++ b/drivers/gpu/drm/i915/intel_acpi.c @@ -196,7 +196,7 @@ static bool intel_dsm_pci_probe(struct pci_dev *pdev) acpi_handle dhandle; int ret; - dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); + dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return false; diff --git a/drivers/gpu/drm/i915/intel_opregion.c b/drivers/gpu/drm/i915/intel_opregion.c index 1b2f41c3f19..6d69a9bad86 100644 --- a/drivers/gpu/drm/i915/intel_opregion.c +++ b/drivers/gpu/drm/i915/intel_opregion.c @@ -638,7 +638,7 @@ static void intel_didl_outputs(struct drm_device *dev) u32 temp; int i = 0; - handle = DEVICE_ACPI_HANDLE(&dev->pdev->dev); + handle = ACPI_HANDLE(&dev->pdev->dev); if (!handle || acpi_bus_get_device(handle, &acpi_dev)) return; diff --git a/drivers/gpu/drm/nouveau/core/subdev/mxm/base.c b/drivers/gpu/drm/nouveau/core/subdev/mxm/base.c index e286e132c7e..129120473f6 100644 --- a/drivers/gpu/drm/nouveau/core/subdev/mxm/base.c +++ b/drivers/gpu/drm/nouveau/core/subdev/mxm/base.c @@ -116,7 +116,7 @@ mxm_shadow_dsm(struct nouveau_mxm *mxm, u8 version) acpi_handle handle; int ret; - handle = DEVICE_ACPI_HANDLE(&device->pdev->dev); + handle = ACPI_HANDLE(&device->pdev->dev); if (!handle) return false; diff --git a/drivers/gpu/drm/nouveau/nouveau_acpi.c b/drivers/gpu/drm/nouveau/nouveau_acpi.c index 07273a2ae62..95c74045404 100644 --- a/drivers/gpu/drm/nouveau/nouveau_acpi.c +++ b/drivers/gpu/drm/nouveau/nouveau_acpi.c @@ -256,7 +256,7 @@ static int nouveau_dsm_pci_probe(struct pci_dev *pdev) acpi_handle dhandle; int retval = 0; - dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); + dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return false; @@ -414,7 +414,7 @@ bool nouveau_acpi_rom_supported(struct pci_dev *pdev) if (!nouveau_dsm_priv.dsm_detected && !nouveau_dsm_priv.optimus_detected) return false; - dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); + dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return false; @@ -448,7 +448,7 @@ nouveau_acpi_edid(struct drm_device *dev, struct drm_connector *connector) return NULL; } - handle = DEVICE_ACPI_HANDLE(&dev->pdev->dev); + handle = ACPI_HANDLE(&dev->pdev->dev); if (!handle) return NULL; diff --git a/drivers/gpu/drm/radeon/radeon_acpi.c b/drivers/gpu/drm/radeon/radeon_acpi.c index 10f98c7742d..98a9074b306 100644 --- a/drivers/gpu/drm/radeon/radeon_acpi.c +++ b/drivers/gpu/drm/radeon/radeon_acpi.c @@ -369,7 +369,7 @@ int radeon_atif_handler(struct radeon_device *rdev, return NOTIFY_DONE; /* Check pending SBIOS requests */ - handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev); + handle = ACPI_HANDLE(&rdev->pdev->dev); count = radeon_atif_get_sbios_requests(handle, &req); if (count <= 0) @@ -556,7 +556,7 @@ int radeon_acpi_pcie_notify_device_ready(struct radeon_device *rdev) struct radeon_atcs *atcs = &rdev->atcs; /* Get the device handle */ - handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev); + handle = ACPI_HANDLE(&rdev->pdev->dev); if (!handle) return -EINVAL; @@ -596,7 +596,7 @@ int radeon_acpi_pcie_performance_request(struct radeon_device *rdev, u32 retry = 3; /* Get the device handle */ - handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev); + handle = ACPI_HANDLE(&rdev->pdev->dev); if (!handle) return -EINVAL; @@ -699,7 +699,7 @@ int radeon_acpi_init(struct radeon_device *rdev) int ret; /* Get the device handle */ - handle = DEVICE_ACPI_HANDLE(&rdev->pdev->dev); + handle = ACPI_HANDLE(&rdev->pdev->dev); /* No need to proceed if we're sure that ATIF is not supported */ if (!ASIC_IS_AVIVO(rdev) || !rdev->bios || !handle) diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c b/drivers/gpu/drm/radeon/radeon_atpx_handler.c index 6153ec18943..9d302eaeea1 100644 --- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c +++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c @@ -8,8 +8,7 @@ */ #include <linux/vga_switcheroo.h> #include <linux/slab.h> -#include <acpi/acpi.h> -#include <acpi/acpi_bus.h> +#include <linux/acpi.h> #include <linux/pci.h> #include "radeon_acpi.h" @@ -447,7 +446,7 @@ static bool radeon_atpx_pci_probe_handle(struct pci_dev *pdev) acpi_handle dhandle, atpx_handle; acpi_status status; - dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); + dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) return false; @@ -493,7 +492,7 @@ static int radeon_atpx_init(void) */ static int radeon_atpx_get_client_id(struct pci_dev *pdev) { - if (radeon_atpx_priv.dhandle == DEVICE_ACPI_HANDLE(&pdev->dev)) + if (radeon_atpx_priv.dhandle == ACPI_HANDLE(&pdev->dev)) return VGA_SWITCHEROO_IGD; else return VGA_SWITCHEROO_DIS; diff --git a/drivers/gpu/drm/radeon/radeon_bios.c b/drivers/gpu/drm/radeon/radeon_bios.c index c155d6f3fa6..b3633d9a531 100644 --- a/drivers/gpu/drm/radeon/radeon_bios.c +++ b/drivers/gpu/drm/radeon/radeon_bios.c @@ -185,7 +185,7 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev) return false; while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != NULL) { - dhandle = DEVICE_ACPI_HANDLE(&pdev->dev); + dhandle = ACPI_HANDLE(&pdev->dev); if (!dhandle) continue; diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index ae48d18ee31..5f7e55f4b7f 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -1008,7 +1008,7 @@ static int i2c_hid_probe(struct i2c_client *client, hid->hid_get_raw_report = i2c_hid_get_raw_report; hid->hid_output_raw_report = i2c_hid_output_raw_report; hid->dev.parent = &client->dev; - ACPI_HANDLE_SET(&hid->dev, ACPI_HANDLE(&client->dev)); + ACPI_COMPANION_SET(&hid->dev, ACPI_COMPANION(&client->dev)); hid->bus = BUS_I2C; hid->version = le16_to_cpu(ihid->hdesc.bcdVersion); hid->vendor = le16_to_cpu(ihid->hdesc.wVendorID); diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 5923cfa390c..d74c0b34248 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -615,6 +615,22 @@ void i2c_unlock_adapter(struct i2c_adapter *adapter) } EXPORT_SYMBOL_GPL(i2c_unlock_adapter); +static void i2c_dev_set_name(struct i2c_adapter *adap, + struct i2c_client *client) +{ + struct acpi_device *adev = ACPI_COMPANION(&client->dev); + + if (adev) { + dev_set_name(&client->dev, "i2c-%s", acpi_dev_name(adev)); + return; + } + + /* For 10-bit clients, add an arbitrary offset to avoid collisions */ + dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), + client->addr | ((client->flags & I2C_CLIENT_TEN) + ? 0xa000 : 0)); +} + /** * i2c_new_device - instantiate an i2c device * @adap: the adapter managing the device @@ -671,12 +687,9 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info) client->dev.bus = &i2c_bus_type; client->dev.type = &i2c_client_type; client->dev.of_node = info->of_node; - ACPI_HANDLE_SET(&client->dev, info->acpi_node.handle); + ACPI_COMPANION_SET(&client->dev, info->acpi_node.companion); - /* For 10-bit clients, add an arbitrary offset to avoid collisions */ - dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap), - client->addr | ((client->flags & I2C_CLIENT_TEN) - ? 0xa000 : 0)); + i2c_dev_set_name(adap, client); status = device_register(&client->dev); if (status) goto out_err; @@ -1100,7 +1113,7 @@ static acpi_status acpi_i2c_add_device(acpi_handle handle, u32 level, return AE_OK; memset(&info, 0, sizeof(info)); - info.acpi_node.handle = handle; + info.acpi_node.companion = adev; info.irq = -1; INIT_LIST_HEAD(&resource_list); diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c index 140c8ef5052..d9e1f7ccfe6 100644 --- a/drivers/ide/ide-acpi.c +++ b/drivers/ide/ide-acpi.c @@ -7,6 +7,7 @@ * Copyright (C) 2006 Hannes Reinecke */ +#include <linux/acpi.h> #include <linux/ata.h> #include <linux/delay.h> #include <linux/device.h> @@ -19,8 +20,6 @@ #include <linux/dmi.h> #include <linux/module.h> -#include <acpi/acpi_bus.h> - #define REGS_PER_GTF 7 struct GTM_buffer { @@ -128,7 +127,7 @@ static int ide_get_dev_handle(struct device *dev, acpi_handle *handle, DEBPRINT("ENTER: pci %02x:%02x.%01x\n", bus, devnum, func); - dev_handle = DEVICE_ACPI_HANDLE(dev); + dev_handle = ACPI_HANDLE(dev); if (!dev_handle) { DEBPRINT("no acpi handle for device\n"); goto err; diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 3226ce98fb1..cbd4e9abc47 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1,7 +1,7 @@ /* * intel_idle.c - native hardware idle loop for modern Intel processors * - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2013, Intel Corporation. * Len Brown <len.brown@intel.com> * * This program is free software; you can redistribute it and/or modify it @@ -329,6 +329,22 @@ static struct cpuidle_state atom_cstates[] __initdata = { { .enter = NULL } }; +static struct cpuidle_state avn_cstates[CPUIDLE_STATE_MAX] = { + { + .name = "C1-AVN", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle }, + { + .name = "C6-AVN", + .desc = "MWAIT 0x51", + .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 15, + .target_residency = 45, + .enter = &intel_idle }, +}; /** * intel_idle @@ -462,6 +478,11 @@ static const struct idle_cpu idle_cpu_hsw = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_avn = { + .state_table = avn_cstates, + .disable_promotion_to_c1e = true, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -483,6 +504,7 @@ static const struct x86_cpu_id intel_idle_ids[] = { ICPU(0x3f, idle_cpu_hsw), ICPU(0x45, idle_cpu_hsw), ICPU(0x46, idle_cpu_hsw), + ICPU(0x4D, idle_cpu_avn), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); diff --git a/drivers/md/md.c b/drivers/md/md.c index 8766eabb001..b6b7a2866c9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, @@ -130,7 +130,7 @@ static ctl_table raid_table[] = { { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { .procname = "raid", .maxlen = 0, @@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { .procname = "dev", .maxlen = 0, @@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) goto retry; } -static inline int mddev_lock(struct mddev * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + static inline int mddev_is_locked(struct mddev *mddev) { return mutex_is_locked(&mddev->reconfig_mutex); @@ -2978,7 +2986,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) for_each_mddev(mddev, tmp) { struct md_rdev *rdev2; - mddev_lock(mddev); + mddev_lock_nointr(mddev); rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && @@ -2994,7 +3002,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -3580,6 +3588,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } + blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); @@ -5258,7 +5267,7 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { - mddev_lock(mddev); + mddev_lock_nointr(mddev); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -5291,20 +5300,35 @@ EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev) { + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } err = -EBUSY; goto out; } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } if (mddev->pers) { __md_stop_writes(mddev); @@ -5315,7 +5339,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; + err = 0; } out: mutex_unlock(&mddev->open_mutex); @@ -5331,20 +5355,34 @@ static int do_md_stop(struct mddev * mddev, int mode, { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > !!bdev || - mddev->sysfs_active) { + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } if (mddev->pers) { @@ -6551,7 +6589,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - mddev_lock(mddev); + mddev_lock_nointr(mddev); } } else { err = -EROFS; @@ -7361,9 +7399,6 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; for_each_mddev(mddev2, tmp) { @@ -7388,7 +7423,7 @@ void md_do_sync(struct md_thread *thread) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" @@ -7464,7 +7499,7 @@ void md_do_sync(struct md_thread *thread) last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; @@ -7501,7 +7536,8 @@ void md_do_sync(struct md_thread *thread) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - while (j >= mddev->resync_max && !kthread_should_stop()) { + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. @@ -7509,17 +7545,18 @@ void md_do_sync(struct md_thread *thread) flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -7556,10 +7593,8 @@ void md_do_sync(struct md_thread *thread) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -7582,11 +7617,12 @@ void md_do_sync(struct md_thread *thread) } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); @@ -7640,16 +7676,6 @@ void md_do_sync(struct md_thread *thread) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); @@ -7894,6 +7920,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b1977..1e5a540995e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -66,7 +66,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) bio->bi_phys_segments--; done = (bio->bi_phys_segments == 0); spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); } else done = 1; @@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, start_next_window, bi_sector); } } @@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - static void raise_barrier(struct r1conf *conf) { spin_lock_irq(&conf->resync_lock); @@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; - /* Now wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r1conf *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) { + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if (conf->next_resync < RESYNC_WINDOW_SECTORS) + wait = true; + else if ((conf->next_resync - RESYNC_WINDOW_SECTORS + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +{ + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) * count down. */ wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && current->bio_list && - !bio_list_empty(current->bio_list)), + !bio_list_empty(current->bio_list))), conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + } + if (bio->bi_sector >= conf->start_next_window) + sector = conf->start_next_window; + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(struct r1conf *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+extra + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for @@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, @@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) int first_clone; int sectors_handled; int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; @@ -1163,6 +1247,7 @@ read_again: disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1231,14 +1316,24 @@ read_again: if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf); + allow_barrier(conf, start_next_window, bio->bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } @@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + conf->next_resync = 0; + conf->start_next_window = MaxSector; } static int raid1_spare_active(struct mddev *mddev) @@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; + err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) @@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) wake_up(&conf->wait_barrier); break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } @@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); if (!IS_ERR(conf)) - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; return conf; } return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0ff3715fb7e..9bebca7bff2 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -41,6 +41,19 @@ struct r1conf { */ sector_t next_resync; + /* When raid1 starts resync, we divide array into four partitions + * |---------|--------------|---------------------|-------------| + * next_resync start_next_window end_window + * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE + * end_window = start_next_window + NEXT_NORMALIO_DISTANCE + * current_window_requests means the count of normalIO between + * start_next_window and end_window. + * next_window_requests means the count of normalIO after end_window. + * */ + sector_t start_next_window; + int current_window_requests; + int next_window_requests; + spinlock_t device_lock; /* list of 'struct r1bio' that need to be processed by raid1d, @@ -65,6 +78,7 @@ struct r1conf { int nr_waiting; int nr_queued; int barrier; + int array_frozen; /* Set to 1 if a full sync is needed, (fresh device added). * Cleared when a sync completes. @@ -111,6 +125,7 @@ struct r1bio { * in this BehindIO request */ sector_t sector; + sector_t start_next_window; int sectors; unsigned long state; struct mddev *mddev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7c3508abb5e..c504e8389e6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + allow_barrier(conf); + return sectors_done; + } conf->reshape_safe = mddev->reshape_position; allow_barrier(conf); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7f0e17a27ae..47da0af6322 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) return &conf->stripe_hashtbl[hash]; } +static inline int stripe_hash_locks_hash(sector_t sect) +{ + return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + local_irq_disable(); + spin_lock(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS; i; i--) + spin_unlock(conf->hash_locks + i - 1); + local_irq_enable(); +} + /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and * a bio could span several devices. @@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) } } -static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); @@ -278,23 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + if (!test_bit(STRIPE_EXPANDING, &sh->state)) + list_add_tail(&sh->lru, temp_inactive_list); } } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { if (atomic_dec_and_test(&sh->count)) - do_release_stripe(conf, sh); + do_release_stripe(conf, sh, temp_inactive_list); +} + +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) +{ + int size; + bool do_wakeup = false; + unsigned long flags; + + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; + } + + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } } /* should hold conf->device_lock already */ -static int release_stripe_list(struct r5conf *conf) +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) { struct stripe_head *sh; int count = 0; @@ -303,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); while (head) { + int hash; + sh = llist_entry(head, struct stripe_head, release_list); head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ @@ -313,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) * again, the count is always > 1. This is true for * STRIPE_ON_UNPLUG_LIST bit too. */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); count++; } @@ -324,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; + struct list_head list; + int hash; bool wakeup; - if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; wakeup = llist_add(&sh->release_list, &conf->released_stripes); if (wakeup) @@ -336,8 +424,11 @@ slow_path: local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { - do_release_stripe(conf, sh); + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); spin_unlock(&conf->device_lock); + release_inactive_stripe_list(conf, &list, hash); } local_irq_restore(flags); } @@ -362,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh = NULL; struct list_head *first; - if (list_empty(&conf->inactive_list)) + if (list_empty(conf->inactive_list + hash)) goto out; - first = conf->inactive_list.next; + first = (conf->inactive_list + hash)->next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); out: return sh; } @@ -416,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { struct r5conf *conf = sh->raid_conf; - int i; + int i, seq; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -426,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector); remove_hash(sh); - +retry: + seq = read_seqcount_begin(&conf->gen_lock); sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; @@ -448,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) dev->flags = 0; raid5_build_block(sh, i, previous); } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; insert_hash(conf, sh); sh->cpu = smp_processor_id(); } @@ -552,29 +649,31 @@ get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; + int hash = stripe_hash_locks_hash(sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - spin_lock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0 || noquiesce, - conf->device_lock); + *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) - sh = get_free_stripe(conf); + sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock); + wait_event_lock_irq( + conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes * 3 / 4) + || !conf->inactive_blocked), + *(conf->hash_locks + hash)); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); @@ -585,9 +684,11 @@ get_active_stripe(struct r5conf *conf, sector_t sector, && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); } else { + spin_lock(&conf->device_lock); if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); if (list_empty(&sh->lru) && + !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state) && !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); @@ -595,6 +696,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, sh->group->stripes_cnt--; sh->group = NULL; } + spin_unlock(&conf->device_lock); } } } while (sh == NULL); @@ -602,7 +704,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (sh) atomic_inc(&sh->count); - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -758,7 +860,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) - bi->bi_rw |= REQ_FLUSH; + bi->bi_rw |= REQ_NOMERGE; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1582,7 +1684,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static int grow_one_stripe(struct r5conf *conf) +static int grow_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); @@ -1598,6 +1700,7 @@ static int grow_one_stripe(struct r5conf *conf) kmem_cache_free(conf->slab_cache, sh); return 0; } + sh->hash_lock_index = hash; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -1610,6 +1713,7 @@ static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); + int hash; if (conf->mddev->gendisk) sprintf(conf->cache_name[0], @@ -1627,9 +1731,13 @@ static int grow_stripes(struct r5conf *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (num--) { + if (!grow_one_stripe(conf, hash)) return 1; + conf->max_nr_stripes++; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; + } return 0; } @@ -1687,6 +1795,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) int err; struct kmem_cache *sc; int i; + int hash, cnt; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ @@ -1726,19 +1835,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) * OK, we have enough stripes, start collecting inactive * stripes and copying them over */ + hash = 0; + cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); for(i=0; i<conf->pool_size; i++) nsh->dev[i].page = osh->dev[i].page; for( ; i<newsize; i++) nsh->dev[i].page = NULL; + nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } } kmem_cache_destroy(conf->slab_cache); @@ -1797,13 +1916,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) return err; } -static int drop_one_stripe(struct r5conf *conf) +static int drop_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); @@ -1815,8 +1934,10 @@ static int drop_one_stripe(struct r5conf *conf) static void shrink_stripes(struct r5conf *conf) { - while (drop_one_stripe(conf)) - ; + int hash; + for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) + while (drop_one_stripe(conf, hash)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -1921,6 +2042,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), bdn); else retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; if (retry) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); @@ -3900,7 +4024,8 @@ static void raid5_activate_delayed(struct r5conf *conf) } } -static void activate_bit_delay(struct r5conf *conf) +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) { /* device_lock is held */ struct list_head head; @@ -3908,9 +4033,11 @@ static void activate_bit_delay(struct r5conf *conf) list_del_init(&conf->bitmap_list); while (!list_empty(&head)) { struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; list_del_init(&sh->lru); atomic_inc(&sh->count); - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); } } @@ -3926,7 +4053,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 1; if (conf->quiesce) return 1; - if (list_empty_careful(&conf->inactive_list)) + if (atomic_read(&conf->empty_inactive_list_nr)) return 1; return 0; @@ -4256,6 +4383,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct raid5_plug_cb { struct blk_plug_cb cb; struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; }; static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) @@ -4266,6 +4394,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; int cnt = 0; + int hash; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4283,11 +4412,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_RELEASE_LIST could be set here. In that * case, the count is always > 1 here */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); cnt++; } spin_unlock_irq(&conf->device_lock); } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); if (mddev->queue) trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); @@ -4308,8 +4440,12 @@ static void release_stripe_plug(struct mddev *mddev, cb = container_of(blk_cb, struct raid5_plug_cb, cb); - if (cb->list.next == NULL) + if (cb->list.next == NULL) { + int i; INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); @@ -4692,14 +4828,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); @@ -4782,7 +4923,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; @@ -4790,13 +4934,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } +ret: return reshape_sectors; } @@ -4954,27 +5101,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } static int handle_active_stripes(struct r5conf *conf, int group, - struct r5worker *worker) + struct r5worker *worker, + struct list_head *temp_inactive_list) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; - int i, batch_size = 0; + int i, batch_size = 0, hash; + bool release_inactive = false; while (batch_size < MAX_STRIPE_BATCH && (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; - if (batch_size == 0) - return batch_size; + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) + return batch_size; + release_inactive = true; + } spin_unlock_irq(&conf->device_lock); + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); - for (i = 0; i < batch_size; i++) - __release_stripe(conf, batch[i]); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } return batch_size; } @@ -4995,9 +5160,10 @@ static void raid5_do_work(struct work_struct *work) while (1) { int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, worker->temp_inactive_list); - batch_size = handle_active_stripes(conf, group_id, worker); + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); worker->working = false; if (!batch_size && !released) break; @@ -5036,7 +5202,7 @@ static void raid5d(struct md_thread *thread) struct bio *bio; int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, conf->temp_inactive_list); if ( !list_empty(&conf->bitmap_list)) { @@ -5046,7 +5212,7 @@ static void raid5d(struct md_thread *thread) bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); + activate_bit_delay(conf, conf->temp_inactive_list); } raid5_activate_delayed(conf); @@ -5060,7 +5226,8 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); if (!batch_size && !released) break; handled += batch_size; @@ -5096,22 +5263,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; int err; + int hash; if (size <= 16 || size > 32768) return -EINVAL; + hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) + if (drop_one_stripe(conf, hash)) conf->max_nr_stripes--; else break; + hash--; + if (hash < 0) + hash = NR_STRIPE_HASH_LOCKS - 1; } err = md_allow_write(mddev); if (err) return err; + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) + if (grow_one_stripe(conf, hash)) conf->max_nr_stripes++; else break; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; } return 0; } @@ -5199,15 +5373,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) return 0; } -static int alloc_thread_groups(struct r5conf *conf, int cnt); +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf = mddev->private; unsigned long new; int err; - struct r5worker_group *old_groups; - int old_group_cnt; + struct r5worker_group *new_groups, *old_groups; + int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; @@ -5223,14 +5400,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); old_groups = conf->worker_groups; - old_group_cnt = conf->worker_cnt_per_group; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - conf->worker_groups = NULL; - err = alloc_thread_groups(conf, new); - if (err) { - conf->worker_groups = old_groups; - conf->worker_cnt_per_group = old_group_cnt; - } else { if (old_groups) kfree(old_groups[0].workers); kfree(old_groups); @@ -5260,40 +5442,47 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt) +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups) { - int i, j; + int i, j, k; ssize_t size; struct r5worker *workers; - conf->worker_cnt_per_group = cnt; + *worker_cnt_per_group = cnt; if (cnt == 0) { - conf->worker_groups = NULL; + *group_cnt = 0; + *worker_groups = NULL; return 0; } - conf->group_cnt = num_possible_nodes(); + *group_cnt = num_possible_nodes(); size = sizeof(struct r5worker) * cnt; - workers = kzalloc(size * conf->group_cnt, GFP_NOIO); - conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * - conf->group_cnt, GFP_NOIO); - if (!conf->worker_groups || !workers) { + workers = kzalloc(size * *group_cnt, GFP_NOIO); + *worker_groups = kzalloc(sizeof(struct r5worker_group) * + *group_cnt, GFP_NOIO); + if (!*worker_groups || !workers) { kfree(workers); - kfree(conf->worker_groups); - conf->worker_groups = NULL; + kfree(*worker_groups); return -ENOMEM; } - for (i = 0; i < conf->group_cnt; i++) { + for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = &conf->worker_groups[i]; + group = worker_groups[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; for (j = 0; j < cnt; j++) { - group->workers[j].group = group; - INIT_WORK(&group->workers[j].work, raid5_do_work); + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); } } @@ -5444,6 +5633,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct md_rdev *rdev; struct disk_info *disk; char pers_name[6]; + int i; + int group_cnt, worker_cnt_per_group; + struct r5worker_group *new_group; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -5478,7 +5670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf == NULL) goto abort; /* Don't enable multi-threading by default*/ - if (alloc_thread_groups(conf, 0)) + if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, + &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_group; + } else goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); @@ -5488,7 +5685,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -5514,6 +5710,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + conf->level = mddev->new_level; if (raid5_alloc_percpu(conf) != 0) goto abort; @@ -5554,7 +5765,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) else conf->max_degraded = 1; conf->algorithm = mddev->new_layout; - conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; @@ -5563,7 +5773,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, NR_STRIPES)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -6369,12 +6580,18 @@ static int raid5_start_reshape(struct mddev *mddev) if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; smp_wmb(); + conf->generation --; conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -6462,27 +6679,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) break; case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain */ conf->quiesce = 2; - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock); + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); conf->quiesce = 1; - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); break; } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index b42e6b462ed..01ad8ae8f57 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -205,6 +205,7 @@ struct stripe_head { short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ @@ -367,9 +368,18 @@ struct disk_info { struct md_rdev *rdev, *replacement; }; +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + struct r5worker { struct work_struct work; struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; bool working; }; @@ -382,6 +392,8 @@ struct r5worker_group { struct r5conf { struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; int level, algorithm; @@ -462,7 +474,8 @@ struct r5conf { * Free stripes pool */ atomic_t active_stripes; - struct list_head inactive_list; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; @@ -477,6 +490,7 @@ struct r5conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; int worker_cnt_per_group; diff --git a/drivers/media/platform/m2m-deinterlace.c b/drivers/media/platform/m2m-deinterlace.c index 36513e89641..65cab70fefc 100644 --- a/drivers/media/platform/m2m-deinterlace.c +++ b/drivers/media/platform/m2m-deinterlace.c @@ -341,8 +341,7 @@ static void deinterlace_issue_dma(struct deinterlace_ctx *ctx, int op, ctx->xt->dir = DMA_MEM_TO_MEM; ctx->xt->src_sgl = false; ctx->xt->dst_sgl = true; - flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT | - DMA_COMPL_SKIP_DEST_UNMAP | DMA_COMPL_SKIP_SRC_UNMAP; + flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT; tx = dmadev->device_prep_interleaved_dma(chan, ctx->xt, flags); if (tx == NULL) { diff --git a/drivers/media/platform/timblogiw.c b/drivers/media/platform/timblogiw.c index 6a74ce040d2..ccdadd623a3 100644 --- a/drivers/media/platform/timblogiw.c +++ b/drivers/media/platform/timblogiw.c @@ -565,7 +565,7 @@ static void buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb) desc = dmaengine_prep_slave_sg(fh->chan, buf->sg, sg_elems, DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); + DMA_PREP_INTERRUPT); if (!desc) { spin_lock_irq(&fh->queue_lock); list_del_init(&vb->queue); diff --git a/drivers/misc/carma/carma-fpga.c b/drivers/misc/carma/carma-fpga.c index 08b18f3f526..9e2b985293f 100644 --- a/drivers/misc/carma/carma-fpga.c +++ b/drivers/misc/carma/carma-fpga.c @@ -633,8 +633,7 @@ static int data_submit_dma(struct fpga_device *priv, struct data_buf *buf) struct dma_async_tx_descriptor *tx; dma_cookie_t cookie; dma_addr_t dst, src; - unsigned long dma_flags = DMA_COMPL_SKIP_DEST_UNMAP | - DMA_COMPL_SKIP_SRC_UNMAP; + unsigned long dma_flags = 0; dst_sg = buf->vb.sglist; dst_nents = buf->vb.sglen; diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index ef8956568c3..157b570ba34 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -308,8 +308,7 @@ static void sdio_acpi_set_handle(struct sdio_func *func) struct mmc_host *host = func->card->host; u64 addr = (host->slotno << 16) | func->num; - ACPI_HANDLE_SET(&func->dev, - acpi_get_child(ACPI_HANDLE(host->parent), addr)); + acpi_preset_companion(&func->dev, ACPI_HANDLE(host->parent), addr); } #else static inline void sdio_acpi_set_handle(struct sdio_func *func) {} diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c index d78a97d4153..59f08c44abd 100644 --- a/drivers/mtd/nand/atmel_nand.c +++ b/drivers/mtd/nand/atmel_nand.c @@ -375,8 +375,7 @@ static int atmel_nand_dma_op(struct mtd_info *mtd, void *buf, int len, dma_dev = host->dma_chan->device; - flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP | - DMA_COMPL_SKIP_DEST_UNMAP; + flags = DMA_CTRL_ACK | DMA_PREP_INTERRUPT; phys_addr = dma_map_single(dma_dev->dev, p, len, dir); if (dma_mapping_error(dma_dev->dev, phys_addr)) { diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c index 3dc1a7564d8..8b2752263db 100644 --- a/drivers/mtd/nand/fsmc_nand.c +++ b/drivers/mtd/nand/fsmc_nand.c @@ -573,8 +573,6 @@ static int dma_xfer(struct fsmc_nand_data *host, void *buffer, int len, dma_dev = chan->device; dma_addr = dma_map_single(dma_dev->dev, buffer, len, direction); - flags |= DMA_COMPL_SKIP_SRC_UNMAP | DMA_COMPL_SKIP_DEST_UNMAP; - if (direction == DMA_TO_DEVICE) { dma_src = dma_addr; dma_dst = host->data_pa; diff --git a/drivers/net/ethernet/micrel/ks8842.c b/drivers/net/ethernet/micrel/ks8842.c index 0951f7aca1e..822616e3c37 100644 --- a/drivers/net/ethernet/micrel/ks8842.c +++ b/drivers/net/ethernet/micrel/ks8842.c @@ -459,8 +459,7 @@ static int ks8842_tx_frame_dma(struct sk_buff *skb, struct net_device *netdev) sg_dma_len(&ctl->sg) += 4 - sg_dma_len(&ctl->sg) % 4; ctl->adesc = dmaengine_prep_slave_sg(ctl->chan, - &ctl->sg, 1, DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); + &ctl->sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT); if (!ctl->adesc) return NETDEV_TX_BUSY; @@ -571,8 +570,7 @@ static int __ks8842_start_new_rx_dma(struct net_device *netdev) sg_dma_len(sg) = DMA_BUFFER_SIZE; ctl->adesc = dmaengine_prep_slave_sg(ctl->chan, - sg, 1, DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP); + sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT); if (!ctl->adesc) goto out; diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 12a9e83c008..d0222f13d15 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -1034,10 +1034,9 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset, struct dma_chan *chan = qp->dma_chan; struct dma_device *device; size_t pay_off, buff_off; - dma_addr_t src, dest; + struct dmaengine_unmap_data *unmap; dma_cookie_t cookie; void *buf = entry->buf; - unsigned long flags; entry->len = len; @@ -1045,35 +1044,49 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset, goto err; if (len < copy_bytes) - goto err1; + goto err_wait; device = chan->device; pay_off = (size_t) offset & ~PAGE_MASK; buff_off = (size_t) buf & ~PAGE_MASK; if (!is_dma_copy_aligned(device, pay_off, buff_off, len)) - goto err1; + goto err_wait; - dest = dma_map_single(device->dev, buf, len, DMA_FROM_DEVICE); - if (dma_mapping_error(device->dev, dest)) - goto err1; + unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT); + if (!unmap) + goto err_wait; - src = dma_map_single(device->dev, offset, len, DMA_TO_DEVICE); - if (dma_mapping_error(device->dev, src)) - goto err2; + unmap->len = len; + unmap->addr[0] = dma_map_page(device->dev, virt_to_page(offset), + pay_off, len, DMA_TO_DEVICE); + if (dma_mapping_error(device->dev, unmap->addr[0])) + goto err_get_unmap; + + unmap->to_cnt = 1; - flags = DMA_COMPL_DEST_UNMAP_SINGLE | DMA_COMPL_SRC_UNMAP_SINGLE | - DMA_PREP_INTERRUPT; - txd = device->device_prep_dma_memcpy(chan, dest, src, len, flags); + unmap->addr[1] = dma_map_page(device->dev, virt_to_page(buf), + buff_off, len, DMA_FROM_DEVICE); + if (dma_mapping_error(device->dev, unmap->addr[1])) + goto err_get_unmap; + + unmap->from_cnt = 1; + + txd = device->device_prep_dma_memcpy(chan, unmap->addr[1], + unmap->addr[0], len, + DMA_PREP_INTERRUPT); if (!txd) - goto err3; + goto err_get_unmap; txd->callback = ntb_rx_copy_callback; txd->callback_param = entry; + dma_set_unmap(txd, unmap); cookie = dmaengine_submit(txd); if (dma_submit_error(cookie)) - goto err3; + goto err_set_unmap; + + dmaengine_unmap_put(unmap); qp->last_cookie = cookie; @@ -1081,11 +1094,11 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset, return; -err3: - dma_unmap_single(device->dev, src, len, DMA_TO_DEVICE); -err2: - dma_unmap_single(device->dev, dest, len, DMA_FROM_DEVICE); -err1: +err_set_unmap: + dmaengine_unmap_put(unmap); +err_get_unmap: + dmaengine_unmap_put(unmap); +err_wait: /* If the callbacks come out of order, the writing of the index to the * last completed will be out of order. This may result in the * receive stalling forever. @@ -1245,12 +1258,12 @@ static void ntb_async_tx(struct ntb_transport_qp *qp, struct dma_chan *chan = qp->dma_chan; struct dma_device *device; size_t dest_off, buff_off; - dma_addr_t src, dest; + struct dmaengine_unmap_data *unmap; + dma_addr_t dest; dma_cookie_t cookie; void __iomem *offset; size_t len = entry->len; void *buf = entry->buf; - unsigned long flags; offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index; hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header); @@ -1273,28 +1286,41 @@ static void ntb_async_tx(struct ntb_transport_qp *qp, if (!is_dma_copy_aligned(device, buff_off, dest_off, len)) goto err; - src = dma_map_single(device->dev, buf, len, DMA_TO_DEVICE); - if (dma_mapping_error(device->dev, src)) + unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT); + if (!unmap) goto err; - flags = DMA_COMPL_SRC_UNMAP_SINGLE | DMA_PREP_INTERRUPT; - txd = device->device_prep_dma_memcpy(chan, dest, src, len, flags); + unmap->len = len; + unmap->addr[0] = dma_map_page(device->dev, virt_to_page(buf), + buff_off, len, DMA_TO_DEVICE); + if (dma_mapping_error(device->dev, unmap->addr[0])) + goto err_get_unmap; + + unmap->to_cnt = 1; + + txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len, + DMA_PREP_INTERRUPT); if (!txd) - goto err1; + goto err_get_unmap; txd->callback = ntb_tx_copy_callback; txd->callback_param = entry; + dma_set_unmap(txd, unmap); cookie = dmaengine_submit(txd); if (dma_submit_error(cookie)) - goto err1; + goto err_set_unmap; + + dmaengine_unmap_put(unmap); dma_async_issue_pending(chan); qp->tx_async++; return; -err1: - dma_unmap_single(device->dev, src, len, DMA_TO_DEVICE); +err_set_unmap: + dmaengine_unmap_put(unmap); +err_get_unmap: + dmaengine_unmap_put(unmap); err: ntb_memcpy_tx(entry, offset); qp->tx_memcpy++; diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index 1ce8ee054f1..a94d850ae22 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -367,7 +367,7 @@ int acpi_get_hp_hw_control_from_firmware(struct pci_dev *pdev, u32 flags) string = (struct acpi_buffer){ ACPI_ALLOCATE_BUFFER, NULL }; } - handle = DEVICE_ACPI_HANDLE(&pdev->dev); + handle = ACPI_HANDLE(&pdev->dev); if (!handle) { /* * This hotplug controller was not listed in the ACPI name diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h index 26100f510b1..1592dbe4f90 100644 --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@ -176,7 +176,6 @@ u8 acpiphp_get_latch_status(struct acpiphp_slot *slot); u8 acpiphp_get_adapter_status(struct acpiphp_slot *slot); /* variables */ -extern bool acpiphp_debug; extern bool acpiphp_disabled; #endif /* _ACPIPHP_H */ diff --git a/drivers/pci/hotplug/pciehp_acpi.c b/drivers/pci/hotplug/pciehp_acpi.c index ead7c534095..cff7cadfc2e 100644 --- a/drivers/pci/hotplug/pciehp_acpi.c +++ b/drivers/pci/hotplug/pciehp_acpi.c @@ -54,7 +54,7 @@ int pciehp_acpi_slot_detection_check(struct pci_dev *dev) { if (slot_detection_mode != PCIEHP_DETECT_ACPI) return 0; - if (acpi_pci_detect_ejectable(DEVICE_ACPI_HANDLE(&dev->dev))) + if (acpi_pci_detect_ejectable(ACPI_HANDLE(&dev->dev))) return 0; return -ENODEV; } @@ -96,7 +96,7 @@ static int __init dummy_probe(struct pcie_device *dev) dup_slot_id++; } list_add_tail(&slot->list, &dummy_slots); - handle = DEVICE_ACPI_HANDLE(&pdev->dev); + handle = ACPI_HANDLE(&pdev->dev); if (!acpi_slot_detected && acpi_pci_detect_ejectable(handle)) acpi_slot_detected = 1; return -ENODEV; /* dummy driver always returns error */ diff --git a/drivers/pci/hotplug/sgi_hotplug.c b/drivers/pci/hotplug/sgi_hotplug.c index b2781dfe60e..5b05a68cca6 100644 --- a/drivers/pci/hotplug/sgi_hotplug.c +++ b/drivers/pci/hotplug/sgi_hotplug.c @@ -9,6 +9,7 @@ * Work to add BIOS PROM support was completed by Mike Habeck. */ +#include <linux/acpi.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -29,7 +30,6 @@ #include <asm/sn/sn_feature_sets.h> #include <asm/sn/sn_sal.h> #include <asm/sn/types.h> -#include <linux/acpi.h> #include <asm/sn/acpi.h> #include "../pci.h" @@ -414,7 +414,7 @@ static int enable_slot(struct hotplug_slot *bss_hotplug_slot) acpi_handle rethandle; acpi_status ret; - phandle = PCI_CONTROLLER(slot->pci_bus)->acpi_handle; + phandle = acpi_device_handle(PCI_CONTROLLER(slot->pci_bus)->companion); if (acpi_bus_get_device(phandle, &pdevice)) { dev_dbg(&slot->pci_bus->self->dev, @@ -495,7 +495,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot) /* free the ACPI resources for the slot */ if (SN_ACPI_BASE_SUPPORT() && - PCI_CONTROLLER(slot->pci_bus)->acpi_handle) { + PCI_CONTROLLER(slot->pci_bus)->companion) { unsigned long long adr; struct acpi_device *device; acpi_handle phandle; @@ -504,7 +504,7 @@ static int disable_slot(struct hotplug_slot *bss_hotplug_slot) acpi_status ret; /* Get the rootbus node pointer */ - phandle = PCI_CONTROLLER(slot->pci_bus)->acpi_handle; + phandle = acpi_device_handle(PCI_CONTROLLER(slot->pci_bus)->companion); acpi_scan_lock_acquire(); /* diff --git a/drivers/pci/ioapic.c b/drivers/pci/ioapic.c index 1b90579b233..50ce6809829 100644 --- a/drivers/pci/ioapic.c +++ b/drivers/pci/ioapic.c @@ -37,7 +37,7 @@ static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) char *type; struct resource *res; - handle = DEVICE_ACPI_HANDLE(&dev->dev); + handle = ACPI_HANDLE(&dev->dev); if (!handle) return -EINVAL; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index dfd1f59de72..f166126e28d 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -173,14 +173,14 @@ static pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) static bool acpi_pci_power_manageable(struct pci_dev *dev) { - acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + acpi_handle handle = ACPI_HANDLE(&dev->dev); return handle ? acpi_bus_power_manageable(handle) : false; } static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { - acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + acpi_handle handle = ACPI_HANDLE(&dev->dev); static const u8 state_conv[] = { [PCI_D0] = ACPI_STATE_D0, [PCI_D1] = ACPI_STATE_D1, @@ -217,7 +217,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) static bool acpi_pci_can_wakeup(struct pci_dev *dev) { - acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + acpi_handle handle = ACPI_HANDLE(&dev->dev); return handle ? acpi_bus_can_wakeup(handle) : false; } diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index edaed6f4da6..d51f45aa669 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -263,7 +263,7 @@ device_has_dsm(struct device *dev) acpi_handle handle; struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - handle = DEVICE_ACPI_HANDLE(dev); + handle = ACPI_HANDLE(dev); if (!handle) return FALSE; @@ -295,7 +295,7 @@ acpilabel_show(struct device *dev, struct device_attribute *attr, char *buf) acpi_handle handle; int length; - handle = DEVICE_ACPI_HANDLE(dev); + handle = ACPI_HANDLE(dev); if (!handle) return -1; @@ -316,7 +316,7 @@ acpiindex_show(struct device *dev, struct device_attribute *attr, char *buf) acpi_handle handle; int length; - handle = DEVICE_ACPI_HANDLE(dev); + handle = ACPI_HANDLE(dev); if (!handle) return -1; diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index 605a9be5512..b9429fbf1cd 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -519,7 +519,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) gmux_data->power_state = VGA_SWITCHEROO_ON; - gmux_data->dhandle = DEVICE_ACPI_HANDLE(&pnp->dev); + gmux_data->dhandle = ACPI_HANDLE(&pnp->dev); if (!gmux_data->dhandle) { pr_err("Cannot find acpi handle for pnp device %s\n", dev_name(&pnp->dev)); diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c index 747826d9905..14655a0f043 100644 --- a/drivers/pnp/pnpacpi/core.c +++ b/drivers/pnp/pnpacpi/core.c @@ -89,7 +89,7 @@ static int pnpacpi_set_resources(struct pnp_dev *dev) pnp_dbg(&dev->dev, "set resources\n"); - handle = DEVICE_ACPI_HANDLE(&dev->dev); + handle = ACPI_HANDLE(&dev->dev); if (!handle || acpi_bus_get_device(handle, &acpi_dev)) { dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); return -ENODEV; @@ -122,7 +122,7 @@ static int pnpacpi_disable_resources(struct pnp_dev *dev) dev_dbg(&dev->dev, "disable resources\n"); - handle = DEVICE_ACPI_HANDLE(&dev->dev); + handle = ACPI_HANDLE(&dev->dev); if (!handle || acpi_bus_get_device(handle, &acpi_dev)) { dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); return 0; @@ -144,7 +144,7 @@ static bool pnpacpi_can_wakeup(struct pnp_dev *dev) struct acpi_device *acpi_dev; acpi_handle handle; - handle = DEVICE_ACPI_HANDLE(&dev->dev); + handle = ACPI_HANDLE(&dev->dev); if (!handle || acpi_bus_get_device(handle, &acpi_dev)) { dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); return false; @@ -159,7 +159,7 @@ static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state) acpi_handle handle; int error = 0; - handle = DEVICE_ACPI_HANDLE(&dev->dev); + handle = ACPI_HANDLE(&dev->dev); if (!handle || acpi_bus_get_device(handle, &acpi_dev)) { dev_dbg(&dev->dev, "ACPI device not found in %s!\n", __func__); return 0; @@ -194,7 +194,7 @@ static int pnpacpi_suspend(struct pnp_dev *dev, pm_message_t state) static int pnpacpi_resume(struct pnp_dev *dev) { struct acpi_device *acpi_dev; - acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + acpi_handle handle = ACPI_HANDLE(&dev->dev); int error = 0; if (!handle || acpi_bus_get_device(handle, &acpi_dev)) { diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 15f166a470a..00773022211 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -626,7 +626,7 @@ comment "Platform RTC drivers" config RTC_DRV_CMOS tristate "PC-style 'CMOS'" - depends on X86 || ALPHA || ARM || M32R || ATARI || PPC || MIPS || SPARC64 + depends on X86 || ARM || M32R || ATARI || PPC || MIPS || SPARC64 default y if X86 help Say "yes" here to get direct support for the real time clock @@ -643,6 +643,14 @@ config RTC_DRV_CMOS This driver can also be built as a module. If so, the module will be called rtc-cmos. +config RTC_DRV_ALPHA + bool "Alpha PC-style CMOS" + depends on ALPHA + default y + help + Direct support for the real-time clock found on every Alpha + system, specifically MC146818 compatibles. If in doubt, say Y. + config RTC_DRV_VRTC tristate "Virtual RTC for Intel MID platforms" depends on X86_INTEL_MID diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 8b2cd8a5a2f..c0da95e9570 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -428,6 +428,14 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) return 0; } +static void at91_rtc_shutdown(struct platform_device *pdev) +{ + /* Disable all interrupts */ + at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM | + AT91_RTC_SECEV | AT91_RTC_TIMEV | + AT91_RTC_CALEV); +} + #ifdef CONFIG_PM_SLEEP /* AT91RM9200 RTC Power management control */ @@ -466,6 +474,7 @@ static SIMPLE_DEV_PM_OPS(at91_rtc_pm_ops, at91_rtc_suspend, at91_rtc_resume); static struct platform_driver at91_rtc_driver = { .remove = __exit_p(at91_rtc_remove), + .shutdown = at91_rtc_shutdown, .driver = { .name = "at91_rtc", .owner = THIS_MODULE, diff --git a/drivers/spi/spi-dw-mid.c b/drivers/spi/spi-dw-mid.c index b9f0192758d..6d207afec8c 100644 --- a/drivers/spi/spi-dw-mid.c +++ b/drivers/spi/spi-dw-mid.c @@ -150,7 +150,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change) &dws->tx_sgl, 1, DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP); + DMA_PREP_INTERRUPT); txdesc->callback = dw_spi_dma_done; txdesc->callback_param = dws; @@ -173,7 +173,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change) &dws->rx_sgl, 1, DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP); + DMA_PREP_INTERRUPT); rxdesc->callback = dw_spi_dma_done; rxdesc->callback_param = dws; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8d85ddc4601..18cc625d887 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -357,6 +357,19 @@ struct spi_device *spi_alloc_device(struct spi_master *master) } EXPORT_SYMBOL_GPL(spi_alloc_device); +static void spi_dev_set_name(struct spi_device *spi) +{ + struct acpi_device *adev = ACPI_COMPANION(&spi->dev); + + if (adev) { + dev_set_name(&spi->dev, "spi-%s", acpi_dev_name(adev)); + return; + } + + dev_set_name(&spi->dev, "%s.%u", dev_name(&spi->master->dev), + spi->chip_select); +} + /** * spi_add_device - Add spi_device allocated with spi_alloc_device * @spi: spi_device to register @@ -383,9 +396,7 @@ int spi_add_device(struct spi_device *spi) } /* Set the bus ID string */ - dev_set_name(&spi->dev, "%s.%u", dev_name(&spi->master->dev), - spi->chip_select); - + spi_dev_set_name(spi); /* We need to make sure there's no other device with this * chipselect **BEFORE** we call setup(), else we'll trash @@ -1144,7 +1155,7 @@ static acpi_status acpi_spi_add_device(acpi_handle handle, u32 level, return AE_NO_MEMORY; } - ACPI_HANDLE_SET(&spi->dev, handle); + ACPI_COMPANION_SET(&spi->dev, adev); spi->irq = -1; INIT_LIST_HEAD(&resource_list); diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 537750261aa..7d8103cd3e2 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -1433,7 +1433,7 @@ static void work_fn_rx(struct work_struct *work) desc = s->desc_rx[new]; if (dma_async_is_tx_complete(s->chan_rx, s->active_rx, NULL, NULL) != - DMA_SUCCESS) { + DMA_COMPLETE) { /* Handle incomplete DMA receive */ struct dma_chan *chan = s->chan_rx; struct shdma_desc *sh_desc = container_of(desc, diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 06cec635e70..a7c04e24ca4 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -5501,6 +5501,6 @@ acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev, if (!hub) return NULL; - return DEVICE_ACPI_HANDLE(&hub->ports[port1 - 1]->dev); + return ACPI_HANDLE(&hub->ports[port1 - 1]->dev); } #endif diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c index 255c14464bf..4e243c37f17 100644 --- a/drivers/usb/core/usb-acpi.c +++ b/drivers/usb/core/usb-acpi.c @@ -173,7 +173,7 @@ static int usb_acpi_find_device(struct device *dev, acpi_handle *handle) } /* root hub's parent is the usb hcd. */ - parent_handle = DEVICE_ACPI_HANDLE(dev->parent); + parent_handle = ACPI_HANDLE(dev->parent); *handle = acpi_get_child(parent_handle, udev->portnum); if (!*handle) return -ENODEV; @@ -194,7 +194,7 @@ static int usb_acpi_find_device(struct device *dev, acpi_handle *handle) raw_port_num = usb_hcd_find_raw_port_number(hcd, port_num); - *handle = acpi_get_child(DEVICE_ACPI_HANDLE(&udev->dev), + *handle = acpi_get_child(ACPI_HANDLE(&udev->dev), raw_port_num); if (!*handle) return -ENODEV; diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index d15f6e80479..188825122aa 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -59,12 +59,12 @@ static int xen_add_device(struct device *dev) add.flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI - handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); + handle = ACPI_HANDLE(&pci_dev->dev); if (!handle && pci_dev->bus->bridge) - handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); + handle = ACPI_HANDLE(pci_dev->bus->bridge); #ifdef CONFIG_PCI_IOV if (!handle && pci_dev->is_virtfn) - handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); + handle = ACPI_HANDLE(physfn->bus->bridge); #endif if (handle) { acpi_status status; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f039b104a98..b03dd23feda 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -43,23 +43,6 @@ #include "fid.h" /** - * v9fs_dentry_delete - called when dentry refcount equals 0 - * @dentry: dentry in question - * - * By returning 1 here we should remove cacheing of unused - * dentry components. - * - */ - -static int v9fs_dentry_delete(const struct dentry *dentry) -{ - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); - - return 1; -} - -/** * v9fs_cached_dentry_delete - called when dentry refcount equals 0 * @dentry: dentry in question * @@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { - .d_delete = v9fs_dentry_delete, + .d_delete = always_delete_dentry, .d_release = v9fs_dentry_release, }; @@ -80,6 +80,8 @@ struct kioctx { struct percpu_ref users; atomic_t dead; + struct percpu_ref reqs; + unsigned long user_id; struct __percpu kioctx_cpu *cpu; @@ -107,7 +109,6 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct rcu_head rcu_head; struct work_struct free_work; struct { @@ -250,8 +251,10 @@ static void aio_free_ring(struct kioctx *ctx) put_aio_ring_file(ctx); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) { kfree(ctx->ring_pages); + ctx->ring_pages = NULL; + } } static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) @@ -463,26 +466,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) return cancel(kiocb); } -static void free_ioctx_rcu(struct rcu_head *head) +static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + struct kioctx *ctx = container_of(work, struct kioctx, free_work); + pr_debug("freeing %p\n", ctx); + + aio_free_ring(ctx); free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } +static void free_ioctx_reqs(struct percpu_ref *ref) +{ + struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct work_struct *work) +static void free_ioctx_users(struct percpu_ref *ref) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - struct aio_ring *ring; + struct kioctx *ctx = container_of(ref, struct kioctx, users); struct kiocb *req; - unsigned cpu, avail; - DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -496,54 +507,8 @@ static void free_ioctx(struct work_struct *work) spin_unlock_irq(&ctx->ctx_lock); - for_each_possible_cpu(cpu) { - struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - - atomic_add(kcpu->reqs_available, &ctx->reqs_available); - kcpu->reqs_available = 0; - } - - while (1) { - prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - - ring = kmap_atomic(ctx->ring_pages[0]); - avail = (ring->head <= ring->tail) - ? ring->tail - ring->head - : ctx->nr_events - ring->head + ring->tail; - - atomic_add(avail, &ctx->reqs_available); - ring->head = ring->tail; - kunmap_atomic(ring); - - if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) - break; - - schedule(); - } - finish_wait(&ctx->wait, &wait); - - WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); - - aio_free_ring(ctx); - - pr_debug("freeing %p\n", ctx); - - /* - * Here the call_rcu() is between the wait_event() for reqs_active to - * hit 0, and freeing the ioctx. - * - * aio_complete() decrements reqs_active, but it has to touch the ioctx - * after to issue a wakeup so we use rcu. - */ - call_rcu(&ctx->rcu_head, free_ioctx_rcu); -} - -static void free_ioctx_ref(struct percpu_ref *ref) -{ - struct kioctx *ctx = container_of(ref, struct kioctx, users); - - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + percpu_ref_kill(&ctx->reqs); + percpu_ref_put(&ctx->reqs); } static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) @@ -602,6 +567,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } } +static void aio_nr_sub(unsigned nr) +{ + spin_lock(&aio_nr_lock); + if (WARN_ON(aio_nr - nr > aio_nr)) + aio_nr = 0; + else + aio_nr -= nr; + spin_unlock(&aio_nr_lock); +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -639,8 +614,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - if (percpu_ref_init(&ctx->users, free_ioctx_ref)) - goto out_freectx; + if (percpu_ref_init(&ctx->users, free_ioctx_users)) + goto err; + + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + goto err; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); @@ -651,10 +629,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freeref; + goto err; if (aio_setup_ring(ctx) < 0) - goto out_freepcpu; + goto err; atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); @@ -666,7 +644,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); - goto out_cleanup; + err = -EAGAIN; + goto err; } aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); @@ -675,23 +654,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err = ioctx_add_table(ctx, mm); if (err) - goto out_cleanup_put; + goto err_cleanup; pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; -out_cleanup_put: - percpu_ref_put(&ctx->users); -out_cleanup: - err = -EAGAIN; - aio_free_ring(ctx); -out_freepcpu: +err_cleanup: + aio_nr_sub(ctx->max_reqs); +err: free_percpu(ctx->cpu); -out_freeref: + free_percpu(ctx->reqs.pcpu_count); free_percpu(ctx->users.pcpu_count); -out_freectx: - put_aio_ring_file(ctx); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); @@ -726,10 +700,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) * -EAGAIN with no ioctxs actually in use (as far as userspace * could tell). */ - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); + aio_nr_sub(ctx->max_reqs); if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); @@ -861,6 +832,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; + percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; return req; out_put: @@ -930,12 +903,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) return; } - /* - * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after incrementing reqs_available. - */ - rcu_read_lock(); - if (iocb->ki_list.next) { unsigned long flags; @@ -1010,7 +977,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - rcu_read_unlock(); + percpu_ref_put(&ctx->reqs); } EXPORT_SYMBOL(aio_complete); @@ -1421,6 +1388,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); + percpu_ref_put(&ctx->reqs); kiocb_free(req); return ret; } @@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs); static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, - unsigned short max_sectors) + unsigned int max_sectors) { int retried_segments = 0; struct bio_vec *bvec; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index f9d5094e102..aa976eced2d 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -9,12 +9,17 @@ config BTRFS_FS select XOR_BLOCKS help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. + Btrfs is a general purpose copy-on-write filesystem with extents, + writable snapshotting, support for multiple devices and many more + features focused on fault tolerance, repair and easy administration. - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. + The filesystem disk format is no longer unstable, and it's not + expected to change unless there are strong reasons to do so. If there + is a format change, file systems with a unchanged format will + continue to be mountable and usable by newer kernels. + + For more information, please see the web pages at + http://btrfs.wiki.kernel.org. To compile this file system support as a module, choose M here. The module will be called btrfs. diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 8aec751fa46..c1e0b0caf9c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -495,6 +495,7 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) spin_lock_irq(&workers->lock); if (workers->stopping) { spin_unlock_irq(&workers->lock); + ret = -EINVAL; goto fail_kthread; } list_add_tail(&worker->worker_list, &workers->idle_list); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e0aab445697..b50764bef14 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -77,6 +77,15 @@ * the integrity of (super)-block write requests, do not * enable the config option BTRFS_FS_CHECK_INTEGRITY to * include and compile the integrity check tool. + * + * Expect millions of lines of information in the kernel log with an + * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the + * kernel config to at least 26 (which is 64MB). Usually the value is + * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be + * changed like this before LOG_BUF_SHIFT can be set to a high value: + * config LOG_BUF_SHIFT + * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + * range 12 30 */ #include <linux/sched.h> @@ -124,6 +133,7 @@ #define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 #define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 #define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 +#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 struct btrfsic_dev_state; struct btrfsic_state; @@ -3015,6 +3025,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) (rw & WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; + u64 cur_bytenr; int bio_is_patched; char **mapped_datav; @@ -3033,6 +3044,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) GFP_NOFS); if (!mapped_datav) goto leave; + cur_bytenr = dev_bytenr; for (i = 0; i < bio->bi_vcnt; i++) { BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); @@ -3044,16 +3056,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio) kfree(mapped_datav); goto leave; } - if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE) == - (dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) printk(KERN_INFO - "#%u: page=%p, len=%u, offset=%u\n", - i, bio->bi_io_vec[i].bv_page, - bio->bi_io_vec[i].bv_len, + "#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bio->bi_io_vec[i].bv_len, bio->bi_io_vec[i].bv_offset); + cur_bytenr += bio->bi_io_vec[i].bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9aeb2759a6..54ab86127f7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3613,9 +3613,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -int btrfs_csum_truncate(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u64 isize); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ @@ -3744,9 +3741,6 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); -int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace, - u64 start, u64 end, int skip_pinned, - int modified); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 342f9fd411e..2cfc3dfff64 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -366,7 +366,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->tgtdev = tgt_device; printk_in_rcu(KERN_INFO - "btrfs: dev_replace from %s (devid %llu) to %s) started\n", + "btrfs: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c4ed0bb3da..8072cfa8a3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3517,7 +3517,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) int btrfs_commit_super(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - int ret; mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); @@ -3531,25 +3530,7 @@ int btrfs_commit_super(struct btrfs_root *root) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - ret = btrfs_write_and_wait_transaction(NULL, root); - if (ret) { - btrfs_error(root->fs_info, ret, - "Failed to sync btree inode to disk."); - return ret; - } - - ret = write_ctree_super(NULL, root, 0); - return ret; + return btrfs_commit_transaction(trans, root); } int close_ctree(struct btrfs_root *root) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 856bc2b2192..8e457fca0a0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1980,6 +1980,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; + ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); /* we can't repair anything in raid56 yet */ @@ -2036,6 +2037,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, @@ -2057,12 +2061,12 @@ static int clean_io_failure(u64 start, struct page *page) u64 private; u64 private_failure; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; int did_repair = 0; int ret; - struct inode *inode = page->mapping->host; private = 0; ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, @@ -2085,6 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page) did_repair = 1; goto out; } + if (fs_info->sb->s_flags & MS_RDONLY) + goto out; spin_lock(&BTRFS_I(inode)->io_tree.lock); state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, @@ -2094,7 +2100,6 @@ static int clean_io_failure(u64 start, struct page *page) if (state && state->start <= failrec->start && state->end >= failrec->start + failrec->len - 1) { - fs_info = BTRFS_I(inode)->root->fs_info; num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da8d2f696ac..f1a77449d03 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2129,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, old->extent_offset, fs_info, path, record_one_backref, old); - BUG_ON(ret < 0 && ret != -ENOENT); + if (ret < 0 && ret != -ENOENT) + return false; /* no backref to be processed for this extent */ if (!old->count) { @@ -6186,8 +6187,7 @@ insert: write_unlock(&em_tree->lock); out: - if (em) - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, em); if (path) btrfs_free_path(path); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 25a8f3812f1..69582d5b69d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -638,6 +638,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) WARN_ON(nr < 0); } } + list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); } @@ -803,7 +804,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len < start) { + if (ordered->file_offset + ordered->len <= start) { btrfs_put_ordered_extent(ordered); break; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2544805544f..561e2f16ba3 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -938,8 +938,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) BTRFS_DEV_STAT_CORRUPTION_ERRS); } - if (sctx->readonly && !sctx->is_dev_replace) - goto did_not_correct_error; + if (sctx->readonly) { + ASSERT(!sctx->is_dev_replace); + goto out; + } if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 57c16b46afb..c6a872a8a46 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1480,7 +1480,7 @@ static void do_async_commit(struct work_struct *work) * We've got freeze protection passed with the transaction. * Tell lockdep about it. */ - if (ac->newtrans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_acquire_read( &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 0, 1, _THIS_IP_); @@ -1521,7 +1521,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ - if (trans->type < TRANS_JOIN_NOLOCK) + if (ac->newtrans->type & __TRANS_FREEZABLE) rwsem_release( &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 744553c83fe..9f7fc51ca33 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3697,7 +3697,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags)) { + &BTRFS_I(inode)->runtime_flags) || + inode_only == LOG_INODE_EXISTS) { if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; @@ -3801,7 +3802,7 @@ log_extents: err = ret; goto out_unlock; } - } else { + } else if (inode_only == LOG_INODE_ALL) { struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0db63709786..92303f42baa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5394,7 +5394,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, { struct bio_vec *prev; struct request_queue *q = bdev_get_queue(bdev); - unsigned short max_sectors = queue_max_sectors(q); + unsigned int max_sectors = queue_max_sectors(q); struct bvec_merge_data bvm = { .bi_bdev = bdev, .bi_sector = sector, diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 277bd1be21f..e081acbac2e 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -56,29 +56,28 @@ static void configfs_d_iput(struct dentry * dentry, struct configfs_dirent *sd = dentry->d_fsdata; if (sd) { - BUG_ON(sd->s_dentry != dentry); /* Coordinate with configfs_readdir */ spin_lock(&configfs_dirent_lock); - sd->s_dentry = NULL; + /* Coordinate with configfs_attach_attr where will increase + * sd->s_count and update sd->s_dentry to new allocated one. + * Only set sd->dentry to null when this dentry is the only + * sd owner. + * If not do so, configfs_d_iput may run just after + * configfs_attach_attr and set sd->s_dentry to null + * even it's still in use. + */ + if (atomic_read(&sd->s_count) <= 2) + sd->s_dentry = NULL; + spin_unlock(&configfs_dirent_lock); configfs_put(sd); } iput(inode); } -/* - * We _must_ delete our dentries on last dput, as the chain-to-parent - * behavior is required to clear the parents of default_groups. - */ -static int configfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - const struct dentry_operations configfs_dentry_ops = { .d_iput = configfs_d_iput, - /* simple_delete_dentry() isn't exported */ - .d_delete = configfs_d_delete, + .d_delete = always_delete_dentry, }; #ifdef CONFIG_LOCKDEP @@ -426,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + spin_lock(&configfs_dirent_lock); dentry->d_fsdata = configfs_get(sd); sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, configfs_init_file); if (error) { diff --git a/fs/coredump.c b/fs/coredump.c index 62406b6959b..bc3fbcd3255 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) while (nr) { if (dump_interrupted()) return 0; - n = vfs_write(file, addr, nr, &pos); + n = __kernel_write(file, addr, nr, &pos); if (n <= 0) return 0; file->f_pos = pos; @@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->written & (align - 1); if (align & (align - 1)) - return -EINVAL; - return mod ? dump_skip(cprm, align - mod) : 0; + return 0; + return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); diff --git a/fs/dcache.c b/fs/dcache.c index 0a38ef8d7f0..4bdb300b16e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; -/** - * read_seqbegin_or_lock - begin a sequence number check or locking block - * @lock: sequence lock - * @seq : sequence number to be checked - * - * First try it once optimistically without taking the lock. If that fails, - * take the lock. The sequence number is also used as a marker for deciding - * whether to be a reader (even) or writer (odd). - * N.B. seq must be initialized to an even number to begin with. - */ -static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) -{ - if (!(*seq & 1)) /* Even */ - *seq = read_seqbegin(lock); - else /* Odd */ - read_seqlock_excl(lock); -} - -static inline int need_seqretry(seqlock_t *lock, int seq) -{ - return !(seq & 1) && read_seqretry(lock, seq); -} - -static inline void done_seqretry(seqlock_t *lock, int seq) -{ - if (seq & 1) - read_sequnlock_excl(lock); -} - /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try @@ -125,8 +96,6 @@ static inline void done_seqretry(seqlock_t *lock, int seq) * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ -#define D_HASHBITS d_hash_shift -#define D_HASHMASK d_hash_mask static unsigned int d_hash_mask __read_mostly; static unsigned int d_hash_shift __read_mostly; @@ -137,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> D_HASHBITS); - return dentry_hashtable + (hash & D_HASHMASK); + hash = hash + (hash >> d_hash_shift); + return dentry_hashtable + (hash & d_hash_mask); } /* Statistics gathering. */ @@ -469,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) { list_del(&dentry->d_u.d_child); /* - * Inform try_to_ascend() that we are no longer attached to the + * Inform d_walk() that we are no longer attached to the * dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; @@ -1069,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb) } EXPORT_SYMBOL(shrink_dcache_sb); -/* - * This tries to ascend one level of parenthood, but - * we can race with renaming, so we need to re-check - * the parenthood after dropping the lock and check - * that the sequence number still matches. - */ -static struct dentry *try_to_ascend(struct dentry *old, unsigned seq) -{ - struct dentry *new = old->d_parent; - - rcu_read_lock(); - spin_unlock(&old->d_lock); - spin_lock(&new->d_lock); - - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (new != old->d_parent || - (old->d_flags & DCACHE_DENTRY_KILLED) || - need_seqretry(&rename_lock, seq)) { - spin_unlock(&new->d_lock); - new = NULL; - } - rcu_read_unlock(); - return new; -} - /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk @@ -1185,9 +1126,24 @@ resume: */ if (this_parent != parent) { struct dentry *child = this_parent; - this_parent = try_to_ascend(this_parent, seq); - if (!this_parent) + this_parent = child->d_parent; + + rcu_read_lock(); + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (this_parent != child->d_parent || + (child->d_flags & DCACHE_DENTRY_KILLED) || + need_seqretry(&rename_lock, seq)) { + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); goto rename_retry; + } + rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index a8766b880c0..becc725a195 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -/* - * Retaining negative dentries for an in-memory filesystem just wastes - * memory and lookup time: arrange for them to be deleted immediately. - */ -static int efivarfs_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - static struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, - .d_delete = efivarfs_delete_dentry, + .d_delete = always_delete_dentry, }; static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) diff --git a/fs/exec.c b/fs/exec.c index 977319fd77f..7ea097f6b34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1380,10 +1380,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - retval = audit_bprm(bprm); - if (retval) - return retval; - retval = -ENOENT; retry: read_lock(&binfmt_lock); @@ -1431,6 +1427,7 @@ static int exec_binprm(struct linux_binprm *bprm) ret = search_binary_handler(bprm); if (ret >= 0) { + audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e66a8009aff..c8420f7e4db 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1899,7 +1899,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) gi->nhash = 0; } /* Skip entries for other sb and dead entries */ - } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref)); + } while (gi->sdp != gi->gl->gl_sbd || + __lockref_is_dead(&gi->gl->gl_lockref)); return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1615df16cf4..7119504159f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1171,8 +1171,11 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (d != NULL) dentry = d; if (dentry->d_inode) { - if (!(*opened & FILE_OPENED)) + if (!(*opened & FILE_OPENED)) { + if (d == NULL) + dget(dentry); return finish_no_open(file, dentry); + } dput(d); return 0; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index c8423d6de6c..2a6ba06bee6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl) static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); - memcpy(&gen, lvb_bits, sizeof(uint32_t)); + memcpy(&gen, lvb_bits, sizeof(__le32)); *lvb_gen = le32_to_cpu(gen); } static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, char *lvb_bits) { - uint32_t gen; + __le32 gen; memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); gen = cpu_to_le32(lvb_gen); - memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); } static int all_jid_bits_clear(char *lvb) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 453b50eadde..98236d0df3c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct buffer_head *bh; struct page *page; void *kaddr, *ptr; - struct gfs2_quota q, *qp; + struct gfs2_quota q; int err, nbytes; u64 size; @@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; err = -EIO; - qp = &q; - qp->qu_value = be64_to_cpu(qp->qu_value); - qp->qu_value += change; - qp->qu_value = cpu_to_be64(qp->qu_value); - qd->qd_qb.qb_value = qp->qu_value; + be64_add_cpu(&q.qu_value, change); + qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_warn = qp->qu_warn; + q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_limit = qp->qu_limit; + q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & FS_DQ_BCOUNT) { - qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); - qd->qd_qb.qb_value = qp->qu_value; + q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + qd->qd_qb.qb_value = q.qu_value; } } /* Write the quota into the quota file on disk */ - ptr = qp; + ptr = &q; nbytes = sizeof(struct gfs2_quota); get_a_page: page = find_or_create_page(mapping, index, GFP_NOFS); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 4d83abdd563..c8d6161bd68 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; } - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); @@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd) if (rgd->rd_flags & GFS2_RDF_UPTODATE) return 0; - if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) return gfs2_rgrp_bh_get(rgd); rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 25437280a20..db23ce1bd90 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) -static int hostfs_d_delete(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations hostfs_dentry_ops = { - .d_delete = hostfs_d_delete, -}; - /* Changed in hostfs_args before the kernel starts running */ static char *root_ino = ""; static int append = 0; @@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; - sb->s_d_op = &hostfs_dentry_ops; + sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as <NULL> by sprintf: avoid that. */ diff --git a/fs/libfs.c b/fs/libfs.c index 5de06947ba5..a1844244246 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs); * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ -static int simple_delete_dentry(const struct dentry *dentry) +int always_delete_dentry(const struct dentry *dentry) { return 1; } +EXPORT_SYMBOL(always_delete_dentry); + +const struct dentry_operations simple_dentry_operations = { + .d_delete = always_delete_dentry, +}; +EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already @@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry) */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - static const struct dentry_operations simple_dentry_operations = { - .d_delete = simple_delete_dentry, - }; - if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) diff --git a/fs/namei.c b/fs/namei.c index e029a4cbff7..8f77a8cea28 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2435,6 +2435,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 088de1355e9..ee7237f99f5 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -141,8 +141,8 @@ xdr_error: \ static void next_decode_page(struct nfsd4_compoundargs *argp) { - argp->pagelist++; argp->p = page_address(argp->pagelist[0]); + argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; @@ -1229,6 +1229,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) len -= pages * PAGE_SIZE; argp->p = (__be32 *)page_address(argp->pagelist[0]); + argp->pagelist++; argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); } argp->p += XDR_QUADLEN(len); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 94b5f5d2bfe..7eea63cada1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp) } /* - * Set various file attributes. - * N.B. After this call fhp needs an fh_put + * Go over the attributes and take care of the small differences between + * NFS semantics and what Linux expects. */ -__be32 -nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, - int check_guard, time_t guardtime) +static void +nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - struct dentry *dentry; - struct inode *inode; - int accmode = NFSD_MAY_SATTR; - umode_t ftype = 0; - __be32 err; - int host_err; - int size_change = 0; - - if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) - accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; - if (iap->ia_valid & ATTR_SIZE) - ftype = S_IFREG; - - /* Get inode */ - err = fh_verify(rqstp, fhp, ftype, accmode); - if (err) - goto out; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - - /* Ignore any mode updates on symlinks */ - if (S_ISLNK(inode->i_mode)) - iap->ia_valid &= ~ATTR_MODE; - - if (!iap->ia_valid) - goto out; - /* * NFSv2 does not differentiate between "set-[ac]time-to-now" * which only requires access, and "set-[ac]time-to-X" which @@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, * convert to "set to now" instead of "set to explicit time" * * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. It is only - * valid if the filesystem does not define it's own i_op->setattr. + * it is not an interface that we should be using. */ #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) #define MAX_TOUCH_TIME_ERROR (30*60) @@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid &= ~BOTH_TIME_SET; } } - - /* - * The size case is special. - * It changes the file as well as the attributes. - */ - if (iap->ia_valid & ATTR_SIZE) { - if (iap->ia_size < inode->i_size) { - err = nfsd_permission(rqstp, fhp->fh_export, dentry, - NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); - if (err) - goto out; - } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserr; - - size_change = 1; - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) { - put_write_access(inode); - goto out_nfserr; - } - } /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { @@ -415,32 +361,111 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } } +} - /* Change the attributes. */ +static __be32 +nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct iattr *iap) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + int host_err; - iap->ia_valid |= ATTR_CTIME; + if (iap->ia_size < inode->i_size) { + __be32 err; - err = nfserr_notsync; - if (!check_guard || guardtime == inode->i_ctime.tv_sec) { - host_err = nfsd_break_lease(inode); - if (host_err) - goto out_nfserr; - fh_lock(fhp); + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE); + if (err) + return err; + } - host_err = notify_change(dentry, iap, NULL); - err = nfserrno(host_err); - fh_unlock(fhp); + host_err = get_write_access(inode); + if (host_err) + goto out_nfserrno; + + host_err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (host_err) + goto out_put_write_access; + return 0; + +out_put_write_access: + put_write_access(inode); +out_nfserrno: + return nfserrno(host_err); +} + +/* + * Set various file attributes. After this call fhp needs an fh_put. + */ +__be32 +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = NFSD_MAY_SATTR; + umode_t ftype = 0; + __be32 err; + int host_err; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Ignore any mode updates on symlinks */ + if (S_ISLNK(inode->i_mode)) + iap->ia_valid &= ~ATTR_MODE; + + if (!iap->ia_valid) + goto out; + + nfsd_sanitize_attrs(inode, iap); + + /* + * The size case is special, it changes the file in addition to the + * attributes. + */ + if (iap->ia_valid & ATTR_SIZE) { + err = nfsd_get_write_access(rqstp, fhp, iap); + if (err) + goto out; + size_change = 1; } + + iap->ia_valid |= ATTR_CTIME; + + if (check_guard && guardtime != inode->i_ctime.tv_sec) { + err = nfserr_notsync; + goto out_put_write_access; + } + + host_err = nfsd_break_lease(inode); + if (host_err) + goto out_put_write_access_nfserror; + + fh_lock(fhp); + host_err = notify_change(dentry, iap, NULL); + fh_unlock(fhp); + +out_put_write_access_nfserror: + err = nfserrno(host_err); +out_put_write_access: if (size_change) put_write_access(inode); if (!err) commit_metadata(fhp); out: return err; - -out_nfserr: - err = nfserrno(host_err); - goto out; } #if defined(CONFIG_NFSD_V2_ACL) || \ diff --git a/fs/proc/base.c b/fs/proc/base.c index 1485e38daaa..03c8d747be4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - kloginuid = make_kuid(file->f_cred->user_ns, loginuid); - if (!uid_valid(kloginuid)) { - length = -EINVAL; - goto out_free_page; + + /* is userspace tring to explicitly UNSET the loginuid? */ + if (loginuid == AUDIT_UID_UNSET) { + kloginuid = INVALID_UID; + } else { + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } } length = audit_set_loginuid(kloginuid); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 737e15615b0..cca93b6fb9a 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -175,22 +175,6 @@ static const struct inode_operations proc_link_inode_operations = { }; /* - * As some entries in /proc are volatile, we want to - * get rid of unused dentries. This could be made - * smarter: we could keep a "volatile" flag in the - * inode to indicate which ones to keep. - */ -static int proc_delete_dentry(const struct dentry * dentry) -{ - return 1; -} - -static const struct dentry_operations proc_dentry_operations = -{ - .d_delete = proc_delete_dentry, -}; - -/* * Don't create negative dentries here, return -ENOENT by hand * instead. */ @@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_dentry_operations); + d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, inode); return NULL; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 49a7fff2e83..9ae46b87470 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = { .setattr = proc_setattr, }; -static int ns_delete_dentry(const struct dentry *dentry) -{ - /* Don't cache namespace inodes when not in use */ - return 1; -} - static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; @@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) const struct dentry_operations ns_dentry_operations = { - .d_delete = ns_delete_dentry, + .d_delete = always_delete_dentry, .d_dname = ns_dname, }; diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index c70111ebefd..b6fa8657dcb 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -25,6 +25,78 @@ config SQUASHFS If unsure, say N. +choice + prompt "File decompression options" + depends on SQUASHFS + help + Squashfs now supports two options for decompressing file + data. Traditionally Squashfs has decompressed into an + intermediate buffer and then memcopied it into the page cache. + Squashfs now supports the ability to decompress directly into + the page cache. + + If unsure, select "Decompress file data into an intermediate buffer" + +config SQUASHFS_FILE_CACHE + bool "Decompress file data into an intermediate buffer" + help + Decompress file data into an intermediate buffer and then + memcopy it into the page cache. + +config SQUASHFS_FILE_DIRECT + bool "Decompress files directly into the page cache" + help + Directly decompress file data into the page cache. + Doing so can significantly improve performance because + it eliminates a memcpy and it also removes the lock contention + on the single buffer. + +endchoice + +choice + prompt "Decompressor parallelisation options" + depends on SQUASHFS + help + Squashfs now supports three parallelisation options for + decompression. Each one exhibits various trade-offs between + decompression performance and CPU and memory usage. + + If in doubt, select "Single threaded compression" + +config SQUASHFS_DECOMP_SINGLE + bool "Single threaded compression" + help + Traditionally Squashfs has used single-threaded decompression. + Only one block (data or metadata) can be decompressed at any + one time. This limits CPU and memory usage to a minimum. + +config SQUASHFS_DECOMP_MULTI + bool "Use multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + If you have a parallel I/O workload and your system has enough memory, + using this option may improve overall I/O performance. + + This decompressor implementation uses up to two parallel + decompressors per core. It dynamically allocates decompressors + on a demand basis. + +config SQUASHFS_DECOMP_MULTI_PERCPU + bool "Use percpu multiple decompressors for parallel I/O" + help + By default Squashfs uses a single decompressor but it gives + poor performance on parallel I/O workloads when using multiple CPU + machines due to waiting on decompressor availability. + + This decompressor implementation uses a maximum of one + decompressor per core. It uses percpu variables to ensure + decompression is load-balanced across the cores. + +endchoice + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 110b0476f3b..4132520b4ff 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,6 +5,11 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o +squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 41d108ecc9b..0cea9b9236d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -36,6 +36,7 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" /* * Read the metadata block length, this is stored in the first two @@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with compression * algorithms). */ -int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength, int pages) +int squashfs_read_data(struct super_block *sb, u64 index, int length, + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; int offset = index & ((1 << msblk->devblksize_log2) - 1); u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, page = 0, avail; + int bytes, compressed, b = 0, k = 0, avail, i; - bh = kcalloc(((srclength + msblk->devblksize - 1) + bh = kcalloc(((output->length + msblk->devblksize - 1) >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); if (bh == NULL) return -ENOMEM; @@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, *next_index = index + length; TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", - index, compressed ? "" : "un", length, srclength); + index, compressed ? "" : "un", length, output->length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto read_failure; @@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, TRACE("Block @ 0x%llx, %scompressed size %d\n", index, compressed ? "" : "un", length); - if (length < 0 || length > srclength || + if (length < 0 || length > output->length || (index + length) > msblk->bytes_used) goto block_release; @@ -158,9 +159,15 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, ll_rw_block(READ, b - 1, bh + 1); } + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + if (compressed) { - length = squashfs_decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + length = squashfs_decompress(msblk, bh, b, offset, length, + output); if (length < 0) goto read_failure; } else { @@ -168,22 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, * Block is uncompressed. */ int in, pg_offset = 0; + void *data = squashfs_first_page(output); for (bytes = length; k < b; k++) { in = min(bytes, msblk->devblksize - offset); bytes -= in; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto block_release; while (in) { if (pg_offset == PAGE_CACHE_SIZE) { - page++; + data = squashfs_next_page(output); pg_offset = 0; } avail = min_t(int, in, PAGE_CACHE_SIZE - pg_offset); - memcpy(buffer[page] + pg_offset, - bh[k]->b_data + offset, avail); + memcpy(data + pg_offset, bh[k]->b_data + offset, + avail); in -= avail; pg_offset += avail; offset += avail; @@ -191,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; put_bh(bh[k]); } + squashfs_finish_page(output); } kfree(bh); diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index af0b7380259..1cb70a0b216 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -56,6 +56,7 @@ #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" +#include "page_actor.h" /* * Look-up block in cache, and increment usage count. If not in cache, read @@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->error = 0; spin_unlock(&cache->lock); - entry->length = squashfs_read_data(sb, entry->data, - block, length, &entry->next_index, - cache->block_size, cache->pages); + entry->length = squashfs_read_data(sb, block, length, + &entry->next_index, entry->actor); spin_lock(&cache->lock); @@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) kfree(cache->entry[i].data[j]); kfree(cache->entry[i].data); } + kfree(cache->entry[i].actor); } kfree(cache->entry); @@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries, goto cleanup; } } + + entry->actor = squashfs_page_actor_init(entry->data, + cache->pages, 0); + if (entry->actor == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } } return cache; @@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; int i, res; void *table, *buffer, **data; + struct squashfs_page_actor *actor; table = buffer = kmalloc(length, GFP_KERNEL); if (table == NULL) @@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length) goto failed; } + actor = squashfs_page_actor_init(data, pages, length); + if (actor == NULL) { + res = -ENOMEM; + goto failed2; + } + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; - res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); + res = squashfs_read_data(sb, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor); kfree(data); + kfree(actor); if (res < 0) goto failed; return table; +failed2: + kfree(data); failed: kfree(table); return ERR_PTR(res); diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 3f6271d86ab..ac22fe73b0a 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -30,6 +30,7 @@ #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" +#include "page_actor.h" /* * This file (and decompressor.h) implements a decompressor framework for @@ -37,29 +38,29 @@ */ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { - NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 + NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { - NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 + NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 }; #endif #ifndef CONFIG_SQUASHFS_XZ static const struct squashfs_decompressor squashfs_xz_comp_ops = { - NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 + NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 }; #endif #ifndef CONFIG_SQUASHFS_ZLIB static const struct squashfs_decompressor squashfs_zlib_comp_ops = { - NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 + NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 }; #endif static const struct squashfs_decompressor squashfs_unknown_comp_ops = { - NULL, NULL, NULL, 0, "unknown", 0 + NULL, NULL, NULL, NULL, 0, "unknown", 0 }; static const struct squashfs_decompressor *decompressor[] = { @@ -83,10 +84,11 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id) } -void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) +static void *get_comp_opts(struct super_block *sb, unsigned short flags) { struct squashfs_sb_info *msblk = sb->s_fs_info; - void *strm, *buffer = NULL; + void *buffer = NULL, *comp_opts; + struct squashfs_page_actor *actor = NULL; int length = 0; /* @@ -94,23 +96,46 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) */ if (SQUASHFS_COMP_OPTS(flags)) { buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); - if (buffer == NULL) - return ERR_PTR(-ENOMEM); + if (buffer == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } + + actor = squashfs_page_actor_init(&buffer, 1, 0); + if (actor == NULL) { + comp_opts = ERR_PTR(-ENOMEM); + goto out; + } - length = squashfs_read_data(sb, &buffer, - sizeof(struct squashfs_super_block), 0, NULL, - PAGE_CACHE_SIZE, 1); + length = squashfs_read_data(sb, + sizeof(struct squashfs_super_block), 0, NULL, actor); if (length < 0) { - strm = ERR_PTR(length); - goto finished; + comp_opts = ERR_PTR(length); + goto out; } } - strm = msblk->decompressor->init(msblk, buffer, length); + comp_opts = squashfs_comp_opts(msblk, buffer, length); -finished: +out: + kfree(actor); kfree(buffer); + return comp_opts; +} + + +void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + void *stream, *comp_opts = get_comp_opts(sb, flags); + + if (IS_ERR(comp_opts)) + return comp_opts; + + stream = squashfs_decompressor_create(msblk, comp_opts); + if (IS_ERR(stream)) + kfree(comp_opts); - return strm; + return stream; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index 330073e2902..af098532180 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -24,28 +24,22 @@ */ struct squashfs_decompressor { - void *(*init)(struct squashfs_sb_info *, void *, int); + void *(*init)(struct squashfs_sb_info *, void *); + void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); - int (*decompress)(struct squashfs_sb_info *, void **, - struct buffer_head **, int, int, int, int, int); + int (*decompress)(struct squashfs_sb_info *, void *, + struct buffer_head **, int, int, int, + struct squashfs_page_actor *); int id; char *name; int supported; }; -static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, - void *s) +static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int length) { - if (msblk->decompressor) - msblk->decompressor->free(s); -} - -static inline int squashfs_decompress(struct squashfs_sb_info *msblk, - void **buffer, struct buffer_head **bh, int b, int offset, int length, - int srclength, int pages) -{ - return msblk->decompressor->decompress(msblk, buffer, bh, b, offset, - length, srclength, pages); + return msblk->decompressor->comp_opts ? + msblk->decompressor->comp_opts(msblk, buff, length) : NULL; } #ifdef CONFIG_SQUASHFS_XZ diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c new file mode 100644 index 00000000000..d6008a63647 --- /dev/null +++ b/fs/squashfs/decompressor_multi.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2013 + * Minchan Kim <minchan@kernel.org> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/cpumask.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression in the + * decompressor framework + */ + + +/* + * The reason that multiply two is that a CPU can request new I/O + * while it is waiting previous request. + */ +#define MAX_DECOMPRESSOR (num_online_cpus() * 2) + + +int squashfs_max_decompressors(void) +{ + return MAX_DECOMPRESSOR; +} + + +struct squashfs_stream { + void *comp_opts; + struct list_head strm_list; + struct mutex mutex; + int avail_decomp; + wait_queue_head_t wait; +}; + + +struct decomp_stream { + void *stream; + struct list_head list; +}; + + +static void put_decomp_stream(struct decomp_stream *decomp_strm, + struct squashfs_stream *stream) +{ + mutex_lock(&stream->mutex); + list_add(&decomp_strm->list, &stream->strm_list); + mutex_unlock(&stream->mutex); + wake_up(&stream->wait); +} + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct decomp_stream *decomp_strm = NULL; + int err = -ENOMEM; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) + goto out; + + stream->comp_opts = comp_opts; + mutex_init(&stream->mutex); + INIT_LIST_HEAD(&stream->strm_list); + init_waitqueue_head(&stream->wait); + + /* + * We should have a decompressor at least as default + * so if we fail to allocate new decompressor dynamically, + * we could always fall back to default decompressor and + * file system works. + */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto out; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + err = PTR_ERR(decomp_strm->stream); + goto out; + } + + list_add(&decomp_strm->list, &stream->strm_list); + stream->avail_decomp = 1; + return stream; + +out: + kfree(decomp_strm); + kfree(stream); + return ERR_PTR(err); +} + + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + if (stream) { + struct decomp_stream *decomp_strm; + + while (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + msblk->decompressor->free(decomp_strm->stream); + kfree(decomp_strm); + stream->avail_decomp--; + } + WARN_ON(stream->avail_decomp); + kfree(stream->comp_opts); + kfree(stream); + } +} + + +static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, + struct squashfs_stream *stream) +{ + struct decomp_stream *decomp_strm; + + while (1) { + mutex_lock(&stream->mutex); + + /* There is available decomp_stream */ + if (!list_empty(&stream->strm_list)) { + decomp_strm = list_entry(stream->strm_list.prev, + struct decomp_stream, list); + list_del(&decomp_strm->list); + mutex_unlock(&stream->mutex); + break; + } + + /* + * If there is no available decomp and already full, + * let's wait for releasing decomp from other users. + */ + if (stream->avail_decomp >= MAX_DECOMPRESSOR) + goto wait; + + /* Let's allocate new decomp */ + decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL); + if (!decomp_strm) + goto wait; + + decomp_strm->stream = msblk->decompressor->init(msblk, + stream->comp_opts); + if (IS_ERR(decomp_strm->stream)) { + kfree(decomp_strm); + goto wait; + } + + stream->avail_decomp++; + WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + + mutex_unlock(&stream->mutex); + break; +wait: + /* + * If system memory is tough, let's for other's + * releasing instead of hurting VM because it could + * make page cache thrashing. + */ + mutex_unlock(&stream->mutex); + wait_event(stream->wait, + !list_empty(&stream->strm_list)); + } + + return decomp_strm; +} + + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); + res = msblk->decompressor->decompress(msblk, decomp_stream->stream, + bh, b, offset, length, output); + put_decomp_stream(decomp_stream, stream); + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + return res; +} diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c new file mode 100644 index 00000000000..23a9c28ad8e --- /dev/null +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements multi-threaded decompression using percpu + * variables, one thread per cpu core. + */ + +struct squashfs_stream { + void *stream; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu; + int err, cpu; + + percpu = alloc_percpu(struct squashfs_stream); + if (percpu == NULL) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + } + + kfree(comp_opts); + return (__force void *) percpu; + +out: + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + if (!IS_ERR_OR_NULL(stream->stream)) + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream; + int cpu; + + if (msblk->stream) { + for_each_possible_cpu(cpu) { + stream = per_cpu_ptr(percpu, cpu); + msblk->decompressor->free(stream->stream); + } + free_percpu(percpu); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; + struct squashfs_stream *stream = get_cpu_ptr(percpu); + int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + put_cpu_ptr(stream); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return num_possible_cpus(); +} diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c new file mode 100644 index 00000000000..a6c75929a00 --- /dev/null +++ b/fs/squashfs/decompressor_single.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "decompressor.h" +#include "squashfs.h" + +/* + * This file implements single-threaded decompression in the + * decompressor framework + */ + +struct squashfs_stream { + void *stream; + struct mutex mutex; +}; + +void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, + void *comp_opts) +{ + struct squashfs_stream *stream; + int err = -ENOMEM; + + stream = kmalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto out; + + stream->stream = msblk->decompressor->init(msblk, comp_opts); + if (IS_ERR(stream->stream)) { + err = PTR_ERR(stream->stream); + goto out; + } + + kfree(comp_opts); + mutex_init(&stream->mutex); + return stream; + +out: + kfree(stream); + return ERR_PTR(err); +} + +void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +{ + struct squashfs_stream *stream = msblk->stream; + + if (stream) { + msblk->decompressor->free(stream->stream); + kfree(stream); + } +} + +int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, + int b, int offset, int length, struct squashfs_page_actor *output) +{ + int res; + struct squashfs_stream *stream = msblk->stream; + + mutex_lock(&stream->mutex); + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + mutex_unlock(&stream->mutex); + + if (res < 0) + ERROR("%s decompression failed, data probably corrupt\n", + msblk->decompressor->name); + + return res; +} + +int squashfs_max_decompressors(void) +{ + return 1; +} diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 8ca62c28fe1..e5c9689062b 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) return le32_to_cpu(size); } - -static int squashfs_readpage(struct file *file, struct page *page) +/* Copy data into page cache */ +void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer, + int bytes, int offset) { struct inode *inode = page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - int bytes, i, offset = 0, sparse = 0; - struct squashfs_cache_entry *buffer = NULL; void *pageaddr; - - int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; - int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); - int start_index = page->index & ~mask; - int end_index = start_index | mask; - int file_end = i_size_read(inode) >> msblk->block_log; - - TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", - page->index, squashfs_i(inode)->start); - - if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) - goto out; - - if (index < file_end || squashfs_i(inode)->fragment_block == - SQUASHFS_INVALID_BLK) { - /* - * Reading a datablock from disk. Need to read block list - * to get location and block size. - */ - u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) - goto error_out; - - if (bsize == 0) { /* hole */ - bytes = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - sparse = 1; - } else { - /* - * Read and decompress datablock. - */ - buffer = squashfs_get_datablock(inode->i_sb, - block, bsize); - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x" - "\n", block, bsize); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = buffer->length; - } - } else { - /* - * Datablock is stored inside a fragment (tail-end packed - * block). - */ - buffer = squashfs_get_fragment(inode->i_sb, - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - - if (buffer->error) { - ERROR("Unable to read page, block %llx, size %x\n", - squashfs_i(inode)->fragment_block, - squashfs_i(inode)->fragment_size); - squashfs_cache_put(buffer); - goto error_out; - } - bytes = i_size_read(inode) & (msblk->block_size - 1); - offset = squashfs_i(inode)->fragment_offset; - } + int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = page->index & ~mask, end_index = start_index | mask; /* * Loop copying datablock into pages. As the datablock likely covers @@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page) for (i = start_index; i <= end_index && bytes > 0; i++, bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { struct page *push_page; - int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0; TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); @@ -475,11 +413,75 @@ skip_page: if (i != page->index) page_cache_release(push_page); } +} + +/* Read datablock stored packed inside a fragment (tail-end packed block) */ +static int squashfs_readpage_fragment(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + else + squashfs_copy_cache(page, buffer, i_size_read(inode) & + (msblk->block_size - 1), + squashfs_i(inode)->fragment_offset); + + squashfs_cache_put(buffer); + return res; +} - if (!sparse) - squashfs_cache_put(buffer); +static int squashfs_readpage_sparse(struct page *page, int index, int file_end) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + squashfs_copy_cache(page, NULL, bytes, 0); return 0; +} + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int file_end = i_size_read(inode) >> msblk->block_log; + int res; + void *pageaddr; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) + res = squashfs_readpage_sparse(page, index, file_end); + else + res = squashfs_readpage_block(page, block, bsize); + } else + res = squashfs_readpage_fragment(page); + + if (!res) + return 0; error_out: SetPageError(page); diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c new file mode 100644 index 00000000000..f2310d2a201 --- /dev/null +++ b/fs/squashfs/file_cache.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* Read separately compressed datablock and memcopy into page cache */ +int squashfs_readpage_block(struct page *page, u64 block, int bsize) +{ + struct inode *i = page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int res = buffer->error; + + if (res) + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + else + squashfs_copy_cache(page, buffer, buffer->length, 0); + + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c new file mode 100644 index 00000000000..2943b2bfae4 --- /dev/null +++ b/fs/squashfs/file_direct.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" +#include "page_actor.h" + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page); + +/* Read separately compressed datablock directly into page cache */ +int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) + +{ + struct inode *inode = target_page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + + int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int start_index = target_page->index & ~mask; + int end_index = start_index | mask; + int i, n, pages, missing_pages, bytes, res = -ENOMEM; + struct page **page; + struct squashfs_page_actor *actor; + void *pageaddr; + + if (end_index > file_end) + end_index = file_end; + + pages = end_index - start_index + 1; + + page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + if (page == NULL) + return res; + + /* + * Create a "page actor" which will kmap and kunmap the + * page cache pages appropriately within the decompressor + */ + actor = squashfs_page_actor_init_special(page, pages, 0); + if (actor == NULL) + goto out; + + /* Try to grab all the pages covered by the Squashfs block */ + for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) { + page[i] = (n == target_page->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, n); + + if (page[i] == NULL) { + missing_pages++; + continue; + } + + if (PageUptodate(page[i])) { + unlock_page(page[i]); + page_cache_release(page[i]); + page[i] = NULL; + missing_pages++; + } + } + + if (missing_pages) { + /* + * Couldn't get one or more pages, this page has either + * been VM reclaimed, but others are still in the page cache + * and uptodate, or we're racing with another thread in + * squashfs_readpage also trying to grab them. Fall back to + * using an intermediate buffer. + */ + res = squashfs_read_cache(target_page, block, bsize, pages, + page); + goto out; + } + + /* Decompress directly into the page cache buffers */ + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + if (res < 0) + goto mark_errored; + + /* Last page may have trailing bytes not filled */ + bytes = res % PAGE_CACHE_SIZE; + if (bytes) { + pageaddr = kmap_atomic(page[pages - 1]); + memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); + kunmap_atomic(pageaddr); + } + + /* Mark pages as uptodate, unlock and release */ + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + unlock_page(page[i]); + if (page[i] != target_page) + page_cache_release(page[i]); + } + + kfree(actor); + kfree(page); + + return 0; + +mark_errored: + /* Decompression failed, mark pages as errored. Target_page is + * dealt with by the caller + */ + for (i = 0; i < pages; i++) { + if (page[i] == target_page) + continue; + flush_dcache_page(page[i]); + SetPageError(page[i]); + unlock_page(page[i]); + page_cache_release(page[i]); + } + +out: + kfree(actor); + kfree(page); + return res; +} + + +static int squashfs_read_cache(struct page *target_page, u64 block, int bsize, + int pages, struct page **page) +{ + struct inode *i = target_page->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb, + block, bsize); + int bytes = buffer->length, res = buffer->error, n, offset = 0; + void *pageaddr; + + if (res) { + ERROR("Unable to read page, block %llx, size %x\n", block, + bsize); + goto out; + } + + for (n = 0; n < pages && bytes > 0; n++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + int avail = min_t(int, bytes, PAGE_CACHE_SIZE); + + if (page[n] == NULL) + continue; + + pageaddr = kmap_atomic(page[n]); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr); + flush_dcache_page(page[n]); + SetPageUptodate(page[n]); + unlock_page(page[n]); + if (page[n] != target_page) + page_cache_release(page[n]); + } + +out: + squashfs_cache_put(buffer); + return res; +} diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 00f4dfc5f08..244b9fbfff7 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -31,13 +31,14 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_lzo { void *input; void *output; }; -static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len) +static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); @@ -74,22 +75,16 @@ static void lzo_free(void *strm) } -static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { - struct squashfs_lzo *stream = msblk->stream; - void *buff = stream->input; + struct squashfs_lzo *stream = strm; + void *buff = stream->input, *data; int avail, i, bytes = length, res; - size_t out_len = srclength; - - mutex_lock(&msblk->read_data_mutex); + size_t out_len = output->length; for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - avail = min(bytes, msblk->devblksize - offset); memcpy(buff, bh[i]->b_data + offset, avail); buff += avail; @@ -104,24 +99,24 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto failed; res = bytes = (int)out_len; - for (i = 0, buff = stream->output; bytes && i < pages; i++) { - avail = min_t(int, bytes, PAGE_CACHE_SIZE); - memcpy(buffer[i], buff, avail); - buff += avail; - bytes -= avail; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } else { + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } } + squashfs_finish_page(output); - mutex_unlock(&msblk->read_data_mutex); return res; -block_release: - for (; i < b; i++) - put_bh(bh[i]); - failed: - mutex_unlock(&msblk->read_data_mutex); - - ERROR("lzo decompression failed, data probably corrupt\n"); return -EIO; } diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c new file mode 100644 index 00000000000..5a1c11f5644 --- /dev/null +++ b/fs/squashfs/page_actor.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include "page_actor.h" + +/* + * This file contains implementations of page_actor for decompressing into + * an intermediate buffer, and for decompressing directly into the + * page cache. + * + * Calling code should avoid sleeping between calls to squashfs_first_page() + * and squashfs_finish_page(). + */ + +/* Implementation of page_actor for decompressing into intermediate buffer */ +static void *cache_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->buffer[0]; +} + +static void *cache_next_page(struct squashfs_page_actor *actor) +{ + if (actor->next_page == actor->pages) + return NULL; + + return actor->buffer[actor->next_page++]; +} + +static void cache_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} + +struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->buffer = buffer; + actor->pages = pages; + actor->next_page = 0; + actor->squashfs_first_page = cache_first_page; + actor->squashfs_next_page = cache_next_page; + actor->squashfs_finish_page = cache_finish_page; + return actor; +} + +/* Implementation of page_actor for decompressing directly into page cache. */ +static void *direct_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->pageaddr = kmap_atomic(actor->page[0]); +} + +static void *direct_next_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); + + return actor->pageaddr = actor->next_page == actor->pages ? NULL : + kmap_atomic(actor->page[actor->next_page++]); +} + +static void direct_finish_page(struct squashfs_page_actor *actor) +{ + if (actor->pageaddr) + kunmap_atomic(actor->pageaddr); +} + +struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + actor->pageaddr = NULL; + actor->squashfs_first_page = direct_first_page; + actor->squashfs_next_page = direct_next_page; + actor->squashfs_finish_page = direct_finish_page; + return actor; +} diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h new file mode 100644 index 00000000000..26dd82008b8 --- /dev/null +++ b/fs/squashfs/page_actor.h @@ -0,0 +1,81 @@ +#ifndef PAGE_ACTOR_H +#define PAGE_ACTOR_H +/* + * Copyright (c) 2013 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef CONFIG_SQUASHFS_FILE_DIRECT +struct squashfs_page_actor { + void **page; + int pages; + int length; + int next_page; +}; + +static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, + int pages, int length) +{ + struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); + + if (actor == NULL) + return NULL; + + actor->length = length ? : pages * PAGE_CACHE_SIZE; + actor->page = page; + actor->pages = pages; + actor->next_page = 0; + return actor; +} + +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + actor->next_page = 1; + return actor->page[0]; +} + +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->next_page == actor->pages ? NULL : + actor->page[actor->next_page++]; +} + +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + /* empty */ +} +#else +struct squashfs_page_actor { + union { + void **buffer; + struct page **page; + }; + void *pageaddr; + void *(*squashfs_first_page)(struct squashfs_page_actor *); + void *(*squashfs_next_page)(struct squashfs_page_actor *); + void (*squashfs_finish_page)(struct squashfs_page_actor *); + int pages; + int length; + int next_page; +}; + +extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int); +extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page + **, int, int); +static inline void *squashfs_first_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_first_page(actor); +} +static inline void *squashfs_next_page(struct squashfs_page_actor *actor) +{ + return actor->squashfs_next_page(actor); +} +static inline void squashfs_finish_page(struct squashfs_page_actor *actor) +{ + actor->squashfs_finish_page(actor); +} +#endif +#endif diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index d1266516ed0..9e1bb79f7e6 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -28,8 +28,8 @@ #define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) /* block.c */ -extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int, int); +extern int squashfs_read_data(struct super_block *, u64, int, u64 *, + struct squashfs_page_actor *); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); @@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int); /* decompressor.c */ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); -extern void *squashfs_decompressor_init(struct super_block *, unsigned short); +extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); + +/* decompressor_xxx.c */ +extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); +extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, + int, int, int, struct squashfs_page_actor *); +extern int squashfs_max_decompressors(void); /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, @@ -59,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); extern __le64 *squashfs_read_fragment_index_table(struct super_block *, u64, u64, unsigned int); +/* file.c */ +void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int, + int); + +/* file_xxx.c */ +extern int squashfs_readpage_block(struct page *, u64, int); + /* id.c */ extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 52934a22f29..1da565cb50c 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -50,6 +50,7 @@ struct squashfs_cache_entry { wait_queue_head_t wait_queue; struct squashfs_cache *cache; void **data; + struct squashfs_page_actor *actor; }; struct squashfs_sb_info { @@ -63,10 +64,9 @@ struct squashfs_sb_info { __le64 *id_table; __le64 *fragment_index; __le64 *xattr_id_table; - struct mutex read_data_mutex; struct mutex meta_index_mutex; struct meta_index *meta_index; - void *stream; + struct squashfs_stream *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60553a9053c..202df6312d4 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); - mutex_init(&msblk->read_data_mutex); mutex_init(&msblk->meta_index_mutex); /* @@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; /* Allocate read_page block */ - msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + msblk->read_page = squashfs_cache_init("data", + squashfs_max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { ERROR("Failed to allocate read_page block\n"); goto failed_mount; } - msblk->stream = squashfs_decompressor_init(sb, flags); + msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; @@ -336,7 +336,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_free(msblk, msblk->stream); + squashfs_decompressor_destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -383,7 +383,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_free(sbi, sbi->stream); + squashfs_decompressor_destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 1760b7d108f..c609624e4b8 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -32,44 +32,70 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" struct squashfs_xz { struct xz_dec *state; struct xz_buf buf; }; -struct comp_opts { +struct disk_comp_opts { __le32 dictionary_size; __le32 flags; }; -static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, - int len) +struct comp_opts { + int dict_size; +}; + +static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) { - struct comp_opts *comp_opts = buff; - struct squashfs_xz *stream; - int dict_size = msblk->block_size; - int err, n; + struct disk_comp_opts *comp_opts = buff; + struct comp_opts *opts; + int err = 0, n; + + opts = kmalloc(sizeof(*opts), GFP_KERNEL); + if (opts == NULL) { + err = -ENOMEM; + goto out2; + } if (comp_opts) { /* check compressor options are the expected length */ if (len < sizeof(*comp_opts)) { err = -EIO; - goto failed; + goto out; } - dict_size = le32_to_cpu(comp_opts->dictionary_size); + opts->dict_size = le32_to_cpu(comp_opts->dictionary_size); /* the dictionary size should be 2^n or 2^n+2^(n+1) */ - n = ffs(dict_size) - 1; - if (dict_size != (1 << n) && dict_size != (1 << n) + + n = ffs(opts->dict_size) - 1; + if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) + (1 << (n + 1))) { err = -EIO; - goto failed; + goto out; } - } + } else + /* use defaults */ + opts->dict_size = max_t(int, msblk->block_size, + SQUASHFS_METADATA_SIZE); + + return opts; + +out: + kfree(opts); +out2: + return ERR_PTR(err); +} + - dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE); +static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct comp_opts *comp_opts = buff; + struct squashfs_xz *stream; + int err; stream = kmalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) { @@ -77,7 +103,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, goto failed; } - stream->state = xz_dec_init(XZ_PREALLOC, dict_size); + stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size); if (stream->state == NULL) { kfree(stream); err = -ENOMEM; @@ -103,42 +129,37 @@ static void squashfs_xz_free(void *strm) } -static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { enum xz_ret xz_err; - int avail, total = 0, k = 0, page = 0; - struct squashfs_xz *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + int avail, total = 0, k = 0; + struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); stream->buf.in_pos = 0; stream->buf.in_size = 0; stream->buf.out_pos = 0; stream->buf.out_size = PAGE_CACHE_SIZE; - stream->buf.out = buffer[page++]; + stream->buf.out = squashfs_first_page(output); do { if (stream->buf.in_pos == stream->buf.in_size && k < b) { avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->buf.in = bh[k]->b_data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; } - if (stream->buf.out_pos == stream->buf.out_size - && page < pages) { - stream->buf.out = buffer[page++]; - stream->buf.out_pos = 0; - total += PAGE_CACHE_SIZE; + if (stream->buf.out_pos == stream->buf.out_size) { + stream->buf.out = squashfs_next_page(output); + if (stream->buf.out != NULL) { + stream->buf.out_pos = 0; + total += PAGE_CACHE_SIZE; + } } xz_err = xz_dec_run(stream->state, &stream->buf); @@ -147,23 +168,14 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (xz_err == XZ_OK); - if (xz_err != XZ_STREAM_END) { - ERROR("xz_dec_run error, data probably corrupt\n"); - goto release_mutex; - } - - if (k < b) { - ERROR("xz_uncompress error, input remaining\n"); - goto release_mutex; - } + squashfs_finish_page(output); - total += stream->buf.out_pos; - mutex_unlock(&msblk->read_data_mutex); - return total; + if (xz_err != XZ_STREAM_END || k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return total + stream->buf.out_pos; +out: for (; k < b; k++) put_bh(bh[k]); @@ -172,6 +184,7 @@ release_mutex: const struct squashfs_decompressor squashfs_xz_comp_ops = { .init = squashfs_xz_init, + .comp_opts = squashfs_xz_comp_opts, .free = squashfs_xz_free, .decompress = squashfs_xz_uncompress, .id = XZ_COMPRESSION, diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 55d918fd2d8..8727caba688 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -32,8 +32,9 @@ #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" +#include "page_actor.h" -static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len) +static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) { z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); if (stream == NULL) @@ -61,44 +62,37 @@ static void zlib_free(void *strm) } -static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, - struct buffer_head **bh, int b, int offset, int length, int srclength, - int pages) +static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0; - int k = 0, page = 0; - z_stream *stream = msblk->stream; - - mutex_lock(&msblk->read_data_mutex); + int zlib_err, zlib_init = 0, k = 0; + z_stream *stream = strm; - stream->avail_out = 0; + stream->avail_out = PAGE_CACHE_SIZE; + stream->next_out = squashfs_first_page(output); stream->avail_in = 0; do { if (stream->avail_in == 0 && k < b) { int avail = min(length, msblk->devblksize - offset); length -= avail; - wait_on_buffer(bh[k]); - if (!buffer_uptodate(bh[k])) - goto release_mutex; - stream->next_in = bh[k]->b_data + offset; stream->avail_in = avail; offset = 0; } - if (stream->avail_out == 0 && page < pages) { - stream->next_out = buffer[page++]; - stream->avail_out = PAGE_CACHE_SIZE; + if (stream->avail_out == 0) { + stream->next_out = squashfs_next_page(output); + if (stream->next_out != NULL) + stream->avail_out = PAGE_CACHE_SIZE; } if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - ERROR("zlib_inflateInit returned unexpected " - "result 0x%x, srclength %d\n", - zlib_err, srclength); - goto release_mutex; + squashfs_finish_page(output); + goto out; } zlib_init = 1; } @@ -109,29 +103,21 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, put_bh(bh[k++]); } while (zlib_err == Z_OK); - if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } + squashfs_finish_page(output); - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) { - ERROR("zlib_inflate error, data probably corrupt\n"); - goto release_mutex; - } + if (zlib_err != Z_STREAM_END) + goto out; - if (k < b) { - ERROR("zlib_uncompress error, data remaining\n"); - goto release_mutex; - } + zlib_err = zlib_inflateEnd(stream); + if (zlib_err != Z_OK) + goto out; - length = stream->total_out; - mutex_unlock(&msblk->read_data_mutex); - return length; + if (k < b) + goto out; -release_mutex: - mutex_unlock(&msblk->read_data_mutex); + return stream->total_out; +out: for (; k < b; k++) put_bh(bh[k]); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 1c02da8bb7d..3ef11b22e75 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ + int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork( if (rsvd) tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) - goto error1; + goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork( } ASSERT(ip->i_d.di_anextents == 0); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); switch (ip->i_d.di_format) { @@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork( default: ASSERT(0); error = XFS_ERROR(EINVAL); - goto error1; + goto trans_cancel; } ASSERT(ip->i_afp == NULL); @@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork( if (logflags) xfs_trans_log_inode(tp, ip, logflags); if (error) - goto error2; + goto bmap_cancel; if (!xfs_sb_version_hasattr(&mp->m_sb) || (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { __int64_t sbfields = 0; @@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) - goto error2; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); -error2: + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: xfs_bmap_cancel(&flist); -error1: +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da88f167af7..02df7b408a2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -41,6 +41,7 @@ #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB @@ -718,8 +719,22 @@ xfs_mountfs( * Set the inode cluster size. * This may still be overridden by the file system * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. */ mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + xfs_info(mp, "Using inode cluster size of %d bytes", + mp->m_inode_cluster_size); + } /* * Set inode alignment fields diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d8101a10d8..a466c5e5826 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -112,7 +112,7 @@ typedef struct xfs_mount { __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 1bba7f60d94..50c3f561428 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -111,12 +111,14 @@ xfs_trans_log_inode( /* * First time we log the inode in a transaction, bump the inode change - * counter if it is configured for this to occur. + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. */ if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && IS_I_VERSION(VFS_I(ip))) { - inode_inc_iversion(VFS_I(ip)); - ip->i_d.di_changecount = VFS_I(ip)->i_version; + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; flags |= XFS_ILOG_CORE; } diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index d53d9f0627a..2fd59c0dae6 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -385,8 +385,7 @@ xfs_calc_ifree_reservation( xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), - XFS_INODE_CLUSTER_SIZE(mp)) + + max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + xfs_calc_buf_res(1, 0) + xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels, 0) + diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 89c60b0f640..7b2de026a4f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -431,9 +431,9 @@ static inline acpi_handle acpi_get_child(acpi_handle handle, u64 addr) { return acpi_find_child(handle, addr, false); } +void acpi_preset_companion(struct device *dev, acpi_handle parent, u64 addr); int acpi_is_root_bridge(acpi_handle); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); -#define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)ACPI_HANDLE(dev)) int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h new file mode 100644 index 00000000000..e1e5a3e5dd1 --- /dev/null +++ b/include/crypto/hash_info.h @@ -0,0 +1,40 @@ +/* + * Hash Info: Hash algorithms information + * + * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _CRYPTO_HASH_INFO_H +#define _CRYPTO_HASH_INFO_H + +#include <crypto/sha.h> +#include <crypto/md5.h> + +#include <uapi/linux/hash_info.h> + +/* not defined in include/crypto/ */ +#define RMD128_DIGEST_SIZE 16 +#define RMD160_DIGEST_SIZE 20 +#define RMD256_DIGEST_SIZE 32 +#define RMD320_DIGEST_SIZE 40 + +/* not defined in include/crypto/ */ +#define WP512_DIGEST_SIZE 64 +#define WP384_DIGEST_SIZE 48 +#define WP256_DIGEST_SIZE 32 + +/* not defined in include/crypto/ */ +#define TGR128_DIGEST_SIZE 16 +#define TGR160_DIGEST_SIZE 20 +#define TGR192_DIGEST_SIZE 24 + +extern const char *const hash_algo_name[HASH_ALGO__LAST]; +extern const int hash_digest_size[HASH_ALGO__LAST]; + +#endif /* _CRYPTO_HASH_INFO_H */ diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index f5b0224c996..fc09732613a 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -15,6 +15,7 @@ #define _LINUX_PUBLIC_KEY_H #include <linux/mpi.h> +#include <crypto/hash_info.h> enum pkey_algo { PKEY_ALGO_DSA, @@ -22,21 +23,11 @@ enum pkey_algo { PKEY_ALGO__LAST }; -extern const char *const pkey_algo[PKEY_ALGO__LAST]; +extern const char *const pkey_algo_name[PKEY_ALGO__LAST]; +extern const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST]; -enum pkey_hash_algo { - PKEY_HASH_MD4, - PKEY_HASH_MD5, - PKEY_HASH_SHA1, - PKEY_HASH_RIPE_MD_160, - PKEY_HASH_SHA256, - PKEY_HASH_SHA384, - PKEY_HASH_SHA512, - PKEY_HASH_SHA224, - PKEY_HASH__LAST -}; - -extern const char *const pkey_hash_algo[PKEY_HASH__LAST]; +/* asymmetric key implementation supports only up to SHA224 */ +#define PKEY_HASH__LAST (HASH_ALGO_SHA224 + 1) enum pkey_id_type { PKEY_ID_PGP, /* OpenPGP generated key ID */ @@ -44,7 +35,7 @@ enum pkey_id_type { PKEY_ID_TYPE__LAST }; -extern const char *const pkey_id_type[PKEY_ID_TYPE__LAST]; +extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST]; /* * Cryptographic data for the public-key subtype of the asymmetric key type. @@ -59,6 +50,7 @@ struct public_key { #define PKEY_CAN_DECRYPT 0x02 #define PKEY_CAN_SIGN 0x04 #define PKEY_CAN_VERIFY 0x08 + enum pkey_algo pkey_algo : 8; enum pkey_id_type id_type : 8; union { MPI mpi[5]; @@ -88,7 +80,8 @@ struct public_key_signature { u8 *digest; u8 digest_size; /* Number of bytes in digest */ u8 nr_mpi; /* Occupancy of mpi[] */ - enum pkey_hash_algo pkey_hash_algo : 8; + enum pkey_algo pkey_algo : 8; + enum hash_algo pkey_hash_algo : 8; union { MPI mpi[2]; struct { diff --git a/include/keys/big_key-type.h b/include/keys/big_key-type.h new file mode 100644 index 00000000000..d69bc8af329 --- /dev/null +++ b/include/keys/big_key-type.h @@ -0,0 +1,25 @@ +/* Big capacity key type. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _KEYS_BIG_KEY_TYPE_H +#define _KEYS_BIG_KEY_TYPE_H + +#include <linux/key-type.h> + +extern struct key_type key_type_big_key; + +extern int big_key_instantiate(struct key *key, struct key_preparsed_payload *prep); +extern void big_key_revoke(struct key *key); +extern void big_key_destroy(struct key *key); +extern void big_key_describe(const struct key *big_key, struct seq_file *m); +extern long big_key_read(const struct key *key, char __user *buffer, size_t buflen); + +#endif /* _KEYS_BIG_KEY_TYPE_H */ diff --git a/include/keys/keyring-type.h b/include/keys/keyring-type.h index cf49159b0e3..fca5c62340a 100644 --- a/include/keys/keyring-type.h +++ b/include/keys/keyring-type.h @@ -1,6 +1,6 @@ /* Keyring key type * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -13,19 +13,6 @@ #define _KEYS_KEYRING_TYPE_H #include <linux/key.h> -#include <linux/rcupdate.h> - -/* - * the keyring payload contains a list of the keys to which the keyring is - * subscribed - */ -struct keyring_list { - struct rcu_head rcu; /* RCU deletion hook */ - unsigned short maxkeys; /* max keys this list can hold */ - unsigned short nkeys; /* number of keys currently held */ - unsigned short delkey; /* key to be unlinked by RCU */ - struct key __rcu *keys[0]; -}; - +#include <linux/assoc_array.h> #endif /* _KEYS_KEYRING_TYPE_H */ diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h new file mode 100644 index 00000000000..8dabc399bd1 --- /dev/null +++ b/include/keys/system_keyring.h @@ -0,0 +1,23 @@ +/* System keyring containing trusted public keys. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _KEYS_SYSTEM_KEYRING_H +#define _KEYS_SYSTEM_KEYRING_H + +#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING + +#include <linux/key.h> + +extern struct key *system_trusted_keyring; + +#endif + +#endif /* _KEYS_SYSTEM_KEYRING_H */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b0972c4ce81..d9099b15b47 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -44,6 +44,20 @@ #include <acpi/acpi_numa.h> #include <asm/acpi.h> +static inline acpi_handle acpi_device_handle(struct acpi_device *adev) +{ + return adev ? adev->handle : NULL; +} + +#define ACPI_COMPANION(dev) ((dev)->acpi_node.companion) +#define ACPI_COMPANION_SET(dev, adev) ACPI_COMPANION(dev) = (adev) +#define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) + +static inline const char *acpi_dev_name(struct acpi_device *adev) +{ + return dev_name(&adev->dev); +} + enum acpi_irq_model_id { ACPI_IRQ_MODEL_PIC = 0, ACPI_IRQ_MODEL_IOAPIC, @@ -401,6 +415,15 @@ static inline bool acpi_driver_match_device(struct device *dev, #define acpi_disabled 1 +#define ACPI_COMPANION(dev) (NULL) +#define ACPI_COMPANION_SET(dev, adev) do { } while (0) +#define ACPI_HANDLE(dev) (NULL) + +static inline const char *acpi_dev_name(struct acpi_device *adev) +{ + return NULL; +} + static inline void acpi_early_init(void) { } static inline int early_acpi_boot_init(void) diff --git a/include/linux/assoc_array.h b/include/linux/assoc_array.h new file mode 100644 index 00000000000..9a193b84238 --- /dev/null +++ b/include/linux/assoc_array.h @@ -0,0 +1,92 @@ +/* Generic associative array implementation. + * + * See Documentation/assoc_array.txt for information. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_ASSOC_ARRAY_H +#define _LINUX_ASSOC_ARRAY_H + +#ifdef CONFIG_ASSOCIATIVE_ARRAY + +#include <linux/types.h> + +#define ASSOC_ARRAY_KEY_CHUNK_SIZE BITS_PER_LONG /* Key data retrieved in chunks of this size */ + +/* + * Generic associative array. + */ +struct assoc_array { + struct assoc_array_ptr *root; /* The node at the root of the tree */ + unsigned long nr_leaves_on_tree; +}; + +/* + * Operations on objects and index keys for use by array manipulation routines. + */ +struct assoc_array_ops { + /* Method to get a chunk of an index key from caller-supplied data */ + unsigned long (*get_key_chunk)(const void *index_key, int level); + + /* Method to get a piece of an object's index key */ + unsigned long (*get_object_key_chunk)(const void *object, int level); + + /* Is this the object we're looking for? */ + bool (*compare_object)(const void *object, const void *index_key); + + /* How different are two objects, to a bit position in their keys? (or + * -1 if they're the same) + */ + int (*diff_objects)(const void *a, const void *b); + + /* Method to free an object. */ + void (*free_object)(void *object); +}; + +/* + * Access and manipulation functions. + */ +struct assoc_array_edit; + +static inline void assoc_array_init(struct assoc_array *array) +{ + array->root = NULL; + array->nr_leaves_on_tree = 0; +} + +extern int assoc_array_iterate(const struct assoc_array *array, + int (*iterator)(const void *object, + void *iterator_data), + void *iterator_data); +extern void *assoc_array_find(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key); +extern void assoc_array_destroy(struct assoc_array *array, + const struct assoc_array_ops *ops); +extern struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + void *object); +extern void assoc_array_insert_set_object(struct assoc_array_edit *edit, + void *object); +extern struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key); +extern struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, + const struct assoc_array_ops *ops); +extern void assoc_array_apply_edit(struct assoc_array_edit *edit); +extern void assoc_array_cancel_edit(struct assoc_array_edit *edit); +extern int assoc_array_gc(struct assoc_array *array, + const struct assoc_array_ops *ops, + bool (*iterator)(void *object, void *iterator_data), + void *iterator_data); + +#endif /* CONFIG_ASSOCIATIVE_ARRAY */ +#endif /* _LINUX_ASSOC_ARRAY_H */ diff --git a/include/linux/assoc_array_priv.h b/include/linux/assoc_array_priv.h new file mode 100644 index 00000000000..711275e6681 --- /dev/null +++ b/include/linux/assoc_array_priv.h @@ -0,0 +1,182 @@ +/* Private definitions for the generic associative array implementation. + * + * See Documentation/assoc_array.txt for information. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_ASSOC_ARRAY_PRIV_H +#define _LINUX_ASSOC_ARRAY_PRIV_H + +#ifdef CONFIG_ASSOCIATIVE_ARRAY + +#include <linux/assoc_array.h> + +#define ASSOC_ARRAY_FAN_OUT 16 /* Number of slots per node */ +#define ASSOC_ARRAY_FAN_MASK (ASSOC_ARRAY_FAN_OUT - 1) +#define ASSOC_ARRAY_LEVEL_STEP (ilog2(ASSOC_ARRAY_FAN_OUT)) +#define ASSOC_ARRAY_LEVEL_STEP_MASK (ASSOC_ARRAY_LEVEL_STEP - 1) +#define ASSOC_ARRAY_KEY_CHUNK_MASK (ASSOC_ARRAY_KEY_CHUNK_SIZE - 1) +#define ASSOC_ARRAY_KEY_CHUNK_SHIFT (ilog2(BITS_PER_LONG)) + +/* + * Undefined type representing a pointer with type information in the bottom + * two bits. + */ +struct assoc_array_ptr; + +/* + * An N-way node in the tree. + * + * Each slot contains one of four things: + * + * (1) Nothing (NULL). + * + * (2) A leaf object (pointer types 0). + * + * (3) A next-level node (pointer type 1, subtype 0). + * + * (4) A shortcut (pointer type 1, subtype 1). + * + * The tree is optimised for search-by-ID, but permits reasonable iteration + * also. + * + * The tree is navigated by constructing an index key consisting of an array of + * segments, where each segment is ilog2(ASSOC_ARRAY_FAN_OUT) bits in size. + * + * The segments correspond to levels of the tree (the first segment is used at + * level 0, the second at level 1, etc.). + */ +struct assoc_array_node { + struct assoc_array_ptr *back_pointer; + u8 parent_slot; + struct assoc_array_ptr *slots[ASSOC_ARRAY_FAN_OUT]; + unsigned long nr_leaves_on_branch; +}; + +/* + * A shortcut through the index space out to where a collection of nodes/leaves + * with the same IDs live. + */ +struct assoc_array_shortcut { + struct assoc_array_ptr *back_pointer; + int parent_slot; + int skip_to_level; + struct assoc_array_ptr *next_node; + unsigned long index_key[]; +}; + +/* + * Preallocation cache. + */ +struct assoc_array_edit { + struct rcu_head rcu; + struct assoc_array *array; + const struct assoc_array_ops *ops; + const struct assoc_array_ops *ops_for_excised_subtree; + struct assoc_array_ptr *leaf; + struct assoc_array_ptr **leaf_p; + struct assoc_array_ptr *dead_leaf; + struct assoc_array_ptr *new_meta[3]; + struct assoc_array_ptr *excised_meta[1]; + struct assoc_array_ptr *excised_subtree; + struct assoc_array_ptr **set_backpointers[ASSOC_ARRAY_FAN_OUT]; + struct assoc_array_ptr *set_backpointers_to; + struct assoc_array_node *adjust_count_on; + long adjust_count_by; + struct { + struct assoc_array_ptr **ptr; + struct assoc_array_ptr *to; + } set[2]; + struct { + u8 *p; + u8 to; + } set_parent_slot[1]; + u8 segment_cache[ASSOC_ARRAY_FAN_OUT + 1]; +}; + +/* + * Internal tree member pointers are marked in the bottom one or two bits to + * indicate what type they are so that we don't have to look behind every + * pointer to see what it points to. + * + * We provide functions to test type annotations and to create and translate + * the annotated pointers. + */ +#define ASSOC_ARRAY_PTR_TYPE_MASK 0x1UL +#define ASSOC_ARRAY_PTR_LEAF_TYPE 0x0UL /* Points to leaf (or nowhere) */ +#define ASSOC_ARRAY_PTR_META_TYPE 0x1UL /* Points to node or shortcut */ +#define ASSOC_ARRAY_PTR_SUBTYPE_MASK 0x2UL +#define ASSOC_ARRAY_PTR_NODE_SUBTYPE 0x0UL +#define ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE 0x2UL + +static inline bool assoc_array_ptr_is_meta(const struct assoc_array_ptr *x) +{ + return (unsigned long)x & ASSOC_ARRAY_PTR_TYPE_MASK; +} +static inline bool assoc_array_ptr_is_leaf(const struct assoc_array_ptr *x) +{ + return !assoc_array_ptr_is_meta(x); +} +static inline bool assoc_array_ptr_is_shortcut(const struct assoc_array_ptr *x) +{ + return (unsigned long)x & ASSOC_ARRAY_PTR_SUBTYPE_MASK; +} +static inline bool assoc_array_ptr_is_node(const struct assoc_array_ptr *x) +{ + return !assoc_array_ptr_is_shortcut(x); +} + +static inline void *assoc_array_ptr_to_leaf(const struct assoc_array_ptr *x) +{ + return (void *)((unsigned long)x & ~ASSOC_ARRAY_PTR_TYPE_MASK); +} + +static inline +unsigned long __assoc_array_ptr_to_meta(const struct assoc_array_ptr *x) +{ + return (unsigned long)x & + ~(ASSOC_ARRAY_PTR_SUBTYPE_MASK | ASSOC_ARRAY_PTR_TYPE_MASK); +} +static inline +struct assoc_array_node *assoc_array_ptr_to_node(const struct assoc_array_ptr *x) +{ + return (struct assoc_array_node *)__assoc_array_ptr_to_meta(x); +} +static inline +struct assoc_array_shortcut *assoc_array_ptr_to_shortcut(const struct assoc_array_ptr *x) +{ + return (struct assoc_array_shortcut *)__assoc_array_ptr_to_meta(x); +} + +static inline +struct assoc_array_ptr *__assoc_array_x_to_ptr(const void *p, unsigned long t) +{ + return (struct assoc_array_ptr *)((unsigned long)p | t); +} +static inline +struct assoc_array_ptr *assoc_array_leaf_to_ptr(const void *p) +{ + return __assoc_array_x_to_ptr(p, ASSOC_ARRAY_PTR_LEAF_TYPE); +} +static inline +struct assoc_array_ptr *assoc_array_node_to_ptr(const struct assoc_array_node *p) +{ + return __assoc_array_x_to_ptr( + p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_NODE_SUBTYPE); +} +static inline +struct assoc_array_ptr *assoc_array_shortcut_to_ptr(const struct assoc_array_shortcut *p) +{ + return __assoc_array_x_to_ptr( + p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE); +} + +#endif /* CONFIG_ASSOCIATIVE_ARRAY */ +#endif /* _LINUX_ASSOC_ARRAY_PRIV_H */ diff --git a/include/linux/audit.h b/include/linux/audit.h index 729a4d165bc..a40641954c2 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -73,6 +73,8 @@ struct audit_field { void *lsm_rule; }; +extern int is_audit_feature_set(int which); + extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); @@ -207,7 +209,7 @@ static inline int audit_get_sessionid(struct task_struct *tsk) extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); -extern int __audit_bprm(struct linux_binprm *bprm); +extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); @@ -236,11 +238,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } -static inline int audit_bprm(struct linux_binprm *bprm) +static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) - return __audit_bprm(bprm); - return 0; + __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { @@ -367,10 +368,8 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } -static inline int audit_bprm(struct linux_binprm *bprm) -{ - return 0; -} +static inline void audit_bprm(struct linux_binprm *bprm) +{ } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f26ec20f635..1b135d49b27 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -505,6 +505,9 @@ struct request_queue { (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) +#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_SAME_COMP)) + static inline void queue_lockdep_assert_held(struct request_queue *q) { if (q->queue_lock) diff --git a/include/linux/device.h b/include/linux/device.h index b025925df7f..952b01033c3 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -644,9 +644,11 @@ struct device_dma_parameters { unsigned long segment_boundary_mask; }; +struct acpi_device; + struct acpi_dev_node { #ifdef CONFIG_ACPI - void *handle; + struct acpi_device *companion; #endif }; @@ -790,14 +792,6 @@ static inline struct device *kobj_to_dev(struct kobject *kobj) return container_of(kobj, struct device, kobj); } -#ifdef CONFIG_ACPI -#define ACPI_HANDLE(dev) ((dev)->acpi_node.handle) -#define ACPI_HANDLE_SET(dev, _handle_) (dev)->acpi_node.handle = (_handle_) -#else -#define ACPI_HANDLE(dev) (NULL) -#define ACPI_HANDLE_SET(dev, _handle_) do { } while (0) -#endif - /* Get the wakeup routines, which depend on struct device */ #include <linux/pm_wakeup.h> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 0bc72753410..41cf0c39928 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -45,13 +45,13 @@ static inline int dma_submit_error(dma_cookie_t cookie) /** * enum dma_status - DMA transaction status - * @DMA_SUCCESS: transaction completed successfully + * @DMA_COMPLETE: transaction completed * @DMA_IN_PROGRESS: transaction not yet processed * @DMA_PAUSED: transaction is paused * @DMA_ERROR: transaction failed */ enum dma_status { - DMA_SUCCESS, + DMA_COMPLETE, DMA_IN_PROGRESS, DMA_PAUSED, DMA_ERROR, @@ -171,12 +171,6 @@ struct dma_interleaved_template { * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client * acknowledges receipt, i.e. has has a chance to establish any dependency * chains - * @DMA_COMPL_SKIP_SRC_UNMAP - set to disable dma-unmapping the source buffer(s) - * @DMA_COMPL_SKIP_DEST_UNMAP - set to disable dma-unmapping the destination(s) - * @DMA_COMPL_SRC_UNMAP_SINGLE - set to do the source dma-unmapping as single - * (if not set, do the source dma-unmapping as page) - * @DMA_COMPL_DEST_UNMAP_SINGLE - set to do the destination dma-unmapping as single - * (if not set, do the destination dma-unmapping as page) * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as @@ -188,14 +182,10 @@ struct dma_interleaved_template { enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), DMA_CTRL_ACK = (1 << 1), - DMA_COMPL_SKIP_SRC_UNMAP = (1 << 2), - DMA_COMPL_SKIP_DEST_UNMAP = (1 << 3), - DMA_COMPL_SRC_UNMAP_SINGLE = (1 << 4), - DMA_COMPL_DEST_UNMAP_SINGLE = (1 << 5), - DMA_PREP_PQ_DISABLE_P = (1 << 6), - DMA_PREP_PQ_DISABLE_Q = (1 << 7), - DMA_PREP_CONTINUE = (1 << 8), - DMA_PREP_FENCE = (1 << 9), + DMA_PREP_PQ_DISABLE_P = (1 << 2), + DMA_PREP_PQ_DISABLE_Q = (1 << 3), + DMA_PREP_CONTINUE = (1 << 4), + DMA_PREP_FENCE = (1 << 5), }; /** @@ -413,6 +403,17 @@ void dma_chan_cleanup(struct kref *kref); typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); typedef void (*dma_async_tx_callback)(void *dma_async_param); + +struct dmaengine_unmap_data { + u8 to_cnt; + u8 from_cnt; + u8 bidi_cnt; + struct device *dev; + struct kref kref; + size_t len; + dma_addr_t addr[0]; +}; + /** * struct dma_async_tx_descriptor - async transaction descriptor * ---dma generic offload fields--- @@ -438,6 +439,7 @@ struct dma_async_tx_descriptor { dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx); dma_async_tx_callback callback; void *callback_param; + struct dmaengine_unmap_data *unmap; #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH struct dma_async_tx_descriptor *next; struct dma_async_tx_descriptor *parent; @@ -445,6 +447,40 @@ struct dma_async_tx_descriptor { #endif }; +#ifdef CONFIG_DMA_ENGINE +static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx, + struct dmaengine_unmap_data *unmap) +{ + kref_get(&unmap->kref); + tx->unmap = unmap; +} + +struct dmaengine_unmap_data * +dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags); +void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap); +#else +static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx, + struct dmaengine_unmap_data *unmap) +{ +} +static inline struct dmaengine_unmap_data * +dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) +{ + return NULL; +} +static inline void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap) +{ +} +#endif + +static inline void dma_descriptor_unmap(struct dma_async_tx_descriptor *tx) +{ + if (tx->unmap) { + dmaengine_unmap_put(tx->unmap); + tx->unmap = NULL; + } +} + #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH static inline void txd_lock(struct dma_async_tx_descriptor *txd) { @@ -979,10 +1015,10 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, { if (last_complete <= last_used) { if ((cookie <= last_complete) || (cookie > last_used)) - return DMA_SUCCESS; + return DMA_COMPLETE; } else { if ((cookie <= last_complete) && (cookie > last_used)) - return DMA_SUCCESS; + return DMA_COMPLETE; } return DMA_IN_PROGRESS; } @@ -1013,11 +1049,11 @@ static inline struct dma_chan *dma_find_channel(enum dma_transaction_type tx_typ } static inline enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) { - return DMA_SUCCESS; + return DMA_COMPLETE; } static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { - return DMA_SUCCESS; + return DMA_COMPLETE; } static inline void dma_issue_pending_all(void) { diff --git a/include/linux/fs.h b/include/linux/fs.h index bf5d574ebdf..121f11f001c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2622,7 +2622,9 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); +extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index acd2010328f..9649ff0c63f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); int PageHuge(struct page *page); +int PageHeadHuge(struct page *page_head); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -69,7 +70,6 @@ int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); bool is_hugepage_active(struct page *page); -void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); @@ -104,6 +104,11 @@ static inline int PageHuge(struct page *page) return 0; } +static inline int PageHeadHuge(struct page *page_head) +{ + return 0; +} + static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } @@ -140,9 +145,6 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) #define isolate_huge_page(p, l) false #define putback_active_hugepage(p) do {} while (0) #define is_hugepage_active(x) false -static inline void copy_huge_page(struct page *dst, struct page *src) -{ -} static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 518a53afb9e..a74c3a84dfd 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,6 +45,7 @@ struct key_preparsed_payload { const void *data; /* Raw data */ size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ + bool trusted; /* True if key is trusted */ }; typedef int (*request_key_actor_t)(struct key_construction *key, @@ -63,6 +64,11 @@ struct key_type { */ size_t def_datalen; + /* Default key search algorithm. */ + unsigned def_lookup_type; +#define KEYRING_SEARCH_LOOKUP_DIRECT 0x0000 /* Direct lookup by description. */ +#define KEYRING_SEARCH_LOOKUP_ITERATE 0x0001 /* Iterative search. */ + /* vet a description */ int (*vet_description)(const char *description); diff --git a/include/linux/key.h b/include/linux/key.h index 4dfde1161c5..80d677483e3 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -22,6 +22,7 @@ #include <linux/sysctl.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/assoc_array.h> #ifdef __KERNEL__ #include <linux/uidgid.h> @@ -82,6 +83,12 @@ struct key_owner; struct keyring_list; struct keyring_name; +struct keyring_index_key { + struct key_type *type; + const char *description; + size_t desc_len; +}; + /*****************************************************************************/ /* * key reference with possession attribute handling @@ -99,7 +106,7 @@ struct keyring_name; typedef struct __key_reference_with_attributes *key_ref_t; static inline key_ref_t make_key_ref(const struct key *key, - unsigned long possession) + bool possession) { return (key_ref_t) ((unsigned long) key | possession); } @@ -109,7 +116,7 @@ static inline struct key *key_ref_to_ptr(const key_ref_t key_ref) return (struct key *) ((unsigned long) key_ref & ~1UL); } -static inline unsigned long is_key_possessed(const key_ref_t key_ref) +static inline bool is_key_possessed(const key_ref_t key_ref) { return (unsigned long) key_ref & 1UL; } @@ -129,7 +136,6 @@ struct key { struct list_head graveyard_link; struct rb_node serial_node; }; - struct key_type *type; /* type of key */ struct rw_semaphore sem; /* change vs change sem */ struct key_user *user; /* owner of this key */ void *security; /* security data for this key */ @@ -162,13 +168,21 @@ struct key { #define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ #define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ #define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ +#define KEY_FLAG_TRUSTED 8 /* set if key is trusted */ +#define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ - /* the description string - * - this is used to match a key against search criteria - * - this should be a printable string + /* the key type and key description string + * - the desc is used to match a key against search criteria + * - it should be a printable string * - eg: for krb5 AFS, this might be "afs@REDHAT.COM" */ - char *description; + union { + struct keyring_index_key index_key; + struct { + struct key_type *type; /* type of key */ + char *description; + }; + }; /* type specific data * - this is used by the keyring type to index the name @@ -185,11 +199,14 @@ struct key { * whatever */ union { - unsigned long value; - void __rcu *rcudata; - void *data; - struct keyring_list __rcu *subscriptions; - } payload; + union { + unsigned long value; + void __rcu *rcudata; + void *data; + void *data2[2]; + } payload; + struct assoc_array keys; + }; }; extern struct key *key_alloc(struct key_type *type, @@ -203,18 +220,23 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ +#define KEY_ALLOC_TRUSTED 0x0004 /* Key should be flagged as trusted */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); extern void key_put(struct key *key); -static inline struct key *key_get(struct key *key) +static inline struct key *__key_get(struct key *key) { - if (key) - atomic_inc(&key->usage); + atomic_inc(&key->usage); return key; } +static inline struct key *key_get(struct key *key) +{ + return key ? __key_get(key) : key; +} + static inline void key_ref_put(key_ref_t key_ref) { key_put(key_ref_to_ptr(key_ref)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0548eb201e0..1cedd000cf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1318,7 +1318,6 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #if USE_SPLIT_PTE_PTLOCKS #if BLOATED_SPINLOCKS -void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1327,7 +1326,6 @@ static inline spinlock_t *ptlock_ptr(struct page *page) return page->ptl; } #else /* BLOATED_SPINLOCKS */ -static inline void ptlock_cache_init(void) {} static inline bool ptlock_alloc(struct page *page) { return true; @@ -1380,17 +1378,10 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } -static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline void pgtable_init(void) -{ - ptlock_cache_init(); - pgtable_cache_init(); -} - static inline bool pgtable_page_ctor(struct page *page) { inc_zone_page_state(page, NR_PAGETABLE); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 10f5a7272b8..bd299418a93 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -44,18 +44,22 @@ struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. - */ + union { + struct address_space *mapping; /* If low bit clear, points to + * inode address_space, or NULL. + * If page mapped as anonymous + * memory, low bit is set, and + * it points to anon_vma object: + * see PAGE_MAPPING_ANON below. + */ + void *s_mem; /* slab first object */ + }; + /* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* slub/slob first free object */ + void *freelist; /* sl[aou]b first free object */ bool pfmemalloc; /* If set by the page allocator, * ALLOC_NO_WATERMARKS was set * and the low watermark was not @@ -65,9 +69,6 @@ struct page { * this page is only used to * free other pages. */ -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - pgtable_t pmd_huge_pte; /* protected by page->ptl */ -#endif }; union { @@ -114,6 +115,7 @@ struct page { }; atomic_t _count; /* Usage count, see below. */ }; + unsigned int active; /* SLAB */ }; }; @@ -135,6 +137,12 @@ struct page { struct list_head list; /* slobs list of pages */ struct slab *slab_page; /* slab fields */ + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + pgtable_t pmd_huge_pte; /* protected by page->ptl */ +#endif }; /* Remainder is not double word aligned */ diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index d006f0ca60f..5a462c4e500 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -27,7 +27,7 @@ static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) while (!pci_is_root_bus(pbus)) pbus = pbus->parent; - return DEVICE_ACPI_HANDLE(pbus->bridge); + return ACPI_HANDLE(pbus->bridge); } static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) @@ -39,7 +39,7 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) else dev = &pbus->self->dev; - return DEVICE_ACPI_HANDLE(dev); + return ACPI_HANDLE(dev); } void acpi_pci_add_bus(struct pci_bus *bus); diff --git a/include/linux/platform_data/edma.h b/include/linux/platform_data/edma.h index 179fb91bb5f..f50821cb64b 100644 --- a/include/linux/platform_data/edma.h +++ b/include/linux/platform_data/edma.h @@ -67,10 +67,10 @@ struct edmacc_param { #define ITCCHEN BIT(23) /*ch_status paramater of callback function possible values*/ -#define DMA_COMPLETE 1 -#define DMA_CC_ERROR 2 -#define DMA_TC1_ERROR 3 -#define DMA_TC2_ERROR 4 +#define EDMA_DMA_COMPLETE 1 +#define EDMA_DMA_CC_ERROR 2 +#define EDMA_DMA_TC1_ERROR 3 +#define EDMA_DMA_TC2_ERROR 4 enum address_mode { INCR = 0, diff --git a/include/linux/security.h b/include/linux/security.h index 9d37e2b9d3e..5623a7f965b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1052,17 +1052,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @xfrm_policy_delete_security: * @ctx contains the xfrm_sec_ctx. * Authorize deletion of xp->security. - * @xfrm_state_alloc_security: + * @xfrm_state_alloc: * @x contains the xfrm_state being added to the Security Association * Database by the XFRM system. * @sec_ctx contains the security context information being provided by * the user-level SA generation program (e.g., setkey or racoon). - * @secid contains the secid from which to take the mls portion of the context. * Allocate a security structure to the x->security field; the security * field is initialized to NULL when the xfrm_state is allocated. Set the - * context to correspond to either sec_ctx or polsec, with the mls portion - * taken from secid in the latter case. - * Return 0 if operation was successful (memory to allocate, legal context). + * context to correspond to sec_ctx. Return 0 if operation was successful + * (memory to allocate, legal context). + * @xfrm_state_alloc_acquire: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @polsec contains the policy's security context. + * @secid contains the secid from which to take the mls portion of the + * context. + * Allocate a security structure to the x->security field; the security + * field is initialized to NULL when the xfrm_state is allocated. Set the + * context to correspond to secid. Return 0 if operation was successful + * (memory to allocate, legal context). * @xfrm_state_free_security: * @x contains the xfrm_state. * Deallocate x->security. @@ -1679,9 +1687,11 @@ struct security_operations { int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx); void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx); int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx); - int (*xfrm_state_alloc_security) (struct xfrm_state *x, - struct xfrm_user_sec_ctx *sec_ctx, - u32 secid); + int (*xfrm_state_alloc) (struct xfrm_state *x, + struct xfrm_user_sec_ctx *sec_ctx); + int (*xfrm_state_alloc_acquire) (struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, + u32 secid); void (*xfrm_state_free_security) (struct xfrm_state *x); int (*xfrm_state_delete_security) (struct xfrm_state *x); int (*xfrm_policy_lookup) (struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 1e8a8b6e837..cf87a24c0f9 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -354,6 +354,35 @@ static inline void read_sequnlock_excl(seqlock_t *sl) spin_unlock(&sl->lock); } +/** + * read_seqbegin_or_lock - begin a sequence number check or locking block + * @lock: sequence lock + * @seq : sequence number to be checked + * + * First try it once optimistically without taking the lock. If that fails, + * take the lock. The sequence number is also used as a marker for deciding + * whether to be a reader (even) or writer (odd). + * N.B. seq must be initialized to an even number to begin with. + */ +static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) +{ + if (!(*seq & 1)) /* Even */ + *seq = read_seqbegin(lock); + else /* Odd */ + read_seqlock_excl(lock); +} + +static inline int need_seqretry(seqlock_t *lock, int seq) +{ + return !(seq & 1) && read_seqretry(lock, seq); +} + +static inline void done_seqretry(seqlock_t *lock, int seq) +{ + if (seq & 1) + read_sequnlock_excl(lock); +} + static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); diff --git a/include/linux/slab.h b/include/linux/slab.h index 74f105847d1..c2bba248fa6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -53,7 +53,14 @@ * } * rcu_read_unlock(); * - * See also the comment on struct slab_rcu in mm/slab.c. + * This is useful if we need to approach a kernel structure obliquely, + * from its address obtained without the usual locking. We can lock + * the structure to stabilize it and check it's still at the given address, + * only if we can be sure that the memory has not been meanwhile reused + * for some other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. */ #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index e9346b4f1ef..09bfffb08a5 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -27,8 +27,8 @@ struct kmem_cache { size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ - struct kmem_cache *slabp_cache; - unsigned int slab_size; + struct kmem_cache *freelist_cache; + unsigned int freelist_size; /* constructor func */ void (*ctor)(void *obj); diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index cc0b67eada4..f56bfa9e452 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -11,7 +11,7 @@ enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_FASTPATH, /* Free to cpu slub */ + FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4db29859464..4836ba3c1cd 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -27,6 +27,12 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + + /* Register of per-UID persistent keyrings for this namespace */ +#ifdef CONFIG_PERSISTENT_KEYRINGS + struct key *persistent_keyring_register; + struct rw_semaphore persistent_keyring_register_sem; +#endif }; extern struct user_namespace init_user_ns; diff --git a/include/linux/wait.h b/include/linux/wait.h index 61939ba30aa..eaa00b10aba 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -278,6 +278,31 @@ do { \ __ret; \ }) +#define __wait_event_cmd(wq, condition, cmd1, cmd2) \ + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + cmd1; schedule(); cmd2) + +/** + * wait_event_cmd - sleep until a condition gets true + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * cmd1: the command will be executed before sleep + * cmd2: the command will be executed after sleep + * + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the + * @condition evaluates to true. The @condition is checked each time + * the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + */ +#define wait_event_cmd(wq, condition, cmd1, cmd2) \ +do { \ + if (condition) \ + break; \ + __wait_event_cmd(wq, condition, cmd1, cmd2); \ +} while (0) + #define __wait_event_interruptible(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f18b3b76e01..4832d75dcba 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -162,12 +162,14 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, { EXTENT_FLAG_LOGGING, "LOGGING" }, \ { EXTENT_FLAG_FILLING, "FILLING" }) -TRACE_EVENT(btrfs_get_extent, +TRACE_EVENT_CONDITION(btrfs_get_extent, TP_PROTO(struct btrfs_root *root, struct extent_map *map), TP_ARGS(root, map), + TP_CONDITION(map), + TP_STRUCT__entry( __field( u64, root_objectid ) __field( u64, start ) diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index db0b825b481..44b05a09f19 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -68,6 +68,9 @@ #define AUDIT_MAKE_EQUIV 1015 /* Append to watched tree */ #define AUDIT_TTY_GET 1016 /* Get TTY auditing status */ #define AUDIT_TTY_SET 1017 /* Set TTY auditing status */ +#define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */ +#define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */ +#define AUDIT_FEATURE_CHANGE 1020 /* audit log listing feature changes */ #define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly uninteresting to kernel */ #define AUDIT_USER_AVC 1107 /* We filter this differently */ @@ -357,6 +360,12 @@ enum { #define AUDIT_PERM_READ 4 #define AUDIT_PERM_ATTR 8 +/* MAX_AUDIT_MESSAGE_LENGTH is set in audit:lib/libaudit.h as: + * 8970 // PATH_MAX*2+CONTEXT_SIZE*2+11+256+1 + * max header+body+tailer: 44 + 29 + 32 + 262 + 7 + pad + */ +#define AUDIT_MESSAGE_TEXT_MAX 8560 + struct audit_status { __u32 mask; /* Bit mask for valid entries */ __u32 enabled; /* 1 = enabled, 0 = disabled */ @@ -368,11 +377,28 @@ struct audit_status { __u32 backlog; /* messages waiting in queue */ }; +struct audit_features { +#define AUDIT_FEATURE_VERSION 1 + __u32 vers; + __u32 mask; /* which bits we are dealing with */ + __u32 features; /* which feature to enable/disable */ + __u32 lock; /* which features to lock */ +}; + +#define AUDIT_FEATURE_ONLY_UNSET_LOGINUID 0 +#define AUDIT_FEATURE_LOGINUID_IMMUTABLE 1 +#define AUDIT_LAST_FEATURE AUDIT_FEATURE_LOGINUID_IMMUTABLE + +#define audit_feature_valid(x) ((x) >= 0 && (x) <= AUDIT_LAST_FEATURE) +#define AUDIT_FEATURE_TO_MASK(x) (1 << ((x) & 31)) /* mask for __u32 */ + struct audit_tty_status { __u32 enabled; /* 1 = enabled, 0 = disabled */ __u32 log_passwd; /* 1 = enabled, 0 = disabled */ }; +#define AUDIT_UID_UNSET (unsigned int)-1 + /* audit_rule_data supports filter rules with both integer and string * fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and * AUDIT_LIST_RULES requests. diff --git a/include/uapi/linux/hash_info.h b/include/uapi/linux/hash_info.h new file mode 100644 index 00000000000..ca18c45f830 --- /dev/null +++ b/include/uapi/linux/hash_info.h @@ -0,0 +1,37 @@ +/* + * Hash Info: Hash algorithms information + * + * Copyright (c) 2013 Dmitry Kasatkin <d.kasatkin@samsung.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#ifndef _UAPI_LINUX_HASH_INFO_H +#define _UAPI_LINUX_HASH_INFO_H + +enum hash_algo { + HASH_ALGO_MD4, + HASH_ALGO_MD5, + HASH_ALGO_SHA1, + HASH_ALGO_RIPE_MD_160, + HASH_ALGO_SHA256, + HASH_ALGO_SHA384, + HASH_ALGO_SHA512, + HASH_ALGO_SHA224, + HASH_ALGO_RIPE_MD_128, + HASH_ALGO_RIPE_MD_256, + HASH_ALGO_RIPE_MD_320, + HASH_ALGO_WP_256, + HASH_ALGO_WP_384, + HASH_ALGO_WP_512, + HASH_ALGO_TGR_128, + HASH_ALGO_TGR_160, + HASH_ALGO_TGR_192, + HASH_ALGO__LAST +}; + +#endif /* _UAPI_LINUX_HASH_INFO_H */ diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index c9b7f4faf97..840cb990abe 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -56,5 +56,6 @@ #define KEYCTL_REJECT 19 /* reject a partially constructed key */ #define KEYCTL_INSTANTIATE_IOV 20 /* instantiate a partially constructed key */ #define KEYCTL_INVALIDATE 21 /* invalidate a key */ +#define KEYCTL_GET_PERSISTENT 22 /* get a user's persistent keyring */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index fe1a5406d4d..f7cf7f35114 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -16,6 +16,7 @@ #define _MD_P_H #include <linux/types.h> +#include <asm/byteorder.h> /* * RAID superblock. diff --git a/init/Kconfig b/init/Kconfig index 3fc8a2f2fac..79383d3aa5d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -301,20 +301,6 @@ config AUDIT_TREE depends on AUDITSYSCALL select FSNOTIFY -config AUDIT_LOGINUID_IMMUTABLE - bool "Make audit loginuid immutable" - depends on AUDIT - help - The config option toggles if a task setting its loginuid requires - CAP_SYS_AUDITCONTROL or if that task should require no special permissions - but should instead only allow setting its loginuid if it was never - previously set. On systems which use systemd or a similar central - process to restart login services this should be set to true. On older - systems in which an admin would typically have to directly stop and - start processes this should be set to false. Setting this to true allows - one to drop potentially dangerous capabilites from the login tasks, - but may not be backwards compatible with older init systems. - source "kernel/irq/Kconfig" source "kernel/time/Kconfig" @@ -1669,6 +1655,18 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. + menuconfig MODULES bool "Enable loadable module support" option modules @@ -1742,6 +1740,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES + select SYSTEM_TRUSTED_KEYRING select KEYS select CRYPTO select ASYMMETRIC_KEY_TYPE diff --git a/init/main.c b/init/main.c index 01573fdfa18..febc511e078 100644 --- a/init/main.c +++ b/init/main.c @@ -476,7 +476,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_init(); + pgtable_cache_init(); vmalloc_init(); } diff --git a/ipc/shm.c b/ipc/shm.c index d69739610fd..7a51443a51d 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -208,15 +208,18 @@ static void shm_open(struct vm_area_struct *vma) */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { + struct file *shm_file; + + shm_file = shp->shm_file; + shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid(ns, shp); shm_unlock(shp); - if (!is_file_hugepages(shp->shm_file)) - shmem_lock(shp->shm_file, 0, shp->mlock_user); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_user); else if (shp->mlock_user) - user_shm_unlock(file_inode(shp->shm_file)->i_size, - shp->mlock_user); - fput (shp->shm_file); + user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); + fput(shm_file); ipc_rcu_putref(shp, shm_rcu_free); } @@ -974,15 +977,25 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) ipc_lock_object(&shp->shm_perm); if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); - err = -EPERM; if (!uid_eq(euid, shp->shm_perm.uid) && - !uid_eq(euid, shp->shm_perm.cuid)) + !uid_eq(euid, shp->shm_perm.cuid)) { + err = -EPERM; goto out_unlock0; - if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) + } + if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) { + err = -EPERM; goto out_unlock0; + } } shm_file = shp->shm_file; + + /* check if shm_destroy() is tearing down shp */ + if (shm_file == NULL) { + err = -EIDRM; + goto out_unlock0; + } + if (is_file_hugepages(shm_file)) goto out_unlock0; @@ -1101,6 +1114,14 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, goto out_unlock; ipc_lock_object(&shp->shm_perm); + + /* check if shm_destroy() is tearing down shp */ + if (shp->shm_file == NULL) { + ipc_unlock_object(&shp->shm_perm); + err = -EIDRM; + goto out_unlock; + } + path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; diff --git a/kernel/Makefile b/kernel/Makefile index 09a9c94f42b..bbaf7d59c1b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,8 +41,9 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -122,19 +123,52 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### +# +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel so that they get loaded into the system trusted keyring during +# boot. # -# Pull the signing certificate and any extra certificates into the kernel +# We look in the source root and the build root for all files whose name ends +# in ".x509". Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates. # +############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 +X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ + $(or $(realpath $(CERT)),$(CERT)))) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif + +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/system_certificates.o: $(obj)/x509_certificate_list -quiet_cmd_touch = TOUCH $@ - cmd_touch = touch $@ +quiet_cmd_x509certs = CERTS $@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") -extra_certificates: - $(call cmd,touch) +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list + $(call if_changed,x509certs) -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +targets += $(obj)/.x509.list +$(obj)/.x509.list: + @echo $(X509_CERTIFICATES) >$@ +clean-files := x509_certificate_list .x509.list +endif + +ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/audit.c b/kernel/audit.c index 7b0e23a740c..906ae5a0233 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif -#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -140,6 +139,17 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); +static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, + .mask = -1, + .features = 0, + .lock = 0,}; + +static char *audit_feature_names[2] = { + "only_unset_loginuid", + "loginuid_immutable", +}; + + /* Serialize requests from userspace. */ DEFINE_MUTEX(audit_cmd_mutex); @@ -584,6 +594,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: + case AUDIT_GET_FEATURE: + case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -613,7 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); - if (!audit_enabled) { + if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return rc; } @@ -628,6 +640,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return rc; } +int is_audit_feature_set(int i) +{ + return af.features & AUDIT_FEATURE_TO_MASK(i); +} + + +static int audit_get_feature(struct sk_buff *skb) +{ + u32 seq; + + seq = nlmsg_hdr(skb)->nlmsg_seq; + + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, + &af, sizeof(af)); + + return 0; +} + +static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, + u32 old_lock, u32 new_lock, int res) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", + audit_feature_names[which], !!old_feature, !!new_feature, + !!old_lock, !!new_lock, res); + audit_log_end(ab); +} + +static int audit_set_feature(struct sk_buff *skb) +{ + struct audit_features *uaf; + int i; + + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + uaf = nlmsg_data(nlmsg_hdr(skb)); + + /* if there is ever a version 2 we should handle that here */ + + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + new_lock = (uaf->lock | af.lock) & feature; + old_lock = af.lock & feature; + + /* are we changing a locked feature? */ + if ((af.lock & feature) && (new_feature != old_feature)) { + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 0); + return -EPERM; + } + } + /* nothing invalid, do the changes */ + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + old_lock = af.lock & feature; + new_lock = (uaf->lock | af.lock) & feature; + + if (new_feature != old_feature) + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 1); + + if (new_feature) + af.features |= feature; + else + af.features &= ~feature; + af.lock |= new_lock; + } + + return 0; +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -659,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) switch (msg_type) { case AUDIT_GET: + memset(&status_set, 0, sizeof(status_set)); status_set.enabled = audit_enabled; status_set.failure = audit_failure; status_set.pid = audit_pid; @@ -670,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) &status_set, sizeof(status_set)); break; case AUDIT_SET: - if (nlh->nlmsg_len < sizeof(struct audit_status)) + if (nlmsg_len(nlh) < sizeof(struct audit_status)) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { @@ -699,6 +800,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit); break; + case AUDIT_GET_FEATURE: + err = audit_get_feature(skb); + if (err) + return err; + break; + case AUDIT_SET_FEATURE: + err = audit_set_feature(skb); + if (err) + return err; + break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: @@ -715,7 +826,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.1024s'", + audit_log_format(ab, " msg='%.*s'", + AUDIT_MESSAGE_TEXT_MAX, (char *)data); else { int size; @@ -818,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk = current; spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty != 0; + s.enabled = tsk->signal->audit_tty; s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); @@ -832,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; @@ -1067,13 +1179,6 @@ static void wait_for_auditd(unsigned long sleep_time) remove_wait_queue(&audit_backlog_wait, &wait); } -/* Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, tsk - * should be NULL. */ - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1389,7 +1494,7 @@ void audit_log_session_info(struct audit_buffer *ab) u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); + audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) @@ -1536,6 +1641,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } } + /* log the audit_names record type */ + audit_log_format(ab, " nametype="); + switch(n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, "NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, "PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, "DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, "CREATE"); + break; + default: + audit_log_format(ab, "UNKNOWN"); + break; + } + audit_log_fcaps(ab, n); audit_log_end(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 123c9b7c397..b779642b29a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -197,6 +197,9 @@ struct audit_context { int fd; int flags; } mmap; + struct { + int argc; + } execve; }; int fds[2]; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7aee8be7fb..51f3fd4c1ed 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: + case AUDIT_INODE: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -423,7 +424,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9845cb32b60..90594c9f755 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -95,13 +95,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_execve { - struct audit_aux_data d; - int argc; - int envc; - struct mm_struct *mm; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -121,12 +114,6 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; -struct audit_aux_data_capset { - struct audit_aux_data d; - pid_t pid; - struct audit_cap_data cap; -}; - struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -566,7 +553,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = (name->ino == f->val); + result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { @@ -943,8 +930,10 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) + if (state == AUDIT_DISABLED) { + clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); return 0; + } if (!(context = audit_alloc_context(state))) { kfree(key); @@ -1149,20 +1138,16 @@ static int audit_log_single_execve_arg(struct audit_context *context, } static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab, - struct audit_aux_data_execve *axi) + struct audit_buffer **ab) { int i, len; size_t len_sent = 0; const char __user *p; char *buf; - if (axi->mm != current->mm) - return; /* execve failed, no additional info */ - - p = (const char __user *)axi->mm->arg_start; + p = (const char __user *)current->mm->arg_start; - audit_log_format(*ab, "argc=%d", axi->argc); + audit_log_format(*ab, "argc=%d", context->execve.argc); /* * we need some kernel buffer to hold the userspace args. Just @@ -1176,7 +1161,7 @@ static void audit_log_execve_info(struct audit_context *context, return; } - for (i = 0; i < axi->argc; i++) { + for (i = 0; i < context->execve.argc; i++) { len = audit_log_single_execve_arg(context, ab, i, &len_sent, p, buf); if (len <= 0) @@ -1279,6 +1264,9 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; } + case AUDIT_EXECVE: { + audit_log_execve_info(context, &ab); + break; } } audit_log_end(ab); } @@ -1325,11 +1313,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts switch (aux->type) { - case AUDIT_EXECVE: { - struct audit_aux_data_execve *axi = (void *)aux; - audit_log_execve_info(context, &ab, axi); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1964,6 +1947,43 @@ int auditsc_get_stamp(struct audit_context *ctx, /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, unsigned int sessionid, + int rc) +{ + struct audit_buffer *ab; + uid_t uid, ologinuid, nloginuid; + + uid = from_kuid(&init_user_ns, task_uid(current)); + ologinuid = from_kuid(&init_user_ns, koldloginuid); + nloginuid = from_kuid(&init_user_ns, kloginuid), + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " + "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, + nloginuid, oldsessionid, sessionid, !rc); + audit_log_end(ab); +} + /** * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value @@ -1975,37 +1995,26 @@ static atomic_t session_id = ATOMIC_INIT(0); int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - struct audit_context *context = task->audit_context; - unsigned int sessionid; + unsigned int oldsessionid, sessionid = (unsigned int)-1; + kuid_t oldloginuid; + int rc; -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (audit_loginuid_set(task)) - return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); - sessionid = atomic_inc_return(&session_id); - if (context && context->in_syscall) { - struct audit_buffer *ab; + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) + sessionid = atomic_inc_return(&session_id); - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u" - " old ses=%u new ses=%u", - task->pid, - from_kuid(&init_user_ns, task_uid(task)), - from_kuid(&init_user_ns, task->loginuid), - from_kuid(&init_user_ns, loginuid), - task->sessionid, sessionid); - audit_log_end(ab); - } - } task->sessionid = sessionid; task->loginuid = loginuid; - return 0; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; } /** @@ -2126,22 +2135,12 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int __audit_bprm(struct linux_binprm *bprm) +void __audit_bprm(struct linux_binprm *bprm) { - struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->argc = bprm->argc; - ax->envc = bprm->envc; - ax->mm = bprm->mm; - ax->d.type = AUDIT_EXECVE; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_EXECVE; + context->execve.argc = bprm->argc; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0839bcd48c..4c62513fe19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -895,11 +895,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1486,7 +1481,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 4a9a86d12c8..00000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,12 +0,0 @@ -#include <linux/export.h> - -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - - .section ".init.data","aw" - -GLOBAL(modsign_certificate_list) - .incbin "signing_key.x509" - .incbin "extra_certificates" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e..00000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ - pr_notice("Initialise module verification\n"); - - modsign_keyring = keyring_alloc(".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(modsign_keyring)) - panic("Can't allocate module signing keyring\n"); - - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading module verification certificates\n"); - - end = modsign_certificate_list_end; - p = modsign_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(modsign_keyring, 1), - "asymmetric", - NULL, - p, - plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(key)) - pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - else - pr_notice("MODSIGN: Loaded cert '%s'\n", - key_ref_to_ptr(key)->description); - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d0..915e123a430 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@ * 2 of the Licence, or (at your option) any later version. */ -extern struct key *modsign_keyring; - extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5e..be5b8fac4bd 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@ #include <crypto/public_key.h> #include <crypto/hash.h> #include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> #include "module-internal.h" /* @@ -28,7 +29,7 @@ */ struct module_signature { u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 hash; /* Digest algorithm [enum hash_algo] */ u8 id_type; /* Key identifier type [enum pkey_id_type] */ u8 signer_len; /* Length of signer's name */ u8 key_id_len; /* Length of key identifier */ @@ -39,7 +40,7 @@ struct module_signature { /* * Digest the module contents. */ -static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +static struct public_key_signature *mod_make_digest(enum hash_algo hash, const void *mod, unsigned long modlen) { @@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(modsign_keyring, 1), + key = keyring_search(make_key_ref(system_trusted_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", @@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !pkey_hash_algo[ms.hash]) + !hash_algo_name[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 10c22cae83a..b38109e204a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; diff --git a/kernel/power/user.c b/kernel/power/user.c index 24850270c80..98d357584cd 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; + data->free_bitmaps = false; error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 00000000000..4aef390671c --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,10 @@ +#include <linux/export.h> +#include <linux/init.h> + + __INITRODATA + + .globl VMLINUX_SYMBOL(system_certificate_list) +VMLINUX_SYMBOL(system_certificate_list): + .incbin "kernel/x509_certificate_list" + .globl VMLINUX_SYMBOL(system_certificate_list_end) +VMLINUX_SYMBOL(system_certificate_list_end): diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 00000000000..564dd93430a --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,105 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const u8 system_certificate_list_end[]; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ + pr_notice("Initialise system trusted keyring\n"); + + system_trusted_keyring = + keyring_alloc(".system_keyring", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(system_trusted_keyring)) + panic("Can't allocate system trusted keyring\n"); + + set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading compiled-in X.509 certificates\n"); + + end = system_certificate_list_end; + p = system_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), + "asymmetric", + NULL, + p, + plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_TRUSTED); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_system_certificate_list); diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e6..a3a0dbfda32 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,10 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, +#ifdef CONFIG_KEYS_KERBEROS_CACHE + .krb_cache_register_sem = + __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba5..240fb62cf39 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,6 +101,9 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); +#ifdef CONFIG_PERSISTENT_KEYRINGS + init_rwsem(&ns->persistent_keyring_register_sem); +#endif return 0; } @@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); ns = parent; diff --git a/lib/Kconfig b/lib/Kconfig index 06dc74200a5..991c98bc4a3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -322,6 +322,20 @@ config TEXTSEARCH_FSM config BTREE boolean +config ASSOCIATIVE_ARRAY + bool + help + Generic associative array. Can be searched and iterated over whilst + it is being modified. It is also reasonably quick to search and + modify. The algorithms are non-recursive, and the trees are highly + capacious. + + See: + + Documentation/assoc_array.txt + + for more information. + config HAS_IOMEM boolean depends on !NO_IOMEM diff --git a/lib/Makefile b/lib/Makefile index d480a8c9238..b46065fd67a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -47,6 +47,7 @@ CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o +obj-$(CONFIG_ASSOCIATIVE_ARRAY) += assoc_array.o obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o diff --git a/lib/assoc_array.c b/lib/assoc_array.c new file mode 100644 index 00000000000..17edeaf1918 --- /dev/null +++ b/lib/assoc_array.c @@ -0,0 +1,1746 @@ +/* Generic associative array implementation. + * + * See Documentation/assoc_array.txt for information. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +//#define DEBUG +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/assoc_array_priv.h> + +/* + * Iterate over an associative array. The caller must hold the RCU read lock + * or better. + */ +static int assoc_array_subtree_iterate(const struct assoc_array_ptr *root, + const struct assoc_array_ptr *stop, + int (*iterator)(const void *leaf, + void *iterator_data), + void *iterator_data) +{ + const struct assoc_array_shortcut *shortcut; + const struct assoc_array_node *node; + const struct assoc_array_ptr *cursor, *ptr, *parent; + unsigned long has_meta; + int slot, ret; + + cursor = root; + +begin_node: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + cursor = ACCESS_ONCE(shortcut->next_node); + } + + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + slot = 0; + + /* We perform two passes of each node. + * + * The first pass does all the leaves in this node. This means we + * don't miss any leaves if the node is split up by insertion whilst + * we're iterating over the branches rooted here (we may, however, see + * some leaves twice). + */ + has_meta = 0; + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + has_meta |= (unsigned long)ptr; + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + smp_read_barrier_depends(); + + /* Invoke the callback */ + ret = iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data); + if (ret) + return ret; + } + } + + /* The second pass attends to all the metadata pointers. If we follow + * one of these we may find that we don't come back here, but rather go + * back to a replacement node with the leaves in a different layout. + * + * We are guaranteed to make progress, however, as the slot number for + * a particular portion of the key space cannot change - and we + * continue at the back pointer + 1. + */ + if (!(has_meta & ASSOC_ARRAY_PTR_META_TYPE)) + goto finished_node; + slot = 0; + +continue_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (assoc_array_ptr_is_meta(ptr)) { + cursor = ptr; + goto begin_node; + } + } + +finished_node: + /* Move up to the parent (may need to skip back over a shortcut) */ + parent = ACCESS_ONCE(node->back_pointer); + slot = node->parent_slot; + if (parent == stop) + return 0; + + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + smp_read_barrier_depends(); + cursor = parent; + parent = ACCESS_ONCE(shortcut->back_pointer); + slot = shortcut->parent_slot; + if (parent == stop) + return 0; + } + + /* Ascend to next slot in parent node */ + cursor = parent; + slot++; + goto continue_node; +} + +/** + * assoc_array_iterate - Pass all objects in the array to a callback + * @array: The array to iterate over. + * @iterator: The callback function. + * @iterator_data: Private data for the callback function. + * + * Iterate over all the objects in an associative array. Each one will be + * presented to the iterator function. + * + * If the array is being modified concurrently with the iteration then it is + * possible that some objects in the array will be passed to the iterator + * callback more than once - though every object should be passed at least + * once. If this is undesirable then the caller must lock against modification + * for the duration of this function. + * + * The function will return 0 if no objects were in the array or else it will + * return the result of the last iterator function called. Iteration stops + * immediately if any call to the iteration function results in a non-zero + * return. + * + * The caller should hold the RCU read lock or better if concurrent + * modification is possible. + */ +int assoc_array_iterate(const struct assoc_array *array, + int (*iterator)(const void *object, + void *iterator_data), + void *iterator_data) +{ + struct assoc_array_ptr *root = ACCESS_ONCE(array->root); + + if (!root) + return 0; + return assoc_array_subtree_iterate(root, NULL, iterator, iterator_data); +} + +enum assoc_array_walk_status { + assoc_array_walk_tree_empty, + assoc_array_walk_found_terminal_node, + assoc_array_walk_found_wrong_shortcut, +} status; + +struct assoc_array_walk_result { + struct { + struct assoc_array_node *node; /* Node in which leaf might be found */ + int level; + int slot; + } terminal_node; + struct { + struct assoc_array_shortcut *shortcut; + int level; + int sc_level; + unsigned long sc_segments; + unsigned long dissimilarity; + } wrong_shortcut; +}; + +/* + * Navigate through the internal tree looking for the closest node to the key. + */ +static enum assoc_array_walk_status +assoc_array_walk(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *ptr; + unsigned long sc_segments, dissimilarity; + unsigned long segments; + int level, sc_level, next_sc_level; + int slot; + + pr_devel("-->%s()\n", __func__); + + cursor = ACCESS_ONCE(array->root); + if (!cursor) + return assoc_array_walk_tree_empty; + + level = 0; + + /* Use segments from the key for the new leaf to navigate through the + * internal tree, skipping through nodes and shortcuts that are on + * route to the destination. Eventually we'll come to a slot that is + * either empty or contains a leaf at which point we've found a node in + * which the leaf we're looking for might be found or into which it + * should be inserted. + */ +jumped: + segments = ops->get_key_chunk(index_key, level); + pr_devel("segments[%d]: %lx\n", level, segments); + + if (assoc_array_ptr_is_shortcut(cursor)) + goto follow_shortcut; + +consider_node: + node = assoc_array_ptr_to_node(cursor); + smp_read_barrier_depends(); + + slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + slot &= ASSOC_ARRAY_FAN_MASK; + ptr = ACCESS_ONCE(node->slots[slot]); + + pr_devel("consider slot %x [ix=%d type=%lu]\n", + slot, level, (unsigned long)ptr & 3); + + if (!assoc_array_ptr_is_meta(ptr)) { + /* The node doesn't have a node/shortcut pointer in the slot + * corresponding to the index key that we have to follow. + */ + result->terminal_node.node = node; + result->terminal_node.level = level; + result->terminal_node.slot = slot; + pr_devel("<--%s() = terminal_node\n", __func__); + return assoc_array_walk_found_terminal_node; + } + + if (assoc_array_ptr_is_node(ptr)) { + /* There is a pointer to a node in the slot corresponding to + * this index key segment, so we need to follow it. + */ + cursor = ptr; + level += ASSOC_ARRAY_LEVEL_STEP; + if ((level & ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) + goto consider_node; + goto jumped; + } + + /* There is a shortcut in the slot corresponding to the index key + * segment. We follow the shortcut if its partial index key matches + * this leaf's. Otherwise we need to split the shortcut. + */ + cursor = ptr; +follow_shortcut: + shortcut = assoc_array_ptr_to_shortcut(cursor); + smp_read_barrier_depends(); + pr_devel("shortcut to %d\n", shortcut->skip_to_level); + sc_level = level + ASSOC_ARRAY_LEVEL_STEP; + BUG_ON(sc_level > shortcut->skip_to_level); + + do { + /* Check the leaf against the shortcut's index key a word at a + * time, trimming the final word (the shortcut stores the index + * key completely from the root to the shortcut's target). + */ + if ((sc_level & ASSOC_ARRAY_KEY_CHUNK_MASK) == 0) + segments = ops->get_key_chunk(index_key, sc_level); + + sc_segments = shortcut->index_key[sc_level >> ASSOC_ARRAY_KEY_CHUNK_SHIFT]; + dissimilarity = segments ^ sc_segments; + + if (round_up(sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE) > shortcut->skip_to_level) { + /* Trim segments that are beyond the shortcut */ + int shift = shortcut->skip_to_level & ASSOC_ARRAY_KEY_CHUNK_MASK; + dissimilarity &= ~(ULONG_MAX << shift); + next_sc_level = shortcut->skip_to_level; + } else { + next_sc_level = sc_level + ASSOC_ARRAY_KEY_CHUNK_SIZE; + next_sc_level = round_down(next_sc_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + } + + if (dissimilarity != 0) { + /* This shortcut points elsewhere */ + result->wrong_shortcut.shortcut = shortcut; + result->wrong_shortcut.level = level; + result->wrong_shortcut.sc_level = sc_level; + result->wrong_shortcut.sc_segments = sc_segments; + result->wrong_shortcut.dissimilarity = dissimilarity; + return assoc_array_walk_found_wrong_shortcut; + } + + sc_level = next_sc_level; + } while (sc_level < shortcut->skip_to_level); + + /* The shortcut matches the leaf's index to this point. */ + cursor = ACCESS_ONCE(shortcut->next_node); + if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) { + level = sc_level; + goto jumped; + } else { + level = sc_level; + goto consider_node; + } +} + +/** + * assoc_array_find - Find an object by index key + * @array: The associative array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Find an object in an associative array by walking through the internal tree + * to the node that should contain the object and then searching the leaves + * there. NULL is returned if the requested object was not found in the array. + * + * The caller must hold the RCU read lock or better. + */ +void *assoc_array_find(const struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_walk_result result; + const struct assoc_array_node *node; + const struct assoc_array_ptr *ptr; + const void *leaf; + int slot; + + if (assoc_array_walk(array, ops, index_key, &result) != + assoc_array_walk_found_terminal_node) + return NULL; + + node = result.terminal_node.node; + smp_read_barrier_depends(); + + /* If the target key is available to us, it's has to be pointed to by + * the terminal node. + */ + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + if (ptr && assoc_array_ptr_is_leaf(ptr)) { + /* We need a barrier between the read of the pointer + * and dereferencing the pointer - but only if we are + * actually going to dereference it. + */ + leaf = assoc_array_ptr_to_leaf(ptr); + smp_read_barrier_depends(); + if (ops->compare_object(leaf, index_key)) + return (void *)leaf; + } + } + + return NULL; +} + +/* + * Destructively iterate over an associative array. The caller must prevent + * other simultaneous accesses. + */ +static void assoc_array_destroy_subtree(struct assoc_array_ptr *root, + const struct assoc_array_ops *ops) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *cursor, *parent = NULL; + int slot = -1; + + pr_devel("-->%s()\n", __func__); + + cursor = root; + if (!cursor) { + pr_devel("empty\n"); + return; + } + +move_to_meta: + if (assoc_array_ptr_is_shortcut(cursor)) { + /* Descend through a shortcut */ + pr_devel("[%d] shortcut\n", slot); + BUG_ON(!assoc_array_ptr_is_shortcut(cursor)); + shortcut = assoc_array_ptr_to_shortcut(cursor); + BUG_ON(shortcut->back_pointer != parent); + BUG_ON(slot != -1 && shortcut->parent_slot != slot); + parent = cursor; + cursor = shortcut->next_node; + slot = -1; + BUG_ON(!assoc_array_ptr_is_node(cursor)); + } + + pr_devel("[%d] node\n", slot); + node = assoc_array_ptr_to_node(cursor); + BUG_ON(node->back_pointer != parent); + BUG_ON(slot != -1 && node->parent_slot != slot); + slot = 0; + +continue_node: + pr_devel("Node %p [back=%p]\n", node, node->back_pointer); + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_ptr *ptr = node->slots[slot]; + if (!ptr) + continue; + if (assoc_array_ptr_is_meta(ptr)) { + parent = cursor; + cursor = ptr; + goto move_to_meta; + } + + if (ops) { + pr_devel("[%d] free leaf\n", slot); + ops->free_object(assoc_array_ptr_to_leaf(ptr)); + } + } + + parent = node->back_pointer; + slot = node->parent_slot; + pr_devel("free node\n"); + kfree(node); + if (!parent) + return; /* Done */ + + /* Move back up to the parent (may need to free a shortcut on + * the way up) */ + if (assoc_array_ptr_is_shortcut(parent)) { + shortcut = assoc_array_ptr_to_shortcut(parent); + BUG_ON(shortcut->next_node != cursor); + cursor = parent; + parent = shortcut->back_pointer; + slot = shortcut->parent_slot; + pr_devel("free shortcut\n"); + kfree(shortcut); + if (!parent) + return; + + BUG_ON(!assoc_array_ptr_is_node(parent)); + } + + /* Ascend to next slot in parent node */ + pr_devel("ascend to %p[%d]\n", parent, slot); + cursor = parent; + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; +} + +/** + * assoc_array_destroy - Destroy an associative array + * @array: The array to destroy. + * @ops: The operations to use. + * + * Discard all metadata and free all objects in an associative array. The + * array will be empty and ready to use again upon completion. This function + * cannot fail. + * + * The caller must prevent all other accesses whilst this takes place as no + * attempt is made to adjust pointers gracefully to permit RCU readlock-holding + * accesses to continue. On the other hand, no memory allocation is required. + */ +void assoc_array_destroy(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + assoc_array_destroy_subtree(array->root, ops); + array->root = NULL; +} + +/* + * Handle insertion into an empty tree. + */ +static bool assoc_array_insert_in_empty_tree(struct assoc_array_edit *edit) +{ + struct assoc_array_node *new_n0; + + pr_devel("-->%s()\n", __func__); + + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[0]; + edit->adjust_count_on = new_n0; + edit->set[0].ptr = &edit->array->root; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + + pr_devel("<--%s() = ok [no root]\n", __func__); + return true; +} + +/* + * Handle insertion into a terminal node. + */ +static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + const void *index_key, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0; + struct assoc_array_node *node, *new_n0, *new_n1, *side; + struct assoc_array_ptr *ptr; + unsigned long dissimilarity, base_seg, blank; + size_t keylen; + bool have_meta; + int level, diff; + int slot, next_slot, free_slot, i, j; + + node = result->terminal_node.node; + level = result->terminal_node.level; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = result->terminal_node.slot; + + pr_devel("-->%s()\n", __func__); + + /* We arrived at a node which doesn't have an onward node or shortcut + * pointer that we have to follow. This means that (a) the leaf we + * want must go here (either by insertion or replacement) or (b) we + * need to split this node and insert in one of the fragments. + */ + free_slot = -1; + + /* Firstly, we have to check the leaves in this node to see if there's + * a matching one we should replace in place. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (!ptr) { + free_slot = i; + continue; + } + if (ops->compare_object(assoc_array_ptr_to_leaf(ptr), index_key)) { + pr_devel("replace in slot %d\n", i); + edit->leaf_p = &node->slots[i]; + edit->dead_leaf = node->slots[i]; + pr_devel("<--%s() = ok [replace]\n", __func__); + return true; + } + } + + /* If there is a free slot in this node then we can just insert the + * leaf here. + */ + if (free_slot >= 0) { + pr_devel("insert in free slot %d\n", free_slot); + edit->leaf_p = &node->slots[free_slot]; + edit->adjust_count_on = node; + pr_devel("<--%s() = ok [insert]\n", __func__); + return true; + } + + /* The node has no spare slots - so we're either going to have to split + * it or insert another node before it. + * + * Whatever, we're going to need at least two new nodes - so allocate + * those now. We may also need a new shortcut, but we deal with that + * when we need it. + */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + new_n1 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n1) + return false; + edit->new_meta[1] = assoc_array_node_to_ptr(new_n1); + + /* We need to find out how similar the leaves are. */ + pr_devel("no spare slots\n"); + have_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + edit->segment_cache[i] = 0xff; + have_meta = true; + continue; + } + base_seg = ops->get_object_key_chunk( + assoc_array_ptr_to_leaf(ptr), level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + if (have_meta) { + pr_devel("have meta\n"); + goto split_node; + } + + /* The node contains only leaves */ + dissimilarity = 0; + base_seg = edit->segment_cache[0]; + for (i = 1; i < ASSOC_ARRAY_FAN_OUT; i++) + dissimilarity |= edit->segment_cache[i] ^ base_seg; + + pr_devel("only leaves; dissimilarity=%lx\n", dissimilarity); + + if ((dissimilarity & ASSOC_ARRAY_FAN_MASK) == 0) { + /* The old leaves all cluster in the same slot. We will need + * to insert a shortcut if the new node wants to cluster with them. + */ + if ((edit->segment_cache[ASSOC_ARRAY_FAN_OUT] ^ base_seg) == 0) + goto all_leaves_cluster_together; + + /* Otherwise we can just insert a new node ahead of the old + * one. + */ + goto present_leaves_cluster_but_not_new_leaf; + } + +split_node: + pr_devel("split node\n"); + + /* We need to split the current node; we know that the node doesn't + * simply contain a full set of leaves that cluster together (it + * contains meta pointers and/or non-clustering leaves). + * + * We need to expel at least two leaves out of a set consisting of the + * leaves in the node and the new leaf. + * + * We need a new node (n0) to replace the current one and a new node to + * take the expelled nodes (n1). + */ + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + +do_split_node: + pr_devel("do_split_node\n"); + + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->nr_leaves_on_branch = 0; + + /* Begin by finding two matching leaves. There have to be at least two + * that match - even if there are meta pointers - because any leaf that + * would match a slot with a meta pointer in it must be somewhere + * behind that meta pointer and cannot be here. Further, given N + * remaining leaf slots, we now have N+1 leaves to go in them. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + slot = edit->segment_cache[i]; + if (slot != 0xff) + for (j = i + 1; j < ASSOC_ARRAY_FAN_OUT + 1; j++) + if (edit->segment_cache[j] == slot) + goto found_slot_for_multiple_occupancy; + } +found_slot_for_multiple_occupancy: + pr_devel("same slot: %x %x [%02x]\n", i, j, slot); + BUG_ON(i >= ASSOC_ARRAY_FAN_OUT); + BUG_ON(j >= ASSOC_ARRAY_FAN_OUT + 1); + BUG_ON(slot >= ASSOC_ARRAY_FAN_OUT); + + new_n1->parent_slot = slot; + + /* Metadata pointers cannot change slot */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + if (assoc_array_ptr_is_meta(node->slots[i])) + new_n0->slots[i] = node->slots[i]; + else + new_n0->slots[i] = NULL; + BUG_ON(new_n0->slots[slot] != NULL); + new_n0->slots[slot] = assoc_array_node_to_ptr(new_n1); + + /* Filter the leaf pointers between the new nodes */ + free_slot = -1; + next_slot = 0; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (assoc_array_ptr_is_meta(node->slots[i])) + continue; + if (edit->segment_cache[i] == slot) { + new_n1->slots[next_slot++] = node->slots[i]; + new_n1->nr_leaves_on_branch++; + } else { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + new_n0->slots[free_slot] = node->slots[i]; + } + } + + pr_devel("filtered: f=%x n=%x\n", free_slot, next_slot); + + if (edit->segment_cache[ASSOC_ARRAY_FAN_OUT] != slot) { + do { + free_slot++; + } while (new_n0->slots[free_slot] != NULL); + edit->leaf_p = &new_n0->slots[free_slot]; + edit->adjust_count_on = new_n0; + } else { + edit->leaf_p = &new_n1->slots[next_slot++]; + edit->adjust_count_on = new_n1; + } + + BUG_ON(next_slot <= 1); + + edit->set_backpointers_to = assoc_array_node_to_ptr(new_n0); + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + if (edit->segment_cache[i] == 0xff) { + ptr = node->slots[i]; + BUG_ON(assoc_array_ptr_is_leaf(ptr)); + if (assoc_array_ptr_is_node(ptr)) { + side = assoc_array_ptr_to_node(ptr); + edit->set_backpointers[i] = &side->back_pointer; + } else { + shortcut = assoc_array_ptr_to_shortcut(ptr); + edit->set_backpointers[i] = &shortcut->back_pointer; + } + } + } + + ptr = node->back_pointer; + if (!ptr) + edit->set[0].ptr = &edit->array->root; + else if (assoc_array_ptr_is_node(ptr)) + edit->set[0].ptr = &assoc_array_ptr_to_node(ptr)->slots[node->parent_slot]; + else + edit->set[0].ptr = &assoc_array_ptr_to_shortcut(ptr)->next_node; + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [split node]\n", __func__); + return true; + +present_leaves_cluster_but_not_new_leaf: + /* All the old leaves cluster in the same slot, but the new leaf wants + * to go into a different slot, so we create a new node to hold the new + * leaf and a pointer to a new node holding all the old leaves. + */ + pr_devel("present leaves cluster but not new leaf\n"); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = edit->segment_cache[0]; + new_n1->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) + new_n1->slots[i] = node->slots[i]; + + new_n0->slots[edit->segment_cache[0]] = assoc_array_node_to_ptr(new_n0); + edit->leaf_p = &new_n0->slots[edit->segment_cache[ASSOC_ARRAY_FAN_OUT]]; + + edit->set[0].ptr = &assoc_array_ptr_to_node(node->back_pointer)->slots[node->parent_slot]; + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + edit->excised_meta[0] = assoc_array_node_to_ptr(node); + pr_devel("<--%s() = ok [insert node before]\n", __func__); + return true; + +all_leaves_cluster_together: + /* All the leaves, new and old, want to cluster together in this node + * in the same slot, so we have to replace this node with a shortcut to + * skip over the identical parts of the key and then place a pair of + * nodes, one inside the other, at the end of the shortcut and + * distribute the keys between them. + * + * Firstly we need to work out where the leaves start diverging as a + * bit position into their keys so that we know how big the shortcut + * needs to be. + * + * We only need to make a single pass of N of the N+1 leaves because if + * any keys differ between themselves at bit X then at least one of + * them must also differ with the base key at bit X or before. + */ + pr_devel("all leaves cluster together\n"); + diff = INT_MAX; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + int x = ops->diff_objects(assoc_array_ptr_to_leaf(edit->leaf), + assoc_array_ptr_to_leaf(node->slots[i])); + if (x < diff) { + BUG_ON(x < 0); + diff = x; + } + } + BUG_ON(diff == INT_MAX); + BUG_ON(diff < level + ASSOC_ARRAY_LEVEL_STEP); + + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s0); + + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = node->back_pointer; + new_s0->parent_slot = node->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + new_n1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_n1->parent_slot = -1; /* Need to calculate this */ + + new_s0->skip_to_level = level = diff & ~ASSOC_ARRAY_LEVEL_STEP_MASK; + pr_devel("skip_to_level = %d [diff %d]\n", level, diff); + BUG_ON(level <= 0); + + for (i = 0; i < keylen; i++) + new_s0->index_key[i] = + ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); + + blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); + new_s0->index_key[keylen - 1] &= ~blank; + + /* This now reduces to a node splitting exercise for which we'll need + * to regenerate the disparity table. + */ + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + base_seg = ops->get_object_key_chunk(assoc_array_ptr_to_leaf(ptr), + level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[i] = base_seg & ASSOC_ARRAY_FAN_MASK; + } + + base_seg = ops->get_key_chunk(index_key, level); + base_seg >>= level & ASSOC_ARRAY_KEY_CHUNK_MASK; + edit->segment_cache[ASSOC_ARRAY_FAN_OUT] = base_seg & ASSOC_ARRAY_FAN_MASK; + goto do_split_node; +} + +/* + * Handle insertion into the middle of a shortcut. + */ +static bool assoc_array_insert_mid_shortcut(struct assoc_array_edit *edit, + const struct assoc_array_ops *ops, + struct assoc_array_walk_result *result) +{ + struct assoc_array_shortcut *shortcut, *new_s0, *new_s1; + struct assoc_array_node *node, *new_n0, *side; + unsigned long sc_segments, dissimilarity, blank; + size_t keylen; + int level, sc_level, diff; + int sc_slot; + + shortcut = result->wrong_shortcut.shortcut; + level = result->wrong_shortcut.level; + sc_level = result->wrong_shortcut.sc_level; + sc_segments = result->wrong_shortcut.sc_segments; + dissimilarity = result->wrong_shortcut.dissimilarity; + + pr_devel("-->%s(ix=%d dis=%lx scix=%d)\n", + __func__, level, dissimilarity, sc_level); + + /* We need to split a shortcut and insert a node between the two + * pieces. Zero-length pieces will be dispensed with entirely. + * + * First of all, we need to find out in which level the first + * difference was. + */ + diff = __ffs(dissimilarity); + diff &= ~ASSOC_ARRAY_LEVEL_STEP_MASK; + diff += sc_level & ~ASSOC_ARRAY_KEY_CHUNK_MASK; + pr_devel("diff=%d\n", diff); + + if (!shortcut->back_pointer) { + edit->set[0].ptr = &edit->array->root; + } else if (assoc_array_ptr_is_node(shortcut->back_pointer)) { + node = assoc_array_ptr_to_node(shortcut->back_pointer); + edit->set[0].ptr = &node->slots[shortcut->parent_slot]; + } else { + BUG(); + } + + edit->excised_meta[0] = assoc_array_shortcut_to_ptr(shortcut); + + /* Create a new node now since we're going to need it anyway */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + return false; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + edit->adjust_count_on = new_n0; + + /* Insert a new shortcut before the new node if this segment isn't of + * zero length - otherwise we just connect the new node directly to the + * parent. + */ + level += ASSOC_ARRAY_LEVEL_STEP; + if (diff > level) { + pr_devel("pre-shortcut %d...%d\n", level, diff); + keylen = round_up(diff, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s0 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s0) + return false; + edit->new_meta[1] = assoc_array_shortcut_to_ptr(new_s0); + edit->set[0].to = assoc_array_shortcut_to_ptr(new_s0); + new_s0->back_pointer = shortcut->back_pointer; + new_s0->parent_slot = shortcut->parent_slot; + new_s0->next_node = assoc_array_node_to_ptr(new_n0); + new_s0->skip_to_level = diff; + + new_n0->back_pointer = assoc_array_shortcut_to_ptr(new_s0); + new_n0->parent_slot = 0; + + memcpy(new_s0->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + blank = ULONG_MAX << (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, diff, blank); + new_s0->index_key[keylen - 1] &= ~blank; + } else { + pr_devel("no pre-shortcut\n"); + edit->set[0].to = assoc_array_node_to_ptr(new_n0); + new_n0->back_pointer = shortcut->back_pointer; + new_n0->parent_slot = shortcut->parent_slot; + } + + side = assoc_array_ptr_to_node(shortcut->next_node); + new_n0->nr_leaves_on_branch = side->nr_leaves_on_branch; + + /* We need to know which slot in the new node is going to take a + * metadata pointer. + */ + sc_slot = sc_segments >> (diff & ASSOC_ARRAY_KEY_CHUNK_MASK); + sc_slot &= ASSOC_ARRAY_FAN_MASK; + + pr_devel("new slot %lx >> %d -> %d\n", + sc_segments, diff & ASSOC_ARRAY_KEY_CHUNK_MASK, sc_slot); + + /* Determine whether we need to follow the new node with a replacement + * for the current shortcut. We could in theory reuse the current + * shortcut if its parent slot number doesn't change - but that's a + * 1-in-16 chance so not worth expending the code upon. + */ + level = diff + ASSOC_ARRAY_LEVEL_STEP; + if (level < shortcut->skip_to_level) { + pr_devel("post-shortcut %d...%d\n", level, shortcut->skip_to_level); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + + new_s1 = kzalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s1) + return false; + edit->new_meta[2] = assoc_array_shortcut_to_ptr(new_s1); + + new_s1->back_pointer = assoc_array_node_to_ptr(new_n0); + new_s1->parent_slot = sc_slot; + new_s1->next_node = shortcut->next_node; + new_s1->skip_to_level = shortcut->skip_to_level; + + new_n0->slots[sc_slot] = assoc_array_shortcut_to_ptr(new_s1); + + memcpy(new_s1->index_key, shortcut->index_key, + keylen * sizeof(unsigned long)); + + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_shortcut_to_ptr(new_s1); + } else { + pr_devel("no post-shortcut\n"); + + /* We don't have to replace the pointed-to node as long as we + * use memory barriers to make sure the parent slot number is + * changed before the back pointer (the parent slot number is + * irrelevant to the old parent shortcut). + */ + new_n0->slots[sc_slot] = shortcut->next_node; + edit->set_parent_slot[0].p = &side->parent_slot; + edit->set_parent_slot[0].to = sc_slot; + edit->set[1].ptr = &side->back_pointer; + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + } + + /* Install the new leaf in a spare slot in the new node. */ + if (sc_slot == 0) + edit->leaf_p = &new_n0->slots[1]; + else + edit->leaf_p = &new_n0->slots[0]; + + pr_devel("<--%s() = ok [split shortcut]\n", __func__); + return edit; +} + +/** + * assoc_array_insert - Script insertion of an object into an associative array + * @array: The array to insert into. + * @ops: The operations to use. + * @index_key: The key to insert at. + * @object: The object to insert. + * + * Precalculate and preallocate a script for the insertion or replacement of an + * object in an associative array. This results in an edit script that can + * either be applied or cancelled. + * + * The function returns a pointer to an edit script or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key, + void *object) +{ + struct assoc_array_walk_result result; + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + /* The leaf pointer we're given must not have the bottom bit set as we + * use those for type-marking the pointer. NULL pointers are also not + * allowed as they indicate an empty slot but we have to allow them + * here as they can be updated later. + */ + BUG_ON(assoc_array_ptr_is_meta(object)); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->leaf = assoc_array_leaf_to_ptr(object); + edit->adjust_count_by = 1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_tree_empty: + /* Allocate a root node if there isn't one yet */ + if (!assoc_array_insert_in_empty_tree(edit)) + goto enomem; + return edit; + + case assoc_array_walk_found_terminal_node: + /* We found a node that doesn't have a node/shortcut pointer in + * the slot corresponding to the index key that we have to + * follow. + */ + if (!assoc_array_insert_into_terminal_node(edit, ops, index_key, + &result)) + goto enomem; + return edit; + + case assoc_array_walk_found_wrong_shortcut: + /* We found a shortcut that didn't match our key in a slot we + * needed to follow. + */ + if (!assoc_array_insert_mid_shortcut(edit, ops, &result)) + goto enomem; + return edit; + } + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_insert_set_object - Set the new object pointer in an edit script + * @edit: The edit script to modify. + * @object: The object pointer to set. + * + * Change the object to be inserted in an edit script. The object pointed to + * by the old object is not freed. This must be done prior to applying the + * script. + */ +void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object) +{ + BUG_ON(!object); + edit->leaf = assoc_array_leaf_to_ptr(object); +} + +struct assoc_array_delete_collapse_context { + struct assoc_array_node *node; + const void *skip_leaf; + int slot; +}; + +/* + * Subtree collapse to node iterator. + */ +static int assoc_array_delete_collapse_iterator(const void *leaf, + void *iterator_data) +{ + struct assoc_array_delete_collapse_context *collapse = iterator_data; + + if (leaf == collapse->skip_leaf) + return 0; + + BUG_ON(collapse->slot >= ASSOC_ARRAY_FAN_OUT); + + collapse->node->slots[collapse->slot++] = assoc_array_leaf_to_ptr(leaf); + return 0; +} + +/** + * assoc_array_delete - Script deletion of an object from an associative array + * @array: The array to search. + * @ops: The operations to use. + * @index_key: The key to the object. + * + * Precalculate and preallocate a script for the deletion of an object from an + * associative array. This results in an edit script that can either be + * applied or cancelled. + * + * The function returns a pointer to an edit script if the object was found, + * NULL if the object was not found or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, + const struct assoc_array_ops *ops, + const void *index_key) +{ + struct assoc_array_delete_collapse_context collapse; + struct assoc_array_walk_result result; + struct assoc_array_node *node, *new_n0; + struct assoc_array_edit *edit; + struct assoc_array_ptr *ptr; + bool has_meta; + int slot, i; + + pr_devel("-->%s()\n", __func__); + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->adjust_count_by = -1; + + switch (assoc_array_walk(array, ops, index_key, &result)) { + case assoc_array_walk_found_terminal_node: + /* We found a node that should contain the leaf we've been + * asked to remove - *if* it's in the tree. + */ + pr_devel("terminal_node\n"); + node = result.terminal_node.node; + + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (ptr && + assoc_array_ptr_is_leaf(ptr) && + ops->compare_object(assoc_array_ptr_to_leaf(ptr), + index_key)) + goto found_leaf; + } + case assoc_array_walk_tree_empty: + case assoc_array_walk_found_wrong_shortcut: + default: + assoc_array_cancel_edit(edit); + pr_devel("not found\n"); + return NULL; + } + +found_leaf: + BUG_ON(array->nr_leaves_on_tree <= 0); + + /* In the simplest form of deletion we just clear the slot and release + * the leaf after a suitable interval. + */ + edit->dead_leaf = node->slots[slot]; + edit->set[0].ptr = &node->slots[slot]; + edit->set[0].to = NULL; + edit->adjust_count_on = node; + + /* If that concludes erasure of the last leaf, then delete the entire + * internal array. + */ + if (array->nr_leaves_on_tree == 1) { + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->adjust_count_on = NULL; + edit->excised_subtree = array->root; + pr_devel("all gone\n"); + return edit; + } + + /* However, we'd also like to clear up some metadata blocks if we + * possibly can. + * + * We go for a simple algorithm of: if this node has FAN_OUT or fewer + * leaves in it, then attempt to collapse it - and attempt to + * recursively collapse up the tree. + * + * We could also try and collapse in partially filled subtrees to take + * up space in this node. + */ + if (node->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + struct assoc_array_node *parent, *grandparent; + struct assoc_array_ptr *ptr; + + /* First of all, we need to know if this node has metadata so + * that we don't try collapsing if all the leaves are already + * here. + */ + has_meta = false; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + ptr = node->slots[i]; + if (assoc_array_ptr_is_meta(ptr)) { + has_meta = true; + break; + } + } + + pr_devel("leaves: %ld [m=%d]\n", + node->nr_leaves_on_branch - 1, has_meta); + + /* Look further up the tree to see if we can collapse this node + * into a more proximal node too. + */ + parent = node; + collapse_up: + pr_devel("collapse subtree: %ld\n", parent->nr_leaves_on_branch); + + ptr = parent->back_pointer; + if (!ptr) + goto do_collapse; + if (assoc_array_ptr_is_shortcut(ptr)) { + struct assoc_array_shortcut *s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->back_pointer; + if (!ptr) + goto do_collapse; + } + + grandparent = assoc_array_ptr_to_node(ptr); + if (grandparent->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT + 1) { + parent = grandparent; + goto collapse_up; + } + + do_collapse: + /* There's no point collapsing if the original node has no meta + * pointers to discard and if we didn't merge into one of that + * node's ancestry. + */ + if (has_meta || parent != node) { + node = parent; + + /* Create a new node to collapse into */ + new_n0 = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n0) + goto enomem; + edit->new_meta[0] = assoc_array_node_to_ptr(new_n0); + + new_n0->back_pointer = node->back_pointer; + new_n0->parent_slot = node->parent_slot; + new_n0->nr_leaves_on_branch = node->nr_leaves_on_branch; + edit->adjust_count_on = new_n0; + + collapse.node = new_n0; + collapse.skip_leaf = assoc_array_ptr_to_leaf(edit->dead_leaf); + collapse.slot = 0; + assoc_array_subtree_iterate(assoc_array_node_to_ptr(node), + node->back_pointer, + assoc_array_delete_collapse_iterator, + &collapse); + pr_devel("collapsed %d,%lu\n", collapse.slot, new_n0->nr_leaves_on_branch); + BUG_ON(collapse.slot != new_n0->nr_leaves_on_branch - 1); + + if (!node->back_pointer) { + edit->set[1].ptr = &array->root; + } else if (assoc_array_ptr_is_leaf(node->back_pointer)) { + BUG(); + } else if (assoc_array_ptr_is_node(node->back_pointer)) { + struct assoc_array_node *p = + assoc_array_ptr_to_node(node->back_pointer); + edit->set[1].ptr = &p->slots[node->parent_slot]; + } else if (assoc_array_ptr_is_shortcut(node->back_pointer)) { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(node->back_pointer); + edit->set[1].ptr = &s->next_node; + } + edit->set[1].to = assoc_array_node_to_ptr(new_n0); + edit->excised_subtree = assoc_array_node_to_ptr(node); + } + } + + return edit; + +enomem: + /* Clean up after an out of memory error */ + pr_devel("enomem\n"); + assoc_array_cancel_edit(edit); + return ERR_PTR(-ENOMEM); +} + +/** + * assoc_array_clear - Script deletion of all objects from an associative array + * @array: The array to clear. + * @ops: The operations to use. + * + * Precalculate and preallocate a script for the deletion of all the objects + * from an associative array. This results in an edit script that can either + * be applied or cancelled. + * + * The function returns a pointer to an edit script if there are objects to be + * deleted, NULL if there are no objects in the array or -ENOMEM. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, + const struct assoc_array_ops *ops) +{ + struct assoc_array_edit *edit; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return NULL; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return ERR_PTR(-ENOMEM); + edit->array = array; + edit->ops = ops; + edit->set[1].ptr = &array->root; + edit->set[1].to = NULL; + edit->excised_subtree = array->root; + edit->ops_for_excised_subtree = ops; + pr_devel("all gone\n"); + return edit; +} + +/* + * Handle the deferred destruction after an applied edit. + */ +static void assoc_array_rcu_cleanup(struct rcu_head *head) +{ + struct assoc_array_edit *edit = + container_of(head, struct assoc_array_edit, rcu); + int i; + + pr_devel("-->%s()\n", __func__); + + if (edit->dead_leaf) + edit->ops->free_object(assoc_array_ptr_to_leaf(edit->dead_leaf)); + for (i = 0; i < ARRAY_SIZE(edit->excised_meta); i++) + if (edit->excised_meta[i]) + kfree(assoc_array_ptr_to_node(edit->excised_meta[i])); + + if (edit->excised_subtree) { + BUG_ON(assoc_array_ptr_is_leaf(edit->excised_subtree)); + if (assoc_array_ptr_is_node(edit->excised_subtree)) { + struct assoc_array_node *n = + assoc_array_ptr_to_node(edit->excised_subtree); + n->back_pointer = NULL; + } else { + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(edit->excised_subtree); + s->back_pointer = NULL; + } + assoc_array_destroy_subtree(edit->excised_subtree, + edit->ops_for_excised_subtree); + } + + kfree(edit); +} + +/** + * assoc_array_apply_edit - Apply an edit script to an associative array + * @edit: The script to apply. + * + * Apply an edit script to an associative array to effect an insertion, + * deletion or clearance. As the edit script includes preallocated memory, + * this is guaranteed not to fail. + * + * The edit script, dead objects and dead metadata will be scheduled for + * destruction after an RCU grace period to permit those doing read-only + * accesses on the array to continue to do so under the RCU read lock whilst + * the edit is taking place. + */ +void assoc_array_apply_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + smp_wmb(); + if (edit->leaf_p) + *edit->leaf_p = edit->leaf; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_parent_slot); i++) + if (edit->set_parent_slot[i].p) + *edit->set_parent_slot[i].p = edit->set_parent_slot[i].to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set_backpointers); i++) + if (edit->set_backpointers[i]) + *edit->set_backpointers[i] = edit->set_backpointers_to; + + smp_wmb(); + for (i = 0; i < ARRAY_SIZE(edit->set); i++) + if (edit->set[i].ptr) + *edit->set[i].ptr = edit->set[i].to; + + if (edit->array->root == NULL) { + edit->array->nr_leaves_on_tree = 0; + } else if (edit->adjust_count_on) { + node = edit->adjust_count_on; + for (;;) { + node->nr_leaves_on_branch += edit->adjust_count_by; + + ptr = node->back_pointer; + if (!ptr) + break; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + ptr = shortcut->back_pointer; + if (!ptr) + break; + } + BUG_ON(!assoc_array_ptr_is_node(ptr)); + node = assoc_array_ptr_to_node(ptr); + } + + edit->array->nr_leaves_on_tree += edit->adjust_count_by; + } + + call_rcu(&edit->rcu, assoc_array_rcu_cleanup); +} + +/** + * assoc_array_cancel_edit - Discard an edit script. + * @edit: The script to discard. + * + * Free an edit script and all the preallocated data it holds without making + * any changes to the associative array it was intended for. + * + * NOTE! In the case of an insertion script, this does _not_ release the leaf + * that was to be inserted. That is left to the caller. + */ +void assoc_array_cancel_edit(struct assoc_array_edit *edit) +{ + struct assoc_array_ptr *ptr; + int i; + + pr_devel("-->%s()\n", __func__); + + /* Clean up after an out of memory error */ + for (i = 0; i < ARRAY_SIZE(edit->new_meta); i++) { + ptr = edit->new_meta[i]; + if (ptr) { + if (assoc_array_ptr_is_node(ptr)) + kfree(assoc_array_ptr_to_node(ptr)); + else + kfree(assoc_array_ptr_to_shortcut(ptr)); + } + } + kfree(edit); +} + +/** + * assoc_array_gc - Garbage collect an associative array. + * @array: The array to clean. + * @ops: The operations to use. + * @iterator: A callback function to pass judgement on each object. + * @iterator_data: Private data for the callback function. + * + * Collect garbage from an associative array and pack down the internal tree to + * save memory. + * + * The iterator function is asked to pass judgement upon each object in the + * array. If it returns false, the object is discard and if it returns true, + * the object is kept. If it returns true, it must increment the object's + * usage count (or whatever it needs to do to retain it) before returning. + * + * This function returns 0 if successful or -ENOMEM if out of memory. In the + * latter case, the array is not changed. + * + * The caller should lock against other modifications and must continue to hold + * the lock until assoc_array_apply_edit() has been called. + * + * Accesses to the tree may take place concurrently with this function, + * provided they hold the RCU read lock. + */ +int assoc_array_gc(struct assoc_array *array, + const struct assoc_array_ops *ops, + bool (*iterator)(void *object, void *iterator_data), + void *iterator_data) +{ + struct assoc_array_shortcut *shortcut, *new_s; + struct assoc_array_node *node, *new_n; + struct assoc_array_edit *edit; + struct assoc_array_ptr *cursor, *ptr; + struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; + unsigned long nr_leaves_on_tree; + int keylen, slot, nr_free, next_slot, i; + + pr_devel("-->%s()\n", __func__); + + if (!array->root) + return 0; + + edit = kzalloc(sizeof(struct assoc_array_edit), GFP_KERNEL); + if (!edit) + return -ENOMEM; + edit->array = array; + edit->ops = ops; + edit->ops_for_excised_subtree = ops; + edit->set[0].ptr = &array->root; + edit->excised_subtree = array->root; + + new_root = new_parent = NULL; + new_ptr_pp = &new_root; + cursor = array->root; + +descend: + /* If this point is a shortcut, then we need to duplicate it and + * advance the target cursor. + */ + if (assoc_array_ptr_is_shortcut(cursor)) { + shortcut = assoc_array_ptr_to_shortcut(cursor); + keylen = round_up(shortcut->skip_to_level, ASSOC_ARRAY_KEY_CHUNK_SIZE); + keylen >>= ASSOC_ARRAY_KEY_CHUNK_SHIFT; + new_s = kmalloc(sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long), GFP_KERNEL); + if (!new_s) + goto enomem; + pr_devel("dup shortcut %p -> %p\n", shortcut, new_s); + memcpy(new_s, shortcut, (sizeof(struct assoc_array_shortcut) + + keylen * sizeof(unsigned long))); + new_s->back_pointer = new_parent; + new_s->parent_slot = shortcut->parent_slot; + *new_ptr_pp = new_parent = assoc_array_shortcut_to_ptr(new_s); + new_ptr_pp = &new_s->next_node; + cursor = shortcut->next_node; + } + + /* Duplicate the node at this position */ + node = assoc_array_ptr_to_node(cursor); + new_n = kzalloc(sizeof(struct assoc_array_node), GFP_KERNEL); + if (!new_n) + goto enomem; + pr_devel("dup node %p -> %p\n", node, new_n); + new_n->back_pointer = new_parent; + new_n->parent_slot = node->parent_slot; + *new_ptr_pp = new_parent = assoc_array_node_to_ptr(new_n); + new_ptr_pp = NULL; + slot = 0; + +continue_node: + /* Filter across any leaves and gc any subtrees */ + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = node->slots[slot]; + if (!ptr) + continue; + + if (assoc_array_ptr_is_leaf(ptr)) { + if (iterator(assoc_array_ptr_to_leaf(ptr), + iterator_data)) + /* The iterator will have done any reference + * counting on the object for us. + */ + new_n->slots[slot] = ptr; + continue; + } + + new_ptr_pp = &new_n->slots[slot]; + cursor = ptr; + goto descend; + } + + pr_devel("-- compress node %p --\n", new_n); + + /* Count up the number of empty slots in this node and work out the + * subtree leaf count. + */ + new_n->nr_leaves_on_branch = 0; + nr_free = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = new_n->slots[slot]; + if (!ptr) + nr_free++; + else if (assoc_array_ptr_is_leaf(ptr)) + new_n->nr_leaves_on_branch++; + } + pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); + + /* See what we can fold in */ + next_slot = 0; + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + struct assoc_array_shortcut *s; + struct assoc_array_node *child; + + ptr = new_n->slots[slot]; + if (!ptr || assoc_array_ptr_is_leaf(ptr)) + continue; + + s = NULL; + if (assoc_array_ptr_is_shortcut(ptr)) { + s = assoc_array_ptr_to_shortcut(ptr); + ptr = s->next_node; + } + + child = assoc_array_ptr_to_node(ptr); + new_n->nr_leaves_on_branch += child->nr_leaves_on_branch; + + if (child->nr_leaves_on_branch <= nr_free + 1) { + /* Fold the child node into this one */ + pr_devel("[%d] fold node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + + /* We would already have reaped an intervening shortcut + * on the way back up the tree. + */ + BUG_ON(s); + + new_n->slots[slot] = NULL; + nr_free++; + if (slot < next_slot) + next_slot = slot; + for (i = 0; i < ASSOC_ARRAY_FAN_OUT; i++) { + struct assoc_array_ptr *p = child->slots[i]; + if (!p) + continue; + BUG_ON(assoc_array_ptr_is_meta(p)); + while (new_n->slots[next_slot]) + next_slot++; + BUG_ON(next_slot >= ASSOC_ARRAY_FAN_OUT); + new_n->slots[next_slot++] = p; + nr_free--; + } + kfree(child); + } else { + pr_devel("[%d] retain node %lu/%d [nx %d]\n", + slot, child->nr_leaves_on_branch, nr_free + 1, + next_slot); + } + } + + pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); + + nr_leaves_on_tree = new_n->nr_leaves_on_branch; + + /* Excise this node if it is singly occupied by a shortcut */ + if (nr_free == ASSOC_ARRAY_FAN_OUT - 1) { + for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) + if ((ptr = new_n->slots[slot])) + break; + + if (assoc_array_ptr_is_meta(ptr) && + assoc_array_ptr_is_shortcut(ptr)) { + pr_devel("excise node %p with 1 shortcut\n", new_n); + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_n->back_pointer; + slot = new_n->parent_slot; + kfree(new_n); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + + if (assoc_array_ptr_is_shortcut(new_parent)) { + /* We can discard any preceding shortcut also */ + struct assoc_array_shortcut *s = + assoc_array_ptr_to_shortcut(new_parent); + + pr_devel("excise preceding shortcut\n"); + + new_parent = new_s->back_pointer = s->back_pointer; + slot = new_s->parent_slot = s->parent_slot; + kfree(s); + if (!new_parent) { + new_s->back_pointer = NULL; + new_s->parent_slot = 0; + new_root = ptr; + goto gc_complete; + } + } + + new_s->back_pointer = new_parent; + new_s->parent_slot = slot; + new_n = assoc_array_ptr_to_node(new_parent); + new_n->slots[slot] = ptr; + goto ascend_old_tree; + } + } + + /* Excise any shortcuts we might encounter that point to nodes that + * only contain leaves. + */ + ptr = new_n->back_pointer; + if (!ptr) + goto gc_complete; + + if (assoc_array_ptr_is_shortcut(ptr)) { + new_s = assoc_array_ptr_to_shortcut(ptr); + new_parent = new_s->back_pointer; + slot = new_s->parent_slot; + + if (new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { + struct assoc_array_node *n; + + pr_devel("excise shortcut\n"); + new_n->back_pointer = new_parent; + new_n->parent_slot = slot; + kfree(new_s); + if (!new_parent) { + new_root = assoc_array_node_to_ptr(new_n); + goto gc_complete; + } + + n = assoc_array_ptr_to_node(new_parent); + n->slots[slot] = assoc_array_node_to_ptr(new_n); + } + } else { + new_parent = ptr; + } + new_n = assoc_array_ptr_to_node(new_parent); + +ascend_old_tree: + ptr = node->back_pointer; + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + slot = shortcut->parent_slot; + cursor = shortcut->back_pointer; + } else { + slot = node->parent_slot; + cursor = ptr; + } + BUG_ON(!ptr); + node = assoc_array_ptr_to_node(cursor); + slot++; + goto continue_node; + +gc_complete: + edit->set[0].to = new_root; + assoc_array_apply_edit(edit); + edit->array->nr_leaves_on_tree = nr_leaves_on_tree; + return 0; + +enomem: + pr_devel("enomem\n"); + assoc_array_destroy_subtree(new_root, edit->ops); + kfree(edit); + return -ENOMEM; +} diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index 657979f71be..bf076d281d4 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -121,3 +121,6 @@ void mpi_free(MPI a) kfree(a); } EXPORT_SYMBOL_GPL(mpi_free); + +MODULE_DESCRIPTION("Multiprecision maths library"); +MODULE_LICENSE("GPL"); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7d57af21f49..dee6cf4e6d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -476,40 +476,6 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) return 0; } -static void copy_gigantic_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - struct hstate *h = page_hstate(src); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -736,6 +702,23 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + compound_page_dtor *dtor; + + if (!PageHead(page_head)) + return 0; + + dtor = get_compound_page_dtor(page_head); + + return dtor == free_huge_page; +} +EXPORT_SYMBOL_GPL(PageHeadHuge); + pgoff_t __basepage_index(struct page *page) { struct page *page_head = compound_head(page); diff --git a/mm/memory.c b/mm/memory.c index 0409e8f43fa..5d9025f3b3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4272,13 +4272,6 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS -static struct kmem_cache *page_ptl_cachep; -void __init ptlock_cache_init(void) -{ - page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, - SLAB_PANIC, NULL); -} - bool ptlock_alloc(struct page *page) { spinlock_t *ptl; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c4403cdf343..eca4a312912 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2950,7 +2950,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) return; } - p += snprintf(p, maxlen, policy_modes[mode]); + p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); diff --git a/mm/migrate.c b/mm/migrate.c index 316e720a202..bb940045fe8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -442,6 +442,54 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages; ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +static void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + int nr_pages; + + if (PageHuge(src)) { + /* hugetlbfs page */ + struct hstate *h = page_hstate(src); + nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + } else { + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = hpage_nr_pages(src); + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + +/* * Copy the page to its new location */ void migrate_page_copy(struct page *newpage, struct page *page) diff --git a/mm/slab.c b/mm/slab.c index 0c8967bb201..eb043bf05f4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -164,72 +164,6 @@ static bool pfmemalloc_active __read_mostly; /* - * kmem_bufctl_t: - * - * Bufctl's are used for linking objs within a slab - * linked offsets. - * - * This implementation relies on "struct page" for locating the cache & - * slab an object belongs to. - * This allows the bufctl structure to be small (one int), but limits - * the number of objects a slab (not a cache) can contain when off-slab - * bufctls are used. The limit is the size of the largest general cache - * that does not use off-slab slabs. - * For 32bit archs with 4 kB pages, is this 56. - * This is not serious, as it is only for large objects, when it is unwise - * to have too many per slab. - * Note: This limit can be raised by introducing a general cache whose size - * is less than 512 (PAGE_SIZE<<3), but greater than 256. - */ - -typedef unsigned int kmem_bufctl_t; -#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) -#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) -#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) -#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) - -/* - * struct slab_rcu - * - * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to - * arrange for kmem_freepages to be called via RCU. This is useful if - * we need to approach a kernel structure obliquely, from its address - * obtained without the usual locking. We can lock the structure to - * stabilize it and check it's still at the given address, only if we - * can be sure that the memory has not been meanwhile reused for some - * other kind of object (which our subsystem's lock might corrupt). - * - * rcu_read_lock before reading the address, then rcu_read_unlock after - * taking the spinlock within the structure expected at that address. - */ -struct slab_rcu { - struct rcu_head head; - struct kmem_cache *cachep; - void *addr; -}; - -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - union { - struct { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; - }; - struct slab_rcu __slab_cover_slab_rcu; - }; -}; - -/* * struct array_cache * * Purpose: @@ -456,18 +390,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } -static inline struct slab *virt_to_slab(const void *obj) -{ - struct page *page = virt_to_head_page(obj); - - VM_BUG_ON(!PageSlab(page)); - return page->slab_page; -} - -static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, unsigned int idx) { - return slab->s_mem + cache->size * idx; + return page->s_mem + cache->size * idx; } /* @@ -477,9 +403,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct page *page, void *obj) { - u32 offset = (obj - slab->s_mem); + u32 offset = (obj - page->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } @@ -641,7 +567,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) static size_t slab_mgmt_size(size_t nr_objs, size_t align) { - return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); + return ALIGN(nr_objs * sizeof(unsigned int), align); } /* @@ -660,8 +586,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - The struct slab - * - One kmem_bufctl_t for each object + * - One unsigned int for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * @@ -674,8 +599,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, mgmt_size = 0; nr_objs = slab_size / buffer_size; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; } else { /* * Ignore padding for the initial guess. The padding @@ -685,8 +608,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = (slab_size - sizeof(struct slab)) / - (buffer_size + sizeof(kmem_bufctl_t)); + nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); /* * This calculated number will be either the right @@ -696,9 +618,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, > slab_size) nr_objs--; - if (nr_objs > SLAB_LIMIT) - nr_objs = SLAB_LIMIT; - mgmt_size = slab_mgmt_size(nr_objs, align); } *num = nr_objs; @@ -829,10 +748,8 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } -static inline bool is_slab_pfmemalloc(struct slab *slabp) +static inline bool is_slab_pfmemalloc(struct page *page) { - struct page *page = virt_to_page(slabp->s_mem); - return PageSlabPfmemalloc(page); } @@ -841,23 +758,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { struct kmem_cache_node *n = cachep->node[numa_mem_id()]; - struct slab *slabp; + struct page *page; unsigned long flags; if (!pfmemalloc_active) return; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_full, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_partial, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_partial, lru) + if (is_slab_pfmemalloc(page)) goto out; - list_for_each_entry(slabp, &n->slabs_free, list) - if (is_slab_pfmemalloc(slabp)) + list_for_each_entry(page, &n->slabs_free, lru) + if (is_slab_pfmemalloc(page)) goto out; pfmemalloc_active = false; @@ -897,8 +814,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, */ n = cachep->node[numa_mem_id()]; if (!list_empty(&n->slabs_free) && force_refill) { - struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); + struct page *page = virt_to_head_page(objp); + ClearPageSlabPfmemalloc(page); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1099,8 +1016,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - struct slab *slabp = virt_to_slab(objp); - int nodeid = slabp->nodeid; + int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1111,7 +1027,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(slabp->nodeid == node)) + if (likely(nodeid == node)) return 0; n = cachep->node[node]; @@ -1512,6 +1428,8 @@ void __init kmem_cache_init(void) { int i; + BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < + sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; setup_node_pointer(kmem_cache); @@ -1687,7 +1605,7 @@ static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { struct kmem_cache_node *n; - struct slab *slabp; + struct page *page; unsigned long flags; int node; @@ -1706,15 +1624,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(slabp, &n->slabs_full, list) { + list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) + list_for_each_entry(page, &n->slabs_free, lru) num_slabs++; free_objects += n->free_objects; @@ -1736,19 +1654,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) { struct page *page; int nr_pages; - int i; - -#ifndef CONFIG_MMU - /* - * Nommu uses slab's for process anonymous memory allocations, and thus - * requires __GFP_COMP to properly refcount higher order allocations - */ - flags |= __GFP_COMP; -#endif flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1772,12 +1682,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - for (i = 0; i < nr_pages; i++) { - __SetPageSlab(page + i); - - if (page->pfmemalloc) - SetPageSlabPfmemalloc(page + i); - } + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1789,17 +1696,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) kmemcheck_mark_unallocated_pages(page, nr_pages); } - return page_address(page); + return page; } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, void *addr) +static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - unsigned long i = (1 << cachep->gfporder); - struct page *page = virt_to_page(addr); - const unsigned long nr_freed = i; + const unsigned long nr_freed = (1 << cachep->gfporder); kmemcheck_free_shadow(page, cachep->gfporder); @@ -1809,27 +1714,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); - while (i--) { - BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); - page++; - } + + BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page_mapcount_reset(page); + page->mapping = NULL; memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); + __free_memcg_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) { - struct slab_rcu *slab_rcu = (struct slab_rcu *)head; - struct kmem_cache *cachep = slab_rcu->cachep; + struct kmem_cache *cachep; + struct page *page; - kmem_freepages(cachep, slab_rcu->addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slab_rcu); + page = container_of(head, struct page, rcu_head); + cachep = page->slab_cache; + + kmem_freepages(cachep, page); } #if DEBUG @@ -1978,19 +1884,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct slab *slabp = virt_to_slab(objp); + struct page *page = virt_to_head_page(objp); unsigned int objnr; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); if (objnr) { - objp = index_to_obj(cachep, slabp, objnr - 1); + objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, slabp, objnr + 1); + objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -2001,11 +1907,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -2030,7 +1937,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab } } #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) { } #endif @@ -2044,23 +1952,34 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy(struct kmem_cache *cachep, struct page *page) { - void *addr = slabp->s_mem - slabp->colouroff; + void *freelist; - slab_destroy_debugcheck(cachep, slabp); + freelist = page->freelist; + slab_destroy_debugcheck(cachep, page); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct slab_rcu *slab_rcu; + struct rcu_head *head; + + /* + * RCU free overloads the RCU head over the LRU. + * slab_page has been overloeaded over the LRU, + * however it is not used from now on so that + * we can use it safely. + */ + head = (void *)&page->rcu_head; + call_rcu(head, kmem_rcu_free); - slab_rcu = (struct slab_rcu *)slabp; - slab_rcu->cachep = cachep; - slab_rcu->addr = addr; - call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->slabp_cache, slabp); + kmem_freepages(cachep, page); } + + /* + * From now on, we don't use freelist + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->freelist_cache, freelist); } /** @@ -2097,8 +2016,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - offslab_limit = size - sizeof(struct slab); - offslab_limit /= sizeof(kmem_bufctl_t); + offslab_limit = size; + offslab_limit /= sizeof(unsigned int); if (num > offslab_limit) break; @@ -2220,7 +2139,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, slab_size, ralign; + size_t left_over, freelist_size, ralign; gfp_t gfp; int err; size_t size = cachep->size; @@ -2339,22 +2258,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) - + sizeof(struct slab), cachep->align); + freelist_size = + ALIGN(cachep->num * sizeof(unsigned int), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ - if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { + if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { flags &= ~CFLGS_OFF_SLAB; - left_over -= slab_size; + left_over -= freelist_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - slab_size = - cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); + freelist_size = cachep->num * sizeof(unsigned int); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2371,16 +2289,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; - cachep->slab_size = slab_size; + cachep->freelist_size = freelist_size; cachep->flags = flags; - cachep->allocflags = 0; + cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->slabp_cache = kmalloc_slab(slab_size, 0u); + cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than @@ -2388,7 +2306,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); + BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); } err = setup_cpu_cache(cachep, gfp); @@ -2494,7 +2412,7 @@ static int drain_freelist(struct kmem_cache *cache, { struct list_head *p; int nr_freed; - struct slab *slabp; + struct page *page; nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -2506,18 +2424,18 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - slabp = list_entry(p, struct slab, list); + page = list_entry(p, struct page, lru); #if DEBUG - BUG_ON(slabp->inuse); + BUG_ON(page->active); #endif - list_del(&slabp->list); + list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ n->free_objects -= cache->num; spin_unlock_irq(&n->list_lock); - slab_destroy(cache, slabp); + slab_destroy(cache, page); nr_freed++; } out: @@ -2600,52 +2518,42 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * descriptors in kmem_cache_create, we search through the malloc_sizes array. * If we are creating a malloc_sizes cache here it would not be visible to * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have slabp_cache same as the original cache. + * Hence we cannot have freelist_cache same as the original cache. */ -static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, - int colour_off, gfp_t local_flags, - int nodeid) +static void *alloc_slabmgmt(struct kmem_cache *cachep, + struct page *page, int colour_off, + gfp_t local_flags, int nodeid) { - struct slab *slabp; + void *freelist; + void *addr = page_address(page); if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - slabp = kmem_cache_alloc_node(cachep->slabp_cache, + freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - /* - * If the first object in the slab is leaked (it's allocated - * but no one has a reference to it), we want to make sure - * kmemleak does not treat the ->s_mem pointer as a reference - * to the object. Otherwise we will not report the leak. - */ - kmemleak_scan_area(&slabp->list, sizeof(struct list_head), - local_flags); - if (!slabp) + if (!freelist) return NULL; } else { - slabp = objp + colour_off; - colour_off += cachep->slab_size; + freelist = addr + colour_off; + colour_off += cachep->freelist_size; } - slabp->inuse = 0; - slabp->colouroff = colour_off; - slabp->s_mem = objp + colour_off; - slabp->nodeid = nodeid; - slabp->free = 0; - return slabp; + page->active = 0; + page->s_mem = addr + colour_off; + return freelist; } -static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +static inline unsigned int *slab_freelist(struct page *page) { - return (kmem_bufctl_t *) (slabp + 1); + return (unsigned int *)(page->freelist); } static void cache_init_objs(struct kmem_cache *cachep, - struct slab *slabp) + struct page *page) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, slabp, i); + void *objp = index_to_obj(cachep, page, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2681,9 +2589,8 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_bufctl(slabp)[i] = i + 1; + slab_freelist(page)[i] = i; } - slab_bufctl(slabp)[i - 1] = BUFCTL_END; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2696,41 +2603,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, int nodeid) { - void *objp = index_to_obj(cachep, slabp, slabp->free); - kmem_bufctl_t next; + void *objp; - slabp->inuse++; - next = slab_bufctl(slabp)[slabp->free]; + objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); + page->active++; #if DEBUG - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); #endif - slabp->free = next; return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, +static void slab_put_obj(struct kmem_cache *cachep, struct page *page, void *objp, int nodeid) { - unsigned int objnr = obj_to_index(cachep, slabp, objp); - + unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG + unsigned int i; + /* Verify that the slab belongs to the intended node */ - WARN_ON(slabp->nodeid != nodeid); + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); - BUG(); + /* Verify double free bug */ + for (i = page->active; i < cachep->num; i++) { + if (slab_freelist(page)[i] == objnr) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } } #endif - slab_bufctl(slabp)[objnr] = slabp->free; - slabp->free = objnr; - slabp->inuse--; + page->active--; + slab_freelist(page)[page->active] = objnr; } /* @@ -2738,23 +2645,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, * for the slab allocator to be able to lookup the cache and slab of a * virtual address for kfree, ksize, and slab debugging. */ -static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, - void *addr) +static void slab_map_pages(struct kmem_cache *cache, struct page *page, + void *freelist) { - int nr_pages; - struct page *page; - - page = virt_to_page(addr); - - nr_pages = 1; - if (likely(!PageCompound(page))) - nr_pages <<= cache->gfporder; - - do { - page->slab_cache = cache; - page->slab_page = slab; - page++; - } while (--nr_pages); + page->slab_cache = cache; + page->freelist = freelist; } /* @@ -2762,9 +2657,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, void *objp) + gfp_t flags, int nodeid, struct page *page) { - struct slab *slabp; + void *freelist; size_t offset; gfp_t local_flags; struct kmem_cache_node *n; @@ -2805,20 +2700,20 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!objp) - objp = kmem_getpages(cachep, local_flags, nodeid); - if (!objp) + if (!page) + page = kmem_getpages(cachep, local_flags, nodeid); + if (!page) goto failed; /* Get slab management. */ - slabp = alloc_slabmgmt(cachep, objp, offset, + freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!slabp) + if (!freelist) goto opps1; - slab_map_pages(cachep, slabp, objp); + slab_map_pages(cachep, page, freelist); - cache_init_objs(cachep, slabp); + cache_init_objs(cachep, page); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2826,13 +2721,13 @@ static int cache_grow(struct kmem_cache *cachep, spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&slabp->list, &(n->slabs_free)); + list_add_tail(&page->lru, &(n->slabs_free)); STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); return 1; opps1: - kmem_freepages(cachep, objp); + kmem_freepages(cachep, page); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2880,9 +2775,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct page *page; unsigned int objnr; - struct slab *slabp; + struct page *page; BUG_ON(virt_to_cache(objp) != cachep); @@ -2890,8 +2784,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_head_page(objp); - slabp = page->slab_page; - if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); *dbg_redzone1(cachep, objp) = RED_INACTIVE; @@ -2900,14 +2792,11 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, slabp, objp); + objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); + BUG_ON(objp != index_to_obj(cachep, page, objnr)); -#ifdef CONFIG_DEBUG_SLAB_LEAK - slab_bufctl(slabp)[objnr] = BUFCTL_FREE; -#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2924,33 +2813,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, return objp; } -static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) -{ - kmem_bufctl_t i; - int entries = 0; - - /* Check slab's freelist to see if this obj is there. */ - for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { - entries++; - if (entries > cachep->num || i >= cachep->num) - goto bad; - } - if (entries != cachep->num - slabp->inuse) { -bad: - printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse, - print_tainted()); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, - sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), - 1); - BUG(); - } -} #else #define kfree_debugcheck(x) do { } while(0) #define cache_free_debugcheck(x,objp,z) (objp) -#define check_slabp(x,y) do { } while(0) #endif static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, @@ -2989,7 +2854,7 @@ retry: while (batchcount > 0) { struct list_head *entry; - struct slab *slabp; + struct page *page; /* Get slab alloc is to come from. */ entry = n->slabs_partial.next; if (entry == &n->slabs_partial) { @@ -2999,8 +2864,7 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); - check_slabp(cachep, slabp); + page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -3008,24 +2872,23 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(slabp->inuse >= cachep->num); + BUG_ON(page->active >= cachep->num); - while (slabp->inuse < cachep->num && batchcount--) { + while (page->active < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, + ac_put_obj(cachep, ac, slab_get_obj(cachep, page, node)); } - check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ - list_del(&slabp->list); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &n->slabs_full); + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->list, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->list, &n->slabs_partial); } must_grow: @@ -3097,16 +2960,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } -#ifdef CONFIG_DEBUG_SLAB_LEAK - { - struct slab *slabp; - unsigned objnr; - - slabp = virt_to_head_page(objp)->slab_page; - objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; - slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; - } -#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3248,18 +3101,20 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ + struct page *page; + if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_mem_id()); + page = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); - if (obj) { + if (page) { /* * Insert into the appropriate per node queues */ - nid = page_to_nid(virt_to_page(obj)); - if (cache_grow(cache, flags, nid, obj)) { + nid = page_to_nid(page); + if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) @@ -3288,7 +3143,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; void *obj; int x; @@ -3308,26 +3163,24 @@ retry: goto must_grow; } - slabp = list_entry(entry, struct slab, list); + page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); - check_slabp(cachep, slabp); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(slabp->inuse == cachep->num); + BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, slabp, nodeid); - check_slabp(cachep, slabp); + obj = slab_get_obj(cachep, page, nodeid); n->free_objects--; /* move slabp to correct slabp list: */ - list_del(&slabp->list); + list_del(&page->lru); - if (slabp->free == BUFCTL_END) - list_add(&slabp->list, &n->slabs_full); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); else - list_add(&slabp->list, &n->slabs_partial); + list_add(&page->lru, &n->slabs_partial); spin_unlock(&n->list_lock); goto done; @@ -3477,23 +3330,21 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, for (i = 0; i < nr_objects; i++) { void *objp; - struct slab *slabp; + struct page *page; clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; - slabp = virt_to_slab(objp); + page = virt_to_head_page(objp); n = cachep->node[node]; - list_del(&slabp->list); + list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - check_slabp(cachep, slabp); - slab_put_obj(cachep, slabp, objp, node); + slab_put_obj(cachep, page, objp, node); STATS_DEC_ACTIVE(cachep); n->free_objects++; - check_slabp(cachep, slabp); /* fixup slab chains */ - if (slabp->inuse == 0) { + if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; /* No need to drop any previously held @@ -3502,16 +3353,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, slabp); + slab_destroy(cachep, page); } else { - list_add(&slabp->list, &n->slabs_free); + list_add(&page->lru, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&slabp->list, &n->slabs_partial); + list_add_tail(&page->lru, &n->slabs_partial); } } } @@ -3551,10 +3402,10 @@ free_done: p = n->slabs_free.next; while (p != &(n->slabs_free)) { - struct slab *slabp; + struct page *page; - slabp = list_entry(p, struct slab, list); - BUG_ON(slabp->inuse); + page = list_entry(p, struct page, lru); + BUG_ON(page->active); i++; p = p->next; @@ -4158,7 +4009,7 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct slab *slabp; + struct page *page; unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs = 0; @@ -4178,23 +4029,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) { - if (slabp->inuse != cachep->num && !error) + list_for_each_entry(page, &n->slabs_full, lru) { + if (page->active != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_partial, list) { - if (slabp->inuse == cachep->num && !error) - error = "slabs_partial inuse accounting error"; - if (!slabp->inuse && !error) - error = "slabs_partial/inuse accounting error"; - active_objs += slabp->inuse; + list_for_each_entry(page, &n->slabs_partial, lru) { + if (page->active == cachep->num && !error) + error = "slabs_partial accounting error"; + if (!page->active && !error) + error = "slabs_partial accounting error"; + active_objs += page->active; active_slabs++; } - list_for_each_entry(slabp, &n->slabs_free, list) { - if (slabp->inuse && !error) - error = "slabs_free/inuse accounting error"; + list_for_each_entry(page, &n->slabs_free, lru) { + if (page->active && !error) + error = "slabs_free accounting error"; num_slabs++; } free_objects += n->free_objects; @@ -4346,15 +4197,27 @@ static inline int add_caller(unsigned long *n, unsigned long v) return 1; } -static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) +static void handle_slab(unsigned long *n, struct kmem_cache *c, + struct page *page) { void *p; - int i; + int i, j; + if (n[0] == n[1]) return; - for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { - if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) + for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { + bool active = true; + + for (j = page->active; j < c->num; j++) { + /* Skip freed item */ + if (slab_freelist(page)[j] == i) { + active = false; + break; + } + } + if (!active) continue; + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) return; } @@ -4379,7 +4242,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); - struct slab *slabp; + struct page *page; struct kmem_cache_node *n; const char *name; unsigned long *x = m->private; @@ -4403,10 +4266,10 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(slabp, &n->slabs_full, list) - handle_slab(x, cachep, slabp); - list_for_each_entry(slabp, &n->slabs_partial, list) - handle_slab(x, cachep, slabp); + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); spin_unlock_irq(&n->list_lock); } name = cachep->name; diff --git a/mm/slub.c b/mm/slub.c index 7e8bd8d828b..545a170ebf9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 @@ -933,6 +933,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -1217,8 +1227,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size, /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; return flags; @@ -1260,13 +1270,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); +} + static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} + void *object) +{ + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, + flags & gfp_allowed_mask); +} -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} #endif /* CONFIG_SLUB_DEBUG */ @@ -2829,8 +2856,8 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) @@ -3272,7 +3299,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -3336,7 +3363,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); + kfree_hook(x); __free_memcg_kmem_pages(page, compound_order(page)); return; } diff --git a/mm/swap.c b/mm/swap.c index 7a9f80d451f..84b26aaabd0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { - /* - * hugetlbfs pages cannot be split from under us. If this is a - * hugetlbfs page, check refcount on head page and release the page if - * the refcount becomes zero. - */ - if (PageHuge(page)) { - page = compound_head(page); - if (put_page_testzero(page)) - __put_compound_page(page); - - return; - } - if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -111,14 +98,31 @@ static void put_compound_page(struct page *page) * still hot on arches that do not support * this_cpu_cmpxchg_double(). */ - if (PageSlab(page_head)) { - if (PageTail(page)) { + if (PageSlab(page_head) || PageHeadHuge(page_head)) { + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); + atomic_dec(&page->_mapcount); if (put_page_testzero(page_head)) VM_BUG_ON(1); - - atomic_dec(&page->_mapcount); - goto skip_lock_tail; + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + return; } else + /* + * __split_huge_page_refcount + * run before us, "page" was a + * THP tail. The split + * page_head has been freed + * and reallocated as slab or + * hugetlbfs page of smaller + * order (only possible if + * reallocated as slab on + * x86). + */ goto skip_lock; } /* @@ -132,8 +136,27 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); skip_lock: - if (put_page_testzero(page_head)) - __put_single_page(page_head); + if (put_page_testzero(page_head)) { + /* + * The head page may have been + * freed and reallocated as a + * compound page of smaller + * order and then freed again. + * All we know is that it + * cannot have become: a THP + * page, a compound page of + * higher order, a tail page. + * That is because we still + * hold the refcount of the + * split THP tail and + * page_head was the THP head + * before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } out_put_single: if (put_page_testzero(page)) __put_single_page(page); @@ -155,7 +178,6 @@ out_put_single: VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); -skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ + unsigned long flags; bool got = false; - struct page *page_head; - - /* - * If this is a hugetlbfs page it cannot be split under us. Simply - * increment refcount for the head page. - */ - if (PageHuge(page)) { - page_head = compound_head(page); - atomic_inc(&page_head->_count); - got = true; - } else { - unsigned long flags; + struct page *page_head = compound_trans_head(page); - page_head = compound_trans_head(page); - if (likely(page != page_head && - get_page_unless_zero(page_head))) { - - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head)) { - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - return true; - } else { - put_page(page_head); - return false; - } - } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head) || PageHeadHuge(page_head)) { if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON(!PageHead(page_head)); __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ put_page(page_head); + return false; + } + } + + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } return got; } diff --git a/net/Kconfig b/net/Kconfig index 0715db64a5c..d334678c0bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -224,7 +224,7 @@ source "net/hsr/Kconfig" config RPS boolean - depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS + depends on SMP && SYSFS default y config RFS_ACCEL @@ -235,7 +235,7 @@ config RFS_ACCEL config XPS boolean - depends on SMP && USE_GENERIC_SMP_HELPERS + depends on SMP default y config NETPRIO_CGROUP diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3dc0c6cf02a..c4638e6f023 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1425,7 +1425,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) do { if (dma_async_is_tx_complete(tp->ucopy.dma_chan, last_issued, &done, - &used) == DMA_SUCCESS) { + &used) == DMA_COMPLETE) { /* Safe to free early-copied skbs now */ __skb_queue_purge(&sk->sk_async_wait_queue); break; @@ -1433,7 +1433,7 @@ static void tcp_service_net_dma(struct sock *sk, bool wait) struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_async_wait_queue)) && (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { + used) == DMA_COMPLETE)) { __skb_dequeue(&sk->sk_async_wait_queue); kfree_skb(skb); } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d0d14a04dce..bf04b30a788 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -471,15 +471,6 @@ struct rpc_filelist { umode_t mode; }; -static int rpc_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static const struct dentry_operations rpc_dentry_operations = { - .d_delete = rpc_delete_dentry, -}; - static struct inode * rpc_get_inode(struct super_block *sb, umode_t mode) { @@ -1266,7 +1257,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &rpc_dentry_operations; + sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); diff --git a/scripts/asn1_compiler.c b/scripts/asn1_compiler.c index db0e5cd34c7..91c4117637a 100644 --- a/scripts/asn1_compiler.c +++ b/scripts/asn1_compiler.c @@ -1353,6 +1353,8 @@ static void render_out_of_line_list(FILE *out) render_opcode(out, "ASN1_OP_END_SET_OF%s,\n", act); render_opcode(out, "_jump_target(%u),\n", entry); break; + default: + break; } if (e->action) render_opcode(out, "_action(ACT_%s),\n", diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 61090e0ff61..9c981003037 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3289,6 +3289,7 @@ sub process { } } if (!defined $suppress_whiletrailers{$linenr} && + defined($stat) && defined($cond) && $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); diff --git a/security/Makefile b/security/Makefile index c26c81e9257..a5918e01a4f 100644 --- a/security/Makefile +++ b/security/Makefile @@ -16,7 +16,6 @@ obj-$(CONFIG_MMU) += min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o obj-$(CONFIG_SECURITYFS) += inode.o -# Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o obj-$(CONFIG_AUDIT) += lsm_audit.o diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 031d2d9dd69..89c78658031 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -111,7 +111,6 @@ static const char *const aa_audit_type[] = { static void audit_pre(struct audit_buffer *ab, void *ca) { struct common_audit_data *sa = ca; - struct task_struct *tsk = sa->aad->tsk ? sa->aad->tsk : current; if (aa_g_audit_header) { audit_log_format(ab, "apparmor="); @@ -132,11 +131,6 @@ static void audit_pre(struct audit_buffer *ab, void *ca) if (sa->aad->profile) { struct aa_profile *profile = sa->aad->profile; - pid_t pid; - rcu_read_lock(); - pid = rcu_dereference(tsk->real_parent)->pid; - rcu_read_unlock(); - audit_log_format(ab, " parent=%d", pid); if (profile->ns != root_ns) { audit_log_format(ab, " namespace="); audit_log_untrustedstring(ab, profile->ns->base.hname); @@ -149,12 +143,6 @@ static void audit_pre(struct audit_buffer *ab, void *ca) audit_log_format(ab, " name="); audit_log_untrustedstring(ab, sa->aad->name); } - - if (sa->aad->tsk) { - audit_log_format(ab, " pid=%d comm=", tsk->pid); - audit_log_untrustedstring(ab, tsk->comm); - } - } /** @@ -212,7 +200,7 @@ int aa_audit(int type, struct aa_profile *profile, gfp_t gfp, if (sa->aad->type == AUDIT_APPARMOR_KILL) (void)send_sig_info(SIGKILL, NULL, - sa->aad->tsk ? sa->aad->tsk : current); + sa->u.tsk ? sa->u.tsk : current); if (sa->aad->type == AUDIT_APPARMOR_ALLOWED) return complain_error(sa->aad->error); diff --git a/security/apparmor/capability.c b/security/apparmor/capability.c index 84d1f5f5387..1101c6f64bb 100644 --- a/security/apparmor/capability.c +++ b/security/apparmor/capability.c @@ -53,8 +53,7 @@ static void audit_cb(struct audit_buffer *ab, void *va) /** * audit_caps - audit a capability - * @profile: profile confining task (NOT NULL) - * @task: task capability test was performed against (NOT NULL) + * @profile: profile being tested for confinement (NOT NULL) * @cap: capability tested * @error: error code returned by test * @@ -63,8 +62,7 @@ static void audit_cb(struct audit_buffer *ab, void *va) * * Returns: 0 or sa->error on success, error code on failure */ -static int audit_caps(struct aa_profile *profile, struct task_struct *task, - int cap, int error) +static int audit_caps(struct aa_profile *profile, int cap, int error) { struct audit_cache *ent; int type = AUDIT_APPARMOR_AUTO; @@ -73,7 +71,6 @@ static int audit_caps(struct aa_profile *profile, struct task_struct *task, sa.type = LSM_AUDIT_DATA_CAP; sa.aad = &aad; sa.u.cap = cap; - sa.aad->tsk = task; sa.aad->op = OP_CAPABLE; sa.aad->error = error; @@ -124,8 +121,7 @@ static int profile_capable(struct aa_profile *profile, int cap) /** * aa_capable - test permission to use capability - * @task: task doing capability test against (NOT NULL) - * @profile: profile confining @task (NOT NULL) + * @profile: profile being tested against (NOT NULL) * @cap: capability to be tested * @audit: whether an audit record should be generated * @@ -133,8 +129,7 @@ static int profile_capable(struct aa_profile *profile, int cap) * * Returns: 0 on success, or else an error code. */ -int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap, - int audit) +int aa_capable(struct aa_profile *profile, int cap, int audit) { int error = profile_capable(profile, cap); @@ -144,5 +139,5 @@ int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap, return error; } - return audit_caps(profile, task, cap, error); + return audit_caps(profile, cap, error); } diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 26c607c971f..452567d3a08 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -50,23 +50,21 @@ void aa_free_domain_entries(struct aa_domain *domain) /** * may_change_ptraced_domain - check if can change profile on ptraced task - * @task: task we want to change profile of (NOT NULL) * @to_profile: profile to change to (NOT NULL) * - * Check if the task is ptraced and if so if the tracing task is allowed + * Check if current is ptraced and if so if the tracing task is allowed * to trace the new domain * * Returns: %0 or error if change not allowed */ -static int may_change_ptraced_domain(struct task_struct *task, - struct aa_profile *to_profile) +static int may_change_ptraced_domain(struct aa_profile *to_profile) { struct task_struct *tracer; struct aa_profile *tracerp = NULL; int error = 0; rcu_read_lock(); - tracer = ptrace_parent(task); + tracer = ptrace_parent(current); if (tracer) /* released below */ tracerp = aa_get_task_profile(tracer); @@ -75,7 +73,7 @@ static int may_change_ptraced_domain(struct task_struct *task, if (!tracer || unconfined(tracerp)) goto out; - error = aa_may_ptrace(tracer, tracerp, to_profile, PTRACE_MODE_ATTACH); + error = aa_may_ptrace(tracerp, to_profile, PTRACE_MODE_ATTACH); out: rcu_read_unlock(); @@ -477,7 +475,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm) } if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { - error = may_change_ptraced_domain(current, new_profile); + error = may_change_ptraced_domain(new_profile); if (error) { aa_put_profile(new_profile); goto audit; @@ -690,7 +688,7 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest) } } - error = may_change_ptraced_domain(current, hat); + error = may_change_ptraced_domain(hat); if (error) { info = "ptraced"; error = -EPERM; @@ -829,7 +827,7 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec, } /* check if tracing task is allowed to trace target domain */ - error = may_change_ptraced_domain(current, target); + error = may_change_ptraced_domain(target); if (error) { info = "ptrace prevents transition"; goto audit; diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index 30e8d768725..ba3dfd17f23 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -109,7 +109,6 @@ struct apparmor_audit_data { void *profile; const char *name; const char *info; - struct task_struct *tsk; union { void *target; struct { diff --git a/security/apparmor/include/capability.h b/security/apparmor/include/capability.h index 2e7c9d6a2f3..fc3fa381d85 100644 --- a/security/apparmor/include/capability.h +++ b/security/apparmor/include/capability.h @@ -4,7 +4,7 @@ * This file contains AppArmor capability mediation definitions. * * Copyright (C) 1998-2008 Novell/SUSE - * Copyright 2009-2010 Canonical Ltd. + * Copyright 2009-2013 Canonical Ltd. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -38,8 +38,7 @@ struct aa_caps { extern struct aa_fs_entry aa_fs_entry_caps[]; -int aa_capable(struct task_struct *task, struct aa_profile *profile, int cap, - int audit); +int aa_capable(struct aa_profile *profile, int cap, int audit); static inline void aa_free_cap_rules(struct aa_caps *caps) { diff --git a/security/apparmor/include/ipc.h b/security/apparmor/include/ipc.h index aeda0fbc8b2..288ca76e2fb 100644 --- a/security/apparmor/include/ipc.h +++ b/security/apparmor/include/ipc.h @@ -19,8 +19,8 @@ struct aa_profile; -int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer, - struct aa_profile *tracee, unsigned int mode); +int aa_may_ptrace(struct aa_profile *tracer, struct aa_profile *tracee, + unsigned int mode); int aa_ptrace(struct task_struct *tracer, struct task_struct *tracee, unsigned int mode); diff --git a/security/apparmor/ipc.c b/security/apparmor/ipc.c index c51d2266587..777ac1c4725 100644 --- a/security/apparmor/ipc.c +++ b/security/apparmor/ipc.c @@ -54,15 +54,14 @@ static int aa_audit_ptrace(struct aa_profile *profile, /** * aa_may_ptrace - test if tracer task can trace the tracee - * @tracer_task: task who will do the tracing (NOT NULL) * @tracer: profile of the task doing the tracing (NOT NULL) * @tracee: task to be traced * @mode: whether PTRACE_MODE_READ || PTRACE_MODE_ATTACH * * Returns: %0 else error code if permission denied or error */ -int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer, - struct aa_profile *tracee, unsigned int mode) +int aa_may_ptrace(struct aa_profile *tracer, struct aa_profile *tracee, + unsigned int mode) { /* TODO: currently only based on capability, not extended ptrace * rules, @@ -72,7 +71,7 @@ int aa_may_ptrace(struct task_struct *tracer_task, struct aa_profile *tracer, if (unconfined(tracer) || tracer == tracee) return 0; /* log this capability request */ - return aa_capable(tracer_task, tracer, CAP_SYS_PTRACE, 1); + return aa_capable(tracer, CAP_SYS_PTRACE, 1); } /** @@ -101,7 +100,7 @@ int aa_ptrace(struct task_struct *tracer, struct task_struct *tracee, if (!unconfined(tracer_p)) { struct aa_profile *tracee_p = aa_get_task_profile(tracee); - error = aa_may_ptrace(tracer, tracer_p, tracee_p, mode); + error = aa_may_ptrace(tracer_p, tracee_p, mode); error = aa_audit_ptrace(tracer_p, tracee_p, error); aa_put_profile(tracee_p); diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index fb99e18123b..4257b7e2796 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -145,7 +145,7 @@ static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, if (!error) { profile = aa_cred_profile(cred); if (!unconfined(profile)) - error = aa_capable(current, profile, cap, audit); + error = aa_capable(profile, cap, audit); } return error; } diff --git a/security/capability.c b/security/capability.c index dbeb9bc27b2..8b4f24ae433 100644 --- a/security/capability.c +++ b/security/capability.c @@ -777,9 +777,15 @@ static int cap_xfrm_policy_delete_security(struct xfrm_sec_ctx *ctx) return 0; } -static int cap_xfrm_state_alloc_security(struct xfrm_state *x, - struct xfrm_user_sec_ctx *sec_ctx, - u32 secid) +static int cap_xfrm_state_alloc(struct xfrm_state *x, + struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static int cap_xfrm_state_alloc_acquire(struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, + u32 secid) { return 0; } @@ -1101,7 +1107,8 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, xfrm_policy_clone_security); set_to_cap_if_null(ops, xfrm_policy_free_security); set_to_cap_if_null(ops, xfrm_policy_delete_security); - set_to_cap_if_null(ops, xfrm_state_alloc_security); + set_to_cap_if_null(ops, xfrm_state_alloc); + set_to_cap_if_null(ops, xfrm_state_alloc_acquire); set_to_cap_if_null(ops, xfrm_state_free_security); set_to_cap_if_null(ops, xfrm_state_delete_security); set_to_cap_if_null(ops, xfrm_policy_lookup); diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 0b759e17a13..77ca965ab68 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -13,7 +13,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/err.h> +#include <linux/sched.h> #include <linux/rbtree.h> +#include <linux/cred.h> #include <linux/key-type.h> #include <linux/digsig.h> @@ -21,21 +23,29 @@ static struct key *keyring[INTEGRITY_KEYRING_MAX]; +#ifdef CONFIG_IMA_TRUSTED_KEYRING +static const char *keyring_name[INTEGRITY_KEYRING_MAX] = { + ".evm", + ".module", + ".ima", +}; +#else static const char *keyring_name[INTEGRITY_KEYRING_MAX] = { "_evm", "_module", "_ima", }; +#endif int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, - const char *digest, int digestlen) + const char *digest, int digestlen) { if (id >= INTEGRITY_KEYRING_MAX) return -EINVAL; if (!keyring[id]) { keyring[id] = - request_key(&key_type_keyring, keyring_name[id], NULL); + request_key(&key_type_keyring, keyring_name[id], NULL); if (IS_ERR(keyring[id])) { int err = PTR_ERR(keyring[id]); pr_err("no %s keyring: %d\n", keyring_name[id], err); @@ -44,9 +54,10 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, } } - switch (sig[0]) { + switch (sig[1]) { case 1: - return digsig_verify(keyring[id], sig, siglen, + /* v1 API expect signature without xattr type */ + return digsig_verify(keyring[id], sig + 1, siglen - 1, digest, digestlen); case 2: return asymmetric_verify(keyring[id], sig, siglen, @@ -55,3 +66,21 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, return -EOPNOTSUPP; } + +int integrity_init_keyring(const unsigned int id) +{ + const struct cred *cred = current_cred(); + const struct user_struct *user = cred->user; + + keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0), + KGIDT_INIT(0), cred, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, user->uid_keyring); + if (!IS_ERR(keyring[id])) + set_bit(KEY_FLAG_TRUSTED_ONLY, &keyring[id]->flags); + else + pr_info("Can't allocate %s keyring (%ld)\n", + keyring_name[id], PTR_ERR(keyring[id])); + return 0; +} diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index b4754667659..9eae4809006 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -20,17 +20,6 @@ #include "integrity.h" /* - * signature format v2 - for using with asymmetric keys - */ -struct signature_v2_hdr { - uint8_t version; /* signature format version */ - uint8_t hash_algo; /* Digest algorithm [enum pkey_hash_algo] */ - uint32_t keyid; /* IMA key identifier - not X509/PGP specific*/ - uint16_t sig_size; /* signature size */ - uint8_t sig[0]; /* signature payload */ -} __packed; - -/* * Request an asymmetric key. */ static struct key *request_asymmetric_key(struct key *keyring, uint32_t keyid) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index af9b6852f4e..336b3ddfe63 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -123,7 +123,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, goto out; } - xattr_len = rc - 1; + xattr_len = rc; /* check value type */ switch (xattr_data->type) { @@ -143,7 +143,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, if (rc) break; rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM, - xattr_data->digest, xattr_len, + (const char *)xattr_data, xattr_len, calc.digest, sizeof(calc.digest)); if (!rc) { /* we probably want to replace rsa with hmac here */ diff --git a/security/integrity/evm/evm_posix_acl.c b/security/integrity/evm/evm_posix_acl.c index b1753e98bf9..46408b9e62e 100644 --- a/security/integrity/evm/evm_posix_acl.c +++ b/security/integrity/evm/evm_posix_acl.c @@ -11,8 +11,9 @@ #include <linux/module.h> #include <linux/xattr.h> +#include <linux/evm.h> -int posix_xattr_acl(char *xattr) +int posix_xattr_acl(const char *xattr) { int xattr_len = strlen(xattr); diff --git a/security/integrity/iint.c b/security/integrity/iint.c index 74522dbd10a..c49d3f14cbe 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -70,6 +70,8 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode) static void iint_free(struct integrity_iint_cache *iint) { + kfree(iint->ima_hash); + iint->ima_hash = NULL; iint->version = 0; iint->flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 39196abaff0..dad8d4ca243 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -9,6 +9,7 @@ config IMA select CRYPTO_HMAC select CRYPTO_MD5 select CRYPTO_SHA1 + select CRYPTO_HASH_INFO select TCG_TPM if HAS_IOMEM && !UML select TCG_TIS if TCG_TPM && X86 select TCG_IBMVTPM if TCG_TPM && PPC64 @@ -45,6 +46,69 @@ config IMA_LSM_RULES help Disabling this option will disregard LSM based policy rules. +choice + prompt "Default template" + default IMA_NG_TEMPLATE + depends on IMA + help + Select the default IMA measurement template. + + The original 'ima' measurement list template contains a + hash, defined as 20 bytes, and a null terminated pathname, + limited to 255 characters. The 'ima-ng' measurement list + template permits both larger hash digests and longer + pathnames. + + config IMA_TEMPLATE + bool "ima" + config IMA_NG_TEMPLATE + bool "ima-ng (default)" + config IMA_SIG_TEMPLATE + bool "ima-sig" +endchoice + +config IMA_DEFAULT_TEMPLATE + string + depends on IMA + default "ima" if IMA_TEMPLATE + default "ima-ng" if IMA_NG_TEMPLATE + default "ima-sig" if IMA_SIG_TEMPLATE + +choice + prompt "Default integrity hash algorithm" + default IMA_DEFAULT_HASH_SHA1 + depends on IMA + help + Select the default hash algorithm used for the measurement + list, integrity appraisal and audit log. The compiled default + hash algorithm can be overwritten using the kernel command + line 'ima_hash=' option. + + config IMA_DEFAULT_HASH_SHA1 + bool "SHA1 (default)" + depends on CRYPTO_SHA1 + + config IMA_DEFAULT_HASH_SHA256 + bool "SHA256" + depends on CRYPTO_SHA256 && !IMA_TEMPLATE + + config IMA_DEFAULT_HASH_SHA512 + bool "SHA512" + depends on CRYPTO_SHA512 && !IMA_TEMPLATE + + config IMA_DEFAULT_HASH_WP512 + bool "WP512" + depends on CRYPTO_WP512 && !IMA_TEMPLATE +endchoice + +config IMA_DEFAULT_HASH + string + depends on IMA + default "sha1" if IMA_DEFAULT_HASH_SHA1 + default "sha256" if IMA_DEFAULT_HASH_SHA256 + default "sha512" if IMA_DEFAULT_HASH_SHA512 + default "wp512" if IMA_DEFAULT_HASH_WP512 + config IMA_APPRAISE bool "Appraise integrity measurements" depends on IMA @@ -59,3 +123,11 @@ config IMA_APPRAISE For more information on integrity appraisal refer to: <http://linux-ima.sourceforge.net> If unsure, say N. + +config IMA_TRUSTED_KEYRING + bool "Require all keys on the _ima keyring be signed" + depends on IMA_APPRAISE && SYSTEM_TRUSTED_KEYRING + default y + help + This option requires that all keys added to the _ima + keyring be signed by a key on the system trusted keyring. diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile index 56dfee7cbf6..d79263d2fdb 100644 --- a/security/integrity/ima/Makefile +++ b/security/integrity/ima/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IMA) += ima.o ima-y := ima_fs.o ima_queue.o ima_init.o ima_main.o ima_crypto.o ima_api.o \ - ima_policy.o + ima_policy.o ima_template.o ima_template_lib.o ima-$(CONFIG_IMA_APPRAISE) += ima_appraise.o diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index b3dd616560f..bf03c6a16cc 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -36,23 +36,48 @@ enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 }; #define IMA_HASH_BITS 9 #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) +#define IMA_TEMPLATE_FIELD_ID_MAX_LEN 16 +#define IMA_TEMPLATE_NUM_FIELDS_MAX 15 + +#define IMA_TEMPLATE_IMA_NAME "ima" +#define IMA_TEMPLATE_IMA_FMT "d|n" + /* set during initialization */ extern int ima_initialized; extern int ima_used_chip; -extern char *ima_hash; +extern int ima_hash_algo; extern int ima_appraise; -/* IMA inode template definition */ -struct ima_template_data { - u8 digest[IMA_DIGEST_SIZE]; /* sha1/md5 measurement hash */ - char file_name[IMA_EVENT_NAME_LEN_MAX + 1]; /* name + \0 */ +/* IMA template field data definition */ +struct ima_field_data { + u8 *data; + u32 len; +}; + +/* IMA template field definition */ +struct ima_template_field { + const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN]; + int (*field_init) (struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len, struct ima_field_data *field_data); + void (*field_show) (struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data); +}; + +/* IMA template descriptor definition */ +struct ima_template_desc { + char *name; + char *fmt; + int num_fields; + struct ima_template_field **fields; }; struct ima_template_entry { - u8 digest[IMA_DIGEST_SIZE]; /* sha1 or md5 measurement hash */ - const char *template_name; - int template_len; - struct ima_template_data template; + u8 digest[TPM_DIGEST_SIZE]; /* sha1 or md5 measurement hash */ + struct ima_template_desc *template_desc; /* template descriptor */ + u32 template_data_len; + struct ima_field_data template_data[0]; /* template related data */ }; struct ima_queue_entry { @@ -69,13 +94,21 @@ int ima_fs_init(void); void ima_fs_cleanup(void); int ima_inode_alloc(struct inode *inode); int ima_add_template_entry(struct ima_template_entry *entry, int violation, - const char *op, struct inode *inode); -int ima_calc_file_hash(struct file *file, char *digest); -int ima_calc_buffer_hash(const void *data, int len, char *digest); -int ima_calc_boot_aggregate(char *digest); -void ima_add_violation(struct inode *inode, const unsigned char *filename, + const char *op, struct inode *inode, + const unsigned char *filename); +int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash); +int ima_calc_field_array_hash(struct ima_field_data *field_data, int num_fields, + struct ima_digest_data *hash); +int __init ima_calc_boot_aggregate(struct ima_digest_data *hash); +void ima_add_violation(struct file *file, const unsigned char *filename, const char *op, const char *cause); int ima_init_crypto(void); +void ima_putc(struct seq_file *m, void *data, int datalen); +void ima_print_digest(struct seq_file *m, u8 *digest, int size); +struct ima_template_desc *ima_template_desc_current(void); +int ima_init_template(void); + +int ima_init_template(void); /* * used to protect h_table and sha_table @@ -98,14 +131,21 @@ static inline unsigned long ima_hash_key(u8 *digest) int ima_get_action(struct inode *inode, int mask, int function); int ima_must_measure(struct inode *inode, int mask, int function); int ima_collect_measurement(struct integrity_iint_cache *iint, - struct file *file); + struct file *file, + struct evm_ima_xattr_data **xattr_value, + int *xattr_len); void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, - const unsigned char *filename); + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len); void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename); +int ima_alloc_init_template(struct integrity_iint_cache *iint, + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len, struct ima_template_entry **entry); int ima_store_template(struct ima_template_entry *entry, int violation, - struct inode *inode); -void ima_template_show(struct seq_file *m, void *e, enum ima_show_type show); + struct inode *inode, const unsigned char *filename); const char *ima_d_path(struct path *path, char **pathbuf); /* rbtree tree calls to lookup, insert, delete @@ -131,17 +171,25 @@ void ima_delete_rules(void); #ifdef CONFIG_IMA_APPRAISE int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, - struct file *file, const unsigned char *filename); + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len); int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func); void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file); enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, int func); +void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_digest_data *hash); +int ima_read_xattr(struct dentry *dentry, + struct evm_ima_xattr_data **xattr_value); #else static inline int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, struct file *file, - const unsigned char *filename) + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len) { return INTEGRITY_UNKNOWN; } @@ -162,6 +210,19 @@ static inline enum integrity_status ima_get_cache_status(struct integrity_iint_c { return INTEGRITY_UNKNOWN; } + +static inline void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, + int xattr_len, + struct ima_digest_data *hash) +{ +} + +static inline int ima_read_xattr(struct dentry *dentry, + struct evm_ima_xattr_data **xattr_value) +{ + return 0; +} + #endif /* LSM based policy rules require audit */ diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 1c03e8f1e0e..0e7540863fc 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -18,9 +18,46 @@ #include <linux/fs.h> #include <linux/xattr.h> #include <linux/evm.h> +#include <crypto/hash_info.h> #include "ima.h" -static const char *IMA_TEMPLATE_NAME = "ima"; +/* + * ima_alloc_init_template - create and initialize a new template entry + */ +int ima_alloc_init_template(struct integrity_iint_cache *iint, + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len, struct ima_template_entry **entry) +{ + struct ima_template_desc *template_desc = ima_template_desc_current(); + int i, result = 0; + + *entry = kzalloc(sizeof(**entry) + template_desc->num_fields * + sizeof(struct ima_field_data), GFP_NOFS); + if (!*entry) + return -ENOMEM; + + for (i = 0; i < template_desc->num_fields; i++) { + struct ima_template_field *field = template_desc->fields[i]; + u32 len; + + result = field->field_init(iint, file, filename, + xattr_value, xattr_len, + &((*entry)->template_data[i])); + if (result != 0) + goto out; + + len = (*entry)->template_data[i].len; + (*entry)->template_data_len += sizeof(len); + (*entry)->template_data_len += len; + } + (*entry)->template_desc = template_desc; + return 0; +out: + kfree(*entry); + *entry = NULL; + return result; +} /* * ima_store_template - store ima template measurements @@ -39,28 +76,34 @@ static const char *IMA_TEMPLATE_NAME = "ima"; * Returns 0 on success, error code otherwise */ int ima_store_template(struct ima_template_entry *entry, - int violation, struct inode *inode) + int violation, struct inode *inode, + const unsigned char *filename) { const char *op = "add_template_measure"; const char *audit_cause = "hashing_error"; + char *template_name = entry->template_desc->name; int result; - - memset(entry->digest, 0, sizeof(entry->digest)); - entry->template_name = IMA_TEMPLATE_NAME; - entry->template_len = sizeof(entry->template); + struct { + struct ima_digest_data hdr; + char digest[TPM_DIGEST_SIZE]; + } hash; if (!violation) { - result = ima_calc_buffer_hash(&entry->template, - entry->template_len, - entry->digest); + int num_fields = entry->template_desc->num_fields; + + /* this function uses default algo */ + hash.hdr.algo = HASH_ALGO_SHA1; + result = ima_calc_field_array_hash(&entry->template_data[0], + num_fields, &hash.hdr); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, - entry->template_name, op, + template_name, op, audit_cause, result, 0); return result; } + memcpy(entry->digest, hash.hdr.digest, hash.hdr.length); } - result = ima_add_template_entry(entry, violation, op, inode); + result = ima_add_template_entry(entry, violation, op, inode, filename); return result; } @@ -71,24 +114,24 @@ int ima_store_template(struct ima_template_entry *entry, * By extending the PCR with 0xFF's instead of with zeroes, the PCR * value is invalidated. */ -void ima_add_violation(struct inode *inode, const unsigned char *filename, +void ima_add_violation(struct file *file, const unsigned char *filename, const char *op, const char *cause) { struct ima_template_entry *entry; + struct inode *inode = file->f_dentry->d_inode; int violation = 1; int result; /* can overflow, only indicator */ atomic_long_inc(&ima_htable.violations); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { + result = ima_alloc_init_template(NULL, file, filename, + NULL, 0, &entry); + if (result < 0) { result = -ENOMEM; goto err_out; } - memset(&entry->template, 0, sizeof(entry->template)); - strncpy(entry->template.file_name, filename, IMA_EVENT_NAME_LEN_MAX); - result = ima_store_template(entry, violation, inode); + result = ima_store_template(entry, violation, inode, filename); if (result < 0) kfree(entry); err_out: @@ -138,20 +181,42 @@ int ima_must_measure(struct inode *inode, int mask, int function) * Return 0 on success, error code otherwise */ int ima_collect_measurement(struct integrity_iint_cache *iint, - struct file *file) + struct file *file, + struct evm_ima_xattr_data **xattr_value, + int *xattr_len) { struct inode *inode = file_inode(file); const char *filename = file->f_dentry->d_name.name; int result = 0; + struct { + struct ima_digest_data hdr; + char digest[IMA_MAX_DIGEST_SIZE]; + } hash; + + if (xattr_value) + *xattr_len = ima_read_xattr(file->f_dentry, xattr_value); if (!(iint->flags & IMA_COLLECTED)) { u64 i_version = file_inode(file)->i_version; - iint->ima_xattr.type = IMA_XATTR_DIGEST; - result = ima_calc_file_hash(file, iint->ima_xattr.digest); + /* use default hash algorithm */ + hash.hdr.algo = ima_hash_algo; + + if (xattr_value) + ima_get_hash_algo(*xattr_value, *xattr_len, &hash.hdr); + + result = ima_calc_file_hash(file, &hash.hdr); if (!result) { - iint->version = i_version; - iint->flags |= IMA_COLLECTED; + int length = sizeof(hash.hdr) + hash.hdr.length; + void *tmpbuf = krealloc(iint->ima_hash, length, + GFP_NOFS); + if (tmpbuf) { + iint->ima_hash = tmpbuf; + memcpy(iint->ima_hash, &hash, length); + iint->version = i_version; + iint->flags |= IMA_COLLECTED; + } else + result = -ENOMEM; } } if (result) @@ -177,7 +242,9 @@ int ima_collect_measurement(struct integrity_iint_cache *iint, * Must be called with iint->mutex held. */ void ima_store_measurement(struct integrity_iint_cache *iint, - struct file *file, const unsigned char *filename) + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len) { const char *op = "add_template_measure"; const char *audit_cause = "ENOMEM"; @@ -189,19 +256,15 @@ void ima_store_measurement(struct integrity_iint_cache *iint, if (iint->flags & IMA_MEASURED) return; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { + result = ima_alloc_init_template(iint, file, filename, + xattr_value, xattr_len, &entry); + if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, 0); return; } - memset(&entry->template, 0, sizeof(entry->template)); - memcpy(entry->template.digest, iint->ima_xattr.digest, IMA_DIGEST_SIZE); - strcpy(entry->template.file_name, - (strlen(filename) > IMA_EVENT_NAME_LEN_MAX) ? - file->f_dentry->d_name.name : filename); - result = ima_store_template(entry, violation, inode); + result = ima_store_template(entry, violation, inode, filename); if (!result || result == -EEXIST) iint->flags |= IMA_MEASURED; if (result < 0) @@ -212,14 +275,16 @@ void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename) { struct audit_buffer *ab; - char hash[(IMA_DIGEST_SIZE * 2) + 1]; + char hash[(iint->ima_hash->length * 2) + 1]; + const char *algo_name = hash_algo_name[iint->ima_hash->algo]; + char algo_hash[sizeof(hash) + strlen(algo_name) + 2]; int i; if (iint->flags & IMA_AUDITED) return; - for (i = 0; i < IMA_DIGEST_SIZE; i++) - hex_byte_pack(hash + (i * 2), iint->ima_xattr.digest[i]); + for (i = 0; i < iint->ima_hash->length; i++) + hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]); hash[i * 2] = '\0'; ab = audit_log_start(current->audit_context, GFP_KERNEL, @@ -230,7 +295,8 @@ void ima_audit_measurement(struct integrity_iint_cache *iint, audit_log_format(ab, "file="); audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash="); - audit_log_untrustedstring(ab, hash); + snprintf(algo_hash, sizeof(algo_hash), "%s:%s", algo_name, hash); + audit_log_untrustedstring(ab, algo_hash); audit_log_task_info(ab, current); audit_log_end(ab); diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 2d4becab891..46353ee517f 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -15,6 +15,7 @@ #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> +#include <crypto/hash_info.h> #include "ima.h" @@ -43,19 +44,31 @@ int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func) } static int ima_fix_xattr(struct dentry *dentry, - struct integrity_iint_cache *iint) + struct integrity_iint_cache *iint) { - iint->ima_xattr.type = IMA_XATTR_DIGEST; - return __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA, - (u8 *)&iint->ima_xattr, - sizeof(iint->ima_xattr), 0); + int rc, offset; + u8 algo = iint->ima_hash->algo; + + if (algo <= HASH_ALGO_SHA1) { + offset = 1; + iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; + } else { + offset = 0; + iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; + iint->ima_hash->xattr.ng.algo = algo; + } + rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA, + &iint->ima_hash->xattr.data[offset], + (sizeof(iint->ima_hash->xattr) - offset) + + iint->ima_hash->length, 0); + return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, int func) { - switch(func) { + switch (func) { case MMAP_CHECK: return iint->ima_mmap_status; case BPRM_CHECK: @@ -71,7 +84,7 @@ enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint, static void ima_set_cache_status(struct integrity_iint_cache *iint, int func, enum integrity_status status) { - switch(func) { + switch (func) { case MMAP_CHECK: iint->ima_mmap_status = status; break; @@ -90,7 +103,7 @@ static void ima_set_cache_status(struct integrity_iint_cache *iint, static void ima_cache_flags(struct integrity_iint_cache *iint, int func) { - switch(func) { + switch (func) { case MMAP_CHECK: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; @@ -107,6 +120,50 @@ static void ima_cache_flags(struct integrity_iint_cache *iint, int func) } } +void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_digest_data *hash) +{ + struct signature_v2_hdr *sig; + + if (!xattr_value || xattr_len < 2) + return; + + switch (xattr_value->type) { + case EVM_IMA_XATTR_DIGSIG: + sig = (typeof(sig))xattr_value; + if (sig->version != 2 || xattr_len <= sizeof(*sig)) + return; + hash->algo = sig->hash_algo; + break; + case IMA_XATTR_DIGEST_NG: + hash->algo = xattr_value->digest[0]; + break; + case IMA_XATTR_DIGEST: + /* this is for backward compatibility */ + if (xattr_len == 21) { + unsigned int zero = 0; + if (!memcmp(&xattr_value->digest[16], &zero, 4)) + hash->algo = HASH_ALGO_MD5; + else + hash->algo = HASH_ALGO_SHA1; + } else if (xattr_len == 17) + hash->algo = HASH_ALGO_MD5; + break; + } +} + +int ima_read_xattr(struct dentry *dentry, + struct evm_ima_xattr_data **xattr_value) +{ + struct inode *inode = dentry->d_inode; + + if (!inode->i_op->getxattr) + return 0; + + return vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value, + 0, GFP_NOFS); +} + /* * ima_appraise_measurement - appraise file measurement * @@ -116,23 +173,22 @@ static void ima_cache_flags(struct integrity_iint_cache *iint, int func) * Return 0 on success, error code otherwise */ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, - struct file *file, const unsigned char *filename) + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len) { struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; - struct evm_ima_xattr_data *xattr_value = NULL; enum integrity_status status = INTEGRITY_UNKNOWN; const char *op = "appraise_data"; char *cause = "unknown"; - int rc; + int rc = xattr_len, hash_start = 0; if (!ima_appraise) return 0; if (!inode->i_op->getxattr) return INTEGRITY_UNKNOWN; - rc = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)&xattr_value, - 0, GFP_NOFS); if (rc <= 0) { if (rc && rc != -ENODATA) goto out; @@ -153,14 +209,25 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, goto out; } switch (xattr_value->type) { + case IMA_XATTR_DIGEST_NG: + /* first byte contains algorithm id */ + hash_start = 1; case IMA_XATTR_DIGEST: if (iint->flags & IMA_DIGSIG_REQUIRED) { cause = "IMA signature required"; status = INTEGRITY_FAIL; break; } - rc = memcmp(xattr_value->digest, iint->ima_xattr.digest, - IMA_DIGEST_SIZE); + if (xattr_len - sizeof(xattr_value->type) - hash_start >= + iint->ima_hash->length) + /* xattr length may be longer. md5 hash in previous + version occupied 20 bytes in xattr, instead of 16 + */ + rc = memcmp(&xattr_value->digest[hash_start], + iint->ima_hash->digest, + iint->ima_hash->length); + else + rc = -EINVAL; if (rc) { cause = "invalid-hash"; status = INTEGRITY_FAIL; @@ -171,9 +238,9 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, case EVM_IMA_XATTR_DIGSIG: iint->flags |= IMA_DIGSIG; rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, - xattr_value->digest, rc - 1, - iint->ima_xattr.digest, - IMA_DIGEST_SIZE); + (const char *)xattr_value, rc, + iint->ima_hash->digest, + iint->ima_hash->length); if (rc == -EOPNOTSUPP) { status = INTEGRITY_UNKNOWN; } else if (rc) { @@ -203,7 +270,6 @@ out: ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); - kfree(xattr_value); return status; } @@ -219,7 +285,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) if (iint->flags & IMA_DIGSIG) return; - rc = ima_collect_measurement(iint, file); + rc = ima_collect_measurement(iint, file, NULL, NULL); if (rc < 0) return; @@ -315,3 +381,14 @@ int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name) } return result; } + +#ifdef CONFIG_IMA_TRUSTED_KEYRING +static int __init init_ima_keyring(void) +{ + int ret; + + ret = integrity_init_keyring(INTEGRITY_KEYRING_IMA); + return 0; +} +late_initcall(init_ima_keyring); +#endif diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index a02e0791cf1..676e0292dfe 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -20,6 +20,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <crypto/hash.h> +#include <crypto/hash_info.h> #include "ima.h" static struct crypto_shash *ima_shash_tfm; @@ -28,31 +29,58 @@ int ima_init_crypto(void) { long rc; - ima_shash_tfm = crypto_alloc_shash(ima_hash, 0, 0); + ima_shash_tfm = crypto_alloc_shash(hash_algo_name[ima_hash_algo], 0, 0); if (IS_ERR(ima_shash_tfm)) { rc = PTR_ERR(ima_shash_tfm); - pr_err("Can not allocate %s (reason: %ld)\n", ima_hash, rc); + pr_err("Can not allocate %s (reason: %ld)\n", + hash_algo_name[ima_hash_algo], rc); return rc; } return 0; } +static struct crypto_shash *ima_alloc_tfm(enum hash_algo algo) +{ + struct crypto_shash *tfm = ima_shash_tfm; + int rc; + + if (algo != ima_hash_algo && algo < HASH_ALGO__LAST) { + tfm = crypto_alloc_shash(hash_algo_name[algo], 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + pr_err("Can not allocate %s (reason: %d)\n", + hash_algo_name[algo], rc); + } + } + return tfm; +} + +static void ima_free_tfm(struct crypto_shash *tfm) +{ + if (tfm != ima_shash_tfm) + crypto_free_shash(tfm); +} + /* * Calculate the MD5/SHA1 file digest */ -int ima_calc_file_hash(struct file *file, char *digest) +static int ima_calc_file_hash_tfm(struct file *file, + struct ima_digest_data *hash, + struct crypto_shash *tfm) { loff_t i_size, offset = 0; char *rbuf; int rc, read = 0; struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(ima_shash_tfm)]; + char ctx[crypto_shash_descsize(tfm)]; } desc; - desc.shash.tfm = ima_shash_tfm; + desc.shash.tfm = tfm; desc.shash.flags = 0; + hash->length = crypto_shash_digestsize(tfm); + rc = crypto_shash_init(&desc.shash); if (rc != 0) return rc; @@ -85,27 +113,83 @@ int ima_calc_file_hash(struct file *file, char *digest) } kfree(rbuf); if (!rc) - rc = crypto_shash_final(&desc.shash, digest); + rc = crypto_shash_final(&desc.shash, hash->digest); if (read) file->f_mode &= ~FMODE_READ; out: return rc; } +int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) +{ + struct crypto_shash *tfm; + int rc; + + tfm = ima_alloc_tfm(hash->algo); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + rc = ima_calc_file_hash_tfm(file, hash, tfm); + + ima_free_tfm(tfm); + + return rc; +} + /* - * Calculate the hash of a given buffer + * Calculate the hash of template data */ -int ima_calc_buffer_hash(const void *data, int len, char *digest) +static int ima_calc_field_array_hash_tfm(struct ima_field_data *field_data, + int num_fields, + struct ima_digest_data *hash, + struct crypto_shash *tfm) { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(ima_shash_tfm)]; + char ctx[crypto_shash_descsize(tfm)]; } desc; + int rc, i; - desc.shash.tfm = ima_shash_tfm; + desc.shash.tfm = tfm; desc.shash.flags = 0; - return crypto_shash_digest(&desc.shash, data, len, digest); + hash->length = crypto_shash_digestsize(tfm); + + rc = crypto_shash_init(&desc.shash); + if (rc != 0) + return rc; + + for (i = 0; i < num_fields; i++) { + rc = crypto_shash_update(&desc.shash, + (const u8 *) &field_data[i].len, + sizeof(field_data[i].len)); + rc = crypto_shash_update(&desc.shash, field_data[i].data, + field_data[i].len); + if (rc) + break; + } + + if (!rc) + rc = crypto_shash_final(&desc.shash, hash->digest); + + return rc; +} + +int ima_calc_field_array_hash(struct ima_field_data *field_data, int num_fields, + struct ima_digest_data *hash) +{ + struct crypto_shash *tfm; + int rc; + + tfm = ima_alloc_tfm(hash->algo); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + rc = ima_calc_field_array_hash_tfm(field_data, num_fields, hash, tfm); + + ima_free_tfm(tfm); + + return rc; } static void __init ima_pcrread(int idx, u8 *pcr) @@ -120,16 +204,17 @@ static void __init ima_pcrread(int idx, u8 *pcr) /* * Calculate the boot aggregate hash */ -int __init ima_calc_boot_aggregate(char *digest) +static int __init ima_calc_boot_aggregate_tfm(char *digest, + struct crypto_shash *tfm) { - u8 pcr_i[IMA_DIGEST_SIZE]; + u8 pcr_i[TPM_DIGEST_SIZE]; int rc, i; struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(ima_shash_tfm)]; + char ctx[crypto_shash_descsize(tfm)]; } desc; - desc.shash.tfm = ima_shash_tfm; + desc.shash.tfm = tfm; desc.shash.flags = 0; rc = crypto_shash_init(&desc.shash); @@ -140,9 +225,26 @@ int __init ima_calc_boot_aggregate(char *digest) for (i = TPM_PCR0; i < TPM_PCR8; i++) { ima_pcrread(i, pcr_i); /* now accumulate with current aggregate */ - rc = crypto_shash_update(&desc.shash, pcr_i, IMA_DIGEST_SIZE); + rc = crypto_shash_update(&desc.shash, pcr_i, TPM_DIGEST_SIZE); } if (!rc) crypto_shash_final(&desc.shash, digest); return rc; } + +int __init ima_calc_boot_aggregate(struct ima_digest_data *hash) +{ + struct crypto_shash *tfm; + int rc; + + tfm = ima_alloc_tfm(hash->algo); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + hash->length = crypto_shash_digestsize(tfm); + rc = ima_calc_boot_aggregate_tfm(hash->digest, tfm); + + ima_free_tfm(tfm); + + return rc; +} diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 38477c9c341..d47a7c86a21 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -88,8 +88,7 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos) * against concurrent list-extension */ rcu_read_lock(); - qe = list_entry_rcu(qe->later.next, - struct ima_queue_entry, later); + qe = list_entry_rcu(qe->later.next, struct ima_queue_entry, later); rcu_read_unlock(); (*pos)++; @@ -100,7 +99,7 @@ static void ima_measurements_stop(struct seq_file *m, void *v) { } -static void ima_putc(struct seq_file *m, void *data, int datalen) +void ima_putc(struct seq_file *m, void *data, int datalen) { while (datalen--) seq_putc(m, *(char *)data++); @@ -111,6 +110,7 @@ static void ima_putc(struct seq_file *m, void *data, int datalen) * char[20]=template digest * 32bit-le=template name size * char[n]=template name + * [eventdata length] * eventdata[n]=template specific data */ static int ima_measurements_show(struct seq_file *m, void *v) @@ -120,6 +120,7 @@ static int ima_measurements_show(struct seq_file *m, void *v) struct ima_template_entry *e; int namelen; u32 pcr = CONFIG_IMA_MEASURE_PCR_IDX; + int i; /* get entry */ e = qe->entry; @@ -134,18 +135,25 @@ static int ima_measurements_show(struct seq_file *m, void *v) ima_putc(m, &pcr, sizeof pcr); /* 2nd: template digest */ - ima_putc(m, e->digest, IMA_DIGEST_SIZE); + ima_putc(m, e->digest, TPM_DIGEST_SIZE); /* 3rd: template name size */ - namelen = strlen(e->template_name); + namelen = strlen(e->template_desc->name); ima_putc(m, &namelen, sizeof namelen); /* 4th: template name */ - ima_putc(m, (void *)e->template_name, namelen); + ima_putc(m, e->template_desc->name, namelen); + + /* 5th: template length (except for 'ima' template) */ + if (strcmp(e->template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) + ima_putc(m, &e->template_data_len, + sizeof(e->template_data_len)); - /* 5th: template specific data */ - ima_template_show(m, (struct ima_template_data *)&e->template, - IMA_SHOW_BINARY); + /* 6th: template specific data */ + for (i = 0; i < e->template_desc->num_fields; i++) { + e->template_desc->fields[i]->field_show(m, IMA_SHOW_BINARY, + &e->template_data[i]); + } return 0; } @@ -168,41 +176,21 @@ static const struct file_operations ima_measurements_ops = { .release = seq_release, }; -static void ima_print_digest(struct seq_file *m, u8 *digest) +void ima_print_digest(struct seq_file *m, u8 *digest, int size) { int i; - for (i = 0; i < IMA_DIGEST_SIZE; i++) + for (i = 0; i < size; i++) seq_printf(m, "%02x", *(digest + i)); } -void ima_template_show(struct seq_file *m, void *e, enum ima_show_type show) -{ - struct ima_template_data *entry = e; - int namelen; - - switch (show) { - case IMA_SHOW_ASCII: - ima_print_digest(m, entry->digest); - seq_printf(m, " %s\n", entry->file_name); - break; - case IMA_SHOW_BINARY: - ima_putc(m, entry->digest, IMA_DIGEST_SIZE); - - namelen = strlen(entry->file_name); - ima_putc(m, &namelen, sizeof namelen); - ima_putc(m, entry->file_name, namelen); - default: - break; - } -} - /* print in ascii */ static int ima_ascii_measurements_show(struct seq_file *m, void *v) { /* the list never shrinks, so we don't need a lock here */ struct ima_queue_entry *qe = v; struct ima_template_entry *e; + int i; /* get entry */ e = qe->entry; @@ -213,14 +201,21 @@ static int ima_ascii_measurements_show(struct seq_file *m, void *v) seq_printf(m, "%2d ", CONFIG_IMA_MEASURE_PCR_IDX); /* 2nd: SHA1 template hash */ - ima_print_digest(m, e->digest); + ima_print_digest(m, e->digest, TPM_DIGEST_SIZE); /* 3th: template name */ - seq_printf(m, " %s ", e->template_name); + seq_printf(m, " %s", e->template_desc->name); /* 4th: template specific data */ - ima_template_show(m, (struct ima_template_data *)&e->template, - IMA_SHOW_ASCII); + for (i = 0; i < e->template_desc->num_fields; i++) { + seq_puts(m, " "); + if (e->template_data[i].len == 0) + continue; + + e->template_desc->fields[i]->field_show(m, IMA_SHOW_ASCII, + &e->template_data[i]); + } + seq_puts(m, "\n"); return 0; } diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index 162ea723db3..15f34bd40ab 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -18,6 +18,7 @@ #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/err.h> +#include <crypto/hash_info.h> #include "ima.h" /* name for boot aggregate entry */ @@ -42,28 +43,38 @@ int ima_used_chip; static void __init ima_add_boot_aggregate(void) { struct ima_template_entry *entry; + struct integrity_iint_cache tmp_iint, *iint = &tmp_iint; const char *op = "add_boot_aggregate"; const char *audit_cause = "ENOMEM"; int result = -ENOMEM; - int violation = 1; + int violation = 0; + struct { + struct ima_digest_data hdr; + char digest[TPM_DIGEST_SIZE]; + } hash; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto err_out; + memset(iint, 0, sizeof(*iint)); + memset(&hash, 0, sizeof(hash)); + iint->ima_hash = &hash.hdr; + iint->ima_hash->algo = HASH_ALGO_SHA1; + iint->ima_hash->length = SHA1_DIGEST_SIZE; - memset(&entry->template, 0, sizeof(entry->template)); - strncpy(entry->template.file_name, boot_aggregate_name, - IMA_EVENT_NAME_LEN_MAX); if (ima_used_chip) { - violation = 0; - result = ima_calc_boot_aggregate(entry->template.digest); + result = ima_calc_boot_aggregate(&hash.hdr); if (result < 0) { audit_cause = "hashing_error"; kfree(entry); goto err_out; } } - result = ima_store_template(entry, violation, NULL); + + result = ima_alloc_init_template(iint, NULL, boot_aggregate_name, + NULL, 0, &entry); + if (result < 0) + return; + + result = ima_store_template(entry, violation, NULL, + boot_aggregate_name); if (result < 0) kfree(entry); return; @@ -74,7 +85,7 @@ err_out: int __init ima_init(void) { - u8 pcr_i[IMA_DIGEST_SIZE]; + u8 pcr_i[TPM_DIGEST_SIZE]; int rc; ima_used_chip = 0; @@ -88,6 +99,10 @@ int __init ima_init(void) rc = ima_init_crypto(); if (rc) return rc; + rc = ima_init_template(); + if (rc != 0) + return rc; + ima_add_boot_aggregate(); /* boot aggregate must be first entry */ ima_init_policy(); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index e9508d5bbfc..149ee1119f8 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/xattr.h> #include <linux/ima.h> +#include <crypto/hash_info.h> #include "ima.h" @@ -35,11 +36,33 @@ int ima_appraise = IMA_APPRAISE_ENFORCE; int ima_appraise; #endif -char *ima_hash = "sha1"; +int ima_hash_algo = HASH_ALGO_SHA1; +static int hash_setup_done; + static int __init hash_setup(char *str) { - if (strncmp(str, "md5", 3) == 0) - ima_hash = "md5"; + struct ima_template_desc *template_desc = ima_template_desc_current(); + int i; + + if (hash_setup_done) + return 1; + + if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) { + if (strncmp(str, "sha1", 4) == 0) + ima_hash_algo = HASH_ALGO_SHA1; + else if (strncmp(str, "md5", 3) == 0) + ima_hash_algo = HASH_ALGO_MD5; + goto out; + } + + for (i = 0; i < HASH_ALGO__LAST; i++) { + if (strcmp(str, hash_algo_name[i]) == 0) { + ima_hash_algo = i; + break; + } + } +out: + hash_setup_done = 1; return 1; } __setup("ima_hash=", hash_setup); @@ -92,10 +115,9 @@ out: pathname = dentry->d_name.name; if (send_tomtou) - ima_add_violation(inode, pathname, - "invalid_pcr", "ToMToU"); + ima_add_violation(file, pathname, "invalid_pcr", "ToMToU"); if (send_writers) - ima_add_violation(inode, pathname, + ima_add_violation(file, pathname, "invalid_pcr", "open_writers"); kfree(pathbuf); } @@ -144,9 +166,12 @@ static int process_measurement(struct file *file, const char *filename, { struct inode *inode = file_inode(file); struct integrity_iint_cache *iint; + struct ima_template_desc *template_desc = ima_template_desc_current(); char *pathbuf = NULL; const char *pathname = NULL; int rc = -ENOMEM, action, must_appraise, _func; + struct evm_ima_xattr_data *xattr_value = NULL, **xattr_ptr = NULL; + int xattr_len = 0; if (!ima_initialized || !S_ISREG(inode->i_mode)) return 0; @@ -185,7 +210,13 @@ static int process_measurement(struct file *file, const char *filename, goto out_digsig; } - rc = ima_collect_measurement(iint, file); + if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) { + if (action & IMA_APPRAISE_SUBMASK) + xattr_ptr = &xattr_value; + } else + xattr_ptr = &xattr_value; + + rc = ima_collect_measurement(iint, file, xattr_ptr, &xattr_len); if (rc != 0) goto out_digsig; @@ -194,9 +225,11 @@ static int process_measurement(struct file *file, const char *filename, pathname = (const char *)file->f_dentry->d_name.name; if (action & IMA_MEASURE) - ima_store_measurement(iint, file, pathname); + ima_store_measurement(iint, file, pathname, + xattr_value, xattr_len); if (action & IMA_APPRAISE_SUBMASK) - rc = ima_appraise_measurement(_func, iint, file, pathname); + rc = ima_appraise_measurement(_func, iint, file, pathname, + xattr_value, xattr_len); if (action & IMA_AUDIT) ima_audit_measurement(iint, pathname); kfree(pathbuf); @@ -205,6 +238,7 @@ out_digsig: rc = -EACCES; out: mutex_unlock(&inode->i_mutex); + kfree(xattr_value); if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE)) return -EACCES; return 0; @@ -244,9 +278,9 @@ int ima_file_mmap(struct file *file, unsigned long prot) int ima_bprm_check(struct linux_binprm *bprm) { return process_measurement(bprm->file, - (strcmp(bprm->filename, bprm->interp) == 0) ? - bprm->filename : bprm->interp, - MAY_EXEC, BPRM_CHECK); + (strcmp(bprm->filename, bprm->interp) == 0) ? + bprm->filename : bprm->interp, + MAY_EXEC, BPRM_CHECK); } /** @@ -263,8 +297,8 @@ int ima_file_check(struct file *file, int mask) { ima_rdwr_violation_check(file); return process_measurement(file, NULL, - mask & (MAY_READ | MAY_WRITE | MAY_EXEC), - FILE_CHECK); + mask & (MAY_READ | MAY_WRITE | MAY_EXEC), + FILE_CHECK); } EXPORT_SYMBOL_GPL(ima_file_check); @@ -294,6 +328,7 @@ static int __init init_ima(void) { int error; + hash_setup(CONFIG_IMA_DEFAULT_HASH); error = ima_init(); if (!error) ima_initialized = 1; diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 399433ad614..a9c3d3cd199 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -73,7 +73,6 @@ static struct ima_rule_entry default_rules[] = { {.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC}, - {.action = DONT_MEASURE,.fsmagic = RAMFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = DEVPTS_SUPER_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = BINFMTFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC}, diff --git a/security/integrity/ima/ima_queue.c b/security/integrity/ima/ima_queue.c index ff63fe00c19..d85e99761f4 100644 --- a/security/integrity/ima/ima_queue.c +++ b/security/integrity/ima/ima_queue.c @@ -50,7 +50,7 @@ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value) key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { - rc = memcmp(qe->entry->digest, digest_value, IMA_DIGEST_SIZE); + rc = memcmp(qe->entry->digest, digest_value, TPM_DIGEST_SIZE); if (rc == 0) { ret = qe; break; @@ -104,9 +104,10 @@ static int ima_pcr_extend(const u8 *hash) * and extend the pcr. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, - const char *op, struct inode *inode) + const char *op, struct inode *inode, + const unsigned char *filename) { - u8 digest[IMA_DIGEST_SIZE]; + u8 digest[TPM_DIGEST_SIZE]; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; @@ -141,8 +142,7 @@ int ima_add_template_entry(struct ima_template_entry *entry, int violation, } out: mutex_unlock(&ima_extend_list_mutex); - integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, - entry->template.file_name, + integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c new file mode 100644 index 00000000000..4e5da990630 --- /dev/null +++ b/security/integrity/ima/ima_template.c @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2013 Politecnico di Torino, Italy + * TORSEC group -- http://security.polito.it + * + * Author: Roberto Sassu <roberto.sassu@polito.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * File: ima_template.c + * Helpers to manage template descriptors. + */ +#include <crypto/hash_info.h> + +#include "ima.h" +#include "ima_template_lib.h" + +static struct ima_template_desc defined_templates[] = { + {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, + {.name = "ima-ng",.fmt = "d-ng|n-ng"}, + {.name = "ima-sig",.fmt = "d-ng|n-ng|sig"}, +}; + +static struct ima_template_field supported_fields[] = { + {.field_id = "d",.field_init = ima_eventdigest_init, + .field_show = ima_show_template_digest}, + {.field_id = "n",.field_init = ima_eventname_init, + .field_show = ima_show_template_string}, + {.field_id = "d-ng",.field_init = ima_eventdigest_ng_init, + .field_show = ima_show_template_digest_ng}, + {.field_id = "n-ng",.field_init = ima_eventname_ng_init, + .field_show = ima_show_template_string}, + {.field_id = "sig",.field_init = ima_eventsig_init, + .field_show = ima_show_template_sig}, +}; + +static struct ima_template_desc *ima_template; +static struct ima_template_desc *lookup_template_desc(const char *name); + +static int __init ima_template_setup(char *str) +{ + struct ima_template_desc *template_desc; + int template_len = strlen(str); + + /* + * Verify that a template with the supplied name exists. + * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. + */ + template_desc = lookup_template_desc(str); + if (!template_desc) + return 1; + + /* + * Verify whether the current hash algorithm is supported + * by the 'ima' template. + */ + if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 && + ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) { + pr_err("IMA: template does not support hash alg\n"); + return 1; + } + + ima_template = template_desc; + return 1; +} +__setup("ima_template=", ima_template_setup); + +static struct ima_template_desc *lookup_template_desc(const char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(defined_templates); i++) { + if (strcmp(defined_templates[i].name, name) == 0) + return defined_templates + i; + } + + return NULL; +} + +static struct ima_template_field *lookup_template_field(const char *field_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(supported_fields); i++) + if (strncmp(supported_fields[i].field_id, field_id, + IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0) + return &supported_fields[i]; + return NULL; +} + +static int template_fmt_size(char *template_fmt) +{ + char c; + int template_fmt_len = strlen(template_fmt); + int i = 0, j = 0; + + while (i < template_fmt_len) { + c = template_fmt[i]; + if (c == '|') + j++; + i++; + } + + return j + 1; +} + +static int template_desc_init_fields(char *template_fmt, + struct ima_template_field ***fields, + int *num_fields) +{ + char *c, *template_fmt_ptr = template_fmt; + int template_num_fields = template_fmt_size(template_fmt); + int i, result = 0; + + if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) + return -EINVAL; + + *fields = kzalloc(template_num_fields * sizeof(*fields), GFP_KERNEL); + if (*fields == NULL) { + result = -ENOMEM; + goto out; + } + for (i = 0; (c = strsep(&template_fmt_ptr, "|")) != NULL && + i < template_num_fields; i++) { + struct ima_template_field *f = lookup_template_field(c); + + if (!f) { + result = -ENOENT; + goto out; + } + (*fields)[i] = f; + } + *num_fields = i; + return 0; +out: + kfree(*fields); + *fields = NULL; + return result; +} + +static int init_defined_templates(void) +{ + int i = 0; + int result = 0; + + /* Init defined templates. */ + for (i = 0; i < ARRAY_SIZE(defined_templates); i++) { + struct ima_template_desc *template = &defined_templates[i]; + + result = template_desc_init_fields(template->fmt, + &(template->fields), + &(template->num_fields)); + if (result < 0) + return result; + } + return result; +} + +struct ima_template_desc *ima_template_desc_current(void) +{ + if (!ima_template) + ima_template = + lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); + return ima_template; +} + +int ima_init_template(void) +{ + int result; + + result = init_defined_templates(); + if (result < 0) + return result; + + return 0; +} diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c new file mode 100644 index 00000000000..6d66ad6ed26 --- /dev/null +++ b/security/integrity/ima/ima_template_lib.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2013 Politecnico di Torino, Italy + * TORSEC group -- http://security.polito.it + * + * Author: Roberto Sassu <roberto.sassu@polito.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * File: ima_template_lib.c + * Library of supported template fields. + */ +#include <crypto/hash_info.h> + +#include "ima_template_lib.h" + +static bool ima_template_hash_algo_allowed(u8 algo) +{ + if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) + return true; + + return false; +} + +enum data_formats { + DATA_FMT_DIGEST = 0, + DATA_FMT_DIGEST_WITH_ALGO, + DATA_FMT_EVENT_NAME, + DATA_FMT_STRING, + DATA_FMT_HEX +}; + +static int ima_write_template_field_data(const void *data, const u32 datalen, + enum data_formats datafmt, + struct ima_field_data *field_data) +{ + u8 *buf, *buf_ptr; + u32 buflen; + + switch (datafmt) { + case DATA_FMT_EVENT_NAME: + buflen = IMA_EVENT_NAME_LEN_MAX + 1; + break; + case DATA_FMT_STRING: + buflen = datalen + 1; + break; + default: + buflen = datalen; + } + + buf = kzalloc(buflen, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memcpy(buf, data, datalen); + + /* + * Replace all space characters with underscore for event names and + * strings. This avoid that, during the parsing of a measurements list, + * filenames with spaces or that end with the suffix ' (deleted)' are + * split into multiple template fields (the space is the delimitator + * character for measurements lists in ASCII format). + */ + if (datafmt == DATA_FMT_EVENT_NAME || datafmt == DATA_FMT_STRING) { + for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) + if (*buf_ptr == ' ') + *buf_ptr = '_'; + } + + field_data->data = buf; + field_data->len = buflen; + return 0; +} + +static void ima_show_template_data_ascii(struct seq_file *m, + enum ima_show_type show, + enum data_formats datafmt, + struct ima_field_data *field_data) +{ + u8 *buf_ptr = field_data->data, buflen = field_data->len; + + switch (datafmt) { + case DATA_FMT_DIGEST_WITH_ALGO: + buf_ptr = strnchr(field_data->data, buflen, ':'); + if (buf_ptr != field_data->data) + seq_printf(m, "%s", field_data->data); + + /* skip ':' and '\0' */ + buf_ptr += 2; + buflen -= buf_ptr - field_data->data; + case DATA_FMT_DIGEST: + case DATA_FMT_HEX: + if (!buflen) + break; + ima_print_digest(m, buf_ptr, buflen); + break; + case DATA_FMT_STRING: + seq_printf(m, "%s", buf_ptr); + break; + default: + break; + } +} + +static void ima_show_template_data_binary(struct seq_file *m, + enum ima_show_type show, + enum data_formats datafmt, + struct ima_field_data *field_data) +{ + ima_putc(m, &field_data->len, sizeof(u32)); + if (!field_data->len) + return; + ima_putc(m, field_data->data, field_data->len); +} + +static void ima_show_template_field_data(struct seq_file *m, + enum ima_show_type show, + enum data_formats datafmt, + struct ima_field_data *field_data) +{ + switch (show) { + case IMA_SHOW_ASCII: + ima_show_template_data_ascii(m, show, datafmt, field_data); + break; + case IMA_SHOW_BINARY: + ima_show_template_data_binary(m, show, datafmt, field_data); + break; + default: + break; + } +} + +void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data) +{ + ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); +} + +void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data) +{ + ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, + field_data); +} + +void ima_show_template_string(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data) +{ + ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); +} + +void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data) +{ + ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); +} + +static int ima_eventdigest_init_common(u8 *digest, u32 digestsize, u8 hash_algo, + struct ima_field_data *field_data, + bool size_limit) +{ + /* + * digest formats: + * - DATA_FMT_DIGEST: digest + * - DATA_FMT_DIGEST_WITH_ALGO: [<hash algo>] + ':' + '\0' + digest, + * where <hash algo> is provided if the hash algoritm is not + * SHA1 or MD5 + */ + u8 buffer[CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; + enum data_formats fmt = DATA_FMT_DIGEST; + u32 offset = 0; + + if (!size_limit) { + fmt = DATA_FMT_DIGEST_WITH_ALGO; + if (hash_algo < HASH_ALGO__LAST) + offset += snprintf(buffer, CRYPTO_MAX_ALG_NAME + 1, + "%s", hash_algo_name[hash_algo]); + buffer[offset] = ':'; + offset += 2; + } + + if (digest) + memcpy(buffer + offset, digest, digestsize); + else + /* + * If digest is NULL, the event being recorded is a violation. + * Make room for the digest by increasing the offset of + * IMA_DIGEST_SIZE. + */ + offset += IMA_DIGEST_SIZE; + + return ima_write_template_field_data(buffer, offset + digestsize, + fmt, field_data); +} + +/* + * This function writes the digest of an event (with size limit). + */ +int ima_eventdigest_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data) +{ + struct { + struct ima_digest_data hdr; + char digest[IMA_MAX_DIGEST_SIZE]; + } hash; + u8 *cur_digest = NULL; + u32 cur_digestsize = 0; + struct inode *inode; + int result; + + memset(&hash, 0, sizeof(hash)); + + if (!iint) /* recording a violation. */ + goto out; + + if (ima_template_hash_algo_allowed(iint->ima_hash->algo)) { + cur_digest = iint->ima_hash->digest; + cur_digestsize = iint->ima_hash->length; + goto out; + } + + if (!file) /* missing info to re-calculate the digest */ + return -EINVAL; + + inode = file_inode(file); + hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? + ima_hash_algo : HASH_ALGO_SHA1; + result = ima_calc_file_hash(file, &hash.hdr); + if (result) { + integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, + filename, "collect_data", + "failed", result, 0); + return result; + } + cur_digest = hash.hdr.digest; + cur_digestsize = hash.hdr.length; +out: + return ima_eventdigest_init_common(cur_digest, cur_digestsize, -1, + field_data, true); +} + +/* + * This function writes the digest of an event (without size limit). + */ +int ima_eventdigest_ng_init(struct integrity_iint_cache *iint, + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len, struct ima_field_data *field_data) +{ + u8 *cur_digest = NULL, hash_algo = HASH_ALGO__LAST; + u32 cur_digestsize = 0; + + /* If iint is NULL, we are recording a violation. */ + if (!iint) + goto out; + + cur_digest = iint->ima_hash->digest; + cur_digestsize = iint->ima_hash->length; + + hash_algo = iint->ima_hash->algo; +out: + return ima_eventdigest_init_common(cur_digest, cur_digestsize, + hash_algo, field_data, false); +} + +static int ima_eventname_init_common(struct integrity_iint_cache *iint, + struct file *file, + const unsigned char *filename, + struct ima_field_data *field_data, + bool size_limit) +{ + const char *cur_filename = NULL; + u32 cur_filename_len = 0; + enum data_formats fmt = size_limit ? + DATA_FMT_EVENT_NAME : DATA_FMT_STRING; + + BUG_ON(filename == NULL && file == NULL); + + if (filename) { + cur_filename = filename; + cur_filename_len = strlen(filename); + + if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) + goto out; + } + + if (file) { + cur_filename = file->f_dentry->d_name.name; + cur_filename_len = strlen(cur_filename); + } else + /* + * Truncate filename if the latter is too long and + * the file descriptor is not available. + */ + cur_filename_len = IMA_EVENT_NAME_LEN_MAX; +out: + return ima_write_template_field_data(cur_filename, cur_filename_len, + fmt, field_data); +} + +/* + * This function writes the name of an event (with size limit). + */ +int ima_eventname_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data) +{ + return ima_eventname_init_common(iint, file, filename, + field_data, true); +} + +/* + * This function writes the name of an event (without size limit). + */ +int ima_eventname_ng_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data) +{ + return ima_eventname_init_common(iint, file, filename, + field_data, false); +} + +/* + * ima_eventsig_init - include the file signature as part of the template data + */ +int ima_eventsig_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data) +{ + enum data_formats fmt = DATA_FMT_HEX; + int rc = 0; + + if ((!xattr_value) || (xattr_value->type != EVM_IMA_XATTR_DIGSIG)) + goto out; + + rc = ima_write_template_field_data(xattr_value, xattr_len, fmt, + field_data); +out: + return rc; +} diff --git a/security/integrity/ima/ima_template_lib.h b/security/integrity/ima/ima_template_lib.h new file mode 100644 index 00000000000..63f6b52cb1c --- /dev/null +++ b/security/integrity/ima/ima_template_lib.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2013 Politecnico di Torino, Italy + * TORSEC group -- http://security.polito.it + * + * Author: Roberto Sassu <roberto.sassu@polito.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * File: ima_template_lib.h + * Header for the library of supported template fields. + */ +#ifndef __LINUX_IMA_TEMPLATE_LIB_H +#define __LINUX_IMA_TEMPLATE_LIB_H + +#include <linux/seq_file.h> +#include "ima.h" + +void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data); +void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data); +void ima_show_template_string(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data); +void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, + struct ima_field_data *field_data); +int ima_eventdigest_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data); +int ima_eventname_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data); +int ima_eventdigest_ng_init(struct integrity_iint_cache *iint, + struct file *file, const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, + int xattr_len, struct ima_field_data *field_data); +int ima_eventname_ng_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data); +int ima_eventsig_init(struct integrity_iint_cache *iint, struct file *file, + const unsigned char *filename, + struct evm_ima_xattr_data *xattr_value, int xattr_len, + struct ima_field_data *field_data); +#endif /* __LINUX_IMA_TEMPLATE_LIB_H */ diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index c42fb7a70de..b9e7c133734 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -54,25 +54,57 @@ enum evm_ima_xattr_type { IMA_XATTR_DIGEST = 0x01, EVM_XATTR_HMAC, EVM_IMA_XATTR_DIGSIG, + IMA_XATTR_DIGEST_NG, }; struct evm_ima_xattr_data { u8 type; u8 digest[SHA1_DIGEST_SIZE]; -} __attribute__((packed)); +} __packed; + +#define IMA_MAX_DIGEST_SIZE 64 + +struct ima_digest_data { + u8 algo; + u8 length; + union { + struct { + u8 unused; + u8 type; + } sha1; + struct { + u8 type; + u8 algo; + } ng; + u8 data[2]; + } xattr; + u8 digest[0]; +} __packed; + +/* + * signature format v2 - for using with asymmetric keys + */ +struct signature_v2_hdr { + uint8_t type; /* xattr type */ + uint8_t version; /* signature format version */ + uint8_t hash_algo; /* Digest algorithm [enum pkey_hash_algo] */ + uint32_t keyid; /* IMA key identifier - not X509/PGP specific */ + uint16_t sig_size; /* signature size */ + uint8_t sig[0]; /* signature payload */ +} __packed; /* integrity data associated with an inode */ struct integrity_iint_cache { - struct rb_node rb_node; /* rooted in integrity_iint_tree */ + struct rb_node rb_node; /* rooted in integrity_iint_tree */ struct inode *inode; /* back pointer to inode in question */ u64 version; /* track inode changes */ unsigned long flags; - struct evm_ima_xattr_data ima_xattr; enum integrity_status ima_file_status:4; enum integrity_status ima_mmap_status:4; enum integrity_status ima_bprm_status:4; enum integrity_status ima_module_status:4; enum integrity_status evm_status:4; + struct ima_digest_data *ima_hash; }; /* rbtree tree calls to lookup, insert, delete @@ -89,7 +121,7 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode); #ifdef CONFIG_INTEGRITY_SIGNATURE int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, - const char *digest, int digestlen); + const char *digest, int digestlen); #else @@ -105,12 +137,19 @@ static inline int integrity_digsig_verify(const unsigned int id, #ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS int asymmetric_verify(struct key *keyring, const char *sig, int siglen, const char *data, int datalen); + +int integrity_init_keyring(const unsigned int id); #else static inline int asymmetric_verify(struct key *keyring, const char *sig, int siglen, const char *data, int datalen) { return -EOPNOTSUPP; } + +static int integrity_init_keyring(const unsigned int id) +{ + return 0; +} #endif #ifdef CONFIG_INTEGRITY_AUDIT diff --git a/security/keys/Kconfig b/security/keys/Kconfig index a90d6d300db..a4f3f8c48d6 100644 --- a/security/keys/Kconfig +++ b/security/keys/Kconfig @@ -4,6 +4,7 @@ config KEYS bool "Enable access key retention support" + select ASSOCIATIVE_ARRAY help This option provides support for retaining authentication tokens and access keys in the kernel. @@ -19,6 +20,34 @@ config KEYS If you are unsure as to whether this is required, answer N. +config PERSISTENT_KEYRINGS + bool "Enable register of persistent per-UID keyrings" + depends on KEYS + help + This option provides a register of persistent per-UID keyrings, + primarily aimed at Kerberos key storage. The keyrings are persistent + in the sense that they stay around after all processes of that UID + have exited, not that they survive the machine being rebooted. + + A particular keyring may be accessed by either the user whose keyring + it is or by a process with administrative privileges. The active + LSMs gets to rule on which admin-level processes get to access the + cache. + + Keyrings are created and added into the register upon demand and get + removed if they expire (a default timeout is set upon creation). + +config BIG_KEYS + bool "Large payload keys" + depends on KEYS + depends on TMPFS + help + This option provides support for holding large keys within the kernel + (for example Kerberos ticket caches). The data may be stored out to + swapspace by tmpfs. + + If you are unsure as to whether this is required, answer N. + config TRUSTED_KEYS tristate "TRUSTED KEYS" depends on KEYS && TCG_TPM diff --git a/security/keys/Makefile b/security/keys/Makefile index 504aaa00838..dfb3a7beded 100644 --- a/security/keys/Makefile +++ b/security/keys/Makefile @@ -18,9 +18,11 @@ obj-y := \ obj-$(CONFIG_KEYS_COMPAT) += compat.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_SYSCTL) += sysctl.o +obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o # # Key types # +obj-$(CONFIG_BIG_KEYS) += big_key.o obj-$(CONFIG_TRUSTED_KEYS) += trusted.o obj-$(CONFIG_ENCRYPTED_KEYS) += encrypted-keys/ diff --git a/security/keys/big_key.c b/security/keys/big_key.c new file mode 100644 index 00000000000..7f44c3207a9 --- /dev/null +++ b/security/keys/big_key.c @@ -0,0 +1,207 @@ +/* Large capacity key type + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/file.h> +#include <linux/shmem_fs.h> +#include <linux/err.h> +#include <keys/user-type.h> +#include <keys/big_key-type.h> + +MODULE_LICENSE("GPL"); + +/* + * If the data is under this limit, there's no point creating a shm file to + * hold it as the permanently resident metadata for the shmem fs will be at + * least as large as the data. + */ +#define BIG_KEY_FILE_THRESHOLD (sizeof(struct inode) + sizeof(struct dentry)) + +/* + * big_key defined keys take an arbitrary string as the description and an + * arbitrary blob of data as the payload + */ +struct key_type key_type_big_key = { + .name = "big_key", + .def_lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, + .instantiate = big_key_instantiate, + .match = user_match, + .revoke = big_key_revoke, + .destroy = big_key_destroy, + .describe = big_key_describe, + .read = big_key_read, +}; + +/* + * Instantiate a big key + */ +int big_key_instantiate(struct key *key, struct key_preparsed_payload *prep) +{ + struct path *path = (struct path *)&key->payload.data2; + struct file *file; + ssize_t written; + size_t datalen = prep->datalen; + int ret; + + ret = -EINVAL; + if (datalen <= 0 || datalen > 1024 * 1024 || !prep->data) + goto error; + + /* Set an arbitrary quota */ + ret = key_payload_reserve(key, 16); + if (ret < 0) + goto error; + + key->type_data.x[1] = datalen; + + if (datalen > BIG_KEY_FILE_THRESHOLD) { + /* Create a shmem file to store the data in. This will permit the data + * to be swapped out if needed. + * + * TODO: Encrypt the stored data with a temporary key. + */ + file = shmem_file_setup("", datalen, 0); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err_quota; + } + + written = kernel_write(file, prep->data, prep->datalen, 0); + if (written != datalen) { + ret = written; + if (written >= 0) + ret = -ENOMEM; + goto err_fput; + } + + /* Pin the mount and dentry to the key so that we can open it again + * later + */ + *path = file->f_path; + path_get(path); + fput(file); + } else { + /* Just store the data in a buffer */ + void *data = kmalloc(datalen, GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto err_quota; + } + + key->payload.data = memcpy(data, prep->data, prep->datalen); + } + return 0; + +err_fput: + fput(file); +err_quota: + key_payload_reserve(key, 0); +error: + return ret; +} + +/* + * dispose of the links from a revoked keyring + * - called with the key sem write-locked + */ +void big_key_revoke(struct key *key) +{ + struct path *path = (struct path *)&key->payload.data2; + + /* clear the quota */ + key_payload_reserve(key, 0); + if (key_is_instantiated(key) && key->type_data.x[1] > BIG_KEY_FILE_THRESHOLD) + vfs_truncate(path, 0); +} + +/* + * dispose of the data dangling from the corpse of a big_key key + */ +void big_key_destroy(struct key *key) +{ + if (key->type_data.x[1] > BIG_KEY_FILE_THRESHOLD) { + struct path *path = (struct path *)&key->payload.data2; + path_put(path); + path->mnt = NULL; + path->dentry = NULL; + } else { + kfree(key->payload.data); + key->payload.data = NULL; + } +} + +/* + * describe the big_key key + */ +void big_key_describe(const struct key *key, struct seq_file *m) +{ + unsigned long datalen = key->type_data.x[1]; + + seq_puts(m, key->description); + + if (key_is_instantiated(key)) + seq_printf(m, ": %lu [%s]", + datalen, + datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff"); +} + +/* + * read the key data + * - the key's semaphore is read-locked + */ +long big_key_read(const struct key *key, char __user *buffer, size_t buflen) +{ + unsigned long datalen = key->type_data.x[1]; + long ret; + + if (!buffer || buflen < datalen) + return datalen; + + if (datalen > BIG_KEY_FILE_THRESHOLD) { + struct path *path = (struct path *)&key->payload.data2; + struct file *file; + loff_t pos; + + file = dentry_open(path, O_RDONLY, current_cred()); + if (IS_ERR(file)) + return PTR_ERR(file); + + pos = 0; + ret = vfs_read(file, buffer, datalen, &pos); + fput(file); + if (ret >= 0 && ret != datalen) + ret = -EIO; + } else { + ret = datalen; + if (copy_to_user(buffer, key->payload.data, datalen) != 0) + ret = -EFAULT; + } + + return ret; +} + +/* + * Module stuff + */ +static int __init big_key_init(void) +{ + return register_key_type(&key_type_big_key); +} + +static void __exit big_key_cleanup(void) +{ + unregister_key_type(&key_type_big_key); +} + +module_init(big_key_init); +module_exit(big_key_cleanup); diff --git a/security/keys/compat.c b/security/keys/compat.c index d65fa7fa29b..bbd32c729db 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -138,6 +138,9 @@ asmlinkage long compat_sys_keyctl(u32 option, case KEYCTL_INVALIDATE: return keyctl_invalidate_key(arg2); + case KEYCTL_GET_PERSISTENT: + return keyctl_get_persistent(arg2, arg3); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c index d67c97bb102..d3222b6d7d5 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -131,50 +131,6 @@ void key_gc_keytype(struct key_type *ktype) } /* - * Garbage collect pointers from a keyring. - * - * Not called with any locks held. The keyring's key struct will not be - * deallocated under us as only our caller may deallocate it. - */ -static void key_gc_keyring(struct key *keyring, time_t limit) -{ - struct keyring_list *klist; - int loop; - - kenter("%x", key_serial(keyring)); - - if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - goto dont_gc; - - /* scan the keyring looking for dead keys */ - rcu_read_lock(); - klist = rcu_dereference(keyring->payload.subscriptions); - if (!klist) - goto unlock_dont_gc; - - loop = klist->nkeys; - smp_rmb(); - for (loop--; loop >= 0; loop--) { - struct key *key = rcu_dereference(klist->keys[loop]); - if (key_is_dead(key, limit)) - goto do_gc; - } - -unlock_dont_gc: - rcu_read_unlock(); -dont_gc: - kleave(" [no gc]"); - return; - -do_gc: - rcu_read_unlock(); - - keyring_gc(keyring, limit); - kleave(" [gc]"); -} - -/* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) @@ -392,8 +348,7 @@ found_unreferenced_key: */ found_keyring: spin_unlock(&key_serial_lock); - kdebug("scan keyring %d", key->serial); - key_gc_keyring(key, limit); + keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and diff --git a/security/keys/internal.h b/security/keys/internal.h index d4f1468b9b5..80b2aac4f50 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -89,42 +89,53 @@ extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); extern int __key_link_begin(struct key *keyring, - const struct key_type *type, - const char *description, - unsigned long *_prealloc); + const struct keyring_index_key *index_key, + struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); -extern void __key_link(struct key *keyring, struct key *key, - unsigned long *_prealloc); +extern void __key_link(struct key *key, struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, - struct key_type *type, - unsigned long prealloc); + const struct keyring_index_key *index_key, + struct assoc_array_edit *edit); -extern key_ref_t __keyring_search_one(key_ref_t keyring_ref, - const struct key_type *type, - const char *description, - key_perm_t perm); +extern key_ref_t find_key_to_update(key_ref_t keyring_ref, + const struct keyring_index_key *index_key); extern struct key *keyring_search_instkey(struct key *keyring, key_serial_t target_id); +extern int iterate_over_keyring(const struct key *keyring, + int (*func)(const struct key *key, void *data), + void *data); + typedef int (*key_match_func_t)(const struct key *, const void *); +struct keyring_search_context { + struct keyring_index_key index_key; + const struct cred *cred; + key_match_func_t match; + const void *match_data; + unsigned flags; +#define KEYRING_SEARCH_LOOKUP_TYPE 0x0001 /* [as type->def_lookup_type] */ +#define KEYRING_SEARCH_NO_STATE_CHECK 0x0002 /* Skip state checks */ +#define KEYRING_SEARCH_DO_STATE_CHECK 0x0004 /* Override NO_STATE_CHECK */ +#define KEYRING_SEARCH_NO_UPDATE_TIME 0x0008 /* Don't update times */ +#define KEYRING_SEARCH_NO_CHECK_PERM 0x0010 /* Don't check permissions */ +#define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0020 /* Give an error on excessive depth */ + + int (*iterator)(const void *object, void *iterator_data); + + /* Internal stuff */ + int skipped_ret; + bool possessed; + key_ref_t result; + struct timespec now; +}; + extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, - const struct cred *cred, - struct key_type *type, - const void *description, - key_match_func_t match, - bool no_state_check); - -extern key_ref_t search_my_process_keyrings(struct key_type *type, - const void *description, - key_match_func_t match, - bool no_state_check, - const struct cred *cred); -extern key_ref_t search_process_keyrings(struct key_type *type, - const void *description, - key_match_func_t match, - const struct cred *cred); + struct keyring_search_context *ctx); + +extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); +extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); @@ -202,7 +213,7 @@ extern struct key *key_get_instantiation_authkey(key_serial_t target_id); /* * Determine whether a key is dead. */ -static inline bool key_is_dead(struct key *key, time_t limit) +static inline bool key_is_dead(const struct key *key, time_t limit) { return key->flags & ((1 << KEY_FLAG_DEAD) | @@ -244,6 +255,15 @@ extern long keyctl_invalidate_key(key_serial_t); extern long keyctl_instantiate_key_common(key_serial_t, const struct iovec *, unsigned, size_t, key_serial_t); +#ifdef CONFIG_PERSISTENT_KEYRINGS +extern long keyctl_get_persistent(uid_t, key_serial_t); +extern unsigned persistent_keyring_expiry; +#else +static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring) +{ + return -EOPNOTSUPP; +} +#endif /* * Debugging key validation diff --git a/security/keys/key.c b/security/keys/key.c index 8fb7c7bd465..55d110f0ace 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -242,8 +242,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, } } - desclen = strlen(desc) + 1; - quotalen = desclen + type->def_datalen; + desclen = strlen(desc); + quotalen = desclen + 1 + type->def_datalen; /* get hold of the key tracking for this user */ user = key_user_lookup(uid); @@ -277,7 +277,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, goto no_memory_2; if (desc) { - key->description = kmemdup(desc, desclen, GFP_KERNEL); + key->index_key.desc_len = desclen; + key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL); if (!key->description) goto no_memory_3; } @@ -285,7 +286,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, atomic_set(&key->usage, 1); init_rwsem(&key->sem); lockdep_set_class(&key->sem, &type->lock_class); - key->type = type; + key->index_key.type = type; key->user = user; key->quotalen = quotalen; key->datalen = type->def_datalen; @@ -299,6 +300,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; + if (flags & KEY_ALLOC_TRUSTED) + key->flags |= 1 << KEY_FLAG_TRUSTED; memset(&key->type_data, 0, sizeof(key->type_data)); @@ -408,7 +411,7 @@ static int __key_instantiate_and_link(struct key *key, struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, - unsigned long *_prealloc) + struct assoc_array_edit **_edit) { int ret, awaken; @@ -435,7 +438,7 @@ static int __key_instantiate_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring) - __key_link(keyring, key, _prealloc); + __key_link(key, _edit); /* disable the authorisation key */ if (authkey) @@ -475,7 +478,7 @@ int key_instantiate_and_link(struct key *key, struct key *authkey) { struct key_preparsed_payload prep; - unsigned long prealloc; + struct assoc_array_edit *edit; int ret; memset(&prep, 0, sizeof(prep)); @@ -489,17 +492,15 @@ int key_instantiate_and_link(struct key *key, } if (keyring) { - ret = __key_link_begin(keyring, key->type, key->description, - &prealloc); + ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_free_preparse; } - ret = __key_instantiate_and_link(key, &prep, keyring, authkey, - &prealloc); + ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit); if (keyring) - __key_link_end(keyring, key->type, prealloc); + __key_link_end(keyring, &key->index_key, edit); error_free_preparse: if (key->type->preparse) @@ -537,7 +538,7 @@ int key_reject_and_link(struct key *key, struct key *keyring, struct key *authkey) { - unsigned long prealloc; + struct assoc_array_edit *edit; struct timespec now; int ret, awaken, link_ret = 0; @@ -548,8 +549,7 @@ int key_reject_and_link(struct key *key, ret = -EBUSY; if (keyring) - link_ret = __key_link_begin(keyring, key->type, - key->description, &prealloc); + link_ret = __key_link_begin(keyring, &key->index_key, &edit); mutex_lock(&key_construction_mutex); @@ -557,9 +557,10 @@ int key_reject_and_link(struct key *key, if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); + key->type_data.reject_error = -error; + smp_wmb(); set_bit(KEY_FLAG_NEGATIVE, &key->flags); set_bit(KEY_FLAG_INSTANTIATED, &key->flags); - key->type_data.reject_error = -error; now = current_kernel_time(); key->expiry = now.tv_sec + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -571,7 +572,7 @@ int key_reject_and_link(struct key *key, /* and link it into the destination keyring */ if (keyring && link_ret == 0) - __key_link(keyring, key, &prealloc); + __key_link(key, &edit); /* disable the authorisation key */ if (authkey) @@ -581,7 +582,7 @@ int key_reject_and_link(struct key *key, mutex_unlock(&key_construction_mutex); if (keyring) - __key_link_end(keyring, key->type, prealloc); + __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ if (awaken) @@ -645,7 +646,7 @@ found: /* this races with key_put(), but that doesn't matter since key_put() * doesn't actually change the key */ - atomic_inc(&key->usage); + __key_get(key); error: spin_unlock(&key_serial_lock); @@ -780,25 +781,27 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, key_perm_t perm, unsigned long flags) { - unsigned long prealloc; + struct keyring_index_key index_key = { + .description = description, + }; struct key_preparsed_payload prep; + struct assoc_array_edit *edit; const struct cred *cred = current_cred(); - struct key_type *ktype; struct key *keyring, *key = NULL; key_ref_t key_ref; int ret; /* look up the key type to see if it's one of the registered kernel * types */ - ktype = key_type_lookup(type); - if (IS_ERR(ktype)) { + index_key.type = key_type_lookup(type); + if (IS_ERR(index_key.type)) { key_ref = ERR_PTR(-ENODEV); goto error; } key_ref = ERR_PTR(-EINVAL); - if (!ktype->match || !ktype->instantiate || - (!description && !ktype->preparse)) + if (!index_key.type->match || !index_key.type->instantiate || + (!index_key.description && !index_key.type->preparse)) goto error_put_type; keyring = key_ref_to_ptr(keyring_ref); @@ -812,21 +815,28 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, memset(&prep, 0, sizeof(prep)); prep.data = payload; prep.datalen = plen; - prep.quotalen = ktype->def_datalen; - if (ktype->preparse) { - ret = ktype->preparse(&prep); + prep.quotalen = index_key.type->def_datalen; + prep.trusted = flags & KEY_ALLOC_TRUSTED; + if (index_key.type->preparse) { + ret = index_key.type->preparse(&prep); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_put_type; } - if (!description) - description = prep.description; + if (!index_key.description) + index_key.description = prep.description; key_ref = ERR_PTR(-EINVAL); - if (!description) + if (!index_key.description) goto error_free_prep; } + index_key.desc_len = strlen(index_key.description); + + key_ref = ERR_PTR(-EPERM); + if (!prep.trusted && test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags)) + goto error_free_prep; + flags |= prep.trusted ? KEY_ALLOC_TRUSTED : 0; - ret = __key_link_begin(keyring, ktype, description, &prealloc); + ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; @@ -844,10 +854,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, * key of the same type and description in the destination keyring and * update that instead if possible */ - if (ktype->update) { - key_ref = __keyring_search_one(keyring_ref, ktype, description, - 0); - if (!IS_ERR(key_ref)) + if (index_key.type->update) { + key_ref = find_key_to_update(keyring_ref, &index_key); + if (key_ref) goto found_matching_key; } @@ -856,23 +865,24 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; - if (ktype->read) + if (index_key.type->read) perm |= KEY_POS_READ; - if (ktype == &key_type_keyring || ktype->update) + if (index_key.type == &key_type_keyring || + index_key.type->update) perm |= KEY_POS_WRITE; } /* allocate a new key */ - key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred, - perm, flags); + key = key_alloc(index_key.type, index_key.description, + cred->fsuid, cred->fsgid, cred, perm, flags); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; } /* instantiate it and link it into the target keyring */ - ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &prealloc); + ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); @@ -882,12 +892,12 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); error_link_end: - __key_link_end(keyring, ktype, prealloc); + __key_link_end(keyring, &index_key, edit); error_free_prep: - if (ktype->preparse) - ktype->free_preparse(&prep); + if (index_key.type->preparse) + index_key.type->free_preparse(&prep); error_put_type: - key_type_put(ktype); + key_type_put(index_key.type); error: return key_ref; @@ -895,7 +905,7 @@ error: /* we found a matching key, so we're going to try to update it * - we can drop the locks first as we have the key pinned */ - __key_link_end(keyring, ktype, prealloc); + __key_link_end(keyring, &index_key, edit); key_ref = __key_update(key_ref, &prep); goto error_free_prep; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 33cfd27b4de..cee72ce6422 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1667,6 +1667,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, case KEYCTL_INVALIDATE: return keyctl_invalidate_key((key_serial_t) arg2); + case KEYCTL_GET_PERSISTENT: + return keyctl_get_persistent((uid_t)arg2, (key_serial_t)arg3); + default: return -EOPNOTSUPP; } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 6ece7f2e570..69f0cb7bab7 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -1,6 +1,6 @@ /* Keyring handling * - * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -17,25 +17,11 @@ #include <linux/seq_file.h> #include <linux/err.h> #include <keys/keyring-type.h> +#include <keys/user-type.h> +#include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include "internal.h" -#define rcu_dereference_locked_keyring(keyring) \ - (rcu_dereference_protected( \ - (keyring)->payload.subscriptions, \ - rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) - -#define rcu_deref_link_locked(klist, index, keyring) \ - (rcu_dereference_protected( \ - (klist)->keys[index], \ - rwsem_is_locked((struct rw_semaphore *)&(keyring)->sem))) - -#define MAX_KEYRING_LINKS \ - min_t(size_t, USHRT_MAX - 1, \ - ((PAGE_SIZE - sizeof(struct keyring_list)) / sizeof(struct key *))) - -#define KEY_LINK_FIXQUOTA 1UL - /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. @@ -47,6 +33,28 @@ */ #define KEYRING_NAME_HASH_SIZE (1 << 5) +/* + * We mark pointers we pass to the associative array with bit 1 set if + * they're keyrings and clear otherwise. + */ +#define KEYRING_PTR_SUBTYPE 0x2UL + +static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) +{ + return (unsigned long)x & KEYRING_PTR_SUBTYPE; +} +static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) +{ + void *object = assoc_array_ptr_to_leaf(x); + return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); +} +static inline void *keyring_key_to_ptr(struct key *key) +{ + if (key->type == &key_type_keyring) + return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); + return key; +} + static struct list_head keyring_name_hash[KEYRING_NAME_HASH_SIZE]; static DEFINE_RWLOCK(keyring_name_lock); @@ -67,7 +75,6 @@ static inline unsigned keyring_hash(const char *desc) */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); -static int keyring_match(const struct key *keyring, const void *criterion); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); @@ -76,9 +83,9 @@ static long keyring_read(const struct key *keyring, struct key_type key_type_keyring = { .name = "keyring", - .def_datalen = sizeof(struct keyring_list), + .def_datalen = 0, .instantiate = keyring_instantiate, - .match = keyring_match, + .match = user_match, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, @@ -127,6 +134,7 @@ static int keyring_instantiate(struct key *keyring, ret = -EINVAL; if (prep->datalen == 0) { + assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); ret = 0; @@ -136,15 +144,226 @@ static int keyring_instantiate(struct key *keyring, } /* - * Match keyrings on their name + * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd + * fold the carry back too, but that requires inline asm. + */ +static u64 mult_64x32_and_fold(u64 x, u32 y) +{ + u64 hi = (u64)(u32)(x >> 32) * y; + u64 lo = (u64)(u32)(x) * y; + return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); +} + +/* + * Hash a key type and description. + */ +static unsigned long hash_key_type_and_desc(const struct keyring_index_key *index_key) +{ + const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; + const unsigned long level_mask = ASSOC_ARRAY_LEVEL_STEP_MASK; + const char *description = index_key->description; + unsigned long hash, type; + u32 piece; + u64 acc; + int n, desc_len = index_key->desc_len; + + type = (unsigned long)index_key->type; + + acc = mult_64x32_and_fold(type, desc_len + 13); + acc = mult_64x32_and_fold(acc, 9207); + for (;;) { + n = desc_len; + if (n <= 0) + break; + if (n > 4) + n = 4; + piece = 0; + memcpy(&piece, description, n); + description += n; + desc_len -= n; + acc = mult_64x32_and_fold(acc, piece); + acc = mult_64x32_and_fold(acc, 9207); + } + + /* Fold the hash down to 32 bits if need be. */ + hash = acc; + if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) + hash ^= acc >> 32; + + /* Squidge all the keyrings into a separate part of the tree to + * ordinary keys by making sure the lowest level segment in the hash is + * zero for keyrings and non-zero otherwise. + */ + if (index_key->type != &key_type_keyring && (hash & level_mask) == 0) + return hash | (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; + if (index_key->type == &key_type_keyring && (hash & level_mask) != 0) + return (hash + (hash << level_shift)) & ~level_mask; + return hash; +} + +/* + * Build the next index key chunk. + * + * On 32-bit systems the index key is laid out as: + * + * 0 4 5 9... + * hash desclen typeptr desc[] + * + * On 64-bit systems: + * + * 0 8 9 17... + * hash desclen typeptr desc[] + * + * We return it one word-sized chunk at a time. */ -static int keyring_match(const struct key *keyring, const void *description) +static unsigned long keyring_get_key_chunk(const void *data, int level) +{ + const struct keyring_index_key *index_key = data; + unsigned long chunk = 0; + long offset = 0; + int desc_len = index_key->desc_len, n = sizeof(chunk); + + level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; + switch (level) { + case 0: + return hash_key_type_and_desc(index_key); + case 1: + return ((unsigned long)index_key->type << 8) | desc_len; + case 2: + if (desc_len == 0) + return (u8)((unsigned long)index_key->type >> + (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8)); + n--; + offset = 1; + default: + offset += sizeof(chunk) - 1; + offset += (level - 3) * sizeof(chunk); + if (offset >= desc_len) + return 0; + desc_len -= offset; + if (desc_len > n) + desc_len = n; + offset += desc_len; + do { + chunk <<= 8; + chunk |= ((u8*)index_key->description)[--offset]; + } while (--desc_len > 0); + + if (level == 2) { + chunk <<= 8; + chunk |= (u8)((unsigned long)index_key->type >> + (ASSOC_ARRAY_KEY_CHUNK_SIZE - 8)); + } + return chunk; + } +} + +static unsigned long keyring_get_object_key_chunk(const void *object, int level) +{ + const struct key *key = keyring_ptr_to_key(object); + return keyring_get_key_chunk(&key->index_key, level); +} + +static bool keyring_compare_object(const void *object, const void *data) { - return keyring->description && - strcmp(keyring->description, description) == 0; + const struct keyring_index_key *index_key = data; + const struct key *key = keyring_ptr_to_key(object); + + return key->index_key.type == index_key->type && + key->index_key.desc_len == index_key->desc_len && + memcmp(key->index_key.description, index_key->description, + index_key->desc_len) == 0; } /* + * Compare the index keys of a pair of objects and determine the bit position + * at which they differ - if they differ. + */ +static int keyring_diff_objects(const void *_a, const void *_b) +{ + const struct key *key_a = keyring_ptr_to_key(_a); + const struct key *key_b = keyring_ptr_to_key(_b); + const struct keyring_index_key *a = &key_a->index_key; + const struct keyring_index_key *b = &key_b->index_key; + unsigned long seg_a, seg_b; + int level, i; + + level = 0; + seg_a = hash_key_type_and_desc(a); + seg_b = hash_key_type_and_desc(b); + if ((seg_a ^ seg_b) != 0) + goto differ; + + /* The number of bits contributed by the hash is controlled by a + * constant in the assoc_array headers. Everything else thereafter we + * can deal with as being machine word-size dependent. + */ + level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; + seg_a = a->desc_len; + seg_b = b->desc_len; + if ((seg_a ^ seg_b) != 0) + goto differ; + + /* The next bit may not work on big endian */ + level++; + seg_a = (unsigned long)a->type; + seg_b = (unsigned long)b->type; + if ((seg_a ^ seg_b) != 0) + goto differ; + + level += sizeof(unsigned long); + if (a->desc_len == 0) + goto same; + + i = 0; + if (((unsigned long)a->description | (unsigned long)b->description) & + (sizeof(unsigned long) - 1)) { + do { + seg_a = *(unsigned long *)(a->description + i); + seg_b = *(unsigned long *)(b->description + i); + if ((seg_a ^ seg_b) != 0) + goto differ_plus_i; + i += sizeof(unsigned long); + } while (i < (a->desc_len & (sizeof(unsigned long) - 1))); + } + + for (; i < a->desc_len; i++) { + seg_a = *(unsigned char *)(a->description + i); + seg_b = *(unsigned char *)(b->description + i); + if ((seg_a ^ seg_b) != 0) + goto differ_plus_i; + } + +same: + return -1; + +differ_plus_i: + level += i; +differ: + i = level * 8 + __ffs(seg_a ^ seg_b); + return i; +} + +/* + * Free an object after stripping the keyring flag off of the pointer. + */ +static void keyring_free_object(void *object) +{ + key_put(keyring_ptr_to_key(object)); +} + +/* + * Operations for keyring management by the index-tree routines. + */ +static const struct assoc_array_ops keyring_assoc_array_ops = { + .get_key_chunk = keyring_get_key_chunk, + .get_object_key_chunk = keyring_get_object_key_chunk, + .compare_object = keyring_compare_object, + .diff_objects = keyring_diff_objects, + .free_object = keyring_free_object, +}; + +/* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * @@ -155,9 +374,6 @@ static int keyring_match(const struct key *keyring, const void *description) */ static void keyring_destroy(struct key *keyring) { - struct keyring_list *klist; - int loop; - if (keyring->description) { write_lock(&keyring_name_lock); @@ -168,12 +384,7 @@ static void keyring_destroy(struct key *keyring) write_unlock(&keyring_name_lock); } - klist = rcu_access_pointer(keyring->payload.subscriptions); - if (klist) { - for (loop = klist->nkeys - 1; loop >= 0; loop--) - key_put(rcu_access_pointer(klist->keys[loop])); - kfree(klist); - } + assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* @@ -181,76 +392,88 @@ static void keyring_destroy(struct key *keyring) */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { - struct keyring_list *klist; - if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_instantiated(keyring)) { - rcu_read_lock(); - klist = rcu_dereference(keyring->payload.subscriptions); - if (klist) - seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys); + if (keyring->keys.nr_leaves_on_tree != 0) + seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); - rcu_read_unlock(); } } +struct keyring_read_iterator_context { + size_t qty; + size_t count; + key_serial_t __user *buffer; +}; + +static int keyring_read_iterator(const void *object, void *data) +{ + struct keyring_read_iterator_context *ctx = data; + const struct key *key = keyring_ptr_to_key(object); + int ret; + + kenter("{%s,%d},,{%zu/%zu}", + key->type->name, key->serial, ctx->count, ctx->qty); + + if (ctx->count >= ctx->qty) + return 1; + + ret = put_user(key->serial, ctx->buffer); + if (ret < 0) + return ret; + ctx->buffer++; + ctx->count += sizeof(key->serial); + return 0; +} + /* * Read a list of key IDs from the keyring's contents in binary form * - * The keyring's semaphore is read-locked by the caller. + * The keyring's semaphore is read-locked by the caller. This prevents someone + * from modifying it under us - which could cause us to read key IDs multiple + * times. */ static long keyring_read(const struct key *keyring, char __user *buffer, size_t buflen) { - struct keyring_list *klist; - struct key *key; - size_t qty, tmp; - int loop, ret; + struct keyring_read_iterator_context ctx; + unsigned long nr_keys; + int ret; - ret = 0; - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* calculate how much data we could return */ - qty = klist->nkeys * sizeof(key_serial_t); - - if (buffer && buflen > 0) { - if (buflen > qty) - buflen = qty; - - /* copy the IDs of the subscribed keys into the - * buffer */ - ret = -EFAULT; - - for (loop = 0; loop < klist->nkeys; loop++) { - key = rcu_deref_link_locked(klist, loop, - keyring); - - tmp = sizeof(key_serial_t); - if (tmp > buflen) - tmp = buflen; - - if (copy_to_user(buffer, - &key->serial, - tmp) != 0) - goto error; - - buflen -= tmp; - if (buflen == 0) - break; - buffer += tmp; - } - } + kenter("{%d},,%zu", key_serial(keyring), buflen); + + if (buflen & (sizeof(key_serial_t) - 1)) + return -EINVAL; + + nr_keys = keyring->keys.nr_leaves_on_tree; + if (nr_keys == 0) + return 0; - ret = qty; + /* Calculate how much data we could return */ + ctx.qty = nr_keys * sizeof(key_serial_t); + + if (!buffer || !buflen) + return ctx.qty; + + if (buflen > ctx.qty) + ctx.qty = buflen; + + /* Copy the IDs of the subscribed keys into the buffer */ + ctx.buffer = (key_serial_t __user *)buffer; + ctx.count = 0; + ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); + if (ret < 0) { + kleave(" = %d [iterate]", ret); + return ret; } -error: - return ret; + kleave(" = %zu [ok]", ctx.count); + return ctx.count; } /* @@ -277,227 +500,361 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, } EXPORT_SYMBOL(keyring_alloc); -/** - * keyring_search_aux - Search a keyring tree for a key matching some criteria - * @keyring_ref: A pointer to the keyring with possession indicator. - * @cred: The credentials to use for permissions checks. - * @type: The type of key to search for. - * @description: Parameter for @match. - * @match: Function to rule on whether or not a key is the one required. - * @no_state_check: Don't check if a matching key is bad - * - * Search the supplied keyring tree for a key that matches the criteria given. - * The root keyring and any linked keyrings must grant Search permission to the - * caller to be searchable and keys can only be found if they too grant Search - * to the caller. The possession flag on the root keyring pointer controls use - * of the possessor bits in permissions checking of the entire tree. In - * addition, the LSM gets to forbid keyring searches and key matches. - * - * The search is performed as a breadth-then-depth search up to the prescribed - * limit (KEYRING_SEARCH_MAX_DEPTH). - * - * Keys are matched to the type provided and are then filtered by the match - * function, which is given the description to use in any way it sees fit. The - * match function may use any attributes of a key that it wishes to to - * determine the match. Normally the match function from the key type would be - * used. - * - * RCU is used to prevent the keyring key lists from disappearing without the - * need to take lots of locks. - * - * Returns a pointer to the found key and increments the key usage count if - * successful; -EAGAIN if no matching keys were found, or if expired or revoked - * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the - * specified keyring wasn't a keyring. - * - * In the case of a successful return, the possession attribute from - * @keyring_ref is propagated to the returned key reference. +/* + * Iteration function to consider each key found. */ -key_ref_t keyring_search_aux(key_ref_t keyring_ref, - const struct cred *cred, - struct key_type *type, - const void *description, - key_match_func_t match, - bool no_state_check) +static int keyring_search_iterator(const void *object, void *iterator_data) { - struct { - /* Need a separate keylist pointer for RCU purposes */ - struct key *keyring; - struct keyring_list *keylist; - int kix; - } stack[KEYRING_SEARCH_MAX_DEPTH]; - - struct keyring_list *keylist; - struct timespec now; - unsigned long possessed, kflags; - struct key *keyring, *key; - key_ref_t key_ref; - long err; - int sp, nkeys, kix; + struct keyring_search_context *ctx = iterator_data; + const struct key *key = keyring_ptr_to_key(object); + unsigned long kflags = key->flags; - keyring = key_ref_to_ptr(keyring_ref); - possessed = is_key_possessed(keyring_ref); - key_check(keyring); + kenter("{%d}", key->serial); - /* top keyring must have search permission to begin the search */ - err = key_task_permission(keyring_ref, cred, KEY_SEARCH); - if (err < 0) { - key_ref = ERR_PTR(err); - goto error; + /* ignore keys not of this type */ + if (key->type != ctx->index_key.type) { + kleave(" = 0 [!type]"); + return 0; } - key_ref = ERR_PTR(-ENOTDIR); - if (keyring->type != &key_type_keyring) - goto error; + /* skip invalidated, revoked and expired keys */ + if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { + if (kflags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) { + ctx->result = ERR_PTR(-EKEYREVOKED); + kleave(" = %d [invrev]", ctx->skipped_ret); + goto skipped; + } - rcu_read_lock(); + if (key->expiry && ctx->now.tv_sec >= key->expiry) { + ctx->result = ERR_PTR(-EKEYEXPIRED); + kleave(" = %d [expire]", ctx->skipped_ret); + goto skipped; + } + } - now = current_kernel_time(); - err = -EAGAIN; - sp = 0; - - /* firstly we should check to see if this top-level keyring is what we - * are looking for */ - key_ref = ERR_PTR(-EAGAIN); - kflags = keyring->flags; - if (keyring->type == type && match(keyring, description)) { - key = keyring; - if (no_state_check) - goto found; + /* keys that don't match */ + if (!ctx->match(key, ctx->match_data)) { + kleave(" = 0 [!match]"); + return 0; + } - /* check it isn't negative and hasn't expired or been - * revoked */ - if (kflags & (1 << KEY_FLAG_REVOKED)) - goto error_2; - if (key->expiry && now.tv_sec >= key->expiry) - goto error_2; - key_ref = ERR_PTR(key->type_data.reject_error); - if (kflags & (1 << KEY_FLAG_NEGATIVE)) - goto error_2; - goto found; + /* key must have search permissions */ + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && + key_task_permission(make_key_ref(key, ctx->possessed), + ctx->cred, KEY_SEARCH) < 0) { + ctx->result = ERR_PTR(-EACCES); + kleave(" = %d [!perm]", ctx->skipped_ret); + goto skipped; } - /* otherwise, the top keyring must not be revoked, expired, or - * negatively instantiated if we are to search it */ - key_ref = ERR_PTR(-EAGAIN); - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_NEGATIVE)) || - (keyring->expiry && now.tv_sec >= keyring->expiry)) - goto error_2; - - /* start processing a new keyring */ -descend: - kflags = keyring->flags; - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - goto not_this_keyring; + if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { + /* we set a different error code if we pass a negative key */ + if (kflags & (1 << KEY_FLAG_NEGATIVE)) { + smp_rmb(); + ctx->result = ERR_PTR(key->type_data.reject_error); + kleave(" = %d [neg]", ctx->skipped_ret); + goto skipped; + } + } - keylist = rcu_dereference(keyring->payload.subscriptions); - if (!keylist) - goto not_this_keyring; + /* Found */ + ctx->result = make_key_ref(key, ctx->possessed); + kleave(" = 1 [found]"); + return 1; - /* iterate through the keys in this keyring first */ - nkeys = keylist->nkeys; - smp_rmb(); - for (kix = 0; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - kflags = key->flags; +skipped: + return ctx->skipped_ret; +} - /* ignore keys not of this type */ - if (key->type != type) - continue; +/* + * Search inside a keyring for a key. We can search by walking to it + * directly based on its index-key or we can iterate over the entire + * tree looking for it, based on the match function. + */ +static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) +{ + if ((ctx->flags & KEYRING_SEARCH_LOOKUP_TYPE) == + KEYRING_SEARCH_LOOKUP_DIRECT) { + const void *object; + + object = assoc_array_find(&keyring->keys, + &keyring_assoc_array_ops, + &ctx->index_key); + return object ? ctx->iterator(object, ctx) : 0; + } + return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); +} - /* skip invalidated, revoked and expired keys */ - if (!no_state_check) { - if (kflags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - continue; +/* + * Search a tree of keyrings that point to other keyrings up to the maximum + * depth. + */ +static bool search_nested_keyrings(struct key *keyring, + struct keyring_search_context *ctx) +{ + struct { + struct key *keyring; + struct assoc_array_node *node; + int slot; + } stack[KEYRING_SEARCH_MAX_DEPTH]; - if (key->expiry && now.tv_sec >= key->expiry) - continue; - } + struct assoc_array_shortcut *shortcut; + struct assoc_array_node *node; + struct assoc_array_ptr *ptr; + struct key *key; + int sp = 0, slot; - /* keys that don't match */ - if (!match(key, description)) - continue; + kenter("{%d},{%s,%s}", + keyring->serial, + ctx->index_key.type->name, + ctx->index_key.description); - /* key must have search permissions */ - if (key_task_permission(make_key_ref(key, possessed), - cred, KEY_SEARCH) < 0) - continue; + if (ctx->index_key.description) + ctx->index_key.desc_len = strlen(ctx->index_key.description); - if (no_state_check) + /* Check to see if this top-level keyring is what we are looking for + * and whether it is valid or not. + */ + if (ctx->flags & KEYRING_SEARCH_LOOKUP_ITERATE || + keyring_compare_object(keyring, &ctx->index_key)) { + ctx->skipped_ret = 2; + ctx->flags |= KEYRING_SEARCH_DO_STATE_CHECK; + switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { + case 1: goto found; - - /* we set a different error code if we pass a negative key */ - if (kflags & (1 << KEY_FLAG_NEGATIVE)) { - err = key->type_data.reject_error; - continue; + case 2: + return false; + default: + break; } + } + + ctx->skipped_ret = 0; + if (ctx->flags & KEYRING_SEARCH_NO_STATE_CHECK) + ctx->flags &= ~KEYRING_SEARCH_DO_STATE_CHECK; + /* Start processing a new keyring */ +descend_to_keyring: + kdebug("descend to %d", keyring->serial); + if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) + goto not_this_keyring; + + /* Search through the keys in this keyring before its searching its + * subtrees. + */ + if (search_keyring(keyring, ctx)) goto found; - } - /* search through the keyrings nested in this one */ - kix = 0; -ascend: - nkeys = keylist->nkeys; - smp_rmb(); - for (; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - if (key->type != &key_type_keyring) - continue; + /* Then manually iterate through the keyrings nested in this one. + * + * Start from the root node of the index tree. Because of the way the + * hash function has been set up, keyrings cluster on the leftmost + * branch of the root node (root slot 0) or in the root node itself. + * Non-keyrings avoid the leftmost branch of the root entirely (root + * slots 1-15). + */ + ptr = ACCESS_ONCE(keyring->keys.root); + if (!ptr) + goto not_this_keyring; - /* recursively search nested keyrings - * - only search keyrings for which we have search permission + if (assoc_array_ptr_is_shortcut(ptr)) { + /* If the root is a shortcut, either the keyring only contains + * keyring pointers (everything clusters behind root slot 0) or + * doesn't contain any keyring pointers. */ - if (sp >= KEYRING_SEARCH_MAX_DEPTH) + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) + goto not_this_keyring; + + ptr = ACCESS_ONCE(shortcut->next_node); + node = assoc_array_ptr_to_node(ptr); + goto begin_node; + } + + node = assoc_array_ptr_to_node(ptr); + smp_read_barrier_depends(); + + ptr = node->slots[0]; + if (!assoc_array_ptr_is_meta(ptr)) + goto begin_node; + +descend_to_node: + /* Descend to a more distal node in this keyring's content tree and go + * through that. + */ + kdebug("descend"); + if (assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + ptr = ACCESS_ONCE(shortcut->next_node); + BUG_ON(!assoc_array_ptr_is_node(ptr)); + node = assoc_array_ptr_to_node(ptr); + } + +begin_node: + kdebug("begin_node"); + smp_read_barrier_depends(); + slot = 0; +ascend_to_node: + /* Go through the slots in a node */ + for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { + ptr = ACCESS_ONCE(node->slots[slot]); + + if (assoc_array_ptr_is_meta(ptr) && node->back_pointer) + goto descend_to_node; + + if (!keyring_ptr_is_keyring(ptr)) continue; - if (key_task_permission(make_key_ref(key, possessed), - cred, KEY_SEARCH) < 0) + key = keyring_ptr_to_key(ptr); + + if (sp >= KEYRING_SEARCH_MAX_DEPTH) { + if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { + ctx->result = ERR_PTR(-ELOOP); + return false; + } + goto not_this_keyring; + } + + /* Search a nested keyring */ + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && + key_task_permission(make_key_ref(key, ctx->possessed), + ctx->cred, KEY_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; - stack[sp].keylist = keylist; - stack[sp].kix = kix; + stack[sp].node = node; + stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; - goto descend; + goto descend_to_keyring; } - /* the keyring we're looking at was disqualified or didn't contain a - * matching key */ + /* We've dealt with all the slots in the current node, so now we need + * to ascend to the parent and continue processing there. + */ + ptr = ACCESS_ONCE(node->back_pointer); + slot = node->parent_slot; + + if (ptr && assoc_array_ptr_is_shortcut(ptr)) { + shortcut = assoc_array_ptr_to_shortcut(ptr); + smp_read_barrier_depends(); + ptr = ACCESS_ONCE(shortcut->back_pointer); + slot = shortcut->parent_slot; + } + if (!ptr) + goto not_this_keyring; + node = assoc_array_ptr_to_node(ptr); + smp_read_barrier_depends(); + slot++; + + /* If we've ascended to the root (zero backpointer), we must have just + * finished processing the leftmost branch rather than the root slots - + * so there can't be any more keyrings for us to find. + */ + if (node->back_pointer) { + kdebug("ascend %d", slot); + goto ascend_to_node; + } + + /* The keyring we're looking at was disqualified or didn't contain a + * matching key. + */ not_this_keyring: - if (sp > 0) { - /* resume the processing of a keyring higher up in the tree */ - sp--; - keyring = stack[sp].keyring; - keylist = stack[sp].keylist; - kix = stack[sp].kix + 1; - goto ascend; + kdebug("not_this_keyring %d", sp); + if (sp <= 0) { + kleave(" = false"); + return false; } - key_ref = ERR_PTR(err); - goto error_2; + /* Resume the processing of a keyring higher up in the tree */ + sp--; + keyring = stack[sp].keyring; + node = stack[sp].node; + slot = stack[sp].slot + 1; + kdebug("ascend to %d [%d]", keyring->serial, slot); + goto ascend_to_node; - /* we found a viable match */ + /* We found a viable match */ found: - atomic_inc(&key->usage); - key->last_used_at = now.tv_sec; - keyring->last_used_at = now.tv_sec; - while (sp > 0) - stack[--sp].keyring->last_used_at = now.tv_sec; + key = key_ref_to_ptr(ctx->result); key_check(key); - key_ref = make_key_ref(key, possessed); -error_2: + if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { + key->last_used_at = ctx->now.tv_sec; + keyring->last_used_at = ctx->now.tv_sec; + while (sp > 0) + stack[--sp].keyring->last_used_at = ctx->now.tv_sec; + } + kleave(" = true"); + return true; +} + +/** + * keyring_search_aux - Search a keyring tree for a key matching some criteria + * @keyring_ref: A pointer to the keyring with possession indicator. + * @ctx: The keyring search context. + * + * Search the supplied keyring tree for a key that matches the criteria given. + * The root keyring and any linked keyrings must grant Search permission to the + * caller to be searchable and keys can only be found if they too grant Search + * to the caller. The possession flag on the root keyring pointer controls use + * of the possessor bits in permissions checking of the entire tree. In + * addition, the LSM gets to forbid keyring searches and key matches. + * + * The search is performed as a breadth-then-depth search up to the prescribed + * limit (KEYRING_SEARCH_MAX_DEPTH). + * + * Keys are matched to the type provided and are then filtered by the match + * function, which is given the description to use in any way it sees fit. The + * match function may use any attributes of a key that it wishes to to + * determine the match. Normally the match function from the key type would be + * used. + * + * RCU can be used to prevent the keyring key lists from disappearing without + * the need to take lots of locks. + * + * Returns a pointer to the found key and increments the key usage count if + * successful; -EAGAIN if no matching keys were found, or if expired or revoked + * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the + * specified keyring wasn't a keyring. + * + * In the case of a successful return, the possession attribute from + * @keyring_ref is propagated to the returned key reference. + */ +key_ref_t keyring_search_aux(key_ref_t keyring_ref, + struct keyring_search_context *ctx) +{ + struct key *keyring; + long err; + + ctx->iterator = keyring_search_iterator; + ctx->possessed = is_key_possessed(keyring_ref); + ctx->result = ERR_PTR(-EAGAIN); + + keyring = key_ref_to_ptr(keyring_ref); + key_check(keyring); + + if (keyring->type != &key_type_keyring) + return ERR_PTR(-ENOTDIR); + + if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { + err = key_task_permission(keyring_ref, ctx->cred, KEY_SEARCH); + if (err < 0) + return ERR_PTR(err); + } + + rcu_read_lock(); + ctx->now = current_kernel_time(); + if (search_nested_keyrings(keyring, ctx)) + __key_get(key_ref_to_ptr(ctx->result)); rcu_read_unlock(); -error: - return key_ref; + return ctx->result; } /** @@ -507,77 +864,73 @@ error: * @description: The name of the keyring we want to find. * * As keyring_search_aux() above, but using the current task's credentials and - * type's default matching function. + * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description) { - if (!type->match) + struct keyring_search_context ctx = { + .index_key.type = type, + .index_key.description = description, + .cred = current_cred(), + .match = type->match, + .match_data = description, + .flags = (type->def_lookup_type | + KEYRING_SEARCH_DO_STATE_CHECK), + }; + + if (!ctx.match) return ERR_PTR(-ENOKEY); - return keyring_search_aux(keyring, current->cred, - type, description, type->match, false); + return keyring_search_aux(keyring, &ctx); } EXPORT_SYMBOL(keyring_search); /* - * Search the given keyring only (no recursion). + * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the - * permission is granted to search the keyring as no check is made here. - * - * RCU is used to make it unnecessary to lock the keyring key list here. + * permission is granted to modify the keyring as no check is made here. The + * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if - * successful and returns -ENOKEY if not found. Revoked keys and keys not - * providing the requested permission are skipped over. + * successful and returns NULL if not found. Revoked and invalidated keys are + * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ -key_ref_t __keyring_search_one(key_ref_t keyring_ref, - const struct key_type *ktype, - const char *description, - key_perm_t perm) +key_ref_t find_key_to_update(key_ref_t keyring_ref, + const struct keyring_index_key *index_key) { - struct keyring_list *klist; - unsigned long possessed; struct key *keyring, *key; - int nkeys, loop; + const void *object; keyring = key_ref_to_ptr(keyring_ref); - possessed = is_key_possessed(keyring_ref); - rcu_read_lock(); + kenter("{%d},{%s,%s}", + keyring->serial, index_key->type->name, index_key->description); - klist = rcu_dereference(keyring->payload.subscriptions); - if (klist) { - nkeys = klist->nkeys; - smp_rmb(); - for (loop = 0; loop < nkeys ; loop++) { - key = rcu_dereference(klist->keys[loop]); - if (key->type == ktype && - (!key->type->match || - key->type->match(key, description)) && - key_permission(make_key_ref(key, possessed), - perm) == 0 && - !(key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED))) - ) - goto found; - } - } + object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, + index_key); - rcu_read_unlock(); - return ERR_PTR(-ENOKEY); + if (object) + goto found; + + kleave(" = NULL"); + return NULL; found: - atomic_inc(&key->usage); - keyring->last_used_at = key->last_used_at = - current_kernel_time().tv_sec; - rcu_read_unlock(); - return make_key_ref(key, possessed); + key = keyring_ptr_to_key(object); + if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) { + kleave(" = NULL [x]"); + return NULL; + } + __key_get(key); + kleave(" = {%d}", key->serial); + return make_key_ref(key, is_key_possessed(keyring_ref)); } /* @@ -640,6 +993,19 @@ out: return keyring; } +static int keyring_detect_cycle_iterator(const void *object, + void *iterator_data) +{ + struct keyring_search_context *ctx = iterator_data; + const struct key *key = keyring_ptr_to_key(object); + + kenter("{%d}", key->serial); + + BUG_ON(key != ctx->match_data); + ctx->result = ERR_PTR(-EDEADLK); + return 1; +} + /* * See if a cycle will will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). @@ -649,116 +1015,39 @@ out: */ static int keyring_detect_cycle(struct key *A, struct key *B) { - struct { - struct keyring_list *keylist; - int kix; - } stack[KEYRING_SEARCH_MAX_DEPTH]; - - struct keyring_list *keylist; - struct key *subtree, *key; - int sp, nkeys, kix, ret; + struct keyring_search_context ctx = { + .index_key = A->index_key, + .match_data = A, + .iterator = keyring_detect_cycle_iterator, + .flags = (KEYRING_SEARCH_LOOKUP_DIRECT | + KEYRING_SEARCH_NO_STATE_CHECK | + KEYRING_SEARCH_NO_UPDATE_TIME | + KEYRING_SEARCH_NO_CHECK_PERM | + KEYRING_SEARCH_DETECT_TOO_DEEP), + }; rcu_read_lock(); - - ret = -EDEADLK; - if (A == B) - goto cycle_detected; - - subtree = B; - sp = 0; - - /* start processing a new keyring */ -descend: - if (test_bit(KEY_FLAG_REVOKED, &subtree->flags)) - goto not_this_keyring; - - keylist = rcu_dereference(subtree->payload.subscriptions); - if (!keylist) - goto not_this_keyring; - kix = 0; - -ascend: - /* iterate through the remaining keys in this keyring */ - nkeys = keylist->nkeys; - smp_rmb(); - for (; kix < nkeys; kix++) { - key = rcu_dereference(keylist->keys[kix]); - - if (key == A) - goto cycle_detected; - - /* recursively check nested keyrings */ - if (key->type == &key_type_keyring) { - if (sp >= KEYRING_SEARCH_MAX_DEPTH) - goto too_deep; - - /* stack the current position */ - stack[sp].keylist = keylist; - stack[sp].kix = kix; - sp++; - - /* begin again with the new keyring */ - subtree = key; - goto descend; - } - } - - /* the keyring we're looking at was disqualified or didn't contain a - * matching key */ -not_this_keyring: - if (sp > 0) { - /* resume the checking of a keyring higher up in the tree */ - sp--; - keylist = stack[sp].keylist; - kix = stack[sp].kix + 1; - goto ascend; - } - - ret = 0; /* no cycles detected */ - -error: + search_nested_keyrings(B, &ctx); rcu_read_unlock(); - return ret; - -too_deep: - ret = -ELOOP; - goto error; - -cycle_detected: - ret = -EDEADLK; - goto error; -} - -/* - * Dispose of a keyring list after the RCU grace period, freeing the unlinked - * key - */ -static void keyring_unlink_rcu_disposal(struct rcu_head *rcu) -{ - struct keyring_list *klist = - container_of(rcu, struct keyring_list, rcu); - - if (klist->delkey != USHRT_MAX) - key_put(rcu_access_pointer(klist->keys[klist->delkey])); - kfree(klist); + return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Preallocate memory so that a key can be linked into to a keyring. */ -int __key_link_begin(struct key *keyring, const struct key_type *type, - const char *description, unsigned long *_prealloc) +int __key_link_begin(struct key *keyring, + const struct keyring_index_key *index_key, + struct assoc_array_edit **_edit) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_sem) { - struct keyring_list *klist, *nklist; - unsigned long prealloc; - unsigned max; - time_t lowest_lru; - size_t size; - int loop, lru, ret; + struct assoc_array_edit *edit; + int ret; + + kenter("%d,%s,%s,", + keyring->serial, index_key->type->name, index_key->description); - kenter("%d,%s,%s,", key_serial(keyring), type->name, description); + BUG_ON(index_key->desc_len == 0); if (keyring->type != &key_type_keyring) return -ENOTDIR; @@ -771,100 +1060,39 @@ int __key_link_begin(struct key *keyring, const struct key_type *type, /* serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders */ - if (type == &key_type_keyring) + if (index_key->type == &key_type_keyring) down_write(&keyring_serialise_link_sem); - klist = rcu_dereference_locked_keyring(keyring); - - /* see if there's a matching key we can displace */ - lru = -1; - if (klist && klist->nkeys > 0) { - lowest_lru = TIME_T_MAX; - for (loop = klist->nkeys - 1; loop >= 0; loop--) { - struct key *key = rcu_deref_link_locked(klist, loop, - keyring); - if (key->type == type && - strcmp(key->description, description) == 0) { - /* Found a match - we'll replace the link with - * one to the new key. We record the slot - * position. - */ - klist->delkey = loop; - prealloc = 0; - goto done; - } - if (key->last_used_at < lowest_lru) { - lowest_lru = key->last_used_at; - lru = loop; - } - } - } - - /* If the keyring is full then do an LRU discard */ - if (klist && - klist->nkeys == klist->maxkeys && - klist->maxkeys >= MAX_KEYRING_LINKS) { - kdebug("LRU discard %d\n", lru); - klist->delkey = lru; - prealloc = 0; - goto done; - } - - /* check that we aren't going to overrun the user's quota */ - ret = key_payload_reserve(keyring, - keyring->datalen + KEYQUOTA_LINK_BYTES); - if (ret < 0) + /* Create an edit script that will insert/replace the key in the + * keyring tree. + */ + edit = assoc_array_insert(&keyring->keys, + &keyring_assoc_array_ops, + index_key, + NULL); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); goto error_sem; + } - if (klist && klist->nkeys < klist->maxkeys) { - /* there's sufficient slack space to append directly */ - klist->delkey = klist->nkeys; - prealloc = KEY_LINK_FIXQUOTA; - } else { - /* grow the key list */ - max = 4; - if (klist) { - max += klist->maxkeys; - if (max > MAX_KEYRING_LINKS) - max = MAX_KEYRING_LINKS; - BUG_ON(max <= klist->maxkeys); - } - - size = sizeof(*klist) + sizeof(struct key *) * max; - - ret = -ENOMEM; - nklist = kmalloc(size, GFP_KERNEL); - if (!nklist) - goto error_quota; - - nklist->maxkeys = max; - if (klist) { - memcpy(nklist->keys, klist->keys, - sizeof(struct key *) * klist->nkeys); - nklist->delkey = klist->nkeys; - nklist->nkeys = klist->nkeys + 1; - klist->delkey = USHRT_MAX; - } else { - nklist->nkeys = 1; - nklist->delkey = 0; - } - - /* add the key into the new space */ - RCU_INIT_POINTER(nklist->keys[nklist->delkey], NULL); - prealloc = (unsigned long)nklist | KEY_LINK_FIXQUOTA; + /* If we're not replacing a link in-place then we're going to need some + * extra quota. + */ + if (!edit->dead_leaf) { + ret = key_payload_reserve(keyring, + keyring->datalen + KEYQUOTA_LINK_BYTES); + if (ret < 0) + goto error_cancel; } -done: - *_prealloc = prealloc; + *_edit = edit; kleave(" = 0"); return 0; -error_quota: - /* undo the quota changes */ - key_payload_reserve(keyring, - keyring->datalen - KEYQUOTA_LINK_BYTES); +error_cancel: + assoc_array_cancel_edit(edit); error_sem: - if (type == &key_type_keyring) + if (index_key->type == &key_type_keyring) up_write(&keyring_serialise_link_sem); error_krsem: up_write(&keyring->sem); @@ -895,60 +1123,12 @@ int __key_link_check_live_key(struct key *keyring, struct key *key) * holds at most one link to any given key of a particular type+description * combination. */ -void __key_link(struct key *keyring, struct key *key, - unsigned long *_prealloc) +void __key_link(struct key *key, struct assoc_array_edit **_edit) { - struct keyring_list *klist, *nklist; - struct key *discard; - - nklist = (struct keyring_list *)(*_prealloc & ~KEY_LINK_FIXQUOTA); - *_prealloc = 0; - - kenter("%d,%d,%p", keyring->serial, key->serial, nklist); - - klist = rcu_dereference_locked_keyring(keyring); - - atomic_inc(&key->usage); - keyring->last_used_at = key->last_used_at = - current_kernel_time().tv_sec; - - /* there's a matching key we can displace or an empty slot in a newly - * allocated list we can fill */ - if (nklist) { - kdebug("reissue %hu/%hu/%hu", - nklist->delkey, nklist->nkeys, nklist->maxkeys); - - RCU_INIT_POINTER(nklist->keys[nklist->delkey], key); - - rcu_assign_pointer(keyring->payload.subscriptions, nklist); - - /* dispose of the old keyring list and, if there was one, the - * displaced key */ - if (klist) { - kdebug("dispose %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); - } - } else if (klist->delkey < klist->nkeys) { - kdebug("replace %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - - discard = rcu_dereference_protected( - klist->keys[klist->delkey], - rwsem_is_locked(&keyring->sem)); - rcu_assign_pointer(klist->keys[klist->delkey], key); - /* The garbage collector will take care of RCU - * synchronisation */ - key_put(discard); - } else { - /* there's sufficient slack space to append directly */ - kdebug("append %hu/%hu/%hu", - klist->delkey, klist->nkeys, klist->maxkeys); - - RCU_INIT_POINTER(klist->keys[klist->delkey], key); - smp_wmb(); - klist->nkeys++; - } + __key_get(key); + assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); + assoc_array_apply_edit(*_edit); + *_edit = NULL; } /* @@ -956,24 +1136,22 @@ void __key_link(struct key *keyring, struct key *key, * * Must be called with __key_link_begin() having being called. */ -void __key_link_end(struct key *keyring, struct key_type *type, - unsigned long prealloc) +void __key_link_end(struct key *keyring, + const struct keyring_index_key *index_key, + struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_sem) { - BUG_ON(type == NULL); - BUG_ON(type->name == NULL); - kenter("%d,%s,%lx", keyring->serial, type->name, prealloc); + BUG_ON(index_key->type == NULL); + kenter("%d,%s,", keyring->serial, index_key->type->name); - if (type == &key_type_keyring) + if (index_key->type == &key_type_keyring) up_write(&keyring_serialise_link_sem); - if (prealloc) { - if (prealloc & KEY_LINK_FIXQUOTA) - key_payload_reserve(keyring, - keyring->datalen - - KEYQUOTA_LINK_BYTES); - kfree((struct keyring_list *)(prealloc & ~KEY_LINK_FIXQUOTA)); + if (edit && !edit->dead_leaf) { + key_payload_reserve(keyring, + keyring->datalen - KEYQUOTA_LINK_BYTES); + assoc_array_cancel_edit(edit); } up_write(&keyring->sem); } @@ -1000,20 +1178,28 @@ void __key_link_end(struct key *keyring, struct key_type *type, */ int key_link(struct key *keyring, struct key *key) { - unsigned long prealloc; + struct assoc_array_edit *edit; int ret; + kenter("{%d,%d}", keyring->serial, atomic_read(&keyring->usage)); + key_check(keyring); key_check(key); - ret = __key_link_begin(keyring, key->type, key->description, &prealloc); + if (test_bit(KEY_FLAG_TRUSTED_ONLY, &keyring->flags) && + !test_bit(KEY_FLAG_TRUSTED, &key->flags)) + return -EPERM; + + ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret == 0) { + kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage)); ret = __key_link_check_live_key(keyring, key); if (ret == 0) - __key_link(keyring, key, &prealloc); - __key_link_end(keyring, key->type, prealloc); + __key_link(key, &edit); + __key_link_end(keyring, &key->index_key, edit); } + kleave(" = %d {%d,%d}", ret, keyring->serial, atomic_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); @@ -1037,90 +1223,37 @@ EXPORT_SYMBOL(key_link); */ int key_unlink(struct key *keyring, struct key *key) { - struct keyring_list *klist, *nklist; - int loop, ret; + struct assoc_array_edit *edit; + int ret; key_check(keyring); key_check(key); - ret = -ENOTDIR; if (keyring->type != &key_type_keyring) - goto error; + return -ENOTDIR; down_write(&keyring->sem); - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* search the keyring for the key */ - for (loop = 0; loop < klist->nkeys; loop++) - if (rcu_access_pointer(klist->keys[loop]) == key) - goto key_is_present; + edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, + &key->index_key); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); + goto error; } - - up_write(&keyring->sem); ret = -ENOENT; - goto error; - -key_is_present: - /* we need to copy the key list for RCU purposes */ - nklist = kmalloc(sizeof(*klist) + - sizeof(struct key *) * klist->maxkeys, - GFP_KERNEL); - if (!nklist) - goto nomem; - nklist->maxkeys = klist->maxkeys; - nklist->nkeys = klist->nkeys - 1; - - if (loop > 0) - memcpy(&nklist->keys[0], - &klist->keys[0], - loop * sizeof(struct key *)); - - if (loop < nklist->nkeys) - memcpy(&nklist->keys[loop], - &klist->keys[loop + 1], - (nklist->nkeys - loop) * sizeof(struct key *)); - - /* adjust the user's quota */ - key_payload_reserve(keyring, - keyring->datalen - KEYQUOTA_LINK_BYTES); - - rcu_assign_pointer(keyring->payload.subscriptions, nklist); - - up_write(&keyring->sem); - - /* schedule for later cleanup */ - klist->delkey = loop; - call_rcu(&klist->rcu, keyring_unlink_rcu_disposal); + if (edit == NULL) + goto error; + assoc_array_apply_edit(edit); + key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); ret = 0; error: - return ret; -nomem: - ret = -ENOMEM; up_write(&keyring->sem); - goto error; + return ret; } EXPORT_SYMBOL(key_unlink); -/* - * Dispose of a keyring list after the RCU grace period, releasing the keys it - * links to. - */ -static void keyring_clear_rcu_disposal(struct rcu_head *rcu) -{ - struct keyring_list *klist; - int loop; - - klist = container_of(rcu, struct keyring_list, rcu); - - for (loop = klist->nkeys - 1; loop >= 0; loop--) - key_put(rcu_access_pointer(klist->keys[loop])); - - kfree(klist); -} - /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. @@ -1131,33 +1264,25 @@ static void keyring_clear_rcu_disposal(struct rcu_head *rcu) */ int keyring_clear(struct key *keyring) { - struct keyring_list *klist; + struct assoc_array_edit *edit; int ret; - ret = -ENOTDIR; - if (keyring->type == &key_type_keyring) { - /* detach the pointer block with the locks held */ - down_write(&keyring->sem); - - klist = rcu_dereference_locked_keyring(keyring); - if (klist) { - /* adjust the quota */ - key_payload_reserve(keyring, - sizeof(struct keyring_list)); - - rcu_assign_pointer(keyring->payload.subscriptions, - NULL); - } - - up_write(&keyring->sem); + if (keyring->type != &key_type_keyring) + return -ENOTDIR; - /* free the keys after the locks have been dropped */ - if (klist) - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); + down_write(&keyring->sem); + edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); + if (IS_ERR(edit)) { + ret = PTR_ERR(edit); + } else { + if (edit) + assoc_array_apply_edit(edit); + key_payload_reserve(keyring, 0); ret = 0; } + up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); @@ -1169,111 +1294,68 @@ EXPORT_SYMBOL(keyring_clear); */ static void keyring_revoke(struct key *keyring) { - struct keyring_list *klist; + struct assoc_array_edit *edit; + + edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); + if (!IS_ERR(edit)) { + if (edit) + assoc_array_apply_edit(edit); + key_payload_reserve(keyring, 0); + } +} + +static bool keyring_gc_select_iterator(void *object, void *iterator_data) +{ + struct key *key = keyring_ptr_to_key(object); + time_t *limit = iterator_data; - klist = rcu_dereference_locked_keyring(keyring); + if (key_is_dead(key, *limit)) + return false; + key_get(key); + return true; +} - /* adjust the quota */ - key_payload_reserve(keyring, 0); +static int keyring_gc_check_iterator(const void *object, void *iterator_data) +{ + const struct key *key = keyring_ptr_to_key(object); + time_t *limit = iterator_data; - if (klist) { - rcu_assign_pointer(keyring->payload.subscriptions, NULL); - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); - } + key_check(key); + return key_is_dead(key, *limit); } /* - * Collect garbage from the contents of a keyring, replacing the old list with - * a new one with the pointers all shuffled down. + * Garbage collect pointers from a keyring. * - * Dead keys are classed as oned that are flagged as being dead or are revoked, - * expired or negative keys that were revoked or expired before the specified - * limit. + * Not called with any locks held. The keyring's key struct will not be + * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time_t limit) { - struct keyring_list *klist, *new; - struct key *key; - int loop, keep, max; - - kenter("{%x,%s}", key_serial(keyring), keyring->description); - - down_write(&keyring->sem); - - klist = rcu_dereference_locked_keyring(keyring); - if (!klist) - goto no_klist; - - /* work out how many subscriptions we're keeping */ - keep = 0; - for (loop = klist->nkeys - 1; loop >= 0; loop--) - if (!key_is_dead(rcu_deref_link_locked(klist, loop, keyring), - limit)) - keep++; - - if (keep == klist->nkeys) - goto just_return; - - /* allocate a new keyring payload */ - max = roundup(keep, 4); - new = kmalloc(sizeof(struct keyring_list) + max * sizeof(struct key *), - GFP_KERNEL); - if (!new) - goto nomem; - new->maxkeys = max; - new->nkeys = 0; - new->delkey = 0; - - /* install the live keys - * - must take care as expired keys may be updated back to life - */ - keep = 0; - for (loop = klist->nkeys - 1; loop >= 0; loop--) { - key = rcu_deref_link_locked(klist, loop, keyring); - if (!key_is_dead(key, limit)) { - if (keep >= max) - goto discard_new; - RCU_INIT_POINTER(new->keys[keep++], key_get(key)); - } - } - new->nkeys = keep; - - /* adjust the quota */ - key_payload_reserve(keyring, - sizeof(struct keyring_list) + - KEYQUOTA_LINK_BYTES * keep); + int result; - if (keep == 0) { - rcu_assign_pointer(keyring->payload.subscriptions, NULL); - kfree(new); - } else { - rcu_assign_pointer(keyring->payload.subscriptions, new); - } + kenter("%x{%s}", keyring->serial, keyring->description ?: ""); - up_write(&keyring->sem); + if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED))) + goto dont_gc; - call_rcu(&klist->rcu, keyring_clear_rcu_disposal); - kleave(" [yes]"); - return; - -discard_new: - new->nkeys = keep; - keyring_clear_rcu_disposal(&new->rcu); - up_write(&keyring->sem); - kleave(" [discard]"); - return; - -just_return: - up_write(&keyring->sem); - kleave(" [no dead]"); - return; + /* scan the keyring looking for dead keys */ + rcu_read_lock(); + result = assoc_array_iterate(&keyring->keys, + keyring_gc_check_iterator, &limit); + rcu_read_unlock(); + if (result == true) + goto do_gc; -no_klist: - up_write(&keyring->sem); - kleave(" [no_klist]"); +dont_gc: + kleave(" [no gc]"); return; -nomem: +do_gc: + down_write(&keyring->sem); + assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, + keyring_gc_select_iterator, &limit); up_write(&keyring->sem); - kleave(" [oom]"); + kleave(" [gc]"); } diff --git a/security/keys/persistent.c b/security/keys/persistent.c new file mode 100644 index 00000000000..0ad3ee28378 --- /dev/null +++ b/security/keys/persistent.c @@ -0,0 +1,167 @@ +/* General persistent per-UID keyrings register + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/user_namespace.h> +#include "internal.h" + +unsigned persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ + +/* + * Create the persistent keyring register for the current user namespace. + * + * Called with the namespace's sem locked for writing. + */ +static int key_create_persistent_register(struct user_namespace *ns) +{ + struct key *reg = keyring_alloc(".persistent_register", + KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(reg)) + return PTR_ERR(reg); + + ns->persistent_keyring_register = reg; + return 0; +} + +/* + * Create the persistent keyring for the specified user. + * + * Called with the namespace's sem locked for writing. + */ +static key_ref_t key_create_persistent(struct user_namespace *ns, kuid_t uid, + struct keyring_index_key *index_key) +{ + struct key *persistent; + key_ref_t reg_ref, persistent_ref; + + if (!ns->persistent_keyring_register) { + long err = key_create_persistent_register(ns); + if (err < 0) + return ERR_PTR(err); + } else { + reg_ref = make_key_ref(ns->persistent_keyring_register, true); + persistent_ref = find_key_to_update(reg_ref, index_key); + if (persistent_ref) + return persistent_ref; + } + + persistent = keyring_alloc(index_key->description, + uid, INVALID_GID, current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, + ns->persistent_keyring_register); + if (IS_ERR(persistent)) + return ERR_CAST(persistent); + + return make_key_ref(persistent, true); +} + +/* + * Get the persistent keyring for a specific UID and link it to the nominated + * keyring. + */ +static long key_get_persistent(struct user_namespace *ns, kuid_t uid, + key_ref_t dest_ref) +{ + struct keyring_index_key index_key; + struct key *persistent; + key_ref_t reg_ref, persistent_ref; + char buf[32]; + long ret; + + /* Look in the register if it exists */ + index_key.type = &key_type_keyring; + index_key.description = buf; + index_key.desc_len = sprintf(buf, "_persistent.%u", from_kuid(ns, uid)); + + if (ns->persistent_keyring_register) { + reg_ref = make_key_ref(ns->persistent_keyring_register, true); + down_read(&ns->persistent_keyring_register_sem); + persistent_ref = find_key_to_update(reg_ref, &index_key); + up_read(&ns->persistent_keyring_register_sem); + + if (persistent_ref) + goto found; + } + + /* It wasn't in the register, so we'll need to create it. We might + * also need to create the register. + */ + down_write(&ns->persistent_keyring_register_sem); + persistent_ref = key_create_persistent(ns, uid, &index_key); + up_write(&ns->persistent_keyring_register_sem); + if (!IS_ERR(persistent_ref)) + goto found; + + return PTR_ERR(persistent_ref); + +found: + ret = key_task_permission(persistent_ref, current_cred(), KEY_LINK); + if (ret == 0) { + persistent = key_ref_to_ptr(persistent_ref); + ret = key_link(key_ref_to_ptr(dest_ref), persistent); + if (ret == 0) { + key_set_timeout(persistent, persistent_keyring_expiry); + ret = persistent->serial; + } + } + + key_ref_put(persistent_ref); + return ret; +} + +/* + * Get the persistent keyring for a specific UID and link it to the nominated + * keyring. + */ +long keyctl_get_persistent(uid_t _uid, key_serial_t destid) +{ + struct user_namespace *ns = current_user_ns(); + key_ref_t dest_ref; + kuid_t uid; + long ret; + + /* -1 indicates the current user */ + if (_uid == (uid_t)-1) { + uid = current_uid(); + } else { + uid = make_kuid(ns, _uid); + if (!uid_valid(uid)) + return -EINVAL; + + /* You can only see your own persistent cache if you're not + * sufficiently privileged. + */ + if (!uid_eq(uid, current_uid()) && + !uid_eq(uid, current_euid()) && + !ns_capable(ns, CAP_SETUID)) + return -EPERM; + } + + /* There must be a destination keyring */ + dest_ref = lookup_user_key(destid, KEY_LOOKUP_CREATE, KEY_WRITE); + if (IS_ERR(dest_ref)) + return PTR_ERR(dest_ref); + if (key_ref_to_ptr(dest_ref)->type != &key_type_keyring) { + ret = -ENOTDIR; + goto out_put_dest; + } + + ret = key_get_persistent(ns, uid, dest_ref); + +out_put_dest: + key_ref_put(dest_ref); + return ret; +} diff --git a/security/keys/proc.c b/security/keys/proc.c index 217b6855e81..88e9a466940 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -182,7 +182,6 @@ static void proc_keys_stop(struct seq_file *p, void *v) static int proc_keys_show(struct seq_file *m, void *v) { - const struct cred *cred = current_cred(); struct rb_node *_p = v; struct key *key = rb_entry(_p, struct key, serial_node); struct timespec now; @@ -191,15 +190,23 @@ static int proc_keys_show(struct seq_file *m, void *v) char xbuf[12]; int rc; + struct keyring_search_context ctx = { + .index_key.type = key->type, + .index_key.description = key->description, + .cred = current_cred(), + .match = lookup_user_key_possessed, + .match_data = key, + .flags = (KEYRING_SEARCH_NO_STATE_CHECK | + KEYRING_SEARCH_LOOKUP_DIRECT), + }; + key_ref = make_key_ref(key, 0); /* determine if the key is possessed by this process (a test we can * skip if the key does not indicate the possessor can view it */ if (key->perm & KEY_POS_VIEW) { - skey_ref = search_my_process_keyrings(key->type, key, - lookup_user_key_possessed, - true, cred); + skey_ref = search_my_process_keyrings(&ctx); if (!IS_ERR(skey_ref)) { key_ref_put(skey_ref); key_ref = make_key_ref(key, 1); @@ -211,7 +218,7 @@ static int proc_keys_show(struct seq_file *m, void *v) * - the caller holds a spinlock, and thus the RCU read lock, making our * access to __current_cred() safe */ - rc = key_task_permission(key_ref, cred, KEY_VIEW); + rc = key_task_permission(key_ref, ctx.cred, KEY_VIEW); if (rc < 0) return 0; diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 42defae1e16..0cf8a130a26 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -235,7 +235,7 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { - atomic_inc(&keyring->usage); + __key_get(keyring); } /* install the keyring */ @@ -319,11 +319,7 @@ void key_fsgid_changed(struct task_struct *tsk) * In the case of a successful return, the possession attribute is set on the * returned key reference. */ -key_ref_t search_my_process_keyrings(struct key_type *type, - const void *description, - key_match_func_t match, - bool no_state_check, - const struct cred *cred) +key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx) { key_ref_t key_ref, ret, err; @@ -339,10 +335,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type, err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ - if (cred->thread_keyring) { + if (ctx->cred->thread_keyring) { key_ref = keyring_search_aux( - make_key_ref(cred->thread_keyring, 1), - cred, type, description, match, no_state_check); + make_key_ref(ctx->cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; @@ -358,10 +353,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the process keyring second */ - if (cred->process_keyring) { + if (ctx->cred->process_keyring) { key_ref = keyring_search_aux( - make_key_ref(cred->process_keyring, 1), - cred, type, description, match, no_state_check); + make_key_ref(ctx->cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; @@ -379,11 +373,11 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the session keyring */ - if (cred->session_keyring) { + if (ctx->cred->session_keyring) { rcu_read_lock(); key_ref = keyring_search_aux( - make_key_ref(rcu_dereference(cred->session_keyring), 1), - cred, type, description, match, no_state_check); + make_key_ref(rcu_dereference(ctx->cred->session_keyring), 1), + ctx); rcu_read_unlock(); if (!IS_ERR(key_ref)) @@ -402,10 +396,10 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } } /* or search the user-session keyring */ - else if (cred->user->session_keyring) { + else if (ctx->cred->user->session_keyring) { key_ref = keyring_search_aux( - make_key_ref(cred->user->session_keyring, 1), - cred, type, description, match, no_state_check); + make_key_ref(ctx->cred->user->session_keyring, 1), + ctx); if (!IS_ERR(key_ref)) goto found; @@ -437,18 +431,14 @@ found: * * Return same as search_my_process_keyrings(). */ -key_ref_t search_process_keyrings(struct key_type *type, - const void *description, - key_match_func_t match, - const struct cred *cred) +key_ref_t search_process_keyrings(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; might_sleep(); - key_ref = search_my_process_keyrings(type, description, match, - false, cred); + key_ref = search_my_process_keyrings(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; @@ -457,18 +447,21 @@ key_ref_t search_process_keyrings(struct key_type *type, * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ - if (cred->request_key_auth && - cred == current_cred() && - type != &key_type_request_key_auth + if (ctx->cred->request_key_auth && + ctx->cred == current_cred() && + ctx->index_key.type != &key_type_request_key_auth ) { + const struct cred *cred = ctx->cred; + /* defend against the auth key being revoked */ down_read(&cred->request_key_auth->sem); - if (key_validate(cred->request_key_auth) == 0) { - rka = cred->request_key_auth->payload.data; + if (key_validate(ctx->cred->request_key_auth) == 0) { + rka = ctx->cred->request_key_auth->payload.data; - key_ref = search_process_keyrings(type, description, - match, rka->cred); + ctx->cred = rka->cred; + key_ref = search_process_keyrings(ctx); + ctx->cred = cred; up_read(&cred->request_key_auth->sem); @@ -522,19 +515,23 @@ int lookup_user_key_possessed(const struct key *key, const void *target) key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, key_perm_t perm) { + struct keyring_search_context ctx = { + .match = lookup_user_key_possessed, + .flags = (KEYRING_SEARCH_NO_STATE_CHECK | + KEYRING_SEARCH_LOOKUP_DIRECT), + }; struct request_key_auth *rka; - const struct cred *cred; struct key *key; key_ref_t key_ref, skey_ref; int ret; try_again: - cred = get_current_cred(); + ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: - if (!cred->thread_keyring) { + if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; @@ -546,13 +543,13 @@ try_again: goto reget_creds; } - key = cred->thread_keyring; - atomic_inc(&key->usage); + key = ctx.cred->thread_keyring; + __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: - if (!cred->process_keyring) { + if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; @@ -564,13 +561,13 @@ try_again: goto reget_creds; } - key = cred->process_keyring; - atomic_inc(&key->usage); + key = ctx.cred->process_keyring; + __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: - if (!cred->session_keyring) { + if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = install_user_keyrings(); @@ -580,13 +577,13 @@ try_again: ret = join_session_keyring(NULL); else ret = install_session_keyring( - cred->user->session_keyring); + ctx.cred->user->session_keyring); if (ret < 0) goto error; goto reget_creds; - } else if (cred->session_keyring == - cred->user->session_keyring && + } else if (ctx.cred->session_keyring == + ctx.cred->user->session_keyring && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) @@ -595,33 +592,33 @@ try_again: } rcu_read_lock(); - key = rcu_dereference(cred->session_keyring); - atomic_inc(&key->usage); + key = rcu_dereference(ctx.cred->session_keyring); + __key_get(key); rcu_read_unlock(); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: - if (!cred->user->uid_keyring) { + if (!ctx.cred->user->uid_keyring) { ret = install_user_keyrings(); if (ret < 0) goto error; } - key = cred->user->uid_keyring; - atomic_inc(&key->usage); + key = ctx.cred->user->uid_keyring; + __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: - if (!cred->user->session_keyring) { + if (!ctx.cred->user->session_keyring) { ret = install_user_keyrings(); if (ret < 0) goto error; } - key = cred->user->session_keyring; - atomic_inc(&key->usage); + key = ctx.cred->user->session_keyring; + __key_get(key); key_ref = make_key_ref(key, 1); break; @@ -631,29 +628,29 @@ try_again: goto error; case KEY_SPEC_REQKEY_AUTH_KEY: - key = cred->request_key_auth; + key = ctx.cred->request_key_auth; if (!key) goto error; - atomic_inc(&key->usage); + __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: - if (!cred->request_key_auth) + if (!ctx.cred->request_key_auth) goto error; - down_read(&cred->request_key_auth->sem); + down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, - &cred->request_key_auth->flags)) { + &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { - rka = cred->request_key_auth->payload.data; + rka = ctx.cred->request_key_auth->payload.data; key = rka->dest_keyring; - atomic_inc(&key->usage); + __key_get(key); } - up_read(&cred->request_key_auth->sem); + up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); @@ -673,9 +670,13 @@ try_again: key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ - skey_ref = search_process_keyrings(key->type, key, - lookup_user_key_possessed, - cred); + ctx.index_key.type = key->type; + ctx.index_key.description = key->description; + ctx.index_key.desc_len = strlen(key->description); + ctx.match_data = key; + kdebug("check possessed"); + skey_ref = search_process_keyrings(&ctx); + kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); @@ -715,14 +716,14 @@ try_again: goto invalid_key; /* check the permissions */ - ret = key_task_permission(key_ref, cred, perm); + ret = key_task_permission(key_ref, ctx.cred, perm); if (ret < 0) goto invalid_key; key->last_used_at = current_kernel_time().tv_sec; error: - put_cred(cred); + put_cred(ctx.cred); return key_ref; invalid_key: @@ -733,7 +734,7 @@ invalid_key: /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: - put_cred(cred); + put_cred(ctx.cred); goto try_again; } @@ -856,3 +857,13 @@ void key_change_session_keyring(struct callback_head *twork) commit_creds(new); } + +/* + * Make sure that root's user and user-session keyrings exist. + */ +static int __init init_root_keyring(void) +{ + return install_user_keyrings(); +} + +late_initcall(init_root_keyring); diff --git a/security/keys/request_key.c b/security/keys/request_key.c index c411f9bb156..381411941cc 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -345,33 +345,34 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) * May return a key that's already under construction instead if there was a * race between two thread calling request_key(). */ -static int construct_alloc_key(struct key_type *type, - const char *description, +static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, struct key **_key) { - const struct cred *cred = current_cred(); - unsigned long prealloc; + struct assoc_array_edit *edit; struct key *key; key_perm_t perm; key_ref_t key_ref; int ret; - kenter("%s,%s,,,", type->name, description); + kenter("%s,%s,,,", + ctx->index_key.type->name, ctx->index_key.description); *_key = NULL; mutex_lock(&user->cons_lock); perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; - if (type->read) + if (ctx->index_key.type->read) perm |= KEY_POS_READ; - if (type == &key_type_keyring || type->update) + if (ctx->index_key.type == &key_type_keyring || + ctx->index_key.type->update) perm |= KEY_POS_WRITE; - key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred, + key = key_alloc(ctx->index_key.type, ctx->index_key.description, + ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, perm, flags); if (IS_ERR(key)) goto alloc_failed; @@ -379,8 +380,7 @@ static int construct_alloc_key(struct key_type *type, set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { - ret = __key_link_begin(dest_keyring, type, description, - &prealloc); + ret = __key_link_begin(dest_keyring, &ctx->index_key, &edit); if (ret < 0) goto link_prealloc_failed; } @@ -390,16 +390,16 @@ static int construct_alloc_key(struct key_type *type, * waited for locks */ mutex_lock(&key_construction_mutex); - key_ref = search_process_keyrings(type, description, type->match, cred); + key_ref = search_process_keyrings(ctx); if (!IS_ERR(key_ref)) goto key_already_present; if (dest_keyring) - __key_link(dest_keyring, key, &prealloc); + __key_link(key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) - __key_link_end(dest_keyring, type, prealloc); + __key_link_end(dest_keyring, &ctx->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); @@ -414,8 +414,8 @@ key_already_present: if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) - __key_link(dest_keyring, key, &prealloc); - __key_link_end(dest_keyring, type, prealloc); + __key_link(key, &edit); + __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; } @@ -444,8 +444,7 @@ alloc_failed: /* * Commence key construction. */ -static struct key *construct_key_and_link(struct key_type *type, - const char *description, +static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, @@ -464,8 +463,7 @@ static struct key *construct_key_and_link(struct key_type *type, construct_get_dest_keyring(&dest_keyring); - ret = construct_alloc_key(type, description, dest_keyring, flags, user, - &key); + ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { @@ -529,17 +527,24 @@ struct key *request_key_and_link(struct key_type *type, struct key *dest_keyring, unsigned long flags) { - const struct cred *cred = current_cred(); + struct keyring_search_context ctx = { + .index_key.type = type, + .index_key.description = description, + .cred = current_cred(), + .match = type->match, + .match_data = description, + .flags = KEYRING_SEARCH_LOOKUP_DIRECT, + }; struct key *key; key_ref_t key_ref; int ret; kenter("%s,%s,%p,%zu,%p,%p,%lx", - type->name, description, callout_info, callout_len, aux, - dest_keyring, flags); + ctx.index_key.type->name, ctx.index_key.description, + callout_info, callout_len, aux, dest_keyring, flags); /* search all the process keyrings for a key */ - key_ref = search_process_keyrings(type, description, type->match, cred); + key_ref = search_process_keyrings(&ctx); if (!IS_ERR(key_ref)) { key = key_ref_to_ptr(key_ref); @@ -562,9 +567,8 @@ struct key *request_key_and_link(struct key_type *type, if (!callout_info) goto error; - key = construct_key_and_link(type, description, callout_info, - callout_len, aux, dest_keyring, - flags); + key = construct_key_and_link(&ctx, callout_info, callout_len, + aux, dest_keyring, flags); } error: @@ -592,8 +596,10 @@ int wait_for_key_construction(struct key *key, bool intr) intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret < 0) return ret; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + smp_rmb(); return key->type_data.reject_error; + } return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 85730d5a5a5..7495a93b4b9 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include "internal.h" +#include <keys/user-type.h> static int request_key_auth_instantiate(struct key *, struct key_preparsed_payload *); @@ -222,32 +223,26 @@ error_alloc: } /* - * See if an authorisation key is associated with a particular key. - */ -static int key_get_instantiation_authkey_match(const struct key *key, - const void *_id) -{ - struct request_key_auth *rka = key->payload.data; - key_serial_t id = (key_serial_t)(unsigned long) _id; - - return rka->target_key->serial == id; -} - -/* * Search the current process's keyrings for the authorisation key for * instantiation of a key. */ struct key *key_get_instantiation_authkey(key_serial_t target_id) { - const struct cred *cred = current_cred(); + char description[16]; + struct keyring_search_context ctx = { + .index_key.type = &key_type_request_key_auth, + .index_key.description = description, + .cred = current_cred(), + .match = user_match, + .match_data = description, + .flags = KEYRING_SEARCH_LOOKUP_DIRECT, + }; struct key *authkey; key_ref_t authkey_ref; - authkey_ref = search_process_keyrings( - &key_type_request_key_auth, - (void *) (unsigned long) target_id, - key_get_instantiation_authkey_match, - cred); + sprintf(description, "%x", target_id); + + authkey_ref = search_process_keyrings(&ctx); if (IS_ERR(authkey_ref)) { authkey = ERR_CAST(authkey_ref); diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c index ee32d181764..8c0af08760c 100644 --- a/security/keys/sysctl.c +++ b/security/keys/sysctl.c @@ -61,5 +61,16 @@ ctl_table key_sysctls[] = { .extra1 = (void *) &zero, .extra2 = (void *) &max, }, +#ifdef CONFIG_PERSISTENT_KEYRINGS + { + .procname = "persistent_keyring_expiry", + .data = &persistent_keyring_expiry, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *) &zero, + .extra2 = (void *) &max, + }, +#endif { } }; diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index 55dc8893918..faa2caeb593 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -25,14 +25,15 @@ static int logon_vet_description(const char *desc); * arbitrary blob of data as the payload */ struct key_type key_type_user = { - .name = "user", - .instantiate = user_instantiate, - .update = user_update, - .match = user_match, - .revoke = user_revoke, - .destroy = user_destroy, - .describe = user_describe, - .read = user_read, + .name = "user", + .def_lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, + .instantiate = user_instantiate, + .update = user_update, + .match = user_match, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = user_describe, + .read = user_read, }; EXPORT_SYMBOL_GPL(key_type_user); @@ -45,6 +46,7 @@ EXPORT_SYMBOL_GPL(key_type_user); */ struct key_type key_type_logon = { .name = "logon", + .def_lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .instantiate = user_instantiate, .update = user_update, .match = user_match, diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 234bc2ab450..9a62045e628 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -397,7 +397,8 @@ void common_lsm_audit(struct common_audit_data *a, if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ - ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC); + ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN, + AUDIT_AVC); if (ab == NULL) return; diff --git a/security/security.c b/security/security.c index 4dc31f4f270..15b6928592e 100644 --- a/security/security.c +++ b/security/security.c @@ -1340,22 +1340,17 @@ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) return security_ops->xfrm_policy_delete_security(ctx); } -int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +int security_xfrm_state_alloc(struct xfrm_state *x, + struct xfrm_user_sec_ctx *sec_ctx) { - return security_ops->xfrm_state_alloc_security(x, sec_ctx, 0); + return security_ops->xfrm_state_alloc(x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { - if (!polsec) - return 0; - /* - * We want the context to be taken from secid which is usually - * from the sock. - */ - return security_ops->xfrm_state_alloc_security(x, NULL, secid); + return security_ops->xfrm_state_alloc_acquire(x, polsec, secid); } int security_xfrm_state_delete(struct xfrm_state *x) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c540795fb3f..794c3ca49ea 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -95,7 +95,9 @@ #include "audit.h" #include "avc_ss.h" -#define NUM_SEL_MNT_OPTS 5 +#define SB_TYPE_FMT "%s%s%s" +#define SB_SUBTYPE(sb) (sb->s_subtype && sb->s_subtype[0]) +#define SB_TYPE_ARGS(sb) sb->s_type->name, SB_SUBTYPE(sb) ? "." : "", SB_SUBTYPE(sb) ? sb->s_subtype : "" extern struct security_operations *security_ops; @@ -139,12 +141,28 @@ static struct kmem_cache *sel_inode_cache; * This function checks the SECMARK reference counter to see if any SECMARK * targets are currently configured, if the reference counter is greater than * zero SECMARK is considered to be enabled. Returns true (1) if SECMARK is - * enabled, false (0) if SECMARK is disabled. + * enabled, false (0) if SECMARK is disabled. If the always_check_network + * policy capability is enabled, SECMARK is always considered enabled. * */ static int selinux_secmark_enabled(void) { - return (atomic_read(&selinux_secmark_refcount) > 0); + return (selinux_policycap_alwaysnetwork || atomic_read(&selinux_secmark_refcount)); +} + +/** + * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled + * + * Description: + * This function checks if NetLabel or labeled IPSEC is enabled. Returns true + * (1) if any are enabled or false (0) if neither are enabled. If the + * always_check_network policy capability is enabled, peer labeling + * is always considered enabled. + * + */ +static int selinux_peerlbl_enabled(void) +{ + return (selinux_policycap_alwaysnetwork || netlbl_enabled() || selinux_xfrm_enabled()); } /* @@ -309,8 +327,11 @@ enum { Opt_defcontext = 3, Opt_rootcontext = 4, Opt_labelsupport = 5, + Opt_nextmntopt = 6, }; +#define NUM_SEL_MNT_OPTS (Opt_nextmntopt - 1) + static const match_table_t tokens = { {Opt_context, CONTEXT_STR "%s"}, {Opt_fscontext, FSCONTEXT_STR "%s"}, @@ -355,6 +376,29 @@ static int may_context_mount_inode_relabel(u32 sid, return rc; } +static int selinux_is_sblabel_mnt(struct super_block *sb) +{ + struct superblock_security_struct *sbsec = sb->s_security; + + if (sbsec->behavior == SECURITY_FS_USE_XATTR || + sbsec->behavior == SECURITY_FS_USE_TRANS || + sbsec->behavior == SECURITY_FS_USE_TASK) + return 1; + + /* Special handling for sysfs. Is genfs but also has setxattr handler*/ + if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0) + return 1; + + /* + * Special handling for rootfs. Is genfs but supports + * setting SELinux context on in-core inodes. + */ + if (strncmp(sb->s_type->name, "rootfs", sizeof("rootfs")) == 0) + return 1; + + return 0; +} + static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; @@ -369,8 +413,8 @@ static int sb_finish_set_opts(struct super_block *sb) the first boot of the SELinux kernel before we have assigned xattr values to the filesystem. */ if (!root_inode->i_op->getxattr) { - printk(KERN_WARNING "SELinux: (dev %s, type %s) has no " - "xattr support\n", sb->s_id, sb->s_type->name); + printk(KERN_WARNING "SELinux: (dev %s, type "SB_TYPE_FMT") has no " + "xattr support\n", sb->s_id, SB_TYPE_ARGS(sb)); rc = -EOPNOTSUPP; goto out; } @@ -378,35 +422,27 @@ static int sb_finish_set_opts(struct super_block *sb) if (rc < 0 && rc != -ENODATA) { if (rc == -EOPNOTSUPP) printk(KERN_WARNING "SELinux: (dev %s, type " - "%s) has no security xattr handler\n", - sb->s_id, sb->s_type->name); + SB_TYPE_FMT") has no security xattr handler\n", + sb->s_id, SB_TYPE_ARGS(sb)); else printk(KERN_WARNING "SELinux: (dev %s, type " - "%s) getxattr errno %d\n", sb->s_id, - sb->s_type->name, -rc); + SB_TYPE_FMT") getxattr errno %d\n", sb->s_id, + SB_TYPE_ARGS(sb), -rc); goto out; } } - sbsec->flags |= (SE_SBINITIALIZED | SE_SBLABELSUPP); - if (sbsec->behavior > ARRAY_SIZE(labeling_behaviors)) - printk(KERN_ERR "SELinux: initialized (dev %s, type %s), unknown behavior\n", - sb->s_id, sb->s_type->name); + printk(KERN_ERR "SELinux: initialized (dev %s, type "SB_TYPE_FMT"), unknown behavior\n", + sb->s_id, SB_TYPE_ARGS(sb)); else - printk(KERN_DEBUG "SELinux: initialized (dev %s, type %s), %s\n", - sb->s_id, sb->s_type->name, + printk(KERN_DEBUG "SELinux: initialized (dev %s, type "SB_TYPE_FMT"), %s\n", + sb->s_id, SB_TYPE_ARGS(sb), labeling_behaviors[sbsec->behavior-1]); - if (sbsec->behavior == SECURITY_FS_USE_GENFS || - sbsec->behavior == SECURITY_FS_USE_MNTPOINT || - sbsec->behavior == SECURITY_FS_USE_NONE || - sbsec->behavior > ARRAY_SIZE(labeling_behaviors)) - sbsec->flags &= ~SE_SBLABELSUPP; - - /* Special handling for sysfs. Is genfs but also has setxattr handler*/ - if (strncmp(sb->s_type->name, "sysfs", sizeof("sysfs")) == 0) - sbsec->flags |= SE_SBLABELSUPP; + sbsec->flags |= SE_SBINITIALIZED; + if (selinux_is_sblabel_mnt(sb)) + sbsec->flags |= SBLABEL_MNT; /* Initialize the root inode. */ rc = inode_doinit_with_dentry(root_inode, root); @@ -460,15 +496,18 @@ static int selinux_get_mnt_opts(const struct super_block *sb, if (!ss_initialized) return -EINVAL; + /* make sure we always check enough bits to cover the mask */ + BUILD_BUG_ON(SE_MNTMASK >= (1 << NUM_SEL_MNT_OPTS)); + tmp = sbsec->flags & SE_MNTMASK; /* count the number of mount options for this sb */ - for (i = 0; i < 8; i++) { + for (i = 0; i < NUM_SEL_MNT_OPTS; i++) { if (tmp & 0x01) opts->num_mnt_opts++; tmp >>= 1; } /* Check if the Label support flag is set */ - if (sbsec->flags & SE_SBLABELSUPP) + if (sbsec->flags & SBLABEL_MNT) opts->num_mnt_opts++; opts->mnt_opts = kcalloc(opts->num_mnt_opts, sizeof(char *), GFP_ATOMIC); @@ -515,9 +554,9 @@ static int selinux_get_mnt_opts(const struct super_block *sb, opts->mnt_opts[i] = context; opts->mnt_opts_flags[i++] = ROOTCONTEXT_MNT; } - if (sbsec->flags & SE_SBLABELSUPP) { + if (sbsec->flags & SBLABEL_MNT) { opts->mnt_opts[i] = NULL; - opts->mnt_opts_flags[i++] = SE_SBLABELSUPP; + opts->mnt_opts_flags[i++] = SBLABEL_MNT; } BUG_ON(i != opts->num_mnt_opts); @@ -561,7 +600,6 @@ static int selinux_set_mnt_opts(struct super_block *sb, const struct cred *cred = current_cred(); int rc = 0, i; struct superblock_security_struct *sbsec = sb->s_security; - const char *name = sb->s_type->name; struct inode *inode = sbsec->sb->s_root->d_inode; struct inode_security_struct *root_isec = inode->i_security; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; @@ -614,14 +652,14 @@ static int selinux_set_mnt_opts(struct super_block *sb, for (i = 0; i < num_opts; i++) { u32 sid; - if (flags[i] == SE_SBLABELSUPP) + if (flags[i] == SBLABEL_MNT) continue; rc = security_context_to_sid(mount_options[i], strlen(mount_options[i]), &sid); if (rc) { printk(KERN_WARNING "SELinux: security_context_to_sid" - "(%s) failed for (dev %s, type %s) errno=%d\n", - mount_options[i], sb->s_id, name, rc); + "(%s) failed for (dev %s, type "SB_TYPE_FMT") errno=%d\n", + mount_options[i], sb->s_id, SB_TYPE_ARGS(sb), rc); goto out; } switch (flags[i]) { @@ -685,9 +723,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, * Determine the labeling behavior to use for this * filesystem type. */ - rc = security_fs_use((sbsec->flags & SE_SBPROC) ? - "proc" : sb->s_type->name, - &sbsec->behavior, &sbsec->sid); + rc = security_fs_use(sb); if (rc) { printk(KERN_WARNING "%s: security_fs_use(%s) returned %d\n", @@ -770,7 +806,8 @@ out: out_double_mount: rc = -EINVAL; printk(KERN_WARNING "SELinux: mount invalid. Same superblock, different " - "security settings for (dev %s, type %s)\n", sb->s_id, name); + "security settings for (dev %s, type "SB_TYPE_FMT")\n", sb->s_id, + SB_TYPE_ARGS(sb)); goto out; } @@ -1037,7 +1074,7 @@ static void selinux_write_opts(struct seq_file *m, case DEFCONTEXT_MNT: prefix = DEFCONTEXT_STR; break; - case SE_SBLABELSUPP: + case SBLABEL_MNT: seq_putc(m, ','); seq_puts(m, LABELSUPP_STR); continue; @@ -1649,7 +1686,7 @@ static int may_create(struct inode *dir, if (rc) return rc; - if (!newsid || !(sbsec->flags & SE_SBLABELSUPP)) { + if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { rc = security_transition_sid(sid, dsec->sid, tclass, &dentry->d_name, &newsid); if (rc) @@ -2437,14 +2474,14 @@ static int selinux_sb_remount(struct super_block *sb, void *data) u32 sid; size_t len; - if (flags[i] == SE_SBLABELSUPP) + if (flags[i] == SBLABEL_MNT) continue; len = strlen(mount_options[i]); rc = security_context_to_sid(mount_options[i], len, &sid); if (rc) { printk(KERN_WARNING "SELinux: security_context_to_sid" - "(%s) failed for (dev %s, type %s) errno=%d\n", - mount_options[i], sb->s_id, sb->s_type->name, rc); + "(%s) failed for (dev %s, type "SB_TYPE_FMT") errno=%d\n", + mount_options[i], sb->s_id, SB_TYPE_ARGS(sb), rc); goto out_free_opts; } rc = -EINVAL; @@ -2482,8 +2519,8 @@ out_free_secdata: return rc; out_bad_option: printk(KERN_WARNING "SELinux: unable to change security options " - "during remount (dev %s, type=%s)\n", sb->s_id, - sb->s_type->name); + "during remount (dev %s, type "SB_TYPE_FMT")\n", sb->s_id, + SB_TYPE_ARGS(sb)); goto out_free_opts; } @@ -2606,7 +2643,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, if ((sbsec->flags & SE_SBINITIALIZED) && (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) newsid = sbsec->mntpoint_sid; - else if (!newsid || !(sbsec->flags & SE_SBLABELSUPP)) { + else if (!newsid || !(sbsec->flags & SBLABEL_MNT)) { rc = security_transition_sid(sid, dsec->sid, inode_mode_to_security_class(inode->i_mode), qstr, &newsid); @@ -2628,7 +2665,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, isec->initialized = 1; } - if (!ss_initialized || !(sbsec->flags & SE_SBLABELSUPP)) + if (!ss_initialized || !(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (name) @@ -2830,7 +2867,7 @@ static int selinux_inode_setxattr(struct dentry *dentry, const char *name, return selinux_inode_setotherxattr(dentry, name); sbsec = inode->i_sb->s_security; - if (!(sbsec->flags & SE_SBLABELSUPP)) + if (!(sbsec->flags & SBLABEL_MNT)) return -EOPNOTSUPP; if (!inode_owner_or_capable(inode)) @@ -3791,8 +3828,12 @@ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid) u32 nlbl_sid; u32 nlbl_type; - selinux_skb_xfrm_sid(skb, &xfrm_sid); - selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); + err = selinux_skb_xfrm_sid(skb, &xfrm_sid); + if (unlikely(err)) + return -EACCES; + err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid); + if (unlikely(err)) + return -EACCES; err = security_net_peersid_resolve(nlbl_sid, nlbl_type, xfrm_sid, sid); if (unlikely(err)) { @@ -4246,7 +4287,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) return selinux_sock_rcv_skb_compat(sk, skb, family); secmark_active = selinux_secmark_enabled(); - peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); + peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return 0; @@ -4628,7 +4669,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex, secmark_active = selinux_secmark_enabled(); netlbl_active = netlbl_enabled(); - peerlbl_active = netlbl_active || selinux_xfrm_enabled(); + peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; @@ -4780,7 +4821,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex, return NF_ACCEPT; #endif secmark_active = selinux_secmark_enabled(); - peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled(); + peerlbl_active = selinux_peerlbl_enabled(); if (!secmark_active && !peerlbl_active) return NF_ACCEPT; @@ -5784,7 +5825,8 @@ static struct security_operations selinux_ops = { .xfrm_policy_clone_security = selinux_xfrm_policy_clone, .xfrm_policy_free_security = selinux_xfrm_policy_free, .xfrm_policy_delete_security = selinux_xfrm_policy_delete, - .xfrm_state_alloc_security = selinux_xfrm_state_alloc, + .xfrm_state_alloc = selinux_xfrm_state_alloc, + .xfrm_state_alloc_acquire = selinux_xfrm_state_alloc_acquire, .xfrm_state_free_security = selinux_xfrm_state_free, .xfrm_state_delete_security = selinux_xfrm_state_delete, .xfrm_policy_lookup = selinux_xfrm_policy_lookup, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index aa47bcabb5f..b1dfe104945 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -58,8 +58,8 @@ struct superblock_security_struct { u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ - unsigned int behavior; /* labeling behavior */ - unsigned char flags; /* which mount options were specified */ + unsigned short behavior; /* labeling behavior */ + unsigned short flags; /* which mount options were specified */ struct mutex lock; struct list_head isec_head; spinlock_t isec_lock; diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 8fd8e18ea34..fe341ae3700 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -45,14 +45,15 @@ /* Mask for just the mount related flags */ #define SE_MNTMASK 0x0f /* Super block security struct flags for mount options */ +/* BE CAREFUL, these need to be the low order bits for selinux_get_mnt_opts */ #define CONTEXT_MNT 0x01 #define FSCONTEXT_MNT 0x02 #define ROOTCONTEXT_MNT 0x04 #define DEFCONTEXT_MNT 0x08 +#define SBLABEL_MNT 0x10 /* Non-mount related flags */ -#define SE_SBINITIALIZED 0x10 -#define SE_SBPROC 0x20 -#define SE_SBLABELSUPP 0x40 +#define SE_SBINITIALIZED 0x0100 +#define SE_SBPROC 0x0200 #define CONTEXT_STR "context=" #define FSCONTEXT_STR "fscontext=" @@ -68,12 +69,15 @@ extern int selinux_enabled; enum { POLICYDB_CAPABILITY_NETPEER, POLICYDB_CAPABILITY_OPENPERM, + POLICYDB_CAPABILITY_REDHAT1, + POLICYDB_CAPABILITY_ALWAYSNETWORK, __POLICYDB_CAPABILITY_MAX }; #define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) extern int selinux_policycap_netpeer; extern int selinux_policycap_openperm; +extern int selinux_policycap_alwaysnetwork; /* * type_datum properties @@ -172,8 +176,7 @@ int security_get_allow_unknown(void); #define SECURITY_FS_USE_NATIVE 7 /* use native label support */ #define SECURITY_FS_USE_MAX 7 /* Highest SECURITY_FS_USE_XXX */ -int security_fs_use(const char *fstype, unsigned int *behavior, - u32 *sid); +int security_fs_use(struct super_block *sb); int security_genfs_sid(const char *fstype, char *name, u16 sclass, u32 *sid); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 6713f04e30b..0dec76c64cf 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -10,29 +10,21 @@ #include <net/flow.h> int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *sec_ctx); + struct xfrm_user_sec_ctx *uctx); int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp); void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx); int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx); int selinux_xfrm_state_alloc(struct xfrm_state *x, - struct xfrm_user_sec_ctx *sec_ctx, u32 secid); + struct xfrm_user_sec_ctx *uctx); +int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, u32 secid); void selinux_xfrm_state_free(struct xfrm_state *x); int selinux_xfrm_state_delete(struct xfrm_state *x); int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, - struct xfrm_policy *xp, const struct flowi *fl); - -/* - * Extract the security blob from the sock (it's actually on the socket) - */ -static inline struct inode_security_struct *get_sock_isec(struct sock *sk) -{ - if (!sk->sk_socket) - return NULL; - - return SOCK_INODE(sk->sk_socket)->i_security; -} + struct xfrm_policy *xp, + const struct flowi *fl); #ifdef CONFIG_SECURITY_NETWORK_XFRM extern atomic_t selinux_xfrm_refcount; @@ -42,10 +34,10 @@ static inline int selinux_xfrm_enabled(void) return (atomic_read(&selinux_xfrm_refcount) > 0); } -int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb, - struct common_audit_data *ad); -int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct common_audit_data *ad, u8 proto); +int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad); +int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad, u8 proto); int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall); static inline void selinux_xfrm_notify_policyload(void) @@ -64,19 +56,21 @@ static inline int selinux_xfrm_enabled(void) return 0; } -static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, - struct common_audit_data *ad) +static inline int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad) { return 0; } -static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct common_audit_data *ad, u8 proto) +static inline int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad, + u8 proto) { return 0; } -static inline int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall) +static inline int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, + int ckall) { *sid = SECSID_NULL; return 0; @@ -87,10 +81,9 @@ static inline void selinux_xfrm_notify_policyload(void) } #endif -static inline void selinux_skb_xfrm_sid(struct sk_buff *skb, u32 *sid) +static inline int selinux_skb_xfrm_sid(struct sk_buff *skb, u32 *sid) { - int err = selinux_xfrm_decode_session(skb, sid, 0); - BUG_ON(err); + return selinux_xfrm_decode_session(skb, sid, 0); } #endif /* _SELINUX_XFRM_H_ */ diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index da4b8b23328..6235d052338 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -442,8 +442,7 @@ int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr) sksec->nlbl_state != NLBL_CONNLABELED) return 0; - local_bh_disable(); - bh_lock_sock_nested(sk); + lock_sock(sk); /* connected sockets are allowed to disconnect when the address family * is set to AF_UNSPEC, if that is what is happening we want to reset @@ -464,7 +463,6 @@ int selinux_netlbl_socket_connect(struct sock *sk, struct sockaddr *addr) sksec->nlbl_state = NLBL_CONNLABELED; socket_connect_return: - bh_unlock_sock(sk); - local_bh_enable(); + release_sock(sk); return rc; } diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index c5454c0477c..03a72c32afd 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -166,6 +166,7 @@ static void sel_netnode_insert(struct sel_netnode *node) break; default: BUG(); + return; } /* we need to impose a limit on the growth of the hash table so check @@ -225,6 +226,7 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) break; default: BUG(); + ret = -EINVAL; } if (ret != 0) goto out; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 855e464e92e..332ac8a80cf 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -116,6 +116,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] = { AUDIT_MAKE_EQUIV, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, { AUDIT_TTY_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ }, { AUDIT_TTY_SET, NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT }, + { AUDIT_GET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_READ }, + { AUDIT_SET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE }, }; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index ff427733c29..5122affe06a 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -44,7 +44,9 @@ /* Policy capability filenames */ static char *policycap_names[] = { "network_peer_controls", - "open_perms" + "open_perms", + "redhat1", + "always_check_network" }; unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c index 30f119b1d1e..820313a04d4 100644 --- a/security/selinux/ss/ebitmap.c +++ b/security/selinux/ss/ebitmap.c @@ -213,7 +213,12 @@ netlbl_import_failure: } #endif /* CONFIG_NETLABEL */ -int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2) +/* + * Check to see if all the bits set in e2 are also set in e1. Optionally, + * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed + * last_e2bit. + */ +int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit) { struct ebitmap_node *n1, *n2; int i; @@ -223,14 +228,25 @@ int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2) n1 = e1->node; n2 = e2->node; + while (n1 && n2 && (n1->startbit <= n2->startbit)) { if (n1->startbit < n2->startbit) { n1 = n1->next; continue; } - for (i = 0; i < EBITMAP_UNIT_NUMS; i++) { + for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i]; ) + i--; /* Skip trailing NULL map entries */ + if (last_e2bit && (i >= 0)) { + u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE + + __fls(n2->maps[i]); + if (lastsetbit > last_e2bit) + return 0; + } + + while (i >= 0) { if ((n1->maps[i] & n2->maps[i]) != n2->maps[i]) return 0; + i--; } n1 = n1->next; diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h index 922f8afa89d..712c8a7b8e8 100644 --- a/security/selinux/ss/ebitmap.h +++ b/security/selinux/ss/ebitmap.h @@ -16,7 +16,13 @@ #include <net/netlabel.h> -#define EBITMAP_UNIT_NUMS ((32 - sizeof(void *) - sizeof(u32)) \ +#ifdef CONFIG_64BIT +#define EBITMAP_NODE_SIZE 64 +#else +#define EBITMAP_NODE_SIZE 32 +#endif + +#define EBITMAP_UNIT_NUMS ((EBITMAP_NODE_SIZE-sizeof(void *)-sizeof(u32))\ / sizeof(unsigned long)) #define EBITMAP_UNIT_SIZE BITS_PER_LONG #define EBITMAP_SIZE (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE) @@ -117,7 +123,7 @@ static inline void ebitmap_node_clr_bit(struct ebitmap_node *n, int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2); int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src); -int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2); +int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit); int ebitmap_get_bit(struct ebitmap *e, unsigned long bit); int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value); void ebitmap_destroy(struct ebitmap *e); diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index 40de8d3f208..c85bc1ec040 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -160,8 +160,6 @@ void mls_sid_to_context(struct context *context, int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; - struct ebitmap_node *node; - int i; if (!l->sens || l->sens > p->p_levels.nprim) return 0; @@ -170,19 +168,13 @@ int mls_level_isvalid(struct policydb *p, struct mls_level *l) if (!levdatum) return 0; - ebitmap_for_each_positive_bit(&l->cat, node, i) { - if (i > p->p_cats.nprim) - return 0; - if (!ebitmap_get_bit(&levdatum->level->cat, i)) { - /* - * Category may not be associated with - * sensitivity. - */ - return 0; - } - } - - return 1; + /* + * Return 1 iff all the bits set in l->cat are also be set in + * levdatum->level->cat and no bit in l->cat is larger than + * p->p_cats.nprim. + */ + return ebitmap_contains(&levdatum->level->cat, &l->cat, + p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) diff --git a/security/selinux/ss/mls_types.h b/security/selinux/ss/mls_types.h index 03bed52a805..e9364877413 100644 --- a/security/selinux/ss/mls_types.h +++ b/security/selinux/ss/mls_types.h @@ -35,7 +35,7 @@ static inline int mls_level_eq(struct mls_level *l1, struct mls_level *l2) static inline int mls_level_dom(struct mls_level *l1, struct mls_level *l2) { return ((l1->sens >= l2->sens) && - ebitmap_contains(&l1->cat, &l2->cat)); + ebitmap_contains(&l1->cat, &l2->cat, 0)); } #define mls_level_incomp(l1, l2) \ diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index c8adde3aff8..f6195ebde3c 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -3203,9 +3203,8 @@ static int range_write_helper(void *key, void *data, void *ptr) static int range_write(struct policydb *p, void *fp) { - size_t nel; __le32 buf[1]; - int rc; + int rc, nel; struct policy_data pd; pd.p = p; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index b4feecc3fe0..ee470a0b5c2 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -72,6 +72,7 @@ int selinux_policycap_netpeer; int selinux_policycap_openperm; +int selinux_policycap_alwaysnetwork; static DEFINE_RWLOCK(policy_rwlock); @@ -1812,6 +1813,8 @@ static void security_load_policycaps(void) POLICYDB_CAPABILITY_NETPEER); selinux_policycap_openperm = ebitmap_get_bit(&policydb.policycaps, POLICYDB_CAPABILITY_OPENPERM); + selinux_policycap_alwaysnetwork = ebitmap_get_bit(&policydb.policycaps, + POLICYDB_CAPABILITY_ALWAYSNETWORK); } static int security_preserve_bools(struct policydb *p); @@ -2323,43 +2326,74 @@ out: /** * security_fs_use - Determine how to handle labeling for a filesystem. - * @fstype: filesystem type - * @behavior: labeling behavior - * @sid: SID for filesystem (superblock) + * @sb: superblock in question */ -int security_fs_use( - const char *fstype, - unsigned int *behavior, - u32 *sid) +int security_fs_use(struct super_block *sb) { int rc = 0; struct ocontext *c; + struct superblock_security_struct *sbsec = sb->s_security; + const char *fstype = sb->s_type->name; + const char *subtype = (sb->s_subtype && sb->s_subtype[0]) ? sb->s_subtype : NULL; + struct ocontext *base = NULL; read_lock(&policy_rwlock); - c = policydb.ocontexts[OCON_FSUSE]; - while (c) { - if (strcmp(fstype, c->u.name) == 0) + for (c = policydb.ocontexts[OCON_FSUSE]; c; c = c->next) { + char *sub; + int baselen; + + baselen = strlen(fstype); + + /* if base does not match, this is not the one */ + if (strncmp(fstype, c->u.name, baselen)) + continue; + + /* if there is no subtype, this is the one! */ + if (!subtype) + break; + + /* skip past the base in this entry */ + sub = c->u.name + baselen; + + /* entry is only a base. save it. keep looking for subtype */ + if (sub[0] == '\0') { + base = c; + continue; + } + + /* entry is not followed by a subtype, so it is not a match */ + if (sub[0] != '.') + continue; + + /* whew, we found a subtype of this fstype */ + sub++; /* move past '.' */ + + /* exact match of fstype AND subtype */ + if (!strcmp(subtype, sub)) break; - c = c->next; } + /* in case we had found an fstype match but no subtype match */ + if (!c) + c = base; + if (c) { - *behavior = c->v.behavior; + sbsec->behavior = c->v.behavior; if (!c->sid[0]) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if (rc) goto out; } - *sid = c->sid[0]; + sbsec->sid = c->sid[0]; } else { - rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, sid); + rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, &sbsec->sid); if (rc) { - *behavior = SECURITY_FS_USE_NONE; + sbsec->behavior = SECURITY_FS_USE_NONE; rc = 0; } else { - *behavior = SECURITY_FS_USE_GENFS; + sbsec->behavior = SECURITY_FS_USE_GENFS; } } diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index d0308188621..a91d205ec0c 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -56,7 +56,7 @@ atomic_t selinux_xfrm_refcount = ATOMIC_INIT(0); /* - * Returns true if an LSM/SELinux context + * Returns true if the context is an LSM/SELinux context. */ static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx) { @@ -66,7 +66,7 @@ static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx) } /* - * Returns true if the xfrm contains a security blob for SELinux + * Returns true if the xfrm contains a security blob for SELinux. */ static inline int selinux_authorizable_xfrm(struct xfrm_state *x) { @@ -74,48 +74,111 @@ static inline int selinux_authorizable_xfrm(struct xfrm_state *x) } /* - * LSM hook implementation that authorizes that a flow can use - * a xfrm policy rule. + * Allocates a xfrm_sec_state and populates it using the supplied security + * xfrm_user_sec_ctx context. */ -int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) +static int selinux_xfrm_alloc_user(struct xfrm_sec_ctx **ctxp, + struct xfrm_user_sec_ctx *uctx) { int rc; - u32 sel_sid; + const struct task_security_struct *tsec = current_security(); + struct xfrm_sec_ctx *ctx = NULL; + u32 str_len; - /* Context sid is either set to label or ANY_ASSOC */ - if (ctx) { - if (!selinux_authorizable_ctx(ctx)) - return -EINVAL; - - sel_sid = ctx->ctx_sid; - } else - /* - * All flows should be treated as polmatch'ing an - * otherwise applicable "non-labeled" policy. This - * would prevent inadvertent "leaks". - */ - return 0; + if (ctxp == NULL || uctx == NULL || + uctx->ctx_doi != XFRM_SC_DOI_LSM || + uctx->ctx_alg != XFRM_SC_ALG_SELINUX) + return -EINVAL; - rc = avc_has_perm(fl_secid, sel_sid, SECCLASS_ASSOCIATION, - ASSOCIATION__POLMATCH, - NULL); + str_len = uctx->ctx_len; + if (str_len >= PAGE_SIZE) + return -ENOMEM; - if (rc == -EACCES) - return -ESRCH; + ctx = kmalloc(sizeof(*ctx) + str_len + 1, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->ctx_doi = XFRM_SC_DOI_LSM; + ctx->ctx_alg = XFRM_SC_ALG_SELINUX; + ctx->ctx_len = str_len; + memcpy(ctx->ctx_str, &uctx[1], str_len); + ctx->ctx_str[str_len] = '\0'; + rc = security_context_to_sid(ctx->ctx_str, str_len, &ctx->ctx_sid); + if (rc) + goto err; + + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, NULL); + if (rc) + goto err; + + *ctxp = ctx; + atomic_inc(&selinux_xfrm_refcount); + return 0; + +err: + kfree(ctx); return rc; } /* + * Free the xfrm_sec_ctx structure. + */ +static void selinux_xfrm_free(struct xfrm_sec_ctx *ctx) +{ + if (!ctx) + return; + + atomic_dec(&selinux_xfrm_refcount); + kfree(ctx); +} + +/* + * Authorize the deletion of a labeled SA or policy rule. + */ +static int selinux_xfrm_delete(struct xfrm_sec_ctx *ctx) +{ + const struct task_security_struct *tsec = current_security(); + + if (!ctx) + return 0; + + return avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, + NULL); +} + +/* + * LSM hook implementation that authorizes that a flow can use a xfrm policy + * rule. + */ +int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) +{ + int rc; + + /* All flows should be treated as polmatch'ing an otherwise applicable + * "non-labeled" policy. This would prevent inadvertent "leaks". */ + if (!ctx) + return 0; + + /* Context sid is either set to label or ANY_ASSOC */ + if (!selinux_authorizable_ctx(ctx)) + return -EINVAL; + + rc = avc_has_perm(fl_secid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__POLMATCH, NULL); + return (rc == -EACCES ? -ESRCH : rc); +} + +/* * LSM hook implementation that authorizes that a state matches * the given policy, flow combo. */ - -int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, - const struct flowi *fl) +int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, + struct xfrm_policy *xp, + const struct flowi *fl) { u32 state_sid; - int rc; if (!xp->security) if (x->security) @@ -138,187 +201,80 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy * if (fl->flowi_secid != state_sid) return 0; - rc = avc_has_perm(fl->flowi_secid, state_sid, SECCLASS_ASSOCIATION, - ASSOCIATION__SENDTO, - NULL)? 0:1; - - /* - * We don't need a separate SA Vs. policy polmatch check - * since the SA is now of the same label as the flow and - * a flow Vs. policy polmatch check had already happened - * in selinux_xfrm_policy_lookup() above. - */ - - return rc; + /* We don't need a separate SA Vs. policy polmatch check since the SA + * is now of the same label as the flow and a flow Vs. policy polmatch + * check had already happened in selinux_xfrm_policy_lookup() above. */ + return (avc_has_perm(fl->flowi_secid, state_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, + NULL) ? 0 : 1); } /* * LSM hook implementation that checks and/or returns the xfrm sid for the * incoming packet. */ - int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall) { + u32 sid_session = SECSID_NULL; struct sec_path *sp; - *sid = SECSID_NULL; - if (skb == NULL) - return 0; + goto out; sp = skb->sp; if (sp) { - int i, sid_set = 0; + int i; - for (i = sp->len-1; i >= 0; i--) { + for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; if (selinux_authorizable_xfrm(x)) { struct xfrm_sec_ctx *ctx = x->security; - if (!sid_set) { - *sid = ctx->ctx_sid; - sid_set = 1; - + if (sid_session == SECSID_NULL) { + sid_session = ctx->ctx_sid; if (!ckall) - break; - } else if (*sid != ctx->ctx_sid) + goto out; + } else if (sid_session != ctx->ctx_sid) { + *sid = SECSID_NULL; return -EINVAL; + } } } } - return 0; -} - -/* - * Security blob allocation for xfrm_policy and xfrm_state - * CTX does not have a meaningful value on input - */ -static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, - struct xfrm_user_sec_ctx *uctx, u32 sid) -{ - int rc = 0; - const struct task_security_struct *tsec = current_security(); - struct xfrm_sec_ctx *ctx = NULL; - char *ctx_str = NULL; - u32 str_len; - - BUG_ON(uctx && sid); - - if (!uctx) - goto not_from_user; - - if (uctx->ctx_alg != XFRM_SC_ALG_SELINUX) - return -EINVAL; - - str_len = uctx->ctx_len; - if (str_len >= PAGE_SIZE) - return -ENOMEM; - - *ctxp = ctx = kmalloc(sizeof(*ctx) + - str_len + 1, - GFP_KERNEL); - - if (!ctx) - return -ENOMEM; - - ctx->ctx_doi = uctx->ctx_doi; - ctx->ctx_len = str_len; - ctx->ctx_alg = uctx->ctx_alg; - - memcpy(ctx->ctx_str, - uctx+1, - str_len); - ctx->ctx_str[str_len] = 0; - rc = security_context_to_sid(ctx->ctx_str, - str_len, - &ctx->ctx_sid); - - if (rc) - goto out; - - /* - * Does the subject have permission to set security context? - */ - rc = avc_has_perm(tsec->sid, ctx->ctx_sid, - SECCLASS_ASSOCIATION, - ASSOCIATION__SETCONTEXT, NULL); - if (rc) - goto out; - - return rc; - -not_from_user: - rc = security_sid_to_context(sid, &ctx_str, &str_len); - if (rc) - goto out; - - *ctxp = ctx = kmalloc(sizeof(*ctx) + - str_len, - GFP_ATOMIC); - - if (!ctx) { - rc = -ENOMEM; - goto out; - } - - ctx->ctx_doi = XFRM_SC_DOI_LSM; - ctx->ctx_alg = XFRM_SC_ALG_SELINUX; - ctx->ctx_sid = sid; - ctx->ctx_len = str_len; - memcpy(ctx->ctx_str, - ctx_str, - str_len); - - goto out2; - out: - *ctxp = NULL; - kfree(ctx); -out2: - kfree(ctx_str); - return rc; + *sid = sid_session; + return 0; } /* - * LSM hook implementation that allocs and transfers uctx spec to - * xfrm_policy. + * LSM hook implementation that allocs and transfers uctx spec to xfrm_policy. */ int selinux_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx) { - int err; - - BUG_ON(!uctx); - - err = selinux_xfrm_sec_ctx_alloc(ctxp, uctx, 0); - if (err == 0) - atomic_inc(&selinux_xfrm_refcount); - - return err; + return selinux_xfrm_alloc_user(ctxp, uctx); } - /* - * LSM hook implementation that copies security data structure from old to - * new for policy cloning. + * LSM hook implementation that copies security data structure from old to new + * for policy cloning. */ int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { struct xfrm_sec_ctx *new_ctx; - if (old_ctx) { - new_ctx = kmalloc(sizeof(*old_ctx) + old_ctx->ctx_len, - GFP_ATOMIC); - if (!new_ctx) - return -ENOMEM; + if (!old_ctx) + return 0; + + new_ctx = kmemdup(old_ctx, sizeof(*old_ctx) + old_ctx->ctx_len, + GFP_ATOMIC); + if (!new_ctx) + return -ENOMEM; + atomic_inc(&selinux_xfrm_refcount); + *new_ctxp = new_ctx; - memcpy(new_ctx, old_ctx, sizeof(*new_ctx)); - memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len); - atomic_inc(&selinux_xfrm_refcount); - *new_ctxp = new_ctx; - } return 0; } @@ -327,8 +283,7 @@ int selinux_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, */ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { - atomic_dec(&selinux_xfrm_refcount); - kfree(ctx); + selinux_xfrm_free(ctx); } /* @@ -336,31 +291,55 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx) */ int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { - const struct task_security_struct *tsec = current_security(); - - if (!ctx) - return 0; + return selinux_xfrm_delete(ctx); +} - return avc_has_perm(tsec->sid, ctx->ctx_sid, - SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, - NULL); +/* + * LSM hook implementation that allocates a xfrm_sec_state, populates it using + * the supplied security context, and assigns it to the xfrm_state. + */ +int selinux_xfrm_state_alloc(struct xfrm_state *x, + struct xfrm_user_sec_ctx *uctx) +{ + return selinux_xfrm_alloc_user(&x->security, uctx); } /* - * LSM hook implementation that allocs and transfers sec_ctx spec to - * xfrm_state. + * LSM hook implementation that allocates a xfrm_sec_state and populates based + * on a secid. */ -int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx, - u32 secid) +int selinux_xfrm_state_alloc_acquire(struct xfrm_state *x, + struct xfrm_sec_ctx *polsec, u32 secid) { - int err; + int rc; + struct xfrm_sec_ctx *ctx; + char *ctx_str = NULL; + int str_len; + + if (!polsec) + return 0; - BUG_ON(!x); + if (secid == 0) + return -EINVAL; - err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, secid); - if (err == 0) - atomic_inc(&selinux_xfrm_refcount); - return err; + rc = security_sid_to_context(secid, &ctx_str, &str_len); + if (rc) + return rc; + + ctx = kmalloc(sizeof(*ctx) + str_len, GFP_ATOMIC); + if (!ctx) + return -ENOMEM; + + ctx->ctx_doi = XFRM_SC_DOI_LSM; + ctx->ctx_alg = XFRM_SC_ALG_SELINUX; + ctx->ctx_sid = secid; + ctx->ctx_len = str_len; + memcpy(ctx->ctx_str, ctx_str, str_len); + kfree(ctx_str); + + x->security = ctx; + atomic_inc(&selinux_xfrm_refcount); + return 0; } /* @@ -368,24 +347,15 @@ int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uct */ void selinux_xfrm_state_free(struct xfrm_state *x) { - atomic_dec(&selinux_xfrm_refcount); - kfree(x->security); + selinux_xfrm_free(x->security); } - /* - * LSM hook implementation that authorizes deletion of labeled SAs. - */ +/* + * LSM hook implementation that authorizes deletion of labeled SAs. + */ int selinux_xfrm_state_delete(struct xfrm_state *x) { - const struct task_security_struct *tsec = current_security(); - struct xfrm_sec_ctx *ctx = x->security; - - if (!ctx) - return 0; - - return avc_has_perm(tsec->sid, ctx->ctx_sid, - SECCLASS_ASSOCIATION, ASSOCIATION__SETCONTEXT, - NULL); + return selinux_xfrm_delete(x->security); } /* @@ -395,14 +365,12 @@ int selinux_xfrm_state_delete(struct xfrm_state *x) * we need to check for unlabelled access since this may not have * gone thru the IPSec process. */ -int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, - struct common_audit_data *ad) +int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad) { - int i, rc = 0; - struct sec_path *sp; - u32 sel_sid = SECINITSID_UNLABELED; - - sp = skb->sp; + int i; + struct sec_path *sp = skb->sp; + u32 peer_sid = SECINITSID_UNLABELED; if (sp) { for (i = 0; i < sp->len; i++) { @@ -410,23 +378,17 @@ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, if (x && selinux_authorizable_xfrm(x)) { struct xfrm_sec_ctx *ctx = x->security; - sel_sid = ctx->ctx_sid; + peer_sid = ctx->ctx_sid; break; } } } - /* - * This check even when there's no association involved is - * intended, according to Trent Jaeger, to make sure a - * process can't engage in non-ipsec communication unless - * explicitly allowed by policy. - */ - - rc = avc_has_perm(isec_sid, sel_sid, SECCLASS_ASSOCIATION, - ASSOCIATION__RECVFROM, ad); - - return rc; + /* This check even when there's no association involved is intended, + * according to Trent Jaeger, to make sure a process can't engage in + * non-IPsec communication unless explicitly allowed by policy. */ + return avc_has_perm(sk_sid, peer_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, ad); } /* @@ -436,49 +398,38 @@ int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb, * If we do have a authorizable security association, then it has already been * checked in the selinux_xfrm_state_pol_flow_match hook above. */ -int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb, - struct common_audit_data *ad, u8 proto) +int selinux_xfrm_postroute_last(u32 sk_sid, struct sk_buff *skb, + struct common_audit_data *ad, u8 proto) { struct dst_entry *dst; - int rc = 0; - - dst = skb_dst(skb); - - if (dst) { - struct dst_entry *dst_test; - - for (dst_test = dst; dst_test != NULL; - dst_test = dst_test->child) { - struct xfrm_state *x = dst_test->xfrm; - - if (x && selinux_authorizable_xfrm(x)) - goto out; - } - } switch (proto) { case IPPROTO_AH: case IPPROTO_ESP: case IPPROTO_COMP: - /* - * We should have already seen this packet once before - * it underwent xfrm(s). No need to subject it to the - * unlabeled check. - */ - goto out; + /* We should have already seen this packet once before it + * underwent xfrm(s). No need to subject it to the unlabeled + * check. */ + return 0; default: break; } - /* - * This check even when there's no association involved is - * intended, according to Trent Jaeger, to make sure a - * process can't engage in non-ipsec communication unless - * explicitly allowed by policy. - */ + dst = skb_dst(skb); + if (dst) { + struct dst_entry *iter; - rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, - ASSOCIATION__SENDTO, ad); -out: - return rc; + for (iter = dst; iter != NULL; iter = iter->child) { + struct xfrm_state *x = iter->xfrm; + + if (x && selinux_authorizable_xfrm(x)) + return 0; + } + } + + /* This check even when there's no association involved is intended, + * according to Trent Jaeger, to make sure a process can't engage in + * non-IPsec communication unless explicitly allowed by policy. */ + return avc_has_perm(sk_sid, SECINITSID_UNLABELED, + SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, ad); } diff --git a/security/smack/smack.h b/security/smack/smack.h index 076b8e8a51a..364cc64fce7 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -177,9 +177,13 @@ struct smk_port_label { #define SMACK_CIPSO_MAXCATNUM 184 /* 23 * 8 */ /* - * Flag for transmute access + * Flags for untraditional access modes. + * It shouldn't be necessary to avoid conflicts with definitions + * in fs.h, but do so anyway. */ -#define MAY_TRANSMUTE 64 +#define MAY_TRANSMUTE 0x00001000 /* Controls directory labeling */ +#define MAY_LOCK 0x00002000 /* Locks should be writes, but ... */ + /* * Just to make the common cases easier to deal with */ @@ -188,9 +192,9 @@ struct smk_port_label { #define MAY_NOT 0 /* - * Number of access types used by Smack (rwxat) + * Number of access types used by Smack (rwxatl) */ -#define SMK_NUM_ACCESS_TYPE 5 +#define SMK_NUM_ACCESS_TYPE 6 /* SMACK data */ struct smack_audit_data { diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index b3b59b1e93d..14293cd9b1e 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -84,6 +84,8 @@ int log_policy = SMACK_AUDIT_DENIED; * * Do the object check first because that is more * likely to differ. + * + * Allowing write access implies allowing locking. */ int smk_access_entry(char *subject_label, char *object_label, struct list_head *rule_list) @@ -99,6 +101,11 @@ int smk_access_entry(char *subject_label, char *object_label, } } + /* + * MAY_WRITE implies MAY_LOCK. + */ + if ((may & MAY_WRITE) == MAY_WRITE) + may |= MAY_LOCK; return may; } @@ -245,6 +252,7 @@ out_audit: static inline void smack_str_from_perm(char *string, int access) { int i = 0; + if (access & MAY_READ) string[i++] = 'r'; if (access & MAY_WRITE) @@ -255,6 +263,8 @@ static inline void smack_str_from_perm(char *string, int access) string[i++] = 'a'; if (access & MAY_TRANSMUTE) string[i++] = 't'; + if (access & MAY_LOCK) + string[i++] = 'l'; string[i] = '\0'; } /** diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 8825375cc03..b0be893ad44 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -185,7 +185,7 @@ static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode) smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); smk_ad_setfield_u_tsk(&ad, ctp); - rc = smk_curacc(skp->smk_known, MAY_READWRITE, &ad); + rc = smk_curacc(skp->smk_known, mode, &ad); return rc; } @@ -1146,7 +1146,7 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd, * @file: the object * @cmd: unused * - * Returns 0 if current has write access, error code otherwise + * Returns 0 if current has lock access, error code otherwise */ static int smack_file_lock(struct file *file, unsigned int cmd) { @@ -1154,7 +1154,7 @@ static int smack_file_lock(struct file *file, unsigned int cmd) smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); smk_ad_setfield_u_fs_path(&ad, file->f_path); - return smk_curacc(file->f_security, MAY_WRITE, &ad); + return smk_curacc(file->f_security, MAY_LOCK, &ad); } /** @@ -1178,8 +1178,13 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, switch (cmd) { case F_GETLK: + break; case F_SETLK: case F_SETLKW: + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); + smk_ad_setfield_u_fs_path(&ad, file->f_path); + rc = smk_curacc(file->f_security, MAY_LOCK, &ad); + break; case F_SETOWN: case F_SETSIG: smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 80f4b4a4572..160aa08e3cd 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -139,7 +139,7 @@ const char *smack_cipso_option = SMACK_CIPSO_OPTION; * SMK_LOADLEN: Smack rule length */ #define SMK_OACCESS "rwxa" -#define SMK_ACCESS "rwxat" +#define SMK_ACCESS "rwxatl" #define SMK_OACCESSLEN (sizeof(SMK_OACCESS) - 1) #define SMK_ACCESSLEN (sizeof(SMK_ACCESS) - 1) #define SMK_OLOADLEN (SMK_LABELLEN + SMK_LABELLEN + SMK_OACCESSLEN) @@ -282,6 +282,10 @@ static int smk_perm_from_str(const char *string) case 'T': perm |= MAY_TRANSMUTE; break; + case 'l': + case 'L': + perm |= MAY_LOCK; + break; default: return perm; } @@ -452,7 +456,7 @@ static ssize_t smk_write_rules_list(struct file *file, const char __user *buf, /* * Minor hack for backward compatibility */ - if (count != SMK_OLOADLEN && count != SMK_LOADLEN) + if (count < SMK_OLOADLEN || count > SMK_LOADLEN) return -EINVAL; } else { if (count >= PAGE_SIZE) { @@ -592,6 +596,8 @@ static void smk_rule_show(struct seq_file *s, struct smack_rule *srp, int max) seq_putc(s, 'a'); if (srp->smk_access & MAY_TRANSMUTE) seq_putc(s, 't'); + if (srp->smk_access & MAY_LOCK) + seq_putc(s, 'l'); seq_putc(s, '\n'); } diff --git a/sound/soc/davinci/davinci-pcm.c b/sound/soc/davinci/davinci-pcm.c index fa64cd85204..fb5d107f560 100644 --- a/sound/soc/davinci/davinci-pcm.c +++ b/sound/soc/davinci/davinci-pcm.c @@ -238,7 +238,7 @@ static void davinci_pcm_dma_irq(unsigned link, u16 ch_status, void *data) print_buf_info(prtd->ram_channel, "i ram_channel"); pr_debug("davinci_pcm: link=%d, status=0x%x\n", link, ch_status); - if (unlikely(ch_status != DMA_COMPLETE)) + if (unlikely(ch_status != EDMA_DMA_COMPLETE)) return; if (snd_pcm_running(substream)) { diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fe702076ca4..9d77f13c2d2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2,7 +2,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel turbo-capable processors. * - * Copyright (c) 2012 Intel Corporation. + * Copyright (c) 2013 Intel Corporation. * Len Brown <len.brown@intel.com> * * This program is free software; you can redistribute it and/or modify it @@ -47,6 +47,8 @@ unsigned int skip_c1; unsigned int do_nhm_cstates; unsigned int do_snb_cstates; unsigned int do_c8_c9_c10; +unsigned int do_slm_cstates; +unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; unsigned int units = 1000000000; /* Ghz etc */ @@ -81,6 +83,8 @@ double rapl_joule_counter_range; #define RAPL_DRAM (1 << 3) #define RAPL_PKG_PERF_STATUS (1 << 4) #define RAPL_DRAM_PERF_STATUS (1 << 5) +#define RAPL_PKG_POWER_INFO (1 << 6) +#define RAPL_CORE_POLICY (1 << 7) #define TJMAX_DEFAULT 100 #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -96,7 +100,7 @@ struct thread_data { unsigned long long tsc; unsigned long long aperf; unsigned long long mperf; - unsigned long long c1; /* derived */ + unsigned long long c1; unsigned long long extra_msr64; unsigned long long extra_delta64; unsigned long long extra_msr32; @@ -266,7 +270,7 @@ void print_header(void) outp += sprintf(outp, " MSR 0x%03X", extra_msr_offset64); if (do_nhm_cstates) outp += sprintf(outp, " %%c1"); - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %%c3"); if (do_nhm_cstates) outp += sprintf(outp, " %%c6"); @@ -280,9 +284,9 @@ void print_header(void) if (do_snb_cstates) outp += sprintf(outp, " %%pc2"); - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %%pc3"); - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %%pc6"); if (do_snb_cstates) outp += sprintf(outp, " %%pc7"); @@ -480,7 +484,7 @@ int format_counters(struct thread_data *t, struct core_data *c, if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) goto done; - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %6.2f", 100.0 * c->c3/t->tsc); if (do_nhm_cstates) outp += sprintf(outp, " %6.2f", 100.0 * c->c6/t->tsc); @@ -499,9 +503,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (do_snb_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc2/t->tsc); - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc3/t->tsc); - if (do_nhm_cstates) + if (do_nhm_cstates && !do_slm_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc6/t->tsc); if (do_snb_cstates) outp += sprintf(outp, " %6.2f", 100.0 * p->pc7/t->tsc); @@ -648,17 +652,24 @@ delta_thread(struct thread_data *new, struct thread_data *old, } - /* - * As counter collection is not atomic, - * it is possible for mperf's non-halted cycles + idle states - * to exceed TSC's all cycles: show c1 = 0% in that case. - */ - if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > old->tsc) - old->c1 = 0; - else { - /* normal case, derive c1 */ - old->c1 = old->tsc - old->mperf - core_delta->c3 + if (use_c1_residency_msr) { + /* + * Some models have a dedicated C1 residency MSR, + * which should be more accurate than the derivation below. + */ + } else { + /* + * As counter collection is not atomic, + * it is possible for mperf's non-halted cycles + idle states + * to exceed TSC's all cycles: show c1 = 0% in that case. + */ + if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > old->tsc) + old->c1 = 0; + else { + /* normal case, derive c1 */ + old->c1 = old->tsc - old->mperf - core_delta->c3 - core_delta->c6 - core_delta->c7; + } } if (old->mperf == 0) { @@ -872,13 +883,21 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, extra_msr_offset64, &t->extra_msr64)) return -5; + if (use_c1_residency_msr) { + if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) + return -6; + } + /* collect core counters only for 1st thread in core */ if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) return 0; - if (do_nhm_cstates) { + if (do_nhm_cstates && !do_slm_cstates) { if (get_msr(cpu, MSR_CORE_C3_RESIDENCY, &c->c3)) return -6; + } + + if (do_nhm_cstates) { if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) return -7; } @@ -898,7 +917,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; - if (do_nhm_cstates) { + if (do_nhm_cstates && !do_slm_cstates) { if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) return -9; if (get_msr(cpu, MSR_PKG_C6_RESIDENCY, &p->pc6)) @@ -977,7 +996,7 @@ void print_verbose_header(void) ratio, bclk, ratio * bclk); get_msr(0, MSR_IA32_POWER_CTL, &msr); - fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E: %sabled)\n", + fprintf(stderr, "cpu0: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", msr, msr & 0x2 ? "EN" : "DIS"); if (!do_ivt_turbo_ratio_limit) @@ -1046,25 +1065,28 @@ print_nhm_turbo_ratio_limits: switch(msr & 0x7) { case 0: - fprintf(stderr, "pc0"); + fprintf(stderr, do_slm_cstates ? "no pkg states" : "pc0"); break; case 1: - fprintf(stderr, do_snb_cstates ? "pc2" : "pc0"); + fprintf(stderr, do_slm_cstates ? "no pkg states" : do_snb_cstates ? "pc2" : "pc0"); break; case 2: - fprintf(stderr, do_snb_cstates ? "pc6-noret" : "pc3"); + fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc6-noret" : "pc3"); break; case 3: - fprintf(stderr, "pc6"); + fprintf(stderr, do_slm_cstates ? "invalid" : "pc6"); break; case 4: - fprintf(stderr, "pc7"); + fprintf(stderr, do_slm_cstates ? "pc4" : "pc7"); break; case 5: - fprintf(stderr, do_snb_cstates ? "pc7s" : "invalid"); + fprintf(stderr, do_slm_cstates ? "invalid" : do_snb_cstates ? "pc7s" : "invalid"); + break; + case 6: + fprintf(stderr, do_slm_cstates ? "pc6" : "invalid"); break; case 7: - fprintf(stderr, "unlimited"); + fprintf(stderr, do_slm_cstates ? "pc7" : "unlimited"); break; default: fprintf(stderr, "invalid"); @@ -1460,6 +1482,8 @@ int has_nehalem_turbo_ratio_limit(unsigned int family, unsigned int model) case 0x3F: /* HSW */ case 0x45: /* HSW */ case 0x46: /* HSW */ + case 0x37: /* BYT */ + case 0x4D: /* AVN */ return 1; case 0x2E: /* Nehalem-EX Xeon - Beckton */ case 0x2F: /* Westmere-EX Xeon - Eagleton */ @@ -1532,14 +1556,33 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ #define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ +double get_tdp(model) +{ + unsigned long long msr; + + if (do_rapl & RAPL_PKG_POWER_INFO) + if (!get_msr(0, MSR_PKG_POWER_INFO, &msr)) + return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; + + switch (model) { + case 0x37: + case 0x4D: + return 30.0; + default: + return 135.0; + } +} + + /* * rapl_probe() * - * sets do_rapl + * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units */ void rapl_probe(unsigned int family, unsigned int model) { unsigned long long msr; + unsigned int time_unit; double tdp; if (!genuine_intel) @@ -1555,11 +1598,15 @@ void rapl_probe(unsigned int family, unsigned int model) case 0x3F: /* HSW */ case 0x45: /* HSW */ case 0x46: /* HSW */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_GFX; + do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; break; case 0x2D: case 0x3E: - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS; + do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; + break; + case 0x37: /* BYT */ + case 0x4D: /* AVN */ + do_rapl = RAPL_PKG | RAPL_CORES ; break; default: return; @@ -1570,19 +1617,22 @@ void rapl_probe(unsigned int family, unsigned int model) return; rapl_power_units = 1.0 / (1 << (msr & 0xF)); - rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); - rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF)); + if (model == 0x37) + rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000; + else + rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); - /* get TDP to determine energy counter range */ - if (get_msr(0, MSR_PKG_POWER_INFO, &msr)) - return; + time_unit = msr >> 16 & 0xF; + if (time_unit == 0) + time_unit = 0xA; - tdp = ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; + rapl_time_units = 1.0 / (1 << (time_unit)); - rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; + tdp = get_tdp(model); + rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (verbose) - fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range\n", rapl_joule_counter_range); + fprintf(stderr, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); return; } @@ -1668,7 +1718,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; int cpu; - double local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units; if (!do_rapl) return 0; @@ -1686,23 +1735,13 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr)) return -1; - local_rapl_power_units = 1.0 / (1 << (msr & 0xF)); - local_rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); - local_rapl_time_units = 1.0 / (1 << (msr >> 16 & 0xF)); - - if (local_rapl_power_units != rapl_power_units) - fprintf(stderr, "cpu%d, ERROR: Power units mis-match\n", cpu); - if (local_rapl_energy_units != rapl_energy_units) - fprintf(stderr, "cpu%d, ERROR: Energy units mis-match\n", cpu); - if (local_rapl_time_units != rapl_time_units) - fprintf(stderr, "cpu%d, ERROR: Time units mis-match\n", cpu); - if (verbose) { fprintf(stderr, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx " "(%f Watts, %f Joules, %f sec.)\n", cpu, msr, - local_rapl_power_units, local_rapl_energy_units, local_rapl_time_units); + rapl_power_units, rapl_energy_units, rapl_time_units); } - if (do_rapl & RAPL_PKG) { + if (do_rapl & RAPL_PKG_POWER_INFO) { + if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) return -5; @@ -1714,6 +1753,9 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); + } + if (do_rapl & RAPL_PKG) { + if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr)) return -9; @@ -1749,12 +1791,16 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) print_power_limit_msr(cpu, msr, "DRAM Limit"); } - if (do_rapl & RAPL_CORES) { + if (do_rapl & RAPL_CORE_POLICY) { if (verbose) { if (get_msr(cpu, MSR_PP0_POLICY, &msr)) return -7; fprintf(stderr, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); + } + } + if (do_rapl & RAPL_CORES) { + if (verbose) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; @@ -1813,10 +1859,48 @@ int has_c8_c9_c10(unsigned int family, unsigned int model) } +int is_slm(unsigned int family, unsigned int model) +{ + if (!genuine_intel) + return 0; + switch (model) { + case 0x37: /* BYT */ + case 0x4D: /* AVN */ + return 1; + } + return 0; +} + +#define SLM_BCLK_FREQS 5 +double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0}; + +double slm_bclk(void) +{ + unsigned long long msr = 3; + unsigned int i; + double freq; + + if (get_msr(0, MSR_FSB_FREQ, &msr)) + fprintf(stderr, "SLM BCLK: unknown\n"); + + i = msr & 0xf; + if (i >= SLM_BCLK_FREQS) { + fprintf(stderr, "SLM BCLK[%d] invalid\n", i); + msr = 3; + } + freq = slm_freq_table[i]; + + fprintf(stderr, "SLM BCLK: %.1f Mhz\n", freq); + + return freq; +} + double discover_bclk(unsigned int family, unsigned int model) { if (is_snb(family, model)) return 100.00; + else if (is_slm(family, model)) + return slm_bclk(); else return 133.33; } @@ -1873,7 +1957,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk fprintf(stderr, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, target_c_local); - if (target_c_local < 85 || target_c_local > 120) + if (target_c_local < 85 || target_c_local > 127) goto guess; tcc_activation_temp = target_c_local; @@ -1970,6 +2054,7 @@ void check_cpuid() do_smi = do_nhm_cstates; do_snb_cstates = is_snb(family, model); do_c8_c9_c10 = has_c8_c9_c10(family, model); + do_slm_cstates = is_slm(family, model); bclk = discover_bclk(family, model); do_nehalem_turbo_ratio_limit = has_nehalem_turbo_ratio_limit(family, model); @@ -2331,7 +2416,7 @@ int main(int argc, char **argv) cmdline(argc, argv); if (verbose) - fprintf(stderr, "turbostat v3.4 April 17, 2013" + fprintf(stderr, "turbostat v3.5 April 26, 2013" " - Len Brown <lenb@kernel.org>\n"); turbostat_init(); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 662f34c3287..a0aa84b5941 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1615,8 +1615,9 @@ EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { - return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, - offset, len); + const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); + + return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); } EXPORT_SYMBOL_GPL(kvm_clear_guest_page); |